Michelle Fullwood / @michelleful
I'm a grad student in linguistics.
I love languages and maps.
I'm from Singapore.
© Open Street Map contributors
{ "type": "Feature", "properties": { "id": 5436.0, "osm_id": 48673274.0, "type": "residential", "name": "Montreal Drive", ... "class": "highway" }, "geometry": { "type": "LineString", "coordinates": [ [ 103.827628075898062, 1.45001447378366 ], [ 103.827546855256259, 1.450088485988644 ], [ 103.82724167016174 , 1.450461983594056 ], ... ] } }
>>> import geopandas as gpd
>>> df = gpd.read_file('singapore-roads.geojson')
>>> df.shape
(59218, 13)
>>> df.plot()
>>> # `within` function returns true if one feature
>>> # sits within the boundary of another
>>> df = df[df.geometry.within(singapore.geometry)]
>>> df.plot()
>>> # filter out empty road names
>>> df = df[df['name'].notnull()]
>>> # only accept roads whose 'highway' variable is
>>> # in an accepted list (not a footpath, etc)
>>> df = df[df['highway'].isin(accepted_road_types)]
Hand label | ||||
Train | Classify | |||
Train | Hand correct | |||
Train | Train | Classify | ||
# split into train and test data
from sklearn.cross_validation import train_test_split
data_train, data_test, y_train, y_true = \
train_test_split(df['road_name'], df['classification'], test_size=0.2)
(Jalan) Malu-Malu
unigrams | m(2) a(2) l(2) u(2) -(1) |
bigrams | #m(1) ma(2) al(2) lu(2) u-(1) ... |
trigrams | ##m(1) #ma(1) mal(2) alu(2) ... |
British | Chinese | Malay | Indian |
23 | 17 | 0 | 0 |
Alnwick Berwick Brickson ... | Boon Teck Hock Chye Kheam Hock ... |
>>> from sklearn.feature_extraction.text import CountVectorizer
>>> ngram_counter = CountVectorizer(ngram_range=(1, 4), analyzer='char')
>>> X_train = ngram_counter.fit_transform(data_train)
>>> X_test = ngram_counter.transform(data_test)
>>> from sklearn.svm import LinearSVC
>>> classifier = LinearSVC()
>>> model = classifier.fit(X_train, y_train)
>>> y_test = model.predict(X_test)
>>> sklearn.metrics.accuracy_score(y_true, y_test)
0.551818181818
At the end of the day, some machine learning projects succeed and some fail. What makes the difference? Easily the most important factor is the features used...This is typically where most of the effort in a machine learning project goes.- Pedro Domingos, "A Few Useful Things to Know about Machine Learning"
>>> from sklearn.feature_extraction.text import CountVectorizer
>>> ngram_counter = CountVectorizer(ngram_range=(1, 4), analyzer='char')
>>> X_train = ngram_counter.fit_transform(data_train)
>>> X_test = ngram_counter.transform(data_test)
>>> from sklearn.pipeline import Pipeline
>>> ppl = Pipeline([
('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='char')),
('clf', LinearSVC())
])
>>> model = ppl.fit(data_train)
>>> y_test = model.predict(data_test)
Average word length
from sklearn.base import BaseEstimator, TransformerMixin
class SampleExtractor(BaseEstimator, TransformerMixin):
def __init__(self, vars):
self.vars = vars
def transform(self, X, y=None):
return do_something_to(X, self.vars)
def fit(self, X, y=None):
return self
from sklearn.base import BaseEstimator, TransformerMixin
class AverageWordLengthExtractor(BaseEstimator, TransformerMixin):
"""Takes in df, extracts road name column, outputs average word length"""
def __init__(self):
pass
def average_word_length(self, name):
return np.mean([len(word) for word in name.split()])
def transform(self, X, y=None):
return X['road_name'].apply(self.average_word_length)
def fit(self, X, y=None):
return self
from sklearn.pipeline import Pipeline, FeatureUnion
pipeline = Pipeline([
('feats', FeatureUnion([
('ngram', ngram_count_pipeline), # can pass in either a pipeline
('ave', AverageWordLengthExtractor()) # or a transformer
])),
('clf', LinearSVC()) # classifier
])
>>> # When you do this:
>>> clf = LinearSVC()
>>> # You're really doing this:
>>> clf = LinearSVC(C=1.0, loss='l2', ...)
>>> # changing the values of these hyperparameters can alter performance,
>>> # sometimes quite significantly
C | |||||
0.10 | 1.00 | 10.0 | 100 | 1000 | |
Gamma 2-2 | |||||
20 | ✔ | ||||
22 |
20% | 20% | 20% | 20% | 20% |
20% | 20% | 20% | 20% | 20% |
20% | 20% | 20% | 20% | 20% |
20% | 20% | 20% | 20% | 20% |
20% | 20% | 20% | 20% | 20% |
>>> from sklearn.grid_search import GridSearchCV
>>> pg = {'clf__C': [0.1, 1, 10, 100]}
>>> grid = GridSearchCV(pipeline, param_grid=pg, cv=5)
>>> grid.fit(X_train, y_train)
>>> grid.best_params_
{'clf__C': 0.1}
>>> grid.best_score_
0.702290076336
>>> model = grid.best_estimator_.fit(X_train, y_train)
>>> y_test = model.predict(X_test)
>>> accuracy_score(y_test, y_true)
0.686590909091
>>> pipeline.get_params() # only works if all transformers
# inherit from BaseEstimator!
{'clf__C': 1.0,
'clf__class_weight': None,
'clf__dual': True,
...
'feats__ngram__vect__ngram_range': (1, 4),
'feats__ngram__vect__preprocessor': None,
'feats__ngram__vect__stop_words': None,
}
>>> ax = df.plot(column='classification', colormap='accent')
>>> import mplleaflet
>>> mplleaflet.display(fig=ax.figure, crs=df.crs, tiles='cartodb_positron')