On Github michelleful / SingaporeRoadnameOrigins
Michelle Fullwood / @michelleful
I'm a grad student in linguistics.
I love languages and maps.
I'm from Singapore.
© Open Street Map contributors
{ "type": "Feature",
"properties":
{ "id": 5436.0, "osm_id": 48673274.0,
"type": "residential",
"name": "Montreal Drive", ...
"class": "highway" },
"geometry":
{ "type": "LineString",
"coordinates": [ [ 103.827628075898062, 1.45001447378366 ],
[ 103.827546855256259, 1.450088485988644 ],
[ 103.82724167016174 , 1.450461983594056 ],
... ] } }
>>> import geopandas as gpd
>>> df = gpd.read_file('singapore-roads.geojson')
>>> df.shape
(59218, 13)
>>> df.plot()
>>> # `within` function returns true if one feature >>> # sits within the boundary of another >>> df = df[df.geometry.within(singapore.geometry)] >>> df.plot()
>>> # filter out empty road names >>> df = df[df['name'].notnull()] >>> # only accept roads whose 'highway' variable is >>> # in an accepted list (not a footpath, etc) >>> df = df[df['highway'].isin(accepted_road_types)]
# split into train and test data
from sklearn.cross_validation import train_test_split
data_train, data_test, y_train, y_true = \
train_test_split(df['road_name'], df['classification'], test_size=0.2)
(Jalan) Malu-Malu
unigrams m(2) a(2) l(2) u(2) -(1) bigrams #m(1) ma(2) al(2) lu(2) u-(1) ... trigrams ##m(1) #ma(1) mal(2) alu(2) ...>>> from sklearn.feature_extraction.text import CountVectorizer >>> ngram_counter = CountVectorizer(ngram_range=(1, 4), analyzer='char') >>> X_train = ngram_counter.fit_transform(data_train) >>> X_test = ngram_counter.transform(data_test)
>>> from sklearn.svm import LinearSVC >>> classifier = LinearSVC() >>> model = classifier.fit(X_train, y_train)
>>> y_test = model.predict(X_test) >>> sklearn.metrics.accuracy_score(y_true, y_test) 0.551818181818
>>> from sklearn.feature_extraction.text import CountVectorizer >>> ngram_counter = CountVectorizer(ngram_range=(1, 4), analyzer='char') >>> X_train = ngram_counter.fit_transform(data_train) >>> X_test = ngram_counter.transform(data_test)
>>> from sklearn.pipeline import Pipeline
>>> ppl = Pipeline([
('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='char')),
('clf', LinearSVC())
])
>>> model = ppl.fit(data_train)
>>> y_test = model.predict(data_test)
Average word length
from sklearn.base import BaseEstimator, TransformerMixin
class SampleExtractor(BaseEstimator, TransformerMixin):
def __init__(self, vars):
self.vars = vars
def transform(self, X, y=None):
return do_something_to(X, self.vars)
def fit(self, X, y=None):
return self
from sklearn.base import BaseEstimator, TransformerMixin
class AverageWordLengthExtractor(BaseEstimator, TransformerMixin):
"""Takes in df, extracts road name column, outputs average word length"""
def __init__(self):
pass
def average_word_length(self, name):
return np.mean([len(word) for word in name.split()])
def transform(self, X, y=None):
return X['road_name'].apply(self.average_word_length)
def fit(self, X, y=None):
return self
from sklearn.pipeline import Pipeline, FeatureUnion
pipeline = Pipeline([
('feats', FeatureUnion([
('ngram', ngram_count_pipeline), # can pass in either a pipeline
('ave', AverageWordLengthExtractor()) # or a transformer
])),
('clf', LinearSVC()) # classifier
])
>>> # When you do this: >>> clf = LinearSVC() >>> # You're really doing this: >>> clf = LinearSVC(C=1.0, loss='l2', ...) >>> # changing the values of these hyperparameters can alter performance, >>> # sometimes quite significantly
>>> from sklearn.grid_search import GridSearchCV
>>> pg = {'clf__C': [0.1, 1, 10, 100]}
>>> grid = GridSearchCV(pipeline, param_grid=pg, cv=5)
>>> grid.fit(X_train, y_train)
>>> grid.best_params_
{'clf__C': 0.1}
>>> grid.best_score_
0.702290076336
>>> model = grid.best_estimator_.fit(X_train, y_train) >>> y_test = model.predict(X_test) >>> accuracy_score(y_test, y_true) 0.686590909091
>>> pipeline.get_params() # only works if all transformers
# inherit from BaseEstimator!
{'clf__C': 1.0,
'clf__class_weight': None,
'clf__dual': True,
...
'feats__ngram__vect__ngram_range': (1, 4),
'feats__ngram__vect__preprocessor': None,
'feats__ngram__vect__stop_words': None,
}
>>> ax = df.plot(column='classification', colormap='accent')
>>> import mplleaflet >>> mplleaflet.display(fig=ax.figure, crs=df.crs, tiles='cartodb_positron')