On Github michelleful / SingaporeRoadnameOrigins
Michelle Fullwood / @michelleful
I'm a grad student in linguistics.
I love languages and maps.
I'm from Singapore.
© Open Street Map contributors
{ "type": "Feature", "properties": { "id": 5436.0, "osm_id": 48673274.0, "type": "residential", "name": "Montreal Drive", ... "class": "highway" }, "geometry": { "type": "LineString", "coordinates": [ [ 103.827628075898062, 1.45001447378366 ], [ 103.827546855256259, 1.450088485988644 ], [ 103.82724167016174 , 1.450461983594056 ], ... ] } }
>>> import geopandas as gpd >>> df = gpd.read_file('singapore-roads.geojson') >>> df.shape (59218, 13)
>>> df.plot()
>>> # `within` function returns true if one feature >>> # sits within the boundary of another >>> df = df[df.geometry.within(singapore.geometry)] >>> df.plot()
>>> # filter out empty road names >>> df = df[df['name'].notnull()] >>> # only accept roads whose 'highway' variable is >>> # in an accepted list (not a footpath, etc) >>> df = df[df['highway'].isin(accepted_road_types)]
# split into train and test data from sklearn.cross_validation import train_test_split data_train, data_test, y_train, y_true = \ train_test_split(df['road_name'], df['classification'], test_size=0.2)
(Jalan) Malu-Malu
unigrams m(2) a(2) l(2) u(2) -(1) bigrams #m(1) ma(2) al(2) lu(2) u-(1) ... trigrams ##m(1) #ma(1) mal(2) alu(2) ...>>> from sklearn.feature_extraction.text import CountVectorizer >>> ngram_counter = CountVectorizer(ngram_range=(1, 4), analyzer='char') >>> X_train = ngram_counter.fit_transform(data_train) >>> X_test = ngram_counter.transform(data_test)
>>> from sklearn.svm import LinearSVC >>> classifier = LinearSVC() >>> model = classifier.fit(X_train, y_train)
>>> y_test = model.predict(X_test) >>> sklearn.metrics.accuracy_score(y_true, y_test) 0.551818181818
>>> from sklearn.feature_extraction.text import CountVectorizer >>> ngram_counter = CountVectorizer(ngram_range=(1, 4), analyzer='char') >>> X_train = ngram_counter.fit_transform(data_train) >>> X_test = ngram_counter.transform(data_test)
>>> from sklearn.pipeline import Pipeline >>> ppl = Pipeline([ ('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='char')), ('clf', LinearSVC()) ]) >>> model = ppl.fit(data_train) >>> y_test = model.predict(data_test)
Average word length
from sklearn.base import BaseEstimator, TransformerMixin class SampleExtractor(BaseEstimator, TransformerMixin): def __init__(self, vars): self.vars = vars def transform(self, X, y=None): return do_something_to(X, self.vars) def fit(self, X, y=None): return self
from sklearn.base import BaseEstimator, TransformerMixin class AverageWordLengthExtractor(BaseEstimator, TransformerMixin): """Takes in df, extracts road name column, outputs average word length""" def __init__(self): pass def average_word_length(self, name): return np.mean([len(word) for word in name.split()]) def transform(self, X, y=None): return X['road_name'].apply(self.average_word_length) def fit(self, X, y=None): return self
from sklearn.pipeline import Pipeline, FeatureUnion pipeline = Pipeline([ ('feats', FeatureUnion([ ('ngram', ngram_count_pipeline), # can pass in either a pipeline ('ave', AverageWordLengthExtractor()) # or a transformer ])), ('clf', LinearSVC()) # classifier ])
>>> # When you do this: >>> clf = LinearSVC() >>> # You're really doing this: >>> clf = LinearSVC(C=1.0, loss='l2', ...) >>> # changing the values of these hyperparameters can alter performance, >>> # sometimes quite significantly
>>> from sklearn.grid_search import GridSearchCV >>> pg = {'clf__C': [0.1, 1, 10, 100]} >>> grid = GridSearchCV(pipeline, param_grid=pg, cv=5) >>> grid.fit(X_train, y_train) >>> grid.best_params_ {'clf__C': 0.1} >>> grid.best_score_ 0.702290076336
>>> model = grid.best_estimator_.fit(X_train, y_train) >>> y_test = model.predict(X_test) >>> accuracy_score(y_test, y_true) 0.686590909091
>>> pipeline.get_params() # only works if all transformers # inherit from BaseEstimator! {'clf__C': 1.0, 'clf__class_weight': None, 'clf__dual': True, ... 'feats__ngram__vect__ngram_range': (1, 4), 'feats__ngram__vect__preprocessor': None, 'feats__ngram__vect__stop_words': None, }
>>> ax = df.plot(column='classification', colormap='accent')
>>> import mplleaflet >>> mplleaflet.display(fig=ax.figure, crs=df.crs, tiles='cartodb_positron')