On Github everdark / pycrawler_101
nodes | edges | fields
/<node_id>/<edge_name>?fields=f1,f2,...
example:
me/photos?fields=from
obviously, not all results are returned in one shut (by design)
change the Graph API method from GET to POST/DELETE
you can do virtually all the things you may have done on Facebook in browser over the Graph API; however those operations are not covered in this course. :)
curl -i -X GET \ "https://graph.facebook.com/v2.4/me?fields=context&access_token=CAACEdEose0cBAL34H6NiMZB3ZCnPaZBnShQoSY9GZCh81kDLbQZArxKGEPY981H7KfBUjG99jThga2OxQ7owu03IZCgoEcjMDmVSyeZAzos3JZBvWEzbRbfX0DZAl0Au2ybbbZCNZBOsZCYGmjKqCLyTHftwrnOerU07Pismb3QBxYommKEo7oGsWTIIREpbKu4VlHMJY7Q7ZBY00aAZDZD"
import requests
curl -i -X GET \ "https://graph.facebook.com/v2.4/me?metadata=1&access_token=CAACEdEose0cBAJzoEpPQfyhYg1YCDauTpjUsPXPhryjrnDO0ZCkjNrw74dDraVZByB0wGCWXs5zZB21UkRf5ZCFss1lzTiPUNZBJCdrwgMvFTi2feej1FcE9vpfErOd6F1lykYlZA8vQc8CZAOQC98bQxnM0Ed7eZBVMjT41buHTHSZCGBkZCfl2EzbRiZCJpXcewpgrRDZAFStTDAZDZD"
# the target API url api_addr = "https://graph.facebook.com/v2.4/me" # a valid token token = "CAACEdEose0cBAEZAjeBtClmJ0ZAtPDGs1iLav0QUPLofXFGLkCWZAUf0hNUZBQr8cnxOsAg8PGsjlcmglGilGTQgfweUOhboycjVr95itFlmskTPhBYBUBZCZAU1yiPuTDyyvdS7vEVrZAPDa1fSxZBMZA4eL8YvthC6Yk115ZARLufW5nEXI30tnUc3b43P47cu6kJWizviDulwZDZD" # build the query string for the GET method qs = {"metadata": 1, "access_token": token}
r = requests.get(api_addr, params=qs)
print r r.close() r.content[:100] # content too long, print only the first 100 char
<Response [200]>
'{"name":"Yueh Hsuan Chung","metadata":{"fields":[{"name":"id","description":"The id of this person\'s'
# parse json string into dict meta = r.json() # check top-level keys in parsed results meta.keys()
[u'id', u'name', u'metadata']
# show metadata dict meta["metadata"].keys()
[u'connections', u'fields', u'type']
# inspect the value of fields in metadata meta["metadata"]["fields"][:3] # a list of dict
[{u'description': u"The id of this person's user account. This ID is unique to each app and cannot be used across different apps. [Our upgrade guide provides more information about app-specific IDs](/docs/apps/upgrading/#upgrading_v2_0_user_ids)", u'name': u'id', u'type': u'numeric string'}, {u'description': u"The About Me section of this person's profile", u'name': u'about', u'type': u'string'}, {u'description': u'The age segment for this person expressed as a minimum and maximum age. For example, more than 18, less than 21.', u'name': u'age_range', u'type': u'agerange'}]
metafields = [ l["name"] for l in meta["metadata"]["fields"] ] for s in metafields: print s
id about age_range bio birthday context currency devices education email favorite_athletes favorite_teams first_name gender hometown inspirational_people install_type installed interested_in is_shared_login is_verified languages last_name link location locale meeting_for middle_name name name_format payment_pricepoints test_group political relationship_status religion security_settings significant_other sports quotes third_party_id timezone token_for_business updated_time shared_login_upgrade_required_by verified video_upload_limits viewer_can_send_gift website work public_key cover
curl -i -X GET \ "https://graph.facebook.com/v2.4/100000862115668_932029203502475/likes?access_token=CAACEdEose0cBAJzoEpPQfyhYg1YCDauTpjUsPXPhryjrnDO0ZCkjNrw74dDraVZByB0wGCWXs5zZB21UkRf5ZCFss1lzTiPUNZBJCdrwgMvFTi2feej1FcE9vpfErOd6F1lykYlZA8vQc8CZAOQC98bQxnM0Ed7eZBVMjT41buHTHSZCGBkZCfl2EzbRiZCJpXcewpgrRDZAFStTDAZDZD"
def getAllLikes1(token, node): result = [] api_addr = "https://graph.facebook.com/v2.4/%s/likes" % node qs = {"fields": "name", "access_token": token} r = requests.get(api_addr, params=qs) res = r.json() if not len(res["data"]): return result else: result += res["data"] while "next" in res["paging"]: cursor_next = res["paging"]["cursors"]["after"] qs["after"] = cursor_next r = requests.get(api_addr, params=qs) res = r.json() result += res["data"] return result def getAllLikes2(token, node): result = [] api_addr = "https://graph.facebook.com/v2.4/%s/likes" % node qs = {"fields": "name", "access_token": token} r = requests.get(api_addr, params=qs) res = r.json() if not len(res["data"]): return result else: result += res["data"] while "next" in res["paging"]: next_query = res["paging"]["next"] r = requests.get(next_query) res = r.json() result += res["data"] return result
token = "CAACEdEose0cBAEZAjeBtClmJ0ZAtPDGs1iLav0QUPLofXFGLkCWZAUf0hNUZBQr8cnxOsAg8PGsjlcmglGilGTQgfweUOhboycjVr95itFlmskTPhBYBUBZCZAU1yiPuTDyyvdS7vEVrZAPDa1fSxZBMZA4eL8YvthC6Yk115ZARLufW5nEXI30tnUc3b43P47cu6kJWizviDulwZDZD" node = "100000862115668_932029203502475" result1 = getAllLikes1(token=token, node=node) result2 = getAllLikes2(token=token, node=node) result1 == result2
True
# the result is a list of dict containing liked-users' id and name result1[:6]
[{u'id': u'315371435293768', u'name': u'Tindy Cheng'}, {u'id': u'969673423047100', u'name': u'ChunKuei Chu'}, {u'id': u'756609997705112', u'name': u'\u6797\u744b\u744b'}, {u'id': u'852448504770317', u'name': u'Mark Yang'}, {u'id': u'1487000218', u'name': u'\u8607\u4e2d\u624d'}, {u'id': u'1161704643846945', u'name': u'\u9673\u667a\u6cd3'}]
me/posts?fields=created_time,likes.fields(id,name)&since=1420070400
# think about a blueprint of your crawler class likerCrawler: # need a user id and a starting time def __init__(self, user, since, token): self.user = str(user) self.since = str(since) self.token = str(token) # need a helper to convert datetime string to timestamp def str2Timestamp(): pass # need a function to crawl all post id in a given time def getPostId(): pass # need a function to crawl all like info given a post id def getLikes(): pass
import requests import datetime, time class likerCrawler: """a FB Graph API crawler to get all like info associated with posts since a given time""" def __init__(self, user, since, token, tfmt="%Y-%m-%d"): self.user = str(user) self.since = str(since) self.token = str(token) self.tfmt = str(tfmt) def getTimestamp(self): """convert datetime string to UNIX timestamp""" return int(time.mktime(datetime.datetime.strptime(self.since, self.tfmt).timetuple())) def getPostIdwithTimestamp(self): """given user_id and start time, return a list of {post_id, time}""" posts = [] api_addr = "https://graph.facebook.com/v2.4/%s/posts" % self.user qs = {"since": self.getTimestamp(), "access_token": self.token} res = requests.get(api_addr, params=qs).json() if not len(res["data"]): return None else: posts += [ {"id": d["id"], "ts": d["created_time"]} for d in res["data"] ] while "paging" in res and next in res["paging"]: res = requests.get(res["paging"]["next"]).json() post_ids += [ {"id": d["id"], "ts": d["created_time"]} for d in res["data"] ] return posts def getLikesPerPost(self, post_id): """given a post id, return all user id/name liked it with timestamp""" like_users = [] api_addr = "https://graph.facebook.com/v2.4/%s" % post_id qs = {"fields": "likes.fields(name),created_time", "access_token": self.token} res = requests.get(api_addr, params=qs).json() if "likes" not in res: return None else: ts = res["created_time"] like_users += res["likes"]["data"] while "paging" in res and "next" in res["paging"]: res = requests.get(next).json() like_users += res["data"] return ts, like_users def getAllLikes(self): all_likes = [ self.getLikesPerPost(d) for d in self.all_posts ] return [ l for l in all_likes if l is not None] @property def all_posts(self): return [ m["id"] for m in self.getPostIdwithTimestamp() ]
mycrawler = likerCrawler(user="me", since="2014-01-01", token="CAACEdEose0cBAEZAjeBtClmJ0ZAtPDGs1iLav0QUPLofXFGLkCWZAUf0hNUZBQr8cnxOsAg8PGsjlcmglGilGTQgfweUOhboycjVr95itFlmskTPhBYBUBZCZAU1yiPuTDyyvdS7vEVrZAPDa1fSxZBMZA4eL8YvthC6Yk115ZARLufW5nEXI30tnUc3b43P47cu6kJWizviDulwZDZD")
# return all post ids mycrawler.getTimestamp()
1388505600
# return all post ids with timestamp mycrawler.getPostIdwithTimestamp()[:6]
[{'id': u'100000862115668_932029203502475', 'ts': u'2015-04-19T21:12:21+0000'}, {'id': u'100000862115668_930976326941096', 'ts': u'2015-04-18T03:50:24+0000'}, {'id': u'100000862115668_930036487035080', 'ts': u'2015-04-17T04:59:26+0000'}, {'id': u'100000862115668_929925230379539', 'ts': u'2015-04-16T04:26:26+0000'}, {'id': u'100000862115668_929794627059266', 'ts': u'2015-04-15T20:29:36+0000'}, {'id': u'100000862115668_929511450420917', 'ts': u'2015-04-15T05:25:38+0000'}]
# return all post ids only mycrawler.all_posts[:6]
[u'100000862115668_932029203502475', u'100000862115668_930976326941096', u'100000862115668_930036487035080', u'100000862115668_929925230379539', u'100000862115668_929794627059266', u'100000862115668_929511450420917']
# get all likers given a post_id mycrawler.getLikesPerPost("100000862115668_932029203502475")
(u'2015-04-19T21:12:21+0000', [{u'id': u'315371435293768', u'name': u'Tindy Cheng'}, {u'id': u'969673423047100', u'name': u'ChunKuei Chu'}, {u'id': u'756609997705112', u'name': u'\u6797\u744b\u744b'}, {u'id': u'852448504770317', u'name': u'Mark Yang'}, {u'id': u'1487000218', u'name': u'\u8607\u4e2d\u624d'}, {u'id': u'1161704643846945', u'name': u'\u9673\u667a\u6cd3'}, {u'id': u'10204030571792411', u'name': u'Mansun Kuo'}, {u'id': u'10152783776360960', u'name': u'Jim Pai'}, {u'id': u'748835111857876', u'name': u'Yi LinWei'}, {u'id': u'788499557828349', u'name': u'Wush Wu'}, {u'id': u'10200724712359727', u'name': u'Cedar Su'}, {u'id': u'10202285354957426', u'name': u'\u67ef\u9d3b\u5100'}, {u'id': u'878903608788910', u'name': u'\u9b4f\u5ef7\u65ed'}, {u'id': u'1182843781729426', u'name': u'\u6d2a \u5fd7\u6587'}, {u'id': u'912921898725737', u'name': u'\u77f3\u5c0f\u77f3'}, {u'id': u'10153331968758409', u'name': u'Yencheng Chen'}, {u'id': u'10152159007283224', u'name': u'\u6797\u76df\u5091'}, {u'id': u'830239116990181', u'name': u'Han-Wei Wendy Lin'}, {u'id': u'10203196401577752', u'name': u'Teresa Tc'}, {u'id': u'10201864190525667', u'name': u'Yao-Tsung Yang'}, {u'id': u'961707153842993', u'name': u'Meng-Yeh Chuang'}, {u'id': u'877391482280299', u'name': u'Leon Zhang'}, {u'id': u'541969934240', u'name': u'Shao-Chuan Yang'}, {u'id': u'609958632422912', u'name': u'Ikki Oda'}, {u'id': u'10152385217924728', u'name': u'Yuan-Ping Wu'}])
all_likes = mycrawler.getAllLikes() all_likes[:2]
[(u'2015-04-19T21:12:21+0000', [{u'id': u'315371435293768', u'name': u'Tindy Cheng'}, {u'id': u'969673423047100', u'name': u'ChunKuei Chu'}, {u'id': u'756609997705112', u'name': u'\u6797\u744b\u744b'}, {u'id': u'852448504770317', u'name': u'Mark Yang'}, {u'id': u'1487000218', u'name': u'\u8607\u4e2d\u624d'}, {u'id': u'1161704643846945', u'name': u'\u9673\u667a\u6cd3'}, {u'id': u'10204030571792411', u'name': u'Mansun Kuo'}, {u'id': u'10152783776360960', u'name': u'Jim Pai'}, {u'id': u'748835111857876', u'name': u'Yi LinWei'}, {u'id': u'788499557828349', u'name': u'Wush Wu'}, {u'id': u'10200724712359727', u'name': u'Cedar Su'}, {u'id': u'10202285354957426', u'name': u'\u67ef\u9d3b\u5100'}, {u'id': u'878903608788910', u'name': u'\u9b4f\u5ef7\u65ed'}, {u'id': u'1182843781729426', u'name': u'\u6d2a \u5fd7\u6587'}, {u'id': u'912921898725737', u'name': u'\u77f3\u5c0f\u77f3'}, {u'id': u'10153331968758409', u'name': u'Yencheng Chen'}, {u'id': u'10152159007283224', u'name': u'\u6797\u76df\u5091'}, {u'id': u'830239116990181', u'name': u'Han-Wei Wendy Lin'}, {u'id': u'10203196401577752', u'name': u'Teresa Tc'}, {u'id': u'10201864190525667', u'name': u'Yao-Tsung Yang'}, {u'id': u'961707153842993', u'name': u'Meng-Yeh Chuang'}, {u'id': u'877391482280299', u'name': u'Leon Zhang'}, {u'id': u'541969934240', u'name': u'Shao-Chuan Yang'}, {u'id': u'609958632422912', u'name': u'Ikki Oda'}, {u'id': u'10152385217924728', u'name': u'Yuan-Ping Wu'}]), (u'2015-04-17T23:06:21+0000', [{u'id': u'769927709723228', u'name': u'\u65b9\u5efa\u667a'}, {u'id': u'924054170938791', u'name': u'\u9b6f\u8b19'}, {u'id': u'10200724712359727', u'name': u'Cedar Su'}, {u'id': u'877391482280299', u'name': u'Leon Zhang'}, {u'id': u'748835111857876', u'name': u'Yi LinWei'}, {u'id': u'10152385217924728', u'name': u'Yuan-Ping Wu'}, {u'id': u'969673423047100', u'name': u'ChunKuei Chu'}, {u'id': u'100000153076447', u'name': u'Julie Woo'}, {u'id': u'858228977536894', u'name': u'Chun WU'}, {u'id': u'10203196401577752', u'name': u'Teresa Tc'}, {u'id': u'10152159007283224', u'name': u'\u6797\u76df\u5091'}])]
# speedup by parallelling # issue: instance method not able to be pickled, the following is one work-around from multiprocessing import Pool from functools import partial def _getLikesPerPost(obj, arg): return obj.getLikesPerPost(arg) _bound_getLikesPerPost = partial(_getLikesPerPost, mycrawler) p = Pool() # this must go after function definition par_res = p.map(_bound_getLikesPerPost, mycrawler.all_posts) all_likes2 = [ l for l in par_res if l is not None ] # check equivalence all_likes == all_likes2
True
%time all_likes = mycrawler.getAllLikes()
CPU times: user 603 ms, sys: 46.8 ms, total: 650 ms Wall time: 5.72 s
%time all_likes2 = [ l for l in p.map(_bound_getLikesPerPost, mycrawler.all_posts) if l is not None ]
CPU times: user 31.3 ms, sys: 3.55 ms, total: 34.8 ms Wall time: 2.12 s
# tidy likers into list of (time, name) def decoratebyTimeUnit(by): def byTimeUnit(dtstr): # %z not support in python... dt = datetime.datetime.strptime(dtstr, "%Y-%m-%dT%H:%M:%S+0000") if by == "date": return dt.date() elif by == "month": return dt.replace(day=1, hour=0, minute=0, second=0).date() else: raise ValueError("Unknown by value") def real_decorator(func): def wrapper(likes): return [ (byTimeUnit(like[0]), like[1]) for like in func(likes) ] return wrapper return real_decorator @decoratebyTimeUnit("month") def countByTime(likes): res = [] for t in likes: for m in t[1]: res.append((t[0], m["name"])) return res
likes = countByTime(all_likes) likes[:10]
[(datetime.date(2015, 4, 1), u'Tindy Cheng'), (datetime.date(2015, 4, 1), u'ChunKuei Chu'), (datetime.date(2015, 4, 1), u'\u6797\u744b\u744b'), (datetime.date(2015, 4, 1), u'Mark Yang'), (datetime.date(2015, 4, 1), u'\u8607\u4e2d\u624d'), (datetime.date(2015, 4, 1), u'\u9673\u667a\u6cd3'), (datetime.date(2015, 4, 1), u'Mansun Kuo'), (datetime.date(2015, 4, 1), u'Jim Pai'), (datetime.date(2015, 4, 1), u'Yi LinWei'), (datetime.date(2015, 4, 1), u'Wush Wu')]
from collections import Counter import pandas as pd # group by time only monthly_like_counts = Counter([ t[0] for t in likes ]) df = pd.DataFrame(monthly_like_counts.items()) df.columns = ["time", "count"] df = df.sort("time") df
# group by time by liker monthly_like_counts_by_liker = [ (t[0][0], t[0][1], t[1]) for t in Counter(likes).items() ] df2 = pd.DataFrame(monthly_like_counts_by_liker) df2.columns = ["time", "liker", "count"] df2 = df2.sort(["liker", "time"]) df2[:6]
# get top 10 likers df2_count_by_liker = df2.groupby("liker").sum() df2_count_by_liker[:6]
top_likers = list(df2_count_by_liker.sort("count", ascending=0).index[:5]) top_likers
[u'Teresa Tc', u'Yi LinWei', u'\u6797\u76df\u5091', u'Leon Zhang', u'Yuan-Ping Wu']
# keep only top 5 likers' data df2_filtered = df2[df2["liker"].isin(top_likers)]
# to plot in ipython notebook %matplotlib inline import matplotlib import matplotlib.pyplot as plt
# plot like counts by time df.plot(x="time", y="count", marker='o', figsize=(10,5))
<matplotlib.axes._subplots.AxesSubplot at 0x10650ced0>
matplotlib預設可能不認中文字型,需要特別指定給他。
# run the following code to show all available Chinese font family on your computer from matplotlib.font_manager import fontManager import os fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111) plt.subplots_adjust(0, 0, 1, 1, 0, 0) plt.xticks([]) plt.yticks([]) x, y = 0.05, 0.18 fonts = [font.name for font in fontManager.ttflist if os.path.exists(font.fname) and os.stat(font.fname).st_size>1e6] font = set(fonts) dy = (1.0-y)/(len(fonts)/4 + (len(fonts)%4!=0)) for font in fonts: t = ax.text(x, y, u"中文字體", {'fontname':font, 'fontsize':14}, transform=ax.transAxes) ax.text(x, y-dy/2, font, transform=ax.transAxes) x += 0.25 if x >= 1.0: y += dy x = 0.05 # plt.show() # not necessary in ipython notebook
# plot like counts by time by top likers matplotlib.rc("font", family="AppleGothic") # set any Chinese font family fig, ax = plt.subplots(1,1) p = df2_filtered.groupby("liker").plot(x="time", y="count", marker='x', ax=ax, figsize=(10,5)) plt.legend(list(p.axes[0].get_values()), loc="best")
<matplotlib.legend.Legend at 0x1081afc50>
右鍵Inspect Element大法!搜尋"data-uid"...
# get list of all avaiable friends on me node def getAllFriendsOnAPI(token): fr = [] target = "https://graph.facebook.com/v2.4/me/friends?fields=name" qs = {"access_token": token} res = requests.get(target, params=qs).json() fr += res["data"] while "next" in res["paging"]: res = requests.get(res["paging"]["next"]).json() fr += res["data"] return fr
token = "CAACEdEose0cBAL904Js5Xl2RHrf15tZBGZBOwmUotrvJE660yjwZCZAK6nJd54bFlE4ZAvEf2ZCxGZC6DfusFCZCnDJUPT36bJZBQHkr8WZByrr1HPSb7AVKq632li0RsAbC0todLXr7g4RZBk1MkdbERYzw6Yqs5lIU1I5M5JoJeIqMOpwvbJcyOtM3mYLC2SedbieObS6cVNgpgZDZD" friends = getAllFriendsOnAPI(token) friends[:6]
[{u'id': u'348900235', u'name': u'Han-Wen Chang'}, {u'id': u'503482470', u'name': u'Ning Chen'}, {u'id': u'523887614', u'name': u'\u9673\u594e\u9298'}, {u'id': u'524871968', u'name': u'Pin Wu'}, {u'id': u'710176842', u'name': u'Linhots Tsao'}, {u'id': u'1045695611', u'name': u'Chih-Peng Wu'}]
# get list of posts given a user id def getAllPosts(user, token): posts = [] api_addr = "https://graph.facebook.com/v2.4/%s/posts" % user qs = {"access_token": token} res = requests.get(api_addr, params=qs).json() if not len(res["data"]): return [] else: posts += [ d["id"] for d in res["data"] ] while "paging" in res and next in res["paging"]: res = requests.get(res["paging"]["next"]).json() post_ids += [ d["id"] for d in res["data"] ] return posts
from functools import partial all_posts = map(partial(getAllPosts, token=token), [ f["id"] for f in friends ]) results = zip(friends, all_posts)
for r in results: print "get %s post(s) from %s" % (len(r[1]), r[0]["name"])
get 0 post(s) from Han-Wen Chang get 0 post(s) from Ning Chen get 21 post(s) from 陳奎銘 get 0 post(s) from Pin Wu get 25 post(s) from Linhots Tsao get 0 post(s) from Chih-Peng Wu get 23 post(s) from Chia-Chi Chang get 25 post(s) from Summit Suen get 25 post(s) from Ddio Juan get 23 post(s) from Mansun Kuo get 0 post(s) from 蘇中才 get 0 post(s) from 王雅人 get 25 post(s) from 陳嘉葳 get 24 post(s) from 趙致平 get 0 post(s) from Julie Woo get 22 post(s) from Arvin Huang get 24 post(s) from Chien-Wen Chen get 2 post(s) from Haoping Liu get 0 post(s) from Felix Shiao get 25 post(s) from Ikki Oda get 22 post(s) from Ming-Yi Huang get 20 post(s) from 徐嘉泰 get 25 post(s) from Chen John
# function to like a post def postLikes(post, token): posts = [] api_addr = "https://graph.facebook.com/v2.4/%s/likes" % post qs = {"access_token": token} return requests.post(api_addr, params=qs)
that is beyond the scope of this course :)
from wikipedia:
a regular expression (abbreviated regex or regexp and sometimes called a rational expression) is a sequence of characters that define a search pattern, mainly for use in pattern matching with strings, or string matching, i.e. "find and replace"-like operations.
「一串字元,用來表示一種可供匹配搜尋的字串模式。」
# let's make a playground! import re test_str = ["kyle chung; (02)2256-1116; New Taipei City", "Mr. Funghi,Earth Anywhere, 07-21180293", "free tel: 0800 000 123 #1234", "Dr.A (not Dr.B) street no.123 2945 1412"] def findFirstMatched(pattern, test_str=test_str): p = re.compile(pattern) print "\n=== result of findFirstMatched ===" for i, s in enumerate(test_str): m = p.search(s) if m: print "row %s:" % i, m.group() else: print "row %s: no match" % i return None def findAllMatched(pattern, test_str=test_str): p = re.compile(pattern) print "\n=== result of findAllMatched ===" for i, s in enumerate(test_str): matched = p.findall(s) print "row %s: " % i, matched return None
# test first-matched findFirstMatched("3")
=== result of findFirstMatched === row 0: no match row 1: 3 row 2: 3 row 3: 3
# test all-matched findAllMatched("3")
=== result of findAllMatched === row 0: [] row 1: ['3'] row 2: ['3', '3'] row 3: ['3']
# 1st try: get all digits for s in test_str: print s findFirstMatched("[0-9]") findAllMatched("[0-9]")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: 0 row 1: 0 row 2: 0 row 3: 1 === result of findAllMatched === row 0: ['0', '2', '2', '2', '5', '6', '1', '1', '1', '6'] row 1: ['0', '7', '2', '1', '1', '8', '0', '2', '9', '3'] row 2: ['0', '8', '0', '0', '0', '0', '0', '1', '2', '3', '1', '2', '3', '4'] row 3: ['1', '2', '3', '2', '9', '4', '5', '1', '4', '1', '2']
# 2nd try: get all digits, not just one for s in test_str: print s findFirstMatched("[0-9]+") findAllMatched("[0-9]+")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: 02 row 1: 07 row 2: 0800 row 3: 123 === result of findAllMatched === row 0: ['02', '2256', '1116'] row 1: ['07', '21180293'] row 2: ['0800', '000', '123', '1234'] row 3: ['123', '2945', '1412']
# 3rd try: get all digits, not just one, and dash for s in test_str: print s findFirstMatched("[0-9-]+") findAllMatched("[0-9-]+")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: 02 row 1: 07-21180293 row 2: 0800 row 3: 123 === result of findAllMatched === row 0: ['02', '2256-1116'] row 1: ['07-21180293'] row 2: ['0800', '000', '123', '1234'] row 3: ['123', '2945', '1412']
# 4th try: get all digits, not just one, and dash and paranthesis and blanks... for s in test_str: print s findFirstMatched("[0-9-()]+") findAllMatched("[0-9-()]+")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: (02)2256-1116 row 1: 07-21180293 row 2: 0800 row 3: ( === result of findAllMatched === row 0: ['(02)2256-1116'] row 1: ['07-21180293'] row 2: ['0800', '000', '123', '1234'] row 3: ['(', ')', '123', '2945', '1412']
# now allow blanks, oops! for s in test_str: print s findFirstMatched("[0-9-() ]+") findAllMatched("[0-9-() ]+")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: row 1: row 2: row 3: ( === result of findAllMatched === row 0: [' ', ' (02)2256-1116', ' ', ' ', ' '] row 1: [' ', ' ', ' 07-21180293'] row 2: [' ', ' 0800 000 123 ', '1234'] row 3: [' (', ' ', ') ', ' ', '123 2945 1412']
# test for optional parenthesis for s in test_str: print s findFirstMatched("\(?[0-9]+\)?") findAllMatched("\(?[0-9]+\)?")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: (02) row 1: 07 row 2: 0800 row 3: 123 === result of findAllMatched === row 0: ['(02)', '2256', '1116'] row 1: ['07', '21180293'] row 2: ['0800', '000', '123', '1234'] row 3: ['123', '2945', '1412']
# optional parenthesis/dash and consecutive numbers for s in test_str: print s findFirstMatched("\(?[0-9]+\)?[0-9-]+") findAllMatched("\(?[0-9]+\)?[0-9-]+")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: (02)2256-1116 row 1: 07-21180293 row 2: 0800 row 3: 123 === result of findAllMatched === row 0: ['(02)2256-1116'] row 1: ['07-21180293'] row 2: ['0800', '000', '123', '1234'] row 3: ['123', '2945', '1412']
# optional parenthesis/dash and consecutive numbers/blanks for s in test_str: print s findFirstMatched("\(?[0-9]+\)?[0-9- ]+") findAllMatched("\(?[0-9]+\)?[0-9- ]+")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: (02)2256-1116 row 1: 07-21180293 row 2: 0800 000 123 row 3: 123 2945 1412 === result of findAllMatched === row 0: ['(02)2256-1116'] row 1: ['07-21180293'] row 2: ['0800 000 123 ', '1234'] row 3: ['123 2945 1412']
# restrict numbers of digits for s in test_str: print s findFirstMatched("\(?[0-9]{0,2}\)?-?[0-9]{4}[ -]?[0-9]{4}") findAllMatched("\(?[0-9]{0,2}\)?-?[0-9]{4}[ -]?[0-9]{4}")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: (02)2256-1116 row 1: 07-21180293 row 2: no match row 3: 2945 1412 === result of findAllMatched === row 0: ['(02)2256-1116'] row 1: ['07-21180293'] row 2: [] row 3: ['2945 1412']
# build a second pattern for s in test_str: print s findFirstMatched("[0-9]{4}[ ]?[0-9]{3}[ ]?[0-9]{3}") findAllMatched("[0-9]{4}[ ]?[0-9]{3}[ ]?[0-9]{3}")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: no match row 1: no match row 2: 0800 000 123 row 3: no match === result of findAllMatched === row 0: [] row 1: [] row 2: ['0800 000 123'] row 3: []
# combine two patterns with or operator for s in test_str: print s findFirstMatched("\(?[0-9]{0,2}\)?-?[0-9]{4}[ -]?[0-9]{4}|[0-9]{4}[ ]?[0-9]{3}[ ]?[0-9]{3}") findAllMatched("\(?[0-9]{0,2}\)?-?[0-9]{4}[ -]?[0-9]{4}|[0-9]{4}[ ]?[0-9]{3}[ ]?[0-9]{3}")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: (02)2256-1116 row 1: 07-21180293 row 2: 0800 000 123 row 3: 2945 1412 === result of findAllMatched === row 0: ['(02)2256-1116'] row 1: ['07-21180293'] row 2: ['0800 000 123'] row 3: ['2945 1412']
# deal with optional extension digits for s in test_str: print s findFirstMatched("\(?[0-9]{0,2}\)?-?[0-9]{4}[ -]?[0-9]{4}|[0-9]{4}[ ]?[0-9]{3}[ ]?[0-9]{3}[ ]?#?[0-9]+") findAllMatched("\(?[0-9]{0,2}\)?-?[0-9]{4}[ -]?[0-9]{4}|[0-9]{4}[ ]?[0-9]{3}[ ]?[0-9]{3}[ ]?#?[0-9]+")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: (02)2256-1116 row 1: 07-21180293 row 2: 0800 000 123 #1234 row 3: 2945 1412 === result of findAllMatched === row 0: ['(02)2256-1116'] row 1: ['07-21180293'] row 2: ['0800 000 123 #1234'] row 3: ['2945 1412']
match anything except newline(\n)
for s in test_str: print s findAllMatched(".")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findAllMatched === row 0: ['k', 'y', 'l', 'e', ' ', 'c', 'h', 'u', 'n', 'g', ';', ' ', '(', '0', '2', ')', '2', '2', '5', '6', '-', '1', '1', '1', '6', ';', ' ', 'N', 'e', 'w', ' ', 'T', 'a', 'i', 'p', 'e', 'i', ' ', 'C', 'i', 't', 'y'] row 1: ['M', 'r', '.', ' ', 'F', 'u', 'n', 'g', 'h', 'i', ',', 'E', 'a', 'r', 't', 'h', ' ', 'A', 'n', 'y', 'w', 'h', 'e', 'r', 'e', ',', ' ', '0', '7', '-', '2', '1', '1', '8', '0', '2', '9', '3'] row 2: ['f', 'r', 'e', 'e', ' ', 't', 'e', 'l', ':', ' ', '0', '8', '0', '0', ' ', '0', '0', '0', ' ', '1', '2', '3', ' ', '#', '1', '2', '3', '4'] row 3: ['D', 'r', '.', 'A', ' ', '(', 'n', 'o', 't', ' ', 'D', 'r', '.', 'B', ')', ' ', 's', 't', 'r', 'e', 'e', 't', ' ', 'n', 'o', '.', '1', '2', '3', ' ', '2', '9', '4', '5', ' ', '1', '4', '1', '2']
for s in test_str: print s findFirstMatched("^kyle") findFirstMatched("^Earth")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: kyle row 1: no match row 2: no match row 3: no match === result of findFirstMatched === row 0: no match row 1: no match row 2: no match row 3: no match
to specify the occurrence condition of its previous character
for s in test_str: print s findFirstMatched("^kyle.*")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: kyle chung; (02)2256-1116; New Taipei City row 1: no match row 2: no match row 3: no match
at least m and at most n occurrence
example:
for s in test_str: print s findFirstMatched("0{3,3}") # could be "0{3}" for short
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findFirstMatched === row 0: no match row 1: no match row 2: 000 row 3: no match
for s in test_str: print s findAllMatched("[a-zA-Z]+")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findAllMatched === row 0: ['kyle', 'chung', 'New', 'Taipei', 'City'] row 1: ['Mr', 'Funghi', 'Earth', 'Anywhere'] row 2: ['free', 'tel'] row 3: ['Dr', 'A', 'not', 'Dr', 'B', 'street', 'no']
for s in test_str: print s findAllMatched("kyle|Funghi") findAllMatched("[Kk]yle|[Ff]unghi")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findAllMatched === row 0: ['kyle'] row 1: ['Funghi'] row 2: [] row 3: [] === result of findAllMatched === row 0: ['kyle'] row 1: ['Funghi'] row 2: [] row 3: []
example:
to have its following character as-is, i.e., stripping special meaning
this is required when you want to match literal meta-char
for s in test_str: print s findAllMatched("\(.*\)")
kyle chung; (02)2256-1116; New Taipei City Mr. Funghi,Earth Anywhere, 07-21180293 free tel: 0800 000 123 #1234 Dr.A (not Dr.B) street no.123 2945 1412 === result of findAllMatched === row 0: ['(02)'] row 1: [] row 2: [] row 3: ['(not Dr.B)']