Created by Frank Yang
pip3 install requests
>>> import requests >>> req = requests.get('http://example.com') >>> req.text
>>> payload = {'key1': 'value1', 'key2': 'value2'} >>> req = requests.post("http://httpbin.org/post", data=payload) >>> req.text
pip3 install beautifulsoup4
>>> from bs4 import BeautifulSoup >>> req = requests.get('https://raw.githubusercontent.com/FrankYang0529/Parser-Tutorial/master/parse.html') >>> soup = BeautifulSoup(req.text, 'html.parser') >>> div = soup.find("div") ...>>> div.a['href'] ... 'http://google.com'
>>> div_all = soup.find_all("div", {"class": "item"}) ... [, ] >>> div_all[1].a.span.text ... 'Facebook'
@asyncio.coroutine def coro_function(): print ("sleeping") yield from asyncio.sleep(3) print ("Done") loop = asyncio.get_event_loop() f = asyncio.wait([coro_function(), coro_function()]) loop.run_until_complete(f)
sleeping sleeping Done Done
pip3 install aiohttp
@asyncio.coroutine def get(*args, **kwargs): response = yield from aiohttp.request('GET', *args, **kwargs) return (yield from response.read())
def first_magnet(page): soup = bs4.BeautifulSoup(page, 'html.parser') return soup.title
@asyncio.coroutine def print_magnet(query): url = query page = yield from get(url, compress=True) magnet = first_magnet(page) print('{}: {}'.format(query, magnet))
distros = ['http://www.bbc.com/news/election-us-2016-35760148', 'http://www.bbc.com/news/world-europe-35760985', 'http://www.bbc.com/news/world-asia-35760797'] loop = asyncio.get_event_loop() f = asyncio.wait([print_magnet(d) for d in distros]) loop.run_until_complete(f)
python3 async_parse.py 0.38s user 0.04s system 59% cpu 0.701 total
python3 general_parse.py 0.42s user 0.05s system 16% cpu 2.790 total