爬虫基础代码
tutorial_basic_DL
地址https://morvanzhou.github.io/tutorials/data-manipulation/scraping/
1简单基础
from urllib.request import urlopen
# 中文解码
html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)
#正则
import re
res = re.findall(r"<title>(.+?)</title>", html)
print("\nPage title is: ", res[0])
#选择文本
res = re.findall(r"<p>(.*?)</p>", html, flags=re.DOTALL) # re.DOTALL if multi line
print("\nPage paragraph is: ", res[0])
#选择链接
res = re.findall(r'href="(.*?)"', html)
print("\nAll links: ", res)
1.1beautifulsoup
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)
soup = BeautifulSoup(html, features='lxml') #sing a method called lxml, create a soup object
print(soup.h1)
print('\n', soup.p)
all_href = soup.find_all('a') # find tags.
all_href = [l['href'] for l in all_href]
print('\n', all_href)
##find by CSS class
#html :"https://morvanzhou.github.io/static/scraping/list.html"
''' <ul class="jan">
<li>一月一号</li>
<li>一月二号</li>
<li>一月三号</li>
</ul>'''
jan = soup.find('ul', {"class": 'jan'})
d_jan = jan.find_all('li') # use jan as a parent
for d in d_jan:
print(d.get_text())
#或者
month = soup.find_all('li', {"class": "month"})
'''一月一号
一月二号
一月三号'''
img_links = soup.find_all("img", {"src": re.compile('.*?\.jpg')})
for link in img_links:
print(link['src'])
#或者
course_links = soup.find_all('a', {'href': re.compile('https://morvan.*')})
2url
print(soup.find('h1').get_text(), ' url: ', his[-1])
# find valid urls
sub_urls = soup.find_all("a", {"target": "_blank", "href": re.compile("/item/(%.{2})+$")})
if len(sub_urls) != 0:
his.append(random.sample(sub_urls, 1)[0]['href'])
else:
# no valid sub link found
his.pop()
3requests: an alternative to urllib
import requests
import webbrowser
param = {"wd": "莫烦Python"}
r = requests.get('http://www.baidu.com/s', params=param)
print(r.url)
webbrowser.open(r.url)
data = {'firstname': '莫烦', 'lastname': '周'}
r = requests.post('http://pythonscraping.com/files/processing.php', data=data)
print(r.text) #Hello there, !
file = {'uploadFile': open('./image.png', 'rb')}
r = requests.post('http://pythonscraping.com/files/processing2.php', files=file)
print(r.text)
##login
payload = {'username': 'Morvan', 'password': 'password'}
r = requests.post('http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
print(r.cookies.get_dict()) #{'loggedin': '1', 'username': 'Morvan'}
r = requests.get('http://pythonscraping.com/pages/cookies/profile.php', cookies=r.cookies)
print(r.text)
##login2
session = requests.Session()
payload = {'username': 'Morvan', 'password': 'password'}
r = session.post('http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
print(r.cookies.get_dict())
r = session.get("http://pythonscraping.com/pages/cookies/profile.php")
print(r.text)
4Download something
import os
os.makedirs('./img/', exist_ok=True)
IMAGE_URL = "https://morvanzhou.github.io/static/img/description/learning_step_flowchart.png"
from urllib.request import urlretrieve
urlretrieve(IMAGE_URL, './img/image1.png') # whole document
import requests
r = requests.get(IMAGE_URL)
with open('./img/image2.png', 'wb') as f:
f.write(r.content) # whole document
r = requests.get(IMAGE_URL, stream=True) # stream loading
with open('./img/image3.png', 'wb') as f:
for chunk in r.iter_content(chunk_size=32):
f.write(chunk)
5Practice: download images from web
from bs4 import BeautifulSoup
import requests
URL = "http://www.nationalgeographic.com.cn/animals/"
html = requests.get(URL).text
soup = BeautifulSoup(html, 'lxml')
img_ul = soup.find_all('ul', {"class": "img_list"})
import os
os.makedirs('./img/', exist_ok=True)
##存储图片
for ul in img_ul:
imgs = ul.find_all('img')
for img in imgs:
url = img['src']
r = requests.get(url, stream=True)
image_name = url.split('/')[-1]
with open('./img/%s' % image_name, 'wb') as f:
for chunk in r.iter_content(chunk_size=128):
f.write(chunk)
print('Saved %s' % image_name)
6 Distributed scraping: multiprocessing
import multiprocessing as mp
import time
from urllib.request import urlopen, urljoin
from bs4 import BeautifulSoup
import re
base_url = "http://127.0.0.1:4000/"
# base_url = 'https://morvanzhou.github.io/'
# DON'T OVER CRAWL THE WEBSITE OR YOU MAY NEVER VISIT AGAIN
if base_url != "http://127.0.0.1:4000/":
restricted_crawl = True
else:
restricted_crawl = False
def crawl(url):
response = urlopen(url)
time.sleep(0.1) # slightly delay for downloading
return response.read().decode()
def parse(html):
soup = BeautifulSoup(html, 'lxml')
urls = soup.find_all('a', {"href": re.compile('^/.+?/$')})
title = soup.find('h1').get_text().strip()
page_urls = set([urljoin(base_url, url['href']) for url in urls])
url = soup.find('meta', {'property': "og:url"})['content']
return title, page_urls, url
6.1Normal way
unseen = set([base_url,])
seen = set()
count, t1 = 1, time.time()
while len(unseen) != 0: # still get some url to visit
if restricted_crawl and len(seen) > 20:
break
print('\nDistributed Crawling...')
htmls = [crawl(url) for url in unseen]
print('\nDistributed Parsing...')
results = [parse(html) for html in htmls]
print('\nAnalysing...')
seen.update(unseen) # seen the crawled
unseen.clear() # nothing unseen
for title, page_urls, url in results:
print(count, title, url)
count += 1
unseen.update(page_urls - seen) # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1, )) # 53 s
6.2multiprocessing
unseen = set([base_url,])
seen = set()
pool = mp.Pool(4)
count, t1 = 1, time.time()
while len(unseen) != 0: # still get some url to visit
if restricted_crawl and len(seen) > 20:
break
print('\nDistributed Crawling...')
crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
htmls = [j.get() for j in crawl_jobs] # request connection
print('\nDistributed Parsing...')
parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
results = [j.get() for j in parse_jobs] # parse html
print('\nAnalysing...')
seen.update(unseen) # seen the crawled
unseen.clear() # nothing unseen
for title, page_urls, url in results:
print(count, title, url)
count += 1
unseen.update(page_urls - seen) # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1, )) # 16 s !!!
7Asyncio tutorial
asyncio是Python 3.4版本引入的标准库,直接内置了对异步IO的支持。 asyncio的编程模型就是一个消息循环
#7.1
import time
def job(t):
print('Start job ', t)
time.sleep(t) # wait for "t" seconds
print('Job ', t, ' takes ', t, ' s')
def main():
[job(t) for t in range(1, 3)]
t1 = time.time()
main()
print("NO async total time : ", time.time() - t1)
##Translate above to async 异步
import asyncio
async def job(t):
print('Start job ', t)
await asyncio.sleep(t) # wait for "t" seconds, it will look for another job while await
print('Job ', t, ' takes ', t, ' s')
async def main(loop):
tasks = [loop.create_task(job(t)) for t in range(1, 3)] # just create, not run job
await asyncio.wait(tasks) # run jobs and wait for all tasks done
t1 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
# loop.close() # Ipython notebook gives error if close loop
print("Async total time : ", time.time() - t1)
###7.2爬网页
import requests
URL = 'https://morvanzhou.github.io/'
def normal():
for i in range(2):
r = requests.get(URL)
url = r.url
print(url)
t1 = time.time()
normal()
print("Normal total time:", time.time()-t1)
#7.2.2Translate above to async using aiohttp
import aiohttp
async def job(session):
response = await session.get(URL)
return str(response.url)
async def main(loop):
async with aiohttp.ClientSession() as session:
tasks = [loop.create_task(job(session)) for _ in range(2)]
finished, unfinished = await asyncio.wait(tasks)
all_results = [r.result() for r in finished] # get return from job
print(all_results)
t1 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
# loop.close() # Ipython notebook gives error if close loop
print("Async total time:", time.time() - t1)
#7.3Compare async with multiprocessing
import aiohttp
import asyncio
import time
from bs4 import BeautifulSoup
from urllib.request import urljoin
import re
import multiprocessing as mp
# base_url = "https://morvanzhou.github.io/"
base_url = "http://127.0.0.1:4000/"
# DON'T OVER CRAWL THE WEBSITE OR YOU MAY NEVER VISIT AGAIN
if base_url != "http://127.0.0.1:4000/":
restricted_crawl = True
else:
restricted_crawl = False
seen = set()
unseen = set([base_url])
def parse(html):
soup = BeautifulSoup(html, 'lxml')
urls = soup.find_all('a', {"href": re.compile('^/.+?/$')})
title = soup.find('h1').get_text().strip()
page_urls = set([urljoin(base_url, url['href']) for url in urls])
url = soup.find('meta', {'property': "og:url"})['content']
return title, page_urls, url
async def crawl(url, session):
r = await session.get(url)
html = await r.text()
await asyncio.sleep(0.1) # slightly delay for downloading
return html
async def main(loop):
pool = mp.Pool(8) # slightly affected
async with aiohttp.ClientSession() as session:
count = 1
while len(unseen) != 0:
print('\nAsync Crawling...')
tasks = [loop.create_task(crawl(url, session)) for url in unseen]
finished, unfinished = await asyncio.wait(tasks)
htmls = [f.result() for f in finished]
print('\nDistributed Parsing...')
parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
results = [j.get() for j in parse_jobs]
print('\nAnalysing...')
seen.update(unseen)
unseen.clear()
for title, page_urls, url in results:
# print(count, title, url)
unseen.update(page_urls - seen)
count += 1
if __name__ == "__main__":
t1 = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
# loop.close()
print("Async total time: ", time.time() - t1)
'''结果
Async Crawling...
Distributed Parsing...
Analysing...
Async Crawling...
Distributed Parsing...
Analysing...
Async Crawling...
Distributed Parsing...
Analysing...
Async total time: 7.21798300743103
Here we try multiprocessing and test the speed'''
7.4
from urllib.request import urlopen, urljoin
from bs4 import BeautifulSoup
import multiprocessing as mp
import re
import time
def crawl(url):
response = urlopen(url)
time.sleep(0.1) # slightly delay for downloading
return response.read().decode()
def parse(html):
soup = BeautifulSoup(html, 'lxml')
urls = soup.find_all('a', {"href": re.compile('^/.+?/$')})
title = soup.find('h1').get_text().strip()
page_urls = set([urljoin(base_url, url['href']) for url in urls])
url = soup.find('meta', {'property': "og:url"})['content']
return title, page_urls, url
if __name__ == '__main__':
# base_url = 'https://morvanzhou.github.io/'
base_url = "http://127.0.0.1:4000/"
# DON'T OVER CRAWL THE WEBSITE OR YOU MAY NEVER VISIT AGAIN
if base_url != "http://127.0.0.1:4000/":
restricted_crawl = True
else:
restricted_crawl = False
unseen = set([base_url,])
seen = set()
pool = mp.Pool(8) # number strongly affected
count, t1 = 1, time.time()
while len(unseen) != 0: # still get some url to visit
if restricted_crawl and len(seen) > 20:
break
print('\nDistributed Crawling...')
crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
htmls = [j.get() for j in crawl_jobs] # request connection
htmls = [h for h in htmls if h is not None] # remove None
print('\nDistributed Parsing...')
parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
results = [j.get() for j in parse_jobs] # parse html
print('\nAnalysing...')
seen.update(unseen)
unseen.clear()
for title, page_urls, url in results:
# print(count, title, url)
count += 1
unseen.update(page_urls - seen)
print('Total time: %.1f s' % (time.time()-t1, ))
8Selenium tutorial
import os
os.makedirs('./img/', exist_ok=True)
from selenium import webdriver
driver = webdriver.Chrome()
driver.get("https://morvanzhou.github.io/")
driver.find_element_by_xpath(u"//img[@alt='强化学习 (Reinforcement Learning)']").click()
driver.find_element_by_link_text("About").click()
driver.find_element_by_link_text(u"赞助").click()
driver.find_element_by_link_text(u"教程 ▾").click()
driver.find_element_by_link_text(u"数据处理 ▾").click()
driver.find_element_by_link_text(u"网页爬虫").click()
html = driver.page_source # get html
driver.get_screenshot_as_file("./img/sreenshot1.png")
driver.close()
print(html[:200])
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless") # define headless
# add the option when creating driver
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get("https://morvanzhou.github.io/")
driver.find_element_by_xpath(u"//img[@alt='强化学习 (Reinforcement Learning)']").click()
driver.find_element_by_link_text("About").click()
driver.find_element_by_link_text(u"赞助").click()
driver.find_element_by_link_text(u"教程 ▾").click()
driver.find_element_by_link_text(u"数据处理 ▾").click()
driver.find_element_by_link_text(u"网页爬虫").click()
html = driver.page_source # get html
driver.get_screenshot_as_file("./img/sreenshot2.png")
driver.close()
print(html[:200])
9Scrapy tutorial
import scrapy
class MofanSpider(scrapy.Spider):
name = "mofan"
start_urls = [
'https://morvanzhou.github.io/',
]
# unseen = set()
# seen = set() # we don't need these two as scrapy will deal with them automatically
def parse(self, response):
yield { # return some results
'title': response.css('h1::text').extract_first(default='Missing').strip().replace('"', ""),
'url': response.url,
}
urls = response.css('a::attr(href)').re(r'^/.+?/$') # find all sub urls
for url in urls:
yield response.follow(url, callback=self.parse) # it will filter duplication automatically
# lastly, run this in terminal
# scrapy runspider 5-2-scrapy.py -o res.json