这两天把爬虫重新捡起来了,顺便看了下多线程和协程,重新写了单线程、多线程以及协程版本的爬虫爬取豆瓣电影TOP250.
单线程版本:
import requests
from lxml import etree
import re
from time import time
def get_page():
i = 0
pages = []
while i <= 225:
url = 'https://movie.douban.com/top250?start={}&filter='.format(i)
pages.append(url)
i = i + 25
return pages
def fetch_page(url):
send_headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
"Connection":"keep-alive",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language":"zh-CN,zh;q=0.8"}
response = requests.get(url, headers=send_headers)
return response
def parse():
pages = get_page()
result = []
for url in pages:
response = fetch_page(url)
html = etree.HTML(response.text)
movie_name = '//