import requests import re import time from multiprocessing import Pool headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36' } def re_scraper(url): res = requests.get(url, headers=headers) ids = re.findall('<h2>(.*?)</h2>', res.text, re.S) contents = re.findall('<div class="content"><span>(.*?)</span></div>', res.text, re.S) laughts = re.findall('<span class="stats-vote"><i class="number">(\d+)</i>', res.text, re.S) comments = re.findall('<i class="number">(\d+)</i>评论', res.text, re.S) for id, content, laught, comment in zip(ids, contents, laughts, comments): info = { 'id': id, 'content': content, 'laught': laught, 'comment': comment } # 只爬取,不存储,测试性能 return if __name__ == '__main__': urls = ['https://www.qiushibaike.com/text/page/{}/'.format(str(i)) for i in range(1, 36)] start_1 = time.time() for url in urls: re_scraper(url) end_1 = time.time() print("串行爬虫耗费时间:", end_1 - start_1) start_2 = time.time() pool = Pool(processes=2) pool.map(re_scraper, urls) end_2 = time.time() print("两个进程爬虫耗费时间:", end_2 - start_2) start_3 = time.time() pool = Pool(processes=4) pool.map(re_scraper, urls) end_3 = time.time() print("四个进程爬虫耗费时间:", end_3 - start_3) start_4= time.time() pool = Pool(processes=8) pool.map(re_scraper, urls) end_4 = time.time() print("八个进程爬虫耗费时间:", end_4 - start_4) start_5 = time.time() pool = Pool(processes=16) pool.map(re_scraper, urls) end_5 = time.time() print("16个进程爬虫耗费时间:", end_5 - start_5)
output:
串行爬虫耗费时间: 13.354042291641235
两个进程爬虫耗费时间: 7.521216869354248
四个进程爬虫耗费时间: 3.670652151107788
八个进程爬虫耗费时间: 2.4657177925109863
16个进程爬虫耗费时间: 1.6584398746490479