本次以爬取糗事百科段子为例,页面的URL是:http://www.qiushibaike.com/8hr/page/1
理论基础详见:https://blog.youkuaiyun.com/apollo_miracle/article/details/84881598
- 普通爬虫代码实现:
import time
from pprint import pprint
import requests
from lxml import etree
class QiuBai(object):
def __init__(self):
self.hesders = {
"User-Agent": "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13"
}
self.url_temp = "https://www.qiushibaike.com/text/page/{}/"
self.content_url_temp = "https://www.qiushibaike.com"
def parse_url(self, url):
"""发送请求,获取响应"""
resp = requests.get(url, headers=self.hesders)
return resp.content
def get_content_list(self, content):
"""提取数据"""
# 转化为elements对象
html = etree.HTML(content)
# div分组
div_list = html.xpath("*//article")
content_list = []
for div in div_list:
term = {}
term["author"] = div.xpath(".//a[@class='username']/text()")[0]
term["content_url"] = self.content_url_temp + div.xpath(".//a[@class='text']/@href")[0]
term["content"] = div.xpath(".//a[@class='text']/text()")[0]
content_list.append(term)
return content_list
def save_content(self, content_list):
"""保存数据"""
pprint(content_list)
def run(self): # 实现主要业务逻辑
# 1.拼接url_list
url_list = [self.url_temp.format(i) for i in range(1, 14)]
# 2.遍历发送请求,获取响应
for url in url_list:
print(url)
content = self.parse_url(url)
# 3.提取数据
content_list = self.get_content_list(content)
# 4.保存数据
self.save_content(content_list)
if __name__ == '__main__':
t1 = time.time()
qb = QiuBai()
qb.run()
print("total cost:", time.time() - t1)
- 上述代码改写成多线程方式实现
import threading
import time
from queue import Queue
import requests
from lxml import etree
class QiuBai(object):
def __init__(self):
self.hesders = {
"User-Agent": "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13"
}
self.url_temp = "https://www.qiushibaike.com/text/page/{}/"
self.content_url_temp = "https://www.qiushibaike.com"
self.url_queue = Queue() # 存放url队列
self.content_queue = Queue() # 存放响应内容队列
self.save_queue = Queue() # 存放提取(待保存)内容队列
def get_url_list(self):
"""拼接url_list"""
for i in range(1, 14):
self.url_queue.put(self.url_temp.format(i))
def parse_url(self):
"""发送请求,获取响应"""
while True:
url = self.url_queue.get()
resp = requests.get(url, headers=self.hesders)
if resp.status_code != 200:
self.url_queue.put(url)
else:
self.content_queue.put(resp.content)
self.url_queue.task_done() # 让队列的计数-1
def get_content_list(self):
"""提取数据"""
while True:
content = self.content_queue.get()
# 转化为elements对象
html = etree.HTML(content)
# div分组
div_list = html.xpath("*//article")
for div in div_list:
term = {}
term["author"] = div.xpath(".//a[@class='username']/text()")[0]
term["content_url"] = self.content_url_temp + div.xpath(".//a[@class='text']/@href")[0]
term["content"] = div.xpath(".//a[@class='text']/text()")[0]
self.save_queue.put(term)
self.content_queue.task_done()
def save_content(self):
"""保存数据"""
while True:
content = self.save_queue.get()
with open("qiubai.txt", "a", encoding="utf-8") as f:
f.write("*" * 50 + "\n" + content["author"] + ":\n")
f.write("网址:" + content["content_url"])
f.write(content["content"])
self.save_queue.task_done()
def run(self): # 实现主要业务逻辑
# 线程列表
thread_list = []
# 1.拼接url_list
th_url = threading.Thread(target=self.get_url_list)
thread_list.append(th_url)
# 2.遍历发送请求,获取响应
for _ in range(2):
th_parse = threading.Thread(target=self.parse_url)
thread_list.append(th_parse)
# 3.提取数据
th_content = threading.Thread(target=self.get_content_list)
thread_list.append(th_content)
# 4.保存数据
for _ in range(3):
th_save = threading.Thread(target=self.save_content)
thread_list.append(th_save)
for th in thread_list:
th.setDaemon(True) # 把子线程设置为守护线程
th.start()
for q in [self.url_queue, self.content_queue, self.save_queue]:
q.join() # 让主线程阻塞,等待队列计数为0
if __name__ == '__main__':
t1 = time.time()
qb = QiuBai()
qb.run()
print("total cost:", time.time() - t1)
- 将上述代码改写成多进程方式实现
import time
# from queue import Queue
from multiprocessing import JoinableQueue as Queue # 与线程不一样
from multiprocessing import Process
import requests
from lxml import etree
class QiuBai(object):
def __init__(self):
self.hesders = {
"User-Agent": "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13"
}
self.url_temp = "https://www.qiushibaike.com/text/page/{}/"
self.content_url_temp = "https://www.qiushibaike.com"
self.url_queue = Queue() # 存放url队列
self.content_queue = Queue() # 存放响应内容队列
self.save_queue = Queue() # 存放提取(待保存)内容队列
def get_url_list(self):
"""拼接url_list"""
for i in range(1, 14):
self.url_queue.put(self.url_temp.format(i))
def parse_url(self):
"""发送请求,获取响应"""
while True:
url = self.url_queue.get()
resp = requests.get(url, headers=self.hesders)
if resp.status_code != 200:
self.url_queue.put(url)
else:
self.content_queue.put(resp.content)
self.url_queue.task_done() # 让队列的计数-1
def get_content_list(self):
"""提取数据"""
while True:
content = self.content_queue.get()
# 转化为elements对象
html = etree.HTML(content)
# div分组
div_list = html.xpath("*//article")
for div in div_list:
term = {}
term["author"] = div.xpath(".//a[@class='username']/text()")[0]
term["content_url"] = self.content_url_temp + div.xpath(".//a[@class='text']/@href")[0]
term["content"] = div.xpath(".//a[@class='text']/text()")[0]
self.save_queue.put(term)
self.content_queue.task_done()
def save_content(self):
"""保存数据"""
while True:
content = self.save_queue.get()
with open("qiubai.txt", "a", encoding="utf-8") as f:
f.write("*" * 50 + "\n" + content["author"] + ":\n")
f.write("网址:" + content["content_url"])
f.write(content["content"])
self.save_queue.task_done()
def run(self): # 实现主要业务逻辑
# 进程列表
process_list = []
# 1.拼接url_list
pr_url = Process(target=self.get_url_list)
process_list.append(pr_url)
# 2.遍历发送请求,获取响应
for _ in range(2):
pr_parse = Process(target=self.parse_url)
process_list.append(pr_parse)
# 3.提取数据
pr_content = Process(target=self.get_content_list)
process_list.append(pr_content)
# 4.保存数据
for _ in range(3):
pr_save = Process(target=self.save_content)
process_list.append(pr_save)
for pr in process_list:
pr.daemon = True # 把子进程设置为守护线程
pr.start()
# time.sleep(1) # 防止子进程还未开始,主进程就结束
for p in [self.url_queue, self.content_queue, self.save_queue]:
p.join() # 让主进程阻塞,等待队列计数为0
if __name__ == '__main__':
t1 = time.time()
qb = QiuBai()
qb.run()
print("total cost:", time.time() - t1)
多线程与多进程的代码区别:
- 队列Queue导包不同:
线程:from queue import Queue
进程:from multiprocessing import JoinableQueue as Queue
- 把子线(进)程设置为守护线程语法不同:
线程:th.setDaemon(True)
进程:pr.daemon = True
- 使用线程池实现更快的爬虫
import time
from multiprocessing.dummy import Pool
from pprint import pprint
from queue import Queue
import requests
from lxml import etree
class QiuBai(object):
def __init__(self):
self.hesders = {
"User-Agent": "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13"
}
self.url_temp = "https://www.qiushibaike.com/text/page/{}/"
self.content_url_temp = "https://www.qiushibaike.com"
self.queue = Queue() # 实例化队列
self.pool = Pool() # 实例化线程池
self.is_running = True # 默认线程正在执行
self.request_num = 0 # 发出的请求数
self.response_num = 0 # 获得的回应数 当回应数等于请求数时结束线程
def get_url_list(self):
"""拼接url_list"""
for i in range(1, 14):
self.queue.put(self.url_temp.format(i))
self.request_num += 1 # 请求数 +1
def parse_url(self, url):
"""发送请求,获取响应"""
resp = requests.get(url, headers=self.hesders)
return resp.content
def get_content_list(self, content):
"""提取数据"""
# 转化为elements对象
html = etree.HTML(content)
# div分组
div_list = html.xpath("*//article")
content_list = []
for div in div_list:
term = {}
term["author"] = div.xpath(".//a[@class='username']/text()")[0]
term["content_url"] = self.content_url_temp + div.xpath(".//a[@class='text']/@href")[0]
term["content"] = div.xpath(".//a[@class='text']/text()")[0]
content_list.append(term)
return content_list
def save_content(self, content_list):
"""保存数据"""
pprint(content_list)
def _execete_request_content_save(self):
"""进行一次url地址的请求,提取,保存"""
# 1.发送请求
url = self.queue.get()
# 2.获取响应
content = self.parse_url(url)
# 3.提取数据
content_list = self.get_content_list(content)
# 4.保存数据
self.save_content(content_list)
self.response_num += 1 # 响应数 +1
def _callback(self, temp): # 参数 temp 在此处没有,但不能去掉
if self.is_running:
self.pool.apply_async(self._execete_request_content_save, callback=self._callback)
def run(self): # 实现主要业务逻辑
# 1.拼接url_list
self.get_url_list()
for i in range(3): # 设置并发数为3
self.pool.apply_async(self._execete_request_content_save, callback=self._callback)
while True:
time.sleep(0.0001)
if self.response_num >= self.request_num:
self.is_running = False
break
if __name__ == '__main__':
t1 = time.time()
qb = QiuBai()
qb.run()
print("total cost:", time.time() - t1)
- 使用协程池实现爬虫
# coding=utf-8
import gevent.monkey
gevent.monkey.patch_all()
# from multiprocessing.dummy import Pool
from gevent.pool import Pool
import requests
from lxml import etree
import time
from queue import Queue
class QiuBai(object):
def __init__(self):
self.hesders = {
"User-Agent": "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13"
}
self.url_temp = "https://www.qiushibaike.com/text/page/{}/"
self.content_url_temp = "https://www.qiushibaike.com"
self.queue = Queue() # 实例化队列
self.pool = Pool() # 实例化线程池
self.is_running = True # 默认线程正在执行
self.request_num = 0 # 发出的请求数
self.response_num = 0 # 获得的回应数 当回应数等于请求数时结束线程
def get_url_list(self):
"""拼接url_list"""
for i in range(1, 14):
self.queue.put(self.url_temp.format(i))
self.request_num += 1 # 请求数 +1
def parse_url(self, url):
"""发送请求,获取响应"""
resp = requests.get(url, headers=self.hesders)
return resp.content
def get_content_list(self, content):
"""提取数据"""
# 转化为elements对象
html = etree.HTML(content)
# div分组
div_list = html.xpath("*//article")
content_list = []
for div in div_list:
term = {}
term["author"] = div.xpath(".//a[@class='username']/text()")[0]
term["content_url"] = self.content_url_temp + div.xpath(".//a[@class='text']/@href")[0]
term["content"] = div.xpath(".//a[@class='text']/text()")[0]
content_list.append(term)
return content_list
def save_content(self, content_list):
"""保存数据"""
pprint(content_list)
def _execete_request_content_save(self):
"""进行一次url地址的请求,提取,保存"""
# 1.发送请求
url = self.queue.get()
# 2.获取响应
content = self.parse_url(url)
# 3.提取数据
content_list = self.get_content_list(content)
# 4.保存数据
self.save_content(content_list)
self.response_num += 1 # 响应数 +1
def _callback(self, temp): # 参数 temp 在此处没有,但不能去掉
if self.is_running:
self.pool.apply_async(self._execete_request_content_save, callback=self._callback)
def run(self): # 实现主要业务逻辑
# 1.拼接url_list
self.get_url_list()
for i in range(3): # 设置并发数为3
self.pool.apply_async(self._execete_request_content_save, callback=self._callback)
while True:
time.sleep(0.0001)
if self.response_num >= self.request_num:
self.is_running = False
break
if __name__ == '__main__':
t1 = time.time()
qb = QiuBai()
qb.run()
print("total cost:", time.time() - t1)
线程池与协程池的代码区别:只有导入的包不同,其他全部一样