1.单线程爬取糗百
import requests
from lxml import etree
class QiubaiSpider:
def __init__(self):
self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
def get_url_list(self):
url_list = [self.url_temp.format(i+1) for i in range(12)]
return url_list
def parse_url(self, url):
response = requests.get(url, self.headers)
html_str = response.content
return html_str
def get_content_list(self, html_str):
html = etree.HTML(html_str)
li_list = html.xpath("//li[contains(@class, 'item typs_')]")
content_list = list()
for li in li_list:
item = dict()
item["title"] = li.xpath(".//a[@class='recmd-content']/text()")[0].strip() if len(li.xpath(".//a[@class='recmd-content']/text()")) else None
item["href"] = "https://www.qiushibaike.com" + li.xpath(".//a[@class='recmd-content']/@href")[0] if len(li.xpath(".//a[@class='recmd-content']/@href")) else None
content_list.append(item)
return content_list
def save_content_list(self, content_list):
for content in content_list:
print(content)
def run(self):
url_list = self.get_url_list()
for url in url_list:
html_str = self.parse_url(url)
content_list = self.get_content_list(html_str)
self.save_content_list(content_list)
if __name__ == "__main__":
qiubai = QiubaiSpider()
qiubai.run()
2.多线程思路解析
- 把爬虫的每个步骤封装成函数,分别用线程去执行
- 不同函数通过列队相互通信,函数间解耦

3.糗百多线程方式实现
- 用到了守护线程
- 竟然可以 q.join() 这里q是列队 很震惊
- put是队列+1,get和task_done一起使用时队列才会-1
"""有个问题我不能解决 有时候打印结果会少一页"""
import requests
from lxml import etree
import threading
import queue
import time
import os
class QiubaiSpider:
def __init__(self):
self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
self.url_queue = queue.Queue()
self.html_str_queue = queue.Queue()
self.content_list_queue = queue.Queue()
def get_url_list(self):
for i in range(1, 14):
url = self.url_temp.format(i)
self.url_queue.put(url)
def parse_url(self):
while True:
url = self.url_queue.get()
print(url)
response = requests.get(url, headers=self.headers)
html_str = response.content
self.html_str_queue.put(html_str)
self.url_queue.task_done()
def get_content_list(self):
while True:
html_str = self.html_str_queue.get()
html = etree.HTML(html_str)
li_list = html.xpath("//li[contains(@class, 'item typs_')]")
content_list = list()
for li in li_list:
item = dict()
item["title"] = li.xpath(".//a[@class='recmd-content']/text()")[0].strip() if len(li.xpath(".//a[@class='recmd-content']/text()")) else None
item["href"] = "https://www.qiushibaike.com" + li.xpath(".//a[@class='recmd-content']/@href")[0] if len(li.xpath(".//a[@class='recmd-content']/@href")) else None
content_list.append(item)
self.content_list_queue.put(content_list)
self.html_str_queue.task_done()
def save_content_list(self):
while True:
content_list = self.content_list_queue.get()
for content in content_list:
print(content)
self.content_list_queue.task_done()
def run(self):
print(os.getppid())
thread_list = []
t_url = threading.Thread(target=self.get_url_list)
thread_list.append(t_url)
for i in range(10):
t_parse = threading.Thread(target=self.parse_url)
thread_list.append(t_parse)
t_content = threading.Thread(target=self.get_content_list)
thread_list.append(t_content)
t_save = threading.Thread(target=self.save_content_list)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True)
t.start()
for q in [self.url_queue, self.html_str_queue, self.content_list_queue]:
q.join()
if __name__ == "__main__":
print(os.getppid())
qiubai = QiubaiSpider()
start_time = time.time()
qiubai.run()
print("total time :", time.time()-start_time)
4.糗百用多进程实现
- 用到JoinableQueue模块 该队列相比普通的Queue的区别在于该对列额外增加的了join函数
- 打印不出来 不知道什么问题
import requests
from lxml import etree
import multiprocessing
from multiprocessing import JoinableQueue
import time
class QiubaiSpider:
def __init__(self):
self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
self.url_queue = JoinableQueue()
self.html_str_queue = JoinableQueue()
self.content_list_queue = JoinableQueue()
def get_url_list(self):
for i in range(1, 14):
url = self.url_temp.format(i)
self.url_queue.put(url)
def parse_url(self):
while True:
url = self.url_queue.get()
response = requests.get(url, headers=self.headers)
html_str = response.content
self.html_str_queue.put(html_str)
self.url_queue.task_done()
def get_content_list(self):
while True:
html_str = self.html_str_queue.get()
html = etree.HTML(html_str)
li_list = html.xpath("//li[contains(@class, 'item typs_')]")
content_list = list()
for li in li_list:
item = dict()
item["title"] = li.xpath(".//a[@class='recmd-content']/text()")[0].strip() if len(li.xpath(".//a[@class='recmd-content']/text()")) else None
item["href"] = "https://www.qiushibaike.com" + li.xpath(".//a[@class='recmd-content']/@href")[0] if len(li.xpath(".//a[@class='recmd-content']/@href")) else None
content_list.append(item)
self.content_list_queue.put(content_list)
self.html_str_queue.task_done()
def save_content_list(self):
while True:
content_list = self.content_list_queue.get()
for content in content_list:
print(content)
self.content_list_queue.task_done()
def run(self):
process_list = []
t_url = multiprocessing.Process(target=self.get_url_list)
process_list.append(t_url)
t_parse = multiprocessing.Process(target=self.parse_url)
process_list.append(t_parse)
t_content = multiprocessing.Process(target=self.get_content_list)
process_list.append(t_content)
t_save = multiprocessing.Process(target=self.save_content_list)
process_list.append(t_save)
for t in process_list:
t.daemon = True
t.start()
for q in [self.url_queue, self.html_str_queue, self.content_list_queue]:
q.join()
if __name__ == "__main__":
qiubai = QiubaiSpider()
qiubai.run()