# coding=utf-8
import requests
from lxml import etree
import json
import threading
from queue import Queue
import time
class XiuShi(object):
"""抓取糗事百科"""
def __init__(self):
self.url = 'https://www.qiushibaike.com/8hr/page/{}'
self.headers = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}
self.file = open('qiushi.json', 'w')
self.url_list_queue = Queue()
self.get_data_queue = Queue()
self.parse_page_queue = Queue()
def url_list(self):
print('正在生成url列表')
for i in range(1, 14):
url = self.url.format(i)
# 将url添加到队列
self.url_list_queue.put(url)
def get_data(self):
while True:
print('正在发送请求')
# 从队列中获取url
url = self.url_list_queue.get()
response = requests.get(url, headers=self.headers)
# 如果返回的响应状态码是503
if response.status_code == 503:
self.url_list_queue.put(url)
else:
# print(response.content)
resp_data = response.content
# 将response添加到队列
self.get_data_queue.put(resp_data)
# 宣告子线程结束
self.url_list_queue.task_done()
def parse_page(self):
while True:
print('正在解析数据')
# 从队列中获取response响应数据
data = self.get_data_queue.get()
# 生成xpath对象
html = etree.HTML(data)
# 解析所有帖子的结点
node_list = html.xpath('//*[contains(@id,"qiushi_tag_")]')
qiushi_list = []
for node in node_list:
qiu_dict = dict()
try:
qiu_dict['user'] = node.xpath('./div[1]/a[2]/h2/text()')[0].strip()
qiu_dict['age'] = node.xpath('./div[1]/div/text()')[0]
qiu_dict['url'] = 'https://www.qiushibaike.com' + node.xpath('./div[1]/a[1]/@href')[0]
qiu_dict['gender'] = node.xpath('./div[1]/div/@class')[0].split(' ')[-1]
except:
qiu_dict['user'] = '匿名用户'
qiu_dict['age'] = None
qiu_dict['url'] = None
qiu_dict['gender'] = None
qiu_dict['content'] = ''.join(node.xpath('./a/div/span/text()')).strip()
qiushi_list.append(qiu_dict)
# 将解析的数据保存到队列
self.parse_page_queue.put(qiushi_list)
# 宣告子线程结束
self.get_data_queue.task_done()
def save_data(self):
while True:
print('正在保存数据')
# 从队列中获取要保存的数据
qiushi_list = self.parse_page_queue.get()
for qiushi in qiushi_list:
# print(qiushi)
# 转化为json数据
json_data = json.dumps(qiushi, ensure_ascii=False) + ',\n'
print(json_data)
self.file.write(json_data)
# time.sleep(3)
# 宣告子线程结束
self.parse_page_queue.task_done()
def __del__(self):
"""关闭文件"""
self.file.close()
def run(self):
# urls = self.url_list()
# for url in urls:
# data = self.get_data(url)
# qiushi_list = self.parse_page(data)
# self.save_data(qiushi_list)
threading_list = []
# 创建生成url的线程
urls = threading.Thread(target=self.url_list)
threading_list.append(urls)
# 创建请求的线程
for i in range(1, 4):
data = threading.Thread(target=self.get_data)
threading_list.append(data)
# 创建解析的线程
for i in range(1, 4):
qiushi_list = threading.Thread(target=self.parse_page)
threading_list.append(qiushi_list)
# 创建保存的线程
save = threading.Thread(target=self.save_data)
threading_list.append(save)
for t in threading_list:
# 将子线程设置为守护主线程,即主线程死亡,子线程就死亡
t.setDaemon(True)
# 执行线程
t.start()
# 设置主线程等待结束的条件
for q in (self.url_list_queue, self.get_data_queue, self.parse_page_queue):
q.join()
if __name__ == '__main__':
qiu = XiuShi()
qiu.run()复制代码
python多线程爬去糗事百科

最新推荐文章于 2025-09-08 09:22:18 发布
