python多线程爬去糗事百科

糗事百科爬虫实战

最新推荐文章于 2025-09-08 09:22:18 发布

weixin_33962621

最新推荐文章于 2025-09-08 09:22:18 发布

阅读量77

点赞数

CC 4.0 BY-SA版权

文章标签： python json

原文链接：https://juejin.im/post/5ac3898af265da23870f08e4

本文介绍了一个使用Python实现的糗事百科爬虫项目。该项目采用多线程方式抓取糗事百科网站上的帖子，并利用requests和lxml库进行网页内容的获取与解析。最终，所有抓取到的数据被保存为JSON格式。

# coding=utf-8
import requests
from lxml import etree
import json
import threading
from queue import Queue
import time


class XiuShi(object):
    """抓取糗事百科"""
    def __init__(self):
        self.url = 'https://www.qiushibaike.com/8hr/page/{}'
        self.headers = {
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }
        self.file = open('qiushi.json', 'w')
        self.url_list_queue = Queue()
        self.get_data_queue = Queue()
        self.parse_page_queue = Queue()

    def url_list(self):
        print('正在生成url列表')
        for i in range(1, 14):
            url = self.url.format(i)
            # 将url添加到队列
            self.url_list_queue.put(url)

    def get_data(self):
        while True:
            print('正在发送请求')
            # 从队列中获取url
            url = self.url_list_queue.get()
            response = requests.get(url, headers=self.headers)
            # 如果返回的响应状态码是503
            if response.status_code == 503:
                self.url_list_queue.put(url)
            else:
                # print(response.content)
                resp_data = response.content
                # 将response添加到队列
                self.get_data_queue.put(resp_data)
            # 宣告子线程结束
            self.url_list_queue.task_done()

    def parse_page(self):
        while True:
            print('正在解析数据')
            # 从队列中获取response响应数据
            data = self.get_data_queue.get()
            # 生成xpath对象
            html = etree.HTML(data)
            # 解析所有帖子的结点
            node_list = html.xpath('//*[contains(@id,"qiushi_tag_")]')
            qiushi_list = []
            for node in node_list:
                qiu_dict = dict()
                try:
                    qiu_dict['user'] = node.xpath('./div[1]/a[2]/h2/text()')[0].strip()
                    qiu_dict['age'] = node.xpath('./div[1]/div/text()')[0]
                    qiu_dict['url'] = 'https://www.qiushibaike.com' + node.xpath('./div[1]/a[1]/@href')[0]
                    qiu_dict['gender'] = node.xpath('./div[1]/div/@class')[0].split(' ')[-1]
                except:
                    qiu_dict['user'] = '匿名用户'
                    qiu_dict['age'] = None
                    qiu_dict['url'] = None
                    qiu_dict['gender'] = None
                qiu_dict['content'] = ''.join(node.xpath('./a/div/span/text()')).strip()
                qiushi_list.append(qiu_dict)
            # 将解析的数据保存到队列
            self.parse_page_queue.put(qiushi_list)
            # 宣告子线程结束
            self.get_data_queue.task_done()

    def save_data(self):
        while True:
            print('正在保存数据')
            # 从队列中获取要保存的数据
            qiushi_list = self.parse_page_queue.get()
            for qiushi in qiushi_list:
                # print(qiushi)
                # 转化为json数据
                json_data = json.dumps(qiushi, ensure_ascii=False) + ',\n'
                print(json_data)
                self.file.write(json_data)
                # time.sleep(3)
            # 宣告子线程结束
            self.parse_page_queue.task_done()

    def __del__(self):
        """关闭文件"""
        self.file.close()

    def run(self):
        # urls = self.url_list()
        # for url in urls:
        #     data = self.get_data(url)
        #     qiushi_list = self.parse_page(data)
        #     self.save_data(qiushi_list)
        threading_list = []
        # 创建生成url的线程
        urls = threading.Thread(target=self.url_list)
        threading_list.append(urls)

        # 创建请求的线程
        for i in range(1, 4):
            data = threading.Thread(target=self.get_data)
            threading_list.append(data)
        # 创建解析的线程
        for i in range(1, 4):
            qiushi_list = threading.Thread(target=self.parse_page)
            threading_list.append(qiushi_list)
        # 创建保存的线程
        save = threading.Thread(target=self.save_data)
        threading_list.append(save)

        for t in threading_list:
            # 将子线程设置为守护主线程，即主线程死亡，子线程就死亡
            t.setDaemon(True)
            # 执行线程
            t.start()

        # 设置主线程等待结束的条件
        for q in (self.url_list_queue, self.get_data_queue, self.parse_page_queue):
            q.join()

if __name__ == '__main__':
    qiu = XiuShi()
    qiu.run()复制代码