使用线程 - 进程 - 线程池 - 协程池实现高效爬虫

本文详细介绍使用Python爬取糗事百科段子的过程,包括单线程、多线程、多进程及线程池的实现方式,对比不同并发模式下的性能差异。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

本次以爬取糗事百科段子为例,页面的URL是:http://www.qiushibaike.com/8hr/page/1

理论基础详见:https://blog.youkuaiyun.com/apollo_miracle/article/details/84881598

  • 普通爬虫代码实现:
import time
from pprint import pprint

import requests
from lxml import etree


class QiuBai(object):
    def __init__(self):
        self.hesders = {
            "User-Agent": "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13"
        }
        self.url_temp = "https://www.qiushibaike.com/text/page/{}/"
        self.content_url_temp = "https://www.qiushibaike.com"

    def parse_url(self, url):
        """发送请求,获取响应"""
        resp = requests.get(url, headers=self.hesders)
        return resp.content

    def get_content_list(self, content):
        """提取数据"""
        # 转化为elements对象
        html = etree.HTML(content)
        # div分组
        div_list = html.xpath("*//article")
        content_list = []
        for div in div_list:
            term = {}
            term["author"] = div.xpath(".//a[@class='username']/text()")[0]
            term["content_url"] = self.content_url_temp + div.xpath(".//a[@class='text']/@href")[0]
            term["content"] = div.xpath(".//a[@class='text']/text()")[0]
            content_list.append(term)
        return content_list

    def save_content(self, content_list):
        """保存数据"""
        pprint(content_list)

    def run(self):  # 实现主要业务逻辑
        # 1.拼接url_list
        url_list = [self.url_temp.format(i) for i in range(1, 14)]
        # 2.遍历发送请求,获取响应
        for url in url_list:
            print(url)
            content = self.parse_url(url)
            # 3.提取数据
            content_list = self.get_content_list(content)
            # 4.保存数据
            self.save_content(content_list)


if __name__ == '__main__':
    t1 = time.time()
    qb = QiuBai()
    qb.run()
    print("total cost:", time.time() - t1)
  • 上述代码改写成多线程方式实现
import threading
import time
from queue import Queue

import requests
from lxml import etree


class QiuBai(object):
    def __init__(self):
        self.hesders = {
            "User-Agent": "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13"
        }
        self.url_temp = "https://www.qiushibaike.com/text/page/{}/"
        self.content_url_temp = "https://www.qiushibaike.com"
        self.url_queue = Queue()  # 存放url队列
        self.content_queue = Queue()  # 存放响应内容队列
        self.save_queue = Queue()  # 存放提取(待保存)内容队列

    def get_url_list(self):
        """拼接url_list"""
        for i in range(1, 14):
            self.url_queue.put(self.url_temp.format(i))

    def parse_url(self):
        """发送请求,获取响应"""
        while True:
            url = self.url_queue.get()
            resp = requests.get(url, headers=self.hesders)
            if resp.status_code != 200:
                self.url_queue.put(url)
            else:
                self.content_queue.put(resp.content)
            self.url_queue.task_done()  # 让队列的计数-1

    def get_content_list(self):
        """提取数据"""
        while True:
            content = self.content_queue.get()
            # 转化为elements对象
            html = etree.HTML(content)
            # div分组
            div_list = html.xpath("*//article")
            for div in div_list:
                term = {}
                term["author"] = div.xpath(".//a[@class='username']/text()")[0]
                term["content_url"] = self.content_url_temp + div.xpath(".//a[@class='text']/@href")[0]
                term["content"] = div.xpath(".//a[@class='text']/text()")[0]
                self.save_queue.put(term)
            self.content_queue.task_done()

    def save_content(self):
        """保存数据"""
        while True:
            content = self.save_queue.get()
            with open("qiubai.txt", "a", encoding="utf-8") as f:
                f.write("*" * 50 + "\n" + content["author"] + ":\n")
                f.write("网址:" + content["content_url"])
                f.write(content["content"])
            self.save_queue.task_done()

    def run(self):  # 实现主要业务逻辑
        # 线程列表
        thread_list = []
        # 1.拼接url_list
        th_url = threading.Thread(target=self.get_url_list)
        thread_list.append(th_url)
        # 2.遍历发送请求,获取响应
        for _ in range(2):
            th_parse = threading.Thread(target=self.parse_url)
            thread_list.append(th_parse)
        # 3.提取数据
        th_content = threading.Thread(target=self.get_content_list)
        thread_list.append(th_content)
        # 4.保存数据
        for _ in range(3):
            th_save = threading.Thread(target=self.save_content)
            thread_list.append(th_save)

        for th in thread_list:
            th.setDaemon(True)  # 把子线程设置为守护线程
            th.start()

        for q in [self.url_queue, self.content_queue, self.save_queue]:
            q.join()  # 让主线程阻塞,等待队列计数为0


if __name__ == '__main__':
    t1 = time.time()
    qb = QiuBai()
    qb.run()
    print("total cost:", time.time() - t1)
  • 将上述代码改写成多进程方式实现
import time
# from queue import Queue
from multiprocessing import JoinableQueue as Queue  # 与线程不一样
from multiprocessing import Process

import requests
from lxml import etree


class QiuBai(object):
    def __init__(self):
        self.hesders = {
            "User-Agent": "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13"
        }
        self.url_temp = "https://www.qiushibaike.com/text/page/{}/"
        self.content_url_temp = "https://www.qiushibaike.com"
        self.url_queue = Queue()  # 存放url队列
        self.content_queue = Queue()  # 存放响应内容队列
        self.save_queue = Queue()  # 存放提取(待保存)内容队列

    def get_url_list(self):
        """拼接url_list"""
        for i in range(1, 14):
            self.url_queue.put(self.url_temp.format(i))

    def parse_url(self):
        """发送请求,获取响应"""
        while True:
            url = self.url_queue.get()
            resp = requests.get(url, headers=self.hesders)
            if resp.status_code != 200:
                self.url_queue.put(url)
            else:
                self.content_queue.put(resp.content)
            self.url_queue.task_done()  # 让队列的计数-1

    def get_content_list(self):
        """提取数据"""
        while True:
            content = self.content_queue.get()
            # 转化为elements对象
            html = etree.HTML(content)
            # div分组
            div_list = html.xpath("*//article")
            for div in div_list:
                term = {}
                term["author"] = div.xpath(".//a[@class='username']/text()")[0]
                term["content_url"] = self.content_url_temp + div.xpath(".//a[@class='text']/@href")[0]
                term["content"] = div.xpath(".//a[@class='text']/text()")[0]
                self.save_queue.put(term)
            self.content_queue.task_done()

    def save_content(self):
        """保存数据"""
        while True:
            content = self.save_queue.get()
            with open("qiubai.txt", "a", encoding="utf-8") as f:
                f.write("*" * 50 + "\n" + content["author"] + ":\n")
                f.write("网址:" + content["content_url"])
                f.write(content["content"])
            self.save_queue.task_done()

    def run(self):  # 实现主要业务逻辑
        # 进程列表
        process_list = []
        # 1.拼接url_list
        pr_url = Process(target=self.get_url_list)
        process_list.append(pr_url)
        # 2.遍历发送请求,获取响应
        for _ in range(2):
            pr_parse = Process(target=self.parse_url)
            process_list.append(pr_parse)
        # 3.提取数据
        pr_content = Process(target=self.get_content_list)
        process_list.append(pr_content)
        # 4.保存数据
        for _ in range(3):
            pr_save = Process(target=self.save_content)
            process_list.append(pr_save)

        for pr in process_list:
            pr.daemon = True  # 把子进程设置为守护线程
            pr.start()

        # time.sleep(1)  # 防止子进程还未开始,主进程就结束

        for p in [self.url_queue, self.content_queue, self.save_queue]:
            p.join()  # 让主进程阻塞,等待队列计数为0


if __name__ == '__main__':
    t1 = time.time()
    qb = QiuBai()
    qb.run()
    print("total cost:", time.time() - t1)

多线程与多进程的代码区别:

  • 队列Queue导包不同:

线程:from queue import Queue

进程:from multiprocessing import JoinableQueue as Queue

  • 把子线(进)程设置为守护线程语法不同:

线程:th.setDaemon(True)

进程:pr.daemon = True

  • 使用线程池实现更快的爬虫
import time
from multiprocessing.dummy import Pool
from pprint import pprint
from queue import Queue

import requests
from lxml import etree


class QiuBai(object):
    def __init__(self):
        self.hesders = {
            "User-Agent": "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13"
        }
        self.url_temp = "https://www.qiushibaike.com/text/page/{}/"
        self.content_url_temp = "https://www.qiushibaike.com"
        self.queue = Queue()  # 实例化队列
        self.pool = Pool()  # 实例化线程池
        self.is_running = True  # 默认线程正在执行
        self.request_num = 0  # 发出的请求数
        self.response_num = 0  # 获得的回应数  当回应数等于请求数时结束线程

    def get_url_list(self):
        """拼接url_list"""
        for i in range(1, 14):
            self.queue.put(self.url_temp.format(i))
            self.request_num += 1  # 请求数 +1

    def parse_url(self, url):
        """发送请求,获取响应"""
        resp = requests.get(url, headers=self.hesders)
        return resp.content

    def get_content_list(self, content):
        """提取数据"""
        # 转化为elements对象
        html = etree.HTML(content)
        # div分组
        div_list = html.xpath("*//article")
        content_list = []
        for div in div_list:
            term = {}
            term["author"] = div.xpath(".//a[@class='username']/text()")[0]
            term["content_url"] = self.content_url_temp + div.xpath(".//a[@class='text']/@href")[0]
            term["content"] = div.xpath(".//a[@class='text']/text()")[0]
            content_list.append(term)
        return content_list

    def save_content(self, content_list):
        """保存数据"""
        pprint(content_list)

    def _execete_request_content_save(self):
        """进行一次url地址的请求,提取,保存"""
        # 1.发送请求
        url = self.queue.get()
        # 2.获取响应
        content = self.parse_url(url)
        # 3.提取数据
        content_list = self.get_content_list(content)
        # 4.保存数据
        self.save_content(content_list)
        self.response_num += 1  # 响应数 +1

    def _callback(self, temp):  # 参数 temp 在此处没有,但不能去掉
        if self.is_running:
            self.pool.apply_async(self._execete_request_content_save, callback=self._callback)

    def run(self):  # 实现主要业务逻辑
        # 1.拼接url_list
        self.get_url_list()
        for i in range(3):  # 设置并发数为3
            self.pool.apply_async(self._execete_request_content_save, callback=self._callback)

        while True:
            time.sleep(0.0001)
            if self.response_num >= self.request_num:
                self.is_running = False
                break


if __name__ == '__main__':
    t1 = time.time()
    qb = QiuBai()
    qb.run()
    print("total cost:", time.time() - t1)
  • 使用协程池实现爬虫
# coding=utf-8
import gevent.monkey
gevent.monkey.patch_all()

# from multiprocessing.dummy import Pool
from gevent.pool import Pool
import requests
from lxml import etree
import time
from queue import Queue


class QiuBai(object):
    def __init__(self):
        self.hesders = {
            "User-Agent": "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13"
        }
        self.url_temp = "https://www.qiushibaike.com/text/page/{}/"
        self.content_url_temp = "https://www.qiushibaike.com"
        self.queue = Queue()  # 实例化队列
        self.pool = Pool()  # 实例化线程池
        self.is_running = True  # 默认线程正在执行
        self.request_num = 0  # 发出的请求数
        self.response_num = 0  # 获得的回应数  当回应数等于请求数时结束线程

    def get_url_list(self):
        """拼接url_list"""
        for i in range(1, 14):
            self.queue.put(self.url_temp.format(i))
            self.request_num += 1  # 请求数 +1

    def parse_url(self, url):
        """发送请求,获取响应"""
        resp = requests.get(url, headers=self.hesders)
        return resp.content

    def get_content_list(self, content):
        """提取数据"""
        # 转化为elements对象
        html = etree.HTML(content)
        # div分组
        div_list = html.xpath("*//article")
        content_list = []
        for div in div_list:
            term = {}
            term["author"] = div.xpath(".//a[@class='username']/text()")[0]
            term["content_url"] = self.content_url_temp + div.xpath(".//a[@class='text']/@href")[0]
            term["content"] = div.xpath(".//a[@class='text']/text()")[0]
            content_list.append(term)
        return content_list

    def save_content(self, content_list):
        """保存数据"""
        pprint(content_list)

    def _execete_request_content_save(self):
        """进行一次url地址的请求,提取,保存"""
        # 1.发送请求
        url = self.queue.get()
        # 2.获取响应
        content = self.parse_url(url)
        # 3.提取数据
        content_list = self.get_content_list(content)
        # 4.保存数据
        self.save_content(content_list)
        self.response_num += 1  # 响应数 +1

    def _callback(self, temp):  # 参数 temp 在此处没有,但不能去掉
        if self.is_running:
            self.pool.apply_async(self._execete_request_content_save, callback=self._callback)

    def run(self):  # 实现主要业务逻辑
        # 1.拼接url_list
        self.get_url_list()
        for i in range(3):  # 设置并发数为3
            self.pool.apply_async(self._execete_request_content_save, callback=self._callback)

        while True:
            time.sleep(0.0001)
            if self.response_num >= self.request_num:
                self.is_running = False
                break


if __name__ == '__main__':
    t1 = time.time()
    qb = QiuBai()
    qb.run()
    print("total cost:", time.time() - t1)

线程池与协程池的代码区别:只有导入的包不同,其他全部一样

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值