Python生产者消费者案例 -- 多线程爬取智联招聘

最新推荐文章于 2025-06-16 17:46:02 发布

atopx

最新推荐文章于 2025-06-16 17:46:02 发布

阅读量443

点赞数

CC 4.0 BY-SA版权

分类专栏： Python 爬虫文章标签：爬虫 itmeng python 单利模式生产者消费者模式

本文链接：https://blog.youkuaiyun.com/qq_43125439/article/details/85333680

Python 同时被 2 个专栏收录

26 篇文章

订阅专栏

爬虫

4 篇文章

订阅专栏

本文介绍了一种基于Python的智联招聘网站爬虫实现方案，通过单例模式的工厂类进行队列数据共享，生产者类负责提取职位详情页URL，消费者类则抓取具体数据，展示了多线程与Selenium、Requests库的应用。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

                    
                        
                    
                    核心思想: 
创建工厂类, 使用单利模式实现队列数据共享
创建生产者类, 提取智联详情页url并存入工厂类队列
创建消费者类, 从工厂类队列里取出url并抓取数据
 
代码详情: 
使用方法说明: python zhilian_spider.py 城市 关键词 最大页码
实例: python zhilian_spider.py 杭州 python 10
完整代码:import sys
import re
import time
import requests
from queue import Queue
from selenium import webdriver
import threading
from name_to_number import NAME_TO_NUMBER
from headers import UserAgent


class Factory(object):
    """
    工厂类, 利用单例模式实现数据共享
    """
    __instance = None  # 记录是否执行过构造方法
    __first_init = False  # 记录是否执行过初始化方法

    def __new__(cls, *args, **kwargs):
        # 实现单利模式, 只执行一次构造方法
        if not Factory.__instance:
            Factory.__instance = object.__new__(cls)
        return Factory.__instance

    def __init__(self):
        # 实现单利模式, 只执行一次初始化方法
        if not self.__first_init:
            self.queue = Queue()
            Factory.__first_init = True

    def get_queue(self):
        return self.queue


class Producer(threading.Thread):
    """生产者"""

    def __init__(self, city='北京', kw='python', max_page=20):
        super().__init__()
        self.city = city
        self.kw = kw
        self.max_page = max_page
        self.production_queue = Factory().get_queue()
        self.start_url = "https://sou.zhaopin.com/?p={}&jl={}&kw={}&kt=3"

    def make_url(self, city_number):
        return [self.start_url.format(i, city_number, self.kw) for i in range(1, self.max_page)]

    def get_city_number(self):
        city_number = NAME_TO_NUMBER.get(self.city)
        if not city_number:
            print('city参数错误, 找不到地址%s, 使用默认地址北京' % self.city)
            city_number = 530
        return city_number

    def get_detail_url(self, content_list):
        for i in range(1, 91):
            try:
                url_str = content_list.find_element_by_xpath('./div[{}]//a'.format(i)).get_attribute('href')
            except Exception as e:
                continue
            self.production_queue.put(url_str)
            # print(url_str)

    def get_url_list(self, url_str):
        # 打开页面
        bs = webdriver.Chrome()
        bs.get(url_str)
        time.sleep(3)
        # 尝试点击警示内容
        try:
            button = bs.find_element_by_xpath('//div[@class="risk-warning__content"]/button')
            button.click()
            time.sleep(3)
        except Exception as e:
            print(e)
        content_list = bs.find_element_by_id('listContent')
        self.get_detail_url(content_list)
        bs.close()
        # print(Factory().queue.qsize())

    def run(self):
        city_number = self.get_city_number()
        url_list = self.make_url(city_number)
        for url in url_list:
            self.get_url_list(url)


class Consumer(threading.Thread):
    """消费者"""

    def __init__(self):
        super().__init__()
        self.queue = Factory().get_queue()
        self.__ua = UserAgent()
        self.try_number = 3

    def get_html(self, headers):
        try:
            url_str = self.queue.get(timeout=30)
        except Exception as e:
            print(" ---- Queue is None, Program Exit ----")
            quit()
        print(url_str)
        response = requests.get(url_str, headers=headers)
        assert response.status_code == 200
        return response.text

    def filter_html(self, html_str):
        item = {}
        try:
            #
            item['job_name'] = re.findall('<h1 class="l info-h3">(.*?)</h1>',
                                          html_str, re.DOTALL)[0]
            item['money'] = re.findall('<li class="info-money">\s+<strong>(.*?)</strong>',
                                       html_str, re.DOTALL)[0]
        except IndexError:
            item['job_name'] = re.findall('<li class="info-h3">\s+(.*?)\s+</li>\s+<li class="info-money">',
                                          html_str, re.DOTALL)[0]
            item['money'] = re.findall('<div class="l info-money">\s+<strong>(.*?)</strong>',
                                       html_str, re.DOTALL)[0]
        item['money'] = re.findall('<li class="info-money">\s+<strong>(.*?)</strong>',
                                   html_str, re.DOTALL)[0]
        item['company'] = re.findall('<div class="company l">\s+<a.*?>(.*?)</a>',
                                     html_str, re.DOTALL)[0]
        item['city'] = re.findall('<div class="info-three l">\s+<span><a.*?>(.*?)</a>.*?</span>',
                                  html_str, re.DOTALL)[0]
        item['ex_requirement'] = re.findall(
            '<div class="info-three l">\s+<span><a.*?</a>.*?</span>\s+<span>(.*?)</span>',
                       html_str, re.DOTALL)[0]
        item['education'] = re.findall(
            '<div class="info-three l">\s+<span><a.*?</a>.*?</span>\s+<span>.*?</span>\s+<span>(.*?)</span>',
            html_str, re.DOTALL)[0]
        item['work_address'] = re.findall('<p class="add-txt"><span class="icon-address"></span>(.*?)</p>',
                                          html_str, re.DOTALL)[0]
        yield item

    def run(self):
        while True:
            headers = self.__ua.get_headers()
            # download
            html_str = self.get_html(headers)
            # filter
            item = self.filter_html(html_str)
            for i in item:
                print(i)


if __name__ == '__main__':
    # python zhilian_spider.py 北京 python 20
    # city='北京', kw='python', max_page=20
    producer_info = sys.argv[1:]
    p1 = Producer()
    try:
        p1.city = producer_info[0]
        p1.kw = producer_info[1]
        p1.max_page = int(producer_info[2])
    except Exception as e:
        print('------ Illegal parameters ------')
    p1.start()

    c1 = Consumer()
    c2 = Consumer()
    c1.start()
    c2.start()