Python生产者消费者案例 -- 多线程爬取智联招聘

本文介绍了一种基于Python的智联招聘网站爬虫实现方案,通过单例模式的工厂类进行队列数据共享,生产者类负责提取职位详情页URL,消费者类则抓取具体数据,展示了多线程与Selenium、Requests库的应用。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

核心思想:
  • 创建工厂类, 使用单利模式实现队列数据共享
  • 创建生产者类, 提取智联详情页url并存入工厂类队列
  • 创建消费者类, 从工厂类队列里取出url并抓取数据
代码详情:
  • 使用方法说明: python zhilian_spider.py 城市 关键词 最大页码
  • 实例: python zhilian_spider.py 杭州 python 10
  • 完整代码:
    import sys
    import re
    import time
    import requests
    from queue import Queue
    from selenium import webdriver
    import threading
    from name_to_number import NAME_TO_NUMBER
    from headers import UserAgent
    
    
    class Factory(object):
        """
        工厂类, 利用单例模式实现数据共享
        """
        __instance = None  # 记录是否执行过构造方法
        __first_init = False  # 记录是否执行过初始化方法
    
        def __new__(cls, *args, **kwargs):
            # 实现单利模式, 只执行一次构造方法
            if not Factory.__instance:
                Factory.__instance = object.__new__(cls)
            return Factory.__instance
    
        def __init__(self):
            # 实现单利模式, 只执行一次初始化方法
            if not self.__first_init:
                self.queue = Queue()
                Factory.__first_init = True
    
        def get_queue(self):
            return self.queue
    
    
    class Producer(threading.Thread):
        """生产者"""
    
        def __init__(self, city='北京', kw='python', max_page=20):
            super().__init__()
            self.city = city
            self.kw = kw
            self.max_page = max_page
            self.production_queue = Factory().get_queue()
            self.start_url = "https://sou.zhaopin.com/?p={}&jl={}&kw={}&kt=3"
    
        def make_url(self, city_number):
            return [self.start_url.format(i, city_number, self.kw) for i in range(1, self.max_page)]
    
        def get_city_number(self):
            city_number = NAME_TO_NUMBER.get(self.city)
            if not city_number:
                print('city参数错误, 找不到地址%s, 使用默认地址北京' % self.city)
                city_number = 530
            return city_number
    
        def get_detail_url(self, content_list):
            for i in range(1, 91):
                try:
                    url_str = content_list.find_element_by_xpath('./div[{}]//a'.format(i)).get_attribute('href')
                except Exception as e:
                    continue
                self.production_queue.put(url_str)
                # print(url_str)
    
        def get_url_list(self, url_str):
            # 打开页面
            bs = webdriver.Chrome()
            bs.get(url_str)
            time.sleep(3)
            # 尝试点击警示内容
            try:
                button = bs.find_element_by_xpath('//div[@class="risk-warning__content"]/button')
                button.click()
                time.sleep(3)
            except Exception as e:
                print(e)
            content_list = bs.find_element_by_id('listContent')
            self.get_detail_url(content_list)
            bs.close()
            # print(Factory().queue.qsize())
    
        def run(self):
            city_number = self.get_city_number()
            url_list = self.make_url(city_number)
            for url in url_list:
                self.get_url_list(url)
    
    
    class Consumer(threading.Thread):
        """消费者"""
    
        def __init__(self):
            super().__init__()
            self.queue = Factory().get_queue()
            self.__ua = UserAgent()
            self.try_number = 3
    
        def get_html(self, headers):
            try:
                url_str = self.queue.get(timeout=30)
            except Exception as e:
                print(" ---- Queue is None, Program Exit ----")
                quit()
            print(url_str)
            response = requests.get(url_str, headers=headers)
            assert response.status_code == 200
            return response.text
    
        def filter_html(self, html_str):
            item = {}
            try:
                #
                item['job_name'] = re.findall('<h1 class="l info-h3">(.*?)</h1>',
                                              html_str, re.DOTALL)[0]
                item['money'] = re.findall('<li class="info-money">\s+<strong>(.*?)</strong>',
                                           html_str, re.DOTALL)[0]
            except IndexError:
                item['job_name'] = re.findall('<li class="info-h3">\s+(.*?)\s+</li>\s+<li class="info-money">',
                                              html_str, re.DOTALL)[0]
                item['money'] = re.findall('<div class="l info-money">\s+<strong>(.*?)</strong>',
                                           html_str, re.DOTALL)[0]
            item['money'] = re.findall('<li class="info-money">\s+<strong>(.*?)</strong>',
                                       html_str, re.DOTALL)[0]
            item['company'] = re.findall('<div class="company l">\s+<a.*?>(.*?)</a>',
                                         html_str, re.DOTALL)[0]
            item['city'] = re.findall('<div class="info-three l">\s+<span><a.*?>(.*?)</a>.*?</span>',
                                      html_str, re.DOTALL)[0]
            item['ex_requirement'] = re.findall(
                '<div class="info-three l">\s+<span><a.*?</a>.*?</span>\s+<span>(.*?)</span>',
                           html_str, re.DOTALL)[0]
            item['education'] = re.findall(
                '<div class="info-three l">\s+<span><a.*?</a>.*?</span>\s+<span>.*?</span>\s+<span>(.*?)</span>',
                html_str, re.DOTALL)[0]
            item['work_address'] = re.findall('<p class="add-txt"><span class="icon-address"></span>(.*?)</p>',
                                              html_str, re.DOTALL)[0]
            yield item
    
        def run(self):
            while True:
                headers = self.__ua.get_headers()
                # download
                html_str = self.get_html(headers)
                # filter
                item = self.filter_html(html_str)
                for i in item:
                    print(i)
    
    
    if __name__ == '__main__':
        # python zhilian_spider.py 北京 python 20
        # city='北京', kw='python', max_page=20
        producer_info = sys.argv[1:]
        p1 = Producer()
        try:
            p1.city = producer_info[0]
            p1.kw = producer_info[1]
            p1.max_page = int(producer_info[2])
        except Exception as e:
            print('------ Illegal parameters ------')
        p1.start()
    
        c1 = Consumer()
        c2 = Consumer()
        c1.start()
        c2.start()
    
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值