完整代码:import sys
import re
import time
import requests
from queue import Queue
from selenium import webdriver
import threading
from name_to_number import NAME_TO_NUMBER
from headers import UserAgent
class Factory(object):
"""
工厂类, 利用单例模式实现数据共享
"""
__instance = None
__first_init = False
def __new__(cls, *args, **kwargs):
if not Factory.__instance:
Factory.__instance = object.__new__(cls)
return Factory.__instance
def __init__(self):
if not self.__first_init:
self.queue = Queue()
Factory.__first_init = True
def get_queue(self):
return self.queue
class Producer(threading.Thread):
"""生产者"""
def __init__(self, city='北京', kw='python', max_page=20):
super().__init__()
self.city = city
self.kw = kw
self.max_page = max_page
self.production_queue = Factory().get_queue()
self.start_url = "https://sou.zhaopin.com/?p={}&jl={}&kw={}&kt=3"
def make_url(self, city_number):
return [self.start_url.format(i, city_number, self.kw) for i in range(1, self.max_page)]
def get_city_number(self):
city_number = NAME_TO_NUMBER.get(self.city)
if not city_number:
print('city参数错误, 找不到地址%s, 使用默认地址北京' % self.city)
city_number = 530
return city_number
def get_detail_url(self, content_list):
for i in range(1, 91):
try:
url_str = content_list.find_element_by_xpath('./div[{}]//a'.format(i)).get_attribute('href')
except Exception as e:
continue
self.production_queue.put(url_str)
def get_url_list(self, url_str):
bs = webdriver.Chrome()
bs.get(url_str)
time.sleep(3)
try:
button = bs.find_element_by_xpath('//div[@class="risk-warning__content"]/button')
button.click()
time.sleep(3)
except Exception as e:
print(e)
content_list = bs.find_element_by_id('listContent')
self.get_detail_url(content_list)
bs.close()
def run(self):
city_number = self.get_city_number()
url_list = self.make_url(city_number)
for url in url_list:
self.get_url_list(url)
class Consumer(threading.Thread):
"""消费者"""
def __init__(self):
super().__init__()
self.queue = Factory().get_queue()
self.__ua = UserAgent()
self.try_number = 3
def get_html(self, headers):
try:
url_str = self.queue.get(timeout=30)
except Exception as e:
print(" ---- Queue is None, Program Exit ----")
quit()
print(url_str)
response = requests.get(url_str, headers=headers)
assert response.status_code == 200
return response.text
def filter_html(self, html_str):
item = {}
try:
item['job_name'] = re.findall('<h1 class="l info-h3">(.*?)</h1>',
html_str, re.DOTALL)[0]
item['money'] = re.findall('<li class="info-money">\s+<strong>(.*?)</strong>',
html_str, re.DOTALL)[0]
except IndexError:
item['job_name'] = re.findall('<li class="info-h3">\s+(.*?)\s+</li>\s+<li class="info-money">',
html_str, re.DOTALL)[0]
item['money'] = re.findall('<div class="l info-money">\s+<strong>(.*?)</strong>',
html_str, re.DOTALL)[0]
item['money'] = re.findall('<li class="info-money">\s+<strong>(.*?)</strong>',
html_str, re.DOTALL)[0]
item['company'] = re.findall('<div class="company l">\s+<a.*?>(.*?)</a>',
html_str, re.DOTALL)[0]
item['city'] = re.findall('<div class="info-three l">\s+<span><a.*?>(.*?)</a>.*?</span>',
html_str, re.DOTALL)[0]
item['ex_requirement'] = re.findall(
'<div class="info-three l">\s+<span><a.*?</a>.*?</span>\s+<span>(.*?)</span>',
html_str, re.DOTALL)[0]
item['education'] = re.findall(
'<div class="info-three l">\s+<span><a.*?</a>.*?</span>\s+<span>.*?</span>\s+<span>(.*?)</span>',
html_str, re.DOTALL)[0]
item['work_address'] = re.findall('<p class="add-txt"><span class="icon-address"></span>(.*?)</p>',
html_str, re.DOTALL)[0]
yield item
def run(self):
while True:
headers = self.__ua.get_headers()
html_str = self.get_html(headers)
item = self.filter_html(html_str)
for i in item:
print(i)
if __name__ == '__main__':
producer_info = sys.argv[1:]
p1 = Producer()
try:
p1.city = producer_info[0]
p1.kw = producer_info[1]
p1.max_page = int(producer_info[2])
except Exception as e:
print('------ Illegal parameters ------')
p1.start()
c1 = Consumer()
c2 = Consumer()
c1.start()
c2.start()