BOSS直聘招聘信息爬取

import urllib
import requests
import time
import random
from bs4 import BeautifulSoup
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

page_headers={
         'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
         'Connection':'keep-alive',
         'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
         'Host':'www.zhipin.com',
         'Accept-Language':'zh-CN,zh;q=0.8',
         'Cache-Control':'max-age=0',
         'Referer':'https://www.zhipin.com/',
         'Upgrade-Insecure-Requests':'1',
         'Cookie':'lastCity=101010100; __c=1560175643; __g=-; JSESSIONID=5CEE199DE881108BD5A8D4335B72974B; t=GhIYKPBA1hVAPL3s; wt=GhIYKPBA1hVAPL3s; __l=l=%2Fwww.zhipin.com%2F&r=; __a=18622455.1560175643..1560175643.5.1.5.5; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1560175643; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1560175780'
         }

#设置搜索职位名称
key_words = "数据分析"
key = urllib.parse.quote(key_words)
#url='https://www.zhipin.com/c101010100/?query='+key+'&page=1&ka=page-1'

def get_data(url):
    try:
        res=requests.get(url,headers=page_headers)
        status=res.status_code
        data=res.text
        print(status)
        soup=BeautifulSoup(data,'lxml')
        #print(soup.prettify())  #输出格式化的html代码
        return soup,status

    except Exception as e:
        print(str(e))
        return 0,0
    
def get_job(url):
    soup,status=get_data(url)
    if status==200:
        job_all=soup.find_all('div',class_="job-primary")
        for job in job_all:
            try:
                #职位名
                job_title=job.find('div',class_="job-title").string
                #薪资
                job_salary=job.find('span',class_="red").string
                #职位标签
                job_tag1=job.p.text
                #公司
                job_company=job.find('div',class_="company-text").a.text
                #招聘详情页链接
                job_url=job.find('div',class_="company-text").a.attrs['href']
                #公司标签
                job_tag2=job.find('div',class_="company-text").p.text
                #发布时间
                job_time=job.find('div',class_="info-publis").p.text
                
                with open('job.csv','a+',encoding='utf-8') as fh:
                    fh.write(job_company+","+job_title+","+job_salary+","+job_tag1+","+job_time+","+job_tag2+",https://www.zhipin.com"+job_url+"\n")
            except Exception as e:
                print(str(e))


if __name__=='__main__':
    with open('job.csv','w',encoding='utf-8') as fh:
        fh.write("公司,职位名,薪资,职位标签,发布时间,公司标签,招聘链接\n")
    for i in range(1,11):
        print("正在爬取第 %s 页..." % i)
        url='https://www.zhipin.com/c101010100/?query='+key+'&page='+str(i)+'&ka=page-'+str(i)
        get_job(url)
        #随机等待
        span=round(random.random()*6,1)
        time.sleep(span)

### Boss招聘信息抓取源代码 以下是基于 Python 的 Scrapy 和 Selenium 结合实现的爬虫代码,用于抓取 Boss 上的招聘信息,并将其存储到 MongoDB 中。 #### 1. 创建 Scrapy 项目 初始化一个新的 Scrapy 项目 `boss_zhipin` 并生成 Spider 文件: ```bash scrapy startproject boss_zhipin cd boss_zhipin scrapy genspider zhipin_spider https://www.zhipin.com/ ``` #### 2. 安装依赖库 确保安装以下依赖项: ```bash pip install scrapy selenium pymongo ``` #### 3. 修改 `settings.py` 启用自定义中间件并配置 User-Agent 列表: ```python # settings.py USER_AGENTS_LIST = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' ] DOWNLOADER_MIDDLEWARES = { 'boss_zhipin.middlewares.BossZhipinDownloaderMiddleware': 543, } ``` #### 4. 自定义 Downloader Middleware 在 `middlewares.py` 中设置随机 User-Agent: ```python import random from boss_zhipin.settings import USER_AGENTS_LIST class BossZhipinDownloaderMiddleware: @classmethod def from_crawler(cls, crawler): s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): user_agent = random.choice(USER_AGENTS_LIST) request.headers['User-Agent'] = user_agent def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) ``` #### 5. 编写 Spider 文件 在 `spiders/zhipin_spider.py` 中编写爬虫逻辑: ```python import scrapy from selenium import webdriver from selenium.webdriver.chrome.options import Options from pymongo import MongoClient from boss_zhipin.items import ZhiPinItem class ZhipinSpider(scrapy.Spider): name = "zhipin_spider" allowed_domains = ["www.zhipin.com"] start_urls = ['https://www.zhipin.com/job_detail/?query=%E5%A4%A7%E6%95%B0%E6%8D%AE&city=101010100'] def __init__(self): chrome_options = Options() chrome_options.add_argument('--headless') self.driver = webdriver.Chrome(options=chrome_options) # 连接 MongoDB client = MongoClient('mongodb://localhost:27017/') db = client['boss_zhipin'] self.collection = db['jobs'] def parse(self, response): self.driver.get(response.url) job_list = self.driver.find_elements_by_css_selector('.job-primary') for job in job_list: try: item = ZhiPinItem() title_element = job.find_element_by_class_name('job-title') salary_element = job.find_element_by_class_name('red') company_element = job.find_element_by_class_name('company-text').find_element_by_tag_name('a') detail_link = company_element.get_attribute('href') item['title'] = title_element.text.strip() item['salary'] = salary_element.text.strip() item['company'] = company_element.text.strip() item['link'] = detail_link yield item # 存储到 MongoDB self.collection.insert_one(dict(item)) except Exception as e: print(f"Error processing job: {e}") next_page_button = self.driver.find_elements_by_css_selector('.next') if next_page_button and 'disabled' not in next_page_button[0].get_attribute('class'): next_url = next_page_button[0].get_attribute('href') yield scrapy.Request(next_url, callback=self.parse) def closed(self, reason): self.driver.quit() ``` #### 6. 定义 Item 类 在 `items.py` 中定义数据结构: ```python import scrapy class ZhiPinItem(scrapy.Item): title = scrapy.Field() salary = scrapy.Field() company = scrapy.Field() link = scrapy.Field() ``` #### 7. 可视化展示 可以使用 Matplotlib 或 Seaborn 对 MongoDB 中的数据进行可视化处理。例如绘制薪资分布图: ```python import matplotlib.pyplot as plt from pymongo import MongoClient client = MongoClient('mongodb://localhost:27017/') db = client['boss_zhipin'] collection = db['jobs'] salaries = [int(job['salary'].split('-')[0]) for job in collection.find()] plt.hist(salaries, bins=20) plt.title('Salary Distribution of Data Science Jobs') plt.xlabel('Salary Range') plt.ylabel('Frequency') plt.show() ``` --- ### 注意事项 - 使用 Selenium 模拟浏览器行为时需注意反爬机制[^5]。 - BOSS 可能动态加载内容,因此需要通过分析面结构调整 XPath/CSS Selector 表达式[^1]。 - 如果目标城市较多,可以通过修改 URL 参数来切换不同城市的职位列表[^1]。
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Jepson2017

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值