2021selenium+chrome爬取BOSS直聘指定字段热门城市岗位信息

最新推荐文章于 2024-10-29 20:18:51 发布

浪淘三千

最新推荐文章于 2024-10-29 20:18:51 发布

阅读量2.9k

点赞数

分类专栏：代码篇文章标签： python

本文链接：https://blog.youkuaiyun.com/weixin_43521165/article/details/118253251

版权

代码篇专栏收录该内容

7 篇文章

订阅专栏

2021selenium+chrome爬取BOSS直聘指定字段热门城市岗位信息

心得

登录之后可以抓取更多数据
浏览器打开BOSS之后让程序sleep()一会然后趁这个时间登录一次
不做其他操作的话，每个城市每个关键字大致会显示10页数据
用requests和 scrapy爬， cookie 操作要很熟练，真是难为人

最终效果

在这里插入图片描述

文件概览

在这里插入图片描述

要先创建好数据库和数据表这里时mysql数据库

在这里插入图片描述

定义搜索关键字

在这里插入图片描述

直接上代码随时用随时拿

# 运行此文件需要先下载安装谷歌的浏览器驱动exe 然后把代码文件和驱动放在一起即可
from selenium import webdriver  # 导入 webdriver
# 要想调用键盘按键操作需要引入keys包 比如 回车和ctrl键
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
# WebDriverWait 库，负责循环等待
from selenium.webdriver.support.ui import WebDriverWait
# expected_conditions 类，负责条件出发
from selenium.webdriver.support import expected_conditions as EC
import time
from lxml import etree

from pymysql import *


# 每个城市首次访问时 进来这里拿页面全部关键li数据
def get_page_data(city_url, search_word, t):
    # # 访问目标网址
    # driver.get(city_url)
    if t == 0:
        print('9秒时间，快去扫码登录，每访问新的城市，我也暂停8秒')
        time.sleep(9)  # 第一此访问时可以手动点击扫描登录 这样能拿到的数据更多
    driver.get(city_url)
    WebDriverWait(driver, 8).until(
        EC.presence_of_element_located(
            (By.XPATH, "//div[@class='job-list']//ul//li"))  # 通过XPATH查找
    )
    con = etree.HTML(driver.page_source)
    page_parse(con, search_word)


def page_parse(con, search_word2):
    print(type(con))
    global num
    global insert_num

    content_now_page = con.xpath("//div[@class='job-list']//ul//li")

    # content_now_page[1].xpath("(.//div[@class='company-text'])/h3/a/text()")   # 公司名称
    # areas_all = content_now_page[1].xpath("(.//span[@class='job-area'])/text()")  # 所在区域 海淀区 朝阳区。。
    for i in content_now_page:
        # city_name = con.xpath("//dd[@class='city-wrapper']/a/text()")[0].strip()
        # print('城市名字2', city_name)
        try:
            city_name = con.xpath("//dd[@class='city-wrapper']/a/text()")[0].strip()
            search_word = search_word2
            print('城市名字', city_name)
            position_name = i.xpath("(.//span[@class='job-name'])/a/text()")  # 职位名称
            company_name = i.xpath("(.//div[@class='company-text'])/h3/a/text()")  # 公司名称
            areas_all = i.xpath("(.//span[@class='job-area'])/text()")  # 所在区域 海淀区 朝阳区。。
            # print(areas_all)
            areas_detail = areas_all[0].split('·')
            if len(areas_detail) == 3:
                areas = areas_detail[1]  # 海淀区
                street = areas_detail[-1]  # 所在具体街区 东北旺
            elif len(areas_detail) == 2:
                areas = areas_detail[1]  # 海淀区
                street = ''
            else:
                areas = ''
                street = ''

            company_info = i.xpath("(.//div[@class='company-text'])/p//text()")
            company_style = company_info[0]  # 公司类别

            print('内容打印:position_name', position_name)
            print('内容打印company_name', company_name)
            print('地区打印areas', areas, street, company_style)

            financing_situation = company_info[1]  # 融资情况
            scale = company_info[2]  # 公司规模

            yao_qiu = i.xpath("(.//div[contains(@class,'job-limit')])//text()")  # 要求
            w = []
            for i2 in yao_qiu:
                if len(i2.strip()) >= 2:
                    w.append(i2)
            # print(w)
            xue_li = w[2]  # 学历要求
            work_experience = w[1]  # 工作经验要求

            if 'K' in w[0].split('·')[0] and '天' not in w[0].split('·')[0]:
                salary = w[0].split('·')[0].replace('K', '')  # 薪资范围
                salary_min = salary.split('-')[0]  # 薪资范围最小值
                salary_max = salary.split('-')[1]  # 薪资范围最大值
                average_salary = round((float(salary_min) + float(salary_max)) / 2, 2)  # 薪资中位值 保留一位小数
                try:
                    salary_month = w[0].split('·')[1]  # 薪资月数 13薪 15薪
                except:
                    salary_month = '12薪'

                print(xue_li, work_experience, salary, salary_month)

                benefits = i.xpath("(.//div[@class='info-desc'])/text()")  # 福利待遇
                print('福利待遇', benefits)
                num += 1
                print('-----------------', num, '条数据已经抓取-------------')
            else:
                continue

            # city_code = city_code_list[0]  # 城市代码
            # rl_detail_page = driver.current_url  # 当前网页地址
            print(222)
            params_kong = [search_word, city_name, position_name, company_name, company_style, scale,
                      xue_li, work_experience, financing_situation, benefits, salary, salary_month,
                      salary_min, salary_max, average_salary]

            params = [search_word, city_name, position_name,company_name,areas,street, company_style, scale,
                      xue_li, work_experience, financing_situation, benefits, salary, salary_month,
                      salary_min, salary_max, average_salary]
            none_data = 1
            for zi_duan in params_kong:
                if len(str(zi_duan)) == 0:
                    none_data = 0
                    print(num - insert_num, '有空值')
                    break
                else:
                    none_data = 1

            if none_data == 1:
                print('无空值，入库种')
                cur.execute(sql_order, params)
                coon.commit()
                insert_num += 1
                print('--------+++++++++---', insert_num, '条数据已经入库-+——+++++++++++++------------')
            else:
                print(none_data, '无法入库')
        except:
            continue
    print('循环完毕')
    time.sleep(1)
    next_page(search_word2)


def next_page(search_word2):
    global page
    page += 1
    print('====城市====', driver.current_url, '========第', page, '页准备=================================')
    # for i in range(1,9):
    #     print('循环到第',i,'页')
    try:
        element_next = WebDriverWait(driver, 8).until(
            EC.presence_of_element_located(
                (By.XPATH, "//div[@class='page']//a[@class='next']"))  # 通过XPATH查找
        )
        element_next.click()
        WebDriverWait(driver, 8).until(
            EC.presence_of_element_located(
                (By.XPATH, "//div[@class='job-list']//ul//li"))  # 通过XPATH查找
        )
        con = etree.HTML(driver.page_source)
        page_parse(con, search_word2)
    except:
        print('-----------------', driver.current_url, '当前城市结束--------------------------')
        page = 0


if __name__ == '__main__':
    ip_source = [{"ip": "27.9.47.216", "port": 4220}, {"ip": "183.166.135.43", "port": 4226},
                 {"ip": "101.74.181.221", "port": 4245}, {"ip": "175.147.100.112", "port": 4260},
                 {"ip": "115.205.77.140", "port": 4286}, {"ip": "113.237.4.211", "port": 4256},
                 {"ip": "116.115.209.201", "port": 4245}, {"ip": "175.174.190.95", "port": 4251},
                 {"ip": "106.112.124.153", "port": 4278}, {"ip": "125.79.200.156", "port": 4281}]

    coon = connect(host='localhost', port=3306, db='boss_zhi_pin',
                   user='root', passwd='password', charset='utf8')
    cur = coon.cursor()

    sql_order = """insert into boss_position_4 (search_word,city_name,position_name ,company_name,areas,street,company_style,scale,
                     xue_li,work_experience,financing_situation, benefits,salary,salary_month,
                    salary_min,salary_max,average_salary) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
    params1 = ['岗位', '城市', '岗位具体名称', '公司名称', '所在区域', '所在街道', '公司类型', '公司规模', '学历要求', '工作经验', '融资情况', '福利待遇', '薪资（K）',
              '发放月数', '最低薪资', '最高薪资', '平均薪资']
    cur.execute(sql_order, params1)
    coon.commit()

    chrome_options = Options()
    # chrome_options.add_argument("--headless")  # 设置为无头浏览器
    chrome_options.add_argument("log-level=3")  # 禁止掉浏览器调试的提示信息 这个网站console.log太多了
    # chrome_options.add_argument("--proxy-server=http://223.8.208.16:4218")
    # 禁止加载图片
    # chrome_options.add_argument('blink-settings=imagesEnabled=false')
    driver = webdriver.Chrome(chrome_options=chrome_options)
    print('抓紧登录，不登陆也可以，数据相对较少')
    driver.get('https://www.zhipin.com/beijing/')

    # 城市的url代号
    city_code_list = ['101010100', '101020100', '101280100', '101280600', '101210100', '101030100', '101110100',
                      '101190400', '101200100', '101230200', '101250100', '101270100', '101180100', '101040100']

    page = 0
    num = 0
    insert_num = 0
    for n in ['python', 'java']:
        for index, city in enumerate(city_code_list):
            get_page_data(f"https://www.zhipin.com/c{city}/?query={n}", n, index)
            print('城市', index, city)
    cur.close()
    coon.close()
    print('抓取', num, '入库', insert_num)