python-selenium爬取51job获取求职信息

最新推荐文章于 2024-01-12 23:25:49 发布

转载最新推荐文章于 2024-01-12 23:25:49 发布 · 1.4k 阅读

14 ·

CC 4.0 BY-SA版权

原文链接：https://www.cnblogs.com/python147/p/14548882.html

文章标签：

#python #爬虫

Python 同时被 2 个专栏收录

278 篇文章

订阅专栏

爬虫

20 篇文章

订阅专栏

前言

本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理。

PS：如有需要Python学习资料的小伙伴可以点击下方链接自行获取

Python免费学习资料、代码以及交流解答点击即可加入

没有赶上秋招的，今年就要开始春招了，可是该怎么获取想要的求职信息呢，各种求职网站信息繁多，快速获取数据筛选得到我们想要的工作地点和岗位以及薪资是我们先行的第一步。

本次爬虫就以51job为例，爬取51job职业为数据分析的相关所有求职信息。

'''
如有需要Python学习资料的小伙伴可以加群领取：1136201545
'''

from selenium import webdriver
import openpyxl

wd = webdriver.Chrome()
lst =[]

def get_data():
    for i in range(1, 3):
        wd.get('https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,{0}.html?lang=c&postchannel=0000&workyear=01%252c06&cotype=99&degreefrom=04&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(i))
        wd.implicitly_wait(10)
        for i in range(1, 51):
            # 职位
            zhiwei = wd.find_element_by_xpath('/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[{0}]/a/p[1]/span[1]'.format(i)).text
            print(zhiwei)
            # 发布日期
            fabu_date = wd.find_element_by_xpath('/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[{0}]/a/p[1]/span[2]'.format(i)).text
            print(fabu_date)
            # 工资
            salary = wd.find_element_by_xpath('/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[{0}]/a/p[2]/span[1]'.format(i)).text
            print(salary)
            # 地区丨经历丨学历丨人数
            basic_info = wd.find_element_by_xpath('/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[{0}]/a/p[2]/span[2]'.format(i)).text
            print(basic_info)
            # 福利（注意只有41个，有的招聘未写福利）所以进行一个判断
            if NodeExists("/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[{0}]/a/p[3]/span/i".format(i)):
                welfare = wd.find_element_by_xpath('/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[{0}]/a/p[3]/span/i'.format(i)).text
                print(welfare)
            else:
                welfare = ''
                print(welfare)
            # 职位url
            zw_url = wd.find_element_by_xpath('/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[{0}]/a'.format(i)).get_attribute('href')
            print(zw_url)
            # 公司
            company = wd.find_element_by_xpath('/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[{0}]/div[2]/a'.format(i)).text
            print(company)
            # 公司类型
            style = wd.find_element_by_xpath('/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[{0}]/div[2]/p'.format(i)).text
            print(style)
            print(i)
            lst.append([i, zhiwei, fabu_date, salary, basic_info, welfare, zw_url, company, style])
    wd.close()
    

# 捕获异常 参考：https://www.cnblogs.com/KHZ521/p/14265235.html
def NodeExists(xpath):
    try:
        wd.find_element_by_xpath(xpath)
        return True
    except:
        return False

def save2excel():
    wb = openpyxl.Workbook()
    print('open excel')
    sheet = wb.active
    print('open sheet')
    row0 = ['序号', '职位', '发布日期', '薪资', '基本信息', '福利', 'url', '公司', '公司类型']
    sheet.append(row0)
    for item in lst:
        '''
        遍历列表，逗号分隔列，列表分隔行，一个列表是一行
        '''
        sheet.append(item)
    wb.save('51job.xlsx')
    print('保存文件')
if __name__ == '__main__':
    get_data()
    save2excel()