import time
import pandas as pd
from lxml import etree
from selenium import webdriver
def get_web_data(dom=None):
href = dom.xpath('//div[@class="qr-code"]/@id')
names = [] # 岗位名称
category = [] # 类别
place = [] # 工作地点
categories = [] # 服务类型
date = [] # 发布时间
duty = [] # 职责
requirements = [] # 要求
for i in href:
href_url = 'https://careers.tencent.com/jobdesc.html?postId={}'.format(i)
driver.get(href_url)
dom = etree.HTML(driver.page_source)
if dom.xpath('//title/text()') == ['404 | 腾讯招聘']:
continue
name = dom.xpath('//div[@class="work-title"]/text()') # 岗位名称
info = dom.xpath('//div[@class="work-wrapper"]/p/span/text()') # 职位信息
q = dom.xpath('///div[@class="duty work-module"]//ul/li/text()') # 岗位职责
w = dom.xpath('//div[@class="requirement work-module"]//ul/li/text()') # 岗位要求
names.append(name[0])
category.append(info[0])
place.append(info[2])
categories.append(info[4])
date.append(info[-1])
duty.append(q)
requirements.append(w)
dutys = []
requirement = []
for i in duty:
s = ''
for y in i:
s += ''.join(y)
for i in requirements:
m = ''
for y in i:
m += ''.join(y)
dutys.append(s)
requirement.append(m)
results = {
'岗位名称': names,
'类别': category,
'工作地点': place,
'服务类型': categories,
'发布时间': date,
'职责': dutys,
'要求': requirement
}
data = pd.DataFrame(results)
return data
# 对所有页面进行数据爬取及解析操作,并进行数据保存
all_data = pd.DataFrame()
i = 0
driver = webdriver.Chrome()
while i <= 400:
url = 'https://careers.tencent.com/search.html?query=ot_40001001,ot_40001002,ot_40001003,ot_40001004,ot_40001005,ot_40001006,ot_40003001,ot_40003002,' \
'ot_40003003&index={}'.format(
i + 1)
driver.get(url)
dom = etree.HTML(driver.page_source)
data = get_web_data(dom=dom)
all_data = pd.concat([all_data, data], ignore_index=True)
i += 1
print(i, end=' ')
all_data.to_csv('最终爬取.csv',encoding='utf-8_sig')
爬虫爬取代码
最新推荐文章于 2025-03-31 10:56:03 发布