# 导入模块
from selenium import webdriver
from selenium.webdriver.common.keys import Keys # 键盘按键的操作
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import csv
# 以csv格式保存数据
f = open('数据库维护.csv', mode='a', encoding='utf8', newline='')
csv_write = csv.DictWriter(f,
fieldnames=['招聘岗位', '发布时间', '工资情况', '技术要求',
'员工福利', '公司规模'
])
csv_write.writeheader()
opt = Options()
opt.add_experimental_option('excludeSwitches', ['enable-automation'])
webdriver.Chrome(options=opt)
driver = webdriver.Chrome()
# driver.find_element(By.XPATH, '//ul[@class="clearfix"]/li/a').click()
def get_product(word):
# 选择城市
driver_city = driver.find_element(By.XPATH, '//ul[@class="clearfix"]/li/a')
driver_city.click()
driver.find_element(By.CSS_SELECTOR, '#search_input').send_keys(keyword) # 搜索框
# 点击确定
driver.find_element(By.CSS_SELECTOR, '#search_button').click()
# driver.find_element(By.XPATH, '//ul[@class="clearfix"]/li/a').click()
driver.implicitly_wait(10)
driver.maximize_window() # 最大化浏览器
# 滑轮设置
def drop_down():
for x in range(1, 12, 2):
time.sleep(1)
j = x / 9
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
driver.execute_script(js)
# 解析数据
def content():
drop_down()
lists = driver.find_elements(By.XPATH, '//div[@class="list__YibNq"]/div')
for i in lists:
# 异常处理
try:
# 爬取岗位
try:
gan_wei = i.find_element(By.XPATH, './div[1]/div/div[1]/a').text
except:
gan_wei = '找不到这个岗位'
# 爬取发布时间/
try:
fabu_time = i.find_element(By.XPATH, './div[1]/div/div/span').text
except:
fabu_time = '找不到信息'
# 爬取工资
try:
gong_zi = i.find_element(By.XPATH, './div/div/div[2]/span').text
except:
gong_zi = '找不到信息'
# 技术
try:
ji_shu = i.find_element(By.XPATH, './div[2]/div/span').text
except:
ji_shu = '找不到信息'
# 福利
try:
fu_li = i.find_element(By.XPATH, './div[2]/div[2]').text.replace('""','')
except:
fu_li = '找不到信息'
# 公司规模
try:
gs_gm = i.find_element(By.XPATH, './div/div[2]/div[2]').text
except:
gs_gm = '找不到信息'
# 用一个字典对爬取到的数据进行包装
dict_con = {
'招聘岗位': gan_wei,
'发布时间': fabu_time,
'工资情况': gong_zi,
'技术要求': ji_shu,
'员工福利': fu_li,
'公司规模': gs_gm
}
csv_write.writerow(dict_con)
print(gan_wei, fabu_time, gong_zi, ji_shu, fu_li, gs_gm)
# 跟try连用,若爬取不到数据直接跳过,不会停止程序,也不会报错
except Exception as e:
pass
# 设计休息时间,以防被监测
time.sleep(0.5)
# 翻页设置
def get_next():
driver.find_element(By.CSS_SELECTOR, '#jobList > div.pagination__1L2PP > ul > li.lg-pagination-next > a').click()
# 输入关键字input
keyword = input("请输入岗位名称:")
# driver = webdriver.Chrome()
driver.get('https://www.lagou.com/')
get_product(keyword)
# 用一个for循环进行对前面函数进行遍历
for page in range(1, 6):
print(f'===============正在采集第{page}页的数据内容======================')
# 调用函数
drop_down()
content()
get_next()
time.sleep(0.5)
# 结束驱动
driver.quit()
爬取某钩网站全代码
最新推荐文章于 2025-05-17 17:07:22 发布