from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import os
import time
import random
import pandas as pd
# ChromeDriver 可执行文件路径
driver_path = ChromeDriverManager().install()
if not driver_path.endswith("chromedriver.exe"):
driver_path = os.path.join(os.path.dirname(driver_path), "chromedriver.exe")
# 配置 Chrome 选项
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--headless") # 启用无头模式
chrome_options.add_argument("--disable-blink-features=AutomationControlled") # 禁用 Selenium 标识
chrome_options.add_argument(
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36") # 设置用户代理
# 启动浏览器
driver = webdriver.Chrome(service=Service(driver_path), options=chrome_options)
# 爬取的数据列表
job_data = []
# 函数:随机等待时间
def random_sleep(min_time=2, max_time=5):
sleep_time = random.uniform(min_time, max_time)
print(f"随机等待 {sleep_time:.2f} 秒...")
time.sleep(sleep_time)
# 函数:爬取一页的职位信息
def scrape_page(driver):
job_cards = driver.find_elements(By.CSS_SELECTOR, '.job-card-wrapper')
for card in job_cards:
try:
company_name = card.find_element(By.CSS_SELECTOR, '.company-info .company-name a').text.strip()
job_title = card.find_element(By.CSS_SELECTOR, '.job-title .job-name').text.strip()
job_location = card.find_element(By.CSS_SELECTOR, '.job-area-wrapper .job-area').text.strip()
salary = card.find_element(By.CSS_SELECTOR, '.salary').text.strip()
experience_education = card.find_element(By.CSS_SELECTOR, '.tag-list').text.strip()
try:
skill_requirements = card.find_element(By.CSS_SELECTOR, '.job-card-footer .tag-list').text.strip()
except:
skill_requirements = '未明确要求'
try:
company_benefits = card.find_element(By.CSS_SELECTOR, '.info-desc').text.strip()
except:
company_benefits = '无'
job_data.append({
'公司名': company_name,
'职位名': job_title,
'区域地址': job_location,
'薪资': salary,
'岗位经验要求/学历要求': experience_education,
'岗位技能要求': skill_requirements,
'公司福利介绍': company_benefits
})
except Exception as e:
print(f"抓取职位信息时出错: {e}")
# 函数:获取最大页数
def get_max_page(driver):
try:
pager = driver.find_element(By.CSS_SELECTOR, '.pager .options-pages')
pages = pager.find_elements(By.TAG_NAME, 'a')
max_page = int(pages[-2].text) # 倒数第二个页码为最大页数
print(f"总页数为: {max_page}")
return max_page
except Exception as e:
print(f"获取总页数时出错: {e}")
return 1 # 默认返回1页
# 爬取所有页面
try:
url_template = "https://www.zhipin.com/web/geek/job?query=python&city=101280600&page={}"
driver.get(url_template.format(1))
random_sleep()
max_page = get_max_page(driver)
for page in range(1, max_page + 1):
print(f"正在抓取第 {page} 页...")
driver.get(url_template.format(page))
random_sleep(2, 5) # 加入随机延迟
scrape_page(driver)
except Exception as e:
print(f"发生错误: {e}")
finally:
driver.quit()
# 将数据保存为 Excel 文件
df = pd.DataFrame(job_data)
df.to_excel('python.xlsx', index=False)
print("数据已成功爬取并保存为 'python.xlsx'")
Boss 直聘爬虫 | Python(仅用于实践学习)
最新推荐文章于 2025-06-16 23:05:58 发布