from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import os
import time
import random
import pandas as pd
from bs4 import BeautifulSoup
# 初始化浏览器函数
def init_browser():
# ChromeDriver 可执行文件路径
driver_path = ChromeDriverManager().install()
if not driver_path.endswith("chromedriver.exe"):
driver_path = os.path.join(os.path.dirname(driver_path), "chromedriver.exe")
# 配置 Chrome 选项
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--headless") # 启用无头模式
chrome_options.add_argument("--disable-blink-features=AutomationControlled") # 禁用 Selenium 标识
chrome_options.add_argument(
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36") # 设置用户代理
return webdriver.Chrome(service=Service(driver_path), options=chrome_options)
# 抓取单页职位信息
def scrape_page(driver):
soup = BeautifulSoup(driver.page_source, 'html.parser')
job_cards = soup.find_all('div', class_='job-card-pc-container')
jobs = []
for card in job_cards:
job_title = card.find('div', class_='job-title-box').div.text.strip()
location = card.find('div', class_='job-dq-box').span.next_sibling.text.strip()
salary = card.find('span', class_='job-salary').text.strip() if card.find('span',
class_='job-salary') else 'N/A'
experience = card.find('div', class_='job-labels-box').find_all('span')[0].text.strip() if card.find('div',
class_='job-labels-box') else 'N/A'
education = card.find('div', class_='job-labels-box').find_all('span')[1].text.strip() if len(
card.find('div', class_='job-labels-box').find_all('span')) > 1 else 'N/A'
company_name = card.find('span', class_='company-name').text.strip() if card.find('span',
class_='company-name') else 'N/A'
company_details = card.find('div', class_='company-tags-box').text.strip() if card.find('div',
class_='company-tags-box') else 'N/A'
job_link = card.find('a', {'data-nick': 'job-detail-job-info'})['href'] if card.find('a', {
'data-nick': 'job-detail-job-info'}) else 'N/A'
jobs.append({
'职位名称': job_title,
'工作地点': location,
'薪资': salary,
'工作经验': experience,
'学历要求': education,
'公司名称': company_name,
'公司详情': company_details,
'职位链接': job_link
})
return jobs
# 主程序函数
def main():
driver = init_browser()
driver.get("https://www.liepin.com/zhaopin/?city=050090&dq=050090&pubTime=¤tPage=0&pageSize=40&key=go")
time.sleep(random.uniform(3, 5)) # 随机延时,等待页面加载
# 获取总页数
soup = BeautifulSoup(driver.page_source, 'html.parser')
pagination = soup.find('ul', class_='ant-pagination')
total_pages = int(pagination.find_all('li', class_='ant-pagination-item')[-1].text.strip()) if pagination else 1
all_jobs = []
for page in range(total_pages):
driver.get(
f"https://www.liepin.com/zhaopin/?city=050090&dq=050090&pubTime=¤tPage={page}&pageSize=40&key=后端")
time.sleep(random.uniform(3, 5)) # 随机延时,防止被封禁
jobs = scrape_page(driver)
all_jobs.extend(jobs)
print(f"已抓取第 {page + 1} 页,共 {total_pages} 页。")
# 将结果保存到 Excel
df = pd.DataFrame(all_jobs)
df.to_excel('后端.xlsx', index=False)
print("所有职位信息已保存到 后端.xlsx 文件中。")
driver.quit()
if __name__ == "__main__":
main()
猎聘爬虫 | Python(仅用于实践学习)
最新推荐文章于 2024-11-29 14:02:47 发布