from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from openpyxl import Workbook, load_workbook
import time
import os
import datetime
# 初始化浏览器驱动
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)
# Excel 文件路径
excel_file = "中芯国际招聘.xlsx"
# 如果文件不存在则新建并初始化表头
if not os.path.exists(excel_file):
wb = Workbook()
ws = wb.active
ws.append(["岗位名称", "工作地点", "发布时间", "技能需求"])
wb.save(excel_file)
def write_to_excel(data):
"""追加数据到Excel"""
wb = load_workbook(excel_file)
ws = wb.active
ws.append(data)
wb.save(excel_file)
try:
page_index = 1
stop_flag = False
while True:
url = f"https://smics.zhiye.com/social/?PageIndex={page_index}"
driver.get(url)
print(f"正在访问第 {page_index} 页:{url}")
time.sleep(2)
if page_index > 19:
print("已达到第20页,停止抓取。")
break
# 等待职位列表加载完成
zw_boxes = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".bg1.szbg1 .zwbox li"))
)
# 使用索引方式遍历
for index in range(len(zw_boxes)):
# 每次循环都重新获取一遍,防止元素失效
current_items = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".bg1.szbg1 .zwbox li"))
)
item = current_items[index]
driver.execute_script("arguments[0].scrollIntoView(true);", item)
time.sleep(0.5)
actions = ActionChains(driver)
actions.move_to_element(item).click().perform()
time.sleep(1) # 等待详情页加载
try:
# 抓取岗位名称
job_title_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".xqbox .xqtitle.pr.flex-between.align-center"))
)
job_title_full = job_title_element.text.split('分')[0].strip()
# 抓取工作地点和发布时间
ul_elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".xqbox ul"))
)
if len(ul_elements) > 1:
li_elements = ul_elements[1].find_elements(By.TAG_NAME, "li")
work_location = li_elements[1].text.replace("工作地点:", "").strip()
pub_time_str = li_elements[0].text.replace("更新时间:", "").strip()
else:
work_location = ""
pub_time_str = ""
# 抓取技能需求
xqm_elements = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".xqbox .xqm"))
)
if len(xqm_elements) > 1:
skill_requirements = xqm_elements[1].text.strip()
else:
skill_requirements = ""
# 输出到控制台 & 写入 Excel
print(f"岗位名称: {job_title_full}")
print(f"工作地点: {work_location}, 发布时间: {pub_time_str}")
print(f"技能需求: {skill_requirements}\n")
write_to_excel([job_title_full, work_location, pub_time_str, skill_requirements])
finally:
driver.back()
time.sleep(1)
# 如果触发了时间条件提前退出
if stop_flag:
break
if stop_flag:
break
try:
# 等待特定容器出现
container = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div.flex-center.align-center.page.pt30"))
)
# 在容器中查找所有链接
next_page_links = container.find_elements(By.TAG_NAME, "a")
# 手动遍历找到的链接,检查哪个的文本是“下一页”
next_page_btn = None
for link in next_page_links:
if link.text == "下一页":
next_page_btn = link
break
if next_page_btn is not None:
# 滚动至该元素位置
driver.execute_script("arguments[0].scrollIntoView(true);", next_page_btn)
time.sleep(0.5) # 给页面一点时间完成滚动
# 创建动作链,并执行移动到元素并点击的操作
actions = ActionChains(driver)
actions.move_to_element(next_page_btn).click().perform()
time.sleep(2) # 根据需要调整,给页面加载留出时间
page_index += 1
else:
print("找不到标记为'下一页'的按钮,可能是最后一页。")
break
except Exception as e:
print("无法点击下一页按钮,可能是最后一页或发生其他错误:", str(e))
break
except Exception as e:
print(f"发生异常: {e}")
finally:
driver.quit()
print("浏览器已关闭,数据抓取结束。")