代码
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
def initialize_driver(driver_path):
"""初始化 Selenium Edge 驱动"""
service = Service(driver_path)
driver = webdriver.Edge(service=service)
return driver
def fetch_page_data(driver):
"""从当前页面抓取数据"""
rows = driver.find_elements(By.CSS_SELECTOR, "tr")
page_data = []
for row in rows[1:]: # 跳过表头
cols = row.find_elements(By.TAG_NAME, "td")
if len(cols) >= 5:
data = [
cols[0].text.strip(), # 排名
cols[1].text.strip(), # 学校名称
cols[2].text.strip(), # 国家/地区
cols[3].text.strip(), # 国家/地区排名
cols[4].text.strip(), # 总分
cols[5].text.strip() # 校友会
]
page_data.append(data)
return page_data
def get_total_pages(driver):
"""动态获取总页数"""
pagination = driver.find_elements(By.CSS_SELECTOR, "a.ant-pagination-item")
if pagination:
return int(pagination[-1].text.strip()) # 获取分页最后一个数字为总页数
return 1
def navigate_and_scrape_data(driver, base_url):
"""导航至每一页并抓取数据"""
driver.get(base_url)
all_data = [["排名", "学校名称", "省市", "学校类型", "总分", "校友获奖"]]
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "ul.ant-pagination")))
# 动态获取总页数
total_pages = 34
print(f"Total pages: {total_pages}")
for page in range(1, total_pages + 1):
print(f"Fetching data from page {page}")
page_data = fetch_page_data(driver)
all_data.extend(page_data)
# 点击下一页按钮
if page < total_pages:
next_button = driver.find_element(By.CSS_SELECTOR, "li.ant-pagination-next")
# 确认按钮未被禁用
if "ant-pagination-disabled" not in next_button.get_attribute("class"):
next_button.click()
time.sleep(2) # 等待新页面加载
else:
print("Next button is disabled.")
break
return all_data
def save_to_csv(data, filename):
"""将数据保存到 CSV 文件"""
pd.DataFrame(data).to_csv(filename, index=False, header=False, encoding='utf-8-sig')
if __name__ == "__main__":
edge_driver_path = r'C:\Program Files (x86)\Microsoft\Edge\edgedriver_win64\msedgedriver.exe'
driver = initialize_driver(edge_driver_path)
base_url = "https://www.shanghairanking.cn/rankings/arwu/2024"
all_data = navigate_and_scrape_data(driver, base_url)
print(all_data)
save_to_csv(all_data, "./college.csv")
driver.quit()
分析
我们首先是tr去检索行,td去检索所有列
然后依次页面读取获取并保存即可
注意
本代码的动态获取页码的部分是获取不到的,实际上只能自己手动修改总页码,但是总体达成的爬取效果还是一致的