Python爬取软科排名,及总结

代码

from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time


def initialize_driver(driver_path):
    """初始化 Selenium Edge 驱动"""
    service = Service(driver_path)
    driver = webdriver.Edge(service=service)
    return driver


def fetch_page_data(driver):
    """从当前页面抓取数据"""
    rows = driver.find_elements(By.CSS_SELECTOR, "tr")
    page_data = []

    for row in rows[1:]:  # 跳过表头
        cols = row.find_elements(By.TAG_NAME, "td")
        if len(cols) >= 5:
            data = [
                cols[0].text.strip(),  # 排名
                cols[1].text.strip(),  # 学校名称
                cols[2].text.strip(),  # 国家/地区
                cols[3].text.strip(),  # 国家/地区排名
                cols[4].text.strip(),  # 总分
                cols[5].text.strip()  # 校友会

            ]
            page_data.append(data)

    return page_data


def get_total_pages(driver):
    """动态获取总页数"""
    pagination = driver.find_elements(By.CSS_SELECTOR, "a.ant-pagination-item")
    if pagination:
        return int(pagination[-1].text.strip())  # 获取分页最后一个数字为总页数
    return 1


def navigate_and_scrape_data(driver, base_url):
    """导航至每一页并抓取数据"""
    driver.get(base_url)
    all_data = [["排名", "学校名称", "省市", "学校类型", "总分", "校友获奖"]]

    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "ul.ant-pagination")))

    # 动态获取总页数
    total_pages = 34
    print(f"Total pages: {total_pages}")

    for page in range(1, total_pages + 1):
        print(f"Fetching data from page {page}")
        page_data = fetch_page_data(driver)
        all_data.extend(page_data)

        # 点击下一页按钮
        if page < total_pages:
            next_button = driver.find_element(By.CSS_SELECTOR, "li.ant-pagination-next")

            # 确认按钮未被禁用
            if "ant-pagination-disabled" not in next_button.get_attribute("class"):
                next_button.click()
                time.sleep(2)  # 等待新页面加载
            else:
                print("Next button is disabled.")
                break

    return all_data


def save_to_csv(data, filename):
    """将数据保存到 CSV 文件"""
    pd.DataFrame(data).to_csv(filename, index=False, header=False, encoding='utf-8-sig')


if __name__ == "__main__":
    edge_driver_path = r'C:\Program Files (x86)\Microsoft\Edge\edgedriver_win64\msedgedriver.exe'
    driver = initialize_driver(edge_driver_path)

    base_url = "https://www.shanghairanking.cn/rankings/arwu/2024"
    all_data = navigate_and_scrape_data(driver, base_url)

    print(all_data)
    save_to_csv(all_data, "./college.csv")
    driver.quit()

分析

我们首先是tr去检索行,td去检索所有列

然后依次页面读取获取并保存即可

注意

本代码的动态获取页码的部分是获取不到的,实际上只能自己手动修改总页码,但是总体达成的爬取效果还是一致的

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值