有同事的小孩想从https://openreview.net/group?id=ICLR.cc/2025/Conference#tab-accept-poster
下载页面里的所有论文。国庆期间利用DeepSeek整了一段代码。可实现:
1. 用户输入所需要爬取的页面数,输入0表示爬取全部(不建议)
2. 接受用户输入后,自动将所需页面的所有论文标题,论文作者,PDF下载地址记录到EXCEL文档
3. 自动下载PDF文档到pdfs目录里
代码如下:
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 9 16:41:04 2025
@author: oldhen
"""
import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd
from urllib.parse import urljoin
import re
def clean_filename(filename):
"""清理文件名,移除非法字符"""
# 移除Windows文件名中不允许的字符
illegal_chars = r'[<>:"/\\|?*]'
filename = re.sub(illegal_chars, '_', filename)
# 限制文件名长度
if len(filename) > 200:
filename = filename[:200]
return filename
def download_pdf(pdf_url, filename, folder='pdfs'):
"""下载PDF文件"""
if not os.path.exists(folder):
os.makedirs(folder)
# 清理文件名
filename = clean_filename(filename)
filepath = os.path.join(folder, f"{filename}.pdf")
try:
# 构建完整的PDF URL
full_pdf_url = urljoin("https://openreview.net", pdf_url)
# 设置请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# 下载PDF
response = requests.get(full_pdf_url, headers=headers, timeout=30)
response.raise_for_status()
# 保存文件
with open(filepath, 'wb') as f:
f.write(response.content)
print(f"成功下载: {filename}.pdf")
return True
except Exception as e:
print(f"下载失败 {filename}: {str(e)}")
return False
def crawl_openreview():
"""爬取OpenReview论文信息"""
# 询问用户要爬取的页数
while True:
try:
pages = input("请输入要爬取的页数(0表示全部): ")
pages = int(pages)
if pages >= 0:
break
else:
print("请输入非负整数")
except ValueError:
print("请输入有效的数字")
# 初始化浏览器
driver = webdriver.Chrome() # 确保chromedriver在PATH中
wait = WebDriverWait(driver, 10)
try:
# 打开目标网页
url = "https://openreview.net/group?id=ICLR.cc/2025/Conference#tab-accept-poster"
driver.get(url)
print("页面加载中...")
# 存储所有论文数据
all_papers = []
current_page = 1
while True:
print(f"\n正在爬取第 {current_page} 页...")
# 等待论文列表出现
try:
wait.until(EC.presence_of_element_located((By.XPATH, "/html/body/div/div[4]/div/main/div/div/div[3]/div/div[2]/div[4]/div/div/ul")))
except TimeoutException:
print("论文列表加载超时")
break
# 滚动页面以触发动态加载
for _ in range(3):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
driver.execute_script("window.scrollTo(0, 0);")
time.sleep(1)
# 获取所有论文条目
paper_elements = driver.find_elements(By.XPATH, "/html/body/div/div[4]/div/main/div/div/div[3]/div/div[2]/div[4]/div/div/ul/li")
print(f"找到 {len(paper_elements)} 篇论文")
if len(paper_elements) == 0:
print("没有找到论文,可能页面结构有变化")
break
# 遍历每篇论文
for i, paper in enumerate(paper_elements):
try:
# 获取论文标题
title_element = paper.find_element(By.XPATH, f"./div/h4/a[1]")
paper_title = title_element.text.strip()
# 获取PDF下载链接
pdf_element = paper.find_element(By.XPATH, f"./div/h4/a[2]")
pdf_url = pdf_element.get_attribute("href")
# 获取作者信息
author_elements = paper.find_elements(By.XPATH, f"./div/div[1]/span/a")
authors = [author.text for author in author_elements]
authors_str = ", ".join(authors) if authors else "未知作者"
# 添加到列表
all_papers.append({
"标题": paper_title,
"作者": authors_str,
"PDF链接": pdf_url
})
print(f"第 {i+1} 篇: {paper_title}")
# 下载PDF
download_pdf(pdf_url, paper_title)
except Exception as e:
print(f"处理第 {i+1} 篇论文时出错: {str(e)}")
continue
# 检查是否还需要继续翻页
if pages > 0 and current_page >= pages:
print(f"已达到指定的 {pages} 页,停止爬取")
break
# 尝试点击下一页
try:
next_button = driver.find_element(By.XPATH, "/html/body/div/div[4]/div/main/div/div/div[3]/div/div[2]/div[4]/div/div/nav/ul/li[13]/a")
# 检查下一页按钮是否可点击
if "disabled" in next_button.get_attribute("class"):
print("已是最后一页,停止爬取")
break
# 点击下一页
driver.execute_script("arguments[0].click();", next_button)
current_page += 1
# 等待页面加载
time.sleep(3)
except NoSuchElementException:
print("找不到下一页按钮,停止爬取")
break
except Exception as e:
print(f"翻页时出错: {str(e)}")
break
# 保存到Excel文件
if all_papers:
df = pd.DataFrame(all_papers)
df.to_excel("lunwen.xlsx", index=False, engine='openpyxl')
print(f"\n成功保存 {len(all_papers)} 篇论文信息到 lunwen.xlsx")
else:
print("没有找到任何论文数据")
except Exception as e:
print(f"爬取过程中出现错误: {str(e)}")
finally:
# 关闭浏览器
driver.quit()
print("程序执行完毕")
if __name__ == "__main__":
crawl_openreview()
1271

被折叠的 条评论
为什么被折叠?



