一键下载当天ArXiv上的pdf文件
(本人是天文专业的,以天体物理作为例子)
ArXiv网址:https://arxiv.org/list/astro-ph/new
完整代码如下:
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
import pandas as np
import urllib
import re
import os
html = urlopen("https://arxiv.org/list/astro-ph/new")
bsObj = BeautifulSoup(html, "lxml")
# 判断是否下载pdf文件:
def decide(title, abstract, regular):
s1 = re.search(regular, str(title.get_text()))
s2 = re.search(regular, str(abstract.get_text()))
if s1 is not None:
return True
elif s2 is not None:
return True
else:
return False
# 无输入用默认
regular = input("input regular expression:")
if regular == '':
regular = "GRB|FRB|GW"
# 将pdf保存到path路径下:
dateline = bsObj.find("h3")
year = '20' + dateline