检索文献有点麻烦,于是想用scholarly去爬取google学术上的文献。可惜只检索了不到900篇文献就被google的反爬虫系统侦测到了。还是去用publish or perish吧。
import os
import time
import csv
import random
import scholarly
from scholarly import scholarly
def search_scholar(query, filename):
count = 0
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['title', 'author', 'year', 'cites', 'url']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# 写入 CSV 文件的头部信息
writer.writeheader()
search_query = scholarly.search_pubs('query')
while True:
try:
pub = next(search_query)
result = {
'title': pub['bib']['title'],
'author': pub['bib']['author'],
'year': pub['bib']['pub_year'],
'cites': pub['num_citations'],
'url': pub['url_scholarbib']
}
writer.writerow(result)
count += 1
if count%100 == 0:
print('fetched: ', count)
sleep_time = random.uniform(15, 45)
time.sleep(sleep_time)
except StopIteration:
break
except Exception as e:
# 记录异常情况
print(f"Error occurred: {e}")
continue
return count
if __name__ == '__main__':
os.chdir('./results/')
filename = 'googleresults.csv'
query = "检索内容"
publications = search_scholar(query, filename)
print(publications)