爬虫在谷歌学术找文献被引次数

最新推荐文章于 2025-07-13 00:41:24 发布

原创最新推荐文章于 2025-07-13 00:41:24 发布 · 913 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#爬虫 #python #开发语言

部署运行你感兴趣的模型镜像

import urllib.request, urllib.error
import re


def get_references(title):
    # 将文章标题中的空格替换为加号，以便用于生成 URL
    title = title.replace(' ', '+')

    # 构造搜索 URL
    search_url = f'https://scholar.google.com/scholar?hl=en&q={title}&btnG=&as_sdt=1%2C5&as_sdtp='
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Referer': 'https://www.google.com/'
    }
    # 发送 HTTP 请求并读取响应内容
    req=urllib.request.Request(search_url,headers=headers)
    try:
        response = urllib.request.urlopen(req)
        html_content = response.read().decode('utf-8')
    except urllib.error.HTTPError as e:
        print(f'Error: {e.code} {e.reason}')
        return None

    # 从 HTML 页面中提取被引用次数
    m = re.search('Cited by\s(\d+)', html_content)
    if m:
        num_citations = int(m.group(1))
        return num_citations
    else:
        return 0


# 测试代码
titles = ['Experimental Study on the Autogenic Acid Fluid System of a High-Temperature Carbonate Reservoir by Acid Fracturing',
          'Experimental study on a new type of self-propping fracturing technology'
          ]
for title in titles:
    num_citations = get_references(title)
    print(f'{title}: {num_citations} 次被引用')