xpath提取豆瓣top250电影数据

最新推荐文章于 2025-04-03 16:49:51 发布

yunAike

最新推荐文章于 2025-04-03 16:49:51 发布

阅读量207

点赞数 2

文章标签：网络爬虫 python ajax

本文链接：https://blog.youkuaiyun.com/yunAike/article/details/145288402

版权

代码部分

import requests
from lxml import etree

# 豆瓣电影 Top 250 的 URL
url = "https://movie.douban.com/top250"

# 设置请求头（模拟浏览器访问）
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
    )
}

try:
    # 发起 GET 请求
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()  # 检查请求是否成功
    response.encoding = response.apparent_encoding  # 自动检测编码
    print("成功获取 HTML 内容")

    # 使用 lxml.etree.HTML 解析 HTML 内容
    html_tree = etree.HTML(response.text)

    # 批量提取电影数据
    data_list = html_tree.xpath("//*[@id='content']/div/div[1]/ol/li")
    print(f"成功提取 {len(data_list)} 部电影信息：\n")

    # 保存到文件
    file_name = "豆瓣电影Top250.csv"
    with open(file_name, "w", encoding="utf-8") as file:
        file.write("标题,评分,影评,链接\n")  # 写入表头

        for data_html_tree in data_list:
            try:
                # 提取电影信息
                title = data_html_tree.xpath("./div/div[2]/div[1]/a/span[1]/text()")[0]
                score = data_html_tree.xpath("./div/div[2]/div[2]/div/span[2]/text()")[0]
                movie_appraise = data_html_tree.xpath(".//span[@class='inq'][1]/text()")
                movie_appraise = movie_appraise[0] if movie_appraise else "暂无影评"
                movie_link = data_html_tree.xpath("./div/div[2]/div[1]/a/@href")[0]

                # 打印输出
                print(f"{title} | {score} 分 | 影评: {movie_appraise}")
                print(f"链接: {movie_link}\n")

                # 写入 CSV 文件
                file.write(f'"{title}","{score}","{movie_appraise}","{movie_link}"\n')
            except IndexError as e:
                print(f"解析某条电影信息时出错：{e}")
                continue

    print(f"\n电影数据已保存到文件：{file_name}")

except requests.RequestException as e:
    print(f"请求失败：{e}")
except IOError as e:
    print(f"文件保存失败：{e}")