代码部分
import requests
from lxml import etree
# 豆瓣电影 Top 250 的 URL
url = "https://movie.douban.com/top250"
# 设置请求头(模拟浏览器访问)
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
)
}
try:
# 发起 GET 请求
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # 检查请求是否成功
response.encoding = response.apparent_encoding # 自动检测编码
print("成功获取 HTML 内容")
# 使用 lxml.etree.HTML 解析 HTML 内容
html_tree = etree.HTML(response.text)
# 批量提取电影数据
data_list = html_tree.xpath("//*[@id='content']/div/div[1]/ol/li")
print(f"成功提取 {len(data_list)} 部电影信息:\n")
# 保存到文件
file_name = "豆瓣电影Top250.csv"
with open(file_name, "w", encoding="utf-8") as file:
file.write("标题,评分,影评,链接\n") # 写入表头
for data_html_tree in data_list:
try:
# 提取电影信息
title = data_html_tree.xpath("./div/div[2]/div[1]/a/span[1]/text()")[0]
score = data_html_tree.xpath("./div/div[2]/div[2]/div/span[2]/text()")[0]
movie_appraise = data_html_tree.xpath(".//span[@class='inq'][1]/text()")
movie_appraise = movie_appraise[0] if movie_appraise else "暂无影评"
movie_link = data_html_tree.xpath("./div/div[2]/div[1]/a/@href")[0]
# 打印输出
print(f"{title} | {score} 分 | 影评: {movie_appraise}")
print(f"链接: {movie_link}\n")
# 写入 CSV 文件
file.write(f'"{title}","{score}","{movie_appraise}","{movie_link}"\n')
except IndexError as e:
print(f"解析某条电影信息时出错:{e}")
continue
print(f"\n电影数据已保存到文件:{file_name}")
except requests.RequestException as e:
print(f"请求失败:{e}")
except IOError as e:
print(f"文件保存失败:{e}")