import requests
from lxml import etree
import csv
import time
import random
import re
import os
# 固定请求头配置(需替换为实际值)
FIXED_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 Edg/137.0.0.0'
FIXED_COOKIE = 'll="118161"; bid=l5ki4SOlbBM; dbcl2="244638424:dLHXPIU8S0M"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.24463; _pk_id.100001.4cf6=42b01014d5c31947.1748938983.; _vwo_uuid_v2=D9E78C6D9D4E71BBB6EC73B8583864961|9da3be87da4a6d3be6203809b085d4a9; __yadk_uid=2Zr6yzTnllQxMzDhrQB82h7doa8gM4Ku; ck=ILlj; ap_v=0,6.0; frodotk_db="dd9f2f5023b9a95198dd8df06b2cfbf3"; __utma=30149280.1697373246.1748938900.1749979892.1750127075.8; __utmc=30149280; __utmz=30149280.1750127075.8.6.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.2.10.1750127075; __utma=223695111.238348316.1748938983.1749979902.1750127082.8; __utmb=223695111.0.10.1750127082; __utmc=223695111; __utmz=223695111.1750127082.8.7.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1750127082%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=1' # 需从浏览器获取
# 基础URL和请求头
base_url = "https://movie.douban.com/subject/27181010/reviews"
headers = {
'User-Agent': FIXED_USER_AGENT,
'Cookie': FIXED_COOKIE,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Referer': 'https://movie.douban.com/subject/27181010/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
}
# 创建输出目录
os.makedirs('douban_data', exist_ok=True)
# 创建CSV文件并写入表头
csv_file = open('douban_data/douban_reviews_fixed.csv', 'w', newline='', encoding='utf-8-sig')
writer = csv.writer(csv_file)
writer.writerow(['昵称', '星级评分', '评论时间', '文本评论', '点赞数', '回应数', '页面位置'])
def extract_content(element):
"""提取评论内容,处理展开情况"""
# 尝试提取短评内容
short_content = element.xpath('.//div[contains(@class, "short-content")]/text()')
if short_content:
return ''.join(short_content).strip()
# 尝试提取完整评论内容
full_content = element.xpath('.//div[@class="review-content clearfix"]/text()')
if full_content:
return ''.join(full_content).strip()
# 尝试提取折叠内容
folded_content = element.xpath('.//div[@class="folded"]/text()')
if folded_content:
return ''.join(folded_content).strip()
return "无内容"
# 爬取多页数据
for page in range(0, 125):
# 构造URL参数
params = {
'start': page * 20,
'sort': 'new_score',
'status': 'P'
}
try:
print(f"开始爬取第 {page+1} 页...")
# 发送请求(禁止重定向以检测验证)
response = requests.get(
url=base_url,
params=params,
headers=headers,
timeout=15,
allow_redirects=False # 禁止重定向以检测验证
)
# 检查重定向状态码(302表示需要验证)
if response.status_code == 302:
location = response.headers.get('Location', '未知位置')
print(f"⚠️ 第 {page+1} 页触发验证,重定向至: {location}")
# 保存重定向页面供分析
with open(f'douban_data/redirect_page_{page}.html', 'w', encoding='utf-8') as f:
f.write(response.text)
continue
response.encoding = 'utf-8'
if response.status_code == 200:
# 解析HTML
html = etree.HTML(response.text)
# 检查是否有验证码提示
captcha = html.xpath('//input[@name="captcha-id"]')
if captcha:
print(f"⚠️ 第 {page+1} 页需要验证码,跳过")
# 保存验证码页面供分析
with open(f'douban_data/captcha_page_{page}.html', 'w', encoding='utf-8') as f:
f.write(response.text)
continue
# 检查页面是否包含评论容器(使用更灵活的选择器)
review_container = html.xpath('//div[@class="review-list"]')
if not review_container:
# 尝试备用选择器
review_container = html.xpath('//div[contains(@id, "content")]//div[contains(@class, "review")]')
if not review_container:
# 保存异常页面用于分析
with open(f'douban_data/error_page_{page}.html', 'w', encoding='utf-8') as f:
f.write(response.text)
print(f"❌ 第 {page+1} 页无评论容器,已保存页面供分析")
continue
# 提取评论项(更新后的选择器)
comments = html.xpath('//div[contains(@class, "review-item")]')
# 备用选择器:尝试抓取评论项
if not comments:
comments = html.xpath('//div[contains(@class, "main") and contains(@class, "review-item")]')
if not comments:
comments = html.xpath('//div[@class="review-list"]/div[contains(@class, "review")]')
if not comments:
print(f"❌ 第 {page+1} 页找到0条评论,可能触发反爬")
# 检查反爬提示
anti_spider = html.xpath('//div[contains(text(), "检测到异常请求")]')
if anti_spider:
print("⚠️ 检测到反爬提示,请更换Cookie或IP")
# 保存页面供分析
with open(f'douban_data/antispider_page_{page}.html', 'w', encoding='utf-8') as f:
f.write(response.text)
continue
print(f"✅ 第 {page+1} 页找到 {len(comments)} 条评论")
for idx, comment in enumerate(comments):
try:
# 提取昵称
username = comment.xpath('.//a[contains(@class, "name")]/text()')
if not username:
username = comment.xpath('.//span[@class="author"]/a/text()')
username = username[0].strip() if username else "无昵称"
# 提取星级评分
rating = comment.xpath('.//span[contains(@class, "rating")]/@title')
if not rating:
rating = comment.xpath('.//span[contains(@class, "main-title-rating")]/@title')
rating = rating[0] if rating else "无评分"
# 提取评论时间
comment_time = comment.xpath('.//span[contains(@class, "main-meta")]/text()')
if not comment_time:
comment_time = comment.xpath('.//span[@class="review-date"]/text()')
comment_time = comment_time[0].strip() if comment_time else "无时间"
# 提取文本评论
content = extract_content(comment)
# 提取点赞数
useful_count = comment.xpath('.//button[contains(@data-count, "useful")]/span/text()')
if not useful_count:
useful_count = comment.xpath('.//button[contains(@class, "useful")]/span/text()')
useful_count = useful_count[0].strip() if useful_count else "0"
# 提取回应数
reply_count = comment.xpath('.//a[contains(@class, "replythis")]/text()')
if not reply_count:
reply_count = comment.xpath('.//span[contains(@class, "reply-count")]/a/text()')
reply_count = re.sub(r'\D', '', reply_count[0]) if reply_count else "0"
# 写入CSV
writer.writerow([
username,
rating,
comment_time,
content,
useful_count,
reply_count,
f"第{page+1}页第{idx+1}条"
])
except Exception as e:
print(f"⚠️ 处理评论时出错: {e}")
continue
else:
print(f"❌ 请求失败,状态码: {response.status_code}")
except Exception as e:
print(f"❌ 请求异常: {e}")
# 随机延迟,避免频繁请求
delay = random.uniform(3, 8)
print(f"⏳ 等待 {delay:.2f} 秒后继续...")
time.sleep(delay)
csv_file.close()
print("✅ 爬取完成!数据已保存到 douban_data/douban_reviews_fixed.csv"),此代码的评论时间、点赞数和回应数显示有误,帮我修改,要求保留原有功能,再添加负赞数这一列数据,要完整代码
最新发布