import requests
import re
import jieba
import jieba.posseg as pseg
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder
import time
from typing import List, Tuple, Optional
# 配置jieba分词,加载自定义词典(如果有)
jieba.load_userdict("user_dict.txt") # 自定义词典,可根据需要添加电影相关词汇
stopwords = set()
# 读取停用词表
def load_stopwords(stopwords_path):
with open(stopwords_path, 'r', encoding='utf-8') as f:
for line in f:
stopwords.add(line.strip())
return stopwords
# 数据获取模块 - 豆瓣影评爬虫(遵守robots协议)
class DoubanMovieCommentCrawler:
def __init__(self, movie_id, max_pages=5, delay=1):
self.movie_id = movie_id
self.max_pages = max_pages
self.base_url = f"https://movie.douban.com/subject/{movie_id}/comments"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive"
}
self.delay = delay # 请求间隔时间(秒)
def parse_comment(self, html):
soup = BeautifulSoup(html, 'lxml')
comment_list = soup.select('div.comment-item')
comments = []
for comment in comment_list:
try:
# 提取评论内容
content_elem = comment.select_one('span.short')
content = content_elem.text.strip() if content_elem else ""
# 提取评分
rating_elem = comment.select_one('span.rating')
rating = rating_elem['title'] if rating_elem else "未评分"
# 提取评论时间
time_elem = comment.select_one('span.comment-time')
comment_time = time_elem.text.strip() if time_elem else ""
# 提取用户名
user_elem = comment.select_one('span.comment-info a')
username = user_elem.text.strip() if user_elem else ""
comments.append({
"username": username,
"content": content,
"rating": rating,
"time": comment_time
})
except Exception as e:
print(f"解析评论时出错: {e}")
return comments
def crawl(self):
all_comments = []
for page in range(self.max_pages):
offset = page * 20
params = {
"start": offset,
"limit": 20,
"sort": "new_score",
"status": "P"
}
try:
print(f"正在爬取第{page + 1}页,URL: {self.base_url}?start={offset}")
response = requests.get(self.base_url, headers=self.headers, params=params, timeout=15)
response.raise_for_status() # 检查请求是否成功
# 检查是否被反爬
if "检测到有异常请求" in response.text:
print("警告: 可能被豆瓣反爬机制拦截,请稍后再试或降低爬取频率")
break
comments = self.parse_comment(response.text)
all_comments.extend(comments)
print(f"已爬取第{page + 1}页,获取{len(comments)}条评论")
# 避免请求过于频繁
if page < self.max_pages - 1:
print(f"等待{self.delay}秒后继续爬取下一页...")
time.sleep(self.delay)
except requests.RequestException as e:
print(f"爬取第{page + 1}页时出错: {e}")
# 出错后等待一段时间再继续
time.sleep(self.delay * 2)
continue
return pd.DataFrame(all_comments)
# 文本预处理模块
class TextProcessor:
def __init__(self, stopwords):
self.stopwords = stopwords
def clean_text(self, text):
# 去除特殊字符和标点符号
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', ' ', text)
# 去除多余空格
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize(self, text):
# 中文分词
words = jieba.lcut(text)
# 去除停用词和单字
words = [word for word in words if word not in self.stopwords and len(word) > 1]
return words
def pos_tag(self, text):
# 词性标注
words = pseg.lcut(text)
return [(word, flag) for word, flag in words if word not in self.stopwords and len(word) > 1]
def filter_pos(self, pos_tags, allowed_pos=None):
"""过滤特定词性的词语"""
if allowed_pos is None:
allowed_pos = ['n', 'v', 'a', 'ad', 'an', 'vn'] # 默认保留名词、动词、形容词
return [word for word, flag in pos_tags if any(flag.startswith(p) for p in allowed_pos)]
# 特征提取模块
class FeatureExtractor:
def __init__(self, max_features=5000, ngram_range=(1, 2)):
self.tfidf_vectorizer = TfidfVectorizer(
max_features=max_features,
ngram_range=ngram_range,
token_pattern=r'(?u)\b\w+\b' # 自定义分词模式,确保中文单字也能被识别
)
def extract_features(self, texts):
# 使用TF-IDF提取文本特征
return self.tfidf_vectorizer.fit_transform(texts)
def get_feature_names(self):
return self.tfidf_vectorizer.get_feature_names_out()
def transform(self, texts):
"""对新文本进行特征转换"""
return self.tfidf_vectorizer.transform(texts)
# 模型训练与评估模块
class ModelTrainer:
def __init__(self, model_type="naive_bayes"):
if model_type == "naive_bayes":
self.model = MultinomialNB()
elif model_type == "logistic_regression":
from sklearn.linear_model import LogisticRegression
self.model = LogisticRegression(max_iter=1000)
else:
raise ValueError(f"不支持的模型类型: {model_type}")
self.label_encoder = LabelEncoder()
def train(self, X_train, y_train):
# 对标签进行编码
y_train_encoded = self.label_encoder.fit_transform(y_train)
# 训练模型
self.model.fit(X_train, y_train_encoded)
return self.label_encoder.classes_
def evaluate(self, X_test, y_test):
# 对测试集标签进行编码
y_test_encoded = self.label_encoder.transform(y_test)
# 预测
y_pred = self.model.predict(X_test)
# 计算评估指标
accuracy = accuracy_score(y_test_encoded, y_pred)
precision = precision_score(y_test_encoded, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test_encoded, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test_encoded, y_pred, average='weighted', zero_division=0)
return {
"accuracy": accuracy,
"precision": precision,
"recall": recall,
"f1": f1
}
def predict(self, X):
# 预测新数据的类别
y_pred = self.model.predict(X)
return self.label_encoder.inverse_transform(y_pred)
def predict_proba(self, X):
"""预测概率"""
return self.model.predict_proba(X)
# 文本分析与可视化模块
class TextAnalyzer:
def __init__(self, font_path: Optional[str] = None):
# 优先使用用户指定的字体
self.font_path = self._get_valid_font(font_path)
# 确保matplotlib使用正确的字体
self._setup_matplotlib_font()
def _get_valid_font(self, user_font: Optional[str]) -> str:
"""获取有效的中文字体路径"""
# 1. 用户指定的字体
if user_font and os.path.exists(user_font):
print(f"使用用户指定字体: {user_font}")
return user_font
# 2. 尝试常用字体文件
candidate_fonts = [
"simhei.ttf", "simsun.ttc", "msyh.ttc", # Windows常用字体
]
for font in candidate_fonts:
if os.path.exists(font):
print(f"使用候选字体: {font}")
return font
# 3. 系统字体检测
try:
from matplotlib import font_manager
fonts = font_manager.findSystemFonts()
chinese_keywords = ['hei', 'song', 'yahei', 'simhei', 'heiti', 'microsoft']
for font in fonts:
if any(keyword in font.lower() for keyword in chinese_keywords):
print(f"使用系统检测到的字体: {font}")
return font
except Exception as e:
print(f"字体检测出错: {e}")
# 4. 回退到matplotlib默认字体
print("警告: 未找到中文字体,使用默认字体")
return ""
def _setup_matplotlib_font(self):
"""配置matplotlib使用指定字体"""
if self.font_path and os.path.exists(self.font_path):
try:
from matplotlib import font_manager
font_manager.fontManager.addfont(self.font_path)
plt.rcParams["font.family"] = ["SimHei"]
print(f"已配置matplotlib使用字体: {self.font_path}")
except Exception as e:
print(f"配置matplotlib字体出错: {e}")
def analyze_word_frequency(self, tokens_list, top_n=20):
# 统计词频
all_tokens = [token for tokens in tokens_list for token in tokens]
word_freq = Counter(all_tokens)
return word_freq.most_common(top_n)
def visualize_wordcloud(self, word_freq, output_file="wordcloud.png", width=800, height=600):
# 生成词云
wordcloud = WordCloud(
font_path=self.font_path if os.path.exists(self.font_path) else None,
width=width,
height=height,
background_color="white",
max_words=200,
contour_width=1,
contour_color='steelblue'
).generate_from_frequencies(dict(word_freq))
# 显示词云图
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout()
plt.savefig(output_file, dpi=300, bbox_inches='tight')
plt.close()
def visualize_class_distribution(self, labels, output_file="class_distribution.png"):
# 统计类别分布
class_counts = Counter(labels)
df = pd.DataFrame(list(class_counts.items()), columns=['Class', 'Count'])
# 绘制柱状图
plt.figure(figsize=(12, 6))
sns.barplot(x='Class', y='Count', data=df)
plt.title('评论类别分布')
plt.xlabel('评分')
plt.ylabel('评论数量')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(output_file, dpi=300, bbox_inches='tight')
plt.close()
def visualize_tfidf_features(self, feature_names, feature_values, class_names, top_n=10,
output_file="tfidf_features.png"):
# 为每个类别获取最重要的特征
plt.figure(figsize=(18, 10))
for i, class_name in enumerate(class_names):
class_idx = i
top_features_idx = feature_values[class_idx].argsort()[-top_n:][::-1]
top_features = [feature_names[idx] for idx in top_features_idx]
top_values = feature_values[class_idx][top_features_idx]
plt.subplot(1, len(class_names), i + 1)
plt.barh(top_features, top_values)
plt.title(f'{class_name}的重要特征')
plt.xlabel('TF-IDF权重')
plt.tight_layout()
plt.savefig(output_file, dpi=300, bbox_inches='tight')
plt.close()
def visualize_sentiment_trend(self, df, date_col, sentiment_col, output_file="sentiment_trend.png"):
"""可视化情感随时间的变化趋势"""
# 转换日期格式
df['date'] = pd.to_datetime(df[date_col])
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
# 按日期分组并计算平均情感
sentiment_trend = df.groupby('date')[sentiment_col].mean().reset_index()
plt.figure(figsize=(15, 7))
sns.lineplot(x='date', y=sentiment_col, data=sentiment_trend, marker='o')
plt.title('情感随时间变化趋势')
plt.xlabel('日期')
plt.ylabel('平均情感分数')
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(output_file, dpi=300, bbox_inches='tight')
plt.close()
# 情感分析模块
class SentimentAnalyzer:
def __init__(self, positive_words_path="positive_words.txt", negative_words_path="negative_words.txt"):
self.positive_words = set()
self.negative_words = set()
# 加载情感词典
if os.path.exists(positive_words_path):
with open(positive_words_path, 'r', encoding='utf-8') as f:
for line in f:
self.positive_words.add(line.strip())
if os.path.exists(negative_words_path):
with open(negative_words_path, 'r', encoding='utf-8') as f:
for line in f:
self.negative_words.add(line.strip())
# 如果没有情感词典,创建简单的词典
if not self.positive_words:
self.positive_words = set(["好", "优秀", "喜欢", "赞", "精彩", "推荐", "感人", "好看", "完美", "棒"])
if not self.negative_words:
self.negative_words = set(["差", "垃圾", "讨厌", "失望", "难看", "糟糕", "恶心", "失败", "无聊", "浪费"])
def analyze_sentiment(self, tokens):
"""基于情感词典的简单情感分析"""
positive_count = sum(1 for word in tokens if word in self.positive_words)
negative_count = sum(1 for word in tokens if word in self.negative_words)
total_words = len(tokens)
if total_words == 0:
return 0.5 # 中性情感
# 计算情感分数 (-1 到 1 之间)
sentiment_score = (positive_count - negative_count) / total_words
# 转换为 0 到 1 之间的分数 (0: 负面, 0.5: 中性, 1: 正面)
normalized_score = (sentiment_score + 1) / 2
return normalized_score
def get_sentiment_label(self, score):
"""将情感分数转换为情感标签"""
if score < 0.4:
return "负面"
elif score < 0.6:
return "中性"
else:
return "正面"
# 主函数 - 整合所有模块
def main():
# 创建输出目录
if not os.path.exists("data"):
os.makedirs("data")
if not os.path.exists("results"):
os.makedirs("results")
# 1. 数据获取
print("开始获取豆瓣影评数据...")
# 示例:使用豆瓣ID
movie_id = "34780991"
# 读取电影信息
try:
movie_info_url = f"https://movie.douban.com/subject/{movie_id}/"
movie_response = requests.get(movie_info_url, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
})
movie_soup = BeautifulSoup(movie_response.text, 'lxml')
movie_title = movie_soup.select_one('span[property="v:itemreviewed"]').text
print(f"正在分析电影: {movie_title}")
except Exception as e:
print(f"获取电影信息失败: {e}")
movie_title = f"电影ID_{movie_id}"
# 爬取评论数据
crawler = DoubanMovieCommentCrawler(movie_id, max_pages=30, delay=2) # 爬取5页数据,每页20条
df = crawler.crawl()
# 保存原始数据
raw_data_file = f"data/{movie_id}_raw_comments.csv"
df.to_csv(raw_data_file, index=False, encoding='utf-8-sig')
print(f"已保存{len(df)}条原始评论到{raw_data_file}")
# 2. 数据预处理
print("开始数据预处理...")
# 加载停用词
stopwords_path = "stopwords.txt"
if not os.path.exists(stopwords_path):
# 如果没有停用词表,创建一个简单的停用词表
print(f"未找到停用词表,创建默认停用词表: {stopwords_path}")
with open(stopwords_path, 'w', encoding='utf-8') as f:
f.write("\n".join([
"的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一", "一个",
"上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好",
"这", "那", "与", "啊", "把", "被", "从", "到", "而", "对于", "给", "关于", "过",
"还是", "或者", "既", "即", "即使", "几", "己", "见", "将", "叫", "让", "然后", "任何",
"如何", "什么", "时", "时候", "使", "使用", "她", "他", "它", "他们", "她们", "我们", "你们",
"些", "向", "像", "想", "向", "项", "这样", "这种", "这里", "那里", "自己", "只是", "知道",
"之中", "之后", "之间", "这些", "那些", "这个", "那个", "这是", "那是", "现在", "现在", "一个"
]))
stopwords = load_stopwords(stopwords_path)
# 初始化文本处理器
text_processor = TextProcessor(stopwords)
# 清理文本
print("清理文本内容...")
# 检查 content 列是否存在
if 'content' not in df.columns:
print(f"错误: DataFrame 中不存在 'content' 列。可用列: {list(df.columns)}")
return
else:
# 确保 content 列是字符串类型
df['content'] = df['content'].astype(str)
# 应用文本清理函数
df['clean_content'] = df['content'].apply(
lambda x: text_processor.clean_text(x) if pd.notna(x) and x.strip() != "" else ""
)
# 去除空评论
original_count = len(df)
df = df[df['clean_content'].str.strip() != ""]
removed_count = original_count - len(df)
print(f"清理完成: 原始 {original_count} 条记录, 移除 {removed_count} 条空评论, 剩余 {len(df)} 条记录")
# 分词
print("进行中文分词...")
df['tokens'] = df['clean_content'].apply(text_processor.tokenize)
# 词性标注
print("进行词性标注...")
df['pos_tags'] = df['clean_content'].apply(text_processor.pos_tag)
# 3. 情感分析
print("进行情感分析...")
sentiment_analyzer = SentimentAnalyzer()
df['sentiment_score'] = df['tokens'].apply(sentiment_analyzer.analyze_sentiment)
df['sentiment_label'] = df['sentiment_score'].apply(sentiment_analyzer.get_sentiment_label)
# 保存预处理后的数据
preprocessed_file = f"data/{movie_id}_preprocessed.csv"
df.to_csv(preprocessed_file, index=False, encoding='utf-8-sig')
print(f"数据预处理完成,已保存到{preprocessed_file}")
# 4. 特征提取
print("开始特征提取...")
feature_extractor = FeatureExtractor(max_features=3000, ngram_range=(1, 2))
X = feature_extractor.extract_features(df['clean_content'])
feature_names = feature_extractor.get_feature_names()
print(f"提取的特征数量: {X.shape[1]}")
# 5. 模型训练与评估
print("开始模型训练与评估...")
# 将评分转换为数值标签
rating_map = {
"力荐": 5,
"推荐": 4,
"还行": 3,
"较差": 2,
"很差": 1,
"未评分": 0
}
df['rating_value'] = df['rating'].map(lambda x: rating_map.get(x, 0))
# 过滤掉未评分的评论
df_rated = df[df['rating_value'] > 0]
if len(df_rated) < 10:
print("警告: 评分数据不足,无法训练模型")
else:
y = df_rated['rating_value']
X_train, X_test, y_train, y_test = train_test_split(
feature_extractor.transform(df_rated['clean_content']),
y,
test_size=0.2,
random_state=42
)
# 训练模型
model_trainer = ModelTrainer(model_type="logistic_regression")
class_names = model_trainer.train(X_train, y_train)
evaluation_results = model_trainer.evaluate(X_test, y_test)
print("模型评估结果:")
for metric, value in evaluation_results.items():
print(f"{metric}: {value:.4f}")
# 保存评估结果
with open(f"results/{movie_id}_model_evaluation.txt", 'w', encoding='utf-8') as f:
f.write(f"电影: {movie_title}\n")
f.write(f"评论数量: {len(df_rated)}\n")
f.write("模型评估结果:\n")
for metric, value in evaluation_results.items():
f.write(f"{metric}: {value:.4f}\n")
# 6. 文本分析与可视化
print("开始文本分析与可视化...")
# 查找并设置中文字体
text_analyzer = TextAnalyzer()
# 分析词频
word_freq = text_analyzer.analyze_word_frequency(df['tokens'])
print("高频词:")
for word, freq in word_freq[:10]:
print(f"{word}: {freq}")
# 生成词云
text_analyzer.visualize_wordcloud(
word_freq,
output_file=f"results/{movie_id}_wordcloud.png",
width=1000,
height=600
)
print(f"词云图已保存到results/{movie_id}_wordcloud.png")
# 可视化类别分布
text_analyzer.visualize_class_distribution(
df['sentiment_label'],
output_file=f"results/{movie_id}_sentiment_distribution.png"
)
print(f"情感分布图已保存到results/{movie_id}_sentiment_distribution.png")
# 可视化评分分布
if len(df_rated) > 0:
text_analyzer.visualize_class_distribution(
df_rated['rating'],
output_file=f"results/{movie_id}_rating_distribution.png"
)
print(f"评分分布图已保存到results/{movie_id}_rating_distribution.png")
# 可视化情感趋势
if len(df) > 0 and 'time' in df.columns:
# 过滤掉时间为空的评论
df_with_time = df[df['time'].notna() & (df['time'] != "")]
if len(df_with_time) > 10:
text_analyzer.visualize_sentiment_trend(
df_with_time,
date_col='time',
sentiment_col='sentiment_score',
output_file=f"results/{movie_id}_sentiment_trend.png"
)
print(f"情感趋势图已保存到results/{movie_id}_sentiment_trend.png")
# 可视化各分类的重要特征
if len(df_rated) > 0 and 'rating_value' in df_rated.columns:
# 训练一个新模型用于特征可视化
feature_values = model_trainer.model.coef_
text_analyzer.visualize_tfidf_features(
feature_names,
feature_values,
class_names,
top_n=10,
output_file=f"results/{movie_id}_tfidf_features.png"
)
print(f"TF-IDF特征图已保存到results/{movie_id}_tfidf_features.png")
print(f"\n分析完成!结果已保存到results目录下")
print(f"电影: {movie_title}")
print(f"总评论数: {len(df)}")
print(f"平均情感分数: {df['sentiment_score'].mean():.4f}")
print(f"情感分布: {Counter(df['sentiment_label'])}")
if len(df_rated) > 0:
print(f"平均评分: {df_rated['rating_value'].mean():.2f}/5")
if __name__ == "__main__":
main()修改代码使避免403错误 要求输出修改后完整的代码
最新发布