影刀RPA自动提取抖店直播评论,AI实时分析观众心声,直播优化有据可依!🚀
直播后手动整理评论到手软?观众反馈堆积如山却不会分析?别慌,今天我用影刀RPA+AI打造直播评论智能分析引擎,让观众心声"一键提取",直播优化从此数据驱动!
一、背景痛点:直播评论的"数据金矿"
做直播电商最痛苦的是什么?不是没观众,而是有观众却听不懂他们在说什么!想象这样的场景:一场2小时的直播收到2000+条评论,你想要分析观众反馈来优化下一场直播,但手动整理评论需要3-4小时,等到分析出来热度都过了...
手动处理直播评论的致命痛点:
-
效率极低:整理1000条评论需要2-3小时,效率堪比"人工OCR"
-
信息碎片:评论散落在不同时间段,难以形成整体洞察
-
分析困难:人工阅读容易错过关键信息,难以量化分析
-
时效性差:等分析完成,优化时机已经错过
-
价值浪费:90%的观众反馈被埋没,无法驱动直播优化
触目惊心的案例:某美妆主播因未及时发现观众对某产品的"质地太油"集中反馈,连续3场直播推荐该产品,导致退货率飙升40%!但通过RPA自动化提取分析,原来需要3小时的数据整理,现在15分钟自动完成,还能生成 actionable 的优化建议!
二、解决方案:影刀RPA的"评论智能矿工"
影刀RPA能够自动登录抖店后台、提取直播评论数据、进行AI情感分析和关键词提取,并生成可视化分析报告。整个过程实现数据采集→清洗→分析→洞察→报告的全链路自动化!
方案核心优势:
-
全自动采集:一键提取整场直播的所有评论数据
-
AI智能分析:集成NLP技术,自动识别情感倾向和关键话题
-
实时监控:支持直播中实时分析,及时调整直播策略
-
多维洞察:从产品、服务、内容多维度分析观众反馈
-
可视化报告:自动生成专业分析报告,驱动数据驱动决策
技术架构设计:
直播评论采集 → 数据清洗处理 → AI情感分析 → 关键词提取 → 可视化报告生成
↓ ↓ ↓ ↓ ↓
多场直播覆盖 去重格式化处理 情感极性判断 主题聚类分析 图表化展示
这个方案的革命性在于:让机器挖掘数据价值,让人专注内容创作!
三、代码实现:手把手搭建评论分析流水线
我们将使用影刀RPA构建完整的直播评论分析流程,结合AI技术实现深度洞察。
步骤1:直播评论数据自动化采集
自动登录抖店并提取直播评论数据。
# 伪代码:直播评论采集模块
class LiveCommentCollector:
"""直播评论采集器"""
def __init__(self):
self.collection_config = self.load_collection_config()
def load_collection_config(self):
"""加载采集配置"""
return {
"max_comments_per_live": 5000, # 每场直播最多采集5000条评论
"scroll_interval": 2, # 滚动间隔2秒
"retry_times": 3
}
def collect_live_comments(self, live_session_id):
"""采集指定直播的评论数据"""
# 登录抖店后台
browser.launch("chrome", "https://compass.jinritemai.com/login")
browser.input_text("#username", env.get("douyin_username"))
browser.input_text("#password", env.get("douyin_password"))
browser.click(".login-btn")
browser.wait_for_element(".dashboard", timeout=10)
# 进入直播管理页面
browser.click("直播")
browser.click("直播管理")
browser.wait_for_element(".live-management", timeout=5)
# 搜索指定直播场次
self.search_live_session(live_session_id)
# 进入评论详情页面
browser.click(".comment-detail-btn")
browser.wait_for_element(".comment-panel", timeout=5)
comments_data = {
"live_session_id": live_session_id,
"collection_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"total_comments": 0,
"comments": []
}
collected_comments = set() # 用于去重
scroll_attempts = 0
max_scroll_attempts = 50
while scroll_attempts < max_scroll_attempts:
# 获取当前页面评论
current_comments = self.extract_page_comments()
new_comments = 0
for comment in current_comments:
comment_id = f"{comment['user_id']}_{comment['timestamp']}"
if comment_id not in collected_comments:
comments_data["comments"].append(comment)
collected_comments.add(comment_id)
new_comments += 1
comments_data["total_comments"] = len(collected_comments)
# 检查是否达到采集上限
if len(collected_comments) >= self.collection_config["max_comments_per_live"]:
log.info(f"达到采集上限: {self.collection_config['max_comments_per_live']} 条")
break
# 滚动加载更多评论
if not self.scroll_for_more_comments():
log.info("没有更多评论可加载")
break
scroll_attempts += 1
delay(self.collection_config["scroll_interval"])
log.info(f"成功采集 {comments_data['total_comments']} 条直播评论")
return comments_data
def search_live_session(self, live_session_id):
"""搜索直播场次"""
search_box = browser.find_element(".live-search-input")
search_box.clear()
search_box.send_keys(live_session_id)
browser.click(".search-btn")
browser.wait_for_element(".search-results", timeout=5)
# 验证搜索结果
if not browser.is_element_exist(f"//*[contains(text(), '{live_session_id}')]"):
raise Exception(f"未找到直播场次: {live_session_id}")
def extract_page_comments(self):
"""提取当前页面评论"""
comments = []
comment_elements = browser.find_elements(".comment-item")
for element in comment_elements:
try:
comment_data = {
"user_id": self.extract_user_id(element),
"user_name": self.extract_user_name(element),
"comment_text": self.extract_comment_text(element),
"timestamp": self.extract_timestamp(element),
"likes_count": self.extract_likes_count(element),
"user_level": self.extract_user_level(element),
"is_author_replied": self.check_author_reply(element)
}
# 数据验证
if comment_data["comment_text"] and len(comment_data["comment_text"].strip()) > 0:
comments.append(comment_data)
except Exception as e:
log.warning(f"提取评论失败: {str(e)}")
continue
return comments
def extract_user_id(self, comment_element):
"""提取用户ID"""
try:
user_link = comment_element.find_element(".user-link")
return user_link.get_attribute("data-userid")
except:
return "unknown"
def extract_user_name(self, comment_element):
"""提取用户名"""
try:
return comment_element.find_element(".user-name").text
except:
return "匿名用户"
def extract_comment_text(self, comment_element):
"""提取评论内容"""
try:
return comment_element.find_element(".comment-text").text.strip()
except:
return ""
def extract_timestamp(self, comment_element):
"""提取时间戳"""
try:
time_text = comment_element.find_element(".comment-time").text
return self.parse_timestamp(time_text)
except:
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def extract_likes_count(self, comment_element):
"""提取点赞数"""
try:
likes_text = comment_element.find_element(".comment-likes").text
return int(likes_text) if likes_text.isdigit() else 0
except:
return 0
def extract_user_level(self, comment_element):
"""提取用户等级"""
try:
level_element = comment_element.find_element(".user-level")
level_class = level_element.get_attribute("class")
if "level-high" in level_class:
return "高价值用户"
elif "level-medium" in level_class:
return "活跃用户"
else:
return "普通用户"
except:
return "普通用户"
def check_author_reply(self, comment_element):
"""检查是否有作者回复"""
return browser.is_element_exist(".author-reply", context=comment_element)
def parse_timestamp(self, time_text):
"""解析时间文本"""
# 处理相对时间(如:2小时前、昨天、刚刚)
if "刚刚" in time_text:
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
elif "分钟前" in time_text:
minutes = int(time_text.replace("分钟前", ""))
past_time = datetime.now() - timedelta(minutes=minutes)
return past_time.strftime("%Y-%m-%d %H:%M:%S")
elif "小时前" in time_text:
hours = int(time_text.replace("小时前", ""))
past_time = datetime.now() - timedelta(hours=hours)
return past_time.strftime("%Y-%m-%d %H:%M:%S")
else:
# 绝对时间,直接返回
return time_text
def scroll_for_more_comments(self):
"""滚动加载更多评论"""
try:
# 执行JavaScript滚动
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
delay(1)
# 检查是否有新评论加载
current_count = len(browser.find_elements(".comment-item"))
delay(2) # 等待可能的加载
new_count = len(browser.find_elements(".comment-item"))
return new_count > current_count
except Exception as e:
log.warning(f"滚动加载失败: {str(e)}")
return False
# 初始化采集器
comment_collector = LiveCommentCollector()
# 执行评论采集(假设已知直播场次ID)
live_comments_data = comment_collector.collect_live_comments("live_20240115_200000")
步骤2:评论数据清洗与预处理
对原始评论数据进行清洗和格式化处理。
# 伪代码:数据清洗与预处理模块
import jieba
import jieba.analyse
import re
from collections import Counter
class CommentPreprocessor:
"""评论数据预处理器"""
def __init__(self):
self.stop_words = self.load_stop_words()
self.special_char_pattern = re.compile(r'[^\w\s\u4e00-\u9fff]') # 保留中文、英文、数字
jieba.initialize()
def load_stop_words(self):
"""加载停用词表"""
stop_words = set()
# 基础停用词
base_stop_words = ["的", "了", "在", "是", "我", "你", "他", "她", "它", "这", "那"]
stop_words.update(base_stop_words)
# 直播常用停用词
live_stop_words = ["主播", "宝宝", "亲", "小姐姐", "小哥哥", "哈哈哈", "呵呵", "啊啊"]
stop_words.update(live_stop_words)
return stop_words
def preprocess_comments(self, raw_comments_data):
"""预处理评论数据"""
processed_data = raw_comments_data.copy()
processed_comments = []
for comment in raw_comments_data["comments"]:
processed_comment = self.clean_single_comment(comment)
if processed_comment: # 只保留有效的评论
processed_comments.append(processed_comment)
processed_data["comments"] = processed_comments
processed_data["valid_comments"] = len(processed_comments)
processed_data["preprocess_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log.info(f"数据预处理完成: 原始{raw_comments_data['total_comments']}条, 有效{len(processed_comments)}条")
return processed_data
def clean_single_comment(self, comment):
"""清洗单条评论"""
try:
# 文本清洗
clean_text = self.clean_text(comment["comment_text"])
if not clean_text or len(clean_text) < 2: # 过滤空评论和过短评论
return None
# 分词处理
words = self.tokenize_text(clean_text)
# 提取关键词
keywords = self.extract_keywords(clean_text)
processed_comment = {
**comment,
"clean_text": clean_text,
"word_tokens": words,
"keywords": keywords,
"word_count": len(words),
"contains_question": self.contains_question(clean_text),
"contains_emotion": self.contains_emotion(words)
}
return processed_comment
except Exception as e:
log.warning(f"清洗评论失败: {str(e)}")
return None
def clean_text(self, text):
"""清洗文本"""
if not text:
return ""
# 移除特殊字符但保留中文和基本标点
clean_text = self.special_char_pattern.sub(' ', text)
# 移除多余空格
clean_text = re.sub(r'\s+', ' ', clean_text).strip()
# 移除URL链接
clean_text = re.sub(r'http\S+', '', clean_text)
# 移除表情符号(简单处理)
clean_text = re.sub(r'\[.*?\]', '', clean_text)
return clean_text
def tokenize_text(self, text):
"""文本分词"""
words = jieba.cut(text)
# 过滤停用词和单字
filtered_words = [
word for word in words
if (word not in self.stop_words and
len(word) > 1 and
not word.isspace())
]
return filtered_words
def extract_keywords(self, text, top_k=5):
"""提取关键词"""
try:
keywords = jieba.analyse.extract_tags(
text,
topK=top_k,
withWeight=False,
allowPOS=('n', 'vn', 'v', 'a') # 只保留名词、动名词、动词、形容词
)
return keywords
except:
return []
def contains_question(self, text):
"""检查是否包含问题"""
question_words = ["吗", "怎么", "如何", "为什么", "什么", "哪", "多少", "什么时候"]
return any(word in text for word in question_words)
def contains_emotion(self, words):
"""检查是否包含情感词"""
positive_words = ["喜欢", "好看", "漂亮", "不错", "棒", "赞", "好用", "划算"]
negative_words = ["不好", "差", "贵", "坑", "假", "失望", "垃圾", "骗人"]
pos_count = sum(1 for word in words if word in positive_words)
neg_count = sum(1 for word in words if word in negative_words)
if pos_count > neg_count:
return "positive"
elif neg_count > pos_count:
return "negative"
else:
return "neutral"
# 初始化预处理器
comment_preprocessor = CommentPreprocessor()
# 执行数据预处理
processed_comments_data = comment_preprocessor.preprocess_comments(live_comments_data)
步骤3:AI情感分析与主题挖掘
使用NLP技术深度分析评论情感和主题。
# 伪代码:AI情感分析与主题挖掘
from snownlp import SnowNLP
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
class CommentAnalyzer:
"""评论智能分析器"""
def __init__(self):
self.sentiment_analyzer = SnowNLP
self.vectorizer = TfidfVectorizer(max_features=1000, stop_words=list(comment_preprocessor.stop_words))
def analyze_comments(self, processed_data):
"""分析评论数据"""
analysis_result = {
"analysis_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"sentiment_analysis": {},
"topic_analysis": {},
"user_behavior_analysis": {},
"key_insights": []
}
comments = processed_data["comments"]
# 情感分析
sentiment_result = self.analyze_sentiment(comments)
analysis_result["sentiment_analysis"] = sentiment_result
# 主题分析
topic_result = self.analyze_topics(comments)
analysis_result["topic_analysis"] = topic_result
# 用户行为分析
user_behavior_result = self.analyze_user_behavior(comments)
analysis_result["user_behavior_analysis"] = user_behavior_result
# 关键洞察提取
key_insights = self.extract_key_insights(sentiment_result, topic_result, user_behavior_result)
analysis_result["key_insights"] = key_insights
return analysis_result
def analyze_sentiment(self, comments):
"""情感分析"""
sentiment_scores = []
sentiment_distribution = {"positive": 0, "neutral": 0, "negative": 0}
high_like_sentiments = []
for comment in comments:
try:
# 使用SnowNLP进行情感分析
s = self.sentiment_analyzer(comment["clean_text"])
sentiment_score = s.sentiments
sentiment_scores.append(sentiment_score)
# 情感分类
if sentiment_score > 0.6:
sentiment_distribution["positive"] += 1
sentiment_label = "positive"
elif sentiment_score < 0.4:
sentiment_distribution["negative"] += 1
sentiment_label = "negative"
else:
sentiment_distribution["neutral"] += 1
sentiment_label = "neutral"
# 记录高点赞评论的情感
if comment["likes_count"] > 10: # 点赞数超过10
high_like_sentiments.append({
"text": comment["clean_text"],
"sentiment": sentiment_label,
"score": sentiment_score,
"likes": comment["likes_count"]
})
# 更新评论数据
comment["sentiment_score"] = sentiment_score
comment["sentiment_label"] = sentiment_label
except Exception as e:
log.warning(f"情感分析失败: {str(e)}")
continue
return {
"average_sentiment": np.mean(sentiment_scores) if sentiment_scores else 0,
"sentiment_distribution": sentiment_distribution,
"sentiment_scores": sentiment_scores,
"high_like_sentiments": sorted(high_like_sentiments, key=lambda x: x["likes"], reverse=True)[:10],
"positive_ratio": sentiment_distribution["positive"] / len(comments) if comments else 0
}
def analyze_topics(self, comments, n_clusters=5):
"""主题分析"""
if len(comments) < n_clusters:
n_clusters = max(1, len(comments) // 2)
# 准备文本数据
texts = [comment["clean_text"] for comment in comments if comment["clean_text"]]
if not texts:
return {"topics": [], "clustering_success": False}
try:
# TF-IDF向量化
tfidf_matrix = self.vectorizer.fit_transform(texts)
# K-means聚类
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(tfidf_matrix)
# 提取每个簇的关键词
feature_names = self.vectorizer.get_feature_names_out()
topics = []
for i in range(n_clusters):
# 获取簇中心的关键词
center = kmeans.cluster_centers_[i]
top_keyword_indices = center.argsort()[-10:][::-1] # 取前10个关键词
topic_keywords = [feature_names[idx] for idx in top_keyword_indices]
# 获取该主题的典型评论
cluster_comments = [comments[j] for j in range(len(comments)) if clusters[j] == i]
representative_comments = self.get_representative_comments(cluster_comments, topic_keywords)
topics.append({
"topic_id": i,
"keywords": topic_keywords[:5], # 只保留前5个关键词
"comment_count": len(cluster_comments),
"representative_comments": representative_comments[:3], # 前3条典型评论
"avg_sentiment": np.mean([c.get("sentiment_score", 0.5) for c in cluster_comments])
})
# 为每个评论分配主题
for i, comment in enumerate(comments):
if i < len(clusters):
comment["topic_id"] = clusters[i]
else:
comment["topic_id"] = -1
return {
"topics": topics,
"clustering_success": True,
"topic_distribution": dict(Counter(clusters))
}
except Exception as e:
log.error(f"主题分析失败: {str(e)}")
return {"topics": [], "clustering_success": False}
def get_representative_comments(self, cluster_comments, topic_keywords):
"""获取代表性评论"""
scored_comments = []
for comment in cluster_comments:
# 计算评论与主题关键词的匹配度
keyword_match = sum(1 for keyword in topic_keywords if keyword in comment["clean_text"])
# 综合考虑点赞数和匹配度
score = keyword_match * 0.6 + min(comment["likes_count"] / 10, 1) * 0.4
scored_comments.append((score, comment))
# 按得分排序并返回
scored_comments.sort(key=lambda x: x[0], reverse=True)
return [comment for score, comment in scored_comments]
def analyze_user_behavior(self, comments):
"""用户行为分析"""
# 用户活跃度分析
user_comment_count = Counter(comment["user_id"] for comment in comments)
active_users = [user_id for user_id, count in user_comment_count.items() if count >= 3]
# 评论时间分布分析
time_distribution = self.analyze_time_distribution(comments)
# 问题类型分析
question_analysis = self.analyze_questions(comments)
return {
"total_unique_users": len(user_comment_count),
"active_users_count": len(active_users),
"user_comment_distribution": dict(user_comment_count.most_common(10)), # 前10活跃用户
"time_distribution": time_distribution,
"question_analysis": question_analysis,
"avg_comments_per_user": len(comments) / len(user_comment_count) if user_comment_count else 0
}
def analyze_time_distribution(self, comments):
"""分析时间分布"""
hourly_distribution = Counter()
for comment in comments:
try:
# 解析时间戳
comment_time = datetime.strptime(comment["timestamp"], "%Y-%m-%d %H:%M:%S")
hour = comment_time.hour
hourly_distribution[hour] += 1
except:
continue
return dict(hourly_distribution)
def analyze_questions(self, comments):
"""分析问题类型"""
question_types = {
"product_questions": 0, # 产品相关问题
"price_questions": 0, # 价格问题
"logistics_questions": 0, # 物流问题
"usage_questions": 0, # 使用问题
"other_questions": 0 # 其他问题
}
question_keywords = {
"product_questions": ["质量", "材质", "颜色", "尺寸", "效果", "适合"],
"price_questions": ["多少钱", "价格", "优惠", "打折", "便宜", "贵"],
"logistics_questions": ["发货", "快递", "物流", "多久到", "包邮"],
"usage_questions": ["怎么用", "使用方法", "步骤", "教程", "注意事项"]
}
question_comments = [c for c in comments if c["contains_question"]]
for comment in question_comments:
text = comment["clean_text"]
question_found = False
for q_type, keywords in question_keywords.items():
if any(keyword in text for keyword in keywords):
question_types[q_type] += 1
question_found = True
break
if not question_found:
question_types["other_questions"] += 1
return question_types
def extract_key_insights(self, sentiment_result, topic_result, user_behavior_result):
"""提取关键洞察"""
insights = []
# 基于情感分析的洞察
positive_ratio = sentiment_result["positive_ratio"]
if positive_ratio > 0.8:
insights.append("观众情绪积极,直播内容深受欢迎")
elif positive_ratio < 0.4:
insights.append("观众情绪偏消极,需要优化直播内容或产品")
# 基于主题分析的洞察
if topic_result["clustering_success"]:
main_topic = max(topic_result["topics"], key=lambda x: x["comment_count"])
insights.append(f"观众最关注的话题: {', '.join(main_topic['keywords'][:3])}")
# 基于用户行为的洞察
if user_behavior_result["active_users_count"] > 10:
insights.append(f"有{user_behavior_result['active_users_count']}位活跃用户持续互动")
# 基于问题分析的洞察
questions = user_behavior_result["question_analysis"]
if questions["product_questions"] > questions["price_questions"]:
insights.append("观众更关注产品本身而非价格,可加强产品介绍")
elif questions["price_questions"] > questions["product_questions"]:
insights.append("观众对价格敏感,可考虑优化价格策略")
return insights
# 初始化分析器
comment_analyzer = CommentAnalyzer()
# 执行深度分析
analysis_result = comment_analyzer.analyze_comments(processed_comments_data)
步骤4:可视化报告自动生成
自动生成专业的直播评论分析报告。
# 伪代码:报告生成模块
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
class ReportGenerator:
"""报告生成器"""
def __init__(self):
self.report_templates = self.load_report_templates()
def load_report_templates(self):
"""加载报告模板"""
return {
"executive_summary": {
"sections": ["总体概览", "核心发现", "改进建议"]
},
"detailed_analysis": {
"sections": ["情感分析", "主题分析", "用户行为", "关键词云"]
}
}
def generate_comprehensive_report(self, processed_data, analysis_result, output_formats=["html", "excel"]):
"""生成综合报告"""
generated_files = {}
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
for format_type in output_formats:
if format_type == "html":
file_path = self.generate_html_report(processed_data, analysis_result)
elif format_type == "excel":
file_path = self.generate_excel_report(processed_data, analysis_result)
elif format_type == "pdf":
file_path = self.generate_pdf_report(processed_data, analysis_result)
else:
continue
if file_path:
generated_files[format_type] = file_path
return generated_files
def generate_html_report(self, processed_data, analysis_result):
"""生成HTML报告"""
try:
report_content = self.create_html_content(processed_data, analysis_result)
# 生成图表
charts = self.generate_charts(processed_data, analysis_result)
# 将图表嵌入HTML
for chart_name, chart_path in charts.items():
report_content = report_content.replace(f"{{{chart_name}}}", f'<img src="{chart_path}" alt="{chart_name}">')
filename = f"直播评论分析报告_{datetime.now().strftime('%Y%m%d_%H%M')}.html"
with open(filename, 'w', encoding='utf-8') as f:
f.write(report_content)
log.info(f"HTML报告已生成: {filename}")
return filename
except Exception as e:
log.error(f"生成HTML报告失败: {str(e)}")
return None
def create_html_content(self, processed_data, analysis_result):
"""创建HTML内容"""
sentiment = analysis_result["sentiment_analysis"]
topics = analysis_result["topic_analysis"]
user_behavior = analysis_result["user_behavior_analysis"]
html_content = f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>直播评论分析报告</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 40px; }}
.header {{ text-align: center; border-bottom: 2px solid #333; padding-bottom: 20px; }}
.section {{ margin: 30px 0; }}
.insight {{ background: #f0f8ff; padding: 15px; border-left: 4px solid #007acc; margin: 10px 0; }}
.chart {{ text-align: center; margin: 20px 0; }}
table {{ width: 100%; border-collapse: collapse; }}
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
th {{ background-color: #f2f2f2; }}
</style>
</head>
<body>
<div class="header">
<h1>🎯 直播评论分析报告</h1>
<p>生成时间: {analysis_result['analysis_time']} | 分析评论数: {processed_data['valid_comments']}</p>
</div>
<div class="section">
<h2>📊 执行摘要</h2>
<div class="insight">
<h3>核心指标</h3>
<p>• 平均情感得分: <strong>{sentiment['average_sentiment']:.2f}</strong></p>
<p>• 积极评论比例: <strong>{sentiment['positive_ratio']*100:.1f}%</strong></p>
<p>• 独立用户数: <strong>{user_behavior['total_unique_users']}</strong></p>
</div>
<div class="insight">
<h3>关键发现</h3>
{"".join(f"<p>• {insight}</p>" for insight in analysis_result['key_insights'])}
</div>
</div>
<div class="section">
<h2>😊 情感分析</h2>
<div class="chart">
{{sentiment_chart}}
</div>
<p>情感分布: 正面 {sentiment['sentiment_distribution']['positive']}条,
中性 {sentiment['sentiment_distribution']['neutral']}条,
负面 {sentiment['sentiment_distribution']['negative']}条</p>
</div>
<div class="section">
<h2>🗣️ 主题分析</h2>
<div class="chart">
{{topic_chart}}
</div>
<table>
<tr><th>主题ID</th><th>关键词</th><th>评论数</th><th>情感倾向</th></tr>
{"".join(f"<tr><td>{topic['topic_id']}</td><td>{', '.join(topic['keywords'])}</td><td>{topic['comment_count']}</td><td>{'积极' if topic['avg_sentiment'] > 0.6 else '消极' if topic['avg_sentiment'] < 0.4 else '中性'}</td></tr>"
for topic in topics.get('topics', []))}
</table>
</div>
<div class="section">
<h2>👥 用户行为分析</h2>
<div class="chart">
{{user_behavior_chart}}
</div>
<p>最活跃时段: {max(user_behavior['time_distribution'].items(), key=lambda x: x[1])[0]}时</p>
</div>
<div class="section">
<h2>🔍 关键词云</h2>
<div class="chart">
{{wordcloud_chart}}
</div>
</div>
</body>
</html>
"""
return html_content
def generate_charts(self, processed_data, analysis_result):
"""生成图表"""
charts = {}
# 情感分布饼图
sentiment_dist = analysis_result["sentiment_analysis"]["sentiment_distribution"]
plt.figure(figsize=(8, 6))
plt.pie(sentiment_dist.values(), labels=sentiment_dist.keys(), autopct='%1.1f%%', startangle=90)
plt.title('情感分布')
sentiment_chart_path = "sentiment_chart.png"
plt.savefig(sentiment_chart_path, bbox_inches='tight', dpi=300)
plt.close()
charts["sentiment_chart"] = sentiment_chart_path
# 主题分布条形图
if analysis_result["topic_analysis"]["clustering_success"]:
topics = analysis_result["topic_analysis"]["topics"]
topic_names = [f"主题{i}" for i in range(len(topics))]
comment_counts = [topic["comment_count"] for topic in topics]
plt.figure(figsize=(10, 6))
plt.bar(topic_names, comment_counts)
plt.title('主题分布')
plt.xticks(rotation=45)
topic_chart_path = "topic_chart.png"
plt.savefig(topic_chart_path, bbox_inches='tight', dpi=300)
plt.close()
charts["topic_chart"] = topic_chart_path
# 时间分布折线图
time_dist = analysis_result["user_behavior_analysis"]["time_distribution"]
if time_dist:
hours = list(range(24))
counts = [time_dist.get(hour, 0) for hour in hours]
plt.figure(figsize=(12, 6))
plt.plot(hours, counts, marker='o')
plt.title('评论时间分布')
plt.xlabel('小时')
plt.ylabel('评论数')
user_behavior_chart_path = "user_behavior_chart.png"
plt.savefig(user_behavior_chart_path, bbox_inches='tight', dpi=300)
plt.close()


被折叠的 条评论
为什么被折叠?



