坑爹的word

写word文档时,一定要不停的保存,但是如果用ctrl+z时,一定要慎重慎重,极有可能把你之前的工作撤销
import os import re from collections import defaultdict # ======================== # 配置路径 # ======================== SEGMENTED_DANMU_FILE = "segmented_danmu.txt" # 分好词的弹幕文件 SENTIMENT_DICT_FILE = "merged_sentiment_dict.txt" # 情感词典 OUTPUT_FILE = "extracted_sentiment_words.txt" # 输出文件 # ======================== # Bilibili自定义词典示例 # ======================== """ # Bilibili弹幕自定义词典 (极性词典) # 格式: 词语\t极性 (POS=正向, NEG=负向) # 正面情感词 yyds\tPOS 泪目\tPOS 绝绝子\tPOS 排面\tPOS awsl\tPOS 宝藏\tPOS 种草\tPOS 神仙\tPOS 吹爆\tPOS 慕了\tPOS 爱了\tPOS 稳了\tPOS 冲冲冲\tPOS 好活\tPOS 排面\tPOS 三连\tPOS # 负面情感词 下头\tNEG 栓Q\tNEG 芭比Q\tNEG 避雷\tNEG 拔草\tNEG 踩坑\tNEG 劝退\tNEG 翻车\tNEG 坑爹\tNEG 无语\tNEG 离谱\tNEG 裂开\tNEG 寄了\tNEG 抬走\tNEG 摆烂\tNEG # 表情符号 😂\tPOS 🤣\tPOS 😭\tPOS 😍\tPOS 👍\tPOS 🤮\tNEG 👎\tNEG 💔\tNEG 😤\tNEG """ # ======================== # 词典加载函数 # ======================== def load_sentiment_dict(file_path): """ 加载情感词典 格式: 词语\t极性 (如: 高兴\tPOS) """ sentiment_dict = {} with open(file_path, "r", encoding="utf-8") as f: for line in f: # 跳过注释行和空行 if line.startswith("#") or not line.strip(): continue parts = line.strip().split("\t") if len(parts) >= 2: word = parts[0].strip() polarity = parts[1].strip().upper() # 确保大写 # 只保留POS和NEG极性 if polarity in ["POS", "NEG"]: sentiment_dict[word] = polarity else: print(f"警告: 忽略无效极性 '{polarity}' 对于词 '{word}'") print(f"加载情感词典: 共 {len(sentiment_dict)} 个情感词 (仅POS/NEG)") return sentiment_dict # ======================== # 情感词提取函数 # ======================== def extract_sentiment_words(danmu_file, sentiment_dict): """ 从分好词的弹幕中提取情感词 """ # 统计结果 results = [] word_stats = defaultdict(lambda: {"count": 0, "polarity": ""}) # 处理弹幕文件 with open(danmu_file, "r", encoding="utf-8") as f: for line_num, line in enumerate(f, 1): # 清洗和分割 cleaned_line = line.strip() if not cleaned_line: continue words = cleaned_line.split() danmu_sentiment_words = [] # 检查每个词 for word in words: # 检查是否是情感词 if word in sentiment_dict: polarity = sentiment_dict[word] # 添加到当前弹幕的情感词列表 danmu_sentiment_words.append((word, polarity)) # 更新统计信息 word_stats[word]["count"] += 1 word_stats[word]["polarity"] = polarity # 记录有情感词的弹幕 if danmu_sentiment_words: results.append({ "line": line_num, "text": cleaned_line, "sentiment_words": danmu_sentiment_words }) print(f"处理弹幕: 共 {len(results)} 条弹幕包含情感词") return results, word_stats # ======================== # 结果保存函数 # ======================== def save_results(results, word_stats, output_file): """ 保存提取结果和统计信息 """ # 保存详细结果 with open(output_file, "w", encoding="utf-8") as f: # 写入标题 f.write("弹幕行号\t弹幕内容\t情感词列表\n") # 写入每条弹幕的情感词 for item in results: # 格式化情感词列表: 词[极性] word_list = ", ".join([f"{word}[{pol}]" for word, pol in item["sentiment_words"]]) f.write(f"{item['line']}\t{item['text']}\t{word_list}\n") # 添加分隔线 f.write("\n" + "=" * 80 + "\n\n") # 写入情感词统计 f.write("情感词统计 (按出现频率排序):\n") f.write("词语\t出现次数\t极性\n") # 按出现次数排序 sorted_words = sorted(word_stats.items(), key=lambda x: x[1]["count"], reverse=True) for word, stats in sorted_words: f.write(f"{word}\t{stats['count']}\t{stats['polarity']}\n") print(f"结果已保存至: {output_file}") # ======================== # 极性分析报告 # ======================== def generate_polarity_report(word_stats, report_file="polarity_report.txt"): """生成极性分析报告""" # 按极性分组 pos_words = [] neg_words = [] for word, stats in word_stats.items(): if stats["polarity"] == "POS": pos_words.append((word, stats["count"])) elif stats["polarity"] == "NEG": neg_words.append((word, stats["count"])) # 排序 pos_words.sort(key=lambda x: x[1], reverse=True) neg_words.sort(key=lambda x: x[1], reverse=True) # 计算总体极性比例 total_count = sum(stats["count"] for stats in word_stats.values()) pos_count = sum(count for _, count in pos_words) neg_count = sum(count for _, count in neg_words) # 生成报告 with open(report_file, "w", encoding="utf-8") as f: f.write("=== 弹幕情感极性分析报告 ===\n\n") f.write(f"总情感词出现次数: {total_count}\n") f.write(f"正向情感词: {pos_count}次 ({pos_count/total_count*100:.1f}%)\n") f.write(f"负向情感词: {neg_count}次 ({neg_count/total_count*100:.1f}%)\n") f.write("\n=== 高频正向情感词 TOP20 ===\n") for i, (word, count) in enumerate(pos_words[:20], 1): f.write(f"{i}. {word}: {count}次\n") f.write("\n=== 高频负向情感词 TOP20 ===\n") for i, (word, count) in enumerate(neg_words[:20], 1): f.write(f"{i}. {word}: {count}次\n") print(f"极性分析报告已保存至: {report_file}") # ======================== # 主函数 # ======================== def main(): # 1. 加载词典 sentiment_dict = load_sentiment_dict(SENTIMENT_DICT_FILE) # 2. 提取情感词 results, word_stats = extract_sentiment_words(SEGMENTED_DANMU_FILE, sentiment_dict) # 3. 保存结果 save_results(results, word_stats, OUTPUT_FILE) # 4. 生成极性报告 generate_polarity_report(word_stats) # 5. 打印摘要统计 total_words = sum(stats["count"] for stats in word_stats.values()) unique_words = len(word_stats) print("\n=== 摘要统计 ===") print(f"总情感词出现次数: {total_words}") print(f"独特情感词数量: {unique_words}") # 按极性统计 pos_count = sum(stats["count"] for stats in word_stats.values() if stats["polarity"] == "POS") neg_count = sum(stats["count"] for stats in word_stats.values() if stats["polarity"] == "NEG") print(f"\n正向情感词比例: {pos_count}次 ({pos_count/total_words*100:.1f}%)") print(f"负向情感词比例: {neg_count}次 ({neg_count/total_words*100:.1f}%)") # 打印TOP10情感词 sorted_words = sorted(word_stats.items(), key=lambda x: x[1]["count"], reverse=True)[:10] print("\nTOP10高频情感词:") for i, (word, stats) in enumerate(sorted_words, 1): print(f"{i}. {word}: {stats['count']}次 ({stats['polarity']})") # ======================== # 执行程序 # ======================== if __name__ == "__main__": main() 我自定义词典里面放了”哈哈哈“”啊啊啊“这样的情感词,为什么没有统计出来呢
08-15
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值