编写程序,统计两会政府工作报告热词频率,并生成词云

import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import re


def load_stopwords(filepath):
    """加载停用词表(增强兼容性)"""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            stopwords = [line.strip() for line in f if line.strip()]
        print(f"已加载 {len(stopwords)} 个停用词")
        return set(stopwords)
    except Exception as e:
        print(f"停用词表加载失败: {str(e)},将使用空停用词表")
        return set()


def preprocess_text(text, stopwords):
    """文本预处理(更安全的清洗逻辑)"""
    # 保留中文、常见标点和换行符(避免过度清洗)
    text = re.sub(r'[^\u4e00-\u9fa5,。、;:!?()【】"\'\n]', '', text)

    # 检查清洗后文本
    if not text:
        raise ValueError("文本清洗后为空!请检查原始文件内容")

    # 分词并过滤
    words = jieba.lcut(text)
    valid_words = [word for word in words if word not in stopwords and len(word) > 1]

    print(f"原始分词数: {len(words)},过滤后: {len(valid_words)}")
    return valid_words


def generate_wordcloud(word_freq, output_file):
    """生成词云(增加错误处理)"""
    if not word_freq:
        raise ValueError("词频数据为空,无法生成词云")

    try:
        wc = WordCloud(
            font_path='simhei.ttf',  # 确保字体文件存在
            background_color='white',
            width=1000,
            height=800,
            max_words=200
        )
        wc.generate_from_frequencies(dict(word_freq))

        plt.figure(figsize=(12, 10))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.show()
        print(f"词云已保存至 {output_file}")
    except Exception as e:
        print(f"生成词云失败: {str(e)}")


def main():
    # 文件配置
    report_file = 'government_report.txt'
    stopwords_file = 'stopwords.txt'

    # 1. 检查文件内容
    try:
        with open(report_file, 'r', encoding='utf-8') as f:
            report_text = f.read()
            print(f"文件前100字符:\n{report_text[:100]}...")  # 重要:验证文件内容
    except UnicodeDecodeError:
        print("UTF-8解码失败,尝试GBK编码...")
        with open(report_file, 'r', encoding='gbk') as f:
            report_text = f.read()

    # 2. 加载停用词表
    stopwords = load_stopwords(stopwords_file)

    # 3. 预处理与分词
    try:
        words = preprocess_text(report_text, stopwords)
        word_freq = Counter(words).most_common(50)

        if not word_freq:
            raise ValueError("无有效热词,请检查停用词表是否过滤过多或文本内容异常")

        # 4. 输出热词
        print("\n热词TOP50:")
        for i, (word, count) in enumerate(word_freq, 1):
            print(f"{i:>2}. {word}: {count}")

        # 5. 生成词云
        generate_wordcloud(word_freq, 'report_wordcloud.png')

    except Exception as e:
        print(f"处理失败: {str(e)}")
        print("调试建议:")
        print("1. 检查 government_report.txt 是否为有效中文文本")
        print("2. 停用词表 stopwords.txt 是否过滤过多(如包含常用词)")
        print("3. 尝试减少文本清洗强度(修改 preprocess_text() 中的正则表达式)")


if __name__ == '__main__':
    main()

由于文件夹的内容为空,所以编译出来的结果显示0个停用词

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值