将政府工作报告保存为report.txt
,准备中文字体文件,中文停用词表stopwords.txt
import jieba
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt
# 1. 读取文本文件(假设文件名为report.txt)
with open('report.txt', 'r', encoding='utf-8') as f:
text = f.read()
# 2. 中文分词
words = jieba.lcut(text)
# 3. 加载停用词表(需提前准备或下载中文停用词表)
stopwords = set()
with open('stopwords.txt', 'r', encoding='utf-8') as f:
for line in f:
stopwords.add(line.strip())
# 4. 过滤停用词和非中文字符
filtered_words = [
word for word in words
if len(word) > 1
and word not in stopwords
and '\u4e00' <= word <= '\u9fff'
]
# 5. 统计词频
word_counts = Counter(filtered_words)
top_words = word_counts.most_common(50)
# 打印前20个高频词
print("高频词汇Top20:")
for word, count in top_words[:20]:
print(f"{word}: {count}")
# 6. 生成词云
font_path = 'msyh.ttc' # 需要中文字体文件
wc = WordCloud(
font_path=font_path,
background_color='white',
max_words=200,
width=800,
height=600
)
wc.generate_from_frequencies(word_counts)
# 显示词云
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
# 保存词云图片
wc.to_file('wordcloud.png')