简单易学的文本分析1——词云分析

最新推荐文章于 2025-02-23 11:25:46 发布

爱做科研的桶

最新推荐文章于 2025-02-23 11:25:46 发布

阅读量767

点赞数 13

文章标签： python 中文分词数据分析自然语言处理

本文链接：https://blog.youkuaiyun.com/llthxx/article/details/141613277

版权

载入库

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import numpy as np
import jieba.posseg as pseg
from collections import Counter
from PIL import Image
from matplotlib import colors

文本读取与分词

从指定的文本文件中读取内容。

使用 jieba.posseg 对文本进行分词，并为每个词标注词性。

# 读取文本文件内容
# 在此替换为实际的文本文件路径
text_file_path = "gpt4_system2_80%.txt"
text = open(text_file_path, encoding="utf-8").read()

# 使用 jieba 进行分词，并附带词性标注
words = pseg.cut(text)

词汇筛选与统计

筛选出长度大于等于2的词，且词性为名词、动词或形容词的词。

统计筛选出的词的频率，提取前200个高频词。

# 提取指定词性（名词、动词、形容词）且长度大于等于2的词
report_words = [
    word for word, flag in words if len(word) >= 2 and flag.startswith(('n', 'v', 'a'))
]

# 统计高频词汇，取前200个词
result = Counter(report_words).most_common(200)
# 建立词汇字典，以便生成词云
content = dict(result)

高频词输出

打印输出前100个高频词及其出现频率。

# 输出前100个高频词汇及其频率
print("前100个高频词汇及其频率：")
for i in range(100):
    word, frequency = result[i]
    print(f"{word:<10}{frequency:>5}")

停用词设置

定义并设置停用词，即需要排除在词云之外的词汇。

# 设置停用词，可以根据需要添加更多停用词
stopwords = set(STOPWORDS)
#stopwords.update(["的", "感谢", "我代表", "以上", "报告", "表示诚挚感谢", "战略"])

词云生成与显示

设置词云生成的字体、颜色映射、最大和最小字体大小等参数。

使用 WordCloud 类根据词频生成词云。

使用 matplotlib 显示生成的词云图。

# 设置词云的字体路径，需替换为实际字体文件路径
font_path = "simhei.ttf"  # 替换为实际的字体路径

# 设置最大和最小字体大小
max_font_size = 300
min_font_size = 5

# 建立颜色列表并生成颜色映射
color_list = ['#FF274B']
colormap = colors.ListedColormap(color_list)

# 自定义颜色函数，使字体呈现蓝色深浅变化
def blue_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return f"hsl(240, 100%, {100 - int(100 * font_size / max_font_size)}%)"

# 生成词云
wordcloud = WordCloud(
    scale=5,  # 设置输出图片的清晰度
    font_path=font_path,  # 字体路径
    # colormap=colormap,  # 使用自定义颜色映射
    width=1600,  # 输出图片的宽度
    height=900,  # 输出图片的高度
    background_color='white',  # 背景颜色
    stopwords=stopwords,  # 停用词
    max_font_size=max_font_size,  # 最大字体大小
    min_font_size=min_font_size,  # 最小字体大小
    color_func=blue_color_func  # 使用自定义颜色函数
).generate_from_frequencies(content)

# 使用 matplotlib 显示生成的词云
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()  # 显示词云图

词云保存

将生成的词云图保存为指定路径的图片文件。

# 保存词云图到文件
output_image_path = "gpt4_system2_80%.png"
wordcloud.to_file(output_image_path)

代码整合

最终全部代码

# 导入所需的库
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import numpy as np
import jieba.posseg as pseg
from collections import Counter
from PIL import Image
from matplotlib import colors

# 读取文本文件内容
# 在此替换为实际的文本文件路径
text_file_path = "gpt4_system2_80%.txt"
text = open(text_file_path, encoding="utf-8").read()

# 使用 jieba 进行分词，并附带词性标注
words = pseg.cut(text)

# 提取指定词性（名词、动词、形容词）且长度大于等于2的词
report_words = [
    word for word, flag in words if len(word) >= 2 and flag.startswith(('n', 'v', 'a'))
]

# 统计高频词汇，取前200个词
result = Counter(report_words).most_common(200)
# 建立词汇字典，以便生成词云
content = dict(result)

# 输出前100个高频词汇及其频率
print("前100个高频词汇及其频率：")
for i in range(100):
    word, frequency = result[i]
    print(f"{word:<10}{frequency:>5}")

# 设置停用词，可以根据需要添加更多停用词
stopwords = set(STOPWORDS)
#stopwords.update(["的", "感谢", "我代表", "以上", "报告", "表示诚挚感谢", "战略"])

# 设置词云的字体路径，需替换为实际字体文件路径
font_path = "simhei.ttf"  # 替换为实际的字体路径

# 设置最大和最小字体大小
max_font_size = 300
min_font_size = 5

# 建立颜色列表并生成颜色映射
color_list = ['#FF274B']
colormap = colors.ListedColormap(color_list)

# 自定义颜色函数，使字体呈现蓝色深浅变化
def blue_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return f"hsl(240, 100%, {100 - int(100 * font_size / max_font_size)}%)"

# 生成词云
wordcloud = WordCloud(
    scale=5,  # 设置输出图片的清晰度
    font_path=font_path,  # 字体路径
    # colormap=colormap,  # 使用自定义颜色映射
    width=1600,  # 输出图片的宽度
    height=900,  # 输出图片的高度
    background_color='white',  # 背景颜色
    stopwords=stopwords,  # 停用词
    max_font_size=max_font_size,  # 最大字体大小
    min_font_size=min_font_size,  # 最小字体大小
    color_func=blue_color_func  # 使用自定义颜色函数
).generate_from_frequencies(content)

# 使用 matplotlib 显示生成的词云
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()  # 显示词云图

# 保存词云图到文件
output_image_path = "gpt4_system2_80%.png"
wordcloud.to_file(output_image_path)

输出的词云图