词云

最新推荐文章于 2021-03-18 06:38:41 发布

转载最新推荐文章于 2021-03-18 06:38:41 发布 · 147 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：http://www.cnblogs.com/hapyygril/p/9969349.html

文章标签：

#python

本文详细介绍了一种利用Python处理图像和生成词云的方法。通过读取Excel文件中的微博评论数据，进行繁简转换和停用词过滤，最终生成一张包含高频词汇的词云图。涉及的技术包括Pandas数据处理、jieba分词、WordCloud词云生成等。

from scipy.misc import imread  # 这是一个处理图像的函数
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from langconv import *
import jieba.posseg as pog

def tradition2simple(line):
    # 将繁体转换成简体
    line = Converter('zh-hans').convert(line)
    return line
def stop_words(texts):
    words_list = []
    stop_property=['b','c','d','e','f','m','o','p','q','r','t','u','x','y','z','uj','nrt','eng','zg','ul']
    stop_words = [line.strip() for line in open('stopwords.txt', encoding='utf-8').readlines()]
    word_generator=[]
    word_cut=pog.cut(texts)
    for word,flag in word_cut:
        if flag not in stop_property and stop_words:
            word_generator.append(word)
    return ' '.join(word_generator)  # 注意是空格
def word_cloud(text):
    # back_color = imread('te.jpg')  # 解析该图片
    wc = WordCloud(
        font_path="C:/Windows/Fonts/STFANGSO.ttf",  ##设置字体，不指定就会出现乱码，文件名不支持中文
        background_color='white',
        # mask=back_color,  # 以该参数值作图绘制词云，这个参数不为空时，width和height会被忽略
        random_state=42,  # 为每个词返回一个PIL颜色
        stopwords=STOPWORDS.add('苟利国'),  # 使用内置的屏蔽词，再添加'苟利国'
        max_words=400,  # 允许最大词汇
        max_font_size=100,
        width=600,
        height=400,
        margin=1,
        collocations=False,  # 是否包括两个词的搭配
        prefer_horizontal=0.8  # 词语水平摆放的频率，默认为0.9.即竖直摆放的频率为0.1
    )
    wc.generate(text)
    # 保存图片
    wc.to_file('weibo_comment.png')



    # #显示图片
    plt.imshow(wc)
    plt.axis('off')
    plt.show()







if __name__=='__main__':


    data = pd.read_excel('C:\E\weibo.xlsx')
    m = data.shape[0]
    data.rename(columns={'粉丝ID': 'fans_id', '粉丝': 'fans_name', '微博账户id': 'weibo_user_id', '微博名': 'weibo_name',
                         '微博id': 'weibo_id', '评论id': 'comment_id', '评论': 'comment'}, inplace=True)
    text = ''
    for i in np.arange(m):
        text += str(data.ix[i, 'comment'])



    text = stop_words(text)#<class 'str'>
    text=tradition2simple(text)
    word_cloud(text)