from scipy.misc import imread # 这是一个处理图像的函数
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from langconv import *
import jieba.posseg as pog
def tradition2simple(line):
# 将繁体转换成简体
line = Converter('zh-hans').convert(line)
return line
def stop_words(texts):
words_list = []
stop_property=['b','c','d','e','f','m','o','p','q','r','t','u','x','y','z','uj','nrt','eng','zg','ul']
stop_words = [line.strip() for line in open('stopwords.txt', encoding='utf-8').readlines()]
word_generator=[]
word_cut=pog.cut(texts)
for word,flag in word_cut:
if flag not in stop_property and stop_words:
word_generator.append(word)
return ' '.join(word_generator) # 注意是空格
def word_cloud(text):
# back_color = imread('te.jpg') # 解析该图片
wc = WordCloud(
font_path="C:/Windows/Fonts/STFANGSO.ttf", ##设置字体,不指定就会出现乱码,文件名不支持中文
background_color='white',
# mask=back_color, # 以该参数值作图绘制词云,这个参数不为空时,width和height会被忽略
random_state=42, # 为每个词返回一个PIL颜色
stopwords=STOPWORDS.add('苟利国'), # 使用内置的屏蔽词,再添加'苟利国'
max_words=400, # 允许最大词汇
max_font_size=100,
width=600,
height=400,
margin=1,
collocations=False, # 是否包括两个词的搭配
prefer_horizontal=0.8 # 词语水平摆放的频率,默认为0.9.即竖直摆放的频率为0.1
)
wc.generate(text)
# 保存图片
wc.to_file('weibo_comment.png')
# #显示图片
plt.imshow(wc)
plt.axis('off')
plt.show()
if __name__=='__main__':
data = pd.read_excel('C:\E\weibo.xlsx')
m = data.shape[0]
data.rename(columns={'粉丝ID': 'fans_id', '粉丝': 'fans_name', '微博账户id': 'weibo_user_id', '微博名': 'weibo_name',
'微博id': 'weibo_id', '评论id': 'comment_id', '评论': 'comment'}, inplace=True)
text = ''
for i in np.arange(m):
text += str(data.ix[i, 'comment'])
text = stop_words(text)#<class 'str'>
text=tradition2simple(text)
word_cloud(text)
转载于:https://www.cnblogs.com/hapyygril/p/9969349.html