1.数据包
(1)wordcloud包:可以通过pip安装
(2)jieba包:可以通过pip安装
(3)停用词表:下载网址,没办法,优快云下载最少需要1积分,所以设置为1积分
(4)使用数据:搜狗新闻数据,搜狗新闻数据(我使用的是数据最小的包,只为实现效果)
另附中文语料库博客:中文语料库汇总
2. 实现步骤
(1) 获取要分词的数据,对数据进行简要处理;
(2)使用jieba 进行分词;
(3)去除停用词;
(4)统计词频;
(5)绘图。
3. 代码
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.figsize']=(10.0,5.0)
#df = pd.read_table("F:/DataSet/SouGou_News_Data/Smart_less/news_sohusite_xml.smarty.dat",encoding='utf-8')
#print(df)
# 1.获取新闻正文内容
data = open("F:/DataSet/SouGou_News_Data/Smart_less/news_sohusite_xml.smarty.dat",encoding='utf-8')
content = list()
for i in range(1200): #文档总行数
temp = data.readline() #读取每行的内容
if temp.startswith("<content>"):
temp=temp[9:]
temp=temp[:-11]
if(len(temp)>0):
content.append(temp)
#共获取200条新闻,其中164条非空
#print(len(content))
# 2.使用jieba进行分词
segment=[]
for line in content:
try:
segs=jieba.lcut(line)
for seg in segs:
if len(seg)>1 and seg!='\r\n':
segment.append(seg)
except:
print(line)
continue
# 分词成功,共获得29834个词汇
#print(len(segment))
# 3.去停用词
words_df=pd.DataFrame({'segment':segment})
stopwords=pd.read_csv("E:\\A_SoftwareApplication\\A_Anacnoda\\wordcloud\\stopwords\\stopwords.txt",
index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
words_df=words_df[~words_df.segment.isin(stopwords.stopword)]
print(len(words_df))
#去除停用词成功,没有停用词的词汇总量为24142
# 4.统计词频
words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":np.size})
words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)
#print(words_stat.head())
# 5.绘图
wordcloud=WordCloud(font_path="data/simhei.ttf",background_color="white",max_font_size=80)
word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}
wordcloud=wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)
4. 结果