from wordcloud import WordCloud, ImageColorGenerator
import jieba
import PIL
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import re
import nltk
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
import collections
In [2]:
# --------------get the basic data--------------------
def get_data():
bug_df = pd.read_excel('/Users/mhl/Desktop/bug.xlsx')
# print('bug_df', bug_df)
bug_id = bug_df.loc[:, 'BugID']
# print('bug_id', bug_id)
bug_sum = bug_df.loc[:, 'Summary']
# print('bug_sum', bug_sum)
bug_pre = pd.concat([bug_id, bug_sum], axis=1)
return bug_pre
# ------------delete the data in []------------------
def del_data_part1(bug_pre):
bug_re = re.compile(r'\[.*?\]')
for bug_item in range(len(bug_pre)):
bug_pre.loc[bug_item, 'Summary'] = bug_re.sub('', bug_pre.loc[bug_item, 'Summary'])
# print(bug_item)
# print('bug_pre', bug_pre)
# -------------tokenize the bug_pre-----------------
def tokenize(bug_pre):
bug_list = list(bug_pre.loc[:, 'Summary'])
for loop in range(len(bug_list)):
bug_list[loop] = bug_list[loop].lower()
bug_list[loop] = nltk.word_tokenize(bug_list[loop])
# print(bug_list[loop])
# print('bug_list', bug_list)
return bug_list
# ------------------delete non-important words--------------------
def del_words(bug_list):
for loop in range(len(bug_list)):
bug_list[loop] = del_stopwords(bug_list[loop])
bug_list[loop] = del_spwords(bug_list[loop])
bug_list[loop] = del_puncwords(bug_list[loop])
# ------------delete stopwords------------------------
def del_stopwords(bug_item):
bug_item = [word for word in bug_item if word not in stopwords.words('english')]
return bug_item
# ------------delete special words for lenovo------------------------
def del_spwords(bug_item):
month = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
other_words = ['block', 'Flex', 'IO']
spwords = month
spwords.extend(other_words)
bug_item = [word for word in bug_item if word not in spwords]
return bug_item
# ------------delete punctuation------------------------------
def del_puncwords(bug_item):
puncwords = ['', '\n', '\t', ',', '.', ':', ';', '?', '(', ')', \
'[', ']', '&', '!', '*', '@', '#', '$', '%']
bug_item = [word for word in bug_item if word not in puncwords]
return bug_item
In [39]:
def wordcloudplot(txt):
path = '/Users/mhl/Desktop/MSYH.TTF'
path = unicode(path, 'utf8').encode('gb18030')
yellow_mask = np.array(PIL.Image.open('/Users/mhl/Desktop/yellow.jpg'))
#print('alice_mask', alice_mask)
wordcloud = WordCloud(font_path=path, background_color='white', mask=yellow_mask,random_state=42)
#wordcloud = wordcloud.generate(txt)
wordcloud = wordcloud.fit_words(txt)
image_colors = ImageColorGenerator(yellow_mask)
wordcloud.to_file('/Users/mhl/Desktop/yellow_word.jpg')
plt.imshow(wordcloud.recolor(color_func=image_colors))
#plt.imshow(wordcloud)
plt.axis("off")
plt.show()
In [40]:
def draw(word_list):
'''
f = open(r'/Users/mhl/Documents/MhlCode/Bug_try/bug_predict.txt','r').read()
words = list(jieba.cut(f))
word_list = []
for word in words:
if len(word)>1:
word_list.append(word)
'''
#txt=r' '.join(word_list)
#wordcloudplot(txt)
wordcloudplot(word_list)
#print('txt', txt)
In [41]:
def main():
bug_pre = get_data()
del_data_part1(bug_pre)
#print bug_pre
bug_list = tokenize(bug_pre)
del_words(bug_list)
bug_word = []
for bug in bug_list:
bug_word.extend(bug)
#print(bug_word)
#draw(bug_word)
word_count = collections.Counter(bug_word)
#print(word_count)
draw(word_count)
if __name__=='__main__':
main()
本文介绍了一种使用Python进行文本预处理的方法,并通过词频统计生成词云图,展示了如何利用nltk库进行分词、去除停用词及特殊字符等步骤,最后采用WordCloud库将处理后的文本可视化。
4806

被折叠的 条评论
为什么被折叠?



