Wordcloud-try

本文介绍了一种使用Python进行文本预处理的方法,并通过词频统计生成词云图,展示了如何利用nltk库进行分词、去除停用词及特殊字符等步骤,最后采用WordCloud库将处理后的文本可视化。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

from wordcloud import WordCloud, ImageColorGenerator
import jieba
import PIL
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd

import re
import nltk
from gensim import corpora, models, similarities
from nltk.corpus import stopwords

import collections
In [2]:
# --------------get the basic data--------------------
def get_data():
    bug_df = pd.read_excel('/Users/mhl/Desktop/bug.xlsx')
    # print('bug_df', bug_df)
    bug_id = bug_df.loc[:, 'BugID']
    # print('bug_id', bug_id)
    bug_sum = bug_df.loc[:, 'Summary']
    # print('bug_sum', bug_sum)
    bug_pre = pd.concat([bug_id, bug_sum], axis=1)
    return bug_pre


# ------------delete the data in []------------------
def del_data_part1(bug_pre):
    bug_re = re.compile(r'\[.*?\]')
    for bug_item in range(len(bug_pre)):
        bug_pre.loc[bug_item, 'Summary'] = bug_re.sub('', bug_pre.loc[bug_item, 'Summary'])
    # print(bug_item)
    # print('bug_pre', bug_pre)


# -------------tokenize the bug_pre-----------------
def tokenize(bug_pre):
    bug_list = list(bug_pre.loc[:, 'Summary'])
    for loop in range(len(bug_list)):
        bug_list[loop] = bug_list[loop].lower()
        bug_list[loop] = nltk.word_tokenize(bug_list[loop])
        # print(bug_list[loop])
    # print('bug_list', bug_list)
    return bug_list


# ------------------delete non-important words--------------------
def del_words(bug_list):
    for loop in range(len(bug_list)):
        bug_list[loop] = del_stopwords(bug_list[loop])
        bug_list[loop] = del_spwords(bug_list[loop])
        bug_list[loop] = del_puncwords(bug_list[loop])


# ------------delete stopwords------------------------
def del_stopwords(bug_item):
    bug_item = [word for word in bug_item if word not in stopwords.words('english')]
    return bug_item


# ------------delete special words for lenovo------------------------
def del_spwords(bug_item):
    month = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    other_words = ['block', 'Flex', 'IO']
    spwords = month
    spwords.extend(other_words)
    bug_item = [word for word in bug_item if word not in spwords]
    return bug_item


# ------------delete punctuation------------------------------
def del_puncwords(bug_item):
    puncwords = ['', '\n', '\t', ',', '.', ':', ';', '?', '(', ')', \
                 '[', ']', '&', '!', '*', '@', '#', '$', '%']
    bug_item = [word for word in bug_item if word not in puncwords]
    return bug_item
In [39]:
def wordcloudplot(txt):
    path = '/Users/mhl/Desktop/MSYH.TTF'
    path = unicode(path, 'utf8').encode('gb18030')
    yellow_mask = np.array(PIL.Image.open('/Users/mhl/Desktop/yellow.jpg'))
    #print('alice_mask', alice_mask)
    wordcloud = WordCloud(font_path=path, background_color='white', mask=yellow_mask,random_state=42)
    #wordcloud = wordcloud.generate(txt)
    wordcloud = wordcloud.fit_words(txt)
    image_colors = ImageColorGenerator(yellow_mask)
    wordcloud.to_file('/Users/mhl/Desktop/yellow_word.jpg')
    plt.imshow(wordcloud.recolor(color_func=image_colors))
    #plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
In [40]:
def draw(word_list):
    '''
    f = open(r'/Users/mhl/Documents/MhlCode/Bug_try/bug_predict.txt','r').read()
    words = list(jieba.cut(f))
    word_list = []
    for word in words:
        if len(word)>1:
            word_list.append(word)
    '''
    #txt=r' '.join(word_list)
    #wordcloudplot(txt)
    wordcloudplot(word_list)
    #print('txt', txt)
In [41]:
def main():
    bug_pre = get_data()
    del_data_part1(bug_pre)
    #print bug_pre
    bug_list = tokenize(bug_pre)
    del_words(bug_list)
    bug_word = []
    for bug in bug_list:
        bug_word.extend(bug)
    #print(bug_word)
    #draw(bug_word)
    word_count = collections.Counter(bug_word)
    #print(word_count)
    draw(word_count)
    
    
if __name__=='__main__':
    main()
<img src=""" style="box-sizing: border-box; border: 0px; vertical-align: middle; max-width: 100%; height: auto;" alt="">

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值