#!/usr/bin/env python
'''
Created on Aug, 2017
@author: menghl
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
# --------------get the basic data--------------------
def get_data():
bug_df = pd.read_excel('/Users/mhl/Desktop/bug.xlsx')
# print('bug_df', bug_df)
bug_id = bug_df.loc[:, 'BugID']
# print('bug_id', bug_id)
bug_sum = bug_df.loc[:, 'Summary']
# print('bug_sum', bug_sum)
bug_pre = pd.concat([bug_id, bug_sum], axis=1)
return bug_pre
# ------------delete the data in []------------------
def del_data_part1(bug_pre):
bug_re = re.compile(r'\[.*?\]')
for bug_item in range(len(bug_pre)):
bug_pre.loc[bug_item, 'Summary'] = bug_re.sub('', bug_pre.loc[bug_item, 'Summary'])
# print(bug_item)
# print('bug_pre', bug_pre)
# -------------tokenize the bug_pre-----------------
def tokenize(bug_pre):
bug_list = list(bug_pre.loc[:, 'Summary'])
for loop in range(len(bug_list)):
bug_list[loop] = bug_list[loop].lower()
bug_list[loop] = nltk.word_tokenize(bug_list[loop])
# print(bug_list[loop])
# print('bug_list', bug_list)
return bug_list
# ------------------delete non-important words--------------------
def del_words(bug_list):
for loop in range(len(bug_list)):
bug_list[loop] = del_stopwords(bug_list[loop])
bug_list[loop] = del_spwords(bug_list[loop])
bug_list[loop] = del_puncwords(bug_list[loop])
# ------------delete stopwords------------------------
def del_stopwords(bug_item):
bug_item = [word for word in bug_item if word not in stopwords.words('english')]
return bug_item
# ------------delete special words for lenovo------------------------
def del_spwords(bug_item):
month = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
other_words = ['block', 'Flex', 'IO']
spwords = month
spwords.extend(other_words)
bug_item = [word for word in bug_item if word not in spwords]
return bug_item
# ------------delete punctuation------------------------------
def del_puncwords(bug_item):
puncwords = ['', '\n', '\t', ',', '.', ':', ';', '?', '(', ')', \
'[', ']', '&', '!', '*', '@', '#', '$', '%']
bug_item = [word for word in bug_item if word not in puncwords]
return bug_item
# ---------calculate the tf-idf value of this model--------------
def cal_tfidf(bug_list):
bug_dict = corpora.Dictionary(bug_list)
# check the total words number
# print(len(bug_dict))
# print('bug_dict', bug_dict)
# print(bug_dict.token2id)
corpus = [bug_dict.doc2bow(bug_loop) for bug_loop in bug_list]
# print(corpus)
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
return bug_dict, tfidf, corpus_tfidf
# ---------calculate the similarities of the new input-----------
def cal_sim(test_query, tfidf, corpus_tfidf):
test_query_tfidf = tfidf[test_query]
index = similarities.MatrixSimilarity(corpus_tfidf)
sims = index[test_query_tfidf]
sims_list = list(enumerate(sims))
sims_sort = sorted(sims_list, key=lambda x:x[1], reverse=True)
# print(sims_sort)
top5_sims = sims_sort[0:5]
return top5_sims
# -------------display data---------------------
def display(top5_sims, bug_pre):
print(top5_sims)
for loop in range(len(top5_sims)):
print(bug_pre.loc[top5_sims[loop][0]], '\n')
# -------------test one-------------------------
def test_bug_list(bug_list):
return bug_list[1]
# -------------test two-------------------------
def test_str_input():
return 'expander'
# -------------test three-------------------------
def test3(bug_pre, bug_list):
bug_num = input('Please enter the bug number you check:')
#bug_num = 41414
print(bug_num)
bug_id_list = list(bug_pre['BugID'])
index = bug_id_list.index(bug_num)
bug_test = bug_list[index]
print('bug_test', bug_test)
return bug_test
# -------------Main--------------------------------
def main():
bug_pre = get_data()
del_data_part1(bug_pre)
bug_list = tokenize(bug_pre)
del_words(bug_list)
# print(bug_list)
bug_dict, tfidf, corpus_tfidf = cal_tfidf(bug_list)
test_query_input = test3(bug_pre, bug_list);
test_query = bug_dict.doc2bow(test_query_input)
top5_sims = cal_sim(test_query, tfidf, corpus_tfidf)
display(top5_sims, bug_pre)
if __name__=="__main__":
main()
'''
Created on Aug, 2017
@author: menghl
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
# --------------get the basic data--------------------
def get_data():
bug_df = pd.read_excel('/Users/mhl/Desktop/bug.xlsx')
# print('bug_df', bug_df)
bug_id = bug_df.loc[:, 'BugID']
# print('bug_id', bug_id)
bug_sum = bug_df.loc[:, 'Summary']
# print('bug_sum', bug_sum)
bug_pre = pd.concat([bug_id, bug_sum], axis=1)
return bug_pre
# ------------delete the data in []------------------
def del_data_part1(bug_pre):
bug_re = re.compile(r'\[.*?\]')
for bug_item in range(len(bug_pre)):
bug_pre.loc[bug_item, 'Summary'] = bug_re.sub('', bug_pre.loc[bug_item, 'Summary'])
# print(bug_item)
# print('bug_pre', bug_pre)
# -------------tokenize the bug_pre-----------------
def tokenize(bug_pre):
bug_list = list(bug_pre.loc[:, 'Summary'])
for loop in range(len(bug_list)):
bug_list[loop] = bug_list[loop].lower()
bug_list[loop] = nltk.word_tokenize(bug_list[loop])
# print(bug_list[loop])
# print('bug_list', bug_list)
return bug_list
# ------------------delete non-important words--------------------
def del_words(bug_list):
for loop in range(len(bug_list)):
bug_list[loop] = del_stopwords(bug_list[loop])
bug_list[loop] = del_spwords(bug_list[loop])
bug_list[loop] = del_puncwords(bug_list[loop])
# ------------delete stopwords------------------------
def del_stopwords(bug_item):
bug_item = [word for word in bug_item if word not in stopwords.words('english')]
return bug_item
# ------------delete special words for lenovo------------------------
def del_spwords(bug_item):
month = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
other_words = ['block', 'Flex', 'IO']
spwords = month
spwords.extend(other_words)
bug_item = [word for word in bug_item if word not in spwords]
return bug_item
# ------------delete punctuation------------------------------
def del_puncwords(bug_item):
puncwords = ['', '\n', '\t', ',', '.', ':', ';', '?', '(', ')', \
'[', ']', '&', '!', '*', '@', '#', '$', '%']
bug_item = [word for word in bug_item if word not in puncwords]
return bug_item
# ---------calculate the tf-idf value of this model--------------
def cal_tfidf(bug_list):
bug_dict = corpora.Dictionary(bug_list)
# check the total words number
# print(len(bug_dict))
# print('bug_dict', bug_dict)
# print(bug_dict.token2id)
corpus = [bug_dict.doc2bow(bug_loop) for bug_loop in bug_list]
# print(corpus)
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
return bug_dict, tfidf, corpus_tfidf
# ---------calculate the similarities of the new input-----------
def cal_sim(test_query, tfidf, corpus_tfidf):
test_query_tfidf = tfidf[test_query]
index = similarities.MatrixSimilarity(corpus_tfidf)
sims = index[test_query_tfidf]
sims_list = list(enumerate(sims))
sims_sort = sorted(sims_list, key=lambda x:x[1], reverse=True)
# print(sims_sort)
top5_sims = sims_sort[0:5]
return top5_sims
# -------------display data---------------------
def display(top5_sims, bug_pre):
print(top5_sims)
for loop in range(len(top5_sims)):
print(bug_pre.loc[top5_sims[loop][0]], '\n')
# -------------test one-------------------------
def test_bug_list(bug_list):
return bug_list[1]
# -------------test two-------------------------
def test_str_input():
return 'expander'
# -------------test three-------------------------
def test3(bug_pre, bug_list):
bug_num = input('Please enter the bug number you check:')
#bug_num = 41414
print(bug_num)
bug_id_list = list(bug_pre['BugID'])
index = bug_id_list.index(bug_num)
bug_test = bug_list[index]
print('bug_test', bug_test)
return bug_test
# -------------Main--------------------------------
def main():
bug_pre = get_data()
del_data_part1(bug_pre)
bug_list = tokenize(bug_pre)
del_words(bug_list)
# print(bug_list)
bug_dict, tfidf, corpus_tfidf = cal_tfidf(bug_list)
test_query_input = test3(bug_pre, bug_list);
test_query = bug_dict.doc2bow(test_query_input)
top5_sims = cal_sim(test_query, tfidf, corpus_tfidf)
display(top5_sims, bug_pre)
if __name__=="__main__":
main()
本文介绍了一种基于Python的文本相似度计算流程,包括数据预处理、分词、去除停用词、计算TF-IDF权重及相似度等步骤。通过实际案例展示了如何利用此流程找出与输入文本最相似的五个文本。
3万+

被折叠的 条评论
为什么被折叠?



