import re
import jieba.posseg as posseg
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# 停用词路径
stopwords_path = './stopwords.txt'
# 需要排除的词性
stopPOS = []
# 读取停用词
with open(stopwords_path, 'r', encoding='utf-8') as f:
stopwords = [line.strip() for line in f.readlines()]
def segment_text_to_sentence(text):
# 将文本分割成句子
sentences = re.split(r'[, 。!?!?]', text)
sentences = [sentence.strip().replace(" ", "").replace('\n', '') for sentence in sentences if sentence.strip()]
return sentences
def segment_text_to_words(text, use_stopwords):
# 分词并去除停用词和去除一些词性的词
global stopPOS, stopwords
stopPOS = [item.lower() for item in stopPOS]
words = posseg.cut(text)
if use_stopwords:
words = [word for word, flag in words if flag[0].lower() not in stopPOS and word not in stopwords]
else:
words = [word for word, flag in words if flag[0].lower() not in stopPOS]
words = set(words)
return words
def original_similarity_matrix(sentences, use_stopwords): # 返回句子之间的相似度
# 计算原始相似性矩阵
sentence_words = [set(segment_text_to_words(item, use_stopwords)) for item in sentences]
print(sentence_words)
size = len(sentences)
similarity_matrix = np.zeros((size, size))
for i in range(size):
for j in range(i + 1, size):
if len(sentence_words[i]) == 0 or len(sentence_words[j]) == 0:
similarity = 0
else:
# 计算相似性
similarity = len(sentence_words[i] & sentence_words[j]) / (
np.log(len(sentence_words[i])) + np.log(len(sentence_words[i])) + 1e-10)
# jaccard原理 : 集合交集大小 / log(集合大小)之和 , 通过log解决差异过大的问题
similarity_matrix[i][j] = similarity_matrix[j][i] = similarity
return similarity_matrix
def cosine_tfidf_similarity_matrix(sentences, use_stopwords):
# 计算基于TF-IDF的余弦相似性矩阵
sentence_words = [' '.join(segment_text_to_words(item, use_stopwords)) for item in sentences]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentence_words)
similarity_matrix = cosine_similarity(tfidf_matrix)
# 将对角线元素设置为0,避免自身与自身的相似性干扰
np.fill_diagonal(similarity_matrix, 0)
return similarity_matrix
def summarize_text_rank(text, d=0.85, iter_num=200, top=3, method='默认方式', use_stopwords=True):
sentences = segment_text_to_sentence(text) # 将文本分割成句子
print('---------开始----------------------------------------')
# 相似度矩阵生成
if method == '默认方式':
edge_weight = original_similarity_matrix(sentences, use_stopwords)
elif method == 'TF-IDF':
edge_weight = cosine_tfidf_similarity_matrix(sentences, use_stopwords)
# 初始化结点权重 ,每个句子初始化为1
node_weight = np.ones((len(sentences)))
# 进行iter_num次迭代
for num in range(iter_num):
# TextRank迭代公式
# d : 阻尼系数 , edge_weight : 句子的相似度矩阵
# (edge_weight / (edge_weight.sum(axis=-1) + 1e-10)).T:这部分是对相似度矩阵
# 进行归一化处理。edge_weight.sum(axis=-1)
# 对每行进行求和,即计算每个句子与其他所有句子的相似度之和。然后除以这个和,实现归一化,避免数值过大或过小的影响。1e-10
# 是为了避免除以零。
# 阻尼系数过大 ,随机游走概率小,算法稳定性差,倾向于选择出现频率较高的结点
# 阻尼系数过小 ,信息传递速度较慢 ,需要更多的迭代次数;
node_weight_new = (1 - d) + d * node_weight @ (edge_weight / (edge_weight.sum(axis=-1) + 1e-10)).T
if ((node_weight_new - node_weight) ** 2).sum() < 1e-10:
break
node_weight = node_weight_new
if num < iter_num:
print('迭代{}次,收敛'.format(num))
else:
print('迭代{}次,未收敛'.format(num))
sorted_indices = np.argsort(node_weight)[::-1]
# 获取最大的几个值及其对应的索引
top_indices = sorted(sorted_indices[:top])
top_values = node_weight[top_indices]
print('最大的{}个值:'.format(top), top_values)
print('对应的索引:', top_indices)
print('结果:')
result = ''
for idx in top_indices:
result += sentences[idx] + '。\n'
print(result)
return result
import tkinter as tk
from tkinter import ttk, scrolledtext
def summarize_text():
input_text = input_text_widget.get("1.0", "end-1c")
d = float(d_entry.get()) if d_entry.get() else 0.85
top = int(top_entry.get()) if top_entry.get() else 3
processing_method = processing_method_var.get()
use_stopwords = use_stopwords_var.get()
summary = summarize_text_rank(input_text, d=d, top=top, method=processing_method, use_stopwords=use_stopwords)
output_text_widget.delete(1.0, tk.END)# 这一行代码的作用是清空文本框中之前显示的任何内容,为接下来显示新的摘要做准备。
output_text_widget.insert(tk.END, summary)# 将生成的文本摘要 summary 显示在 output_text_widget 文本框中,替换之前清空的内容。
# 创建主窗口
root = tk.Tk()
root.title("中文文本自动摘要工具")
# 使用ttk模块中的样式调整
style = ttk.Style()
style.configure('TFrame', padding=10)
style.configure('TButton', padding=(10, 5), font=('Helvetica', 10))
style.configure('TLabel', font=('Helvetica', 10))
# 创建输入文本框
input_label_frame = ttk.LabelFrame(root, text="输入文本")
input_label_frame.grid(row=0, column=0, padx=10, pady=10, sticky="nsew", columnspan=2) # 设置columnspan为2,使其横跨两列
input_text_widget = scrolledtext.ScrolledText(input_label_frame, wrap=tk.WORD, width=70, height=10)
input_text_widget.pack(pady=10, fill='both', expand=True)
# 创建摘要长度输入框,设置默认值为100
frame1 = ttk.LabelFrame(root, text="TextRank参数设置")
frame1.grid(row=1, column=0, padx=10, pady=10, sticky="nsew", columnspan=2) # 设置columnspan为2,使其横跨两列
# 创建停用词复选框
use_stopwords_var = tk.BooleanVar(root)
use_stopwords_var.set(True) # 默认使用停用词
use_stopwords_checkbutton = ttk.Checkbutton(frame1, text="使用停用词", variable=use_stopwords_var)
use_stopwords_checkbutton.grid(row=0, column=0, pady=5)
default_d = 0.85
d_label = ttk.Label(frame1, text=f"阻尼系数:")
d_label.grid(row=1, column=0, padx=5, pady=5, sticky="w")
d_entry = ttk.Entry(frame1, width=10)
d_entry.insert(0, str(default_d))
d_entry.grid(row=1, column=1, padx=2, pady=5)
default_top = 3
top_label = ttk.Label(frame1, text=f"摘要句数:")
top_label.grid(row=2, column=0, padx=5, pady=5, sticky="w")
top_entry = ttk.Entry(frame1, width=10)
top_entry.insert(0, str(default_top))
top_entry.grid(row=2, column=1, padx=2, pady=5)
processing_method_var = tk.StringVar(root)
processing_method_var.set("默认方式") # 设置默认选项
processing_method_label = ttk.Label(frame1, text="相似度度量:")
processing_method_label.grid(row=3, column=0, padx=5, pady=5, sticky="w")
processing_method_menu = ttk.Combobox(frame1, textvariable=processing_method_var, values=["默认方式", "TF-IDF"],
width=10)
processing_method_menu.grid(row=3, column=1, padx=2, pady=5)
# 创建按钮,用于触发文本摘要
summarize_button = ttk.Button(root, text="TextRank生成摘要", command=summarize_text, style='TButton')
summarize_button.grid(row=2, column=0, padx=(10, 5), pady=10) # 添加横向和纵向的内边距
# 创建输出文本框
output_label_frame = ttk.LabelFrame(root, text="TextRank输出文本")
output_label_frame.grid(row=3, column=0, padx=10, pady=10, sticky="nsew", columnspan=2) # 设置columnspan为2,使其横跨两列
output_text_widget = scrolledtext.ScrolledText(output_label_frame, wrap=tk.WORD, width=50, height=10)
output_text_widget.pack(pady=10, fill='both', expand=True)
# 设置行列权重,使得在窗口变大时,文本框和标签框都能够扩展
for i in range(4): # 设置所有行的权重为1
root.grid_rowconfigure(i, weight=1)
root.grid_columnconfigure(0, weight=1)
root.grid_columnconfigure(1, weight=1)
# 运行主循环
root.mainloop()