import os
import re
import tkinter as tk
from tkinter import ttk, filedialog, messagebox, simpledialog
import jieba
import math
import requests
import threading
from collections import defaultdict, Counter
from bs4 import BeautifulSoup
from urllib.parse import quote
from tkinter.scrolledtext import ScrolledText
class InformationRetrievalSystem:
def __init__(self, root):
self.root = root
self.root.title("智能信息检索系统 v3.0")
self.root.geometry("1200x800")
# 初始化系统组件
self.documents = {}
self.vocabulary = set()
self.inverted_index = defaultdict(lambda: {'doc_ids': {}, 'idf': 0})
self.stop_words = self.load_stop_words()
self.doc_vectors = {}
self.synonym_dict = self.load_synonym_dict() # 同义词词典
self.domain_categories = self.load_domain_categories() # 领域分类
# 线程安全的文档计数器
self.doc_counter = 0
self.doc_counter_lock = threading.Lock()
# 界面布局
self.create_ui()
self.create_status_bar()
def create_ui(self):
"""创建主界面"""
main_frame = ttk.Frame(self.root)
main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
# 控制面板
control_frame = ttk.LabelFrame(main_frame, text="控制面板")
control_frame.pack(fill=tk.X, pady=5)
# 搜索组件
self.search_entry = ttk.Entry(control_frame, width=50)
self.search_entry.pack(side=tk.LEFT, padx=5)
# 搜索类型选择
search_type_frame = ttk.Frame(control_frame)
search_type_frame.pack(side=tk.LEFT, padx=5)
self.search_type = tk.StringVar(value="关键词")
ttk.Radiobutton(search_type_frame, text="关键词", variable=self.search_type, value="关键词").pack(side=tk.LEFT)
ttk.Radiobutton(search_type_frame, text="布尔", variable=self.search_type, value="布尔").pack(side=tk.LEFT)
ttk.Radiobutton(search_type_frame, text="短语", variable=self.search_type, value="短语").pack(side=tk.LEFT)
ttk.Radiobutton(search_type_frame, text="扩展", variable=self.search_type, value="扩展").pack(side=tk.LEFT)
ttk.Button(control_frame, text="搜索", command=self.search).pack(side=tk.LEFT, padx=5)
# 文档管理组件
doc_manage_frame = ttk.Frame(control_frame)
doc_manage_frame.pack(side=tk.LEFT, padx=20)
ttk.Button(doc_manage_frame, text="加载本地文档", command=self.load_local_documents).pack(side=tk.LEFT)
ttk.Button(doc_manage_frame, text="输入URL添加", command=self.add_url_document).pack(side=tk.LEFT, padx=5)
# 自动搜索组件
auto_search_frame = ttk.Frame(control_frame)
auto_search_frame.pack(side=tk.LEFT)
self.keyword_entry = ttk.Entry(auto_search_frame, width=20)
self.keyword_entry.pack(side=tk.LEFT)
self.keyword_entry.insert(0, "人工智能")
ttk.Button(auto_search_frame, text="自动网络搜索", command=self.auto_web_search).pack(side=tk.LEFT, padx=5)
# 结果显示区域
result_frame = ttk.LabelFrame(main_frame, text="搜索结果")
result_frame.pack(fill=tk.BOTH, expand=True)
columns = ('doc_id', 'source', 'score', 'content')
self.result_tree = ttk.Treeview(
result_frame,
columns=columns,
show='headings',
selectmode='browse'
)
# 配置列
self.result_tree.heading('doc_id', text='文档ID', anchor=tk.W)
self.result_tree.heading('source', text='来源', anchor=tk.W)
self.result_tree.heading('score', text='相关度', anchor=tk.CENTER)
self.result_tree.heading('content', text='内容摘要', anchor=tk.W)
self.result_tree.column('doc_id', width=150, minwidth=100)
self.result_tree.column('source', width=100, minwidth=80)
self.result_tree.column('score', width=80, minwidth=60, anchor=tk.CENTER)
self.result_tree.column('content', width=800, minwidth=400)
vsb = ttk.Scrollbar(result_frame, orient="vertical", command=self.result_tree.yview)
hsb = ttk.Scrollbar(result_frame, orient="horizontal", command=self.result_tree.xview)
self.result_tree.configure(yscrollcommand=vsb.set, xscrollcommand=hsb.set)
self.result_tree.grid(row=0, column=0, sticky=tk.NSEW)
vsb.grid(row=0, column=1, sticky=tk.NS)
hsb.grid(row=1, column=0, sticky=tk.EW)
result_frame.grid_rowconfigure(0, weight=1)
result_frame.grid_columnconfigure(0, weight=1)
# 详情面板
detail_frame = ttk.LabelFrame(main_frame, text="文档详情")
detail_frame.pack(fill=tk.BOTH, expand=False, pady=5)
self.detail_text = ScrolledText(detail_frame, wrap=tk.WORD, height=8)
self.detail_text.pack(fill=tk.BOTH, expand=True)
# 绑定事件
self.result_tree.bind('<<TreeviewSelect>>', self.show_detail)
def create_status_bar(self):
"""创建状态栏"""
self.status_var = tk.StringVar()
status_bar = ttk.Label(self.root, textvariable=self.status_var, relief=tk.SUNKEN)
status_bar.pack(side=tk.BOTTOM, fill=tk.X)
def update_status(self, message):
"""更新状态栏"""
self.status_var.set(message)
self.root.update_idletasks()
def load_stop_words(self):
"""加载停用词表"""
stop_words = set()
try:
with open('stopwords.txt', 'r', encoding='utf-8') as f:
stop_words = set(line.strip() for line in f if line.strip())
except FileNotFoundError:
messagebox.showwarning("警告", "未找到停用词文件stopwords.txt")
return stop_words
def load_synonym_dict(self):
"""加载同义词词典"""
synonyms = defaultdict(dict) # 改为存储术语-同义词-权重的字典
try:
with open('synonyms.txt', 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
parts = line.strip().split(',')
if len(parts) > 1:
term = parts[0]
for syn_with_weight in parts[1:]:
if ':' in syn_with_weight:
syn, weight = syn_with_weight.split(':')
synonyms[term][syn] = float(weight)
else:
synonyms[term][syn_with_weight] = 0.7 # 默认权重
except FileNotFoundError:
messagebox.showwarning("警告", "未找到同义词文件synonyms.txt")
return synonyms
def load_domain_categories(self):
"""加载领域分类词典"""
categories = defaultdict(list)
try:
with open('domain_categories.txt', 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
parts = line.strip().split(':')
if len(parts) == 2:
domain, terms = parts
categories[domain] = terms.split(',')
except FileNotFoundError:
messagebox.showwarning("警告", "未找到领域分类文件domain_categories.txt")
return categories
# 文档管理功能
def load_local_documents(self):
"""加载本地文档"""
files = filedialog.askopenfilenames(filetypes=[("Text files", "*.txt")])
if not files:
return
for file in files:
try:
with open(file, 'r', encoding='utf-8') as f:
content = f.read()
doc_id = os.path.basename(file)
self.add_document(doc_id, content, '本地文件')
except Exception as e:
messagebox.showerror("错误", f"加载文件失败:{str(e)}")
self.update_index()
messagebox.showinfo("成功", f"已加载 {len(files)} 个本地文档")
def add_url_document(self):
"""手动添加URL文档"""
url = simpledialog.askstring("输入URL", "请输入网页地址:")
if url:
threading.Thread(target=self.fetch_web_content, args=(url, '手动添加'), daemon=True).start()
def auto_web_search(self):
"""自动网络搜索文档"""
keyword = self.keyword_entry.get().strip()
if not keyword:
messagebox.showwarning("警告", "请输入搜索关键词")
return
threading.Thread(target=self.baidu_search, args=(keyword,), daemon=True).start()
# 网络请求功能
def fetch_web_content(self, url, source):
"""获取网页内容"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# 获取真实URL
with requests.get(url, headers=headers, timeout=10, allow_redirects=True) as response:
response.raise_for_status()
final_url = response.url
# 获取页面内容
with requests.get(final_url, headers=headers, timeout=15) as response:
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# 提取正文(简单实现)
text = soup.get_text()
text = re.sub(r'\s+', ' ', text).strip()
# 生成唯一文档ID
with self.doc_counter_lock:
self.doc_counter += 1
doc_id = f"{source}_{self.doc_counter}"
self.add_document(doc_id, text, final_url)
self.root.after(0, lambda: messagebox.showinfo("成功", f"已添加文档:{doc_id}"))
except Exception as e:
self.root.after(0, lambda: messagebox.showerror("错误", f"抓取失败:{str(e)}"))
def baidu_search(self, keyword):
"""执行百度搜索"""
try:
self.update_status(f"正在搜索: {keyword}")
search_url = f"https://www.baidu.com/s?wd={quote(keyword)}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
with requests.get(search_url, headers=headers, timeout=10) as response:
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 修复:使用新的CSS选择器来定位搜索结果
results = soup.select('div.c-container')
max_results = min(5, len(results))
for idx, result in enumerate(results[:max_results]):
# 修复:提取正确的链接
if link := result.select_one('h3.t a'):
href = link.get('href', '')
if href.startswith('http'): # 确保是有效的URL
# 错开线程启动时间,避免请求过于密集
threading.Timer(
idx * 1, # 每个线程间隔1秒启动
self.fetch_web_content,
args=(href, f"百度结果_{idx+1}")
).start()
self.update_status(f"搜索完成: {keyword}")
except Exception as e:
self.update_status(f"搜索失败: {str(e)}")
self.root.after(0, lambda: messagebox.showerror("错误", f"搜索失败:{str(e)}"))
def add_document(self, doc_id, content, source):
"""添加文档到系统"""
if doc_id in self.documents:
return
self.documents[doc_id] = self.preprocess_text(content)
self.update_index()
# 更新UI显示
def update_treeview():
self.result_tree.insert('', 'end', values=(
doc_id,
source,
'N/A',
' '.join(self.documents[doc_id][:30]) + '...'
))
self.root.after(0, update_treeview)
def preprocess_text(self, text):
"""文本预处理"""
words = jieba.lcut(text)
return [word for word in words
if word not in self.stop_words
and re.match(r'^[\u4e00-\u9fa5]+$', word)]
def update_index(self):
"""更新索引系统"""
self.build_inverted_index()
self.calculate_tf_idf()
def build_inverted_index(self):
"""构建倒排索引"""
self.vocabulary = set()
self.inverted_index = defaultdict(lambda: {'doc_ids': {}, 'idf': 0})
# 收集词项位置
for doc_id, words in self.documents.items():
term_positions = defaultdict(list)
for pos, term in enumerate(words):
term_positions[term].append(pos)
self.vocabulary.add(term)
for term, positions in term_positions.items():
self.inverted_index[term]['doc_ids'][doc_id] = positions
# 计算IDF
total_docs = len(self.documents)
for term in self.inverted_index:
doc_count = len(self.inverted_index[term]['doc_ids'])
self.inverted_index[term]['idf'] = math.log((total_docs + 1) / (doc_count + 1e-9))
def calculate_tf_idf(self):
"""计算TF-IDF向量"""
self.doc_vectors = {}
for doc_id, words in self.documents.items():
tf = defaultdict(int)
for word in words:
tf[word] += 1
vector = {}
for term, count in tf.items():
tf_val = count / len(words)
idf_val = self.inverted_index[term]['idf']
vector[term] = tf_val * idf_val
self.doc_vectors[doc_id] = vector
# 搜索功能
def search(self):
"""根据不同搜索类型执行搜索"""
query = self.search_entry.get().strip()
search_type = self.search_type.get()
if not query:
messagebox.showwarning("警告", "请输入查询内容")
return
# 清空结果
for item in self.result_tree.get_children():
self.result_tree.delete(item)
if search_type == "关键词":
self.perform_keyword_search(query)
elif search_type == "布尔":
self.perform_boolean_search(query)
elif search_type == "短语":
self.perform_phrase_search(query)
elif search_type == "扩展":
self.perform_expanded_search(query)
def perform_keyword_search(self, query):
"""执行关键词搜索"""
query_terms = self.preprocess_text(query)
if not query_terms:
messagebox.showwarning("警告", "查询词无效或均为停用词")
return
# 构建查询向量
query_vector = defaultdict(float)
term_counts = defaultdict(int)
for term in query_terms:
term_counts[term] += 1
for term in term_counts:
if term in self.inverted_index:
tf = term_counts[term] / len(query_terms)
idf = self.inverted_index[term]['idf']
query_vector[term] = tf * idf
# 计算相似度
scores = {}
query_norm = math.sqrt(sum(v**2 for v in query_vector.values()))
for doc_id, doc_vector in self.doc_vectors.items():
dot_product = sum(query_vector[term] * doc_vector[term]
for term in query_vector if term in doc_vector)
doc_norm = math.sqrt(sum(v**2 for v in doc_vector.values()))
if query_norm * doc_norm == 0:
score = 0
else:
score = dot_product / (query_norm * doc_norm)
scores[doc_id] = score
# 显示结果
for doc_id, score in sorted(scores.items(), key=lambda x: x[1], reverse=True):
if score > 0:
source = '本地文档' if doc_id.endswith('.txt') else '网络文档'
preview = ' '.join(self.documents[doc_id][:30]) + '...'
self.result_tree.insert('', 'end', values=(
doc_id,
source,
f"{score:.4f}",
preview
))
def perform_boolean_search(self, query):
"""执行布尔搜索"""
# 解析查询字符串,支持 AND(&), OR(|), NOT(!), 括号()
tokens = re.findall(r'[\(\)!&\|]|[\u4e00-\u9fa5]+', query)
query_terms = []
operators = []
for token in tokens:
if token in {'&', '|', '!', '(', ')'}:
operators.append(token)
else:
# 直接使用原始词项(不进行分词)
if token not in self.stop_words and re.match(r'^[\u4e00-\u9fa5]+$', token):
query_terms.append(token)
if not query_terms:
messagebox.showwarning("警告", "查询词无效或均为停用词")
return
# 执行布尔查询
try:
result_docs = self.evaluate_boolean_expression(query_terms, operators)
except Exception as e:
messagebox.showerror("错误", f"布尔表达式解析错误: {str(e)}")
return
# 显示结果
for doc_id in result_docs:
source = '本地文档' if doc_id.endswith('.txt') else '网络文档'
preview = ' '.join(self.documents[doc_id][:30]) + '...'
self.result_tree.insert('', 'end', values=(
doc_id,
source,
"1.0000", # 布尔查询结果只有匹配/不匹配
preview
))
def evaluate_boolean_expression(self, terms, operators):
"""评估布尔表达式"""
# 使用栈式表达式求值
docs_stack = []
op_stack = []
for token in terms + operators:
if token in terms:
# 获取词项的文档集合
if token in self.inverted_index:
docs = set(self.inverted_index[token]['doc_ids'].keys())
else:
docs = set()
docs_stack.append(docs)
elif token == '(':
op_stack.append(token)
elif token == ')':
# 处理括号内的表达式
while op_stack and op_stack[-1] != '(':
op = op_stack.pop()
self.apply_operator(op, docs_stack)
op_stack.pop() # 弹出左括号
elif token in {'&', '|', '!'}:
# 处理操作符优先级
while (op_stack and op_stack[-1] != '(' and
self.operator_precedence(op_stack[-1]) >= self.operator_precedence(token)):
op = op_stack.pop()
self.apply_operator(op, docs_stack)
op_stack.append(token)
# 处理剩余操作符
while op_stack:
op = op_stack.pop()
self.apply_operator(op, docs_stack)
if not docs_stack:
return set()
return docs_stack[0]
def operator_precedence(self, op):
"""定义操作符优先级"""
if op == '!':
return 3
elif op == '&':
return 2
elif op == '|':
return 1
return 0
def apply_operator(self, op, docs_stack):
"""应用布尔操作符"""
if op == '!':
if len(docs_stack) < 1:
raise ValueError("操作数不足")
operand = docs_stack.pop()
result = set(self.documents.keys()) - operand
docs_stack.append(result)
elif op in {'&', '|'}:
if len(docs_stack) < 2:
raise ValueError("操作数不足")
right = docs_stack.pop()
left = docs_stack.pop()
if op == '&':
result = left & right
else: # op == '|'
result = left | right
docs_stack.append(result)
def perform_phrase_search(self, query):
"""执行短语搜索"""
# 预处理查询
query_terms = self.preprocess_text(query)
if not query_terms:
messagebox.showwarning("警告", "查询词无效或均为停用词")
return
# 查找所有包含第一个词的文档
if query_terms[0] not in self.inverted_index:
return # 没有匹配结果
candidate_docs = set(self.inverted_index[query_terms[0]]['doc_ids'].keys())
# 检查每个候选文档是否包含完整短语
result_docs = []
for doc_id in candidate_docs:
# 检查后续词是否按顺序出现在文档中
positions = self.inverted_index[query_terms[0]]['doc_ids'][doc_id]
found = False
for pos in positions:
match = True
for i in range(1, len(query_terms)):
next_term = query_terms[i]
if next_term not in self.inverted_index:
match = False
break
if doc_id not in self.inverted_index[next_term]['doc_ids']:
match = False
break
# 检查下一个词是否出现在预期位置
expected_pos = pos + i
if expected_pos not in self.inverted_index[next_term]['doc_ids'][doc_id]:
match = False
break
if match:
found = True
break
if found:
result_docs.append(doc_id)
# 显示结果
for doc_id in result_docs:
source = '本地文档' if doc_id.endswith('.txt') else '网络文档'
preview = ' '.join(self.documents[doc_id][:30]) + '...'
self.result_tree.insert('', 'end', values=(
doc_id,
source,
"1.0000", # 短语查询结果只有匹配/不匹配
preview
))
def perform_expanded_search(self, query):
"""执行查询扩展搜索"""
# 预处理查询
query_terms = self.preprocess_text(query)
if not query_terms:
messagebox.showwarning("警告", "查询词无效或均为停用词")
return
# 扩展查询词
expanded_terms = []
for term in query_terms:
# 添加原始词
expanded_terms.append(term)
# 添加同义词
if term in self.synonym_dict:
for syn, weight in self.synonym_dict[term].items():
expanded_terms.extend([(syn, weight)] * int(weight * 10)) # 按权重比例扩展
# 添加领域相关术语
for domain, terms in self.domain_categories.items():
if term in terms:
for related_term in terms:
if related_term != term and related_term not in self.synonym_dict.get(term, {}):
expanded_terms.append((related_term, 0.5)) # 领域相关术语权重较低
# 构建扩展查询向量
query_vector = defaultdict(float)
term_counts = defaultdict(float)
for item in expanded_terms:
if isinstance(item, tuple):
term, weight = item
term_counts[term] += weight
else:
term_counts[item] += 1.0 # 原始词权重为1.0
# 计算TF-IDF
total_weight = sum(term_counts.values())
for term, count in term_counts.items():
if term in self.inverted_index:
tf = count / total_weight
idf = self.inverted_index[term]['idf']
query_vector[term] = tf * idf
# 计算相似度
scores = {}
query_norm = math.sqrt(sum(v**2 for v in query_vector.values()))
for doc_id, doc_vector in self.doc_vectors.items():
dot_product = sum(query_vector[term] * doc_vector[term]
for term in query_vector if term in doc_vector)
doc_norm = math.sqrt(sum(v**2 for v in doc_vector.values()))
if query_norm * doc_norm == 0:
score = 0
else:
score = dot_product / (query_norm * doc_norm)
scores[doc_id] = score
# 显示结果
for doc_id, score in sorted(scores.items(), key=lambda x: x[1], reverse=True):
if score > 0:
source = '本地文档' if doc_id.endswith('.txt') else '网络文档'
preview = ' '.join(self.documents[doc_id][:30]) + '...'
self.result_tree.insert('', 'end', values=(
doc_id,
source,
f"{score:.4f}",
preview
))
def show_detail(self, event):
"""显示文档详情"""
selected = self.result_tree.selection()
if not selected:
return
item = self.result_tree.item(selected[0])
doc_id = item['values'][0]
content = ' '.join(self.documents.get(doc_id, []))
self.detail_text.delete(1.0, tk.END)
self.detail_text.insert(tk.END, content)
if __name__ == "__main__":
jieba.initialize()
root = tk.Tk()
app = InformationRetrievalSystem(root)
root.mainloop()采用python程序设计语言,进行分词,再去掉停用词和标点符号等,生成文档的词典,接着根据词典和文档内容生成词项的倒排记录表(含位置信息),然后根据搜索关键字,进行满足某个布尔条件的检索并实现短语查询,结果文档按余弦相似度计算结果排序,完成查询扩展,最后提交设计程序和课程设计报告。
要求有UI界面,与或非的输入应有按钮或下拉列表辅助,显示出倒排记录表,结果文档的内容显示出来,相应的词项突出显示。
最新发布