# -*- coding: utf-8 -*-
# 优化版中英文双语搜索引擎
import os
import re
import sqlite3
import jieba
import heapq
import langid
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
from math import log
from collections import defaultdict, deque
from functools import lru_cache
from dataclasses import dataclass
from flask import Flask, request, render_template, redirect, g
# ========================
# 数据结构优化
# ========================
@dataclass
class DocumentMetadata:
url: str
title: str = ""
snippet: str = ""
lang: str = ""
vector_length: float = 0.0
# ========================
# 1. 高效爬虫模块
# ========================
class OptimizedCrawler:
def __init__(self, db_path='search_engine.db'):
self.conn = sqlite3.connect(db_path, timeout=10)
self.conn.execute("PRAGMA journal_mode=WAL")
self.conn.execute("PRAGMA synchronous=NORMAL")
self.cursor = self.conn.cursor()
self._init_db()
self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) BingBot/1.0'
self.session = requests.Session()
self.session.headers.update({'User-Agent': self.user_agent})
# 预先编译正则表达式
self.url_pattern = re.compile(r'https?://[^\s"\'<>]+')
self.clean_pattern = re.compile(r'[\'";]')
def _init_db(self):
"""优化数据库结构并创建索引[^1][^3]"""
self.cursor.execute('''
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY,
url TEXT UNIQUE,
title TEXT,
content TEXT,
lang TEXT,
processed_content TEXT,
vector_length REAL DEFAULT 0.0
)
''')
# 创建索引提升查询性能[^4]
self.cursor.execute('CREATE INDEX IF NOT EXISTS idx_url ON pages(url)')
self.cursor.execute('CREATE INDEX IF NOT EXISTS idx_lang ON pages(lang)')
self.conn.commit()
def sanitize_input(self, text):
"""高效文本清洗"""
return self.clean_pattern.sub('', text)
@lru_cache(maxsize=1000)
def detect_language(self, text):
"""带缓存的语言检测"""
return langid.classify(text[:1000])[0]
def extract_text(self, html):
"""优化HTML文本提取"""
soup = BeautifulSoup(html, 'html.parser')
# 移除不需要的标签
for tag in soup(['script', 'style', 'noscript', 'footer', 'header']):
tag.decompose()
# 提取标题和正文
title = soup.title.string if soup.title else ""
body = soup.body.get_text(separator=' ', strip=True) if soup.body else ""
return title, body
def crawl(self, start_urls, max_pages=100):
"""优化爬取逻辑"""
visited = set()
queue = deque(start_urls)
page_count = 0
bulk_data = []
while queue and page_count < max_pages:
url = queue.popleft()
try:
response = self.session.get(url, timeout=5)
response.encoding = response.apparent_encoding
if response.status_code == 200:
title, text_content = self.extract_text(response.text)
sanitized_content = self.sanitize_input(text_content)
lang = self.detect_language(text_content[:500])
# 批量插入优化[^1]
bulk_data.append((
url,
title,
sanitized_content,
lang,
sanitized_content[:200] + '...'
))
visited.add(url)
page_count += 1
# 批量提交
if page_count % 10 == 0:
self._bulk_insert(bulk_data)
bulk_data = []
# 优化链接发现
new_urls = self.url_pattern.findall(response.text)
for new_url in new_urls:
if new_url not in visited:
queue.append(new_url)
except Exception as e:
print(f"Error crawling {url}: {str(e)}")
# 插入剩余数据
if bulk_data:
self._bulk_insert(bulk_data)
return list(visited)
def _bulk_insert(self, data):
"""批量插入优化性能[^1]"""
self.cursor.executemany('''
INSERT OR IGNORE INTO pages (url, title, content, lang, processed_content)
VALUES (?, ?, ?, ?, ?)
''', data)
self.conn.commit()
# ========================
# 2. 高效索引与搜索模块
# ========================
class OptimizedIndexer:
def __init__(self, db_path='search_engine.db'):
self.conn = sqlite3.connect(db_path, timeout=10)
self.conn.execute("PRAGMA journal_mode=WAL")
self.cursor = self.conn.cursor()
# 优化索引数据结构[^5]
self.index = defaultdict(dict) # term -> {doc_id: tf}
self.documents = {}
self.total_docs = 0
self.doc_freq = defaultdict(int)
self.doc_vector_lengths = {}
# 预加载停用词
self.stopwords = self._load_stopwords()
def _load_stopwords(self):
"""预加载停用词表"""
base_stopwords = {'的', '是', '在', '和', 'the', 'and', 'is', 'of', 'to'}
try:
# 加载中文扩展停用词
from jieba.analyse import STOP_WORDS
return base_stopwords | STOP_WORDS
except:
return base_stopwords
@lru_cache(maxsize=10000)
def preprocess_text(self, text, lang):
"""带缓存的分词预处理"""
# 中文分词
if lang == 'zh':
words = jieba.cut(text)
# 英文处理
else:
words = re.findall(r'\b\w{3,}\b', text.lower())
return [word for word in words if word not in self.stopwords]
def build_index(self):
"""高效索引构建"""
self.cursor.execute("SELECT id, url, title, content, lang, processed_content FROM pages")
rows = self.cursor.fetchall()
self.total_docs = len(rows)
# 预计算向量长度
for row in rows:
doc_id, url, title, content, lang, snippet = row
self.documents[doc_id] = DocumentMetadata(
url=url,
title=title,
snippet=snippet,
lang=lang
)
tokens = self.preprocess_text(content, lang)
doc_length = len(tokens)
# 计算词频
term_freq = defaultdict(int)
for token in tokens:
term_freq[token] += 1
# 构建倒排索引
vector_length_sq = 0
for term, freq in term_freq.items():
self.index[term][doc_id] = freq
self.doc_freq[term] += 1
# 累加向量长度平方
idf = log(self.total_docs / max(self.doc_freq[term], 1))
tf_idf_sq = (freq * idf) ** 2
vector_length_sq += tf_idf_sq
# 存储文档向量长度
self.doc_vector_lengths[doc_id] = vector_length_sq ** 0.5
# 更新数据库中的向量长度
self._update_vector_lengths()
def _update_vector_lengths(self):
"""批量更新向量长度"""
update_data = [(self.doc_vector_lengths[doc_id], doc_id)
for doc_id in self.doc_vector_lengths]
self.cursor.executemany(
"UPDATE pages SET vector_length = ? WHERE id = ?",
update_data
)
self.conn.commit()
def tf_idf(self, term, doc_id):
"""高效TF-IDF计算"""
if doc_id not in self.doc_vector_lengths or term not in self.index:
return 0
tf = self.index[term].get(doc_id, 0)
if tf == 0:
return 0
idf = log(self.total_docs / max(self.doc_freq[term], 1))
return tf * idf
def cosine_similarity(self, query_terms, doc_id):
"""优化余弦相似度计算"""
# 获取文档向量长度
doc_vector_length = self.doc_vector_lengths.get(doc_id, 0)
if doc_vector_length == 0:
return 0
# 计算点积
dot_product = 0
for term in query_terms:
if term in self.index and doc_id in self.index[term]:
term_weight = self.tf_idf(term, doc_id)
dot_product += term_weight
return dot_product / doc_vector_length
def search(self, query, top_n=10):
"""高效搜索实现"""
# 检测查询语言
lang, _ = langid.classify(query)
# 预处理查询
query_terms = self.preprocess_text(query, lang)
# 检索候选文档
candidate_docs = set()
for term in query_terms:
if term in self.index:
candidate_docs.update(self.index[term].keys())
# 使用堆排序高效获取TopN结果[^5]
results = []
for doc_id in candidate_docs:
score = self.cosine_similarity(query_terms, doc_id)
if score > 0:
if len(results) < top_n:
heapq.heappush(results, (score, doc_id))
else:
heapq.heappushpop(results, (score, doc_id))
# 按分数降序排序
results.sort(reverse=True)
return [(doc_id, score) for score, doc_id in results]
# ========================
# 3. Web服务优化
# ========================
app = Flask(__name__)
@app.before_request
def before_request():
"""初始化索引器"""
if 'indexer' not in g:
g.indexer = OptimizedIndexer()
if not g.indexer.documents:
try:
g.indexer.build_index()
except Exception as e:
print(f"Index build error: {str(e)}")
@app.route('/')
def home():
return render_template('index.html')
@app.route('/search')
def search():
query = request.args.get('q', '')
if not query.strip():
return redirect('/')
results = []
indexer = g.get('indexer', None)
if indexer and indexer.documents:
search_results = indexer.search(query)
for doc_id, score in search_results:
doc = indexer.documents[doc_id]
results.append({
'url': doc.url,
'title': doc.title,
'snippet': doc.snippet,
'score': round(score, 4)
})
return render_template('results.html', query=query, results=results)
# ========================
# 主程序执行
# ========================
if __name__ == '__main__':
# 优化启动流程
print("初始化数据库...")
crawler = OptimizedCrawler()
# 按需爬取
crawler.cursor.execute("SELECT COUNT(*) FROM pages")
if crawler.cursor.fetchone()[0] < 10:
start_urls = [
'https://en.wikipedia.org/wiki/Search_engine',
'https://baike.baidu.com/item/%E6%90%9C%E7%B4%A2%E5%BC%95%E6%93%8E'
]
print("开始爬取网页...")
crawler.crawl(start_urls, max_pages=50)
print("启动搜索引擎服务: http://localhost:5000")
app.run(debug=False, threaded=True)
这个代码存在问题