import os
import re
import math
import http.server
import socketserver
import urllib.parse
from collections import defaultdict
from lxml import etree, html
# 配置信息
PORT = 8000
DOCUMENTS_DIR = "documents" # 存放HTML文件的目录
INDEX_FILE = "search_index.json"
# 创建文档目录(如果不存在)
os.makedirs(DOCUMENTS_DIR, exist_ok=True)
# 示例文档(实际应用中会从文件系统加载)
sample_docs = [
{
"id": 1,
"title": "Python编程语言",
"content": "Python是一种高级编程语言,由Guido van Rossum创建。它强调代码可读性和简洁的语法。",
"url": "doc1.html"
},
{
"id": 2,
"title": "lxml库简介",
"content": "lxml是Python中处理XML和HTML的强大库,基于libxml2和libxslt库构建。",
"url": "doc2.html"
},
{
"id": 3,
"title": "构建搜索网站",
"content": "使用Python和lxml可以构建高效的搜索系统,解析HTML并提取关键信息。",
"url": "doc3.html"
}
]
# 创建示例文档
for doc in sample_docs:
with open(os.path.join(DOCUMENTS_DIR, doc["url"]), "w", encoding="utf-8") as f:
f.write(f"""
<!DOCTYPE html>
<html>
<head>
<title>{doc['title']}</title>
</head>
<body>
<h1>{doc['title']}</h1>
<p>{doc['content']}</p>
<p>相关主题: Python, 编程, lxml, 搜索系统</p>
</body>
</html>
""")
class SearchEngine:
def __init__(self):
self.index = defaultdict(dict) # 倒排索引 {词: {文档ID: 词频}}
self.documents = {} # 文档元数据 {文档ID: {标题, url, content}}
self.doc_count = 0
self.doc_lengths = {} # 文档长度(词数)
def tokenize(self, text):
"""简单的分词函数"""
words = re.findall(r'\b\w+\b', text.lower())
return words
def build_index(self, documents_dir):
"""构建搜索索引"""
self.index.clear()
self.documents.clear()
self.doc_count = 0
# 遍历文档目录中的所有HTML文件
for filename in os.listdir(documents_dir):
if filename.endswith(".html"):
doc_id = self.doc_count + 1
self.doc_count += 1
filepath = os.path.join(documents_dir, filename)
# 使用lxml解析HTML
with open(filepath, "r", encoding="utf-8") as f:
content = f.read()
tree = html.fromstring(content)
# 提取文档内容
title = tree.findtext(".//title") or filename
body_text = " ".join(tree.xpath("//body//text()"))
# 存储文档元数据
self.documents[doc_id] = {
"title": title.strip(),
"url": filename,
"content": body_text.strip()[:200] + "..." # 摘要
}
# 分词并更新索引
words = self.tokenize(title + " " + body_text)
self.doc_lengths[doc_id] = len(words)
# 更新倒排索引
for word in set(words): # 使用set避免重复计数
if doc_id not in self.index[word]:
self.index[word][doc_id] = 0
self.index[word][doc_id] += words.count(word)
def tf_idf(self, term, doc_id):
"""计算TF-IDF分数"""
# 词频(TF)
tf = self.index[term].get(doc_id, 0) / self.doc_lengths[doc_id]
# 逆文档频率(IDF)
idf = math.log(self.doc_count / (1 + len(self.index[term])))
return tf * idf
def search(self, query):
"""执行搜索查询"""
if not self.index:
self.build_index(DOCUMENTS_DIR)
query_terms = self.tokenize(query)
scores = defaultdict(float)
# 计算每个文档的相关性分数
for term in query_terms:
if term in self.index:
for doc_id in self.index[term]:
score = self.tf_idf(term, doc_id)
scores[doc_id] += score
# 按分数排序
sorted_results = sorted(scores.items(), key=lambda x: x[1], reverse=True)
# 准备结果
results = []
for doc_id, score in sorted_results:
if score > 0: # 只返回有相关性的文档
doc_info = self.documents[doc_id].copy()
doc_info["score"] = round(score, 4)
results.append(doc_info)
return results
# 创建搜索引擎实例
search_engine = SearchEngine()
class SearchHandler(http.server.SimpleHTTPRequestHandler):
def do_GET(self):
"""处理GET请求"""
if self.path == "/":
# 显示搜索页面
self.send_response(200)
self.send_header("Content-type", "text/html; charset=utf-8")
self.end_headers()
html_content = self.generate_search_page()
self.wfile.write(html_content.encode("utf-8"))
elif self.path.startswith("/search?"):
# 处理搜索请求
query = urllib.parse.parse_qs(urllib.parse.urlparse(self.path).query).get("q", [""])[0]
self.send_response(200)
self.send_header("Content-type", "text/html; charset=utf-8")
self.end_headers()
results = search_engine.search(query)
html_content = self.generate_results_page(query, results)
self.wfile.write(html_content.encode("utf-8"))
else:
# 处理静态文件请求
super().do_GET()
def generate_search_page(self):
"""生成搜索页面"""
return f"""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Python搜索网站</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 0; padding: 20px; background-color: #f5f5f5; }}
.container {{ max-width: 800px; margin: 0 auto; background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }}
h1 {{ color: #2c3e50; text-align: center; }}
.search-box {{ display: flex; margin: 20px 0; }}
input[type="text"] {{ flex: 1; padding: 12px; font-size: 16px; border: 1px solid #ddd; border-radius: 4px 0 0 4px; }}
button {{ padding: 12px 20px; background: #3498db; color: white; border: none; border-radius: 0 4px 4px 0; cursor: pointer; font-size: 16px; }}
button:hover {{ background: #2980b9; }}
.result-item {{ margin: 15px 0; padding: 15px; border: 1px solid #eee; border-radius: 4px; }}
.result-title {{ font-size: 18px; font-weight: bold; margin-bottom: 5px; color: #3498db; }}
.result-url {{ color: #006621; font-size: 14px; margin-bottom: 5px; }}
.result-snippet {{ color: #545454; }}
.no-results {{ text-align: center; padding: 20px; color: #777; }}
.footer {{ margin-top: 30px; text-align: center; color: #777; font-size: 14px; }}
</style>
</head>
<body>
<div class="container">
<h1>Python搜索网站</h1>
<form action="/search" method="get">
<div class="search-box">
<input type="text" name="q" placeholder="输入搜索关键词...">
<button type="submit">搜索</button>
</div>
</form>
<div class="info">
<p>这个搜索网站使用Python内置库和lxml构建,可以索引和搜索本地HTML文档。</p>
<p>示例文档已包含在系统中,尝试搜索: "Python", "lxml", "搜索"等关键词。</p>
</div>
<div class="footer">
使用Python + lxml构建 | 本地搜索系统
</div>
</div>
</body>
</html>
"""
def generate_results_page(self, query, results):
"""生成搜索结果页面"""
results_html = ""
if results:
for result in results:
results_html += f"""
<div class="result-item">
<div class="result-title"><a href="/{DOCUMENTS_DIR}/{result['url']}">{result['title']}</a></div>
<div class="result-url">/{DOCUMENTS_DIR}/{result['url']}</div>
<div class="result-snippet">{result['content']}</div>
<div class="result-info">相关性: {result['score']}</div>
</div>
"""
else:
results_html = '<div class="no-results">没有找到相关结果</div>'
return f"""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>搜索: {query} - Python搜索网站</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 0; padding: 20px; background-color: #f5f5f5; }}
.container {{ max-width: 800px; margin: 0 auto; background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }}
h1 {{ color: #2c3e50; }}
.search-box {{ display: flex; margin: 20px 0; }}
input[type="text"] {{ flex: 1; padding: 12px; font-size: 16px; border: 1px solid #ddd; border-radius: 4px 0 0 4px; }}
button {{ padding: 12px 20px; background: #3498db; color: white; border: none; border-radius: 0 4px 4px 0; cursor: pointer; font-size: 16px; }}
button:hover {{ background: #2980b9; }}
.results-count {{ color: #777; margin-bottom: 20px; }}
.result-item {{ margin: 15px 0; padding: 15px; border: 1px solid #eee; border-radius: 4px; }}
.result-title {{ font-size: 18px; font-weight: bold; margin-bottom: 5px; color: #3498db; }}
.result-url {{ color: #006621; font-size: 14px; margin-bottom: 5px; }}
.result-snippet {{ color: #545454; }}
.result-info {{ color: #777; font-size: 14px; margin-top: 5px; }}
.no-results {{ text-align: center; padding: 20px; color: #777; }}
</style>
</head>
<body>
<div class="container">
<h1>Python搜索网站</h1>
<form action="/search" method="get">
<div class="search-box">
<input type="text" name="q" value="{query}">
<button type="submit">搜索</button>
</div>
</form>
<div class="results-count">找到 {len(results)} 条结果</div>
{results_html}
<div class="footer">
<a href="/">返回首页</a> | 使用Python + lxml构建
</div>
</div>
</body>
</html>
"""
# 启动服务器
with socketserver.TCPServer(("", PORT), SearchHandler) as httpd:
print(f"服务器运行在端口 {PORT}")
print(f"访问地址: http://localhost:{PORT}/")
print("按Ctrl+C停止服务器")
try:
httpd.serve_forever()
except KeyboardInterrupt:
print("\n服务器已停止")让浏览器成为专业AI工具,无需字典列表元组集合,还有网络请求,无需外部库,加上后的完整代码
最新发布