tuckey urlrewrite 3.x的问题

在配置myblog的rewrite规则时遇到无法匹配带有查询字符串的URL问题。通过调整<urlrewrite>元素的use-query-string属性为true解决了这一问题。
昨天晚上配置myblog的rewrite。发现一个奇怪的问题。由于现在使用的这个pjblog,为了让搜索引擎收录的连接有效。我想把原来的asp连接rewrite到我的新程序上面。所以有这样一条规则。

     < rule >
        
< from > ^/article.asp\?id=(.*)$ from>
        
<to type="redirect">/entry/$1.jhtmlto>
    
rule>
     但是我这样的连接总是匹配不到,只要去掉那个?就可以了。这个正则表达式是没有问题的。/article.asp?id=64是可以匹配的到的。
    后来看3.0的manual (http://tuckey.org/urlrewrite/manual/3.0/)才发现原来是这个的问题。

<urlrewrite> element</urlrewrite>

The top level element.

AttributePossible ValueExplanation
default-match-type
(optional)
regex (default)All rules and thier conditions will be processed using the Java Regular Expression engine (unless match-type is specified on a rule).
wildcardAll rules and thier conditions will be processed using the Wildcard Expression engine (unless match-type is specified on a rule).
decode-using
(optional)
utf8 (default)When URL is decoded UTF-8 will be used.
nullDo not decode.
[encoding]Any string representing a supported character encoding eg, ISO-8859-1. See Java Charset Object for more info.
use-query-string
(optional)
false (default)The query string will not be appended to the url that the "from" element matches against.
trueThe query string will be appended to the url that the "from" element matches against.
use-context
(optional)
false (default)The context path will not be added to the url that the "from" element matches against.
trueThe context path will be added to the url that the "from" element matches against.

就是那个use-query-string 的问题,默认的是不使用query-string就是把?后面的都忽略了。所以就不能匹配到了。只要在<urlrewrite>里面加一个属性就可以了。
</urlrewrite>
< urlrewrite  use-query-string ="true" >
    
urlrewrite>

 

# -*- coding: utf-8 -*- # 优化版中英文双语搜索引擎 import os import re import sqlite3 import jieba import heapq import langid from urllib.parse import urlparse from bs4 import BeautifulSoup import requests from math import log from collections import defaultdict, deque from functools import lru_cache from dataclasses import dataclass from flask import Flask, request, render_template, redirect, g # ======================== # 数据结构优化 # ======================== @dataclass class DocumentMetadata: url: str title: str = "" snippet: str = "" lang: str = "" vector_length: float = 0.0 # ======================== # 1. 高效爬虫模块 # ======================== class OptimizedCrawler: def __init__(self, db_path=&#39;search_engine.db&#39;): self.conn = sqlite3.connect(db_path, timeout=10) self.conn.execute("PRAGMA journal_mode=WAL") self.conn.execute("PRAGMA synchronous=NORMAL") self.cursor = self.conn.cursor() self._init_db() self.user_agent = &#39;Mozilla/5.0 (Windows NT 10.0; Win64; x64) BingBot/1.0&#39; self.session = requests.Session() self.session.headers.update({&#39;User-Agent&#39;: self.user_agent}) # 预先编译正则表达式 self.url_pattern = re.compile(r&#39;https?://[^\s"\&#39;<>]+&#39;) self.clean_pattern = re.compile(r&#39;[\&#39;";]&#39;) def _init_db(self): """优化数据库结构并创建索引[^1][^3]""" self.cursor.execute(&#39;&#39;&#39; CREATE TABLE IF NOT EXISTS pages ( id INTEGER PRIMARY KEY, url TEXT UNIQUE, title TEXT, content TEXT, lang TEXT, processed_content TEXT, vector_length REAL DEFAULT 0.0 ) &#39;&#39;&#39;) # 创建索引提升查询性能[^4] self.cursor.execute(&#39;CREATE INDEX IF NOT EXISTS idx_url ON pages(url)&#39;) self.cursor.execute(&#39;CREATE INDEX IF NOT EXISTS idx_lang ON pages(lang)&#39;) self.conn.commit() def sanitize_input(self, text): """高效文本清洗""" return self.clean_pattern.sub(&#39;&#39;, text) @lru_cache(maxsize=1000) def detect_language(self, text): """带缓存的语言检测""" return langid.classify(text[:1000])[0] def extract_text(self, html): """优化HTML文本提取""" soup = BeautifulSoup(html, &#39;html.parser&#39;) # 移除不需要的标签 for tag in soup([&#39;script&#39;, &#39;style&#39;, &#39;noscript&#39;, &#39;footer&#39;, &#39;header&#39;]): tag.decompose() # 提取标题和正文 title = soup.title.string if soup.title else "" body = soup.body.get_text(separator=&#39; &#39;, strip=True) if soup.body else "" return title, body def crawl(self, start_urls, max_pages=100): """优化爬取逻辑""" visited = set() queue = deque(start_urls) page_count = 0 bulk_data = [] while queue and page_count < max_pages: url = queue.popleft() try: response = self.session.get(url, timeout=5) response.encoding = response.apparent_encoding if response.status_code == 200: title, text_content = self.extract_text(response.text) sanitized_content = self.sanitize_input(text_content) lang = self.detect_language(text_content[:500]) # 批量插入优化[^1] bulk_data.append(( url, title, sanitized_content, lang, sanitized_content[:200] + &#39;...&#39; )) visited.add(url) page_count += 1 # 批量提交 if page_count % 10 == 0: self._bulk_insert(bulk_data) bulk_data = [] # 优化链接发现 new_urls = self.url_pattern.findall(response.text) for new_url in new_urls: if new_url not in visited: queue.append(new_url) except Exception as e: print(f"Error crawling {url}: {str(e)}") # 插入剩余数据 if bulk_data: self._bulk_insert(bulk_data) return list(visited) def _bulk_insert(self, data): """批量插入优化性能[^1]""" self.cursor.executemany(&#39;&#39;&#39; INSERT OR IGNORE INTO pages (url, title, content, lang, processed_content) VALUES (?, ?, ?, ?, ?) &#39;&#39;&#39;, data) self.conn.commit() # ======================== # 2. 高效索引与搜索模块 # ======================== class OptimizedIndexer: def __init__(self, db_path=&#39;search_engine.db&#39;): self.conn = sqlite3.connect(db_path, timeout=10) self.conn.execute("PRAGMA journal_mode=WAL") self.cursor = self.conn.cursor() # 优化索引数据结构[^5] self.index = defaultdict(dict) # term -> {doc_id: tf} self.documents = {} self.total_docs = 0 self.doc_freq = defaultdict(int) self.doc_vector_lengths = {} # 预加载停用词 self.stopwords = self._load_stopwords() def _load_stopwords(self): """预加载停用词表""" base_stopwords = {&#39;的&#39;, &#39;是&#39;, &#39;在&#39;, &#39;和&#39;, &#39;the&#39;, &#39;and&#39;, &#39;is&#39;, &#39;of&#39;, &#39;to&#39;} try: # 加载中文扩展停用词 from jieba.analyse import STOP_WORDS return base_stopwords | STOP_WORDS except: return base_stopwords @lru_cache(maxsize=10000) def preprocess_text(self, text, lang): """带缓存的分词预处理""" # 中文分词 if lang == &#39;zh&#39;: words = jieba.cut(text) # 英文处理 else: words = re.findall(r&#39;\b\w{3,}\b&#39;, text.lower()) return [word for word in words if word not in self.stopwords] def build_index(self): """高效索引构建""" self.cursor.execute("SELECT id, url, title, content, lang, processed_content FROM pages") rows = self.cursor.fetchall() self.total_docs = len(rows) # 预计算向量长度 for row in rows: doc_id, url, title, content, lang, snippet = row self.documents[doc_id] = DocumentMetadata( url=url, title=title, snippet=snippet, lang=lang ) tokens = self.preprocess_text(content, lang) doc_length = len(tokens) # 计算词频 term_freq = defaultdict(int) for token in tokens: term_freq[token] += 1 # 构建倒排索引 vector_length_sq = 0 for term, freq in term_freq.items(): self.index[term][doc_id] = freq self.doc_freq[term] += 1 # 累加向量长度平方 idf = log(self.total_docs / max(self.doc_freq[term], 1)) tf_idf_sq = (freq * idf) ** 2 vector_length_sq += tf_idf_sq # 存储文档向量长度 self.doc_vector_lengths[doc_id] = vector_length_sq ** 0.5 # 更新数据库中的向量长度 self._update_vector_lengths() def _update_vector_lengths(self): """批量更新向量长度""" update_data = [(self.doc_vector_lengths[doc_id], doc_id) for doc_id in self.doc_vector_lengths] self.cursor.executemany( "UPDATE pages SET vector_length = ? WHERE id = ?", update_data ) self.conn.commit() def tf_idf(self, term, doc_id): """高效TF-IDF计算""" if doc_id not in self.doc_vector_lengths or term not in self.index: return 0 tf = self.index[term].get(doc_id, 0) if tf == 0: return 0 idf = log(self.total_docs / max(self.doc_freq[term], 1)) return tf * idf def cosine_similarity(self, query_terms, doc_id): """优化余弦相似度计算""" # 获取文档向量长度 doc_vector_length = self.doc_vector_lengths.get(doc_id, 0) if doc_vector_length == 0: return 0 # 计算点积 dot_product = 0 for term in query_terms: if term in self.index and doc_id in self.index[term]: term_weight = self.tf_idf(term, doc_id) dot_product += term_weight return dot_product / doc_vector_length def search(self, query, top_n=10): """高效搜索实现""" # 检测查询语言 lang, _ = langid.classify(query) # 预处理查询 query_terms = self.preprocess_text(query, lang) # 检索候选文档 candidate_docs = set() for term in query_terms: if term in self.index: candidate_docs.update(self.index[term].keys()) # 使用堆排序高效获取TopN结果[^5] results = [] for doc_id in candidate_docs: score = self.cosine_similarity(query_terms, doc_id) if score > 0: if len(results) < top_n: heapq.heappush(results, (score, doc_id)) else: heapq.heappushpop(results, (score, doc_id)) # 按分数降序排序 results.sort(reverse=True) return [(doc_id, score) for score, doc_id in results] # ======================== # 3. Web服务优化 # ======================== app = Flask(__name__) @app.before_request def before_request(): """初始化索引器""" if &#39;indexer&#39; not in g: g.indexer = OptimizedIndexer() if not g.indexer.documents: try: g.indexer.build_index() except Exception as e: print(f"Index build error: {str(e)}") @app.route(&#39;/&#39;) def home(): return render_template(&#39;index.html&#39;) @app.route(&#39;/search&#39;) def search(): query = request.args.get(&#39;q&#39;, &#39;&#39;) if not query.strip(): return redirect(&#39;/&#39;) results = [] indexer = g.get(&#39;indexer&#39;, None) if indexer and indexer.documents: search_results = indexer.search(query) for doc_id, score in search_results: doc = indexer.documents[doc_id] results.append({ &#39;url&#39;: doc.url, &#39;title&#39;: doc.title, &#39;snippet&#39;: doc.snippet, &#39;score&#39;: round(score, 4) }) return render_template(&#39;results.html&#39;, query=query, results=results) # ======================== # 主程序执行 # ======================== if __name__ == &#39;__main__&#39;: # 优化启动流程 print("初始化数据库...") crawler = OptimizedCrawler() # 按需爬取 crawler.cursor.execute("SELECT COUNT(*) FROM pages") if crawler.cursor.fetchone()[0] < 10: start_urls = [ &#39;https://en.wikipedia.org/wiki/Search_engine&#39;, &#39;https://baike.baidu.com/item/%E6%90%9C%E7%B4%A2%E5%BC%95%E6%93%8E&#39; ] print("开始爬取网页...") crawler.crawl(start_urls, max_pages=50) print("启动搜索引擎服务: http://localhost:5000") app.run(debug=False, threaded=True) 这个代码存在问题
09-22
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值