初次爬虫实践问题soup=BeautifulSoup(html_doc,'html.parser',from_coding='utf-8')

博主初次尝试爬虫实践,遇到了参数错误导致的问题。通过仔细阅读错误提示并删除了一个非预期的参数,最终成功运行了爬虫程序。

今天博主第一次根据视频教学尝试爬虫实例,不料遇到重重险阻。。。在第一个实例上就遇到问题了。
Traceback (most recent call last):
  File "C:\Users\Administrator\eclipse-workspace\imooc\test\test_bs4.py", line 17, in <module>
    soup=BeautifulSoup(html_doc,'html.parser',from_coding='utf-8')
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36-32\lib\site-packages\bs4\__init__.py", line 152, in __init__
    "__init__() got an unexpected keyword argument '%s'" % arg)
TypeError: __init__() got an unexpected keyword argument 'from_coding'
在网上找个很多也没找到问题的解决方案。。本来都打算放弃了,想想不行还得再自己琢磨一下,然后就又看了错误提示,说“from_coding”不是预期的关键词,就想着要不把它删了!哈哈哈没想到删了以后真的运行出来了。。
但是说真的我完全不知道原理是什么。。也不知道会不会有大佬看见我这篇文章,能跟我解释一下。。这也是自己第一次写博客。。真希望遇到一些跟我一样无知的少年,遇到和我一样的问题,可以看见我,再。。。可以一起学习。一起进步!
#-*- coding: UTF-8 -*- from bs4 import BeautifulSoup import bs4 from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains # 鼠标操作 from selenium.webdriver.common.by import By import time def get_info(soup, _type, element, param=None): if _type == "find": if param is not None: params = dict([param.split('=')]) res = soup.find(element, **params) else: res = soup.find(element) if res is not None: res = res.string.replace(" ", "").replace("\n", "") else: res = "None" if _type == "find_all": if param is not None: params = dict([param.split('=')]) res = soup.find_all(element, **params) else: res = soup.find_all(element) return res def fillUnivList(html): soup = BeautifulSoup(html, 'html.parser') for tr in soup.find('tbody').children: if isinstance(tr, bs4.element.Tag): # 如果为Tag类型 td_list = tr.find_all('td') "排名" top = get_info(td_list[0], "find", "div", "class_=ranking") "logo" logo = td_list[1].find('img')["src"] "中文名/英文名" university_list = get_info(td_list[1], "find_all", "a") ch_name = university_list[0].string.replace("\n", "").replace("\t", "").strip(" ") en_name = university_list[1].string.replace("\n", "").strip(" ") "学校标签" tags = get_info(td_list[1], "find", "p") "学校地址" area = td_list[2].text.replace("\n", "").strip(" ") "学校行业" main = td_list[3].text.replace("\n", "").strip(" ") "综合分数" score = td_list[4].text.replace("\n", "").strip(" ") "办学层次" layer = td_list[5].text.replace("\n", "").strip(" ") print("{:<3}|{}|{:<80}|{}|{}|{}|{:<6}|{:<5}|{}".format( top, ch_name.ljust(14, "\u3000"),en_name, tags.ljust(12, "\u3000"), area.ljust(4, "\u3000"), main.ljust(4, "\u3000"), score, layer, logo)) def action_run(driver, actions, info, by=By.ID, time_num=1): while 1: config_facesearch = driver.find_element(by=by, value=info) if config_facesearch.is_displayed(): actions.move_to_element(config_facesearch).click().perform() time.sleep(time_num) break else: print("%s is not find, watting..." % (info)) time.sleep(1) if __name__ == "__main__": url = "https://www.shanghairanking.cn/rankings/bcur/2023" start = time.strftime("%H:%M:%S", time.localtime()) driver = webdriver.Firefox() # driver = webdriver.Chrome() driver.maximize_window() driver.get(url) time.sleep(2) "鼠标操作" actions = ActionChains(driver) for i in range(20): html = driver.page_source fillUnivList(html) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # 滚动至底部 action_run(driver, actions, info="li[title='下一页']", by=By.CSS_SELECTOR) end = time.strftime("%H:%M:%S", time.localtime()) print("用时%s - %s" % (start, end)) # 关闭浏览器 driver.quit() 为什么爬取时无法爬取大学名字
最新发布
10-19
# -*- coding: utf-8 -*- import os import time import jieba import requests import numpy as np import pandas as pd import tkinter as tk from tkinter import ttk from urllib.parse import urljoin from pdfminer.high_level import extract_text from docx import Document from pptx import Presentation from bs4 import BeautifulSoup from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import matplotlib.pyplot as plt from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg from nltk.corpus import reuters # 需要先执行nltk.download('reuters') # ====================== 数据源模块 ====================== class DocumentParser: """多格式文档解析器""" @staticmethod def parse(file_path): ext = os.path.splitext(file_path)[1].lower() try: if ext == '.txt': return DocumentParser.parse_txt(file_path) elif ext == '.pdf': return DocumentParser.parse_pdf(file_path) elif ext == '.docx': return DocumentParser.parse_docx(file_path) elif ext == '.xlsx': return DocumentParser.parse_xlsx(file_path) elif ext == '.html': return DocumentParser.parse_html(file_path) elif ext == '.pptx': return DocumentParser.parse_pptx(file_path) except Exception as e: print(f"解析失败 {file_path}: {e}") return "" @staticmethod def parse_txt(file_path): with open(file_path, 'r', encoding='utf-8') as f: return f.read() @staticmethod def parse_pdf(file_path): return extract_text(file_path) @staticmethod def parse_docx(file_path): doc = Document(file_path) return '\n'.join([p.text for p in doc.paragraphs]) @staticmethod def parse_xlsx(file_path): df = pd.read_excel(file_path, header=None) return '\n'.join(df.astype(str).values.flatten()) @staticmethod def parse_html(file_path): with open(file_path, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'html.parser') return soup.get_text() @staticmethod def parse_pptx(file_path): prs = Presentation(file_path) text = [] for slide in prs.slides: for shape in slide.shapes: if hasattr(shape, "text"): text.append(shape.text) return '\n'.join(text) class CorpusLoader: """语料加载器""" def __init__(self, data_dirs): self.data_dirs = data_dirs def load(self): documents = [] for data_dir in self.data_dirs: for root, _, files in os.walk(data_dir): for file in files: path = os.path.join(root, file) content = DocumentParser.parse(path) if content: documents.append({ 'path': path, 'content': content, 'ext': os.path.splitext(file)[1].lower(), 'size': os.path.getsize(path), 'mtime': os.path.getmtime(path) }) return documents # ====================== 检索模块 ====================== class TFIDFRetriever: """TF-IDF检索器""" def __init__(self, documents): self.documents = documents self.vectorizer = TfidfVectorizer(tokenizer=self.chinese_tokenizer) self.tfidf_matrix = self.vectorizer.fit_transform([d['content'] for d in documents]) @staticmethod def chinese_tokenizer(text): words = jieba.cut(text) return [w for w in words if len(w) > 1 and not self.is_stopword(w)] @staticmethod def is_stopword(word): stopwords = {'的', '了', '在', '是', '和', '就', '都', '而', '及', '与'} return word in stopwords def search(self, query, top_k=10): query_vec = self.vectorizer.transform([query]) scores = cosine_similarity(query_vec, self.tfidf_matrix).flatten() indices = np.argsort(scores)[::-1][:top_k] return [(self.documents[i], scores[i]) for i in indices if scores[i] > 0] # ====================== 规则模块 ====================== class SearchRuleEngine: """检索规则引擎""" def __init__(self): self.rules = { 'must_contain': [], 'exclude_words': [], 'date_range': None, 'file_size': (0, float('inf')), 'file_types': [] } def add_rule(self, rule_type, value): if rule_type == 'must_contain': self.rules['must_contain'] = value.split() elif rule_type == 'exclude_words': self.rules['exclude_words'] = value.split() elif rule_type == 'date_range': self.rules['date_range'] = (time.mktime(value[0].timetuple()), time.mktime(value[1].timetuple())) elif rule_type == 'file_size': self.rules['file_size'] = (value[0]*1024, value[1]*1024) # KB转字节 elif rule_type == 'file_types': self.rules['file_types'] = [ext.lower() for ext in value] def apply(self, document): content = document['content'] # 必须包含词 for word in self.rules['must_contain']: if word not in content: return False # 排除词 for word in self.rules['exclude_words']: if word in content: return False # 文件类型 if self.rules['file_types'] and document['ext'] not in self.rules['file_types']: return False # 文件大小 if not (self.rules['file_size'][0] <= document['size'] <= self.rules['file_size'][1]): return False # 修改时间 if self.rules['date_range'] and not (self.rules['date_range'][0] <= document['mtime'] <= self.rules['date_range'][1]): return False return True # ====================== 可视化模块 ====================== class ResultVisualizer: """结果可视化""" @staticmethod def plot_metrics(ax, metrics): ax.clear() ax.plot(metrics['recall'], metrics['precision'], 'b-o') ax.set_xlabel('Recall') ax.set_ylabel('Precision') ax.set_title('Precision-Recall Curve') @staticmethod def plot_distribution(ax, documents): exts = [doc['ext'] for doc in documents] counts = pd.Series(exts).value_counts() ax.clear() counts.plot.pie(ax=ax, autopct='%1.1f%%') ax.set_ylabel('') ax.set_title('File Type Distribution') # ====================== 图形界面模块 ====================== class SearchGUI(tk.Tk): """检索系统GUI""" def __init__(self, retriever, rule_engine): super().__init__() self.title("文档检索系统 v1.0") self.retriever = retriever self.rule_engine = rule_engine # 作者信息 info_frame = ttk.Frame(self) info_frame.pack(pady=5, fill='x') ttk.Label(info_frame, text="作者:张三\n学号:20230001\n班级:计算机科学与技术1班").pack(side='left') # 搜索区 search_frame = ttk.Frame(self) search_frame.pack(fill='x', padx=10, pady=5) ttk.Label(search_frame, text="搜索词:").pack(side='left') self.query_entry = ttk.Entry(search_frame, width=50) self.query_entry.pack(side='left', padx=5) ttk.Button(search_frame, text="搜索", command=self.search).pack(side='left') # 规则设置 rule_frame = ttk.LabelFrame(self, text="高级检索规则") rule_frame.pack(fill='x', padx=10, pady=5) ttk.Label(rule_frame, text="必须包含:").grid(row=0, column=0, sticky='w') self.must_contain_entry = ttk.Entry(rule_frame) self.must_contain_entry.grid(row=0, column=1, sticky='we') ttk.Label(rule_frame, text="排除词:").grid(row=1, column=0, sticky='w') self.exclude_entry = ttk.Entry(rule_frame) self.exclude_entry.grid(row=1, column=1, sticky='we') # 结果展示 result_frame = ttk.Frame(self) result_frame.pack(fill='both', expand=True, padx=10, pady=5) self.result_tree = ttk.Treeview(result_frame, columns=('文件', '路径', '得分'), show='headings') self.result_tree.heading('文件', text='文件名') self.result_tree.heading('路径', text='文件路径') self.result_tree.heading('得分', text='匹配度') self.result_tree.pack(side='left', fill='both', expand=True) # 可视化区域 fig = plt.Figure(figsize=(6,4), dpi=100) self.metrics_ax = fig.add_subplot(211) self.dist_ax = fig.add_subplot(212) self.canvas = FigureCanvasTkAgg(fig, master=result_frame) self.canvas.get_tk_widget().pack(side='right', fill='y') def search(self): query = self.query_entry.get() if not query: return # 应用规则 self.rule_engine.add_rule('must_contain', self.must_contain_entry.get()) self.rule_engine.add_rule('exclude_words', self.exclude_entry.get()) # 执行检索 filtered_docs = [doc for doc in self.retriever.documents if self.rule_engine.apply(doc)] results = self.retriever.search(query) # 更新结果列表 self.result_tree.delete(*self.result_tree.get_children()) for doc, score in results: self.result_tree.insert('', 'end', values=( os.path.basename(doc['path']), doc['path'], f"{score:.4f}" )) # 更新可视化 ResultVisualizer.plot_metrics(self.metrics_ax, self.calculate_metrics()) ResultVisualizer.plot_distribution(self.dist_ax, self.retriever.documents) self.canvas.draw() def calculate_metrics(self): # 简化的指标计算,实际需要测试数据 return {'precision': [0.8, 0.7], 'recall': [0.6, 0.5]} # ====================== 语料采集 ====================== class WebCrawler: """网页爬虫""" def __init__(self, base_url, save_dir="corpus"): self.base_url = base_url self.save_dir = save_dir os.makedirs(save_dir, exist_ok=True) def crawl(self, max_pages=10): downloaded = [] queue = [self.base_url] visited = set() while queue and len(downloaded) < max_pages: url = queue.pop(0) if url in visited: continue try: response = requests.get(url, timeout=10) soup = BeautifulSoup(response.text, 'html.parser') # 保存内容 filename = f"web_{len(downloaded)}.html" path = os.path.join(self.save_dir, filename) with open(path, 'w', encoding='utf-8') as f: f.write(soup.get_text()) downloaded.append(path) # 提取链接 for link in soup.find_all('a', href=True): absolute_url = urljoin(self.base_url, link['href']) if absolute_url not in visited: queue.append(absolute_url) visited.add(url) except Exception as e: print(f"Error crawling {url}: {e}") return downloaded # ====================== 指标测试 ====================== class SearchEvaluator: """检索评估器""" def __init__(self, retriever): self.retriever = retriever self.test_cases = [] def add_test_case(self, query, relevant_docs): self.test_cases.append({ 'query': query, 'relevant': set(relevant_docs) }) def evaluate(self): report = [] for case in self.test_cases: start = time.time() results = [doc['path'] for doc, _ in self.retriever.search(case['query'])] elapsed = time.time() - start retrieved = set(results) relevant = case['relevant'] tp = len(retrieved & relevant) fp = len(retrieved - relevant) fn = len(relevant - retrieved) precision = tp / (tp + fp) if (tp + fp) > 0 else 0 recall = tp / (tp + fn) if (tp + fn) > 0 else 0 f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 report.append({ 'query': case['query'], 'precision': precision, 'recall': recall, 'f1': f1, 'time': elapsed }) return pd.DataFrame(report) # ====================== 主程序 ====================== if __name__ == "__main__": # 语料采集 crawler = WebCrawler("https://news.sina.com.cn") web_docs = crawler.crawl(3) # 爬取3个页面 # 加载NLTK语料 nltk_docs = [] for file_id in reuters.fileids()[:50]: # 取前50篇 content = reuters.raw(file_id) path = f"corpus/nltk_{file_id}.txt" with open(path, 'w', encoding='utf-8') as f: f.write(content) nltk_docs.append(path) # 加载所有文档 loader = CorpusLoader(["corpus", "web_corpus"]) documents = loader.load() # 初始化检索系统 retriever = TFIDFRetriever(documents) rule_engine = SearchRuleEngine() # 评估测试 evaluator = SearchEvaluator(retriever) evaluator.add_test_case("经济", ["corpus/doc1.txt", "corpus/doc2.pdf"]) report = evaluator.evaluate() print("\n评估报告:") print(report) # 启动GUI gui = SearchGUI(retriever, rule_engine) gui.mainloop() 1、数据源模块:用于读取和处理各种数据源; 2、检索模块:用于实现各种检索算法,如词频、TF-IDF、向量空间模型等; 3、规则模块:用于实现用户自定义的检索规则; 4、可视化模块:用于将检索结果以图表形式展示出来; 5、图形用户界面模块:用于与用户进行交互,接收用户检索请求并展示检索结果。可以flask、tkinter等框架来开发。(在图形用户界面上一定显示出作者姓名和学号) 6、自己去采集所需的各种语料,包括从NLTK库、Gensim库采集,或者编写爬虫在网上去爬取,或者前者结合; 7、最后进行查询各项指标测试,以一定的语料和样本进行检索,至少应进行精确率、召回率、F1值计算、平均响应时间、误检率。 8、实现多模态图文检索(选做) 9、实现RAG(增加生成式检索)(选做) 按要求优化代码
05-28
""" @author: jtahstu @contact: root@jtahstu.com @site: http://www.jtahstu.com @time: 2017/12/10 00:25 """ # -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup import time from pymongo import MongoClient headers = { 'x-devtools-emulate-network-conditions-client-id': "5f2fc4da-c727-43c0-aad4-37fce8e3ff39", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 'dnt': "1", 'accept-encoding': "gzip, deflate", 'accept-language': "zh-CN,zh;q=0.8,en;q=0.6", 'cookie': "__c=1501326829; lastCity=101020100; __g=-; __l=r=https%3A%2F%2Fwww.google.com.hk%2F&l=%2F; __a=38940428.1501326829..1501326829.20.1.20.20; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1501326839; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1502948718; __c=1501326829; lastCity=101020100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1501326839; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1502954829; __l=r=https%3A%2F%2Fwww.google.com.hk%2F&l=%2F; __a=38940428.1501326829..1501326829.21.1.21.21", 'cache-control': "no-cache", 'postman-token': "76554687-c4df-0c17-7cc0-5bf3845c9831" } conn = MongoClient('127.0.0.1', 27017) db = conn.iApp # 连接mydb数据库,没有则自动创建 def init(): items = db.jobs_php.find().sort('pid') for item in items: if 'detail' in item.keys(): # 在爬虫挂掉再此爬取时,跳过已爬取的行 continue detail_url = "https://www.zhipin.com/job_detail/%s.html?ka=search_list_1" % item['pid'] print(detail_url) html = requests.get(detail_url, headers=headers) if html.status_code != 200: # 爬的太快网站返回403,这时等待解封吧 print('status_code is %d' % html.status_code) break soup = BeautifulSoup(html.text, "html.parser") job = soup.select(".job-sec .text") if len(job) < 1:
03-18
# -*- coding: utf-8 -*- # 优化版中英文双语搜索引擎 import os import re import sqlite3 import jieba import heapq import langid from urllib.parse import urlparse from bs4 import BeautifulSoup import requests from math import log from collections import defaultdict, deque from functools import lru_cache from dataclasses import dataclass from flask import Flask, request, render_template, redirect, g # ======================== # 数据结构优化 # ======================== @dataclass class DocumentMetadata: url: str title: str = "" snippet: str = "" lang: str = "" vector_length: float = 0.0 # ======================== # 1. 高效爬虫模块 # ======================== class OptimizedCrawler: def __init__(self, db_path='search_engine.db'): self.conn = sqlite3.connect(db_path, timeout=10) self.conn.execute("PRAGMA journal_mode=WAL") self.conn.execute("PRAGMA synchronous=NORMAL") self.cursor = self.conn.cursor() self._init_db() self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) BingBot/1.0' self.session = requests.Session() self.session.headers.update({'User-Agent': self.user_agent}) # 预先编译正则表达式 self.url_pattern = re.compile(r'https?://[^\s"\'<>]+') self.clean_pattern = re.compile(r'[\'";]') def _init_db(self): """优化数据库结构并创建索引[^1][^3]""" self.cursor.execute(''' CREATE TABLE IF NOT EXISTS pages ( id INTEGER PRIMARY KEY, url TEXT UNIQUE, title TEXT, content TEXT, lang TEXT, processed_content TEXT, vector_length REAL DEFAULT 0.0 ) ''') # 创建索引提升查询性能[^4] self.cursor.execute('CREATE INDEX IF NOT EXISTS idx_url ON pages(url)') self.cursor.execute('CREATE INDEX IF NOT EXISTS idx_lang ON pages(lang)') self.conn.commit() def sanitize_input(self, text): """高效文本清洗""" return self.clean_pattern.sub('', text) @lru_cache(maxsize=1000) def detect_language(self, text): """带缓存的语言检测""" return langid.classify(text[:1000])[0] def extract_text(self, html): """优化HTML文本提取""" soup = BeautifulSoup(html, 'html.parser') # 移除不需要的标签 for tag in soup(['script', 'style', 'noscript', 'footer', 'header']): tag.decompose() # 提取标题和正文 title = soup.title.string if soup.title else "" body = soup.body.get_text(separator=' ', strip=True) if soup.body else "" return title, body def crawl(self, start_urls, max_pages=100): """优化爬取逻辑""" visited = set() queue = deque(start_urls) page_count = 0 bulk_data = [] while queue and page_count < max_pages: url = queue.popleft() try: response = self.session.get(url, timeout=5) response.encoding = response.apparent_encoding if response.status_code == 200: title, text_content = self.extract_text(response.text) sanitized_content = self.sanitize_input(text_content) lang = self.detect_language(text_content[:500]) # 批量插入优化[^1] bulk_data.append(( url, title, sanitized_content, lang, sanitized_content[:200] + '...' )) visited.add(url) page_count += 1 # 批量提交 if page_count % 10 == 0: self._bulk_insert(bulk_data) bulk_data = [] # 优化链接发现 new_urls = self.url_pattern.findall(response.text) for new_url in new_urls: if new_url not in visited: queue.append(new_url) except Exception as e: print(f"Error crawling {url}: {str(e)}") # 插入剩余数据 if bulk_data: self._bulk_insert(bulk_data) return list(visited) def _bulk_insert(self, data): """批量插入优化性能[^1]""" self.cursor.executemany(''' INSERT OR IGNORE INTO pages (url, title, content, lang, processed_content) VALUES (?, ?, ?, ?, ?) ''', data) self.conn.commit() # ======================== # 2. 高效索引与搜索模块 # ======================== class OptimizedIndexer: def __init__(self, db_path='search_engine.db'): self.conn = sqlite3.connect(db_path, timeout=10) self.conn.execute("PRAGMA journal_mode=WAL") self.cursor = self.conn.cursor() # 优化索引数据结构[^5] self.index = defaultdict(dict) # term -> {doc_id: tf} self.documents = {} self.total_docs = 0 self.doc_freq = defaultdict(int) self.doc_vector_lengths = {} # 预加载停用词 self.stopwords = self._load_stopwords() def _load_stopwords(self): """预加载停用词表""" base_stopwords = {'的', '是', '在', '和', 'the', 'and', 'is', 'of', 'to'} try: # 加载中文扩展停用词 from jieba.analyse import STOP_WORDS return base_stopwords | STOP_WORDS except: return base_stopwords @lru_cache(maxsize=10000) def preprocess_text(self, text, lang): """带缓存的分词预处理""" # 中文分词 if lang == 'zh': words = jieba.cut(text) # 英文处理 else: words = re.findall(r'\b\w{3,}\b', text.lower()) return [word for word in words if word not in self.stopwords] def build_index(self): """高效索引构建""" self.cursor.execute("SELECT id, url, title, content, lang, processed_content FROM pages") rows = self.cursor.fetchall() self.total_docs = len(rows) # 预计算向量长度 for row in rows: doc_id, url, title, content, lang, snippet = row self.documents[doc_id] = DocumentMetadata( url=url, title=title, snippet=snippet, lang=lang ) tokens = self.preprocess_text(content, lang) doc_length = len(tokens) # 计算词频 term_freq = defaultdict(int) for token in tokens: term_freq[token] += 1 # 构建倒排索引 vector_length_sq = 0 for term, freq in term_freq.items(): self.index[term][doc_id] = freq self.doc_freq[term] += 1 # 累加向量长度平方 idf = log(self.total_docs / max(self.doc_freq[term], 1)) tf_idf_sq = (freq * idf) ** 2 vector_length_sq += tf_idf_sq # 存储文档向量长度 self.doc_vector_lengths[doc_id] = vector_length_sq ** 0.5 # 更新数据库中的向量长度 self._update_vector_lengths() def _update_vector_lengths(self): """批量更新向量长度""" update_data = [(self.doc_vector_lengths[doc_id], doc_id) for doc_id in self.doc_vector_lengths] self.cursor.executemany( "UPDATE pages SET vector_length = ? WHERE id = ?", update_data ) self.conn.commit() def tf_idf(self, term, doc_id): """高效TF-IDF计算""" if doc_id not in self.doc_vector_lengths or term not in self.index: return 0 tf = self.index[term].get(doc_id, 0) if tf == 0: return 0 idf = log(self.total_docs / max(self.doc_freq[term], 1)) return tf * idf def cosine_similarity(self, query_terms, doc_id): """优化余弦相似度计算""" # 获取文档向量长度 doc_vector_length = self.doc_vector_lengths.get(doc_id, 0) if doc_vector_length == 0: return 0 # 计算点积 dot_product = 0 for term in query_terms: if term in self.index and doc_id in self.index[term]: term_weight = self.tf_idf(term, doc_id) dot_product += term_weight return dot_product / doc_vector_length def search(self, query, top_n=10): """高效搜索实现""" # 检测查询语言 lang, _ = langid.classify(query) # 预处理查询 query_terms = self.preprocess_text(query, lang) # 检索候选文档 candidate_docs = set() for term in query_terms: if term in self.index: candidate_docs.update(self.index[term].keys()) # 使用堆排序高效获取TopN结果[^5] results = [] for doc_id in candidate_docs: score = self.cosine_similarity(query_terms, doc_id) if score > 0: if len(results) < top_n: heapq.heappush(results, (score, doc_id)) else: heapq.heappushpop(results, (score, doc_id)) # 按分数降序排序 results.sort(reverse=True) return [(doc_id, score) for score, doc_id in results] # ======================== # 3. Web服务优化 # ======================== app = Flask(__name__) @app.before_request def before_request(): """初始化索引器""" if 'indexer' not in g: g.indexer = OptimizedIndexer() if not g.indexer.documents: try: g.indexer.build_index() except Exception as e: print(f"Index build error: {str(e)}") @app.route('/') def home(): return render_template('index.html') @app.route('/search') def search(): query = request.args.get('q', '') if not query.strip(): return redirect('/') results = [] indexer = g.get('indexer', None) if indexer and indexer.documents: search_results = indexer.search(query) for doc_id, score in search_results: doc = indexer.documents[doc_id] results.append({ 'url': doc.url, 'title': doc.title, 'snippet': doc.snippet, 'score': round(score, 4) }) return render_template('results.html', query=query, results=results) # ======================== # 主程序执行 # ======================== if __name__ == '__main__': # 优化启动流程 print("初始化数据库...") crawler = OptimizedCrawler() # 按需爬取 crawler.cursor.execute("SELECT COUNT(*) FROM pages") if crawler.cursor.fetchone()[0] < 10: start_urls = [ 'https://en.wikipedia.org/wiki/Search_engine', 'https://baike.baidu.com/item/%E6%90%9C%E7%B4%A2%E5%BC%95%E6%93%8E' ] print("开始爬取网页...") crawler.crawl(start_urls, max_pages=50) print("启动搜索引擎服务: http://localhost:5000") app.run(debug=False, threaded=True) 这个代码存在问题
09-22
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值