website

# -*- coding: utf-8 -*- """ 网站爬取脚本(优化版): - 读取 data_cn_en.csv(URL列表) - 爬取网站文本并清理 - 每爬取1条记录实时追加保存到CSV - 遇到错误网站直接跳过,不中断整体流程 """ import pandas as pd import requests from bs4 import BeautifulSoup, element import spacy as sp from spacy.language import Language import jieba import os from tqdm import tqdm import traceback from urllib.parse import urlparse # 提前导入URL解析模块 # 打印当前模块导入情况,帮助排查命名冲突 print("=" * 50) print("当前导入模块检查:") print(f"pandas: {pd.__name__} v{pd.__version__}") print(f"requests: {requests.__name__}") print(f"BeautifulSoup: {BeautifulSoup.__name__}") print(f"spacy: {sp.__name__} v{sp.__version__}") print(f"jieba: {jieba.__name__}") print(f"tqdm: {tqdm.__name__}") print("=" * 50) # -------------------------- 1. 初始化文本处理工具 -------------------------- print("\n[初始化] 开始初始化文本处理工具...") try: print("[初始化] 尝试加载spaCy英文模型...") nlp = sp.load("en_core_web_sm") print("[初始化] spaCy模型加载成功") except Exception as e: print(f"[初始化错误] spaCy模型加载失败: {str(e)}") print("请先安装spaCy英文模型:python -m spacy download en_core_web_sm") exit(1) @Language.component("jieba_tokenizer") def jieba_tokenizer(doc): """中文文本分词组件(spaCy管道)""" try: text = doc.text has_chinese = any('\u4e00' <= char <= '\u9fff' for char in text) if has_chinese: words = jieba.lcut(text) spaces = [False] * len(words) return sp.tokens.Doc(doc.vocab, words=words, spaces=spaces) else: return doc except Exception as e: print(f"[分词错误] jieba_tokenizer出错: {str(e)}") traceback.print_exc() return doc try: nlp.add_pipe("jieba_tokenizer", before="parser") print("[初始化] jieba分词管道添加成功") except Exception as e: print(f"[初始化错误] 添加jieba分词管道失败: {str(e)}") traceback.print_exc() exit(1) # -------------------------- 2. 网站爬取工具 -------------------------- class ScrapTool: def visit_url(self, website_url): """ 爬取单个URL的核心方法 返回:包含URL、网站名、原始文本的字典(失败时文本为空) """ try: print(f"\n[爬取] 处理URL: {website_url}") # 补全URL前缀(若缺失http/https) if not website_url.startswith(("http://", "https://")): website_url = f"https://{website_url}" print(f"[爬取] 补全URL为: {website_url}") # 模拟浏览器请求头(降低反爬概率) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' } # 发送请求(超时30秒,避免卡死) print(f"[爬取] 发送请求到: {website_url}") response = requests.get(website_url, headers=headers, timeout=30) response.raise_for_status() # 触发HTTP错误(如404、500) # 解析网页内容 print(f"[爬取] 成功获取响应,状态码: {response.status_code}") soup = BeautifulSoup(response.content, "lxml") # 整合网页关键文本(标题+Meta+标题标签+正文) print("[爬取] 提取网页内容...") website_text = ( self.get_html_title_tag(soup) + " " + self.get_html_meta_tags(soup) + " " + self.get_html_heading_tags(soup) + " " + self.get_text_content(soup) ).strip() # 返回成功结果 result = { "website_url": website_url, "website_name": self.get_website_name(website_url), "website_text": website_text } print(f"[爬取] 成功处理URL: {website_url}(文本长度:{len(website_text)}字符)") return result except Exception as e: # 捕获所有异常,返回失败结果(文本为空) error_msg = str(e)[:100] + "..." if len(str(e)) > 100 else str(e) print(f"[爬取错误] 处理 {website_url} 失败:{error_msg}") traceback.print_exc() # 打印详细错误堆栈(便于调试) return { "website_url": website_url, "website_name": self.get_website_name(website_url) if website_url else "", "website_text": "" # 错误时文本为空 } def get_website_name(self, website_url): """从URL中提取网站名(如:www.baidu.com → baidu)""" try: parsed = urlparse(website_url) netloc = parsed.netloc # 获取域名(如:www.baidu.com) name_parts = netloc.split(".") # 处理二级/三级域名(如:blog.youkuaiyun.com → csdn,www.taobao.com → taobao) return name_parts[-2] if len(name_parts) >= 2 else netloc except Exception as e: print(f"[提取网站名错误] {str(e)}") return "" def get_html_title_tag(self, soup): """提取网页<title>标签内容""" try: if soup.title and soup.title.contents: title = '. '.join([str(c).strip() for c in soup.title.contents]) return title[:200] # 限制长度,避免异常长标题 return "" except Exception as e: print(f"[提取标题错误] {str(e)}") return "" def get_html_meta_tags(self, soup): """提取网页Meta标签(keywords/description)""" try: meta_tags = soup.find_all( lambda tag: tag.name == "meta" and tag.has_attr('name') and tag.has_attr('content') ) meta_content = [str(tag["content"]).strip() for tag in meta_tags if tag["name"].lower() in ['keywords', 'description']] return ' '.join(meta_content)[:500] # 限制长度 except Exception as e: print(f"[提取Meta错误] {str(e)}") return "" def get_html_heading_tags(self, soup): """提取网页H1-H6标题标签""" try: heading_tags = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]) heading_content = [" ".join(tag.stripped_strings) for tag in heading_tags] return ' '.join(heading_content)[:300] # 限制长度 except Exception as e: print(f"[提取标题标签错误] {str(e)}") return "" def get_text_content(self, soup): """提取网页正文文本(过滤无用标签)""" try: # 忽略的无用标签(避免爬取样式、脚本等内容) ignore_tags = ['style', 'script', 'head', 'title', 'meta', '[document]', "noscript", "iframe", "nav", "footer", "aside"] text_tags = soup.find_all(string=True) valid_text = [] for tag in text_tags: stripped_text = tag.strip() # 过滤条件:非忽略标签、非注释、非纯数字、非空文本 if (tag.parent.name not in ignore_tags and not isinstance(tag, element.Comment) and not stripped_text.isnumeric() and len(stripped_text) > 1): # 过滤单个字符(如标点、空格) valid_text.append(stripped_text) return ' '.join(valid_text)[:2000] # 限制长度,避免异常长文本 except Exception as e: print(f"[提取正文错误] {str(e)}") return "" # -------------------------- 3. 文本清理函数 -------------------------- def clean_text(raw_text): """ 清理原始网页文本: 1. 过滤停用词、标点、纯数字 2. 英文词形还原(如:running → run) 3. 统一小写 """ try: if not raw_text or raw_text.strip() == "": return "" # 空文本直接返回 # 使用spaCy处理文本 doc = nlp(raw_text) tokens = [] exclusion_list = ["nan", "none", "null"] # 额外排除的无效文本 for token in doc: # 过滤条件:非停用词、非标点、非纯数字、非特殊字符、不在排除列表 if (not token.is_stop and not token.is_punct and not token.text.isnumeric() and (token.text.isalnum() or '\u4e00' <= token.text <= '\u9fff') and token.text.lower() not in exclusion_list): # 中文直接取文本,英文取词形还原 cleaned_token = token.text.lower().strip() if '\u4e00' <= token.text <= '\u9fff' else str( token.lemma_).lower().strip() tokens.append(cleaned_token) return " ".join(tokens) # 拼接清理后的词 except Exception as e: print(f"[文本清理错误] {str(e)}") traceback.print_exc() return "" # -------------------------- 4. 核心爬取流程(实时保存+错误跳过) -------------------------- def main(): # 配置文件路径 input_csv_path = "data_cn_en.csv" output_csv_path = "data_with_cleaned_text.csv" # 检查输入文件是否存在 if not os.path.exists(input_csv_path): print(f"[错误] 输入文件 {input_csv_path} 不存在!") exit(1) # 读取原始CSV(假设格式:id, url, category) print("\n[主流程] 读取原始数据集...") try: df = pd.read_csv( input_csv_path, header=None, names=["id", "url", "category"], encoding="utf-8" ) print(f"[主流程] 读取完成,共 {len(df)} 条URL记录") except Exception as e: print(f"[读取CSV错误] {str(e)}") exit(1) # 初始化爬虫工具 scrap_tool = ScrapTool() print("[主流程] 爬虫工具初始化完成,开始爬取...") # 初始化进度条 pbar = tqdm(total=len(df), desc="整体爬取进度") # 遍历每条URL(逐个处理+实时保存) for idx, row in df.iterrows(): try: # 获取当前行数据 record_id = row["id"] url = row["url"] category = row["category"] print(f"\n[主流程] 处理第 {idx + 1}/{len(df)} 条记录(ID:{record_id})") # 1. 爬取URL内容 crawl_result = scrap_tool.visit_url(url) # 2. 清理文本 cleaned_text = clean_text(crawl_result["website_text"]) print(f"[主流程] 文本清理完成(清理后长度:{len(cleaned_text)}字符)") # 3. 构造最终记录 final_record = pd.DataFrame({ "id": [record_id], "url": [url], # 保留原始URL(未补全前) "website_name": [crawl_result["website_name"]], "website_text": [crawl_result["website_text"]], "cleaned_website_text": [cleaned_text], "category": [category], "status": ["成功" if cleaned_text else "失败"] # 标记爬取状态 }) # 4. 实时追加保存到CSV(不存在则创建,存在则追加) if not os.path.exists(output_csv_path): # 首次保存:写入表头 final_record.to_csv(output_csv_path, index=False, encoding="utf-8", mode="w") else: # 后续保存:追加(不写表头) final_record.to_csv(output_csv_path, index=False, encoding="utf-8", mode="a", header=False) print(f"[主流程] 第 {idx + 1} 条记录已保存到 {output_csv_path}") except Exception as e: # 捕获单条记录的所有异常,直接跳过(不中断循环) print(f"[主流程警告] 第 {idx + 1} 条记录处理失败,已跳过:{str(e)}") traceback.print_exc() # 更新进度条 pbar.update(1) # 爬取完成,打印统计信息 pbar.close() print("\n" + "=" * 50) print("[主流程] 所有URL处理完成!") # 读取输出文件,统计成功/失败数 if os.path.exists(output_csv_path): output_df = pd.read_csv(output_csv_path, encoding="utf-8") total = len(output_df) success = len(output_df[output_df["cleaned_website_text"] != ""]) fail = total - success print(f"\n[统计信息]") print(f" - 总处理数:{total}") print(f" - 成功数(有效文本):{success}") print(f" - 失败数(无有效文本):{fail}") print(f" - 成功率:{success / total * 100:.1f}%" if total > 0 else " - 成功率:0%") print(f"\n输出文件路径:{os.path.abspath(output_csv_path)}") else: print(f"[警告] 未生成输出文件 {output_csv_path}(可能所有记录处理失败)") if __name__ == "__main__": try: print("=" * 50) print("程序开始执行(优化版:实时保存+错误跳过)") print("=" * 50) # 检查必要依赖(提前捕获缺失依赖) required_libs = ["pandas", "requests", "bs4", "spacy", "jieba", "tqdm"] for lib in required_libs: __import__(lib) print("[依赖检查] 所有必要依赖均已安装") # 执行主流程 main() except ImportError as e: # 处理缺失依赖的错误 missing_lib = str(e).split("'")[1] print(f"\n[依赖错误] 缺少必要库:{missing_lib}") print("请执行以下命令安装所有依赖:") print("pip install pandas requests beautifulsoup4 spacy jieba tqdm") if "spacy" in missing_lib or "en_core_web_sm" in str(e): print("并安装spaCy英文模型:python -m spacy download en_core_web_sm") except Exception as e: # 处理程序启动时的致命错误 print(f"\n[程序致命错误] 启动失败:{str(e)}") traceback.print_exc() 这段代码清理爬取的文本时忘记给中文设置停用词了,请写一个程序重新处理一下data_with_cleaned_text.csv文件中的cleaned_website_text
09-05
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值