关于get_headers跟get_meta_tags如何使用

牧羊城(www.muyangcheng.com)是一个广州本地社区,提供广州消费经验、生活信息等,专注于服务广州百姓日常生活。该网站使用phpwind8.7搭建,服务器采用nginx,并通过PHP/5.3.8运行。
print_r(get_meta_tags('http://www.muyangcheng.com'));
print_r(get_headers('http://www.muyangcheng.com',1));
print_r(get_headers('http://www.muyangcheng.com'));

结果如下:

Array
(
    [generator] => phpwind 8.7
    [description] => 牧羊城(www.muyangcheng.com),广州论坛是一个集广州消费经验、广州生活信息、广州超市商场打折优惠信息、广州人脉关系为一体的城市社区、生活门户、广州论坛,牧羊城(广州论坛)专注于广州城市百姓生活,无论你是老广,还是新广人,这是你我共同的网上生活家园!
    [keywords] => 牧羊城网站,广州新闻,广州城市,广州优惠,牧羊城,牧羊人,muyangcheng,广州超市商场打折优惠,广州消费,广州生活信息,广州生活社区,广州生活门户,广州论坛生活,广州论坛分类信息,广州团购,广州论坛优惠信息,广州论坛,羊城美食,广州网,广州BBS,广州论坛网
    [msapplication-task] => name=我的设置; action-uri=profile.php; icon-uri=images/ico/edit.ico
)
Array
(
    [0] => HTTP/1.1 200 OK
    [Server] => nginx/1.0.8
    [Date] => Tue, 19 Jun 2012 03:11:25 GMT
    [Content-Type] => text/html
    [Connection] => close
    [Vary] => Accept-Encoding
    [X-Powered-By] => PHP/5.3.8
    [Set-Cookie] => Array
        (
            [0] => b9971_c_stamp=1340075484; expires=Wed, 19-Jun-2013 03:11:24 GMT; path=/
            [1] => b9971_lastvisit=0%091340075484%09%2Findex.php; expires=Wed, 19-Jun-2013 03:11:24 GMT; path=/
            [2] => b9971_lastpos=index; expires=Wed, 19-Jun-2013 03:11:24 GMT; path=/
            [3] => b9971_lastvisit=0%091340075484%09%2Findex.php; expires=Wed, 19-Jun-2013 03:11:24 GMT; path=/
        )
 
)
Array
(
    [0] => HTTP/1.1 200 OK
    [1] => Server: nginx/1.0.8
    [2] => Date: Tue, 19 Jun 2012 03:11:26 GMT
    [3] => Content-Type: text/html
    [4] => Connection: close
    [5] => Vary: Accept-Encoding
    [6] => X-Powered-By: PHP/5.3.8
    [7] => Set-Cookie: b9971_c_stamp=1340075486; expires=Wed, 19-Jun-2013 03:11:26 GMT; path=/
    [8] => Set-Cookie: b9971_lastvisit=0%091340075486%09%2Findex.php; expires=Wed, 19-Jun-2013 03:11:26 GMT; path=/
    [9] => Set-Cookie: b9971_lastpos=index; expires=Wed, 19-Jun-2013 03:11:26 GMT; path=/
    [10] => Set-Cookie: b9971_lastvisit=0%091340075486%09%2Findex.php; expires=Wed, 19-Jun-2013 03:11:26 GMT; path=/
)
from flask import Flask, render_template, request, send_file from bs4 import BeautifulSoup import bs4 as bs4 from urllib.parse import urlparse import requests from collections import Counter import pandas as pd import os from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np from sklearn.model_selection import train_test_split import spacy as sp import psutil from sklearn.calibration import CalibratedClassifierCV import joblib from sklearn.svm import LinearSVC import en_core_web_sm # 加载NLP模型 nlp = en_core_web_sm.load() # 加载预训练模型 m1 = joblib.load('linear_svc_model.joblib') # 初始化TF-IDF向量器 tfidf = TfidfVectorizer( sublinear_tf=True, min_df=5, ngram_range=(1, 2), stop_words='english' ) class ScrapTool: def visit_url(self, website_url): '''访问URL,下载内容,初始化BeautifulSoup对象,调用解析方法,返回Series对象''' try: # 添加浏览器头部信息,模拟正常访问 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' } # 使用带头部信息的请求 content = requests.get(website_url, headers=headers, timeout=60).content soup = BeautifulSoup(content, "lxml") result = { "website_url": website_url, "website_name": self.get_website_name(website_url), "website_text": self.get_html_title_tag(soup) + self.get_html_meta_tags(soup) + self.get_html_heading_tags(soup) + self.get_text_content(soup) } return pd.Series(result) except Exception as e: print(f"访问URL时出错: {str(e)}") return None def get_website_name(self, website_url): '''从URL中提取网站名称,例如从"www.google.com"返回"google"''' return "".join(urlparse(website_url).netloc.split(".")[-2]) def get_html_title_tag(self, soup): '''返回网页<title>标签的文本内容''' if soup.title and soup.title.contents: return '. '.join(soup.title.contents) return "" def get_html_meta_tags(self, soup): '''返回与关键词和描述相关的<meta>标签的文本内容''' tags = soup.find_all( lambda tag: (tag.name == "meta") & (tag.has_attr('name') & tag.has_attr('content')) ) content = [str(tag["content"]) for tag in tags if tag["name"] in ['keywords', 'description']] return ' '.join(content) def get_html_heading_tags(self, soup): '''返回标题标签的文本内容,假设标题可能包含相对重要的文本''' tags = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]) content = [" ".join(tag.stripped_strings) for tag in tags] return ' '.join(content) def get_text_content(self, soup): '''返回整个页面的文本内容,忽略某些标签''' tags_to_ignore = ['style', 'script', 'head', 'title', 'meta', '[document]', "h1", "h2", "h3", "h4", "h5", "h6", "noscript"] tags = soup.find_all(string=True) result = [] for tag in tags: stripped_tag = tag.strip() if (tag.parent.name not in tags_to_ignore and not isinstance(tag, bs4.element.Comment) and not stripped_tag.isnumeric() and len(stripped_tag) > 0): result.append(stripped_tag) return ' '.join(result) # 初始化爬虫工具 scrapTool = ScrapTool() # 初始化Flask应用 app = Flask(__name__) def clean_text(doc): '''清理文档:移除代词、停用词、词形还原并转为小写''' if not doc: # 处理空文档情况 return "" doc = nlp(doc) tokens = [] exclusion_list = ["nan"] for token in doc: if (token.is_stop or token.is_punct or token.text.isnumeric() or (token.text.isalnum() == False) or token.text in exclusion_list): continue token = str(token.lemma_.lower().strip()) tokens.append(token) return " ".join(tokens) @app.route("/") def hello_world(): return render_template('index.html') @app.route('/submit', methods=['POST']) def submit(): site = request.form['site'] print(f"处理网站: {site}") try: # 读取数据文件 dir_path = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(dir_path, 'data.csv') df = pd.read_csv(file_path, low_memory=True) df['category_id'] = df['Category'].factorize()[0] # 训练TF-IDF向量器 X_train, _ = train_test_split( df['cleaned_website_text'], test_size=0.20, random_state=0 ) # 保存TF-IDF矩阵用于后续特征提取 tfidf_matrix = tfidf.fit_transform(X_train) # 爬取网站内容 web_data = scrapTool.visit_url(site) if web_data is None: return render_template('error.html', message="无法爬取网站内容") web = dict(web_data) # 显示并清理文本 raw_text = web['website_text'] print("\n===== 原始爬取文本 =====") print(raw_text[:500] + "...") # 只显示前500字符,避免输出过长 cleaned_text = clean_text(raw_text) print("\n===== 处理后的文本 =====") print(cleaned_text[:500] + "...") # 只显示前500字符 print("\n========================") # 文本分类预测 t = tfidf.transform([cleaned_text]) data = pd.DataFrame( m1.predict_proba(t) * 100, columns=df['Category'].unique() ).T data.columns = ['Probability'] data.index.name = 'Category' data = data.sort_values('Probability', ascending=False) print(data, type(data["Probability"])) # 获取预测的类别 predicted_category = data.index[0] category_index = df[df['Category'] == predicted_category]['category_id'].iloc[0] # 获取特征重要性(针对LinearSVC模型) # 如果是CalibratedClassifierCV包装的模型,需要访问底层模型 if hasattr(m1, 'base_estimator'): model = m1.base_estimator else: model = m1 # 获取当前文档的TF-IDF特征值 feature_names = np.array(tfidf.get_feature_names_out()) doc_tfidf = t.toarray()[0] # 计算特征重要性:特征系数 × TF-IDF值(针对预测类别) if hasattr(model, 'coef_'): # 对于多类分类,coef_是形状为(n_classes, n_features)的数组 feature_importance = np.abs(model.coef_[category_index] * doc_tfidf) # 获取最重要的前10个特征 top_n = 10 top_indices = np.argsort(feature_importance)[-top_n:][::-1] top_features = feature_names[top_indices] top_importance = feature_importance[top_indices] # 打印最重要的前10个特征 print(f"\n===== 预测为 {predicted_category} 的最重要的前10个特征 =====") for i, (feature, importance) in enumerate(zip(top_features, top_importance), 1): print(f"{i}. {feature}: {importance:.4f}") print("==============================================") else: print("\n该模型不支持特征重要性计算") # 打印内存使用情况 process = psutil.Process() memory_info = process.memory_info() print(f"内存使用: {memory_info.rss / (1024 * 1024):.2f} MB") # 将处理后的文本和特征重要性传递到模板 # 准备特征数据用于模板显示 top_features_data = None if hasattr(model, 'coef_'): top_features_data = list(zip(top_features, top_importance)) return render_template( 'predict.html', data=data, original_url=site, raw_text=raw_text[:1000], cleaned_text=cleaned_text[:1000], top_features=top_features_data, predicted_category=predicted_category ) except Exception as e: print(f"处理过程出错: {str(e)}") return render_template('error.html', message=str(e)) if __name__ == "__main__": app.run(port=5000, debug=True) 梳理提取特征的方式
09-04
# -*- coding: utf-8 -*- """ 网站爬取脚本(优化版): - 读取 data_cn_en.csv(URL列表) - 爬取网站文本并清理 - 每爬取1条记录实时追加保存到CSV - 遇到错误网站直接跳过,不中断整体流程 """ import pandas as pd import requests from bs4 import BeautifulSoup, element import spacy as sp from spacy.language import Language import jieba import os from tqdm import tqdm import traceback from urllib.parse import urlparse # 提前导入URL解析模块 # 打印当前模块导入情况,帮助排查命名冲突 print("=" * 50) print("当前导入模块检查:") print(f"pandas: {pd.__name__} v{pd.__version__}") print(f"requests: {requests.__name__}") print(f"BeautifulSoup: {BeautifulSoup.__name__}") print(f"spacy: {sp.__name__} v{sp.__version__}") print(f"jieba: {jieba.__name__}") print(f"tqdm: {tqdm.__name__}") print("=" * 50) # -------------------------- 1. 初始化文本处理工具 -------------------------- print("\n[初始化] 开始初始化文本处理工具...") try: print("[初始化] 尝试加载spaCy英文模型...") nlp = sp.load("en_core_web_sm") print("[初始化] spaCy模型加载成功") except Exception as e: print(f"[初始化错误] spaCy模型加载失败: {str(e)}") print("请先安装spaCy英文模型:python -m spacy download en_core_web_sm") exit(1) @Language.component("jieba_tokenizer") def jieba_tokenizer(doc): """中文文本分词组件(spaCy管道)""" try: text = doc.text has_chinese = any('\u4e00' <= char <= '\u9fff' for char in text) if has_chinese: words = jieba.lcut(text) spaces = [False] * len(words) return sp.tokens.Doc(doc.vocab, words=words, spaces=spaces) else: return doc except Exception as e: print(f"[分词错误] jieba_tokenizer出错: {str(e)}") traceback.print_exc() return doc try: nlp.add_pipe("jieba_tokenizer", before="parser") print("[初始化] jieba分词管道添加成功") except Exception as e: print(f"[初始化错误] 添加jieba分词管道失败: {str(e)}") traceback.print_exc() exit(1) # -------------------------- 2. 网站爬取工具 -------------------------- class ScrapTool: def visit_url(self, website_url): """ 爬取单个URL的核心方法 返回:包含URL、网站名、原始文本的字典(失败时文本为空) """ try: print(f"\n[爬取] 处理URL: {website_url}") # 补全URL前缀(若缺失http/https) if not website_url.startswith(("http://", "https://")): website_url = f"https://{website_url}" print(f"[爬取] 补全URL为: {website_url}") # 模拟浏览器请求头(降低反爬概率) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' } # 发送请求(超时30秒,避免卡死) print(f"[爬取] 发送请求到: {website_url}") response = requests.get(website_url, headers=headers, timeout=30) response.raise_for_status() # 触发HTTP错误(如404、500) # 解析网页内容 print(f"[爬取] 成功获取响应,状态码: {response.status_code}") soup = BeautifulSoup(response.content, "lxml") # 整合网页关键文本(标题+Meta+标题标签+正文) print("[爬取] 提取网页内容...") website_text = ( self.get_html_title_tag(soup) + " " + self.get_html_meta_tags(soup) + " " + self.get_html_heading_tags(soup) + " " + self.get_text_content(soup) ).strip() # 返回成功结果 result = { "website_url": website_url, "website_name": self.get_website_name(website_url), "website_text": website_text } print(f"[爬取] 成功处理URL: {website_url}(文本长度:{len(website_text)}字符)") return result except Exception as e: # 捕获所有异常,返回失败结果(文本为空) error_msg = str(e)[:100] + "..." if len(str(e)) > 100 else str(e) print(f"[爬取错误] 处理 {website_url} 失败:{error_msg}") traceback.print_exc() # 打印详细错误堆栈(便于调试) return { "website_url": website_url, "website_name": self.get_website_name(website_url) if website_url else "", "website_text": "" # 错误时文本为空 } def get_website_name(self, website_url): """从URL中提取网站名(如:www.baidu.com → baidu)""" try: parsed = urlparse(website_url) netloc = parsed.netloc # 获取域名(如:www.baidu.com) name_parts = netloc.split(".") # 处理二级/三级域名(如:blog.youkuaiyun.com → csdn,www.taobao.com → taobao) return name_parts[-2] if len(name_parts) >= 2 else netloc except Exception as e: print(f"[提取网站名错误] {str(e)}") return "" def get_html_title_tag(self, soup): """提取网页<title>标签内容""" try: if soup.title and soup.title.contents: title = '. '.join([str(c).strip() for c in soup.title.contents]) return title[:200] # 限制长度,避免异常长标题 return "" except Exception as e: print(f"[提取标题错误] {str(e)}") return "" def get_html_meta_tags(self, soup): """提取网页Meta标签(keywords/description)""" try: meta_tags = soup.find_all( lambda tag: tag.name == "meta" and tag.has_attr('name') and tag.has_attr('content') ) meta_content = [str(tag["content"]).strip() for tag in meta_tags if tag["name"].lower() in ['keywords', 'description']] return ' '.join(meta_content)[:500] # 限制长度 except Exception as e: print(f"[提取Meta错误] {str(e)}") return "" def get_html_heading_tags(self, soup): """提取网页H1-H6标题标签""" try: heading_tags = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]) heading_content = [" ".join(tag.stripped_strings) for tag in heading_tags] return ' '.join(heading_content)[:300] # 限制长度 except Exception as e: print(f"[提取标题标签错误] {str(e)}") return "" def get_text_content(self, soup): """提取网页正文文本(过滤无用标签)""" try: # 忽略的无用标签(避免爬取样式、脚本等内容) ignore_tags = ['style', 'script', 'head', 'title', 'meta', '[document]', "noscript", "iframe", "nav", "footer", "aside"] text_tags = soup.find_all(string=True) valid_text = [] for tag in text_tags: stripped_text = tag.strip() # 过滤条件:非忽略标签、非注释、非纯数字、非空文本 if (tag.parent.name not in ignore_tags and not isinstance(tag, element.Comment) and not stripped_text.isnumeric() and len(stripped_text) > 1): # 过滤单个字符(如标点、空格) valid_text.append(stripped_text) return ' '.join(valid_text)[:2000] # 限制长度,避免异常长文本 except Exception as e: print(f"[提取正文错误] {str(e)}") return "" # -------------------------- 3. 文本清理函数 -------------------------- def clean_text(raw_text): """ 清理原始网页文本: 1. 过滤停用词、标点、纯数字 2. 英文词形还原(如:running → run) 3. 统一小写 """ try: if not raw_text or raw_text.strip() == "": return "" # 空文本直接返回 # 使用spaCy处理文本 doc = nlp(raw_text) tokens = [] exclusion_list = ["nan", "none", "null"] # 额外排除的无效文本 for token in doc: # 过滤条件:非停用词、非标点、非纯数字、非特殊字符、不在排除列表 if (not token.is_stop and not token.is_punct and not token.text.isnumeric() and (token.text.isalnum() or '\u4e00' <= token.text <= '\u9fff') and token.text.lower() not in exclusion_list): # 中文直接取文本,英文取词形还原 cleaned_token = token.text.lower().strip() if '\u4e00' <= token.text <= '\u9fff' else str( token.lemma_).lower().strip() tokens.append(cleaned_token) return " ".join(tokens) # 拼接清理后的词 except Exception as e: print(f"[文本清理错误] {str(e)}") traceback.print_exc() return "" # -------------------------- 4. 核心爬取流程(实时保存+错误跳过) -------------------------- def main(): # 配置文件路径 input_csv_path = "data_cn_en.csv" output_csv_path = "data_with_cleaned_text.csv" # 检查输入文件是否存在 if not os.path.exists(input_csv_path): print(f"[错误] 输入文件 {input_csv_path} 不存在!") exit(1) # 读取原始CSV(假设格式:id, url, category) print("\n[主流程] 读取原始数据集...") try: df = pd.read_csv( input_csv_path, header=None, names=["id", "url", "category"], encoding="utf-8" ) print(f"[主流程] 读取完成,共 {len(df)} 条URL记录") except Exception as e: print(f"[读取CSV错误] {str(e)}") exit(1) # 初始化爬虫工具 scrap_tool = ScrapTool() print("[主流程] 爬虫工具初始化完成,开始爬取...") # 初始化进度条 pbar = tqdm(total=len(df), desc="整体爬取进度") # 遍历每条URL(逐个处理+实时保存) for idx, row in df.iterrows(): try: # 获取当前行数据 record_id = row["id"] url = row["url"] category = row["category"] print(f"\n[主流程] 处理第 {idx + 1}/{len(df)} 条记录(ID:{record_id})") # 1. 爬取URL内容 crawl_result = scrap_tool.visit_url(url) # 2. 清理文本 cleaned_text = clean_text(crawl_result["website_text"]) print(f"[主流程] 文本清理完成(清理后长度:{len(cleaned_text)}字符)") # 3. 构造最终记录 final_record = pd.DataFrame({ "id": [record_id], "url": [url], # 保留原始URL(未补全前) "website_name": [crawl_result["website_name"]], "website_text": [crawl_result["website_text"]], "cleaned_website_text": [cleaned_text], "category": [category], "status": ["成功" if cleaned_text else "失败"] # 标记爬取状态 }) # 4. 实时追加保存到CSV(不存在则创建,存在则追加) if not os.path.exists(output_csv_path): # 首次保存:写入表头 final_record.to_csv(output_csv_path, index=False, encoding="utf-8", mode="w") else: # 后续保存:追加(不写表头) final_record.to_csv(output_csv_path, index=False, encoding="utf-8", mode="a", header=False) print(f"[主流程] 第 {idx + 1} 条记录已保存到 {output_csv_path}") except Exception as e: # 捕获单条记录的所有异常,直接跳过(不中断循环) print(f"[主流程警告] 第 {idx + 1} 条记录处理失败,已跳过:{str(e)}") traceback.print_exc() # 更新进度条 pbar.update(1) # 爬取完成,打印统计信息 pbar.close() print("\n" + "=" * 50) print("[主流程] 所有URL处理完成!") # 读取输出文件,统计成功/失败数 if os.path.exists(output_csv_path): output_df = pd.read_csv(output_csv_path, encoding="utf-8") total = len(output_df) success = len(output_df[output_df["cleaned_website_text"] != ""]) fail = total - success print(f"\n[统计信息]") print(f" - 总处理数:{total}") print(f" - 成功数(有效文本):{success}") print(f" - 失败数(无有效文本):{fail}") print(f" - 成功率:{success / total * 100:.1f}%" if total > 0 else " - 成功率:0%") print(f"\n输出文件路径:{os.path.abspath(output_csv_path)}") else: print(f"[警告] 未生成输出文件 {output_csv_path}(可能所有记录处理失败)") if __name__ == "__main__": try: print("=" * 50) print("程序开始执行(优化版:实时保存+错误跳过)") print("=" * 50) # 检查必要依赖(提前捕获缺失依赖) required_libs = ["pandas", "requests", "bs4", "spacy", "jieba", "tqdm"] for lib in required_libs: __import__(lib) print("[依赖检查] 所有必要依赖均已安装") # 执行主流程 main() except ImportError as e: # 处理缺失依赖的错误 missing_lib = str(e).split("'")[1] print(f"\n[依赖错误] 缺少必要库:{missing_lib}") print("请执行以下命令安装所有依赖:") print("pip install pandas requests beautifulsoup4 spacy jieba tqdm") if "spacy" in missing_lib or "en_core_web_sm" in str(e): print("并安装spaCy英文模型:python -m spacy download en_core_web_sm") except Exception as e: # 处理程序启动时的致命错误 print(f"\n[程序致命错误] 启动失败:{str(e)}") traceback.print_exc() 这个代码能获取类似的信息吗
最新发布
09-06
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值