from flask import Flask, render_template, request, send_file
from bs4 import BeautifulSoup
import bs4 as bs4
from urllib.parse import urlparse
import requests
from collections import Counter
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
import spacy as sp
import psutil
from sklearn.calibration import CalibratedClassifierCV
import joblib
from sklearn.svm import LinearSVC
import en_core_web_sm
# 加载NLP模型
nlp = en_core_web_sm.load()
# 加载预训练模型
m1 = joblib.load('linear_svc_model.joblib')
# 初始化TF-IDF向量器
tfidf = TfidfVectorizer(
sublinear_tf=True,
min_df=5,
ngram_range=(1, 2),
stop_words='english'
)
class ScrapTool:
def visit_url(self, website_url):
'''访问URL,下载内容,初始化BeautifulSoup对象,调用解析方法,返回Series对象'''
try:
# 添加浏览器头部信息,模拟正常访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
# 使用带头部信息的请求
content = requests.get(website_url, headers=headers, timeout=60).content
soup = BeautifulSoup(content, "lxml")
result = {
"website_url": website_url,
"website_name": self.get_website_name(website_url),
"website_text": self.get_html_title_tag(soup) +
self.get_html_meta_tags(soup) +
self.get_html_heading_tags(soup) +
self.get_text_content(soup)
}
return pd.Series(result)
except Exception as e:
print(f"访问URL时出错: {str(e)}")
return None
def get_website_name(self, website_url):
'''从URL中提取网站名称,例如从"www.google.com"返回"google"'''
return "".join(urlparse(website_url).netloc.split(".")[-2])
def get_html_title_tag(self, soup):
'''返回网页<title>标签的文本内容'''
if soup.title and soup.title.contents:
return '. '.join(soup.title.contents)
return ""
def get_html_meta_tags(self, soup):
'''返回与关键词和描述相关的<meta>标签的文本内容'''
tags = soup.find_all(
lambda tag: (tag.name == "meta") &
(tag.has_attr('name') & tag.has_attr('content'))
)
content = [str(tag["content"]) for tag in tags
if tag["name"] in ['keywords', 'description']]
return ' '.join(content)
def get_html_heading_tags(self, soup):
'''返回标题标签的文本内容,假设标题可能包含相对重要的文本'''
tags = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
content = [" ".join(tag.stripped_strings) for tag in tags]
return ' '.join(content)
def get_text_content(self, soup):
'''返回整个页面的文本内容,忽略某些标签'''
tags_to_ignore = ['style', 'script', 'head', 'title', 'meta',
'[document]', "h1", "h2", "h3", "h4", "h5", "h6", "noscript"]
tags = soup.find_all(string=True)
result = []
for tag in tags:
stripped_tag = tag.strip()
if (tag.parent.name not in tags_to_ignore and
not isinstance(tag, bs4.element.Comment) and
not stripped_tag.isnumeric() and
len(stripped_tag) > 0):
result.append(stripped_tag)
return ' '.join(result)
# 初始化爬虫工具
scrapTool = ScrapTool()
# 初始化Flask应用
app = Flask(__name__)
def clean_text(doc):
'''清理文档:移除代词、停用词、词形还原并转为小写'''
if not doc: # 处理空文档情况
return ""
doc = nlp(doc)
tokens = []
exclusion_list = ["nan"]
for token in doc:
if (token.is_stop or token.is_punct or
token.text.isnumeric() or
(token.text.isalnum() == False) or
token.text in exclusion_list):
continue
token = str(token.lemma_.lower().strip())
tokens.append(token)
return " ".join(tokens)
@app.route("/")
def hello_world():
return render_template('index.html')
@app.route('/submit', methods=['POST'])
def submit():
site = request.form['site']
print(f"处理网站: {site}")
try:
# 读取数据文件
dir_path = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(dir_path, 'data.csv')
df = pd.read_csv(file_path, low_memory=True)
df['category_id'] = df['Category'].factorize()[0]
# 训练TF-IDF向量器
X_train, _ = train_test_split(
df['cleaned_website_text'],
test_size=0.20,
random_state=0
)
# 保存TF-IDF矩阵用于后续特征提取
tfidf_matrix = tfidf.fit_transform(X_train)
# 爬取网站内容
web_data = scrapTool.visit_url(site)
if web_data is None:
return render_template('error.html', message="无法爬取网站内容")
web = dict(web_data)
# 显示并清理文本
raw_text = web['website_text']
print("\n===== 原始爬取文本 =====")
print(raw_text[:500] + "...") # 只显示前500字符,避免输出过长
cleaned_text = clean_text(raw_text)
print("\n===== 处理后的文本 =====")
print(cleaned_text[:500] + "...") # 只显示前500字符
print("\n========================")
# 文本分类预测
t = tfidf.transform([cleaned_text])
data = pd.DataFrame(
m1.predict_proba(t) * 100,
columns=df['Category'].unique()
).T
data.columns = ['Probability']
data.index.name = 'Category'
data = data.sort_values('Probability', ascending=False)
print(data, type(data["Probability"]))
# 获取预测的类别
predicted_category = data.index[0]
category_index = df[df['Category'] == predicted_category]['category_id'].iloc[0]
# 获取特征重要性(针对LinearSVC模型)
# 如果是CalibratedClassifierCV包装的模型,需要访问底层模型
if hasattr(m1, 'base_estimator'):
model = m1.base_estimator
else:
model = m1
# 获取当前文档的TF-IDF特征值
feature_names = np.array(tfidf.get_feature_names_out())
doc_tfidf = t.toarray()[0]
# 计算特征重要性:特征系数 × TF-IDF值(针对预测类别)
if hasattr(model, 'coef_'):
# 对于多类分类,coef_是形状为(n_classes, n_features)的数组
feature_importance = np.abs(model.coef_[category_index] * doc_tfidf)
# 获取最重要的前10个特征
top_n = 10
top_indices = np.argsort(feature_importance)[-top_n:][::-1]
top_features = feature_names[top_indices]
top_importance = feature_importance[top_indices]
# 打印最重要的前10个特征
print(f"\n===== 预测为 {predicted_category} 的最重要的前10个特征 =====")
for i, (feature, importance) in enumerate(zip(top_features, top_importance), 1):
print(f"{i}. {feature}: {importance:.4f}")
print("==============================================")
else:
print("\n该模型不支持特征重要性计算")
# 打印内存使用情况
process = psutil.Process()
memory_info = process.memory_info()
print(f"内存使用: {memory_info.rss / (1024 * 1024):.2f} MB")
# 将处理后的文本和特征重要性传递到模板
# 准备特征数据用于模板显示
top_features_data = None
if hasattr(model, 'coef_'):
top_features_data = list(zip(top_features, top_importance))
return render_template(
'predict.html',
data=data,
original_url=site,
raw_text=raw_text[:1000],
cleaned_text=cleaned_text[:1000],
top_features=top_features_data,
predicted_category=predicted_category
)
except Exception as e:
print(f"处理过程出错: {str(e)}")
return render_template('error.html', message=str(e))
if __name__ == "__main__":
app.run(port=5000, debug=True)
梳理提取特征的方式