# -*- coding: utf-8 -*-
"""
人民网科技频道新闻分类系统(最终稳定版)
核心修复:自动检测单类别/小众类别数据,确保模型可训练
功能:爬虫采集 → 数据清洗(类别过滤)→ 数据探索 → 文本预处理 → SVM分类 → 模型评价
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import jieba
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import (
accuracy_score, classification_report, confusion_matrix,
roc_curve, auc, f1_score
)
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from wordcloud import WordCloud
import time
from typing import List, Dict, Optional
import warnings
warnings.filterwarnings('ignore')
# ====================== 全局配置 ======================
# 爬虫配置
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'http://scitech.people.com.cn/',
'Connection': 'keep-alive'
}
BASE_URL = "http://scitech.people.com.cn"
RETRY_TIMES = 3 # 爬取重试次数
DELAY_RANGE = (1.5, 3.0) # 反爬延迟范围
# 文本处理配置
STOPWORDS = set([
'的', '了', '是', '在', '有', '就', '不', '和', '也', '都', '这', '那',
'我', '你', '他', '我们', '你们', '他们', '来', '去', '上', '下', '里', '外',
'一个', '一些', '关于', '对于', '随着', '通过', '根据', '显示', '表明', '指出'
])
MAX_FEATURES = 5000 # TF-IDF最大特征数
TOP_N_FEATURES = 20 # 可视化TOP特征数
# 模型配置
TEST_SIZE = 0.25
RANDOM_STATE = 42
CV_FOLDS = 5 # 交叉验证折数
MIN_CATEGORY_COUNT = 5 # 最小类别样本数(少于该数的类别会被过滤)
# 字体配置(Windows系统默认黑体)
FONT_PATH = 'C:/Windows/Fonts/simhei.ttf'
# ====================== 工具函数 ======================
def retry_request(url: str, headers: dict, timeout: int = 15) -> Optional[requests.Response]:
"""带重试机制的HTTP请求"""
for i in range(RETRY_TIMES):
try:
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status() # 抛出HTTP错误
return response
except Exception as e:
print(f"请求失败(第{i+1}次重试): {e}")
time.sleep(np.random.uniform(*DELAY_RANGE))
return None
def complete_url(relative_url: str) -> str:
"""补全相对URL为绝对URL"""
if relative_url.startswith('http'):
return relative_url
return f"{BASE_URL}{relative_url.lstrip('/')}"
def filter_single_category(df: pd.DataFrame) -> pd.DataFrame:
"""
过滤单类别/小众类别数据(核心修复函数)
:param df: 原始数据DataFrame
:return: 至少包含2个有效类别的DataFrame
"""
print("\n" + "="*50)
print("开始类别数据过滤...")
# 统计每个类别的样本数
category_counts = df['category'].value_counts()
print(f"原始类别分布:\n{category_counts}")
# 筛选样本数≥MIN_CATEGORY_COUNT的类别
valid_categories = category_counts[category_counts >= MIN_CATEGORY_COUNT].index
df_filtered = df[df['category'].isin(valid_categories)].reset_index(drop=True)
# 检查过滤后是否至少有2个类别
remaining_categories = df_filtered['category'].nunique()
if remaining_categories < 2:
# 若不足2个类别,降低阈值到3,再次尝试
print(f"有效类别数不足2个(当前{remaining_categories}个),降低筛选阈值...")
valid_categories = category_counts[category_counts >= 3].index
df_filtered = df[df['category'].isin(valid_categories)].reset_index(drop=True)
remaining_categories = df_filtered['category'].nunique()
# 若仍不足2个类别,手动拆分类别(应急方案)
if remaining_categories < 2:
print("启动应急方案:根据标题关键词手动拆分类别...")
df_filtered = manual_split_category(df_filtered)
remaining_categories = df_filtered['category'].nunique()
print(f"过滤后类别分布:\n{df_filtered['category'].value_counts()}")
print(f"过滤后数据量:{len(df_filtered)}条,有效类别数:{remaining_categories}个")
# 最终校验:若仍不足2个类别,抛出明确错误
if remaining_categories < 2:
raise ValueError(f"数据类别不足!当前仅{remaining_categories}个类别,无法进行分类训练。请增加爬取页数或检查栏目提取逻辑。")
return df_filtered
def manual_split_category(df: pd.DataFrame) -> pd.DataFrame:
"""
应急方案:根据标题关键词手动拆分类别(避免单类别)
可根据实际新闻内容调整关键词
"""
def get_category_from_title(title: str) -> str:
# 关键词-类别映射(可扩展)
category_map = {
'人工智能': ['AI', '人工智能', '大模型', '机器学习', '深度学习', '机器人'],
'航天科技': ['航天', '太空', '卫星', '火箭', '空间站', '探月', '火星'],
'电子科技': ['芯片', '半导体', '5G', '通信', '手机', '电脑', '处理器'],
'生物医疗': ['生物', '医疗', '疫苗', '基因', '药物', '医院', '健康'],
'新能源': ['新能源', '电池', '光伏', '风电', '电动车', '充电'],
'互联网': ['互联网', 'APP', '软件', '平台', '直播', '电商']
}
for cat, keywords in category_map.items():
if any(keyword in title for keyword in keywords):
return cat
return '综合科技'
# 应用标题分类逻辑
df['category'] = df['title'].apply(get_category_from_title)
return df
# ====================== 1. 数据爬取(增强版) ======================
def crawl_news_detail(link: str) -> str:
"""爬取新闻正文内容"""
response = retry_request(link, HEADERS)
if not response:
return ""
soup = BeautifulSoup(response.content, 'html.parser')
# 适配人民网多种正文结构
content_tags = soup.select('.rm_txt_con, .article-content, #rwb_article, .content')
if content_tags:
content = ' '.join([tag.text.strip() for tag in content_tags])
return re.sub(r'\s+', ' ', content)[:500] # 截取前500字避免过长
return ""
def crawl_scitech_news(pages: int = 15) -> pd.DataFrame:
"""
爬取人民网科技频道新闻数据(增加爬取页数,确保多类别)
:param pages: 爬取页数(默认15页,确保覆盖足够栏目)
:return: 包含标题、正文、时间、分类的DataFrame
"""
news_list = []
print(f"开始爬取人民网科技频道({pages}页)...")
for page in range(1, pages + 1):
# 构建分页URL
if page == 1:
url = f"{BASE_URL}/index.html"
else:
url = f"{BASE_URL}/index{page}.html"
response = retry_request(url, HEADERS)
if not response:
continue
soup = BeautifulSoup(response.content, 'html.parser')
articles = soup.select('.ej_list_box li')
if not articles:
print(f"第{page}页未找到新闻条目")
continue
for idx, article in enumerate(articles, 1):
try:
# 提取核心信息
title_tag = article.select_one('a')
if not title_tag:
continue
title = title_tag.text.strip()
link = complete_url(title_tag['href'])
content_summary = article.select_one('.ej_content').text.strip() if article.select_one('.ej_content') else ""
publish_time = article.select_one('.ej_time').text.strip() if article.select_one('.ej_time') else ""
# 优化栏目提取逻辑(增加备选选择器)
category_tag = article.select_one('.ej_key a, .category, .column, .tags a')
if category_tag:
category = category_tag.text.strip()
else:
category = "综合科技" # 默认类别
# 爬取正文
full_content = crawl_news_detail(link)
# 合并标题+摘要+正文作为特征
combined_content = f"{title} {content_summary} {full_content}"
news_list.append({
'title': title,
'content_summary': content_summary,
'full_content': full_content,
'combined_content': combined_content,
'publish_time': publish_time,
'category': category,
'link': link
})
# 每爬5条休息一次(反爬优化)
if idx % 5 == 0:
time.sleep(np.random.uniform(*DELAY_RANGE) / 2)
except Exception as e:
print(f"第{page}页第{idx}条解析失败: {str(e)[:50]}")
print(f"第{page}页爬取完成,累计{len(news_list)}条")
time.sleep(np.random.uniform(*DELAY_RANGE))
df = pd.DataFrame(news_list)
# 去重(基于标题)
df = df.drop_duplicates(subset=['title'], keep='first').reset_index(drop=True)
print(f"爬取完成,去重后共{len(df)}条新闻")
return df
# ====================== 2. 数据探索分析(优化版) ======================
def analyze_data_distribution(df: pd.DataFrame):
"""数据分布分析(基于过滤后的有效数据)"""
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(12, 6))
ax = sns.countplot(x='category', data=df, palette='viridis')
plt.title('科技频道新闻栏目分布(有效类别)', fontsize=14, pad=20)
plt.xlabel('栏目类别', fontsize=12)
plt.ylabel('新闻数量', fontsize=12)
plt.xticks(rotation=45, ha='right')
# 添加数值标签
for p in ax.patches:
ax.annotate(f'{p.get_height()}',
(p.get_x() + p.get_width()/2, p.get_height() + 0.5),
ha='center', va='bottom', fontsize=10)
plt.tight_layout()
plt.savefig('scitech_category_distribution.png', dpi=300, bbox_inches='tight')
plt.close()
def analyze_text_similarity(df: pd.DataFrame, sample_size: int = 50):
"""文本相似度分析"""
if len(df) < sample_size:
sample_size = len(df)
texts = df['combined_content'].fillna("").tolist()[:sample_size]
vectorizer = TfidfVectorizer(max_features=500, stop_words=list(STOPWORDS))
vectors = vectorizer.fit_transform(texts)
from sklearn.metrics.pairwise import cosine_similarity
sim_matrix = cosine_similarity(vectors)
plt.figure(figsize=(12, 10))
sns.heatmap(sim_matrix, cmap='coolwarm', vmin=0, vmax=1, annot=False)
plt.title(f'新闻内容相似度矩阵(前{sample_size}条)', fontsize=14, pad=20)
plt.xlabel('新闻序号', fontsize=12)
plt.ylabel('新闻序号', fontsize=12)
plt.tight_layout()
plt.savefig('scitech_content_similarity.png', dpi=300, bbox_inches='tight')
plt.close()
def generate_category_wordcloud(df: pd.DataFrame):
"""生成各栏目词云(修复字体问题)"""
for category in df['category'].unique():
# 合并该栏目所有文本
texts = df[df['category'] == category]['combined_content'].fillna("").tolist()
full_text = ' '.join(texts)
# 分词并过滤停用词
words = jieba.lcut(full_text)
filtered_words = [word for word in words if word not in STOPWORDS and len(word) > 1]
word_text = ' '.join(filtered_words)
if not word_text:
print(f"{category}栏目无有效文本,跳过词云生成")
continue
# 生成词云
wordcloud = WordCloud(
font_path=FONT_PATH,
width=800, height=600,
background_color='white',
max_words=100,
collocations=False,
contour_width=3
).generate(word_text)
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f'{category}栏目词云', fontsize=14, pad=20)
plt.tight_layout()
plt.savefig(f'scitech_wordcloud_{category}.png', dpi=300, bbox_inches='tight')
plt.close()
def explore_data(df: pd.DataFrame):
"""数据探索主函数"""
print("\n" + "="*50)
print("开始数据探索分析...")
# 基础信息输出
print(f"\n数据基本信息:")
print(f"总新闻数:{len(df)}")
print(f"有效栏目数:{df['category'].nunique()}")
print(f"栏目分布:\n{df['category'].value_counts()}")
# 执行各项分析
analyze_data_distribution(df)
analyze_text_similarity(df)
generate_category_wordcloud(df)
print("数据探索完成,可视化文件已保存")
# ====================== 3. 文本预处理(增强版) ======================
def clean_text(text: Optional[str]) -> str:
"""文本清洗"""
if not isinstance(text, str) or pd.isna(text):
return ""
# 保留中文和重要标点
text = re.sub(r'[^\u4e00-\u9fa5\s,。!?;:""''()【】《》、]', '', text)
# 合并连续空格和换行
text = re.sub(r'\s+', ' ', text)
return text.strip()
def chinese_tokenizer(text: str) -> str:
"""中文分词(结合停用词过滤)"""
words = jieba.lcut(text)
filtered_words = [word for word in words if word not in STOPWORDS and len(word) > 1]
return " ".join(filtered_words)
def preprocess_text(df: pd.DataFrame) -> tuple:
"""文本预处理主函数"""
print("\n" + "="*50)
print("开始文本预处理...")
# 1. 文本清洗
df['cleaned_content'] = df['combined_content'].apply(clean_text)
# 2. 过滤空文本
df = df[df['cleaned_content'].str.len() > 5].reset_index(drop=True)
print(f"过滤空文本后剩余:{len(df)}条")
# 3. 分词
df['tokenized_content'] = df['cleaned_content'].apply(chinese_tokenizer)
# 4. TF-IDF向量化
vectorizer = TfidfVectorizer(
max_features=MAX_FEATURES,
ngram_range=(1, 2),
token_pattern=r'\b\w+\b',
min_df=3,
stop_words=list(STOPWORDS)
)
X = vectorizer.fit_transform(df['tokenized_content'])
print(f"TF-IDF特征矩阵维度:{X.shape}")
# 5. 特征选择
selector = SelectKBest(f_classif, k=min(3000, X.shape[1]))
X_selected = selector.fit_transform(X, df['category'])
print(f"特征选择后维度:{X_selected.shape}")
# 6. 标签编码
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['category'])
print(f"标签编码映射:{dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")
return X_selected, y, label_encoder, vectorizer, selector, df
# ====================== 4. 模型构建与优化 ======================
def train_optimized_svm(X_train: np.ndarray, y_train: np.ndarray) -> tuple:
"""训练优化后的SVM模型(适配多类别)"""
print("\n" + "="*50)
print("开始模型训练与超参数优化...")
print(f"训练集类别分布:{pd.Series(y_train).value_counts().to_dict()}")
# 简化参数网格(加快训练速度,避免不必要的拟合)
param_grid = {
'C': [1, 10, 100],
'kernel': ['linear', 'rbf'],
'gamma': ['scale', 'auto'],
'class_weight': [None, 'balanced'] # 处理类别不平衡
}
# 网格搜索(交叉验证)
grid_search = GridSearchCV(
estimator=SVC(probability=True, random_state=RANDOM_STATE),
param_grid=param_grid,
cv=CV_FOLDS,
scoring='f1_weighted', # 适合多分类的评价指标
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
# 输出最佳参数
print(f"最佳参数组合:{grid_search.best_params_}")
print(f"交叉验证最佳F1分数:{grid_search.best_score_:.4f}")
return grid_search.best_estimator_, grid_search
# ====================== 5. 模型评价(完善版) ======================
def evaluate_model(model, X_test: np.ndarray, y_test: np.ndarray, label_encoder: LabelEncoder):
"""模型评价(多类别适配)"""
print("\n" + "="*50)
print("模型评价结果:")
# 预测
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)
# 1. 基础评价指标
accuracy = accuracy_score(y_test, y_pred)
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
print(f"准确率:{accuracy:.4f}")
print(f"加权F1分数:{weighted_f1:.4f}")
print("\n详细分类报告:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0))
# 2. 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(12, 10))
sns.heatmap(
cm, annot=True, fmt='d', cmap='Blues',
xticklabels=label_encoder.classes_,
yticklabels=label_encoder.classes_,
annot_kws={'size': 10}
)
plt.title('混淆矩阵', fontsize=14, pad=20)
plt.xlabel('预测标签', fontsize=12)
plt.ylabel('真实标签', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('scitech_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.close()
# 3. 交叉验证分数可视化
cv_scores = cross_val_score(model, X_test, y_test, cv=CV_FOLDS, scoring='f1_weighted')
plt.figure(figsize=(8, 6))
sns.boxplot(x=cv_scores, palette='viridis')
plt.title(f'交叉验证F1分数分布({CV_FOLDS}折)', fontsize=14, pad=20)
plt.xlabel('F1分数', fontsize=12)
plt.axvline(x=cv_scores.mean(), color='red', linestyle='--', label=f'平均值: {cv_scores.mean():.4f}')
plt.legend()
plt.tight_layout()
plt.savefig('scitech_cv_scores.png', dpi=300, bbox_inches='tight')
plt.close()
print("模型评价完成,所有可视化文件已保存")
return {
'accuracy': accuracy,
'weighted_f1': weighted_f1,
'cv_scores': cv_scores.mean()
}
# ====================== 6. 特征重要性分析(优化) ======================
def plot_feature_importance(model, vectorizer, selector, top_n: int = TOP_N_FEATURES):
"""可视化特征重要性"""
print("\n" + "="*50)
print("生成特征重要性可视化...")
if not hasattr(model, 'coef_'):
print("当前核函数不支持特征重要性计算(建议使用linear核)")
return
# 获取特征名称(考虑特征选择)
all_features = vectorizer.get_feature_names_out()
selected_mask = selector.get_support()
selected_features = all_features[selected_mask]
# 获取系数(多分类取绝对值最大的一类系数)
if len(model.coef_) > 1:
coefs = np.max(np.abs(model.coef_), axis=0) # 多分类时取各特征最大绝对系数
else:
coefs = model.coef_.toarray()[0] if hasattr(model.coef_, 'toarray') else model.coef_[0]
if len(coefs) != len(selected_features):
print("特征系数与特征名称长度不匹配,跳过可视化")
return
# 排序并取TOP N
sorted_idx = np.argsort(coefs)[::-1][:top_n]
top_features = [selected_features[i] for i in sorted_idx]
top_coefs = coefs[sorted_idx]
# 绘图
plt.figure(figsize=(12, 8))
bars = plt.barh(top_features, top_coefs, color='darkgreen', alpha=0.7)
plt.xlabel('特征权重(绝对值)', fontsize=12)
plt.title(f'TOP {top_n} 重要特征', fontsize=14, pad=20)
plt.gca().invert_yaxis()
# 添加数值标签
for bar, coef in zip(bars, top_coefs):
plt.text(bar.get_width() + 0.01,
bar.get_y() + bar.get_height()/2,
f'{coef:.3f}',
ha='left', va='center', fontsize=10)
plt.tight_layout()
plt.savefig('scitech_feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()
# ====================== 7. 改进建议(具体化) ======================
def get_improvement_suggestions(evaluation_results: dict):
"""基于模型表现的改进建议"""
print("\n" + "="*50)
print("模型改进建议:")
suggestions = [
(
"数据层面",
[
"扩展爬取范围:增加爬取页数(建议≥20页)和其他科技网站数据,提升样本多样性",
"完善正文爬取:优化正文解析规则,确保更多新闻能获取完整内容",
"数据平衡:对样本量较少的类别进行数据增强(如同义词替换)或合并相似栏目"
]
),
(
"特征工程",
[
"增加自定义特征:提取新闻长度、关键词密度、发布时间差等辅助特征",
"尝试词嵌入:使用Word2Vec/GloVe/BERT等预训练模型替换TF-IDF,提升语义理解",
"特征融合:结合标题特征和正文特征,使用加权融合策略"
]
),
(
"模型优化",
[
"尝试其他模型:对比随机森林、XGBoost、LightGBM等集成学习模型",
"深度学习:使用CNN/RNN/Transformer处理文本,适合大规模数据场景",
"超参数细化:扩大参数搜索范围,使用贝叶斯优化替代网格搜索"
]
),
(
"工程落地",
[
"构建实时爬取管道:使用定时任务(如Celery)定期更新训练数据",
"模型部署:封装为API服务(如FastAPI),支持实时新闻分类请求",
"监控迭代:建立模型性能监控机制,定期重新训练适应数据分布变化"
]
)
]
# 根据模型表现调整建议优先级
if evaluation_results['weighted_f1'] < 0.7:
print("\n⚠️ 当前模型性能一般,建议优先优化:")
print("- 扩展数据量和数据质量")
print("- 优化文本预处理和特征工程")
elif 0.7 <= evaluation_results['weighted_f1'] < 0.85:
print("\n✅ 当前模型性能良好,建议进一步优化:")
print("- 尝试深度学习模型或特征融合")
print("- 细化超参数搜索")
else:
print("\n🎉 当前模型性能优秀,建议关注:")
print("- 工程化落地和实时更新机制")
print("- 细粒度分类扩展")
# 输出详细建议
for category, tips in suggestions:
print(f"\n【{category}】")
for i, tip in enumerate(tips, 1):
print(f"{i}. {tip}")
# ====================== 主函数(流程串联) ======================
if __name__ == "__main__":
try:
# 1. 数据爬取(默认15页,确保多类别)
tech_df = crawl_scitech_news(pages=15)
tech_df.to_csv('scitech_news_dataset.csv', index=False, encoding='utf-8-sig')
print(f"\n数据已保存至:scitech_news_dataset.csv")
# 2. 过滤单类别/小众类别数据(核心修复步骤)
filtered_df = filter_single_category(tech_df)
# 3. 数据探索
explore_data(filtered_df)
# 4. 文本预处理
X, y, label_encoder, vectorizer, selector, processed_df = preprocess_text(filtered_df)
# 5. 划分数据集(分层抽样,保持类别比例)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)
print(f"\n训练集规模:{X_train.shape[0]}条,测试集规模:{X_test.shape[0]}条")
print(f"训练集类别数:{len(np.unique(y_train))}个,测试集类别数:{len(np.unique(y_test))}个")
# 6. 模型训练与优化
best_svm_model, grid_search = train_optimized_svm(X_train, y_train)
# 7. 模型评价
eval_results = evaluate_model(best_svm_model, X_test, y_test, label_encoder)
# 8. 特征重要性可视化
plot_feature_importance(best_svm_model, vectorizer, selector)
# 9. 输出改进建议
get_improvement_suggestions(eval_results)
# 10. 保存模型
import joblib
joblib.dump({
'model': best_svm_model,
'vectorizer': vectorizer,
'selector': selector,
'label_encoder': label_encoder,
'eval_results': eval_results
}, 'scitech_news_classifier.pkl')
print("\n模型已保存至:scitech_news_classifier.pkl")
print("\n" + "="*50)
print("科技频道新闻分类全流程完成!")
except Exception as e:
print(f"\n程序执行失败:{str(e)}")
print("建议检查:1. 爬取页数是否足够 2. 网络连接 3. 类别提取逻辑")
最新发布