import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # 设定 TensorFlow 日志级别,减少不必要的警告信息
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
# 1. 加载数据集
fake_data = pd.read_csv('gossipcop_fake.csv') # 加载假新闻数据
real_data = pd.read_csv('gossipcop_real.csv') # 加载真新闻数据
# 2. 添加标签字段 ('label')
fake_data['label'] = 0 # 假新闻标签为 0
real_data['label'] = 1 # 真新闻标签为 1
# 3. 合并两个数据集
data = pd.concat([fake_data, real_data], ignore_index=True) # 合并数据并重置索引
# 4. 数据清洗:将文本转换为小写
def clean_data(text):
text = str(text).lower()
return text
data['cleaned_text'] = data['title'].apply(clean_data) # 对 'title' 列进行清洗
# 5. 对新闻进行分词
data['token_text'] = data['cleaned_text'].apply(lambda x: x.split())
# 6. 使用 Word2Vec 进行特征提取
w2v_model = Word2Vec(sentences=data['token_text'], vector_size=100, window=5, min_count=1, workers=4)
# 7. 获取句子向量(词向量的平均值),用于 KNN 和 XGBoost
def get_sentence_vector(sentence, model, vector_size=100):
# 获取句子中所有词的向量,并取平均值
vectors = [model.wv[word] for word in sentence if word in model.wv]
if len(vectors) > 0:
return np.mean(vectors, axis=0)
else:
# 如果句子中没有词在词汇表中,则返回零向量
return np.zeros(vector_size)
# 应用 get_sentence_vector 函数
data['sentence_vector'] = data['token_text'].apply(lambda x: get_sentence_vector(x, w2v_model))
# 准备特征和标签
X_vectors = np.vstack(data['sentence_vector']) # 将句子向量堆叠成矩阵
y = data['label'].values # 标签
# 8. 为 LSTM 准备序列数据
# 将词映射为索引
tokenizer = w2v_model.wv
# 构建词汇表
vocab_size = len(tokenizer)
word_index = {word: index + 1 for index, word in enumerate(tokenizer.index_to_key)} # 索引从 1 开始
# 将文本转换为索引序列
def text_to_sequence(tokens, word_index):
return [word_index[word] for word in tokens if word in word_index]
data['text_seq'] = data['token_text'].apply(lambda x: text_to_sequence(x, word_index))
# 填充序列,使其具有相同的长度
max_seq_length = max(len(seq) for seq in data['text_seq'])
X_sequences = pad_sequences(data['text_seq'], maxlen=max_seq_length, padding='post')
# 创建嵌入矩阵,用于初始化 Embedding 层
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size + 1, embedding_dim)) # +1 是因为索引从 1 开始
for word, i in word_index.items():
if word in tokenizer:
embedding_vector = tokenizer[word]
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
# 9. 划分数据集
X_train_vec, X_test_vec, y_train, y_test = train_test_split(
X_vectors, y, test_size=0.2, random_state=42) # 用于 KNN 和 XGBoost
X_train_seq, X_test_seq, _, _ = train_test_split(
X_sequences, y, test_size=0.2, random_state=42) # 用于 LSTM
# 10. 训练 KNN 模型
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_vec, y_train)
# 11. 训练 XGBoost 模型
xgb_model = xgb.XGBClassifier(
n_estimators=100,
max_depth=6,
learning_rate=0.1,
objective='binary:logistic',
eval_metric='logloss',
use_label_encoder=False,
verbosity=1
)
xgb_model.fit(X_train_vec, y_train)
# 12. 构建并训练 LSTM 模型
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=vocab_size + 1, output_dim=embedding_dim,
weights=[embedding_matrix], input_length=max_seq_length, trainable=False))
lstm_model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(1, activation='sigmoid'))
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 训练 LSTM 模型
lstm_model.fit(X_train_seq, y_train, epochs=5, batch_size=64, validation_split=0.1, verbose=1)
# 13. 在测试集上评估各个模型
# KNN 模型预测
knn_test_preds = knn_model.predict(X_test_vec)
knn_test_probs = knn_model.predict_proba(X_test_vec)[:, 1].reshape(-1, 1)
print("KNN 模型")
print("准确率:", accuracy_score(y_test, knn_test_preds))
# print(classification_report(y_test, knn_test_preds))
# XGBoost 模型预测
xgb_test_preds = xgb_model.predict(X_test_vec)
xgb_test_probs = xgb_model.predict_proba(X_test_vec)[:, 1].reshape(-1, 1)
print("XGBoost 模型")
print("准确率:", accuracy_score(y_test, xgb_test_preds))
# print(classification_report(y_test, xgb_test_preds))
# LSTM 模型预测
lstm_test_probs = lstm_model.predict(X_test_seq).reshape(-1, 1)
lstm_test_preds = (lstm_test_probs >= 0.5).astype(int).reshape(-1)
print("LSTM 模型")
print("准确率:", accuracy_score(y_test, lstm_test_preds))
# print(classification_report(y_test, lstm_test_preds))
# 14. 准备用于 Stacking 的数据(避免数据泄漏)
# 将训练集划分为新的训练集和验证集
X_train_meta_vec, X_valid_meta_vec, y_train_meta, y_valid_meta = train_test_split(
X_train_vec, y_train, test_size=0.2, random_state=42)
X_train_meta_seq, X_valid_meta_seq, _, _ = train_test_split(
X_train_seq, y_train, test_size=0.2, random_state=42)
# 在新的训练集上重新训练基模型
# 重新训练 KNN 模型
knn_model_meta = KNeighborsClassifier(n_neighbors=5)
knn_model_meta.fit(X_train_meta_vec, y_train_meta)
# 重新训练 XGBoost 模型
xgb_model_meta = xgb.XGBClassifier(
n_estimators=100,
max_depth=6,
learning_rate=0.1,
objective='binary:logistic',
eval_metric='logloss',
use_label_encoder=False,
verbosity=1
)
xgb_model_meta.fit(X_train_meta_vec, y_train_meta)
# 重新训练 LSTM 模型
lstm_model_meta = Sequential()
lstm_model_meta.add(Embedding(input_dim=vocab_size + 1, output_dim=embedding_dim,
weights=[embedding_matrix], input_length=max_seq_length, trainable=False))
lstm_model_meta.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_meta.add(Dense(1, activation='sigmoid'))
lstm_model_meta.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model_meta.fit(X_train_meta_seq, y_train_meta, epochs=5, batch_size=64, validation_split=0.1, verbose=1)
# 15. 获取基模型在验证集上的预测结果,作为元模型的训练特征
knn_valid_probs = knn_model_meta.predict_proba(X_valid_meta_vec)[:, 1].reshape(-1, 1)
xgb_valid_probs = xgb_model_meta.predict_proba(X_valid_meta_vec)[:, 1].reshape(-1, 1)
lstm_valid_probs = lstm_model_meta.predict(X_valid_meta_seq).reshape(-1, 1)
# 构建元模型的训练数据
meta_train_features = np.hstack((knn_valid_probs, xgb_valid_probs, lstm_valid_probs))
# 16. 训练元模型(可以选择不同的模型)
meta_model = LogisticRegression()
# meta_model = SVC(probability=True)
meta_model.fit(meta_train_features, y_valid_meta)
# 17. 在测试集上进行预测
# 基模型在测试集上的预测结果
knn_test_probs = knn_model_meta.predict_proba(X_test_vec)[:, 1].reshape(-1, 1)
xgb_test_probs = xgb_model_meta.predict_proba(X_test_vec)[:, 1].reshape(-1, 1)
lstm_test_probs = lstm_model_meta.predict(X_test_seq).reshape(-1, 1)
# 构建元模型的测试数据
meta_test_features = np.hstack((knn_test_probs, xgb_test_probs, lstm_test_probs))
# 元模型的最终预测
final_probs = meta_model.predict_proba(meta_test_features)[:, 1]
final_preds = (final_probs >= 0.5).astype(int)
# 18. 输出集成模型的结果
print("Stacking 模型")
print("准确率:", accuracy_score(y_test, final_preds))
print(classification_report(y_test, final_preds))
# 可视化混淆矩阵
cm = confusion_matrix(y_test, final_preds)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Soft Voting Ensemble')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
# 1.使用k折交叉验证
# 2.更换stacking模型
# 3.表格+画图