import tensorflow as tf
from tensorflow.keras import datasets, layers, models, regularizers
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import gc
# 设置随机种子确保结果可复现
tf.random.set_seed(42)
np.random.seed(42)
print("正在加载数据...")
try:
# 读取训练集和测试集
train_df = pd.read_csv('../SDX/train_set.csv', sep='\t',nrows=1000)
test_a_df = pd.read_csv('../SDX/test_a.csv', sep='\t',nrows=1000)
print(f"训练集大小: {train_df.shape}, 测试集A大小: {test_a_df.shape}")
print("训练集预览:")
print(train_df.head())
except Exception as e:
print(f"数据加载失败: {str(e)}")
print(f"当前工作目录: {os.getcwd()}")
print(f"目录内容: {os.listdir('../SDX') if os.path.exists('../SDX') else 'SDX目录不存在'}")
raise
# 类别映射(14个类别)
class_mapping = {
0: "科技", 1: "股票", 2: "体育", 3: "娱乐",
4: "时政", 5: "社会", 6: "教育", 7: "财经",
8: "家居", 9: "游戏", 10: "房产", 11: "时尚",
12: "彩票", 13: "星座"
}
# 验证类别数量
if train_df['label'].nunique() != 14:
print(f"警告: 训练集标签数量为 {train_df['label'].nunique()},应为14个类别")
# 2. 数据预处理
print("\n处理文本数据...")
def text_to_sequence(text):
"""将文本转换为整数序列"""
try:
return [int(x) for x in text.split()]
except Exception as e:
print(f"文本转换错误: {str(e)}")
return []
# 文本序列转换
train_df['sequence'] = train_df['text'].apply(text_to_sequence)
test_a_df['sequence'] = test_a_df['text'].apply(text_to_sequence)
# 计算序列长度统计
lengths = train_df['sequence'].apply(len)
print(f"序列长度统计: min={min(lengths)}, max={max(lengths)}, mean={np.mean(lengths):.1f}")
# 确定最大序列长度(取95%分位数以减少填充)
max_len = int(np.percentile(lengths, 95))
print(f"最大序列长度(95%分位数): {max_len}")
# 填充序列
X = pad_sequences(train_df['sequence'].values, maxlen=max_len, padding='post')
X_test = pad_sequences(test_a_df['sequence'].values, maxlen=max_len, padding='post')
# 清理内存
del lengths
gc.collect()
# 标签处理
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_df['label'])
num_classes = len(np.unique(y))
# 1. 数据准备
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()
train_images, test_images = train_images / 255.0, test_images / 255.0
# 2. 模型构建函数
def build_model(optimizer='adam', l2_rate=0.001, dropout_rate=0.5):
model = models.Sequential([
layers.Conv2D(32, (3, 3), activation='relu', padding='same',
kernel_regularizer=regularizers.l2(l2_rate),
input_shape=(32, 32, 3)),
layers.BatchNormalization(),
layers.MaxPooling2D((2, 2)),
layers.Conv2D(64, (3, 3), activation='relu', padding='same',
kernel_regularizer=regularizers.l2(l2_rate)),
layers.BatchNormalization(),
layers.MaxPooling2D((2, 2)),
layers.Conv2D(128, (3, 3), activation='relu', padding='same',
kernel_regularizer=regularizers.l2(l2_rate)),
layers.BatchNormalization(),
layers.GlobalAveragePooling2D(),
layers.Dense(256, activation='relu',
kernel_regularizer=regularizers.l2(l2_rate)),
layers.Dropout(dropout_rate),
layers.Dense(10)
])
if optimizer.lower() == 'sgd':
opt = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
else:
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt,
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
return model
# 3. 训练配置
early_stopping = EarlyStopping(
monitor='val_loss',
patience=10,
restore_best_weights=True
)
# 超参数组合测试
params_grid = [
{'optimizer': 'adam', 'batch_size': 128, 'epochs': 2},
{'optimizer': 'sgd', 'batch_size': 256, 'epochs': 2}
]
# 4. 训练与评估
results = []
for params in params_grid:
print(f"\n当前参数组合: {params}")
model = build_model(optimizer=params['optimizer'])
history = model.fit(
train_images, train_labels,
validation_data=(test_images, test_labels),
batch_size=params['batch_size'],
epochs=params['epochs'],
callbacks=[early_stopping],
verbose=1
)
# 记录最佳结果
best_epoch = np.argmin(history.history['val_loss'])
results.append({
'params': params,
'best_val_acc': history.history['val_accuracy'][best_epoch],
'best_val_loss': history.history['val_loss'][best_epoch],
'history': history
})
# 5. 可视化分析
plt.figure(figsize=(18, 6))
# 准确率曲线
plt.subplot(1, 2, 1)
for i, result in enumerate(results):
plt.plot(result['history'].history['accuracy'],
linestyle='--' if i==0 else '-',
label=f"{result['params']['optimizer']} (train)")
plt.plot(result['history'].history['val_accuracy'],
linestyle='--' if i==0 else '-',
label=f"{result['params']['optimizer']} (val)")
plt.title('Model Accuracy Comparison', fontsize=14)
plt.ylabel('Accuracy', fontsize=12)
plt.xlabel('Epoch', fontsize=12)
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
# 损失曲线
plt.subplot(1, 2, 2)
for i, result in enumerate(results):
plt.plot(result['history'].history['loss'],
linestyle='--' if i==0 else '-',
label=f"{result['params']['optimizer']} (train)")
plt.plot(result['history'].history['val_loss'],
linestyle='--' if i==0 else '-',
label=f"{result['params']['optimizer']} (val)")
plt.title('Model Loss Comparison', fontsize=14)
plt.ylabel('Loss', fontsize=12)
plt.xlabel('Epoch', fontsize=12)
plt.legend(loc='upper right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300)
plt.show()
# 6. 最优模型评估
best_model_idx = np.argmax([r['best_val_acc'] for r in results])
best_result = results[best_model_idx]
print(f"\n最优参数组合: {best_result['params']}")
print(f"验证集最高准确率: {best_result['best_val_acc']*100:.2f}%")
# 混淆矩阵可视化
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
predictions = np.argmax(best_result['history'].model.predict(test_images), axis=1)
cm = confusion_matrix(test_labels, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png', dpi=300)
plt.show()
# 生成预测结果
print("\n生成预测结果...")
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1) # 定义y_pred_classes变量
# 保存预测结果
prediction_path = os.path.join('predictions', 'test_a_predictions.csv')
test_a_df['prediction'] = y_pred_classes
test_a_df['prediction_class'] = test_a_df['prediction'].map(class_mapping)
test_a_df.to_csv(prediction_path, index=False)
print(f"预测结果已保存至: {prediction_path}")
base_dir = os.path.dirname(os.path.abspath(__file__))
prediction_dir = os.path.join(base_dir, 'predictions')
os.makedirs('prediction_dir', exist_ok=True)
prediction_path = os.path.join(prediction_dir, 'test_a_predictions.csv')
directories = ['models', 'logs', 'predictions']
for dir_name in directories:
if not os.path.exists(dir_name):
os.makedirs(dir_name)
print(f"创建目录: {dir_name}")
# 保存预测结果前检查目录
print(f"检查预测结果保存目录: {'predictions' if os.path.exists('predictions') else '目录不存在,将尝试创建'}")
prediction_path = os.path.join('predictions', 'test_a_predictions.csv')
test_a_df['prediction'] = y_pred_classes
test_a_df['prediction_class'] = test_a_df['prediction'].map(class_mapping)
test_a_df.to_csv(prediction_path, index=False)
# 保存后验证文件是否存在
if os.path.exists('prediction_path'):
print(f"预测结果已成功保存至: {prediction_path}")
print(f"文件大小: {os.path.getsize(prediction_path) / 1024:.2f} KB")
else:
print(f"错误: 文件未生成! 当前工作目录: {os.getcwd()}")
print(f"预测目录内容: {os.listdir('predictions') if os.path.exists('predictions') else '预测目录不存在'}")
try:
test_a_df.to_csv(prediction_path, index=False)
print(f"预测结果已保存至: {prediction_path}")
except Exception as e:
print(f"保存预测结果失败: {str(e)}")寻找不能生成csv文件的具体原因,在不改变原有代码的基础上,呈现修改后的完整代码