预处理文件中的除了原始是第一列为序号,第二列为碱解氮,其他六个文件都是MSC这样预处理文件中,第一列为序号第二列为碱解氮倒数第一列为intercept_b 倒数第二列为slope_a 。而我们需要对这七个文件进行CNN处理。给出完整代码 。原先这个代码生成的数据不好看 。import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
# TensorFlow/Keras
try:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
TF_AVAILABLE = True
except ImportError:
print("警告: TensorFlow未安装,跳过CNN分析")
TF_AVAILABLE = False
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
# 设置路径
input_folder = r"C:\Users\yelanke\Desktop\大创\预处理数据"
output_folder = r"C:\Users\yelanke\Desktop\大创\CNN"
# 确保输出文件夹存在
os.makedirs(output_folder, exist_ok=True)
if not TF_AVAILABLE:
print("TensorFlow未安装,无法进行CNN分析")
exit()
def calculate_rpd(y_true, y_pred):
"""计算RPD"""
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
std_y = np.std(y_true)
return std_y / rmse if rmse != 0 else 0
def format_metrics(r2, mae, rmse, rpd):
"""格式化评估指标,保留4位小数"""
# 确保R²在合理范围内
if r2 > 0.9999:
r2 = 0.9999
elif r2 < 0:
r2 = 0.0
return {
'R2': round(r2, 4) if not np.isnan(r2) else 0.0000,
'MAE': round(mae, 4),
'RMSE': round(rmse, 4),
'RPD': round(rpd, 4)
}
def build_cnn_model(input_shape):
"""构建CNN模型"""
model = keras.Sequential([
layers.Input(shape=input_shape),
# 卷积层1
layers.Conv1D(filters=32, kernel_size=3, activation='relu', padding='same'),
layers.BatchNormalization(),
layers.MaxPooling1D(pool_size=2),
layers.Dropout(0.2),
# 卷积层2
layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
layers.BatchNormalization(),
layers.MaxPooling1D(pool_size=2),
layers.Dropout(0.2),
# 卷积层3
layers.Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'),
layers.BatchNormalization(),
layers.MaxPooling1D(pool_size=2),
layers.Dropout(0.3),
# 展平层
layers.Flatten(),
# 全连接层
layers.Dense(128, activation='relu'),
layers.Dropout(0.3),
layers.Dense(64, activation='relu'),
layers.Dropout(0.3),
# 输出层
layers.Dense(1)
])
# 编译模型
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=0.001),
loss='mse',
metrics=['mae']
)
return model
def reshape_for_cnn(X):
"""将数据reshape为CNN需要的格式"""
return X.reshape(X.shape[0], X.shape[1], 1)
def evaluate_cnn_model(X_train, y_train, X_test, y_test):
"""评估CNN模型并返回预测结果"""
try:
# 标准化
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()
# 为CNN reshape数据
X_train_cnn = reshape_for_cnn(X_train_scaled)
X_test_cnn = reshape_for_cnn(X_test_scaled)
# 构建模型
model = build_cnn_model((X_train_cnn.shape[1], X_train_cnn.shape[2]))
# 设置早停
early_stopping = callbacks.EarlyStopping(
monitor='val_loss',
patience=20,
restore_best_weights=True,
verbose=0
)
reduce_lr = callbacks.ReduceLROnPlateau(
monitor='val_loss',
factor=0.5,
patience=10,
min_lr=0.00001,
verbose=0
)
# 训练模型
history = model.fit(
X_train_cnn, y_train_scaled,
validation_split=0.2,
epochs=100,
batch_size=16,
verbose=0,
callbacks=[early_stopping, reduce_lr]
)
# 预测
y_train_pred_scaled = model.predict(X_train_cnn, verbose=0).ravel()
y_test_pred_scaled = model.predict(X_test_cnn, verbose=0).ravel()
# 反标准化
y_train_pred = scaler_y.inverse_transform(y_train_pred_scaled.reshape(-1, 1)).ravel()
y_test_pred = scaler_y.inverse_transform(y_test_pred_scaled.reshape(-1, 1)).ravel()
# 计算评估指标
train_metrics = format_metrics(
r2_score(y_train, y_train_pred),
mean_absolute_error(y_train, y_train_pred),
np.sqrt(mean_squared_error(y_train, y_train_pred)),
calculate_rpd(y_train, y_train_pred)
)
test_metrics = format_metrics(
r2_score(y_test, y_test_pred),
mean_absolute_error(y_test, y_test_pred),
np.sqrt(mean_squared_error(y_test, y_test_pred)),
calculate_rpd(y_test, y_test_pred)
)
# 准备散点图数据
scatter_data = pd.DataFrame({
'实际值': np.round(y_test, 4),
'预测值': np.round(y_test_pred, 4),
'残差': np.round(y_test - y_test_pred, 4),
'样本编号': range(1, len(y_test) + 1)
})
return {
'训练轮次': len(history.history['loss']),
'最佳验证损失': round(min(history.history['val_loss']), 6),
'最终验证损失': round(history.history['val_loss'][-1], 6),
'最终训练损失': round(history.history['loss'][-1], 6),
'建模集': train_metrics,
'预测集': test_metrics,
'建模集样本数': len(y_train),
'预测集样本数': len(y_test),
'散点图数据': scatter_data,
'训练集预测值': y_train_pred,
'测试集预测值': y_test_pred
}
except Exception as e:
print(f" 模型评估错误: {str(e)}")
return None
def save_beautiful_excel(results, output_path):
"""保存美观的Excel报告"""
if not results:
print(" 警告: 没有评估结果可保存")
return False
# 创建DataFrame
data = []
for result in results:
row = {
'文件名': result['文件名'],
'总样本数': result['总样本数'],
'训练轮次': result['训练轮次'],
'最佳验证损失': result['最佳验证损失'],
'最终验证损失': result['最终验证损失'],
'最终训练损失': result['最终训练损失'],
# 建模集指标
'建模集样本数': result['建模集样本数'],
'建模集_R2': result['建模集']['R2'],
'建模集_MAE': result['建模集']['MAE'],
'建模集_RMSE': result['建模集']['RMSE'],
'建模集_RPD': result['建模集']['RPD'],
# 预测集指标
'预测集样本数': result['预测集样本数'],
'预测集_R2': result['预测集']['R2'],
'预测集_MAE': result['预测集']['MAE'],
'预测集_RMSE': result['预测集']['RMSE'],
'预测集_RPD': result['预测集']['RPD'],
# 性能差异
'R2差异': round(result['建模集']['R2'] - result['预测集']['R2'], 4),
'RPD差异': round(result['建模集']['RPD'] - result['预测集']['RPD'], 4)
}
data.append(row)
df = pd.DataFrame(data)
# 使用openpyxl引擎创建Excel
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
# 写入主要数据
df.to_excel(writer, sheet_name='评估结果', index=False)
# 获取工作簿和工作表
workbook = writer.book
worksheet = writer.sheets['评估结果']
# 定义样式
from openpyxl.styles import PatternFill, Font, Alignment, Border, Side
from openpyxl.formatting.rule import ColorScaleRule, CellIsRule
# 表头样式
header_fill = PatternFill(start_color="C0504D", end_color="C0504D", fill_type="solid")
header_font = Font(color="FFFFFF", bold=True, size=11)
header_alignment = Alignment(horizontal="center", vertical="center")
# 应用表头样式
for cell in worksheet[1]:
cell.fill = header_fill
cell.font = header_font
cell.alignment = header_alignment
# 设置列宽
column_widths = {
'A': 35, # 文件名
'B': 12, # 总样本数
'C': 12, # 训练轮次
'D': 14, # 最佳验证损失
'E': 14, # 最终验证损失
'F': 14, # 最终训练损失
'G': 14, # 建模集样本数
'H': 12, # 建模集_R2
'I': 12, # 建模集_MAE
'J': 12, # 建模集_RMSE
'K': 12, # 建模集_RPD
'L': 14, # 预测集样本数
'M': 12, # 预测集_R2
'N': 12, # 预测集_MAE
'O': 12, # 预测集_RMSE
'P': 12, # 预测集_RPD
'Q': 12, # R2差异
'R': 12, # RPD差异
}
for col, width in column_widths.items():
worksheet.column_dimensions[col].width = width
# 应用条件格式
# R2的条件格式
r2_rule = ColorScaleRule(start_type='min', start_color='FF0000',
mid_type='percentile', mid_value=50, mid_color='FFFF00',
end_type='max', end_color='00FF00')
worksheet.conditional_formatting.add('H2:H1000', r2_rule)
worksheet.conditional_formatting.add('M2:M1000', r2_rule)
# RPD的条件格式
rpd_rule = ColorScaleRule(start_type='min', start_color='FF0000',
mid_type='percentile', mid_value=50, mid_color='FFFF00',
end_type='max', end_color='00FF00')
worksheet.conditional_formatting.add('K2:K1000', rpd_rule)
worksheet.conditional_formatting.add('P2:P1000', rpd_rule)
# 验证损失的条件格式(越低越好)
loss_rule = ColorScaleRule(start_type='min', start_color='00FF00',
mid_type='percentile', mid_value=50, mid_color='FFFF00',
end_type='max', end_color='FF0000')
worksheet.conditional_formatting.add('D2:D1000', loss_rule)
worksheet.conditional_formatting.add('E2:E1000', loss_rule)
worksheet.conditional_formatting.add('F2:F1000', loss_rule)
# 添加边框
thin_border = Border(left=Side(style='thin'),
right=Side(style='thin'),
top=Side(style='thin'),
bottom=Side(style='thin'))
for row in worksheet.iter_rows(min_row=1, max_row=len(df)+1, max_col=18):
for cell in row:
cell.border = thin_border
# 创建统计汇总sheet
summary_data = {
'统计项': ['平均值', '中位数', '最大值', '最小值', '标准差'],
'建模集_R2': [
round(df['建模集_R2'].mean(), 4),
round(df['建模集_R2'].median(), 4),
round(df['建模集_R2'].max(), 4),
round(df['建模集_R2'].min(), 4),
round(df['建模集_R2'].std(), 4)
],
'预测集_R2': [
round(df['预测集_R2'].mean(), 4),
round(df['预测集_R2'].median(), 4),
round(df['预测集_R2'].max(), 4),
round(df['预测集_R2'].min(), 4),
round(df['预测集_R2'].std(), 4)
],
'建模集_RPD': [
round(df['建模集_RPD'].mean(), 4),
round(df['建模集_RPD'].median(), 4),
round(df['建模集_RPD'].max(), 4),
round(df['建模集_RPD'].min(), 4),
round(df['建模集_RPD'].std(), 4)
],
'预测集_RPD': [
round(df['预测集_RPD'].mean(), 4),
round(df['预测集_RPD'].median(), 4),
round(df['预测集_RPD'].max(), 4),
round(df['预测集_RPD'].min(), 4),
round(df['预测集_RPD'].std(), 4)
],
'最佳验证损失': [
round(df['最佳验证损失'].mean(), 6),
round(df['最佳验证损失'].median(), 6),
round(df['最佳验证损失'].max(), 6),
round(df['最佳验证损失'].min(), 6),
round(df['最佳验证损失'].std(), 6)
]
}
summary_df = pd.DataFrame(summary_data)
summary_df.to_excel(writer, sheet_name='统计汇总', index=False)
# 格式化汇总表
summary_ws = writer.sheets['统计汇总']
for cell in summary_ws[1]:
cell.fill = PatternFill(start_color="4F81BD", end_color="4F81BD", fill_type="solid")
cell.font = Font(color="FFFFFF", bold=True)
cell.alignment = Alignment(horizontal="center")
for col in ['A', 'B', 'C', 'D', 'E', 'F']:
summary_ws.column_dimensions[col].width = 15
# 创建模型性能分类sheet
performance_data = []
for idx, row in df.iterrows():
# 根据R2分类
if row['预测集_R2'] >= 0.9:
r2_class = '优秀'
elif row['预测集_R2'] >= 0.8:
r2_class = '良好'
elif row['预测集_R2'] >= 0.6:
r2_class = '一般'
else:
r2_class = '较差'
# 根据RPD分类
if row['预测集_RPD'] >= 3:
rpd_class = '优秀'
elif row['预测集_RPD'] >= 2:
rpd_class = '良好'
elif row['预测集_RPD'] >= 1.5:
rpd_class = '一般'
else:
rpd_class = '较差'
# 训练效果分类
if row['最佳验证损失'] < 0.01:
loss_class = '很好'
elif row['最佳验证损失'] < 0.05:
loss_class = '良好'
elif row['最佳验证损失'] < 0.1:
loss_class = '一般'
else:
loss_class = '较差'
performance_data.append({
'文件名': row['文件名'],
'R2分类': r2_class,
'RPD分类': rpd_class,
'训练效果': loss_class,
'训练轮次': row['训练轮次'],
'最佳验证损失': row['最佳验证损失'],
'预测集_R2': row['预测集_R2'],
'预测集_RPD': row['预测集_RPD']
})
performance_df = pd.DataFrame(performance_data)
performance_df.to_excel(writer, sheet_name='性能分类', index=False)
# 格式化性能分类表
perf_ws = writer.sheets['性能分类']
for cell in perf_ws[1]:
cell.fill = PatternFill(start_color="8064A2", end_color="8064A2", fill_type="solid")
cell.font = Font(color="FFFFFF", bold=True)
cell.alignment = Alignment(horizontal="center")
for col in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']:
perf_ws.column_dimensions[col].width = 15
# 保存成功
return True
def save_scatter_data(results, output_path):
"""保存散点图数据"""
if not results:
print(" 警告: 没有散点图数据可保存")
return False
# 使用openpyxl引擎创建Excel
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
for result in results:
if '散点图数据' in result and result['散点图数据'] is not None:
# 创建sheet名称
sheet_name = result['文件名']
# 移除扩展名
for ext in ['.xlsx', '.xls', '.csv', '.XLSX', '.XLS', '.CSV']:
sheet_name = sheet_name.replace(ext, '')
# 限制sheet名长度
sheet_name = sheet_name[:31]
# 替换无效字符
invalid_chars = ['/', '\\', '?', '*', ':', '[', ']']
for char in invalid_chars:
sheet_name = sheet_name.replace(char, '_')
# 保存散点图数据
scatter_df = result['散点图数据']
scatter_df.to_excel(writer, sheet_name=sheet_name, index=False)
# 获取工作表并设置样式
worksheet = writer.sheets[sheet_name]
# 定义样式
from openpyxl.styles import PatternFill, Font, Alignment, Border, Side
# 表头样式
header_fill = PatternFill(start_color="FF6600", end_color="FF6600", fill_type="solid")
header_font = Font(color="FFFFFF", bold=True, size=11)
header_alignment = Alignment(horizontal="center", vertical="center")
# 应用表头样式
for cell in worksheet[1]:
cell.fill = header_fill
cell.font = header_font
cell.alignment = header_alignment
# 设置列宽
column_widths = {
'A': 15, # 实际值
'B': 15, # 预测值
'C': 15, # 残差
'D': 15 # 样本编号
}
for col, width in column_widths.items():
worksheet.column_dimensions[col].width = width
# 添加边框
thin_border = Border(left=Side(style='thin'),
right=Side(style='thin'),
top=Side(style='thin'),
bottom=Side(style='thin'))
for row in worksheet.iter_rows(min_row=1, max_row=len(scatter_df)+1, max_col=4):
for cell in row:
cell.border = thin_border
# 在顶部添加模型信息
worksheet.insert_rows(1, 6)
worksheet.merge_cells('A1:D1')
worksheet['A1'] = f"CNN模型预测结果 - {result['文件名']}"
worksheet['A1'].font = Font(bold=True, size=14)
worksheet['A1'].alignment = Alignment(horizontal="center")
worksheet['A2'] = f"训练轮次: {result['训练轮次']}"
worksheet['B2'] = f"最佳验证损失: {result['最佳验证损失']:.6f}"
worksheet['C2'] = f"最终验证损失: {result['最终验证损失']:.6f}"
worksheet['A3'] = f"最终训练损失: {result['最终训练损失']:.6f}"
worksheet['B3'] = f"建模集样本数: {result['建模集样本数']}"
worksheet['C3'] = f"预测集样本数: {result['预测集样本数']}"
worksheet['A4'] = f"建模集R²: {result['建模集']['R2']:.4f}"
worksheet['B4'] = f"建模集RPD: {result['建模集']['RPD']:.4f}"
worksheet['C4'] = f"建模集RMSE: {result['建模集']['RMSE']:.4f}"
worksheet['A5'] = f"预测集R²: {result['预测集']['R2']:.4f}"
worksheet['B5'] = f"预测集RPD: {result['预测集']['RPD']:.4f}"
worksheet['C5'] = f"预测集RMSE: {result['预测集']['RMSE']:.4f}"
# 设置信息行的样式
for row in range(2, 6):
for col in range(1, 5):
cell = worksheet.cell(row=row, column=col)
cell.font = Font(bold=True)
return True
print("开始CNN分析...")
print("=" * 60)
# 获取所有文件
files = [f for f in os.listdir(input_folder) if f.lower().endswith(('.xlsx', '.xls', '.csv'))]
print(f"找到 {len(files)} 个文件需要处理")
all_results = []
for file_idx, filename in enumerate(files, 1):
print(f"正在处理文件 {file_idx}/{len(files)}: {filename}")
try:
# 读取文件
file_path = os.path.join(input_folder, filename)
if filename.lower().endswith('.csv'):
df = pd.read_csv(file_path)
else:
df = pd.read_excel(file_path)
print(f" 数据形状: {df.shape}")
# 数据清洗
df = df.dropna()
if df.shape[1] < 2:
print(f" 跳过:数据列数不足,至少需要2列")
continue
if len(df) < 20:
print(f" 跳过:样本量不足,至少需要20个样本")
continue
# 假设最后一列是目标变量
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
# 划分建模集和预测集
X_modeling, X_prediction, y_modeling, y_prediction = train_test_split(
X, y, test_size=0.3, random_state=42
)
print(f" 建模集大小: {X_modeling.shape[0]}, 预测集大小: {X_prediction.shape[0]}")
# 评估模型
model_results = evaluate_cnn_model(X_modeling, y_modeling, X_prediction, y_prediction)
if model_results is None:
print(f" 模型评估失败")
continue
# 添加文件信息
model_results['文件名'] = filename
model_results['总样本数'] = len(y)
all_results.append(model_results)
print(f" 完成: 建模集R2={model_results['建模集']['R2']:.4f}, "
f"预测集R2={model_results['预测集']['R2']:.4f}, "
f"训练轮次={model_results['训练轮次']}")
except Exception as e:
print(f" 文件处理错误: {str(e)}")
continue
print("\n" + "=" * 60)
if all_results:
# 保存评估报告
report_path = os.path.join(output_folder, 'CNN模型评估报告.xlsx')
print(f"正在保存评估报告...")
save_success = save_beautiful_excel(all_results, report_path)
if save_success:
print(f"✓ 评估报告已保存到: {report_path}")
# 保存散点图数据
scatter_path = os.path.join(output_folder, 'CNN预测集散点图数据.xlsx')
print(f"正在保存散点图数据...")
scatter_success = save_scatter_data(all_results, scatter_path)
if scatter_success:
print(f"✓ 散点图数据已保存到: {scatter_path}")
# 打印汇总统计
print("\n汇总统计:")
print("-" * 40)
# 创建临时DataFrame用于统计
temp_data = []
for r in all_results:
temp_data.append({
'文件名': r['文件名'],
'建模集_R2': r['建模集']['R2'],
'预测集_R2': r['预测集']['R2'],
'建模集_RPD': r['建模集']['RPD'],
'预测集_RPD': r['预测集']['RPD'],
'最佳验证损失': r['最佳验证损失']
})
df_summary = pd.DataFrame(temp_data)
print(f"平均建模集R²: {df_summary['建模集_R2'].mean():.4f}")
print(f"平均预测集R²: {df_summary['预测集_R2'].mean():.4f}")
print(f"平均建模集RPD: {df_summary['建模集_RPD'].mean():.4f}")
print(f"平均预测集RPD: {df_summary['预测集_RPD'].mean():.4f}")
print(f"平均验证损失: {df_summary['最佳验证损失'].mean():.6f}")
# 统计优秀模型
excellent_models = df_summary[df_summary['预测集_R2'] >= 0.9]
print(f"优秀模型数量 (R²≥0.9): {len(excellent_models)}")
# 显示每个文件的简要结果
print("\n各文件评估结果:")
print("-" * 40)
for r in all_results:
print(f" {r['文件名']}: 建模集R²={r['建模集']['R2']:.4f}, "
f"预测集R²={r['预测集']['R2']:.4f}, "
f"训练轮次={r['训练轮次']}")
else:
print("警告: 没有成功处理任何文件")
print("=" * 60)
print("CNN分析完成!")
根据问题描述,我们需要对七个文件进行CNN处理。但是,文件分为两种类型:
原始文件(第一列为序号,第二列为碱解氮,后面的列为光谱数据)
预处理文件(MSC等预处理方法,第一列为序号,第二列为碱解氮,倒数第一列为intercept_b,倒数第二列为slope_a,中间为光谱数据)
注意:原始文件中,除了第一列序号和第二列碱解氮,其余列都是光谱数据。而预处理文件中,除了第一列序号、第二列碱解氮、倒数两列(intercept_b和slope_a),中间列是光谱数据。
因此,在读取数据时,我们需要根据文件名判断是否是原始文件,然后提取相应的特征列和目标列。
目标变量:碱解氮(第二列)
特征变量:
对于原始文件:从第三列开始到最后一列(光谱数据)
对于预处理文件:从第三列开始到倒数第三列(因为倒数两列是intercept_b和slope_a,不需要作为特征)
另外,注意第一列是序号,我们不需要。
修改思路:
读取文件后,先判断文件名是否包含“原始”二字(或者根据预处理方法名判断,但题目说除了原始文件其他都是MSC等预处理,所以我们可以通过文件名是否包含“原始”来区分)
如果不包含“原始”,则按照预处理文件处理:目标列为第二列,特征列为第三列到倒数第三列(包含)
如果包含“原始”,则按照原始文件处理:目标列为第二列,特征列为第三列到最后一列
但是,题目中提到了七个文件,一个原始,六个预处理(MSC等)。所以,我们假设文件名中如果包含“原始”则为原始文件,否则为预处理文件。
注意:预处理文件也可能有其他预处理方法,但倒数两列都是intercept_b和slope_a。
因此,我们修改读取数据部分的代码:
原始文件:特征列索引为2:(第三列到最后),目标列索引为1(第二列)
预处理文件:特征列索引为2:-2(第三列到倒数第三列),目标列索引为1(第二列)
另外,注意:有些文件可能有多余的列,比如序号列我们已经不考虑,但是确保读取正确的列。
步骤:
读取数据框
删除所有空值的行
根据文件名判断文件类型
提取对应的特征列和目标列