# 时间:2024年6月8号 Date: June 16, 2024
# 文件名称 Filename: Ensemble_Models.py
# 集成模型:使用模型融合和Stacking策略提升预测精度
# coding=utf-8
import time
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
try:
from catboost import CatBoostRegressor
CATBOOST_AVAILABLE = True
except ImportError:
CATBOOST_AVAILABLE = False
print("警告: CatBoost未安装,将跳过CatBoost模型")
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
start_time = time.time()
# 加载数据集
train_dataSet = pd.read_csv(r'modified_数据集Time_Series661_detail.dat')
test_dataSet = pd.read_csv(r'modified_数据集Time_Series662_detail.dat')
# columns表示原始列,noise_columns表示添加噪声的列
columns = ['T_SONIC', 'CO2_density', 'CO2_density_fast_tmpr', 'H2O_density', 'H2O_sig_strgth', 'CO2_sig_strgth']
noise_columns = ['Error_T_SONIC', 'Error_CO2_density', 'Error_CO2_density_fast_tmpr', 'Error_H2O_density',
'Error_H2O_sig_strgth', 'Error_CO2_sig_strgth']
# 划分训练集和测试集
X_train = train_dataSet[noise_columns].copy()
y_train = train_dataSet[columns].copy()
X_test = test_dataSet[noise_columns].copy()
y_test = test_dataSet[columns].copy()
print("="*60)
print("集成模型训练")
print("="*60)
print(f"训练集大小: {X_train.shape}")
print(f"测试集大小: {X_test.shape}")
# 数据预处理
scaler = RobustScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
# 定义基模型
# 注意:XGBoost 和 RandomForest 原生支持多输出,其他模型需要使用 MultiOutputRegressor 包装
base_models = {
'XGBoost': XGBRegressor(
n_estimators=300,
max_depth=6,
learning_rate=0.05,
subsample=0.8,
colsample_bytree=0.8,
reg_alpha=5,
reg_lambda=10,
min_child_weight=3,
random_state=217,
n_jobs=-1,
tree_method='hist'
),
'LightGBM': MultiOutputRegressor(LGBMRegressor(
n_estimators=300,
max_depth=6,
learning_rate=0.05,
subsample=0.8,
colsample_bytree=0.8,
reg_alpha=5,
reg_lambda=10,
min_child_samples=20,
random_state=217,
n_jobs=1, # 每个输出使用1个job,MultiOutputRegressor会并行处理
verbose=-1
), n_jobs=-1), # MultiOutputRegressor使用所有CPU核心
'RandomForest': RandomForestRegressor(
n_estimators=200,
max_depth=10,
min_samples_split=5,
min_samples_leaf=2,
random_state=217,
n_jobs=-1
),
'GradientBoosting': MultiOutputRegressor(GradientBoostingRegressor(
n_estimators=200,
max_depth=6,
learning_rate=0.05,
subsample=0.8,
random_state=217
), n_jobs=-1)
}
if CATBOOST_AVAILABLE:
base_models['CatBoost'] = MultiOutputRegressor(CatBoostRegressor(
iterations=300,
depth=6,
learning_rate=0.05,
l2_leaf_reg=10,
random_seed=217,
verbose=False,
thread_count=1 # 每个输出使用1个线程
), n_jobs=-1)
# 添加多层感知机(MLP)模型
base_models['MLP'] = MultiOutputRegressor(MLPRegressor(
hidden_layer_sizes=(128, 64, 32), # 三层隐藏层,神经元数量分别为128, 64, 32
activation='relu', # 使用ReLU激活函数
solver='adam', # 使用Adam优化器
alpha=0.001, # L2正则化参数
batch_size='auto', # 自动选择批次大小
learning_rate='adaptive', # 自适应学习率
learning_rate_init=0.001, # 初始学习率
max_iter=500, # 最大迭代次数
shuffle=True, # 每次迭代前打乱数据
random_state=217, # 随机种子
tol=1e-4, # 收敛容忍度
early_stopping=True, # 早停机制
validation_fraction=0.1, # 验证集比例
n_iter_no_change=20, # 早停等待轮数
verbose=False # 不输出训练过程
), n_jobs=-1)
def create_model_instance(name, fold_idx=None):
"""
创建模型实例的辅助函数
"""
random_seed = 217 if fold_idx is None else 217 + fold_idx
if name == 'XGBoost':
return XGBRegressor(
n_estimators=300, max_depth=6, learning_rate=0.05,
subsample=0.8, colsample_bytree=0.8, reg_alpha=5,
reg_lambda=10, min_child_weight=3, random_state=random_seed,
n_jobs=-1, tree_method='hist'
)
elif name == 'LightGBM':
return MultiOutputRegressor(LGBMRegressor(
n_estimators=300, max_depth=6, learning_rate=0.05,
subsample=0.8, colsample_bytree=0.8, reg_alpha=5,
reg_lambda=10, min_child_samples=20, random_state=random_seed,
n_jobs=1, verbose=-1
), n_jobs=-1)
elif name == 'RandomForest':
return RandomForestRegressor(
n_estimators=200, max_depth=10, min_samples_split=5,
min_samples_leaf=2, random_state=random_seed, n_jobs=-1
)
elif name == 'GradientBoosting':
return MultiOutputRegressor(GradientBoostingRegressor(
n_estimators=200, max_depth=6, learning_rate=0.05,
subsample=0.8, random_state=random_seed
), n_jobs=-1)
elif name == 'CatBoost' and CATBOOST_AVAILABLE:
return MultiOutputRegressor(CatBoostRegressor(
iterations=300, depth=6, learning_rate=0.05,
l2_leaf_reg=10, random_seed=random_seed,
verbose=False, thread_count=1
), n_jobs=-1)
elif name == 'MLP':
return MultiOutputRegressor(MLPRegressor(
hidden_layer_sizes=(128, 64, 32),
activation='relu',
solver='adam',
alpha=0.001,
batch_size='auto',
learning_rate='adaptive',
learning_rate_init=0.001,
max_iter=500,
shuffle=True,
random_state=random_seed,
tol=1e-4,
early_stopping=True,
validation_fraction=0.1,
n_iter_no_change=20,
verbose=False
), n_jobs=-1)
else:
# 如果模型不在上述列表中,尝试从base_models复制
if name in base_models:
original = base_models[name]
if isinstance(original, MultiOutputRegressor):
inner = original.estimator
return MultiOutputRegressor(
type(inner)(**inner.get_params()),
n_jobs=-1
)
else:
params = original.get_params()
if 'random_state' in params:
params['random_state'] = random_seed
elif 'random_seed' in params:
params['random_seed'] = random_seed
return type(original)(**params)
else:
raise ValueError(f"未知的模型名称: {name}")
def stacking_predictions(X_train, y_train, X_test, base_models, n_folds=5):
"""
使用Stacking方法进行预测
"""
n_samples = X_train.shape[0]
n_features = y_train.shape[1]
n_models = len(base_models)
# 存储每个基模型在训练集上的预测(用于训练元模型)
train_meta_features = np.zeros((n_samples, n_features * n_models))
# 存储每个基模型在测试集上的预测
test_meta_features = np.zeros((X_test.shape[0], n_features * n_models))
kf = KFold(n_splits=n_folds, shuffle=True, random_state=217)
print(f"\n使用 {n_folds} 折交叉验证进行Stacking...")
# 对每个基模型进行训练和预测
for model_idx, (name, model) in enumerate(base_models.items()):
print(f" 训练基模型: {name}")
test_preds = np.zeros((X_test.shape[0], n_features))
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X_train)):
# 划分训练集和验证集
X_tr = X_train.iloc[train_idx]
y_tr = y_train.iloc[train_idx]
X_val = X_train.iloc[val_idx]
# 训练模型(使用辅助函数创建新实例)
model_copy = create_model_instance(name, fold_idx)
model_copy.fit(X_tr, y_tr)
# 在验证集上预测
val_preds = model_copy.predict(X_val)
train_meta_features[val_idx, model_idx*n_features:(model_idx+1)*n_features] = val_preds
# 在测试集上预测并平均
test_preds += model_copy.predict(X_test) / n_folds
test_meta_features[:, model_idx*n_features:(model_idx+1)*n_features] = test_preds
# 使用Ridge回归作为元模型
print(" 训练元模型 (Ridge回归)...")
meta_model = Ridge(alpha=1.0)
meta_model.fit(train_meta_features, y_train)
# 在测试集上预测
final_predictions = meta_model.predict(test_meta_features)
return final_predictions
def weighted_average_predictions(X_train, y_train, X_test, base_models, weights=None):
"""
使用加权平均方法进行预测
"""
if weights is None:
# 如果没有指定权重,则根据验证集上的表现自动计算权重
print("\n计算模型权重...")
kf = KFold(n_splits=5, shuffle=True, random_state=217)
model_scores = {}
for name, model in base_models.items():
print(f" 评估模型: {name}")
scores = []
for train_idx, val_idx in kf.split(X_train):
X_tr = X_train.iloc[train_idx]
y_tr = y_train.iloc[train_idx]
X_val = X_train.iloc[val_idx]
y_val = y_train.iloc[val_idx]
# 使用辅助函数创建模型实例
model_copy = create_model_instance(name)
model_copy.fit(X_tr, y_tr)
preds = model_copy.predict(X_val)
mae = mean_absolute_error(y_val, preds)
scores.append(mae)
model_scores[name] = np.mean(scores)
print(f" {name} 平均MAE: {model_scores[name]:.6f}")
# 根据MAE计算权重(MAE越小,权重越大)
# 使用倒数并归一化
inv_scores = {name: 1.0 / score for name, score in model_scores.items()}
total_inv = sum(inv_scores.values())
weights = {name: inv_scores[name] / total_inv for name in model_scores.keys()}
print("\n模型权重:")
for name, weight in weights.items():
print(f" {name}: {weight:.4f}")
# 使用权重进行预测
print("\n使用加权平均进行预测...")
predictions = None
for name, model in base_models.items():
print(f" 训练并预测: {name}")
model.fit(X_train, y_train)
pred = model.predict(X_test)
if predictions is None:
predictions = weights[name] * pred
else:
predictions += weights[name] * pred
return predictions
# 方法1: Stacking集成
print("\n" + "="*60)
print("方法1: Stacking集成")
print("="*60)
stacking_pred = stacking_predictions(X_train_scaled, y_train, X_test_scaled, base_models, n_folds=5)
stacking_mae = mean_absolute_error(y_test, stacking_pred)
print(f"\nStacking集成 MAE: {stacking_mae:.6f}")
# 方法2: 加权平均集成
print("\n" + "="*60)
print("方法2: 加权平均集成")
print("="*60)
weighted_pred = weighted_average_predictions(X_train_scaled, y_train, X_test_scaled, base_models)
weighted_mae = mean_absolute_error(y_test, weighted_pred)
print(f"\n加权平均集成 MAE: {weighted_mae:.6f}")
# 方法3: 简单平均集成
print("\n" + "="*60)
print("方法3: 简单平均集成")
print("="*60)
simple_pred = None
for name, model in base_models.items():
print(f" 训练: {name}")
model.fit(X_train_scaled, y_train)
pred = model.predict(X_test_scaled)
if simple_pred is None:
simple_pred = pred
else:
simple_pred += pred
simple_pred /= len(base_models)
simple_mae = mean_absolute_error(y_test, simple_pred)
print(f"\n简单平均集成 MAE: {simple_mae:.6f}")
# 选择最佳集成方法
ensemble_results = {
'Stacking': (stacking_pred, stacking_mae),
'Weighted_Average': (weighted_pred, weighted_mae),
'Simple_Average': (simple_pred, simple_mae)
}
best_method = min(ensemble_results.keys(), key=lambda x: ensemble_results[x][1])
best_pred, best_mae = ensemble_results[best_method]
print("\n" + "="*60)
print("集成方法比较:")
print("="*60)
for method in sorted(ensemble_results.keys(), key=lambda x: ensemble_results[x][1]):
mae = ensemble_results[method][1]
print(f"{method:20s} - MAE: {mae:.6f}")
print("\n" + "="*60)
print(f"最佳集成方法: {best_method}")
print(f"最佳MAE: {best_mae:.6f}")
print("="*60)
# 保存结果
results_list = []
for True_Value, Predicted_Value in zip(y_test.values, best_pred):
error = np.abs(True_Value - Predicted_Value)
formatted_true_value = ' '.join(map(str, True_Value))
formatted_predicted_value = ' '.join(map(str, Predicted_Value))
formatted_error = ' '.join(map(str, error))
results_list.append([formatted_true_value, formatted_predicted_value, formatted_error])
result_df = pd.DataFrame(results_list, columns=['True_Value', 'Predicted_Value', 'Error'])
result_file = "result_Ensemble.csv"
result_df.to_csv(result_file, index=False)
print(f"\n预测结果已保存到: {result_file}")
# 计算并显示平均误差
print("\n" + "<*>"*30)
data = pd.read_csv(result_file)
column3 = data.iloc[:, 2]
numbers = column3.str.split(' ', expand=True).apply(pd.to_numeric)
means = numbers.mean()
print("6个特征的平均误差:")
for col, mean_err in zip(columns, means):
print(f" {col}: {mean_err:.6f}")
print(f"\n总体平均误差: {means.mean():.6f}")
print("<*>"*30)
end_time = time.time()
print(f"\n总耗时:{end_time - start_time:.3f}秒")
我明天要在机器学习课程上讲解我的代码,可以不可以帮我生成一个PPT的内容