import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from bayes_opt import BayesianOptimization
import warnings
# 忽略警告
warnings.filterwarnings('ignore')
# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['Arial', 'Arial Unicode MS', 'Microsoft YaHei', 'sans-serif']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")
# 加载数据集
df = pd.read_csv('/Jupyter/BCC_HEA/alloy_properties.csv')
features = ['mean_r', 'mean_electronegativity', 'mean_G', 'mean_vec', 'mean_delta','mean_delta_G']
targets = ['deta_E_mono','deta_E_di']
# 定义模型名称列表
model_names = ['GBR', 'SVR', 'DTR', 'RFR', 'GPR']
# 为每个目标变量进行操作
for target in targets:
print(f"\n\n{'='*50}")
print(f"开始处理目标变量: {target}")
print(f"{'='*50}\n")
# 数据拆分:训练集+验证集+测试集
X = df[features]
y = df[target]
# 拆分测试集(30%)
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# 从训练集拆分验证集(20%用于调参)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)
# 存储模型性能
model_performance_before = {}
model_performance_after = {}
best_params_dict = {}
# 1. 初始模型训练和评估(测试集指标为主)
print(f"\n{'='*30} {target} - 原始模型性能 {'='*30}")
# 定义初始模型
models = {
'GBR': GradientBoostingRegressor(random_state=42),
'SVR': make_pipeline(StandardScaler(), SVR(kernel='rbf')),
'DTR': DecisionTreeRegressor(random_state=42),
'RFR': RandomForestRegressor(random_state=42, n_jobs=-1),
'GPR': make_pipeline(StandardScaler(), GaussianProcessRegressor(random_state=42))
}
# 打印原始模型的默认超参数
print(f'\n\n===== {target} - 原始模型的默认超参数 =====')
for model_name, model in models.items():
print(f"\n{model_name} 的默认超参数:")
default_params = model.get_params()
for param, value in default_params.items():
print(f" {param}: {value}")
# 训练和评估初始模型(测试集指标)
for model_name in model_names:
model = models[model_name]
model.fit(X_train_full, y_train_full) # 完整训练集训练
# 预测
y_train_pred = model.predict(X_train_full) # 训练集预测
y_test_pred = model.predict(X_test) # 测试集预测
# 计算指标
train_mae = mean_absolute_error(y_train_full, y_train_pred)
train_r2 = r2_score(y_train_full, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
# 交叉验证(可选,辅助参考)
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
cv_score = cross_val_score(model, X_train_full, y_train_full, cv=cv, scoring='neg_mean_absolute_error').mean()
cv_mae = -cv_score
# 存储性能
model_performance_before[model_name] = {
'Train MAE': train_mae,
'Train R2': train_r2,
'Test MAE': test_mae,
'Test R2': test_r2,
'CV MAE': cv_mae
}
# 打印结果
print(f"\n{model_name} 原始模型性能:")
print(f" 训练集: MAE = {train_mae:.4f}, R² = {train_r2:.4f}")
print(f" 测试集: MAE = {test_mae:.4f}, R² = {test_r2:.4f}")
print(f" 交叉验证 MAE = {cv_mae:.4f}")
# 绘制初始拟合图(训练+测试集)
plt.figure(figsize=(8, 6))
plt.scatter(y_train_full, y_train_pred, alpha=0.7, label='training set', c='blue')
plt.scatter(y_test, y_test_pred, alpha=0.7, label='test set', c='red')
# 趋势线
all_y = np.concatenate([y_train_full, y_test])
all_pred = np.concatenate([y_train_pred, y_test_pred])
z = np.polyfit(all_y, all_pred, 1)
p = np.poly1d(z)
plt.plot(all_y, p(all_y), c='black', lw=2, linestyle='--')
plt.xlabel('actual', fontsize=12)
plt.ylabel('predicted', fontsize=12)
plt.title(f'{target} - {model_name} (before optimization)', fontsize=14)
# 标注指标
plt.text(0.05, 0.9, f'training MAE = {train_mae:.3f}, $R^2$ = {train_r2:.3f}',
transform=plt.gca().transAxes, fontsize=10, bbox=dict(facecolor='white', alpha=0.8))
plt.text(0.05, 0.8, f'test set MAE = {test_mae:.3f}, $R^2$ = {test_r2:.3f}',
transform=plt.gca().transAxes, fontsize=10, bbox=dict(facecolor='white', alpha=0.8))
plt.legend(fontsize=10)
plt.tight_layout()
plt.savefig(f'{target}_{model_name}_before_optimization.png', dpi=300)
plt.show()
# 2. 贝叶斯优化(验证集MAE为目标)
print(f"\n{'='*30} {target} - 贝叶斯优化模型超参数 {'='*30}")
# 超参数空间(保持原范围)
pbounds_dict = {
'GBR': {
'n_estimators': (50, 500),
'learning_rate': (0.001, 0.3),
'max_depth': (2, 10),
'min_samples_split': (2, 20),
'min_samples_leaf': (1, 10)
},
'SVR': {
'logC': (0, 3), # C: 1-1000
'logGamma': (-3, 1), # gamma: 0.001-10
'epsilon': (0.01, 0.1) # ε范围
},
'DTR': {
'max_depth': (2, 15),
'min_samples_split': (2, 20),
'min_samples_leaf': (1, 15),
'max_features': (0.3, 1.0)
},
'RFR': {
'n_estimators': (50, 300),
'max_depth': (3, 10),
'min_samples_split': (3, 8),
'min_samples_leaf': (1, 5),
'max_features': (0.5, 1.0)
},
'GPR': {
'kernel_type': (0, 1), # 0=RBF, 1=Matern
'log_length_scale': (-1, 2), # length_scale: 0.1-100
'log_alpha': (-3.5, -1.5), # alpha: 0.00001-1
'nu': (0.5, 2) # Matern核平滑度
}
}
# 目标函数:验证集MAE(最小化MAE → 最大化负MAE)
def get_objective_function(model_name):
def objective_function(**params):
# 构建模型
if model_name == 'GBR':
model = GradientBoostingRegressor(
n_estimators=int(params['n_estimators']),
learning_rate=params['learning_rate'],
max_depth=int(params['max_depth']),
min_samples_split=int(params['min_samples_split']),
min_samples_leaf=int(params['min_samples_leaf']),
random_state=42
)
elif model_name == 'SVR':
C_val = 10 ** params['logC']
gamma_val = 10 ** params['logGamma']
model = make_pipeline(
StandardScaler(),
SVR(kernel='rbf', C=C_val, gamma=gamma_val, epsilon=params['epsilon'])
)
elif model_name == 'DTR':
model = DecisionTreeRegressor(
max_depth=int(params['max_depth']),
min_samples_split=int(params['min_samples_split']),
min_samples_leaf=int(params['min_samples_leaf']),
max_features=params['max_features'],
random_state=42
)
elif model_name == 'RFR':
model = RandomForestRegressor(
n_estimators=int(params['n_estimators']),
max_depth=int(params['max_depth']),
min_samples_split=int(params['min_samples_split']),
min_samples_leaf=int(params['min_samples_leaf']),
max_features=params['max_features'],
random_state=42,
n_jobs=-1
)
elif model_name == 'GPR':
kernel_type = int(round(params['kernel_type']))
length_scale = 10 ** params['log_length_scale']
alpha_val = 10 ** params['log_alpha']
if kernel_type == 0:
kernel = RBF(length_scale=length_scale)
else:
kernel = Matern(
length_scale=length_scale,
nu=params['nu']
)
model = make_pipeline(
StandardScaler(),
GaussianProcessRegressor(
kernel=kernel,
alpha=alpha_val,
n_restarts_optimizer=5,
random_state=42
)
)
# 训练(用训练集)+ 验证(用验证集)
model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)
val_mae = mean_absolute_error(y_val, y_val_pred)
return -val_mae # 最大化负MAE等价于最小化MAE
return objective_function
# 优化每个模型
for model_name in model_names:
print(f"\n>>> 正在优化 {model_name} 模型...")
optimizer = BayesianOptimization(
f=get_objective_function(model_name),
pbounds=pbounds_dict[model_name],
random_state=42,
allow_duplicate_points=True
)
# 执行优化(增加迭代次数提升搜索充分性)
optimizer.maximize(
init_points=15, # 初始随机点从10→15
n_iter=30 # 迭代次数从20→30
)
# 获取最优参数
best_params = optimizer.max['params']
best_val_mae = -optimizer.max['target'] # 还原为MAE值
best_params_dict[model_name] = best_params
print(f"\n{model_name} 最优超参数:")
# 特殊参数转换显示
if model_name == 'SVR':
print(f" C: {10**best_params['logC']:.4f} (logC: {best_params['logC']:.4f})")
print(f" gamma: {10**best_params['logGamma']:.6f} (logGamma: {best_params['logGamma']:.4f})")
print(f" epsilon: {best_params['epsilon']:.4f}")
elif model_name == 'GPR':
kernel_type = "RBF" if int(round(best_params['kernel_type'])) == 0 else "Matern"
print(f" 核类型: {kernel_type}")
print(f" length_scale: {10**best_params['log_length_scale']:.4f} (log: {best_params['log_length_scale']:.4f})")
print(f" alpha: {10**best_params['log_alpha']:.6f} (log: {best_params['log_alpha']:.4f})")
if kernel_type == "Matern":
print(f" nu: {best_params['nu']:.4f}")
else:
for param, value in best_params.items():
if 'int' in str(type(value)) or param in ['n_estimators', 'max_depth',
'min_samples_split', 'min_samples_leaf']:
print(f" {param}: {int(value)}")
else:
print(f" {param}: {value:.6f}")
print(f"最优验证集 MAE: {best_val_mae:.4f}")
# 3. 优化后模型评估(测试集指标为主)
print(f"\n{'='*30} {target} - 优化后模型性能 {'='*30}")
for model_name in model_names:
best_params = best_params_dict[model_name]
# 初始化优化后模型(同原逻辑)
if model_name == 'GBR':
optimized_model = GradientBoostingRegressor(
n_estimators=int(best_params['n_estimators']),
learning_rate=best_params['learning_rate'],
max_depth=int(best_params['max_depth']),
min_samples_split=int(best_params['min_samples_split']),
min_samples_leaf=int(best_params['min_samples_leaf']),
random_state=42
)
elif model_name == 'SVR':
C_val = 10 ** best_params['logC']
gamma_val = 10 ** best_params['logGamma']
optimized_model = make_pipeline(
StandardScaler(),
SVR(kernel='rbf', C=C_val, gamma=gamma_val, epsilon=best_params['epsilon'])
)
elif model_name == 'DTR':
optimized_model = DecisionTreeRegressor(
max_depth=int(best_params['max_depth']),
min_samples_split=int(best_params['min_samples_split']),
min_samples_leaf=int(best_params['min_samples_leaf']),
max_features=best_params['max_features'],
random_state=42
)
elif model_name == 'RFR':
optimized_model = RandomForestRegressor(
n_estimators=int(best_params['n_estimators']),
max_depth=int(best_params['max_depth']),
min_samples_split=int(best_params['min_samples_split']),
min_samples_leaf=int(best_params['min_samples_leaf']),
max_features=best_params['max_features'],
random_state=42,
n_jobs=-1
)
elif model_name == 'GPR':
kernel_type = int(round(best_params['kernel_type']))
length_scale = 10 ** best_params['log_length_scale']
alpha_val = 10 ** best_params['log_alpha']
if kernel_type == 0:
kernel = RBF(length_scale=length_scale)
else:
kernel = Matern(
length_scale=length_scale,
nu=best_params['nu']
)
optimized_model = make_pipeline(
StandardScaler(),
GaussianProcessRegressor(
kernel=kernel,
alpha=alpha_val,
n_restarts_optimizer=5,
random_state=42
)
)
# 训练:用完整训练集(训练+验证)
optimized_model.fit(X_train_full, y_train_full)
# 预测测试集
y_test_pred = optimized_model.predict(X_test)
# 预测训练集(可选)
y_train_full_pred = optimized_model.predict(X_train_full)
# 计算指标
train_mae = mean_absolute_error(y_train_full, y_train_full_pred)
train_r2 = r2_score(y_train_full, y_train_full_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
# 交叉验证(可选)
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
cv_score = cross_val_score(optimized_model, X_train_full, y_train_full, cv=cv, scoring='neg_mean_absolute_error').mean()
cv_mae = -cv_score
# 存储性能
model_performance_after[model_name] = {
'Train MAE': train_mae,
'Train R2': train_r2,
'Test MAE': test_mae,
'Test R2': test_r2,
'CV MAE': cv_mae
}
# 打印结果
print(f"\n{model_name} 优化后模型性能:")
print(f" 训练集: MAE = {train_mae:.4f}, R² = {train_r2:.4f}")
print(f" 测试集: MAE = {test_mae:.4f}, R² = {test_r2:.4f}")
print(f" 交叉验证 MAE = {cv_mae:.4f}")
# 绘制优化后拟合图(训练+测试集)
plt.figure(figsize=(8, 6))
plt.scatter(y_train_full, y_train_full_pred, alpha=0.7, label='training set', c='blue')
plt.scatter(y_test, y_test_pred, alpha=0.7, label='test set', c='red')
# 趋势线
all_y = np.concatenate([y_train_full, y_test])
all_pred = np.concatenate([y_train_full_pred, y_test_pred])
z = np.polyfit(all_y, all_pred, 1)
p = np.poly1d(z)
plt.plot(all_y, p(all_y), c='black', lw=2, linestyle='--')
plt.xlabel('actual', fontsize=12)
plt.ylabel('predicted', fontsize=12)
plt.title(f'{target} - {model_name} (after optimization)', fontsize=14)
# 标注指标
plt.text(0.05, 0.9, f'training MAE = {train_mae:.3f}, $R^2$ = {train_r2:.3f}',
transform=plt.gca().transAxes, fontsize=10, bbox=dict(facecolor='white', alpha=0.8))
plt.text(0.05, 0.8, f'test set MAE = {test_mae:.3f}, $R^2$ = {test_r2:.3f}',
transform=plt.gca().transAxes, fontsize=10, bbox=dict(facecolor='white', alpha=0.8))
plt.legend(fontsize=10)
plt.tight_layout()
plt.savefig(f'{target}_{model_name}_after_optimization.png', dpi=300)
plt.show()
# 4. 性能对比分析(测试集指标为主)
print(f"\n{'='*30} {target} - 优化前后模型性能对比 {'='*30}")
# 构建对比数据
comparison_data = []
for model_name in model_names:
before = model_performance_before[model_name]
after = model_performance_after[model_name]
# 计算提升率
mae_test_improve = (before['Test MAE'] - after['Test MAE']) / before['Test MAE'] * 100 if before['Test MAE'] != 0 else 0
r2_test_improve = (after['Test R2'] - before['Test R2']) * 100 # 百分点
comparison_data.append({
'模型': model_name,
'优化前测试MAE': before['Test MAE'],
'优化后测试MAE': after['Test MAE'],
'MAE提升(%)': mae_test_improve,
'优化前测试R²': before['Test R2'],
'优化后测试R²': after['Test R2'],
'R²提升(百分点)': r2_test_improve,
'优化前交叉验证MAE': before['CV MAE'],
'优化后交叉验证MAE': after['CV MAE']
})
comparison_df = pd.DataFrame(comparison_data)
print("\n模型性能对比表:")
print(comparison_df.round(4))
# 保存对比结果
comparison_df.to_csv(f'model_comparison_{target}.csv', index=False)
# 绘制测试集MAE对比
plt.figure(figsize=(12, 8))
width = 0.35
x = np.arange(len(model_names))
plt.bar(x - width/2, comparison_df['优化前测试MAE'], width, label='before optimization', color='skyblue')
plt.bar(x + width/2, comparison_df['优化后测试MAE'], width, label='after optimization', color='lightcoral')
# 数据标签
for i, (before, after) in enumerate(zip(comparison_df['优化前测试MAE'], comparison_df['优化后测试MAE'])):
plt.text(i - width/2, before + 0.005, f'{before:.4f}', ha='center')
plt.text(i + width/2, after + 0.005, f'{after:.4f}', ha='center')
# 提升百分比
improvement = (before - after) / before * 100 if before != 0 else 0
plt.text(i, max(before, after) + 0.01, f'{improvement:.1f}%',
ha='center', fontsize=10, color='green' if improvement > 0 else 'red')
plt.xlabel('Model', fontsize=12)
plt.ylabel('test set MAE', fontsize=12)
plt.title(f'{target} - before/after optimization test set MAE comparison', fontsize=14)
plt.xticks(x, model_names)
plt.legend()
plt.tight_layout()
plt.savefig(f'{target}_MAE_comparison.png', dpi=300)
plt.show()
# 绘制测试集R²对比
plt.figure(figsize=(12, 8))
plt.bar(x - width/2, comparison_df['优化前测试R²'], width, label='before optimization', color='skyblue')
plt.bar(x + width/2, comparison_df['优化后测试R²'], width, label='after optimization', color='lightcoral')
# 数据标签
for i, (before, after) in enumerate(zip(comparison_df['优化前测试R²'], comparison_df['优化后测试R²'])):
plt.text(i - width/2, before + 0.01, f'{before:.4f}', ha='center')
plt.text(i + width/2, after + 0.01, f'{after:.4f}', ha='center')
# 提升百分点
improvement = (after - before) * 100
plt.text(i, max(before, after) + 0.02, f'+{improvement:.1f}pp' if improvement > 0 else f'{improvement:.1f}pp',
ha='center', fontsize=10, color='green' if improvement > 0 else 'red')
plt.xlabel('Model', fontsize=12)
plt.ylabel('test set R²', fontsize=12)
plt.title(f'{target} - before/after optimization test set R² comparison', fontsize=14)
plt.xticks(x, model_names)
plt.legend()
plt.tight_layout()
plt.savefig(f'{target}_R2_comparison.png', dpi=300)
plt.show()
# 交叉验证MAE对比(可选)
all_cv_scores = []
for model_name in model_names:
before_score = model_performance_before[model_name]['CV MAE']
after_score = model_performance_after[model_name]['CV MAE']
all_cv_scores.append([model_name + '_before', before_score])
all_cv_scores.append([model_name + '_after', after_score])
cv_df = pd.DataFrame(all_cv_scores, columns=['Model', 'CV MAE'])
plt.figure(figsize=(12, 6))
sns.boxplot(x='Model', y='CV MAE', data=cv_df)
plt.title('Cross-Validation MAE Comparison Before and After Optimization')
plt.ylabel('MAE')
plt.tight_layout()
plt.savefig(f'{target}_cv_comparison.png', dpi=300)
plt.show()
print("\n所有模型优化完成!")修改上述贝叶斯优化模型前后的结果比较,不适用交叉验证