关于sklearn中GridsearchCV从sklearn.grid_search中移除的问题

在sklearn2.0中,使用网格搜索(gridsearch)寻找最优参数模型,

模块加载从原来的from sklearn.grid_search import GridsearchCV

改为了  from  sklearn.model_selection import  GridsearchCV

``` import pandas as pd from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error, r2_score import numpy as np # 加载数据 df = pd.read_excel('集胞藻-Cd.xlsx') # 确保文件路径正确 # 清理列名,移除列名中的空格 df.columns = df.columns.str.strip() # 定义特征列和目标列 features =['T','Ph','Biomass','Time','Initial'] target_column = 'Removal' # 提取特征和目标数据 X = df[features] y = df[target_column] # 分割数据为训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # # 保存训练集 # train_dataset = pd.concat([X_train, y_train], axis=1) # train_dataset.to_csv('train_dataset.csv', index=False) # # # 保存测试集 # test_dataset = pd.concat([X_test, y_test], axis=1) # test_dataset.to_csv('test_dataset.csv', index=False) # 定义要搜索的参数网格 param_grid = { 'n_estimators': [80, 100, 120], 'learning_rate': [0.1, 0.2], # 降低学习率 'max_depth': [3,4], # 减少树深度 'min_samples_split': [5,10], # 新增分裂最小样本数 'subsample': [0.8,0.9], # 新增样本子采样 'max_features': [0.8,0.9] # 新增特征子采样 } # 创建梯度提升回归模型实例 gbr = GradientBoostingRegressor(random_state=42) # 使用 GridSearchCV 进行参数调优 grid_search = GridSearchCV( estimator=gbr, param_grid=param_grid, cv=10, # $\Delta$ 将cv从80改为10折交叉验证 scoring='neg_mean_squared_error', n_jobs=-1, # $\Delta$ 启用并行计算 verbose=2 ) grid_search.fit(X_train, y_train) gbr = GradientBoostingRegressor( random_state=42, n_iter_no_change=5 # $\Delta$ 添加早停条件 ) # 找到最佳参数组合 print("Best parameters:", grid_search.best_params_) # 使用最佳参数的模型在训练集和测试集上进行预测 y_train_pred = grid_search.predict(X_train) y_test_pred = grid_search.predict(X_test) # 计算并打印训练集和测试集的均方误差 (MSE)、均方根误差 (RMSE) 和 R^2 值 mse_train = mean_squared_error(y_train, y_train_pred) rmse_train = np.sqrt(mse_train) r2_train = r2_score(y_train, y_train_pred) mse_test = mean_squared_error(y_test, y_test_pred) rmse_test = np.sqrt(mse_test) r2_test = r2_score(y_test, y_test_pred) print(f"训练集 MSE: {mse_train}") print(f"训练集 RMSE: {rmse_train}") print(f"训练集 R^2: {r2_train}") print(f"测试集 MSE: {mse_test}") print(f"测试集 RMSE: {rmse_test}") print(f"测试集 R^2: {r2_test}") # 保存结果 results_df = pd.DataFrame({ '数据集': ['训练集', '测试集'], 'MSE': [mse_train, mse_test], 'RMSE': [rmse_train, rmse_test], 'R²': [r2_train, r2_test] }) results_df.to_excel('结果/集胞藻-Cd模型评估结果.xlsx', index=False)```分析上述代码
03-09
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from pathlib import Path excel_path = "C:/Users/Administrator/Desktop/data2.xlsx" data = pd.read_excel(excel_path, sheet_name='Sheet1') x = data[['掺氨比', '总热输入', '分级比', '主燃区温度']] y = data['NOx排放浓度'] cat_cols = data.select_dtypes(include=['object']).columns for col in cat_cols: data[col] = le.fit_transform(data[col]) X = data.drop('NOx排放浓度', axis=1) y = data['NOx排放浓度'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) import xgboost as xgb dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) params = { 'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'eta': 0.1, 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 0.8 } model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, 'train'), (dtest, 'test')], early_stopping_rounds=10) y_pred = model.predict(dtest) from sklearn.metrics import mean_squared_error, r2_score from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}") print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}") print(f"R²: {r2_score(y_test, y_pred):.2%}") import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体为黑体 plt.rcParams['axes.unicode_minus'] = False xgb.plot_importance(model) plt.show() 评估结果
03-08
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score from scipy.optimize import minimize import matplotlib.pyplot as plt from pathlib import Path ------------------ 数据预处理 ------------------ 设置文件路径 excel_path = Path(r"C:\Users\Administrator\Desktop\掺氨比、燃尽风位置、主燃区温度\扩展数据_KDE.xlsx") 读取数据 data = pd.read_excel(excel_path, sheet_name=‘Sheet1’) 特征工程处理 检查并转换分类变量(仅对真正需要编码的列进行处理) cat_cols = [] le = LabelEncoder() 假设’燃尽风位置’是分类变量,进行编码 if data[‘燃尽风位置’].dtype == ‘object’: data[‘燃尽风位置’] = le.fit_transform(data[‘燃尽风位置’]) cat_cols.append(‘燃尽风位置’) 确保温度保持为连续数值(移除之前的字符串转换) X = data[[‘掺氨比’, ‘燃尽风位置’, ‘主燃区温度’]] y = data[‘NO排放浓度’] ------------------ 模型训练 ------------------ 划分训练测试集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) 新增导入 from sklearn.model_selection import GridSearchCV # 导入网格搜索模块 ------------------ 模型训练(修改部分)------------------ 定义参数搜索空间 param_grid = { ‘n_estimators’: [100, 200, 300, 400], # 树的数量候选值 ‘max_depth’: [5, 10, 15, None], # 树的最大深度 ‘min_samples_split’: [2, 5, 10], # 节点分裂最小样本数 ‘min_samples_leaf’: [1, 2, 4] # 叶节点最小样本数 } 创建基础模型 rf_base = RandomForestRegressor(random_state=42) 创建网格搜索对象 grid_search = GridSearchCV( estimator=rf_base, param_grid=param_grid, scoring=‘neg_mean_squared_error’, # 使用负均方误差作为评估指标 cv=5, # 5折交叉验证 n_jobs=-1, # 使用全部CPU核心 verbose=2 # 显示详细日志 ) 执行网格搜索 grid_search.fit(X_train, y_train) 获取最优模型 rf = grid_search.best_estimator_ # 替换原有模型 print(f"交叉验证平均MSE: {-grid_search.best_score_:.2f}") ------------------ 模型评估 ------------------ y_pred = rf.predict(X_test) print(“模型性能评估:”) print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}“) print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}”) print(f"R²: {r2_score(y_test, y_pred):.2%}") 可视化特征重要性(排序后) plt.rcParams[‘font.sans-serif’] = [‘SimHei’] plt.rcParams[‘axes.unicode_minus’] = False importances = rf.feature_importances_ sorted_idx = importances.argsort() plt.figure(figsize=(10,6)) plt.barh(X.columns[sorted_idx], importances[sorted_idx]) plt.title(“随机森林特征重要性排序”) plt.show() ------------------ 优化参数配置 ------------------ param_constraints = { ‘掺氨比’: (0.0, 100.0), # 氨燃料掺混比例 ‘燃尽风位置’: (min(X[‘燃尽风位置’]), max(X[‘燃尽风位置’])), # 编码后的范围 ‘主燃区温度’: (800, 1600) # 温度范围(℃) } ------------------ 优化函数定义 ------------------ def predict_no_emission(params): “”“包装预测函数,确保输入格式正确”“” input_df = pd.DataFrame([params], columns=X.columns) return rf.predict(input_df)[0] 初始猜测值(取训练数据中位数,更鲁棒) initial_guess = X.median().values 参数边界设置 bounds = [ param_constraints[‘掺氨比’], param_constraints[‘燃尽风位置’], param_constraints[‘主燃区温度’] ] ------------------ 执行优化 ------------------ result = minimize( fun=predict_no_emission, x0=initial_guess, method=‘SLSQP’, bounds=bounds, options={‘maxiter’: 500, ‘ftol’: 1e-8} # 增加迭代次数和精度 ) ------------------ 结果处理 ------------------ optimized_params = result.x.copy() 对分类变量进行逆向解码 if ‘燃尽风位置’ in cat_cols: position_idx = X.columns.get_loc(‘燃尽风位置’) # 四舍五入到最近的整数编码值 encoded_value = int(round(optimized_params[position_idx])) # 确保超出原始编码范围 encoded_value = np.clip(encoded_value, min(data[‘燃尽风位置’]), max(data[‘燃尽风位置’])) optimized_params[position_idx] = le.inverse_transform([encoded_value])[0] ------------------ 结果展示 ------------------ print(“\n最优参数组合:”) print(f"最小NO排放浓度预测值:{result.fun:.2f} mg/m³") print(f"掺氨比:{optimized_params[0]:.3f}“) print(f"燃尽风位置:{optimized_params[1]}”) print(f"主燃区温度:{optimized_params[2]:.0f}℃")改为线性回归算法并展示完整代码
最新发布
03-20
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值