当安装好Xgboost后当调用其函数出现``` AttributeError: module 'xgboost' has no attribute 'feature_importance_'

本文解决了一个关于Xgboost特征重要性函数调用的问题,详细介绍了错误信息及可能的原因,对于使用Xgboost进行特征选择的读者具有较高的参考价值。

当安装好Xgboost后当调用其函数,比如import xgboost as xgb
from xgboost import *
import sklearn as skl
from sklearn import *
from sklearn.preprocessing import LabelEncoder
booster = xgb.feature_importance_()

出现
AttributeError: module 'xgboost' has no attribute 'feature_importance_'

正在解决问题中………

# 准备贝叶斯优化搜索空间 search_spaces = [] feature_index_map = {} # 用于映射特征名称到索引 # 用于存储参数名 param_names = [] for i, feature in enumerate(importance): if feature in result['feature_name']: idx = result['feature_name'].index(feature) feature_index_map[feature] = idx # 基于现有数据确定搜索范围 feature_values = normalized_base_centers[:, idx] min_val = np.min(feature_values) max_val = np.max(feature_values) range_val = max_val - min_val # 扩展10%的范围以允许探索 search_spaces.append(Real( min_val - 0.1 * range_val, max_val + 0.1 * range_val, name=feature # 为每个维度指定名称,用于use_named_args )) param_names.append(feature) # 保存参数名 def objective_function_batch(params_list): """ 贝叶斯优化的目标函数(支持批量处理) params_list: 多个参数字典组成的元组 """ results = [] for params in params_list: # 创建新的配方点 new_point = normalized_full_grid_point.copy()[0] # 复制基础点 # 设置重要特征的值(使用命名参数) for feature, value in params.items(): if feature in feature_index_map: idx = feature_index_map[feature] new_point[idx] = value # 进行预测 batch_input = np.array([new_point]) batch_predictions = batch_weighted_prediction( recipe_id_batch=batch_input, normalized_base_centers=normalized_base_centers, loaded_models=loaded_models, result=result ) # 计算评分 predictedDF = pd.DataFrame(batch_predictions, columns=pd.Series(result['minMatrix']).keys()) weights = {'Depth': 1, 'SOCremain': 1, 'SiNSWA': 1, 'TCD': 1, 'doubleslope': 1, 'maskremain': 1} filtered_df = dynamic_filter_sort(predictedDF, target_spec, weights=weights) # 返回负的评分(因为贝叶斯优化默认最小化目标) results.append(-filtered_df['score#'].iloc[0]) return results # 返回一个列表,包含多个目标值 # 创建贝叶斯优化器 optimizer = Optimizer( dimensions=search_spaces, base_estimator="gp", # 高斯过程作为代理模型 acq_func="EI", # 期望改进作为采集函数 acq_optimizer="lbfgs", random_state=42, ) # 运行贝叶斯优化(每次采样20个点) print(f"开始贝叶斯优化,共{n_iter}次迭代,初始采样点{n_initial_points}个...") for i in tqdm(range(n_iter), desc="贝叶斯优化进度"): next_x = optimizer.ask(n_points=20) # 每次采样20个点 f_vals = objective_function_batch(next_x) # 批量评估 optimizer.tell(next_x, f_vals) # 提交20个点及其对应的目标值:报错: File "C:\Users\yizhiwei\Documents\data_analysis\optim.py", line 1258, in <module> recommended_recipes, results=optimize_recipe(recipe_index, normalized_base_centers, loaded_models, result, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\yizhiwei\Documents\data_analysis\optim.py", line 1179, in optimize_recipe f_vals = objective_function_batch(next_x) # 批量评估 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\yizhiwei\Documents\data_analysis\optim.py", line 1142, in objective_function_batch for feature, value in params.items(): ^^^^^^^^^^^^ AttributeError: 'list' object has no attribute 'items'
08-11
import tkinter as tk from tkinter import ttk, filedialog, messagebox from tkinter.scrolledtext import ScrolledText import threading import queue import os import pickle import pandas as pd import numpy as np from scipy.stats import pearsonr from scipy.spatial.distance import euclidean from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import Ridge from sklearn.model_selection import KFold, train_test_split from sklearn.feature_selection import RFECV from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error import joblib import logging import seaborn as sns import matplotlib.pyplot as plt from xgboost import XGBRegressor import customtkinter as ctk ctk.set_appearance_mode("light") ctk.set_default_color_theme("blue") # 配置图片字体(解决中文乱码) plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] plt.rcParams['axes.unicode_minus'] = False # ------------------------------ # 1. 数据预处理(缺失值填充) # ------------------------------ def preprocess_data(input_path, output_path, logger, y_col, x_cols, log_queue=None): if log_queue is not None: log_queue.put(f" \n== = 步骤1:数据预处理(缺失值填充) - Y列:{y_col},X列:{x_cols} == = \n") logger.info(f" \n== = 步骤1:数据预处理(缺失值填充) - Y列:{y_col},X列:{x_cols} == = \n") data = pd.read_csv(input_path, encoding='gbk') y = data[y_col] X = data[x_cols] # 填充X列缺失值(仅数值型) for col in X.columns: if X[col].isnull().sum() > 0 and X[col].dtype in ["int64", "float64"]: mean_val = X[col].mean() X[col].fillna(mean_val, inplace=True) if log_queue is not None: log_queue.put(f"填充特征 {col}:均值={mean_val:.2f}") logger.info(f"填充特征 {col}:均值={mean_val:.2f}") # 保存结果(Y在前,X在后) processed_data = pd.concat([y.reset_index(drop=True), X], axis=1) processed_data.to_csv(output_path, index=False, encoding='gb18030') if log_queue is not None: log_queue.put(f"预处理完成,保存到:{output_path}") logger.info(f"预处理完成,保存到:{output_path}") return processed_data # ------------------------------ # 2. 自相关特征筛选 # ------------------------------ def remove_high_correlation(input_path, output_path, corr_threshold, n_keep, logger, log_queue=None): if log_queue is not None: log_queue.put("\n=== 步骤2:自相关特征筛选 ===") logger.info("\n=== 步骤2:自相关特征筛选 ===") data = pd.read_csv(input_path) y = data.iloc[:, 0] X = data.iloc[:, 1:] # 计算特征相关系数矩阵 corr_matrix = X.corr().abs() selected_features = [] feature_groups = {} sorted_features = corr_matrix.mean().sort_values().index.tolist() # 迭代筛选特征 while len(selected_features) < n_keep and sorted_features: feat = sorted_features.pop(0) selected_features.append(feat) # 剔除高相关特征 high_corr = corr_matrix[feat][ (corr_matrix[feat] > corr_threshold) & (corr_matrix[feat].index != feat) ].index.tolist() if high_corr: feature_groups[feat] = {f: round(corr_matrix[feat][f], 6) for f in high_corr} # 更新待处理特征 sorted_features = [f for f in sorted_features if f not in high_corr] # 保存筛选结果 X_filtered = X[selected_features] filtered_data = pd.concat([y.reset_index(drop=True), X_filtered], axis=1) filtered_data.to_csv(output_path, index=False, encoding='gb18030') if logger is not None: logger.info(f"自相关筛选完成:保留{len(selected_features)}个特征,保存到:{output_path}") if log_queue is not None: log_queue.put(f"自相关筛选完成:保留{len(selected_features)}个特征,保存到:{output_path}") # 生成分析报告 report_df = pd.DataFrame([ [kept, discarded, corr] for kept, discards in feature_groups.items() for discarded, corr in discards.items() ], columns=["保留特征", "剔除特征", "相关系数"]) report_path = os.path.join(os.path.dirname(output_path), "feature_report.csv") report_df.to_csv(report_path, index=False, encoding='gb18030') if logger is not None: logger.info("自相关分析报告已生成") if log_queue is not None: log_queue.put("自相关分析报告已生成") return filtered_data # ------------------------------ # 3. 特征相似度排序(DTW+Pearson,含平滑+降采样) # ------------------------------ def calculate_feature_similarity( input_path, ranking_path, filtered_path, threshold, logger, log_queue=None, ma_window=200, downsample_step=200, dtw_weight=0.1, pearson_weight=0.9 ): if log_queue is not None: log_queue.put("\n=== 步骤3:特征与目标相似度排序(含平滑+降采样) ===") logger.info("\n=== 步骤3:特征与目标相似度排序(含平滑+降采样) ===") # 1. 加载数据 data = pd.read_csv(input_path) y = data.iloc[:, 0] X = data.iloc[:, 1:] # ------------------------------ # 平滑+降采样预处理 # ------------------------------ try: logging.debug("开始数据平滑处理...") y_smooth = y.rolling(window=ma_window, center=True).mean().dropna() x_smooth = X.rolling(window=ma_window, center=True).mean().dropna() if y_smooth.empty or x_smooth.empty: raise ValueError("平滑后的目标变量或特征数据为空!") common_length = min(len(y_smooth), len(x_smooth)) downsampled_length = max(1, common_length // downsample_step) y_sampled = y_smooth.iloc[::downsample_step][:downsampled_length] X_sampled = x_smooth.iloc[::downsample_step][:downsampled_length] logging.debug(f"降采样完成:y长度={len(y_sampled)},X每列长度={X_sampled.apply(len)}") except Exception as e: logging.critical(f"预处理(平滑+降采样)失败: {str(e)}") raise # ------------------------------ # 计算特征与目标的相似度 # ------------------------------ results = [] try: for col in X.columns: try: x_series = X_sampled[col].dropna() common_length_align = min(len(y_sampled), len(x_series)) if common_length_align < 2: logging.warning(f"特征 {col} 对齐后长度不足,跳过") continue y_aligned = y_sampled.iloc[:common_length_align] x_aligned = x_series.iloc[:common_length_align] # 标准化 scaler = StandardScaler() y_norm = scaler.fit_transform(y_aligned.values.reshape(-1, 1)).flatten() x_norm = scaler.fit_transform(x_aligned.values.reshape(-1, 1)).flatten() # DTW计算 dtw_dist = euclidean(y_norm, x_norm) dtw_score = 1 / (1 + dtw_dist) if dtw_dist > 0 else 0 # Pearson计算 if len(x_aligned) == 0 or np.all(x_aligned == x_aligned.iloc[0]): pearson_score = 0 logging.info(f"特征 {col} 对齐后为空或为常量,Pearson得分设为0") else: try: pearson_coef, _ = pearsonr(y_norm, x_norm) pearson_score = abs(pearson_coef) except Exception as e: pearson_score = 0 logging.error(f"特征 {col} 计算Pearson时出错: {str(e)}") # 综合得分 composite_score = dtw_weight * dtw_score + pearson_weight * pearson_score results.append({ 'Feature': col, 'DTW_Score': dtw_score, 'Pearson_Score': pearson_score, 'Composite_Score': composite_score }) except Exception as e: logging.error(f"处理特征 {col} 时发生错误: {str(e)}") continue except Exception as e: logging.critical(f"特征相似度计算失败: {str(e)}") raise # ------------------------------ # 结果排序与保存 # ------------------------------ try: if not results: raise ValueError("未检测到有效特征,请检查输入数据") similarity_scores = pd.DataFrame(results, columns=[ 'Feature', 'DTW_Score', 'Pearson_Score', 'Composite_Score' ]) required_columns = {'Feature', 'DTW_Score', 'Pearson_Score', 'Composite_Score'} if not required_columns.issubset(similarity_scores.columns): missing = required_columns - set(similarity_scores.columns) raise ValueError(f"结果缺失关键列: {missing}") # 过滤与排序 filtered_scores = similarity_scores[similarity_scores['Composite_Score'] >= threshold] if filtered_scores.empty: error_msg = f"无特征达到阈值 {threshold}" if logger is not None: logger.error(error_msg) if log_queue is not None: log_queue.put(f"❌ {error_msg}") raise ValueError(error_msg) filtered_scores = filtered_scores.sort_values('Composite_Score', ascending=False).reset_index(drop=True) # 保存结果 filtered_columns = filtered_scores['Feature'].tolist() filtered_df = pd.concat([y.reset_index(drop=True), X[filtered_columns]], axis=1) filtered_scores.to_csv(ranking_path, index=False, encoding='gb18030') filtered_df.to_csv(filtered_path, index=False, encoding='gb18030') log_msg = f"互相关特征相似度排名保存到:{ranking_path}" if logger is not None: logger.info(log_msg) if log_queue is not None: log_queue.put(log_msg) log_msg = f"互相关相似度筛选完成:保留{len(filtered_columns)}个特征,保存到:{filtered_path}" if logger is not None: logger.info(log_msg) if log_queue is not None: log_queue.put(log_msg) except Exception as e: logging.critical(f"结果保存失败: {str(e)}") raise return filtered_df # ------------------------------ # 4. 递归特征消除(RFE) # ------------------------------ def recursive_feature_elimination(input_path, output_path, min_features, logger, log_queue=None): if log_queue is not None: log_queue.put("\n=== 步骤4:递归特征消除(RFE)===") logger.info("\n=== 步骤4:递归特征消除(RFE)===") data = pd.read_csv(input_path) y = data.iloc[:, 0] X = data.iloc[:, 1:] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.01, random_state=42 ) estimator = RandomForestRegressor(random_state=42) selector = RFECV( estimator=estimator, step=1, min_features_to_select=min_features, cv=KFold(n_splits=20), scoring="r2", n_jobs=-1 ) selector.fit(X_train, y_train) selected_feature_names = X.columns[selector.support_].tolist() if logger is not None: logger.info(f"RFE完成:最优特征数={selector.n_features_},保留特征={selected_feature_names}") if log_queue is not None: log_queue.put(f"RFE完成:最优特征数={selector.n_features_},保留特征={selected_feature_names}") # 保存筛选数据 X_train_selected = X_train[selected_feature_names] X_test_selected = X_test[selected_feature_names] model = RandomForestRegressor( n_estimators=200, max_depth=8, min_samples_split=10, min_samples_leaf=5, random_state=42, n_jobs=-1 ) model.fit(X_train_selected, y_train) selected_data = pd.concat([y.reset_index(drop=True), X[selected_feature_names]], axis=1) selected_output_path = output_path selected_data.to_csv(selected_output_path, index=False, encoding='gb18030') if logger is not None: logger.info(f"RFE筛选数据保存到:{selected_output_path}") if log_queue is not None: log_queue.put(f"RFE筛选数据保存到:{selected_output_path}") return selected_data # ------------------------------ # 5. 模型训练与评估(返回模型序列化数据) # ------------------------------ def train_evaluate_models(input_path, output_path, logger, log_queue=None): if log_queue is not None: log_queue.put("\n=== 步骤5:模型训练与评估 ===") logger.info("\n=== 步骤5:模型训练与评估 ===") data = pd.read_csv(input_path) y = data.iloc[:, 0] X = data.iloc[:, 1:] # 划分训练集/测试集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # 定义模型 models = { "RandomForest": RandomForestRegressor( n_estimators=800, max_depth=15, min_samples_split=9, min_samples_leaf=9, max_features=0.7, random_state=42, n_jobs=-1 ), "XGBoost": XGBRegressor( n_estimators=800, max_depth=6, learning_rate=0.0175, subsample=0.65, colsample_bytree=0.65, reg_alpha=1, reg_lambda=1, random_state=42 ) } results = {} feature_importance_data = {} model_pickles = {} for name, model in models.items(): model.fit(X_train, y_train) train_pred = model.predict(X_train) test_pred = model.predict(X_test) # 计算指标 results[name] = { "训练R²": r2_score(y_train, train_pred), "测试R²": r2_score(y_test, test_pred), "测试RMSE": np.sqrt(mean_squared_error(y_test, test_pred)), "测试MAE": mean_absolute_error(y_test, test_pred) } if logger is not None: logger.info(f"{name} 评估:训练R²={results[name]['训练R²']:.4f},测试R²={results[name]['测试R²']:.4f}") if log_queue is not None: log_queue.put(f"{name} 评估:训练R²={results[name]['训练R²']:.4f},测试R²={results[name]['测试R²']:.4f}") # 特征重要性处理 importance_scores = model.feature_importances_ feature_importance = pd.DataFrame({ "特征": X.columns, f"{name}_重要性得分": importance_scores }) feature_importance_sorted = feature_importance.sort_values(by=f"{name}_重要性得分", ascending=False) # 保存特征重要性CSV plot_dir = os.path.join(os.path.dirname(output_path), "plots") os.makedirs(plot_dir, exist_ok=True) importance_csv_path = os.path.join(plot_dir, f"{name}_feature_importance.csv") feature_importance_sorted.to_csv(importance_csv_path, index=False, encoding="utf-8-sig") if logger is not None: logger.info(f"{name} 特征重要性CSV保存到:{importance_csv_path}") if log_queue is not None: log_queue.put(f"{name} 特征重要性CSV保存到:{importance_csv_path}") # 收集绘图数据 feature_importance_data[name] = { "feature_importance_sorted": feature_importance_sorted, "plot_dir": plot_dir, "logger": logger, "log_queue": log_queue } # 序列化模型(用于主线程绘图) try: model_pickles[name] = pickle.dumps(model) except Exception as e: logger.error(f"序列化模型{name}失败:{str(e)}") log_queue.put(f"序列化模型{name}失败:{str(e)}") # 保存模型结果 results_df = pd.DataFrame(results).T results_output_path = output_path results_df.to_csv(results_output_path, index=True, encoding='utf-8-sig') if logger is not None: logger.info(f"模型结果保存到:{results_output_path}") if log_queue is not None: log_queue.put(f"模型结果保存到:{results_output_path}") return results_df, feature_importance_data, model_pickles, X_test, y_test # ------------------------------ # 6. 生成预测值对比图(主线程执行) # ------------------------------ def generate_comparison_plots(models, X_test, y_test, plot_dir, logger, log_queue): """主线程:生成预测值对比图并保存""" try: for name, model in models.items(): y_pred = model.predict(X_test) r2 = r2_score(y_test, y_pred) mae = mean_absolute_error(y_test, y_pred) try: r, _ = pearsonr(y_test, y_pred) except Exception as e: r = 0 logger.warning(f"计算模型{name}的皮尔逊相关系数失败:{str(e)}") plt.figure(figsize=(8, 8)) sns.scatterplot( x=y_test.values.flatten(), y=y_pred.flatten(), alpha=0.6, color="red" ) plt.plot( [y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "b-", linewidth=2, label="理想拟合线" ) ax = plt.gca() text_content = f"""$R^2={r2:.3f}$ $MAE={mae:.3f}$ $Corr={r:.3f}$""" ax.text( x=0.02, y=0.815, s=text_content, fontsize=12, bbox=dict(boxstyle="round", facecolor="white", alpha=0.8), transform=ax.transAxes ) plt.xlabel("真实值", fontsize=12) plt.ylabel("预测值", fontsize=12) plt.title(f"{name} 预测值 vs 真实值", fontsize=14) plt.legend() plt.grid(True, linestyle="--", alpha=0.5) plot_path = os.path.join(plot_dir, f"{name}_comparison.png") plt.savefig(plot_path, bbox_inches="tight", dpi=300) plt.close() if logger is not None: logger.info(f"模型{name}对比图保存到:{plot_path}") if log_queue is not None: log_queue.put(f"模型{name}对比图保存到:{plot_path}") except Exception as e: if logger is not None: logger.error(f"生成对比图失败:{str(e)}") if log_queue is not None: log_queue.put(f"生成对比图失败:{str(e)}") # ------------------------------ # 主窗口类(补全stop_processing方法) # ------------------------------ class FDCPipelineApp: def __init__(self, root): self.root = root self.log_queue = queue.Queue() self.processing = False self.stop_event = threading.Event() # ---------- 文件设置区域 ---------- file_frame = ctk.CTkFrame( self.root, corner_radius=8, fg_color="#f0f8ff" ) file_frame.pack(fill=tk.X, padx=10, pady=10) # 添加标题标签(替代原text参数) ttk.Label( file_frame, text="文件设置", font=('微软雅黑', 10, 'bold'), foreground="#002244" ).grid(row=0, column=0, padx=5, pady=5, sticky=tk.W, columnspan=3) # columnspan=3占满三列 ttk.Label(file_frame, text="输入CSV文件:").grid(row=0, column=0, padx=5, pady=5, sticky=tk.W) self.input_file_entry = ctk.CTkEntry( file_frame, width=70, corner_radius=6, border_width=1, fg_color="white", text_color="black" ) self.input_file_entry.grid(row=0, column=1, padx=5, pady=5) ctk.CTkButton( file_frame, text="浏览...", command=self.browse_input_file, corner_radius=6, fg_color="#cce6ff", text_color="#002244", hover_color="#b3d4fc" ).grid(row=0, column=2, padx=5, pady=5) # ---------- 输出目录区域 ---------- output_dir_frame = ctk.CTkFrame( self.root, corner_radius=8, fg_color="#f0f8ff" ) output_dir_frame.pack(fill=tk.X, padx=10, pady=5) # 添加标题标签 ttk.Label( output_dir_frame, text="输出目录", font=('微软雅黑', 10, 'bold'), foreground="#002244" ).grid(row=0, column=0, padx=5, pady=5, sticky=tk.W, columnspan=3) self.output_dir_entry = ctk.CTkEntry( output_dir_frame, width=70, corner_radius=6, border_width=1, fg_color="white", text_color="black" ) self.output_dir_entry.grid(row=0, column=1, padx=5, pady=5) ctk.CTkButton( output_dir_frame, text="浏览...", command=self.browse_output_dir, corner_radius=6, fg_color="#cce6ff", text_color="#002244", hover_color="#b3d4fc" ).grid(row=0, column=2, padx=5, pady=5) ttk.Label(output_dir_frame, text="输出目录:").grid(row=0, column=0, padx=5, pady=5, sticky=tk.W) # ------------------------------ # 2. 列选择区域 # ------------------------------ column_frame = ctk.CTkFrame( self.root, corner_radius=8, fg_color="#f0f8ff" ) column_frame.pack(fill=tk.X, padx=10, pady=5) # 添加标题标签 ttk.Label( column_frame, text="列选择(关键!)", font=('微软雅黑', 10, 'bold'), foreground="#002244" ).grid(row=0, column=0, padx=5, pady=5, sticky=tk.W, columnspan=2) # Y列选择(Combobox) ttk.Label(column_frame, text="选择Y列(目标变量):").grid(row=0, column=0, padx=5, pady=5, sticky=tk.NW) self.y_col_var = tk.StringVar() self.y_col_combobox = ctk.CTkComboBox( column_frame, width=50, corner_radius=6, border_width=1, fg_color="white", text_color="black", state="readonly" ) self.y_col_combobox.grid(row=0, column=1, padx=5, pady=5) # X列选择(Listbox + Scrollbar) ttk.Label(column_frame, text="选择X列(特征,可多选):").grid(row=1, column=0, padx=5, pady=5, sticky=tk.NW) self.x_cols_listbox = ctk.CTkListbox( column_frame, selectmode=tk.MULTIPLE, width=50, height=5, corner_radius=6, border_width=1, fg_color="white", text_color="black", selectbackground="#e6f2ff", selectforeground="black", font=('微软雅黑', 10) ) self.x_cols_listbox.grid(row=1, column=1, padx=5, pady=5, sticky=tk.W) self.x_scrollbar = ctk.CTkScrollbar( column_frame, orientation=tk.VERTICAL, button_color="#cce6ff", button_hover_color="#b3d4fc" ) self.x_scrollbar.grid(row=1, column=2, padx=5, pady=5, sticky=tk.N + tk.S) self.x_cols_listbox.config(yscrollcommand=self.x_scrollbar.set) self.x_scrollbar.config(command=self.x_cols_listbox.yview) # ------------------------------ # ---------- 列序号范围区域 ---------- col_range_frame = ctk.CTkFrame( self.root, corner_radius=8, fg_color="#f0f8ff" ) col_range_frame.pack(fill=tk.X, padx=10, pady=5) # 添加标题标签 ttk.Label( col_range_frame, text="列序号范围(可选,留空则全选X列)", font=('微软雅黑', 10, 'bold'), foreground="#002244" ).pack(anchor=tk.W, padx=5, pady=5) # 新增:列序号范围输入框(原代码缺失此控件) ttk.Label(col_range_frame, text="列序号范围(如1-3,5):").grid(row=0, column=0, padx=5, pady=5, sticky=tk.W) self.x_cols_range_entry = ctk.CTkEntry( # 原代码缺失此控件 col_range_frame, width=50, corner_radius=6, border_width=1, fg_color="white", text_color="black" ) self.x_cols_range_entry.grid(row=0, column=1, padx=5, pady=5, sticky=tk.W) self.x_cols_range_entry.insert(0, "") ttk.Label(col_range_frame, text="结束索引(从0开始,留空则到最后一个):").grid(row=0, column=2, padx=5, pady=5, sticky=tk.W) self.end_col_idx_entry = ctk.CTkEntry( col_range_frame, width=10, corner_radius=6, border_width=1, fg_color="white", text_color="black" ) self.end_col_idx_entry.grid(row=0, column=3, padx=5, pady=5, sticky=tk.W) self.end_col_idx_entry.insert(0, "") # ---------- 参数配置区域 ---------- param_frame = ctk.CTkFrame( self.root, corner_radius=8, fg_color="#f0f8ff" ) param_frame.pack(fill=tk.X, padx=10, pady=5) # 添加标题标签 ttk.Label( param_frame, text="参数配置", font=('微软雅黑', 10, 'bold'), foreground="#002244" ).pack(anchor=tk.W, padx=5, pady=5) ttk.Label(param_frame, text="自相关阈值:").grid(row=0, column=0, padx=5, pady=5, sticky=tk.W) self.corr_threshold_entry = ctk.CTkEntry( param_frame, width=10, corner_radius=6, border_width=1, fg_color="white", text_color="black" ) self.corr_threshold_entry.grid(row=0, column=1, padx=5, pady=5, sticky=tk.W) self.corr_threshold_entry.insert(0, "0.95") ttk.Label(param_frame, text="自相关保留特征数:").grid(row=1, column=0, padx=5, pady=5, sticky=tk.W) self.n_keep_entry = ctk.CTkEntry( param_frame, width=10, corner_radius=6, border_width=1, fg_color="white", text_color="black" ) self.n_keep_entry.grid(row=1, column=1, padx=5, pady=5, sticky=tk.W) self.n_keep_entry.insert(0, "5") ttk.Label(param_frame, text="RFE最小特征数:").grid(row=2, column=0, padx=5, pady=5, sticky=tk.W) self.rfe_min_entry = ctk.CTkEntry( param_frame, width=10, corner_radius=6, border_width=1, fg_color="white", text_color="black" ) self.rfe_min_entry.grid(row=2, column=1, padx=5, pady=5, sticky=tk.W) self.rfe_min_entry.insert(0, "3") # 新增:特征相似度阈值(原代码缺失此控件) ttk.Label(param_frame, text="特征相似度阈值:").grid(row=3, column=0, padx=5, pady=5, sticky=tk.W) self.similarity_threshold_entry = ctk.CTkEntry( # 原代码缺失此控件 param_frame, width=10, corner_radius=6, border_width=1, fg_color="white", text_color="black" ) self.similarity_threshold_entry.grid(row=3, column=1, padx=5, pady=5, sticky=tk.W) self.similarity_threshold_entry.insert(0, "0.5") # 默认值 # ------------------------------ # ---------- 进度跟踪区域 ---------- progress_frame = ctk.CTkFrame( self.root, corner_radius=8, fg_color="#f0f8ff" ) progress_frame.pack(fill=tk.X, padx=10, pady=5) # 添加标题标签 ttk.Label( progress_frame, text="进度跟踪", font=('微软雅黑', 10, 'bold'), foreground="#002244" ).pack(anchor=tk.W, padx=5, pady=5) self.progress_bar = ctk.CTkProgressBar(progress_frame, width=800, corner_radius=6) self.progress_bar.pack(pady=5) # 新增:进度文本标签(原代码缺失此控件) self.progress_label = ttk.Label(progress_frame, text="就绪") # 原代码缺失此控件 self.progress_label.pack(anchor=tk.W) ttk.Label(progress_frame, text="处理日志:").pack(anchor=tk.W, pady=(10, 0)) self.log_text = ctk.CTkTextbox( progress_frame, width=100, height=10, corner_radius=6, border_width=1, fg_color="#FBECD5", text_color="black", font=('微软雅黑', 9) ) self.log_text.pack(fill=tk.BOTH, expand=True, pady=5) self.log_text.configure(state=tk.DISABLED) # ---------- 结果展示区域 ---------- result_frame = ctk.CTkFrame( self.root, corner_radius=8, fg_color="#f0f8ff" ) result_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5) # 添加标题标签 ttk.Label( result_frame, text="模型结果", font=('微软雅黑', 10, 'bold'), foreground="#002244" ).pack(anchor=tk.W, padx=5, pady=5) # Treeview 容器(带阴影的卡片) result_tree_frame = ctk.CTkFrame( result_frame, corner_radius=8, fg_color="white", border_width=1, border_color="#e0e0e0" ) result_tree_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) columns = ("模型", "训练R²", "测试R²", "测试RMSE", "测试MAE") self.result_tree = ttk.Treeview( result_tree_frame, columns=columns, show="headings", style='Treeview' ) for col in columns: self.result_tree.heading(col, text=col) self.result_tree.column(col, width=150, anchor=tk.CENTER) self.result_tree.pack(fill=tk.BOTH, expand=True) # ---------- 按钮区域 ---------- button_frame = ctk.CTkFrame(self.root, corner_radius=8, fg_color="#f0f8ff") button_frame.pack(fill=tk.X, padx=10, pady=10) self.start_button = ctk.CTkButton( button_frame, text="开始处理", command=self.start_processing, corner_radius=6, fg_color="#007bff", text_color="white", hover_color="#0069d9" ) self.start_button.pack(side=tk.LEFT, padx=5) self.stop_button = ctk.CTkButton( button_frame, text="终止运行", command=self.stop_processing, corner_radius=6, fg_color="#dc3545", text_color="white", hover_color="#c82333" ) self.stop_button.pack(side=tk.LEFT, padx=5) self.clear_log_btn = ctk.CTkButton( button_frame, text="清除日志", command=self.clear_log, corner_radius=6, fg_color="#6c757d", text_color="white", hover_color="#5a6268" ) self.clear_log_btn.pack(side=tk.LEFT, padx=5) # 配置 Treeview 样式 style = ttk.Style() style.configure('Treeview', background='#ffffff', fieldbackground='#ffffff', foreground='#333333', font=('微软雅黑', 10), rowheight=30, borderwidth=0, relief='flat') style.configure('Treeview.Heading', background='#cce6ff', foreground='#002244', font=('微软雅黑', 10, 'bold'), relief='flat', padding=5) style.map('Treeview', background=[('selected', '#b3d4fc')]) # ------------------------------ # 事件处理函数:浏览输入文件 # ------------------------------ def browse_input_file(self): file_path = filedialog.askopenfilename(filetypes=[("CSV文件", "*.csv")]) if file_path: self.input_file_entry.delete(0, tk.END) self.input_file_entry.insert(0, file_path) try: data = pd.read_csv(file_path, nrows=0, encoding='gbk') columns = data.columns.tolist() self.y_col_combobox['values'] = columns self.x_cols_listbox.delete(0, tk.END) for col in columns: self.x_cols_listbox.insert(tk.END, col) if len(columns) >= 1: self.y_col_var.set(columns[0]) for i in range(1, len(columns)): self.x_cols_listbox.selection_set(i) except Exception as e: messagebox.showerror("错误", f"读取文件列名失败:{str(e)}") # ------------------------------ # 事件处理函数:浏览输出目录 # ------------------------------ def browse_output_dir(self): dir_path = filedialog.askdirectory() if dir_path: self.output_dir_entry.delete(0, tk.END) self.output_dir_entry.insert(0, dir_path) # ------------------------------ # 事件处理函数:清除日志 # ------------------------------ def clear_log(self): self.log_text.config(state=tk.NORMAL) self.log_text.delete(1.0, tk.END) self.log_text.config(state=tk.DISABLED) # ------------------------------ # 新增:根据列序号范围批量选中Listbox项 # ------------------------------ def select_x_cols_by_range(self): """根据用户输入的列序号范围批量选中Listbox项""" range_str = self.x_cols_range_entry.get().strip() if not range_str: messagebox.showwarning("警告", "请输入列序号范围!") return try: # 1. 解析范围字符串(支持"1-3,5"或"2,4-6"格式) selected_indices = set() # 用集合去重 parts = range_str.split(',') # 分割多个部分(如["1-3", "5"]) for part in parts: part = part.strip() if '-' in part: # 处理范围(如"1-3" → 0,1,2) start, end = map(int, part.split('-')) if start > end: raise ValueError(f"范围起始不能大于结束:{part}") # 转换为Listbox的0-based索引(用户输入从1开始) selected_indices.update(range(start - 1, end)) else: # 处理单个序号(如"5" → 4) num = int(part) selected_indices.add(num - 1) # 2. 验证索引有效性 listbox_size = self.x_cols_listbox.size() # Listbox总项数 invalid_indices = [idx for idx in selected_indices if idx < 0 or idx >= listbox_size] if invalid_indices: raise ValueError(f"索引超出范围:{invalid_indices}(Listbox共{listbox_size}项)") # 3. 选中目标项 self.x_cols_listbox.selection_clear(0, tk.END) # 清空原有选中 for idx in selected_indices: self.x_cols_listbox.selection_set(idx) # 选中新项 # 提示选中结果 selected_cols = [ self.x_cols_listbox.get(idx) for idx in sorted(selected_indices) # 按顺序显示 ] messagebox.showinfo("成功", f"选中了列:{', '.join(selected_cols)}") except ValueError as e: messagebox.showerror("错误", f"无效的范围格式:{str(e)}") except Exception as e: messagebox.showerror("错误", f"处理失败:{str(e)}") # ------------------------------ # 核心:终止处理流程 # ------------------------------ def stop_processing(self): """设置停止事件,终止处理流程""" self.stop_event.set() self.log_queue.put("🛑 用户请求终止处理流程...") # ------------------------------ # 核心:启动处理流程 # ------------------------------ def start_processing(self): if self.processing: messagebox.showwarning("警告", "处理正在进行中...") return # 重置状态 self.stop_event.clear() self.processing = True self.start_button.config(state=tk.DISABLED) self.stop_button.config(state=tk.NORMAL) self.progress_bar['value'] = 0 self.progress_label.config(text="就绪") self.clear_log() # 获取输入路径和参数 input_path = self.input_file_entry.get() output_dir = self.output_dir_entry.get() y_col = self.y_col_var.get().strip() selected_indices = self.x_cols_listbox.curselection() x_cols = [self.x_cols_listbox.get(i) for i in selected_indices] # 关键修正:变量名与控件一致 corr_threshold = float(self.corr_threshold_entry.get()) n_keep = int(self.n_keep_entry.get()) # 原错误:n_keep_entry → 现正确 rfe_min = int(self.rfe_min_entry.get()) # 原错误:rfe_min_entry → 现正确 similarity_threshold = float(self.similarity_threshold_entry.get()) # 原缺失控件 → 现正确 # 验证参数合法性(略,保持原有逻辑) if not y_col: messagebox.showwarning("警告", "请选择Y列(目标变量)!") self.processing = False self.start_button.config(state=tk.NORMAL) self.stop_button.config(state=tk.DISABLED) return if not x_cols: messagebox.showwarning("警告", "请选择至少一个X列(特征)!") self.processing = False self.start_button.config(state=tk.NORMAL) self.stop_button.config(state=tk.DISABLED) return if y_col in x_cols: messagebox.showwarning("警告", "Y列不能同时作为X列!") self.processing = False self.start_button.config(state=tk.NORMAL) self.stop_button.config(state=tk.DISABLED) return if not all(col in self.x_cols_listbox.get(0, tk.END) for col in x_cols): messagebox.showwarning("警告", "选择的X列不在文件列名中!") self.processing = False self.start_button.config(state=tk.NORMAL) self.stop_button.config(state=tk.DISABLED) return if not all([input_path, output_dir]): messagebox.showwarning("警告", "请填写所有必填项!") self.processing = False self.start_button.config(state=tk.NORMAL) self.stop_button.config(state=tk.DISABLED) return if not os.path.exists(input_path): messagebox.showerror("错误", "输入文件不存在!") self.processing = False self.start_button.config(state=tk.NORMAL) self.stop_button.config(state=tk.DISABLED) return if not os.path.isdir(output_dir): messagebox.showerror("错误", "输出目录不存在!") self.processing = False self.start_button.config(state=tk.NORMAL) self.stop_button.config(state=tk.DISABLED) return try: corr_threshold = float(self.corr_threshold_entry.get()) n_keep = int(self.n_keep_entry.get()) rfe_min = int(self.rfe_min_entry.get()) similarity_threshold = float(self.similarity_threshold_entry.get()) if not (0 < corr_threshold < 1): raise ValueError("自相关阈值需在0-1之间") if n_keep <= 0: raise ValueError("自相关保留特征数需大于0") if rfe_min <= 0: raise ValueError("RFE最小保留特征数需大于0") if not (0 < similarity_threshold <= 1): raise ValueError("特征相似度阈值需在0-1之间") except ValueError as e: messagebox.showerror("错误", f"参数格式错误:{str(e)}") self.processing = False self.start_button.config(state=tk.NORMAL) self.stop_button.config(state=tk.DISABLED) return # 启动子线程处理 self.processing_thread = threading.Thread( target=self.run_pipeline, args=( input_path, output_dir, corr_threshold, n_keep, rfe_min, y_col, x_cols, similarity_threshold ), daemon=True ) self.processing_thread.start() # 启动进度更新 self.root.after(100, self.update_progress_and_logs) # ------------------------------ # 辅助:重置处理状态 # ------------------------------ def _reset_processing_state(self): self.processing = False self.start_button.config(state=tk.NORMAL) self.stop_button.config(state=tk.DISABLED) self.progress_bar['value'] = 0 self.progress_label.config(text="就绪") # ------------------------------ # 日志处理与进度更新 # ------------------------------ def update_progress_and_logs(self): if not self.processing: return # 处理队列中的消息 while not self.log_queue.empty(): message = self.log_queue.get_nowait() # 处理进度消息 if isinstance(message, tuple) and message[0] == "progress": self.progress_bar['value'] = message[1] self.progress_label.config(text=f"进度:{message[1]}%") # 类型2:绘图指令消息(格式:("draw_plots", 特征数据, 模型数据, X_test, y_test, 绘图目录, 日志器, 日志队列)) elif isinstance(message, tuple) and message[0] == "draw_plots": try: # 解包绘图所需的所有数据 _, feat_imp_data, model_pkl, X_tst, y_tst, plt_dir, logr, log_q = message # 绘制所有模型的「特征重要性图」(Top 20) for model_name, imp_data in feat_imp_data.items(): self.draw_feature_importance( name=model_name, feature_importance_sorted=imp_data["feature_importance_sorted"], plot_dir=plt_dir, logger=logr, log_queue=log_q ) # 绘制所有模型的「预测值对比图」 self.draw_comparison_plots( model_pickles=model_pkl, X_test=X_tst, y_test=y_tst, plot_dir=plt_dir, logger=logr, log_queue=log_q ) # 发送绘图完成的日志提示 self.log_queue.put("✅ 特征重要性图与预测对比图已生成!") except Exception as e: self.log_queue.put(f"❌ 绘图失败:{str(e)}") # 类型3:普通日志消息(非进度、非绘图指令) else: self.log_text.config(state=tk.NORMAL) self.log_text.insert(tk.END, message + "") # 加换行符避免日志粘连 self.log_text.see(tk.END) # 自动滚动到日志底部 self.log_text.config(state=tk.DISABLED) # 2. 递归监听队列(必须放在循环外!否则会卡死界面) self.root.after(100, self.update_progress_and_logs) # ------------------------------ # 子线程:运行流水线 # ------------------------------ def run_pipeline( self, input_path, output_dir, corr_threshold, n_keep, rfe_min, y_col, x_cols, similarity_threshold ): # 初始化日志 log_file_path = os.path.join(output_dir, "fdc_pipeline.log") logger = setup_logging(output_dir, log_file_path) # 添加QueueHandler将日志发送到UI class QueueHandler(logging.Handler): def __init__(self, queue): super().__init__() self.queue = queue def emit(self, record): msg = self.format(record) self.queue.put(msg) queue_handler = QueueHandler(self.log_queue) queue_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) logger.addHandler(queue_handler) try: # 步骤1:数据预处理 self.log_queue.put(("progress", 25)) self.log_queue.put("\n进度:25% \n== = 步骤1:数据预处理(缺失值填充) == =") cleaned_path = os.path.join(output_dir, "cleaned_data.csv") cleaned = preprocess_data( input_path=input_path, output_path=cleaned_path, logger=logger, y_col=y_col, x_cols=x_cols, log_queue=self.log_queue ) # 步骤2:自相关特征筛选 self.log_queue.put(("progress", 50)) self.log_queue.put("\n进度:50% \n== = 步骤2:自相关特征筛选 == = ") filtered_path = os.path.join(output_dir, "filtered_data.csv") filtered = remove_high_correlation( input_path=cleaned_path, output_path=filtered_path, corr_threshold=corr_threshold, n_keep=n_keep, logger=logger, log_queue=self.log_queue ) # 步骤3:特征相似度排序 self.log_queue.put(("progress", 75)) self.log_queue.put("\n进度:75% \n== = 步骤3:特征与目标相似度排序 == = ") ranking_path = os.path.join(output_dir, "feature_ranking.csv") similarity_filtered_path = os.path.join(output_dir, "similarity_filtered_data.csv") similarity_filtered = calculate_feature_similarity( input_path=filtered_path, ranking_path=ranking_path, filtered_path=similarity_filtered_path, threshold=similarity_threshold, logger=logger, log_queue=self.log_queue ) # 步骤4:递归特征消除(RFE) self.log_queue.put(("progress", 90)) self.log_queue.put("\n进度:90% \n== = 步骤4:递归特征消除(RFE) == = ") rfe_selected_path = os.path.join(output_dir, "rfe_selected_data.csv") rfe_selected = recursive_feature_elimination( input_path=similarity_filtered_path, output_path=rfe_selected_path, min_features=rfe_min, logger=logger, log_queue=self.log_queue ) # 步骤5:模型训练与评估 self.log_queue.put(("progress", 100)) self.log_queue.put("\n进度:100% \n== = 步骤5:模型训练与评估 == = ") model_results_path = os.path.join(output_dir, "model_results.csv") model_results, feature_importance_data, model_pickles, X_test, y_test = train_evaluate_models( input_path=rfe_selected_path, output_path=model_results_path, logger=logger, log_queue=self.log_queue ) # 更新结果展示 self.log_queue.put("\n正在更新模型结果...") self.result_tree.delete(*self.result_tree.get_children()) try: results_df = pd.read_csv(model_results_path, index_col=0) for model_name in ["RandomForest", "XGBoost"]: if model_name in results_df.index: metrics = results_df.loc[model_name] self.result_tree.insert("", tk.END, values=( model_name, f"{metrics['训练R²']:.4f}", f"{metrics['测试R²']:.4f}", f"{metrics['测试RMSE']:.4f}", f"{metrics['测试MAE']:.4f}" )) except Exception as e: self.log_queue.put(f"无法加载模型结果:{str(e)}") # 准备绘图数据并发送给主线程 plot_dir = feature_importance_data["RandomForest"]["plot_dir"] # 所有模型共享同一绘图目录 draw_msg = ( "draw_plots", # 消息类型标识 feature_importance_data, # 特征重要性数据 model_pickles, # 序列化的模型 X_test, # 测试集特征 y_test, # 测试集目标 plot_dir, # 绘图目录 logger, # 日志器 self.log_queue # 日志队列 ) self.log_queue.put(draw_msg) # 发送绘图指令 # 处理完成 self.log_queue.put("✅ 流程执行完成!") messagebox.showinfo("完成", "数据处理与模型训练已完成!") except Exception as e: self.log_queue.put(f"❌ 错误:{str(e)}") self.progress_bar['value'] = 0 self.progress_label.config(text="处理失败") messagebox.showerror("处理错误", str(e)) finally: self.processing = False self.start_button.config(state=tk.NORMAL) self.stop_button.config(state=tk.DISABLED) # ------------------------------ # 主线程:绘制特征重要性图 # ------------------------------ def draw_feature_importance(self, name, feature_importance_sorted, plot_dir, logger, log_queue): try: plt.figure(figsize=(10, 8)) sns.barplot( x=f"{name}_重要性得分", y="特征", data=feature_importance_sorted.head(20) ) plt.title(f"{name} 特征重要性排序(Top 20)", fontsize=14) plt.xlabel("重要性得分", fontsize=12) plt.ylabel("特征", fontsize=12) plt.tight_layout() importance_plot_path = os.path.join(plot_dir, f"{name}_feature_importance.png") plt.savefig(importance_plot_path, bbox_inches="tight", dpi=300) plt.close() if logger is not None: logger.info(f"{name} 特征重要性图保存到:{importance_plot_path}") if log_queue is not None: log_queue.put(f"{name} 特征重要性图保存到:{importance_plot_path}") except Exception as e: if logger is not None: logger.error(f"绘制模型{name}特征重要性图失败:{str(e)}") if log_queue is not None: log_queue.put(f"绘制模型{name}特征重要性图失败:{str(e)}") # ------------------------------ # 主线程:绘制预测值对比图 # ------------------------------ def draw_comparison_plots(self, model_pickles, X_test, y_test, plot_dir, logger, log_queue): try: models = {} for name, pickle_data in model_pickles.items(): try: models[name] = pickle.loads(pickle_data) except Exception as e: logger.error(f"加载模型{name}失败:{str(e)}") log_queue.put(f"加载模型{name}失败:{str(e)}") continue for name, model in models.items(): generate_comparison_plots( models={name: model}, X_test=X_test, y_test=y_test, plot_dir=plot_dir, logger=logger, log_queue=log_queue ) except Exception as e: if logger is not None: logger.error(f"调度对比图绘制失败:{str(e)}") if log_queue is not None: log_queue.put(f"调度对比图绘制失败:{str(e)}") # ------------------------------ # 日志配置 # ------------------------------ def setup_logging(output_dir, log_file_path): logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) # 文件处理器 file_handler = logging.FileHandler(log_file_path, encoding='gb18030') file_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) logger.addHandler(file_handler) return logger # ------------------------------ # 程序入口 # ------------------------------ if __name__ == "__main__": root = ctk.CTk() root.geometry("1000x700") # 初始窗口大小 root.title("FDC Pipeline") # 窗口标题 app = FDCPipelineApp(root) root.mainloop():解释为什么上述代码运行后显示错误
10-28
评论 9
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值