def preprocess_data(self):
"""改进版数据预处理对话框 - 支持选择性处理"""
if self.df is None:
messagebox.showwarning("警告", "请先选择Excel文件")
return
preprocess_window = tk.Toplevel(self.root)
preprocess_window.title("数据预处理中心(可选模式)")
preprocess_window.geometry("650x750")
# 主容器(增加滚动条)
main_canvas = tk.Canvas(preprocess_window)
main_canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
scrollbar = ttk.Scrollbar(preprocess_window, orient=tk.VERTICAL, command=main_canvas.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
main_canvas.configure(yscrollcommand=scrollbar.set)
main_canvas.bind('<Configure>', lambda e: main_canvas.configure(scrollregion=main_canvas.bbox("all")))
main_frame = ttk.Frame(main_canvas)
main_canvas.create_window((0,0), window=main_frame, anchor="nw")
# 全局控制开关
global_control_frame = ttk.LabelFrame(main_frame, text="全局控制", padding=10)
global_control_frame.pack(fill=tk.X, pady=5)
ttk.Button(global_control_frame, text="展开所有选项",
command=lambda: [f.config(state=tk.NORMAL) for f in all_frames]).pack(side=tk.LEFT)
ttk.Button(global_control_frame, text="折叠所有选项",
command=lambda: [f.config(state=tk.DISABLED) for f in all_frames]).pack(side=tk.LEFT)
all_frames = [] # 用于存储所有可折叠框架
# 1. 缺失值处理部分
missing_frame = ttk.LabelFrame(main_frame, text="1. 缺失值处理 (点击展开)", padding=10)
missing_frame.pack(fill=tk.X, pady=5)
all_frames.append(missing_frame)
# 缺失值处理开关
missing_enable = tk.BooleanVar(value=False)
ttk.Checkbutton(missing_frame, text="启用缺失值处理", variable=missing_enable,
command=lambda: missing_subframe.config(state=tk.NORMAL if missing_enable.get() else tk.DISABLED)).pack(anchor=tk.W)
missing_subframe = ttk.Frame(missing_frame)
missing_subframe.pack(fill=tk.X)
missing_subframe.config(state=tk.DISABLED)
# 缺失值统计显示
missing_stats = self.df.isnull().sum()
missing_text = scrolledtext.ScrolledText(missing_subframe, height=4)
missing_text.pack(fill=tk.X)
for col, count in missing_stats.items():
if count > 0:
missing_text.insert(tk.END, f"{col}: {count}个缺失值\n")
missing_text.config(state=tk.DISABLED)
# 处理方法选择
ttk.Label(missing_subframe, text="处理方法:").pack(anchor=tk.W)
missing_method_var = tk.StringVar(value="fill")
ttk.Radiobutton(missing_subframe, text="删除缺失行", variable=missing_method_var, value="drop").pack(anchor=tk.W)
ttk.Radiobutton(missing_subframe, text="填充缺失值", variable=missing_method_var, value="fill").pack(anchor=tk.W)
ttk.Radiobutton(missing_subframe, text="插值法处理", variable=missing_method_var, value="interpolate").pack(anchor=tk.W)
# 填充选项
fill_frame = ttk.LabelFrame(missing_subframe, text="填充选项", padding=5)
fill_frame.pack(fill=tk.X, pady=5)
fill_type_var = tk.StringVar(value="fixed")
ttk.Radiobutton(fill_frame, text="固定值:", variable=fill_type_var, value="fixed").pack(side=tk.LEFT)
fill_value_entry = ttk.Entry(fill_frame, width=10)
fill_value_entry.pack(side=tk.LEFT, padx=5)
fill_value_entry.insert(0, "0")
ttk.Radiobutton(fill_frame, text="前向填充", variable=fill_type_var, value="ffill").pack(side=tk.LEFT, padx=5)
ttk.Radiobutton(fill_frame, text="后向填充", variable=fill_type_var, value="bfill").pack(side=tk.LEFT, padx=5)
ttk.Radiobutton(fill_frame, text="均值填充", variable=fill_type_var, value="mean").pack(side=tk.LEFT, padx=5)
# 2. 异常值处理部分
outlier_frame = ttk.LabelFrame(main_frame, text="2. 异常值处理 (点击展开)", padding=10)
outlier_frame.pack(fill=tk.X, pady=5)
all_frames.append(outlier_frame)
outlier_enable = tk.BooleanVar(value=False)
ttk.Checkbutton(outlier_frame, text="启用异常值处理", variable=outlier_enable,
command=lambda: outlier_subframe.config(state=tk.NORMAL if outlier_enable.get() else tk.DISABLED)).pack(anchor=tk.W)
outlier_subframe = ttk.Frame(outlier_frame)
outlier_subframe.pack(fill=tk.X)
outlier_subframe.config(state=tk.DISABLED)
# 检测方法
ttk.Label(outlier_subframe, text="检测方法:").pack(anchor=tk.W)
outlier_method_var = tk.StringVar(value="3sigma")
ttk.Radiobutton(outlier_subframe, text="3σ原则", variable=outlier_method_var, value="3sigma").pack(anchor=tk.W)
ttk.Radiobutton(outlier_subframe, text="IQR方法", variable=outlier_method_var, value="iqr").pack(anchor=tk.W)
# 处理方式
ttk.Label(outlier_subframe, text="处理方式:").pack(anchor=tk.W)
outlier_action_var = tk.StringVar(value="remove")
ttk.Radiobutton(outlier_subframe, text="删除异常值", variable=outlier_action_var, value="remove").pack(anchor=tk.W)
ttk.Radiobutton(outlier_subframe, text="用中位数替换", variable=outlier_action_var, value="median").pack(anchor=tk.W)
ttk.Radiobutton(outlier_subframe, text="用前后均值替换", variable=outlier_action_var, value="neighbor").pack(anchor=tk.W)
# 3. 时间列转换
time_frame = ttk.LabelFrame(main_frame, text="3. 时间列转换 (点击展开)", padding=10)
time_frame.pack(fill=tk.X, pady=5)
all_frames.append(time_frame)
time_enable = tk.BooleanVar(value=False)
ttk.Checkbutton(time_frame, text="启用时间列转换", variable=time_enable,
command=lambda: time_subframe.config(state=tk.NORMAL if time_enable.get() else tk.DISABLED)).pack(anchor=tk.W)
time_subframe = ttk.Frame(time_frame)
time_subframe.pack(fill=tk.X)
time_subframe.config(state=tk.DISABLED)
ttk.Label(time_subframe, text="选择时间列:").pack(anchor=tk.W)
time_col_var = tk.StringVar()
time_col_combo = ttk.Combobox(time_subframe, textvariable=time_col_var, width=25)
time_col_combo['values'] = tuple(self.df.columns)
time_col_combo.pack(anchor=tk.W, pady=5)
# 4. 特征工程
feature_frame = ttk.LabelFrame(main_frame, text="4. 特征工程 (点击展开)", padding=10)
feature_frame.pack(fill=tk.X, pady=5)
all_frames.append(feature_frame)
feature_enable = tk.BooleanVar(value=False)
ttk.Checkbutton(feature_frame, text="启用特征工程", variable=feature_enable,
command=lambda: feature_subframe.config(state=tk.NORMAL if feature_enable.get() else tk.DISABLED)).pack(anchor=tk.W)
feature_subframe = ttk.Frame(feature_frame)
feature_subframe.pack(fill=tk.X)
feature_subframe.config(state=tk.DISABLED)
# 滞后特征
lag_frame = ttk.LabelFrame(feature_subframe, text="滞后特征", padding=5)
lag_frame.pack(fill=tk.X, pady=5)
ttk.Label(lag_frame, text="选择列:").pack(side=tk.LEFT)
lag_col_var = tk.StringVar()
lag_col_combo = ttk.Combobox(lag_frame, textvariable=lag_col_var, width=15)
lag_col_combo['values'] = tuple(self.df.select_dtypes(include=['number']).columns)
lag_col_combo.pack(side=tk.LEFT, padx=5)
ttk.Label(lag_frame, text="滞后步数:").pack(side=tk.LEFT)
lag_steps_entry = ttk.Entry(lag_frame, width=5)
lag_steps_entry.pack(side=tk.LEFT)
lag_steps_entry.insert(0, "1")
# 执行按钮区域
button_frame = ttk.Frame(main_frame)
button_frame.pack(fill=tk.X, pady=10)
def apply_preprocessing():
try:
original_shape = self.df.shape
# 1. 缺失值处理
if missing_enable.get():
if missing_method_var.get() == "drop":
self.df = self.df.dropna()
elif missing_method_var.get() == "fill":
if fill_type_var.get() == "fixed":
fill_value = float(fill_value_entry.get()) if self.df.select_dtypes(include=['number']).shape[1] > 0 else fill_value_entry.get()
self.df = self.df.fillna(fill_value)
elif fill_type_var.get() == "ffill":
self.df = self.df.ffill()
elif fill_type_var.get() == "bfill":
self.df = self.df.bfill()
elif fill_type_var.get() == "mean":
self.df = self.df.fillna(self.df.mean())
elif missing_method_var.get() == "interpolate":
self.df = self.df.interpolate()
# 2. 异常值处理
if outlier_enable.get():
numeric_cols = self.df.select_dtypes(include=['number']).columns
for col in numeric_cols:
if outlier_method_var.get() == "3sigma":
mean, std = self.df[col].mean(), self.df[col].std()
lower, upper = mean - 3*std, mean + 3*std
else: # iqr
q1, q3 = self.df[col].quantile(0.25), self.df[col].quantile(0.75)
iqr = q3 - q1
lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
if outlier_action_var.get() == "remove":
self.df = self.df[(self.df[col] >= lower) & (self.df[col] <= upper)]
elif outlier_action_var.get() == "median":
self.df.loc[(self.df[col] < lower) | (self.df[col] > upper), col] = self.df[col].median()
elif outlier_action_var.get() == "neighbor":
mask = (self.df[col] < lower) | (self.df[col] > upper)
self.df.loc[mask, col] = self.df[col].rolling(2, min_periods=1).mean()[mask]
# 3. 时间列转换
if time_enable.get() and time_col_var.get():
try:
self.df[time_col_var.get()] = pd.to_datetime(self.df[time_col_var.get()])
self.df['year'] = self.df[time_col_var.get()].dt.year
self.df['month'] = self.df[time_col_var.get()].dt.month
self.df['day'] = self.df[time_col_var.get()].dt.day
except Exception as e:
messagebox.showwarning("时间转换警告", f"时间列转换失败: {str(e)}")
# 4. 特征工程
if feature_enable.get() and lag_col_var.get():
try:
lag_steps = int(lag_steps_entry.get())
self.df[f'{lag_col_var.get()}_lag{lag_steps}'] = self.df[lag_col_var.get()].shift(lag_steps)
except Exception as e:
messagebox.showwarning("滞后特征警告", f"创建滞后特征失败: {str(e)}")
# 更新显示
self.show_preview()
new_shape = self.df.shape
self.status_var.set(f"预处理完成 | 原形状: {original_shape} | 新形状: {new_shape}")
except Exception as e:
messagebox.showerror("预处理错误", f"预处理过程中发生错误:\n{str(e)}")
ttk.Button(button_frame, text="执行选中的预处理", command=apply_preprocessing, style='Accent.TButton').pack(side=tk.LEFT, padx=5)
ttk.Button(button_frame, text="取消", command=preprocess_window.destroy).pack(side=tk.LEFT, padx=5)
# 配置折叠功能
for frame in all_frames:
frame.bind("<Button-1>", lambda e, f=frame: f.config(state=tk.NORMAL if f.cget('state') == tk.DISABLED else tk.DISABLED))帮我检查这段代码有什么问题
最新发布