“order”运算符后缺少操作数。dt.Select order by排序

修复DataTable排序错误
本文解决了一个关于使用DataTable进行数据排序时出现的“order”运算符后缺少操作数的问题。通过调整select方法的参数顺序,成功实现了基于指定字段的升序排列。
部署运行你感兴趣的模型镜像

在使用DataTable.select进行排序时,直接写DataRow[] drr = dt.Select("POSTPARID='" + node.Value.ToString() + "' order by  DISPINDEX ASC");

这样写会报如题错误:“order”运算符后缺少操作数。

Select("过滤条件","排序列")

所以应该改成DataRow[] drr = dt.Select("POSTPARID='" + node.Value.ToString() + "'", " DISPINDEX ASC")即可



您可能感兴趣的与本文相关的镜像

Stable-Diffusion-3.5

Stable-Diffusion-3.5

图片生成
Stable-Diffusion

Stable Diffusion 3.5 (SD 3.5) 是由 Stability AI 推出的新一代文本到图像生成模型,相比 3.0 版本,它提升了图像质量、运行速度和硬件效率

import tkinter as tk from tkinter import ttk, filedialog, messagebox, scrolledtext import pandas as pd import numpy as np import os import re class ExcelProcessor: def __init__(self, root): self.root = root self.root.title("Excel数据处理器") self.root.geometry("900x600") self.df = None self.create_widgets() def create_widgets(self): # 顶部控制区域 control_frame = ttk.Frame(self.root, padding=10) control_frame.pack(fill=tk.X) # 文件选择按钮 ttk.Button(control_frame, text="选择Excel文件", command=self.load_excel).grid(row=0, column=0, padx=5) # 处理方式选择 self.process_var = tk.StringVar(value="点击选择") ttk.Label(control_frame, text="处理方式:").grid(row=0, column=1, padx=5) process_combo = ttk.Combobox(control_frame, textvariable=self.process_var, width=15) process_combo['values'] = ( '统计', '预处理','数据排序', '数据切分','保存结果') process_combo.grid(row=0, column=2, padx=5) # 执行按钮 ttk.Button(control_frame, text="执行", command=self.process_data).grid(row=0, column=3, padx=5) # 数据展示区域 self.notebook = ttk.Notebook(self.root) self.notebook.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 预览标签页 self.preview_frame = ttk.Frame(self.notebook) self.notebook.add(self.preview_frame, text="数据展示") # 统计标签页 self.stats_frame = ttk.Frame(self.notebook) self.notebook.add(self.stats_frame, text="统计信息") # 状态栏 self.status_var = tk.StringVar(value="就绪") ttk.Label(self.root, textvariable=self.status_var, relief=tk.SUNKEN, anchor=tk.W).pack(fill=tk.X, side=tk.BOTTOM) def load_excel(self): """加载Excel文件""" file_path = filedialog.askopenfilename( title="选择Excel文件", filetypes=[("Excel文件", "*.xlsx *.xls"), ("所有文件", "*.*")] ) if not file_path: return try: self.status_var.set(f"正在加载: {os.path.basename(file_path)}...") self.root.update() # 更新界面显示状态 # 读取Excel文件 self.df = pd.read_excel(file_path) # 显示预览数据 self.show_preview() self.status_var.set( f"已加载: {os.path.basename(file_path)} | 行数: {len(self.df)} | 列数: {len(self.df.columns)}") except Exception as e: messagebox.showerror("加载错误", f"无法读取Excel文件:\n{str(e)}") self.status_var.set("加载失败") def preprocess_data(self): """数据预处理对话框 - 整合全部预处理功能""" if self.df is None: messagebox.showwarning("警告", "请先选择Excel文件") return preprocess_window = tk.Toplevel(self.root) preprocess_window.title("数据预处理") preprocess_window.geometry("650x800") main_frame = ttk.Frame(preprocess_window) main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # ================ 新增:功能启用复选框 ================ enable_frame = ttk.LabelFrame(main_frame, text="启用功能", padding=10) enable_frame.pack(fill=tk.X, pady=5) # 创建启用变量 self.enable_missing = tk.BooleanVar(value=True) self.enable_outlier = tk.BooleanVar(value=True) self.enable_datetime = tk.BooleanVar(value=True) self.enable_lag = tk.BooleanVar(value=True) ttk.Checkbutton(enable_frame, text="执行缺失值处理", variable=self.enable_missing).pack(anchor=tk.W) ttk.Checkbutton(enable_frame, text="执行异常值处理", variable=self.enable_outlier).pack(anchor=tk.W) ttk.Checkbutton(enable_frame, text="执行时间列转换", variable=self.enable_datetime).pack(anchor=tk.W) ttk.Checkbutton(enable_frame, text="添加滞后特征", variable=self.enable_lag).pack(anchor=tk.W) # ================================================= # 1. 缺失值处理部分 missing_frame = ttk.LabelFrame(main_frame, text="缺失值处理", padding=10) missing_frame.pack(fill=tk.X, pady=5) # 缺失值统计显示 missing_stats = self.df.isnull().sum() missing_text = scrolledtext.ScrolledText(missing_frame, height=4) missing_text.pack(fill=tk.X) for col, count in missing_stats.items(): if count > 0: missing_text.insert(tk.END, f"{col}: {count}个缺失值\n") missing_text.config(state=tk.DISABLED) # 缺失值处理方法选择 ttk.Label(missing_frame, text="处理方法:").pack(anchor=tk.W) missing_method_var = tk.StringVar(value="fill") missing_method_frame = ttk.Frame(missing_frame) missing_method_frame.pack(fill=tk.X, pady=5) ttk.Radiobutton(missing_method_frame, text="删除缺失行", variable=missing_method_var, value="drop").pack( side=tk.LEFT) ttk.Radiobutton(missing_method_frame, text="固定值填充", variable=missing_method_var, value="fill").pack( side=tk.LEFT) ttk.Radiobutton(missing_method_frame, text="插值法", variable=missing_method_var, value="interpolate").pack( side=tk.LEFT) # 填充选项 fill_options_frame = ttk.Frame(missing_frame) fill_options_frame.pack(fill=tk.X, pady=5) ttk.Label(fill_options_frame, text="填充值:").pack(side=tk.LEFT) fill_value_entry = ttk.Entry(fill_options_frame, width=10) fill_value_entry.pack(side=tk.LEFT, padx=5) fill_value_entry.insert(0, "0") ttk.Label(fill_options_frame, text="或选择:").pack(side=tk.LEFT, padx=5) fill_type_var = tk.StringVar(value="fixed") ttk.Radiobutton(fill_options_frame, text="前值填充", variable=fill_type_var, value="ffill").pack(side=tk.LEFT) ttk.Radiobutton(fill_options_frame, text="后值填充", variable=fill_type_var, value="bfill").pack(side=tk.LEFT) ttk.Radiobutton(fill_options_frame, text="均值填充", variable=fill_type_var, value="mean").pack(side=tk.LEFT) # 2. 异常值处理部分 outlier_frame = ttk.LabelFrame(main_frame, text="异常值处理", padding=10) outlier_frame.pack(fill=tk.X, pady=5) # 异常值检测方法 ttk.Label(outlier_frame, text="检测方法:").pack(anchor=tk.W) outlier_method_var = tk.StringVar(value="3sigma") outlier_method_frame = ttk.Frame(outlier_frame) outlier_method_frame.pack(fill=tk.X) ttk.Radiobutton(outlier_method_frame, text="3σ原则", variable=outlier_method_var, value="3sigma").pack( side=tk.LEFT) ttk.Radiobutton(outlier_method_frame, text="IQR方法", variable=outlier_method_var, value="iqr").pack( side=tk.LEFT) # 异常值处理方式 ttk.Label(outlier_frame, text="处理方式:").pack(anchor=tk.W) outlier_action_var = tk.StringVar(value="remove") outlier_action_frame = ttk.Frame(outlier_frame) outlier_action_frame.pack(fill=tk.X) ttk.Radiobutton(outlier_action_frame, text="删除", variable=outlier_action_var, value="remove").pack( side=tk.LEFT) ttk.Radiobutton(outlier_action_frame, text="用中位数替换", variable=outlier_action_var, value="median").pack( side=tk.LEFT) ttk.Radiobutton(outlier_action_frame, text="用前后均值替换", variable=outlier_action_var, value="neighbor").pack(side=tk.LEFT) # 3. 数据类型转换部分 type_frame = ttk.LabelFrame(main_frame, text="数据类型转换", padding=10) type_frame.pack(fill=tk.X, pady=5) # 时间列转换 ttk.Label(type_frame, text="时间列转换:").pack(anchor=tk.W) time_col_var = tk.StringVar() time_col_combo = ttk.Combobox(type_frame, textvariable=time_col_var, width=20) time_col_combo['values'] = tuple(self.df.columns) time_col_combo.pack(anchor=tk.W, pady=5) # === 新增:时间单位选择 === time_units_frame = ttk.Frame(type_frame) time_units_frame.pack(fill=tk.X, pady=5) ttk.Label(time_units_frame, text="提取时间单位:").pack(side=tk.LEFT) # 创建时间单位变量 self.extract_year = tk.BooleanVar(value=True) self.extract_month = tk.BooleanVar(value=True) self.extract_day = tk.BooleanVar(value=True) self.extract_hour = tk.BooleanVar(value=False) self.extract_minute = tk.BooleanVar(value=False) self.extract_second = tk.BooleanVar(value=False) # 添加复选框 ttk.Checkbutton(time_units_frame, text="年", variable=self.extract_year).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="月", variable=self.extract_month).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="日", variable=self.extract_day).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="时", variable=self.extract_hour).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="分", variable=self.extract_minute).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="秒", variable=self.extract_second).pack(side=tk.LEFT, padx=5) # === 修改时间转换逻辑 === if self.enable_datetime.get(): time_col = time_col_var.get() if time_col and time_col in self.df.columns: try: # 统一处理不同日期格式 self.df[time_col] = self.df[time_col].apply( lambda x: pd.to_datetime(x, errors='coerce', format='mixed') ) # 强制显示完整时间格式 pd.set_option('display.datetime_format', '%Y-%m-%d %H:%M:%S') # 根据用户选择提取时间单位 if self.extract_year.get(): self.df['year'] = self.df[time_col].dt.year if self.extract_month.get(): self.df['month'] = self.df[time_col].dt.month if self.extract_day.get(): self.df['day'] = self.df[time_col].dt.day if self.extract_hour.get(): self.df['hour'] = self.df[time_col].dt.hour if self.extract_minute.get(): self.df['minute'] = self.df[time_col].dt.minute if self.extract_second.get(): self.df['second'] = self.df[time_col].dt.second # 新增:确保时间部分显示 self.df['full_datetime'] = self.df[time_col].dt.strftime('%Y-%m-%d %H:%M:%S') # 时间周期特征 if self.extract_hour.get() or self.extract_minute.get(): self.df['time_of_day'] = self.df[time_col].dt.hour + self.df[time_col].dt.minute / 60.0 if self.extract_second.get(): self.df['time_of_day'] += self.df[time_col].dt.second / 3600.0 except Exception as e: messagebox.showerror("时间转换错误", f"时间列转换失败: {str(e)}") # 4. 特征工程部分 feature_frame = ttk.LabelFrame(main_frame, text="特征工程", padding=10) feature_frame.pack(fill=tk.X, pady=5) # 添加滞后特征 ttk.Label(feature_frame, text="滞后特征:").pack(anchor=tk.W) lag_frame = ttk.Frame(feature_frame) lag_frame.pack(fill=tk.X) ttk.Label(lag_frame, text="选择列:").pack(side=tk.LEFT) lag_col_var = tk.StringVar() lag_col_combo = ttk.Combobox(lag_frame, textvariable=lag_col_var, width=15) lag_col_combo['values'] = tuple(self.df.select_dtypes(include=['number']).columns) lag_col_combo.pack(side=tk.LEFT, padx=5) ttk.Label(lag_frame, text="滞后步数:").pack(side=tk.LEFT) lag_steps_entry = ttk.Entry(lag_frame, width=5) lag_steps_entry.pack(side=tk.LEFT) lag_steps_entry.insert(0, "1") # 执行预处理按钮 def apply_preprocessing(): try: original_shape = self.df.shape # 1. 处理缺失值 (如果启用) if self.enable_missing.get(): missing_method = missing_method_var.get() if missing_method == "drop": self.df = self.df.dropna() elif missing_method == "fill": fill_type = fill_type_var.get() if fill_type == "fixed": fill_value = fill_value_entry.get() self.df = self.df.fillna( float(fill_value) if self.df.select_dtypes(include=['number']).shape[ 1] > 0 else fill_value) elif fill_type == "ffill": self.df = self.df.ffill() elif fill_type == "bfill": self.df = self.df.bfill() elif fill_type == "mean": self.df = self.df.fillna(self.df.mean()) elif missing_method == "interpolate": self.df = self.df.interpolate() # 2. 处理异常值 (如果启用) if self.enable_outlier.get(): outlier_method = outlier_method_var.get() outlier_action = outlier_action_var.get() numeric_cols = self.df.select_dtypes(include=['number']).columns for col in numeric_cols: if outlier_method == "3sigma": mean, std = self.df[col].mean(), self.df[col].std() lower, upper = mean - 3 * std, mean + 3 * std else: # iqr q1, q3 = self.df[col].quantile(0.25), self.df[col].quantile(0.75) iqr = q3 - q1 lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr if outlier_action == "remove": self.df = self.df[(self.df[col] >= lower) & (self.df[col] <= upper)] elif outlier_action == "median": self.df.loc[(self.df[col] < lower) | (self.df[col] > upper), col] = self.df[col].median() elif outlier_action == "neighbor": mask = (self.df[col] < lower) | (self.df[col] > upper) self.df.loc[mask, col] = self.df[col].rolling(2, min_periods=1).mean()[mask] # 3. 时间列转换 (如果启用) if self.enable_datetime.get(): time_col = time_col_var.get() if time_col and time_col in self.df.columns: try: self.df[time_col] = pd.to_datetime(self.df[time_col]) self.df['year'] = self.df[time_col].dt.year self.df['month'] = self.df[time_col].dt.month self.df['day'] = self.df[time_col].dt.day except Exception as e: messagebox.showwarning("时间转换警告", f"时间列转换失败: {str(e)}") # 4. 添加滞后特征 (如果启用) if self.enable_lag.get(): lag_col = lag_col_var.get() if lag_col and lag_col in self.df.columns: try: lag_steps = int(lag_steps_entry.get()) self.df[f'{lag_col}_lag{lag_steps}'] = self.df[lag_col].shift(lag_steps) except Exception as e: messagebox.showwarning("滞后特征警告", f"创建滞后特征失败: {str(e)}") # ========================================================= # 更新显示 self.show_preview() preprocess_window.destroy() new_shape = self.df.shape self.status_var.set(f"预处理完成 | 原形状: {original_shape} | 新形状: {new_shape}") except Exception as e: messagebox.showerror("预处理错误", f"预处理过程中发生错误:\n{str(e)}") ttk.Button(main_frame, text="执行预处理", command=apply_preprocessing).pack(pady=10) def show_preview(self): """在表格中分页显示数据预览""" # 清除旧内容 for widget in self.preview_frame.winfo_children(): widget.destroy() # 创建主容器 container = ttk.Frame(self.preview_frame) container.pack(fill=tk.BOTH, expand=True) # 创建表格 columns = list(self.df.columns) self.tree = ttk.Treeview(container, columns=columns, show="headings") # 添加列标题 for col in columns: self.tree.heading(col, text=col) self.tree.column(col, width=100, anchor=tk.W) # 添加滚动条 scrollbar = ttk.Scrollbar(container, orient=tk.VERTICAL, command=self.tree.yview) self.tree.configure(yscroll=scrollbar.set) # 布局表格和滚动条 self.tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) # 创建分页控制面板 pagination_frame = ttk.Frame(self.preview_frame) pagination_frame.pack(fill=tk.X, pady=5) # 分页参数 self.current_page = 1 self.rows_per_page = 1000 # 每页显示的行数 self.total_pages = max(1, (len(self.df) + self.rows_per_page - 1) // self.rows_per_page) # 分页标签 self.page_label = ttk.Label(pagination_frame, text=f"第 {self.current_page} 页 / 共 {self.total_pages} 页") self.page_label.pack(side=tk.LEFT, padx=10) # 分页按钮 ttk.Button(pagination_frame, text="首页", command=lambda: self.change_page(1)).pack(side=tk.LEFT) ttk.Button(pagination_frame, text="上一页", command=lambda: self.change_page(self.current_page - 1)).pack( side=tk.LEFT) ttk.Button(pagination_frame, text="下一页", command=lambda: self.change_page(self.current_page + 1)).pack( side=tk.LEFT) ttk.Button(pagination_frame, text="末页", command=lambda: self.change_page(self.total_pages)).pack(side=tk.LEFT) # 跳转输入框 ttk.Label(pagination_frame, text="跳转到:").pack(side=tk.LEFT, padx=(10, 0)) self.page_entry = ttk.Entry(pagination_frame, width=5) self.page_entry.pack(side=tk.LEFT) ttk.Button(pagination_frame, text="跳转", command=self.jump_to_page).pack(side=tk.LEFT, padx=(5, 10)) # 显示第一页数据 self.load_page_data() def load_page_data(self): """加载当前页的数据""" # 清空现有数据 for item in self.tree.get_children(): self.tree.delete(item) # 计算起始和结束索引 start_idx = (self.current_page - 1) * self.rows_per_page end_idx = start_idx + self.rows_per_page # 添加当前页的数据行 for i, row in self.df.iloc[start_idx:end_idx].iterrows(): self.tree.insert("", tk.END, values=list(row)) # 更新分页标签 self.page_label.config(text=f"第 {self.current_page} 页 / 共 {self.total_pages} 页") self.page_entry.delete(0, tk.END) self.page_entry.insert(0, str(self.current_page)) def change_page(self, new_page): """切换页面""" # 确保新页码在有效范围内 new_page = max(1, min(new_page, self.total_pages)) if new_page != self.current_page: self.current_page = new_page self.load_page_data() def jump_to_page(self): """跳转到指定页码""" try: page_num = int(self.page_entry.get()) self.change_page(page_num) except ValueError: messagebox.showerror("错误", "请输入有效的页码数字") def process_data(self): """根据选择的处理方式处理数据""" if self.df is None: messagebox.showwarning("警告", "请先选择Excel文件") return process_type = self.process_var.get() if process_type == "统计": self.show_statistics() self.notebook.select(1) elif process_type == "数据排序": self.sort_data() elif process_type == "保存结果": self.save_data() elif process_type == "预处理": self.preprocess_data() elif process_type == "数据切分": self.divide_data() def show_statistics(self): """显示数据统计信息""" # 清除旧内容 for widget in self.stats_frame.winfo_children(): widget.destroy() # 计算统计信息 stats = self.df.describe(include='all').fillna('-') # 创建表格显示统计信息 columns = ['统计项'] + list(stats.columns) tree = ttk.Treeview(self.stats_frame, columns=columns, show="headings") # 添加列标题 for col in columns: tree.heading(col, text=col) tree.column(col, width=100, anchor=tk.W) # 添加数据行 for index, row in stats.iterrows(): tree.insert("", tk.END, values=[index] + list(row)) # 添加滚动条 scrollbar = ttk.Scrollbar(self.stats_frame, orient=tk.VERTICAL, command=tree.yview) tree.configure(yscroll=scrollbar.set) # 布局 tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) # 添加数据类型信息 type_frame = ttk.LabelFrame(self.stats_frame, text="数据类型") type_frame.pack(fill=tk.X, padx=5, pady=5) type_text = scrolledtext.ScrolledText(type_frame, height=5) type_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) dtypes = self.df.dtypes.apply(lambda x: x.name).to_dict() type_info = "\n".join([f"{col}: {dtype}" for col, dtype in dtypes.items()]) type_text.insert(tk.END, type_info) type_text.config(state=tk.DISABLED) def sort_data(self): """数据排序对话框""" sort_window = tk.Toplevel(self.root) sort_window.title("数据排序") sort_window.geometry("400x300") ttk.Label(sort_window, text="选择排序:").pack(pady=10) # 列选择 col_var = tk.StringVar() col_combo = ttk.Combobox(sort_window, textvariable=col_var, width=20) col_combo['values'] = tuple(self.df.columns) col_combo.pack(pady=5) # 排序方式 ttk.Label(sort_window, text="排序方式:").pack(pady=10) order_var = tk.StringVar(value="ascending") ttk.Radiobutton(sort_window, text="升序", variable=order_var, value="ascending").pack() ttk.Radiobutton(sort_window, text="降序", variable=order_var, value="descending").pack() def apply_sort(): if not col_var.get(): messagebox.showwarning("警告", "请选择排序列") return try: ascending = (order_var.get() == "ascending") self.df = self.df.sort_values(by=col_var.get(), ascending=ascending) self.show_preview() sort_window.destroy() self.status_var.set(f"数据已按 {col_var.get()} {'升序' if ascending else '降序'} 排序") except Exception as e: messagebox.showerror("排序错误", f"排序失败:\n{str(e)}") ttk.Button(sort_window, text="应用排序", command=apply_sort).pack(pady=20) def save_data(self): """保存处理结果""" if self.df is None or self.df.empty: messagebox.showwarning("警告", "没有可保存的数据") return save_path = filedialog.asksaveasfilename( defaultextension=".xlsx", filetypes=[("Excel文件", "*.xlsx"), ("CSV文件", "*.csv")] ) if not save_path: return try: if save_path.endswith('.xlsx'): self.df.to_excel(save_path, index=False) else: self.df.to_csv(save_path, index=False) self.status_var.set(f"文件已保存至: {os.path.basename(save_path)}") messagebox.showinfo("保存成功", f"文件已成功保存至:\n{save_path}") except Exception as e: messagebox.showerror("保存错误", f"保存文件失败:\n{str(e)}") # 创建并运行程序 if __name__ == "__main__": root = tk.Tk() app = ExcelProcessor(root) root.mainloop() 请你用相同的风格也一个excel处理器类的一个方法:可以对数据进行筛选
07-16
import tkinter as tk from tkinter import ttk, filedialog, messagebox, scrolledtext import pandas as pd import numpy as np import os import re class ExcelProcessor: def __init__(self, root): self.root = root self.root.title("Excel数据处理器") self.root.geometry("900x600") self.df = None self.create_widgets() def create_widgets(self): # 顶部控制区域 control_frame = ttk.Frame(self.root, padding=10) control_frame.pack(fill=tk.X) # 文件选择按钮 ttk.Button(control_frame, text="选择Excel文件", command=self.load_excel).grid(row=0, column=0, padx=5) # 处理方式选择 self.process_var = tk.StringVar(value="点击选择") ttk.Label(control_frame, text="处理方式:").grid(row=0, column=1, padx=5) process_combo = ttk.Combobox(control_frame, textvariable=self.process_var, width=15) process_combo['values'] = ( '统计', '预处理','数据排序','数据筛选','数据切分','保存结果') process_combo.grid(row=0, column=2, padx=5) # 执行按钮 ttk.Button(control_frame, text="执行", command=self.process_data).grid(row=0, column=3, padx=5) # 数据展示区域 self.notebook = ttk.Notebook(self.root) self.notebook.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 预览标签页 self.preview_frame = ttk.Frame(self.notebook) self.notebook.add(self.preview_frame, text="数据展示") # 统计标签页 self.stats_frame = ttk.Frame(self.notebook) self.notebook.add(self.stats_frame, text="统计信息") # 状态栏 self.status_var = tk.StringVar(value="就绪") ttk.Label(self.root, textvariable=self.status_var, relief=tk.SUNKEN, anchor=tk.W).pack(fill=tk.X, side=tk.BOTTOM) def load_excel(self): """加载Excel文件""" file_path = filedialog.askopenfilename( title="选择Excel文件", filetypes=[("Excel文件", "*.xlsx *.xls"), ("所有文件", "*.*")] ) if not file_path: return try: self.status_var.set(f"正在加载: {os.path.basename(file_path)}...") self.root.update() # 更新界面显示状态 # 读取Excel文件 self.df = pd.read_excel(file_path) # 显示预览数据 self.show_preview() self.status_var.set( f"已加载: {os.path.basename(file_path)} | 行数: {len(self.df)} | 列数: {len(self.df.columns)}") except Exception as e: messagebox.showerror("加载错误", f"无法读取Excel文件:\n{str(e)}") self.status_var.set("加载失败") def preprocess_data(self): """数据预处理对话框 - 整合全部预处理功能""" if self.df is None: messagebox.showwarning("警告", "请先选择Excel文件") return preprocess_window = tk.Toplevel(self.root) preprocess_window.title("数据预处理") preprocess_window.geometry("650x800") main_frame = ttk.Frame(preprocess_window) main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # ================ 新增:功能启用复选框 ================ enable_frame = ttk.LabelFrame(main_frame, text="启用功能", padding=10) enable_frame.pack(fill=tk.X, pady=5) # 创建启用变量 self.enable_missing = tk.BooleanVar(value=True) self.enable_outlier = tk.BooleanVar(value=True) self.enable_datetime = tk.BooleanVar(value=True) self.enable_lag = tk.BooleanVar(value=True) ttk.Checkbutton(enable_frame, text="执行缺失值处理", variable=self.enable_missing).pack(anchor=tk.W) ttk.Checkbutton(enable_frame, text="执行异常值处理", variable=self.enable_outlier).pack(anchor=tk.W) ttk.Checkbutton(enable_frame, text="执行时间列转换", variable=self.enable_datetime).pack(anchor=tk.W) ttk.Checkbutton(enable_frame, text="添加滞后特征", variable=self.enable_lag).pack(anchor=tk.W) # ================================================= # 1. 缺失值处理部分 missing_frame = ttk.LabelFrame(main_frame, text="缺失值处理", padding=10) missing_frame.pack(fill=tk.X, pady=5) # 缺失值统计显示 missing_stats = self.df.isnull().sum() missing_text = scrolledtext.ScrolledText(missing_frame, height=4) missing_text.pack(fill=tk.X) for col, count in missing_stats.items(): if count > 0: missing_text.insert(tk.END, f"{col}: {count}个缺失值\n") missing_text.config(state=tk.DISABLED) # 缺失值处理方法选择 ttk.Label(missing_frame, text="处理方法:").pack(anchor=tk.W) missing_method_var = tk.StringVar(value="fill") missing_method_frame = ttk.Frame(missing_frame) missing_method_frame.pack(fill=tk.X, pady=5) ttk.Radiobutton(missing_method_frame, text="删除缺失行", variable=missing_method_var, value="drop").pack( side=tk.LEFT) ttk.Radiobutton(missing_method_frame, text="固定值填充", variable=missing_method_var, value="fill").pack( side=tk.LEFT) ttk.Radiobutton(missing_method_frame, text="插值法", variable=missing_method_var, value="interpolate").pack( side=tk.LEFT) # 填充选项 fill_options_frame = ttk.Frame(missing_frame) fill_options_frame.pack(fill=tk.X, pady=5) ttk.Label(fill_options_frame, text="填充值:").pack(side=tk.LEFT) fill_value_entry = ttk.Entry(fill_options_frame, width=10) fill_value_entry.pack(side=tk.LEFT, padx=5) fill_value_entry.insert(0, "0") ttk.Label(fill_options_frame, text="或选择:").pack(side=tk.LEFT, padx=5) fill_type_var = tk.StringVar(value="fixed") ttk.Radiobutton(fill_options_frame, text="前值填充", variable=fill_type_var, value="ffill").pack(side=tk.LEFT) ttk.Radiobutton(fill_options_frame, text="后值填充", variable=fill_type_var, value="bfill").pack(side=tk.LEFT) ttk.Radiobutton(fill_options_frame, text="均值填充", variable=fill_type_var, value="mean").pack(side=tk.LEFT) # 2. 异常值处理部分 outlier_frame = ttk.LabelFrame(main_frame, text="异常值处理", padding=10) outlier_frame.pack(fill=tk.X, pady=5) # 异常值检测方法 ttk.Label(outlier_frame, text="检测方法:").pack(anchor=tk.W) outlier_method_var = tk.StringVar(value="3sigma") outlier_method_frame = ttk.Frame(outlier_frame) outlier_method_frame.pack(fill=tk.X) ttk.Radiobutton(outlier_method_frame, text="3σ原则", variable=outlier_method_var, value="3sigma").pack( side=tk.LEFT) ttk.Radiobutton(outlier_method_frame, text="IQR方法", variable=outlier_method_var, value="iqr").pack( side=tk.LEFT) # 异常值处理方式 ttk.Label(outlier_frame, text="处理方式:").pack(anchor=tk.W) outlier_action_var = tk.StringVar(value="remove") outlier_action_frame = ttk.Frame(outlier_frame) outlier_action_frame.pack(fill=tk.X) ttk.Radiobutton(outlier_action_frame, text="删除", variable=outlier_action_var, value="remove").pack( side=tk.LEFT) ttk.Radiobutton(outlier_action_frame, text="用中位数替换", variable=outlier_action_var, value="median").pack( side=tk.LEFT) ttk.Radiobutton(outlier_action_frame, text="用前后均值替换", variable=outlier_action_var, value="neighbor").pack(side=tk.LEFT) # 3. 数据类型转换部分 type_frame = ttk.LabelFrame(main_frame, text="数据类型转换", padding=10) type_frame.pack(fill=tk.X, pady=5) # 时间列转换 ttk.Label(type_frame, text="时间列转换:").pack(anchor=tk.W) time_col_var = tk.StringVar() time_col_combo = ttk.Combobox(type_frame, textvariable=time_col_var, width=20) time_col_combo['values'] = tuple(self.df.columns) time_col_combo.pack(anchor=tk.W, pady=5) # === 新增:时间单位选择 === time_units_frame = ttk.Frame(type_frame) time_units_frame.pack(fill=tk.X, pady=5) ttk.Label(time_units_frame, text="提取时间单位:").pack(side=tk.LEFT) # 创建时间单位变量 self.extract_year = tk.BooleanVar(value=True) self.extract_month = tk.BooleanVar(value=True) self.extract_day = tk.BooleanVar(value=True) self.extract_hour = tk.BooleanVar(value=False) self.extract_minute = tk.BooleanVar(value=False) self.extract_second = tk.BooleanVar(value=False) # 添加复选框 ttk.Checkbutton(time_units_frame, text="年", variable=self.extract_year).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="月", variable=self.extract_month).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="日", variable=self.extract_day).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="时", variable=self.extract_hour).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="分", variable=self.extract_minute).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="秒", variable=self.extract_second).pack(side=tk.LEFT, padx=5) # === 修改时间转换逻辑 === if self.enable_datetime.get(): time_col = time_col_var.get() if time_col and time_col in self.df.columns: try: # 统一处理不同日期格式 self.df[time_col] = self.df[time_col].apply( lambda x: pd.to_datetime(x, errors='coerce', format='mixed') ) # 强制显示完整时间格式 pd.set_option('display.datetime_format', '%Y-%m-%d %H:%M:%S') # 根据用户选择提取时间单位 if self.extract_year.get(): self.df['year'] = self.df[time_col].dt.year if self.extract_month.get(): self.df['month'] = self.df[time_col].dt.month if self.extract_day.get(): self.df['day'] = self.df[time_col].dt.day if self.extract_hour.get(): self.df['hour'] = self.df[time_col].dt.hour if self.extract_minute.get(): self.df['minute'] = self.df[time_col].dt.minute if self.extract_second.get(): self.df['second'] = self.df[time_col].dt.second # 新增:确保时间部分显示 self.df['full_datetime'] = self.df[time_col].dt.strftime('%Y-%m-%d %H:%M:%S') # 时间周期特征 if self.extract_hour.get() or self.extract_minute.get(): self.df['time_of_day'] = self.df[time_col].dt.hour + self.df[time_col].dt.minute / 60.0 if self.extract_second.get(): self.df['time_of_day'] += self.df[time_col].dt.second / 3600.0 except Exception as e: messagebox.showerror("时间转换错误", f"时间列转换失败: {str(e)}") # 4. 特征工程部分 feature_frame = ttk.LabelFrame(main_frame, text="特征工程", padding=10) feature_frame.pack(fill=tk.X, pady=5) # 添加滞后特征 ttk.Label(feature_frame, text="滞后特征:").pack(anchor=tk.W) lag_frame = ttk.Frame(feature_frame) lag_frame.pack(fill=tk.X) ttk.Label(lag_frame, text="选择列:").pack(side=tk.LEFT) lag_col_var = tk.StringVar() lag_col_combo = ttk.Combobox(lag_frame, textvariable=lag_col_var, width=15) lag_col_combo['values'] = tuple(self.df.select_dtypes(include=['number']).columns) lag_col_combo.pack(side=tk.LEFT, padx=5) ttk.Label(lag_frame, text="滞后步数:").pack(side=tk.LEFT) lag_steps_entry = ttk.Entry(lag_frame, width=5) lag_steps_entry.pack(side=tk.LEFT) lag_steps_entry.insert(0, "1") # 执行预处理按钮 def apply_preprocessing(): try: original_shape = self.df.shape # 1. 处理缺失值 (如果启用) if self.enable_missing.get(): missing_method = missing_method_var.get() if missing_method == "drop": self.df = self.df.dropna() elif missing_method == "fill": fill_type = fill_type_var.get() if fill_type == "fixed": fill_value = fill_value_entry.get() self.df = self.df.fillna( float(fill_value) if self.df.select_dtypes(include=['number']).shape[ 1] > 0 else fill_value) elif fill_type == "ffill": self.df = self.df.ffill() elif fill_type == "bfill": self.df = self.df.bfill() elif fill_type == "mean": self.df = self.df.fillna(self.df.mean()) elif missing_method == "interpolate": self.df = self.df.interpolate() # 2. 处理异常值 (如果启用) if self.enable_outlier.get(): outlier_method = outlier_method_var.get() outlier_action = outlier_action_var.get() numeric_cols = self.df.select_dtypes(include=['number']).columns for col in numeric_cols: if outlier_method == "3sigma": mean, std = self.df[col].mean(), self.df[col].std() lower, upper = mean - 3 * std, mean + 3 * std else: # iqr q1, q3 = self.df[col].quantile(0.25), self.df[col].quantile(0.75) iqr = q3 - q1 lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr if outlier_action == "remove": self.df = self.df[(self.df[col] >= lower) & (self.df[col] <= upper)] elif outlier_action == "median": self.df.loc[(self.df[col] < lower) | (self.df[col] > upper), col] = self.df[col].median() elif outlier_action == "neighbor": mask = (self.df[col] < lower) | (self.df[col] > upper) self.df.loc[mask, col] = self.df[col].rolling(2, min_periods=1).mean()[mask] # 3. 时间列转换 (如果启用) if self.enable_datetime.get(): time_col = time_col_var.get() if time_col and time_col in self.df.columns: try: self.df[time_col] = pd.to_datetime(self.df[time_col]) self.df['year'] = self.df[time_col].dt.year self.df['month'] = self.df[time_col].dt.month self.df['day'] = self.df[time_col].dt.day except Exception as e: messagebox.showwarning("时间转换警告", f"时间列转换失败: {str(e)}") # 4. 添加滞后特征 (如果启用) if self.enable_lag.get(): lag_col = lag_col_var.get() if lag_col and lag_col in self.df.columns: try: lag_steps = int(lag_steps_entry.get()) self.df[f'{lag_col}_lag{lag_steps}'] = self.df[lag_col].shift(lag_steps) except Exception as e: messagebox.showwarning("滞后特征警告", f"创建滞后特征失败: {str(e)}") # ========================================================= # 更新显示 self.show_preview() preprocess_window.destroy() new_shape = self.df.shape self.status_var.set(f"预处理完成 | 原形状: {original_shape} | 新形状: {new_shape}") except Exception as e: messagebox.showerror("预处理错误", f"预处理过程中发生错误:\n{str(e)}") ttk.Button(main_frame, text="执行预处理", command=apply_preprocessing).pack(pady=10) def show_preview(self): """在表格中分页显示数据预览""" # 清除旧内容 for widget in self.preview_frame.winfo_children(): widget.destroy() # 创建主容器 container = ttk.Frame(self.preview_frame) container.pack(fill=tk.BOTH, expand=True) # 创建表格 columns = list(self.df.columns) self.tree = ttk.Treeview(container, columns=columns, show="headings") # 添加列标题 for col in columns: self.tree.heading(col, text=col) self.tree.column(col, width=100, anchor=tk.W) # 添加滚动条 scrollbar = ttk.Scrollbar(container, orient=tk.VERTICAL, command=self.tree.yview) self.tree.configure(yscroll=scrollbar.set) # 布局表格和滚动条 self.tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) # 创建分页控制面板 pagination_frame = ttk.Frame(self.preview_frame) pagination_frame.pack(fill=tk.X, pady=5) # 分页参数 self.current_page = 1 self.rows_per_page = 1000 # 每页显示的行数 self.total_pages = max(1, (len(self.df) + self.rows_per_page - 1) // self.rows_per_page) # 分页标签 self.page_label = ttk.Label(pagination_frame, text=f"第 {self.current_page} 页 / 共 {self.total_pages} 页") self.page_label.pack(side=tk.LEFT, padx=10) # 分页按钮 ttk.Button(pagination_frame, text="首页", command=lambda: self.change_page(1)).pack(side=tk.LEFT) ttk.Button(pagination_frame, text="上一页", command=lambda: self.change_page(self.current_page - 1)).pack( side=tk.LEFT) ttk.Button(pagination_frame, text="下一页", command=lambda: self.change_page(self.current_page + 1)).pack( side=tk.LEFT) ttk.Button(pagination_frame, text="末页", command=lambda: self.change_page(self.total_pages)).pack(side=tk.LEFT) # 跳转输入框 ttk.Label(pagination_frame, text="跳转到:").pack(side=tk.LEFT, padx=(10, 0)) self.page_entry = ttk.Entry(pagination_frame, width=5) self.page_entry.pack(side=tk.LEFT) ttk.Button(pagination_frame, text="跳转", command=self.jump_to_page).pack(side=tk.LEFT, padx=(5, 10)) # 显示第一页数据 self.load_page_data() def load_page_data(self): """加载当前页的数据""" # 清空现有数据 for item in self.tree.get_children(): self.tree.delete(item) # 计算起始和结束索引 start_idx = (self.current_page - 1) * self.rows_per_page end_idx = start_idx + self.rows_per_page # 添加当前页的数据行 for i, row in self.df.iloc[start_idx:end_idx].iterrows(): self.tree.insert("", tk.END, values=list(row)) # 更新分页标签 self.page_label.config(text=f"第 {self.current_page} 页 / 共 {self.total_pages} 页") self.page_entry.delete(0, tk.END) self.page_entry.insert(0, str(self.current_page)) def change_page(self, new_page): """切换页面""" # 确保新页码在有效范围内 new_page = max(1, min(new_page, self.total_pages)) if new_page != self.current_page: self.current_page = new_page self.load_page_data() def jump_to_page(self): """跳转到指定页码""" try: page_num = int(self.page_entry.get()) self.change_page(page_num) except ValueError: messagebox.showerror("错误", "请输入有效的页码数字") def process_data(self): """根据选择的处理方式处理数据""" if self.df is None: messagebox.showwarning("警告", "请先选择Excel文件") return process_type = self.process_var.get() if process_type == "统计": self.show_statistics() self.notebook.select(1) elif process_type == "数据排序": self.sort_data() elif process_type == "保存结果": self.save_data() elif process_type == "预处理": self.preprocess_data() elif process_type == "数据切分": self.divide_data() elif process_type == "数据筛选": self.filter_data() def show_statistics(self): """显示数据统计信息""" # 清除旧内容 for widget in self.stats_frame.winfo_children(): widget.destroy() # 计算统计信息 stats = self.df.describe(include='all').fillna('-') # 创建表格显示统计信息 columns = ['统计项'] + list(stats.columns) tree = ttk.Treeview(self.stats_frame, columns=columns, show="headings") # 添加列标题 for col in columns: tree.heading(col, text=col) tree.column(col, width=100, anchor=tk.W) # 添加数据行 for index, row in stats.iterrows(): tree.insert("", tk.END, values=[index] + list(row)) # 添加滚动条 scrollbar = ttk.Scrollbar(self.stats_frame, orient=tk.VERTICAL, command=tree.yview) tree.configure(yscroll=scrollbar.set) # 布局 tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) # 添加数据类型信息 type_frame = ttk.LabelFrame(self.stats_frame, text="数据类型") type_frame.pack(fill=tk.X, padx=5, pady=5) type_text = scrolledtext.ScrolledText(type_frame, height=5) type_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) dtypes = self.df.dtypes.apply(lambda x: x.name).to_dict() type_info = "\n".join([f"{col}: {dtype}" for col, dtype in dtypes.items()]) type_text.insert(tk.END, type_info) type_text.config(state=tk.DISABLED) def sort_data(self): """数据排序对话框""" sort_window = tk.Toplevel(self.root) sort_window.title("数据排序") sort_window.geometry("400x300") ttk.Label(sort_window, text="选择排序:").pack(pady=10) # 列选择 col_var = tk.StringVar() col_combo = ttk.Combobox(sort_window, textvariable=col_var, width=20) col_combo['values'] = tuple(self.df.columns) col_combo.pack(pady=5) # 排序方式 ttk.Label(sort_window, text="排序方式:").pack(pady=10) order_var = tk.StringVar(value="ascending") ttk.Radiobutton(sort_window, text="升序", variable=order_var, value="ascending").pack() ttk.Radiobutton(sort_window, text="降序", variable=order_var, value="descending").pack() def apply_sort(): if not col_var.get(): messagebox.showwarning("警告", "请选择排序列") return try: ascending = (order_var.get() == "ascending") self.df = self.df.sort_values(by=col_var.get(), ascending=ascending) self.show_preview() sort_window.destroy() self.status_var.set(f"数据已按 {col_var.get()} {'升序' if ascending else '降序'} 排序") except Exception as e: messagebox.showerror("排序错误", f"排序失败:\n{str(e)}") ttk.Button(sort_window, text="应用排序", command=apply_sort).pack(pady=20) def filter_data(self): """数据筛选对话框 - 支持多条件筛选""" if self.df is None: messagebox.showwarning("警告", "请先选择Excel文件") return filter_window = tk.Toplevel(self.root) filter_window.title("数据筛选") filter_window.geometry("700x600") main_frame = ttk.Frame(filter_window, padding=10) main_frame.pack(fill=tk.BOTH, expand=True) # 条件容器 conditions_frame = ttk.LabelFrame(main_frame, text="筛选条件", padding=10) conditions_frame.pack(fill=tk.BOTH, expand=True, pady=5) # 条件列表 self.filter_conditions = [] # 添加初始条件行 def add_condition_row(parent_frame, index=0): """添加单行筛选条件""" condition_frame = ttk.Frame(parent_frame) condition_frame.pack(fill=tk.X, pady=5) # 列选择 col_var = tk.StringVar() col_combo = ttk.Combobox(condition_frame, textvariable=col_var, width=15) col_combo['values'] = tuple(self.df.columns) col_combo.pack(side=tk.LEFT, padx=5) # 运算符选择 operator_var = tk.StringVar(value="==") operator_combo = ttk.Combobox(condition_frame, textvariable=operator_var, width=5) operator_combo['values'] = ("==", "!=", ">", ">=", "<", "<=", "包含", "不包含", "开头是", "结尾是") operator_combo.pack(side=tk.LEFT, padx=5) # 值输入 value_var = tk.StringVar() value_entry = ttk.Entry(condition_frame, textvariable=value_var, width=20) value_entry.pack(side=tk.LEFT, padx=5) # 删除按钮 def remove_condition(): condition_frame.destroy() self.filter_conditions.remove((col_var, operator_var, value_var)) remove_btn = ttk.Button(condition_frame, text="×", width=2, command=remove_condition) remove_btn.pack(side=tk.RIGHT, padx=5) # 逻辑关系选择(从第二行开始) if index > 0: logic_var = tk.StringVar(value="AND") logic_combo = ttk.Combobox(condition_frame, textvariable=logic_var, width=5) logic_combo['values'] = ("AND", "OR") logic_combo.pack(side=tk.LEFT, padx=5) return (col_var, operator_var, value_var, logic_var) else: return (col_var, operator_var, value_var, None) # 添加条件按钮 def add_condition(): new_condition = add_condition_row(conditions_frame, len(self.filter_conditions)) self.filter_conditions.append(new_condition) # 初始添加两行条件 self.filter_conditions.append(add_condition_row(conditions_frame)) self.filter_conditions.append(add_condition_row(conditions_frame, 1)) add_btn = ttk.Button(main_frame, text="添加条件", command=add_condition) add_btn.pack(anchor=tk.W, pady=5) # 预览区域 preview_frame = ttk.LabelFrame(main_frame, text="筛选结果预览", padding=10) preview_frame.pack(fill=tk.BOTH, expand=True, pady=5) preview_tree = ttk.Treeview(preview_frame, show="headings") preview_tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) vsb = ttk.Scrollbar(preview_frame, orient="vertical", command=preview_tree.yview) vsb.pack(side=tk.RIGHT, fill=tk.Y) preview_tree.configure(yscrollcommand=vsb.set) # 更新预览 def update_preview(): """更新筛选结果预览""" try: # 清空现有预览 for item in preview_tree.get_children(): preview_tree.delete(item) # 如果没有条件,显示所有数据 if not any(cond[0].get() for cond in self.filter_conditions): filtered_df = self.df.head(100) # 限制预览行数 else: # 构建查询字符串 query_parts = [] for i, (col_var, operator_var, value_var, logic_var) in enumerate(self.filter_conditions): if not col_var.get() or not value_var.get(): continue col = col_var.get() operator = operator_var.get() value = value_var.get() # 处理数值类型 if self.df[col].dtype in (int, float): try: value = float(value) except ValueError: messagebox.showwarning("类型错误", f"列 '{col}' 是数值类型,请输入数字") return # 构建条件表达式 if operator in ("==", "!=", ">", ">=", "<", "<="): expr = f"`{col}` {operator} {repr(value)}" elif operator == "包含": expr = f"`{col}`.str.contains({repr(value)}, na=False)" elif operator == "不包含": expr = f"~`{col}`.str.contains({repr(value)}, na=False)" elif operator == "开头是": expr = f"`{col}`.str.startswith({repr(value)}, na=False)" elif operator == "结尾是": expr = f"`{col}`.str.endswith({repr(value)}, na=False)" # 添加逻辑关系(从第二个条件开始) if i > 0 and logic_var and logic_var.get(): expr = f" {logic_var.get()} {expr}" query_parts.append(expr) # 执行查询 query_str = "".join(query_parts) if query_str: filtered_df = self.df.query(query_str, engine='python').head(100) # 限制预览行数 else: filtered_df = self.df.head(100) # 更新预览表格 preview_tree["columns"] = list(filtered_df.columns) for col in filtered_df.columns: preview_tree.heading(col, text=col) preview_tree.column(col, width=100, anchor=tk.W) # 添加数据行 for _, row in filtered_df.iterrows(): preview_tree.insert("", tk.END, values=list(row)) # 更新状态 self.status_var.set(f"预览: {len(filtered_df)} 行 (显示前100条)") except Exception as e: messagebox.showerror("筛选错误", f"筛选条件错误:\n{str(e)}") # 应用筛选按钮 def apply_filter(): """应用筛选条件到主数据集""" try: # 构建查询字符串 query_parts = [] for i, (col_var, operator_var, value_var, logic_var) in enumerate(self.filter_conditions): if not col_var.get() or not value_var.get(): continue col = col_var.get() operator = operator_var.get() value = value_var.get() # 处理数值类型 if self.df[col].dtype in (int, float): try: value = float(value) except ValueError: messagebox.showwarning("类型错误", f"列 '{col}' 是数值类型,请输入数字") return # 构建条件表达式 if operator in ("==", "!=", ">", ">=", "<", "<="): expr = f"`{col}` {operator} {repr(value)}" elif operator == "包含": expr = f"`{col}`.str.contains({repr(value)}, na=False)" elif operator == "不包含": expr = f"~`{col}`.str.contains({repr(value)}, na=False)" elif operator == "开头是": expr = f"`{col}`.str.startswith({repr(value)}, na=False)" elif operator == "结尾是": expr = f"`{col}`.str.endswith({repr(value)}, na=False)" # 添加逻辑关系(从第二个条件开始) if i > 0 and logic_var and logic_var.get(): expr = f" {logic_var.get()} {expr}" query_parts.append(expr) # 执行查询 query_str = "".join(query_parts) if query_str: original_count = len(self.df) self.df = self.df.query(query_str, engine='python') new_count = len(self.df) # 更新显示 self.show_preview() filter_window.destroy() self.status_var.set(f"筛选完成 | 原始行数: {original_count} | 筛选后: {new_count}") messagebox.showinfo("筛选完成", f"数据筛选完成,保留 {new_count} 行数据") else: messagebox.showwarning("警告", "未设置有效筛选条件") except Exception as e: messagebox.showerror("筛选错误", f"应用筛选条件失败:\n{str(e)}") # 按钮区域 btn_frame = ttk.Frame(main_frame) btn_frame.pack(fill=tk.X, pady=10) ttk.Button(btn_frame, text="预览", command=update_preview).pack(side=tk.LEFT, padx=5) ttk.Button(btn_frame, text="应用筛选", command=apply_filter).pack(side=tk.LEFT, padx=5) ttk.Button(btn_frame, text="取消", command=filter_window.destroy).pack(side=tk.RIGHT, padx=5) # 初始预览 update_preview() def save_data(self): """保存处理结果""" if self.df is None or self.df.empty: messagebox.showwarning("警告", "没有可保存的数据") return save_path = filedialog.asksaveasfilename( defaultextension=".xlsx", filetypes=[("Excel文件", "*.xlsx"), ("CSV文件", "*.csv")] ) if not save_path: return try: if save_path.endswith('.xlsx'): self.df.to_excel(save_path, index=False) else: self.df.to_csv(save_path, index=False) self.status_var.set(f"文件已保存至: {os.path.basename(save_path)}") messagebox.showinfo("保存成功", f"文件已成功保存至:\n{save_path}") except Exception as e: messagebox.showerror("保存错误", f"保存文件失败:\n{str(e)}") # 创建并运行程序 if __name__ == "__main__": root = tk.Tk() app = ExcelProcessor(root) root.mainloop() 帮我用相同的风格写一个类的方法:divide_data,我的需求是对一个已经按照时间顺序排列好的数据进行切分,来满足对大坝渗流问题模型的训练和测试,也就是实现数据切分功能,然后我可以选择获取测试集或者训练集
07-16
import tkinter as tk from tkinter import ttk, filedialog, messagebox, scrolledtext import pandas as pd import numpy as np import os import re class ExcelProcessor: def init(self, root): self.root = root self.root.title(“Excel数据预处理和切分”) self.root.geometry(“1000x600”) self.df = None self.create_widgets() def create_widgets(self): # 顶部控制区域 control_frame = ttk.Frame(self.root, padding=10) control_frame.pack(fill=tk.X) # 文件选择按钮 ttk.Button(control_frame, text="选择Excel文件", command=self.load_excel).grid(row=0, column=0, padx=5) # 处理方式选择 self.process_var = tk.StringVar(value="点击选择") ttk.Label(control_frame, text="处理方式:").grid(row=0, column=1, padx=5) process_combo = ttk.Combobox(control_frame, textvariable=self.process_var, width=15) process_combo['values'] = ( '预处理','数据排序','数据筛选','数据切分','保存结果') process_combo.grid(row=0, column=2, padx=5) # 执行按钮 ttk.Button(control_frame, text="执行", command=self.process_data).grid(row=0, column=3, padx=5) # 数据展示区域 self.notebook = ttk.Notebook(self.root) self.notebook.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 预览标签页 self.preview_frame = ttk.Frame(self.notebook) self.notebook.add(self.preview_frame, text="数据展示") # 状态栏 self.status_var = tk.StringVar(value="就绪") ttk.Label(self.root, textvariable=self.status_var, relief=tk.SUNKEN, anchor=tk.W).pack(fill=tk.X, side=tk.BOTTOM) def load_excel(self): """加载Excel文件""" file_path = filedialog.askopenfilename( title="选择Excel文件", filetypes=[("Excel文件", "*.xlsx *.xls"), ("所有文件", "*.*")] ) if not file_path: return try: self.status_var.set(f"正在加载: {os.path.basename(file_path)}...") self.root.update() # 更新界面显示状态 # 读取Excel文件 self.df = pd.read_excel(file_path) # 显示预览数据 self.show_preview() self.status_var.set( f"已加载: {os.path.basename(file_path)} | 行数: {len(self.df)} | 列数: {len(self.df.columns)}") except Exception as e: messagebox.showerror("加载错误", f"无法读取Excel文件:\n{str(e)}") self.status_var.set("加载失败") def preprocess_data(self): """数据预处理对话框 - 整合全部预处理功能""" if self.df is None: messagebox.showwarning("警告", "请先选择Excel文件") return preprocess_window = tk.Toplevel(self.root) preprocess_window.title("数据预处理") preprocess_window.geometry("650x800") main_frame = ttk.Frame(preprocess_window) main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # ================ 新增:功能启用复选框 ================ enable_frame = ttk.LabelFrame(main_frame, text="启用功能", padding=10) enable_frame.pack(fill=tk.X, pady=5) # 创建启用变量 self.enable_missing = tk.BooleanVar(value=True) self.enable_outlier = tk.BooleanVar(value=True) self.enable_datetime = tk.BooleanVar(value=True) self.enable_lag = tk.BooleanVar(value=True) ttk.Checkbutton(enable_frame, text="执行缺失值处理", variable=self.enable_missing).pack(anchor=tk.W) ttk.Checkbutton(enable_frame, text="执行异常值处理", variable=self.enable_outlier).pack(anchor=tk.W) ttk.Checkbutton(enable_frame, text="执行时间列转换", variable=self.enable_datetime).pack(anchor=tk.W) ttk.Checkbutton(enable_frame, text="添加滞后特征", variable=self.enable_lag).pack(anchor=tk.W) # ================================================= # 1. 缺失值处理部分 missing_frame = ttk.LabelFrame(main_frame, text="缺失值处理", padding=10) missing_frame.pack(fill=tk.X, pady=5) # 缺失值统计显示 missing_stats = self.df.isnull().sum() missing_text = scrolledtext.ScrolledText(missing_frame, height=4) missing_text.pack(fill=tk.X) for col, count in missing_stats.items(): if count > 0: missing_text.insert(tk.END, f"{col}: {count}个缺失值\n") missing_text.config(state=tk.DISABLED) # 缺失值处理方法选择 ttk.Label(missing_frame, text="处理方法:").pack(anchor=tk.W) missing_method_var = tk.StringVar(value="fill") missing_method_frame = ttk.Frame(missing_frame) missing_method_frame.pack(fill=tk.X, pady=5) ttk.Radiobutton(missing_method_frame, text="删除缺失行", variable=missing_method_var, value="drop").pack( side=tk.LEFT) ttk.Radiobutton(missing_method_frame, text="固定值填充", variable=missing_method_var, value="fill").pack( side=tk.LEFT) ttk.Radiobutton(missing_method_frame, text="插值法", variable=missing_method_var, value="interpolate").pack( side=tk.LEFT) # 填充选项 fill_options_frame = ttk.Frame(missing_frame) fill_options_frame.pack(fill=tk.X, pady=5) ttk.Label(fill_options_frame, text="填充值:").pack(side=tk.LEFT) fill_value_entry = ttk.Entry(fill_options_frame, width=10) fill_value_entry.pack(side=tk.LEFT, padx=5) fill_value_entry.insert(0, "0") ttk.Label(fill_options_frame, text="或选择:").pack(side=tk.LEFT, padx=5) fill_type_var = tk.StringVar(value="fixed") ttk.Radiobutton(fill_options_frame, text="前值填充", variable=fill_type_var, value="ffill").pack(side=tk.LEFT) ttk.Radiobutton(fill_options_frame, text="后值填充", variable=fill_type_var, value="bfill").pack(side=tk.LEFT) ttk.Radiobutton(fill_options_frame, text="均值填充", variable=fill_type_var, value="mean").pack(side=tk.LEFT) # 2. 异常值处理部分 outlier_frame = ttk.LabelFrame(main_frame, text="异常值处理", padding=10) outlier_frame.pack(fill=tk.X, pady=5) # 异常值检测方法 ttk.Label(outlier_frame, text="检测方法:").pack(anchor=tk.W) outlier_method_var = tk.StringVar(value="3sigma") outlier_method_frame = ttk.Frame(outlier_frame) outlier_method_frame.pack(fill=tk.X) ttk.Radiobutton(outlier_method_frame, text="3σ原则", variable=outlier_method_var, value="3sigma").pack( side=tk.LEFT) ttk.Radiobutton(outlier_method_frame, text="IQR方法", variable=outlier_method_var, value="iqr").pack( side=tk.LEFT) # 异常值处理方式 ttk.Label(outlier_frame, text="处理方式:").pack(anchor=tk.W) outlier_action_var = tk.StringVar(value="remove") outlier_action_frame = ttk.Frame(outlier_frame) outlier_action_frame.pack(fill=tk.X) ttk.Radiobutton(outlier_action_frame, text="删除", variable=outlier_action_var, value="remove").pack( side=tk.LEFT) ttk.Radiobutton(outlier_action_frame, text="用中位数替换", variable=outlier_action_var, value="median").pack( side=tk.LEFT) ttk.Radiobutton(outlier_action_frame, text="用前后均值替换", variable=outlier_action_var, value="neighbor").pack(side=tk.LEFT) # 3. 数据类型转换部分 type_frame = ttk.LabelFrame(main_frame, text="数据类型转换", padding=10) type_frame.pack(fill=tk.X, pady=5) # 时间列转换 ttk.Label(type_frame, text="时间列转换:").pack(anchor=tk.W) time_col_var = tk.StringVar() time_col_combo = ttk.Combobox(type_frame, textvariable=time_col_var, width=20) time_col_combo['values'] = tuple(self.df.columns) time_col_combo.pack(anchor=tk.W, pady=5) # === 新增:时间单位选择 === time_units_frame = ttk.Frame(type_frame) time_units_frame.pack(fill=tk.X, pady=5) ttk.Label(time_units_frame, text="提取时间单位:").pack(side=tk.LEFT) # 创建时间单位变量 self.extract_year = tk.BooleanVar(value=True) self.extract_month = tk.BooleanVar(value=True) self.extract_day = tk.BooleanVar(value=True) self.extract_hour = tk.BooleanVar(value=False) self.extract_minute = tk.BooleanVar(value=False) self.extract_second = tk.BooleanVar(value=False) # 添加复选框 ttk.Checkbutton(time_units_frame, text="年", variable=self.extract_year).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="月", variable=self.extract_month).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="日", variable=self.extract_day).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="时", variable=self.extract_hour).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="分", variable=self.extract_minute).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="秒", variable=self.extract_second).pack(side=tk.LEFT, padx=5) # === 修改时间转换逻辑 === if self.enable_datetime.get(): time_col = time_col_var.get() if time_col and time_col in self.df.columns: try: # 统一处理不同日期格式 self.df[time_col] = self.df[time_col].apply( lambda x: pd.to_datetime(x, errors='coerce', format='mixed') ) # 强制显示完整时间格式 pd.set_option('display.datetime_format', '%Y-%m-%d %H:%M:%S') # 根据用户选择提取时间单位 if self.extract_year.get(): self.df['year'] = self.df[time_col].dt.year if self.extract_month.get(): self.df['month'] = self.df[time_col].dt.month if self.extract_day.get(): self.df['day'] = self.df[time_col].dt.day if self.extract_hour.get(): self.df['hour'] = self.df[time_col].dt.hour if self.extract_minute.get(): self.df['minute'] = self.df[time_col].dt.minute if self.extract_second.get(): self.df['second'] = self.df[time_col].dt.second # 新增:确保时间部分显示 self.df['full_datetime'] = self.df[time_col].dt.strftime('%Y-%m-%d %H:%M:%S') # 时间周期特征 if self.extract_hour.get() or self.extract_minute.get(): self.df['time_of_day'] = self.df[time_col].dt.hour + self.df[time_col].dt.minute / 60.0 if self.extract_second.get(): self.df['time_of_day'] += self.df[time_col].dt.second / 3600.0 except Exception as e: messagebox.showerror("时间转换错误", f"时间列转换失败: {str(e)}") # 4. 特征工程部分 feature_frame = ttk.LabelFrame(main_frame, text="特征工程", padding=10) feature_frame.pack(fill=tk.X, pady=5) # 添加滞后特征 ttk.Label(feature_frame, text="滞后特征:").pack(anchor=tk.W) lag_frame = ttk.Frame(feature_frame) lag_frame.pack(fill=tk.X) ttk.Label(lag_frame, text="选择列:").pack(side=tk.LEFT) lag_col_var = tk.StringVar() lag_col_combo = ttk.Combobox(lag_frame, textvariable=lag_col_var, width=15) lag_col_combo['values'] = tuple(self.df.select_dtypes(include=['number']).columns) lag_col_combo.pack(side=tk.LEFT, padx=5) ttk.Label(lag_frame, text="滞后步数:").pack(side=tk.LEFT) lag_steps_entry = ttk.Entry(lag_frame, width=5) lag_steps_entry.pack(side=tk.LEFT) lag_steps_entry.insert(0, "1") # 执行预处理按钮 def apply_preprocessing(): try: original_shape = self.df.shape # 1. 处理缺失值 (如果启用) if self.enable_missing.get(): missing_method = missing_method_var.get() if missing_method == "drop": self.df = self.df.dropna() elif missing_method == "fill": fill_type = fill_type_var.get() if fill_type == "fixed": fill_value = fill_value_entry.get() self.df = self.df.fillna( float(fill_value) if self.df.select_dtypes(include=['number']).shape[ 1] > 0 else fill_value) elif fill_type == "ffill": self.df = self.df.ffill() elif fill_type == "bfill": self.df = self.df.bfill() elif fill_type == "mean": self.df = self.df.fillna(self.df.mean()) elif missing_method == "interpolate": self.df = self.df.interpolate() # 2. 处理异常值 (如果启用) if self.enable_outlier.get(): outlier_method = outlier_method_var.get() outlier_action = outlier_action_var.get() numeric_cols = self.df.select_dtypes(include=['number']).columns for col in numeric_cols: if outlier_method == "3sigma": mean, std = self.df[col].mean(), self.df[col].std() lower, upper = mean - 3 * std, mean + 3 * std else: # iqr q1, q3 = self.df[col].quantile(0.25), self.df[col].quantile(0.75) iqr = q3 - q1 lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr if outlier_action == "remove": self.df = self.df[(self.df[col] >= lower) & (self.df[col] <= upper)] elif outlier_action == "median": self.df.loc[(self.df[col] < lower) | (self.df[col] > upper), col] = self.df[col].median() elif outlier_action == "neighbor": mask = (self.df[col] < lower) | (self.df[col] > upper) self.df.loc[mask, col] = self.df[col].rolling(2, min_periods=1).mean()[mask] # 3. 时间列转换 (如果启用) if self.enable_datetime.get(): time_col = time_col_var.get() if time_col and time_col in self.df.columns: try: self.df[time_col] = pd.to_datetime(self.df[time_col]) self.df['year'] = self.df[time_col].dt.year self.df['month'] = self.df[time_col].dt.month self.df['day'] = self.df[time_col].dt.day except Exception as e: messagebox.showwarning("时间转换警告", f"时间列转换失败: {str(e)}") # 4. 添加滞后特征 (如果启用) if self.enable_lag.get(): lag_col = lag_col_var.get() if lag_col and lag_col in self.df.columns: try: lag_steps = int(lag_steps_entry.get()) self.df[f'{lag_col}_lag{lag_steps}'] = self.df[lag_col].shift(lag_steps) except Exception as e: messagebox.showwarning("滞后特征警告", f"创建滞后特征失败: {str(e)}") # ========================================================= # 更新显示 self.show_preview() preprocess_window.destroy() new_shape = self.df.shape self.status_var.set(f"预处理完成 | 原形状: {original_shape} | 新形状: {new_shape}") except Exception as e: messagebox.showerror("预处理错误", f"预处理过程中发生错误:\n{str(e)}") ttk.Button(main_frame, text="执行预处理", command=apply_preprocessing).pack(pady=10) def show_preview(self): """在表格中分页显示数据预览""" # 清除旧内容 for widget in self.preview_frame.winfo_children(): widget.destroy() # 创建主容器 container = ttk.Frame(self.preview_frame) container.pack(fill=tk.BOTH, expand=True) # 创建表格 columns = list(self.df.columns) self.tree = ttk.Treeview(container, columns=columns, show="headings") # 添加列标题 for col in columns: self.tree.heading(col, text=col) self.tree.column(col, width=100, anchor=tk.W) # 添加滚动条 scrollbar = ttk.Scrollbar(container, orient=tk.VERTICAL, command=self.tree.yview) self.tree.configure(yscroll=scrollbar.set) # 布局表格和滚动条 self.tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) # 创建分页控制面板 pagination_frame = ttk.Frame(self.preview_frame) pagination_frame.pack(fill=tk.X, pady=5) # 分页参数 self.current_page = 1 self.rows_per_page = 1000 # 每页显示的行数 self.total_pages = max(1, (len(self.df) + self.rows_per_page - 1) // self.rows_per_page) # 分页标签 self.page_label = ttk.Label(pagination_frame, text=f"第 {self.current_page} 页 / 共 {self.total_pages} 页") self.page_label.pack(side=tk.LEFT, padx=10) # 分页按钮 ttk.Button(pagination_frame, text="首页", command=lambda: self.change_page(1)).pack(side=tk.LEFT) ttk.Button(pagination_frame, text="上一页", command=lambda: self.change_page(self.current_page - 1)).pack( side=tk.LEFT) ttk.Button(pagination_frame, text="下一页", command=lambda: self.change_page(self.current_page + 1)).pack( side=tk.LEFT) ttk.Button(pagination_frame, text="末页", command=lambda: self.change_page(self.total_pages)).pack(side=tk.LEFT) # 跳转输入框 ttk.Label(pagination_frame, text="跳转到:").pack(side=tk.LEFT, padx=(10, 0)) self.page_entry = ttk.Entry(pagination_frame, width=5) self.page_entry.pack(side=tk.LEFT) ttk.Button(pagination_frame, text="跳转", command=self.jump_to_page).pack(side=tk.LEFT, padx=(5, 10)) # 显示第一页数据 self.load_page_data() def load_page_data(self): """加载当前页的数据""" # 清空现有数据 for item in self.tree.get_children(): self.tree.delete(item) # 计算起始和结束索引 start_idx = (self.current_page - 1) * self.rows_per_page end_idx = start_idx + self.rows_per_page # 添加当前页的数据行 for i, row in self.df.iloc[start_idx:end_idx].iterrows(): self.tree.insert("", tk.END, values=list(row)) # 更新分页标签 self.page_label.config(text=f"第 {self.current_page} 页 / 共 {self.total_pages} 页") self.page_entry.delete(0, tk.END) self.page_entry.insert(0, str(self.current_page)) def change_page(self, new_page): """切换页面""" # 确保新页码在有效范围内 new_page = max(1, min(new_page, self.total_pages)) if new_page != self.current_page: self.current_page = new_page self.load_page_data() def jump_to_page(self): """跳转到指定页码""" try: page_num = int(self.page_entry.get()) self.change_page(page_num) except ValueError: messagebox.showerror("错误", "请输入有效的页码数字") def process_data(self): """根据选择的处理方式处理数据""" if self.df is None: messagebox.showwarning("警告", "请先选择Excel文件") return process_type = self.process_var.get() if process_type == "数据排序": self.sort_data() elif process_type == "保存结果": self.save_data() elif process_type == "预处理": self.preprocess_data() elif process_type == "数据切分": self.divide_data() elif process_type == "数据筛选": self.filter_data() def sort_data(self): """数据排序对话框""" sort_window = tk.Toplevel(self.root) sort_window.title("数据排序") sort_window.geometry("400x300") ttk.Label(sort_window, text="选择排序:").pack(pady=10) # 列选择 col_var = tk.StringVar() col_combo = ttk.Combobox(sort_window, textvariable=col_var, width=20) col_combo['values'] = tuple(self.df.columns) col_combo.pack(pady=5) # 排序方式 ttk.Label(sort_window, text="排序方式:").pack(pady=10) order_var = tk.StringVar(value="ascending") ttk.Radiobutton(sort_window, text="升序", variable=order_var, value="ascending").pack() ttk.Radiobutton(sort_window, text="降序", variable=order_var, value="descending").pack() def apply_sort(): if not col_var.get(): messagebox.showwarning("警告", "请选择排序列") return try: ascending = (order_var.get() == "ascending") self.df = self.df.sort_values(by=col_var.get(), ascending=ascending) self.show_preview() sort_window.destroy() self.status_var.set(f"数据已按 {col_var.get()} {'升序' if ascending else '降序'} 排序") except Exception as e: messagebox.showerror("排序错误", f"排序失败:\n{str(e)}") ttk.Button(sort_window, text="应用排序", command=apply_sort).pack(pady=20) def filter_data(self): """数据筛选对话框 - 支持多条件筛选""" if self.df is None: messagebox.showwarning("警告", "请先选择Excel文件") return filter_window = tk.Toplevel(self.root) filter_window.title("数据筛选") filter_window.geometry("700x600") main_frame = ttk.Frame(filter_window, padding=10) main_frame.pack(fill=tk.BOTH, expand=True) # 条件容器 conditions_frame = ttk.LabelFrame(main_frame, text="筛选条件", padding=10) conditions_frame.pack(fill=tk.BOTH, expand=True, pady=5) # 条件列表 self.filter_conditions = [] # 添加初始条件行 def add_condition_row(parent_frame, index=0): """添加单行筛选条件""" condition_frame = ttk.Frame(parent_frame) condition_frame.pack(fill=tk.X, pady=5) # 列选择 col_var = tk.StringVar() col_combo = ttk.Combobox(condition_frame, textvariable=col_var, width=15) col_combo['values'] = tuple(self.df.columns) col_combo.pack(side=tk.LEFT, padx=5) # 运算符选择 operator_var = tk.StringVar(value="==") operator_combo = ttk.Combobox(condition_frame, textvariable=operator_var, width=5) operator_combo['values'] = ("==", "!=", ">", ">=", "<", "<=", "包含", "不包含", "开头是", "结尾是") operator_combo.pack(side=tk.LEFT, padx=5) # 值输入 value_var = tk.StringVar() value_entry = ttk.Entry(condition_frame, textvariable=value_var, width=20) value_entry.pack(side=tk.LEFT, padx=5) # 删除按钮 def remove_condition(): condition_frame.destroy() self.filter_conditions.remove((col_var, operator_var, value_var)) remove_btn = ttk.Button(condition_frame, text="×", width=2, command=remove_condition) remove_btn.pack(side=tk.RIGHT, padx=5) # 逻辑关系选择(从第二行开始) if index > 0: logic_var = tk.StringVar(value="AND") logic_combo = ttk.Combobox(condition_frame, textvariable=logic_var, width=5) logic_combo['values'] = ("AND", "OR") logic_combo.pack(side=tk.LEFT, padx=5) return (col_var, operator_var, value_var, logic_var) else: return (col_var, operator_var, value_var, None) # 添加条件按钮 def add_condition(): new_condition = add_condition_row(conditions_frame, len(self.filter_conditions)) self.filter_conditions.append(new_condition) # 初始添加两行条件 self.filter_conditions.append(add_condition_row(conditions_frame)) self.filter_conditions.append(add_condition_row(conditions_frame, 1)) add_btn = ttk.Button(main_frame, text="添加条件", command=add_condition) add_btn.pack(anchor=tk.W, pady=5) # 预览区域 preview_frame = ttk.LabelFrame(main_frame, text="筛选结果预览", padding=10) preview_frame.pack(fill=tk.BOTH, expand=True, pady=5) preview_tree = ttk.Treeview(preview_frame, show="headings") preview_tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) vsb = ttk.Scrollbar(preview_frame, orient="vertical", command=preview_tree.yview) vsb.pack(side=tk.RIGHT, fill=tk.Y) preview_tree.configure(yscrollcommand=vsb.set) # 更新预览 def update_preview(): """更新筛选结果预览""" try: # 清空现有预览 for item in preview_tree.get_children(): preview_tree.delete(item) # 如果没有条件,显示所有数据 if not any(cond[0].get() for cond in self.filter_conditions): filtered_df = self.df.head(100) # 限制预览行数 else: # 构建查询字符串 query_parts = [] for i, (col_var, operator_var, value_var, logic_var) in enumerate(self.filter_conditions): if not col_var.get() or not value_var.get(): continue col = col_var.get() operator = operator_var.get() value = value_var.get() # 处理数值类型 if self.df[col].dtype in (int, float): try: value = float(value) except ValueError: messagebox.showwarning("类型错误", f"列 '{col}' 是数值类型,请输入数字") return # 构建条件表达式 if operator in ("==", "!=", ">", ">=", "<", "<="): expr = f"`{col}` {operator} {repr(value)}" elif operator == "包含": expr = f"`{col}`.str.contains({repr(value)}, na=False)" elif operator == "不包含": expr = f"~`{col}`.str.contains({repr(value)}, na=False)" elif operator == "开头是": expr = f"`{col}`.str.startswith({repr(value)}, na=False)" elif operator == "结尾是": expr = f"`{col}`.str.endswith({repr(value)}, na=False)" # 添加逻辑关系(从第二个条件开始) if i > 0 and logic_var and logic_var.get(): expr = f" {logic_var.get()} {expr}" query_parts.append(expr) # 执行查询 query_str = "".join(query_parts) if query_str: filtered_df = self.df.query(query_str, engine='python').head(100) # 限制预览行数 else: filtered_df = self.df.head(100) # 更新预览表格 preview_tree["columns"] = list(filtered_df.columns) for col in filtered_df.columns: preview_tree.heading(col, text=col) preview_tree.column(col, width=100, anchor=tk.W) # 添加数据行 for _, row in filtered_df.iterrows(): preview_tree.insert("", tk.END, values=list(row)) # 更新状态 self.status_var.set(f"预览: {len(filtered_df)} 行 (显示前100条)") except Exception as e: messagebox.showerror("筛选错误", f"筛选条件错误:\n{str(e)}") # 应用筛选按钮 def apply_filter(): """应用筛选条件到主数据集""" try: # 构建查询字符串 query_parts = [] for i, (col_var, operator_var, value_var, logic_var) in enumerate(self.filter_conditions): if not col_var.get() or not value_var.get(): continue col = col_var.get() operator = operator_var.get() value = value_var.get() # 处理数值类型 if self.df[col].dtype in (int, float): try: value = float(value) except ValueError: messagebox.showwarning("类型错误", f"列 '{col}' 是数值类型,请输入数字") return # 构建条件表达式 if operator in ("==", "!=", ">", ">=", "<", "<="): expr = f"`{col}` {operator} {repr(value)}" elif operator == "包含": expr = f"`{col}`.str.contains({repr(value)}, na=False)" elif operator == "不包含": expr = f"~`{col}`.str.contains({repr(value)}, na=False)" elif operator == "开头是": expr = f"`{col}`.str.startswith({repr(value)}, na=False)" elif operator == "结尾是": expr = f"`{col}`.str.endswith({repr(value)}, na=False)" # 添加逻辑关系(从第二个条件开始) if i > 0 and logic_var and logic_var.get(): expr = f" {logic_var.get()} {expr}" query_parts.append(expr) # 执行查询 query_str = "".join(query_parts) if query_str: original_count = len(self.df) self.df = self.df.query(query_str, engine='python') new_count = len(self.df) # 更新显示 self.show_preview() filter_window.destroy() self.status_var.set(f"筛选完成 | 原始行数: {original_count} | 筛选后: {new_count}") messagebox.showinfo("筛选完成", f"数据筛选完成,保留 {new_count} 行数据") else: messagebox.showwarning("警告", "未设置有效筛选条件") except Exception as e: messagebox.showerror("筛选错误", f"应用筛选条件失败:\n{str(e)}") # 按钮区域 btn_frame = ttk.Frame(main_frame) btn_frame.pack(fill=tk.X, pady=10) ttk.Button(btn_frame, text="预览", command=update_preview).pack(side=tk.LEFT, padx=5) ttk.Button(btn_frame, text="应用筛选", command=apply_filter).pack(side=tk.LEFT, padx=5) ttk.Button(btn_frame, text="取消", command=filter_window.destroy).pack(side=tk.RIGHT, padx=5) # 初始预览 update_preview() def divide_data(self): """数据切分对话框 - 专门用于大坝渗流问题模型训练和测试""" if self.df is None: messagebox.showwarning("警告", "请先选择Excel文件") return # 检查时间列是否存在 datetime_cols = [col for col in self.df.columns if pd.api.types.is_datetime64_any_dtype(self.df[col])] if not datetime_cols: messagebox.showwarning("警告", "未找到时间列,请先进行时间转换") return divide_window = tk.Toplevel(self.root) divide_window.title("大坝渗流数据切分") divide_window.geometry("600x550") main_frame = ttk.Frame(divide_window, padding=15) main_frame.pack(fill=tk.BOTH, expand=True) # 1. 时间列选择 time_frame = ttk.LabelFrame(main_frame, text="时间列选择", padding=10) time_frame.pack(fill=tk.X, pady=5) ttk.Label(time_frame, text="选择时间列:").pack(anchor=tk.W, pady=3) self.time_col_var = tk.StringVar(value=datetime_cols[0]) time_combo = ttk.Combobox(time_frame, textvariable=self.time_col_var, width=25) time_combo['values'] = tuple(datetime_cols) time_combo.pack(anchor=tk.W, fill=tk.X, pady=3) # 显示时间范围 time_range_frame = ttk.Frame(time_frame) time_range_frame.pack(fill=tk.X, pady=5) # 获取最小和最大时间 time_col = self.time_col_var.get() min_time = self.df[time_col].min().strftime('%Y-%m-%d %H:%M') max_time = self.df[time_col].max().strftime('%Y-%m-%d %H:%M') ttk.Label(time_range_frame, text="时间范围:").pack(side=tk.LEFT) ttk.Label(time_range_frame, text=f"{min_time} 至 {max_time}", font=("Arial", 9, "bold")).pack(side=tk.LEFT, padx=5) # 2. 切分方式选择 method_frame = ttk.LabelFrame(main_frame, text="切分方式", padding=10) method_frame.pack(fill=tk.X, pady=5) self.divide_method = tk.StringVar(value="ratio") # 按比例切分 ratio_frame = ttk.Frame(method_frame) ratio_frame.pack(fill=tk.X, pady=3) ttk.Radiobutton(ratio_frame, text="按比例切分", variable=self.divide_method, value="ratio").pack(side=tk.LEFT) ratio_subframe = ttk.Frame(ratio_frame) ratio_subframe.pack(side=tk.LEFT, padx=10) ttk.Label(ratio_subframe, text="训练集比例:").pack(side=tk.LEFT) self.train_ratio_var = tk.DoubleVar(value=0.8) ratio_spin = ttk.Spinbox(ratio_subframe, from_=0.1, to=0.9, increment=0.05, width=5, textvariable=self.train_ratio_var) ratio_spin.pack(side=tk.LEFT, padx=5) # 按时间点切分 date_frame = ttk.Frame(method_frame) date_frame.pack(fill=tk.X, pady=3) ttk.Radiobutton(date_frame, text="按时间点切分", variable=self.divide_method, value="date").pack(side=tk.LEFT) date_subframe = ttk.Frame(date_frame) date_subframe.pack(side=tk.LEFT, padx=10) ttk.Label(date_subframe, text="切分时间:").pack(side=tk.LEFT) # 使用日历控件选择日期 self.divide_date_var = tk.StringVar(value=min_time) date_entry = ttk.Entry(date_subframe, textvariable=self.divide_date_var, width=16) date_entry.pack(side=tk.LEFT, padx=5) # 3. 数据集选择 dataset_frame = ttk.LabelFrame(main_frame, text="选择数据集", padding=10) dataset_frame.pack(fill=tk.X, pady=5) self.dataset_var = tk.StringVar(value="train") ttk.Radiobutton(dataset_frame, text="训练集 (切分点之前)", variable=self.dataset_var, value="train").pack(anchor=tk.W) ttk.Radiobutton(dataset_frame, text="测试集 (切分点之后)", variable=self.dataset_var, value="test").pack(anchor=tk.W) ttk.Radiobutton(dataset_frame, text="全部数据 (仅划分,不切分)", variable=self.dataset_var, value="all").pack(anchor=tk.W) # 4. 添加切分标记选项 mark_frame = ttk.LabelFrame(main_frame, text="切分标记", padding=10) mark_frame.pack(fill=tk.X, pady=5) self.mark_division = tk.BooleanVar(value=True) ttk.Checkbutton(mark_frame, text="添加数据集标记列 (train/test)", variable=self.mark_division).pack(anchor=tk.W) # 5. 执行按钮 btn_frame = ttk.Frame(main_frame) btn_frame.pack(fill=tk.X, pady=10) def apply_division(): """应用数据切分""" try: time_col = self.time_col_var.get() # 确保数据按时间排序 self.df = self.df.sort_values(by=time_col) # 计算切分点 if self.divide_method.get() == "ratio": ratio = self.train_ratio_var.get() split_index = int(len(self.df) * ratio) split_time = self.df.iloc[split_index][time_col] else: # 按时间点切分 split_time = pd.to_datetime(self.divide_date_var.get()) split_index = self.df[self.df[time_col] <= split_time].index.max() + 1 # 添加数据集标记列 if self.mark_division.get(): self.df['dataset'] = 'train' self.df.loc[split_index:, 'dataset'] = 'test' # 根据选择获取数据集 dataset_choice = self.dataset_var.get() if dataset_choice == "train": self.df = self.df.iloc[:split_index] result_type = "训练集" elif dataset_choice == "test": self.df = self.df.iloc[split_index:] result_type = "测试集" else: result_type = "全部数据 (已添加标记)" # 更新显示 self.show_preview() divide_window.destroy() self.status_var.set(f"数据切分完成 | {result_type} | 切分点: {split_time.strftime('%Y-%m-%d %H:%M')}") messagebox.showinfo("切分完成", f"大坝渗流数据切分完成\n当前数据集: {result_type}") except Exception as e: messagebox.showerror("切分错误", f"数据切分失败:\n{str(e)}") def preview_division(): """预览切分点""" try: time_col = self.time_col_var.get() if self.divide_method.get() == "ratio": ratio = self.train_ratio_var.get() split_index = int(len(self.df) * ratio) split_time = self.df.iloc[split_index][time_col] else: split_time = pd.to_datetime(self.divide_date_var.get()) split_index = self.df[self.df[time_col] <= split_time].index.max() + 1 train_count = split_index test_count = len(self.df) - split_index messagebox.showinfo("切分预览", f"切分点时间: {split_time.strftime('%Y-%m-%d %H:%M')}\n" f"训练集数据量: {train_count} 行 ({train_count / len(self.df) * 100:.1f}%)\n" f"测试集数据量: {test_count} 行 ({test_count / len(self.df) * 100:.1f}%)") except Exception as e: messagebox.showerror("预览错误", f"切分点预览失败:\n{str(e)}") ttk.Button(btn_frame, text="预览切分点", command=preview_division).pack(side=tk.LEFT, padx=5) ttk.Button(btn_frame, text="应用切分", command=apply_division).pack(side=tk.LEFT, padx=5) ttk.Button(btn_frame, text="取消", command=divide_window.destroy).pack(side=tk.RIGHT, padx=5) def save_data(self): """保存处理结果""" if self.df is None or self.df.empty: messagebox.showwarning("警告", "没有可保存的数据") return save_path = filedialog.asksaveasfilename( defaultextension=".xlsx", filetypes=[("Excel文件", "*.xlsx"), ("CSV文件", "*.csv")] ) if not save_path: return try: if save_path.endswith('.xlsx'): self.df.to_excel(save_path, index=False) else: self.df.to_csv(save_path, index=False) self.status_var.set(f"文件已保存至: {os.path.basename(save_path)}") messagebox.showinfo("保存成功", f"文件已成功保存至:\n{save_path}") except Exception as e: messagebox.showerror("保存错误", f"保存文件失败:\n{str(e)}") 创建并运行程序 if name == “main”: root = tk.Tk() app = ExcelProcessor(root) root.mainloop() 我是想让你设计成这样的一个窗口格式可以选择excel文件当作训练集和测试集
07-17
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值