import tkinter as tk
from tkinter import ttk, filedialog, messagebox, scrolledtext
import pandas as pd
import numpy as np
import os
import re
class ExcelProcessor:
def init(self, root):
self.root = root
self.root.title(“Excel数据预处理和切分”)
self.root.geometry(“1000x600”)
self.df = None
self.create_widgets()
def create_widgets(self): # 顶部控制区域 control_frame = ttk.Frame(self.root, padding=10) control_frame.pack(fill=tk.X) # 文件选择按钮 ttk.Button(control_frame, text="选择Excel文件", command=self.load_excel).grid(row=0, column=0, padx=5) # 处理方式选择 self.process_var = tk.StringVar(value="点击选择") ttk.Label(control_frame, text="处理方式:").grid(row=0, column=1, padx=5) process_combo = ttk.Combobox(control_frame, textvariable=self.process_var, width=15) process_combo['values'] = ( '预处理','数据排序','数据筛选','数据切分','保存结果') process_combo.grid(row=0, column=2, padx=5) # 执行按钮 ttk.Button(control_frame, text="执行", command=self.process_data).grid(row=0, column=3, padx=5) # 数据展示区域 self.notebook = ttk.Notebook(self.root) self.notebook.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 预览标签页 self.preview_frame = ttk.Frame(self.notebook) self.notebook.add(self.preview_frame, text="数据展示") # 状态栏 self.status_var = tk.StringVar(value="就绪") ttk.Label(self.root, textvariable=self.status_var, relief=tk.SUNKEN, anchor=tk.W).pack(fill=tk.X, side=tk.BOTTOM) def load_excel(self): """加载Excel文件""" file_path = filedialog.askopenfilename( title="选择Excel文件", filetypes=[("Excel文件", "*.xlsx *.xls"), ("所有文件", "*.*")] ) if not file_path: return try: self.status_var.set(f"正在加载: {os.path.basename(file_path)}...") self.root.update() # 更新界面显示状态 # 读取Excel文件 self.df = pd.read_excel(file_path) # 显示预览数据 self.show_preview() self.status_var.set( f"已加载: {os.path.basename(file_path)} | 行数: {len(self.df)} | 列数: {len(self.df.columns)}") except Exception as e: messagebox.showerror("加载错误", f"无法读取Excel文件:\n{str(e)}") self.status_var.set("加载失败") def preprocess_data(self): """数据预处理对话框 - 整合全部预处理功能""" if self.df is None: messagebox.showwarning("警告", "请先选择Excel文件") return preprocess_window = tk.Toplevel(self.root) preprocess_window.title("数据预处理") preprocess_window.geometry("650x800") main_frame = ttk.Frame(preprocess_window) main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # ================ 新增:功能启用复选框 ================ enable_frame = ttk.LabelFrame(main_frame, text="启用功能", padding=10) enable_frame.pack(fill=tk.X, pady=5) # 创建启用变量 self.enable_missing = tk.BooleanVar(value=True) self.enable_outlier = tk.BooleanVar(value=True) self.enable_datetime = tk.BooleanVar(value=True) self.enable_lag = tk.BooleanVar(value=True) ttk.Checkbutton(enable_frame, text="执行缺失值处理", variable=self.enable_missing).pack(anchor=tk.W) ttk.Checkbutton(enable_frame, text="执行异常值处理", variable=self.enable_outlier).pack(anchor=tk.W) ttk.Checkbutton(enable_frame, text="执行时间列转换", variable=self.enable_datetime).pack(anchor=tk.W) ttk.Checkbutton(enable_frame, text="添加滞后特征", variable=self.enable_lag).pack(anchor=tk.W) # ================================================= # 1. 缺失值处理部分 missing_frame = ttk.LabelFrame(main_frame, text="缺失值处理", padding=10) missing_frame.pack(fill=tk.X, pady=5) # 缺失值统计显示 missing_stats = self.df.isnull().sum() missing_text = scrolledtext.ScrolledText(missing_frame, height=4) missing_text.pack(fill=tk.X) for col, count in missing_stats.items(): if count > 0: missing_text.insert(tk.END, f"{col}: {count}个缺失值\n") missing_text.config(state=tk.DISABLED) # 缺失值处理方法选择 ttk.Label(missing_frame, text="处理方法:").pack(anchor=tk.W) missing_method_var = tk.StringVar(value="fill") missing_method_frame = ttk.Frame(missing_frame) missing_method_frame.pack(fill=tk.X, pady=5) ttk.Radiobutton(missing_method_frame, text="删除缺失行", variable=missing_method_var, value="drop").pack( side=tk.LEFT) ttk.Radiobutton(missing_method_frame, text="固定值填充", variable=missing_method_var, value="fill").pack( side=tk.LEFT) ttk.Radiobutton(missing_method_frame, text="插值法", variable=missing_method_var, value="interpolate").pack( side=tk.LEFT) # 填充选项 fill_options_frame = ttk.Frame(missing_frame) fill_options_frame.pack(fill=tk.X, pady=5) ttk.Label(fill_options_frame, text="填充值:").pack(side=tk.LEFT) fill_value_entry = ttk.Entry(fill_options_frame, width=10) fill_value_entry.pack(side=tk.LEFT, padx=5) fill_value_entry.insert(0, "0") ttk.Label(fill_options_frame, text="或选择:").pack(side=tk.LEFT, padx=5) fill_type_var = tk.StringVar(value="fixed") ttk.Radiobutton(fill_options_frame, text="前值填充", variable=fill_type_var, value="ffill").pack(side=tk.LEFT) ttk.Radiobutton(fill_options_frame, text="后值填充", variable=fill_type_var, value="bfill").pack(side=tk.LEFT) ttk.Radiobutton(fill_options_frame, text="均值填充", variable=fill_type_var, value="mean").pack(side=tk.LEFT) # 2. 异常值处理部分 outlier_frame = ttk.LabelFrame(main_frame, text="异常值处理", padding=10) outlier_frame.pack(fill=tk.X, pady=5) # 异常值检测方法 ttk.Label(outlier_frame, text="检测方法:").pack(anchor=tk.W) outlier_method_var = tk.StringVar(value="3sigma") outlier_method_frame = ttk.Frame(outlier_frame) outlier_method_frame.pack(fill=tk.X) ttk.Radiobutton(outlier_method_frame, text="3σ原则", variable=outlier_method_var, value="3sigma").pack( side=tk.LEFT) ttk.Radiobutton(outlier_method_frame, text="IQR方法", variable=outlier_method_var, value="iqr").pack( side=tk.LEFT) # 异常值处理方式 ttk.Label(outlier_frame, text="处理方式:").pack(anchor=tk.W) outlier_action_var = tk.StringVar(value="remove") outlier_action_frame = ttk.Frame(outlier_frame) outlier_action_frame.pack(fill=tk.X) ttk.Radiobutton(outlier_action_frame, text="删除", variable=outlier_action_var, value="remove").pack( side=tk.LEFT) ttk.Radiobutton(outlier_action_frame, text="用中位数替换", variable=outlier_action_var, value="median").pack( side=tk.LEFT) ttk.Radiobutton(outlier_action_frame, text="用前后均值替换", variable=outlier_action_var, value="neighbor").pack(side=tk.LEFT) # 3. 数据类型转换部分 type_frame = ttk.LabelFrame(main_frame, text="数据类型转换", padding=10) type_frame.pack(fill=tk.X, pady=5) # 时间列转换 ttk.Label(type_frame, text="时间列转换:").pack(anchor=tk.W) time_col_var = tk.StringVar() time_col_combo = ttk.Combobox(type_frame, textvariable=time_col_var, width=20) time_col_combo['values'] = tuple(self.df.columns) time_col_combo.pack(anchor=tk.W, pady=5) # === 新增:时间单位选择 === time_units_frame = ttk.Frame(type_frame) time_units_frame.pack(fill=tk.X, pady=5) ttk.Label(time_units_frame, text="提取时间单位:").pack(side=tk.LEFT) # 创建时间单位变量 self.extract_year = tk.BooleanVar(value=True) self.extract_month = tk.BooleanVar(value=True) self.extract_day = tk.BooleanVar(value=True) self.extract_hour = tk.BooleanVar(value=False) self.extract_minute = tk.BooleanVar(value=False) self.extract_second = tk.BooleanVar(value=False) # 添加复选框 ttk.Checkbutton(time_units_frame, text="年", variable=self.extract_year).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="月", variable=self.extract_month).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="日", variable=self.extract_day).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="时", variable=self.extract_hour).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="分", variable=self.extract_minute).pack(side=tk.LEFT, padx=5) ttk.Checkbutton(time_units_frame, text="秒", variable=self.extract_second).pack(side=tk.LEFT, padx=5) # === 修改时间转换逻辑 === if self.enable_datetime.get(): time_col = time_col_var.get() if time_col and time_col in self.df.columns: try: # 统一处理不同日期格式 self.df[time_col] = self.df[time_col].apply( lambda x: pd.to_datetime(x, errors='coerce', format='mixed') ) # 强制显示完整时间格式 pd.set_option('display.datetime_format', '%Y-%m-%d %H:%M:%S') # 根据用户选择提取时间单位 if self.extract_year.get(): self.df['year'] = self.df[time_col].dt.year if self.extract_month.get(): self.df['month'] = self.df[time_col].dt.month if self.extract_day.get(): self.df['day'] = self.df[time_col].dt.day if self.extract_hour.get(): self.df['hour'] = self.df[time_col].dt.hour if self.extract_minute.get(): self.df['minute'] = self.df[time_col].dt.minute if self.extract_second.get(): self.df['second'] = self.df[time_col].dt.second # 新增:确保时间部分显示 self.df['full_datetime'] = self.df[time_col].dt.strftime('%Y-%m-%d %H:%M:%S') # 时间周期特征 if self.extract_hour.get() or self.extract_minute.get(): self.df['time_of_day'] = self.df[time_col].dt.hour + self.df[time_col].dt.minute / 60.0 if self.extract_second.get(): self.df['time_of_day'] += self.df[time_col].dt.second / 3600.0 except Exception as e: messagebox.showerror("时间转换错误", f"时间列转换失败: {str(e)}") # 4. 特征工程部分 feature_frame = ttk.LabelFrame(main_frame, text="特征工程", padding=10) feature_frame.pack(fill=tk.X, pady=5) # 添加滞后特征 ttk.Label(feature_frame, text="滞后特征:").pack(anchor=tk.W) lag_frame = ttk.Frame(feature_frame) lag_frame.pack(fill=tk.X) ttk.Label(lag_frame, text="选择列:").pack(side=tk.LEFT) lag_col_var = tk.StringVar() lag_col_combo = ttk.Combobox(lag_frame, textvariable=lag_col_var, width=15) lag_col_combo['values'] = tuple(self.df.select_dtypes(include=['number']).columns) lag_col_combo.pack(side=tk.LEFT, padx=5) ttk.Label(lag_frame, text="滞后步数:").pack(side=tk.LEFT) lag_steps_entry = ttk.Entry(lag_frame, width=5) lag_steps_entry.pack(side=tk.LEFT) lag_steps_entry.insert(0, "1") # 执行预处理按钮 def apply_preprocessing(): try: original_shape = self.df.shape # 1. 处理缺失值 (如果启用) if self.enable_missing.get(): missing_method = missing_method_var.get() if missing_method == "drop": self.df = self.df.dropna() elif missing_method == "fill": fill_type = fill_type_var.get() if fill_type == "fixed": fill_value = fill_value_entry.get() self.df = self.df.fillna( float(fill_value) if self.df.select_dtypes(include=['number']).shape[ 1] > 0 else fill_value) elif fill_type == "ffill": self.df = self.df.ffill() elif fill_type == "bfill": self.df = self.df.bfill() elif fill_type == "mean": self.df = self.df.fillna(self.df.mean()) elif missing_method == "interpolate": self.df = self.df.interpolate() # 2. 处理异常值 (如果启用) if self.enable_outlier.get(): outlier_method = outlier_method_var.get() outlier_action = outlier_action_var.get() numeric_cols = self.df.select_dtypes(include=['number']).columns for col in numeric_cols: if outlier_method == "3sigma": mean, std = self.df[col].mean(), self.df[col].std() lower, upper = mean - 3 * std, mean + 3 * std else: # iqr q1, q3 = self.df[col].quantile(0.25), self.df[col].quantile(0.75) iqr = q3 - q1 lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr if outlier_action == "remove": self.df = self.df[(self.df[col] >= lower) & (self.df[col] <= upper)] elif outlier_action == "median": self.df.loc[(self.df[col] < lower) | (self.df[col] > upper), col] = self.df[col].median() elif outlier_action == "neighbor": mask = (self.df[col] < lower) | (self.df[col] > upper) self.df.loc[mask, col] = self.df[col].rolling(2, min_periods=1).mean()[mask] # 3. 时间列转换 (如果启用) if self.enable_datetime.get(): time_col = time_col_var.get() if time_col and time_col in self.df.columns: try: self.df[time_col] = pd.to_datetime(self.df[time_col]) self.df['year'] = self.df[time_col].dt.year self.df['month'] = self.df[time_col].dt.month self.df['day'] = self.df[time_col].dt.day except Exception as e: messagebox.showwarning("时间转换警告", f"时间列转换失败: {str(e)}") # 4. 添加滞后特征 (如果启用) if self.enable_lag.get(): lag_col = lag_col_var.get() if lag_col and lag_col in self.df.columns: try: lag_steps = int(lag_steps_entry.get()) self.df[f'{lag_col}_lag{lag_steps}'] = self.df[lag_col].shift(lag_steps) except Exception as e: messagebox.showwarning("滞后特征警告", f"创建滞后特征失败: {str(e)}") # ========================================================= # 更新显示 self.show_preview() preprocess_window.destroy() new_shape = self.df.shape self.status_var.set(f"预处理完成 | 原形状: {original_shape} | 新形状: {new_shape}") except Exception as e: messagebox.showerror("预处理错误", f"预处理过程中发生错误:\n{str(e)}") ttk.Button(main_frame, text="执行预处理", command=apply_preprocessing).pack(pady=10) def show_preview(self): """在表格中分页显示数据预览""" # 清除旧内容 for widget in self.preview_frame.winfo_children(): widget.destroy() # 创建主容器 container = ttk.Frame(self.preview_frame) container.pack(fill=tk.BOTH, expand=True) # 创建表格 columns = list(self.df.columns) self.tree = ttk.Treeview(container, columns=columns, show="headings") # 添加列标题 for col in columns: self.tree.heading(col, text=col) self.tree.column(col, width=100, anchor=tk.W) # 添加滚动条 scrollbar = ttk.Scrollbar(container, orient=tk.VERTICAL, command=self.tree.yview) self.tree.configure(yscroll=scrollbar.set) # 布局表格和滚动条 self.tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) # 创建分页控制面板 pagination_frame = ttk.Frame(self.preview_frame) pagination_frame.pack(fill=tk.X, pady=5) # 分页参数 self.current_page = 1 self.rows_per_page = 1000 # 每页显示的行数 self.total_pages = max(1, (len(self.df) + self.rows_per_page - 1) // self.rows_per_page) # 分页标签 self.page_label = ttk.Label(pagination_frame, text=f"第 {self.current_page} 页 / 共 {self.total_pages} 页") self.page_label.pack(side=tk.LEFT, padx=10) # 分页按钮 ttk.Button(pagination_frame, text="首页", command=lambda: self.change_page(1)).pack(side=tk.LEFT) ttk.Button(pagination_frame, text="上一页", command=lambda: self.change_page(self.current_page - 1)).pack( side=tk.LEFT) ttk.Button(pagination_frame, text="下一页", command=lambda: self.change_page(self.current_page + 1)).pack( side=tk.LEFT) ttk.Button(pagination_frame, text="末页", command=lambda: self.change_page(self.total_pages)).pack(side=tk.LEFT) # 跳转输入框 ttk.Label(pagination_frame, text="跳转到:").pack(side=tk.LEFT, padx=(10, 0)) self.page_entry = ttk.Entry(pagination_frame, width=5) self.page_entry.pack(side=tk.LEFT) ttk.Button(pagination_frame, text="跳转", command=self.jump_to_page).pack(side=tk.LEFT, padx=(5, 10)) # 显示第一页数据 self.load_page_data() def load_page_data(self): """加载当前页的数据""" # 清空现有数据 for item in self.tree.get_children(): self.tree.delete(item) # 计算起始和结束索引 start_idx = (self.current_page - 1) * self.rows_per_page end_idx = start_idx + self.rows_per_page # 添加当前页的数据行 for i, row in self.df.iloc[start_idx:end_idx].iterrows(): self.tree.insert("", tk.END, values=list(row)) # 更新分页标签 self.page_label.config(text=f"第 {self.current_page} 页 / 共 {self.total_pages} 页") self.page_entry.delete(0, tk.END) self.page_entry.insert(0, str(self.current_page)) def change_page(self, new_page): """切换页面""" # 确保新页码在有效范围内 new_page = max(1, min(new_page, self.total_pages)) if new_page != self.current_page: self.current_page = new_page self.load_page_data() def jump_to_page(self): """跳转到指定页码""" try: page_num = int(self.page_entry.get()) self.change_page(page_num) except ValueError: messagebox.showerror("错误", "请输入有效的页码数字") def process_data(self): """根据选择的处理方式处理数据""" if self.df is None: messagebox.showwarning("警告", "请先选择Excel文件") return process_type = self.process_var.get() if process_type == "数据排序": self.sort_data() elif process_type == "保存结果": self.save_data() elif process_type == "预处理": self.preprocess_data() elif process_type == "数据切分": self.divide_data() elif process_type == "数据筛选": self.filter_data() def sort_data(self): """数据排序对话框""" sort_window = tk.Toplevel(self.root) sort_window.title("数据排序") sort_window.geometry("400x300") ttk.Label(sort_window, text="选择排序列:").pack(pady=10) # 列选择 col_var = tk.StringVar() col_combo = ttk.Combobox(sort_window, textvariable=col_var, width=20) col_combo['values'] = tuple(self.df.columns) col_combo.pack(pady=5) # 排序方式 ttk.Label(sort_window, text="排序方式:").pack(pady=10) order_var = tk.StringVar(value="ascending") ttk.Radiobutton(sort_window, text="升序", variable=order_var, value="ascending").pack() ttk.Radiobutton(sort_window, text="降序", variable=order_var, value="descending").pack() def apply_sort(): if not col_var.get(): messagebox.showwarning("警告", "请选择排序列") return try: ascending = (order_var.get() == "ascending") self.df = self.df.sort_values(by=col_var.get(), ascending=ascending) self.show_preview() sort_window.destroy() self.status_var.set(f"数据已按 {col_var.get()} {'升序' if ascending else '降序'} 排序") except Exception as e: messagebox.showerror("排序错误", f"排序失败:\n{str(e)}") ttk.Button(sort_window, text="应用排序", command=apply_sort).pack(pady=20) def filter_data(self): """数据筛选对话框 - 支持多条件筛选""" if self.df is None: messagebox.showwarning("警告", "请先选择Excel文件") return filter_window = tk.Toplevel(self.root) filter_window.title("数据筛选") filter_window.geometry("700x600") main_frame = ttk.Frame(filter_window, padding=10) main_frame.pack(fill=tk.BOTH, expand=True) # 条件容器 conditions_frame = ttk.LabelFrame(main_frame, text="筛选条件", padding=10) conditions_frame.pack(fill=tk.BOTH, expand=True, pady=5) # 条件列表 self.filter_conditions = [] # 添加初始条件行 def add_condition_row(parent_frame, index=0): """添加单行筛选条件""" condition_frame = ttk.Frame(parent_frame) condition_frame.pack(fill=tk.X, pady=5) # 列选择 col_var = tk.StringVar() col_combo = ttk.Combobox(condition_frame, textvariable=col_var, width=15) col_combo['values'] = tuple(self.df.columns) col_combo.pack(side=tk.LEFT, padx=5) # 运算符选择 operator_var = tk.StringVar(value="==") operator_combo = ttk.Combobox(condition_frame, textvariable=operator_var, width=5) operator_combo['values'] = ("==", "!=", ">", ">=", "<", "<=", "包含", "不包含", "开头是", "结尾是") operator_combo.pack(side=tk.LEFT, padx=5) # 值输入 value_var = tk.StringVar() value_entry = ttk.Entry(condition_frame, textvariable=value_var, width=20) value_entry.pack(side=tk.LEFT, padx=5) # 删除按钮 def remove_condition(): condition_frame.destroy() self.filter_conditions.remove((col_var, operator_var, value_var)) remove_btn = ttk.Button(condition_frame, text="×", width=2, command=remove_condition) remove_btn.pack(side=tk.RIGHT, padx=5) # 逻辑关系选择(从第二行开始) if index > 0: logic_var = tk.StringVar(value="AND") logic_combo = ttk.Combobox(condition_frame, textvariable=logic_var, width=5) logic_combo['values'] = ("AND", "OR") logic_combo.pack(side=tk.LEFT, padx=5) return (col_var, operator_var, value_var, logic_var) else: return (col_var, operator_var, value_var, None) # 添加条件按钮 def add_condition(): new_condition = add_condition_row(conditions_frame, len(self.filter_conditions)) self.filter_conditions.append(new_condition) # 初始添加两行条件 self.filter_conditions.append(add_condition_row(conditions_frame)) self.filter_conditions.append(add_condition_row(conditions_frame, 1)) add_btn = ttk.Button(main_frame, text="添加条件", command=add_condition) add_btn.pack(anchor=tk.W, pady=5) # 预览区域 preview_frame = ttk.LabelFrame(main_frame, text="筛选结果预览", padding=10) preview_frame.pack(fill=tk.BOTH, expand=True, pady=5) preview_tree = ttk.Treeview(preview_frame, show="headings") preview_tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) vsb = ttk.Scrollbar(preview_frame, orient="vertical", command=preview_tree.yview) vsb.pack(side=tk.RIGHT, fill=tk.Y) preview_tree.configure(yscrollcommand=vsb.set) # 更新预览 def update_preview(): """更新筛选结果预览""" try: # 清空现有预览 for item in preview_tree.get_children(): preview_tree.delete(item) # 如果没有条件,显示所有数据 if not any(cond[0].get() for cond in self.filter_conditions): filtered_df = self.df.head(100) # 限制预览行数 else: # 构建查询字符串 query_parts = [] for i, (col_var, operator_var, value_var, logic_var) in enumerate(self.filter_conditions): if not col_var.get() or not value_var.get(): continue col = col_var.get() operator = operator_var.get() value = value_var.get() # 处理数值类型 if self.df[col].dtype in (int, float): try: value = float(value) except ValueError: messagebox.showwarning("类型错误", f"列 '{col}' 是数值类型,请输入数字") return # 构建条件表达式 if operator in ("==", "!=", ">", ">=", "<", "<="): expr = f"`{col}` {operator} {repr(value)}" elif operator == "包含": expr = f"`{col}`.str.contains({repr(value)}, na=False)" elif operator == "不包含": expr = f"~`{col}`.str.contains({repr(value)}, na=False)" elif operator == "开头是": expr = f"`{col}`.str.startswith({repr(value)}, na=False)" elif operator == "结尾是": expr = f"`{col}`.str.endswith({repr(value)}, na=False)" # 添加逻辑关系(从第二个条件开始) if i > 0 and logic_var and logic_var.get(): expr = f" {logic_var.get()} {expr}" query_parts.append(expr) # 执行查询 query_str = "".join(query_parts) if query_str: filtered_df = self.df.query(query_str, engine='python').head(100) # 限制预览行数 else: filtered_df = self.df.head(100) # 更新预览表格 preview_tree["columns"] = list(filtered_df.columns) for col in filtered_df.columns: preview_tree.heading(col, text=col) preview_tree.column(col, width=100, anchor=tk.W) # 添加数据行 for _, row in filtered_df.iterrows(): preview_tree.insert("", tk.END, values=list(row)) # 更新状态 self.status_var.set(f"预览: {len(filtered_df)} 行 (显示前100条)") except Exception as e: messagebox.showerror("筛选错误", f"筛选条件错误:\n{str(e)}") # 应用筛选按钮 def apply_filter(): """应用筛选条件到主数据集""" try: # 构建查询字符串 query_parts = [] for i, (col_var, operator_var, value_var, logic_var) in enumerate(self.filter_conditions): if not col_var.get() or not value_var.get(): continue col = col_var.get() operator = operator_var.get() value = value_var.get() # 处理数值类型 if self.df[col].dtype in (int, float): try: value = float(value) except ValueError: messagebox.showwarning("类型错误", f"列 '{col}' 是数值类型,请输入数字") return # 构建条件表达式 if operator in ("==", "!=", ">", ">=", "<", "<="): expr = f"`{col}` {operator} {repr(value)}" elif operator == "包含": expr = f"`{col}`.str.contains({repr(value)}, na=False)" elif operator == "不包含": expr = f"~`{col}`.str.contains({repr(value)}, na=False)" elif operator == "开头是": expr = f"`{col}`.str.startswith({repr(value)}, na=False)" elif operator == "结尾是": expr = f"`{col}`.str.endswith({repr(value)}, na=False)" # 添加逻辑关系(从第二个条件开始) if i > 0 and logic_var and logic_var.get(): expr = f" {logic_var.get()} {expr}" query_parts.append(expr) # 执行查询 query_str = "".join(query_parts) if query_str: original_count = len(self.df) self.df = self.df.query(query_str, engine='python') new_count = len(self.df) # 更新显示 self.show_preview() filter_window.destroy() self.status_var.set(f"筛选完成 | 原始行数: {original_count} | 筛选后: {new_count}") messagebox.showinfo("筛选完成", f"数据筛选完成,保留 {new_count} 行数据") else: messagebox.showwarning("警告", "未设置有效筛选条件") except Exception as e: messagebox.showerror("筛选错误", f"应用筛选条件失败:\n{str(e)}") # 按钮区域 btn_frame = ttk.Frame(main_frame) btn_frame.pack(fill=tk.X, pady=10) ttk.Button(btn_frame, text="预览", command=update_preview).pack(side=tk.LEFT, padx=5) ttk.Button(btn_frame, text="应用筛选", command=apply_filter).pack(side=tk.LEFT, padx=5) ttk.Button(btn_frame, text="取消", command=filter_window.destroy).pack(side=tk.RIGHT, padx=5) # 初始预览 update_preview() def divide_data(self): """数据切分对话框 - 专门用于大坝渗流问题模型训练和测试""" if self.df is None: messagebox.showwarning("警告", "请先选择Excel文件") return # 检查时间列是否存在 datetime_cols = [col for col in self.df.columns if pd.api.types.is_datetime64_any_dtype(self.df[col])] if not datetime_cols: messagebox.showwarning("警告", "未找到时间列,请先进行时间转换") return divide_window = tk.Toplevel(self.root) divide_window.title("大坝渗流数据切分") divide_window.geometry("600x550") main_frame = ttk.Frame(divide_window, padding=15) main_frame.pack(fill=tk.BOTH, expand=True) # 1. 时间列选择 time_frame = ttk.LabelFrame(main_frame, text="时间列选择", padding=10) time_frame.pack(fill=tk.X, pady=5) ttk.Label(time_frame, text="选择时间列:").pack(anchor=tk.W, pady=3) self.time_col_var = tk.StringVar(value=datetime_cols[0]) time_combo = ttk.Combobox(time_frame, textvariable=self.time_col_var, width=25) time_combo['values'] = tuple(datetime_cols) time_combo.pack(anchor=tk.W, fill=tk.X, pady=3) # 显示时间范围 time_range_frame = ttk.Frame(time_frame) time_range_frame.pack(fill=tk.X, pady=5) # 获取最小和最大时间 time_col = self.time_col_var.get() min_time = self.df[time_col].min().strftime('%Y-%m-%d %H:%M') max_time = self.df[time_col].max().strftime('%Y-%m-%d %H:%M') ttk.Label(time_range_frame, text="时间范围:").pack(side=tk.LEFT) ttk.Label(time_range_frame, text=f"{min_time} 至 {max_time}", font=("Arial", 9, "bold")).pack(side=tk.LEFT, padx=5) # 2. 切分方式选择 method_frame = ttk.LabelFrame(main_frame, text="切分方式", padding=10) method_frame.pack(fill=tk.X, pady=5) self.divide_method = tk.StringVar(value="ratio") # 按比例切分 ratio_frame = ttk.Frame(method_frame) ratio_frame.pack(fill=tk.X, pady=3) ttk.Radiobutton(ratio_frame, text="按比例切分", variable=self.divide_method, value="ratio").pack(side=tk.LEFT) ratio_subframe = ttk.Frame(ratio_frame) ratio_subframe.pack(side=tk.LEFT, padx=10) ttk.Label(ratio_subframe, text="训练集比例:").pack(side=tk.LEFT) self.train_ratio_var = tk.DoubleVar(value=0.8) ratio_spin = ttk.Spinbox(ratio_subframe, from_=0.1, to=0.9, increment=0.05, width=5, textvariable=self.train_ratio_var) ratio_spin.pack(side=tk.LEFT, padx=5) # 按时间点切分 date_frame = ttk.Frame(method_frame) date_frame.pack(fill=tk.X, pady=3) ttk.Radiobutton(date_frame, text="按时间点切分", variable=self.divide_method, value="date").pack(side=tk.LEFT) date_subframe = ttk.Frame(date_frame) date_subframe.pack(side=tk.LEFT, padx=10) ttk.Label(date_subframe, text="切分时间:").pack(side=tk.LEFT) # 使用日历控件选择日期 self.divide_date_var = tk.StringVar(value=min_time) date_entry = ttk.Entry(date_subframe, textvariable=self.divide_date_var, width=16) date_entry.pack(side=tk.LEFT, padx=5) # 3. 数据集选择 dataset_frame = ttk.LabelFrame(main_frame, text="选择数据集", padding=10) dataset_frame.pack(fill=tk.X, pady=5) self.dataset_var = tk.StringVar(value="train") ttk.Radiobutton(dataset_frame, text="训练集 (切分点之前)", variable=self.dataset_var, value="train").pack(anchor=tk.W) ttk.Radiobutton(dataset_frame, text="测试集 (切分点之后)", variable=self.dataset_var, value="test").pack(anchor=tk.W) ttk.Radiobutton(dataset_frame, text="全部数据 (仅划分,不切分)", variable=self.dataset_var, value="all").pack(anchor=tk.W) # 4. 添加切分标记选项 mark_frame = ttk.LabelFrame(main_frame, text="切分标记", padding=10) mark_frame.pack(fill=tk.X, pady=5) self.mark_division = tk.BooleanVar(value=True) ttk.Checkbutton(mark_frame, text="添加数据集标记列 (train/test)", variable=self.mark_division).pack(anchor=tk.W) # 5. 执行按钮 btn_frame = ttk.Frame(main_frame) btn_frame.pack(fill=tk.X, pady=10) def apply_division(): """应用数据切分""" try: time_col = self.time_col_var.get() # 确保数据按时间排序 self.df = self.df.sort_values(by=time_col) # 计算切分点 if self.divide_method.get() == "ratio": ratio = self.train_ratio_var.get() split_index = int(len(self.df) * ratio) split_time = self.df.iloc[split_index][time_col] else: # 按时间点切分 split_time = pd.to_datetime(self.divide_date_var.get()) split_index = self.df[self.df[time_col] <= split_time].index.max() + 1 # 添加数据集标记列 if self.mark_division.get(): self.df['dataset'] = 'train' self.df.loc[split_index:, 'dataset'] = 'test' # 根据选择获取数据集 dataset_choice = self.dataset_var.get() if dataset_choice == "train": self.df = self.df.iloc[:split_index] result_type = "训练集" elif dataset_choice == "test": self.df = self.df.iloc[split_index:] result_type = "测试集" else: result_type = "全部数据 (已添加标记)" # 更新显示 self.show_preview() divide_window.destroy() self.status_var.set(f"数据切分完成 | {result_type} | 切分点: {split_time.strftime('%Y-%m-%d %H:%M')}") messagebox.showinfo("切分完成", f"大坝渗流数据切分完成\n当前数据集: {result_type}") except Exception as e: messagebox.showerror("切分错误", f"数据切分失败:\n{str(e)}") def preview_division(): """预览切分点""" try: time_col = self.time_col_var.get() if self.divide_method.get() == "ratio": ratio = self.train_ratio_var.get() split_index = int(len(self.df) * ratio) split_time = self.df.iloc[split_index][time_col] else: split_time = pd.to_datetime(self.divide_date_var.get()) split_index = self.df[self.df[time_col] <= split_time].index.max() + 1 train_count = split_index test_count = len(self.df) - split_index messagebox.showinfo("切分预览", f"切分点时间: {split_time.strftime('%Y-%m-%d %H:%M')}\n" f"训练集数据量: {train_count} 行 ({train_count / len(self.df) * 100:.1f}%)\n" f"测试集数据量: {test_count} 行 ({test_count / len(self.df) * 100:.1f}%)") except Exception as e: messagebox.showerror("预览错误", f"切分点预览失败:\n{str(e)}") ttk.Button(btn_frame, text="预览切分点", command=preview_division).pack(side=tk.LEFT, padx=5) ttk.Button(btn_frame, text="应用切分", command=apply_division).pack(side=tk.LEFT, padx=5) ttk.Button(btn_frame, text="取消", command=divide_window.destroy).pack(side=tk.RIGHT, padx=5) def save_data(self): """保存处理结果""" if self.df is None or self.df.empty: messagebox.showwarning("警告", "没有可保存的数据") return save_path = filedialog.asksaveasfilename( defaultextension=".xlsx", filetypes=[("Excel文件", "*.xlsx"), ("CSV文件", "*.csv")] ) if not save_path: return try: if save_path.endswith('.xlsx'): self.df.to_excel(save_path, index=False) else: self.df.to_csv(save_path, index=False) self.status_var.set(f"文件已保存至: {os.path.basename(save_path)}") messagebox.showinfo("保存成功", f"文件已成功保存至:\n{save_path}") except Exception as e: messagebox.showerror("保存错误", f"保存文件失败:\n{str(e)}")
创建并运行程序
if name == “main”:
root = tk.Tk()
app = ExcelProcessor(root)
root.mainloop()
我是想让你设计成这样的一个窗口格式可以选择excel文件当作训练集和测试集
最新发布