comboBox1.SelectedText为何是空值?

本文介绍了一种从ComboBox中获取选中项的方法,并提供了两种有效方式:通过 comboBox1.Text 获取文本内容,或者使用 comboBox1.SelectedItem as string 来获取选中项。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

这种方式总是空   
MessageBox.Show(comboBox1.SelectedText);  

 可采用下面这两种
            string s = comboBox1.Text;
            string s = comboBox1.SelectedItem as string;
import os import pandas as pd import tkinter as tk from tkinter import ttk, filedialog, messagebox, scrolledtext from openpyxl import load_workbook from openpyxl.styles import PatternFill import matplotlib.pyplot as plt from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg import numpy as np from PIL import Image, ImageTk import seaborn as sns import csv import chardet class TableAnalyzer: def __init__(self, root): self.root = root self.root.title("智能表格分析工具") self.root.geometry("1300x850") self.root.configure(bg='#f0f2f5') # 设置图标 try: self.root.iconbitmap('table_icon.ico') except: pass # 创建主框架 self.main_frame = ttk.Frame(root, padding=10) self.main_frame.pack(fill=tk.BOTH, expand=True) # 创建标题 title_label = ttk.Label(self.main_frame, text="智能表格分析工具", font=("Arial", 16, "bold"), foreground="#2c3e50") title_label.pack(pady=(0, 15)) # 创建文件选择区域 self.create_file_selection_section() # 创建表头检测区域 self.create_header_detection_section() # 创建分析结果区域 self.create_analysis_section() # 创建图表区域 self.create_chart_section() # 初始化变量 self.file_path = "" self.df = None self.workbook = None self.sheet = None self.header_row = 0 # 默认表头在第0行 self.raw_data = None # 存储原始数据用于预览 def create_file_selection_section(self): # 文件选择框架 file_frame = ttk.LabelFrame(self.main_frame, text="文件选择") file_frame.pack(fill=tk.X, padx=5, pady=5) # 文件路径输入框 path_frame = ttk.Frame(file_frame) path_frame.pack(fill=tk.X, padx=10, pady=10) self.path_var = tk.StringVar() path_entry = ttk.Entry(path_frame, textvariable=self.path_var, width=70, state='readonly') path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=(0, 10)) # 浏览按钮 browse_btn = ttk.Button(path_frame, text="浏览文件", command=self.browse_file) browse_btn.pack(side=tk.RIGHT) # 文件类型提示 type_label = ttk.Label(file_frame, text="支持文件类型: Excel (.xlsx, .xls), CSV (.csv)", foreground="#7f8c8d", font=("Arial", 9)) type_label.pack(side=tk.LEFT, padx=10, pady=(0, 5)) def create_header_detection_section(self): # 表头检测框架 header_frame = ttk.LabelFrame(self.main_frame, text="表头检测与设置") header_frame.pack(fill=tk.X, padx=5, pady=5) # 表头行设置 row_frame = ttk.Frame(header_frame) row_frame.pack(fill=tk.X, padx=10, pady=10) ttk.Label(row_frame, text="表头所在行:").pack(side=tk.LEFT, padx=(0, 5)) self.header_row_var = tk.IntVar(value=0) self.header_row_spin = ttk.Spinbox(row_frame, from_=0, to=100, width=5, textvariable=self.header_row_var) self.header_row_spin.pack(side=tk.LEFT, padx=(0, 10)) # 自动检测按钮 detect_btn = ttk.Button(row_frame, text="自动检测表头", command=self.auto_detect_header) detect_btn.pack(side=tk.LEFT, padx=(0, 10)) # 分析按钮 analyze_btn = ttk.Button(row_frame, text="分析表格", command=self.analyze_table, style="Accent.TButton") analyze_btn.pack(side=tk.RIGHT) # 数据预览 preview_frame = ttk.Frame(header_frame) preview_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=(0, 10)) ttk.Label(preview_frame, text="数据预览:").pack(anchor=tk.W) # 预览文本框 self.preview_text = scrolledtext.ScrolledText(preview_frame, height=8, wrap=tk.NONE, font=('Consolas', 10)) self.preview_text.pack(fill=tk.BOTH, expand=True) self.preview_text.config(state=tk.DISABLED) def create_analysis_section(self): # 分析结果框架 analysis_frame = ttk.LabelFrame(self.main_frame, text="分析结果") analysis_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) # 创建分页控件 self.notebook = ttk.Notebook(analysis_frame) self.notebook.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) # 表头识别标签页 self.headers_tab = ttk.Frame(self.notebook) self.notebook.add(self.headers_tab, text="表头识别") # 列统计标签页 self.stats_tab = ttk.Frame(self.notebook) self.notebook.add(self.stats_tab, text="列统计") # 格式分析标签页 self.format_tab = ttk.Frame(self.notebook) self.notebook.add(self.format_tab, text="格式分析") # 初始化标签页内容 self.initialize_tabs() def initialize_tabs(self): # 表头识别标签页内容 headers_frame = ttk.Frame(self.headers_tab) headers_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 表头表格 self.headers_tree = ttk.Treeview(headers_frame, columns=("index", "header", "data_type", "sample"), show="headings", height=8) # 设置列 self.headers_tree.column("index", width=50, anchor=tk.CENTER) self.headers_tree.column("header", width=200, anchor=tk.W) self.headers_tree.column("data_type", width=150, anchor=tk.W) self.headers_tree.column("sample", width=300, anchor=tk.W) # 设置列标题 self.headers_tree.heading("index", text="序号") self.headers_tree.heading("header", text="表头名称") self.headers_tree.heading("data_type", text="数据类型") self.headers_tree.heading("sample", text="示例") # 添加滚动条 scrollbar = ttk.Scrollbar(headers_frame, orient="vertical", command=self.headers_tree.yview) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) self.headers_tree.configure(yscrollcommand=scrollbar.set) self.headers_tree.pack(fill=tk.BOTH, expand=True) # 列统计标签页内容 stats_frame = ttk.Frame(self.stats_tab) stats_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 列选择框 col_frame = ttk.Frame(stats_frame) col_frame.pack(fill=tk.X, pady=(0, 10)) ttk.Label(col_frame, text="选择列:").pack(side=tk.LEFT, padx=(0, 5)) self.col_var = tk.StringVar() self.col_combobox = ttk.Combobox(col_frame, textvariable=self.col_var, state="readonly") self.col_combobox.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=(0, 10)) self.col_combobox.bind("<<ComboboxSelected>>", self.update_stats) # 统计信息表格 self.stats_tree = ttk.Treeview(stats_frame, columns=("metric", "value"), show="headings", height=10) # 设置列 self.stats_tree.column("metric", width=200, anchor=tk.W) self.stats_tree.column("value", width=300, anchor=tk.W) # 设置列标题 self.stats_tree.heading("metric", text="统计指标") self.stats_tree.heading("value", text="") # 添加滚动条 scrollbar2 = ttk.Scrollbar(stats_frame, orient="vertical", command=self.stats_tree.yview) scrollbar2.pack(side=tk.RIGHT, fill=tk.Y) self.stats_tree.configure(yscrollcommand=scrollbar2.set) self.stats_tree.pack(fill=tk.BOTH, expand=True) # 格式分析标签页内容 format_frame = ttk.Frame(self.format_tab) format_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 格式信息表格 self.format_tree = ttk.Treeview(format_frame, columns=("column", "bg_colors", "has_bg", "unique_formats"), show="headings", height=12) # 设置列 self.format_tree.column("column", width=150, anchor=tk.W) self.format_tree.column("bg_colors", width=200, anchor=tk.W) self.format_tree.column("has_bg", width=100, anchor=tk.CENTER) self.format_tree.column("unique_formats", width=150, anchor=tk.CENTER) # 设置列标题 self.format_tree.heading("column", text="列名") self.format_tree.heading("bg_colors", text="背景颜色") self.format_tree.heading("has_bg", text="有底色单元格") self.format_tree.heading("unique_formats", text="唯一格式数") # 添加滚动条 scrollbar3 = ttk.Scrollbar(format_frame, orient="vertical", command=self.format_tree.yview) scrollbar3.pack(side=tk.RIGHT, fill=tk.Y) self.format_tree.configure(yscrollcommand=scrollbar3.set) self.format_tree.pack(fill=tk.BOTH, expand=True) def create_chart_section(self): # 图表框架 chart_frame = ttk.LabelFrame(self.main_frame, text="数据可视化") chart_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) # 图表容器 self.chart_canvas_frame = ttk.Frame(chart_frame) self.chart_canvas_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 默认图表占位符 self.default_chart_label = ttk.Label(self.chart_canvas_frame, text="选择列并分析表格后显示图表", foreground="#7f8c8d", font=("Arial", 11)) self.default_chart_label.pack(expand=True) self.chart_canvas = None def browse_file(self): filetypes = [ ("Excel 文件", "*.xlsx *.xls"), ("CSV 文件", "*.csv"), ("所有文件", "*.*") ] file_path = filedialog.askopenfilename(title="选择表格文件", filetypes=filetypes) if file_path: self.file_path = file_path self.path_var.set(file_path) self.reset_analysis() self.load_and_preview() def load_and_preview(self): """加载文件并显示预览""" try: file_ext = os.path.splitext(self.file_path)[1].lower() # 清空预览 self.preview_text.config(state=tk.NORMAL) self.preview_text.delete(1.0, tk.END) if file_ext in ['.xlsx', '.xls']: # 加载Excel文件 self.raw_data = pd.read_excel(self.file_path, header=None) elif file_ext == '.csv': # 检测CSV文件编码 with open(self.file_path, 'rb') as f: result = chardet.detect(f.read()) encoding = result['encoding'] or 'utf-8' # 加载CSV文件 self.raw_data = pd.read_csv(self.file_path, header=None, encoding=encoding, on_bad_lines='skip', engine='python') else: messagebox.showerror("错误", "不支持的文件类型") return # 显示前10行预览 preview_content = self.raw_data.head(10).to_string(index=False, header=False) self.preview_text.insert(tk.END, preview_content) self.preview_text.config(state=tk.DISABLED) # 尝试自动检测表头 self.auto_detect_header() except Exception as e: messagebox.showerror("错误", f"加载文件时出错: {str(e)}") def auto_detect_header(self): """自动检测表头所在行""" if self.raw_data is None: return # 自动检测策略: # 1. 查找第一个非空行 # 2. 检查每行的数据类型混合情况(表头行通常包含更多字符串) # 3. 检查每行的唯一比例(表头行通常有较高的唯一比例) best_row = 0 best_score = -1 # 只检查前20行 for i in range(min(20, len(self.raw_data))): row = self.raw_data.iloc[i] # 计算得分 score = 0 # 1.空值比例 non_null_ratio = row.notnull().mean() score += non_null_ratio * 20 # 2. 字符串比例 str_ratio = (row.apply(lambda x: isinstance(x, str))).mean() score += str_ratio * 30 # 3. 唯一比例 unique_ratio = row.nunique() / len(row) if len(row) > 0 else 0 score += unique_ratio * 50 if score > best_score: best_score = score best_row = i # 设置检测到的表头行 self.header_row_var.set(best_row) # 在预览中高亮显示表头行 self.highlight_header_row(best_row) def highlight_header_row(self, header_row): """在预览中高亮显示表头行""" self.preview_text.config(state=tk.NORMAL) self.preview_text.tag_configure("header", background="#3498db", foreground="white") # 清除之前的高亮 self.preview_text.tag_remove("header", "1.0", tk.END) # 设置新行的高亮 start_index = f"{header_row + 1}.0" end_index = f"{header_row + 2}.0" self.preview_text.tag_add("header", start_index, end_index) self.preview_text.config(state=tk.DISABLED) def reset_analysis(self): # 清除之前的分析结果 for item in self.headers_tree.get_children(): self.headers_tree.delete(item) for item in self.stats_tree.get_children(): self.stats_tree.delete(item) for item in self.format_tree.get_children(): self.format_tree.delete(item) self.col_combobox.set('') self.col_combobox['values'] = [] # 清除图表 if self.chart_canvas: self.chart_canvas.get_tk_widget().destroy() self.chart_canvas = None self.default_chart_label.pack(expand=True) def analyze_table(self): if not self.file_path: messagebox.showwarning("警告", "请先选择表格文件") return try: self.header_row = self.header_row_var.get() file_ext = os.path.splitext(self.file_path)[1].lower() if file_ext in ['.xlsx', '.xls']: # 读Excel文件,指定表头行 self.df = pd.read_excel(self.file_path, header=self.header_row) self.workbook = load_workbook(self.file_path) self.sheet = self.workbook.active elif file_ext == '.csv': # 读CSV文件,指定表头行 with open(self.file_path, 'rb') as f: result = chardet.detect(f.read()) encoding = result['encoding'] or 'utf-8' self.df = pd.read_csv(self.file_path, header=self.header_row, encoding=encoding, engine='python') else: messagebox.showerror("错误", "不支持的文件类型") return # 清理列名 self.df.columns = [str(col).strip() for col in self.df.columns] # 分析表头 self.analyze_headers() # 分析列格式 if file_ext in ['.xlsx', '.xls']: self.analyze_format() else: messagebox.showinfo("信息", "CSV文件不支持格式分析") # 更新列选择框 self.col_combobox['values'] = list(self.df.columns) if self.df.columns.size > 0: self.col_combobox.current(0) self.update_stats() messagebox.showinfo("成功", "表格分析完成") except Exception as e: messagebox.showerror("错误", f"分析表格时出错: {str(e)}") def analyze_headers(self): # 清空表头表格 for item in self.headers_tree.get_children(): self.headers_tree.delete(item) # 添加表头信息 for i, col in enumerate(self.df.columns): sample = "" # 获前3个非空值作为示例 non_empty = self.df[col].dropna() if len(non_empty) > 0: sample = ", ".join(str(x) for x in non_empty.head(3).tolist()) # 推断数据类型 dtype = str(self.df[col].dtype) dtype_map = { 'object': '文本', 'int64': '整数', 'float64': '小数', 'bool': '布尔', 'datetime64': '日期时间', 'category': '分类数据' } for k, v in dtype_map.items(): if k in dtype: dtype = v break # 检查列名是否有效 col_name = col if col and not str(col).startswith('Unnamed:') else f"列_{i+1}" self.headers_tree.insert("", "end", values=(i+1, col_name, dtype, sample)) def update_stats(self, event=None): # 清空统计表格 for item in self.stats_tree.get_children(): self.stats_tree.delete(item) selected_col = self.col_var.get() if not selected_col or selected_col not in self.df.columns: return col_data = self.df[selected_col] # 基本统计信息 self.stats_tree.insert("", "end", values=("列名", selected_col)) self.stats_tree.insert("", "end", values=("数据类型", str(col_data.dtype))) self.stats_tree.insert("", "end", values=("总行数", len(col_data))) non_null_count = col_data.count() null_count = len(col_data) - non_null_count self.stats_tree.insert("", "end", values=("非空值数量", non_null_count)) self.stats_tree.insert("", "end", values=("空值数量", null_count)) self.stats_tree.insert("", "end", values=("空值比例", f"{null_count/len(col_data):.2%}")) # 统计 if col_data.dtype in ['int64', 'float64']: try: self.stats_tree.insert("", "end", values=("平均", f"{col_data.mean():.2f}")) self.stats_tree.insert("", "end", values=("中位数", f"{col_data.median():.2f}")) self.stats_tree.insert("", "end", values=("最小", f"{col_data.min():.2f}")) self.stats_tree.insert("", "end", values=("最大", f"{col_data.max():.2f}")) self.stats_tree.insert("", "end", values=("标准差", f"{col_data.std():.2f}")) self.stats_tree.insert("", "end", values=("总和", f"{col_data.sum():.2f}")) except: pass # 唯一统计 unique_count = col_data.nunique() self.stats_tree.insert("", "end", values=("唯一数量", unique_count)) self.stats_tree.insert("", "end", values=("唯一比例", f"{unique_count/non_null_count:.2%}")) # 最常出现的 if non_null_count > 0: try: top_values = col_data.value_counts().head(3) top_str = ", ".join([f"{val} ({count})" for val, count in top_values.items()]) self.stats_tree.insert("", "end", values=("最常见", top_str)) except: pass # 更新图表 self.update_chart(selected_col) def analyze_format(self): # 清空格式表格 for item in self.format_tree.get_children(): self.format_tree.delete(item) if not self.workbook or not self.sheet: return # 分析每一列的格式 for col_idx, col_name in enumerate(self.df.columns, start=1): col_letter = self.get_column_letter(col_idx) # 收集背景色信息 bg_colors = {} has_bg_count = 0 unique_colors = set() # 从表头行+1开始(数据行) start_row = self.header_row + 2 if self.header_row > 0 else 2 for row_idx in range(start_row, len(self.df) + start_row): if row_idx > self.sheet.max_row: break cell = self.sheet[f"{col_letter}{row_idx}"] # 获背景色 bg_color = "无" if cell.fill and isinstance(cell.fill, PatternFill): if cell.fill.fgColor and cell.fill.fgColor.rgb: bg_color = cell.fill.fgColor.rgb unique_colors.add(bg_color) # 统计颜色出现次数 if bg_color in bg_colors: bg_colors[bg_color] += 1 else: bg_colors[bg_color] = 1 if bg_color != "无": has_bg_count += 1 # 格式化背景色信息 bg_info = "" if bg_colors: top_colors = sorted(bg_colors.items(), key=lambda x: x[1], reverse=True)[:3] bg_info = ", ".join([f"{color[0]}:{color[1]}" for color in top_colors]) # 添加到表格 self.format_tree.insert("", "end", values=( col_name, bg_info, f"{has_bg_count} ({has_bg_count/len(self.df):.1%})", len(unique_colors) )) def get_column_letter(self, col_idx): """将列索引转换为Excel列字母(如1->A, 27->AA)""" letters = [] while col_idx: col_idx, remainder = divmod(col_idx - 1, 26) letters.append(chr(65 + remainder)) return ''.join(reversed(letters)) def update_chart(self, col_name): # 清除现有图表 if self.chart_canvas: self.chart_canvas.get_tk_widget().destroy() self.chart_canvas = None self.default_chart_label.pack_forget() col_data = self.df[col_name] # 创建图表框架 fig, ax = plt.subplots(figsize=(10, 5)) fig.patch.set_facecolor('#f0f2f5') ax.set_facecolor('#ffffff') # 根据数据类型选择合适的图表 if col_data.dtype in ['int64', 'float64']: # 数数据 - 直方图 try: sns.histplot(col_data.dropna(), kde=True, ax=ax, color='#3498db') ax.set_title(f"{col_name} 数分布", fontsize=12) ax.set_xlabel(col_name) ax.set_ylabel("频率") except: ax.text(0.5, 0.5, "无法绘制图表", ha='center', va='center', fontsize=12) ax.set_title(f"{col_name} 数分布", fontsize=12) elif col_data.dtype == 'object' or col_data.dtype.name == 'category': # 分类数据 - 条形图 try: top_values = col_data.value_counts().head(10) if len(top_values) > 0: # 处理长标签 labels = [str(val)[:20] + ('...' if len(str(val)) > 20 else '') for val in top_values.index] ax.bar(labels, top_values.values, color='#2ecc71') ax.set_title(f"{col_name} 分类分布", fontsize=12) ax.set_xlabel(col_name) ax.set_ylabel("数量") plt.xticks(rotation=45, ha='right') else: ax.text(0.5, 0.5, "无足够数据", ha='center', va='center', fontsize=12) ax.set_title(f"{col_name} 分类分布", fontsize=12) except: ax.text(0.5, 0.5, "无法绘制图表", ha='center', va='center', fontsize=12) ax.set_title(f"{col_name} 分类分布", fontsize=12) else: # 其他数据类型 ax.text(0.5, 0.5, "不支持的数据类型", ha='center', va='center', fontsize=12) ax.set_title(f"{col_name} 数据分布", fontsize=12) plt.tight_layout() # 将图表嵌入Tkinter self.chart_canvas = FigureCanvasTkAgg(fig, master=self.chart_canvas_frame) self.chart_canvas.draw() self.chart_canvas.get_tk_widget().pack(fill=tk.BOTH, expand=True) # 自定义样式 def configure_styles(): style = ttk.Style() style.theme_use('clam') # 主框架样式 style.configure('TFrame', background='#f0f2f5') # 标签框架样式 style.configure('TLabelframe', background='#ffffff', borderwidth=2) style.configure('TLabelframe.Label', background='#ffffff', foreground='#2c3e50') # 按钮样式 style.configure('TButton', background='#3498db', foreground='white', font=('Arial', 10), borderwidth=1) style.map('TButton', background=[('active', '#2980b9')]) # 强调按钮样式 style.configure('Accent.TButton', background='#2ecc71', foreground='white') style.map('Accent.TButton', background=[('active', '#27ae60')]) # 树状视图样式 style.configure('Treeview', background='#ffffff', fieldbackground='#ffffff', foreground='#2c3e50', rowheight=25) style.configure('Treeview.Heading', background='#3498db', foreground='white', font=('Arial', 10, 'bold')) style.map('Treeview', background=[('selected', '#3498db')]) # 旋转框样式 style.configure('TSpinbox', background='white') # 组合框样式 style.configure('TCombobox', background='white') if __name__ == "__main__": root = tk.Tk() configure_styles() app = TableAnalyzer(root) root.mainloop()
08-05
import os import pandas as pd import numpy as np import tkinter as tk from tkinter import ttk, filedialog, messagebox from openpyxl import load_workbook from openpyxl.styles import PatternFill, Font, colors import chardet import threading import time from pandastable import Table, TableModel from sklearn.impute import SimpleImputer from sklearn.ensemble import IsolationForest class TableAnalyzer: def __init__(self, root): self.root = root self.root.title("智能表格分析工具") self.root.geometry("1200x800") self.root.configure(bg='#f5f7fa') # 设置窗口图标 try: self.root.iconbitmap('table_icon.ico') except: pass # 创建主框架 self.main_frame = ttk.Frame(root, padding=15) self.main_frame.pack(fill=tk.BOTH, expand=True) # 创建标题 title_frame = ttk.Frame(self.main_frame) title_frame.pack(fill=tk.X, pady=(0, 15)) title_label = ttk.Label( title_frame, text="智能表格分析工具", font=("Arial", 18, "bold"), foreground="#2c3e50" ) title_label.pack(side=tk.LEFT) # 状态指示器 self.status_var = tk.StringVar(value="就绪") status_label = ttk.Label( title_frame, textvariable=self.status_var, font=("Arial", 10), foreground="#7f8c8d" ) status_label.pack(side=tk.RIGHT, padx=10) # 创建左右分栏 paned_window = ttk.PanedWindow(self.main_frame, orient=tk.HORIZONTAL) paned_window.pack(fill=tk.BOTH, expand=True) # 左侧面板 (控制和预览) left_panel = ttk.Frame(paned_window, padding=10) paned_window.add(left_panel, weight=1) # 右侧面板 (分析和筛选) right_panel = ttk.Frame(paned_window, padding=10) paned_window.add(right_panel, weight=1) # 左侧面板内容 self.create_file_selection_section(left_panel) self.create_header_detection_section(left_panel) self.create_preview_section(left_panel) # 右侧面板内容 self.create_filter_section(right_panel) self.create_linkage_section(right_panel) self.create_analysis_section(right_panel) self.create_advanced_analysis_section(right_panel) # 初始化变量 self.file_path = "" self.df = None self.workbook = None self.sheet = None self.header_row = 0 self.raw_data = None self.analysis_thread = None self.stop_analysis = False self.progress = 0 #self.column_has_color = {} # 存储列是否有底色 # 设置样式 self.configure_styles() self.cell_bg_status = None # 存储单元格底色状态的DataFrame def configure_styles(self): style = ttk.Style() style.theme_use('clam') # 主框架样式 style.configure('TFrame', background='#f5f7fa') # 标签框架样式 style.configure('TLabelframe', background='#ffffff', borderwidth=2) style.configure('TLabelframe.Label', background='#ffffff', foreground='#2c3e50', font=('Arial', 10, 'bold')) # 按钮样式 style.configure('TButton', background='#3498db', foreground='white', font=('Arial', 10), borderwidth=1, padding=5) style.map('TButton', background=[('active', '#2980b9')]) # 强调按钮样式 style.configure('Accent.TButton', background='#2ecc71', foreground='white') style.map('Accent.TButton', background=[('active', '#27ae60')]) # 危险按钮样式 style.configure('Danger.TButton', background='#e74c3c', foreground='white') style.map('Danger.TButton', background=[('active', '#c0392b')]) # 树状视图样式 style.configure('Treeview', background='#ffffff', fieldbackground='#ffffff', foreground='#2c3e50', rowheight=25, font=('Arial', 9)) style.configure('Treeview.Heading', background='#3498db', foreground='white', font=('Arial', 10, 'bold')) style.map('Treeview', background=[('selected', '#3498db')]) # 进度条样式 style.configure('Custom.Horizontal.TProgressbar', background='#3498db', thickness=20) # 标签样式 style.configure('TLabel', background='#f5f7fa', font=('Arial', 10)) style.configure('Title.TLabel', font=('Arial', 12, 'bold'), foreground='#2c3e50') # 组合框样式 style.configure('TCombobox', fieldbackground='white', background='white') def create_file_selection_section(self, parent): # 文件选择框架 file_frame = ttk.LabelFrame(parent, text="文件选择") file_frame.pack(fill=tk.X, padx=5, pady=5) # 文件路径输入框 path_frame = ttk.Frame(file_frame) path_frame.pack(fill=tk.X, padx=10, pady=10) self.path_var = tk.StringVar() path_entry = ttk.Entry(path_frame, textvariable=self.path_var, state='readonly') path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=(0, 10)) # 浏览按钮 browse_btn = ttk.Button( path_frame, text="浏览文件", command=self.browse_file, width=10 ) browse_btn.pack(side=tk.RIGHT) # 进度条 self.progress_var = tk.DoubleVar() progress_bar = ttk.Progressbar( file_frame, variable=self.progress_var, maximum=100, style='Custom.Horizontal.TProgressbar' ) progress_bar.pack(fill=tk.X, padx=10, pady=(0, 10)) # 文件类型提示 type_label = ttk.Label( file_frame, text="支持文件类型: Excel (.xlsx, .xls), CSV (.csv), Parquet (.parquet)", foreground="#7f8c8d", font=("Arial", 9) ) type_label.pack(side=tk.LEFT, padx=10, pady=(0, 5)) def create_header_detection_section(self, parent): # 表头检测框架 header_frame = ttk.LabelFrame(parent, text="表头检测与设置") header_frame.pack(fill=tk.X, padx=5, pady=5) # 表头行设置 row_frame = ttk.Frame(header_frame) row_frame.pack(fill=tk.X, padx=10, pady=10) ttk.Label(row_frame, text="表头所在行:").pack(side=tk.LEFT, padx=(0, 5)) self.header_row_var = tk.IntVar(value=0) self.header_row_spin = ttk.Spinbox( row_frame, from_=0, to=100, width=5, textvariable=self.header_row_var ) self.header_row_spin.pack(side=tk.LEFT, padx=(0, 10)) # 自动检测按钮 detect_btn = ttk.Button( row_frame, text="自动检测表头", command=self.auto_detect_header ) detect_btn.pack(side=tk.LEFT, padx=(0, 10)) # 分析按钮 analyze_btn = ttk.Button( row_frame, text="分析表格", command=self.start_analysis_thread, style="Accent.TButton", width=10 ) analyze_btn.pack(side=tk.RIGHT) # 停止按钮 stop_btn = ttk.Button( row_frame, text="停止分析", command=self.stop_analysis_process, style="Danger.TButton", width=10 ) stop_btn.pack(side=tk.RIGHT, padx=(0, 10)) def create_preview_section(self, parent): # 数据预览框架 preview_frame = ttk.LabelFrame(parent, text="数据预览") preview_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) # 创建表格预览 self.preview_table = ttk.Frame(preview_frame) self.preview_table.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 初始占位符 self.preview_placeholder = ttk.Label( preview_frame, text="选择文件后显示数据预览", foreground="#7f8c8d", font=("Arial", 11) ) self.preview_placeholder.pack(expand=True) def create_filter_section(self, parent): """创建数据筛选区域""" # 筛选框架 filter_frame = ttk.LabelFrame(parent, text="数据筛选") filter_frame.pack(fill=tk.X, padx=5, pady=5) # 筛选控制区域 control_frame = ttk.Frame(filter_frame) control_frame.pack(fill=tk.X, padx=10, pady=10) # 列选择 ttk.Label(control_frame, text="选择列:").pack(side=tk.LEFT, padx=(0, 5)) self.filter_col_var = tk.StringVar() self.filter_col_combobox = ttk.Combobox( control_frame, textvariable=self.filter_col_var, state="readonly", width=20 ) self.filter_col_combobox.pack(side=tk.LEFT, padx=(0, 10)) self.filter_col_combobox.bind("<<ComboboxSelected>>", self.update_filter_values) # 选择 ttk.Label(control_frame, text="选择:").pack(side=tk.LEFT, padx=(0, 5)) self.filter_value_var = tk.StringVar() self.filter_value_combobox = ttk.Combobox( control_frame, textvariable=self.filter_value_var, state="readonly", width=20 ) self.filter_value_combobox.pack(side=tk.LEFT, padx=(0, 10)) # 筛选按钮 filter_btn = ttk.Button( control_frame, text="应用筛选", command=self.apply_filter, style="Accent.TButton" ) filter_btn.pack(side=tk.LEFT, padx=(0, 10)) # 清除筛选按钮 clear_btn = ttk.Button( control_frame, text="清除筛选", command=self.clear_filter, style="Danger.TButton" ) clear_btn.pack(side=tk.LEFT) # 筛选结果信息 self.filter_info_var = tk.StringVar(value="未应用筛选") filter_info_label = ttk.Label( filter_frame, textvariable=self.filter_info_var, foreground="#3498db", font=("Arial", 10, "bold") ) filter_info_label.pack(side=tk.RIGHT, padx=10, pady=(0, 5)) def create_linkage_section(self, parent): """创建联动统计区域""" # 联动统计框架 linkage_frame = ttk.LabelFrame(parent, text="联动统计") linkage_frame.pack(fill=tk.X, padx=5, pady=5) # 控制区域 control_frame = ttk.Frame(linkage_frame) control_frame.pack(fill=tk.X, padx=10, pady=10) # 目标列选择 ttk.Label(control_frame, text="统计列:").pack(side=tk.LEFT, padx=(0, 5)) self.linkage_col_var = tk.StringVar() self.linkage_col_combobox = ttk.Combobox( control_frame, textvariable=self.linkage_col_var, state="readonly", width=20 ) self.linkage_col_combobox.pack(side=tk.LEFT, padx=(0, 10)) # 统计按钮 stats_btn = ttk.Button( control_frame, text="统计分布", command=self.update_linkage_stats, style="Accent.TButton" ) stats_btn.pack(side=tk.LEFT, padx=(0, 10)) # 统计结果框架 stats_frame = ttk.Frame(linkage_frame) stats_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 创建树状视图显示统计结果 columns = ("value", "count", "percent") self.linkage_tree = ttk.Treeview( stats_frame, columns=columns, show="headings", height=8 ) # 设置列 col_widths = [200, 100, 100] col_headings = ["", "出现次数", "占比"] for col, width, heading in zip(columns, col_widths, col_headings): self.linkage_tree.column(col, width=width, anchor=tk.CENTER) self.linkage_tree.heading(col, text=heading) # 添加滚动条 scrollbar = ttk.Scrollbar( stats_frame, orient="vertical", command=self.linkage_tree.yview ) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) self.linkage_tree.configure(yscrollcommand=scrollbar.set) self.linkage_tree.pack(fill=tk.BOTH, expand=True) # 统计信息 self.linkage_info_var = tk.StringVar(value="未统计") linkage_info_label = ttk.Label( linkage_frame, textvariable=self.linkage_info_var, foreground="#3498db", font=("Arial", 10, "bold") ) linkage_info_label.pack(side=tk.RIGHT, padx=10, pady=(0, 5)) def create_analysis_section(self, parent): # 分析结果框架 analysis_frame = ttk.LabelFrame(parent, text="数据分析结果") analysis_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) # 创建分页控件 self.notebook = ttk.Notebook(analysis_frame) self.notebook.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) # 表头识别标签页 self.headers_tab = ttk.Frame(self.notebook) self.notebook.add(self.headers_tab, text="表头识别") # 列统计标签页 self.stats_tab = ttk.Frame(self.notebook) self.notebook.add(self.stats_tab, text="列统计") # 格式分析标签页 self.format_tab = ttk.Frame(self.notebook) self.notebook.add(self.format_tab, text="格式分析") # 数据质量标签页 self.quality_tab = ttk.Frame(self.notebook) self.notebook.add(self.quality_tab, text="数据质量") # 初始化标签页内容 self.initialize_tabs() def initialize_tabs(self): # 表头识别标签页内容 headers_frame = ttk.Frame(self.headers_tab) headers_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 表头表格 columns = ("index", "header", "data_type", "sample", "unique", "null_percent") self.headers_tree = ttk.Treeview( headers_frame, columns=columns, show="headings", height=8 ) # 设置列 col_widths = [50, 150, 100, 200, 80, 80] col_anchors = [tk.CENTER, tk.W, tk.W, tk.W, tk.CENTER, tk.CENTER] col_headings = ["序号", "表头名称", "数据类型", "示例", "唯一", "空值率"] for col, width, anchor, heading in zip(columns, col_widths, col_anchors, col_headings): self.headers_tree.column(col, width=width, anchor=anchor) self.headers_tree.heading(col, text=heading) # 添加滚动条 scrollbar = ttk.Scrollbar( headers_frame, orient="vertical", command=self.headers_tree.yview ) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) self.headers_tree.configure(yscrollcommand=scrollbar.set) self.headers_tree.pack(fill=tk.BOTH, expand=True) # 列统计标签页内容 stats_frame = ttk.Frame(self.stats_tab) stats_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 列选择框 col_frame = ttk.Frame(stats_frame) col_frame.pack(fill=tk.X, pady=(0, 10)) ttk.Label(col_frame, text="选择列:").pack(side=tk.LEFT, padx=(0, 5)) self.col_var = tk.StringVar() self.col_combobox = ttk.Combobox( col_frame, textvariable=self.col_var, state="readonly", width=30 ) self.col_combobox.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=(0, 10)) self.col_combobox.bind("<<ComboboxSelected>>", self.update_stats) # 统计信息表格 self.stats_tree = ttk.Treeview( stats_frame, columns=("metric", "value"), show="headings", height=10 ) # 设置列 self.stats_tree.column("metric", width=200, anchor=tk.W) self.stats_tree.column("value", width=300, anchor=tk.W) # 设置列标题 self.stats_tree.heading("metric", text="统计指标") self.stats_tree.heading("value", text="") # 添加滚动条 scrollbar2 = ttk.Scrollbar( stats_frame, orient="vertical", command=self.stats_tree.yview ) scrollbar2.pack(side=tk.RIGHT, fill=tk.Y) self.stats_tree.configure(yscrollcommand=scrollbar2.set) self.stats_tree.pack(fill=tk.BOTH, expand=True) # 格式分析标签页内容 format_frame = ttk.Frame(self.format_tab) format_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 格式信息表格 columns = ("column", "has_bg", "bg_status") self.format_tree = ttk.Treeview( format_frame, columns=columns, show="headings", height=10 ) # 设置列 col_widths = [150, 120, 120] col_anchors = [tk.W, tk.CENTER, tk.CENTER] col_headings = ["列名", "有底色", "底色状态"] for col, width, anchor, heading in zip(columns, col_widths, col_anchors, col_headings): self.format_tree.column(col, width=width, anchor=anchor) self.format_tree.heading(col, text=heading) # 添加滚动条 scrollbar3 = ttk.Scrollbar( format_frame, orient="vertical", command=self.format_tree.yview ) scrollbar3.pack(side=tk.RIGHT, fill=tk.Y) self.format_tree.configure(yscrollcommand=scrollbar3.set) self.format_tree.pack(fill=tk.BOTH, expand=True) # 数据质量标签页内容 quality_frame = ttk.Frame(self.quality_tab) quality_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 数据质量表格 columns = ("issue", "count", "severity", "columns", "suggestion") self.quality_tree = ttk.Treeview( quality_frame, columns=columns, show="headings", height=8 ) # 设置列 col_widths = [150, 80, 100, 150, 250] col_anchors = [tk.W, tk.CENTER, tk.CENTER, tk.W, tk.W] col_headings = ["问题类型", "数量", "严重程度", "相关列", "解决建议"] for col, width, anchor, heading in zip(columns, col_widths, col_anchors, col_headings): self.quality_tree.column(col, width=width, anchor=anchor) self.quality_tree.heading(col, text=heading) # 添加滚动条 scrollbar4 = ttk.Scrollbar( quality_frame, orient="vertical", command=self.quality_tree.yview ) scrollbar4.pack(side=tk.RIGHT, fill=tk.Y) self.quality_tree.configure(yscrollcommand=scrollbar4.set) self.quality_tree.pack(fill=tk.BOTH, expand=True) # 添加缺失的 update_stats 方法 def update_stats(self, event=None): """更新列统计信息""" # 清空统计表格 for item in self.stats_tree.get_children(): self.stats_tree.delete(item) selected_col_display = self.col_var.get() if not selected_col_display: return # 从显示列名中提原始列名(移除底色标记) if " (有底色)" in selected_col_display: selected_col = selected_col_display.replace(" (有底色)", "") elif " (无底色)" in selected_col_display: selected_col = selected_col_display.replace(" (无底色)", "") else: selected_col = selected_col_display if not selected_col or selected_col not in self.df.columns: return col_data = self.df[selected_col] # 基本统计信息 self.stats_tree.insert("", "end", values=("列名", selected_col)) self.stats_tree.insert("", "end", values=("数据类型", str(col_data.dtype))) self.stats_tree.insert("", "end", values=("总行数", len(col_data))) non_null_count = col_data.count() null_count = len(col_data) - non_null_count self.stats_tree.insert("", "end", values=("非空值数量", non_null_count)) self.stats_tree.insert("", "end", values=("空值数量", null_count)) self.stats_tree.insert("", "end", values=("空值比例", f"{null_count/len(col_data):.2%}" if len(col_data) > 0 else "0%")) # 统计 if pd.api.types.is_numeric_dtype(col_data): try: self.stats_tree.insert("", "end", values=("平均", f"{col_data.mean():.4f}")) self.stats_tree.insert("", "end", values=("中位数", f"{col_data.median():.4f}")) self.stats_tree.insert("", "end", values=("最小", f"{col_data.min():.4f}")) self.stats_tree.insert("", "end", values=("最大", f"{col_data.max():.4f}")) self.stats_tree.insert("", "end", values=("标准差", f"{col_data.std():.4f}")) self.stats_tree.insert("", "end", values=("偏度", f"{col_data.skew():.4f}")) self.stats_tree.insert("", "end", values=("峰度", f"{col_data.kurtosis():.4f}")) self.stats_tree.insert("", "end", values=("总和", f"{col_data.sum():.4f}")) except: pass # 唯一统计 unique_count = col_data.nunique() self.stats_tree.insert("", "end", values=("唯一数量", unique_count)) unique_ratio = unique_count/non_null_count if non_null_count > 0 else 0 self.stats_tree.insert("", "end", values=("唯一比例", f"{unique_ratio:.2%}")) # 最常出现的 if non_null_count > 0: try: top_values = col_data.value_counts().head(3) top_str = ", ".join([f"{val} ({count})" for val, count in top_values.items()]) self.stats_tree.insert("", "end", values=("最常见", top_str)) except: pass def create_advanced_analysis_section(self, parent): # 高级分析框架 analysis_frame = ttk.LabelFrame(parent, text="高级分析") analysis_frame.pack(fill=tk.X, padx=5, pady=5) # 分析选项 options_frame = ttk.Frame(analysis_frame) options_frame.pack(fill=tk.X, padx=10, pady=10) # 异常检测 self.outlier_var = tk.BooleanVar(value=True) outlier_cb = ttk.Checkbutton( options_frame, text="检测异常", variable=self.outlier_var ) outlier_cb.pack(side=tk.LEFT, padx=(0, 20)) # 数据标准化 self.scale_var = tk.BooleanVar(value=False) scale_cb = ttk.Checkbutton( options_frame, text="数据标准化", variable=self.scale_var ) scale_cb.pack(side=tk.LEFT, padx=(0, 20)) # 缺失处理 ttk.Label(options_frame, text="缺失处理:").pack(side=tk.LEFT, padx=(0, 5)) self.missing_var = tk.StringVar(value="mean") missing_options = ["不处理", "平均填充", "中位数填充", "众数填充", "删除行"] missing_cb = ttk.Combobox( options_frame, textvariable=self.missing_var, values=missing_options, state="readonly", width=15 ) missing_cb.pack(side=tk.LEFT, padx=(0, 20)) # 执行高级分析按钮 adv_analyze_btn = ttk.Button( options_frame, text="执行高级分析", command=self.perform_advanced_analysis, style="Accent.TButton" ) adv_analyze_btn.pack(side=tk.RIGHT) def browse_file(self): filetypes = [ ("Excel 文件", "*.xlsx *.xls"), ("CSV 文件", "*.csv"), ("Parquet 文件", "*.parquet"), ("所有文件", "*.*") ] file_path = filedialog.askopenfilename( title="选择表格文件", filetypes=filetypes ) if file_path: self.file_path = file_path self.path_var.set(file_path) self.reset_analysis() self.load_and_preview() def load_and_preview(self): """加载文件并显示预览""" try: self.status_var.set("加载文件中...") self.progress_var.set(0) self.root.update() file_ext = os.path.splitext(self.file_path)[1].lower() # 移除预览占位符 self.preview_placeholder.pack_forget() # 创建预览表格 if hasattr(self, 'pt'): self.pt.destroy() self.pt_frame = ttk.Frame(self.preview_table) self.pt_frame.pack(fill=tk.BOTH, expand=True) # 使用pandastable进行高效预览 self.pt = Table( self.pt_frame, showtoolbar=False, showstatusbar=True, width=400, height=300 ) self.pt.show() # 加载数据 if file_ext in ['.xlsx', '.xls']: # 分块读Excel文件 self.load_excel_preview() elif file_ext == '.csv': # 分块读CSV文件 self.load_csv_preview() elif file_ext == '.parquet': # 读Parquet文件 self.load_parquet_preview() else: messagebox.showerror("错误", "不支持的文件类型") return # 尝试自动检测表头 self.auto_detect_header() self.status_var.set("文件加载完成") except Exception as e: messagebox.showerror("错误", f"加载文件时出错: {str(e)}") self.status_var.set(f"错误: {str(e)}") def load_excel_preview(self): """分块读Excel文件预览""" # 仅读1000行进行预览 self.raw_data = pd.read_excel(self.file_path, header=None, nrows=1000) self.pt.model = TableModel(dataframe=self.raw_data) self.pt.redraw() def load_csv_preview(self): """分块读CSV文件预览""" # 检测编码 with open(self.file_path, 'rb') as f: result = chardet.detect(f.read(10000)) encoding = result['encoding'] or 'utf-8' # 仅读1000行进行预览 self.raw_data = pd.read_csv( self.file_path, header=None, encoding=encoding, engine='c', nrows=1000 ) self.pt.model = TableModel(dataframe=self.raw_data) self.pt.redraw() def load_parquet_preview(self): """读Parquet文件预览""" # 仅读1000行进行预览 self.raw_data = pd.read_parquet(self.file_path) self.pt.model = TableModel(dataframe=self.raw_data.head(1000)) self.pt.redraw() def auto_detect_header(self): """自动检测表头所在行""" if self.raw_data is None or self.raw_data.empty: return # 使用更高效的检测算法 best_row = 0 best_score = -1 # 只检查前20行 for i in range(min(20, len(self.raw_data))): row = self.raw_data.iloc[i] # 计算得分 score = 0 # 1.空值比例 non_null_ratio = row.notnull().mean() score += non_null_ratio * 20 # 2. 字符串比例 str_count = sum(isinstance(x, str) for x in row) str_ratio = str_count / len(row) if len(row) > 0 else 0 score += str_ratio * 30 # 3. 唯一比例 unique_count = len(set(row)) unique_ratio = unique_count / len(row) if len(row) > 0 else 0 score += unique_ratio * 50 if score > best_score: best_score = score best_row = i # 设置检测到的表头行 self.header_row_var.set(best_row) def reset_analysis(self): # 清除之前的分析结果 for tree in [self.headers_tree, self.stats_tree, self.format_tree, self.quality_tree, self.linkage_tree]: for item in tree.get_children(): tree.delete(item) self.col_combobox.set('') self.col_combobox['values'] = [] self.filter_col_var.set('') self.filter_value_var.set('') self.filter_value_combobox['values'] = [] self.linkage_col_var.set('') self.filter_info_var.set("未应用筛选") self.linkage_info_var.set("未统计") self.column_has_color = {} def start_analysis_thread(self): """启动分析线程""" if not self.file_path: messagebox.showwarning("警告", "请先选择表格文件") return # 禁用分析按钮 for widget in self.root.winfo_children(): if isinstance(widget, ttk.Button) and widget.cget("text") == "分析表格": widget.config(state=tk.DISABLED) self.status_var.set("分析中...") self.progress_var.set(0) self.stop_analysis = False # 创建并启动分析线程 self.analysis_thread = threading.Thread(target=self.analyze_table) self.analysis_thread.daemon = True self.analysis_thread.start() # 启动进度更新 self.root.after(100, self.update_progress) def stop_analysis_process(self): """停止分析过程""" self.stop_analysis = True self.status_var.set("分析已停止") def update_progress(self): """更新进度条""" if self.analysis_thread and self.analysis_thread.is_alive(): self.progress_var.set(self.progress) self.root.after(100, self.update_progress) else: # 重新启用分析按钮 for widget in self.root.winfo_children(): if isinstance(widget, ttk.Button) and widget.cget("text") == "分析表格": widget.config(state=tk.NORMAL) self.progress_var.set(100) def analyze_table(self): """执行表格分析(在后台线程中运行)""" try: self.header_row = self.header_row_var.get() file_ext = os.path.splitext(self.file_path)[1].lower() # 更新进度 self.progress = 10 time.sleep(0.1) if self.stop_analysis: return if file_ext in ['.xlsx', '.xls']: # 读Excel文件,指定表头行 self.df = pd.read_excel(self.file_path, header=self.header_row) self.workbook = load_workbook(self.file_path) self.sheet = self.workbook.active elif file_ext == '.csv': # 读CSV文件,指定表头行 with open(self.file_path, 'rb') as f: result = chardet.detect(f.read()) encoding = result['encoding'] or 'utf-8' self.df = pd.read_csv( self.file_path, header=self.header_row, encoding=encoding, engine='c' ) elif file_ext == '.parquet': # 读Parquet文件 self.df = pd.read_parquet(self.file_path) else: messagebox.showerror("错误", "不支持的文件类型") return # 清理列名 self.df.columns = [str(col).strip() for col in self.df.columns] # 初始化列底色信息 self.column_has_color = {col: False for col in self.df.columns} # 更新进度 self.progress = 30 time.sleep(0.1) if self.stop_analysis: return # 分析表头 self.analyze_headers() # 更新进度 self.progress = 50 time.sleep(0.1) if self.stop_analysis: return # 分析列格式 if file_ext in ['.xlsx', '.xls']: self.analyze_format() # 更新进度 self.progress = 70 time.sleep(0.1) if self.stop_analysis: return # 分析数据质量 self.analyze_data_quality() # 更新列选择框 self.col_combobox['values'] = list(self.df.columns) if self.df.columns.size > 0: self.col_combobox.current(0) self.update_stats() # 更新筛选控件(添加底色标记) display_columns = [] for col in self.df.columns: has_color = self.column_has_color.get(col, False) display_columns.append(f"{col} (有底色)" if has_color else f"{col} (无底色)") self.filter_col_combobox['values'] = display_columns # 更新联动统计控件(添加底色标记) display_columns = [] for col in self.df.columns: has_color = self.column_has_color.get(col, False) display_columns.append(f"{col} (有底色)" if has_color else f"{col} (无底色)") self.linkage_col_combobox['values'] = display_columns self.status_var.set("分析完成") # 更新进度 self.progress = 100 except Exception as e: messagebox.showerror("错误", f"分析表格时出错: {str(e)}") self.status_var.set(f"错误: {str(e)}") def analyze_headers(self): # 清空表头表格 for item in self.headers_tree.get_children(): self.headers_tree.delete(item) # 添加表头信息 for i, col in enumerate(self.df.columns): if self.stop_analysis: return # 检查列是否有底色(只针对数据区域) has_color = self.column_has_color.get(col, False) col_display = f"{col} (有底色)" if has_color else f"{col} (无底色)" sample = "" # 获前3个非空值作为示例 non_empty = self.df[col].dropna() if len(non_empty) > 0: sample = ", ".join(str(x) for x in non_empty.head(3).tolist()) # 推断数据类型 dtype = str(self.df[col].dtype) dtype_map = { 'object': '文本', 'int64': '整数', 'float64': '小数', 'bool': '布尔', 'datetime64': '日期时间', 'category': '分类数据' } for k, v in dtype_map.items(): if k in dtype: dtype = v break # 唯一统计 unique_count = self.df[col].nunique() # 空值率 null_count = self.df[col].isnull().sum() null_percent = f"{null_count / len(self.df):.1%}" if len(self.df) > 0 else "0%" self.headers_tree.insert("", "end", values=( i+1, col_display, # 使用带底色标记的列名 dtype, sample, unique_count, null_percent )) def analyze_format(self): """分析每个单元格的底色状态""" if not self.workbook or not self.sheet: return # 初始化底色状态DataFrame(与数据表相同形状) self.cell_bg_status = pd.DataFrame( "无底色", index=self.df.index, columns=self.df.columns ) # 分析每个单元格的底色 for row_idx in range(len(self.df)): for col_idx, col_name in enumerate(self.df.columns): # 转换为Excel坐标(行号从1开始) excel_row = self.header_row + 2 + row_idx col_letter = self.get_column_letter(col_idx + 1) cell = self.sheet[f"{col_letter}{excel_row}"] # 检测底色 has_bg = False if cell.fill and isinstance(cell.fill, PatternFill): if cell.fill.fgColor and cell.fill.fgColor.rgb: if cell.fill.fgColor.rgb != "00000000": # 排除透明 has_bg = True # 存储底色状态 self.cell_bg_status.iloc[row_idx, col_idx] = "有底色" if has_bg else "无底色" def analyze_data_quality(self): """分析数据质量问题""" # 清空质量表格 for item in self.quality_tree.get_children(): self.quality_tree.delete(item) issues = [] # 1. 检查空值 null_counts = self.df.isnull().sum() total_rows = len(self.df) for col, count in null_counts.items(): if count > 0: severity = "高" if count / total_rows > 0.3 else "中" if count / total_rows > 0.1 else "低" issues.append({ "issue": "空值", "count": count, "severity": severity, "columns": col, "suggestion": f"考虑使用平均/中位数填充或删除空值行" }) # 2. 检查重复行 duplicate_rows = self.df.duplicated().sum() if duplicate_rows > 0: severity = "高" if duplicate_rows / total_rows > 0.1 else "中" issues.append({ "issue": "重复行", "count": duplicate_rows, "severity": severity, "columns": "所有列", "suggestion": "删除重复行或分析重复原因" }) # 3. 检查数据类型不一致 for col in self.df.columns: if self.df[col].dtype == 'object': # 检查混合数据类型 type_counts = self.df[col].apply(type).value_counts() if len(type_counts) > 1: issues.append({ "issue": "数据类型不一致", "count": len(self.df[col]), "severity": "中", "columns": col, "suggestion": "统一数据类型或转换格式" }) # 4. 检查异常(仅数列) numeric_cols = self.df.select_dtypes(include=np.number).columns for col in numeric_cols: q1 = self.df[col].quantile(0.25) q3 = self.df[col].quantile(0.75) iqr = q3 - q1 lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr outliers = self.df[(self.df[col] < lower_bound) | (self.df[col] > upper_bound)] outlier_count = len(outliers) if outlier_count > 0: severity = "高" if outlier_count / total_rows > 0.05 else "中" issues.append({ "issue": "异常", "count": outlier_count, "severity": severity, "columns": col, "suggestion": "检查数据准确性或进行转换" }) # 添加到表格 for issue in issues: self.quality_tree.insert("", "end", values=( issue["issue"], issue["count"], issue["severity"], issue["columns"], issue["suggestion"] )) def get_column_letter(self, col_idx): """将列索引转换为Excel列字母""" letters = [] while col_idx: col_idx, remainder = divmod(col_idx - 1, 26) letters.append(chr(65 + remainder)) return ''.join(reversed(letters)) def update_filter_values(self, event=None): """更新筛选列表(带底色标记)""" selected_col = self.filter_col_var.get() if not selected_col or selected_col not in self.df.columns: return # 获列的和对应的底色状态 values = self.df[selected_col].astype(str) bg_status = self.cell_bg_status[selected_col] # 创建带底色标记的列表 unique_values = set() for val, bg in zip(values, bg_status): unique_values.add(f"{val} ({bg})") # 如果唯一太多,只显示前50个 if len(unique_values) > 50: unique_values = sorted(unique_values)[:50] messagebox.showinfo("提示", f"该列有超过50个唯一,只显示前50个") # 更新选择框 self.filter_value_combobox['values'] = sorted(unique_values) self.filter_value_combobox.set('') def apply_filter(self): """应用筛选条件(考虑底色状态)""" if self.df is None: messagebox.showwarning("警告", "请先分析表格") return selected_col = self.filter_col_var.get() selected_value_with_bg = self.filter_value_var.get() # 检查输入有效性 if not selected_col or not selected_value_with_bg: messagebox.showwarning("警告", "请选择列和") return try: # 从带底色标记的中提原始和底色状态 if " (有底色)" in selected_value_with_bg: selected_value = selected_value_with_bg.replace(" (有底色)", "") selected_bg = "有底色" elif " (无底色)" in selected_value_with_bg: selected_value = selected_value_with_bg.replace(" (无底色)", "") selected_bg = "无底色" else: selected_value = selected_value_with_bg selected_bg = None # 应用筛选 if pd.api.types.is_numeric_dtype(self.df[selected_col]): try: selected_value = float(selected_value) value_condition = (self.df[selected_col] == selected_value) except: value_condition = (self.df[selected_col].astype(str) == selected_value) else: value_condition = (self.df[selected_col].astype(str) == selected_value) # 应用底色状态筛选(如果有) if selected_bg and hasattr(self, 'cell_bg_status') and selected_col in self.cell_bg_status: bg_condition = (self.cell_bg_status[selected_col] == selected_bg) condition = value_condition & bg_condition else: condition = value_condition # 应用组合筛选条件 self.filtered_df = self.df[condition] # 更新预览 self.update_preview(self.filtered_df) # 更新筛选信息 match_count = len(self.filtered_df) total_count = len(self.df) self.filter_info_var.set( f"筛选结果: {match_count} 行 (共 {total_count} 行, {match_count/total_count:.1%})" ) # 更新统计信息 self.update_stats(self.col_var.get()) # 自动更新联动统计 if self.linkage_col_var.get() and self.linkage_col_var.get() in self.df.columns: self.update_linkage_stats() except Exception as e: messagebox.showerror("错误", f"筛选数据时出错: {str(e)}") def clear_filter(self): """清除筛选条件""" if hasattr(self, 'filtered_df'): del self.filtered_df # 重置筛选控件 self.filter_col_var.set('') self.filter_value_var.set('') self.filter_value_combobox['values'] = [] # 恢复原始数据预览 if self.df is not None: self.update_preview(self.df) # 更新筛选信息 self.filter_info_var.set("筛选已清除") # 更新统计信息 if self.col_var.get(): self.update_stats(self.col_var.get()) # 清除联动统计 for item in self.linkage_tree.get_children(): self.linkage_tree.delete(item) self.linkage_info_var.set("未统计") def update_preview(self, df): """更新数据预览(可选:添加底色状态)""" if hasattr(self, 'pt'): # 创建带底色状态的预览副本 preview_df = df.copy().head(1000) # 添加底色状态列(如果存在) if hasattr(self, 'cell_bg_status') and not self.cell_bg_status.empty: for col in df.columns: if col in self.cell_bg_status: preview_df[f"{col}_底色状态"] = self.cell_bg_status[col].loc[preview_df.index] # 更新现有表格 self.pt.model = TableModel(dataframe=preview_df) self.pt.redraw() else: # 创建新表格 self.pt_frame = ttk.Frame(self.preview_table) self.pt_frame.pack(fill=tk.BOTH, expand=True) self.pt = Table( self.pt_frame, showtoolbar=False, showstatusbar=True, width=400, height=300 ) # 创建带底色状态的预览 preview_df = df.head(1000).copy() if hasattr(self, 'cell_bg_status') and not self.cell_bg_status.empty: for col in df.columns: if col in self.cell_bg_status: preview_df[f"{col}_底色状态"] = self.cell_bg_status[col].loc[preview_df.index] self.pt.model = TableModel(dataframe=preview_df) self.pt.show() def update_linkage_stats(self): """更新联动统计结果(考虑底色状态)""" if not hasattr(self, 'filtered_df') or self.filtered_df.empty: messagebox.showwarning("警告", "请先应用筛选") return target_col = self.linkage_col_var.get() if not target_col: return # 清空现有结果 for item in self.linkage_tree.get_children(): self.linkage_tree.delete(item) # 获筛选后的数据 filtered_data = self.filtered_df # 添加底色状态列(如果存在) if hasattr(self, 'cell_bg_status') and target_col in self.cell_bg_status: # 合并底色状态到数据中 filtered_data = filtered_data.copy() filtered_data['底色状态'] = self.cell_bg_status[target_col].loc[filtered_data.index] # 统计分布(包含底色状态) value_counts = filtered_data.groupby([target_col, '底色状态']).size().reset_index(name='计数') total_count = len(filtered_data) # 添加到树状视图 for _, row in value_counts.iterrows(): percent = f"{row['计数'] / total_count:.1%}" self.linkage_tree.insert("", "end", values=( row[target_col], row['底色状态'], row['计数'], percent )) # 更新统计信息 self.linkage_info_var.set( f"共 {len(value_counts)} 个组合 (总计 {total_count} 行)" ) else: # 没有底色信息时的处理 value_counts = filtered_data[target_col].value_counts() total_count = len(filtered_data) for value, count in value_counts.items(): percent = f"{count / total_count:.1%}" self.linkage_tree.insert("", "end", values=(value, "", count, percent)) self.linkage_info_var.set( f"共 {len(value_counts)} 个唯一 (总计 {total_count} 行)" ) def perform_advanced_analysis(self): """执行高级分析""" if self.df is None or self.df.empty: messagebox.showwarning("警告", "请先分析表格") return self.status_var.set("执行高级分析...") self.progress_var.set(0) # 备份原始底色状态 original_bg_status = self.cell_bg_status.copy() if hasattr(self, 'cell_bg_status') else None try: # 缺失处理 missing_strategy = self.missing_var.get() if missing_strategy != "不处理": numeric_cols = self.df.select_dtypes(include=np.number).columns categorical_cols = self.df.select_dtypes(include='object').columns if missing_strategy == "删除行": self.df = self.df.dropna() else: if missing_strategy == "平均填充": strategy = 'mean' elif missing_strategy == "中位数填充": strategy = 'median' else: # 众数填充 strategy = 'most_frequent' # 数列填充 if len(numeric_cols) > 0: num_imputer = SimpleImputer(strategy=strategy) self.df[numeric_cols] = num_imputer.fit_transform(self.df[numeric_cols]) # 分类列填充 if len(categorical_cols) > 0: cat_imputer = SimpleImputer(strategy='most_frequent') self.df[categorical_cols] = cat_imputer.fit_transform(self.df[categorical_cols]) # 异常检测 if self.outlier_var.get(): numeric_cols = self.df.select_dtypes(include=np.number).columns if len(numeric_cols) > 0: clf = IsolationForest(contamination=0.05, random_state=42) outliers = clf.fit_predict(self.df[numeric_cols]) self.df['is_outlier'] = outliers == -1 # 在界面上显示异常数量 outlier_count = self.df['is_outlier'].sum() messagebox.showinfo("异常检测", f"检测到 {outlier_count} 个异常\n已添加 'is_outlier' 列标记") # 数据标准化 if self.scale_var.get(): numeric_cols = self.df.select_dtypes(include=np.number).columns if len(numeric_cols) > 0: for col in numeric_cols: min_val = self.df[col].min() max_val = self.df[col].max() if max_val > min_val: self.df[col] = (self.df[col] - min_val) / (max_val - min_val) # 更新分析结果 self.analyze_headers() self.analyze_data_quality() # 更新列选择框 self.col_combobox['values'] = list(self.df.columns) if self.df.columns.size > 0: self.col_combobox.current(0) self.update_stats() # 高级分析后恢复底色状态 if original_bg_status is not None: # 只保留现有行的底色状态 self.cell_bg_status = original_bg_status.loc[self.df.index] # 处理新增列(如果没有底色状态,设为"无底色") for col in self.df.columns: if col not in self.cell_bg_status: self.cell_bg_status[col] = "无底色" self.status_var.set("高级分析完成") self.progress_var.set(100) except Exception as e: if original_bg_status is not None: self.cell_bg_status = original_bg_status messagebox.showerror("错误", f"高级分析时出错: {str(e)}") self.status_var.set(f"错误: {str(e)}") if __name__ == "__main__": root = tk.Tk() app = TableAnalyzer(root) root.mainloop() TableAnalyzer’ object has no attribute ‘apply_filter’ File “C:\Users\siyuanll2102\Desktop\classify\classify.py”, line 281, in create_filter_section command=self.apply_filter, File “C:\Users\siyuanll2102\Desktop\classify\classify.py”, line 72, in init self.create_filter_section(right_panel) File “C:\Users\siyuanll2102\Desktop\classify\classify.py”, line 1445, in app = TableAnalyzer(root) AttributeError: ‘TableAnalyzer’ object has no attribute ‘apply_filter’
最新发布
08-06
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值