import os
import pandas as pd
import numpy as np
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
from openpyxl import load_workbook
from openpyxl.styles import PatternFill, Font, colors
import chardet
import threading
import time
from pandastable import Table, TableModel
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
class TableAnalyzer:
def __init__(self, root):
self.root = root
self.root.title("智能表格分析工具")
self.root.geometry("1200x800")
self.root.configure(bg='#f5f7fa')
# 设置窗口图标
try:
self.root.iconbitmap('table_icon.ico')
except:
pass
# 创建主框架
self.main_frame = ttk.Frame(root, padding=15)
self.main_frame.pack(fill=tk.BOTH, expand=True)
# 创建标题
title_frame = ttk.Frame(self.main_frame)
title_frame.pack(fill=tk.X, pady=(0, 15))
title_label = ttk.Label(
title_frame,
text="智能表格分析工具",
font=("Arial", 18, "bold"),
foreground="#2c3e50"
)
title_label.pack(side=tk.LEFT)
# 状态指示器
self.status_var = tk.StringVar(value="就绪")
status_label = ttk.Label(
title_frame,
textvariable=self.status_var,
font=("Arial", 10),
foreground="#7f8c8d"
)
status_label.pack(side=tk.RIGHT, padx=10)
# 创建左右分栏
paned_window = ttk.PanedWindow(self.main_frame, orient=tk.HORIZONTAL)
paned_window.pack(fill=tk.BOTH, expand=True)
# 左侧面板 (控制和预览)
left_panel = ttk.Frame(paned_window, padding=10)
paned_window.add(left_panel, weight=1)
# 右侧面板 (分析和筛选)
right_panel = ttk.Frame(paned_window, padding=10)
paned_window.add(right_panel, weight=1)
# 左侧面板内容
self.create_file_selection_section(left_panel)
self.create_header_detection_section(left_panel)
self.create_preview_section(left_panel)
# 右侧面板内容
self.create_filter_section(right_panel)
self.create_linkage_section(right_panel)
self.create_analysis_section(right_panel)
self.create_advanced_analysis_section(right_panel)
# 初始化变量
self.file_path = ""
self.df = None
self.workbook = None
self.sheet = None
self.header_row = 0
self.raw_data = None
self.analysis_thread = None
self.stop_analysis = False
self.progress = 0
#self.column_has_color = {} # 存储列是否有底色
# 设置样式
self.configure_styles()
self.cell_bg_status = None # 存储单元格底色状态的DataFrame
def configure_styles(self):
style = ttk.Style()
style.theme_use('clam')
# 主框架样式
style.configure('TFrame', background='#f5f7fa')
# 标签框架样式
style.configure('TLabelframe', background='#ffffff', borderwidth=2)
style.configure('TLabelframe.Label', background='#ffffff', foreground='#2c3e50', font=('Arial', 10, 'bold'))
# 按钮样式
style.configure('TButton', background='#3498db', foreground='white',
font=('Arial', 10), borderwidth=1, padding=5)
style.map('TButton', background=[('active', '#2980b9')])
# 强调按钮样式
style.configure('Accent.TButton', background='#2ecc71', foreground='white')
style.map('Accent.TButton', background=[('active', '#27ae60')])
# 危险按钮样式
style.configure('Danger.TButton', background='#e74c3c', foreground='white')
style.map('Danger.TButton', background=[('active', '#c0392b')])
# 树状视图样式
style.configure('Treeview', background='#ffffff', fieldbackground='#ffffff',
foreground='#2c3e50', rowheight=25, font=('Arial', 9))
style.configure('Treeview.Heading', background='#3498db', foreground='white',
font=('Arial', 10, 'bold'))
style.map('Treeview', background=[('selected', '#3498db')])
# 进度条样式
style.configure('Custom.Horizontal.TProgressbar', background='#3498db', thickness=20)
# 标签样式
style.configure('TLabel', background='#f5f7fa', font=('Arial', 10))
style.configure('Title.TLabel', font=('Arial', 12, 'bold'), foreground='#2c3e50')
# 组合框样式
style.configure('TCombobox', fieldbackground='white', background='white')
def create_file_selection_section(self, parent):
# 文件选择框架
file_frame = ttk.LabelFrame(parent, text="文件选择")
file_frame.pack(fill=tk.X, padx=5, pady=5)
# 文件路径输入框
path_frame = ttk.Frame(file_frame)
path_frame.pack(fill=tk.X, padx=10, pady=10)
self.path_var = tk.StringVar()
path_entry = ttk.Entry(path_frame, textvariable=self.path_var, state='readonly')
path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=(0, 10))
# 浏览按钮
browse_btn = ttk.Button(
path_frame,
text="浏览文件",
command=self.browse_file,
width=10
)
browse_btn.pack(side=tk.RIGHT)
# 进度条
self.progress_var = tk.DoubleVar()
progress_bar = ttk.Progressbar(
file_frame,
variable=self.progress_var,
maximum=100,
style='Custom.Horizontal.TProgressbar'
)
progress_bar.pack(fill=tk.X, padx=10, pady=(0, 10))
# 文件类型提示
type_label = ttk.Label(
file_frame,
text="支持文件类型: Excel (.xlsx, .xls), CSV (.csv), Parquet (.parquet)",
foreground="#7f8c8d",
font=("Arial", 9)
)
type_label.pack(side=tk.LEFT, padx=10, pady=(0, 5))
def create_header_detection_section(self, parent):
# 表头检测框架
header_frame = ttk.LabelFrame(parent, text="表头检测与设置")
header_frame.pack(fill=tk.X, padx=5, pady=5)
# 表头行设置
row_frame = ttk.Frame(header_frame)
row_frame.pack(fill=tk.X, padx=10, pady=10)
ttk.Label(row_frame, text="表头所在行:").pack(side=tk.LEFT, padx=(0, 5))
self.header_row_var = tk.IntVar(value=0)
self.header_row_spin = ttk.Spinbox(
row_frame,
from_=0,
to=100,
width=5,
textvariable=self.header_row_var
)
self.header_row_spin.pack(side=tk.LEFT, padx=(0, 10))
# 自动检测按钮
detect_btn = ttk.Button(
row_frame,
text="自动检测表头",
command=self.auto_detect_header
)
detect_btn.pack(side=tk.LEFT, padx=(0, 10))
# 分析按钮
analyze_btn = ttk.Button(
row_frame,
text="分析表格",
command=self.start_analysis_thread,
style="Accent.TButton",
width=10
)
analyze_btn.pack(side=tk.RIGHT)
# 停止按钮
stop_btn = ttk.Button(
row_frame,
text="停止分析",
command=self.stop_analysis_process,
style="Danger.TButton",
width=10
)
stop_btn.pack(side=tk.RIGHT, padx=(0, 10))
def create_preview_section(self, parent):
# 数据预览框架
preview_frame = ttk.LabelFrame(parent, text="数据预览")
preview_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
# 创建表格预览
self.preview_table = ttk.Frame(preview_frame)
self.preview_table.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
# 初始占位符
self.preview_placeholder = ttk.Label(
preview_frame,
text="选择文件后显示数据预览",
foreground="#7f8c8d",
font=("Arial", 11)
)
self.preview_placeholder.pack(expand=True)
def create_filter_section(self, parent):
"""创建数据筛选区域"""
# 筛选框架
filter_frame = ttk.LabelFrame(parent, text="数据筛选")
filter_frame.pack(fill=tk.X, padx=5, pady=5)
# 筛选控制区域
control_frame = ttk.Frame(filter_frame)
control_frame.pack(fill=tk.X, padx=10, pady=10)
# 列选择
ttk.Label(control_frame, text="选择列:").pack(side=tk.LEFT, padx=(0, 5))
self.filter_col_var = tk.StringVar()
self.filter_col_combobox = ttk.Combobox(
control_frame,
textvariable=self.filter_col_var,
state="readonly",
width=20
)
self.filter_col_combobox.pack(side=tk.LEFT, padx=(0, 10))
self.filter_col_combobox.bind("<<ComboboxSelected>>", self.update_filter_values)
# 值选择
ttk.Label(control_frame, text="选择值:").pack(side=tk.LEFT, padx=(0, 5))
self.filter_value_var = tk.StringVar()
self.filter_value_combobox = ttk.Combobox(
control_frame,
textvariable=self.filter_value_var,
state="readonly",
width=20
)
self.filter_value_combobox.pack(side=tk.LEFT, padx=(0, 10))
# 筛选按钮
filter_btn = ttk.Button(
control_frame,
text="应用筛选",
command=self.apply_filter,
style="Accent.TButton"
)
filter_btn.pack(side=tk.LEFT, padx=(0, 10))
# 清除筛选按钮
clear_btn = ttk.Button(
control_frame,
text="清除筛选",
command=self.clear_filter,
style="Danger.TButton"
)
clear_btn.pack(side=tk.LEFT)
# 筛选结果信息
self.filter_info_var = tk.StringVar(value="未应用筛选")
filter_info_label = ttk.Label(
filter_frame,
textvariable=self.filter_info_var,
foreground="#3498db",
font=("Arial", 10, "bold")
)
filter_info_label.pack(side=tk.RIGHT, padx=10, pady=(0, 5))
def create_linkage_section(self, parent):
"""创建联动统计区域"""
# 联动统计框架
linkage_frame = ttk.LabelFrame(parent, text="联动统计")
linkage_frame.pack(fill=tk.X, padx=5, pady=5)
# 控制区域
control_frame = ttk.Frame(linkage_frame)
control_frame.pack(fill=tk.X, padx=10, pady=10)
# 目标列选择
ttk.Label(control_frame, text="统计列:").pack(side=tk.LEFT, padx=(0, 5))
self.linkage_col_var = tk.StringVar()
self.linkage_col_combobox = ttk.Combobox(
control_frame,
textvariable=self.linkage_col_var,
state="readonly",
width=20
)
self.linkage_col_combobox.pack(side=tk.LEFT, padx=(0, 10))
# 统计按钮
stats_btn = ttk.Button(
control_frame,
text="统计分布",
command=self.update_linkage_stats,
style="Accent.TButton"
)
stats_btn.pack(side=tk.LEFT, padx=(0, 10))
# 统计结果框架
stats_frame = ttk.Frame(linkage_frame)
stats_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
# 创建树状视图显示统计结果
columns = ("value", "count", "percent")
self.linkage_tree = ttk.Treeview(
stats_frame,
columns=columns,
show="headings",
height=8
)
# 设置列
col_widths = [200, 100, 100]
col_headings = ["值", "出现次数", "占比"]
for col, width, heading in zip(columns, col_widths, col_headings):
self.linkage_tree.column(col, width=width, anchor=tk.CENTER)
self.linkage_tree.heading(col, text=heading)
# 添加滚动条
scrollbar = ttk.Scrollbar(
stats_frame,
orient="vertical",
command=self.linkage_tree.yview
)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
self.linkage_tree.configure(yscrollcommand=scrollbar.set)
self.linkage_tree.pack(fill=tk.BOTH, expand=True)
# 统计信息
self.linkage_info_var = tk.StringVar(value="未统计")
linkage_info_label = ttk.Label(
linkage_frame,
textvariable=self.linkage_info_var,
foreground="#3498db",
font=("Arial", 10, "bold")
)
linkage_info_label.pack(side=tk.RIGHT, padx=10, pady=(0, 5))
def create_analysis_section(self, parent):
# 分析结果框架
analysis_frame = ttk.LabelFrame(parent, text="数据分析结果")
analysis_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
# 创建分页控件
self.notebook = ttk.Notebook(analysis_frame)
self.notebook.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
# 表头识别标签页
self.headers_tab = ttk.Frame(self.notebook)
self.notebook.add(self.headers_tab, text="表头识别")
# 列统计标签页
self.stats_tab = ttk.Frame(self.notebook)
self.notebook.add(self.stats_tab, text="列统计")
# 格式分析标签页
self.format_tab = ttk.Frame(self.notebook)
self.notebook.add(self.format_tab, text="格式分析")
# 数据质量标签页
self.quality_tab = ttk.Frame(self.notebook)
self.notebook.add(self.quality_tab, text="数据质量")
# 初始化标签页内容
self.initialize_tabs()
def initialize_tabs(self):
# 表头识别标签页内容
headers_frame = ttk.Frame(self.headers_tab)
headers_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
# 表头表格
columns = ("index", "header", "data_type", "sample", "unique", "null_percent")
self.headers_tree = ttk.Treeview(
headers_frame,
columns=columns,
show="headings",
height=8
)
# 设置列
col_widths = [50, 150, 100, 200, 80, 80]
col_anchors = [tk.CENTER, tk.W, tk.W, tk.W, tk.CENTER, tk.CENTER]
col_headings = ["序号", "表头名称", "数据类型", "示例值", "唯一值", "空值率"]
for col, width, anchor, heading in zip(columns, col_widths, col_anchors, col_headings):
self.headers_tree.column(col, width=width, anchor=anchor)
self.headers_tree.heading(col, text=heading)
# 添加滚动条
scrollbar = ttk.Scrollbar(
headers_frame,
orient="vertical",
command=self.headers_tree.yview
)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
self.headers_tree.configure(yscrollcommand=scrollbar.set)
self.headers_tree.pack(fill=tk.BOTH, expand=True)
# 列统计标签页内容
stats_frame = ttk.Frame(self.stats_tab)
stats_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
# 列选择框
col_frame = ttk.Frame(stats_frame)
col_frame.pack(fill=tk.X, pady=(0, 10))
ttk.Label(col_frame, text="选择列:").pack(side=tk.LEFT, padx=(0, 5))
self.col_var = tk.StringVar()
self.col_combobox = ttk.Combobox(
col_frame,
textvariable=self.col_var,
state="readonly",
width=30
)
self.col_combobox.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=(0, 10))
self.col_combobox.bind("<<ComboboxSelected>>", self.update_stats)
# 统计信息表格
self.stats_tree = ttk.Treeview(
stats_frame,
columns=("metric", "value"),
show="headings",
height=10
)
# 设置列
self.stats_tree.column("metric", width=200, anchor=tk.W)
self.stats_tree.column("value", width=300, anchor=tk.W)
# 设置列标题
self.stats_tree.heading("metric", text="统计指标")
self.stats_tree.heading("value", text="值")
# 添加滚动条
scrollbar2 = ttk.Scrollbar(
stats_frame,
orient="vertical",
command=self.stats_tree.yview
)
scrollbar2.pack(side=tk.RIGHT, fill=tk.Y)
self.stats_tree.configure(yscrollcommand=scrollbar2.set)
self.stats_tree.pack(fill=tk.BOTH, expand=True)
# 格式分析标签页内容
format_frame = ttk.Frame(self.format_tab)
format_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
# 格式信息表格
columns = ("column", "has_bg", "bg_status")
self.format_tree = ttk.Treeview(
format_frame,
columns=columns,
show="headings",
height=10
)
# 设置列
col_widths = [150, 120, 120]
col_anchors = [tk.W, tk.CENTER, tk.CENTER]
col_headings = ["列名", "有底色", "底色状态"]
for col, width, anchor, heading in zip(columns, col_widths, col_anchors, col_headings):
self.format_tree.column(col, width=width, anchor=anchor)
self.format_tree.heading(col, text=heading)
# 添加滚动条
scrollbar3 = ttk.Scrollbar(
format_frame,
orient="vertical",
command=self.format_tree.yview
)
scrollbar3.pack(side=tk.RIGHT, fill=tk.Y)
self.format_tree.configure(yscrollcommand=scrollbar3.set)
self.format_tree.pack(fill=tk.BOTH, expand=True)
# 数据质量标签页内容
quality_frame = ttk.Frame(self.quality_tab)
quality_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
# 数据质量表格
columns = ("issue", "count", "severity", "columns", "suggestion")
self.quality_tree = ttk.Treeview(
quality_frame,
columns=columns,
show="headings",
height=8
)
# 设置列
col_widths = [150, 80, 100, 150, 250]
col_anchors = [tk.W, tk.CENTER, tk.CENTER, tk.W, tk.W]
col_headings = ["问题类型", "数量", "严重程度", "相关列", "解决建议"]
for col, width, anchor, heading in zip(columns, col_widths, col_anchors, col_headings):
self.quality_tree.column(col, width=width, anchor=anchor)
self.quality_tree.heading(col, text=heading)
# 添加滚动条
scrollbar4 = ttk.Scrollbar(
quality_frame,
orient="vertical",
command=self.quality_tree.yview
)
scrollbar4.pack(side=tk.RIGHT, fill=tk.Y)
self.quality_tree.configure(yscrollcommand=scrollbar4.set)
self.quality_tree.pack(fill=tk.BOTH, expand=True)
# 添加缺失的 update_stats 方法
def update_stats(self, event=None):
"""更新列统计信息"""
# 清空统计表格
for item in self.stats_tree.get_children():
self.stats_tree.delete(item)
selected_col_display = self.col_var.get()
if not selected_col_display:
return
# 从显示列名中提取原始列名(移除底色标记)
if " (有底色)" in selected_col_display:
selected_col = selected_col_display.replace(" (有底色)", "")
elif " (无底色)" in selected_col_display:
selected_col = selected_col_display.replace(" (无底色)", "")
else:
selected_col = selected_col_display
if not selected_col or selected_col not in self.df.columns:
return
col_data = self.df[selected_col]
# 基本统计信息
self.stats_tree.insert("", "end", values=("列名", selected_col))
self.stats_tree.insert("", "end", values=("数据类型", str(col_data.dtype)))
self.stats_tree.insert("", "end", values=("总行数", len(col_data)))
non_null_count = col_data.count()
null_count = len(col_data) - non_null_count
self.stats_tree.insert("", "end", values=("非空值数量", non_null_count))
self.stats_tree.insert("", "end", values=("空值数量", null_count))
self.stats_tree.insert("", "end", values=("空值比例", f"{null_count/len(col_data):.2%}" if len(col_data) > 0 else "0%"))
# 值统计
if pd.api.types.is_numeric_dtype(col_data):
try:
self.stats_tree.insert("", "end", values=("平均值", f"{col_data.mean():.4f}"))
self.stats_tree.insert("", "end", values=("中位数", f"{col_data.median():.4f}"))
self.stats_tree.insert("", "end", values=("最小值", f"{col_data.min():.4f}"))
self.stats_tree.insert("", "end", values=("最大值", f"{col_data.max():.4f}"))
self.stats_tree.insert("", "end", values=("标准差", f"{col_data.std():.4f}"))
self.stats_tree.insert("", "end", values=("偏度", f"{col_data.skew():.4f}"))
self.stats_tree.insert("", "end", values=("峰度", f"{col_data.kurtosis():.4f}"))
self.stats_tree.insert("", "end", values=("总和", f"{col_data.sum():.4f}"))
except:
pass
# 唯一值统计
unique_count = col_data.nunique()
self.stats_tree.insert("", "end", values=("唯一值数量", unique_count))
unique_ratio = unique_count/non_null_count if non_null_count > 0 else 0
self.stats_tree.insert("", "end", values=("唯一值比例", f"{unique_ratio:.2%}"))
# 最常出现的值
if non_null_count > 0:
try:
top_values = col_data.value_counts().head(3)
top_str = ", ".join([f"{val} ({count})" for val, count in top_values.items()])
self.stats_tree.insert("", "end", values=("最常见值", top_str))
except:
pass
def create_advanced_analysis_section(self, parent):
# 高级分析框架
analysis_frame = ttk.LabelFrame(parent, text="高级分析")
analysis_frame.pack(fill=tk.X, padx=5, pady=5)
# 分析选项
options_frame = ttk.Frame(analysis_frame)
options_frame.pack(fill=tk.X, padx=10, pady=10)
# 异常值检测
self.outlier_var = tk.BooleanVar(value=True)
outlier_cb = ttk.Checkbutton(
options_frame,
text="检测异常值",
variable=self.outlier_var
)
outlier_cb.pack(side=tk.LEFT, padx=(0, 20))
# 数据标准化
self.scale_var = tk.BooleanVar(value=False)
scale_cb = ttk.Checkbutton(
options_frame,
text="数据标准化",
variable=self.scale_var
)
scale_cb.pack(side=tk.LEFT, padx=(0, 20))
# 缺失值处理
ttk.Label(options_frame, text="缺失值处理:").pack(side=tk.LEFT, padx=(0, 5))
self.missing_var = tk.StringVar(value="mean")
missing_options = ["不处理", "平均值填充", "中位数填充", "众数填充", "删除行"]
missing_cb = ttk.Combobox(
options_frame,
textvariable=self.missing_var,
values=missing_options,
state="readonly",
width=15
)
missing_cb.pack(side=tk.LEFT, padx=(0, 20))
# 执行高级分析按钮
adv_analyze_btn = ttk.Button(
options_frame,
text="执行高级分析",
command=self.perform_advanced_analysis,
style="Accent.TButton"
)
adv_analyze_btn.pack(side=tk.RIGHT)
def browse_file(self):
filetypes = [
("Excel 文件", "*.xlsx *.xls"),
("CSV 文件", "*.csv"),
("Parquet 文件", "*.parquet"),
("所有文件", "*.*")
]
file_path = filedialog.askopenfilename(
title="选择表格文件",
filetypes=filetypes
)
if file_path:
self.file_path = file_path
self.path_var.set(file_path)
self.reset_analysis()
self.load_and_preview()
def load_and_preview(self):
"""加载文件并显示预览"""
try:
self.status_var.set("加载文件中...")
self.progress_var.set(0)
self.root.update()
file_ext = os.path.splitext(self.file_path)[1].lower()
# 移除预览占位符
self.preview_placeholder.pack_forget()
# 创建预览表格
if hasattr(self, 'pt'):
self.pt.destroy()
self.pt_frame = ttk.Frame(self.preview_table)
self.pt_frame.pack(fill=tk.BOTH, expand=True)
# 使用pandastable进行高效预览
self.pt = Table(
self.pt_frame,
showtoolbar=False,
showstatusbar=True,
width=400,
height=300
)
self.pt.show()
# 加载数据
if file_ext in ['.xlsx', '.xls']:
# 分块读取Excel文件
self.load_excel_preview()
elif file_ext == '.csv':
# 分块读取CSV文件
self.load_csv_preview()
elif file_ext == '.parquet':
# 读取Parquet文件
self.load_parquet_preview()
else:
messagebox.showerror("错误", "不支持的文件类型")
return
# 尝试自动检测表头
self.auto_detect_header()
self.status_var.set("文件加载完成")
except Exception as e:
messagebox.showerror("错误", f"加载文件时出错: {str(e)}")
self.status_var.set(f"错误: {str(e)}")
def load_excel_preview(self):
"""分块读取Excel文件预览"""
# 仅读取前1000行进行预览
self.raw_data = pd.read_excel(self.file_path, header=None, nrows=1000)
self.pt.model = TableModel(dataframe=self.raw_data)
self.pt.redraw()
def load_csv_preview(self):
"""分块读取CSV文件预览"""
# 检测编码
with open(self.file_path, 'rb') as f:
result = chardet.detect(f.read(10000))
encoding = result['encoding'] or 'utf-8'
# 仅读取前1000行进行预览
self.raw_data = pd.read_csv(
self.file_path,
header=None,
encoding=encoding,
engine='c',
nrows=1000
)
self.pt.model = TableModel(dataframe=self.raw_data)
self.pt.redraw()
def load_parquet_preview(self):
"""读取Parquet文件预览"""
# 仅读取前1000行进行预览
self.raw_data = pd.read_parquet(self.file_path)
self.pt.model = TableModel(dataframe=self.raw_data.head(1000))
self.pt.redraw()
def auto_detect_header(self):
"""自动检测表头所在行"""
if self.raw_data is None or self.raw_data.empty:
return
# 使用更高效的检测算法
best_row = 0
best_score = -1
# 只检查前20行
for i in range(min(20, len(self.raw_data))):
row = self.raw_data.iloc[i]
# 计算得分
score = 0
# 1. 非空值比例
non_null_ratio = row.notnull().mean()
score += non_null_ratio * 20
# 2. 字符串比例
str_count = sum(isinstance(x, str) for x in row)
str_ratio = str_count / len(row) if len(row) > 0 else 0
score += str_ratio * 30
# 3. 唯一值比例
unique_count = len(set(row))
unique_ratio = unique_count / len(row) if len(row) > 0 else 0
score += unique_ratio * 50
if score > best_score:
best_score = score
best_row = i
# 设置检测到的表头行
self.header_row_var.set(best_row)
def reset_analysis(self):
# 清除之前的分析结果
for tree in [self.headers_tree, self.stats_tree, self.format_tree, self.quality_tree, self.linkage_tree]:
for item in tree.get_children():
tree.delete(item)
self.col_combobox.set('')
self.col_combobox['values'] = []
self.filter_col_var.set('')
self.filter_value_var.set('')
self.filter_value_combobox['values'] = []
self.linkage_col_var.set('')
self.filter_info_var.set("未应用筛选")
self.linkage_info_var.set("未统计")
self.column_has_color = {}
def start_analysis_thread(self):
"""启动分析线程"""
if not self.file_path:
messagebox.showwarning("警告", "请先选择表格文件")
return
# 禁用分析按钮
for widget in self.root.winfo_children():
if isinstance(widget, ttk.Button) and widget.cget("text") == "分析表格":
widget.config(state=tk.DISABLED)
self.status_var.set("分析中...")
self.progress_var.set(0)
self.stop_analysis = False
# 创建并启动分析线程
self.analysis_thread = threading.Thread(target=self.analyze_table)
self.analysis_thread.daemon = True
self.analysis_thread.start()
# 启动进度更新
self.root.after(100, self.update_progress)
def stop_analysis_process(self):
"""停止分析过程"""
self.stop_analysis = True
self.status_var.set("分析已停止")
def update_progress(self):
"""更新进度条"""
if self.analysis_thread and self.analysis_thread.is_alive():
self.progress_var.set(self.progress)
self.root.after(100, self.update_progress)
else:
# 重新启用分析按钮
for widget in self.root.winfo_children():
if isinstance(widget, ttk.Button) and widget.cget("text") == "分析表格":
widget.config(state=tk.NORMAL)
self.progress_var.set(100)
def analyze_table(self):
"""执行表格分析(在后台线程中运行)"""
try:
self.header_row = self.header_row_var.get()
file_ext = os.path.splitext(self.file_path)[1].lower()
# 更新进度
self.progress = 10
time.sleep(0.1)
if self.stop_analysis:
return
if file_ext in ['.xlsx', '.xls']:
# 读取Excel文件,指定表头行
self.df = pd.read_excel(self.file_path, header=self.header_row)
self.workbook = load_workbook(self.file_path)
self.sheet = self.workbook.active
elif file_ext == '.csv':
# 读取CSV文件,指定表头行
with open(self.file_path, 'rb') as f:
result = chardet.detect(f.read())
encoding = result['encoding'] or 'utf-8'
self.df = pd.read_csv(
self.file_path,
header=self.header_row,
encoding=encoding,
engine='c'
)
elif file_ext == '.parquet':
# 读取Parquet文件
self.df = pd.read_parquet(self.file_path)
else:
messagebox.showerror("错误", "不支持的文件类型")
return
# 清理列名
self.df.columns = [str(col).strip() for col in self.df.columns]
# 初始化列底色信息
self.column_has_color = {col: False for col in self.df.columns}
# 更新进度
self.progress = 30
time.sleep(0.1)
if self.stop_analysis:
return
# 分析表头
self.analyze_headers()
# 更新进度
self.progress = 50
time.sleep(0.1)
if self.stop_analysis:
return
# 分析列格式
if file_ext in ['.xlsx', '.xls']:
self.analyze_format()
# 更新进度
self.progress = 70
time.sleep(0.1)
if self.stop_analysis:
return
# 分析数据质量
self.analyze_data_quality()
# 更新列选择框
self.col_combobox['values'] = list(self.df.columns)
if self.df.columns.size > 0:
self.col_combobox.current(0)
self.update_stats()
# 更新筛选控件(添加底色标记)
display_columns = []
for col in self.df.columns:
has_color = self.column_has_color.get(col, False)
display_columns.append(f"{col} (有底色)" if has_color else f"{col} (无底色)")
self.filter_col_combobox['values'] = display_columns
# 更新联动统计控件(添加底色标记)
display_columns = []
for col in self.df.columns:
has_color = self.column_has_color.get(col, False)
display_columns.append(f"{col} (有底色)" if has_color else f"{col} (无底色)")
self.linkage_col_combobox['values'] = display_columns
self.status_var.set("分析完成")
# 更新进度
self.progress = 100
except Exception as e:
messagebox.showerror("错误", f"分析表格时出错: {str(e)}")
self.status_var.set(f"错误: {str(e)}")
def analyze_headers(self):
# 清空表头表格
for item in self.headers_tree.get_children():
self.headers_tree.delete(item)
# 添加表头信息
for i, col in enumerate(self.df.columns):
if self.stop_analysis:
return
# 检查列是否有底色(只针对数据区域)
has_color = self.column_has_color.get(col, False)
col_display = f"{col} (有底色)" if has_color else f"{col} (无底色)"
sample = ""
# 获取前3个非空值作为示例
non_empty = self.df[col].dropna()
if len(non_empty) > 0:
sample = ", ".join(str(x) for x in non_empty.head(3).tolist())
# 推断数据类型
dtype = str(self.df[col].dtype)
dtype_map = {
'object': '文本',
'int64': '整数',
'float64': '小数',
'bool': '布尔值',
'datetime64': '日期时间',
'category': '分类数据'
}
for k, v in dtype_map.items():
if k in dtype:
dtype = v
break
# 唯一值统计
unique_count = self.df[col].nunique()
# 空值率
null_count = self.df[col].isnull().sum()
null_percent = f"{null_count / len(self.df):.1%}" if len(self.df) > 0 else "0%"
self.headers_tree.insert("", "end", values=(
i+1,
col_display, # 使用带底色标记的列名
dtype,
sample,
unique_count,
null_percent
))
def analyze_format(self):
"""分析每个单元格的底色状态"""
if not self.workbook or not self.sheet:
return
# 初始化底色状态DataFrame(与数据表相同形状)
self.cell_bg_status = pd.DataFrame(
"无底色",
index=self.df.index,
columns=self.df.columns
)
# 分析每个单元格的底色
for row_idx in range(len(self.df)):
for col_idx, col_name in enumerate(self.df.columns):
# 转换为Excel坐标(行号从1开始)
excel_row = self.header_row + 2 + row_idx
col_letter = self.get_column_letter(col_idx + 1)
cell = self.sheet[f"{col_letter}{excel_row}"]
# 检测底色
has_bg = False
if cell.fill and isinstance(cell.fill, PatternFill):
if cell.fill.fgColor and cell.fill.fgColor.rgb:
if cell.fill.fgColor.rgb != "00000000": # 排除透明
has_bg = True
# 存储底色状态
self.cell_bg_status.iloc[row_idx, col_idx] = "有底色" if has_bg else "无底色"
def analyze_data_quality(self):
"""分析数据质量问题"""
# 清空质量表格
for item in self.quality_tree.get_children():
self.quality_tree.delete(item)
issues = []
# 1. 检查空值
null_counts = self.df.isnull().sum()
total_rows = len(self.df)
for col, count in null_counts.items():
if count > 0:
severity = "高" if count / total_rows > 0.3 else "中" if count / total_rows > 0.1 else "低"
issues.append({
"issue": "空值",
"count": count,
"severity": severity,
"columns": col,
"suggestion": f"考虑使用平均值/中位数填充或删除空值行"
})
# 2. 检查重复行
duplicate_rows = self.df.duplicated().sum()
if duplicate_rows > 0:
severity = "高" if duplicate_rows / total_rows > 0.1 else "中"
issues.append({
"issue": "重复行",
"count": duplicate_rows,
"severity": severity,
"columns": "所有列",
"suggestion": "删除重复行或分析重复原因"
})
# 3. 检查数据类型不一致
for col in self.df.columns:
if self.df[col].dtype == 'object':
# 检查混合数据类型
type_counts = self.df[col].apply(type).value_counts()
if len(type_counts) > 1:
issues.append({
"issue": "数据类型不一致",
"count": len(self.df[col]),
"severity": "中",
"columns": col,
"suggestion": "统一数据类型或转换格式"
})
# 4. 检查异常值(仅数值列)
numeric_cols = self.df.select_dtypes(include=np.number).columns
for col in numeric_cols:
q1 = self.df[col].quantile(0.25)
q3 = self.df[col].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = self.df[(self.df[col] < lower_bound) | (self.df[col] > upper_bound)]
outlier_count = len(outliers)
if outlier_count > 0:
severity = "高" if outlier_count / total_rows > 0.05 else "中"
issues.append({
"issue": "异常值",
"count": outlier_count,
"severity": severity,
"columns": col,
"suggestion": "检查数据准确性或进行转换"
})
# 添加到表格
for issue in issues:
self.quality_tree.insert("", "end", values=(
issue["issue"],
issue["count"],
issue["severity"],
issue["columns"],
issue["suggestion"]
))
def get_column_letter(self, col_idx):
"""将列索引转换为Excel列字母"""
letters = []
while col_idx:
col_idx, remainder = divmod(col_idx - 1, 26)
letters.append(chr(65 + remainder))
return ''.join(reversed(letters))
def update_filter_values(self, event=None):
"""更新筛选值列表(带底色标记)"""
selected_col = self.filter_col_var.get()
if not selected_col or selected_col not in self.df.columns:
return
# 获取列的值和对应的底色状态
values = self.df[selected_col].astype(str)
bg_status = self.cell_bg_status[selected_col]
# 创建带底色标记的值列表
unique_values = set()
for val, bg in zip(values, bg_status):
unique_values.add(f"{val} ({bg})")
# 如果唯一值太多,只显示前50个
if len(unique_values) > 50:
unique_values = sorted(unique_values)[:50]
messagebox.showinfo("提示", f"该列有超过50个唯一值,只显示前50个")
# 更新值选择框
self.filter_value_combobox['values'] = sorted(unique_values)
self.filter_value_combobox.set('')
def apply_filter(self):
"""应用筛选条件(考虑底色状态)"""
if self.df is None:
messagebox.showwarning("警告", "请先分析表格")
return
selected_col = self.filter_col_var.get()
selected_value_with_bg = self.filter_value_var.get()
# 检查输入有效性
if not selected_col or not selected_value_with_bg:
messagebox.showwarning("警告", "请选择列和值")
return
try:
# 从带底色标记的值中提取原始值和底色状态
if " (有底色)" in selected_value_with_bg:
selected_value = selected_value_with_bg.replace(" (有底色)", "")
selected_bg = "有底色"
elif " (无底色)" in selected_value_with_bg:
selected_value = selected_value_with_bg.replace(" (无底色)", "")
selected_bg = "无底色"
else:
selected_value = selected_value_with_bg
selected_bg = None
# 应用值筛选
if pd.api.types.is_numeric_dtype(self.df[selected_col]):
try:
selected_value = float(selected_value)
value_condition = (self.df[selected_col] == selected_value)
except:
value_condition = (self.df[selected_col].astype(str) == selected_value)
else:
value_condition = (self.df[selected_col].astype(str) == selected_value)
# 应用底色状态筛选(如果有)
if selected_bg and hasattr(self, 'cell_bg_status') and selected_col in self.cell_bg_status:
bg_condition = (self.cell_bg_status[selected_col] == selected_bg)
condition = value_condition & bg_condition
else:
condition = value_condition
# 应用组合筛选条件
self.filtered_df = self.df[condition]
# 更新预览
self.update_preview(self.filtered_df)
# 更新筛选信息
match_count = len(self.filtered_df)
total_count = len(self.df)
self.filter_info_var.set(
f"筛选结果: {match_count} 行 (共 {total_count} 行, {match_count/total_count:.1%})"
)
# 更新统计信息
self.update_stats(self.col_var.get())
# 自动更新联动统计
if self.linkage_col_var.get() and self.linkage_col_var.get() in self.df.columns:
self.update_linkage_stats()
except Exception as e:
messagebox.showerror("错误", f"筛选数据时出错: {str(e)}")
def clear_filter(self):
"""清除筛选条件"""
if hasattr(self, 'filtered_df'):
del self.filtered_df
# 重置筛选控件
self.filter_col_var.set('')
self.filter_value_var.set('')
self.filter_value_combobox['values'] = []
# 恢复原始数据预览
if self.df is not None:
self.update_preview(self.df)
# 更新筛选信息
self.filter_info_var.set("筛选已清除")
# 更新统计信息
if self.col_var.get():
self.update_stats(self.col_var.get())
# 清除联动统计
for item in self.linkage_tree.get_children():
self.linkage_tree.delete(item)
self.linkage_info_var.set("未统计")
def update_preview(self, df):
"""更新数据预览(可选:添加底色状态)"""
if hasattr(self, 'pt'):
# 创建带底色状态的预览副本
preview_df = df.copy().head(1000)
# 添加底色状态列(如果存在)
if hasattr(self, 'cell_bg_status') and not self.cell_bg_status.empty:
for col in df.columns:
if col in self.cell_bg_status:
preview_df[f"{col}_底色状态"] = self.cell_bg_status[col].loc[preview_df.index]
# 更新现有表格
self.pt.model = TableModel(dataframe=preview_df)
self.pt.redraw()
else:
# 创建新表格
self.pt_frame = ttk.Frame(self.preview_table)
self.pt_frame.pack(fill=tk.BOTH, expand=True)
self.pt = Table(
self.pt_frame,
showtoolbar=False,
showstatusbar=True,
width=400,
height=300
)
# 创建带底色状态的预览
preview_df = df.head(1000).copy()
if hasattr(self, 'cell_bg_status') and not self.cell_bg_status.empty:
for col in df.columns:
if col in self.cell_bg_status:
preview_df[f"{col}_底色状态"] = self.cell_bg_status[col].loc[preview_df.index]
self.pt.model = TableModel(dataframe=preview_df)
self.pt.show()
def update_linkage_stats(self):
"""更新联动统计结果(考虑底色状态)"""
if not hasattr(self, 'filtered_df') or self.filtered_df.empty:
messagebox.showwarning("警告", "请先应用筛选")
return
target_col = self.linkage_col_var.get()
if not target_col:
return
# 清空现有结果
for item in self.linkage_tree.get_children():
self.linkage_tree.delete(item)
# 获取筛选后的数据
filtered_data = self.filtered_df
# 添加底色状态列(如果存在)
if hasattr(self, 'cell_bg_status') and target_col in self.cell_bg_status:
# 合并底色状态到数据中
filtered_data = filtered_data.copy()
filtered_data['底色状态'] = self.cell_bg_status[target_col].loc[filtered_data.index]
# 统计值分布(包含底色状态)
value_counts = filtered_data.groupby([target_col, '底色状态']).size().reset_index(name='计数')
total_count = len(filtered_data)
# 添加到树状视图
for _, row in value_counts.iterrows():
percent = f"{row['计数'] / total_count:.1%}"
self.linkage_tree.insert("", "end", values=(
row[target_col],
row['底色状态'],
row['计数'],
percent
))
# 更新统计信息
self.linkage_info_var.set(
f"共 {len(value_counts)} 个组合 (总计 {total_count} 行)"
)
else:
# 没有底色信息时的处理
value_counts = filtered_data[target_col].value_counts()
total_count = len(filtered_data)
for value, count in value_counts.items():
percent = f"{count / total_count:.1%}"
self.linkage_tree.insert("", "end", values=(value, "", count, percent))
self.linkage_info_var.set(
f"共 {len(value_counts)} 个唯一值 (总计 {total_count} 行)"
)
def perform_advanced_analysis(self):
"""执行高级分析"""
if self.df is None or self.df.empty:
messagebox.showwarning("警告", "请先分析表格")
return
self.status_var.set("执行高级分析...")
self.progress_var.set(0)
# 备份原始底色状态
original_bg_status = self.cell_bg_status.copy() if hasattr(self, 'cell_bg_status') else None
try:
# 缺失值处理
missing_strategy = self.missing_var.get()
if missing_strategy != "不处理":
numeric_cols = self.df.select_dtypes(include=np.number).columns
categorical_cols = self.df.select_dtypes(include='object').columns
if missing_strategy == "删除行":
self.df = self.df.dropna()
else:
if missing_strategy == "平均值填充":
strategy = 'mean'
elif missing_strategy == "中位数填充":
strategy = 'median'
else: # 众数填充
strategy = 'most_frequent'
# 数值列填充
if len(numeric_cols) > 0:
num_imputer = SimpleImputer(strategy=strategy)
self.df[numeric_cols] = num_imputer.fit_transform(self.df[numeric_cols])
# 分类列填充
if len(categorical_cols) > 0:
cat_imputer = SimpleImputer(strategy='most_frequent')
self.df[categorical_cols] = cat_imputer.fit_transform(self.df[categorical_cols])
# 异常值检测
if self.outlier_var.get():
numeric_cols = self.df.select_dtypes(include=np.number).columns
if len(numeric_cols) > 0:
clf = IsolationForest(contamination=0.05, random_state=42)
outliers = clf.fit_predict(self.df[numeric_cols])
self.df['is_outlier'] = outliers == -1
# 在界面上显示异常值数量
outlier_count = self.df['is_outlier'].sum()
messagebox.showinfo("异常值检测", f"检测到 {outlier_count} 个异常值\n已添加 'is_outlier' 列标记")
# 数据标准化
if self.scale_var.get():
numeric_cols = self.df.select_dtypes(include=np.number).columns
if len(numeric_cols) > 0:
for col in numeric_cols:
min_val = self.df[col].min()
max_val = self.df[col].max()
if max_val > min_val:
self.df[col] = (self.df[col] - min_val) / (max_val - min_val)
# 更新分析结果
self.analyze_headers()
self.analyze_data_quality()
# 更新列选择框
self.col_combobox['values'] = list(self.df.columns)
if self.df.columns.size > 0:
self.col_combobox.current(0)
self.update_stats()
# 高级分析后恢复底色状态
if original_bg_status is not None:
# 只保留现有行的底色状态
self.cell_bg_status = original_bg_status.loc[self.df.index]
# 处理新增列(如果没有底色状态,设为"无底色")
for col in self.df.columns:
if col not in self.cell_bg_status:
self.cell_bg_status[col] = "无底色"
self.status_var.set("高级分析完成")
self.progress_var.set(100)
except Exception as e:
if original_bg_status is not None:
self.cell_bg_status = original_bg_status
messagebox.showerror("错误", f"高级分析时出错: {str(e)}")
self.status_var.set(f"错误: {str(e)}")
if __name__ == "__main__":
root = tk.Tk()
app = TableAnalyzer(root)
root.mainloop()
TableAnalyzer’ object has no attribute ‘apply_filter’
File “C:\Users\siyuanll2102\Desktop\classify\classify.py”, line 281, in create_filter_section
command=self.apply_filter,
File “C:\Users\siyuanll2102\Desktop\classify\classify.py”, line 72, in init
self.create_filter_section(right_panel)
File “C:\Users\siyuanll2102\Desktop\classify\classify.py”, line 1445, in
app = TableAnalyzer(root)
AttributeError: ‘TableAnalyzer’ object has no attribute ‘apply_filter’
最新发布