def analyze_table(self):
"""执行表格分析(在后台线程中运行)"""
try:
# 重置分析状态
self.reset_analysis()
self.header_row = self.header_row_var.get()
file_ext = os.path.splitext(self.file_path)[1].lower()
# 更新进度
self.progress = 10
time.sleep(0.1)
if self.stop_analysis:
return
# 确保有原始数据
if self.raw_data is None:
self.load_and_preview()
if file_ext in ['.xlsx', '.xls']:
# 读取Excel文件,指定表头行
self.df = pd.read_excel(self.file_path, header=self.header_row)
self.workbook = load_workbook(self.file_path)
self.sheet = self.workbook.active
elif file_ext == '.csv':
# 读取CSV文件,指定表头行
with open(self.file_path, 'rb') as f:
result = chardet.detect(f.read())
encoding = result['encoding'] or 'utf-8'
self.df = pd.read_csv(
self.file_path,
header=self.header_row,
encoding=encoding,
engine='c'
)
elif file_ext == '.parquet':
# 读取Parquet文件
self.df = pd.read_parquet(self.file_path)
else:
messagebox.showerror("错误", "不支持的文件类型")
return
# 确保DataFrame不为空
if self.df is None or self.df.empty:
raise ValueError("加载的数据为空")
# 清理列名
self.df.columns = [str(col).strip() for col in self.df.columns]
# 初始化列底色信息
self.column_has_color = {col: False for col in self.df.columns}
# 更新进度
self.progress = 30
time.sleep(0.1)
if self.stop_analysis:
return
# 分析表头
self.analyze_headers()
# 更新进度
self.progress = 50
time.sleep(0.1)
if self.stop_analysis:
return
# 分析列格式
if file_ext in ['.xlsx', '.xls']:
self.analyze_format()
# 更新进度
self.progress = 70
time.sleep(0.1)
if self.stop_analysis:
return
# 分析数据质量
self.analyze_data_quality()
# 更新列选择框
self.col_combobox['values'] = list(self.df.columns)
if self.df.columns.size > 0:
self.col_combobox.current(0)
self.update_stats()
# 更新筛选控件(添加底色标记)
display_columns = []
for col in self.df.columns:
has_color = self.column_has_color.get(col, False)
display_columns.append(f"{col} (有底色)" if has_color else f"{col} (无底色)")
self.filter_col_combobox['values'] = display_columns
# 更新联动统计控件(添加底色标记)
display_columns = []
for col in self.df.columns:
has_color = self.column_has_color.get(col, False)
display_columns.append(f"{col} (有底色)" if has_color else f"{col} (无底色)")
self.linkage_col_combobox['values'] = display_columns
self.status_var.set("分析完成")
# 更新进度
self.progress = 100
except Exception as e:
error_msg = f"分析表格时出错: {str(e)}"
logging.error(error_msg)
logging.error(traceback.format_exc())
messagebox.showerror("分析错误", error_msg)
self.status_var.set(f"错误: {error_msg}")
def analyze_headers(self):
# 清空表头表格
for item in self.headers_tree.get_children():
self.headers_tree.delete(item)
# 添加表头信息
for i, col in enumerate(self.df.columns):
if self.stop_analysis:
return
# 检查列是否有底色(只针对数据区域)
has_color = self.column_has_color.get(col, False)
col_display = f"{col} (有底色)" if has_color else f"{col} (无底色)"
sample = ""
# 获取前3个非空值作为示例
non_empty = self.df[col].dropna()
if len(non_empty) > 0:
sample = ", ".join(str(x) for x in non_empty.head(3).tolist())
# 推断数据类型
dtype = str(self.df[col].dtype)
dtype_map = {
'object': '文本',
'int64': '整数',
'float64': '小数',
'bool': '布尔值',
'datetime64': '日期时间',
'category': '分类数据'
}
for k, v in dtype_map.items():
if k in dtype:
dtype = v
break
# 唯一值统计
unique_count = self.df[col].nunique()
# 空值率
null_count = self.df[col].isnull().sum()
null_percent = f"{null_count / len(self.df):.1%}" if len(self.df) > 0 else "0%"
self.headers_tree.insert("", "end", values=(
i+1,
col_display, # 使用带底色标记的列名
dtype,
sample,
unique_count,
null_percent
))
def analyze_format(self):
"""分析每个单元格的底色状态"""
if not self.workbook or not self.sheet:
return
# 初始化底色状态DataFrame(与数据表相同形状)
self.cell_bg_status = pd.DataFrame(
"无底色",
index=self.df.index,
columns=self.df.columns
)
# 分析每个单元格的底色
for row_idx in range(len(self.df)):
for col_idx, col_name in enumerate(self.df.columns):
# 转换为Excel坐标(行号从1开始)
excel_row = self.header_row + 2 + row_idx
col_letter = self.get_column_letter(col_idx + 1)
cell = self.sheet[f"{col_letter}{excel_row}"]
# 检测底色
has_bg = False
if cell.fill and isinstance(cell.fill, PatternFill):
if cell.fill.fgColor and cell.fill.fgColor.rgb:
if cell.fill.fgColor.rgb != "00000000": # 排除透明
has_bg = True
# 存储底色状态
self.cell_bg_status.iloc[row_idx, col_idx] = "有底色" if has_bg else "无底色"
# 标记列是否有底色
if has_bg:
self.column_has_color[col_name] = True
def analyze_data_quality(self):
"""分析数据质量问题"""
# 清空质量表格
for item in self.quality_tree.get_children():
self.quality_tree.delete(item)
issues = []
total_rows = len(self.df)
# 1. 检查空值
null_counts = self.df.isnull().sum()
for col, count in null_counts.items():
if count > 0:
severity = "高" if count / total_rows > 0.3 else "中" if count / total_rows > 0.1 else "低"
issues.append({
"issue": "空值",
"count": count,
"severity": severity,
"columns": col,
"suggestion": f"考虑使用平均值/中位数填充或删除空值行"
})
# 2. 检查重复行
duplicate_rows = self.df.duplicated().sum()
if duplicate_rows > 0:
severity = "高" if duplicate_rows / total_rows > 0.1 else "中"
issues.append({
"issue": "重复行",
"count": duplicate_rows,
"severity": severity,
"columns": "所有列",
"suggestion": "删除重复行或分析重复原因"
})
# 3. 检查数据类型不一致
for col in self.df.columns:
if self.df[col].dtype == 'object':
# 检查混合数据类型
type_counts = self.df[col].apply(type).value_counts()
if len(type_counts) > 1:
issues.append({
"issue": "数据类型不一致",
"count": len(self.df[col]),
"severity": "中",
"columns": col,
"suggestion": "统一数据类型或转换格式"
})
# 4. 检查异常值(仅数值列)
numeric_cols = self.df.select_dtypes(include=np.number).columns
for col in numeric_cols:
q1 = self.df[col].quantile(0.25)
q3 = self.df[col].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = self.df[(self.df[col] < lower_bound) | (self.df[col] > upper_bound)]
outlier_count = len(outliers)
if outlier_count > 0:
severity = "高" if outlier_count / total_rows > 0.05 else "中"
issues.append({
"issue": "异常值",
"count": outlier_count,
"severity": severity,
"columns": col,
"suggestion": "检查数据准确性或进行转换"
})
# 添加到表格
for issue in issues:
self.quality_tree.insert("", "end", values=(
issue["issue"],
issue["count"],
issue["severity"],
issue["columns"],
issue["suggestion"]
))
def get_column_letter(self, col_idx):
"""将列索引转换为Excel列字母"""
letters = []
while col_idx:
col_idx, remainder = divmod(col_idx - 1, 26)
letters.append(chr(65 + remainder))
return ''.join(reversed(letters))
def update_filter_values(self, event=None):
"""更新筛选值列表(带底色标记)"""
selected_col_display = self.filter_col_var.get()
if not selected_col_display:
return
# 从显示列名中提取原始列名(移除底色标记)
if " (有底色)" in selected_col_display:
selected_col = selected_col_display.replace(" (有底色)", "")
elif " (无底色)" in selected_col_display:
selected_col = selected_col_display.replace(" (无底色)", "")
else:
selected_col = selected_col_display
if not selected_col or selected_col not in self.df.columns:
return
# 获取列的值和对应的底色状态
values = self.df[selected_col].astype(str)
# 创建带底色标记的值列表
unique_values = set()
if not self.cell_bg_status.empty and selected_col in self.cell_bg_status:
bg_status = self.cell_bg_status[selected_col]
for val, bg in zip(values, bg_status):
unique_values.add(f"{val} ({bg})")
else:
unique_values = set(values)
# 如果唯一值太多,只显示前50个
if len(unique_values) > 50:
unique_values = sorted(unique_values)[:50]
messagebox.showinfo("提示", f"该列有超过50个唯一值,只显示前50个")
# 更新值选择框
self.filter_value_combobox['values'] = sorted(unique_values)
self.filter_value_combobox.set('')
def apply_filter(self):
"""应用筛选条件(考虑底色状态)"""
if self.df is None:
messagebox.showwarning("警告", "请先分析表格")
return
selected_col_display = self.filter_col_var.get()
selected_value_with_bg = self.filter_value_var.get()
# 检查输入有效性
if not selected_col_display or not selected_value_with_bg:
messagebox.showwarning("警告", "请选择列和值")
return
try:
# 从显示列名中提取原始列名(移除底色标记)
if " (有底色)" in selected_col_display:
selected_col = selected_col_display.replace(" (有底色)", "")
elif " (无底色)" in selected_col_display:
selected_col = selected_col_display.replace(" (无底色)", "")
else:
selected_col = selected_col_display
# 从带底色标记的值中提取原始值和底色状态
if " (有底色)" in selected_value_with_bg:
selected_value = selected_value_with_bg.replace(" (有底色)", "")
selected_bg = "有底色"
elif " (无底色)" in selected_value_with_bg:
selected_value = selected_value_with_bg.replace(" (无底色)", "")
selected_bg = "无底色"
else:
selected_value = selected_value_with_bg
selected_bg = None
# 应用值筛选
if pd.api.types.is_numeric_dtype(self.df[selected_col]):
try:
selected_value = float(selected_value)
value_condition = (self.df[selected_col] == selected_value)
except:
value_condition = (self.df[selected_col].astype(str) == selected_value)
else:
value_condition = (self.df[selected_col].astype(str) == selected_value)
# 应用底色状态筛选(如果有)
if selected_bg and not self.cell_bg_status.empty and selected_col in self.cell_bg_status:
bg_condition = (self.cell_bg_status[selected_col] == selected_bg)
condition = value_condition & bg_condition
else:
condition = value_condition
# 应用组合筛选条件
self.filtered_df = self.df[condition]
# 更新预览
self.update_preview(self.filtered_df)
# 更新筛选信息
match_count = len(self.filtered_df)
total_count = len(self.df)
self.filter_info_var.set(
f"筛选结果: {match_count} 行 (共 {total_count} 行, {match_count/total_count:.1%})"
)
# 更新统计信息
if self.col_var.get():
self.update_stats()
except Exception as e:
messagebox.showerror("错误", f"筛选数据时出错: {str(e)}")
def clear_filter(self):
"""清除筛选条件"""
self.filtered_df = None
# 重置筛选控件
self.filter_col_var.set('')
self.filter_value_var.set('')
self.filter_value_combobox['values'] = []
# 恢复原始数据预览
if self.df is not None:
self.update_preview(self.df)
# 更新筛选信息
self.filter_info_var.set("筛选已清除")
# 更新统计信息
if self.col_var.get():
self.update_stats()
# 清除联动统计
for item in self.linkage_tree.get_children():
self.linkage_tree.delete(item)
self.linkage_info_var.set("未统计")
def update_preview(self, df):
"""更新数据预览 - 增强健壮性"""
try:
# 确保有数据
if df is None or df.empty:
logging.warning("尝试更新预览时数据为空")
return
# 创建带底色状态的预览副本
preview_df = df.copy().head(1000)
# 添加底色状态列(如果存在)
if not self.cell_bg_status.empty:
for col in df.columns:
if col in self.cell_bg_status:
preview_df[f"{col}_底色状态"] = self.cell_bg_status[col].loc[preview_df.index]
# 更新或创建表格
if self.pt is not None and self.pt_frame is not None:
try:
# 更新现有表格
self.pt.model = TableModel(dataframe=preview_df)
self.pt.redraw()
except Exception as e:
logging.error(f"更新预览时出错: {str(e)}")
# 尝试重新创建表格
self.safe_destroy_preview()
self.create_preview_table(preview_df)
else:
# 创建新表格
self.create_preview_table(preview_df)
except Exception as e:
logging.error(f"更新预览失败: {str(e)}")
messagebox.showerror("预览错误", f"无法更新预览: {str(e)}")
def create_preview_table(self, df):
"""创建新的预览表格"""
try:
# 安全销毁旧组件
self.safe_destroy_preview()
# 创建新框架
self.pt_frame = ttk.Frame(self.preview_table)
self.pt_frame.pack(fill=tk.BOTH, expand=True)
# 创建新表格
self.pt = Table(
self.pt_frame,
showtoolbar=False,
showstatusbar=True,
width=400,
height=300
)
self.pt.model = TableModel(dataframe=df)
self.pt.show()
except Exception as e:
logging.error(f"创建预览表格失败: {str(e)}")
messagebox.showerror("预览错误", f"无法创建预览表格: {str(e)}")
def update_linkage_stats(self, event=None):
"""更新联动统计结果(考虑底色状态)"""
if not hasattr(self, 'df') or self.df.empty:
messagebox.showwarning("警告", "请先分析表格")
return
target_col_display = self.linkage_col_var.get()
if not target_col_display:
return
# 从显示列名中提取原始列名(移除底色标记)
if " (有底色)" in target_col_display:
target_col = target_col_display.replace(" (有底色)", "")
elif " (无底色)" in target_col_display:
target_col = target_col_display.replace(" (无底色)", "")
else:
target_col = target_col_display
# 清空现有结果
for item in self.linkage_tree.get_children():
self.linkage_tree.delete(item)
# 确定使用筛选数据还是完整数据
data_source = self.filtered_df if self.filtered_df is not None else self.df
# 添加底色状态列(如果存在)
if not self.cell_bg_status.empty and target_col in self.cell_bg_status:
# 合并底色状态到数据中
temp_df = data_source.copy()
temp_df['底色状态'] = self.cell_bg_status[target_col].loc[temp_df.index]
# 统计值分布(包含底色状态)
value_counts = temp_df.groupby([target_col, '底色状态']).size().reset_index(name='计数')
total_count = len(data_source)
# 添加到树状视图
for _, row in value_counts.iterrows():
percent = f"{row['计数'] / total_count:.1%}"
self.linkage_tree.insert("", "end", values=(
row[target_col],
row['底色状态'],
row['计数'],
percent
))
# 更新统计信息
self.linkage_info_var.set(
f"共 {len(value_counts)} 个组合 (总计 {total_count} 行)"
)
else:
# 没有底色信息时的处理
value_counts = data_source[target_col].value_counts()
total_count = len(data_source)
for value, count in value_counts.items():
percent = f"{count / total_count:.1%}"
self.linkage_tree.insert("", "end", values=(value, "", count, percent))
self.linkage_info_var.set(
f"共 {len(value_counts)} 个唯一值 (总计 {total_count} 行)"
)
def update_stats(self, event=None):
"""更新列统计信息"""
# 清空统计表格
for item in self.stats_tree.get_children():
self.stats_tree.delete(item)
selected_col_display = self.col_var.get()
if not selected_col_display:
return
# 从显示列名中提取原始列名(移除底色标记)
if " (有底色)" in selected_col_display:
selected_col = selected_col_display.replace(" (有底色)", "")
elif " (无底色)" in selected_col_display:
selected_col = selected_col_display.replace(" (无底色)", "")
else:
selected_col = selected_col_display
if not selected_col or selected_col not in self.df.columns:
return
col_data = self.df[selected_col]
# 基本统计信息
self.stats_tree.insert("", "end", values=("列名", selected_col))
self.stats_tree.insert("", "end", values=("数据类型", str(col_data.dtype)))
self.stats_tree.insert("", "end", values=("总行数", len(col_data)))
non_null_count = col_data.count()
null_count = len(col_data) - non_null_count
self.stats_tree.insert("", "end", values=("非空值数量", non_null_count))
self.stats_tree.insert("", "end", values=("空值数量", null_count))
self.stats_tree.insert("", "end", values=("空值比例", f"{null_count/len(col_data):.2%}" if len(col_data) > 0 else "0%"))
# 值统计
if pd.api.types.is_numeric_dtype(col_data):
try:
self.stats_tree.insert("", "end", values=("平均值", f"{col_data.mean():.4f}"))
self.stats_tree.insert("", "end", values=("中位数", f"{col_data.median():.4f}"))
self.stats_tree.insert("", "end", values=("最小值", f"{col_data.min():.4f}"))
self.stats_tree.insert("", "end", values=("最大值", f"{col_data.max():.4f}"))
self.stats_tree.insert("", "end", values=("标准差", f"{col_data.std():.4f}"))
self.stats_tree.insert("", "end", values=("偏度", f"{col_data.skew():.4f}"))
self.stats_tree.insert("", "end", values=("峰度", f"{col_data.kurtosis():.4f}"))
self.stats_tree.insert("", "end", values=("总和", f"{col_data.sum():.4f}"))
except:
pass
# 唯一值统计
unique_count = col_data.nunique()
self.stats_tree.insert("", "end", values=("唯一值数量", unique_count))
unique_ratio = unique_count/non_null_count if non_null_count > 0 else 0
self.stats_tree.insert("", "end", values=("唯一值比例", f"{unique_ratio:.2%}"))
# 最常出现的值
if non_null_count > 0:
try:
top_values = col_data.value_counts().head(3)
top_str = ", ".join([f"{val} ({count})" for val, count in top_values.items()])
self.stats_tree.insert("", "end", values=("最常见值", top_str))
except:
pass
def perform_advanced_analysis(self):
"""执行高级分析"""
if self.df is None or self.df.empty:
messagebox.showwarning("警告", "请先分析表格")
return
self.status_var.set("执行高级分析...")
self.progress_var.set(0)
# 备份原始底色状态
original_bg_status = self.cell_bg_status.copy() if not self.cell_bg_status.empty else None
try:
# 缺失值处理
missing_strategy = self.missing_var.get()
if missing_strategy != "不处理":
numeric_cols = self.df.select_dtypes(include=np.number).columns
categorical_cols = self.df.select_dtypes(include='object').columns
if missing_strategy == "删除行":
self.df = self.df.dropna()
else:
if missing_strategy == "平均值填充":
strategy = 'mean'
elif missing_strategy == "中位数填充":
strategy = 'median'
else: # 众数填充
strategy = 'most_frequent'
# 数值列填充
if len(numeric_cols) > 0:
num_imputer = SimpleImputer(strategy=strategy)
self.df[numeric_cols] = num_imputer.fit_transform(self.df[numeric_cols])
# 分类列填充
if len(categorical_cols) > 0:
cat_imputer = SimpleImputer(strategy='most_frequent')
self.df[categorical_cols] = cat_imputer.fit_transform(self.df[categorical_cols])
# 异常值检测
if self.outlier_var.get():
numeric_cols = self.df.select_dtypes(include=np.number).columns
if len(numeric_cols) > 0:
clf = IsolationForest(contamination=0.05, random_state=42)
outliers = clf.fit_predict(self.df[numeric_cols])
self.df['is_outlier'] = outliers == -1
# 在界面上显示异常值数量
outlier_count = self.df['is_outlier'].sum()
messagebox.showinfo("异常值检测", f"检测到 {outlier_count} 个异常值\n已添加 'is_outlier' 列标记")
# 数据标准化
if self.scale_var.get():
numeric_cols = self.df.select_dtypes(include=np.number).columns
if len(numeric_cols) > 0:
for col in numeric_cols:
min_val = self.df[col].min()
max_val = self.df[col].max()
if max_val > min_val:
self.df[col] = (self.df[col] - min_val) / (max_val - min_val)
# 更新分析结果
self.analyze_headers()
self.analyze_data_quality()
# 更新列选择框
self.col_combobox['values'] = list(self.df.columns)
if self.df.columns.size > 0:
self.col_combobox.current(0)
self.update_stats()
# 高级分析后恢复底色状态
if original_bg_status is not None:
# 只保留现有行的底色状态
self.cell_bg_status = original_bg_status.loc[self.df.index]
# 处理新增列(如果没有底色状态,设为"无底色")
for col in self.df.columns:
if col not in self.cell_bg_status:
self.cell_bg_status[col] = "无底色"
self.status_var.set("高级分析完成")
self.progress_var.set(100)
except Exception as e:
if original_bg_status is not None:
self.cell_bg_status = original_bg_status
messagebox.showerror("错误", f"高级分析时出错: {str(e)}")
self.status_var.set(f"错误: {str(e)}")
if __name__ == "__main__":
# 设置未捕获异常的全局处理
def handle_exception(exc_type, exc_value, exc_traceback):
"""处理未捕获的异常"""
error_msg = f"未捕获异常:\n{exc_type.__name__}: {exc_value}"
logging.critical(error_msg)
logging.critical(traceback.format_exc())
# 在GUI中显示错误
root = tk.Tk()
root.withdraw() # 隐藏主窗口
messagebox.showerror("未捕获异常", error_msg)
root.destroy()
sys.exit(1)
sys.excepthook = handle_exception
try:
root = tk.Tk()
app = TableAnalyzer(root)
root.mainloop()
except Exception as e:
error_msg = f"初始化应用失败: {str(e)}"
logging.critical(error_msg)
logging.critical(traceback.format_exc())
messagebox.showerror("致命错误", error_msg)
sys.exit(1)
后面要怎么改
最新发布