pd.value_counts(db)

博客提及使用pd.value_counts对数据db进行处理,涉及pandas库中的value_counts方法,这是信息技术领域数据处理的常用操作。
pd.value_counts(db)
import os import pandas as pd import numpy as np import tkinter as tk from tkinter import ttk, filedialog, messagebox from openpyxl import load_workbook from openpyxl.styles import PatternFill, Font, colors import chardet import threading import time from pandastable import Table, TableModel from sklearn.impute import SimpleImputer from sklearn.ensemble import IsolationForest class TableAnalyzer: def __init__(self, root): self.root = root self.root.title("智能表格分析工具") self.root.geometry("1200x800") self.root.configure(bg='#f5f7fa') # 设置窗口图标 try: self.root.iconbitmap('table_icon.ico') except: pass # 创建主框架 self.main_frame = ttk.Frame(root, padding=15) self.main_frame.pack(fill=tk.BOTH, expand=True) # 创建标题 title_frame = ttk.Frame(self.main_frame) title_frame.pack(fill=tk.X, pady=(0, 15)) title_label = ttk.Label( title_frame, text="智能表格分析工具", font=("Arial", 18, "bold"), foreground="#2c3e50" ) title_label.pack(side=tk.LEFT) # 状态指示器 self.status_var = tk.StringVar(value="就绪") status_label = ttk.Label( title_frame, textvariable=self.status_var, font=("Arial", 10), foreground="#7f8c8d" ) status_label.pack(side=tk.RIGHT, padx=10) # 创建左右分栏 paned_window = ttk.PanedWindow(self.main_frame, orient=tk.HORIZONTAL) paned_window.pack(fill=tk.BOTH, expand=True) # 左侧面板 (控制和预览) left_panel = ttk.Frame(paned_window, padding=10) paned_window.add(left_panel, weight=1) # 右侧面板 (分析和筛选) right_panel = ttk.Frame(paned_window, padding=10) paned_window.add(right_panel, weight=1) # 左侧面板内容 self.create_file_selection_section(left_panel) self.create_header_detection_section(left_panel) self.create_preview_section(left_panel) # 右侧面板内容 self.create_filter_section(right_panel) self.create_linkage_section(right_panel) self.create_analysis_section(right_panel) self.create_advanced_analysis_section(right_panel) # 初始化变量 self.file_path = "" self.df = None self.workbook = None self.sheet = None self.header_row = 0 self.raw_data = None self.analysis_thread = None self.stop_analysis = False self.progress = 0 #self.column_has_color = {} # 存储列是否有底色 # 设置样式 self.configure_styles() self.cell_bg_status = None # 存储单元格底色状态的DataFrame def configure_styles(self): style = ttk.Style() style.theme_use('clam') # 主框架样式 style.configure('TFrame', background='#f5f7fa') # 标签框架样式 style.configure('TLabelframe', background='#ffffff', borderwidth=2) style.configure('TLabelframe.Label', background='#ffffff', foreground='#2c3e50', font=('Arial', 10, 'bold')) # 按钮样式 style.configure('TButton', background='#3498db', foreground='white', font=('Arial', 10), borderwidth=1, padding=5) style.map('TButton', background=[('active', '#2980b9')]) # 强调按钮样式 style.configure('Accent.TButton', background='#2ecc71', foreground='white') style.map('Accent.TButton', background=[('active', '#27ae60')]) # 危险按钮样式 style.configure('Danger.TButton', background='#e74c3c', foreground='white') style.map('Danger.TButton', background=[('active', '#c0392b')]) # 树状视图样式 style.configure('Treeview', background='#ffffff', fieldbackground='#ffffff', foreground='#2c3e50', rowheight=25, font=('Arial', 9)) style.configure('Treeview.Heading', background='#3498db', foreground='white', font=('Arial', 10, 'bold')) style.map('Treeview', background=[('selected', '#3498db')]) # 进度条样式 style.configure('Custom.Horizontal.TProgressbar', background='#3498db', thickness=20) # 标签样式 style.configure('TLabel', background='#f5f7fa', font=('Arial', 10)) style.configure('Title.TLabel', font=('Arial', 12, 'bold'), foreground='#2c3e50') # 组合框样式 style.configure('TCombobox', fieldbackground='white', background='white') def create_file_selection_section(self, parent): # 文件选择框架 file_frame = ttk.LabelFrame(parent, text="文件选择") file_frame.pack(fill=tk.X, padx=5, pady=5) # 文件路径输入框 path_frame = ttk.Frame(file_frame) path_frame.pack(fill=tk.X, padx=10, pady=10) self.path_var = tk.StringVar() path_entry = ttk.Entry(path_frame, textvariable=self.path_var, state='readonly') path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=(0, 10)) # 浏览按钮 browse_btn = ttk.Button( path_frame, text="浏览文件", command=self.browse_file, width=10 ) browse_btn.pack(side=tk.RIGHT) # 进度条 self.progress_var = tk.DoubleVar() progress_bar = ttk.Progressbar( file_frame, variable=self.progress_var, maximum=100, style='Custom.Horizontal.TProgressbar' ) progress_bar.pack(fill=tk.X, padx=10, pady=(0, 10)) # 文件类型提示 type_label = ttk.Label( file_frame, text="支持文件类型: Excel (.xlsx, .xls), CSV (.csv), Parquet (.parquet)", foreground="#7f8c8d", font=("Arial", 9) ) type_label.pack(side=tk.LEFT, padx=10, pady=(0, 5)) def create_header_detection_section(self, parent): # 表头检测框架 header_frame = ttk.LabelFrame(parent, text="表头检测与设置") header_frame.pack(fill=tk.X, padx=5, pady=5) # 表头行设置 row_frame = ttk.Frame(header_frame) row_frame.pack(fill=tk.X, padx=10, pady=10) ttk.Label(row_frame, text="表头所在行:").pack(side=tk.LEFT, padx=(0, 5)) self.header_row_var = tk.IntVar(value=0) self.header_row_spin = ttk.Spinbox( row_frame, from_=0, to=100, width=5, textvariable=self.header_row_var ) self.header_row_spin.pack(side=tk.LEFT, padx=(0, 10)) # 自动检测按钮 detect_btn = ttk.Button( row_frame, text="自动检测表头", command=self.auto_detect_header ) detect_btn.pack(side=tk.LEFT, padx=(0, 10)) # 分析按钮 analyze_btn = ttk.Button( row_frame, text="分析表格", command=self.start_analysis_thread, style="Accent.TButton", width=10 ) analyze_btn.pack(side=tk.RIGHT) # 停止按钮 stop_btn = ttk.Button( row_frame, text="停止分析", command=self.stop_analysis_process, style="Danger.TButton", width=10 ) stop_btn.pack(side=tk.RIGHT, padx=(0, 10)) def create_preview_section(self, parent): # 数据预览框架 preview_frame = ttk.LabelFrame(parent, text="数据预览") preview_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) # 创建表格预览 self.preview_table = ttk.Frame(preview_frame) self.preview_table.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 初始占位符 self.preview_placeholder = ttk.Label( preview_frame, text="选择文件后显示数据预览", foreground="#7f8c8d", font=("Arial", 11) ) self.preview_placeholder.pack(expand=True) def create_filter_section(self, parent): """创建数据筛选区域""" # 筛选框架 filter_frame = ttk.LabelFrame(parent, text="数据筛选") filter_frame.pack(fill=tk.X, padx=5, pady=5) # 筛选控制区域 control_frame = ttk.Frame(filter_frame) control_frame.pack(fill=tk.X, padx=10, pady=10) # 列选择 ttk.Label(control_frame, text="选择列:").pack(side=tk.LEFT, padx=(0, 5)) self.filter_col_var = tk.StringVar() self.filter_col_combobox = ttk.Combobox( control_frame, textvariable=self.filter_col_var, state="readonly", width=20 ) self.filter_col_combobox.pack(side=tk.LEFT, padx=(0, 10)) self.filter_col_combobox.bind("<<ComboboxSelected>>", self.update_filter_values) # 值选择 ttk.Label(control_frame, text="选择值:").pack(side=tk.LEFT, padx=(0, 5)) self.filter_value_var = tk.StringVar() self.filter_value_combobox = ttk.Combobox( control_frame, textvariable=self.filter_value_var, state="readonly", width=20 ) self.filter_value_combobox.pack(side=tk.LEFT, padx=(0, 10)) # 筛选按钮 filter_btn = ttk.Button( control_frame, text="应用筛选", command=self.apply_filter, style="Accent.TButton" ) filter_btn.pack(side=tk.LEFT, padx=(0, 10)) # 清除筛选按钮 clear_btn = ttk.Button( control_frame, text="清除筛选", command=self.clear_filter, style="Danger.TButton" ) clear_btn.pack(side=tk.LEFT) # 筛选结果信息 self.filter_info_var = tk.StringVar(value="未应用筛选") filter_info_label = ttk.Label( filter_frame, textvariable=self.filter_info_var, foreground="#3498db", font=("Arial", 10, "bold") ) filter_info_label.pack(side=tk.RIGHT, padx=10, pady=(0, 5)) def create_linkage_section(self, parent): """创建联动统计区域""" # 联动统计框架 linkage_frame = ttk.LabelFrame(parent, text="联动统计") linkage_frame.pack(fill=tk.X, padx=5, pady=5) # 控制区域 control_frame = ttk.Frame(linkage_frame) control_frame.pack(fill=tk.X, padx=10, pady=10) # 目标列选择 ttk.Label(control_frame, text="统计列:").pack(side=tk.LEFT, padx=(0, 5)) self.linkage_col_var = tk.StringVar() self.linkage_col_combobox = ttk.Combobox( control_frame, textvariable=self.linkage_col_var, state="readonly", width=20 ) self.linkage_col_combobox.pack(side=tk.LEFT, padx=(0, 10)) # 统计按钮 stats_btn = ttk.Button( control_frame, text="统计分布", command=self.update_linkage_stats, style="Accent.TButton" ) stats_btn.pack(side=tk.LEFT, padx=(0, 10)) # 统计结果框架 stats_frame = ttk.Frame(linkage_frame) stats_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 创建树状视图显示统计结果 columns = ("value", "count", "percent") self.linkage_tree = ttk.Treeview( stats_frame, columns=columns, show="headings", height=8 ) # 设置列 col_widths = [200, 100, 100] col_headings = ["值", "出现次数", "占比"] for col, width, heading in zip(columns, col_widths, col_headings): self.linkage_tree.column(col, width=width, anchor=tk.CENTER) self.linkage_tree.heading(col, text=heading) # 添加滚动条 scrollbar = ttk.Scrollbar( stats_frame, orient="vertical", command=self.linkage_tree.yview ) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) self.linkage_tree.configure(yscrollcommand=scrollbar.set) self.linkage_tree.pack(fill=tk.BOTH, expand=True) # 统计信息 self.linkage_info_var = tk.StringVar(value="未统计") linkage_info_label = ttk.Label( linkage_frame, textvariable=self.linkage_info_var, foreground="#3498db", font=("Arial", 10, "bold") ) linkage_info_label.pack(side=tk.RIGHT, padx=10, pady=(0, 5)) def create_analysis_section(self, parent): # 分析结果框架 analysis_frame = ttk.LabelFrame(parent, text="数据分析结果") analysis_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) # 创建分页控件 self.notebook = ttk.Notebook(analysis_frame) self.notebook.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) # 表头识别标签页 self.headers_tab = ttk.Frame(self.notebook) self.notebook.add(self.headers_tab, text="表头识别") # 列统计标签页 self.stats_tab = ttk.Frame(self.notebook) self.notebook.add(self.stats_tab, text="列统计") # 格式分析标签页 self.format_tab = ttk.Frame(self.notebook) self.notebook.add(self.format_tab, text="格式分析") # 数据质量标签页 self.quality_tab = ttk.Frame(self.notebook) self.notebook.add(self.quality_tab, text="数据质量") # 初始化标签页内容 self.initialize_tabs() def initialize_tabs(self): # 表头识别标签页内容 headers_frame = ttk.Frame(self.headers_tab) headers_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 表头表格 columns = ("index", "header", "data_type", "sample", "unique", "null_percent") self.headers_tree = ttk.Treeview( headers_frame, columns=columns, show="headings", height=8 ) # 设置列 col_widths = [50, 150, 100, 200, 80, 80] col_anchors = [tk.CENTER, tk.W, tk.W, tk.W, tk.CENTER, tk.CENTER] col_headings = ["序号", "表头名称", "数据类型", "示例值", "唯一值", "空值率"] for col, width, anchor, heading in zip(columns, col_widths, col_anchors, col_headings): self.headers_tree.column(col, width=width, anchor=anchor) self.headers_tree.heading(col, text=heading) # 添加滚动条 scrollbar = ttk.Scrollbar( headers_frame, orient="vertical", command=self.headers_tree.yview ) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) self.headers_tree.configure(yscrollcommand=scrollbar.set) self.headers_tree.pack(fill=tk.BOTH, expand=True) # 列统计标签页内容 stats_frame = ttk.Frame(self.stats_tab) stats_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 列选择框 col_frame = ttk.Frame(stats_frame) col_frame.pack(fill=tk.X, pady=(0, 10)) ttk.Label(col_frame, text="选择列:").pack(side=tk.LEFT, padx=(0, 5)) self.col_var = tk.StringVar() self.col_combobox = ttk.Combobox( col_frame, textvariable=self.col_var, state="readonly", width=30 ) self.col_combobox.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=(0, 10)) self.col_combobox.bind("<<ComboboxSelected>>", self.update_stats) # 统计信息表格 self.stats_tree = ttk.Treeview( stats_frame, columns=("metric", "value"), show="headings", height=10 ) # 设置列 self.stats_tree.column("metric", width=200, anchor=tk.W) self.stats_tree.column("value", width=300, anchor=tk.W) # 设置列标题 self.stats_tree.heading("metric", text="统计指标") self.stats_tree.heading("value", text="值") # 添加滚动条 scrollbar2 = ttk.Scrollbar( stats_frame, orient="vertical", command=self.stats_tree.yview ) scrollbar2.pack(side=tk.RIGHT, fill=tk.Y) self.stats_tree.configure(yscrollcommand=scrollbar2.set) self.stats_tree.pack(fill=tk.BOTH, expand=True) # 格式分析标签页内容 format_frame = ttk.Frame(self.format_tab) format_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 格式信息表格 columns = ("column", "has_bg", "bg_status") self.format_tree = ttk.Treeview( format_frame, columns=columns, show="headings", height=10 ) # 设置列 col_widths = [150, 120, 120] col_anchors = [tk.W, tk.CENTER, tk.CENTER] col_headings = ["列名", "有底色", "底色状态"] for col, width, anchor, heading in zip(columns, col_widths, col_anchors, col_headings): self.format_tree.column(col, width=width, anchor=anchor) self.format_tree.heading(col, text=heading) # 添加滚动条 scrollbar3 = ttk.Scrollbar( format_frame, orient="vertical", command=self.format_tree.yview ) scrollbar3.pack(side=tk.RIGHT, fill=tk.Y) self.format_tree.configure(yscrollcommand=scrollbar3.set) self.format_tree.pack(fill=tk.BOTH, expand=True) # 数据质量标签页内容 quality_frame = ttk.Frame(self.quality_tab) quality_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 数据质量表格 columns = ("issue", "count", "severity", "columns", "suggestion") self.quality_tree = ttk.Treeview( quality_frame, columns=columns, show="headings", height=8 ) # 设置列 col_widths = [150, 80, 100, 150, 250] col_anchors = [tk.W, tk.CENTER, tk.CENTER, tk.W, tk.W] col_headings = ["问题类型", "数量", "严重程度", "相关列", "解决建议"] for col, width, anchor, heading in zip(columns, col_widths, col_anchors, col_headings): self.quality_tree.column(col, width=width, anchor=anchor) self.quality_tree.heading(col, text=heading) # 添加滚动条 scrollbar4 = ttk.Scrollbar( quality_frame, orient="vertical", command=self.quality_tree.yview ) scrollbar4.pack(side=tk.RIGHT, fill=tk.Y) self.quality_tree.configure(yscrollcommand=scrollbar4.set) self.quality_tree.pack(fill=tk.BOTH, expand=True) # 添加缺失的 update_stats 方法 def update_stats(self, event=None): """更新列统计信息""" # 清空统计表格 for item in self.stats_tree.get_children(): self.stats_tree.delete(item) selected_col_display = self.col_var.get() if not selected_col_display: return # 从显示列名中提取原始列名(移除底色标记) if " (有底色)" in selected_col_display: selected_col = selected_col_display.replace(" (有底色)", "") elif " (无底色)" in selected_col_display: selected_col = selected_col_display.replace(" (无底色)", "") else: selected_col = selected_col_display if not selected_col or selected_col not in self.df.columns: return col_data = self.df[selected_col] # 基本统计信息 self.stats_tree.insert("", "end", values=("列名", selected_col)) self.stats_tree.insert("", "end", values=("数据类型", str(col_data.dtype))) self.stats_tree.insert("", "end", values=("总行数", len(col_data))) non_null_count = col_data.count() null_count = len(col_data) - non_null_count self.stats_tree.insert("", "end", values=("非空值数量", non_null_count)) self.stats_tree.insert("", "end", values=("空值数量", null_count)) self.stats_tree.insert("", "end", values=("空值比例", f"{null_count/len(col_data):.2%}" if len(col_data) > 0 else "0%")) # 值统计 if pd.api.types.is_numeric_dtype(col_data): try: self.stats_tree.insert("", "end", values=("平均值", f"{col_data.mean():.4f}")) self.stats_tree.insert("", "end", values=("中位数", f"{col_data.median():.4f}")) self.stats_tree.insert("", "end", values=("最小值", f"{col_data.min():.4f}")) self.stats_tree.insert("", "end", values=("最大值", f"{col_data.max():.4f}")) self.stats_tree.insert("", "end", values=("标准差", f"{col_data.std():.4f}")) self.stats_tree.insert("", "end", values=("偏度", f"{col_data.skew():.4f}")) self.stats_tree.insert("", "end", values=("峰度", f"{col_data.kurtosis():.4f}")) self.stats_tree.insert("", "end", values=("总和", f"{col_data.sum():.4f}")) except: pass # 唯一值统计 unique_count = col_data.nunique() self.stats_tree.insert("", "end", values=("唯一值数量", unique_count)) unique_ratio = unique_count/non_null_count if non_null_count > 0 else 0 self.stats_tree.insert("", "end", values=("唯一值比例", f"{unique_ratio:.2%}")) # 最常出现的值 if non_null_count > 0: try: top_values = col_data.value_counts().head(3) top_str = ", ".join([f"{val} ({count})" for val, count in top_values.items()]) self.stats_tree.insert("", "end", values=("最常见值", top_str)) except: pass def create_advanced_analysis_section(self, parent): # 高级分析框架 analysis_frame = ttk.LabelFrame(parent, text="高级分析") analysis_frame.pack(fill=tk.X, padx=5, pady=5) # 分析选项 options_frame = ttk.Frame(analysis_frame) options_frame.pack(fill=tk.X, padx=10, pady=10) # 异常值检测 self.outlier_var = tk.BooleanVar(value=True) outlier_cb = ttk.Checkbutton( options_frame, text="检测异常值", variable=self.outlier_var ) outlier_cb.pack(side=tk.LEFT, padx=(0, 20)) # 数据标准化 self.scale_var = tk.BooleanVar(value=False) scale_cb = ttk.Checkbutton( options_frame, text="数据标准化", variable=self.scale_var ) scale_cb.pack(side=tk.LEFT, padx=(0, 20)) # 缺失值处理 ttk.Label(options_frame, text="缺失值处理:").pack(side=tk.LEFT, padx=(0, 5)) self.missing_var = tk.StringVar(value="mean") missing_options = ["不处理", "平均值填充", "中位数填充", "众数填充", "删除行"] missing_cb = ttk.Combobox( options_frame, textvariable=self.missing_var, values=missing_options, state="readonly", width=15 ) missing_cb.pack(side=tk.LEFT, padx=(0, 20)) # 执行高级分析按钮 adv_analyze_btn = ttk.Button( options_frame, text="执行高级分析", command=self.perform_advanced_analysis, style="Accent.TButton" ) adv_analyze_btn.pack(side=tk.RIGHT) def browse_file(self): filetypes = [ ("Excel 文件", "*.xlsx *.xls"), ("CSV 文件", "*.csv"), ("Parquet 文件", "*.parquet"), ("所有文件", "*.*") ] file_path = filedialog.askopenfilename( title="选择表格文件", filetypes=filetypes ) if file_path: self.file_path = file_path self.path_var.set(file_path) self.reset_analysis() self.load_and_preview() def load_and_preview(self): """加载文件并显示预览""" try: self.status_var.set("加载文件中...") self.progress_var.set(0) self.root.update() file_ext = os.path.splitext(self.file_path)[1].lower() # 移除预览占位符 self.preview_placeholder.pack_forget() # 创建预览表格 if hasattr(self, 'pt'): self.pt.destroy() self.pt_frame = ttk.Frame(self.preview_table) self.pt_frame.pack(fill=tk.BOTH, expand=True) # 使用pandastable进行高效预览 self.pt = Table( self.pt_frame, showtoolbar=False, showstatusbar=True, width=400, height=300 ) self.pt.show() # 加载数据 if file_ext in ['.xlsx', '.xls']: # 分块读取Excel文件 self.load_excel_preview() elif file_ext == '.csv': # 分块读取CSV文件 self.load_csv_preview() elif file_ext == '.parquet': # 读取Parquet文件 self.load_parquet_preview() else: messagebox.showerror("错误", "不支持的文件类型") return # 尝试自动检测表头 self.auto_detect_header() self.status_var.set("文件加载完成") except Exception as e: messagebox.showerror("错误", f"加载文件时出错: {str(e)}") self.status_var.set(f"错误: {str(e)}") def load_excel_preview(self): """分块读取Excel文件预览""" # 仅读取前1000行进行预览 self.raw_data = pd.read_excel(self.file_path, header=None, nrows=1000) self.pt.model = TableModel(dataframe=self.raw_data) self.pt.redraw() def load_csv_preview(self): """分块读取CSV文件预览""" # 检测编码 with open(self.file_path, 'rb') as f: result = chardet.detect(f.read(10000)) encoding = result['encoding'] or 'utf-8' # 仅读取前1000行进行预览 self.raw_data = pd.read_csv( self.file_path, header=None, encoding=encoding, engine='c', nrows=1000 ) self.pt.model = TableModel(dataframe=self.raw_data) self.pt.redraw() def load_parquet_preview(self): """读取Parquet文件预览""" # 仅读取前1000行进行预览 self.raw_data = pd.read_parquet(self.file_path) self.pt.model = TableModel(dataframe=self.raw_data.head(1000)) self.pt.redraw() def auto_detect_header(self): """自动检测表头所在行""" if self.raw_data is None or self.raw_data.empty: return # 使用更高效的检测算法 best_row = 0 best_score = -1 # 只检查前20行 for i in range(min(20, len(self.raw_data))): row = self.raw_data.iloc[i] # 计算得分 score = 0 # 1. 非空值比例 non_null_ratio = row.notnull().mean() score += non_null_ratio * 20 # 2. 字符串比例 str_count = sum(isinstance(x, str) for x in row) str_ratio = str_count / len(row) if len(row) > 0 else 0 score += str_ratio * 30 # 3. 唯一值比例 unique_count = len(set(row)) unique_ratio = unique_count / len(row) if len(row) > 0 else 0 score += unique_ratio * 50 if score > best_score: best_score = score best_row = i # 设置检测到的表头行 self.header_row_var.set(best_row) def reset_analysis(self): # 清除之前的分析结果 for tree in [self.headers_tree, self.stats_tree, self.format_tree, self.quality_tree, self.linkage_tree]: for item in tree.get_children(): tree.delete(item) self.col_combobox.set('') self.col_combobox['values'] = [] self.filter_col_var.set('') self.filter_value_var.set('') self.filter_value_combobox['values'] = [] self.linkage_col_var.set('') self.filter_info_var.set("未应用筛选") self.linkage_info_var.set("未统计") self.column_has_color = {} def start_analysis_thread(self): """启动分析线程""" if not self.file_path: messagebox.showwarning("警告", "请先选择表格文件") return # 禁用分析按钮 for widget in self.root.winfo_children(): if isinstance(widget, ttk.Button) and widget.cget("text") == "分析表格": widget.config(state=tk.DISABLED) self.status_var.set("分析中...") self.progress_var.set(0) self.stop_analysis = False # 创建并启动分析线程 self.analysis_thread = threading.Thread(target=self.analyze_table) self.analysis_thread.daemon = True self.analysis_thread.start() # 启动进度更新 self.root.after(100, self.update_progress) def stop_analysis_process(self): """停止分析过程""" self.stop_analysis = True self.status_var.set("分析已停止") def update_progress(self): """更新进度条""" if self.analysis_thread and self.analysis_thread.is_alive(): self.progress_var.set(self.progress) self.root.after(100, self.update_progress) else: # 重新启用分析按钮 for widget in self.root.winfo_children(): if isinstance(widget, ttk.Button) and widget.cget("text") == "分析表格": widget.config(state=tk.NORMAL) self.progress_var.set(100) def analyze_table(self): """执行表格分析(在后台线程中运行)""" try: self.header_row = self.header_row_var.get() file_ext = os.path.splitext(self.file_path)[1].lower() # 更新进度 self.progress = 10 time.sleep(0.1) if self.stop_analysis: return if file_ext in ['.xlsx', '.xls']: # 读取Excel文件,指定表头行 self.df = pd.read_excel(self.file_path, header=self.header_row) self.workbook = load_workbook(self.file_path) self.sheet = self.workbook.active elif file_ext == '.csv': # 读取CSV文件,指定表头行 with open(self.file_path, 'rb') as f: result = chardet.detect(f.read()) encoding = result['encoding'] or 'utf-8' self.df = pd.read_csv( self.file_path, header=self.header_row, encoding=encoding, engine='c' ) elif file_ext == '.parquet': # 读取Parquet文件 self.df = pd.read_parquet(self.file_path) else: messagebox.showerror("错误", "不支持的文件类型") return # 清理列名 self.df.columns = [str(col).strip() for col in self.df.columns] # 初始化列底色信息 self.column_has_color = {col: False for col in self.df.columns} # 更新进度 self.progress = 30 time.sleep(0.1) if self.stop_analysis: return # 分析表头 self.analyze_headers() # 更新进度 self.progress = 50 time.sleep(0.1) if self.stop_analysis: return # 分析列格式 if file_ext in ['.xlsx', '.xls']: self.analyze_format() # 更新进度 self.progress = 70 time.sleep(0.1) if self.stop_analysis: return # 分析数据质量 self.analyze_data_quality() # 更新列选择框 self.col_combobox['values'] = list(self.df.columns) if self.df.columns.size > 0: self.col_combobox.current(0) self.update_stats() # 更新筛选控件(添加底色标记) display_columns = [] for col in self.df.columns: has_color = self.column_has_color.get(col, False) display_columns.append(f"{col} (有底色)" if has_color else f"{col} (无底色)") self.filter_col_combobox['values'] = display_columns # 更新联动统计控件(添加底色标记) display_columns = [] for col in self.df.columns: has_color = self.column_has_color.get(col, False) display_columns.append(f"{col} (有底色)" if has_color else f"{col} (无底色)") self.linkage_col_combobox['values'] = display_columns self.status_var.set("分析完成") # 更新进度 self.progress = 100 except Exception as e: messagebox.showerror("错误", f"分析表格时出错: {str(e)}") self.status_var.set(f"错误: {str(e)}") def analyze_headers(self): # 清空表头表格 for item in self.headers_tree.get_children(): self.headers_tree.delete(item) # 添加表头信息 for i, col in enumerate(self.df.columns): if self.stop_analysis: return # 检查列是否有底色(只针对数据区域) has_color = self.column_has_color.get(col, False) col_display = f"{col} (有底色)" if has_color else f"{col} (无底色)" sample = "" # 获取前3个非空值作为示例 non_empty = self.df[col].dropna() if len(non_empty) > 0: sample = ", ".join(str(x) for x in non_empty.head(3).tolist()) # 推断数据类型 dtype = str(self.df[col].dtype) dtype_map = { 'object': '文本', 'int64': '整数', 'float64': '小数', 'bool': '布尔值', 'datetime64': '日期时间', 'category': '分类数据' } for k, v in dtype_map.items(): if k in dtype: dtype = v break # 唯一值统计 unique_count = self.df[col].nunique() # 空值率 null_count = self.df[col].isnull().sum() null_percent = f"{null_count / len(self.df):.1%}" if len(self.df) > 0 else "0%" self.headers_tree.insert("", "end", values=( i+1, col_display, # 使用带底色标记的列名 dtype, sample, unique_count, null_percent )) def analyze_format(self): """分析每个单元格的底色状态""" if not self.workbook or not self.sheet: return # 初始化底色状态DataFrame(与数据表相同形状) self.cell_bg_status = pd.DataFrame( "无底色", index=self.df.index, columns=self.df.columns ) # 分析每个单元格的底色 for row_idx in range(len(self.df)): for col_idx, col_name in enumerate(self.df.columns): # 转换为Excel坐标(行号从1开始) excel_row = self.header_row + 2 + row_idx col_letter = self.get_column_letter(col_idx + 1) cell = self.sheet[f"{col_letter}{excel_row}"] # 检测底色 has_bg = False if cell.fill and isinstance(cell.fill, PatternFill): if cell.fill.fgColor and cell.fill.fgColor.rgb: if cell.fill.fgColor.rgb != "00000000": # 排除透明 has_bg = True # 存储底色状态 self.cell_bg_status.iloc[row_idx, col_idx] = "有底色" if has_bg else "无底色" def analyze_data_quality(self): """分析数据质量问题""" # 清空质量表格 for item in self.quality_tree.get_children(): self.quality_tree.delete(item) issues = [] # 1. 检查空值 null_counts = self.df.isnull().sum() total_rows = len(self.df) for col, count in null_counts.items(): if count > 0: severity = "高" if count / total_rows > 0.3 else "中" if count / total_rows > 0.1 else "低" issues.append({ "issue": "空值", "count": count, "severity": severity, "columns": col, "suggestion": f"考虑使用平均值/中位数填充或删除空值行" }) # 2. 检查重复行 duplicate_rows = self.df.duplicated().sum() if duplicate_rows > 0: severity = "高" if duplicate_rows / total_rows > 0.1 else "中" issues.append({ "issue": "重复行", "count": duplicate_rows, "severity": severity, "columns": "所有列", "suggestion": "删除重复行或分析重复原因" }) # 3. 检查数据类型不一致 for col in self.df.columns: if self.df[col].dtype == 'object': # 检查混合数据类型 type_counts = self.df[col].apply(type).value_counts() if len(type_counts) > 1: issues.append({ "issue": "数据类型不一致", "count": len(self.df[col]), "severity": "中", "columns": col, "suggestion": "统一数据类型或转换格式" }) # 4. 检查异常值(仅数值列) numeric_cols = self.df.select_dtypes(include=np.number).columns for col in numeric_cols: q1 = self.df[col].quantile(0.25) q3 = self.df[col].quantile(0.75) iqr = q3 - q1 lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr outliers = self.df[(self.df[col] < lower_bound) | (self.df[col] > upper_bound)] outlier_count = len(outliers) if outlier_count > 0: severity = "高" if outlier_count / total_rows > 0.05 else "中" issues.append({ "issue": "异常值", "count": outlier_count, "severity": severity, "columns": col, "suggestion": "检查数据准确性或进行转换" }) # 添加到表格 for issue in issues: self.quality_tree.insert("", "end", values=( issue["issue"], issue["count"], issue["severity"], issue["columns"], issue["suggestion"] )) def get_column_letter(self, col_idx): """将列索引转换为Excel列字母""" letters = [] while col_idx: col_idx, remainder = divmod(col_idx - 1, 26) letters.append(chr(65 + remainder)) return ''.join(reversed(letters)) def update_filter_values(self, event=None): """更新筛选值列表(带底色标记)""" selected_col = self.filter_col_var.get() if not selected_col or selected_col not in self.df.columns: return # 获取列的值和对应的底色状态 values = self.df[selected_col].astype(str) bg_status = self.cell_bg_status[selected_col] # 创建带底色标记的值列表 unique_values = set() for val, bg in zip(values, bg_status): unique_values.add(f"{val} ({bg})") # 如果唯一值太多,只显示前50个 if len(unique_values) > 50: unique_values = sorted(unique_values)[:50] messagebox.showinfo("提示", f"该列有超过50个唯一值,只显示前50个") # 更新值选择框 self.filter_value_combobox['values'] = sorted(unique_values) self.filter_value_combobox.set('') def apply_filter(self): """应用筛选条件(考虑底色状态)""" if self.df is None: messagebox.showwarning("警告", "请先分析表格") return selected_col = self.filter_col_var.get() selected_value_with_bg = self.filter_value_var.get() # 检查输入有效性 if not selected_col or not selected_value_with_bg: messagebox.showwarning("警告", "请选择列和值") return try: # 从带底色标记的值中提取原始值和底色状态 if " (有底色)" in selected_value_with_bg: selected_value = selected_value_with_bg.replace(" (有底色)", "") selected_bg = "有底色" elif " (无底色)" in selected_value_with_bg: selected_value = selected_value_with_bg.replace(" (无底色)", "") selected_bg = "无底色" else: selected_value = selected_value_with_bg selected_bg = None # 应用值筛选 if pd.api.types.is_numeric_dtype(self.df[selected_col]): try: selected_value = float(selected_value) value_condition = (self.df[selected_col] == selected_value) except: value_condition = (self.df[selected_col].astype(str) == selected_value) else: value_condition = (self.df[selected_col].astype(str) == selected_value) # 应用底色状态筛选(如果有) if selected_bg and hasattr(self, 'cell_bg_status') and selected_col in self.cell_bg_status: bg_condition = (self.cell_bg_status[selected_col] == selected_bg) condition = value_condition & bg_condition else: condition = value_condition # 应用组合筛选条件 self.filtered_df = self.df[condition] # 更新预览 self.update_preview(self.filtered_df) # 更新筛选信息 match_count = len(self.filtered_df) total_count = len(self.df) self.filter_info_var.set( f"筛选结果: {match_count} 行 (共 {total_count} 行, {match_count/total_count:.1%})" ) # 更新统计信息 self.update_stats(self.col_var.get()) # 自动更新联动统计 if self.linkage_col_var.get() and self.linkage_col_var.get() in self.df.columns: self.update_linkage_stats() except Exception as e: messagebox.showerror("错误", f"筛选数据时出错: {str(e)}") def clear_filter(self): """清除筛选条件""" if hasattr(self, 'filtered_df'): del self.filtered_df # 重置筛选控件 self.filter_col_var.set('') self.filter_value_var.set('') self.filter_value_combobox['values'] = [] # 恢复原始数据预览 if self.df is not None: self.update_preview(self.df) # 更新筛选信息 self.filter_info_var.set("筛选已清除") # 更新统计信息 if self.col_var.get(): self.update_stats(self.col_var.get()) # 清除联动统计 for item in self.linkage_tree.get_children(): self.linkage_tree.delete(item) self.linkage_info_var.set("未统计") def update_preview(self, df): """更新数据预览(可选:添加底色状态)""" if hasattr(self, 'pt'): # 创建带底色状态的预览副本 preview_df = df.copy().head(1000) # 添加底色状态列(如果存在) if hasattr(self, 'cell_bg_status') and not self.cell_bg_status.empty: for col in df.columns: if col in self.cell_bg_status: preview_df[f"{col}_底色状态"] = self.cell_bg_status[col].loc[preview_df.index] # 更新现有表格 self.pt.model = TableModel(dataframe=preview_df) self.pt.redraw() else: # 创建新表格 self.pt_frame = ttk.Frame(self.preview_table) self.pt_frame.pack(fill=tk.BOTH, expand=True) self.pt = Table( self.pt_frame, showtoolbar=False, showstatusbar=True, width=400, height=300 ) # 创建带底色状态的预览 preview_df = df.head(1000).copy() if hasattr(self, 'cell_bg_status') and not self.cell_bg_status.empty: for col in df.columns: if col in self.cell_bg_status: preview_df[f"{col}_底色状态"] = self.cell_bg_status[col].loc[preview_df.index] self.pt.model = TableModel(dataframe=preview_df) self.pt.show() def update_linkage_stats(self): """更新联动统计结果(考虑底色状态)""" if not hasattr(self, 'filtered_df') or self.filtered_df.empty: messagebox.showwarning("警告", "请先应用筛选") return target_col = self.linkage_col_var.get() if not target_col: return # 清空现有结果 for item in self.linkage_tree.get_children(): self.linkage_tree.delete(item) # 获取筛选后的数据 filtered_data = self.filtered_df # 添加底色状态列(如果存在) if hasattr(self, 'cell_bg_status') and target_col in self.cell_bg_status: # 合并底色状态到数据中 filtered_data = filtered_data.copy() filtered_data['底色状态'] = self.cell_bg_status[target_col].loc[filtered_data.index] # 统计值分布(包含底色状态) value_counts = filtered_data.groupby([target_col, '底色状态']).size().reset_index(name='计数') total_count = len(filtered_data) # 添加到树状视图 for _, row in value_counts.iterrows(): percent = f"{row['计数'] / total_count:.1%}" self.linkage_tree.insert("", "end", values=( row[target_col], row['底色状态'], row['计数'], percent )) # 更新统计信息 self.linkage_info_var.set( f"共 {len(value_counts)} 个组合 (总计 {total_count} 行)" ) else: # 没有底色信息时的处理 value_counts = filtered_data[target_col].value_counts() total_count = len(filtered_data) for value, count in value_counts.items(): percent = f"{count / total_count:.1%}" self.linkage_tree.insert("", "end", values=(value, "", count, percent)) self.linkage_info_var.set( f"共 {len(value_counts)} 个唯一值 (总计 {total_count} 行)" ) def perform_advanced_analysis(self): """执行高级分析""" if self.df is None or self.df.empty: messagebox.showwarning("警告", "请先分析表格") return self.status_var.set("执行高级分析...") self.progress_var.set(0) # 备份原始底色状态 original_bg_status = self.cell_bg_status.copy() if hasattr(self, 'cell_bg_status') else None try: # 缺失值处理 missing_strategy = self.missing_var.get() if missing_strategy != "不处理": numeric_cols = self.df.select_dtypes(include=np.number).columns categorical_cols = self.df.select_dtypes(include='object').columns if missing_strategy == "删除行": self.df = self.df.dropna() else: if missing_strategy == "平均值填充": strategy = 'mean' elif missing_strategy == "中位数填充": strategy = 'median' else: # 众数填充 strategy = 'most_frequent' # 数值列填充 if len(numeric_cols) > 0: num_imputer = SimpleImputer(strategy=strategy) self.df[numeric_cols] = num_imputer.fit_transform(self.df[numeric_cols]) # 分类列填充 if len(categorical_cols) > 0: cat_imputer = SimpleImputer(strategy='most_frequent') self.df[categorical_cols] = cat_imputer.fit_transform(self.df[categorical_cols]) # 异常值检测 if self.outlier_var.get(): numeric_cols = self.df.select_dtypes(include=np.number).columns if len(numeric_cols) > 0: clf = IsolationForest(contamination=0.05, random_state=42) outliers = clf.fit_predict(self.df[numeric_cols]) self.df['is_outlier'] = outliers == -1 # 在界面上显示异常值数量 outlier_count = self.df['is_outlier'].sum() messagebox.showinfo("异常值检测", f"检测到 {outlier_count} 个异常值\n已添加 'is_outlier' 列标记") # 数据标准化 if self.scale_var.get(): numeric_cols = self.df.select_dtypes(include=np.number).columns if len(numeric_cols) > 0: for col in numeric_cols: min_val = self.df[col].min() max_val = self.df[col].max() if max_val > min_val: self.df[col] = (self.df[col] - min_val) / (max_val - min_val) # 更新分析结果 self.analyze_headers() self.analyze_data_quality() # 更新列选择框 self.col_combobox['values'] = list(self.df.columns) if self.df.columns.size > 0: self.col_combobox.current(0) self.update_stats() # 高级分析后恢复底色状态 if original_bg_status is not None: # 只保留现有行的底色状态 self.cell_bg_status = original_bg_status.loc[self.df.index] # 处理新增列(如果没有底色状态,设为"无底色") for col in self.df.columns: if col not in self.cell_bg_status: self.cell_bg_status[col] = "无底色" self.status_var.set("高级分析完成") self.progress_var.set(100) except Exception as e: if original_bg_status is not None: self.cell_bg_status = original_bg_status messagebox.showerror("错误", f"高级分析时出错: {str(e)}") self.status_var.set(f"错误: {str(e)}") if __name__ == "__main__": root = tk.Tk() app = TableAnalyzer(root) root.mainloop() TableAnalyzer’ object has no attribute ‘apply_filter’ File “C:\Users\siyuanll2102\Desktop\classify\classify.py”, line 281, in create_filter_section command=self.apply_filter, File “C:\Users\siyuanll2102\Desktop\classify\classify.py”, line 72, in init self.create_filter_section(right_panel) File “C:\Users\siyuanll2102\Desktop\classify\classify.py”, line 1445, in app = TableAnalyzer(root) AttributeError: ‘TableAnalyzer’ object has no attribute ‘apply_filter’
08-06
检查代码并优化: import sys import os import json import time import wave import numpy as np import pandas as pd import matplotlib.pyplot as plt import soundfile as sf # 新增依赖,用于音频片段保存 from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas from PyQt5.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, QPushButton, QLabel, QLineEdit, QTextEdit, QFileDialog, QProgressBar, QGroupBox, QComboBox, QCheckBox) from PyQt5.QtCore import QThread, pyqtSignal from pydub import AudioSegment from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification import whisper from pyannote.audio import Pipeline from docx import Document from docx.shared import Inches import librosa import tempfile from collections import defaultdict import re # 新增用于中文字符计数 class AnalysisThread(QThread): progress = pyqtSignal(int) message = pyqtSignal(str) analysis_complete = pyqtSignal(dict) error = pyqtSignal(str) def __init__(self, audio_files, keyword_file, whisper_model_path, pyannote_model_path, emotion_model_path): super().__init__() self.audio_files = audio_files self.keyword_file = keyword_file self.whisper_model_path = whisper_model_path self.pyannote_model_path = pyannote_model_path self.emotion_model_path = emotion_model_path self.running = True self.cached_models = {} def run(self): try: # 加载关键词 self.message.emit("正在加载关键词...") keywords = self.load_keywords() # 预加载模型 self.message.emit("正在预加载模型...") self.preload_models() results = [] total_files = len(self.audio_files) for idx, audio_file in enumerate(self.audio_files): if not self.running: self.message.emit("分析已停止") return self.message.emit(f"正在处理文件: {os.path.basename(audio_file)} ({idx+1}/{total_files})") file_result = self.analyze_file(audio_file, keywords) if file_result: results.append(file_result) self.progress.emit(int((idx + 1) / total_files * 100)) self.analysis_complete.emit({"results": results, "keywords": keywords}) self.message.emit("分析完成!") except Exception as e: import traceback self.error.emit(f"分析过程中发生错误: {str(e)}\n{traceback.format_exc()}") def preload_models(self): """预加载所有模型到缓存""" # 检查是否已加载模型 if hasattr(self, 'cached_models') and self.cached_models: return self.cached_models = {} # 加载语音识别模型 if 'whisper' not in self.cached_models: self.message.emit("正在加载语音识别模型...") self.cached_models['whisper'] = whisper.load_model(self.whisper_model_path) # 加载说话人分离模型 if 'pyannote' not in self.cached_models: self.message.emit("正在加载说话人分离模型...") self.cached_models['pyannote'] = Pipeline.from_pretrained(self.pyannote_model_path) # 加载情感分析模型 if 'emotion_classifier' not in self.cached_models: self.message.emit("正在加载情感分析模型...") tokenizer = AutoTokenizer.from_pretrained(self.emotion_model_path) model = AutoModelForSequenceClassification.from_pretrained(self.emotion_model_path) self.cached_models['emotion_classifier'] = pipeline( "text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1 # 使用GPU如果可用 ) def analyze_file(self, audio_file, keywords): """分析单个音频文件""" try: # 确保音频为WAV格式 wav_file = self.convert_to_wav(audio_file) # 获取音频信息 duration, sample_rate, channels = self.get_audio_info(wav_file) # 说话人分离 diarization = self.cached_models['pyannote'](wav_file) # 识别客服和客户(使用改进的方法) agent_segments, customer_segments = self.identify_speakers(wav_file, diarization, keywords['opening']) # 语音识别(使用优化后的方法) whisper_model = self.cached_models['whisper'] agent_text = self.transcribe_audio(wav_file, agent_segments, whisper_model) customer_text = self.transcribe_audio(wav_file, customer_segments, whisper_model) # 情感分析 emotion_classifier = self.cached_models['emotion_classifier'] agent_emotion = self.analyze_emotion(agent_text, emotion_classifier) customer_emotion = self.analyze_emotion(customer_text, emotion_classifier) # 服务规范检查 opening_check = self.check_opening(agent_text, keywords['opening']) closing_check = self.check_closing(agent_text, keywords['closing']) forbidden_check = self.check_forbidden(agent_text, keywords['forbidden']) # 沟通技巧分析(使用改进的方法) speech_rate = self.analyze_speech_rate(wav_file, agent_segments) volume_analysis = self.analyze_volume(wav_file, agent_segments) # 问题解决率分析 resolution_rate = self.analyze_resolution(agent_text, customer_text, keywords['resolution']) # 构建结果 return { "file_name": os.path.basename(audio_file), "duration": duration, "agent_text": agent_text, "customer_text": customer_text, "opening_check": opening_check, "closing_check": closing_check, "forbidden_check": forbidden_check, "agent_emotion": agent_emotion, "customer_emotion": customer_emotion, "speech_rate": speech_rate, "volume_mean": volume_analysis['mean'], "volume_std": volume_analysis['std'], "resolution_rate": resolution_rate } except Exception as e: self.error.emit(f"处理文件 {os.path.basename(audio_file)} 时出错: {str(e)}") return None def load_keywords(self): """从Excel文件加载关键词""" try: df = pd.read_excel(self.keyword_file) keywords = { "opening": [str(k).strip() for k in df['opening'].dropna().tolist()], "closing": [str(k).strip() for k in df['closing'].dropna().tolist()], "forbidden": [str(k).strip() for k in df['forbidden'].dropna().tolist()], "resolution": [str(k).strip() for k in df['resolution'].dropna().tolist()] } return keywords except Exception as e: raise Exception(f"加载关键词文件失败: {str(e)}") def convert_to_wav(self, audio_file): """将音频文件转换为WAV格式(如果需要)""" try: if not audio_file.lower().endswith('.wav'): # 使用临时文件避免磁盘IO with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmpfile: output_file = tmpfile.name audio = AudioSegment.from_file(audio_file) audio.export(output_file, format='wav') return output_file return audio_file except Exception as e: raise Exception(f"音频转换失败: {str(e)}") def get_audio_info(self, wav_file): """获取音频文件信息""" try: with wave.open(wav_file, 'rb') as wf: frames = wf.getnframes() rate = wf.getframerate() channels = wf.getnchannels() duration = frames / float(rate) return duration, rate, channels except Exception as e: raise Exception(f"获取音频信息失败: {str(e)}") def identify_speakers(self, wav_file, diarization, opening_keywords): """改进的客服识别方法 - 检查前三个片段是否有开场白关键词""" speaker_segments = defaultdict(list) for segment, _, speaker in diarization.itertracks(yield_label=True): speaker_segments[speaker].append((segment.start, segment.end)) # 如果没有说话人 if not speaker_segments: return [], [] # 如果只有一个说话人 if len(speaker_segments) == 1: speaker = list(speaker_segments.keys())[0] return speaker_segments[speaker], [] # 检查每个说话人的前三个片段是否有开场白 speaker_scores = {} whisper_model = self.cached_models['whisper'] for speaker, segments in speaker_segments.items(): score = 0 # 取前三个片段(或所有片段如果少于3个) check_segments = segments[:3] for start, end in check_segments: # 转录片段 text = self.transcribe_audio_segment(wav_file, [(start, end)], whisper_model) # 检查开场白关键词 for keyword in opening_keywords: if keyword and keyword in text: score += 1 break speaker_scores[speaker] = score # 找到得分最高的说话人作为客服 agent_speaker = max(speaker_scores, key=speaker_scores.get) agent_segments = [] customer_segments = [] for speaker, segments in speaker_segments.items(): if speaker == agent_speaker: agent_segments = segments else: customer_segments.extend(segments) return agent_segments, customer_segments def transcribe_audio_segment(self, wav_file, segments, model): """转录单个音频片段 - 用于客服识别""" if not segments: return "" # 使用pydub加载音频 audio = AudioSegment.from_wav(wav_file) start, end = segments[0] # 转换为毫秒 start_ms = int(start * 1000) end_ms = int(end * 1000) segment_audio = audio[start_ms:end_ms] # 使用临时文件 with tempfile.NamedTemporaryFile(suffix='.wav') as tmpfile: segment_audio.export(tmpfile.name, format="wav") result = model.transcribe(tmpfile.name) return result['text'] def transcribe_audio(self, wav_file, segments, model): """优化后的转录方法 - 按片段转录""" if not segments: return "" # 使用pydub加载音频 audio = AudioSegment.from_wav(wav_file) full_text = "" # 只处理指定片段 for start, end in segments: # 转换为毫秒 start_ms = int(start * 1000) end_ms = int(end * 1000) segment_audio = audio[start_ms:end_ms] # 使用临时文件避免内存占用 with tempfile.NamedTemporaryFile(suffix='.wav') as tmpfile: segment_audio.export(tmpfile.name, format="wav") result = model.transcribe(tmpfile.name) full_text += result['text'] + " " return full_text.strip() def analyze_emotion(self, text, classifier): """分析文本情感""" if not text.strip(): return {"label": "中性", "score": 0.0} # 截断长文本以提高性能 if len(text) > 500: text = text[:500] result = classifier(text, truncation=True, max_length=512) return { "label": result[0]['label'], "score": result[0]['score'] } def check_opening(self, text, opening_keywords): """检查开场白""" return any(keyword in text for keyword in opening_keywords if keyword) def check_closing(self, text, closing_keywords): """检查结束语""" return any(keyword in text for keyword in closing_keywords if keyword) def check_forbidden(self, text, forbidden_keywords): """检查服务禁语""" return any(keyword in text for keyword in forbidden_keywords if keyword) def analyze_speech_rate(self, wav_file, segments): """改进的语速分析 - 基于实际识别文本""" if not segments: return 0 # 加载音频 y, sr = librosa.load(wav_file, sr=None) total_chars = 0 total_duration = 0 whisper_model = self.cached_models['whisper'] for start, end in segments: # 计算片段时长(秒) duration = end - start total_duration += duration # 转录片段 text = self.transcribe_audio_segment(wav_file, [(start, end)], whisper_model) # 计算中文字符数(去除标点和空格) chinese_chars = sum(1 for char in text if '\u4e00' <= char <= '\u9fff') total_chars += chinese_chars if total_duration == 0: return 0 # 语速 = 总字数 / 总时长(分钟) return total_chars / (total_duration / 60) def analyze_volume(self, wav_file, segments): """改进的音量分析 - 使用librosa计算RMS分贝值""" if not segments: return {"mean": -60, "std": 0} # 加载音频 y, sr = librosa.load(wav_file, sr=None) all_dB = [] for start, end in segments: start_sample = int(start * sr) end_sample = int(end * sr) segment_audio = y[start_sample:end_sample] # 计算RMS并转换为dB rms = librosa.feature.rms(y=segment_audio)[0] dB = librosa.amplitude_to_db(rms, ref=np.max) all_dB.extend(dB) if not all_dB: return {"mean": -60, "std": 0} return { "mean": float(np.mean(all_dB)), "std": float(np.std(all_dB)) } def analyze_resolution(self, agent_text, customer_text, resolution_keywords): """分析问题解决率""" return any(keyword in agent_text for keyword in resolution_keywords if keyword) def stop(self): """停止分析""" self.running = False class MainWindow(QMainWindow): def __init__(self): super().__init__() self.setWindowTitle("外呼电话录音包质检分析系统") self.setGeometry(100, 100, 1000, 700) # 初始化变量 self.audio_files = [] self.keyword_file = "" self.whisper_model_path = "./models/whisper-small" self.pyannote_model_path = "./models/pyannote-speaker-diarization" self.emotion_model_path = "./models/Erlangshen-Roberta-110M-Sentiment" self.output_dir = "./reports" # 创建主控件 central_widget = QWidget() self.setCentralWidget(central_widget) main_layout = QVBoxLayout(central_widget) # 文件选择区域 file_group = QGroupBox("文件选择") file_layout = QVBoxLayout(file_group) # 音频文件选择 audio_layout = QHBoxLayout() self.audio_label = QLabel("音频文件/文件夹:") audio_layout.addWidget(self.audio_label) self.audio_path_edit = QLineEdit() audio_layout.addWidget(self.audio_path_edit) self.audio_browse_btn = QPushButton("浏览...") self.audio_browse_btn.clicked.connect(self.browse_audio) audio_layout.addWidget(self.audio_browse_btn) file_layout.addLayout(audio_layout) # 关键词文件选择 keyword_layout = QHBoxLayout() self.keyword_label = QLabel("关键词文件:") keyword_layout.addWidget(self.keyword_label) self.keyword_path_edit = QLineEdit() keyword_layout.addWidget(self.keyword_path_edit) self.keyword_browse_btn = QPushButton("浏览...") self.keyword_browse_btn.clicked.connect(self.browse_keyword) keyword_layout.addWidget(self.keyword_browse_btn) file_layout.addLayout(keyword_layout) main_layout.addWidget(file_group) # 模型设置区域 model_group = QGroupBox("模型设置") model_layout = QVBoxLayout(model_group) # Whisper模型路径 whisper_layout = QHBoxLayout() whisper_layout.addWidget(QLabel("Whisper模型路径:")) self.whisper_edit = QLineEdit(self.whisper_model_path) whisper_layout.addWidget(self.whisper_edit) model_layout.addLayout(whisper_layout) # Pyannote模型路径 pyannote_layout = QHBoxLayout() pyannote_layout.addWidget(QLabel("Pyannote模型路径:")) self.pyannote_edit = QLineEdit(self.pyannote_model_path) pyannote_layout.addWidget(self.pyannote_edit) model_layout.addLayout(pyannote_layout) # 情感分析模型路径 emotion_layout = QHBoxLayout() emotion_layout.addWidget(QLabel("情感分析模型路径:")) self.emotion_edit = QLineEdit(self.emotion_model_path) emotion_layout.addWidget(self.emotion_edit) model_layout.addLayout(emotion_layout) # 输出目录 output_layout = QHBoxLayout() output_layout.addWidget(QLabel("输出目录:")) self.output_edit = QLineEdit(self.output_dir) output_layout.addWidget(self.output_edit) self.output_browse_btn = QPushButton("浏览...") self.output_browse_btn.clicked.connect(self.browse_output) output_layout.addWidget(self.output_browse_btn) model_layout.addLayout(output_layout) main_layout.addWidget(model_group) # 控制按钮区域 control_layout = QHBoxLayout() self.start_btn = QPushButton("开始分析") self.start_btn.clicked.connect(self.start_analysis) control_layout.addWidget(self.start_btn) self.stop_btn = QPushButton("停止分析") self.stop_btn.clicked.connect(self.stop_analysis) self.stop_btn.setEnabled(False) control_layout.addWidget(self.stop_btn) self.clear_btn = QPushButton("清空") self.clear_btn.clicked.connect(self.clear_all) control_layout.addWidget(self.clear_btn) main_layout.addLayout(control_layout) # 进度条 self.progress_bar = QProgressBar() self.progress_bar.setValue(0) main_layout.addWidget(self.progress_bar) # 日志输出区域 log_group = QGroupBox("分析日志") log_layout = QVBoxLayout(log_group) self.log_text = QTextEdit() self.log_text.setReadOnly(True) log_layout.addWidget(self.log_text) main_layout.addWidget(log_group) # 状态区域 status_layout = QHBoxLayout() self.status_label = QLabel("就绪") status_layout.addWidget(self.status_label) self.file_count_label = QLabel("已选择0个音频文件") status_layout.addWidget(self.file_count_label) main_layout.addLayout(status_layout) # 初始化分析线程 self.analysis_thread = None def browse_audio(self): """浏览音频文件或文件夹""" options = QFileDialog.Options() files, _ = QFileDialog.getOpenFileNames( self, "选择音频文件", "", "音频文件 (*.mp3 *.wav *.amr *.ogg *.flac);;所有文件 (*)", options=options ) if files: self.audio_files = files self.audio_path_edit.setText("; ".join(files)) self.file_count_label.setText(f"已选择{len(files)}个音频文件") self.log_text.append(f"已选择{len(files)}个音频文件") def browse_keyword(self): """浏览关键词文件""" options = QFileDialog.Options() file, _ = QFileDialog.getOpenFileName( self, "选择关键词文件", "", "Excel文件 (*.xlsx *.xls);;所有文件 (*)", options=options ) if file: self.keyword_file = file self.keyword_path_edit.setText(file) self.log_text.append(f"已选择关键词文件: {file}") def browse_output(self): """浏览输出目录""" options = QFileDialog.Options() directory = QFileDialog.getExistingDirectory( self, "选择输出目录", "", options=options ) if directory: self.output_dir = directory self.output_edit.setText(directory) self.log_text.append(f"输出目录设置为: {directory}") def start_analysis(self): """开始分析""" if not self.audio_files: self.log_text.append("错误: 请先选择音频文件") return if not self.keyword_file: self.log_text.append("错误: 请先选择关键词文件") return # 更新模型路径 self.whisper_model_path = self.whisper_edit.text() self.pyannote_model_path = self.pyannote_edit.text() self.emotion_model_path = self.emotion_edit.text() self.output_dir = self.output_edit.text() # 创建输出目录 os.makedirs(self.output_dir, exist_ok=True) self.log_text.append("开始分析...") self.start_btn.setEnabled(False) self.stop_btn.setEnabled(True) self.status_label.setText("分析中...") self.progress_bar.setValue(0) # 创建并启动分析线程 self.analysis_thread = AnalysisThread( self.audio_files, self.keyword_file, self.whisper_model_path, self.pyannote_model_path, self.emotion_model_path ) self.analysis_thread.progress.connect(self.progress_bar.setValue) self.analysis_thread.message.connect(self.log_text.append) self.analysis_thread.analysis_complete.connect(self.on_analysis_complete) self.analysis_thread.error.connect(self.on_analysis_error) self.analysis_thread.finished.connect(self.on_analysis_finished) self.analysis_thread.start() def stop_analysis(self): """停止分析""" if self.analysis_thread and self.analysis_thread.isRunning(): self.analysis_thread.stop() self.log_text.append("正在停止分析...") self.stop_btn.setEnabled(False) def clear_all(self): """清空所有内容""" self.audio_files = [] self.keyword_file = "" self.audio_path_edit.clear() self.keyword_path_edit.clear() self.log_text.clear() self.progress_bar.setValue(0) self.status_label.setText("就绪") self.file_count_label.setText("已选择0个音频文件") self.log_text.append("已清空所有内容") def on_analysis_complete(self, result): """分析完成处理""" try: self.log_text.append("正在生成报告...") if not result.get("results"): self.log_text.append("警告: 没有生成任何分析结果") return # 生成Excel报告 excel_path = os.path.join(self.output_dir, "质检分析报告.xlsx") self.generate_excel_report(result, excel_path) # 生成Word报告 word_path = os.path.join(self.output_dir, "质检分析报告.docx") self.generate_word_report(result, word_path) self.log_text.append(f"分析报告已保存至: {excel_path}") self.log_text.append(f"可视化报告已保存至: {word_path}") self.log_text.append("分析完成!") self.status_label.setText(f"分析完成!报告保存至: {self.output_dir}") except Exception as e: import traceback self.log_text.append(f"生成报告时出错: {str(e)}\n{traceback.format_exc()}") def on_analysis_error(self, message): """分析错误处理""" self.log_text.append(f"错误: {message}") self.status_label.setText("发生错误") def on_analysis_finished(self): """分析线程结束处理""" self.start_btn.setEnabled(True) self.stop_btn.setEnabled(False) def generate_excel_report(self, result, output_path): """生成Excel报告""" # 从结果中提取数据 data = [] for res in result['results']: data.append({ "文件名": res['file_name'], "音频时长()": res['duration'], "开场白检查": "通过" if res['opening_check'] else "未通过", "结束语检查": "通过" if res['closing_check'] else "未通过", "服务禁语检查": "通过" if not res['forbidden_check'] else "未通过", "客服情感": res['agent_emotion']['label'], "客服情感得分": res['agent_emotion']['score'], "客户情感": res['customer_emotion']['label'], "客户情感得分": res['customer_emotion']['score'], "语速(字/分)": res['speech_rate'], "平均音量(dB)": res['volume_mean'], "音量标准差": res['volume_std'], "问题解决率": "是" if res['resolution_rate'] else "否" }) # 创建DataFrame并保存 df = pd.DataFrame(data) df.to_excel(output_path, index=False) # 添加汇总统计 try: with pd.ExcelWriter(output_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: summary_data = { "统计项": ["总文件数", "开场白通过率", "结束语通过率", "服务禁语通过率", "问题解决率"], "数值": [ len(result['results']), df['开场白检查'].value_counts().get('通过', 0) / len(df), df['结束语检查'].value_counts().get('通过', 0) / len(df), df['服务禁语检查'].value_counts().get('通过', 0) / len(df), df['问题解决率'].value_counts().get('是', 0) / len(df) ] } summary_df = pd.DataFrame(summary_data) summary_df.to_excel(writer, sheet_name='汇总统计', index=False) except Exception as e: self.log_text.append(f"添加汇总统计时出错: {str(e)}") def generate_word_report(self, result, output_path): """生成Word报告""" doc = Document() # 添加标题 doc.add_heading('外呼电话录音质检分析报告', 0) # 添加基本信息 doc.add_heading('分析概况', level=1) doc.add_paragraph(f"分析时间: {time.strftime('%Y-%m-%d %H:%M:%S')}") doc.add_paragraph(f"分析文件数量: {len(result['results'])}") doc.add_paragraph(f"关键词文件: {os.path.basename(self.keyword_file)}") # 添加汇总统计 doc.add_heading('汇总统计', level=1) # 创建汇总表格 table = doc.add_table(rows=5, cols=2) table.style = 'Table Grid' # 表头 hdr_cells = table.rows[0].cells hdr_cells[0].text = '统计项' hdr_cells[1].text = '数值' # 计算统计数据 df = pd.DataFrame(result['results']) pass_rates = { "开场白通过率": df['opening_check'].mean() if not df.empty else 0, "结束语通过率": df['closing_check'].mean() if not df.empty else 0, "服务禁语通过率": (1 - df['forbidden_check']).mean() if not df.empty else 0, "问题解决率": df['resolution_rate'].mean() if not df.empty else 0 } # 填充表格 rows = [ ("总文件数", len(result['results'])), ("开场白通过率", f"{pass_rates['开场白通过率']:.2%}"), ("结束语通过率", f"{pass_rates['结束语通过率']:.2%}"), ("服务禁语通过率", f"{pass_rates['服务禁语通过率']:.2%}"), ("问题解决率", f"{pass_rates['问题解决率']:.2%}") ] for i, row_data in enumerate(rows): if i < len(table.rows): row_cells = table.rows[i].cells row_cells[0].text = row_data[0] row_cells[1].text = str(row_data[1]) # 添加情感分析图表 if result['results']: doc.add_heading('情感分析', level=1) # 客服情感分布 agent_emotions = [res['agent_emotion']['label'] for res in result['results']] agent_emotion_counts = pd.Series(agent_emotions).value_counts() if not agent_emotion_counts.empty: fig, ax = plt.subplots(figsize=(6, 4)) agent_emotion_counts.plot.pie(autopct='%1.1f%%', ax=ax) ax.set_title('客服情感分布') plt.tight_layout() # 保存图表到临时文件 chart_path = os.path.join(self.output_dir, "agent_emotion_chart.png") plt.savefig(chart_path, dpi=100) plt.close() doc.add_picture(chart_path, width=Inches(4)) doc.add_paragraph('图1: 客服情感分布') # 客户情感分布 customer_emotions = [res['customer_emotion']['label'] for res in result['results']] customer_emotion_counts = pd.Series(customer_emotions).value_counts() if not customer_emotion_counts.empty: fig, ax = plt.subplots(figsize=(6, 4)) customer_emotion_counts.plot.pie(autopct='%1.1f%%', ax=ax) ax.set_title('客户情感分布') plt.tight_layout() chart_path = os.path.join(self.output_dir, "customer_emotion_chart.png") plt.savefig(chart_path, dpi=100) plt.close() doc.add_picture(chart_path, width=Inches(4)) doc.add_paragraph('图2: 客户情感分布') # 添加详细分析结果 doc.add_heading('详细分析结果', level=1) # 创建详细表格 table = doc.add_table(rows=1, cols=6) table.style = 'Table Grid' # 表头 hdr_cells = table.rows[0].cells headers = ['文件名', '开场白', '结束语', '禁语', '客服情感', '问题解决'] for i, header in enumerate(headers): hdr_cells[i].text = header # 填充数据 for res in result['results']: row_cells = table.add_row().cells row_cells[0].text = res['file_name'] row_cells[1].text = "✓" if res['opening_check'] else "✗" row_cells[2].text = "✓" if res['closing_check'] else "✗" row_cells[3].text = "✗" if res['forbidden_check'] else "✓" row_cells[4].text = res['agent_emotion']['label'] row_cells[5].text = "✓" if res['resolution_rate'] else "✗" # 保存文档 doc.save(output_path) if __name__ == "__main__": # 检查是否安装了torch try: import torch except ImportError: print("警告: PyTorch 未安装,情感分析可能无法使用GPU加速") app = QApplication(sys.argv) window = MainWindow()
07-15
import tkinter as tk from tkinter import ttk, filedialog, messagebox, scrolledtext import pandas as pd import matplotlib.pyplot as plt from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg import os from datetime import datetime class ExcelQuerySystem: def __init__(self, root): self.root = root self.root.title("Excel表格查询系统") self.root.geometry("1200x800") self.root.minsize(1000, 600) # 初始化变量 self.df = None self.filtered_df = None self.current_page = 1 self.rows_per_page = 20 self.sort_column = None self.sort_order = "asc" # 设置样式 self.style = ttk.Style() self.style.theme_use("clam") self.setup_styles() # 创建主布局 self.create_main_layout() # 创建菜单 self.create_menus() # 创建工具栏 self.create_toolbar() # 创建主内容区 self.create_main_content() # 初始状态禁用某些控件 self.update_widget_states() def setup_styles(self): """设置界面样式""" # 主色调:蓝色 primary_color = "#3498db" secondary_color = "#2980b9" accent_color = "#f39c12" # 配置样式 self.style.configure("Main.TFrame", background="white") self.style.configure("ToolBar.TFrame", background="#f5f5f5") self.style.configure("SidePanel.TFrame", background="#f8f9fa") self.style.configure("MainContent.TFrame", background="white") # 按钮样式 self.style.configure("Primary.TButton", background=primary_color, foreground="white", borderwidth=0, padding=6) self.style.map("Primary.TButton", background=[("active", secondary_color)]) self.style.configure("Accent.TButton", background=accent_color, foreground="white", borderwidth=0, padding=6) self.style.map("Accent.TButton", background=[("active", "#e67e22")]) # 标签样式 self.style.configure("Header.TLabel", font=("Arial", 14, "bold"), foreground="#2c3e50", padding=10) self.style.configure("Section.TLabel", font=("Arial", 12, "bold"), foreground="#34495e", padding=5) # 表格样式 self.style.configure("DataTable.Treeview", font=("Arial", 10), rowheight=25) self.style.configure("DataTable.Treeview.Heading", font=("Arial", 11, "bold"), background=primary_color, foreground="white") # 输入框样式 self.style.configure("Entry.TEntry", padding=5, bordercolor="#bdc3c7", relief="flat") # 组合框样式 self.style.configure("TCombobox", padding=5, bordercolor="#bdc3c7", relief="flat") def create_main_layout(self): """创建主布局""" # 主框架 self.main_frame = ttk.Frame(self.root, style="Main.TFrame") self.main_frame.pack(fill=tk.BOTH, expand=True) # 工具栏框架 self.toolbar_frame = ttk.Frame(self.main_frame, style="ToolBar.TFrame", height=40) self.toolbar_frame.pack(fill=tk.X, side=tk.TOP) # 内容区框架 self.content_frame = ttk.Frame(self.main_frame, style="Main.TFrame") self.content_frame.pack(fill=tk.BOTH, expand=True) # 左侧面板 self.side_panel = ttk.Frame(self.content_frame, style="SidePanel.TFrame", width=300) self.side_panel.pack(fill=tk.Y, side=tk.LEFT, padx=5, pady=5) # 右侧主内容 self.main_content = ttk.Frame(self.content_frame, style="MainContent.TFrame") self.main_content.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) def create_menus(self): """创建菜单""" menubar = tk.Menu(self.root) # 文件菜单 file_menu = tk.Menu(menubar, tearoff=0) file_menu.add_command(label="导入Excel", command=self.import_excel) file_menu.add_command(label="导出结果", command=self.export_results) file_menu.add_separator() file_menu.add_command(label="退出", command=self.root.quit) menubar.add_cascade(label="文件", menu=file_menu) # 帮助菜单 help_menu = tk.Menu(menubar, tearoff=0) help_menu.add_command(label="使用说明", command=self.show_help) help_menu.add_command(label="关于", command=self.show_about) menubar.add_cascade(label="帮助", menu=help_menu) self.root.config(menu=menubar) def create_toolbar(self): """创建工具栏""" # 导入按钮 self.import_btn = ttk.Button(self.toolbar_frame, text="导入Excel", command=self.import_excel, style="Primary.TButton") self.import_btn.pack(side=tk.LEFT, padx=5, pady=5) # 导出按钮 self.export_btn = ttk.Button(self.toolbar_frame, text="导出结果", command=self.export_results, style="Accent.TButton") self.export_btn.pack(side=tk.LEFT, padx=5, pady=5) # 统计按钮 self.stats_btn = ttk.Button(self.toolbar_frame, text="数据统计", command=self.show_statistics, style="Primary.TButton") self.stats_btn.pack(side=tk.LEFT, padx=5, pady=5) # 分隔符 separator = ttk.Separator(self.toolbar_frame, orient=tk.VERTICAL) separator.pack(side=tk.LEFT, fill=tk.Y, padx=5, pady=5) # 状态标签 self.status_label = ttk.Label(self.toolbar_frame, text="就绪", font=("Arial", 10)) self.status_label.pack(side=tk.RIGHT, padx=10, pady=5) def create_main_content(self): """创建主内容区""" # 表格数据展示区 self.table_frame = ttk.Frame(self.main_content) self.table_frame.pack(fill=tk.BOTH, expand=True) # 表格 self.tree = ttk.Treeview(self.table_frame, style="DataTable.Treeview") self.tree.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) # 滚动条 self.scrollbar = ttk.Scrollbar(self.table_frame, orient=tk.VERTICAL, command=self.tree.yview) self.scrollbar.pack(side=tk.RIGHT, fill=tk.Y) self.tree.configure(yscrollcommand=self.scrollbar.set) # 分页控件 self.pagination_frame = ttk.Frame(self.main_content) self.pagination_frame.pack(fill=tk.X, side=tk.BOTTOM, pady=5) self.prev_btn = ttk.Button(self.pagination_frame, text="上一页", command=self.prev_page) self.prev_btn.pack(side=tk.LEFT, padx=5) self.page_label = ttk.Label(self.pagination_frame, text="第 1 页") self.page_label.pack(side=tk.LEFT, padx=5) self.next_btn = ttk.Button(self.pagination_frame, text="下一页", command=self.next_page) self.next_btn.pack(side=tk.LEFT, padx=5) # 图表展示区 self.chart_frame = ttk.Frame(self.main_content) self.chart_frame.pack(fill=tk.BOTH, expand=True, pady=5) # 初始隐藏图表区 self.chart_frame.pack_forget() def create_side_panel_content(self): """创建侧边面板内容""" # 清空现有内容 for widget in self.side_panel.winfo_children(): widget.destroy() # 标题 title_label = ttk.Label(self.side_panel, text="查询条件", style="Section.TLabel") title_label.pack(fill=tk.X, pady=5) # 查询条件区域 self.query_frame = ttk.Frame(self.side_panel) self.query_frame.pack(fill=tk.X, padx=5, pady=5) # 添加查询条件行 self.add_query_row() # 添加条件按钮 self.add_condition_btn = ttk.Button(self.side_panel, text="+ 添加条件", command=self.add_query_row, style="Primary.TButton") self.add_condition_btn.pack(fill=tk.X, padx=5, pady=2) # 查询按钮 self.query_btn = ttk.Button(self.side_panel, text="执行查询", command=self.execute_query, style="Accent.TButton") self.query_btn.pack(fill=tk.X, padx=5, pady=10) # 重置按钮 self.reset_btn = ttk.Button(self.side_panel, text="重置查询", command=self.reset_query, style="Primary.TButton") self.reset_btn.pack(fill=tk.X, padx=5, pady=2) # 数据信息区域 info_label = ttk.Label(self.side_panel, text="数据信息", style="Section.TLabel") info_label.pack(fill=tk.X, padx=5, pady=10) self.info_text = scrolledtext.ScrolledText(self.side_panel, height=8, wrap=tk.WORD) self.info_text.pack(fill=tk.BOTH, padx=5, pady=5) self.info_text.config(state=tk.DISABLED) def add_query_row(self): """添加查询条件行""" row_frame = ttk.Frame(self.query_frame) row_frame.pack(fill=tk.X, pady=3) # 列选择 columns = list(self.df.columns) if self.df is not None else [] column_combo = ttk.Combobox(row_frame, values=columns, state="readonly", width=15) column_combo.pack(side=tk.LEFT, padx=2) # 操作符选择 operators = ["包含", "等于", "不等于", "大于", "小于", "大于等于", "小于等于"] operator_combo = ttk.Combobox(row_frame, values=operators, state="readonly", width=10) operator_combo.pack(side=tk.LEFT, padx=2) operator_combo.current(0) # 值输入 value_entry = ttk.Entry(row_frame) value_entry.pack(side=tk.LEFT, padx=2, fill=tk.X, expand=True) # 删除按钮 delete_btn = ttk.Button(row_frame, text="×", command=lambda: row_frame.destroy(), width=3, style="Accent.TButton") delete_btn.pack(side=tk.LEFT, padx=2) def update_widget_states(self): """更新控件状态""" if self.df is None: # 禁用导出、统计按钮 self.export_btn.config(state=tk.DISABLED) self.stats_btn.config(state=tk.DISABLED) self.query_btn.config(state=tk.DISABLED) self.reset_btn.config(state=tk.DISABLED) self.add_condition_btn.config(state=tk.DISABLED) self.prev_btn.config(state=tk.DISABLED) self.next_btn.config(state=tk.DISABLED) else: # 启用导出、统计按钮 self.export_btn.config(state=tk.NORMAL) self.stats_btn.config(state=tk.NORMAL) self.query_btn.config(state=tk.NORMAL) self.reset_btn_btn.config(state=tk.NORMAL) self.add_condition_btn.config(state=tk.NORMAL) # 更新根据数据量启用/禁用分页按钮 total_pages = self.get_total_pages() if total_pages <= 1: self.prev_btn.config(state=tk.DISABLED) self.next_btn.config(state=tk.DISABLED) else: self.prev_btn.config(state=tk.NORMAL if self.current_page > 1 else tk.DISABLED) self.next_btn.config(state=tk.NORMAL if self.current_page < total_pages_pages else tk.DISABLED) def import_excel(self): """导入Excel文件""" file_path = filedialog.askopenfilename( title="选择Excel文件", filetypes=[("Excel文件", "*.xlsx *.xls"), ("所有文件", "*.*")] ) if not file_path: return try: # 读取Excel文件 self.df = pd.read_excel(file_path) self.filtered_df = self.df.copy() self.current_page = 1 # 更新状态 self.status_label.config(text=f"已导入: {os.path.basename(file_path)}") # 显示数据信息 self.update_info_text() # 创建侧边面板内容 self.create_side_panel_content() # 显示表格数据 self.display_table_data() # 更新控件状态 self.update_widget_states() messagebox.showinfo("导入成功", f"成功导入Excel文件,包含 {len(self.df)} 行数据") except Exception as e: messagebox.showerror("导入失败", f"导入Excel文件时出错: {str(e)}") self.status_label.config(text="导入失败") def update_info_text(self): """更新信息文本框""" if self.df is None: return info = f"数据文件: {self.status_label.cget('text').replace('已导入: ', '')}\n" info += f"总行数: {len(self.df)}\n" info += f"总列数: {len(self.df.columns)}\n\n" info += "列名及数据类型:\n" for col in self.df.columns: dtype = str(self.df[col].dtype) info += f"- {col}: {dtype}\n" self.info_text.config(state=tk.NORMAL) self.info_text.delete(1.0, tk.END) self.info_text.insert(tk.END, info) self.info_text.config(state=tk.DISABLED) def display_table_data(self): """显示表格数据""" # 清空现有表格 self.tree.delete(*self.tree.get_children()) # 如果没有数据,显示提示信息 if self.filtered_df is None or len(self.filtered_df) == 0: self.tree["columns"] = ["提示"] self.tree.column("#0", width=0, stretch=tk.NO) self.tree.column("提示", anchor=tk.CENTER, width=800) self.tree.heading("提示", text="没有符合条件的数据") self.tree.insert("", tk.END, values=["没有符合条件的数据"]) return # 获取当前页数据 start_idx = (self.current_page - 1) * self.rows_per_page end_idx = start_idx + self.rows_per_page page_data = self.filtered_df.iloc[start_idx:end_idx] # 设置表格列 columns = list(page_data.columns) self.tree["columns"] = columns # 隐藏第一列(默认的#0列) self.tree.column("#0", width=0, stretch=tk.NO) # 设置列属性 for col in columns: self.tree.column(col, anchor=tk.CENTER, width=100, minwidth=80) self.tree.heading(col, text=col, command=lambda c=col: self.sort_data(c)) # 添加数据行 for _, row in page_data.iterrows(): values = [str(val) if pd.notna(val) else "" for val in row.values] self.tree.insert("", tk.END, values=values) # 更新分页信息 total_pages = self.get_total_pages() self.page_label.config(text=f"第 {self.current_page} 页 / 共 {total_pages} 页") def get_total_pages(self): """计算总页数""" if self.filtered_df is None or len(self.filtered_df) == 0: return 1 return (len(self.filtered_df) + self.rows_per_page - 1) // self.rows_per_page def prev_page(self): """上一页""" if self.current_page > 1: self.current_page -= 1 self.display_table_data() self.update_widget_states() def next_page(self): """下一页""" total_pages = self.get_total_pages() if self.current_page < total_pages: self.current_page += 1 self.display_table_data() self.update_widget_states() def sort_data(self, column): """排序数据""" if self.filtered_df is None: return # 如果点击的是当前排序列,则切换排序顺序 if self.sort_column == column: self.sort_order = "desc" if self.sort_order == "asc" else "asc" else: self.sort_column = column self.sort_order = "asc" # 执行排序 self.filtered_df = self.filtered_df.sort_values( by=column, ascending=(self.sort_order == "asc") ) # 重置到第一页 self.current_page = 1 # 更新表格显示 self.display_table_data() def execute_query(self): """执行查询""" if self.df is None: return # 获取所有查询条件 conditions = [] valid = True for row_frame in self.query_frame.winfo_children(): widgets = row_frame.winfo_children() if len(widgets) < 3: continue column_combo = widgets[0] operator_combo = widgets[1] value_entry = widgets[2] column = column_combo.get() operator = operator_combo.get() value = value_entry.get().strip() if not column or not value: continue conditions.append((column, operator, value)) # 应用查询条件 try: self.filtered_df = self.df.copy() for column, operator, value in conditions: if column not in self.filtered_df.columns: continue # 根据数据类型处理查询值 dtype = self.filtered_df[column].dtype try: if "int" in str(dtype): value = int(value) elif "float" in str(dtype): value = float(value) elif "datetime" in str(dtype): value = pd.to_datetime(value) except ValueError: # 如果无法转换,则作为字符串处理 pass # 应用过滤条件 if operator == "包含": if isinstance(value, str): self.filtered_df = self.filtered_df[ self.filtered_df[column].astype(str).str.contains(value, na=False) ] elif operator == "等于": self.filtered_df = self.filtered_df[ self.filtered_df[column] == value ] elif operator == "不等于": self.filtered_df = self.filtered_df[ self.filtered_df[column] != value ] elif operator == "大于": self.filtered_df = self.filtered_df[ self.filtered_df[column] > value ] elif operator == "小于": self.filtered_df = self.filtered_df[ self.filtered_df[column] < value ] elif operator == "大于等于": self.filtered_df = self.filtered_df[ self.filtered_df[column] >= value ] elif operator == "小于等于": self.filtered_df = self.filtered_df[ self.filtered_df[column] <= value ] # 重置到第一页 self.current_page = 1 # 更新表格显示 self.display_table_data() # 更新状态 self.status_label.config(text=f"查询完成,找到 {len(self.filtered_df)} 条结果") # 更新控件状态 self.update_widget_states() except Exception as e: messagebox.showerror("查询失败", f"执行查询时出错: {str(e)}") def reset_query(self): """重置查询""" if self.df is None: return # 重置过滤后的数据 self.filtered_df = self.df.copy() self.current_page = 1 # 清空查询条件 for widget in self.query_frame.winfo_children(): widget.destroy() # 添加一行空的查询条件 self.add_query_row() # 更新表格显示 self.display_table_data() # 更新状态 self.status_label.config(text="查询已重置") # 更新控件状态 self.update_widget_states() def export_results(self): """导出查询结果""" if self.filtered_df is None or len(self.filtered_df) == 0: messagebox.showwarning("导出警告", "没有可导出的数据") return # 选择导出文件路径 file_path = filedialog.asksaveasfilename( title="导出结果", defaultextension=".xlsx", filetypes=[("Excel文件", "*.xlsx"), ("CSV文件", "*.csv"), ("所有文件", "*.*")] ) if not file_path: return try: # 根据文件扩展名选择导出格式 if file_path.endswith(".csv"): self.filtered_df.to_csv(file_path, index=False, encoding="utf-8-sig") else: self.filtered_df.to_excel(file_path, index=False) messagebox.showinfo("导出成功", f"成功导出 {len(self.filtered_df)} 行数据到 {file_path}") self.status_label.config(text=f"已导出: {os.path.basename(file_path)}") except Exception as e: messagebox.showerror("导出失败", f"导出数据时出错: {str(e)}") def show_statistics(self): """显示数据统计""" if self.filtered_df is None or len(self.filtered_df) == 0: messagebox.showwarning("统计警告", "没有可统计的数据") return # 显示图表区域 self.chart_frame.pack(fill=tk.BOTH, expand=True, pady=5) # 清空现有图表 for widget in self.chart_frame.winfo_children(): widget.destroy() # 创建统计信息标签 stats_label = ttk.Label(self.chart_frame, text="数据统计与可视化", style="Header.TLabel") stats_label.pack(fill=tk.X, pady=5) # 创建统计内容框架 stats_content_frame = ttk.Frame(self.chart_frame) stats_content_frame.pack(fill=tk.BOTH, expand=True) # 左侧统计信息 stats_text_frame = ttk.Frame(stats_content_frame, width=300) stats_text_frame.pack(fill=tk.Y, side=tk.LEFT, padx=5) # 生成统计信息 stats_text = self.generate_statistics_text() # 显示统计信息 stats_text_widget = scrolledtext.ScrolledText(stats_text_frame, wrap=tk.WORD) stats_text_widget.pack(fill=tk.BOTH, expand=True) stats_text_widget.insert(tk.END, stats_text) stats_text_widget.config(state=tk.DISABLED) # 右侧图表 chart_canvas_frame = ttk.Frame(stats_content_frame) chart_canvas_frame.pack(fill=tk.BOTH, expand=True, padx=5) # 创建图表 self.create_chart(chart_canvas_frame) def generate_statistics_text(self): """生成统计信息文本""" if self.filtered_df is None: return "" stats_text = f"数据概览:\n" stats_text += f"- 总记录数: {len(self.filtered_df)}\n" stats_text += f"- 总列数: {len(self.filtered_df.columns)}\n\n" # 数值型列的统计信息 numeric_columns = self.filtered_df.select_dtypes(include=["int64", "float64"]).columns if len(numeric_columns) > 0: stats_text += "数值型列统计:\n" for col in numeric_columns: stats_text += f"\n{col}:\n" stats_text += f" - 平均值: {self.filtered_df[col].mean():.2f}\n" stats_text += f" - 中位数: {self.filtered_df[col].median():.2f}\n" stats_text += f" - 最小值: {self.filtered_df[col].min():.2f}\n" stats_text += f" - 最大值: {self.filtered_df[col].max():.2f}\n" stats_text += f" - 标准差: {self.filtered_df[col].std():.2f}\n" # 字符串列的统计信息 string_columns = self.filtered_df.select_dtypes(include=["object"]).columns if len(string_columns) > 0: stats_text += "\n字符串列统计:\n" for col in string_columns: non_null_count = self.filtered_df[col].notna().sum() unique_count = self.filtered_df[col].nunique() stats_text += f"\n{col}:\n" stats_text += f" - 非空值数量: {non_null_count}\n" stats_text += f" - 唯一值数量: {unique_count}\n" # 如果唯一值数量不多,显示前5个最常见的值 if unique_count > 0 and unique_count <= 20: top_values = self.filtered_df[col].value_counts().head(5) stats_text += " - 常见值: " for val, count in top_values.items(): stats_text += f"{val}({count}), " stats_text = stats_text.rstrip(", ") + "\n" # 日期时间列的统计信息 datetime_columns = self.filtered_df.select_dtypes(include=["datetime64"]).columns if len(datetime_columns) > 0: stats_text += "\n日期时间列统计:\n" for col in datetime_columns: stats_text += f"\n{col}:\n" stats_text += f" - 最早日期: {self.filtered_df[col].min()}\n" stats_text += f" - 最晚日期: {self.filtered_df[col].max()}\n" stats_text += f" - 时间跨度: {(self.filtered_df[col].max() - self.filtered_df[col].min()).days} 天\n" return stats_text def create_chart(self, parent_frame): """创建数据可视化图表""" if self.filtered_df is None: return # 选择第一个数值型列进行可视化 numeric_columns = self.filtered_df.select_dtypes(include=["int64", "float64"]).columns if len(numeric_columns) == 0: # 如果没有数值型列,选择第一个字符串列进行柱状图 string_columns = self.filtered_df.select_dtypes(include=["object"]).columns if len(string_columns) == 0: return col = string_columns[0] value_counts = self.filtered_df[col].value_counts().head(10) # 创建柱状图 fig, ax = plt.subplots(figsize=(8, 6)) ax.bar(value_counts.index.astype(str), value_counts.values) ax.set_title(f"{col} 分布") ax.set_xlabel(col) ax.set_ylabel("数量") plt.xticks(rotation=45, ha="right") else: col = numeric_columns[0] # 创建直方图 fig, ax = plt.subplots(figsize=(8, 6)) ax.hist(self.filtered_df[col].dropna(), bins=20, edgecolor="black") ax.set_title(f"{col} 分布") ax.set_xlabel(col) ax.set_ylabel("频数") # 调整布局 plt.tight_layout() # 创建画布并显示 canvas = FigureCanvasTkAgg(fig, master=parent_frame) canvas.draw() canvas.get_tk_widget().pack(fill=tk.BOTH, expand=True) def show_help(self): """显示使用说明""" help_text = """ Excel表格查询系统使用说明 1. 导入Excel文件 - 点击"导入Excel"按钮或通过"文件"菜单选择Excel文件 - 支持.xlsx和.xls格式 2. 查询数据 - 在左侧面板设置查询条件 - 点击"+ 添加条件"可以增加多个查询条件 - 点击"执行查询"按钮应用筛选 - 点击"重置查询"按钮清除所有条件 3. 数据操作 - 点击表头可以按该列排序 - 使用分页控件浏览大量数据 - 点击"导出结果"按钮可以将查询结果导出为Excel或CSV文件 4. 数据统计 - 点击"数据统计"按钮查看数据统计信息和可视化图表 - 统计信息包括基本统计量、分布情况等 - 可视化图表根据数据类型自动生成 5. 其他功能 - 通过"帮助"菜单可以查看使用说明和关于信息 - 通过"文件"菜单可以退出系统 """ messagebox.showinfo("使用说明", help_text) def show_about(self): """显示关于信息""" about_text = """ Excel表格查询系统 v1.0 一个简单易用的Excel数据查询和分析工具, 无需编程知识即可快速查询、筛选和分析Excel表格数据。 主要功能: - Excel文件导入与预览 - 多条件组合查询 - 查询结果导出 - 数据统计与可视化 技术支持:Python + Tkinter + Pandas + Matplotlib © 2025 Excel表格查询系统 """ messagebox.showinfo("关于", about_text) def main(): """主函数""" root = tk.Tk() app = ExcelQuerySystem(root) root.mainloop() if __name__ == "__main__": main()
最新发布
10-24
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值