update修改为merge(max+decode)

记录日期: 2014-07-30 14:25:27

 

------------- 优化方法: 减少大表扫描次数采用max+decode方式

 

原sql语句:

UPDATE RKO_ACCT_STATUS A SET RMB_PAYMENT = (SELECT NVL(SUM(POSTING_AMT), 0) FROM RKOT_ACCT_PMT_PRIOR B WHERE B.ACCT = A.ACCT AND ORG = '242' AND POSTING_DATE BETWEEN A.PRIOR_BILLING_DATE + 1 AND ADD_MONTHS(A.PRIOR_BILLING_DATE, 1) AND TXN_DATE <= (SELECT CASE WHEN USER_DATE_10=0 THEN NULL ELSE TO_DATE(USER_DATE_10, 'yyyyddd') END FROM RKOH_HAPS_AMBS_KD WHERE ACCT = A.ACCT AND ORG = 242 AND BATCH_DATE = TO_CHAR(LAST_DAY(A.PRIOR_BILLING_DATE),'yyyymmdd')) + 0.99999), USD_PAYMENT = (SELECT NVL(SUM(POSTING_AMT), 0) FROM RKOT_ACCT_PMT_PRIOR B WHERE B.ACCT = A.ACCT AND ORG = '241' AND POSTING_DATE BETWEEN A.PRIOR_BILLING_DATE + 1 AND ADD_MONTHS(A.PRIOR_BILLING_DATE, 1) AND TXN_DATE <= (SELECT CASE WHEN USER_DATE_10=0 THEN NULL ELSE TO_DATE(USER_DATE_10, 'yyyyddd') END FROM RKOH_HAPS_AMBS_KD WHERE ACCT = A.ACCT AND ORG = 241 AND BATCH_DATE = TO_CHAR(LAST_DAY(A.PRIOR_BILLING_DATE),'yyyymmdd')) + 0.99999) WHERE TO_CHAR(A.PRIOR_BILLING_DATE, 'yyyymm') = :B1;

 

格式化一下:

UPDATE RKO_ACCT_STATUS A

SET RMB_PAYMENT =

(SELECT NVL(SUM(POSTING_AMT),

0)

FROM RKOT_ACCT_PMT_PRIOR B

WHERE B.ACCT = A.ACCT

AND ORG = '242'

AND POSTING_DATE BETWEEN A.PRIOR_BILLING_DATE + 1 AND

ADD_MONTHS(A.PRIOR_BILLING_DATE,

1)

AND TXN_DATE <=

(SELECT CASE

WHEN USER_DATE_10 = 0 THEN

NULL

ELSE

TO_DATE(USER_DATE_10,

'yyyyddd')

END

FROM RKOH_HAPS_AMBS_KD

WHERE ACCT = A.ACCT

AND ORG = 242

AND BATCH_DATE = TO_CHAR(LAST_DAY(A.PRIOR_BILLING_DATE),

'yyyymmdd')) + 0.99999),

USD_PAYMENT =

(SELECT NVL(SUM(POSTING_AMT),

0)

FROM RKOT_ACCT_PMT_PRIOR B

WHERE B.ACCT = A.ACCT

AND ORG = '241'

AND POSTING_DATE BETWEEN A.PRIOR_BILLING_DATE + 1 AND

ADD_MONTHS(A.PRIOR_BILLING_DATE,

1)

AND TXN_DATE <=

(SELECT CASE

WHEN USER_DATE_10 = 0 THEN

NULL

ELSE

TO_DATE(USER_DATE_10,

'yyyyddd')

END

FROM RKOH_HAPS_AMBS_KD

WHERE ACCT = A.ACCT

AND ORG = 241

AND BATCH_DATE = TO_CHAR(LAST_DAY(A.PRIOR_BILLING_DATE),

'yyyymmdd')) + 0.99999)

WHERE TO_CHAR(A.PRIOR_BILLING_DATE,

'yyyymm') = :B1

;

 

 

原sql执行计划:

 

 

------------------------------------- 优化

 

create index ind_RKO_ACCT_date on RKO_ACCT_STATUS(PRIOR_BILLING_DATE) NOLOGGING parallel 20;

alter index ind_RKO_ACCT_date NOPARALLEL;

 

 

 

 

-------------------------------------------------------------------------------------------优化后sql

 

MERGE INTO RKO_ACCT_STATUS t

USING (SELECT /*+USE_HASH(a,b,c)*/ a.rowid rowids,

sum(DECODE(b.org,

242,

NVL(b.POSTING_AMT,

0))) counts,

sum(DECODE(b.org,

241,

NVL(b.POSTING_AMT,

0))) counts1

FROM RKOT_ACCT_PMT_PRIOR B,

RKOH_HAPS_AMBS_KD c,

RKO_ACCT_STATUS a

WHERE B.ACCT = A.ACCT

AND c.ACCT = A.ACCT

AND b.ORG = c.ORG

AND b.ORG IN (242,

241)

AND (b.POSTING_DATE BETWEEN A.PRIOR_BILLING_DATE + 1 AND

ADD_MONTHS(A.PRIOR_BILLING_DATE,

1))

AND (c.BATCH_DATE =

(TO_CHAR(LAST_DAY(A.PRIOR_BILLING_DATE),

'yyyymmdd')) + 0.99999)

AND b.TXN_DATE <= TO_DATE(USER_DATE_10,

'yyyyddd')

AND A.PRIOR_BILLING_DATE BETWEEN

to_date('2014-04-01',

'YYYY-MM-dd') AND

to_date('2014-05-31',

'YYYY-MM-dd')

GROUP BY a.rowid) t1

ON (t.rowid = t1.rowids)

WHEN MATCHED THEN

UPDATE

SET t.RMB_PAYMENT = t1.counts,

t.USD_PAYMENT = t1.counts1

;

    

 

来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/26736162/viewspace-1244055/,如需转载,请注明出处,否则将追究法律责任。

转载于:http://blog.itpub.net/26736162/viewspace-1244055/

import os import subprocess import shutil import time import tkinter as tk from tkinter import filedialog, ttk, scrolledtext, messagebox, PhotoImage import pandas as pd import win32com.client as win32 from bs4 import BeautifulSoup import threading import tempfile import queue import traceback class DiffProcessorApp: def __init__(self, root): self.root = root root.title("高级文件夹比较工具") root.geometry("1000x700") root.configure(bg="#f5f5f5") # 创建现代风格主题 self.style = ttk.Style() self.style.theme_use('clam') # 自定义主题颜色 self.style.configure('TButton', font=('Segoe UI', 10, 'bold'), borderwidth=1, foreground="#333", background="#4CAF50", bordercolor="#388E3C", relief="flat", padding=8, anchor="center") self.style.map('TButton', background=[('active', '#388E3C'), ('disabled', '#BDBDBD')], foreground=[('disabled', '#9E9E9E')]) self.style.configure('TLabel', font=('Segoe UI', 9), background="#f5f5f5") self.style.configure('TLabelframe', font=('Segoe UI', 10, 'bold'), background="#f5f5f5", relief="flat", borderwidth=2) self.style.configure('TLabelframe.Label', font=('Segoe UI', 10, 'bold'), background="#f5f5f5", foreground="#2E7D32") self.style.configure('Treeview', font=('Segoe UI', 9), rowheight=25) self.style.configure('Treeview.Heading', font=('Segoe UI', 9, 'bold')) # 创建主框架 main_frame = ttk.Frame(root, padding="15") main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 标题区域 header_frame = ttk.Frame(main_frame) header_frame.pack(fill=tk.X, pady=(0, 15)) # 添加标题图标 try: icon = PhotoImage(file="folder_icon.png") self.icon_label = ttk.Label(header_frame, image=icon) self.icon_label.image = icon self.icon_label.pack(side=tk.LEFT, padx=(0, 10)) except: self.icon_label = ttk.Label(header_frame, text="📁", font=("Arial", 24)) self.icon_label.pack(side=tk.LEFT, padx=(0, 10)) title_label = ttk.Label(header_frame, text="高级文件夹比较工具", font=("Segoe UI", 18, "bold"), foreground="#2E7D32") title_label.pack(side=tk.LEFT) # 文件选择区域 file_frame = ttk.LabelFrame(main_frame, text="文件夹选择", padding="12") file_frame.pack(fill=tk.X, pady=5) # 文件夹选择 self.old_folder_entry, self.new_folder_entry = self.create_folder_selector(file_frame, "原始文件夹:") self.new_folder_entry = self.create_folder_selector(file_frame, "修改后文件夹:")[0] # 比较选项区域 options_frame = ttk.LabelFrame(main_frame, text="比较选项", padding="12") options_frame.pack(fill=tk.X, pady=5) # 递归比较选项 self.recursive_var = tk.BooleanVar(value=True) recursive_check = ttk.Checkbutton(options_frame, text="递归比较子文件夹", variable=self.recursive_var) recursive_check.grid(row=0, column=0, padx=10, pady=5, sticky=tk.W) # 文件过滤 filter_frame = ttk.Frame(options_frame) filter_frame.grid(row=0, column=1, padx=10, pady=5, sticky=tk.W) ttk.Label(filter_frame, text="文件过滤:").pack(side=tk.LEFT, padx=(0, 5)) self.filter_var = tk.StringVar(value="*.*") filter_entry = ttk.Entry(filter_frame, textvariable=self.filter_var, width=15) filter_entry.pack(side=tk.LEFT) # 目标Excel选择 excel_frame = ttk.LabelFrame(main_frame, text="输出设置", padding="12") excel_frame.pack(fill=tk.X, pady=5) ttk.Label(excel_frame, text="目标Excel文件:").grid(row=0, column=0, sticky=tk.W, padx=5, pady=5) self.excel_file_entry = ttk.Entry(excel_frame, width=60) self.excel_file_entry.grid(row=0, column=1, padx=5, pady=5) ttk.Button(excel_frame, text="浏览...", command=lambda: self.select_file(self.excel_file_entry, [("Excel文件", "*.xlsx *.xlsm")])).grid(row=0, column=2, padx=5, pady=5) # 执行按钮区域 button_frame = ttk.Frame(main_frame) button_frame.pack(fill=tk.X, pady=10) self.run_button = ttk.Button(button_frame, text="执行比较", command=self.start_processing, width=20, style='TButton') self.run_button.pack(side=tk.LEFT) # 停止按钮 self.stop_button = ttk.Button(button_frame, text="停止", command=self.stop_processing, width=10, state=tk.DISABLED) self.stop_button.pack(side=tk.LEFT, padx=10) # 进度条 self.progress = ttk.Progressbar(main_frame, orient=tk.HORIZONTAL, length=700, mode='determinate') self.progress.pack(fill=tk.X, pady=5) # 状态信息 status_frame = ttk.Frame(main_frame) status_frame.pack(fill=tk.X, pady=5) self.status_var = tk.StringVar(value="准备就绪") status_label = ttk.Label(status_frame, textvariable=self.status_var, font=("Segoe UI", 9), foreground="#2E7D32") status_label.pack(side=tk.LEFT) # 日志和预览区域 notebook = ttk.Notebook(main_frame) notebook.pack(fill=tk.BOTH, expand=True, pady=5) # 文件夹结构标签 tree_frame = ttk.Frame(notebook, padding="5") notebook.add(tree_frame, text="文件夹结构") # 创建树形视图 self.tree = ttk.Treeview(tree_frame, columns=("Status"), show="tree") self.tree.heading("#0", text="文件夹结构", anchor=tk.W) self.tree.heading("Status", text="状态", anchor=tk.W) self.tree.column("#0", width=400) self.tree.column("Status", width=100) vsb = ttk.Scrollbar(tree_frame, orient="vertical", command=self.tree.yview) hsb = ttk.Scrollbar(tree_frame, orient="horizontal", command=self.tree.xview) self.tree.configure(yscrollcommand=vsb.set, xscrollcommand=hsb.set) self.tree.grid(row=0, column=0, sticky="nsew") vsb.grid(row=0, column=1, sticky="ns") hsb.grid(row=1, column=0, sticky="ew") # 日志标签 log_frame = ttk.Frame(notebook, padding="5") notebook.add(log_frame, text="执行日志") self.log_text = scrolledtext.ScrolledText(log_frame, height=10, wrap=tk.WORD, font=("Consolas", 9)) self.log_text.pack(fill=tk.BOTH, expand=True) self.log_text.config(state=tk.DISABLED) # 设置网格权重 tree_frame.grid_rowconfigure(0, weight=1) tree_frame.grid_columnconfigure(0, weight=1) # 线程控制 self.processing = False self.queue = queue.Queue() # 启动队列处理 self.root.after(100, self.process_queue) def create_folder_selector(self, parent, label_text): """创建文件夹选择器组件""" frame = ttk.Frame(parent) frame.pack(fill=tk.X, pady=5) ttk.Label(frame, text=label_text).grid(row=0, column=0, sticky=tk.W, padx=5, pady=5) entry = ttk.Entry(frame, width=70) entry.grid(row=0, column=1, padx=5, pady=5) button = ttk.Button(frame, text="浏览文件夹...", command=lambda: self.select_folder(entry)) button.grid(row=0, column=2, padx=5, pady=5) return entry, button def select_folder(self, entry): """选择文件夹""" foldername = filedialog.askdirectory() if foldername: entry.delete(0, tk.END) entry.insert(0, foldername) # 自动填充文件夹结构 self.populate_folder_tree(foldername) def select_file(self, entry, filetypes=None): """选择文件""" if filetypes is None: filetypes = [("所有文件", "*.*")] filename = filedialog.askopenfilename(filetypes=filetypes) if filename: entry.delete(0, tk.END) entry.insert(0, filename) def populate_folder_tree(self, path): """填充文件夹结构树""" self.tree.delete(*self.tree.get_children()) if not os.path.isdir(path): return # 添加根节点 root_node = self.tree.insert("", "end", text=os.path.basename(path), values=("文件夹",), open=True) self.add_tree_nodes(root_node, path) def add_tree_nodes(self, parent, path): """递归添加树节点""" try: for item in os.listdir(path): item_path = os.path.join(path, item) if os.path.isdir(item_path): node = self.tree.insert(parent, "end", text=item, values=("文件夹",)) self.add_tree_nodes(node, item_path) else: self.tree.insert(parent, "end", text=item, values=("文件",)) except PermissionError: self.log_message(f"权限错误: 无法访问 {path}") def log_message(self, message): """记录日志消息""" self.queue.put(("log", message)) def update_progress(self, value): """更新进度条""" self.queue.put(("progress", value)) def update_status(self, message): """更新状态信息""" self.queue.put(("status", message)) def process_queue(self): """处理线程队列中的消息""" try: while not self.queue.empty(): msg_type, data = self.queue.get_nowait() if msg_type == "log": self.log_text.config(state=tk.NORMAL) self.log_text.insert(tk.END, data + "\n") self.log_text.see(tk.END) self.log_text.config(state=tk.DISABLED) elif msg_type == "progress": self.progress['value'] = data elif msg_type == "status": self.status_var.set(data) except queue.Empty: pass self.root.after(100, self.process_queue) def run_winmerge(self, path1, path2, output_html): """调用WinMerge生成HTML差异文件""" winmerge_path = r"E:\App\WinMerge\WinMerge2.16.12.0\WinMergeU.exe" # 修复TypeError: 确保所有参数都是字符串 winmerge_cmd = [ str(winmerge_path), '/u', '/dl', 'Base', '/dr', 'Modified', '/or', str(output_html), str(path1), str(path2) ] # 添加递归选项 if self.recursive_var.get(): winmerge_cmd.insert(1, '/r') self.log_message("正在调用WinMerge生成差异报告...") try: result = subprocess.run(winmerge_cmd, capture_output=True, text=True, timeout=120) if result.returncode == 0: self.log_message(f"HTML差异报告生成完成: {output_html}") return True else: error_msg = f"WinMerge执行失败: {result.stderr}" # 修复TypeError: 使用f-string避免字符串连接问题 self.log_message(error_msg) return False except subprocess.TimeoutExpired: self.log_message("WinMerge执行超时,请检查输入文件大小") return False except Exception as e: # 修复TypeError: 使用f-string记录异常 self.log_message(f"WinMerge执行错误: {str(e)}") return False def parse_html_diff(self, html_path): """解析HTML差异文件""" self.log_message("正在解析HTML差异文件...") try: with open(html_path, 'r', encoding='utf-8') as f: content = f.read() soup = BeautifulSoup(content, 'html.parser') diff_table = soup.find('table', {'class': 'diff'}) if not diff_table: self.log_message("错误: 未找到差异表格") return None # 提取表格数据 diff_data = [] for row in diff_table.find_all('tr')[1:]: # 跳过表头 cols = row.find_all('td') if len(cols) >= 3: # 修复TypeError: 确保所有值都是字符串 diff_type = str(cols[0].get_text(strip=True)) content_left = str(cols[1].get_text(strip=True)) content_right = str(cols[2].get_text(strip=True)) diff_data.append([diff_type, content_left, content_right]) # 修复TypeError: 使用f-string记录结果 self.log_message(f"成功解析 {len(diff_data)} 行差异数据") return diff_data except Exception as e: # 修复TypeError: 使用f-string记录异常 error_msg = f"解析HTML失败: {str(e)}\n{traceback.format_exc()}" self.log_message(error_msg) return None def write_to_excel(self, excel_path, diff_data): """将差异数据写入Excel""" self.log_message("正在写入Excel文件...") try: # 使用win32com打开Excel excel = win32.gencache.EnsureDispatch('Excel.Application') excel.Visible = True workbook = excel.Workbooks.Open(os.path.abspath(excel_path)) sheet = workbook.Sheets("一覧") # 从第6行开始写入数据 start_row = 6 for i, row_data in enumerate(diff_data): for j, value in enumerate(row_data[:6]): # 确保值是字符串类型 sheet.Cells(start_row + i, j + 1).Value = str(value) # 保存Excel workbook.Save() self.log_message(f"数据已写入Excel第{start_row}行开始") # 触发"作成"按钮 self.log_message("正在触发'作成'按钮...") try: # 查找按钮并点击 button = sheet.Buttons("作成") button.OnAction = "作成按钮的处理" button.Click() self.log_message("已触发'作成'按钮") # 等待处理完成 self.update_status("处理中...请等待") # 简单等待机制 for _ in range(30): # 最多等待30秒 if not self.processing: break if excel.CalculationState == 0: # 0 = xlDone break time.sleep(1) self.log_message("处理中...") self.log_message("处理完成") self.update_status("处理完成") except Exception as e: # 修复TypeError: 使用f-string记录异常 self.log_message(f"按钮操作失败: {str(e)}. 请手动点击'作成'按钮") # 关闭Excel workbook.Close() excel.Quit() return True except Exception as e: # 修复TypeError: 使用f-string记录异常 self.log_message(f"Excel操作失败: {str(e)}\n{traceback.format_exc()}") return False def start_processing(self): """启动处理线程 - 修复无响应问题""" if self.processing: self.log_message("警告: 处理正在进行中") return # 获取路径 old_path = self.old_folder_entry.get() new_path = self.new_folder_entry.get() excel_file = self.excel_file_entry.get() # 详细路径验证 validation_errors = [] if not old_path: validation_errors.append("原始文件夹路径为空") elif not os.path.isdir(old_path): validation_errors.append(f"原始文件夹路径无效: {old_path}") if not new_path: validation_errors.append("新文件夹路径为空") elif not os.path.isdir(new_path): validation_errors.append(f"新文件夹路径无效: {new_path}") if not excel_file: validation_errors.append("Excel文件路径为空") elif not excel_file.lower().endswith(('.xlsx', '.xlsm')): validation_errors.append("Excel文件必须是.xlsx或.xlsm格式") if validation_errors: self.log_message("错误: " + "; ".join(validation_errors)) messagebox.showerror("输入错误", "\n".join(validation_errors)) return # 检查WinMerge安装 winmerge_path = r"E:\App\WinMerge\WinMerge2.16.12.0\WinMergeU.exe" if not os.path.exists(winmerge_path): self.log_message(f"错误: WinMerge未安装在默认位置 {winmerge_path}") messagebox.showwarning("WinMerge未安装", "请确保WinMerge已安装或更新路径配置") return # 禁用执行按钮,启用停止按钮 self.run_button.config(state=tk.DISABLED) self.stop_button.config(state=tk.NORMAL) self.processing = True # 启动处理线程 thread = threading.Thread(target=self.process_folders, args=(old_path, new_path, excel_file)) thread.daemon = True thread.start() self.log_message("处理线程已启动") def process_folders(self, old_path, new_path, excel_file): """处理文件夹比较的线程函数 - 增强异常处理""" output_html = None try: # 步骤1: 生成HTML差异文件 self.update_status("生成HTML差异文件...") self.update_progress(20) # 使用临时文件存储HTML报告 with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as temp_file: output_html = temp_file.name if not self.run_winmerge(old_path, new_path, output_html): self.update_status("WinMerge执行失败") return # 步骤2: 将HTML文件与Excel放在同一目录 self.update_status("准备文件...") self.update_progress(40) excel_dir = os.path.dirname(excel_file) if excel_dir: target_html = os.path.join(excel_dir, "diff_report.html") try: shutil.copy(output_html, target_html) self.log_message(f"已将HTML文件复制到: {target_html}") except Exception as e: self.log_message(f"文件复制失败: {str(e)}") return # 步骤3: 解析HTML差异文件 self.update_status("解析差异数据...") self.update_progress(60) diff_data = self.parse_html_diff(output_html) if not diff_data: self.update_status("HTML解析失败") return # 步骤4: 写入Excel并触发按钮 self.update_status("写入Excel并触发处理...") self.update_progress(80) if not self.write_to_excel(excel_file, diff_data): self.update_status("Excel操作失败") return # 完成 self.update_progress(100) self.update_status("处理完成!") self.log_message("文件夹比较流程执行完毕") messagebox.showinfo("完成", "文件夹比较处理成功完成") except Exception as e: error_msg = f"执行过程中发生错误: {str(e)}\n{traceback.format_exc()}" self.log_message(error_msg) self.update_status("执行失败") messagebox.showerror("错误", f"处理失败: {str(e)}") finally: # 重新启用执行按钮 if self.processing: self.stop_processing() # 清理临时文件 if output_html and os.path.exists(output_html): try: os.remove(output_html) except: pass def run_winmerge(self, path1, path2, output_html): """调用WinMerge生成HTML差异文件 - 增强错误处理""" winmerge_path = r"E:\App\WinMerge\WinMerge2.16.12.0\WinMergeU.exe" # 验证WinMerge可执行文件 if not os.path.exists(winmerge_path): self.log_message(f"错误: WinMerge路径不存在 {winmerge_path}") return False winmerge_cmd = [ winmerge_path, '/u', '/dl', 'Base', '/dr', 'Modified', '/or', output_html, path1, path2 ] # 添加递归选项 if self.recursive_var.get(): winmerge_cmd.insert(1, '/r') self.log_message(f"执行WinMerge命令: {' '.join(winmerge_cmd)}") try: result = subprocess.run( winmerge_cmd, capture_output=True, text=True, timeout=120, creationflags=subprocess.CREATE_NO_WINDOW # 避免控制台窗口闪烁 ) if result.returncode == 0: self.log_message(f"HTML差异报告生成完成: {output_html}") return True else: error_msg = f"WinMerge执行失败(退出码{result.returncode}): {result.stderr}" self.log_message(error_msg) return False except subprocess.TimeoutExpired: self.log_message("WinMerge执行超时(120秒),请检查输入文件大小") return False except Exception as e: self.log_message(f"WinMerge执行错误: {str(e)}") return False def write_to_excel(self, excel_path, diff_data): """将差异数据写入Excel - 增强健壮性""" self.log_message("正在写入Excel文件...") excel = None workbook = None try: # 验证Excel文件存在 if not os.path.exists(excel_path): self.log_message(f"错误: Excel文件不存在 {excel_path}") return False # 使用win32com打开Excel excel = win32.gencache.EnsureDispatch('Excel.Application') excel.Visible = True excel.DisplayAlerts = False # 禁用警告提示 # 尝试打开工作簿 try: workbook = excel.Workbooks.Open(os.path.abspath(excel_path)) except Exception as e: self.log_message(f"打开Excel文件失败: {str(e)}") return False # 检查工作表是否存在 sheet_names = [sheet.Name for sheet in workbook.Sheets] if "一覧" not in sheet_names: self.log_message("错误: Excel文件中缺少'一覧'工作表") return False sheet = workbook.Sheets("一覧") # 从第6行开始写入数据 start_row = 6 for i, row_data in enumerate(diff_data): for j, value in enumerate(row_data[:6]): # 确保值是字符串类型 sheet.Cells(start_row + i, j + 1).Value = str(value) # 保存Excel workbook.Save() self.log_message(f"数据已写入Excel第{start_row}行开始") # 触发"作成"按钮 self.log_message("正在触发'作成'按钮...") try: # 查找按钮并点击 button = sheet.Buttons("作成") button.OnAction = "作成按钮的处理" button.Click() self.log_message("已触发'作成'按钮") # 等待处理完成 self.update_status("处理中...请等待") wait_time = 0 max_wait = 60 # 最大等待60秒 while self.processing and wait_time < max_wait: if excel.CalculationState == 0: # 0 = xlDone break time.sleep(1) wait_time += 1 self.log_message(f"处理中...({wait_time}秒)") if wait_time >= max_wait: self.log_message("警告: 处理超时") else: self.log_message("处理完成") return True except Exception as e: self.log_message(f"按钮操作失败: {str(e)}. 请手动点击'作成'按钮") return False except Exception as e: self.log_message(f"Excel操作失败: {str(e)}\n{traceback.format_exc()}") return False finally: # 确保正确关闭Excel try: if workbook: workbook.Close(SaveChanges=False) if excel: excel.Quit() except Exception as e: self.log_message(f"关闭Excel时出错: {str(e)}") def stop_processing(self): """停止处理""" self.processing = False self.stop_button.config(state=tk.DISABLED) self.run_button.config(state=tk.NORMAL) self.update_status("操作已停止") def process_folders(self, old_path, new_path, excel_file): """处理文件夹比较的线程函数""" try: # 步骤1: 生成HTML差异文件 self.update_status("生成HTML差异文件...") self.update_progress(20) # 使用临时文件存储HTML报告 with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as temp_file: output_html = temp_file.name if not self.run_winmerge(old_path, new_path, output_html): return # 步骤2: 将HTML文件与Excel放在同一目录 self.update_status("准备文件...") self.update_progress(40) excel_dir = os.path.dirname(excel_file) if excel_dir: target_html = os.path.join(excel_dir, "diff_report.html") shutil.copy(output_html, target_html) self.log_message(f"已将HTML文件复制到: {target_html}") # 步骤3: 解析HTML差异文件 self.update_status("解析差异数据...") self.update_progress(60) diff_data = self.parse_html_diff(output_html) if not diff_data: return # 步骤4: 写入Excel并触发按钮 self.update_status("写入Excel并触发处理...") self.update_progress(80) self.write_to_excel(excel_file, diff_data) # 完成 self.update_progress(100) self.update_status("处理完成!") self.log_message("文件夹比较流程执行完毕") except Exception as e: # 修复TypeError: 使用f-string记录异常 error_msg = f"执行过程中发生错误: {str(e)}\n{traceback.format_exc()}" self.log_message(error_msg) self.update_status("执行失败") finally: # 重新启用执行按钮 if self.processing: self.stop_processing() # 清理临时文件 if os.path.exists(output_html): try: os.remove(output_html) except: pass if __name__ == "__main__": root = tk.Tk() app = DiffProcessorApp(root) root.mainloop() 这里启动winmerge成功,并且成功生成了报告,但是进程还是显示卡在了生成报告那里,不继续往下执行了,是不是生成报告之后,winmerge有个弹窗显示生成成功了,我点击确定了,这个动作导致的?请解决这个问题
07-11
# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark offline inference throughput.""" import argparse import dataclasses import json import os import random import time import warnings from typing import Any, Optional, Union import torch import uvloop from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase from benchmark_dataset import ( AIMODataset, BurstGPTDataset, ConversationDataset, InstructCoderDataset, RandomDataset, SampleRequest, ShareGPTDataset, SonnetDataset, VisionArenaDataset, ) from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args, ) from vllm.inputs import TextPrompt, TokensPrompt from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser, merge_async_iterators def run_vllm( requests: list[SampleRequest], n: int, engine_args: EngineArgs, disable_detokenize: bool = False, ) -> tuple[float, Optional[list[RequestOutput]]]: from vllm import LLM, SamplingParams llm = LLM(**dataclasses.asdict(engine_args)) assert all( llm.llm_engine.model_config.max_model_len >= (request.prompt_len + request.expected_output_len) for request in requests ), ( "Please ensure that max_model_len is greater than the sum of" " prompt_len and expected_output_len for all requests." ) # Add the requests to the engine. prompts: list[Union[TextPrompt, TokensPrompt]] = [] sampling_params: list[SamplingParams] = [] for request in requests: prompts.append( TokensPrompt( prompt_token_ids=request.prompt["prompt_token_ids"], multi_modal_data=request.multi_modal_data, ) if "prompt_token_ids" in request.prompt else TextPrompt( prompt=request.prompt, multi_modal_data=request.multi_modal_data ) ) sampling_params.append( SamplingParams( n=n, temperature=1.0, top_p=1.0, ignore_eos=True, max_tokens=request.expected_output_len, detokenize=not disable_detokenize, ) ) lora_requests: Optional[list[LoRARequest]] = None if engine_args.enable_lora: lora_requests = [request.lora_request for request in requests] use_beam_search = False outputs = None if not use_beam_search: start = time.perf_counter() outputs = llm.generate( prompts, sampling_params, lora_request=lora_requests, use_tqdm=True ) end = time.perf_counter() else: assert lora_requests is None, "BeamSearch API does not support LoRA" prompts = [request.prompt for request in requests] # output_len should be the same for all requests. output_len = requests[0].expected_output_len for request in requests: assert request.expected_output_len == output_len start = time.perf_counter() llm.beam_search( prompts, BeamSearchParams( beam_width=n, max_tokens=output_len, ignore_eos=True, ), ) end = time.perf_counter() return end - start, outputs def run_vllm_chat( requests: list[SampleRequest], n: int, engine_args: EngineArgs, disable_detokenize: bool = False, ) -> tuple[float, list[RequestOutput]]: """ Run vLLM chat benchmark. This function is recommended ONLY for benchmarking multimodal models as it properly handles multimodal inputs and chat formatting. For non-multimodal models, use run_vllm() instead. """ from vllm import LLM, SamplingParams llm = LLM(**dataclasses.asdict(engine_args)) assert all( llm.llm_engine.model_config.max_model_len >= (request.prompt_len + request.expected_output_len) for request in requests ), ( "Please ensure that max_model_len is greater than the sum of " "prompt_len and expected_output_len for all requests." ) prompts = [] sampling_params: list[SamplingParams] = [] for request in requests: prompts.append(request.prompt) sampling_params.append( SamplingParams( n=n, temperature=1.0, top_p=1.0, ignore_eos=True, max_tokens=request.expected_output_len, detokenize=not disable_detokenize, ) ) start = time.perf_counter() outputs = llm.chat(prompts, sampling_params, use_tqdm=True) end = time.perf_counter() return end - start, outputs async def run_vllm_async( requests: list[SampleRequest], n: int, engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, disable_detokenize: bool = False, ) -> float: from vllm import SamplingParams async with build_async_engine_client_from_engine_args( engine_args, disable_frontend_multiprocessing ) as llm: model_config = await llm.get_model_config() assert all( model_config.max_model_len >= (request.prompt_len + request.expected_output_len) for request in requests ), ( "Please ensure that max_model_len is greater than the sum of" " prompt_len and expected_output_len for all requests." ) # Add the requests to the engine. prompts: list[Union[TextPrompt, TokensPrompt]] = [] sampling_params: list[SamplingParams] = [] lora_requests: list[Optional[LoRARequest]] = [] for request in requests: prompts.append( TokensPrompt( prompt_token_ids=request.prompt["prompt_token_ids"], multi_modal_data=request.multi_modal_data, ) if "prompt_token_ids" in request.prompt else TextPrompt( prompt=request.prompt, multi_modal_data=request.multi_modal_data ) ) sampling_params.append( SamplingParams( n=n, temperature=1.0, top_p=1.0, ignore_eos=True, max_tokens=request.expected_output_len, detokenize=not disable_detokenize, ) ) lora_requests.append(request.lora_request) generators = [] start = time.perf_counter() for i, (prompt, sp, lr) in enumerate( zip(prompts, sampling_params, lora_requests) ): generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}") generators.append(generator) all_gens = merge_async_iterators(*generators) async for i, res in all_gens: pass end = time.perf_counter() return end - start def run_hf( requests: list[SampleRequest], model: str, tokenizer: PreTrainedTokenizerBase, n: int, max_batch_size: int, trust_remote_code: bool, disable_detokenize: bool = False, ) -> float: llm = AutoModelForCausalLM.from_pretrained( model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code ) if llm.config.model_type == "llama": # To enable padding in the HF backend. tokenizer.pad_token = tokenizer.eos_token llm = llm.cuda() pbar = tqdm(total=len(requests)) start = time.perf_counter() batch: list[str] = [] max_prompt_len = 0 max_output_len = 0 for i in range(len(requests)): prompt = requests[i].prompt prompt_len = requests[i].prompt_len output_len = requests[i].expected_output_len # Add the prompt to the batch. batch.append(prompt) max_prompt_len = max(max_prompt_len, prompt_len) max_output_len = max(max_output_len, output_len) if len(batch) < max_batch_size and i != len(requests) - 1: # Check if we can add more requests to the batch. next_prompt_len = requests[i + 1].prompt_len next_output_len = requests[i + 1].expected_output_len if ( max(max_prompt_len, next_prompt_len) + max(max_output_len, next_output_len) ) <= 2048: # We can add more requests to the batch. continue # Generate the sequences. input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids llm_outputs = llm.generate( input_ids=input_ids.cuda(), do_sample=True, num_return_sequences=n, temperature=1.0, top_p=1.0, use_cache=True, max_new_tokens=max_output_len, ) if not disable_detokenize: # Include the decoding time. tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) pbar.update(len(batch)) # Clear the batch. batch = [] max_prompt_len = 0 max_output_len = 0 end = time.perf_counter() return end - start def run_mii( requests: list[SampleRequest], model: str, tensor_parallel_size: int, output_len: int, ) -> float: from mii import client, serve llm = serve(model, tensor_parallel=tensor_parallel_size) prompts = [request.prompt for request in requests] start = time.perf_counter() llm.generate(prompts, max_new_tokens=output_len) end = time.perf_counter() client = client(model) client.terminate_server() return end - start def save_to_pytorch_benchmark_format( args: argparse.Namespace, results: dict[str, Any] ) -> None: pt_records = convert_to_pytorch_benchmark_format( args=args, metrics={ "requests_per_second": [results["requests_per_second"]], "tokens_per_second": [results["tokens_per_second"]], }, extra_info={ k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"] }, ) if pt_records: # Don't use json suffix here as we don't want CI to pick it up pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" write_to_json(pt_file, pt_records) def get_requests(args, tokenizer): # Common parameters for all dataset types. common_kwargs = { "dataset_path": args.dataset_path, "random_seed": args.seed, } sample_kwargs = { "tokenizer": tokenizer, "lora_path": args.lora_path, "max_loras": args.max_loras, "num_requests": args.num_prompts, "input_len": args.input_len, "output_len": args.output_len, } if args.dataset_path is None or args.dataset_name == "random": sample_kwargs["range_ratio"] = args.random_range_ratio sample_kwargs["prefix_len"] = args.prefix_len dataset_cls = RandomDataset elif args.dataset_name == "sharegpt": dataset_cls = ShareGPTDataset if args.backend == "vllm-chat": sample_kwargs["enable_multimodal_chat"] = True elif args.dataset_name == "sonnet": assert tokenizer.chat_template or tokenizer.default_chat_template, ( "Tokenizer/model must have chat template for sonnet dataset." ) dataset_cls = SonnetDataset sample_kwargs["prefix_len"] = args.prefix_len sample_kwargs["return_prompt_formatted"] = True elif args.dataset_name == "burstgpt": dataset_cls = BurstGPTDataset elif args.dataset_name == "hf": common_kwargs["no_stream"] = args.no_stream if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: dataset_cls = VisionArenaDataset common_kwargs["dataset_subset"] = None common_kwargs["dataset_split"] = "train" sample_kwargs["enable_multimodal_chat"] = True elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: dataset_cls = InstructCoderDataset common_kwargs["dataset_split"] = "train" elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: dataset_cls = ConversationDataset common_kwargs["dataset_subset"] = args.hf_subset common_kwargs["dataset_split"] = args.hf_split sample_kwargs["enable_multimodal_chat"] = True elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: dataset_cls = AIMODataset common_kwargs["dataset_subset"] = None common_kwargs["dataset_split"] = "train" else: raise ValueError(f"Unknown dataset name: {args.dataset_name}") # Remove None values sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None} return dataset_cls(**common_kwargs).sample(**sample_kwargs) def main(args: argparse.Namespace): if args.seed is None: args.seed = 0 print(args) random.seed(args.seed) # Sample the requests. tokenizer = AutoTokenizer.from_pretrained( args.tokenizer, trust_remote_code=args.trust_remote_code ) requests = get_requests(args, tokenizer) is_multi_modal = any(request.multi_modal_data is not None for request in requests) request_outputs: Optional[list[RequestOutput]] = None if args.backend == "vllm": if args.async_engine: elapsed_time = uvloop.run( run_vllm_async( requests, args.n, AsyncEngineArgs.from_cli_args(args), args.disable_frontend_multiprocessing, args.disable_detokenize, ) ) else: elapsed_time, request_outputs = run_vllm( requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize, ) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf( requests, args.model, tokenizer, args.n, args.hf_max_batch_size, args.trust_remote_code, args.disable_detokenize, ) elif args.backend == "mii": elapsed_time = run_mii( requests, args.model, args.tensor_parallel_size, args.output_len ) elif args.backend == "vllm-chat": elapsed_time, request_outputs = run_vllm_chat( requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize ) else: raise ValueError(f"Unknown backend: {args.backend}") if request_outputs: # Note: with the vllm and vllm-chat backends, # we have request_outputs, which we use to count tokens. total_prompt_tokens = 0 total_output_tokens = 0 for ro in request_outputs: if not isinstance(ro, RequestOutput): continue total_prompt_tokens += ( len(ro.prompt_token_ids) if ro.prompt_token_ids else 0 ) total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o) total_num_tokens = total_prompt_tokens + total_output_tokens else: total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests) total_output_tokens = sum(r.expected_output_len for r in requests) total_prompt_tokens = total_num_tokens - total_output_tokens if is_multi_modal and args.backend != "vllm-chat": print( "\033[91mWARNING\033[0m: Multi-modal request with " f"{args.backend} backend detected. The " "following metrics are not accurate because image tokens are not" " counted. See vllm-project/vllm/issues/9778 for details." ) # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length. # vllm-chat backend counts the image tokens now print( f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_output_tokens / elapsed_time:.2f} output tokens/s" ) print(f"Total num prompt tokens: {total_prompt_tokens}") print(f"Total num output tokens: {total_output_tokens}") # Output JSON results if specified if args.output_json: results = { "elapsed_time": elapsed_time, "num_requests": len(requests), "total_num_tokens": total_num_tokens, "requests_per_second": len(requests) / elapsed_time, "tokens_per_second": total_num_tokens / elapsed_time, } with open(args.output_json, "w") as f: json.dump(results, f, indent=4) save_to_pytorch_benchmark_format(args, results) def validate_args(args): """ Validate command-line arguments. """ # === Deprecation and Defaulting === if args.dataset is not None: warnings.warn( "The '--dataset' argument will be deprecated in the next release. " "Please use '--dataset-name' and '--dataset-path' instead.", stacklevel=2, ) args.dataset_path = args.dataset if not getattr(args, "tokenizer", None): args.tokenizer = args.model # === Backend Validation === valid_backends = {"vllm", "hf", "mii", "vllm-chat"} if args.backend not in valid_backends: raise ValueError(f"Unsupported backend: {args.backend}") # === Dataset Configuration === if not args.dataset and not args.dataset_path: print("When dataset path is not set, it will default to random dataset") args.dataset_name = "random" if args.input_len is None: raise ValueError("input_len must be provided for a random dataset") # === Dataset Name Specific Checks === # --hf-subset and --hf-split: only used # when dataset_name is 'hf' if args.dataset_name != "hf" and ( getattr(args, "hf_subset", None) is not None or getattr(args, "hf_split", None) is not None ): warnings.warn( "--hf-subset and --hf-split will be ignored \ since --dataset-name is not 'hf'.", stacklevel=2, ) elif args.dataset_name == "hf": if args.dataset_path in ( VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys() | ConversationDataset.SUPPORTED_DATASET_PATHS ): assert args.backend == "vllm-chat", ( f"{args.dataset_path} needs to use vllm-chat as the backend." ) # noqa: E501 elif args.dataset_path in ( InstructCoderDataset.SUPPORTED_DATASET_PATHS | AIMODataset.SUPPORTED_DATASET_PATHS ): assert args.backend == "vllm", ( f"{args.dataset_path} needs to use vllm as the backend." ) # noqa: E501 else: raise ValueError(f"{args.dataset_path} is not supported by hf dataset.") # --random-range-ratio: only used when dataset_name is 'random' if args.dataset_name != "random" and args.random_range_ratio is not None: warnings.warn( "--random-range-ratio will be ignored since \ --dataset-name is not 'random'.", stacklevel=2, ) # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not # set. if ( args.dataset_name not in {"random", "sonnet", None} and args.prefix_len is not None ): warnings.warn( "--prefix-len will be ignored since --dataset-name\ is not 'random', 'sonnet', or not set.", stacklevel=2, ) # === LoRA Settings === if getattr(args, "enable_lora", False) and args.backend != "vllm": raise ValueError("LoRA benchmarking is only supported for vLLM backend") if getattr(args, "enable_lora", False) and args.lora_path is None: raise ValueError("LoRA path must be provided when enable_lora is True") # === Backend-specific Validations === if args.backend == "hf" and args.hf_max_batch_size is None: raise ValueError("HF max batch size is required for HF backend") if args.backend != "hf" and args.hf_max_batch_size is not None: raise ValueError("HF max batch size is only for HF backend.") if ( args.backend in {"hf", "mii"} and getattr(args, "quantization", None) is not None ): raise ValueError("Quantization is only for vLLM backend.") if args.backend == "mii" and args.dtype != "auto": raise ValueError("dtype must be auto for MII backend.") if args.backend == "mii" and args.n != 1: raise ValueError("n must be 1 for MII backend.") if args.backend == "mii" and args.tokenizer != args.model: raise ValueError("Tokenizer must be the same as the model for MII backend.") # --data-parallel is not supported currently. # https://github.com/vllm-project/vllm/issues/16222 if args.data_parallel_size > 1: raise ValueError( "Data parallel is not supported in offline benchmark, \ please use benchmark serving instead" ) def create_argument_parser(): parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument( "--backend", type=str, choices=["vllm", "hf", "mii", "vllm-chat"], default="vllm", ) parser.add_argument( "--dataset-name", type=str, choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"], help="Name of the dataset to benchmark on.", default="sharegpt", ) parser.add_argument( "--no-stream", action="store_true", help="Do not load the dataset in streaming mode.", ) parser.add_argument( "--dataset", type=str, default=None, help="Path to the ShareGPT dataset, will be deprecated in\ the next release. The dataset is expected to " "be a json in form of list[dict[..., conversations: " "list[dict[..., value: <prompt_or_response>]]]]", ) parser.add_argument( "--dataset-path", type=str, default=None, help="Path to the dataset" ) parser.add_argument( "--input-len", type=int, default=None, help="Input prompt length for each request", ) parser.add_argument( "--output-len", type=int, default=None, help="Output length for each request. Overrides the " "output length from the dataset.", ) parser.add_argument( "--n", type=int, default=1, help="Number of generated sequences per prompt." ) parser.add_argument( "--num-prompts", type=int, default=1000, help="Number of prompts to process." ) parser.add_argument( "--hf-max-batch-size", type=int, default=None, help="Maximum batch size for HF backend.", ) parser.add_argument( "--output-json", type=str, default=None, help="Path to save the throughput results in JSON format.", ) parser.add_argument( "--async-engine", action="store_true", default=False, help="Use vLLM async engine rather than LLM class.", ) parser.add_argument( "--disable-frontend-multiprocessing", action="store_true", default=False, help="Disable decoupled async engine frontend.", ) parser.add_argument( "--disable-detokenize", action="store_true", help=( "Do not detokenize the response (i.e. do not include " "detokenization time in the measurement)" ), ) # LoRA parser.add_argument( "--lora-path", type=str, default=None, help="Path to the LoRA adapters to use. This can be an absolute path, " "a relative path, or a Hugging Face model identifier.", ) parser.add_argument( "--prefix-len", type=int, default=None, help=f"Number of prefix tokens to be used in RandomDataset " "and SonnetDataset. For RandomDataset, the total input " "length is the sum of prefix-len (default: " f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length " "sampled from [input_len * (1 - range_ratio), " "input_len * (1 + range_ratio)]. For SonnetDataset, " f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) " "controls how much of the input is fixed lines versus " "random lines, but the total input length remains approximately " "input_len tokens.", ) # random dataset parser.add_argument( "--random-range-ratio", type=float, default=None, help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) " "for sampling input/output length, " "used only for RandomDataset. Must be in the range [0, 1) to " "define a symmetric sampling range " "[length * (1 - range_ratio), length * (1 + range_ratio)].", ) # hf dtaset parser.add_argument( "--hf-subset", type=str, default=None, help="Subset of the HF dataset." ) parser.add_argument( "--hf-split", type=str, default=None, help="Split of the HF dataset." ) parser = AsyncEngineArgs.add_cli_args(parser) return parser if __name__ == "__main__": parser = create_argument_parser() args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model validate_args(args) main(args)
最新发布
07-29
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符  | 博主筛选后可见
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值