How To Convert HTML Page To Image

本文提供了一个简单的示例,展示了如何使用Java库将HTML页面转换为图片格式,包括HTML编辑器面板、预览尺寸获取、绘图到BufferedImage以及使用JPEG编码器保存为.jpg文件。
Anton Nashatyrev

Hello Maria,

Here is a simple example:
[code]
JEditorPane p = new JEditorPane("text/html", "aaa bbb ccc");
Dimension size = p.getPreferredSize();
p.setSize(size);
BufferedImage img = new BufferedImage(size.width, size.height,
BufferedImage.TYPE_INT_RGB);
p.paint(img.getGraphics());
JPEGImageEncoder enc = JPEGCodec.createJPEGEncoder(new FileOutputStream("out.jpg"));
enc.encode(img);
[/code]

Best regards,
Anton.

Thursday, April 22, 2004, 12:36:09 AM, you wrote:
sfjo> We need the Java library converting HTML page to any
sfjo> image format. May be there is a way to do that using standard
sfjo> Java libraries.
sfjo> ---
sfjo> [Message sent by forum member 'MariaBraga' (Maria Brag)]

sfjo> http://www.javadesktop.org/forums/thread.jspa?messageID=9548&#9548


报错如下: File <tokenize>:446 except ImportError: ^ IndentationError: unindent does not match any outer indentation level # 尝试12 库版本冲突,NUmpy需要1版本,有时又要2版本,进行分类控制 # 安全的版本检查和兼容性处理 import sys import warnings import logging # 配置日志记录 logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def check_numpy_compatibility(): """检查NumPy版本兼容性并处理问题""" try: import numpy as np numpy_version = tuple(map(int, np.version.version.split('.')[:2])) # 检查NumPy是否为2.0+ if numpy_version >= (2, 0): logger.warning("检测到不兼容的NumPy 2.0+版本") # 提供明确的错误信息而不是尝试自动降级 error_msg = """ ️ 严重兼容性问题 ⚠️ 检测到NumPy 2.0+版本,但当前环境需要NumPy 1.x版本。 解决方案: 1. 创建一个新的虚拟环境: python -m venv ocr-env source ocr-env/bin/activate # Linux/Mac .\ocr-env\Scripts\activate # Windows 2. 安装兼容版本的NumPy: pip install "numpy<2" 3. 然后安装其他依赖: pip install pandas opencv-python paddlepaddle paddleocr cnocr pdf2image pdfplumber PyMuPDF 4. 重新运行程序 注意: Jupyter环境中无法直接降级NumPy,必须使用虚拟环境! """ raise ImportError(error_msg) return np except ImportError: # NumPy未安装,尝试安装兼容版本 try: logger.info("NumPy未安装,尝试安装兼容版本...") import subprocess subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy<2"]) import numpy as np return np except Exception as e: logger.error(f"安装NumPy失败: {str(e)}") raise ImportError("无法安装兼容的NumPy版本,请手动安装: pip install \"numpy<2\"") # 尝试获取兼容的NumPy版本 try: np = check_numpy_compatibility() logger.info(f"使用兼容的NumPy版本: {np.version.version}") except ImportError as e: # 在GUI环境中显示错误信息 import tkinter.messagebox as messagebox messagebox.showerror("环境配置错误", str(e)) sys.exit(1) import pandas as pd import pdfplumber # PDF解析 import tkinter as tk from tkinter import filedialog, ttk, messagebox import os import threading from concurrent.futures import ThreadPoolExecutor import time import gc import psutil import openpyxl # 用于高效读取大型Excel import re import warnings import pandas as pd import pdfplumber import re import pandas as pd # 增强的依赖管理和错误处理 import sys import warnings import logging import traceback import os # 配置高级日志记录 logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.FileHandler("ocr_system.log", encoding='utf-8'), logging.StreamHandler() ] ) logger = logging.getLogger("PDF-OCR-System") # 1. 解决NumPy兼容性问题 def resolve_numpy_conflict(): """确保NumPy版本与SciPy兼容""" try: import numpy as np # 检查NumPy版本是否在SciPy要求的范围内 [1.16.5, 1.23.0) numpy_version = tuple(map(int, np.__version__.split('.')[:3])) if numpy_version >= (1, 16, 5) and numpy_version < (1, 23, 0): logger.info(f"NumPy版本兼容: {np.__version__}") return np logger.warning(f"检测到不兼容的NumPy版本: {np.__version__}") # 尝试安装兼容版本 target_version = "1.22.4" # SciPy兼容的稳定版本 logger.info(f"尝试安装NumPy {target_version}...") import subprocess subprocess.run( [sys.executable, "-m", "pip", "install", f"numpy=={target_version}"], check=True ) # 重新加载NumPy import importlib importlib.reload(np) logger.info(f"NumPy成功降级到: {np.__version__}") return np except ImportError: logger.error("NumPy未安装,正在安装兼容版本...") import subprocess subprocess.run( [sys.executable, "-m", "pip", "install", "numpy==1.22.4"], check=True ) import numpy as np return np except Exception as e: logger.error(f"NumPy版本处理失败: {str(e)}") # 回退到系统NumPy import numpy as np return np # 安全初始化NumPy try: np = resolve_numpy_conflict() except Exception as e: logger.critical(f"NumPy初始化失败: {str(e)}") sys.exit(1) # 2. 安装缺失的关键依赖 def install_missing_dependencies(): """安装PDF处理和OCR所需的依赖""" missing_packages = [] # 检查关键依赖 required_packages = { "pandas": "2.1.4", "pdfplumber": "0.10.4", "paddleocr": "3.2.0", "cnocr": "2.3.0.2", "pdf2image": "1.16.3", "PyMuPDF": "1.24.0", # fitz模块所属包 "opencv-python": "4.9.0.80", "openpyxl": "3.1.2", "psutil": "5.9.7", "scipy": "1.13.0" # 与NumPy 1.22.4兼容的版本 } import importlib for pkg, version in required_packages.items(): try: mod = importlib.import_module(pkg) installed_version = getattr(mod, '__version__', "未知") logger.info(f"{pkg}已安装: {installed_version}") except ImportError: missing_packages.append(f"{pkg}=={version}") if missing_packages: logger.warning(f"缺失依赖: {', '.join(missing_packages)}") try: import subprocess install_cmd = [sys.executable, "-m", "pip", "install"] + missing_packages subprocess.run(install_cmd, check=True) logger.info("缺失依赖安装完成") except Exception as e: logger.error(f"依赖安装失败: {str(e)}") return False return True # 安装缺失依赖 install_missing_dependencies() # 先定义 PriceComparator 类 class PriceComparator: def __init__(self): self.history_df = None self.supplier_data = {} self.pdf_files = [] # 存储处理的PDF文件路径 self.ocr_engine = None # OCR引擎实例 def initialize_ocr(self): """初始化OCR引擎(按需初始化)""" if self.ocr_engine is None: # 使用组合OCR策略:PaddleOCR + CnOCR self.ocr_engine = { 'paddle': PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=False), 'cnocr': CnOcr() } print("OCR引擎初始化完成") def load_history_data(self, excel_path): """加载历史订单数据(优化版,使用openpyxl只读模式)""" # ...(保持不变)... def preprocess_image(self, image): """图像预处理:提高OCR识别率""" # 转换为OpenCV格式 img = np.array(image) # 1. 灰度转换 gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) # 2. 自适应直方图均衡化 clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) enhanced = clahe.apply(gray) # 3. 非局部均值去噪 denoised = cv2.fastNlMeansDenoising(enhanced, h=10, templateWindowSize=7, searchWindowSize=21) # 4. 锐化处理 kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]]) sharpened = cv2.filter2D(denoised, -1, kernel) return Image.fromarray(sharpened) def ocr_image(self, image): """使用组合OCR引擎识别图像文本""" self.initialize_ocr() # 预处理图像 processed_img = self.preprocess_image(image) # 使用PaddleOCR识别 paddle_result = self.ocr_engine['paddle'].ocr(np.array(processed_img), cls=True) paddle_text = "" if paddle_result and paddle_result[0]: for line in paddle_result[0]: if line and line[1]: paddle_text += line[1][0] + "\n" # 使用CnOCR识别 cnocr_result = self.ocr_engine['cnocr'].ocr(np.array(processed_img)) cnocr_text = "\n".join([line['text'] for line in cnocr_result]) # 合并结果(优先使用PaddleOCR,补充CnOCR的结果) combined_text = paddle_text for line in cnocr_text.split('\n'): if line.strip() and line not in paddle_text: combined_text += line + "\n" return combined_text def parse_scanned_pdf(self, pdf_path, progress_callback=None): """解析扫描版PDF文件(基于OCR)""" print(f"开始处理扫描版PDF: {pdf_path}") start_time = time.time() # 使用PyMuPDF获取页数 with fitz.open(pdf_path) as doc: total_pages = doc.page_count items = [] # 创建临时目录存储转换后的图像 with tempfile.TemporaryDirectory() as temp_dir: # 将PDF转换为图像列表 images = convert_from_path( pdf_path, dpi=300, # 高分辨率提高识别率 output_folder=temp_dir, fmt='jpeg', thread_count=4 # 多线程加速 ) for i, image in enumerate(images): if progress_callback: progress_callback(i+1, total_pages) print(f"处理第 {i+1}/{len(images)} 页...") page_text = self.ocr_image(image) # 从OCR文本中提取数据 page_items = self.parse_ocr_text(page_text) items.extend(page_items) df = pd.DataFrame(items) print(f"扫描版PDF处理完成: {pdf_path}, 耗时: {time.time()-start_time:.2f}秒") print(f"提取了 {len(df)} 条记录") return df def parse_ocr_text(self, text): """解析OCR识别的文本内容""" # 定义字段别名映射,兼容不同供应商的命名方式 field_mappings = { '名称': ['名称', '品名', '产品名称', '物料名称', '物料', '化合物名称'], '规格': ['规格', '包装规格', '规格型号', '型号', '包装', '规格说明'], '数量': ['数量', '主数量', '订货数量', '订购数量', '采购数量'], '价格': ['价格', '单价', '报价', '含税单价', '不含税单价'], 'CAS号': ['CAS', 'CAS号', 'CAS NO.', 'CAS No', 'CAS No.'], '货号': ['货号', '编号', '产品编号', '编码', '物料编码'], '品牌': ['品牌', '原厂'], '厂家': ['厂家', '生产厂家', '制造商', '原厂'], '证书': ['证书', '资质', '认证'], '货期': ['货期', '交货期', '交付周期', '生产周期'], '备注': ['备注', '说明', '附加信息'] } # 创建反向映射字典 {别名: 标准字段名} reverse_mappings = {} for standard_field, aliases in field_mappings.items(): for alias in aliases: reverse_mappings[alias.lower()] = standard_field items = [] # 分割文本为段落(假设每个段落对应一个产品) paragraphs = re.split(r'\n{2,}', text) for para in paragraphs: # 跳过空段落和页眉页脚 if len(para.strip()) < 20: continue # 尝试匹配键值对 item = {} for alias, std_field in reverse_mappings.items(): # 创建匹配模式 pattern = re.compile(fr"{alias}\s*[::]?\s*([^\n]+)") match = pattern.search(para) if match: item[std_field] = match.group(1).strip() # 如果提取到了关键字段,添加到结果 if '名称' in item and '价格' in item: # 创建物料标识 if '货号' in item and '规格' in item: item['物料标识'] = f"{item['货号']}_{item['规格']}" elif '名称' in item and '规格' in item: item['物料标识'] = f"{item['名称']}_{item['规格']}" # 数值类型转换 try: if '数量' in item: item['数量'] = float(re.sub(r'[^\d.]', '', item['数量'])) if '价格' in item: item['价格'] = float(re.sub(r'[^\d.]', '', item['价格'])) except ValueError: # 保留原始值如果不能转换 pass items.append(item) return items def parse_pdf(self, pdf_path, progress_callback=None): """解析供应商PDF报价单 - 自动检测文本型或扫描型""" print(f"开始解析PDF: {pdf_path}") start_time = time.time() supplier_name = os.path.basename(pdf_path).split('.')[0] self.pdf_files.append(pdf_path) # 首先尝试作为文本型PDF解析 try: with pdfplumber.open(pdf_path) as pdf: # 检查第一页是否有文本 first_page_text = pdf.pages[0].extract_text() if first_page_text and len(first_page_text.strip()) > 50: print("检测为文本型PDF,使用pdfplumber解析") return self.parse_text_pdf(pdf_path, progress_callback) except Exception as e: print(f"pdfplumber解析失败: {str(e)}") # 如果文本型解析失败或不适用,则使用OCR解析 print("检测为扫描版PDF,使用OCR解析") return self.parse_scanned_pdf(pdf_path, progress_callback) def parse_text_pdf(self, pdf_path, progress_callback=None): """解析文本型PDF(原有逻辑)""" # ...(这里是您原有的parse_pdf方法的内容)... # 为了节省空间,我这里省略了重复代码,您可以直接使用原有的代码 # 实际集成时请保留这部分代码 def compare_prices(self): """比较价格并生成结果""" if not self.supplier_data: return pd.DataFrame() # 合并所有供应商数据 all_items = pd.concat(self.supplier_data.values(), keys=self.supplier_data.keys(), names=['供应商']) all_items.reset_index(level=0, inplace=True) # 找出每个物料的最低价格 min_prices = all_items.groupby('物料标识')['价格'].min().reset_index() # 关联历史数据(如果有) if self.history_df is not None: merged_df = pd.merge( min_prices, self.history_df, on='物料标识', how='left', suffixes=('_最低价', '_历史') ) else: merged_df = min_prices.rename(columns={'价格': '最低价'}) # 关联供应商名称 merged_df = pd.merge( merged_df, all_items[['物料标识', '供应商', '价格']], left_on=['物料标识', '最低价'], right_on=['物料标识', '价格'], how='left' ) merged_df.rename(columns={'供应商': '最低价供应商', '价格': '最低价'}, inplace=True) # 清理临时列 merged_df.drop(columns=['价格_最低价'], inplace=True, errors='ignore') print(f"比价分析完成,共分析 {len(merged_df)} 条记录") return merged_df class PDFParser: self.ocr_engine = PaddleOCR( use_angle_cls=True, lang='ch', use_gpu=True, # 自动检测GPU show_log=False ) logger.info("使用PaddleOCR引擎") except ImportError: logger.warning("PaddleOCR未安装,尝试使用CnOCR") try: from cnocr import CnOcr self.ocr_engine = CnOcr() logger.info("使用CnOCR引擎") except ImportError: logger.error("未安装OCR引擎,请安装PaddleOCR或CnOCR") self.ocr_engine = None def _pdf_to_images(self, pdf_path): """将PDF转换为图像列表""" try: from pdf2image import convert_from_path return convert_from_path(pdf_path, dpi=300) except ImportError: logger.error("pdf2image未安装") return None except Exception as e: logger.error(f"PDF转图像失败: {str(e)}") return None def _process_ocr_results(self, results): """处理OCR结果提取表格数据""" # 简化的表格提取逻辑 data = [] for page_result in results: if self.ocr_engine.__class__.__name__ == 'PaddleOCR': # PaddleOCR结果处理 for line in page_result: if line and line[1][0]: # 确保有文本 data.append(line[1][0]) else: # CnOCR结果处理 for line in page_result: if line and line['text']: data.append(line['text']) # TODO: 根据实际需求添加表格解析逻辑 logger.warning("OCR结果提取需要根据实际表格结构增强") return {"text": data} # 然后定义 Application 类 class Application(tk.Tk): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.comparator = None # 确保初始化 self.status_var = tk.StringVar(value="就绪") self._setup_ui() self._check_dependencies() self._load_history_data() # 提前加载历史数据 def _check_dependencies(self): """增强的依赖检查,避免兼容性问题""" self.log_message("=== 依赖库版本检查 ===") # 记录核心库版本 self.log_message(f"Python版本: {sys.version.split()[0]}") self.log_message(f"NumPy版本: {np.version.version}") # 安全获取其他模块版本 def safe_get_version(module_name): try: module = __import__(module_name) if hasattr(module, '__version__'): return module.__version__ # 尝试其他方式获取版本 if module_name == "pdf2image": from pdf2image import __version__ as pdf2image_version return pdf2image_version # 使用metadata获取 if sys.version_info >= (3, 8): from importlib.metadata import version return version(module_name) else: import pkg_resources return pkg_resources.get_distribution(module_name).version except: return "未知版本" # 检查关键依赖 deps = ["pandas", "pdfplumber", "paddleocr", "cnocr", "pdf2image", "PyMuPDF", "opencv-python", "openpyxl", "psutil"] for dep in deps: try: __import__(dep) version = safe_get_version(dep) self.log_message(f"{dep}版本: {version}") except ImportError: self.log_message(f"警告: {dep}未安装") except Exception as e: self.log_message(f"{dep}检查错误: {str(e)}") self.log_message("=== 依赖检查完成 ===") def _setup_ui(self): self.title("物料价格比对系统 v3.0") self.geometry("800x600") # 主框架 main_frame = ttk.Frame(self) main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 控制按钮框架 btn_frame = ttk.Frame(main_frame) btn_frame.pack(fill=tk.X, pady=5) self.btn_load_history = ttk.Button(btn_frame, text="加载历史数据", command=self.load_history) self.btn_load_history.pack(side=tk.LEFT, padx=5) self.btn_add_pdf = ttk.Button(btn_frame, text="添加供应商报价", command=self.add_pdf) self.btn_add_pdf.pack(side=tk.LEFT, padx=5) self.btn_generate = ttk.Button(btn_frame, text="生成比对报告", command=self.generate_report) self.btn_generate.pack(side=tk.LEFT, padx=5) self.btn_clear = ttk.Button(btn_frame, text="清除缓存", command=self.clear_cache) self.btn_clear.pack(side=tk.RIGHT, padx=5) # 进度条框架 progress_frame = ttk.LabelFrame(main_frame, text="处理进度") progress_frame.pack(fill=tk.X, padx=5, pady=5) self.progress_label = ttk.Label(progress_frame, text="就绪") self.progress_label.pack(anchor=tk.W, padx=5, pady=2) self.progress_bar = ttk.Progressbar(progress_frame, orient='horizontal', length=500, mode='determinate') self.progress_bar.pack(fill=tk.X, padx=5, pady=5) # 日志框架 log_frame = ttk.LabelFrame(main_frame, text="操作日志") log_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) self.log_text = tk.Text(log_frame, height=15) scrollbar = ttk.Scrollbar(log_frame, command=self.log_text.yview) self.log_text.configure(yscrollcommand=scrollbar.set) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) self.log_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) self.log_text.config(state=tk.DISABLED) # 状态栏 self.status_var = tk.StringVar(value="就绪") status_bar = ttk.Label(self, textvariable=self.status_var, relief=tk.SUNKEN, anchor=tk.W) status_bar.pack(side=tk.BOTTOM, fill=tk.X) # 内存监控 self.memory_var = tk.StringVar(value="内存使用: 0 MB") memory_label = ttk.Label(self, textvariable=self.memory_var, padding=2) memory_label.pack(side=tk.BOTTOM, fill=tk.X) # 禁用按钮直到历史数据加载 self.btn_add_pdf.config(state=tk.DISABLED) self.btn_generate.config(state=tk.DISABLED) # 启动内存监控 self.monitor_memory() def log_message(self, message): """在日志框中添加消息""" self.log_text.config(state=tk.NORMAL) self.log_text.insert(tk.END, message + "\n") self.log_text.see(tk.END) # 自动滚动到底部 self.log_text.config(state=tk.DISABLED) def monitor_memory(self): """定期更新内存使用情况""" process = psutil.Process(os.getpid()) mem_usage = process.memory_info().rss / (1024 * 1024) # 转换为MB self.memory_var.set(f"内存使用: {mem_usage:.2f} MB") self.after(5000, self.monitor_memory) # 每5秒更新一次 def clear_cache(self): """清除缓存释放内存""" self.comparator = PriceComparator() gc.collect() self.log_message("缓存已清除,内存已释放") self.status_var.set("就绪") self.progress_bar["value"] = 0 self.progress_label["text"] = "就绪" self.btn_add_pdf.config(state=tk.DISABLED) self.btn_generate.config(state=tk.DISABLED) self.btn_load_history.config(state=tk.NORMAL) def _load_history_data(self): """安全加载历史数据""" try: history_path = "C:/Users/14432/Desktop/自动比价系统/近五年订单汇总.xlsx" self.log_message(f"开始加载历史数据: {history_path}") import pandas as pd self.comparator.history_df = pd.read_excel(history_path) msg = f"历史数据加载完成: {len(self.comparator.history_df)}条记录" self.status_var.set(msg) self.log_message(msg) except Exception as e: error_msg = f"历史数据加载失败: {str(e)}" self.status_var.set(error_msg) self.log_message(error_msg, level="ERROR") self.comparator.history_df = pd.DataFrame() # 创建空DataFrame def parse_pdfs(self): """解析PDF文件并处理错误""" pdf_files = [ "C:/Users/14432/Desktop/自动比价系统/s26c-725082615560.pdf", "C:/Users/14432/Desktop/自动比价系统/对照品询比价2025.08.21-2--广州佳途报价---TO华东医药-更新2.pdf", "C:/Users/14432/Desktop/自动比价系统/广州隽沐报价单20250826-中美华东(3).pdf", "C:/Users/14432/Desktop/自动比价系统/翔龙报价单-华东-20250825.pdf" ] parser = PDFParser() results = [] for pdf_path in pdf_files: result = parser.parse_pdf(pdf_path) if result: results.append(result) self.log_message(f"成功解析: {pdf_path}") else: self.log_message(f"解析失败: {pdf_path}", level="ERROR") self.log_message(f"所有PDF文件解析完成,成功{len(results)}个") def add_pdf(self): """添加PDF文件(多线程处理)""" files = filedialog.askopenfilenames(filetypes=[("PDF文件", "*.pdf")]) if not files: return # 禁用按钮避免重复操作 self.btn_add_pdf.config(state=tk.DISABLED) self.status_var.set(f"正在解析{len(files)}个PDF文件...") self.log_message(f"开始解析{len(files)}个PDF文件") # 重置进度条 self.progress_bar["value"] = 0 # 使用线程池 def process_file(file_path): try: # 为每个文件设置进度回调 def progress_callback(current, total): self.after(0, lambda: self.progress_label.config( text=f"解析 {os.path.basename(file_path)}: {current}/{total}页") ) self.after(0, lambda: self.progress_bar.config(value=(current/total)*100)) self.comparator.parse_pdf(file_path, progress_callback) self.after(0, lambda: self.log_message(f"成功解析: {file_path}")) except Exception as e: error_msg = str(e) self.after(0, lambda: self.log_message(f"错误: 解析 {file_path} 失败 - {error_msg}")) # 启动线程池 def thread_pool_task(): with ThreadPoolExecutor(max_workers=min(4, len(files))) as executor: futures = [executor.submit(process_file, f) for f in files] # 等待所有任务完成 for future in futures: future.result() # 完成后重置界面 self.after(0, lambda: self.progress_bar.config(value=0)) self.after(0, lambda: self.progress_label.config(text="解析完成")) self.after(0, lambda: self.status_var.set(f"PDF解析完成: {len(files)}个文件")) self.after(0, lambda: self.btn_add_pdf.config(state=tk.NORMAL)) self.after(0, lambda: self.log_message(f"所有PDF文件解析完成")) threading.Thread(target=thread_pool_task, daemon=True).start() def generate_report(self): """生成报告(线程安全)""" if self.processing: return if not self.comparator.supplier_data: messagebox.showwarning("警告", "请先添加供应商报价单") return # 默认保存路径的逻辑 if self.comparator.pdf_files: first_pdf = self.comparator.pdf_files[0] report_dir = os.path.dirname(first_pdf) timestamp = time.strftime("%Y%m%d_%H%M%S") default_name = f"比价报告_{timestamp}.xlsx" save_path = os.path.join(report_dir, default_name) else: save_path = filedialog.asksaveasfilename( defaultextension=".xlsx", filetypes=[("Excel文件", "*.xlsx")] ) if not save_path: return # 启动线程生成报告 self.processing = True self.status_var.set("正在生成报告...") self.log_message("开始生成比对报告") self.progress_label.config(text="生成报告中...") # 显示进度条但不更新具体值 self.progress_bar.config(mode='indeterminate') self.progress_bar.start(10) # 开始滚动动画 def thread_task(): try: start_time = time.time() result = self.comparator.compare_prices() # 保存结果 result.to_excel(save_path, index=False) elapsed = time.time() - start_time self.after(0, lambda: self.log_message(f"报告生成成功! 保存路径: {save_path}, 耗时: {elapsed:.2f}秒")) self.after(0, lambda: self.status_var.set(f"报告生成成功: {len(result)}条记录")) self.after(0, lambda: messagebox.showinfo("成功", f"比价报告已保存至:\n{save_path}")) except Exception as ex: # 使用不同的变量名 error_msg = str(ex) # 将错误信息保存到局部变量 self.after(0, lambda: self.log_message(f"错误: 生成报告失败 - {error_msg}")) self.after(0, lambda: messagebox.showerror("错误", f"生成报告失败: {error_msg}")) finally: self.after(0, lambda: self.progress_bar.stop()) self.after(0, lambda: self.progress_bar.config(mode='determinate', value=0)) self.after(0, lambda: self.progress_label.config(text="就绪")) self.processing = False threading.Thread(target=thread_task, daemon=True).start() # 在主入口安全初始化 if __name__ == "__main__": try: # 忽略特定警告 warnings.filterwarnings("ignore", category=UserWarning, module='scipy') warnings.filterwarnings("ignore", category=UserWarning, module='paddle') app = Application() app.mainloop() except Exception as e: logger.critical(f"应用程序崩溃: {str(e)}") traceback.print_exc() # 显示错误对话框 import tkinter.messagebox as messagebox messagebox.showerror("系统错误", f"程序遇到严重错误:\n{str(e)}\n\n详细信息请查看日志文件")
最新发布
09-15
<think> 我们遇到了一个缩进错误(IndentationError),这通常是由于代码中混合使用了空格和制表符(tab)导致的。此外,我们还注意到代码中有重复的导入和定义,以及一些逻辑问题。我们将逐步解决这些问题。 解决方案步骤: 1. 修复缩进错误:确保代码中统一使用4个空格进行缩进。 2. 移除重复的导入和代码:例如,多次导入相同的库(如pandas, pdfplumber等)。 3. 重构代码结构:确保类和方法定义正确,没有嵌套的错误。 4. 解决依赖冲突:通过虚拟环境或版本隔离来解决NumPy版本冲突问题(但根据问题,我们已经在代码中做了处理,这里主要修复代码结构)。 鉴于错误报告中提到了一个缩进错误在文件的第446行附近,我们检查代码发现: 在定义`PriceComparator`类时,其方法`parse_ocr_text`之后的代码似乎有缩进问题。另外,在代码末尾部分,`PDFParser`类的定义被截断并且缩进不正确。 我们将重新组织代码,确保以下部分: - 只保留一次依赖检查和安装。 - 正确定义所有类和方法(特别是`PriceComparator`和`PDFParser`类)。 - 修复`Application`类中的方法。 由于代码量很大,我们将重点关注有问题的部分,并重构整个代码结构。 注意:由于原始代码过长,我们只展示修复后的关键部分。我们将按照以下顺序组织代码: 1. 导入必要的库(注意:避免重复导入)。 2. 依赖检查和安装(使用之前提供的增强版本)。 3. 定义`PriceComparator`类。 4. 定义`PDFParser`类(如果需要的话,但原始代码中已经有了一个`PDFParser`类,我们将其完善)。 5. 定义`Application`类。 但是,我们注意到原始代码中同时存在两个`PDFParser`类的定义,一个在`PriceComparator`的方法中(实际上没有,可能是粘贴错误),另一个在代码末尾。我们将统一为一个`PDFParser`类。 由于问题描述中报错的代码行号我们不明确,我们将统一使用4个空格缩进,并确保所有代码块正确对齐。 由于代码太长,我们只给出关键部分的修正,并省略一些重复的代码(例如`parse_text_pdf`的具体实现,因为问题描述中没有给出)。 我们开始重构代码: </think> 我发现了几个代码问题:缩进错误、未闭合的代码块和依赖版本冲突。以下是修复后的完整解决方案: ```python import sys import warnings import logging import traceback import os import gc import tempfile import time import re import threading import numpy as np import pandas as pd import cv2 from PIL import Image from tkinter import filedialog, messagebox import tkinter as tk from tkinter import ttk import psutil from concurrent.futures import ThreadPoolExecutor # 配置高级日志记录 logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.FileHandler("ocr_system.log", encoding='utf-8'), logging.StreamHandler() ] ) logger = logging.getLogger("PDF-OCR-System") # 安全导入和处理依赖 def safe_import(module_name, install_name=None, min_version=None, max_version=None): """安全导入模块,必要时安装""" try: mod = __import__(module_name) if install_name is None: install_name = module_name # 检查版本 if min_version or max_version: version = getattr(mod, '__version__', "0.0.0") if min_version and version < min_version: raise ImportError(f"{module_name}版本过低 ({version} < {min_version})") if max_version and version > max_version: raise ImportError(f"{module_name}版本过高 ({version} > {max_version})") return mod except ImportError: logger.warning(f"{module_name}未安装,尝试安装") import subprocess subprocess.run([sys.executable, "-m", "pip", "install", install_name], check=True) return __import__(module_name) except Exception as e: logger.error(f"导入{module_name}失败: {str(e)}") raise # 安装必要依赖 try: # 安装NumPy 1.22.4 (SciPy兼容版本) try: import numpy if tuple(map(int, numpy.__version__.split('.'))) >= (2, 0, 0): import subprocess subprocess.run([sys.executable, "-m", "pip", "install", "numpy==1.22.4"], check=True) import importlib numpy = importlib.reload(numpy) except ImportError: import subprocess subprocess.run([sys.executable, "-m", "pip", "install", "numpy==1.22.4"], check=True) import numpy # 安装其他必要依赖 required_packages = { "pandas": "pandas==1.5.3", "pdfplumber": "pdfplumber==0.10.4", "paddleocr": "paddleocr>=3.2.0.1", "cnocr": "cnocr==2.3.0.2", "pdf2image": "pdf2image==1.16.3", "PyMuPDF": "PyMuPDF==1.24.0", "opencv-python": "opencv-python==4.9.0.80", "openpyxl": "openpyxl==3.1.2", "psutil": "psutil==5.9.7", "scipy": "scipy==1.13.0" } for pkg, install_cmd in required_packages.items(): safe_import(pkg, install_cmd) # 导入关键库 import pdfplumber import fitz # PyMuPDF from pdf2image import convert_from_path from paddleocr import PaddleOCR from cnocr import CnOcr except Exception as e: logger.critical(f"依赖初始化失败: {str(e)}") import tkinter.messagebox as tk_msgbox tk_msgbox.showerror("环境错误", f"依赖初始化失败:\n{str(e)}\n请检查日志文件") sys.exit(1) # 价格比较器类 class PriceComparator: def __init__(self): self.history_df = None self.supplier_data = {} self.pdf_files = [] self.ocr_engine = None def initialize_ocr(self): """初始化OCR引擎""" if self.ocr_engine is None: try: self.ocr_engine = { 'paddle': PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=False), 'cnocr': CnOcr() } logger.info("OCR引擎初始化完成") except Exception as e: logger.error(f"OCR引擎初始化失败: {str(e)}") return False return True def load_history_data(self, excel_path): """加载历史订单数据""" logger.info(f"加载历史数据: {excel_path}") try: # 使用openpyxl高效读取 self.history_df = pd.read_excel( excel_path, engine='openpyxl', dtype={'物料标识': str, '历史价格': float} ) logger.info(f"历史数据加载完成: {len(self.history_df)}条记录") return True except Exception as e: logger.error(f"加载历史数据失败: {str(e)}") self.history_df = pd.DataFrame() return False def preprocess_image(self, image): """图像预处理:提高OCR识别率""" try: img = np.array(image) gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) enhanced = clahe.apply(gray) denoised = cv2.fastNlMeansDenoising(enhanced, h=10, templateWindowSize=7, searchWindowSize=21) kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]]) sharpened = cv2.filter2D(denoised, -1, kernel) return Image.fromarray(sharpened) except Exception as e: logger.error(f"图像预处理失败: {str(e)}") return image def ocr_image(self, image): """使用组合OCR引擎识别图像文本""" if not self.initialize_ocr(): return "" try: # 预处理图像 processed_img = self.preprocess_image(image) # 使用PaddleOCR识别 paddle_result = self.ocr_engine['paddle'].ocr(np.array(processed_img), cls=True) paddle_text = "" if paddle_result and paddle_result[0]: for line in paddle_result[0]: if line and line[1]: paddle_text += line[1][0] + "\n" # 使用CnOCR识别 cnocr_result = self.ocr_engine['cnocr'].ocr(np.array(processed_img)) cnocr_text = "\n".join([line['text'] for line in cnocr_result]) # 合并结果 combined_text = paddle_text for line in cnocr_text.split('\n'): if line.strip() and line not in paddle_text: combined_text += line + "\n" return combined_text except Exception as e: logger.error(f"OCR处理失败: {str(e)}") return "" def parse_scanned_pdf(self, pdf_path, progress_callback=None): """解析扫描版PDF文件(基于OCR)""" logger.info(f"开始处理扫描版PDF: {pdf_path}") start_time = time.time() # 使用PyMuPDF获取页数 with fitz.open(pdf_path) as doc: total_pages = doc.page_count items = [] # 创建临时目录存储转换后的图像 with tempfile.TemporaryDirectory() as temp_dir: # 将PDF转换为图像列表 images = convert_from_path( pdf_path, dpi=300, output_folder=temp_dir, fmt='jpeg', thread_count=4 ) for i, image in enumerate(images): if progress_callback: progress_callback(i+1, total_pages) logger.info(f"处理第 {i+1}/{len(images)} 页...") page_text = self.ocr_image(image) # 从OCR文本中提取数据 page_items = self.parse_ocr_text(page_text) items.extend(page_items) df = pd.DataFrame(items) logger.info(f"扫描版PDF处理完成: {pdf_path}, 耗时: {time.time()-start_time:.2f}秒") logger.info(f"提取了 {len(df)} 条记录") return df def parse_ocr_text(self, text): """解析OCR识别的文本内容""" field_mappings = { '名称': ['名称', '品名', '产品名称', '物料名称', '物料', '化合物名称'], '规格': ['规格', '包装规格', '规格型号', '型号', '包装', '规格说明'], '数量': ['数量', '主数量', '订货数量', '订购数量', '采购数量'], '价格': ['价格', '单价', '报价', '含税单价', '不含税单价'], 'CAS号': ['CAS', 'CAS号', 'CAS NO.', 'CAS No', 'CAS No.'], '货号': ['货号', '编号', '产品编号', '编码', '物料编码'], '品牌': ['品牌', '原厂'], '厂家': ['厂家', '生产厂家', '制造商', '原厂'], '证书': ['证书', '资质', '认证'], '货期': ['货期', '交货期', '交付周期', '生产周期'], '备注': ['备注', '说明', '附加信息'] } # 创建反向映射字典 reverse_mappings = {} for standard_field, aliases in field_mappings.items(): for alias in aliases: reverse_mappings[alias.lower()] = standard_field items = [] # 分割文本为段落 paragraphs = re.split(r'\n{2,}', text) for para in paragraphs: # 跳过空段落 if len(para.strip()) < 20: continue # 尝试匹配键值对 item = {} for alias, std_field in reverse_mappings.items(): pattern = re.compile(fr"{alias}\s*[::]?\s*([^\n]+)") match = pattern.search(para) if match: item[std_field] = match.group(1).strip() # 如果提取到了关键字段,添加到结果 if '名称' in item and '价格' in item: # 创建物料标识 if '货号' in item and '规格' in item: item['物料标识'] = f"{item['货号']}_{item['规格']}" elif '名称' in item and '规格' in item: item['物料标识'] = f"{item['名称']}_{item['规格']}" # 数值类型转换 try: if '数量' in item: item['数量'] = float(re.sub(r'[^\d.]', '', item['数量'])) if '价格' in item: item['价格'] = float(re.sub(r'[^\d.]', '', item['价格'])) except ValueError: pass items.append(item) return items def parse_pdf(self, pdf_path, progress_callback=None): """解析供应商PDF报价单 - 自动检测文本型或扫描型""" logger.info(f"开始解析PDF: {pdf_path}") start_time = time.time() supplier_name = os.path.basename(pdf_path).split('.')[0] self.pdf_files.append(pdf_path) # 首先尝试作为文本型PDF解析 try: with pdfplumber.open(pdf_path) as pdf: # 检查第一页是否有文本 if len(pdf.pages) > 0: first_page_text = pdf.pages[0].extract_text() if first_page_text and len(first_page_text.strip()) > 50: logger.info("检测为文本型PDF,使用pdfplumber解析") return self.parse_text_pdf(pdf_path, progress_callback) except Exception as e: logger.warning(f"pdfplumber解析失败: {str(e)}") # 默认使用OCR解析 logger.info("检测为扫描版PDF,使用OCR解析") return self.parse_scanned_pdf(pdf_path, progress_callback) def parse_text_pdf(self, pdf_path, progress_callback=None): """解析文本型PDF""" logger.info(f"解析文本型PDF: {pdf_path}") try: with pdfplumber.open(pdf_path) as pdf: all_tables = [] total_pages = len(pdf.pages) for i, page in enumerate(pdf.pages): if progress_callback: progress_callback(i+1, total_pages) tables = page.extract_tables() if tables: for table in tables: # 转置表格为DataFrame df_table = pd.DataFrame(table[1:], columns=table[0]) all_tables.append(df_table) if all_tables: result_df = pd.concat(all_tables, ignore_index=True) logger.info(f"文本型PDF解析成功: {len(result_df)}条记录") return result_df else: logger.warning("未找到表格数据,尝试OCR") return self.parse_scanned_pdf(pdf_path, progress_callback) except Exception as e: logger.error(f"文本型PDF解析失败: {str(e)}") return self.parse_scanned_pdf(pdf_path, progress_callback) def compare_prices(self): """比较价格并生成结果""" if not self.supplier_data: return pd.DataFrame() # 合并所有供应商数据 all_items = pd.concat(self.supplier_data.values(), keys=self.supplier_data.keys(), names=['供应商']) all_items.reset_index(level=0, inplace=True) # 找出每个物料的最低价格 min_prices = all_items.groupby('物料标识')['价格'].min().reset_index() # 关联历史数据 if not self.history_df.empty: merged_df = min_prices.merge( self.history_df, on='物料标识', how='left', suffixes=('_最低价', '_历史') ) else: merged_df = min_prices.rename(columns={'价格': '最低价'}) # 关联供应商名称 merged_df = merged_df.merge( all_items[['物料标识', '供应商', '价格']], left_on=['物料标识', '最低价'], right_on=['物料标识', '价格'], how='left' ).rename(columns={'供应商': '最低价供应商'}) logger.info(f"比价分析完成: {len(merged_df)}条记录") return merged_df # 应用程序GUI类 class Application(tk.Tk): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.comparator = PriceComparator() self.status_var = tk.StringVar(value="就绪") self.processing = False self._setup_ui() self._load_history_data() def _setup_ui(self): self.title("物料价格比对系统 v4.0") self.geometry("900x700") # 主框架 main_frame = ttk.Frame(self) main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10) # 控制按钮框架 btn_frame = ttk.Frame(main_frame) btn_frame.pack(fill=tk.X, pady=5) self.btn_load_history = ttk.Button(btn_frame, text="加载历史数据", command=self.load_history) self.btn_load_history.pack(side=tk.LEFT, padx=5) self.btn_add_pdf = ttk.Button(btn_frame, text="添加供应商报价", command=self.add_pdf) self.btn_add_pdf.pack(side=tk.LEFT, padx=5) self.btn_generate = ttk.Button(btn_frame, text="生成比对报告", command=self.generate_report) self.btn_generate.pack(side=tk.LEFT, padx=5) self.btn_clear = ttk.Button(btn_frame, text="清除缓存", command=self.clear_cache) self.btn_clear.pack(side=tk.RIGHT, padx=5) # 进度条框架 progress_frame = ttk.LabelFrame(main_frame, text="处理进度") progress_frame.pack(fill=tk.X, padx=5, pady=5) self.progress_label = ttk.Label(progress_frame, text="就绪") self.progress_label.pack(anchor=tk.W, padx=5, pady=2) self.progress_bar = ttk.Progressbar(progress_frame, orient='horizontal', length=700, mode='determinate') self.progress_bar.pack(fill=tk.X, padx=5, pady=5) # 日志框架 log_frame = ttk.LabelFrame(main_frame, text="操作日志") log_frame.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) self.log_text = tk.Text(log_frame, height=15) scrollbar = ttk.Scrollbar(log_frame, command=self.log_text.yview) self.log_text.configure(yscrollcommand=scrollbar.set) scrollbar.pack(side=tk.RIGHT, fill=tk.Y) self.log_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) self.log_text.config(state=tk.DISABLED) # 状态栏 status_bar = ttk.Frame(self) status_bar.pack(side
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值