[zt ]Exception in thread "main" .........

本文探讨了在Red Hat Linux AS4环境下使用JDK 1.5.0编译和运行Java程序时遇到的NoClassDefFoundError错误。通过详细分析welcome.java示例程序的编译和运行过程,揭示了环境变量CLASSPATH配置不当导致的问题根源,并提供了正确的CLASSPATH设置方法。

为什么会出现Exception in thread "main" java.lang.NoClassDefFoundError: welcome的错误?
2007年06月09日 星期六 21:26

环境: redhat linux as4, j2sdk1.5.0
welcome.java的内容如下:

public class welcome
{
         public static void main(String [] args)
         {
                 System.out.print("welcome.\n");
                 return;
         }
}

编译: javac welcome.java
正确

运行: java welcome
出错: Exception in thread "main" java.lang.NoClassDefFoundDrror: welcome

后来终于找到原因: java加载了welcome.class以后,由于class文件里保存的只是对象引用记号,并没有保存类本身,并不像C++一样在二进制文件中就保存着类的定义。所以这时java解释器就试图到环境变量CLASSPATH中找类welcome,而我当时的环境变量CLASSPATH为:
$CLASSPATH=/usr/java/jdk1.5.0_11/lib:/usr/java/jdk1.5.0_11/jre/lib
显示java解释器找不到welcome类的定义,即而出错。

悟出这一点,我立刻进行验证,先重新设置环境变量:
CLASSPATH=/usr/java/jdk1.5.0_11/lib:/usr/java/jdk1.5.0_11/jre/lib:.
注意最后加了一个冒号和点,冒号是多个域的分割符,点代表让java解释器到当前目录下寻找welcome.class

一切OK!

【import tkinter as tk from tkinter import ttk, scrolledtext, messagebox, filedialog import threading from openpyxl import Workbook from openpyxl.drawing.image import Image as ExcelImage from PIL import Image import requests import base64 import json import os from datetime import datetime, timedelta import time from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.chrome.options import Options import cv2 import numpy as np import easyocr import io import pyautogui from selenium.webdriver.chrome.service import Service import openpyxl as Excelopenpyxl from openpyxl.utils import get_column_letter class FOFASearchGUI: def __init__(self, root): self._stop_flag = False self.root = root self.root.title("FOFA搜索工具 - APK下载检测整合版") self.root.geometry("1000x950") root.geometry("+900+200") # 变量 self.rules_file = "rules2.txt" self.api_key = "723e2138c72d9320c5bb2e884cb43eed" self.results = [] # 存储FOFA搜索结果 self.apk_results = {} # 存储每个URL对应的APK检测结果 self.setup_ui() # 自动加载规则 self.auto_load_rules(0) self.GuiZhi = None # 自动点击配置 self.download_folder = self.setup_download_folder() self.driver = None self.ret_down_path = None self.keywords = self.load_keywords_from_file() # -------监控-------------------------- # 设置默认监控路径 self.active_downloads = {} self.monitor_running = False self.root.grid_rowconfigure(1, weight=1) # 进度区域可扩展 self.root.grid_columnconfigure(0, weight=1) # 主列可扩展 # 设置关闭事件处理 self.root.protocol("WM_DELETE_WINDOW", self.on_close) # 初始化行计数器 self.row_counter = 0 # -------监控-------------------------- def setup_download_folder(self): """设置下载文件夹""" download_folder = os.path.join(os.getcwd(), "downloads") if not os.path.exists(download_folder): os.makedirs(download_folder) return download_folder def get_past_date(self, days=3): """获取指定天数前的日期""" past_date = datetime.now() - timedelta(days=days) return past_date.strftime("%Y-%m-%d") def setup_ui(self): main_frame = ttk.Frame(self.root, padding="10") main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) self.root.columnconfigure(0, weight=1) self.root.rowconfigure(0, weight=1) # 设置左右列权重为7:3 main_frame.columnconfigure(0, weight=8) # 左侧区域占比70% main_frame.columnconfigure(1, weight=2) # 右侧区域占比30% # === 左侧区域 (规则和结果显示) === left_frame = ttk.Frame(main_frame) left_frame.grid(row=0, column=0, rowspan=3, sticky=(tk.W, tk.E, tk.N, tk.S)) left_frame.columnconfigure(0, weight=1) main_frame.rowconfigure(0, weight=1) # 运行显示区域 rules_frame = ttk.LabelFrame(left_frame, text="运行结果", padding="5") rules_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), pady=(0, 5)) rules_frame.columnconfigure(0, weight=1) rules_frame.rowconfigure(0, weight=1) # self.rules_tree = ttk.Treeview(rules_frame, columns=(), height=8) self.rules_tree = scrolledtext.ScrolledText(rules_frame, height=15, wrap=tk.WORD) self.rules_tree.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) # -------监控-------------------------- # 添加开始监控按钮 # =============== 中间进度区域 =============== progress_frame = ttk.LabelFrame(self.root, text="下载进度", padding=10) progress_frame.grid(row=1, column=0, sticky="nsew", padx=10, pady=5) progress_frame.grid_rowconfigure(0, weight=1) # canvas行可扩展 progress_frame.grid_columnconfigure(0, weight=1) # canvas列可扩展 # 创建带滚动条的画布 self.canvas = tk.Canvas(progress_frame) self.scrollbar = ttk.Scrollbar( progress_frame, orient="vertical", command=self.canvas.yview ) # 可滚动区域 self.scrollable_frame = ttk.Frame(self.canvas) self.scrollable_frame.bind( "<Configure>", lambda e: self.canvas.configure(scrollregion=self.canvas.bbox("all")) ) # 网格布局滚动区域 self.canvas.create_window((0, 0), window=self.scrollable_frame, anchor="nw") self.canvas.configure(yscrollcommand=self.scrollbar.set) # 布局画布和滚动条 self.canvas.grid(row=0, column=0, sticky="nsew") self.scrollbar.grid(row=0, column=1, sticky="ns") # =============== 底部按钮区域 =============== bottom_frame = ttk.Frame(self.root, padding=10) bottom_frame.grid(row=2, column=0, sticky="ew", padx=10, pady=5) bottom_frame.grid_columnconfigure(0, weight=1) # 退出按钮区域居右 # 退出按钮 ttk.Button( bottom_frame, text="退出", command=self.on_close, width=10 ).grid(row=0, column=1, sticky="e") # -------监控-------------------------- scrollbar = ttk.Scrollbar(rules_frame, orient=tk.VERTICAL, command=self.rules_tree.yview) scrollbar.grid(row=0, column=1, sticky=(tk.N, tk.S)) self.rules_tree.configure(yscrollcommand=scrollbar.set) # 日志显示区域 result_frame = ttk.LabelFrame(left_frame, text="运行日志", padding="5") result_frame.grid(row=2, column=0, sticky=(tk.W, tk.E, tk.N, tk.S), pady=(5, 0)) result_frame.columnconfigure(0, weight=1) result_frame.rowconfigure(0, weight=1) self.result_text = scrolledtext.ScrolledText(result_frame, height=30, wrap=tk.WORD) self.result_text.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) # 进度条 self.progress = ttk.Progressbar(left_frame, mode=&#39;determinate&#39;, maximum=100) self.progress.grid(row=3, column=0, sticky=(tk.W, tk.E), pady=(5, 0)) # === 右侧区域 (其他组件) === # 创建右侧框架 - 修复未定义错误 right_frame = ttk.Frame(main_frame) right_frame.grid(row=0, column=1, sticky=(tk.N, tk.S, tk.W, tk.E)) right_frame.columnconfigure(0, weight=1) right_frame.rowconfigure(0, weight=1) # 添加此行确保Notebook填充空间 # 在right_frame中创建Notebook选项卡 notebook = ttk.Notebook(right_frame) notebook.grid(row=0, column=0, sticky=(tk.N, tk.S, tk.W, tk.E)) # 创建选项卡1:搜索配置 tab_search = ttk.Frame(notebook) notebook.add(tab_search, text="自定义配置搜索") # 文件选择区域 file_frame = ttk.LabelFrame(tab_search, text="规则文件配置", padding="5") file_frame.pack(fill="x", pady=(0, 10), padx=5) ttk.Label(file_frame, text="规则文件:").grid(row=0, column=0, sticky=tk.W, padx=(0, 5)) self.file_label = ttk.Label(file_frame, text=self.rules_file) self.file_label.grid(row=0, column=1, sticky=(tk.W, tk.E), padx=(0, 5)) file_frame.columnconfigure(1, weight=1) # 搜索配置区域 config_frame = ttk.LabelFrame(tab_search, text="搜索配置", padding="5") config_frame.pack(fill="x", pady=(0, 10), padx=5) ttk.Label(config_frame, text="返回数量:").grid(row=0, column=0, sticky=tk.W, padx=(0, 5)) self.size_entry = ttk.Entry(config_frame, width=20) self.size_entry.grid(row=0, column=1, sticky=tk.W, padx=(0, 5)) self.size_entry.insert(0, "1") ttk.Label(config_frame, text="时间范围:").grid(row=1, column=0, sticky=tk.W, padx=(0, 5)) self.days_var = tk.StringVar(value="3") days_spinbox = ttk.Spinbox(config_frame, from_=1, to=30, width=5, textvariable=self.days_var) days_spinbox.grid(row=1, column=1, sticky=tk.W, padx=(0, 5)) ttk.Label(config_frame, text="天前").grid(row=1, column=2, sticky=tk.W) # 搜索按钮区域 search_btn_frame = ttk.LabelFrame(tab_search, text="搜索相关按钮", padding="5") search_btn_frame.pack(fill="x", pady=(0, 10), padx=5) ttk.Button(search_btn_frame, text="开始搜索链接", command=self.perform_search).pack(fill="x", pady=2) ttk.Button(search_btn_frame, text="清空结果", command=self.clear_results).pack(fill="x", pady=2) ttk.Button(search_btn_frame, text="导出结果", command=self.export_results).pack(fill="x", pady=2) ttk.Button(search_btn_frame, text="查看完整规则", command=self.show_full_rule).pack(fill="x", pady=2) # APK检测按钮区域 apk_btn_frame = ttk.LabelFrame(tab_search, text="APK检测相关按钮", padding="5") apk_btn_frame.pack(fill="x", pady=(0, 10), padx=5) ttk.Button(apk_btn_frame, text="APK链接检测", command=self.start_apk_detection).pack(fill="x", pady=2) ttk.Button(apk_btn_frame, text="存活链接检测", command=self.start_apk_detection).pack(fill="x", pady=2) ttk.Button(apk_btn_frame, text="停止检测APK下载", command=self.stop_apk_detection).pack(fill="x", pady=2) # APK下载相关按钮 apk_dou_frame = ttk.LabelFrame(tab_search, text="APK检测相关按钮", padding="5") apk_dou_frame.pack(fill="x", pady=(0, 10), padx=5) ttk.Button(apk_dou_frame, text="APK下载", command=self.process_single_url).pack(fill="x", pady=2) ttk.Button(apk_dou_frame, text="APK重复删除", command=self.process_single_url).pack(fill="x", pady=2) ttk.Button(apk_dou_frame, text="停止APK下载", command=self.stop_apk_detection).pack(fill="x", pady=2) ttk.Button(apk_dou_frame, text="搜索保存网页图片", command=self.start_web_save).pack(fill="x", pady=2) # 创建选项卡2:APK检测 tab_apk = ttk.Frame(notebook) notebook.add(tab_apk, text="一键配置搜索") # APK检测按钮区域 apk_btn_frame = ttk.LabelFrame(tab_apk, text="一键配置搜索", padding="5") apk_btn_frame.pack(fill="x", pady=(0, 10), padx=5) ttk.Button(apk_btn_frame, text="一键配置搜索", command=self.start_apk_detection).pack(fill="x", pady=2) def log_message(self, message): """添加日志消息""" # timestamp = datetime.datetime.now().strftime("%Y%m%d %H:%M:%S") timestamp = datetime.now().strftime("%Y%m%d %H:%M:%S") log_msg = f"[{timestamp}] {message}\n" # self.result_text.config(state=tk.NORMAL) self.result_text.insert(tk.END, log_msg + "\n") self.result_text.see(tk.END) # self.result_text.config(state=tk.DISABLED) def result_message(self, message): """添加结果消息""" self.rules_tree.insert(tk.END, message) self.rules_tree.see(tk.END) def start_web_save(self): if not self.results: messagebox.showwarning("警告", "请先执行FOFA搜索以获取目标链接") return threading.Thread(target=self.run_web_save, daemon=True).start() def run_web_save(self): # 配置 Chrome 浏览器选项 chrome_options = Options() chrome_options.add_argument(&#39;--headless&#39;) chrome_options.add_argument(&#39;--disable-gpu&#39;) # 创建 Excel 工作簿和工作表 wb = Workbook() ws = wb.active ws.append([&#39;规则&#39;, &#39;链接&#39;, &#39;截图&#39;]) for idx, result_row in enumerate(self.results): # 假设 link 是第8列 (索引7),即 result_row[7] # try: url = result_row[0].strip() guize = result_row[3].strip() # 实例化浏览器 service = Service(executable_path=r&#39;chromedriver.exe&#39;) br = webdriver.Chrome(service=service,options=chrome_options) # 打开链接 br.get(url) # 截取网页图,以二进制形式获取 screenshot_binary = br.get_screenshot_as_png() # 使用 io 模块将二进制数据转换为文件对象 img = ExcelImage(io.BytesIO(screenshot_binary)) # 将链接添加到 Excel 工作表中 ws.cell(row=idx + 2, column=1, value=guize) ws.cell(row=idx + 2, column=2, value=url) # 将图片插入到 Excel 工作表中 ws.add_image(img, f&#39;C{idx + 2}&#39;) self.log_message(f"保存{url}网页图片成功!" + "\n") # 退出浏览器 br.quit() # except Exception as e: # self.log_message(f"处理链接 {result_row[7].strip()} 时出错: {e}" + "\n") # 保存 Excel 文件 wb.save(&#39;web_and_screenshots.xlsx&#39;) self.log_message(f"处理完毕!结果保存在web_and_screenshots.xlsx" + "\n") def stop_apk_detection(self): """设置停止标志,等待线程自然退出""" self._stop_flag = True self.log_message("正在停止检测...") def process_rule(self, rule): rule = rule.strip() if not rule: return None, None days = int(self.days_var.get()) past_date = self.get_past_date(days) processed_rule = f&#39;{rule} && after="{past_date}"&#39; base64_rule = base64.b64encode(processed_rule.encode(&#39;utf-8&#39;)).decode(&#39;utf-8&#39;) return processed_rule, base64_rule def auto_load_rules(self,zt): """加载规则文件""" if not os.path.exists(self.rules_file): print(f"规则文件不存在: {self.rules_file}") return [] try: with open(self.rules_file, &#39;r&#39;, encoding=&#39;utf-8&#39;) as f: raw_rules = [line.strip() for line in f if line.strip()] for line in raw_rules: self.result_message(f"{line}\n\n") processed_rules = [] for raw_rule in raw_rules: processed_rule, base64_rule = self.process_rule(raw_rule) if processed_rule and base64_rule: processed_rules.append({ &#39;raw&#39;: raw_rule, &#39;processed&#39;: processed_rule, &#39;base64&#39;: base64_rule }) if zt ==0: self.log_message(f"成功加载 {len(processed_rules)} 条规则") return processed_rules except Exception as e: self.log_message(f"加载规则文件失败: {str(e)}") return [] def perform_search(self, size=50): self.progress[&#39;value&#39;] = 0 self.results = [] try: # 加载规则 rules = self.auto_load_rules(1) if not rules: self.log_message("没有可用的规则,程序退出") return # 对每条规则进行搜索 for i, rule_data in enumerate(rules): try: cha_rule = self.search_fofa(rule_data[&#39;base64&#39;], rule_data[&#39;processed&#39;]) self.results.extend(cha_rule) self.log_message("规则:"+ "\n"f"{rule_data[&#39;processed&#39;]}" + "\n" + f"找到 {len(cha_rule)} 条结果:") for sublist in cha_rule: self.log_message(f"结果:{sublist[0]}") except Exception as e: self.log_message(f"规则 &#39;{rule_data[&#39;processed&#39;]}&#39; 搜索失败: {str(e)}") self.log_message(f"搜索完成,共找到 {len(self.results)} 条结果") except Exception as e: self.root.after(0, messagebox.showerror, "错误", f"搜索过程中发生错误: {str(e)}") finally: self.progress.stop() def search_fofa(self, base64_rule, rule_data): url = f"https://fofa.info/api/v1/search/next" params = { &#39;key&#39;: self.api_key, &#39;size&#39;: self.size_entry.get().strip(), &#39;fields&#39;: &#39;link,ip,lastupdatetime&#39;, &#39;qbase64&#39;: base64_rule } headers = { &#39;User-Agent&#39;: &#39;Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.41 Safari/537.36 Edg/88.0.705.22&#39;, &#39;Accept-Encoding&#39;: &#39;gzip, deflate&#39;, &#39;Cache-Control&#39;: &#39;no-cache&#39;, &#39;Pragma&#39;: &#39;no-cache&#39;, &#39;Host&#39;: &#39;fofa.info&#39;, &#39;Accept&#39;: &#39;text/html, image/gif, image/jpeg, *; q=.2, */*; q=.2&#39;, &#39;Connection&#39;: &#39;close&#39; } response = requests.get(url, params=params, headers=headers, timeout=30) response.raise_for_status() data = response.json() if data.get(&#39;error&#39;): raise Exception(f"API返回错误: {data.get(&#39;errmsg&#39;)}") retu_dg = data.get(&#39;results&#39;, []) for sub_list in retu_dg: sub_list.append(rule_data) return retu_dg def update_results(self, rule_data, results): self.log_message(f"规则 {rule_data[&#39;original&#39;]}: {rule_data[&#39;processed&#39;]}") # self.log_message(f"找到 {len(results)} 条结果:\n{results[&#39;link&#39;]}") self.log_message(f"找到 {len(results)} 条结果:") for idx, result_row in enumerate(results): self.log_message(result_row[0].strip()) def clear_results(self): self.result_text.delete(1.0, tk.END) self.results = [] self.apk_results = {} self.log_message("结果已清空") self.progress[&#39;value&#39;] = 0 def export_results(self): if not self.results: messagebox.showwarning("警告", "没有结果可导出") return filename = filedialog.asksaveasfilename( title="导出结果", defaultextension=".json", filetypes=[("JSON files", "*.json"), ("Text files", "*.txt"), ("All files", "*.*")] ) if filename: try: with open(filename, &#39;w&#39;, encoding=&#39;utf-8&#39;) as f: json.dump(self.results, f, ensure_ascii=False, indent=2) messagebox.showinfo("成功", f"结果已导出到: {filename}") except Exception as e: messagebox.showerror("错误", f"导出失败: {str(e)}") def show_full_rule(self): selected = self.rules_tree.selection() if not selected: messagebox.showinfo("提示", "请先选择一个规则") return item = selected[0] values = self.rules_tree.item(item)[&#39;values&#39;] if values and len(values) >= 2: messagebox.showinfo( "完整规则信息", f"处理后的规则:\n{values[0]}\nBase64编码:\n{values[1]}" ) def start_apk_detection(self): """APK检测相关按钮""" if not self.results: messagebox.showwarning("警告", "请先执行FOFA搜索以获取目标链接") return threading.Thread(target=self.run_apk_detection, daemon=True).start() def run_apk_detection(self): """APK检测相关按钮""" self.log_message("APK检测完成未发现APK下载链接") def load_keywords_from_file(self, filename="keywords.txt"): """从txt文件中读取关键字列表""" keywords = [] try: if os.path.exists(filename): with open(filename, &#39;r&#39;, encoding=&#39;utf-8&#39;) as file: for line in file: keyword = line.strip() if keyword: keywords.append(keyword) self.log_message(f"从 {filename} 中读取了 {len(keywords)} 个关键字: {&#39;, &#39;.join(keywords)} "+ "\n") else: default_keywords = [&#39;安卓下载&#39;, &#39;立即下载&#39;,&#39;点击下载&#39;, &#39;Android download&#39;, &#39;Download&#39;,&#39;android download&#39;, &#39;download&#39;,&#39;下载&#39;, &#39;apk&#39;, &#39;安装包&#39;, &#39;安卓版&#39;, &#39;install&#39;, &#39;Android&#39;, &#39;android&#39;, &#39;download-apk&#39;, &#39;客户端&#39;, &#39;获取&#39;, &#39;down load&#39;, &#39;安装&#39;] with open(filename, &#39;w&#39;, encoding=&#39;utf-8&#39;) as file: for keyword in default_keywords: file.write(keyword + &#39;\n&#39;) keywords = default_keywords self.log_message(f"创建了默认关键字文件 {filename},包含 {len(keywords)} 个关键字 "+ "\n") except Exception as e: self.log_message(f"读取关键字文件时出错: {e} "+ "\n") keywords = [&#39;安卓下载&#39;, &#39;立即下载&#39;,&#39;点击下载&#39;, &#39;Android download&#39;, &#39;Download&#39;,&#39;android download&#39;, &#39;download&#39;,&#39;下载&#39;, &#39;apk&#39;, &#39;安装包&#39;, &#39;安卓版&#39;, &#39;install&#39;, &#39;Android&#39;, &#39;android&#39;, &#39;download-apk&#39;, &#39;客户端&#39;, &#39;获取&#39;, &#39;down load&#39;, &#39;安装&#39;] self.log_message(f"使用默认关键字 "+ "\n") return keywords def init_driver(self): """初始化浏览器驱动""" self.ret_down_path = self.down_path() if self.driver is not None: return self.driver options = Options() options.add_argument(&#39;--ignore-certificate-errors&#39;) options.add_argument(&#39;--ignore-ssl-errors&#39;) options.add_argument(&#39;--disable-web-security&#39;) options.add_argument(&#39;--allow-running-insecure-content&#39;) options.add_argument(&#39;--disable-blink-features=AutomationControlled&#39;) options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"]) options.add_argument("--window-size=1200,800") custom_user_agent = "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Mobile Safari/537.36" options.add_argument(f&#39;--user-agent={custom_user_agent}&#39;) prefs = { "download.default_directory":self.ret_down_path, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True, "profile.default_content_settings.popups": 0, "profile.default_content_setting_values.notifications": 2, } options.add_experimental_option("prefs", prefs) try: # 初始化浏览器 service = Service(executable_path=r&#39;chromedriver.exe&#39;) self.driver = webdriver.Chrome(service=service,options=options) # self.driver = webdriver.Chrome(options=options) self.log_message("浏览器初始化成功 "+ "\n") return self.driver except Exception as e: self.log_message(f"浏览器初始化失败: {str(e)} "+ "\n") return None def down_path(self): """创建新的带唯一时间文件夹""" # 获取当前系统时间 curr_time = datetime.now() time_str = curr_time.strftime(&#39;%m%d%H%M%S&#39;) # 创建子文件夹 sub_folder_path = os.path.join(self.download_folder, time_str) if not os.path.exists(sub_folder_path): os.makedirs(sub_folder_path) # 记录文件夹路径 return sub_folder_path def gengxin_down_path(self): """更新下载文件夹""" # 新的下载文件夹路径 new_download_folder = self.down_path() options = Options() # 更新下载文件夹 options.add_experimental_option("prefs", { "download.default_directory": new_download_folder, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True }) # 虽然更新了options,但需要重新创建一个新的会话来应用新的设置 # 这里可以通过执行一个简单的JavaScript来模拟重新加载浏览器设置 self.driver.execute_cdp_cmd(&#39;Page.setDownloadBehavior&#39;, { &#39;behavior&#39;: &#39;allow&#39;, &#39;downloadPath&#39;: new_download_folder }) return new_download_folder def close_driver(self): """关闭浏览器驱动""" if self.driver: try: self.driver.quit() self.driver = None self.log_message(f"浏览器已关闭 "+ "\n") except Exception as e: self.log_message(f"关闭浏览器时出错: {str(e)} "+ "\n") def find_keyword_locations(self): """使用EasyOCR识别页面中包含关键字的文字位置""" if not self.driver: return [] try: # 等待页面完全加载 time.sleep(6) # 获取页面截图 screenshot = self.driver.get_screenshot_as_png() image = Image.open(io.BytesIO(screenshot)) open_cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # 初始化OCR阅读器 reader = easyocr.Reader([&#39;ch_sim&#39;, &#39;en&#39;]) results = reader.readtext(open_cv_image) keyword_locations = [] key_word = [] # 查找包含关键字的文本 for (bbox, text, confidence) in results: text = text.strip() for keyword in self.keywords: if keyword.lower() in text.lower(): # 计算边界框的中心点 top_left = bbox[0] bottom_right = bbox[2] center_x = int((top_left[0] + bottom_right[0]) / 2) center_y = int((top_left[1] + bottom_right[1]) / 2) keyword_locations.append({ &#39;text&#39;: text, &#39;keyword&#39;: keyword, &#39;x&#39;: center_x, &#39;y&#39;: center_y, &#39;confidence&#39;: confidence }) key_word.append(keyword) self.log_message(f"找到关键字 &#39;{keyword}&#39; 在文字: &#39;{text}&#39;,置信度: {confidence:.2f} "+ "\n") break # 按置信度排序 keyword_locations.sort(key=lambda x: x[&#39;confidence&#39;], reverse=True) return keyword_locations,screenshot,key_word except Exception as e: self.log_message(f"OCR识别失败: {e} "+ "\n") return [] def get_window_position(self): """获取浏览器窗口在屏幕上的位置""" if not self.driver: return 0, 0 try: window_rect = self.driver.get_window_rect() return window_rect[&#39;x&#39;], window_rect[&#39;y&#39;] except: return 0, 0 def click_all_locations_with_pyautogui(self, locations): """使用pyautogui点击所有找到的位置""" if not locations: self.log_message("没有找到可点击的位置 "+ "\n") return False window_x, window_y = self.get_window_position() success_count = 0 self.log_message(f"准备点击 {len(locations)} 个位置 "+ "\n") for i, location in enumerate(locations, 1): try: self.log_message(f"正在点击第 {i}/{len(locations)} 个位置: &#39;{location[&#39;text&#39;]}&#39; (关键字: &#39;{location[&#39;keyword&#39;]}&#39;) "+ "\n") # 计算在屏幕上的绝对坐标 screen_x = window_x + location[&#39;x&#39;] + 10 screen_y = window_y + location[&#39;y&#39;] + 80 # 移动鼠标到指定位置 pyautogui.moveTo(screen_x, screen_y, duration=0.3) time.sleep(0.3) # 模拟鼠标点击 pyautogui.click() self.log_message(f"成功点击第 {i} 个位置: &#39;{location[&#39;text&#39;]}&#39; "+ "\n") success_count += 1 # 点击后等待一段时间 time.sleep(2) except Exception as e: self.log_message(f"点击第 {i} 个位置时出错: {e} "+ "\n") continue self.log_message(f"成功点击了 {success_count}/{len(locations)} 个位置 "+ "\n") return success_count > 0 def click_all_locations_with_javascript(self, locations): """使用JavaScript直接点击所有元素""" if not locations: return False success_count = 0 for i, location in enumerate(locations, 1): try: self.log_message(f"正在使用JavaScript点击第 {i}/{len(locations)} 个位置: &#39;{location[&#39;text&#39;]}&#39; "+ "\n") script = f""" var element = document.elementFromPoint({location[&#39;x&#39;]}, {location[&#39;y&#39;]}); if (element) {{ element.click(); console.log(&#39;JavaScript点击成功&#39;); return true; }} else {{ console.log(&#39;未找到元素&#39;); return false; }} """ result = self.driver.execute_script(script) if result: self.log_message(f"JavaScript点击第 {i} 个位置成功 "+ "\n") success_count += 1 else: self.log_message(f"JavaScript点击第 {i} 个位置失败 "+ "\n") # 点击后等待一段时间 time.sleep(2) except Exception as e: self.log_message(f"JavaScript点击第 {i} 个位置时出错: {e} "+ "\n") continue self.log_message(f"JavaScript成功点击了 {success_count}/{len(locations)} 个位置 "+ "\n") return success_count > 0 def navigate_to_url(self, url): """导航到指定URL""" if not self.driver: return False try: self.log_message(f"正在打开网页: {url} "+ "\n") self.driver.get(url) # self.GuiZhi.set(url) self.log_message("等待页面完全加载... "+ "\n") # 使用显式等待等待页面加载完成 WebDriverWait(self.driver, 30).until( lambda driver: driver.execute_script("return document.readyState "+ "\n") == "complete" ) # 等待内容加载 time.sleep(5) self.log_message("网页加载完成") return True except Exception as e: self.log_message(f"导航到URL失败: {str(e)} "+ "\n") return False def get_save_excel(self, guize, url, gxshijian, tupian, key_word, gx_down_path): """下载结果保存FofaApkDownLog""" try: # 尝试打开已有的 Excel 文件 workbook = Excelopenpyxl.load_workbook(&#39;FofaApkDownLog.xlsx&#39;) sheet = workbook.active except FileNotFoundError: # 如果文件不存在,创建一个新的工作簿和工作表 workbook = Excelopenpyxl.Workbook() sheet = workbook.active # 添加表头 headers = [&#39;规则&#39;,&#39;链接&#39;,&#39;时间&#39;,&#39;截图&#39;,&#39;关键字&#39;,&#39;包名&#39;,&#39;大小&#39;,&#39;包地址&#39;] for col_num, header in enumerate(headers, 1): col_letter = get_column_letter(col_num) sheet[f&#39;{col_letter}1&#39;] = header try: # 找到空白行 row_num = sheet.max_row + 1 # 插入数值 sheet.cell(row=row_num, column=1, value=guize) sheet.cell(row=row_num, column=2, value=url) sheet.cell(row=row_num, column=3, value=gxshijian) # 将图片插入到 Excel 工作表中 img = ExcelImage(io.BytesIO(tupian)) # img = Image.open(io.BytesIO(tupian)) sheet2 = sheet.cell(row=row_num, column=4) sheet.add_image(img, sheet2.coordinate) sheet.cell(row=row_num, column=5, value=&#39; &#39;.join(map(str, key_word))) sheet.cell(row=row_num, column=6, value=&#39;&#39;) sheet.cell(row=row_num, column=7, value=&#39;&#39;) sheet.cell(row=row_num, column=8, value=gx_down_path) except Exception as e: # 插入数值 sheet.cell(row=row_num, column=1, value="出错") sheet.cell(row=row_num, column=2, value=url.strip()) sheet.cell(row=row_num, column=3, value="出错") sheet.cell(row=row_num, column=4, value="出错") sheet.cell(row=row_num, column=5, value="出错") sheet.cell(row=row_num, column=6, value="出错") self.log_message(f"保存{url.strip()}时出错: {e}" + "\n") # 保存工作簿 workbook.save(&#39;FofaApkDownLog.xlsx&#39;) self.log_message(f"下载结果保存在FofaApkDownLog.xlsx" + "\n\n") # -------<监控-------------------------- def start_monitoring(self,folder_down_path): """开始监控线程""" # 初始化行计数器 self.row_counter = 0 # 扫描并添加已有的apk文件 self.add_existing_apk_files(folder_down_path) self.monitor_running = True # 创建线程监控文件下载 self.monitor_thread = threading.Thread(target=self.monitor_folder, args=(folder_down_path,)) self.monitor_thread.daemon = True self.monitor_thread.start() def add_existing_apk_files(self,folder_down_path): """添加文件夹中已存在的APK文件到列表""" if not os.path.exists(folder_down_path): return for filename in os.listdir(folder_down_path): if filename.endswith(&#39;.apk&#39;): file_path = os.path.join(folder_down_path, filename) try: file_size = os.path.getsize(file_path) self.add_completed_file(filename, file_size) except OSError: continue def stop_monitoring(self): """停止监控线程""" if self.monitor_running: self.monitor_running = False if hasattr(self, &#39;monitor_thread&#39;) and self.monitor_thread.is_alive(): self.monitor_thread.join(timeout=1.0) def update_progress(self, filename, current_size, increment, is_new=False): """更新下载进度条""" if filename not in self.active_downloads: # 创建新的进度条组件(使用网格布局) frame = ttk.Frame(self.scrollable_frame, padding=5) # 新下载文件放在最上面 if is_new: frame.grid(row=0, column=0, sticky="ew", pady=2) # 将已有项目下移 for name, data in self.active_downloads.items(): current_row = data[&#39;frame&#39;].grid_info()[&#39;row&#39;] data[&#39;frame&#39;].grid(row=current_row + 1) else: frame.grid(row=self.row_counter, column=0, sticky="ew", pady=2) self.row_counter += 1 frame.grid_columnconfigure(1, weight=1) # 进度条区域可扩展 # 文件名标签 file_label = ttk.Label(frame, text=filename, width=30, anchor="w") file_label.grid(row=0, column=0, padx=(0, 5), sticky="w") # 进度条 progress = ttk.Progressbar( frame, orient="horizontal", length=300, mode="determinate", maximum=(current_size + 1024 * 100) / 1024 ) progress.grid(row=0, column=1, sticky="ew", padx=5) # 下载信息标签 size_label = ttk.Label(frame, text=f"{current_size / 1024:.2f} KB", width=15, anchor="e") size_label.grid(row=0, column=2, padx=5) # 速度标签 speed_label = ttk.Label(frame, text="0 KB/s", width=10, anchor="e") speed_label.grid(row=0, column=3, padx=5) # 存储组件引用 self.active_downloads[filename] = { &#39;frame&#39;: frame, &#39;progress&#39;: progress, &#39;size_label&#39;: size_label, &#39;speed_label&#39;: speed_label, &#39;last_size&#39;: current_size, &#39;last_time&#39;: time.time(), &#39;total_downloaded&#39;: 0, &#39;completed&#39;: False } else: data = self.active_downloads[filename] if data[&#39;completed&#39;]: return # 计算下载速度 current_time = time.time() time_diff = current_time - data[&#39;last_time&#39;] speed_kbs = (increment / 1024) / time_diff if time_diff > 0 else 0 # 更新UI组件 current_kb = current_size / 1024 data[&#39;progress&#39;][&#39;value&#39;] = current_kb data[&#39;size_label&#39;].config(text=f"{current_size / 1024:.2f} KB") data[&#39;speed_label&#39;].config(text=f"{speed_kbs:.2f} KB/s") # 更新内部状态 data[&#39;last_size&#39;] = current_size data[&#39;last_time&#39;] = current_time data[&#39;total_downloaded&#39;] += increment def add_completed_file(self, filename, final_size): """添加已完成的文件到列表""" # 创建新的进度条组件(使用网格布局) frame = ttk.Frame(self.scrollable_frame, padding=5) frame.grid(row=self.row_counter, column=0, sticky="ew", pady=2) self.row_counter += 1 frame.grid_columnconfigure(1, weight=1) # 进度条区域可扩展 # 文件名标签 file_label = ttk.Label(frame, text=filename, width=30, anchor="w") file_label.grid(row=0, column=0, padx=(0, 5), sticky="w") # 进度条(设置为100%) progress = ttk.Progressbar( frame, orient="horizontal", length=300, mode="determinate", value=100, maximum=100 ) progress.grid(row=0, column=1, sticky="ew", padx=5) # 下载信息标签 size_label = ttk.Label(frame, text=f"{final_size / 1024:.2f} KB", width=15, anchor="e") size_label.grid(row=0, column=2, padx=5) # 速度标签(显示已完成) speed_label = ttk.Label(frame, text="完成", width=10, anchor="e") speed_label.grid(row=0, column=3, padx=5) # 存储组件引用 self.active_downloads[filename] = { &#39;frame&#39;: frame, &#39;progress&#39;: progress, &#39;size_label&#39;: size_label, &#39;speed_label&#39;: speed_label, &#39;completed&#39;: True } def complete_download(self, filename, final_size,folder_down_path): """标记下载完成""" if filename in self.active_downloads: data = self.active_downloads[filename] data[&#39;progress&#39;].configure(value=100, maximum=100) data[&#39;size_label&#39;].config(text=f"{final_size / 1024:.2f} KB") data[&#39;speed_label&#39;].config(text="完成") data[&#39;completed&#39;] = True def remove_progress(self, filename): """移除完成的下载进度条""" if filename in self.active_downloads: data = self.active_downloads[filename] data[&#39;frame&#39;].destroy() del self.active_downloads[filename] # 重新计算行号 self.row_counter = 0 for i, (name, item) in enumerate(self.active_downloads.items()): item[&#39;frame&#39;].grid(row=i, column=0, sticky="ew", pady=2) self.row_counter = i + 1 def monitor_folder(self,folder_down_path): """监控文件夹的核心函数""" while self.monitor_running: try: # 1. 检查APK文件 for filename in os.listdir(folder_down_path): if filename.endswith(&#39;.apk&#39;): file_path = os.path.join(folder_down_path, filename) file_size = os.path.getsize(file_path) # 标记对应的下载任务完成 crdownload_name = filename.replace(&#39;.apk&#39;, &#39;.crdownload&#39;) if crdownload_name in self.active_downloads: self.root.after(0, self.complete_download, crdownload_name, file_size,folder_down_path) # 2. 扫描下载中的.crdownload文件 current_files = set(os.listdir(folder_down_path)) crdownload_files = [f for f in current_files if f.endswith(&#39;.crdownload&#39;)] # 4. 处理下载文件 for filename in crdownload_files: file_path = os.path.join(folder_down_path, filename) try: current_size = os.path.getsize(file_path) last_size = self.active_downloads[filename][ &#39;last_size&#39;] if filename in self.active_downloads else 0 increment = current_size - last_size if increment > 0 or filename not in self.active_downloads: # 新下载文件放在最上面 is_new = filename not in self.active_downloads self.root.after(0, self.update_progress, filename, current_size, increment, is_new) except FileNotFoundError: self.root.after(0, filename) except Exception: pass time.sleep(1) except Exception: time.sleep(5) def on_close(self): """关闭应用时停止监控线程""" self.stop_monitoring() self.root.destroy() # -------监控>-------------------------- def process_single_url(self): """L:打开页面,识别并点击所有下载按钮""" # 初始化浏览器 if not self.init_driver(): self.result_text.insert(tk.END,"浏览器初始化失败,程序退出 "+ "\n") return if not self.driver: self.log_message("浏览器未初始化 "+ "\n") return False if not self.results: messagebox.showwarning("警告", "请先执行FOFA搜索以获取目标链接 "+ "\n") return # 进度条 self.progress[&#39;value&#39;] = 0 total = len(self.results) if total == 0: return for idx, result_row in enumerate(self.results): # 假设 link 是第8列 (索引7),即 result_row[7] try: url = result_row[0].strip() guize = result_row[3].strip() gxshijian = result_row[2].strip() self.log_message(f"正在检测APK: {idx + 1}/{total} " + "\n") # 更新下载文件夹 gx_down_path = self.gengxin_down_path() self.log_message(f"更新了下载文件夹: {gx_down_path} " + "\n") # 导航到目标URL if not self.navigate_to_url(url): return self.log_message(f"正在使用EasyOCR识别页面中的关键字文字... "+ "\n") keyword_locations,tupian,key_word= self.find_keyword_locations() if not keyword_locations: self.log_message(f"未找到任何包含关键字的文字位置 "+ "\n") self.log_message(f"找到 {len(keyword_locations)} 个包含关键字的文字位置 "+ "\n") # 先尝试使用pyautogui进行所有点击 pyautogui_success = self.click_all_locations_with_pyautogui(keyword_locations) # 如果pyautogui点击失败,尝试JavaScript点击 if not pyautogui_success: self.log_message(f"pyautogui点击效果不佳,尝试JavaScript点击所有位置... "+ "\n") javascript_success = self.click_all_locations_with_javascript(keyword_locations) # 等待下载完成 self.log_message(f"等待下载完成... "+ "\n") # 保存到excel 规则3 链接0 时间2 截图 关键字 包名 大小 包地址 self.get_save_excel(guize,url,gxshijian,tupian,key_word,gx_down_path) # 检查是否有新文件下载 # 启动监控线程 self.start_monitoring(gx_down_path) time.sleep(2) except IndexError: continue self.log_message(f"执行完成"+ "\n") # self.close_driver() def main(): root = tk.Tk() app = FOFASearchGUI(root) root.mainloop() if __name__ == "__main__": main() 】 运行日志区域【self.log_message("")】及时更新消息。下载速度区域及时更新下载进度条。而不是执行完成后才一次更新全部。
11-21
# # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging import os import random import re import sys import threading from copy import deepcopy from io import BytesIO from timeit import default_timer as timer import numpy as np import pdfplumber import trio import xgboost as xgb from huggingface_hub import snapshot_download from PIL import Image from pypdf import PdfReader as pdf2_read from api import settings from api.utils.file_utils import get_project_base_directory from deepdoc.vision import OCR, LayoutRecognizer, Recognizer, TableStructureRecognizer from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk from rag.nlp import rag_tokenizer from rag.prompts import vision_llm_describe_prompt from rag.settings import PARALLEL_DEVICES LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber" if LOCK_KEY_pdfplumber not in sys.modules: sys.modules[LOCK_KEY_pdfplumber] = threading.Lock() class RAGFlowPdfParser: def __init__(self, **kwargs): """ If you have trouble downloading HuggingFace models, -_^ this might help!! For Linux: export HF_ENDPOINT=https://hf-mirror.com For Windows: Good luck ^_- """ self.ocr = OCR() self.parallel_limiter = None if PARALLEL_DEVICES is not None and PARALLEL_DEVICES > 1: self.parallel_limiter = [trio.CapacityLimiter(1) for _ in range(PARALLEL_DEVICES)] if hasattr(self, "model_speciess"): self.layouter = LayoutRecognizer("layout." + self.model_speciess) else: self.layouter = LayoutRecognizer("layout") self.tbl_det = TableStructureRecognizer() self.updown_cnt_mdl = xgb.Booster() if not settings.LIGHTEN: try: import torch.cuda if torch.cuda.is_available(): self.updown_cnt_mdl.set_param({"device": "cuda"}) except Exception: logging.exception("RAGFlowPdfParser __init__") try: model_dir = os.path.join( get_project_base_directory(), "rag/res/deepdoc") self.updown_cnt_mdl.load_model(os.path.join( model_dir, "updown_concat_xgb.model")) except Exception: model_dir = snapshot_download( repo_id="InfiniFlow/text_concat_xgb_v1.0", local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"), local_dir_use_symlinks=False) self.updown_cnt_mdl.load_model(os.path.join( model_dir, "updown_concat_xgb.model")) self.page_from = 0 def __char_width(self, c): return (c["x1"] - c["x0"]) // max(len(c["text"]), 1) def __height(self, c): return c["bottom"] - c["top"] def _x_dis(self, a, b): return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]), abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2) def _y_dis( self, a, b): return ( b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2 def _match_proj(self, b): proj_patt = [ r"第[零一二三四五六七八九十百]+章", r"第[零一二三四五六七八九十百]+[条节]", r"[零一二三四五六七八九十百]+[、是  ]", r"[\((][零一二三四五六七八九十百]+[)\)]", r"[\((][0-9]+[)\)]", r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})", r"[0-9]+\.[0-9.]+(、|\.[  ])", r"[⚫•➢①② ]", ] return any([re.match(p, b["text"]) for p in proj_patt]) def _updown_concat_features(self, up, down): w = max(self.__char_width(up), self.__char_width(down)) h = max(self.__height(up), self.__height(down)) y_dis = self._y_dis(up, down) LEN = 6 tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split() tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split() tks_all = up["text"][-LEN:].strip() \ + (" " if re.match(r"[a-zA-Z0-9]+", up["text"][-1] + down["text"][0]) else "") \ + down["text"][:LEN].strip() tks_all = rag_tokenizer.tokenize(tks_all).split() fea = [ up.get("R", -1) == down.get("R", -1), y_dis / h, down["page_number"] - up["page_number"], up["layout_type"] == down["layout_type"], up["layout_type"] == "text", down["layout_type"] == "text", up["layout_type"] == "table", down["layout_type"] == "table", True if re.search( r"([。?!;!?;+))]|[a-z]\.)$", up["text"]) else False, True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False, True if re.search( r"(^.?[/,?;:\],。;:’”?!》】)-])", down["text"]) else False, True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False, True if re.search(r"[,,][^。.]+$", up["text"]) else False, True if re.search(r"[,,][^。.]+$", up["text"]) else False, True if re.search(r"[\((][^\))]+$", up["text"]) and re.search(r"[\))]", down["text"]) else False, self._match_proj(down), True if re.match(r"[A-Z]", down["text"]) else False, True if re.match(r"[A-Z]", up["text"][-1]) else False, True if re.match(r"[a-z0-9]", up["text"][-1]) else False, True if re.match(r"[0-9.%,-]+$", down["text"]) else False, up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip() ) > 1 and len( down["text"].strip()) > 1 else False, up["x0"] > down["x1"], abs(self.__height(up) - self.__height(down)) / min(self.__height(up), self.__height(down)), self._x_dis(up, down) / max(w, 0.000001), (len(up["text"]) - len(down["text"])) / max(len(up["text"]), len(down["text"])), len(tks_all) - len(tks_up) - len(tks_down), len(tks_down) - len(tks_up), tks_down[-1] == tks_up[-1] if tks_down and tks_up else False, max(down["in_row"], up["in_row"]), abs(down["in_row"] - up["in_row"]), len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0, len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0 ] return fea @staticmethod def sort_X_by_page(arr, threashold): # sort using y1 first and then x1 arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"])) for i in range(len(arr) - 1): for j in range(i, -1, -1): # restore the order using th if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \ and arr[j + 1]["top"] < arr[j]["top"] \ and arr[j + 1]["page_number"] == arr[j]["page_number"]: tmp = arr[j] arr[j] = arr[j + 1] arr[j + 1] = tmp return arr def _has_color(self, o): if o.get("ncs", "") == "DeviceGray": if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \ o["non_stroking_color"][0] == 1: if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")): return False return True def _table_transformer_job(self, ZM): logging.debug("Table processing...") imgs, pos = [], [] tbcnt = [0] MARGIN = 10 self.tb_cpns = [] assert len(self.page_layout) == len(self.page_images) for p, tbls in enumerate(self.page_layout): # for page tbls = [f for f in tbls if f["type"] == "table"] tbcnt.append(len(tbls)) if not tbls: continue for tb in tbls: # for table left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \ tb["x1"] + MARGIN, tb["bottom"] + MARGIN left *= ZM top *= ZM right *= ZM bott *= ZM pos.append((left, top)) imgs.append(self.page_images[p].crop((left, top, right, bott))) assert len(self.page_images) == len(tbcnt) - 1 if not imgs: return recos = self.tbl_det(imgs) tbcnt = np.cumsum(tbcnt) for i in range(len(tbcnt) - 1): # for page pg = [] for j, tb_items in enumerate( recos[tbcnt[i]: tbcnt[i + 1]]): # for table poss = pos[tbcnt[i]: tbcnt[i + 1]] for it in tb_items: # for table components it["x0"] = (it["x0"] + poss[j][0]) it["x1"] = (it["x1"] + poss[j][0]) it["top"] = (it["top"] + poss[j][1]) it["bottom"] = (it["bottom"] + poss[j][1]) for n in ["x0", "x1", "top", "bottom"]: it[n] /= ZM it["top"] += self.page_cum_height[i] it["bottom"] += self.page_cum_height[i] it["pn"] = i it["layoutno"] = j pg.append(it) self.tb_cpns.extend(pg) def gather(kwd, fzy=10, ption=0.6): eles = Recognizer.sort_Y_firstly( [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy) eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption) return Recognizer.sort_Y_firstly(eles, 0) # add R,H,C,SP tag to boxes within table layout headers = gather(r".*header$") rows = gather(r".* (row|header)") spans = gather(r".*spanning") clmns = sorted([r for r in self.tb_cpns if re.match( r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"])) clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5) for b in self.boxes: if b.get("layout_type", "") != "table": continue ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3) if ii is not None: b["R"] = ii b["R_top"] = rows[ii]["top"] b["R_bott"] = rows[ii]["bottom"] ii = Recognizer.find_overlapped_with_threashold( b, headers, thr=0.3) if ii is not None: b["H_top"] = headers[ii]["top"] b["H_bott"] = headers[ii]["bottom"] b["H_left"] = headers[ii]["x0"] b["H_right"] = headers[ii]["x1"] b["H"] = ii ii = Recognizer.find_horizontally_tightest_fit(b, clmns) if ii is not None: b["C"] = ii b["C_left"] = clmns[ii]["x0"] b["C_right"] = clmns[ii]["x1"] ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3) if ii is not None: b["H_top"] = spans[ii]["top"] b["H_bott"] = spans[ii]["bottom"] b["H_left"] = spans[ii]["x0"] b["H_right"] = spans[ii]["x1"] b["SP"] = ii def __ocr(self, pagenum, img, chars, ZM=3, device_id: int | None = None): start = timer() bxs = self.ocr.detect(np.array(img), device_id) logging.info(f"__ocr detecting boxes of a image cost ({timer() - start}s)") start = timer() if not bxs: self.boxes.append([]) return bxs = [(line[0], line[1][0]) for line in bxs] bxs = Recognizer.sort_Y_firstly( [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM, "top": b[0][1] / ZM, "text": "", "txt": t, "bottom": b[-1][1] / ZM, "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]], self.mean_height[-1] / 3 ) # merge chars in the same rect for c in Recognizer.sort_Y_firstly( chars, self.mean_height[pagenum - 1] // 4): ii = Recognizer.find_overlapped(c, bxs) if ii is None: self.lefted_chars.append(c) continue ch = c["bottom"] - c["top"] bh = bxs[ii]["bottom"] - bxs[ii]["top"] if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != &#39; &#39;: self.lefted_chars.append(c) continue if c["text"] == " " and bxs[ii]["text"]: if re.match(r"[0-9a-zA--яА-Я,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " " else: bxs[ii]["text"] += c["text"] logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s") start = timer() boxes_to_reg = [] img_np = np.array(img) for b in bxs: if not b["text"]: left, right, top, bott = b["x0"] * ZM, b["x1"] * \ ZM, b["top"] * ZM, b["bottom"] * ZM b["box_image"] = self.ocr.get_rotate_crop_image(img_np, np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32)) boxes_to_reg.append(b) del b["txt"] texts = self.ocr.recognize_batch([b["box_image"] for b in boxes_to_reg], device_id) for i in range(len(boxes_to_reg)): boxes_to_reg[i]["text"] = texts[i] del boxes_to_reg[i]["box_image"] logging.info(f"__ocr recognize {len(bxs)} boxes cost {timer() - start}s") bxs = [b for b in bxs if b["text"]] if self.mean_height[-1] == 0: self.mean_height[-1] = np.median([b["bottom"] - b["top"] for b in bxs]) self.boxes.append(bxs) def _layouts_rec(self, ZM, drop=True): assert len(self.page_images) == len(self.boxes) self.boxes, self.page_layout = self.layouter( self.page_images, self.boxes, ZM, drop=drop) # cumlative Y for i in range(len(self.boxes)): self.boxes[i]["top"] += \ self.page_cum_height[self.boxes[i]["page_number"] - 1] self.boxes[i]["bottom"] += \ self.page_cum_height[self.boxes[i]["page_number"] - 1] def _text_merge(self): # merge adjusted boxes bxs = self.boxes def end_with(b, txt): txt = txt.strip() tt = b.get("text", "").strip() return tt and tt.find(txt) == len(tt) - len(txt) def start_with(b, txts): tt = b.get("text", "").strip() return tt and any([tt.find(t.strip()) == 0 for t in txts]) # horizontally merge adjacent box with the same layout i = 0 while i < len(bxs) - 1: b = bxs[i] b_ = bxs[i + 1] if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]: i += 1 continue if abs(self._y_dis(b, b_) ) < self.mean_height[bxs[i]["page_number"] - 1] / 3: # merge bxs[i]["x1"] = b_["x1"] bxs[i]["top"] = (b["top"] + b_["top"]) / 2 bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2 bxs[i]["text"] += b_["text"] bxs.pop(i + 1) continue i += 1 continue dis_thr = 1 dis = b["x1"] - b_["x0"] if b.get("layout_type", "") != "text" or b_.get( "layout_type", "") != "text": if end_with(b, ",") or start_with(b_, "(,"): dis_thr = -8 else: i += 1 continue if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \ and dis >= dis_thr and b["x1"] < b_["x1"]: # merge bxs[i]["x1"] = b_["x1"] bxs[i]["top"] = (b["top"] + b_["top"]) / 2 bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2 bxs[i]["text"] += b_["text"] bxs.pop(i + 1) continue i += 1 self.boxes = bxs def _naive_vertical_merge(self): bxs = Recognizer.sort_Y_firstly( self.boxes, np.median( self.mean_height) / 3) i = 0 while i + 1 < len(bxs): b = bxs[i] b_ = bxs[i + 1] if b["page_number"] < b_["page_number"] and re.match( r"[0-9 •一—-]+$", b["text"]): bxs.pop(i) continue if not b["text"].strip(): bxs.pop(i) continue concatting_feats = [ b["text"].strip()[-1] in ",;:&#39;\",、‘“;:-", len(b["text"].strip()) > 1 and b["text"].strip( )[-2] in ",;:&#39;\",‘“、;:", b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:", ] # features for not concating feats = [ b.get("layoutno", 0) != b_.get("layoutno", 0), b["text"].strip()[-1] in "。?!?", self.is_english and b["text"].strip()[-1] in ".!?", b["page_number"] == b_["page_number"] and b_["top"] - b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5, b["page_number"] < b_["page_number"] and abs( b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4, ] # split features detach_feats = [b["x1"] < b_["x0"], b["x0"] > b_["x1"]] if (any(feats) and not any(concatting_feats)) or any(detach_feats): logging.debug("{} {} {} {}".format( b["text"], b_["text"], any(feats), any(concatting_feats), )) i += 1 continue # merge up and down b["bottom"] = b_["bottom"] b["text"] += b_["text"] b["x0"] = min(b["x0"], b_["x0"]) b["x1"] = max(b["x1"], b_["x1"]) bxs.pop(i + 1) self.boxes = bxs def _concat_downward(self, concat_between_pages=True): # count boxes in the same row as a feature for i in range(len(self.boxes)): mh = self.mean_height[self.boxes[i]["page_number"] - 1] self.boxes[i]["in_row"] = 0 j = max(0, i - 12) while j < min(i + 12, len(self.boxes)): if j == i: j += 1 continue ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh if abs(ydis) < 1: self.boxes[i]["in_row"] += 1 elif ydis > 0: break j += 1 # concat between rows boxes = deepcopy(self.boxes) blocks = [] while boxes: chunks = [] def dfs(up, dp): chunks.append(up) i = dp while i < min(dp + 12, len(boxes)): ydis = self._y_dis(up, boxes[i]) smpg = up["page_number"] == boxes[i]["page_number"] mh = self.mean_height[up["page_number"] - 1] mw = self.mean_width[up["page_number"] - 1] if smpg and ydis > mh * 4: break if not smpg and ydis > mh * 16: break down = boxes[i] if not concat_between_pages and down["page_number"] > up["page_number"]: break if up.get("R", "") != down.get( "R", "") and up["text"][-1] != ",": i += 1 continue if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \ or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) \ or not down["text"].strip(): i += 1 continue if not down["text"].strip() or not up["text"].strip(): i += 1 continue if up["x1"] < down["x0"] - 10 * \ mw or up["x0"] > down["x1"] + 10 * mw: i += 1 continue if i - dp < 5 and up.get("layout_type") == "text": if up.get("layoutno", "1") == down.get( "layoutno", "2"): dfs(down, i + 1) boxes.pop(i) return i += 1 continue fea = self._updown_concat_features(up, down) if self.updown_cnt_mdl.predict( xgb.DMatrix([fea]))[0] <= 0.5: i += 1 continue dfs(down, i + 1) boxes.pop(i) return dfs(boxes[0], 1) boxes.pop(0) if chunks: blocks.append(chunks) # concat within each block boxes = [] for b in blocks: if len(b) == 1: boxes.append(b[0]) continue t = b[0] for c in b[1:]: t["text"] = t["text"].strip() c["text"] = c["text"].strip() if not c["text"]: continue if t["text"] and re.match( r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]): t["text"] += " " t["text"] += c["text"] t["x0"] = min(t["x0"], c["x0"]) t["x1"] = max(t["x1"], c["x1"]) t["page_number"] = min(t["page_number"], c["page_number"]) t["bottom"] = c["bottom"] if not t["layout_type"] \ and c["layout_type"]: t["layout_type"] = c["layout_type"] boxes.append(t) self.boxes = Recognizer.sort_Y_firstly(boxes, 0) def _filter_forpages(self): if not self.boxes: return findit = False i = 0 while i < len(self.boxes): if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())): i += 1 continue findit = True eng = re.match( r"[0-9a-zA-Z :&#39;.-]{5,}", self.boxes[i]["text"].strip()) self.boxes.pop(i) if i >= len(self.boxes): break prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join( self.boxes[i]["text"].strip().split()[:2]) while not prefix: self.boxes.pop(i) if i >= len(self.boxes): break prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join( self.boxes[i]["text"].strip().split()[:2]) self.boxes.pop(i) if i >= len(self.boxes) or not prefix: break for j in range(i, min(i + 128, len(self.boxes))): if not re.match(prefix, self.boxes[j]["text"]): continue for k in range(i, j): self.boxes.pop(i) break if findit: return page_dirty = [0] * len(self.page_images) for b in self.boxes: if re.search(r"(··|··|··)", b["text"]): page_dirty[b["page_number"] - 1] += 1 page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3]) if not page_dirty: return i = 0 while i < len(self.boxes): if self.boxes[i]["page_number"] in page_dirty: self.boxes.pop(i) continue i += 1 def _merge_with_same_bullet(self): i = 0 while i + 1 < len(self.boxes): b = self.boxes[i] b_ = self.boxes[i + 1] if not b["text"].strip(): self.boxes.pop(i) continue if not b_["text"].strip(): self.boxes.pop(i + 1) continue if b["text"].strip()[0] != b_["text"].strip()[0] \ or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \ or rag_tokenizer.is_chinese(b["text"].strip()[0]) \ or b["top"] > b_["bottom"]: i += 1 continue b_["text"] = b["text"] + "\n" + b_["text"] b_["x0"] = min(b["x0"], b_["x0"]) b_["x1"] = max(b["x1"], b_["x1"]) b_["top"] = b["top"] self.boxes.pop(i) def _extract_table_figure(self, need_image, ZM, return_html, need_position, separate_tables_figures=False): tables = {} figures = {} # extract figure and table boxes i = 0 lst_lout_no = "" nomerge_lout_no = [] while i < len(self.boxes): if "layoutno" not in self.boxes[i]: i += 1 continue lout_no = str(self.boxes[i]["page_number"]) + \ "-" + str(self.boxes[i]["layoutno"]) if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title", "figure caption", "reference"]: nomerge_lout_no.append(lst_lout_no) if self.boxes[i]["layout_type"] == "table": if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]): self.boxes.pop(i) continue if lout_no not in tables: tables[lout_no] = [] tables[lout_no].append(self.boxes[i]) self.boxes.pop(i) lst_lout_no = lout_no continue if need_image and self.boxes[i]["layout_type"] == "figure": if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]): self.boxes.pop(i) continue if lout_no not in figures: figures[lout_no] = [] figures[lout_no].append(self.boxes[i]) self.boxes.pop(i) lst_lout_no = lout_no continue i += 1 # merge table on different pages nomerge_lout_no = set(nomerge_lout_no) tbls = sorted([(k, bxs) for k, bxs in tables.items()], key=lambda x: (x[1][0]["top"], x[1][0]["x0"])) i = len(tbls) - 1 while i - 1 >= 0: k0, bxs0 = tbls[i - 1] k, bxs = tbls[i] i -= 1 if k0 in nomerge_lout_no: continue if bxs[0]["page_number"] == bxs0[0]["page_number"]: continue if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1: continue mh = self.mean_height[bxs[0]["page_number"] - 1] if self._y_dis(bxs0[-1], bxs[0]) > mh * 23: continue tables[k0].extend(tables[k]) del tables[k] def x_overlapped(a, b): return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]]) # find captions and pop out i = 0 while i < len(self.boxes): c = self.boxes[i] # mh = self.mean_height[c["page_number"]-1] if not TableStructureRecognizer.is_caption(c): i += 1 continue # find the nearest layouts def nearest(tbls): nonlocal c mink = "" minv = 1000000000 for k, bxs in tbls.items(): for b in bxs: if b.get("layout_type", "").find("caption") >= 0: continue y_dis = self._y_dis(c, b) x_dis = self._x_dis( c, b) if not x_overlapped( c, b) else 0 dis = y_dis * y_dis + x_dis * x_dis if dis < minv: mink = k minv = dis return mink, minv tk, tv = nearest(tables) fk, fv = nearest(figures) # if min(tv, fv) > 2000: # i += 1 # continue if tv < fv and tk: tables[tk].insert(0, c) logging.debug( "TABLE:" + self.boxes[i]["text"] + "; Cap: " + tk) elif fk: figures[fk].insert(0, c) logging.debug( "FIGURE:" + self.boxes[i]["text"] + "; Cap: " + tk) self.boxes.pop(i) def cropout(bxs, ltype, poss): nonlocal ZM pn = set([b["page_number"] - 1 for b in bxs]) if len(pn) < 2: pn = list(pn)[0] ht = self.page_cum_height[pn] b = { "x0": np.min([b["x0"] for b in bxs]), "top": np.min([b["top"] for b in bxs]) - ht, "x1": np.max([b["x1"] for b in bxs]), "bottom": np.max([b["bottom"] for b in bxs]) - ht } louts = [layout for layout in self.page_layout[pn] if layout["type"] == ltype] ii = Recognizer.find_overlapped(b, louts, naive=True) if ii is not None: b = louts[ii] else: logging.warning( f"Missing layout match: {pn + 1},%s" % (bxs[0].get( "layoutno", ""))) left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"] if right < left: right = left + 1 poss.append((pn + self.page_from, left, right, top, bott)) return self.page_images[pn] \ .crop((left * ZM, top * ZM, right * ZM, bott * ZM)) pn = {} for b in bxs: p = b["page_number"] - 1 if p not in pn: pn[p] = [] pn[p].append(b) pn = sorted(pn.items(), key=lambda x: x[0]) imgs = [cropout(arr, ltype, poss) for p, arr in pn] pic = Image.new("RGB", (int(np.max([i.size[0] for i in imgs])), int(np.sum([m.size[1] for m in imgs]))), (245, 245, 245)) height = 0 for img in imgs: pic.paste(img, (0, int(height))) height += img.size[1] return pic res = [] positions = [] figure_results = [] figure_positions = [] # crop figure out and add caption for k, bxs in figures.items(): txt = "\n".join([b["text"] for b in bxs]) if not txt: continue poss = [] if separate_tables_figures: figure_results.append( (cropout( bxs, "figure", poss), [txt])) figure_positions.append(poss) else: res.append( (cropout( bxs, "figure", poss), [txt])) positions.append(poss) for k, bxs in tables.items(): if not bxs: continue bxs = Recognizer.sort_Y_firstly(bxs, np.mean( [(b["bottom"] - b["top"]) / 2 for b in bxs])) poss = [] res.append((cropout(bxs, "table", poss), self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english))) positions.append(poss) if separate_tables_figures: assert len(positions) + len(figure_positions) == len(res) + len(figure_results) if need_position: return list(zip(res, positions)), list(zip(figure_results, figure_positions)) else: return res, figure_results else: assert len(positions) == len(res) if need_position: return list(zip(res, positions)) else: return res def proj_match(self, line): if len(line) <= 2: return if re.match(r"[0-9 ().,%%+/-]+$", line): return False for p, j in [ (r"第[零一二三四五六七八九十百]+章", 1), (r"第[零一二三四五六七八九十百]+[条节]", 2), (r"[零一二三四五六七八九十百]+[、  ]", 3), (r"[\((][零一二三四五六七八九十百]+[)\)]", 4), (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5), (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6), (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7), (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8), (r".{,48}[::??]$", 9), (r"[0-9]+)", 10), (r"[\((][0-9]+[)\)]", 11), (r"[零一二三四五六七八九十百]+是", 12), (r"[⚫•➢✓]", 12) ]: if re.match(p, line): return j return def _line_tag(self, bx, ZM): pn = [bx["page_number"]] top = bx["top"] - self.page_cum_height[pn[0] - 1] bott = bx["bottom"] - self.page_cum_height[pn[0] - 1] page_images_cnt = len(self.page_images) if pn[-1] - 1 >= page_images_cnt: return "" while bott * ZM > self.page_images[pn[-1] - 1].size[1]: bott -= self.page_images[pn[-1] - 1].size[1] / ZM pn.append(pn[-1] + 1) if pn[-1] - 1 >= page_images_cnt: return "" return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ .format("-".join([str(p) for p in pn]), bx["x0"], bx["x1"], top, bott) def __filterout_scraps(self, boxes, ZM): def width(b): return b["x1"] - b["x0"] def height(b): return b["bottom"] - b["top"] def usefull(b): if b.get("layout_type"): return True if width( b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3: return True if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]: return True return False res = [] while boxes: lines = [] widths = [] pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM mh = self.mean_height[boxes[0]["page_number"] - 1] mj = self.proj_match( boxes[0]["text"]) or boxes[0].get( "layout_type", "") == "title" def dfs(line, st): nonlocal mh, pw, lines, widths lines.append(line) widths.append(width(line)) mmj = self.proj_match( line["text"]) or line.get( "layout_type", "") == "title" for i in range(st + 1, min(st + 20, len(boxes))): if (boxes[i]["page_number"] - line["page_number"]) > 0: break if not mmj and self._y_dis( line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh: break if not usefull(boxes[i]): continue if mmj or \ (self._x_dis(boxes[i], line) < pw / 10): \ # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5): # concat following dfs(boxes[i], i) boxes.pop(i) break try: if usefull(boxes[0]): dfs(boxes[0], 0) else: logging.debug("WASTE: " + boxes[0]["text"]) except Exception: pass boxes.pop(0) mw = np.mean(widths) if mj or mw / pw >= 0.35 or mw > 200: res.append( "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines])) else: logging.debug("REMOVED: " + "<<".join([c["text"] for c in lines])) return "\n\n".join(res) @staticmethod def total_page_number(fnm, binary=None): try: with sys.modules[LOCK_KEY_pdfplumber]: pdf = pdfplumber.open( fnm) if not binary else pdfplumber.open(BytesIO(binary)) total_page = len(pdf.pages) pdf.close() return total_page except Exception: logging.exception("total_page_number") def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): self.lefted_chars = [] self.mean_height = [] self.mean_width = [] self.boxes = [] self.garbages = {} self.page_cum_height = [0] self.page_layout = [] self.page_from = page_from start = timer() try: with sys.modules[LOCK_KEY_pdfplumber]: with (pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))) as pdf: self.pdf = pdf self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in enumerate(self.pdf.pages[page_from:page_to])] try: self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]] except Exception as e: logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}") self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead. self.total_page = len(self.pdf.pages) except Exception: logging.exception("RAGFlowPdfParser __images__") logging.info(f"__images__ dedupe_chars cost {timer() - start}s") self.outlines = [] try: with (pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))) as pdf: self.pdf = pdf outlines = self.pdf.outline def dfs(arr, depth): for a in arr: if isinstance(a, dict): self.outlines.append((a["/Title"], depth)) continue dfs(a, depth + 1) dfs(outlines, 0) except Exception as e: logging.warning(f"Outlines exception: {e}") if not self.outlines: logging.warning("Miss outlines") logging.debug("Images converted.") self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:&#39;\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join( random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))] if sum([1 if e else 0 for e in self.is_english]) > len( self.page_images) / 2: self.is_english = True else: self.is_english = False async def __img_ocr(i, id, img, chars, limiter): j = 0 while j + 1 < len(chars): if chars[j]["text"] and chars[j + 1]["text"] \ and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \ and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"], chars[j]["width"]) / 2: chars[j]["text"] += " " j += 1 if limiter: async with limiter: await trio.to_thread.run_sync(lambda: self.__ocr(i + 1, img, chars, zoomin, id)) else: self.__ocr(i + 1, img, chars, zoomin, id) if callback and i % 6 == 5: callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="") async def __img_ocr_launcher(): def __ocr_preprocess(): chars = self.page_chars[i] if not self.is_english else [] self.mean_height.append( np.median(sorted([c["height"] for c in chars])) if chars else 0 ) self.mean_width.append( np.median(sorted([c["width"] for c in chars])) if chars else 8 ) self.page_cum_height.append(img.size[1] / zoomin) return chars if self.parallel_limiter: async with trio.open_nursery() as nursery: for i, img in enumerate(self.page_images): chars = __ocr_preprocess() nursery.start_soon(__img_ocr, i, i % PARALLEL_DEVICES, img, chars, self.parallel_limiter[i % PARALLEL_DEVICES]) await trio.sleep(0.1) else: for i, img in enumerate(self.page_images): chars = __ocr_preprocess() await __img_ocr(i, 0, img, chars, None) start = timer() trio.run(__img_ocr_launcher) logging.info(f"__images__ {len(self.page_images)} pages cost {timer() - start}s") if not self.is_english and not any( [c for c in self.page_chars]) and self.boxes: bxes = [b for bxs in self.boxes for b in bxs] self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:&#39;\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))])) logging.debug("Is it English:", self.is_english) self.page_cum_height = np.cumsum(self.page_cum_height) assert len(self.page_cum_height) == len(self.page_images) + 1 if len(self.boxes) == 0 and zoomin < 9: self.__images__(fnm, zoomin * 3, page_from, page_to, callback) def __call__(self, fnm, need_image=True, zoomin=3, return_html=False): self.__images__(fnm, zoomin) self._layouts_rec(zoomin) self._table_transformer_job(zoomin) self._text_merge() self._concat_downward() self._filter_forpages() tbls = self._extract_table_figure( need_image, zoomin, return_html, False) return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls def remove_tag(self, txt): return re.sub(r"@@[\t0-9.-]+?##", "", txt) def crop(self, text, ZM=3, need_position=False): imgs = [] poss = [] for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text): pn, left, right, top, bottom = tag.strip( "#").strip("@").split("\t") left, right, top, bottom = float(left), float( right), float(top), float(bottom) poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) if not poss: if need_position: return None, None return max_width = max( np.max([right - left for (_, left, right, _, _) in poss]), 6) GAP = 6 pos = poss[0] poss.insert(0, ([pos[0][0]], pos[1], pos[2], max( 0, pos[3] - 120), max(pos[3] - GAP, 0))) pos = poss[-1] poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120))) positions = [] for ii, (pns, left, right, top, bottom) in enumerate(poss): right = left + max_width bottom *= ZM for pn in pns[1:]: bottom += self.page_images[pn - 1].size[1] imgs.append( self.page_images[pns[0]].crop((left * ZM, top * ZM, right * ZM, min( bottom, self.page_images[pns[0]].size[1]) )) ) if 0 < ii < len(poss) - 1: positions.append((pns[0] + self.page_from, left, right, top, min( bottom, self.page_images[pns[0]].size[1]) / ZM)) bottom -= self.page_images[pns[0]].size[1] for pn in pns[1:]: imgs.append( self.page_images[pn].crop((left * ZM, 0, right * ZM, min(bottom, self.page_images[pn].size[1]) )) ) if 0 < ii < len(poss) - 1: positions.append((pn + self.page_from, left, right, 0, min( bottom, self.page_images[pn].size[1]) / ZM)) bottom -= self.page_images[pn].size[1] if not imgs: if need_position: return None, None return height = 0 for img in imgs: height += img.size[1] + GAP height = int(height) width = int(np.max([i.size[0] for i in imgs])) pic = Image.new("RGB", (width, height), (245, 245, 245)) height = 0 for ii, img in enumerate(imgs): if ii == 0 or ii + 1 == len(imgs): img = img.convert(&#39;RGBA&#39;) overlay = Image.new(&#39;RGBA&#39;, img.size, (0, 0, 0, 0)) overlay.putalpha(128) img = Image.alpha_composite(img, overlay).convert("RGB") pic.paste(img, (0, int(height))) height += img.size[1] + GAP if need_position: return pic, positions return pic def get_position(self, bx, ZM): poss = [] pn = bx["page_number"] top = bx["top"] - self.page_cum_height[pn - 1] bott = bx["bottom"] - self.page_cum_height[pn - 1] poss.append((pn, bx["x0"], bx["x1"], top, min( bott, self.page_images[pn - 1].size[1] / ZM))) while bott * ZM > self.page_images[pn - 1].size[1]: bott -= self.page_images[pn - 1].size[1] / ZM top = 0 pn += 1 poss.append((pn, bx["x0"], bx["x1"], top, min( bott, self.page_images[pn - 1].size[1] / ZM))) return poss class PlainParser: def __call__(self, filename, from_page=0, to_page=100000, **kwargs): self.outlines = [] lines = [] try: self.pdf = pdf2_read( filename if isinstance( filename, str) else BytesIO(filename)) for page in self.pdf.pages[from_page:to_page]: lines.extend([t for t in page.extract_text().split("\n")]) outlines = self.pdf.outline def dfs(arr, depth): for a in arr: if isinstance(a, dict): self.outlines.append((a["/Title"], depth)) continue dfs(a, depth + 1) dfs(outlines, 0) except Exception: logging.exception("Outlines exception") if not self.outlines: logging.warning("Miss outlines") return [(line, "") for line in lines], [] def crop(self, ck, need_position): raise NotImplementedError @staticmethod def remove_tag(txt): raise NotImplementedError class VisionParser(RAGFlowPdfParser): def __init__(self, vision_model, *args, **kwargs): super().__init__(*args, **kwargs) self.vision_model = vision_model def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): try: with sys.modules[LOCK_KEY_pdfplumber]: self.pdf = pdfplumber.open(fnm) if isinstance( fnm, str) else pdfplumber.open(BytesIO(fnm)) self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in enumerate(self.pdf.pages[page_from:page_to])] self.total_page = len(self.pdf.pages) except Exception: self.page_images = None self.total_page = 0 logging.exception("VisionParser __images__") def __call__(self, filename, from_page=0, to_page=100000, **kwargs): callback = kwargs.get("callback", lambda prog, msg: None) self.__images__(fnm=filename, zoomin=3, page_from=from_page, page_to=to_page, **kwargs) total_pdf_pages = self.total_page start_page = max(0, from_page) end_page = min(to_page, total_pdf_pages) all_docs = [] for idx, img_binary in enumerate(self.page_images or []): pdf_page_num = idx # 0-based if pdf_page_num < start_page or pdf_page_num >= end_page: continue docs = picture_vision_llm_chunk( binary=img_binary, vision_model=self.vision_model, prompt=vision_llm_describe_prompt(page=pdf_page_num+1), callback=callback, ) if docs: all_docs.append(docs) return [(doc, "") for doc in all_docs], [] if __name__ == "__main__": pass 改为import fitz # PyMuPDF def extract_vector_images(pdf_path): doc = fitz.open(pdf_path) image_data = [] for page_num in range(len(doc)): page = doc.load_page(page_num) for img_index, img in enumerate(page.get_images(full=True)): xref = img[0] base_image = doc.extract_image(xref) if base_image["ext"] == "svg": # 矢量图识别 svg_data = base_image["image"] image_data.append({ "page": page_num, "type": "vector", "data": svg_data }) return image_data 这个方式
09-29
【电动汽车充电站有序充电调度的分散式优化】基于蒙特卡诺和拉格朗日的电动汽车优化调度(分时电价调度)(Matlab代码实现)内容概要:本文介绍了基于蒙特卡洛和拉格朗日方法的电动汽车充电站有序充电调度优化方案,重点在于采用分散式优化策略应对分时电价机制下的充电需求管理。通过构建数学模型,结合不确定性因素如用户充电行为和电网负荷波动,利用蒙特卡洛模拟生成大量场景,并运用拉格朗日松弛法对复杂问题进行分解求解,从而实现全局最优或近似最优的充电调度计划。该方法有效降低了电网峰值负荷压力,提升了充电站运营效率与经济效益,同时兼顾用户充电便利性。 适合人群:具备一定电力系统、优化算法和Matlab编程基础的高校研究生、科研人员及从事智能电网、电动汽车相关领域的工程技术人员。 使用场景及目标:①应用于电动汽车充电站的日常运营管理,优化充电负荷分布;②服务于城市智能交通系统规划,提升电网与交通系统的协同水平;③作为学术研究案例,用于验证分散式优化算法在复杂能源系统中的有效性。 阅读建议:建议读者结合Matlab代码实现部分,深入理解蒙特卡洛模拟与拉格朗日松弛法的具体实施步骤,重点关注场景生成、约束处理与迭代收敛过程,以便在实际项目中灵活应用与改进。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值