WIN32_FIND_DATA 循环获取文件大小BUG

在使用WIN32_FIND_DATA遍历文件时遇到一个问题,部分正常大小的图片文件被错误地读为0KB。通过检查nFileSizeLow和nFileSizeHigh字段,发现在某些情况下这两个值为0,导致文件误判。解决方案是在判断文件大小时,对nFileSizeLow为0的情况单独处理,避免将正常图片移到错误目录。

今天在调试程序时发现一个 WIN32_FIND_DATA 的BUG,在循环读取一个文件夹下的图片文件时,发现结构体中 nFileSizeLow 和 nFileSizeHigh 值 == 0的情况,即能获取到文件名,但读出的文件大小为0的尴尬情况。但事实上,图片文件是正常的,大小从80K~170K的都有,比较诡异,与各位分享一下。

   typedef struct _WIN32_FIND_DATAW {
    DWORD dwFileAttributes;
    FILETIME ftCreationTime;
    FILETIME ftLastAccessTime;
    FILETIME ftLastWriteTime;
   
DWORD nFileSizeHigh;
    DWORD nFileSizeLow;

    DWORD dwReserved0;
    DWORD dwReserved1;
    WCHAR  cFileName[ MAX_PATH ];
    WCHAR  cAlternateFileName[ 14 ];
#ifdef _MAC
    DWORD dwFileType;
    DWORD dwCreatorType;
    WORD  wFinderFlags;
#endif
} WIN32_FIND_DATAW, *PWIN32_FIND_DATAW, *LPWIN32_FIND_DATAW;

""" @File : excel.py @Author : 董根虎 @Date : 2024-10-29 @Desc : 文档对象模型的excel格式的实现 """ import logging import os import traceback from base64 import b64encode from copy import deepcopy from io import BytesIO from typing import List, Dict, Tuple, Any import webcolors import re import pythoncom from openpyxl.cell import Cell from openpyxl.cell.rich_text import CellRichText, TextBlock from openpyxl.cell.text import InlineFont from openpyxl.utils import coordinate_to_tuple from openpyxl.workbook import Workbook from kotei_omp.common.base import WIN32_LOCK, get_cell_value, get_start_end_of_table, get_cell_info from kotei_omp.common.excel import get_excel_shapes_win32com, \ handle_table_cell, calculate_overlap_percentage, handle_shape_in_merged_cell, \ ensure_merge_content, get_end_desc from kotei_omp.data import PictureObject, TextObject, GraphicObject, LayoutObject, RunObject from kotei_omp.data.position import Position from kotei_omp.data.style import FontStyleObject, StyleObject # 执行patch脚本, 修复三方库openpyxl的问题 from kotei_omp.utils.python_patch import fix_openpyxl_descriptors_base_bug fix_openpyxl_descriptors_base_bug() from openpyxl.styles.colors import COLOR_INDEX, WHITE, BLACK from kotei_omp.common.theme_color import ThemeColor from kotei_omp.data.block import DocumentBlockObject, CHANGE_RESUME from kotei_omp.data.border import CellBorderObject, BorderObject from kotei_omp.data.table import TableObject, RowObject, CellObject from kotei_omp.interface_parser import ParserInterface from zipfile import ZipFile from openpyxl.drawing.spreadsheet_drawing import SpreadsheetDrawing from openpyxl.xml.functions import fromstring from openpyxl.worksheet.worksheet import Worksheet from openpyxl.packaging.relationship import get_rels_path, get_dependents, RelationshipList import base64 from PIL import Image, ImageGrab import imagehash # Excel 索引颜色到十六进制颜色值的映射表 INDEXED_COLORS = { 0: "#000000", # 黑色 1: "#FFFFFF", # 白色 2: "#FF0000", # 红色 3: "#00FF00", # 绿色 4: "#0000FF", # 蓝色 5: "#FFFF00", # 黄色 6: "#FF00FF", # 洋红色 7: "#00FFFF", # 青色 8: "#800000", # 深红色 9: "#008000", # 深绿色 10: "#000080", # 深蓝色 11: "#808000", # 深黄色 12: "#800080", # 深洋红色 13: "#008080", # 深青色 14: "#C0C0C0", # 浅灰色 15: "#808080", # 深灰色 64: "#FFFF00", # 黄色(索引 64 是 Excel 的特殊值) } def parse_sheet_images(file_path: str, sheet: Worksheet): """ 从 Excel 工作表中解析嵌入的图片对象。 该函数读取指定 Excel 文件中的特定工作表,解析其中的图片对象, 提取图片的元数据和二进制数据,并返回以图片名称和ID为键的字典。 Args: file_path (str): Excel 文件的路径。 sheet (Worksheet): openpyxl 的工作表对象,需要从中提取图片。 Returns: dict: 包含所有图片信息的字典,键为 "图片名###图片ID" 格式,值为包含以下字段的字典: - "type": 图片类型(固定为 "picture_shape") - "sheet_name": 所在工作表名称 - "id": 图片ID - "name": 图片名称 - "data": 图片的base64编码数据 - "hash": 图片的哈希值 异常处理: 如果在解析过程中发生异常,会记录警告和错误日志,并返回空字典。 """ archive = ZipFile(file_path, 'r') pictures = {} try: drawings = sheet._rels.find(SpreadsheetDrawing._rel_type) for rel in drawings: path = rel.target src = archive.read(path) tree = fromstring(src) drawing_rel_path = get_rels_path(rel.target) # 检查drawing_rel_path是否存在 if drawing_rel_path not in archive.namelist(): logging.warning(f"Drawing relationship file {drawing_rel_path} not found in archive") continue drawing_rels = get_dependents(archive, drawing_rel_path) # 定义命名空间 ns = { 'xdr': 'http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing', 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' } images = {} for item in drawing_rels: images[item.Id] = item.Target # 遍历所有子元素 for anchor in tree: # 处理 twoCellAnchor, oneCellAnchor 等锚点元素 if anchor.tag.endswith('}twoCellAnchor') or anchor.tag.endswith( '}oneCellAnchor') or anchor.tag.endswith('}absoluteAnchor'): # 使用命名空间查找元素 picture = { "type": "picture_shape", "sheet_name": sheet.title } # 查找图片元素 pic_elem = anchor.find('xdr:pic', ns) if pic_elem is not None: # 获取图片的 id 和 name cNvPr_elem = pic_elem.find('xdr:nvPicPr/xdr:cNvPr', ns) if cNvPr_elem is not None: pic_id = cNvPr_elem.get('id') pic_name = cNvPr_elem.get('name') picture["id"] = pic_id picture["name"] = pic_name # 获取 r:embed 属性值 blip_elem = pic_elem.find('xdr:blipFill/a:blip', ns) if blip_elem is not None: r_embed = blip_elem.get( '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed') image_path = images[r_embed] image_res = read_image_info(archive, image_path) if image_res is None: continue picture["data"] = image_res["data"] picture["hash"] = image_res["hash"] pictures[pic_name + '###' + pic_id] = picture return pictures except Exception as e: logging.warning(traceback.format_exc()) logging.error(f"Error reading drawing relationships for sheet {sheet.title}: {e}") def get_image_hash(img): """ 计算图片的哈希值,用于图片去重和比较。 使用 imagehash 库的 average_hash 算法计算图片的感知哈希值, 可用于比较两张图片是否相似或相同。 Args: img (PIL.Image): PIL 图片对象。 Returns: tuple: 包含两个元素的元组: - str: 图片哈希值的字符串表示 - list: 图片哈希值的二维数组表示(用于详细比较) """ img_hash = imagehash.average_hash(img) return str(img_hash), img_hash.hash.tolist() def read_image_info(archive, image_path): """ 从 Excel 文件中读取图片并返回详细信息 :param archive: ZipFile 对象 :param image_path: 图片在 Excel 中的路径 :return: 包含 base64、哈希值等信息的字典 """ try: # 从 archive 中读取图片数据 image_data = archive.read(image_path) # 通过文件头进一步验证格式 if len(image_data) >= 4: header = image_data[:10] is_jpg = header.startswith(b'\xff\xd8') is_png = header.startswith(b'\x89PNG\r\n\x1a\n') if not (is_jpg or is_png): logging.info(f"根据文件头检测跳过非 JPG/PNG 格式图片: {image_path}") return None # 转换为 base64 image_base64 = base64.b64encode(image_data).decode('utf-8') # 使用 PIL 打开图片获取尺寸信息 image = Image.open(BytesIO(image_data)) # 最终验证格式 if image.format not in ['JPEG', 'PNG']: logging.info(f"图片实际格式不是 JPEG/PNG: {image_path}, 实际格式: {image.format}") return None image_hash = get_image_hash(image) width, height = image.size return { "data": image_base64, "hash": image_hash[0], "width": width, "height": height, "size": len(image_data) } except Exception as e: logging.error(f"读取图片失败: {image_path}, 错误: {e}") return None class ExcelStandardParser(ParserInterface): """excel文档解析器""" def __init__(self): self._WHITE = "#" + WHITE[2:] self._BLACK = "#" + BLACK[2:] self._THEME_COLORS = [] self._HEAD_BORDER_STYLE = ['medium', 'double'] # 粗线,双横线 self._file_path = "" self._cell_bg_colors = {} self._merged_cells_cache: Dict[Tuple[int, int], CellObject] = {} self._openpyxl_wb: Workbook = None self._win32com_wb = None self._excel_app = None # cell 原始的border信息,没有修改 self._cell_border: Dict[Tuple[int, int], CellBorderObject] = {} def _read_document(self, file_path): """读取文档,将文件数据加载到内存。构建对象模型的接口函数 :param file_path: 文档的路径 :return: 返回通用文档“句柄”self """ # 判断文件是否存在 self._THEME_COLORS = ThemeColor().get_theme_colors(self._openpyxl_wb) self._file_path = file_path def parse_document(self, file_path) -> (list[DocumentBlockObject]): """ 解析文档 """ workbook = self._openpyxl_wb self._file_path = file_path sheet_pictures = {} for name in workbook.sheetnames: pictures = parse_sheet_images(self._file_path, workbook[name]) if pictures: sheet_pictures[name] = pictures logging.info(f"sheet {name} has {len(pictures)} pictures") # 使用win32com解析图形+图片 shapes = self._parse_shapes(self._file_path, exclude_sheet_pictures=sheet_pictures) # 多线程异步处理多个sheet # with ThreadPoolExecutor() as executor: # # 遍历excel中所有的sheet # futures = [executor.submit(self._parse_sheet, workbook[n], shapes) for n in workbook.sheetnames # if (not demo_config_sheet or n in demo_config_sheet)] # blocks = [future.result() for future in futures if future.result()] blocks = [] for name in workbook.sheetnames: if name in CHANGE_RESUME: continue # Sheet 隐藏或者特别隐藏 if workbook[name].sheet_state in ['hidden', 'veryHidden']: continue sheet = workbook[name] # 跳过空的sheet if sheet.max_row == 1 and sheet.max_column == 1: block = DocumentBlockObject() block._name = sheet.title block.file_name = os.path.basename(self._file_path) blocks.append(block) continue # TODO 找到 shape 里面和 cell 重叠的部分 sheet_shapes = [shape for shape in shapes if shape["sheet_name"] == name] block = self._parse_sheet(sheet, sheet_shapes) # handle_positions(block, total_width, total_height) blocks.append(block) # print(block) # print() # print(blocks) # print() return blocks def _get_table_cell_info(self, sheet, row_index, col_index, sheet_max_row, sheet_max_col, ws_with_pywin32) -> CellObject: """ 构建 单元格对象 :param sheet: 当前解析的Worksheet对象 :param row_index: excel单元格行坐标 :param col_index: excel单元格列坐标 :return: """ def get_border(cell: Cell) -> CellBorderObject: cell_border = deepcopy(self.get_cell_border(cell_)) if not cell_border.border_top.border_style and row_index > 1: new_border = self.get_cell_border(sheet.cell(row_index - 1, col_index)) cell_border.border_top.border_style = new_border.border_bottom.border_style if not cell_border.border_bottom.border_style and row_index < sheet_max_row: new_border = self.get_cell_border(sheet.cell(row_index + 1, col_index)) cell_border.border_bottom.border_style = new_border.border_top.border_style if not cell_border.border_left and col_index > 1: new_border = self.get_cell_border(sheet.cell(row_index + 1, col_index)) cell_border.border_left.border_style = new_border.border_right.border_style if not cell_border.border_right and col_index < sheet_max_col: new_border = self.get_cell_border(sheet.cell(row_index + 1, col_index)) cell_border.border_right.border_style = new_border.border_left.border_style return cell_border cell_ = sheet.cell(row_index, col_index) is_merged = False if (row_index, col_index) in self._merged_cells_cache: is_merged = True # 获取单元格的信息 v = get_cell_info(cell_, sheet, in_table=False, is_merged=is_merged) # 单元格边框对象 b = get_border(cell_) # 通用模型对象 c = CellObject() c.text = v.get("content", "") c.comment.text = v.get("comment", "") if hasattr(cell_, "number_format") and '0' in cell_.number_format: c.number_format = cell_.number_format c._border = b c.coordinate.desc = cell_.coordinate current_text = TextObject() cell_index = (row_index, col_index) # 合并单元格的border取的是合并范围后的边框,不是单个单元格的边框 if cell_index in self._merged_cells_cache: cache_cell = self._merged_cells_cache[cell_index] # 开始的cell存储merge的border信息 if cache_cell._is_merged_start: cache_cell.border = deepcopy(b) c.style = cache_cell.style c.merged_ranges = cache_cell.merged_ranges run_objs = cache_cell.content[0] current_text._style = run_objs[0].style if run_objs else None current_text._runs = run_objs else: c.style = self._get_text_style(cell_.font) bg_color = self._get_cell_bg_color(cell_, ws_with_pywin32) if bg_color: c.style.background_color = bg_color if cell_.fill.patternType: c.style.background_style = cell_.fill.patternType else: c.style.background_style = 'solid' # 构建一个文本对象, 放在单元格对象中 run_objs = self._get_text_cell(cell_, ws_with_pywin32) current_text._style = run_objs[0].style if run_objs else None current_text._runs = run_objs current_text._text = c.text current_text.coordinate.desc = cell_.coordinate c.content.append(current_text) # print(f"Old Text: {c.text} Old style: {c.style}") return c # 合并单元格的边框 def _get_merged_table_cell_info(self, sheet, ws_with_pywin32): merged_cells_dict = {} for merged_range in sheet.merged_cells.ranges: # 获取合并范围的边界 start_col, start_row, end_col, end_row = list(merged_range.bounds) # 获取起始单元格对象 start_cell = sheet.cell(row=start_row, column=start_col) # 构造起始单元格的 CellObject start_cell_obj = CellObject() start_cell_obj.merged_ranges = [start_row, start_col, end_row, end_col] run_objs = self._get_text_cell(start_cell, ws_with_pywin32) # 暂时将runs存到 content 中 start_cell_obj.content.append(run_objs) # 提取样式信息 start_cell_obj.style = self._get_text_style(start_cell.font) bg_color = self._get_cell_bg_color(start_cell, ws_with_pywin32) if bg_color: start_cell_obj.style.background_color = bg_color start_cell_obj.style.background_style = start_cell.fill.patternType or 'solid' start_cell_obj._is_merged = True start_cell_obj._is_merged_start = True start_cell_border = self.get_cell_border(start_cell) # 将起始单元格的 CellObject 放入缓存 merged_cells_dict[(start_row, start_col)] = start_cell_obj # 将范围内其他单元格也存入缓存,直接使用起始单元格的样式 for row in range(start_row, end_row + 1): for col in range(start_col, end_col + 1): top = deepcopy(start_cell_border.border_top) bottom = deepcopy(start_cell_border.border_bottom) left = deepcopy(start_cell_border.border_left) right = deepcopy(start_cell_border.border_right) empty_border = BorderObject() if (row, col) == (start_row, start_col): start_cell_obj.border.border_top = empty_border start_cell_obj.border.border_right = empty_border start_cell_obj.border.border_left = empty_border start_cell_obj.border.border_bottom = empty_border if row == start_row: start_cell_obj.border.border_top = top if row == end_row: start_cell_obj.border.border_bottom = bottom if col == start_col: start_cell_obj.border.border_left = left if col == end_col: start_cell_obj.border.border_right = right self._cell_border[(row, col)] = start_cell_obj.border continue # 跳过起始单元格 cell_obj = CellObject() cell_obj._is_merged = True cell_obj._is_merged_start = False cell_obj.style = start_cell_obj.style # 使用起始单元格的样式 cell_obj.content = start_cell_obj.content cell_obj.merged_ranges = start_cell_obj.merged_ranges merged_cells_dict[(row, col)] = cell_obj # print(merged_cells_dict) self._merged_cells_cache = merged_cells_dict # 更新类的缓存 def _parse_table(self, sheet, block, table_range_list, ws_with_pywin32, shapes) -> (List[TableObject]): """ 解析文档对象的表格,组织成通用结构的对象结构。 :param sheet: Worksheet对象 :param block: DocumentBlockObject :param table_range_list: 表格的对角坐标范围 [((3, 2), (15, 13))] :return: """ # 统计合并列,只需要保留每行的第一列 # col_merge_dic: Dict[int, List[int]] = {} # for merged_range in sheet.merged_cells.ranges: # min_col = merged_range.min_col # max_col = merged_range.max_col # min_row = merged_range.min_row # max_row = merged_range.max_row # if max_col > min_col: # for row in range(min_row, max_row + 1): # if row not in col_merge_dic: # col_merge_dic[row] = [] # ignore_col = list(range(min_col+1, max_col+1)) # col_merge_dic[row].extend(ignore_col) # print(f"old: {table_range_list}") tables = [] sheet_max_row = sheet.max_row sheet_max_col = sheet.max_column self._get_merged_table_cell_info(sheet, ws_with_pywin32) for i in range(len(table_range_list)): logging.info(f"parsing table index: {i}") table_range = table_range_list[i] # 提取一个表格的坐标范围 (min_row, min_col), (max_row, max_col) = table_range # 构建表格对象 t = TableObject() for row_index in range(min_row, max_row + 1): cells_ = [] # 构建表格行对象 r = RowObject() r.data_id = row_index # real_col_index = 0 for col_index in range(min_col, max_col + 1): # ignore_col = col_merge_dic.get(row_index, []) # if col_index in ignore_col: # continue # 构建表格的单元格对象 c = self._get_table_cell_info(sheet, row_index, col_index, sheet_max_row, sheet_max_col, ws_with_pywin32) # 这里图片会放到合并单元格左上角cell的content中 for shape in shapes: row = shape["from_row"] col = shape["from_col"] if row == row_index and col == col_index and shape["sheet_name"] == sheet.title and shape["in_cell"]: obj = PictureObject() self._shape_to_object(shape, obj) shape["in_table"] = True c.content.append(obj) c.layout = self._get_layout(r) # 修正单元格的坐标为当前表格的行、列索引 c.row_index = row_index - min_row c.col_index = col_index - min_col c.data_id = row_index * 10 + col_index if hasattr(c, "merged_ranges") and c.merged_ranges: c.merged_ranges = [c.merged_ranges[0] - min_row, c.merged_ranges[1] - min_col, c.merged_ranges[2] - min_row, c.merged_ranges[3] - min_col, ] # logging.info( # f"merged_ranges1 {c.coordinate.desc} ranges: {c.merged_ranges} {c.relative_merged_ranges} ") cells_.append(c) r._layout = self._get_layout(t) r.row_index = row_index - min_row r.cells = cells_ t.rows.append(r) logging.debug(f"parse_table parse_row sheet_name: {sheet.title} row_index: {row_index}") t.layout = self._get_layout(block) if t.rows: t.coordinate.desc = t.rows[0].cells[0].coordinate.desc # 表格的最后一个单元格坐标 last_row = t.rows[len(t.rows) - 1] t._last_coordinate = last_row.cells[len(last_row.cells) - 1].coordinate.desc logging.debug(f"parse_table end sheet_name: {sheet.title} table_index: {i}") t = handle_table_cell(t) t = ensure_merge_content(t) tables.append(t) # 处理表格的表头 # with ThreadPoolExecutor() as executor: # futures = [executor.submit(parse_table_head, t) for t in tables] # concurrent.futures.wait(futures) # print(tables) return tables @staticmethod def _get_layout(parent_ref): """ 构建 layout对象 :param parent_ref: :return: """ layout = LayoutObject() layout._parent_ref = parent_ref return layout def _parse_picture(self, sheet_name, block, shapes) -> (List[PictureObject]): """ 解析文档对象的图片,组织成通用结构的对象结构 :param sheet_name: sheet名称 :param block: :param shapes: 图形列表 :return: 图片对象 """ results = [] for shape in shapes: if shape["sheet_name"] == sheet_name and shape["shape_type"] == 13 and not 'in_table' in shape: if shape["position"].width < 1 or shape["position"].height < 1: continue # 构建图形对象 g = PictureObject() self._shape_to_object(shape, g) g._layout = self._get_layout(block) results.append(g) return results def _parse_graphic(self, sheet_name, block, shapes) -> (List[GraphicObject]): """ 解析文档对象的图形,组织成通用结构的对象结构 :param sheet_name: sheet名称 :param block: :param shapes: 图形列表 :return: 图形对象 """ results = [] for shape in shapes: if (shape["sheet_name"] == sheet_name and not 'in_table' in shape # 图片对象 and shape["shape_type"] != 13 # 文本框对象 # and shape["shape_type"] != 17 ): if shape["position"].width < 1 or shape["position"].height < 1: continue # 构建图形对象 g = GraphicObject() self._shape_to_object(shape, g) g._layout = self._get_layout(block) # 赋值shape_type g.shape_type = int(shape["shape_type"]) # 填充色 fill_color = shape.get("fillcolor").split()[0] if shape.get("fillcolor") else None if fill_color is not None and re.match(r'^#[0-9A-Fa-f]{6}$', fill_color): g.style.background_color = str(fill_color) elif fill_color is not None: try: fill_color = webcolors.name_to_hex(fill_color) g.style.background_color = str(fill_color) except Exception as e: logging.error(f"webcolors name to hex error-{fill_color}, {e}") g._text = shape.get("content", "") g._graphic_type = shape["type"] results.append(g) return results def _parse_text(self, sheet, block, table_range_list, out_table_cells, shapes, ws_with_pywin32) -> (List[TextObject]): """ 解析文档对象的文本,连续单元格作为一个TextObject对象 :param sheet: :param block: :param table_range_list: 表格对象的范围 :param out_table_cells: 表格外对象,跟表格的范围可能存在重复,需要先进行清晰 :param shapes: shapes集合, 从shapes中查找文本框对象 :return: 文本对象 """ # return self._parse_text_single_cell(sheet, table_range_list, out_table_cells) # 按照excel的自然行列排序 sorted_out_table_cells = sorted(out_table_cells, key=lambda cell: (cell[0], cell[1])) texts: List[TextObject] = [] visited = set() for text_cell in sorted_out_table_cells: row, col = text_cell if text_cell in visited: continue visited.add(text_cell) # 当前单元格在表格范围内,则跳过 if self._cell_in_table(*text_cell, table_range_list): continue cell_ = sheet.cell(*text_cell) # 获取单元格中的内容 v = get_cell_value(cell_, sheet, False) # 单元格内容为空, 则跳过 if v is None or (isinstance(v, str) and v.strip() == ""): continue # 从单元格中解析出RunObject run_objs = self._get_text_cell(cell_, ws_with_pywin32) # 创建一个新的 TextObject current_text = TextObject() current_text._layout = self._get_layout(block) current_text._text = str(v) current_text._style = run_objs[0].style if run_objs else None current_text._runs = run_objs current_text.coordinate.desc = cell_.coordinate neighbors = self._get_neighbors(text_cell, sorted_out_table_cells, table_range_list) queue = neighbors while len(queue) > 0: current_index = queue.pop(0) if current_index in visited: continue visited.add(current_index) cell_ = sheet.cell(*current_index) v = get_cell_value(cell_, sheet, False) if v is None or (isinstance(v, str) and v.strip() == ""): continue run_objs = self._get_text_cell(cell_, ws_with_pywin32) current_text.text += "\n" + str(v) current_text.runs.extend(run_objs) neighbors = self._get_neighbors(current_index, sorted_out_table_cells, table_range_list) queue.extend(neighbors) texts.append(current_text) return texts def _get_neighbors(self, cell_index, sorted_out_table_cells, table_range_list): current_row, current_col = cell_index directions = [(-1, 0), (1, 0), (0, -1), (0, 1)] neighbors = [] for dr, dc in directions: neighbor_row = current_row + dr neighbor_col = current_col + dc neighbor_cell = (neighbor_row, neighbor_col) if neighbor_cell in table_range_list or neighbor_cell not in sorted_out_table_cells: continue neighbors.append(neighbor_cell) return neighbors def _parse_text_single_cell(self, sheet, block, table_range_list, out_table_cells, ws_with_pywin32) -> (List[TextObject]): """ 解析文档对象的文本,以单元格为单位, 一个单元格一个文本对象 :param table_range_list: 表格对象的范围 :param out_table_cells: 表格外对象,跟表格的范围可能存在重复,需要先进行清晰 :return: 文本对象 """ texts = [] sorted_out_table_cells = sorted(out_table_cells, key=lambda cell: (cell[0], cell[1])) for text_cell in sorted_out_table_cells: (row, col) = text_cell # 当前单元格在表格范围内,则跳过 if self._cell_in_table(row, col, table_range_list): continue cell_ = sheet.cell(row, col) v = get_cell_value(cell_, sheet, False) # 单元格内容为空, 则跳过 if not v: continue # excel中的单元格文本组成 run_objs = self._get_text_cell(cell_, ws_with_pywin32) # 构建文本对象 t = TextObject() t._text = v t._style = run_objs[0].style if run_objs else None t._runs = run_objs t.coordinate.desc = cell_.coordinate t.layout.parent_ref = block texts.append(t) return texts def _cell_in_table(self, row, col, table_range_list) -> bool: """ 判断单元格是否在表格范围内 :param row: 当前单元格坐标 :param col: :param table_range_list: 表格范围 :return: True: 在表格范围内, False: 不在表格范围内 """ for table_range in table_range_list: # 提取一个表格的坐标范围 (min_row, min_col), (max_row, max_col) = table_range if min_row <= row <= max_row and min_col <= col <= max_col: return True return False def _parse_shapes(self, file_path, exclude_type_list=[4], exclude_sheet_pictures={}): """ 使用win32com解析图形+图片 :param file_path: :return: """ with WIN32_LOCK: shapes_json = get_excel_shapes_win32com(self._win32com_wb, exclude_type_list, exclude_sheet_pictures) return shapes_json def _get_text_cell(self, cell_, ws_with_pywin32) -> (List[RunObject]): """ 解析excel中一个单元格的文本,解析为RunObject :param cell_: openpyxl.cell.cell.Cell :return: 返回 RunObject 列表 """ # excel中cell_.value 类型常见取值: str TextBlock CellRichText # 获取字体信息 run_objs = [] font = cell_.font value = cell_.value bg_color = self._get_cell_bg_color(cell_, ws_with_pywin32) if isinstance(value, TextBlock): style_obj = self._get_text_style(cell_.value.font) style_obj.background_color = bg_color r = RunObject() r.coordinate.desc = cell_.coordinate if cell_.value.text: r._text = str(cell_.value.text) r._style = style_obj run_objs.append(r) elif isinstance(value, CellRichText): # CellRichText 文本片段循环处理 for item in value: item_font = font if isinstance(item, str) else item.font style_obj = self._get_text_style(item_font) style_obj.background_color = bg_color r = RunObject() r.coordinate.desc = cell_.coordinate if item and isinstance(item, str): r._text = item elif item.text: r._text = str(item.text) r._style = style_obj run_objs.append(r) else: style_obj = self._get_text_style(font) style_obj.background_color = bg_color r = RunObject() if value or isinstance(value, int): r._text = str(value) r.coordinate.desc = cell_.coordinate r._style = style_obj run_objs.append(r) return run_objs def _get_text_style(self, font): """ 解析文本样式 :param font: :return: """ style_obj = StyleObject() style_obj._font_size = font.size if font.color: style_obj._font_color = self._get_rgb_color(font.color) elif font.color is None: style_obj._font_color = "#000000" style_obj._background_color = '' # 字体样式,如粗体、斜体、下划线、删除线 font_style_ = FontStyleObject() if font.b: font_style_.bold = True # 粗体 if font.i: font_style_.italic = True # 斜体 if font.u: font_style_.underline = True # 下划线 if font.strike: font_style_.strikeout = True # 删除线 style_obj.font_style = font_style_ # excel中font行内文本样式: InlineFont if isinstance(font, InlineFont): style_obj._font_family = font.rFont else: style_obj._font_family = font.name return style_obj @staticmethod def _find_text_object(current_cell, texts): """检查当前单元格是否连续, 并返回TextObject对象 """ if not texts: return None (curr_row, curr_col) = current_cell return next( (text_ for text_ in texts for prev_row, prev_col in text_._cell_list if (curr_row == prev_row and curr_col == prev_col + 1) or (curr_row == prev_row + 1 and curr_col == prev_col)), None ) @staticmethod def _shape_to_object(shape, obj): """ shape json对象转换为对象模型 :param shape: :param obj: :return: """ obj._id = shape["id"] obj._name = shape["name"] obj._width = shape["width"] obj._height = shape["height"] obj.px_width = obj._width obj.px_height = obj._height obj._in_cell = shape.get("in_cell", True) obj._data = shape.get("data", "") obj._digest = shape.get("hash", "") obj.coordinate.desc = shape["index"] obj.to_coordinate.desc = shape["to_index"] obj.coordinate.top = str(shape["top"]) # 坐标:距离顶边距离 obj.coordinate.left = str(shape["left"]) # 坐标:距离左边距离 obj._position = shape["position"] @staticmethod def index_color_to_hex(index: int) -> str: """ 索引颜色值转为十六进制颜色值 """ return INDEXED_COLORS.get(index, "#FFFFFF") @staticmethod def int_color_to_hex(val: int) -> str: """ 整形颜色值转为十六进制 """ assert type(val) is int, "function 'int_color_to_hex' need int param." r = val & 0xFF g = (val >> 8) & 0xFF b = (val >> 16) & 0xFF return "#{:02X}{:02X}{:02X}".format(r, g, b) def _get_cell_bg_color(self, cell_: Cell, ws_with_pywin32): """ 获取单元格背景色 https://openpyxl.readthedocs.io/en/latest/_modules/openpyxl/styles/colors.html :param cell_: openpyxl.cell.cell.Cell对象 :return: 十六进制的颜色值 """ index = ws_with_pywin32.Name + cell_.coordinate if index in self._cell_bg_colors: return self._cell_bg_colors[index] cell = ws_with_pywin32.Range(cell_.coordinate) color = self.int_color_to_hex(int(cell.Interior.Color)) self._cell_bg_colors[index] = color return color # 判定是否有显式设置背景色 # if cell_.fill.patternType is not None: # # 有背景色 # fg_color_idx = cell_.fill.fgColor.index # if isinstance(fg_color_idx, int) and fg_color_idx == 0: # # 定制白色背景色 # return "#FFFFFF" # elif isinstance(fg_color_idx, int): # return self.index_color_to_hex(fg_color_idx) # elif isinstance(fg_color_idx, str): # return "#" + fg_color_idx[2:] # # # 未显式设置背景色 # return "#FFFFFF" # color_ = self._get_rgb_color(cell_.fill.bgColor) # if color_ in ("", self._BLACK, self._WHITE): # # 如果为默认颜色,则再从cell_.fill.bgColor读取一次 # color_ = self._get_rgb_color(cell_.fill.fgColor) # return color_ def _get_rgb_color(self, color): """ openpyxl的颜色对象解析对于的rgb颜色值 :param color: :return: """ if not color: return "" type_ = color.type value_ = color.value if type_ == 'rgb': return "#" + value_[2:] if len(value_) == 8 else "#" + value_ if type_ == 'indexed': # 索引颜色 return self._get_color_by_index(value_) if type_ == 'theme': # 主题颜色 if color.theme >= len(self._THEME_COLORS): return "" return "#" + ThemeColor().theme_and_tint_to_rgb(self._THEME_COLORS, color.theme, color.tint) return "" def _get_color_by_index(self, value): """ 拼接颜色属性 :param value: :return: """ return self._BLACK if value >= len(COLOR_INDEX) else "#" + COLOR_INDEX[value][2:] def _get_openpyxl_cell_bg_color(self, cell: Cell): index = cell.coordinate if index in self._cell_bg_colors: return self._cell_bg_colors[index] # 无填充 color_str = "#FFFFFF" if cell.fill.patternType is not None: color = cell.fill.bgColor # 64 是系统保留的前景色 if color.type == "indexed" and color.value == 64: color_str = self._get_rgb_color(cell.fill.fgColor) else: # openpyxl 不支持 lightUp 的解析方式,背景色是黑色,但是实际应该是白色,所以单独处理这种情况 if cell.fill.patternType=="lightUp" and color.type == "rgb" and color.value == "00000000": color_str = "#FFFFFF" else: color_str = self._get_rgb_color(color) # 这里背景颜色可能错误的提取为黑色 000000,导致背景颜色提取出错,如果提取为黑色,则尝试使用 fgCorlor # 如果后面还是出现背景颜色提取错误的问题,建议直接使用 _get_cell_bg_color 方法,这个方法准确度很高,确定是会牺牲性能 if color_str == self._BLACK: color_str = self._get_rgb_color(cell.fill.fgColor) self._cell_bg_colors[index] = color_str return color_str def _assign_ref(self, block): """ 为每个sheet 元素prev_ref, next_ref 属性赋值 :param block: :return: """ # 按照excel的自然行列排序 sorted_elements = sorted(block.elements, key=lambda ele_: self._get_cell_index(ele_, False)) # 排序完成后, 按照自然顺序为对象的prev_ref, next_ref 属性赋值 for i in range(len(sorted_elements)): ele = sorted_elements[i] if i > 0: ele.layout.prev_ref = sorted_elements[i - 1] if ele._type != 'table' and i < len(sorted_elements) - 1: # table的next_ref 需要单独计算 ele.layout.next_ref = sorted_elements[i + 1] # table类型元素的next_ref需要按照最后一个单元格坐标进行匹配 sorted_elements_2 = sorted(block.elements, key=lambda ele_: self._get_cell_index(ele_, True)) for i in range(len(sorted_elements_2)): ele = sorted_elements_2[i] if ele._type == 'table' and i < len(sorted_elements_2) - 1: ele.layout.next_ref = sorted_elements[i + 1] @staticmethod def _get_cell_index(ele, table_last_coordinate=True): """ 获取单元格行列数字坐标 :param ele: :return: (1,2) """ coordinate = ele.coordinate.desc if table_last_coordinate: coordinate = ele._last_coordinate if hasattr(ele, "_last_coordinate") else ele.coordinate.desc return coordinate_to_tuple(coordinate) @staticmethod def _get_start_end_of_table(sheet): return get_start_end_of_table(sheet) @staticmethod def remove_verbose_border(img, border_color=(0, 0, 0, 255)): """ 处理图片去除边框像素 """ # 转为RGBA格式 if img.mode != "RGBA": img = img.convert("RGBA") img_data = img.load() width, height = img.size # 去除上下黑色边框 for r in [0, height - 1]: for c in range(width): if img_data[c, r] == border_color: # 设置为透明 img_data[c, r] = (0, 0, 0, 0) # 去除左右的黑色边框 for c in [0, width - 1]: for r in range(height): if img_data[c, r] == border_color: img_data[c, r] = (0, 0, 0, 0) # 去除其他的横线边框 for r in range(height): # 边框占用了1px,已改为透明,所以从第二个像素开始 cur_pixel = img_data[1, r] if cur_pixel != (0, 0, 0, 0): for c in range(width): # 暂时简化处理 if img_data[c, r] == cur_pixel: img_data[c, r] = (0, 0, 0, 0) return img def _get_merged_graphic(self, ws_with_pywin32, start_pos, end_pos, graphic_obj): """ 从当前worksheet中对应的pos坐标位置截取图片,图片数据存入graphic_obj中 """ try: # 隐藏excel单元格的网格 ws_with_pywin32.Application.ActiveWindow.DisplayGridlines = False # 获取当前范围的单元格 cell_range = ws_with_pywin32.Range(f"{start_pos}:{end_pos}") cell_num = cell_range.Cells.Count temp_list = [] for i in range(cell_num): cur_cell = cell_range.Cells(i+1) # 记录单元格内容 temp_list.append((cur_cell, cur_cell.Value)) cur_cell.Value = None # ws_with_pywin32.Range(f"{start_pos}:{end_pos}").CopyPicture() cell_range.CopyPicture() img = ImageGrab.grabclipboard() # 单元格内容恢复 for c, v in temp_list: c.Value = v except Exception: img = None if img is not None: if start_pos != end_pos: # 去除图片中可能存在的边框 img = self.remove_verbose_border(img) bio = BytesIO() img.save(bio, format="png") graphic_obj.data = b64encode(bio.getvalue()).decode() @staticmethod def is_virtual_cell_in(vcell, vlist): """ 判定当前虚拟单元格是否在虚拟列表中 :param vcell: 虚拟的单元格对象VirtualCell :param vlist: 虚拟的单元格对象的列表 :return: -1未找到 or 列表正向索引 """ for idx, cell in enumerate(vlist): if vcell == cell: return idx return -1 # def _merge_graphic_pic_cross_multi_cell(self, block, ws_with_pywin32, ws_with_openpyxl): # """ 跨越多个单元格的图片、图形的合并,即虚拟单元格内图片、图形的合并 """ # # 存储虚拟单元格对象 # virtual_cell_list = [] # cur_idx = -1 # # 虚拟单元格在列表的索引为key, 同一个虚拟单元格内的图形、图片放入列表,以该列表为value # virtual_cell_dict = defaultdict(list) # # # 统计虚拟单元格内的图形 # for graphic in block.graphics: # coord = graphic.coordinate.desc # if not coord: # continue # # 判定该图形是否在虚拟单元格内 # vcell = has_virtual_cell(coord, ws_with_openpyxl) # if vcell is None: # 不在虚拟单元格内则跳过 # continue # # 虚拟单元格不在列表中则存入 # r = self.is_virtual_cell_in(vcell, virtual_cell_list) # if r == -1: # cur_idx += 1 # virtual_cell_list.append(vcell) # # 记录当前虚拟单元格中的图形对象 # virtual_cell_dict[cur_idx].append(graphic) # else: # # 当虚拟单元格已在列表中,则仅记录图形对象 # virtual_cell_dict[r].append(graphic) # # # 统计虚拟单元格内的图片 # for pic in block.pictures: # coord = pic.coordinate.desc # if not coord: # continue # # 判定该图片是否在虚拟单元格内 # vcell = has_virtual_cell(coord, ws_with_openpyxl) # if vcell is None: # 不在虚拟单元格内则跳过 # continue # # 虚拟单元格不在列表中则存入 # r = self.is_virtual_cell_in(vcell, virtual_cell_list) # if r == -1: # cur_idx += 1 # virtual_cell_list.append(vcell) # # 记录当前虚拟单元格中的图形对象 # virtual_cell_dict[cur_idx].append(pic) # else: # # 当虚拟单元格已在列表中,则仅记录图形对象 # virtual_cell_dict[r].append(pic) # # # 逆序处理每个虚拟单元格的图片、图形合并 # vnum = len(virtual_cell_list) # vidx = vnum - 1 # while vidx >= 0: # g_list = virtual_cell_dict.get(vidx) # 获取当前虚拟单元格中的图形、图片列表 # # 以图片对象作为基准 # leader = g_list[0] # for cur_g in g_list: # if isinstance(cur_g, PictureObject): # leader = cur_g # break # leader_label = "graphics" if isinstance(leader, GraphicObject) else "pictures" # try: # # 记录elements的插入位置 # ele_insert_idx = block.elements.index(leader) # # 记录graphics/pictures的插入位置 # gp_insert_idx = getattr(block, leader_label, []).index(leader) # # 开始处理合并 # for g in g_list: # # 从列表中删除待合并的图形/图片对象 # block.elements.remove(g) # if isinstance(g, GraphicObject): # block.graphics.remove(g) # else: # block.pictures.remove(g) # # 获取合并的图形数据 # current_vcell = virtual_cell_list[vidx] # start_pos = current_vcell.start_cell_coord # end_pos = current_vcell.end_cell_coord # self._get_merged_graphic(ws_with_pywin32, start_pos, end_pos, leader) # # 修正大图片的坐标 # leader.coordinate.desc = start_pos # # 在指定的位置插入合并的图片 # block.elements.insert(ele_insert_idx, leader) # if isinstance(leader, GraphicObject): # block.graphics.insert(gp_insert_idx, leader) # else: # block.pictures.insert(gp_insert_idx, leader) # vidx -= 1 # except (ValueError, IndexError, pythoncom.com_error): # logging.error("merge graphics&pictures inside virtual cell failed.") # vidx -= 1 def _merge_graphics(self, block, ws_with_pywin32): """ 合并同一个单元格内的图形 """ pos_dict = {} pos_list = [] # 保持位置有序性 # 统计每个位置的图形个数 for graphic in block.graphics: pos = graphic.coordinate.desc if not pos: continue if pos not in pos_dict: pos_dict[pos] = [graphic] pos_list.append(pos) else: pos_dict[pos].append(graphic) # 逆序处理(便于删除、插入)有图形的坐标位置 for pos in reversed(pos_list): g_list = pos_dict.get(pos) # 获取当前位置的图形 if len(g_list) == 1: # 当前位置只有一个图形,不做处理 continue try: # 当前位置有多个图形,则进行图形的合并 # 记录elements的插入位置 ele_insert_idx = block.elements.index(g_list[0]) # 记录graphics的插入位置 graphic_insert_idx = block.graphics.index(g_list[0]) # 开始处理 merged_graphic = g_list[0] for g in g_list: # 从列表中删除待合并的图形对象 block.elements.remove(g) block.graphics.remove(g) # 获取合并的图形数据 self._get_merged_graphic(ws_with_pywin32, pos, pos, merged_graphic) # 在指定的位置插入合并的图形 block.elements.insert(ele_insert_idx, merged_graphic) block.graphics.insert(graphic_insert_idx, merged_graphic) except (ValueError, IndexError, pythoncom.com_error): logging.error("merge graphics inside same cell failed.") def _merge_pictures(self, block, ws_with_pywin32): """ 合并同一个单元格内的图片 """ pos_dict = {} pos_list = [] # 保持位置有序性 # 统计每个pos位置的图片个数 for pic in block.pictures: pos = pic.coordinate.desc if not pos: continue if pos not in pos_dict: pos_dict[pos] = [pic] pos_list.append(pos) else: pos_dict[pos].append(pic) # 逆序处理(便于删除、插入)有图片的坐标位置 for pos in reversed(pos_list): pic_list = pos_dict.get(pos) # 获取当前位置的图片列表 if len(pic_list) == 1: # 当前位置只有一个图片,不做处理 continue try: # 当前位置有多个图片,则进行图片的合并 # 记录elements的插入位置 ele_insert_idx = block.elements.index(pic_list[0]) # 记录pictures的插入位置 picture_insert_idx = block.pictures.index(pic_list[0]) # 开始处理 merged_pic = pic_list[0] for cur_pic in pic_list: # 从列表中删除待合并的图片对象 block.elements.remove(cur_pic) block.pictures.remove(cur_pic) # 获取合并的图片数据 self._get_merged_graphic(ws_with_pywin32, pos, pos, merged_pic) # 在指定的位置插入合并的图形 block.elements.insert(ele_insert_idx, merged_pic) block.pictures.insert(picture_insert_idx, merged_pic) except (ValueError, IndexError, pythoncom.com_error): logging.error("merge pictures inside same cell failed.") def _merge_graphics_and_pictures(self, block, ws_with_pywin32): """ 合并同一个单元格的图形 + 图片 """ pos_dict = {} pos_list = [] # 保持位置有序性 # 统计每个pos位置的图片个数 for pic in block.pictures: pos = pic.coordinate.desc if not pos: continue if pos not in pos_dict: pos_dict[pos] = [pic] pos_list.append(pos) else: pos_dict[pos].append(pic) # 统计每个pos位置的图形个数 for graphic in block.graphics: pos = graphic.coordinate.desc if not pos or pos not in pos_dict: continue else: # 同一个cell中有图片 + 图形 pos_dict[pos].append(graphic) # 逆序处理(便于删除、插入)有图片的坐标位置 for pos in reversed(pos_list): pic_list = pos_dict.get(pos) # 获取当前位置的图片列表 if len(pic_list) == 1: # 当前位置只有一个图片,不做处理 continue try: end_pos = get_end_desc(pic_list) # 当前位置有图片+图形,则进行图片+图形的合并 # 记录elements的插入位置 ele_insert_idx = block.elements.index(pic_list[0]) # 记录pictures的插入位置 picture_insert_idx = block.pictures.index(pic_list[0]) # 开始处理 merged_pic = pic_list[0] for cur_pic in pic_list: # 从列表中删除待合并的图片、图形对象 block.elements.remove(cur_pic) if isinstance(cur_pic, PictureObject): block.pictures.remove(cur_pic) elif isinstance(cur_pic, GraphicObject): block.graphics.remove(cur_pic) # 获取合并的图片数据 self._get_merged_graphic(ws_with_pywin32, pos, end_pos, merged_pic) # 在指定的位置插入合并的图形 block.elements.insert(ele_insert_idx, merged_pic) block.pictures.insert(picture_insert_idx, merged_pic) except (ValueError, IndexError, pythoncom.com_error): logging.error("merge pictures inside same cell failed.") def get_cell_border(self, cell: Cell) -> CellBorderObject: index = (cell.row, cell.column) if index in self._cell_border: return self._cell_border[index] cell_border = CellBorderObject() border_style = getattr(cell.border, "top").style border_color = self._get_rgb_color(getattr(cell.border, "top").color) cell_border.border_top = BorderObject(border_color, border_style) border_style = getattr(cell.border, "left").style border_color = self._get_rgb_color(getattr(cell.border, "left").color) cell_border.border_left = BorderObject(border_color, border_style) border_style = getattr(cell.border, "bottom").style border_color = self._get_rgb_color(getattr(cell.border, "bottom").color) cell_border.border_bottom = BorderObject(border_color, border_style) border_style = getattr(cell.border, "right").style border_color = self._get_rgb_color(getattr(cell.border, "right").color) cell_border.border_right = BorderObject(border_color, border_style) self._cell_border[index] = cell_border return cell_border def _parse_sheet(self, sheet, shapes): """ 解析一个worksheet中的数据 :params sheet: Worksheet对象 :params shapes: shape字典列表 """ wb_with_pywin32 = self._excel_app ws_with_pywin32 = wb_with_pywin32.Sheets[sheet.title] sheet_name = sheet.title # 重置sheet背景颜色的缓存 self._cell_bg_colors = {} self._cell_border = {} self._merged_cells_cache = {} logging.debug(f"parse_excel start sheet_name: {sheet_name}") # 表格的范围, 表格外的内容的范围 table_range_list, out_table_cells = self._get_start_end_of_table(sheet) table_range_list = sorted(table_range_list) # sort the table range list logging.debug(f"parse_excel sheet_name: {sheet_name} get_table_range_list {table_range_list}") # 实例化一个sheet对象 block = DocumentBlockObject() block.file_name = os.path.basename(self._file_path) shapes = handle_shape_in_merged_cell(sheet, shapes) # 解析当前sheet的表格信息 tables = self._parse_table(sheet, block, table_range_list, ws_with_pywin32, shapes) logging.info(f"parse_excel sheet_name: {sheet_name} _parse_table {len(tables)}") # 解析当前sheet的图片信息 pictures = self._parse_picture(sheet_name, block, shapes) logging.info(f"parse_excel sheet_name: {sheet_name} _parse_picture {len(pictures)}") # 解析当前sheet的图形信息 graphics = self._parse_graphic(sheet_name, block, shapes) logging.info(f"parse_excel sheet_name: {sheet_name} _parse_graphic {len(graphics)}") # 解析当前sheet的表格外的对象信息 texts = self._parse_text(sheet, block, table_range_list, out_table_cells, shapes, ws_with_pywin32) logging.info(f"parse_excel sheet_name: {sheet_name} _parse_text {len(texts)}") block.name = sheet_name block.add_table(tables) block.add_text(texts) block.add_graphic(graphics) block.add_picture(pictures) self._assign_ref(block) logging.debug(f"parse_excel end sheet_name: {sheet_name}") # 合并虚拟单元格中的图形、图片 # self._merge_graphic_pic_cross_multi_cell(block, ws_with_pywin32, sheet) # 合并同一个单元格中的图形 self._merge_graphics(block, ws_with_pywin32) # 合并同一个单元格图片 self._merge_pictures(block, ws_with_pywin32) # 合并同一个单元格中的图形 + 图片 self._merge_graphics_and_pictures(block, ws_with_pywin32) # 关闭 return block # 这个方法暂时没有启用,由于位置信息有问题,无法计算覆盖率 def _handle_shapes(self, shapes: List[Dict[str, Any]], total_width, total_height) -> List[Dict[str, Any]]: """通过覆盖比例判断shape是否属某个的单元格""" for shape in shapes: from_row = shape["from_row"] from_col = shape["from_col"] to_row = shape["to_row"] to_col = shape["to_col"] # 已经属于某个单元格,则跳过 if shape["in_cell"]: continue # 如果图片的大小超过6个单元格,则认为当前图片不属于某个单元格 if (to_col - from_col + 1) * (to_row - from_row + 1) > 6: continue found = False for row in range(from_row, to_row + 1): for col in range(from_col, to_col + 1): cell_pos = self._cell_positions[(row, col)] ## cell_pos 是 百分比,需要转换 cell_pos = Position(cell_pos.x * total_width, cell_pos.y * total_height, cell_pos.width * total_width, cell_pos.height * total_height) shape_pos = shape["position"] overlap_percentage = calculate_overlap_percentage(cell_pos, shape_pos) if hasattr(shape, 'content') and hasattr(shape, 'index') and shape["content"] == "△X": print(f"{shape['index']}: {overlap_percentage}") print(f"{shape['content']}: {overlap_percentage}") if overlap_percentage > 0.8: shape["in_cell"] = True shape["cell_row"] = row shape["cell_col"] = col found = True break if found: break return shapes 详细说一下解析的逻辑
最新发布
10-28
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值