Tuple_in_a_table行列叫什么好多名字

表格行列的别称介绍

In the context of databases and data structures, a tuple is a collection of elements, typically a row in a table. Each element within a tuple is an atomic value, which means it cannot be further divided into smaller meaningful parts. In a relational database table, a tuple corresponds to a single record, which contains data for each column in that row.

For example, consider a simple table with three columns: “Name,” “Age,” and “City.” A tuple in this table might look like this:

(Name: “Alice”, Age: 30, City: “New York”)

This tuple represents one entry in the table, with “Alice” being the value for the “Name” column, 30 for the “Age” column, and “New York” for the “City” column. Each tuple in the table is unique and represents a single entity or observation.

表格里的行列又叫作什么

Row: tuple, record, observation
Column: attribute, feature, field
Table: relation

Traceback (most recent call last): File “D:\Venv\RRM_env\lib\site-packages\kotei_omc\compare_parser.py”, line 78, in compare plugin_dr.extend_result(plugin_instance.compare_each_block(CustomPreDiffStrategyMiddleware)) File “D:\Venv\RRM_env\lib\site-packages\kotei_omc\comparers\base_comparer.py”, line 88, in compare_each_block block_result = self.compare(block_name, ls_base, ls_target, belong_to) File “D:\Venv\RRM_env\lib\site-packages\kotei_omc\comparers\table_comparer.py”, line 49, in compare part_delete, part_add, part_update = self.compare_table(block_name, old_table, new_table,belong_to=belong_to) File “D:\Venv\RRM_env\lib\site-packages\kotei_omc\comparers\table_comparer.py”, line 193, in compare_table col_matched = CustomTableStrategyMiddleware(self._path_base).match_row(del_cols, add_cols,is_col=True, File “D:\Venv\RRM_env\lib\site-packages\kotei_omc\middlewares\table_middlewares.py”, line 605, in match_row return self.strategy.match_rows(base_rows, target_rows, is_col, head_indexes) File “D:\Venv\RRM_env\lib\site-packages\kotei_omc\middlewares\table_strategy.py”, line 201, in match_rows matched_strategies = self.registry.get_matched_row_strategies(base_rows, table) File “D:\Venv\RRM_env\lib\site-packages\kotei_omc\middlewares\table_strategy.py”, line 87, in get_matched_row_strategies if table and condition(table): File “D:\Venv\RRM_env\lib\site-packages\kotei_omc\middlewares\table_middlewares.py”, line 862, in _is_header_rules_valid heads = base_table.get_heads() File “D:\Venv\RRM_env\lib\site-packages\kotei_omp\data\table.py”, line 322, in get_heads heads.append(self.rows[index].cells) TypeError: list indices must be integers or slices, not str def compare_table(self, block_name, old_table, new_table, belong_to): logger.info(f"start compare table, old_data_id: {old_table.data_id}, new_data_id: {new_table.data_id}") 确保表头属性存在 for table in [old_table, new_table]: if not hasattr(table, ‘head_type’): table.head_type = ‘horizontal’ if not hasattr(table, ‘head_list’): table.head_list = [] if not hasattr(table, ‘header_row_idx’): table.header_row_idx = 0 # === 表头有效性检查函数 === def is_valid_header(row): if not row or not row.cells: return False total_cells = len(row.cells) empty_cells = sum(1 for cell in row.cells if (cell.text is None or str(cell.text).strip() == ‘’) and not cell.content) return (empty_cells / total_cells) < 0.1 # === 表头检测与设置 === def detect_and_set_header(table): # 确保header_row_idx是整数 table.header_row_idx = int(getattr(table, ‘header_row_idx’, 0)) # 水平表头处理 if table.head_type == ‘horizontal’ and table.rows: # 检查当前表头行是否有效 current_idx = table.header_row_idx if 0 <= current_idx < len(table.rows) and is_valid_header(table.rows[current_idx]): return # 当前表头有效 # 查找有效表头行 for idx in range(0, min(3, len(table.rows))): if is_valid_header(table.rows[idx]): table.header_row_idx = idx logger.info(f"设置第{idx + 1}行为表头") return # 未找到有效表头 table.head_type = None logger.warning(“未找到有效表头”) # === 表头内容提取 === def extract_header_content(table): if table.head_type == ‘horizontal’ and table.rows: # 确保header_row_idx是有效整数 idx = int(table.header_row_idx) if 0 <= idx < len(table.rows): header_row = table.rows[idx] return [cell.text for cell in header_row.cells] return [] # 无效表头返回空列表 # === 执行表头处理 === for table in [old_table, new_table]: detect_and_set_header(table) table.head_list = extract_header_content(table) # 使表格内列数一致 self.align_table_col(old_table, new_table) # 表格中存在大量视觉上merge但是实际未合并的空格,需要将空格赋值为正确的文本,防止影响相似度匹配 self.fill_visual_merged_cells(old_table) self.fill_visual_merged_cells(new_table) if old_table.head_type == new_table.head_type == ‘horizontal’: old_col_table, new_col_table = self.transpose_table(old_table, new_table) else: if old_table.head_type == ‘vertical’: new_table.head_list = old_table.head_list new_table.head_type = ‘vertical’ elif new_table.head_type == ‘vertical’: old_table.head_list = new_table.head_list old_table.head_type = ‘vertical’ old_col_table, new_col_table = old_table, new_table # 列匹配 del_cols, add_cols = old_col_table.rows, new_col_table.rows col_matched = CustomTableStrategyMiddleware(self._path_base).match_row(del_cols, add_cols,is_col=True, head_indexes=[old_table.head_list,new_table.head_list]) if col_matched: matched_old_cols, matched_new_cols = list(zip(*list(col_matched))) del_cols = [old_col for old_col in old_col_table.rows if old_col not in matched_old_cols] add_cols = [new_col for new_col in new_col_table.rows if new_col not in matched_new_cols] sub_type = ‘col’ if old_table.head_type == ‘horizontal’ else ‘row’ ls_col_delete, ls_col_add = self.process_delete_add_diff(block_name, sub_type, del_cols, add_cols, belong_to=belong_to, head_type=old_table.head_type) # 根据matched的列组合新的表,得到列一致的两个表 if col_matched: old_col_indexes,new_col_indexes =[],[] for old_col, new_col in col_matched: old_col_indexes.append(old_col_table.rows.index(old_col)) new_col_indexes.append(new_col_table.rows.index(new_col)) old_equal_col_table = self.choice_cols(old_table, old_col_indexes) new_equal_col_table = self.choice_cols(new_table, new_col_indexes) else: return ls_col_delete, ls_col_add, [] # 行匹配 del_rows, add_rows = old_equal_col_table.rows, new_equal_col_table.rows row_matched = CustomTableStrategyMiddleware(self._path_base).match_row(del_rows, add_rows, is_col=False) if row_matched: matched_old_rows, matched_new_rows = list(zip(*list(row_matched))) del_rows_indexes = [idx for idx, old_row in enumerate(old_equal_col_table.rows) if old_row not in matched_old_rows] add_rows_indexes = [idx for idx, new_row in enumerate(new_equal_col_table.rows) if new_row not in matched_new_rows] # 使用没有重组前的表,横表头直接处理,竖表头需要转置 if old_table.head_type == new_table.head_type == ‘horizontal’: del_rows = [old_table.rows[idx] for idx in del_rows_indexes] add_rows = [new_table.rows[idx] for idx in add_rows_indexes] else: old_transpose_table = self.choice_cols(old_table, list(range(len(old_table.rows)))) new_transpose_table = self.choice_cols(new_table, list(range(len(new_table.rows)))) del_rows = [old_transpose_table.rows[idx] for idx in del_rows_indexes] add_rows = [new_transpose_table.rows[idx] for idx in add_rows_indexes] sub_type = ‘row’ if old_table.head_type == ‘horizontal’ else ‘col’ ls_row_delete, ls_row_add = self.process_delete_add_diff(block_name, sub_type, del_rows, add_rows, belong_to=belong_to, head_type=old_table.head_type) # 根据matched的行组合新的表,得到行一致的两个表 if row_matched: old_equal_row_table, new_equal_row_table = TableObject(), TableObject() old_equal_row_table.rows = list(matched_old_rows) old_equal_row_table.head_type = old_table.head_type self.copy_table_attrs(old_equal_row_table, old_table) new_equal_row_table.rows = list(matched_new_rows) new_equal_row_table.head_type = new_table.head_type self.copy_table_attrs(new_equal_row_table, new_table) # 查找行变更、列变更、单元格变更 ls_row_update, ls_col_update, ls_cell_update = self.compare_ordered_tables(block_name,old_equal_row_table, new_equal_row_table,belong_to=belong_to) else: ls_row_update, ls_col_update, ls_cell_update = [], [], [] part_delete = ls_row_delete + ls_col_delete part_add = ls_row_add + ls_col_add part_update = ls_row_update + ls_col_update + ls_cell_update logger.info(f"finish compare table, old_data_id: {old_table.data_id}, new_data_id: {new_table.data_id}") return part_delete, part_add, part_update 不用填充表头内容进去,如果第一行是表头的话,headlist就是1,我应该没有理解错: 结合下面的代码再理解一下: if old_table.head_type == new_table.head_type == ‘horizontal’: old_col_table, new_col_table = self.transpose_table(old_table, new_table) else: if old_table.head_type == ‘vertical’: new_table.head_list = old_table.head_list new_table.head_type = ‘vertical’ elif new_table.head_type == ‘vertical’: old_table.head_list = new_table.head_list old_table.head_type = ‘vertical’ old_col_table, new_col_table = old_table, new_table 列匹配 del_cols, add_cols = old_col_table.rows, new_col_table.rows col_matched = CustomTableStrategyMiddleware(self._path_base).match_row(del_cols, add_cols,is_col=True, head_indexes=[old_table.head_list,new_table.head_list]) def match_row(self, base_rows: List[RowObject], target_rows: List[RowObject], is_col: bool = False, head_indexes=None) -> List[Tuple[RowObject, RowObject]]: “”" 匹配行 调用自定义匹配策略 Args: base_rows: 基准行列表 target_rows: 目标行列表 is_col: 是否为列匹配 head_indexes: 表头索引 Returns: 匹配结果列表 [(base_row, target_row), …] “”" return self.strategy.match_rows(base_rows, target_rows, is_col, head_indexes) def match_rows(self, base_rows: List[RowObject], target_rows: List[RowObject], is_col: bool = False, head_indexes=None) -> List[Tuple[RowObject, RowObject]]: “”" 匹配行 先应用定制匹配策略,再应用默认匹配策略 Args: base_rows: 基准行列表 target_rows: 目标行列表 is_col: 是否为列匹配 head_indexes: 表头索引 Returns: 匹配结果列表 [(base_row, target_row), …] “”" # 最终匹配结果 matched_pairs = [] # 跟踪已匹配的行 matched_base_rows = [] matched_target_rows = [] # 尝试获取所属表格(如果可以) table = None if base_rows and base_rows[0].cells: cell = base_rows[0].cells[0] block = cell while block and not isinstance(block, TableObject) and hasattr(block, ‘layout’): block = block.layout.parent_ref if isinstance(block, TableObject): table = block # 获取匹配的策略 matched_strategies = self.registry.get_matched_row_strategies(base_rows, table) logger.info(f"table row match 使用定制策略进行匹配, base_num: {len(base_rows)}, target_num: {len(target_rows)}") # 应用定制匹配策略 for handler in matched_strategies: # 过滤出未匹配的行 applicable_base_rows = [row for row in base_rows if row not in matched_base_rows] if not applicable_base_rows: continue applicable_target_rows = [row for row in target_rows if row not in matched_target_rows] if not applicable_target_rows: continue # 应用处理函数进行匹配 custom_matches = handler(applicable_base_rows, applicable_target_rows, is_col, head_indexes) # 更新匹配结果和已匹配行 for base_row, target_row in custom_matches: matched_pairs.append((base_row, target_row)) matched_base_rows.append(base_row) matched_target_rows.append(target_row) # 应用默认匹配策略处理未匹配的行 if self.default_strategy: logger.info(“table rows 使用默认策略匹配”) remaining_base_rows = [row for row in base_rows if row not in matched_base_rows] remaining_target_rows = [row for row in target_rows if row not in matched_target_rows] if remaining_base_rows and remaining_target_rows: default_matches = self.default_strategy.match_row( remaining_base_rows, remaining_target_rows, is_col, head_indexes) matched_pairs.extend(default_matches) logger.info(“table rows match finish”) return matched_pairs 我的目标是设置默认表头,我觉得header_row_idx很有可能是列表,但是我们默认表头我只希望有一行就行,检测第一行空格,小于0.1就是第一行,
最新发布
12-05
import logging,time,re import numpy as np from collections import defaultdict from itertools import zip_longest from kotei_omp.data import DocumentBlockObject from kotei_omc.comparers.picture_comparer import PictureComparer, GraphicComparer from kotei_omc.comparers.base_comparer import BaseComparer from kotei_omc.comparers.plugins import register_plugin from kotei_omc.data.diff import DiffItem from kotei_omp.data import TextObject, GraphicObject, PictureObject, StyleObject, RunObject from kotei_omp.data.table import CellObject, RowObject, TableObject from kotei_omc.settings import settings from kotei_omc.utils.type_checker import is_instance_of from kotei_omc.middlewares.table_middlewares import CustomTableStrategyMiddleware logger = logging.getLogger("req_diff") @register_plugin("table") class TableComparer(BaseComparer): def get_block_resource(self, block, belong_to='block'): return self.do_get_block_resource(block, belong_to, 'tables', TableObject) def compare(self, block_name, base, target, belong_to=None): t0 = time.time() # 表格匹配 logger.info(f'start match table, block_name: {block_name}, base_num: {len(base)}, target_num: {len(target)}') match_func = CustomTableStrategyMiddleware(self._path_base).match if settings.MATCH_WITH_CHAPTER: tb_delete_list, tb_add_list, old_new_tb_matched = self.do_match_with_chapter(base, target,match_func) else: tb_delete_list, tb_add_list, old_new_tb_matched = self.do_match_normal(base, target,match_func) logger.info('finish match table') # 表格新增删除 ls_tb_delete, ls_tb_add = self.process_delete_add_diff(block_name, 'table', tb_delete_list, tb_add_list, belong_to=belong_to) # 表格差分 ls_tb_update = [] for old_table, new_table in old_new_tb_matched: # 要求废止特殊处理 old_table, new_table = self.pre_process_require(old_table, new_table) # 表格位置差分 if not old_table.is_same_pos(new_table): ls_tb_update.append(DiffItem('update', 'table', sub_type='table', block_name=block_name, old=old_table, new=new_table, belong_to=belong_to,diff_point='coordinate_desc')) # 对匹配的每个表格进行对比 part_delete, part_add, part_update = self.compare_table(block_name, old_table, new_table,belong_to=belong_to) ls_tb_delete.extend(self.row_del_add_after(part_delete,category='delete')) ls_tb_add.extend(self.row_del_add_after(part_add,category='add')) ls_tb_update.extend(self.cell_update_after(part_update)) t1 = time.time() logger.info(f'Time Cost:table diff {block_name} {t1 - t0}') return {'add': ls_tb_add, 'delete': ls_tb_delete, 'update': ls_tb_update} @staticmethod def copy_table_attrs(to_table, from_table): for attr_name in ('layout', 'style', 'border', 'coordinate', 'data_id'): setattr(to_table, attr_name, getattr(from_table, attr_name)) @staticmethod def fill_visual_merged_cells(table): num_rows = len(table.rows) if num_rows == 0: return num_cols = max([len(row.cells) for row in table.rows]) if num_cols == 0: return # 判断是否有边界 def is_bordered(side): return side.border_style is not None for col in range(num_cols): row_ptr = 0 while row_ptr < num_rows: cell = table.rows[row_ptr].cells[col] top_border_exists = is_bordered(cell.border.border_top) if row_ptr == 0 or top_border_exists: start_row = row_ptr end_row = start_row while end_row < num_rows: current_cell = table.rows[end_row].cells[col] bottom_border_exists = is_bordered(current_cell.border.border_bottom) # import ipdb;ipdb.set_trace() if bottom_border_exists or end_row == num_rows - 1: break else: end_row += 1 block_text = None block_content = None for r in range(start_row, end_row + 1): val = table.rows[r].cells[col].text if val is not None and str(val).strip() != "": block_text = val block_content = table.rows[r].cells[col].content break if block_text is not None: merged_ranges = [start_row, col, end_row, col] for r in range(start_row, end_row + 1): val = table.rows[r].cells[col].text if val is None or str(val).strip() == "": table.rows[r].cells[col].content = block_content table.rows[r].cells[col].text = block_text # 添加 merged_ranges 属性 if not table.rows[r].cells[col].merged_ranges: table.rows[r].cells[col].merged_ranges = merged_ranges row_ptr = end_row + 1 else: row_ptr += 1 def compare_table(self, block_name, old_table, new_table, belong_to): logger.info(f"start compare table, old_data_id: {old_table.data_id}, new_data_id: {new_table.data_id}") # === 设置默认表头类型 === DEFAULT_HEADER = 'horizontal' if not hasattr(old_table, 'head_type') or old_table.head_type is None: old_table.head_type = DEFAULT_HEADER if not hasattr(new_table, 'head_type') or new_table.head_type is None: new_table.head_type = DEFAULT_HEADER # === 表头有效性检查函数 === def is_valid_header(row): if not row or not row.cells: return False total_cells = len(row.cells) empty_cells = sum(1 for cell in row.cells if (cell.text is None or str(cell.text).strip() == '') and not cell.content) return (empty_cells / total_cells) < 0.1 # === 关键修复:设置实际表头行 === # 设置默认表头行为第一行 old_table.header_row_idx = 0 new_table.header_row_idx = 0 # 检查并确认表头行 if old_table.head_type == 'horizontal' and old_table.rows: # 检查第一行是否有效 if not is_valid_header(old_table.rows[0]): logger.warning(f"第一行无效表头 in old table {old_table.data_id}") # 尝试查找后续有效行作为表头 for idx in range(1, min(3, len(old_table.rows))): # 最多检查前3行 if is_valid_header(old_table.rows[idx]): old_table.header_row_idx = idx logger.info(f"设置第{idx + 1}行为表头 in old table") break else: old_table.head_type = None # 未找到有效表头 logger.warning(f"未找到有效表头 in old table {old_table.data_id}") # 新表同样处理 if new_table.head_type == 'horizontal' and new_table.rows: if not is_valid_header(new_table.rows[0]): logger.warning(f"第一行无效表头 in new table {new_table.data_id}") for idx in range(1, min(3, len(new_table.rows))): if is_valid_header(new_table.rows[idx]): new_table.header_row_idx = idx logger.info(f"设置第{idx + 1}行为表头 in new table") break else: new_table.head_type = None logger.warning(f"未找到有效表头 in new table {new_table.data_id}") # === 设置表头内容 === # 从确定的表头行提取表头内容 if old_table.head_type == 'horizontal' and old_table.rows: header_row = old_table.rows[old_table.header_row_idx] old_table.head_list = [cell.text for cell in header_row.cells] if new_table.head_type == 'horizontal' and new_table.rows: header_row = new_table.rows[new_table.header_row_idx] new_table.head_list = [cell.text for cell in header_row.cells] # 使表格内列数一致 self.align_table_col(old_table, new_table) # 表格中存在大量视觉上merge但是实际未合并的空格,需要将空格赋值为正确的文本,防止影响相似度匹配 self.fill_visual_merged_cells(old_table) self.fill_visual_merged_cells(new_table) if old_table.head_type == new_table.head_type == 'horizontal': old_col_table, new_col_table = self.transpose_table(old_table, new_table) else: if old_table.head_type == 'vertical': new_table.head_list = old_table.head_list new_table.head_type = 'vertical' elif new_table.head_type == 'vertical': old_table.head_list = new_table.head_list old_table.head_type = 'vertical' old_col_table, new_col_table = old_table, new_table # 列匹配 del_cols, add_cols = old_col_table.rows, new_col_table.rows col_matched = CustomTableStrategyMiddleware(self._path_base).match_row(del_cols, add_cols,is_col=True, head_indexes=[old_table.head_list,new_table.head_list]) if col_matched: matched_old_cols, matched_new_cols = list(zip(*list(col_matched))) del_cols = [old_col for old_col in old_col_table.rows if old_col not in matched_old_cols] add_cols = [new_col for new_col in new_col_table.rows if new_col not in matched_new_cols] sub_type = 'col' if old_table.head_type == 'horizontal' else 'row' ls_col_delete, ls_col_add = self.process_delete_add_diff(block_name, sub_type, del_cols, add_cols, belong_to=belong_to, head_type=old_table.head_type) # 根据matched的列组合新的表,得到列一致的两个表 if col_matched: old_col_indexes,new_col_indexes =[],[] for old_col, new_col in col_matched: old_col_indexes.append(old_col_table.rows.index(old_col)) new_col_indexes.append(new_col_table.rows.index(new_col)) old_equal_col_table = self.choice_cols(old_table, old_col_indexes) new_equal_col_table = self.choice_cols(new_table, new_col_indexes) else: return ls_col_delete, ls_col_add, [] # 行匹配 del_rows, add_rows = old_equal_col_table.rows, new_equal_col_table.rows row_matched = CustomTableStrategyMiddleware(self._path_base).match_row(del_rows, add_rows, is_col=False) if row_matched: matched_old_rows, matched_new_rows = list(zip(*list(row_matched))) del_rows_indexes = [idx for idx, old_row in enumerate(old_equal_col_table.rows) if old_row not in matched_old_rows] add_rows_indexes = [idx for idx, new_row in enumerate(new_equal_col_table.rows) if new_row not in matched_new_rows] # 使用没有重组前的表,横表头直接处理,竖表头需要转置 if old_table.head_type == new_table.head_type == 'horizontal': del_rows = [old_table.rows[idx] for idx in del_rows_indexes] add_rows = [new_table.rows[idx] for idx in add_rows_indexes] else: old_transpose_table = self.choice_cols(old_table, list(range(len(old_table.rows)))) new_transpose_table = self.choice_cols(new_table, list(range(len(new_table.rows)))) del_rows = [old_transpose_table.rows[idx] for idx in del_rows_indexes] add_rows = [new_transpose_table.rows[idx] for idx in add_rows_indexes] sub_type = 'row' if old_table.head_type == 'horizontal' else 'col' ls_row_delete, ls_row_add = self.process_delete_add_diff(block_name, sub_type, del_rows, add_rows, belong_to=belong_to, head_type=old_table.head_type) # 根据matched的行组合新的表,得到行一致的两个表 if row_matched: old_equal_row_table, new_equal_row_table = TableObject(), TableObject() old_equal_row_table.rows = list(matched_old_rows) old_equal_row_table.head_type = old_table.head_type self.copy_table_attrs(old_equal_row_table, old_table) new_equal_row_table.rows = list(matched_new_rows) new_equal_row_table.head_type = new_table.head_type self.copy_table_attrs(new_equal_row_table, new_table) # 查找行变更、列变更、单元格变更 ls_row_update, ls_col_update, ls_cell_update = self.compare_ordered_tables(block_name,old_equal_row_table, new_equal_row_table,belong_to=belong_to) else: ls_row_update, ls_col_update, ls_cell_update = [], [], [] part_delete = ls_row_delete + ls_col_delete part_add = ls_row_add + ls_col_add part_update = ls_row_update + ls_col_update + ls_cell_update logger.info(f"finish compare table, old_data_id: {old_table.data_id}, new_data_id: {new_table.data_id}") return part_delete, part_add, part_update def transpose_table(self, old_table, new_table): """ 将表格进行转置操作,即将行转换为列,列转换为行。 Args: old_table (TableObject): 原始表格对象 new_table (TableObject): 目标表格对象 Returns: tuple: 返回转置后的两个表格对象 (old_col_table, new_col_table) """ # 创建新的表格对象用于存储转置后的数据 old_col_table, new_col_table = TableObject(), TableObject() # 对原始表格的行进行转置操作 old_col_table.rows = self.transpose_table_rows(old_table.rows) # 根据原始表格的表头类型,设置转置后的表头类型 old_col_table.head_type = 'vertical' if old_table.head_type == 'horizontal' else 'horizontal' # 复制原始表格的属性到转置后的表格 self.copy_table_attrs(old_col_table, old_table) # 对目标表格的行进行转置操作 new_col_table.rows = self.transpose_table_rows(new_table.rows) # 根据目标表格的表头类型,设置转置后的表头类型 new_col_table.head_type = 'vertical' if new_table.head_type == 'horizontal' else 'horizontal' # 复制目标表格的属性到转置后的表格 self.copy_table_attrs(new_col_table, new_table) # 返回转置后的两个表格对象 return old_col_table, new_col_table def compare_ordered_tables(self, block_name, old_table_obj, new_table_obj, belong_to): row_updates, col_updates, cell_updates = [], [], [] # 获取新旧行数据 old_rows = getattr(old_table_obj, 'rows', []) new_rows = getattr(new_table_obj, 'rows', []) old_cells_list = [row.cells for row in old_rows] new_cells_list = [row.cells for row in new_rows] # 获取内容用于对比 old_content_cells_list = self.get_cell_content_list(old_cells_list, settings.DIFF_ATTR) new_content_cells_list = self.get_cell_content_list(new_cells_list, settings.DIFF_ATTR) # 删除完全一样的匹配 for row_index in range(len(old_content_cells_list) - 1, -1, -1): # 如果新旧行内容相同,则删除该行 # 之后可以在这里增加原子操作逻辑,避免删除不同步 if old_content_cells_list[row_index] == new_content_cells_list[row_index]: old_content_cells_list.pop(row_index) new_content_cells_list.pop(row_index) old_cells_list.pop(row_index) new_cells_list.pop(row_index) #原子一致性检查 flag = False if len(old_content_cells_list) ==len(new_content_cells_list)==len(old_cells_list) == len(new_cells_list): flag = True if not flag: logger.warning(f"{block_name} old_table_obj: {old_table_obj}, new_table_obj: {new_table_obj}; delete operator is not atomic; all the cells list will involved in finding differences computation") if not old_content_cells_list: return [], [], [] # 查找差异 diff_type, row_diffs, col_diffs, cell_diffs, cell_diff_points, cell_diff_values, \ row_diff_idx, col_diff_idx, graphic_diff, picture_diff = self.find_differences( old_content_cells_list, new_content_cells_list, old_cells_list, new_cells_list) # 抽取单元格内的图形图像差分 for item in graphic_diff + picture_diff: if item: cell_updates.extend(item) # 处理单元格差分 for idx, (cell_diff_idx, diff_point, diff_value) in enumerate( zip(cell_diffs, cell_diff_points, cell_diff_values)): try: # old = self.get_element_by_index(old_cells_list, cell_diff_idx) # new = self.get_element_by_index(new_cells_list, cell_diff_idx) old, new = old_cells_list, new_cells_list for cell_idx in cell_diff_idx: old = old[cell_idx] new = new[cell_idx] except IndexError: continue # 忽略非法索引 cell_diff_obj = DiffItem( 'update', 'table', 'cell', block_name=block_name, old=old, new=new, belong_to=belong_to, diff_point=diff_point, diff_values=diff_value ) cell_updates.append(cell_diff_obj) # 处理行差分 # if diff_type == 'row': # for row_idx, row_diff_col_idx in zip(row_diffs, row_diff_idx): # try: # old_row = [old_cells_list[row_idx][cell_idx] for cell_idx in row_diff_col_idx] # new_row = [new_cells_list[row_idx][cell_idx] for cell_idx in row_diff_col_idx] # except IndexError: # continue # # row_diff_item = DiffItem( # 'update', 'table', 'row', # block_name=block_name, # old=self.merge_cells_to_row(old_row), # new=self.merge_cells_to_row(new_row), # belong_to=belong_to) # row_updates.append(row_diff_item) # 处理列差分 # elif diff_type == 'col': # for col_idx, col_diff_col_idx in zip(col_diffs, col_diff_idx): # try: # old_col = [old_cells_list[cell_idx][col_idx] for cell_idx in col_diff_col_idx] # new_col = [new_cells_list[cell_idx][col_idx] for cell_idx in col_diff_col_idx] # except IndexError: # continue # # col_diff_item = DiffItem( # 'update', 'table', 'col', # block_name=block_name, # old=self.merge_cells_to_row(old_col), # new=self.merge_cells_to_row(new_col), # belong_to=belong_to # ) # col_updates.append(col_diff_item) return row_updates, col_updates, cell_updates def choice_cols(self, table_obj, col_indexes): if table_obj.head_type == 'horizontal': rows = [] for row_obj in table_obj.rows: cells = [] for cel_idx in col_indexes: cells.append(row_obj.cells[cel_idx]) rows.append(cells) else: rows = [[] for _ in range(len(table_obj.rows[0].cells))] for cel_idx in col_indexes: for idx, cell in enumerate(table_obj.rows[cel_idx].cells): rows[idx].append(cell) res_table_obj = TableObject() for cell_list in rows: row_obj = RowObject() if cell_list: row_obj.cells = cell_list row_obj.coordinate = cell_list[0].coordinate # 对cell_obj的layout.parent_ref进行判断,有值在进行赋值 if cell_list[0].layout.parent_ref: row_obj.layout = cell_list[0].layout.parent_ref.layout row_obj.style = cell_list[0].layout.parent_ref.style row_obj.border = cell_list[0].layout.parent_ref.border row_obj.row_index = cell_list[0].row_index row_obj.data_id = cell_list[0].data_id # res_table_obj.rows.append(self.merge_cells_to_row(cell_list)) res_table_obj.rows.append(row_obj) self.copy_table_attrs(res_table_obj, table_obj) return res_table_obj @staticmethod def process_delete_add_diff(block_name, sub_type, delete_tables, add_tables, belong_to, head_type=None): def process_graphic_objects(action, cell_list): """ 辅助函数:处理单元格中的图形对象和图片对象。 action: 操作类型('delete' 或 'add') cells_list: 单元格列表 """ diff_items = [] all_merged_ranges = [] for cell_obj in cell_list: if cell_obj.merged_ranges: # 合并单元格只处理一次 if cell_obj.merged_ranges not in all_merged_ranges: all_merged_ranges.append(cell_obj.merged_ranges) else: continue for item_obj in cell_obj.content: if is_instance_of(item_obj, GraphicObject) or is_instance_of(item_obj, PictureObject): # 检查是否是图形或图片对象 diff_items.append( DiffItem(action, item_obj._type, sub_type=item_obj._type, block_name=block_name, old=item_obj if action == 'delete' else None, new=None if action == 'delete' else item_obj, belong_to=belong_to) ) return diff_items # filter_duplicate_cells 过滤在一行或者一列中因合并单元格引起的重复 # 相关代码暂时先不启用,可以在后续使用者启用查看是否会引起漏差分的问题在决定是否启用 # 如果在解析端可以处理合并单元格,则不需要过滤,避免冗余处理而降低效率 def filter_duplicate_cells(item,sub_type): """ 根据text和merged_ranges过滤掉cells_list中的合并单元格 Args: item: RowObject or TableObject """ if sub_type != 'table': seen_contents = defaultdict(list) for i in range(len(item.cells) - 1, -1, -1): cell = item.cells[i] cell_merged_ranges = cell.merged_ranges if not cell_merged_ranges: continue cell_text = cell.text if cell_merged_ranges == seen_contents[cell_text]: del item.cells[i] continue seen_contents[cell_text] = cell_merged_ranges else: for row in item.rows: seen_contents = defaultdict(list) for i in range(len(row.cells) - 1, -1, -1): cell = row.cells[i] cell_merged_ranges = cell.merged_ranges if not cell_merged_ranges: continue cell_text = cell.text if cell_merged_ranges == seen_contents[cell_text]: del row.cells[i] continue seen_contents[cell_text] = cell_merged_ranges return item ls_tb_add, ls_tb_delete = [], [] for tb_base_item in delete_tables: # 过滤(行、列)合并单元格 tb_base_item = filter_duplicate_cells(tb_base_item,sub_type) diff_obj = DiffItem('delete', 'table', sub_type=sub_type, block_name=block_name, old=tb_base_item, new=None, belong_to=belong_to) setattr(diff_obj, 'head_type', head_type) ls_tb_delete.append(diff_obj) # 如果是表格、行或列,处理单元格中的内容 if sub_type in ('row', 'col', 'table'): cells_list = tb_base_item.cells if sub_type != 'table' else [ cell for row in tb_base_item.rows for cell in row.cells] ls_tb_delete.extend(process_graphic_objects('delete', cells_list)) for tb_target_item in add_tables: # 过滤(行、列)合并单元格 tb_target_item = filter_duplicate_cells(tb_target_item,sub_type) diff_obj = DiffItem('add', 'table', sub_type=sub_type, block_name=block_name, old=None, new=tb_target_item, belong_to=belong_to) setattr(diff_obj, 'head_type', head_type) ls_tb_add.append(diff_obj) # 如果是表格、行或列,处理单元格中的内容 if sub_type in ('row', 'col', 'table'): cells_list = tb_target_item.cells if sub_type != 'table' else [ cell for row in tb_target_item.rows for cell in row.cells] ls_tb_add.extend(process_graphic_objects('add', cells_list)) return ls_tb_delete, ls_tb_add def transpose_table_rows(self, rows): """ 将表格的行进行转置操作,即将行转换为列,列转换为行。 Args: rows (list): 原始表格的行列表,每个元素是一个RowObject对象 Returns: list: 返回转置后的行列表,每个元素是一个RowObject对象 """ # 创建新的行对象列表,数量等于原始表格的最大列数 max_cell_count = 0 for row in rows: if len(row.cells) > max_cell_count: max_cell_count = len(row.cells) t_rows = [RowObject() for _ in range(max_cell_count)] # 遍历原始表格的每一行 for row in rows: # 遍历每一行的单元格 for idx, cell in enumerate(row.cells): # 将单元格添加到转置后的对应行中 t_rows[idx].cells.append(cell) # 为转置后的每一行设置属性 for row in t_rows: # 设置行的坐标为第一个单元格的坐标 row.coordinate = row.cells[0].coordinate # 设置行的数据ID为第一个单元格的数据ID row.data_id = row.cells[0].data_id # 设置行的布局为第一个单元格的布局 row.layout = row.cells[0].layout # 如果第一个单元格有列索引,则设置行的列索引 if isinstance(row.cells[0].col_index, int): row.col_index = row.cells[0].col_index # 如果第一个单元格有行索引,则设置行的行索引 if isinstance(row.cells[0].row_index, int): row.row_index = row.cells[0].row_index # 返回转置后的行列表 return t_rows def find_differences(self, array1: list, array2: list, old_items, new_items, diff_mode='normal'): if isinstance(array1, list): array1 = np.array(array1) if isinstance(array2, list): array2 = np.array(array2) # 确保两个ndarray的shape相同 if array1.shape != array2.shape: raise ValueError("两个ndarray的shape必须相同") diff_type = diff_mode if diff_type == 'normal': # 计算行差异数 row_diff_count = np.sum(~np.all(array1 == array2, axis=1)) # 计算列差异数 col_diff_count = np.sum(~np.all(array1 == array2, axis=0)) # 根据差异数选择差异类型 diff_type = 'col' if col_diff_count < row_diff_count else 'row' # 找出所有行和列的差异项 row_diffs, col_diffs, cell_diffs, cell_diff_points, cell_diff_values, row_diff_idx, col_diff_idx, graphic_diffs, picture_diffs = [ [] for _ in range(9)] if diff_type == 'row': for i in range(array1.shape[0]): if not np.all(array1[i] == array2[i]): # if np.sum(array1[i] != array2[i]) < settings.T_MERGE_MULTI_CELL_UPDATE_TO_ROW_UPDATE_MIN_CELL_COUNT: # 如果该行只有一个数值不一致,则将这个差异项改为单元格的差异项 for j in range(array1.shape[1]): if array1[i, j] == array2[i, j]: continue diff_point, diff_values, graphic_diff, picture_diff = self.get_cell_not_equal_attrs( old_items[i][j], new_items[i][j], settings.CELL_COMPARE_ATTRS) if diff_point: cell_diffs.append((i, j)) cell_diff_points.append(' '.join(diff_point)) cell_diff_values.append(diff_values) if graphic_diff: graphic_diffs.append(graphic_diff) if picture_diff: picture_diffs.append(picture_diff) # else: # row_diffs.append(i) # row_diff_idx.append(np.where(array1[i] != array2[i])[0].tolist()) else: for j in range(array1.shape[1]): if not np.all(array1[:, j] == array2[:, j]): # if np.sum(array1[:, j] != array2[:,j]) < settings.T_MERGE_MULTI_CELL_UPDATE_TO_ROW_UPDATE_MIN_CELL_COUNT: # 如果该列只有一个数值不一致,则将这个差异项改为单元格的差异项 for i in range(array1.shape[0]): if array1[i, j] == array2[i, j]: continue diff_point, diff_values, graphic_diff, picture_diff = self.get_cell_not_equal_attrs( old_items[i][j], new_items[i][j], settings.CELL_COMPARE_ATTRS) if diff_point: cell_diffs.append((i, j)) cell_diff_points.append(' '.join(diff_point)) cell_diff_values.append(diff_values) if graphic_diff: graphic_diffs.append(graphic_diff) if picture_diff: picture_diffs.append(picture_diff) # else: # col_diffs.append(j) # col_diff_idx.append(np.where(array1[:, j] != array2[:, j])[0].tolist()) # 返回所有差异类型对应的单元格索引 return diff_type, row_diffs, col_diffs, cell_diffs, cell_diff_points, cell_diff_values, row_diff_idx, col_diff_idx, graphic_diffs, picture_diffs @staticmethod def get_cell_chars(cell_obj): chars = [] for text_obj in cell_obj.content: if not is_instance_of(text_obj, TextObject): continue chars.extend(text_obj.get_chars()) return chars def _compare_cell_diff(self, base, target, data_type, block_name=''): """ 对比单元格图像的方法 """ result = [] if data_type == 'graphic': cp_obj = GraphicComparer(self._base_block_mapping, self._target_block_mapping, self._path_base, self._path_target) else: cp_obj = PictureComparer(self._base_block_mapping, self._target_block_mapping, self._path_base, self._path_target) item_result = cp_obj.compare(block_name, base, target, 'cell', True) result.extend(item_result['add']) result.extend(item_result['delete']) result.extend(item_result['update']) return result @staticmethod def _get_graphic_picture_obj(old_cell, new_cell): base_graphic = [] target_graphic = [] base_picture = [] target_picture = [] for base_item in old_cell.content: if is_instance_of(base_item, GraphicObject): base_graphic.append(base_item) elif is_instance_of(base_item, PictureObject): base_picture.append(base_item) for new_item in new_cell.content: if is_instance_of(new_item, GraphicObject): target_graphic.append(new_item) elif is_instance_of(new_item, PictureObject): target_picture.append(new_item) return [(base_graphic, target_graphic), (base_picture, target_picture)] def _get_cell_graphic_picture_diff(self, old_cell, new_cell): """ 对比单元格图形图像的方法 """ graphic_diff = [] picture_diff = [] block = old_cell while not isinstance(block, DocumentBlockObject) and block and hasattr(block, 'layout'): block = block.layout.parent_ref block_name = block.name if block else '' graphic_obj, picture_obj = self._get_graphic_picture_obj(old_cell, new_cell) if graphic_obj[0] or graphic_obj[1]: graphic_diff = self._compare_cell_diff(graphic_obj[0], graphic_obj[1], 'graphic', block_name) if picture_obj[0] or picture_obj[1]: picture_diff = self._compare_cell_diff(picture_obj[0], picture_obj[1], 'picture', block_name) return graphic_diff, picture_diff def get_cell_not_equal_attrs(self, old_cell, new_cell, compare_attrs): diff_attrs = [] diff_values = [] if getattr(old_cell, 'auto_number', None) and getattr(new_cell, 'auto_number', None): return [], [], [], [] if old_cell.text != new_cell.text: diff_attrs.append('text') diff_values.append((old_cell.text, new_cell.text)) else: # 直接在对象上取值的属性 # direct_attr = ['style.background_color', 'border.border_top.border_style', 'style.background_color', # 'border.border_bottom.border_style', 'border.border_left.border_style', # 'border.border_right.border_style', 'style.background_style'] direct_attr = ['style.background_color', 'style.background_style'] attrs, values = self.get_not_equal_attrs(old_cell, new_cell, direct_attr) diff_attrs.extend(attrs) diff_values.extend(values) for old_char, new_char in zip_longest(self.get_cell_chars(old_cell), self.get_cell_chars(new_cell), fillvalue=None): if old_char is None or new_char is None: diff_attrs.append('text') diff_values.append((str(old_char), str(new_char))) else: attrs, values = self.get_not_equal_attrs(old_char, new_char, compare_attrs) diff_attrs.extend(attrs) diff_values.extend(values) # 单元格增加图形图像的比较 graphic_diff, picture_diff = self._get_cell_graphic_picture_diff(old_cell, new_cell) unique_diff_attrs = list(set(diff_attrs)) unique_not_equal_values = [diff_values[diff_attrs.index(v)] for v in unique_diff_attrs] return unique_diff_attrs, unique_not_equal_values, graphic_diff, picture_diff def get_cell_content_list(self, cell_obj_lists, with_attr=False): content_lists = [] processed_merged_ranges = set() for cell_obj_list in cell_obj_lists: row_content_list = [] for cell_obj in cell_obj_list: # 检查是否是合并单元格且已经处理过 if hasattr(cell_obj, 'merged_ranges') and cell_obj.merged_ranges: # 创建一个基于合并范围和内容的唯一键 merged_key = (tuple(cell_obj.merged_ranges), str(getattr(cell_obj, 'text', ''))) if merged_key in processed_merged_ranges: # 如果已经处理过,设置为空字符串 row_content_list.append('') # 直接添加空字符串到结果中 continue else: # 如果是合并单元格但未处理过,标记为已处理 processed_merged_ranges.add(merged_key) cell_contents = [f'text:{cell_obj.text}'] if with_attr: attr_list = settings.CELL_COMPARE_ATTRS for attr in attr_list: if attr == "text": continue attr_value = self.get_nest_attr(cell_obj, attr) if attr_value not in (None, ''): cell_contents.append(f'{attr}:{str(attr_value)}') row_content_list.append('🙉'.join(cell_contents)) content_lists.append(row_content_list) return content_lists def get_nest_attr(self, obj, nest_attr): if is_instance_of(obj, CellObject): result_attr = [] # 特殊处理单元格背景色 # if nest_attr in ('style.background_color', 'border.border_top.border_style', # 'border.border_bottom.border_style', 'border.border_left.border_style', # 'border.border_right.border_style', 'style.background_style'): if nest_attr in ('style.background_color', 'style.background_style'): return self.get_target_attr(obj, nest_attr) if nest_attr == 'font_background_color': nest_attr = 'style.background_color' for item_obj in obj.content: if is_instance_of(item_obj, GraphicObject) and nest_attr == 'graphic': for item_attr in settings.PICTURE_COMPARE_ATTRS: run_attr = self.get_target_attr(item_obj, item_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) graphic_text_obj = getattr(item_obj, 'text_obj', None) if graphic_text_obj and graphic_text_obj.text: text_attr_list = [] for text_attr in settings.TEXT_COMPARE_ATTRS: for run_obj in graphic_text_obj.runs: attr_val = self.get_target_attr(run_obj, text_attr) if attr_val and str(attr_val) not in text_attr_list: text_attr_list.append(str(attr_val)) if text_attr_list: result_attr.extend(text_attr_list) elif is_instance_of(item_obj, PictureObject) and nest_attr == 'picture': for item_attr in settings.PICTURE_COMPARE_ATTRS: run_attr = self.get_target_attr(item_obj, item_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) else: if is_instance_of(item_obj, TextObject): for run_obj in item_obj.runs: run_attr = self.get_target_attr(run_obj, nest_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) elif is_instance_of(item_obj, RunObject): run_attr = self.get_target_attr(item_obj, nest_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) return "".join(result_attr) elif is_instance_of(obj, TextObject): result_attr = [] for run_obj in obj.runs: run_attr = self.get_target_attr(run_obj, nest_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) else: return self.get_target_attr(obj, nest_attr) @staticmethod def get_target_attr(obj, nest_attr): nest_attrs = nest_attr.split('.') attr_str = nest_attrs.pop(0) base_attr = getattr(obj, attr_str, None) while base_attr and nest_attrs: attr_str = nest_attrs.pop(0) base_attr = getattr(base_attr, attr_str, None) return base_attr # @staticmethod # def merge_cells_to_row(cell_list): # row_obj = RowObject() # # if cell_list: # row_obj.cells = cell_list # row_obj.coordinate = cell_list[0].coordinate # # 对cell_obj的layout.parent_ref进行判断,有值在进行赋值 # if cell_list[0].layout.parent_ref: # row_obj.layout = cell_list[0].layout.parent_ref.layout # row_obj.style = cell_list[0].layout.parent_ref.style # row_obj.border = cell_list[0].layout.parent_ref.border # row_obj.row_index = cell_list[0].row_index # row_obj.data_id = cell_list[0].data_id # return row_obj @staticmethod def align_table_col(base_table, target_table): base_max_col_count = max([len(row.cells) for row in base_table.rows]) target_max_col_count = max([len(row.cells) for row in target_table.rows]) for base_row in base_table.rows: if len(base_row.cells) != base_max_col_count: # 匹配行的列数不一致,补齐缺失的cell add_col_count = abs(len(base_row.cells) - base_max_col_count) base_row.cells.extend([CellObject() for _ in range(add_col_count)]) for target_row in target_table.rows: if len(target_row.cells) != target_max_col_count: # 匹配行的列数不一致,补齐缺失的cell add_col_count = abs(len(target_row.cells) - target_max_col_count) target_row.cells.extend([CellObject() for _ in range(add_col_count)]) def cell_update_after(self, update_cells): """ 单元格的变更后处理, 只有在content和merged_ranges都一样的情况下才过滤重复项 :return: """ if not update_cells: return update_cells result = [] custom_cells_merged_ranges = list() seen_cells = defaultdict(list) def normalize_content(cell): """标准化单元格内容用于比较""" if not cell: return "" # 获取文本内容并标准化 content_text = str(cell.text) if hasattr(cell, 'text') else "" # 标准化换行符 normalized = content_text.strip().replace('\r\n', '\n').replace('\r', '\n') return normalized def get_cell_key(item): """生成用于比较的键""" old_cell = getattr(item, 'old', None) new_cell = getattr(item, 'new', None) # 获取内容键 old_content = normalize_content(old_cell) new_content = normalize_content(new_cell) content_key = f"{old_content}|{new_content}" # 获取合并范围键 old_range = tuple(old_cell.merged_ranges) if old_cell and hasattr(old_cell, 'merged_ranges') and old_cell.merged_ranges else () new_range = tuple(new_cell.merged_ranges) if new_cell and hasattr(new_cell, 'merged_ranges') and new_cell.merged_ranges else () range_key = f"{old_range}|{new_range}" return f"{content_key}||{range_key}" def get_is_custom_cell(cell_obj): for c_obj in cell_obj.get_heads(): if c_obj.text == settings.SPECIAL_CELL_CONTENT3: return True for item in update_cells: # 如果不是单元格更新或者没有old/new对象,直接添加到结果中 if (item.type != 'update' or item.data_type != 'table' or item.sub_type != 'cell' or (not item.old or not item.old.merged_ranges) and (not item.new or not item.new.merged_ranges)): result.append(item) continue current_old_range = getattr(item.old, 'merged_ranges', []) if item.old else [] current_new_range = getattr(item.new, 'merged_ranges', []) if item.new else [] # 特殊定制的表格累加处理 if item.old and get_is_custom_cell(item.old): if current_old_range not in custom_cells_merged_ranges: custom_cells_merged_ranges.add(current_old_range) result.append(item) else: existing_idx = custom_cells_merged_ranges.index(current_old_range) result[existing_idx].old.text += item.old.text result[existing_idx].old.content.extend(item.old.content) elif item.new and get_is_custom_cell(item.new): if current_new_range not in custom_cells_merged_ranges: custom_cells_merged_ranges.append(current_new_range) result.append(item) else: existing_idx = custom_cells_merged_ranges.index(current_new_range) result[existing_idx].new.text += item.new.text result[existing_idx].new.content.extend(item.new.content) # 检查是否只有单侧有合并范围, 如果只有单侧有合并范围,则不视为重复 # elif len(current_old_range) <4 or len(current_new_range)<4: # result.append(item) else: # 处理普通单元格 - 进行去重检查 # 生成用于比较的键 cell_key = get_cell_key(item) # 检查是否已经存在相同的键 is_duplicate = False for existing_idx in seen_cells[cell_key]: existing_item = result[existing_idx] # 获取当前和已存在项目的合并范围 existing_old_range = getattr(existing_item.old, 'merged_ranges', []) if existing_item.old else [] existing_new_range = getattr(existing_item.new, 'merged_ranges', []) if existing_item.new else [] # 只有当merged_ranges完全相同时才认为是重复 if (current_old_range == existing_old_range and current_new_range == existing_new_range): is_duplicate = True break if not is_duplicate: seen_cells[cell_key].append(len(result)) result.append(item) # 如果是重复项,则忽略(不添加到结果中) return result def row_del_add_after(self, part, category='add'): """ 根据 category 参数处理新增或删除的行对象,判断行中的单元格是否有 merged_ranges 属性。 如果行中的任意一个单元格没有 merged_ranges 属性,则添加到结果列表中。 同时过滤具有相同 merged_ranges 的重复行对象,仅保留第一个出现的行。 注意:对于两列(行)中至少共享一个合并单元格,同时两列(行)内容完全相同,依然有可能会被误删除 解决方案:需要解析提供所有的单元格范围,之后综合计算整列(行)的范围进行判断, 若整列(行)都是因合并单元格而造成的冗余则进行过滤,否则(如只共享一(多)个合并单元格)则保留 :param part: 行对象列表 :param category: 操作类型,'add' 或 'delete' :return: 处理后的结果列表 """ # 使用列表来保存拼接后的列表 result = [] if not part: return part if category not in ('add', 'delete'): return part # 根据 category 决定处理新增还是删除的行对象 merged_rows = [] for row in part: # 检查是否是行对象;(会有PictureObject和GraphicObject)如不是则直接加入结果中 if category == 'add' and not is_instance_of(row.new, RowObject): result.append(row) continue if category == 'delete' and not is_instance_of(row.old, RowObject): result.append(row) continue # 获取要检查的单元格列表 cells = row.new.cells if category == 'add' else row.old.cells # 检查行中的每个单元格是否有 merged_ranges 属性 has_merged_ranges = any(hasattr(cell, 'merged_ranges') for cell in cells) # 如果行中的任意一个单元格没有 merged_ranges 属性,则添加到结果中 if not has_merged_ranges: result.append(row) else: merged_rows.append(row) # 处理具有 merged_ranges 的行,过滤重复项 if merged_rows: seen_contents = defaultdict(list) def remove_timestamp(text): return re.sub(r'\d{4}[-/]\d{2}[-/]\d{2}.*?(?=\t|\n|$)', '', text) for index, row in enumerate(merged_rows): # 获取当前行的内容 content = getattr(row, 'new_content' if category == 'add' else 'old_content', None) if content: #确保在处理可能包含非UTF-8编码字符的文本时不会出现解码错误 if isinstance(content, bytes): content = content.decode('utf-8', errors='replace') elif not isinstance(content, str): content = str(content) # 标准化 content cleaned_content = remove_timestamp(content) normalized_content = cleaned_content.strip().replace('\r\n', '\n').replace('\r', '\n') seen_contents[normalized_content].append(index) duplicates_row = {item: indices for item, indices in seen_contents.items() if len(indices) > 1} removed_rows_indices = [] for _, indices in duplicates_row.items(): seen_merged_ranges = set() for i in indices: # 获取要检查的单元格列表 cells = merged_rows[i].new.cells if category == 'add' else merged_rows[i].old.cells for cell in cells: if cell.merged_ranges: merged_range_tuple = tuple(cell.merged_ranges) if merged_range_tuple not in seen_merged_ranges: seen_merged_ranges.add(merged_range_tuple) break else: removed_rows_indices.append(i) break # 添加未被移除的行到结果中 for index, row in enumerate(merged_rows): if index not in removed_rows_indices: result.append(row) return result def pre_process_require(self, old_table, new_table): base_resources = [old_table] target_resources = [new_table] changes_dict = {} # 存储变更信息的字典 # 合并表格并编号(0=变更前,1=变更后) for table_idx, table in enumerate(base_resources + target_resources): col_list = table.get_col_list(col_name=settings.SPECIAL_COLUMN) #'要求廃止' # 在循环外部初始化计数器 be_counter = 1 af_counter = 1 if col_list: for row_index, cell in enumerate(col_list): cell_text = getattr(cell, 'text', '') if cell_text == settings.SPECIAL_CELL_CONTENT2: #'レ' if table_idx < len(base_resources): table_key = f"be_{be_counter:02d}" be_counter += 1 else: table_key = f"af_{af_counter:02d}" af_counter += 1 # 获取列索引 col_index = cell.col_index # 存储变更位置信息 changes_dict[table_key] = (row_index, col_index) if any(changes_dict): # 分离变更前和变更后的数据 be_changes = {k: v for k, v in changes_dict.items() if k.startswith('be_')} af_changes = {k: v for k, v in changes_dict.items() if k.startswith('af_')} # 处理变更前的数据 # 记录需要清理的key(分前后表) be_clear_keys = [] af_clear_keys = [] if be_changes: for table_key, (row_idx, col_idx) in be_changes.items(): next_col_idx = col_idx + 1 if next_col_idx < len(base_resources[0].rows[row_idx].cells): if self._tbl_find_unique(base_resources[0], target_resources[0], row_idx, col_idx + 1): be_clear_keys.append(table_key) # 处理变更后的数据 if af_changes: for table_key, (row_idx, col_idx) in af_changes.items(): next_col_idx = col_idx + 1 if next_col_idx < len(target_resources[0].rows[row_idx].cells): if self._tbl_find_unique(target_resources[0], base_resources[0], row_idx, col_idx + 1): af_clear_keys.append(table_key) # 统一清理(分表操作) for table_key in be_clear_keys: row_idx, col_idx = changes_dict[table_key] # 清理前表(base_resources) for cell in base_resources[0].rows[row_idx].cells: cell.text = '' cell.content = [] cell.style = StyleObject() for table_key in af_clear_keys: row_idx, col_idx = changes_dict[table_key] # 清理后表(target_resources) for cell in target_resources[0].rows[row_idx].cells: cell.text = '' cell.content = [] cell.style = StyleObject() return base_resources[0], target_resources[0] @staticmethod def _tbl_find_unique(base_tbl, target_tbl, row_idx, col_idx): """校验指定单元格是否满足唯一性条件: 1. 在 base_tbl 对应列中不存在相同值 2. 在 target_tbl 当前列中唯一(排除自己) 返回:是否需要清理(True/False) """ if not target_tbl or not base_tbl: return False target_rows = target_tbl.rows if row_idx >= len(target_rows) or col_idx >= len(target_rows[row_idx].cells): return False compare_text = str(target_rows[row_idx].cells[col_idx].text) # 条件1:检查 base_tbl 对应列是否存在相同值 if base_tbl.rows and col_idx < len(base_tbl.rows[0].cells): for row in base_tbl.rows: if col_idx < len(row.cells) and str(row.cells[col_idx].text) == compare_text: return True # 需要清理 # 条件2:检查 target_tbl 当前列是否有重复(排除自己) for i, row in enumerate(target_rows): if i == row_idx: continue if col_idx < len(row.cells) and str(row.cells[col_idx].text) == compare_text: return True # 需要清理 return False # 无需清理 新增的表格匹配的设置表头这段代码写的忽视了表格结构 2025-12-04 14:48:57 - [MainThread] - ERROR - (compare_parser.py:80) $ compare) ::: compare table is error:list indices must be integers or slices, not str Traceback (most recent call last): File "D:\Venv\RRM_env\lib\site-packages\kotei_omc\compare_parser.py", line 78, in compare plugin_dr.extend_result(plugin_instance.compare_each_block(CustomPreDiffStrategyMiddleware)) File "D:\Venv\RRM_env\lib\site-packages\kotei_omc\comparers\base_comparer.py", line 88, in compare_each_block block_result = self.compare(block_name, ls_base, ls_target, belong_to) File "D:\Venv\RRM_env\lib\site-packages\kotei_omc\comparers\table_comparer.py", line 49, in compare part_delete, part_add, part_update = self.compare_table(block_name, old_table, new_table,belong_to=belong_to) File "D:\Venv\RRM_env\lib\site-packages\kotei_omc\comparers\table_comparer.py", line 197, in compare_table col_matched = CustomTableStrategyMiddleware(self._path_base).match_row(del_cols, add_cols,is_col=True, File "D:\Venv\RRM_env\lib\site-packages\kotei_omc\middlewares\table_middlewares.py", line 605, in match_row return self.strategy.match_rows(base_rows, target_rows, is_col, head_indexes) File "D:\Venv\RRM_env\lib\site-packages\kotei_omc\middlewares\table_strategy.py", line 201, in match_rows matched_strategies = self.registry.get_matched_row_strategies(base_rows, table) File "D:\Venv\RRM_env\lib\site-packages\kotei_omc\middlewares\table_strategy.py", line 87, in get_matched_row_strategies if table and condition(table): File "D:\Venv\RRM_env\lib\site-packages\kotei_omc\middlewares\table_middlewares.py", line 862, in _is_header_rules_valid heads = base_table.get_heads() File "D:\Venv\RRM_env\lib\site-packages\kotei_omp\data\table.py", line 322, in get_heads heads.append(self.rows[index].cells) TypeError: list indices must be integers or slices, not str 再观察一下整段代码,重构一下这段设置表头的代码: # === 设置默认表头类型 === DEFAULT_HEADER = 'horizontal' if not hasattr(old_table, 'head_type') or old_table.head_type is None: old_table.head_type = DEFAULT_HEADER if not hasattr(new_table, 'head_type') or new_table.head_type is None: new_table.head_type = DEFAULT_HEADER # === 表头有效性检查函数 === def is_valid_header(row): if not row or not row.cells: return False total_cells = len(row.cells) empty_cells = sum(1 for cell in row.cells if (cell.text is None or str(cell.text).strip() == '') and not cell.content) return (empty_cells / total_cells) < 0.1 # === 关键修复:设置实际表头行 === # 设置默认表头行为第一行 old_table.header_row_idx = 0 new_table.header_row_idx = 0 # 检查并确认表头行 if old_table.head_type == 'horizontal' and old_table.rows: # 检查第一行是否有效 if not is_valid_header(old_table.rows[0]): logger.warning(f"第一行无效表头 in old table {old_table.data_id}") # 尝试查找后续有效行作为表头 for idx in range(1, min(3, len(old_table.rows))): # 最多检查前3行 if is_valid_header(old_table.rows[idx]): old_table.header_row_idx = idx logger.info(f"设置第{idx + 1}行为表头 in old table") break else: old_table.head_type = None # 未找到有效表头 logger.warning(f"未找到有效表头 in old table {old_table.data_id}") # 新表同样处理 if new_table.head_type == 'horizontal' and new_table.rows: if not is_valid_header(new_table.rows[0]): logger.warning(f"第一行无效表头 in new table {new_table.data_id}") for idx in range(1, min(3, len(new_table.rows))): if is_valid_header(new_table.rows[idx]): new_table.header_row_idx = idx logger.info(f"设置第{idx + 1}行为表头 in new table") break else: new_table.head_type = None logger.warning(f"未找到有效表头 in new table {new_table.data_id}") # === 设置表头内容 === # 从确定的表头行提取表头内容 if old_table.head_type == 'horizontal' and old_table.rows: header_row = old_table.rows[old_table.header_row_idx] old_table.head_list = [cell.text for cell in header_row.cells] if new_table.head_type == 'horizontal' and new_table.rows: header_row = new_table.rows[new_table.header_row_idx] new_table.head_list = [cell.text for cell in header_row.cells]
12-05
import logging,time,re import numpy as np from collections import defaultdict from itertools import zip_longest from kotei_omp.data import DocumentBlockObject from kotei_omc.comparers.picture_comparer import PictureComparer, GraphicComparer from kotei_omc.comparers.base_comparer import BaseComparer from kotei_omc.comparers.plugins import register_plugin from kotei_omc.data.diff import DiffItem from kotei_omp.data import TextObject, GraphicObject, PictureObject, StyleObject, RunObject from kotei_omp.data.table import CellObject, RowObject, TableObject from kotei_omc.settings import settings from kotei_omc.utils.type_checker import is_instance_of from kotei_omc.middlewares.table_middlewares import CustomTableStrategyMiddleware logger = logging.getLogger("req_diff") @register_plugin("table") class TableComparer(BaseComparer): def get_block_resource(self, block, belong_to='block'): return self.do_get_block_resource(block, belong_to, 'tables', TableObject) def compare(self, block_name, base, target, belong_to=None): t0 = time.time() # 表格匹配 logger.info(f'start match table, block_name: {block_name}, base_num: {len(base)}, target_num: {len(target)}') match_func = CustomTableStrategyMiddleware(self._path_base).match if settings.MATCH_WITH_CHAPTER: tb_delete_list, tb_add_list, old_new_tb_matched = self.do_match_with_chapter(base, target,match_func) else: tb_delete_list, tb_add_list, old_new_tb_matched = self.do_match_normal(base, target,match_func) logger.info('finish match table') # 表格新增删除 ls_tb_delete, ls_tb_add = self.process_delete_add_diff(block_name, 'table', tb_delete_list, tb_add_list, belong_to=belong_to) # 表格差分 ls_tb_update = [] for old_table, new_table in old_new_tb_matched: # 要求废止特殊处理 old_table, new_table = self.pre_process_require(old_table, new_table) # 表格位置差分 if not old_table.is_same_pos(new_table): ls_tb_update.append(DiffItem('update', 'table', sub_type='table', block_name=block_name, old=old_table, new=new_table, belong_to=belong_to,diff_point='coordinate_desc')) # 对匹配的每个表格进行对比 part_delete, part_add, part_update = self.compare_table(block_name, old_table, new_table,belong_to=belong_to) ls_tb_delete.extend(self.row_del_add_after(part_delete,category='delete')) ls_tb_add.extend(self.row_del_add_after(part_add,category='add')) ls_tb_update.extend(self.cell_update_after(part_update)) t1 = time.time() logger.info(f'Time Cost:table diff {block_name} {t1 - t0}') return {'add': ls_tb_add, 'delete': ls_tb_delete, 'update': ls_tb_update} @staticmethod def copy_table_attrs(to_table, from_table): for attr_name in ('layout', 'style', 'border', 'coordinate', 'data_id'): setattr(to_table, attr_name, getattr(from_table, attr_name)) @staticmethod def fill_visual_merged_cells(table): num_rows = len(table.rows) if num_rows == 0: return num_cols = max([len(row.cells) for row in table.rows]) if num_cols == 0: return # 判断是否有边界 def is_bordered(side): return side.border_style is not None for col in range(num_cols): row_ptr = 0 while row_ptr < num_rows: cell = table.rows[row_ptr].cells[col] top_border_exists = is_bordered(cell.border.border_top) if row_ptr == 0 or top_border_exists: start_row = row_ptr end_row = start_row while end_row < num_rows: current_cell = table.rows[end_row].cells[col] bottom_border_exists = is_bordered(current_cell.border.border_bottom) # import ipdb;ipdb.set_trace() if bottom_border_exists or end_row == num_rows - 1: break else: end_row += 1 block_text = None block_content = None for r in range(start_row, end_row + 1): val = table.rows[r].cells[col].text if val is not None and str(val).strip() != "": block_text = val block_content = table.rows[r].cells[col].content break if block_text is not None: merged_ranges = [start_row, col, end_row, col] for r in range(start_row, end_row + 1): val = table.rows[r].cells[col].text if val is None or str(val).strip() == "": table.rows[r].cells[col].content = block_content table.rows[r].cells[col].text = block_text # 添加 merged_ranges 属性 if not table.rows[r].cells[col].merged_ranges: table.rows[r].cells[col].merged_ranges = merged_ranges row_ptr = end_row + 1 else: row_ptr += 1 def compare_table(self, block_name, old_table, new_table, belong_to): logger.info(f"start compare table, old_data_id: {old_table.data_id}, new_data_id: {new_table.data_id}") # === 设置默认表头 === DEFAULT_HEADER = 'horizontal' # 默认水平表头 if not hasattr(old_table, 'head_type') or old_table.head_type is None: old_table.head_type = DEFAULT_HEADER logger.debug(f"Set default head_type for old_table: {DEFAULT_HEADER}") if not hasattr(new_table, 'head_type') or new_table.head_type is None: new_table.head_type = DEFAULT_HEADER logger.debug(f"Set default head_type for new_table: {DEFAULT_HEADER}") # === 新增:检查第一行是否有效表头(空格数小于10%)=== def is_valid_header(row): """检查行是否有效表头(空格数小于10%)""" if not row or not row.cells: return False total_cells = len(row.cells) empty_cells = 0 for cell in row.cells: # 检查单元格是否为空:文本为空且没有其他内容(如图形) if (cell.text is None or str(cell.text).strip() == '') and not cell.content: empty_cells += 1 # 计算空格比例 empty_ratio = empty_cells / total_cells return empty_ratio < 0.1 # 空格比例小于10% # 检查旧表的第一行 if old_table.head_type == 'horizontal' and old_table.rows: if not is_valid_header(old_table.rows[0]): logger.warning(f"Invalid header in old table {old_table.data_id}: too many empty cells") old_table.head_type = None # 标记为无效表头 # 检查新表的第一行 if new_table.head_type == 'horizontal' and new_table.rows: if not is_valid_header(new_table.rows[0]): logger.warning(f"Invalid header in new table {new_table.data_id}: too many empty cells") new_table.head_type = None # 标记为无效表头 # 使表格内列数一致 self.align_table_col(old_table, new_table) # 使表格内列数一致 self.align_table_col(old_table, new_table) # 表格中存在大量视觉上merge但是实际未合并的空格,需要将空格赋值为正确的文本,防止影响相似度匹配 self.fill_visual_merged_cells(old_table) self.fill_visual_merged_cells(new_table) if old_table.head_type == new_table.head_type == 'horizontal': old_col_table, new_col_table = self.transpose_table(old_table, new_table) else: if old_table.head_type == 'vertical': new_table.head_list = old_table.head_list new_table.head_type = 'vertical' elif new_table.head_type == 'vertical': old_table.head_list = new_table.head_list old_table.head_type = 'vertical' old_col_table, new_col_table = old_table, new_table # 列匹配 del_cols, add_cols = old_col_table.rows, new_col_table.rows col_matched = CustomTableStrategyMiddleware(self._path_base).match_row(del_cols, add_cols,is_col=True, head_indexes=[old_table.head_list,new_table.head_list]) if col_matched: matched_old_cols, matched_new_cols = list(zip(*list(col_matched))) del_cols = [old_col for old_col in old_col_table.rows if old_col not in matched_old_cols] add_cols = [new_col for new_col in new_col_table.rows if new_col not in matched_new_cols] sub_type = 'col' if old_table.head_type == 'horizontal' else 'row' ls_col_delete, ls_col_add = self.process_delete_add_diff(block_name, sub_type, del_cols, add_cols, belong_to=belong_to, head_type=old_table.head_type) # 根据matched的列组合新的表,得到列一致的两个表 if col_matched: old_col_indexes,new_col_indexes =[],[] for old_col, new_col in col_matched: old_col_indexes.append(old_col_table.rows.index(old_col)) new_col_indexes.append(new_col_table.rows.index(new_col)) old_equal_col_table = self.choice_cols(old_table, old_col_indexes) new_equal_col_table = self.choice_cols(new_table, new_col_indexes) else: return ls_col_delete, ls_col_add, [] # 行匹配 del_rows, add_rows = old_equal_col_table.rows, new_equal_col_table.rows row_matched = CustomTableStrategyMiddleware(self._path_base).match_row(del_rows, add_rows, is_col=False) if row_matched: matched_old_rows, matched_new_rows = list(zip(*list(row_matched))) del_rows_indexes = [idx for idx, old_row in enumerate(old_equal_col_table.rows) if old_row not in matched_old_rows] add_rows_indexes = [idx for idx, new_row in enumerate(new_equal_col_table.rows) if new_row not in matched_new_rows] # 使用没有重组前的表,横表头直接处理,竖表头需要转置 if old_table.head_type == new_table.head_type == 'horizontal': del_rows = [old_table.rows[idx] for idx in del_rows_indexes] add_rows = [new_table.rows[idx] for idx in add_rows_indexes] else: old_transpose_table = self.choice_cols(old_table, list(range(len(old_table.rows)))) new_transpose_table = self.choice_cols(new_table, list(range(len(new_table.rows)))) del_rows = [old_transpose_table.rows[idx] for idx in del_rows_indexes] add_rows = [new_transpose_table.rows[idx] for idx in add_rows_indexes] sub_type = 'row' if old_table.head_type == 'horizontal' else 'col' ls_row_delete, ls_row_add = self.process_delete_add_diff(block_name, sub_type, del_rows, add_rows, belong_to=belong_to, head_type=old_table.head_type) # 根据matched的行组合新的表,得到行一致的两个表 if row_matched: old_equal_row_table, new_equal_row_table = TableObject(), TableObject() old_equal_row_table.rows = list(matched_old_rows) old_equal_row_table.head_type = old_table.head_type self.copy_table_attrs(old_equal_row_table, old_table) new_equal_row_table.rows = list(matched_new_rows) new_equal_row_table.head_type = new_table.head_type self.copy_table_attrs(new_equal_row_table, new_table) # 查找行变更、列变更、单元格变更 ls_row_update, ls_col_update, ls_cell_update = self.compare_ordered_tables(block_name,old_equal_row_table, new_equal_row_table,belong_to=belong_to) else: ls_row_update, ls_col_update, ls_cell_update = [], [], [] part_delete = ls_row_delete + ls_col_delete part_add = ls_row_add + ls_col_add part_update = ls_row_update + ls_col_update + ls_cell_update logger.info(f"finish compare table, old_data_id: {old_table.data_id}, new_data_id: {new_table.data_id}") return part_delete, part_add, part_update def transpose_table(self, old_table, new_table): """ 将表格进行转置操作,即将行转换为列,列转换为行。 Args: old_table (TableObject): 原始表格对象 new_table (TableObject): 目标表格对象 Returns: tuple: 返回转置后的两个表格对象 (old_col_table, new_col_table) """ # 创建新的表格对象用于存储转置后的数据 old_col_table, new_col_table = TableObject(), TableObject() # 对原始表格的行进行转置操作 old_col_table.rows = self.transpose_table_rows(old_table.rows) # 根据原始表格的表头类型,设置转置后的表头类型 old_col_table.head_type = 'vertical' if old_table.head_type == 'horizontal' else 'horizontal' # 复制原始表格的属性到转置后的表格 self.copy_table_attrs(old_col_table, old_table) # 对目标表格的行进行转置操作 new_col_table.rows = self.transpose_table_rows(new_table.rows) # 根据目标表格的表头类型,设置转置后的表头类型 new_col_table.head_type = 'vertical' if new_table.head_type == 'horizontal' else 'horizontal' # 复制目标表格的属性到转置后的表格 self.copy_table_attrs(new_col_table, new_table) # 返回转置后的两个表格对象 return old_col_table, new_col_table def compare_ordered_tables(self, block_name, old_table_obj, new_table_obj, belong_to): row_updates, col_updates, cell_updates = [], [], [] # 获取新旧行数据 old_rows = getattr(old_table_obj, 'rows', []) new_rows = getattr(new_table_obj, 'rows', []) old_cells_list = [row.cells for row in old_rows] new_cells_list = [row.cells for row in new_rows] # 获取内容用于对比 old_content_cells_list = self.get_cell_content_list(old_cells_list, settings.DIFF_ATTR) new_content_cells_list = self.get_cell_content_list(new_cells_list, settings.DIFF_ATTR) # 删除完全一样的匹配 for row_index in range(len(old_content_cells_list) - 1, -1, -1): # 如果新旧行内容相同,则删除该行 # 之后可以在这里增加原子操作逻辑,避免删除不同步 if old_content_cells_list[row_index] == new_content_cells_list[row_index]: old_content_cells_list.pop(row_index) new_content_cells_list.pop(row_index) old_cells_list.pop(row_index) new_cells_list.pop(row_index) #原子一致性检查 flag = False if len(old_content_cells_list) ==len(new_content_cells_list)==len(old_cells_list) == len(new_cells_list): flag = True if not flag: logger.warning(f"{block_name} old_table_obj: {old_table_obj}, new_table_obj: {new_table_obj}; delete operator is not atomic; all the cells list will involved in finding differences computation") if not old_content_cells_list: return [], [], [] # 查找差异 diff_type, row_diffs, col_diffs, cell_diffs, cell_diff_points, cell_diff_values, \ row_diff_idx, col_diff_idx, graphic_diff, picture_diff = self.find_differences( old_content_cells_list, new_content_cells_list, old_cells_list, new_cells_list) # 抽取单元格内的图形图像差分 for item in graphic_diff + picture_diff: if item: cell_updates.extend(item) # 处理单元格差分 for idx, (cell_diff_idx, diff_point, diff_value) in enumerate( zip(cell_diffs, cell_diff_points, cell_diff_values)): try: # old = self.get_element_by_index(old_cells_list, cell_diff_idx) # new = self.get_element_by_index(new_cells_list, cell_diff_idx) old, new = old_cells_list, new_cells_list for cell_idx in cell_diff_idx: old = old[cell_idx] new = new[cell_idx] except IndexError: continue # 忽略非法索引 cell_diff_obj = DiffItem( 'update', 'table', 'cell', block_name=block_name, old=old, new=new, belong_to=belong_to, diff_point=diff_point, diff_values=diff_value ) cell_updates.append(cell_diff_obj) # 处理行差分 # if diff_type == 'row': # for row_idx, row_diff_col_idx in zip(row_diffs, row_diff_idx): # try: # old_row = [old_cells_list[row_idx][cell_idx] for cell_idx in row_diff_col_idx] # new_row = [new_cells_list[row_idx][cell_idx] for cell_idx in row_diff_col_idx] # except IndexError: # continue # # row_diff_item = DiffItem( # 'update', 'table', 'row', # block_name=block_name, # old=self.merge_cells_to_row(old_row), # new=self.merge_cells_to_row(new_row), # belong_to=belong_to) # row_updates.append(row_diff_item) # 处理列差分 # elif diff_type == 'col': # for col_idx, col_diff_col_idx in zip(col_diffs, col_diff_idx): # try: # old_col = [old_cells_list[cell_idx][col_idx] for cell_idx in col_diff_col_idx] # new_col = [new_cells_list[cell_idx][col_idx] for cell_idx in col_diff_col_idx] # except IndexError: # continue # # col_diff_item = DiffItem( # 'update', 'table', 'col', # block_name=block_name, # old=self.merge_cells_to_row(old_col), # new=self.merge_cells_to_row(new_col), # belong_to=belong_to # ) # col_updates.append(col_diff_item) return row_updates, col_updates, cell_updates def choice_cols(self, table_obj, col_indexes): if table_obj.head_type == 'horizontal': rows = [] for row_obj in table_obj.rows: cells = [] for cel_idx in col_indexes: cells.append(row_obj.cells[cel_idx]) rows.append(cells) else: rows = [[] for _ in range(len(table_obj.rows[0].cells))] for cel_idx in col_indexes: for idx, cell in enumerate(table_obj.rows[cel_idx].cells): rows[idx].append(cell) res_table_obj = TableObject() for cell_list in rows: row_obj = RowObject() if cell_list: row_obj.cells = cell_list row_obj.coordinate = cell_list[0].coordinate # 对cell_obj的layout.parent_ref进行判断,有值在进行赋值 if cell_list[0].layout.parent_ref: row_obj.layout = cell_list[0].layout.parent_ref.layout row_obj.style = cell_list[0].layout.parent_ref.style row_obj.border = cell_list[0].layout.parent_ref.border row_obj.row_index = cell_list[0].row_index row_obj.data_id = cell_list[0].data_id # res_table_obj.rows.append(self.merge_cells_to_row(cell_list)) res_table_obj.rows.append(row_obj) self.copy_table_attrs(res_table_obj, table_obj) return res_table_obj @staticmethod def process_delete_add_diff(block_name, sub_type, delete_tables, add_tables, belong_to, head_type=None): def process_graphic_objects(action, cell_list): """ 辅助函数:处理单元格中的图形对象和图片对象。 action: 操作类型('delete' 或 'add') cells_list: 单元格列表 """ diff_items = [] all_merged_ranges = [] for cell_obj in cell_list: if cell_obj.merged_ranges: # 合并单元格只处理一次 if cell_obj.merged_ranges not in all_merged_ranges: all_merged_ranges.append(cell_obj.merged_ranges) else: continue for item_obj in cell_obj.content: if is_instance_of(item_obj, GraphicObject) or is_instance_of(item_obj, PictureObject): # 检查是否是图形或图片对象 diff_items.append( DiffItem(action, item_obj._type, sub_type=item_obj._type, block_name=block_name, old=item_obj if action == 'delete' else None, new=None if action == 'delete' else item_obj, belong_to=belong_to) ) return diff_items # filter_duplicate_cells 过滤在一行或者一列中因合并单元格引起的重复 # 相关代码暂时先不启用,可以在后续使用者启用查看是否会引起漏差分的问题在决定是否启用 # 如果在解析端可以处理合并单元格,则不需要过滤,避免冗余处理而降低效率 def filter_duplicate_cells(item,sub_type): """ 根据text和merged_ranges过滤掉cells_list中的合并单元格 Args: item: RowObject or TableObject """ if sub_type != 'table': seen_contents = defaultdict(list) for i in range(len(item.cells) - 1, -1, -1): cell = item.cells[i] cell_merged_ranges = cell.merged_ranges if not cell_merged_ranges: continue cell_text = cell.text if cell_merged_ranges == seen_contents[cell_text]: del item.cells[i] continue seen_contents[cell_text] = cell_merged_ranges else: for row in item.rows: seen_contents = defaultdict(list) for i in range(len(row.cells) - 1, -1, -1): cell = row.cells[i] cell_merged_ranges = cell.merged_ranges if not cell_merged_ranges: continue cell_text = cell.text if cell_merged_ranges == seen_contents[cell_text]: del row.cells[i] continue seen_contents[cell_text] = cell_merged_ranges return item ls_tb_add, ls_tb_delete = [], [] for tb_base_item in delete_tables: # 过滤(行、列)合并单元格 tb_base_item = filter_duplicate_cells(tb_base_item,sub_type) diff_obj = DiffItem('delete', 'table', sub_type=sub_type, block_name=block_name, old=tb_base_item, new=None, belong_to=belong_to) setattr(diff_obj, 'head_type', head_type) ls_tb_delete.append(diff_obj) # 如果是表格、行或列,处理单元格中的内容 if sub_type in ('row', 'col', 'table'): cells_list = tb_base_item.cells if sub_type != 'table' else [ cell for row in tb_base_item.rows for cell in row.cells] ls_tb_delete.extend(process_graphic_objects('delete', cells_list)) for tb_target_item in add_tables: # 过滤(行、列)合并单元格 tb_target_item = filter_duplicate_cells(tb_target_item,sub_type) diff_obj = DiffItem('add', 'table', sub_type=sub_type, block_name=block_name, old=None, new=tb_target_item, belong_to=belong_to) setattr(diff_obj, 'head_type', head_type) ls_tb_add.append(diff_obj) # 如果是表格、行或列,处理单元格中的内容 if sub_type in ('row', 'col', 'table'): cells_list = tb_target_item.cells if sub_type != 'table' else [ cell for row in tb_target_item.rows for cell in row.cells] ls_tb_add.extend(process_graphic_objects('add', cells_list)) return ls_tb_delete, ls_tb_add def transpose_table_rows(self, rows): """ 将表格的行进行转置操作,即将行转换为列,列转换为行。 Args: rows (list): 原始表格的行列表,每个元素是一个RowObject对象 Returns: list: 返回转置后的行列表,每个元素是一个RowObject对象 """ # 创建新的行对象列表,数量等于原始表格的最大列数 max_cell_count = 0 for row in rows: if len(row.cells) > max_cell_count: max_cell_count = len(row.cells) t_rows = [RowObject() for _ in range(max_cell_count)] # 遍历原始表格的每一行 for row in rows: # 遍历每一行的单元格 for idx, cell in enumerate(row.cells): # 将单元格添加到转置后的对应行中 t_rows[idx].cells.append(cell) # 为转置后的每一行设置属性 for row in t_rows: # 设置行的坐标为第一个单元格的坐标 row.coordinate = row.cells[0].coordinate # 设置行的数据ID为第一个单元格的数据ID row.data_id = row.cells[0].data_id # 设置行的布局为第一个单元格的布局 row.layout = row.cells[0].layout # 如果第一个单元格有列索引,则设置行的列索引 if isinstance(row.cells[0].col_index, int): row.col_index = row.cells[0].col_index # 如果第一个单元格有行索引,则设置行的行索引 if isinstance(row.cells[0].row_index, int): row.row_index = row.cells[0].row_index # 返回转置后的行列表 return t_rows def find_differences(self, array1: list, array2: list, old_items, new_items, diff_mode='normal'): if isinstance(array1, list): array1 = np.array(array1) if isinstance(array2, list): array2 = np.array(array2) # 确保两个ndarray的shape相同 if array1.shape != array2.shape: raise ValueError("两个ndarray的shape必须相同") diff_type = diff_mode if diff_type == 'normal': # 计算行差异数 row_diff_count = np.sum(~np.all(array1 == array2, axis=1)) # 计算列差异数 col_diff_count = np.sum(~np.all(array1 == array2, axis=0)) # 根据差异数选择差异类型 diff_type = 'col' if col_diff_count < row_diff_count else 'row' # 找出所有行和列的差异项 row_diffs, col_diffs, cell_diffs, cell_diff_points, cell_diff_values, row_diff_idx, col_diff_idx, graphic_diffs, picture_diffs = [ [] for _ in range(9)] if diff_type == 'row': for i in range(array1.shape[0]): if not np.all(array1[i] == array2[i]): # if np.sum(array1[i] != array2[i]) < settings.T_MERGE_MULTI_CELL_UPDATE_TO_ROW_UPDATE_MIN_CELL_COUNT: # 如果该行只有一个数值不一致,则将这个差异项改为单元格的差异项 for j in range(array1.shape[1]): if array1[i, j] == array2[i, j]: continue diff_point, diff_values, graphic_diff, picture_diff = self.get_cell_not_equal_attrs( old_items[i][j], new_items[i][j], settings.CELL_COMPARE_ATTRS) if diff_point: cell_diffs.append((i, j)) cell_diff_points.append(' '.join(diff_point)) cell_diff_values.append(diff_values) if graphic_diff: graphic_diffs.append(graphic_diff) if picture_diff: picture_diffs.append(picture_diff) # else: # row_diffs.append(i) # row_diff_idx.append(np.where(array1[i] != array2[i])[0].tolist()) else: for j in range(array1.shape[1]): if not np.all(array1[:, j] == array2[:, j]): # if np.sum(array1[:, j] != array2[:,j]) < settings.T_MERGE_MULTI_CELL_UPDATE_TO_ROW_UPDATE_MIN_CELL_COUNT: # 如果该列只有一个数值不一致,则将这个差异项改为单元格的差异项 for i in range(array1.shape[0]): if array1[i, j] == array2[i, j]: continue diff_point, diff_values, graphic_diff, picture_diff = self.get_cell_not_equal_attrs( old_items[i][j], new_items[i][j], settings.CELL_COMPARE_ATTRS) if diff_point: cell_diffs.append((i, j)) cell_diff_points.append(' '.join(diff_point)) cell_diff_values.append(diff_values) if graphic_diff: graphic_diffs.append(graphic_diff) if picture_diff: picture_diffs.append(picture_diff) # else: # col_diffs.append(j) # col_diff_idx.append(np.where(array1[:, j] != array2[:, j])[0].tolist()) # 返回所有差异类型对应的单元格索引 return diff_type, row_diffs, col_diffs, cell_diffs, cell_diff_points, cell_diff_values, row_diff_idx, col_diff_idx, graphic_diffs, picture_diffs @staticmethod def get_cell_chars(cell_obj): chars = [] for text_obj in cell_obj.content: if not is_instance_of(text_obj, TextObject): continue chars.extend(text_obj.get_chars()) return chars def _compare_cell_diff(self, base, target, data_type, block_name=''): """ 对比单元格图像的方法 """ result = [] if data_type == 'graphic': cp_obj = GraphicComparer(self._base_block_mapping, self._target_block_mapping, self._path_base, self._path_target) else: cp_obj = PictureComparer(self._base_block_mapping, self._target_block_mapping, self._path_base, self._path_target) item_result = cp_obj.compare(block_name, base, target, 'cell', True) result.extend(item_result['add']) result.extend(item_result['delete']) result.extend(item_result['update']) return result @staticmethod def _get_graphic_picture_obj(old_cell, new_cell): base_graphic = [] target_graphic = [] base_picture = [] target_picture = [] for base_item in old_cell.content: if is_instance_of(base_item, GraphicObject): base_graphic.append(base_item) elif is_instance_of(base_item, PictureObject): base_picture.append(base_item) for new_item in new_cell.content: if is_instance_of(new_item, GraphicObject): target_graphic.append(new_item) elif is_instance_of(new_item, PictureObject): target_picture.append(new_item) return [(base_graphic, target_graphic), (base_picture, target_picture)] def _get_cell_graphic_picture_diff(self, old_cell, new_cell): """ 对比单元格图形图像的方法 """ graphic_diff = [] picture_diff = [] block = old_cell while not isinstance(block, DocumentBlockObject) and block and hasattr(block, 'layout'): block = block.layout.parent_ref block_name = block.name if block else '' graphic_obj, picture_obj = self._get_graphic_picture_obj(old_cell, new_cell) if graphic_obj[0] or graphic_obj[1]: graphic_diff = self._compare_cell_diff(graphic_obj[0], graphic_obj[1], 'graphic', block_name) if picture_obj[0] or picture_obj[1]: picture_diff = self._compare_cell_diff(picture_obj[0], picture_obj[1], 'picture', block_name) return graphic_diff, picture_diff def get_cell_not_equal_attrs(self, old_cell, new_cell, compare_attrs): diff_attrs = [] diff_values = [] if getattr(old_cell, 'auto_number', None) and getattr(new_cell, 'auto_number', None): return [], [], [], [] if old_cell.text != new_cell.text: diff_attrs.append('text') diff_values.append((old_cell.text, new_cell.text)) else: # 直接在对象上取值的属性 # direct_attr = ['style.background_color', 'border.border_top.border_style', 'style.background_color', # 'border.border_bottom.border_style', 'border.border_left.border_style', # 'border.border_right.border_style', 'style.background_style'] direct_attr = ['style.background_color', 'style.background_style'] attrs, values = self.get_not_equal_attrs(old_cell, new_cell, direct_attr) diff_attrs.extend(attrs) diff_values.extend(values) for old_char, new_char in zip_longest(self.get_cell_chars(old_cell), self.get_cell_chars(new_cell), fillvalue=None): if old_char is None or new_char is None: diff_attrs.append('text') diff_values.append((str(old_char), str(new_char))) else: attrs, values = self.get_not_equal_attrs(old_char, new_char, compare_attrs) diff_attrs.extend(attrs) diff_values.extend(values) # 单元格增加图形图像的比较 graphic_diff, picture_diff = self._get_cell_graphic_picture_diff(old_cell, new_cell) unique_diff_attrs = list(set(diff_attrs)) unique_not_equal_values = [diff_values[diff_attrs.index(v)] for v in unique_diff_attrs] return unique_diff_attrs, unique_not_equal_values, graphic_diff, picture_diff def get_cell_content_list(self, cell_obj_lists, with_attr=False): content_lists = [] processed_merged_ranges = set() for cell_obj_list in cell_obj_lists: row_content_list = [] for cell_obj in cell_obj_list: # 检查是否是合并单元格且已经处理过 if hasattr(cell_obj, 'merged_ranges') and cell_obj.merged_ranges: # 创建一个基于合并范围和内容的唯一键 merged_key = (tuple(cell_obj.merged_ranges), str(getattr(cell_obj, 'text', ''))) if merged_key in processed_merged_ranges: # 如果已经处理过,设置为空字符串 row_content_list.append('') # 直接添加空字符串到结果中 continue else: # 如果是合并单元格但未处理过,标记为已处理 processed_merged_ranges.add(merged_key) cell_contents = [f'text:{cell_obj.text}'] if with_attr: attr_list = settings.CELL_COMPARE_ATTRS for attr in attr_list: if attr == "text": continue attr_value = self.get_nest_attr(cell_obj, attr) if attr_value not in (None, ''): cell_contents.append(f'{attr}:{str(attr_value)}') row_content_list.append('🙉'.join(cell_contents)) content_lists.append(row_content_list) return content_lists def get_nest_attr(self, obj, nest_attr): if is_instance_of(obj, CellObject): result_attr = [] # 特殊处理单元格背景色 # if nest_attr in ('style.background_color', 'border.border_top.border_style', # 'border.border_bottom.border_style', 'border.border_left.border_style', # 'border.border_right.border_style', 'style.background_style'): if nest_attr in ('style.background_color', 'style.background_style'): return self.get_target_attr(obj, nest_attr) if nest_attr == 'font_background_color': nest_attr = 'style.background_color' for item_obj in obj.content: if is_instance_of(item_obj, GraphicObject) and nest_attr == 'graphic': for item_attr in settings.PICTURE_COMPARE_ATTRS: run_attr = self.get_target_attr(item_obj, item_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) graphic_text_obj = getattr(item_obj, 'text_obj', None) if graphic_text_obj and graphic_text_obj.text: text_attr_list = [] for text_attr in settings.TEXT_COMPARE_ATTRS: for run_obj in graphic_text_obj.runs: attr_val = self.get_target_attr(run_obj, text_attr) if attr_val and str(attr_val) not in text_attr_list: text_attr_list.append(str(attr_val)) if text_attr_list: result_attr.extend(text_attr_list) elif is_instance_of(item_obj, PictureObject) and nest_attr == 'picture': for item_attr in settings.PICTURE_COMPARE_ATTRS: run_attr = self.get_target_attr(item_obj, item_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) else: if is_instance_of(item_obj, TextObject): for run_obj in item_obj.runs: run_attr = self.get_target_attr(run_obj, nest_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) elif is_instance_of(item_obj, RunObject): run_attr = self.get_target_attr(item_obj, nest_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) return "".join(result_attr) elif is_instance_of(obj, TextObject): result_attr = [] for run_obj in obj.runs: run_attr = self.get_target_attr(run_obj, nest_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) else: return self.get_target_attr(obj, nest_attr) @staticmethod def get_target_attr(obj, nest_attr): nest_attrs = nest_attr.split('.') attr_str = nest_attrs.pop(0) base_attr = getattr(obj, attr_str, None) while base_attr and nest_attrs: attr_str = nest_attrs.pop(0) base_attr = getattr(base_attr, attr_str, None) return base_attr # @staticmethod # def merge_cells_to_row(cell_list): # row_obj = RowObject() # # if cell_list: # row_obj.cells = cell_list # row_obj.coordinate = cell_list[0].coordinate # # 对cell_obj的layout.parent_ref进行判断,有值在进行赋值 # if cell_list[0].layout.parent_ref: # row_obj.layout = cell_list[0].layout.parent_ref.layout # row_obj.style = cell_list[0].layout.parent_ref.style # row_obj.border = cell_list[0].layout.parent_ref.border # row_obj.row_index = cell_list[0].row_index # row_obj.data_id = cell_list[0].data_id # return row_obj @staticmethod def align_table_col(base_table, target_table): base_max_col_count = max([len(row.cells) for row in base_table.rows]) target_max_col_count = max([len(row.cells) for row in target_table.rows]) for base_row in base_table.rows: if len(base_row.cells) != base_max_col_count: # 匹配行的列数不一致,补齐缺失的cell add_col_count = abs(len(base_row.cells) - base_max_col_count) base_row.cells.extend([CellObject() for _ in range(add_col_count)]) for target_row in target_table.rows: if len(target_row.cells) != target_max_col_count: # 匹配行的列数不一致,补齐缺失的cell add_col_count = abs(len(target_row.cells) - target_max_col_count) target_row.cells.extend([CellObject() for _ in range(add_col_count)]) def cell_update_after(self, update_cells): """ 单元格的变更后处理, 只有在content和merged_ranges都一样的情况下才过滤重复项 :return: """ if not update_cells: return update_cells result = [] custom_cells_merged_ranges = list() seen_cells = defaultdict(list) def normalize_content(cell): """标准化单元格内容用于比较""" if not cell: return "" # 获取文本内容并标准化 content_text = str(cell.text) if hasattr(cell, 'text') else "" # 标准化换行符 normalized = content_text.strip().replace('\r\n', '\n').replace('\r', '\n') return normalized def get_cell_key(item): """生成用于比较的键""" old_cell = getattr(item, 'old', None) new_cell = getattr(item, 'new', None) # 获取内容键 old_content = normalize_content(old_cell) new_content = normalize_content(new_cell) content_key = f"{old_content}|{new_content}" # 获取合并范围键 old_range = tuple(old_cell.merged_ranges) if old_cell and hasattr(old_cell, 'merged_ranges') and old_cell.merged_ranges else () new_range = tuple(new_cell.merged_ranges) if new_cell and hasattr(new_cell, 'merged_ranges') and new_cell.merged_ranges else () range_key = f"{old_range}|{new_range}" return f"{content_key}||{range_key}" def get_is_custom_cell(cell_obj): for c_obj in cell_obj.get_heads(): if c_obj.text == settings.SPECIAL_CELL_CONTENT3: return True for item in update_cells: # 如果不是单元格更新或者没有old/new对象,直接添加到结果中 if (item.type != 'update' or item.data_type != 'table' or item.sub_type != 'cell' or (not item.old or not item.old.merged_ranges) and (not item.new or not item.new.merged_ranges)): result.append(item) continue current_old_range = getattr(item.old, 'merged_ranges', []) if item.old else [] current_new_range = getattr(item.new, 'merged_ranges', []) if item.new else [] # 特殊定制的表格累加处理 if item.old and get_is_custom_cell(item.old): if current_old_range not in custom_cells_merged_ranges: custom_cells_merged_ranges.add(current_old_range) result.append(item) else: existing_idx = custom_cells_merged_ranges.index(current_old_range) result[existing_idx].old.text += item.old.text result[existing_idx].old.content.extend(item.old.content) elif item.new and get_is_custom_cell(item.new): if current_new_range not in custom_cells_merged_ranges: custom_cells_merged_ranges.append(current_new_range) result.append(item) else: existing_idx = custom_cells_merged_ranges.index(current_new_range) result[existing_idx].new.text += item.new.text result[existing_idx].new.content.extend(item.new.content) # 检查是否只有单侧有合并范围, 如果只有单侧有合并范围,则不视为重复 # elif len(current_old_range) <4 or len(current_new_range)<4: # result.append(item) else: # 处理普通单元格 - 进行去重检查 # 生成用于比较的键 cell_key = get_cell_key(item) # 检查是否已经存在相同的键 is_duplicate = False for existing_idx in seen_cells[cell_key]: existing_item = result[existing_idx] # 获取当前和已存在项目的合并范围 existing_old_range = getattr(existing_item.old, 'merged_ranges', []) if existing_item.old else [] existing_new_range = getattr(existing_item.new, 'merged_ranges', []) if existing_item.new else [] # 只有当merged_ranges完全相同时才认为是重复 if (current_old_range == existing_old_range and current_new_range == existing_new_range): is_duplicate = True break if not is_duplicate: seen_cells[cell_key].append(len(result)) result.append(item) # 如果是重复项,则忽略(不添加到结果中) return result def row_del_add_after(self, part, category='add'): """ 根据 category 参数处理新增或删除的行对象,判断行中的单元格是否有 merged_ranges 属性。 如果行中的任意一个单元格没有 merged_ranges 属性,则添加到结果列表中。 同时过滤具有相同 merged_ranges 的重复行对象,仅保留第一个出现的行。 注意:对于两列(行)中至少共享一个合并单元格,同时两列(行)内容完全相同,依然有可能会被误删除 解决方案:需要解析提供所有的单元格范围,之后综合计算整列(行)的范围进行判断, 若整列(行)都是因合并单元格而造成的冗余则进行过滤,否则(如只共享一(多)个合并单元格)则保留 :param part: 行对象列表 :param category: 操作类型,'add' 或 'delete' :return: 处理后的结果列表 """ # 使用列表来保存拼接后的列表 result = [] if not part: return part if category not in ('add', 'delete'): return part # 根据 category 决定处理新增还是删除的行对象 merged_rows = [] for row in part: # 检查是否是行对象;(会有PictureObject和GraphicObject)如不是则直接加入结果中 if category == 'add' and not is_instance_of(row.new, RowObject): result.append(row) continue if category == 'delete' and not is_instance_of(row.old, RowObject): result.append(row) continue # 获取要检查的单元格列表 cells = row.new.cells if category == 'add' else row.old.cells # 检查行中的每个单元格是否有 merged_ranges 属性 has_merged_ranges = any(hasattr(cell, 'merged_ranges') for cell in cells) # 如果行中的任意一个单元格没有 merged_ranges 属性,则添加到结果中 if not has_merged_ranges: result.append(row) else: merged_rows.append(row) # 处理具有 merged_ranges 的行,过滤重复项 if merged_rows: seen_contents = defaultdict(list) def remove_timestamp(text): return re.sub(r'\d{4}[-/]\d{2}[-/]\d{2}.*?(?=\t|\n|$)', '', text) for index, row in enumerate(merged_rows): # 获取当前行的内容 content = getattr(row, 'new_content' if category == 'add' else 'old_content', None) if content: #确保在处理可能包含非UTF-8编码字符的文本时不会出现解码错误 if isinstance(content, bytes): content = content.decode('utf-8', errors='replace') elif not isinstance(content, str): content = str(content) # 标准化 content cleaned_content = remove_timestamp(content) normalized_content = cleaned_content.strip().replace('\r\n', '\n').replace('\r', '\n') seen_contents[normalized_content].append(index) duplicates_row = {item: indices for item, indices in seen_contents.items() if len(indices) > 1} removed_rows_indices = [] for _, indices in duplicates_row.items(): seen_merged_ranges = set() for i in indices: # 获取要检查的单元格列表 cells = merged_rows[i].new.cells if category == 'add' else merged_rows[i].old.cells for cell in cells: if cell.merged_ranges: merged_range_tuple = tuple(cell.merged_ranges) if merged_range_tuple not in seen_merged_ranges: seen_merged_ranges.add(merged_range_tuple) break else: removed_rows_indices.append(i) break # 添加未被移除的行到结果中 for index, row in enumerate(merged_rows): if index not in removed_rows_indices: result.append(row) return result def pre_process_require(self, old_table, new_table): base_resources = [old_table] target_resources = [new_table] changes_dict = {} # 存储变更信息的字典 # 合并表格并编号(0=变更前,1=变更后) for table_idx, table in enumerate(base_resources + target_resources): col_list = table.get_col_list(col_name=settings.SPECIAL_COLUMN) #'要求廃止' # 在循环外部初始化计数器 be_counter = 1 af_counter = 1 if col_list: for row_index, cell in enumerate(col_list): cell_text = getattr(cell, 'text', '') if cell_text == settings.SPECIAL_CELL_CONTENT2: #'レ' if table_idx < len(base_resources): table_key = f"be_{be_counter:02d}" be_counter += 1 else: table_key = f"af_{af_counter:02d}" af_counter += 1 # 获取列索引 col_index = cell.col_index # 存储变更位置信息 changes_dict[table_key] = (row_index, col_index) if any(changes_dict): # 分离变更前和变更后的数据 be_changes = {k: v for k, v in changes_dict.items() if k.startswith('be_')} af_changes = {k: v for k, v in changes_dict.items() if k.startswith('af_')} # 处理变更前的数据 # 记录需要清理的key(分前后表) be_clear_keys = [] af_clear_keys = [] if be_changes: for table_key, (row_idx, col_idx) in be_changes.items(): next_col_idx = col_idx + 1 if next_col_idx < len(base_resources[0].rows[row_idx].cells): if self._tbl_find_unique(base_resources[0], target_resources[0], row_idx, col_idx + 1): be_clear_keys.append(table_key) # 处理变更后的数据 if af_changes: for table_key, (row_idx, col_idx) in af_changes.items(): next_col_idx = col_idx + 1 if next_col_idx < len(target_resources[0].rows[row_idx].cells): if self._tbl_find_unique(target_resources[0], base_resources[0], row_idx, col_idx + 1): af_clear_keys.append(table_key) # 统一清理(分表操作) for table_key in be_clear_keys: row_idx, col_idx = changes_dict[table_key] # 清理前表(base_resources) for cell in base_resources[0].rows[row_idx].cells: cell.text = '' cell.content = [] cell.style = StyleObject() for table_key in af_clear_keys: row_idx, col_idx = changes_dict[table_key] # 清理后表(target_resources) for cell in target_resources[0].rows[row_idx].cells: cell.text = '' cell.content = [] cell.style = StyleObject() return base_resources[0], target_resources[0] @staticmethod def _tbl_find_unique(base_tbl, target_tbl, row_idx, col_idx): """校验指定单元格是否满足唯一性条件: 1. 在 base_tbl 对应列中不存在相同值 2. 在 target_tbl 当前列中唯一(排除自己) 返回:是否需要清理(True/False) """ if not target_tbl or not base_tbl: return False target_rows = target_tbl.rows if row_idx >= len(target_rows) or col_idx >= len(target_rows[row_idx].cells): return False compare_text = str(target_rows[row_idx].cells[col_idx].text) # 条件1:检查 base_tbl 对应列是否存在相同值 if base_tbl.rows and col_idx < len(base_tbl.rows[0].cells): for row in base_tbl.rows: if col_idx < len(row.cells) and str(row.cells[col_idx].text) == compare_text: return True # 需要清理 # 条件2:检查 target_tbl 当前列是否有重复(排除自己) for i, row in enumerate(target_rows): if i == row_idx: continue if col_idx < len(row.cells) and str(row.cells[col_idx].text) == compare_text: return True # 需要清理 return False # 无需清理 我刚刚试了一下,表格设置好像没有成功
12-05
import logging,time,re import numpy as np from collections import defaultdict from itertools import zip_longest from kotei_omp.data import DocumentBlockObject from kotei_omc.comparers.picture_comparer import PictureComparer, GraphicComparer from kotei_omc.comparers.base_comparer import BaseComparer from kotei_omc.comparers.plugins import register_plugin from kotei_omc.data.diff import DiffItem from kotei_omp.data import TextObject, GraphicObject, PictureObject, StyleObject, RunObject from kotei_omp.data.table import CellObject, RowObject, TableObject from kotei_omc.settings import settings from kotei_omc.utils.type_checker import is_instance_of from kotei_omc.middlewares.table_middlewares import CustomTableStrategyMiddleware logger = logging.getLogger("req_diff") @register_plugin("table") class TableComparer(BaseComparer): def get_block_resource(self, block, belong_to='block'): return self.do_get_block_resource(block, belong_to, 'tables', TableObject) def compare(self, block_name, base, target, belong_to=None): t0 = time.time() # 表格匹配 logger.info(f'start match table, block_name: {block_name}, base_num: {len(base)}, target_num: {len(target)}') match_func = CustomTableStrategyMiddleware(self._path_base).match if settings.MATCH_WITH_CHAPTER: tb_delete_list, tb_add_list, old_new_tb_matched = self.do_match_with_chapter(base, target,match_func) else: tb_delete_list, tb_add_list, old_new_tb_matched = self.do_match_normal(base, target,match_func) logger.info('finish match table') # 表格新增删除 ls_tb_delete, ls_tb_add = self.process_delete_add_diff(block_name, 'table', tb_delete_list, tb_add_list, belong_to=belong_to) # 表格差分 ls_tb_update = [] for old_table, new_table in old_new_tb_matched: # 要求废止特殊处理 old_table, new_table = self.pre_process_require(old_table, new_table) # 表格位置差分 if not old_table.is_same_pos(new_table): ls_tb_update.append(DiffItem('update', 'table', sub_type='table', block_name=block_name, old=old_table, new=new_table, belong_to=belong_to,diff_point='coordinate_desc')) # 对匹配的每个表格进行对比 part_delete, part_add, part_update = self.compare_table(block_name, old_table, new_table,belong_to=belong_to) ls_tb_delete.extend(self.row_del_add_after(part_delete,category='delete')) ls_tb_add.extend(self.row_del_add_after(part_add,category='add')) ls_tb_update.extend(self.cell_update_after(part_update)) t1 = time.time() logger.info(f'Time Cost:table diff {block_name} {t1 - t0}') return {'add': ls_tb_add, 'delete': ls_tb_delete, 'update': ls_tb_update} @staticmethod def copy_table_attrs(to_table, from_table): for attr_name in ('layout', 'style', 'border', 'coordinate', 'data_id'): setattr(to_table, attr_name, getattr(from_table, attr_name)) @staticmethod def fill_visual_merged_cells(table): num_rows = len(table.rows) if num_rows == 0: return num_cols = max([len(row.cells) for row in table.rows]) if num_cols == 0: return # 判断是否有边界 def is_bordered(side): return side.border_style is not None for col in range(num_cols): row_ptr = 0 while row_ptr < num_rows: cell = table.rows[row_ptr].cells[col] top_border_exists = is_bordered(cell.border.border_top) if row_ptr == 0 or top_border_exists: start_row = row_ptr end_row = start_row while end_row < num_rows: current_cell = table.rows[end_row].cells[col] bottom_border_exists = is_bordered(current_cell.border.border_bottom) # import ipdb;ipdb.set_trace() if bottom_border_exists or end_row == num_rows - 1: break else: end_row += 1 block_text = None block_content = None for r in range(start_row, end_row + 1): val = table.rows[r].cells[col].text if val is not None and str(val).strip() != "": block_text = val block_content = table.rows[r].cells[col].content break if block_text is not None: merged_ranges = [start_row, col, end_row, col] for r in range(start_row, end_row + 1): val = table.rows[r].cells[col].text if val is None or str(val).strip() == "": table.rows[r].cells[col].content = block_content table.rows[r].cells[col].text = block_text # 添加 merged_ranges 属性 if not table.rows[r].cells[col].merged_ranges: table.rows[r].cells[col].merged_ranges = merged_ranges row_ptr = end_row + 1 else: row_ptr += 1 def compare_table(self, block_name, old_table, new_table, belong_to): logger.info(f"start compare table, old_data_id: {old_table.data_id}, new_data_id: {new_table.data_id}") # 使表格内列数一致 self.align_table_col(old_table, new_table) # 表格中存在大量视觉上merge但是实际未合并的空格,需要将空格赋值为正确的文本,防止影响相似度匹配 self.fill_visual_merged_cells(old_table) self.fill_visual_merged_cells(new_table) if old_table.head_type == new_table.head_type == 'horizontal': old_col_table, new_col_table = self.transpose_table(old_table, new_table) else: if old_table.head_type == 'vertical': new_table.head_list = old_table.head_list new_table.head_type = 'vertical' elif new_table.head_type == 'vertical': old_table.head_list = new_table.head_list old_table.head_type = 'vertical' old_col_table, new_col_table = old_table, new_table # 列匹配 del_cols, add_cols = old_col_table.rows, new_col_table.rows col_matched = CustomTableStrategyMiddleware(self._path_base).match_row(del_cols, add_cols,is_col=True, head_indexes=[old_table.head_list,new_table.head_list]) if col_matched: matched_old_cols, matched_new_cols = list(zip(*list(col_matched))) del_cols = [old_col for old_col in old_col_table.rows if old_col not in matched_old_cols] add_cols = [new_col for new_col in new_col_table.rows if new_col not in matched_new_cols] sub_type = 'col' if old_table.head_type == 'horizontal' else 'row' ls_col_delete, ls_col_add = self.process_delete_add_diff(block_name, sub_type, del_cols, add_cols, belong_to=belong_to, head_type=old_table.head_type) # 根据matched的列组合新的表,得到列一致的两个表 if col_matched: old_col_indexes,new_col_indexes =[],[] for old_col, new_col in col_matched: old_col_indexes.append(old_col_table.rows.index(old_col)) new_col_indexes.append(new_col_table.rows.index(new_col)) old_equal_col_table = self.choice_cols(old_table, old_col_indexes) new_equal_col_table = self.choice_cols(new_table, new_col_indexes) else: return ls_col_delete, ls_col_add, [] # 行匹配 del_rows, add_rows = old_equal_col_table.rows, new_equal_col_table.rows row_matched = CustomTableStrategyMiddleware(self._path_base).match_row(del_rows, add_rows, is_col=False) if row_matched: matched_old_rows, matched_new_rows = list(zip(*list(row_matched))) del_rows_indexes = [idx for idx, old_row in enumerate(old_equal_col_table.rows) if old_row not in matched_old_rows] add_rows_indexes = [idx for idx, new_row in enumerate(new_equal_col_table.rows) if new_row not in matched_new_rows] # 使用没有重组前的表,横表头直接处理,竖表头需要转置 if old_table.head_type == new_table.head_type == 'horizontal': del_rows = [old_table.rows[idx] for idx in del_rows_indexes] add_rows = [new_table.rows[idx] for idx in add_rows_indexes] else: old_transpose_table = self.choice_cols(old_table, list(range(len(old_table.rows)))) new_transpose_table = self.choice_cols(new_table, list(range(len(new_table.rows)))) del_rows = [old_transpose_table.rows[idx] for idx in del_rows_indexes] add_rows = [new_transpose_table.rows[idx] for idx in add_rows_indexes] sub_type = 'row' if old_table.head_type == 'horizontal' else 'col' ls_row_delete, ls_row_add = self.process_delete_add_diff(block_name, sub_type, del_rows, add_rows, belong_to=belong_to, head_type=old_table.head_type) # 根据matched的行组合新的表,得到行一致的两个表 if row_matched: old_equal_row_table, new_equal_row_table = TableObject(), TableObject() old_equal_row_table.rows = list(matched_old_rows) old_equal_row_table.head_type = old_table.head_type self.copy_table_attrs(old_equal_row_table, old_table) new_equal_row_table.rows = list(matched_new_rows) new_equal_row_table.head_type = new_table.head_type self.copy_table_attrs(new_equal_row_table, new_table) # 查找行变更、列变更、单元格变更 ls_row_update, ls_col_update, ls_cell_update = self.compare_ordered_tables(block_name,old_equal_row_table, new_equal_row_table,belong_to=belong_to) else: ls_row_update, ls_col_update, ls_cell_update = [], [], [] part_delete = ls_row_delete + ls_col_delete part_add = ls_row_add + ls_col_add part_update = ls_row_update + ls_col_update + ls_cell_update logger.info(f"finish compare table, old_data_id: {old_table.data_id}, new_data_id: {new_table.data_id}") return part_delete, part_add, part_update def transpose_table(self, old_table, new_table): """ 将表格进行转置操作,即将行转换为列,列转换为行。 Args: old_table (TableObject): 原始表格对象 new_table (TableObject): 目标表格对象 Returns: tuple: 返回转置后的两个表格对象 (old_col_table, new_col_table) """ # 创建新的表格对象用于存储转置后的数据 old_col_table, new_col_table = TableObject(), TableObject() # 对原始表格的行进行转置操作 old_col_table.rows = self.transpose_table_rows(old_table.rows) # 根据原始表格的表头类型,设置转置后的表头类型 old_col_table.head_type = 'vertical' if old_table.head_type == 'horizontal' else 'horizontal' # 复制原始表格的属性到转置后的表格 self.copy_table_attrs(old_col_table, old_table) # 对目标表格的行进行转置操作 new_col_table.rows = self.transpose_table_rows(new_table.rows) # 根据目标表格的表头类型,设置转置后的表头类型 new_col_table.head_type = 'vertical' if new_table.head_type == 'horizontal' else 'horizontal' # 复制目标表格的属性到转置后的表格 self.copy_table_attrs(new_col_table, new_table) # 返回转置后的两个表格对象 return old_col_table, new_col_table def compare_ordered_tables(self, block_name, old_table_obj, new_table_obj, belong_to): row_updates, col_updates, cell_updates = [], [], [] # 获取新旧行数据 old_rows = getattr(old_table_obj, 'rows', []) new_rows = getattr(new_table_obj, 'rows', []) old_cells_list = [row.cells for row in old_rows] new_cells_list = [row.cells for row in new_rows] # 获取内容用于对比 old_content_cells_list = self.get_cell_content_list(old_cells_list, settings.DIFF_ATTR) new_content_cells_list = self.get_cell_content_list(new_cells_list, settings.DIFF_ATTR) # 删除完全一样的匹配 for row_index in range(len(old_content_cells_list) - 1, -1, -1): # 如果新旧行内容相同,则删除该行 # 之后可以在这里增加原子操作逻辑,避免删除不同步 if old_content_cells_list[row_index] == new_content_cells_list[row_index]: old_content_cells_list.pop(row_index) new_content_cells_list.pop(row_index) old_cells_list.pop(row_index) new_cells_list.pop(row_index) #原子一致性检查 flag = False if len(old_content_cells_list) ==len(new_content_cells_list)==len(old_cells_list) == len(new_cells_list): flag = True if not flag: logger.warning(f"{block_name} old_table_obj: {old_table_obj}, new_table_obj: {new_table_obj}; delete operator is not atomic; all the cells list will involved in finding differences computation") if not old_content_cells_list: return [], [], [] # 查找差异 diff_type, row_diffs, col_diffs, cell_diffs, cell_diff_points, cell_diff_values, \ row_diff_idx, col_diff_idx, graphic_diff, picture_diff = self.find_differences( old_content_cells_list, new_content_cells_list, old_cells_list, new_cells_list) # 抽取单元格内的图形图像差分 for item in graphic_diff + picture_diff: if item: cell_updates.extend(item) # 处理单元格差分 for idx, (cell_diff_idx, diff_point, diff_value) in enumerate( zip(cell_diffs, cell_diff_points, cell_diff_values)): try: # old = self.get_element_by_index(old_cells_list, cell_diff_idx) # new = self.get_element_by_index(new_cells_list, cell_diff_idx) old, new = old_cells_list, new_cells_list for cell_idx in cell_diff_idx: old = old[cell_idx] new = new[cell_idx] except IndexError: continue # 忽略非法索引 cell_diff_obj = DiffItem( 'update', 'table', 'cell', block_name=block_name, old=old, new=new, belong_to=belong_to, diff_point=diff_point, diff_values=diff_value ) cell_updates.append(cell_diff_obj) # 处理行差分 # if diff_type == 'row': # for row_idx, row_diff_col_idx in zip(row_diffs, row_diff_idx): # try: # old_row = [old_cells_list[row_idx][cell_idx] for cell_idx in row_diff_col_idx] # new_row = [new_cells_list[row_idx][cell_idx] for cell_idx in row_diff_col_idx] # except IndexError: # continue # # row_diff_item = DiffItem( # 'update', 'table', 'row', # block_name=block_name, # old=self.merge_cells_to_row(old_row), # new=self.merge_cells_to_row(new_row), # belong_to=belong_to) # row_updates.append(row_diff_item) # 处理列差分 # elif diff_type == 'col': # for col_idx, col_diff_col_idx in zip(col_diffs, col_diff_idx): # try: # old_col = [old_cells_list[cell_idx][col_idx] for cell_idx in col_diff_col_idx] # new_col = [new_cells_list[cell_idx][col_idx] for cell_idx in col_diff_col_idx] # except IndexError: # continue # # col_diff_item = DiffItem( # 'update', 'table', 'col', # block_name=block_name, # old=self.merge_cells_to_row(old_col), # new=self.merge_cells_to_row(new_col), # belong_to=belong_to # ) # col_updates.append(col_diff_item) return row_updates, col_updates, cell_updates def choice_cols(self, table_obj, col_indexes): if table_obj.head_type == 'horizontal': rows = [] for row_obj in table_obj.rows: cells = [] for cel_idx in col_indexes: cells.append(row_obj.cells[cel_idx]) rows.append(cells) else: rows = [[] for _ in range(len(table_obj.rows[0].cells))] for cel_idx in col_indexes: for idx, cell in enumerate(table_obj.rows[cel_idx].cells): rows[idx].append(cell) res_table_obj = TableObject() for cell_list in rows: row_obj = RowObject() if cell_list: row_obj.cells = cell_list row_obj.coordinate = cell_list[0].coordinate # 对cell_obj的layout.parent_ref进行判断,有值在进行赋值 if cell_list[0].layout.parent_ref: row_obj.layout = cell_list[0].layout.parent_ref.layout row_obj.style = cell_list[0].layout.parent_ref.style row_obj.border = cell_list[0].layout.parent_ref.border row_obj.row_index = cell_list[0].row_index row_obj.data_id = cell_list[0].data_id # res_table_obj.rows.append(self.merge_cells_to_row(cell_list)) res_table_obj.rows.append(row_obj) self.copy_table_attrs(res_table_obj, table_obj) return res_table_obj @staticmethod def process_delete_add_diff(block_name, sub_type, delete_tables, add_tables, belong_to, head_type=None): def process_graphic_objects(action, cell_list): """ 辅助函数:处理单元格中的图形对象和图片对象。 action: 操作类型('delete' 或 'add') cells_list: 单元格列表 """ diff_items = [] all_merged_ranges = [] for cell_obj in cell_list: if cell_obj.merged_ranges: # 合并单元格只处理一次 if cell_obj.merged_ranges not in all_merged_ranges: all_merged_ranges.append(cell_obj.merged_ranges) else: continue for item_obj in cell_obj.content: if is_instance_of(item_obj, GraphicObject) or is_instance_of(item_obj, PictureObject): # 检查是否是图形或图片对象 diff_items.append( DiffItem(action, item_obj._type, sub_type=item_obj._type, block_name=block_name, old=item_obj if action == 'delete' else None, new=None if action == 'delete' else item_obj, belong_to=belong_to) ) return diff_items # filter_duplicate_cells 过滤在一行或者一列中因合并单元格引起的重复 # 相关代码暂时先不启用,可以在后续使用者启用查看是否会引起漏差分的问题在决定是否启用 # 如果在解析端可以处理合并单元格,则不需要过滤,避免冗余处理而降低效率 def filter_duplicate_cells(item,sub_type): """ 根据text和merged_ranges过滤掉cells_list中的合并单元格 Args: item: RowObject or TableObject """ if sub_type != 'table': seen_contents = defaultdict(list) for i in range(len(item.cells) - 1, -1, -1): cell = item.cells[i] cell_merged_ranges = cell.merged_ranges if not cell_merged_ranges: continue cell_text = cell.text if cell_merged_ranges == seen_contents[cell_text]: del item.cells[i] continue seen_contents[cell_text] = cell_merged_ranges else: for row in item.rows: seen_contents = defaultdict(list) for i in range(len(row.cells) - 1, -1, -1): cell = row.cells[i] cell_merged_ranges = cell.merged_ranges if not cell_merged_ranges: continue cell_text = cell.text if cell_merged_ranges == seen_contents[cell_text]: del row.cells[i] continue seen_contents[cell_text] = cell_merged_ranges return item ls_tb_add, ls_tb_delete = [], [] for tb_base_item in delete_tables: # 过滤(行、列)合并单元格 tb_base_item = filter_duplicate_cells(tb_base_item,sub_type) diff_obj = DiffItem('delete', 'table', sub_type=sub_type, block_name=block_name, old=tb_base_item, new=None, belong_to=belong_to) setattr(diff_obj, 'head_type', head_type) ls_tb_delete.append(diff_obj) # 如果是表格、行或列,处理单元格中的内容 if sub_type in ('row', 'col', 'table'): cells_list = tb_base_item.cells if sub_type != 'table' else [ cell for row in tb_base_item.rows for cell in row.cells] ls_tb_delete.extend(process_graphic_objects('delete', cells_list)) for tb_target_item in add_tables: # 过滤(行、列)合并单元格 tb_target_item = filter_duplicate_cells(tb_target_item,sub_type) diff_obj = DiffItem('add', 'table', sub_type=sub_type, block_name=block_name, old=None, new=tb_target_item, belong_to=belong_to) setattr(diff_obj, 'head_type', head_type) ls_tb_add.append(diff_obj) # 如果是表格、行或列,处理单元格中的内容 if sub_type in ('row', 'col', 'table'): cells_list = tb_target_item.cells if sub_type != 'table' else [ cell for row in tb_target_item.rows for cell in row.cells] ls_tb_add.extend(process_graphic_objects('add', cells_list)) return ls_tb_delete, ls_tb_add def transpose_table_rows(self, rows): """ 将表格的行进行转置操作,即将行转换为列,列转换为行。 Args: rows (list): 原始表格的行列表,每个元素是一个RowObject对象 Returns: list: 返回转置后的行列表,每个元素是一个RowObject对象 """ # 创建新的行对象列表,数量等于原始表格的最大列数 max_cell_count = 0 for row in rows: if len(row.cells) > max_cell_count: max_cell_count = len(row.cells) t_rows = [RowObject() for _ in range(max_cell_count)] # 遍历原始表格的每一行 for row in rows: # 遍历每一行的单元格 for idx, cell in enumerate(row.cells): # 将单元格添加到转置后的对应行中 t_rows[idx].cells.append(cell) # 为转置后的每一行设置属性 for row in t_rows: # 设置行的坐标为第一个单元格的坐标 row.coordinate = row.cells[0].coordinate # 设置行的数据ID为第一个单元格的数据ID row.data_id = row.cells[0].data_id # 设置行的布局为第一个单元格的布局 row.layout = row.cells[0].layout # 如果第一个单元格有列索引,则设置行的列索引 if isinstance(row.cells[0].col_index, int): row.col_index = row.cells[0].col_index # 如果第一个单元格有行索引,则设置行的行索引 if isinstance(row.cells[0].row_index, int): row.row_index = row.cells[0].row_index # 返回转置后的行列表 return t_rows def find_differences(self, array1: list, array2: list, old_items, new_items, diff_mode='normal'): if isinstance(array1, list): array1 = np.array(array1) if isinstance(array2, list): array2 = np.array(array2) # 确保两个ndarray的shape相同 if array1.shape != array2.shape: raise ValueError("两个ndarray的shape必须相同") diff_type = diff_mode if diff_type == 'normal': # 计算行差异数 row_diff_count = np.sum(~np.all(array1 == array2, axis=1)) # 计算列差异数 col_diff_count = np.sum(~np.all(array1 == array2, axis=0)) # 根据差异数选择差异类型 diff_type = 'col' if col_diff_count < row_diff_count else 'row' # 找出所有行和列的差异项 row_diffs, col_diffs, cell_diffs, cell_diff_points, cell_diff_values, row_diff_idx, col_diff_idx, graphic_diffs, picture_diffs = [ [] for _ in range(9)] if diff_type == 'row': for i in range(array1.shape[0]): if not np.all(array1[i] == array2[i]): # if np.sum(array1[i] != array2[i]) < settings.T_MERGE_MULTI_CELL_UPDATE_TO_ROW_UPDATE_MIN_CELL_COUNT: # 如果该行只有一个数值不一致,则将这个差异项改为单元格的差异项 for j in range(array1.shape[1]): if array1[i, j] == array2[i, j]: continue diff_point, diff_values, graphic_diff, picture_diff = self.get_cell_not_equal_attrs( old_items[i][j], new_items[i][j], settings.CELL_COMPARE_ATTRS) if diff_point: cell_diffs.append((i, j)) cell_diff_points.append(' '.join(diff_point)) cell_diff_values.append(diff_values) if graphic_diff: graphic_diffs.append(graphic_diff) if picture_diff: picture_diffs.append(picture_diff) # else: # row_diffs.append(i) # row_diff_idx.append(np.where(array1[i] != array2[i])[0].tolist()) else: for j in range(array1.shape[1]): if not np.all(array1[:, j] == array2[:, j]): # if np.sum(array1[:, j] != array2[:,j]) < settings.T_MERGE_MULTI_CELL_UPDATE_TO_ROW_UPDATE_MIN_CELL_COUNT: # 如果该列只有一个数值不一致,则将这个差异项改为单元格的差异项 for i in range(array1.shape[0]): if array1[i, j] == array2[i, j]: continue diff_point, diff_values, graphic_diff, picture_diff = self.get_cell_not_equal_attrs( old_items[i][j], new_items[i][j], settings.CELL_COMPARE_ATTRS) if diff_point: cell_diffs.append((i, j)) cell_diff_points.append(' '.join(diff_point)) cell_diff_values.append(diff_values) if graphic_diff: graphic_diffs.append(graphic_diff) if picture_diff: picture_diffs.append(picture_diff) # else: # col_diffs.append(j) # col_diff_idx.append(np.where(array1[:, j] != array2[:, j])[0].tolist()) # 返回所有差异类型对应的单元格索引 return diff_type, row_diffs, col_diffs, cell_diffs, cell_diff_points, cell_diff_values, row_diff_idx, col_diff_idx, graphic_diffs, picture_diffs @staticmethod def get_cell_chars(cell_obj): chars = [] for text_obj in cell_obj.content: if not is_instance_of(text_obj, TextObject): continue chars.extend(text_obj.get_chars()) return chars def _compare_cell_diff(self, base, target, data_type, block_name=''): """ 对比单元格图像的方法 """ result = [] if data_type == 'graphic': cp_obj = GraphicComparer(self._base_block_mapping, self._target_block_mapping, self._path_base, self._path_target) else: cp_obj = PictureComparer(self._base_block_mapping, self._target_block_mapping, self._path_base, self._path_target) item_result = cp_obj.compare(block_name, base, target, 'cell', True) result.extend(item_result['add']) result.extend(item_result['delete']) result.extend(item_result['update']) return result @staticmethod def _get_graphic_picture_obj(old_cell, new_cell): base_graphic = [] target_graphic = [] base_picture = [] target_picture = [] for base_item in old_cell.content: if is_instance_of(base_item, GraphicObject): base_graphic.append(base_item) elif is_instance_of(base_item, PictureObject): base_picture.append(base_item) for new_item in new_cell.content: if is_instance_of(new_item, GraphicObject): target_graphic.append(new_item) elif is_instance_of(new_item, PictureObject): target_picture.append(new_item) return [(base_graphic, target_graphic), (base_picture, target_picture)] def _get_cell_graphic_picture_diff(self, old_cell, new_cell): """ 对比单元格图形图像的方法 """ graphic_diff = [] picture_diff = [] block = old_cell while not isinstance(block, DocumentBlockObject) and block and hasattr(block, 'layout'): block = block.layout.parent_ref block_name = block.name if block else '' graphic_obj, picture_obj = self._get_graphic_picture_obj(old_cell, new_cell) if graphic_obj[0] or graphic_obj[1]: graphic_diff = self._compare_cell_diff(graphic_obj[0], graphic_obj[1], 'graphic', block_name) if picture_obj[0] or picture_obj[1]: picture_diff = self._compare_cell_diff(picture_obj[0], picture_obj[1], 'picture', block_name) return graphic_diff, picture_diff def get_cell_not_equal_attrs(self, old_cell, new_cell, compare_attrs): diff_attrs = [] diff_values = [] if getattr(old_cell, 'auto_number', None) and getattr(new_cell, 'auto_number', None): return [], [], [], [] if old_cell.text != new_cell.text: diff_attrs.append('text') diff_values.append((old_cell.text, new_cell.text)) else: # 直接在对象上取值的属性 # direct_attr = ['style.background_color', 'border.border_top.border_style', 'style.background_color', # 'border.border_bottom.border_style', 'border.border_left.border_style', # 'border.border_right.border_style', 'style.background_style'] direct_attr = ['style.background_color', 'style.background_style'] attrs, values = self.get_not_equal_attrs(old_cell, new_cell, direct_attr) diff_attrs.extend(attrs) diff_values.extend(values) for old_char, new_char in zip_longest(self.get_cell_chars(old_cell), self.get_cell_chars(new_cell), fillvalue=None): if old_char is None or new_char is None: diff_attrs.append('text') diff_values.append((str(old_char), str(new_char))) else: attrs, values = self.get_not_equal_attrs(old_char, new_char, compare_attrs) diff_attrs.extend(attrs) diff_values.extend(values) # 单元格增加图形图像的比较 graphic_diff, picture_diff = self._get_cell_graphic_picture_diff(old_cell, new_cell) unique_diff_attrs = list(set(diff_attrs)) unique_not_equal_values = [diff_values[diff_attrs.index(v)] for v in unique_diff_attrs] return unique_diff_attrs, unique_not_equal_values, graphic_diff, picture_diff def get_cell_content_list(self, cell_obj_lists, with_attr=False): content_lists = [] processed_merged_ranges = set() for cell_obj_list in cell_obj_lists: row_content_list = [] for cell_obj in cell_obj_list: # 检查是否是合并单元格且已经处理过 if hasattr(cell_obj, 'merged_ranges') and cell_obj.merged_ranges: # 创建一个基于合并范围和内容的唯一键 merged_key = (tuple(cell_obj.merged_ranges), str(getattr(cell_obj, 'text', ''))) if merged_key in processed_merged_ranges: # 如果已经处理过,设置为空字符串 row_content_list.append('') # 直接添加空字符串到结果中 continue else: # 如果是合并单元格但未处理过,标记为已处理 processed_merged_ranges.add(merged_key) cell_contents = [f'text:{cell_obj.text}'] if with_attr: attr_list = settings.CELL_COMPARE_ATTRS for attr in attr_list: if attr == "text": continue attr_value = self.get_nest_attr(cell_obj, attr) if attr_value not in (None, ''): cell_contents.append(f'{attr}:{str(attr_value)}') row_content_list.append('🙉'.join(cell_contents)) content_lists.append(row_content_list) return content_lists def get_nest_attr(self, obj, nest_attr): if is_instance_of(obj, CellObject): result_attr = [] # 特殊处理单元格背景色 # if nest_attr in ('style.background_color', 'border.border_top.border_style', # 'border.border_bottom.border_style', 'border.border_left.border_style', # 'border.border_right.border_style', 'style.background_style'): if nest_attr in ('style.background_color', 'style.background_style'): return self.get_target_attr(obj, nest_attr) if nest_attr == 'font_background_color': nest_attr = 'style.background_color' for item_obj in obj.content: if is_instance_of(item_obj, GraphicObject) and nest_attr == 'graphic': for item_attr in settings.PICTURE_COMPARE_ATTRS: run_attr = self.get_target_attr(item_obj, item_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) graphic_text_obj = getattr(item_obj, 'text_obj', None) if graphic_text_obj and graphic_text_obj.text: text_attr_list = [] for text_attr in settings.TEXT_COMPARE_ATTRS: for run_obj in graphic_text_obj.runs: attr_val = self.get_target_attr(run_obj, text_attr) if attr_val and str(attr_val) not in text_attr_list: text_attr_list.append(str(attr_val)) if text_attr_list: result_attr.extend(text_attr_list) elif is_instance_of(item_obj, PictureObject) and nest_attr == 'picture': for item_attr in settings.PICTURE_COMPARE_ATTRS: run_attr = self.get_target_attr(item_obj, item_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) else: if is_instance_of(item_obj, TextObject): for run_obj in item_obj.runs: run_attr = self.get_target_attr(run_obj, nest_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) elif is_instance_of(item_obj, RunObject): run_attr = self.get_target_attr(item_obj, nest_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) return "".join(result_attr) elif is_instance_of(obj, TextObject): result_attr = [] for run_obj in obj.runs: run_attr = self.get_target_attr(run_obj, nest_attr) if run_attr and str(run_attr) not in result_attr: result_attr.append(str(run_attr)) else: return self.get_target_attr(obj, nest_attr) @staticmethod def get_target_attr(obj, nest_attr): nest_attrs = nest_attr.split('.') attr_str = nest_attrs.pop(0) base_attr = getattr(obj, attr_str, None) while base_attr and nest_attrs: attr_str = nest_attrs.pop(0) base_attr = getattr(base_attr, attr_str, None) return base_attr # @staticmethod # def merge_cells_to_row(cell_list): # row_obj = RowObject() # # if cell_list: # row_obj.cells = cell_list # row_obj.coordinate = cell_list[0].coordinate # # 对cell_obj的layout.parent_ref进行判断,有值在进行赋值 # if cell_list[0].layout.parent_ref: # row_obj.layout = cell_list[0].layout.parent_ref.layout # row_obj.style = cell_list[0].layout.parent_ref.style # row_obj.border = cell_list[0].layout.parent_ref.border # row_obj.row_index = cell_list[0].row_index # row_obj.data_id = cell_list[0].data_id # return row_obj @staticmethod def align_table_col(base_table, target_table): base_max_col_count = max([len(row.cells) for row in base_table.rows]) target_max_col_count = max([len(row.cells) for row in target_table.rows]) for base_row in base_table.rows: if len(base_row.cells) != base_max_col_count: # 匹配行的列数不一致,补齐缺失的cell add_col_count = abs(len(base_row.cells) - base_max_col_count) base_row.cells.extend([CellObject() for _ in range(add_col_count)]) for target_row in target_table.rows: if len(target_row.cells) != target_max_col_count: # 匹配行的列数不一致,补齐缺失的cell add_col_count = abs(len(target_row.cells) - target_max_col_count) target_row.cells.extend([CellObject() for _ in range(add_col_count)]) def cell_update_after(self, update_cells): """ 单元格的变更后处理, 只有在content和merged_ranges都一样的情况下才过滤重复项 :return: """ if not update_cells: return update_cells result = [] custom_cells_merged_ranges = list() seen_cells = defaultdict(list) def normalize_content(cell): """标准化单元格内容用于比较""" if not cell: return "" # 获取文本内容并标准化 content_text = str(cell.text) if hasattr(cell, 'text') else "" # 标准化换行符 normalized = content_text.strip().replace('\r\n', '\n').replace('\r', '\n') return normalized def get_cell_key(item): """生成用于比较的键""" old_cell = getattr(item, 'old', None) new_cell = getattr(item, 'new', None) # 获取内容键 old_content = normalize_content(old_cell) new_content = normalize_content(new_cell) content_key = f"{old_content}|{new_content}" # 获取合并范围键 old_range = tuple(old_cell.merged_ranges) if old_cell and hasattr(old_cell, 'merged_ranges') and old_cell.merged_ranges else () new_range = tuple(new_cell.merged_ranges) if new_cell and hasattr(new_cell, 'merged_ranges') and new_cell.merged_ranges else () range_key = f"{old_range}|{new_range}" return f"{content_key}||{range_key}" def get_is_custom_cell(cell_obj): for c_obj in cell_obj.get_heads(): if c_obj.text == settings.SPECIAL_CELL_CONTENT3: return True for item in update_cells: # 如果不是单元格更新或者没有old/new对象,直接添加到结果中 if (item.type != 'update' or item.data_type != 'table' or item.sub_type != 'cell' or (not item.old or not item.old.merged_ranges) and (not item.new or not item.new.merged_ranges)): result.append(item) continue current_old_range = getattr(item.old, 'merged_ranges', []) if item.old else [] current_new_range = getattr(item.new, 'merged_ranges', []) if item.new else [] # 特殊定制的表格累加处理 if item.old and get_is_custom_cell(item.old): if current_old_range not in custom_cells_merged_ranges: custom_cells_merged_ranges.add(current_old_range) result.append(item) else: existing_idx = custom_cells_merged_ranges.index(current_old_range) result[existing_idx].old.text += item.old.text result[existing_idx].old.content.extend(item.old.content) elif item.new and get_is_custom_cell(item.new): if current_new_range not in custom_cells_merged_ranges: custom_cells_merged_ranges.append(current_new_range) result.append(item) else: existing_idx = custom_cells_merged_ranges.index(current_new_range) result[existing_idx].new.text += item.new.text result[existing_idx].new.content.extend(item.new.content) # 检查是否只有单侧有合并范围, 如果只有单侧有合并范围,则不视为重复 # elif len(current_old_range) <4 or len(current_new_range)<4: # result.append(item) else: # 处理普通单元格 - 进行去重检查 # 生成用于比较的键 cell_key = get_cell_key(item) # 检查是否已经存在相同的键 is_duplicate = False for existing_idx in seen_cells[cell_key]: existing_item = result[existing_idx] # 获取当前和已存在项目的合并范围 existing_old_range = getattr(existing_item.old, 'merged_ranges', []) if existing_item.old else [] existing_new_range = getattr(existing_item.new, 'merged_ranges', []) if existing_item.new else [] # 只有当merged_ranges完全相同时才认为是重复 if (current_old_range == existing_old_range and current_new_range == existing_new_range): is_duplicate = True break if not is_duplicate: seen_cells[cell_key].append(len(result)) result.append(item) # 如果是重复项,则忽略(不添加到结果中) return result def row_del_add_after(self, part, category='add'): """ 根据 category 参数处理新增或删除的行对象,判断行中的单元格是否有 merged_ranges 属性。 如果行中的任意一个单元格没有 merged_ranges 属性,则添加到结果列表中。 同时过滤具有相同 merged_ranges 的重复行对象,仅保留第一个出现的行。 注意:对于两列(行)中至少共享一个合并单元格,同时两列(行)内容完全相同,依然有可能会被误删除 解决方案:需要解析提供所有的单元格范围,之后综合计算整列(行)的范围进行判断, 若整列(行)都是因合并单元格而造成的冗余则进行过滤,否则(如只共享一(多)个合并单元格)则保留 :param part: 行对象列表 :param category: 操作类型,'add' 或 'delete' :return: 处理后的结果列表 """ # 使用列表来保存拼接后的列表 result = [] if not part: return part if category not in ('add', 'delete'): return part # 根据 category 决定处理新增还是删除的行对象 merged_rows = [] for row in part: # 检查是否是行对象;(会有PictureObject和GraphicObject)如不是则直接加入结果中 if category == 'add' and not is_instance_of(row.new, RowObject): result.append(row) continue if category == 'delete' and not is_instance_of(row.old, RowObject): result.append(row) continue # 获取要检查的单元格列表 cells = row.new.cells if category == 'add' else row.old.cells # 检查行中的每个单元格是否有 merged_ranges 属性 has_merged_ranges = any(hasattr(cell, 'merged_ranges') for cell in cells) # 如果行中的任意一个单元格没有 merged_ranges 属性,则添加到结果中 if not has_merged_ranges: result.append(row) else: merged_rows.append(row) # 处理具有 merged_ranges 的行,过滤重复项 if merged_rows: seen_contents = defaultdict(list) def remove_timestamp(text): return re.sub(r'\d{4}[-/]\d{2}[-/]\d{2}.*?(?=\t|\n|$)', '', text) for index, row in enumerate(merged_rows): # 获取当前行的内容 content = getattr(row, 'new_content' if category == 'add' else 'old_content', None) if content: #确保在处理可能包含非UTF-8编码字符的文本时不会出现解码错误 if isinstance(content, bytes): content = content.decode('utf-8', errors='replace') elif not isinstance(content, str): content = str(content) # 标准化 content cleaned_content = remove_timestamp(content) normalized_content = cleaned_content.strip().replace('\r\n', '\n').replace('\r', '\n') seen_contents[normalized_content].append(index) duplicates_row = {item: indices for item, indices in seen_contents.items() if len(indices) > 1} removed_rows_indices = [] for _, indices in duplicates_row.items(): seen_merged_ranges = set() for i in indices: # 获取要检查的单元格列表 cells = merged_rows[i].new.cells if category == 'add' else merged_rows[i].old.cells for cell in cells: if cell.merged_ranges: merged_range_tuple = tuple(cell.merged_ranges) if merged_range_tuple not in seen_merged_ranges: seen_merged_ranges.add(merged_range_tuple) break else: removed_rows_indices.append(i) break # 添加未被移除的行到结果中 for index, row in enumerate(merged_rows): if index not in removed_rows_indices: result.append(row) return result def pre_process_require(self, old_table, new_table): base_resources = [old_table] target_resources = [new_table] changes_dict = {} # 存储变更信息的字典 # 合并表格并编号(0=变更前,1=变更后) for table_idx, table in enumerate(base_resources + target_resources): col_list = table.get_col_list(col_name=settings.SPECIAL_COLUMN) #'要求廃止' # 在循环外部初始化计数器 be_counter = 1 af_counter = 1 if col_list: for row_index, cell in enumerate(col_list): cell_text = getattr(cell, 'text', '') if cell_text == settings.SPECIAL_CELL_CONTENT2: #'レ' if table_idx < len(base_resources): table_key = f"be_{be_counter:02d}" be_counter += 1 else: table_key = f"af_{af_counter:02d}" af_counter += 1 # 获取列索引 col_index = cell.col_index # 存储变更位置信息 changes_dict[table_key] = (row_index, col_index) if any(changes_dict): # 分离变更前和变更后的数据 be_changes = {k: v for k, v in changes_dict.items() if k.startswith('be_')} af_changes = {k: v for k, v in changes_dict.items() if k.startswith('af_')} # 处理变更前的数据 # 记录需要清理的key(分前后表) be_clear_keys = [] af_clear_keys = [] if be_changes: for table_key, (row_idx, col_idx) in be_changes.items(): next_col_idx = col_idx + 1 if next_col_idx < len(base_resources[0].rows[row_idx].cells): if self._tbl_find_unique(base_resources[0], target_resources[0], row_idx, col_idx + 1): be_clear_keys.append(table_key) # 处理变更后的数据 if af_changes: for table_key, (row_idx, col_idx) in af_changes.items(): next_col_idx = col_idx + 1 if next_col_idx < len(target_resources[0].rows[row_idx].cells): if self._tbl_find_unique(target_resources[0], base_resources[0], row_idx, col_idx + 1): af_clear_keys.append(table_key) # 统一清理(分表操作) for table_key in be_clear_keys: row_idx, col_idx = changes_dict[table_key] # 清理前表(base_resources) for cell in base_resources[0].rows[row_idx].cells: cell.text = '' cell.content = [] cell.style = StyleObject() for table_key in af_clear_keys: row_idx, col_idx = changes_dict[table_key] # 清理后表(target_resources) for cell in target_resources[0].rows[row_idx].cells: cell.text = '' cell.content = [] cell.style = StyleObject() return base_resources[0], target_resources[0] @staticmethod def _tbl_find_unique(base_tbl, target_tbl, row_idx, col_idx): """校验指定单元格是否满足唯一性条件: 1. 在 base_tbl 对应列中不存在相同值 2. 在 target_tbl 当前列中唯一(排除自己) 返回:是否需要清理(True/False) """ if not target_tbl or not base_tbl: return False target_rows = target_tbl.rows if row_idx >= len(target_rows) or col_idx >= len(target_rows[row_idx].cells): return False compare_text = str(target_rows[row_idx].cells[col_idx].text) # 条件1:检查 base_tbl 对应列是否存在相同值 if base_tbl.rows and col_idx < len(base_tbl.rows[0].cells): for row in base_tbl.rows: if col_idx < len(row.cells) and str(row.cells[col_idx].text) == compare_text: return True # 需要清理 # 条件2:检查 target_tbl 当前列是否有重复(排除自己) for i, row in enumerate(target_rows): if i == row_idx: continue if col_idx < len(row.cells) and str(row.cells[col_idx].text) == compare_text: return True # 需要清理 return False # 无需清理 详细讲解表格匹配的代码以及流程
11-05
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

「已注销」

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值