import difflib, logging, os, re, time
import colorspacious as cs
from concurrent.futures import ThreadPoolExecutor, as_completed
from contextlib import contextmanager
from itertools import zip_longest
from multiprocessing import cpu_count
from kotei_omp.data import TextObject #, GraphicObject, PictureObject, CellObject, RunObject
from kotei_omp.data.text import CharObject
from kotei_omc.settings import settings
from kotei_omc.data.diff import DiffItem
from kotei_omc.utils.type_checker import is_instance_of
logger = logging.getLogger("req_diff")
@contextmanager
def process_pool_executor(max_workers=None):
executor = ThreadPoolExecutor(max_workers=max_workers)
try:
yield executor
finally:
executor.shutdown()
class BaseComparer:
def __init__(self, base_block_mapping, target_block_mapping, base_path, target_path):
self._path_base = base_path
self._path_target = target_path
self._base_block_mapping, self._target_block_mapping = self.filter_mapping(base_block_mapping,target_block_mapping)
def compare_block(self):
"""
对sheet的增删进行对比
"""
logger.info('start compare block')
compare_result = {'update': [], 'add': [], 'delete': []}
add_blocks = self._target_block_mapping.keys() - self._base_block_mapping.keys()
delete_blocks = self._base_block_mapping.keys() - self._target_block_mapping.keys()
for add_block in add_blocks:
if self._target_block_mapping[add_block].elements:
self._target_block_mapping[add_block].coordinate = self._target_block_mapping[add_block].elements[
0].coordinate
compare_result['add'].append(
DiffItem('add', self._target_block_mapping[add_block]._type, sub_type='block', block_name=add_block,
old=None, new=self._target_block_mapping[add_block]))
for delete_block in delete_blocks:
if self._base_block_mapping[delete_block].elements:
self._base_block_mapping[delete_block].coordinate = self._base_block_mapping[delete_block].elements[
0].coordinate
compare_result['delete'].append(
DiffItem('delete', self._base_block_mapping[delete_block]._type, sub_type='block',
block_name=delete_block, old=self._base_block_mapping[delete_block], new=None))
return compare_result
def compare_each_block(self, pre_fun):
"""
对比每一种数据类型
"""
compare_result = {'update': [], 'add': [], 'delete': []}
for block_name, _base in self._base_block_mapping.items():
_target = self._target_block_mapping.get(block_name)
if not _target:
continue
for belong_to in ['block', 'header', 'footer']:
# 获取资源列表
ls_base = self.get_block_resource(_base, belong_to)
ls_target = self.get_block_resource(_target, belong_to)
logger.info(f'get {belong_to} resource finish')
# 如果两个列表都为空,跳过比较
if not ls_base and not ls_target:
logger.info(f'No {belong_to} resource found.')
continue
if (ls_base and isinstance(ls_base[0], list)) or (ls_target and isinstance(ls_target[0], list)):
ls_base_list = ls_base
ls_target_list = ls_target
else:
ls_base_list = [ls_base]
ls_target_list = [ls_target]
for ls_base, ls_target in zip_longest(ls_base_list, ls_target_list, fillvalue=[]):
if not ls_base and not ls_target:
continue
# 调用预处理的策略中间件
ls_base, ls_target = pre_fun(self._path_base).process(ls_base, ls_target)
logger.info(
f'compare {belong_to}, block_name: {block_name}, base_num: {len(ls_base)}, target_num: {len(ls_target)}')
# 进行比较
block_result = self.compare(block_name, ls_base, ls_target, belong_to)
# 更新结果
for change_type, change_items in block_result.items():
if change_type in compare_result:
compare_result[change_type].extend(change_items)
logger.info(f'compare {belong_to} finish')
# 处理行、列中的单元格合并问题
# self.handle_cell_merge(compare_result)
return compare_result
# 相关处理逻辑已经在table_compare.py中处理,而且处理时同时考虑合并单元格范围和内容
# @staticmethod
# def handle_cell_merge(cr: dict):
# """ 处理行、列 中的单元格的合并,防止合并内容重复输出 """
# add_list = cr.get("add")
# delete_list = cr.get("delete")
# # 处理新增行、列中的单元格过滤
# for diff_item in add_list:
# if diff_item.data_type == "table" and diff_item.sub_type in ["col", "row"]:
# # 以合并范围元组为key,以cell列表为值,记录需要合并的cell
# merge_range_dict = defaultdict(list)
# # 记录需删除的cell
# need_remove = []
# row_obj = diff_item.new
# # 获取单元格对象列表
# cells_list = row_obj.cells
# # 检出需删除的单元格对象
# for cell_obj in cells_list:
# cur_merged_range = tuple(cell_obj.merged_ranges)
# # 没有合并时需要跳过
# if not cur_merged_range:
# continue
# if cur_merged_range not in merge_range_dict:
# merge_range_dict[cur_merged_range].append(cell_obj)
# else:
# # 合并范围冗余的cell
# need_remove.append(cell_obj)
# # 执行删除
# for cell_obj in need_remove:
# if cell_obj in row_obj.cells:
# row_obj.cells.remove(cell_obj)
# # 处理删除的行、列中的单元格过滤
# for diff_item in delete_list:
# if diff_item.data_type == "table" and diff_item.sub_type in ["col", "row"]:
# # 以合并范围元组为key,以cell列表为值,记录需要合并的cell
# merge_range_dict = defaultdict(list)
# # 记录需删除的cell
# need_remove = []
# row_obj = diff_item.old
# # 获取单元格对象列表
# cells_list = row_obj.cells
# # 检出需删除的单元格对象
# for cell_obj in cells_list:
# cur_merged_range = tuple(cell_obj.merged_ranges)
# # 没有合并时需要跳过
# if not cur_merged_range:
# continue
# if cur_merged_range not in merge_range_dict:
# merge_range_dict[cur_merged_range].append(cell_obj)
# else:
# # 合并范围冗余的cell
# need_remove.append(cell_obj)
# # 执行删除
# for cell_obj in need_remove:
# if cell_obj in row_obj.cells:
# row_obj.cells.remove(cell_obj)
#
# return cr
def is_color_equal(self,base_attr, target_attr):
"""
判定颜色是否相同
1. 忽略大小写一致时,返回True
2. 在视觉上颜色差异很小时,返回True
"""
if not isinstance(base_attr, str) or not isinstance(target_attr, str):
return False
# 定义正则表达式
hex_color_pattern = r"#([0-9a-fA-F]{3}|[0-9a-fA-F]{6})"
# 验证是否是有效的十六进制颜色
if not (re.fullmatch(hex_color_pattern, base_attr) and re.fullmatch(hex_color_pattern, target_attr)):
return False
# 忽略大小写,比较是否一致
if base_attr.lower() == target_attr.lower():
return True
# 忽略视觉上的颜色差异
if self.are_colors_visually_identical(base_attr, target_attr):
return True
return False
@staticmethod
def are_colors_visually_identical(color1_hex, color2_hex, threshold=5):
"""
判断两个十六进制颜色在视觉上是否相同。
:param color1_hex: 第一个颜色的十六进制字符串,例如 '#0D0D0D'
:param color2_hex: 第二个颜色的十六进制字符串,例如 '#000000'
:param threshold: 判断视觉差异的阈值,默认值为 5
:return: 如果两个颜色在视觉上几乎相同,返回 True;否则返回 False
"""
# 将十六进制颜色转换为 RGB
def hex_to_rgb(hex_color):
hex_color = hex_color.lstrip('#')
return tuple(int(hex_color[i:i + 2], 16) for i in (0, 2, 4))
# 转换十六进制颜色为 RGB
color1_rgb = hex_to_rgb(color1_hex)
color2_rgb = hex_to_rgb(color2_hex)
# 将 RGB 归一化到 [0, 1] 范围
color1_rgb_normalized = tuple(c / 255.0 for c in color1_rgb)
color2_rgb_normalized = tuple(c / 255.0 for c in color2_rgb)
# 将 RGB 转换为 LAB 并计算 Delta E
color1_lab = cs.cspace_convert(color1_rgb_normalized, "sRGB1", "CIELab")
color2_lab = cs.cspace_convert(color2_rgb_normalized, "sRGB1", "CIELab")
delta_e = sum((a - b) ** 2 for a, b in zip(color1_lab, color2_lab)) ** 0.5
# 判断是否在阈值范围内
return delta_e < threshold
def get_not_equal_attrs(self, base_item, target_item, compare_attrs):
not_equal_attrs = []
not_equal_values = []
for compare_attr in compare_attrs:
nest_attrs = compare_attr.split('.')
attr_str = nest_attrs.pop(0)
base_attr = getattr(base_item, attr_str, None)
target_attr = getattr(target_item, attr_str, None)
while base_attr and target_attr and nest_attrs:
attr_str = nest_attrs.pop(0)
base_attr = getattr(base_attr, attr_str, None)
target_attr = getattr(target_attr, attr_str, None)
# 如果是颜色属性且大小写一样时忽略
if self.is_color_equal(base_attr, target_attr):
continue
if base_attr != target_attr:
# 处理单元格背景色属性和字体底纹属性字段一致的问题,font_background_color:表示字体底纹,取值还是按照在run中的style.background_color进行取值
if is_instance_of(base_item, CharObject) and compare_attr == 'style.background_color':
not_equal_attrs.append('font_background_color')
not_equal_values.append((base_attr, target_attr))
else:
not_equal_attrs.append(compare_attr)
not_equal_values.append((base_attr, target_attr))
return not_equal_attrs, not_equal_values
@staticmethod
def cal_str_sim(text1, text2, i_j):
similar = difflib.SequenceMatcher(None, str(text1), str(text2)).ratio()
return similar
@staticmethod
def get_dynamic_thread_count():
"""
动态计算进程数:
- 最少 1 个进程
- 最多使用系统 CPU 核心数的一半
"""
total_cores = cpu_count() # 获取系统的 CPU 核心数
return max(2, total_cores)
@staticmethod
def compute_similarity_chunk(chunk, cal_sim_func, min_similarity):
"""
计算子任务块的相似度,只返回大于阈值的结果。
"""
results = []
for line_a, line_b, i_j, tuple_index in chunk:
similarity = cal_sim_func(line_a, line_b, i_j)
if similarity >= min_similarity:
results.append((similarity, tuple_index))
return results
def find_best_matched_indexes(self, lines1, lines2, min_similarity=0.6, cal_sim_func=None, data_type=None):
start_time = time.time()
if cal_sim_func is None:
cal_sim_func = self.cal_str_sim
max_couple_count = min(len(lines1), len(lines2))
logger.info(f"lines1 num :{len(lines1)}, lines2 num :{len(lines2)}")
if (data_type != 'table' and len(lines1) < 500 and len(lines2) < 500) or (
data_type == 'table' and len(lines1) < 50 and len(lines2) < 50):
similarity_index_pairs = []
for j, line_b in enumerate(lines2):
for i, line_a in enumerate(lines1):
# 计算位置参数
pos_param = (
i / (len(lines1) - 1) if len(lines1) > 1 else 0,
j / (len(lines2) - 1) if len(lines2) > 1 else 0)
# 计算相似度
similarity = cal_sim_func(line_a, line_b, pos_param)
# 如果相似度大于等于最小阈值,则添加到结果列表中
if similarity >= min_similarity:
similarity_index_pairs.append((similarity, (i, j)))
else:
# Step 构建任务列表
tasks = [
(line_a,
line_b,
(i / (len(lines1) - 1) if len(lines1) > 1 else 0, j / (len(lines2) - 1) if len(lines2) > 1 else 0),
(i, j)
)
for j, line_b in enumerate(lines2)
for i, line_a in enumerate(lines1)
]
# 如果 tasks 为空,直接返回空结果
if not tasks:
return []
# Step 分块任务
process_count = self.get_dynamic_thread_count() # 动态获取进程数
process_count = min(process_count, len(tasks))
chunk_size = max(1, len(tasks) // process_count)
chunks = [tasks[i:i + chunk_size] for i in range(0, len(tasks), chunk_size)]
# 使用 ProcessPoolExecutor 进程在打包成exe时会存在问题,暂时用线程替代,后续有好的方法在优化
# 使用上下文管理器创建并行执行器
with process_pool_executor(max_workers=process_count) as executor:
futures = [
executor.submit(self.compute_similarity_chunk, chunk, cal_sim_func, min_similarity)
for chunk in chunks
]
results = []
for future in as_completed(futures):
try:
result = future.result()
results.append(result)
except Exception as e:
logger.error(f"An error occurred during chunk processing: {e}", exc_info=True)
# 合并结果(只合并大于阈值的结果)
similarity_index_pairs = [pair for result in results for pair in result]
# 局部去重并选择最佳匹配
best_matches = {} # 存储每个 lines1[i] 的最佳匹配
# 增加绝对位置的距离,优先选择最接近的行
for similarity, (i, j) in sorted(similarity_index_pairs, key=lambda x: (-x[0], abs(x[1][0]-x[1][1]), x[1][0], x[1][1]), reverse=False):
if i not in best_matches and j not in [match[1] for match in best_matches.values()]:
best_matches[i] = (similarity, j)
# 提取结果并限制数量
matches = []
for i, (similarity, j) in best_matches.items():
matches.append((i, j))
if max_couple_count and len(matches) == max_couple_count:
break
end_time = time.time()
logger.info(f"find_best_matched_indexes cost time: {end_time - start_time}")
return sorted(matches)
def get_block_resource(self, block, belong_to='block'):
"""
从block获取对应资源的方法
"""
raise NotImplementedError("Subclasses must implement this method")
@staticmethod
def do_get_block_resource(block, belong_to, resource_attr, resource_obj):
"""
从block中获取指定类型的资源对象,并根据belong_to属性进行分类。
Args:
block: 包含资源的block对象。
belong_to: 资源所属的类型,如'block', 'footer', 'header'。
resource_attr: 资源在block中的属性名称。
resource_obj: 资源的对象类型,用于过滤。
Returns:
list: 包含指定类型资源的列表。
"""
logger.info(f'Start getting block resource for {belong_to} with resource attribute {resource_attr}')
# 如果block为空,直接返回空列表
if not block:
logger.info('Block is empty, returning empty list.')
return []
target_objs = []
# 根据belong_to类型获取资源
if belong_to == 'block':
logger.info('Processing block resources.')
target_objs.extend(getattr(block, resource_attr))
# 如果需要比较图片和图形,并且资源属性为'pictures',则添加图形资源
if settings.IS_COMPARE_PICTURE_GRAPHIC and resource_attr == 'pictures':
logger.info('Adding graphics to block resources.')
target_objs.extend(getattr(block, 'graphics'))
elif belong_to == 'footer':
logger.info('Processing footer resources.')
# 过滤footer中的资源对象
target_objs = [[obj for obj in obj_list if is_instance_of(obj, resource_obj)] for obj_list in block.footer]
elif belong_to == 'header':
logger.info('Processing header resources.')
# 过滤header中的资源对象
target_objs = [[obj for obj in obj_list if is_instance_of(obj, resource_obj)] for obj_list in block.header]
# 为每个资源对象设置belong_to属性
for target_obj in target_objs:
if isinstance(target_obj, list):
for b_target_obj in target_obj:
b_target_obj.belong_to = belong_to
else:
target_obj.belong_to = belong_to
logger.info(f'Finished getting block resource for {belong_to}, found {len(target_objs)} resources.')
return target_objs
def compare(self, block_name, base, target, belong_to='block'):
"""
对比两种资源
"""
raise NotImplementedError("Subclasses must implement this method")
def do_match_normal(self, base, target, match_functions):
old_new_matched = []
delete_list, add_list = base, target
if not match_functions:
return delete_list, add_list, old_new_matched
# for func in middlewares['match']:
matched = match_functions(delete_list, add_list)
if matched:
old_new_matched.extend(matched)
matched_olds, matched_news = list(zip(*old_new_matched))
delete_list = [base_item for base_item in base if
base_item not in matched_olds]
add_list = [target_item for target_item in target if
target_item not in matched_news]
# if not func.__self__.is_continue:
# logging.info(f'{func} is not continue')
# break
return delete_list, add_list, old_new_matched
def do_match_with_chapter(self, base, target,func):
# 按照chapter分类 # 根据 parent_ref进行分类
def group_by_parent_ref( objects):
"""
根据 layout.parent_ref 对文本进行分组
:param texts: 文本列表
:return: 以 parent_ref 为键的分组字典
"""
groups = {}
# 使用特殊键表示无父引用的文本; 父文本的标识符为obj.text(目前暂不考虑chapter_id)
groups['__no_parent_ref__'] = []
for obj in objects:
# 通过章节标题设置分组并且将标题本身添加到对应的分组
# 目前默认的章节标题为文本类型(TextObject)
if is_instance_of(obj, TextObject) and getattr(obj.layout, 'is_chapter_title', None) and (
obj.text) not in groups:
groups[obj.text] = [obj]
continue
# 获取父级引用
parent_ref = getattr(obj.layout, 'parent_ref', None)
# 如果父级引用不存在或者不是 TextObject 则将文本添加到无父级引用的分组
if parent_ref is None or not is_instance_of(parent_ref, TextObject):
groups['__no_parent_ref__'].append(obj)
continue
if parent_ref.text and (parent_ref.text) not in groups:
groups[parent_ref.text] = []
groups[parent_ref.text].append(obj)
return groups
classified_bases, classified_targets = group_by_parent_ref(base), group_by_parent_ref(target)
delete_list, add_list, old_new_matched = [], [], []
for del_chapter in classified_bases.keys() - classified_targets.keys():
delete_list.extend(classified_bases[del_chapter])
for add_chapter in classified_targets.keys() - classified_bases.keys():
add_list.extend(classified_targets[add_chapter])
for chapter in classified_bases.keys() & classified_targets.keys():
chapter_delete_list, chapter_add_list = classified_bases[chapter], classified_targets[chapter]
chapter_delete_list, chapter_add_list, chapter_old_new_matched = self.do_match_normal(
chapter_delete_list, chapter_add_list,func)
delete_list.extend(chapter_delete_list)
add_list.extend(chapter_add_list)
old_new_matched.extend(chapter_old_new_matched)
return delete_list, add_list, old_new_matched
@staticmethod
def filter_mapping_sheet(mapping, first_sheet=None, is_old=None):
"""过滤掉不需要的sheet"""
if settings.COMPARE_FIRST_SHEET:
sheet_name, sheet_val = next(iter(mapping.items()))
if is_old:
return {f"{sheet_name}-{first_sheet}": sheet_val}
else:
return {f"{first_sheet}-{sheet_name}": sheet_val}
if settings.NOT_COMPARE_SHEET:
# 将字符串按分号分割成多个正则表达式
regex_patterns = settings.NOT_COMPARE_SHEET.split(';')
# 编译正则表达式,提高匹配效率
compiled_patterns = [re.compile(pattern) for pattern in regex_patterns if pattern.strip()]
# 使用生成器表达式判断是否匹配任意正则表达式
def is_match(key):
return any(pattern.search(key) for pattern in compiled_patterns)
# 过滤掉匹配任意正则表达式的键
return {k: v for k, v in mapping.items() if not is_match(k)}
return mapping
@staticmethod
def filter_header_footer(base_mapping, target_mapping):
"""
处理页眉页脚配置
"""
_, base_val = next(iter(base_mapping.items()))
_, target_val = next(iter(target_mapping.items()))
def process_section(section_name, is_compare, is_ignore_page):
"""
通用处理页眉或页脚的逻辑
"""
base_section = getattr(base_val, section_name)
target_section = getattr(target_val, section_name)
if not is_compare:
setattr(base_val, section_name, [])
setattr(target_val, section_name, [])
elif is_ignore_page:
min_count = min(len(base_section), len(target_section))
setattr(base_val, section_name, base_section[:min_count])
setattr(target_val, section_name, target_section[:min_count])
# 处理页眉
process_section('header', settings.IS_COMPARE_HEADER, settings.IS_IGNORE_HEADER_PAGE)
# 处理页脚
process_section('footer', settings.IS_COMPARE_FOOTER, settings.IS_IGNORE_FOOTER_PAGE)
return base_mapping, target_mapping
@staticmethod
def filter_chapter(base_mapping, target_mapping):
"""
过滤不需要差分的章节
"""
if not settings.NOT_COMPARE_CHAPTER:
return base_mapping, target_mapping
not_compare_list = settings.NOT_COMPARE_CHAPTER.split(';')
def filter_objects(objects):
"""过滤对象列表中的不需要差分的内容"""
return [obj for obj in objects if obj.layout.parent_content not in not_compare_list]
def process_mapping(mapping):
"""处理单个mapping的内容"""
_, val = next(iter(mapping.items()))
val.texts = filter_objects(val.texts)
val.tables = filter_objects(val.tables)
val.pictures_obj = filter_objects(val.pictures)
val.graphics = filter_objects(val.graphics)
process_mapping(base_mapping)
process_mapping(target_mapping)
return base_mapping, target_mapping
def filter_mapping(self, base_mapping, target_mapping):
"""
根据配置项对解析数据进行过滤
"""
base_block_mapping = self.filter_mapping_sheet(base_mapping, next(iter(target_mapping)), True)
target_block_mapping = self.filter_mapping_sheet(target_mapping, next(iter(base_mapping)))
if os.path.splitext(self._path_base)[1] in ('.doc', '.docx'):
base_block_mapping, target_block_mapping = self.filter_header_footer(base_block_mapping,
target_block_mapping)
base_block_mapping, target_block_mapping = self.filter_chapter(base_block_mapping, target_block_mapping)
return base_block_mapping, target_block_mapping
if __name__ == "__main__":
pass # 防止模块被作为脚本执行
这些是配对算法吗
最新发布