Nest add mapping

本文介绍了如何使用.NET客户端库Nest为Elasticsearch索引配置mapping,包括嵌套对象和GeoPoint地理坐标字段的设置。

当使用.net nest client build  elasticsearch index的时候 需要 为一些属性加上mapping列入 nest,geopiont


nest mapping

 var response = Client.CreateIndex(d => d
                .Index(_indexName)
                .AddMapping<DishDoc>(m => m
                   .Type(Constants.TypeName.Dish)
                   .Properties(p => p
                      .NestedObject<DishCluster>(n => n
                          .Name("clusterInfo")
                      )
                   )
                )
            );


class DishDoc

public DishCluster[] ClusterInfo { get; set; }


class DishCluster
 [ElasticProperty(Index = FieldIndexOption.No)]
public int Id { get; set; }

[ElasticProperty(Index = FieldIndexOption.No)]
public string Name { get; set; }


geopoint mapping

var response = Client.CreateIndex(d =>
              d.Index(Constants.IndexName.Instagram)
              .AddMapping<PostDoc>(m => m.Type(Constants.TypeName.Post).MapFromAttributes().Properties(p => p.GeoPoint(g => g.Name(n => n.RegionInfo.Location).IndexLatLon()))));
            if (response.ServerError != null)
            {
                Console.WriteLine(response.ServerError.Error);
            }

import difflib, logging, os, re, time import colorspacious as cs from concurrent.futures import ThreadPoolExecutor, as_completed from contextlib import contextmanager from itertools import zip_longest from multiprocessing import cpu_count from kotei_omp.data import TextObject #, GraphicObject, PictureObject, CellObject, RunObject from kotei_omp.data.text import CharObject from kotei_omc.settings import settings from kotei_omc.data.diff import DiffItem from kotei_omc.utils.type_checker import is_instance_of logger = logging.getLogger("req_diff") @contextmanager def process_pool_executor(max_workers=None): executor = ThreadPoolExecutor(max_workers=max_workers) try: yield executor finally: executor.shutdown() class BaseComparer: def __init__(self, base_block_mapping, target_block_mapping, base_path, target_path): self._path_base = base_path self._path_target = target_path self._base_block_mapping, self._target_block_mapping = self.filter_mapping(base_block_mapping,target_block_mapping) def compare_block(self): """ 对sheet的增删进行对比 """ logger.info('start compare block') compare_result = {'update': [], 'add': [], 'delete': []} add_blocks = self._target_block_mapping.keys() - self._base_block_mapping.keys() delete_blocks = self._base_block_mapping.keys() - self._target_block_mapping.keys() for add_block in add_blocks: if self._target_block_mapping[add_block].elements: self._target_block_mapping[add_block].coordinate = self._target_block_mapping[add_block].elements[ 0].coordinate compare_result['add'].append( DiffItem('add', self._target_block_mapping[add_block]._type, sub_type='block', block_name=add_block, old=None, new=self._target_block_mapping[add_block])) for delete_block in delete_blocks: if self._base_block_mapping[delete_block].elements: self._base_block_mapping[delete_block].coordinate = self._base_block_mapping[delete_block].elements[ 0].coordinate compare_result['delete'].append( DiffItem('delete', self._base_block_mapping[delete_block]._type, sub_type='block', block_name=delete_block, old=self._base_block_mapping[delete_block], new=None)) return compare_result def compare_each_block(self, pre_fun): """ 对比每一种数据类型 """ compare_result = {'update': [], 'add': [], 'delete': []} for block_name, _base in self._base_block_mapping.items(): _target = self._target_block_mapping.get(block_name) if not _target: continue for belong_to in ['block', 'header', 'footer']: # 获取资源列表 ls_base = self.get_block_resource(_base, belong_to) ls_target = self.get_block_resource(_target, belong_to) logger.info(f'get {belong_to} resource finish') # 如果两个列表都为空,跳过比较 if not ls_base and not ls_target: logger.info(f'No {belong_to} resource found.') continue if (ls_base and isinstance(ls_base[0], list)) or (ls_target and isinstance(ls_target[0], list)): ls_base_list = ls_base ls_target_list = ls_target else: ls_base_list = [ls_base] ls_target_list = [ls_target] for ls_base, ls_target in zip_longest(ls_base_list, ls_target_list, fillvalue=[]): if not ls_base and not ls_target: continue # 调用预处理的策略中间件 ls_base, ls_target = pre_fun(self._path_base).process(ls_base, ls_target) logger.info( f'compare {belong_to}, block_name: {block_name}, base_num: {len(ls_base)}, target_num: {len(ls_target)}') # 进行比较 block_result = self.compare(block_name, ls_base, ls_target, belong_to) # 更新结果 for change_type, change_items in block_result.items(): if change_type in compare_result: compare_result[change_type].extend(change_items) logger.info(f'compare {belong_to} finish') # 处理行、列中的单元格合并问题 # self.handle_cell_merge(compare_result) return compare_result # 相关处理逻辑已经在table_compare.py中处理,而且处理时同时考虑合并单元格范围和内容 # @staticmethod # def handle_cell_merge(cr: dict): # """ 处理行、列 中的单元格的合并,防止合并内容重复输出 """ # add_list = cr.get("add") # delete_list = cr.get("delete") # # 处理新增行、列中的单元格过滤 # for diff_item in add_list: # if diff_item.data_type == "table" and diff_item.sub_type in ["col", "row"]: # # 以合并范围元组为key,以cell列表为值,记录需要合并的cell # merge_range_dict = defaultdict(list) # # 记录需删除的cell # need_remove = [] # row_obj = diff_item.new # # 获取单元格对象列表 # cells_list = row_obj.cells # # 检出需删除的单元格对象 # for cell_obj in cells_list: # cur_merged_range = tuple(cell_obj.merged_ranges) # # 没有合并时需要跳过 # if not cur_merged_range: # continue # if cur_merged_range not in merge_range_dict: # merge_range_dict[cur_merged_range].append(cell_obj) # else: # # 合并范围冗余的cell # need_remove.append(cell_obj) # # 执行删除 # for cell_obj in need_remove: # if cell_obj in row_obj.cells: # row_obj.cells.remove(cell_obj) # # 处理删除的行、列中的单元格过滤 # for diff_item in delete_list: # if diff_item.data_type == "table" and diff_item.sub_type in ["col", "row"]: # # 以合并范围元组为key,以cell列表为值,记录需要合并的cell # merge_range_dict = defaultdict(list) # # 记录需删除的cell # need_remove = [] # row_obj = diff_item.old # # 获取单元格对象列表 # cells_list = row_obj.cells # # 检出需删除的单元格对象 # for cell_obj in cells_list: # cur_merged_range = tuple(cell_obj.merged_ranges) # # 没有合并时需要跳过 # if not cur_merged_range: # continue # if cur_merged_range not in merge_range_dict: # merge_range_dict[cur_merged_range].append(cell_obj) # else: # # 合并范围冗余的cell # need_remove.append(cell_obj) # # 执行删除 # for cell_obj in need_remove: # if cell_obj in row_obj.cells: # row_obj.cells.remove(cell_obj) # # return cr def is_color_equal(self,base_attr, target_attr): """ 判定颜色是否相同 1. 忽略大小写一致时,返回True 2. 在视觉上颜色差异很小时,返回True """ if not isinstance(base_attr, str) or not isinstance(target_attr, str): return False # 定义正则表达式 hex_color_pattern = r"#([0-9a-fA-F]{3}|[0-9a-fA-F]{6})" # 验证是否是有效的十六进制颜色 if not (re.fullmatch(hex_color_pattern, base_attr) and re.fullmatch(hex_color_pattern, target_attr)): return False # 忽略大小写,比较是否一致 if base_attr.lower() == target_attr.lower(): return True # 忽略视觉上的颜色差异 if self.are_colors_visually_identical(base_attr, target_attr): return True return False @staticmethod def are_colors_visually_identical(color1_hex, color2_hex, threshold=5): """ 判断两个十六进制颜色在视觉上是否相同。 :param color1_hex: 第一个颜色的十六进制字符串,例如 '#0D0D0D' :param color2_hex: 第二个颜色的十六进制字符串,例如 '#000000' :param threshold: 判断视觉差异的阈值,默认值为 5 :return: 如果两个颜色在视觉上几乎相同,返回 True;否则返回 False """ # 将十六进制颜色转换为 RGB def hex_to_rgb(hex_color): hex_color = hex_color.lstrip('#') return tuple(int(hex_color[i:i + 2], 16) for i in (0, 2, 4)) # 转换十六进制颜色为 RGB color1_rgb = hex_to_rgb(color1_hex) color2_rgb = hex_to_rgb(color2_hex) # 将 RGB 归一化到 [0, 1] 范围 color1_rgb_normalized = tuple(c / 255.0 for c in color1_rgb) color2_rgb_normalized = tuple(c / 255.0 for c in color2_rgb) # 将 RGB 转换为 LAB 并计算 Delta E color1_lab = cs.cspace_convert(color1_rgb_normalized, "sRGB1", "CIELab") color2_lab = cs.cspace_convert(color2_rgb_normalized, "sRGB1", "CIELab") delta_e = sum((a - b) ** 2 for a, b in zip(color1_lab, color2_lab)) ** 0.5 # 判断是否在阈值范围内 return delta_e < threshold def get_not_equal_attrs(self, base_item, target_item, compare_attrs): not_equal_attrs = [] not_equal_values = [] for compare_attr in compare_attrs: nest_attrs = compare_attr.split('.') attr_str = nest_attrs.pop(0) base_attr = getattr(base_item, attr_str, None) target_attr = getattr(target_item, attr_str, None) while base_attr and target_attr and nest_attrs: attr_str = nest_attrs.pop(0) base_attr = getattr(base_attr, attr_str, None) target_attr = getattr(target_attr, attr_str, None) # 如果是颜色属性且大小写一样时忽略 if self.is_color_equal(base_attr, target_attr): continue if base_attr != target_attr: # 处理单元格背景色属性和字体底纹属性字段一致的问题,font_background_color:表示字体底纹,取值还是按照在run中的style.background_color进行取值 if is_instance_of(base_item, CharObject) and compare_attr == 'style.background_color': not_equal_attrs.append('font_background_color') not_equal_values.append((base_attr, target_attr)) else: not_equal_attrs.append(compare_attr) not_equal_values.append((base_attr, target_attr)) return not_equal_attrs, not_equal_values @staticmethod def cal_str_sim(text1, text2, i_j): similar = difflib.SequenceMatcher(None, str(text1), str(text2)).ratio() return similar @staticmethod def get_dynamic_thread_count(): """ 动态计算进程数: - 最少 1 个进程 - 最多使用系统 CPU 核心数的一半 """ total_cores = cpu_count() # 获取系统的 CPU 核心数 return max(2, total_cores) @staticmethod def compute_similarity_chunk(chunk, cal_sim_func, min_similarity): """ 计算子任务块的相似度,只返回大于阈值的结果。 """ results = [] for line_a, line_b, i_j, tuple_index in chunk: similarity = cal_sim_func(line_a, line_b, i_j) if similarity >= min_similarity: results.append((similarity, tuple_index)) return results def find_best_matched_indexes(self, lines1, lines2, min_similarity=0.6, cal_sim_func=None, data_type=None): start_time = time.time() if cal_sim_func is None: cal_sim_func = self.cal_str_sim max_couple_count = min(len(lines1), len(lines2)) logger.info(f"lines1 num :{len(lines1)}, lines2 num :{len(lines2)}") if (data_type != 'table' and len(lines1) < 500 and len(lines2) < 500) or ( data_type == 'table' and len(lines1) < 50 and len(lines2) < 50): similarity_index_pairs = [] for j, line_b in enumerate(lines2): for i, line_a in enumerate(lines1): # 计算位置参数 pos_param = ( i / (len(lines1) - 1) if len(lines1) > 1 else 0, j / (len(lines2) - 1) if len(lines2) > 1 else 0) # 计算相似度 similarity = cal_sim_func(line_a, line_b, pos_param) # 如果相似度大于等于最小阈值,则添加到结果列表中 if similarity >= min_similarity: similarity_index_pairs.append((similarity, (i, j))) else: # Step 构建任务列表 tasks = [ (line_a, line_b, (i / (len(lines1) - 1) if len(lines1) > 1 else 0, j / (len(lines2) - 1) if len(lines2) > 1 else 0), (i, j) ) for j, line_b in enumerate(lines2) for i, line_a in enumerate(lines1) ] # 如果 tasks 为空,直接返回空结果 if not tasks: return [] # Step 分块任务 process_count = self.get_dynamic_thread_count() # 动态获取进程数 process_count = min(process_count, len(tasks)) chunk_size = max(1, len(tasks) // process_count) chunks = [tasks[i:i + chunk_size] for i in range(0, len(tasks), chunk_size)] # 使用 ProcessPoolExecutor 进程在打包成exe时会存在问题,暂时用线程替代,后续有好的方法在优化 # 使用上下文管理器创建并行执行器 with process_pool_executor(max_workers=process_count) as executor: futures = [ executor.submit(self.compute_similarity_chunk, chunk, cal_sim_func, min_similarity) for chunk in chunks ] results = [] for future in as_completed(futures): try: result = future.result() results.append(result) except Exception as e: logger.error(f"An error occurred during chunk processing: {e}", exc_info=True) # 合并结果(只合并大于阈值的结果) similarity_index_pairs = [pair for result in results for pair in result] # 局部去重并选择最佳匹配 best_matches = {} # 存储每个 lines1[i] 的最佳匹配 # 增加绝对位置的距离,优先选择最接近的行 for similarity, (i, j) in sorted(similarity_index_pairs, key=lambda x: (-x[0], abs(x[1][0]-x[1][1]), x[1][0], x[1][1]), reverse=False): if i not in best_matches and j not in [match[1] for match in best_matches.values()]: best_matches[i] = (similarity, j) # 提取结果并限制数量 matches = [] for i, (similarity, j) in best_matches.items(): matches.append((i, j)) if max_couple_count and len(matches) == max_couple_count: break end_time = time.time() logger.info(f"find_best_matched_indexes cost time: {end_time - start_time}") return sorted(matches) def get_block_resource(self, block, belong_to='block'): """ 从block获取对应资源的方法 """ raise NotImplementedError("Subclasses must implement this method") @staticmethod def do_get_block_resource(block, belong_to, resource_attr, resource_obj): """ 从block中获取指定类型的资源对象,并根据belong_to属性进行分类。 Args: block: 包含资源的block对象。 belong_to: 资源所属的类型,如'block', 'footer', 'header'。 resource_attr: 资源在block中的属性名称。 resource_obj: 资源的对象类型,用于过滤。 Returns: list: 包含指定类型资源的列表。 """ logger.info(f'Start getting block resource for {belong_to} with resource attribute {resource_attr}') # 如果block为空,直接返回空列表 if not block: logger.info('Block is empty, returning empty list.') return [] target_objs = [] # 根据belong_to类型获取资源 if belong_to == 'block': logger.info('Processing block resources.') target_objs.extend(getattr(block, resource_attr)) # 如果需要比较图片和图形,并且资源属性为'pictures',则添加图形资源 if settings.IS_COMPARE_PICTURE_GRAPHIC and resource_attr == 'pictures': logger.info('Adding graphics to block resources.') target_objs.extend(getattr(block, 'graphics')) elif belong_to == 'footer': logger.info('Processing footer resources.') # 过滤footer中的资源对象 target_objs = [[obj for obj in obj_list if is_instance_of(obj, resource_obj)] for obj_list in block.footer] elif belong_to == 'header': logger.info('Processing header resources.') # 过滤header中的资源对象 target_objs = [[obj for obj in obj_list if is_instance_of(obj, resource_obj)] for obj_list in block.header] # 为每个资源对象设置belong_to属性 for target_obj in target_objs: if isinstance(target_obj, list): for b_target_obj in target_obj: b_target_obj.belong_to = belong_to else: target_obj.belong_to = belong_to logger.info(f'Finished getting block resource for {belong_to}, found {len(target_objs)} resources.') return target_objs def compare(self, block_name, base, target, belong_to='block'): """ 对比两种资源 """ raise NotImplementedError("Subclasses must implement this method") def do_match_normal(self, base, target, match_functions): old_new_matched = [] delete_list, add_list = base, target if not match_functions: return delete_list, add_list, old_new_matched # for func in middlewares['match']: matched = match_functions(delete_list, add_list) if matched: old_new_matched.extend(matched) matched_olds, matched_news = list(zip(*old_new_matched)) delete_list = [base_item for base_item in base if base_item not in matched_olds] add_list = [target_item for target_item in target if target_item not in matched_news] # if not func.__self__.is_continue: # logging.info(f'{func} is not continue') # break return delete_list, add_list, old_new_matched def do_match_with_chapter(self, base, target,func): # 按照chapter分类 # 根据 parent_ref进行分类 def group_by_parent_ref( objects): """ 根据 layout.parent_ref 对文本进行分组 :param texts: 文本列表 :return: 以 parent_ref 为键的分组字典 """ groups = {} # 使用特殊键表示无父引用的文本; 父文本的标识符为obj.text(目前暂不考虑chapter_id) groups['__no_parent_ref__'] = [] for obj in objects: # 通过章节标题设置分组并且将标题本身添加到对应的分组 # 目前默认的章节标题为文本类型(TextObject) if is_instance_of(obj, TextObject) and getattr(obj.layout, 'is_chapter_title', None) and ( obj.text) not in groups: groups[obj.text] = [obj] continue # 获取父级引用 parent_ref = getattr(obj.layout, 'parent_ref', None) # 如果父级引用不存在或者不是 TextObject 则将文本添加到无父级引用的分组 if parent_ref is None or not is_instance_of(parent_ref, TextObject): groups['__no_parent_ref__'].append(obj) continue if parent_ref.text and (parent_ref.text) not in groups: groups[parent_ref.text] = [] groups[parent_ref.text].append(obj) return groups classified_bases, classified_targets = group_by_parent_ref(base), group_by_parent_ref(target) delete_list, add_list, old_new_matched = [], [], [] for del_chapter in classified_bases.keys() - classified_targets.keys(): delete_list.extend(classified_bases[del_chapter]) for add_chapter in classified_targets.keys() - classified_bases.keys(): add_list.extend(classified_targets[add_chapter]) for chapter in classified_bases.keys() & classified_targets.keys(): chapter_delete_list, chapter_add_list = classified_bases[chapter], classified_targets[chapter] chapter_delete_list, chapter_add_list, chapter_old_new_matched = self.do_match_normal( chapter_delete_list, chapter_add_list,func) delete_list.extend(chapter_delete_list) add_list.extend(chapter_add_list) old_new_matched.extend(chapter_old_new_matched) return delete_list, add_list, old_new_matched @staticmethod def filter_mapping_sheet(mapping, first_sheet=None, is_old=None): """过滤掉不需要的sheet""" if settings.COMPARE_FIRST_SHEET: sheet_name, sheet_val = next(iter(mapping.items())) if is_old: return {f"{sheet_name}-{first_sheet}": sheet_val} else: return {f"{first_sheet}-{sheet_name}": sheet_val} if settings.NOT_COMPARE_SHEET: # 将字符串按分号分割成多个正则表达式 regex_patterns = settings.NOT_COMPARE_SHEET.split(';') # 编译正则表达式,提高匹配效率 compiled_patterns = [re.compile(pattern) for pattern in regex_patterns if pattern.strip()] # 使用生成器表达式判断是否匹配任意正则表达式 def is_match(key): return any(pattern.search(key) for pattern in compiled_patterns) # 过滤掉匹配任意正则表达式的键 return {k: v for k, v in mapping.items() if not is_match(k)} return mapping @staticmethod def filter_header_footer(base_mapping, target_mapping): """ 处理页眉页脚配置 """ _, base_val = next(iter(base_mapping.items())) _, target_val = next(iter(target_mapping.items())) def process_section(section_name, is_compare, is_ignore_page): """ 通用处理页眉或页脚的逻辑 """ base_section = getattr(base_val, section_name) target_section = getattr(target_val, section_name) if not is_compare: setattr(base_val, section_name, []) setattr(target_val, section_name, []) elif is_ignore_page: min_count = min(len(base_section), len(target_section)) setattr(base_val, section_name, base_section[:min_count]) setattr(target_val, section_name, target_section[:min_count]) # 处理页眉 process_section('header', settings.IS_COMPARE_HEADER, settings.IS_IGNORE_HEADER_PAGE) # 处理页脚 process_section('footer', settings.IS_COMPARE_FOOTER, settings.IS_IGNORE_FOOTER_PAGE) return base_mapping, target_mapping @staticmethod def filter_chapter(base_mapping, target_mapping): """ 过滤不需要差分的章节 """ if not settings.NOT_COMPARE_CHAPTER: return base_mapping, target_mapping not_compare_list = settings.NOT_COMPARE_CHAPTER.split(';') def filter_objects(objects): """过滤对象列表中的不需要差分的内容""" return [obj for obj in objects if obj.layout.parent_content not in not_compare_list] def process_mapping(mapping): """处理单个mapping的内容""" _, val = next(iter(mapping.items())) val.texts = filter_objects(val.texts) val.tables = filter_objects(val.tables) val.pictures_obj = filter_objects(val.pictures) val.graphics = filter_objects(val.graphics) process_mapping(base_mapping) process_mapping(target_mapping) return base_mapping, target_mapping def filter_mapping(self, base_mapping, target_mapping): """ 根据配置项对解析数据进行过滤 """ base_block_mapping = self.filter_mapping_sheet(base_mapping, next(iter(target_mapping)), True) target_block_mapping = self.filter_mapping_sheet(target_mapping, next(iter(base_mapping))) if os.path.splitext(self._path_base)[1] in ('.doc', '.docx'): base_block_mapping, target_block_mapping = self.filter_header_footer(base_block_mapping, target_block_mapping) base_block_mapping, target_block_mapping = self.filter_chapter(base_block_mapping, target_block_mapping) return base_block_mapping, target_block_mapping if __name__ == "__main__": pass # 防止模块被作为脚本执行 这些是配对算法吗
最新发布
11-05
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值