后端实现（二）：接口抛出指定格式数据对象-优快云博客

本文链接：https://blog.youkuaiyun.com/m0_63358814/article/details/139911933

基于bert-base-chinese模型和词表对文本进行初步语义压缩

vocab_file = 'model/vocab.txt'
bert_tokenizer = BertTokenizer(vocab_file)
bert = BertModel.from_pretrained("model/bert-base-chinese/")


# 将query向量化
def get_embedding(text, model, tokenizer, device='cpu'):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # 将输入移到对应设备
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # 获取平均的token embeddings
    return embeddings.cpu().detach().numpy()  # 确保输出在CPU上进行后续处理

根据自己设计的滑动窗口法对问句进行划分，进行子实体识别及匹配


def get_subsequences(text, min_tokens, max_tokens):
    tokens = bert_tokenizer.tokenize(text)
    subsequences = []
    for i in range(min_tokens, max_tokens + 1):
        for j in range(len(tokens) - i + 1):
            subsequence = tokens[j:j + i]
            subsequences.append(bert_tokenizer.convert_tokens_to_string(subsequence))
    return subsequences

根据实体识别窗口确定句子在向量空间中最邻近的k个子实体

def get_similar_entities(query, model, tokenizer, entity_embeddings, entity_names, min_tokens=3, max_tokens=5, top_k=5,
                         temperature=0.85):
    subsequences = get_subsequences(query, min_tokens, max_tokens)
    best_matches = []
    for subsequence in subsequences:
        subseq_embedding = get_embedding(subsequence, model, tokenizer)
        cos_sim = cosine_similarity(subseq_embedding, entity_embeddings)
        top_k_indices = np.argsort(cos_sim[0])[::-1][:top_k]
        for idx in top_k_indices:
            if cos_sim[0][idx] >= temperature:
                best_matches.append((entity_names[idx], cos_sim[0][idx]))

    best_matches.sort(key=lambda x: x[1], reverse=True)

    # 去重逻辑：保留每个 entity_name 的最高相似度匹配项
    unique_best_matches = {}
    for name, score in best_matches:
        if name not in unique_best_matches:
            unique_best_matches[name] = score

    final_matches = [(name, score) for name, score in unique_best_matches.items()]
    final_matches.sort(key=lambda x: x[1], reverse=True)
    return final_matches[:top_k]

确定实体在向量空间中最邻近的k个实体

def get_similar_entities_single_entity(entity_name, entity_names, model, tokenizer, entity_embeddings, temperture=0.5):
    embedding = get_embedding(entity_name, model, tokenizer)
    best_matches = []
    cos_sim = cosine_similarity(embedding, entity_embeddings)
    indices = np.argsort(cos_sim[0])[::-1]
    for idx in indices:
        if cos_sim[0][idx] >= temperture:
            best_matches.append((entity_names[idx], cos_sim[0][idx]))
    best_matches.sort(key=lambda x: x[1], reverse=True)

    unique_best_matches = {}
    for name, score in best_matches:
        if name not in unique_best_matches:
            unique_best_matches[name] = score
    final_matches = [(name, score) for name, score in unique_best_matches.items()]
    final_matches.sort(key=lambda x: x[1], reverse=True)
    return final_matches

列表及字典的统一去重方法

def make_list_unique(input_list):
    def hashable(item):
        if isinstance(item, dict):
            # 将字典转换为 frozenset，以便进行哈希和比较
            return frozenset((key, hashable(value)) for key, value in item.items())
        elif isinstance(item, list):
            # 将列表中的每一项进行 hashable 转换，并转换为元组
            return tuple(hashable(sub_item) for sub_item in item)
        else:
            # 其他类型保持原样
            return item

    seen = set()
    result = []
    for item in input_list:
        hashed_item = hashable(item)
        if hashed_item not in seen:
            seen.add(hashed_item)
            result.append(item)
    return result