基于bert-base-chinese模型和词表对文本进行初步语义压缩
vocab_file = 'model/vocab.txt'
bert_tokenizer = BertTokenizer(vocab_file)
bert = BertModel.from_pretrained("model/bert-base-chinese/")
# 将query向量化
def get_embedding(text, model, tokenizer, device='cpu'):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
inputs = {k: v.to(device) for k, v in inputs.items()} # 将输入移到对应设备
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1) # 获取平均的token embeddings
return embeddings.cpu().detach().numpy() # 确保输出在CPU上进行后续处理
根据自己设计的滑动窗口法对问句进行划分,进行子实体识别及匹配
def get_subsequences(text, min_tokens, max_tokens):
tokens = bert_tokenizer.tokenize(text)
subsequences = []
for i in range(min_tokens, max_tokens + 1):
for j in range(len(tokens) - i + 1):
subsequence = tokens[j:j + i]
subsequences.append(bert_tokenizer.convert_tokens_to_string(subsequence))
return subsequences
根据实体识别窗口确定句子在向量空间中最邻近的k个子实体
def get_similar_entities(query, model, tokenizer, entity_embeddings, entity_names, min_tokens=3, max_tokens=5, top_k=5,
temperature=0.85):
subsequences = get_subsequences(query, min_tokens, max_tokens)
best_matches = []
for subsequence in subsequences:
subseq_embedding = get_embedding(subsequence, model, tokenizer)
cos_sim = cosine_similarity(subseq_embedding, entity_embeddings)
top_k_indices = np.argsort(cos_sim[0])[::-1][:top_k]
for idx in top_k_indices:
if cos_sim[0][idx] >= temperature:
best_matches.append((entity_names[idx], cos_sim[0][idx]))
best_matches.sort(key=lambda x: x[1], reverse=True)
# 去重逻辑:保留每个 entity_name 的最高相似度匹配项
unique_best_matches = {}
for name, score in best_matches:
if name not in unique_best_matches:
unique_best_matches[name] = score
final_matches = [(name, score) for name, score in unique_best_matches.items()]
final_matches.sort(key=lambda x: x[1], reverse=True)
return final_matches[:top_k]
确定实体在向量空间中最邻近的k个实体
def get_similar_entities_single_entity(entity_name, entity_names, model, tokenizer, entity_embeddings, temperture=0.5):
embedding = get_embedding(entity_name, model, tokenizer)
best_matches = []
cos_sim = cosine_similarity(embedding, entity_embeddings)
indices = np.argsort(cos_sim[0])[::-1]
for idx in indices:
if cos_sim[0][idx] >= temperture:
best_matches.append((entity_names[idx], cos_sim[0][idx]))
best_matches.sort(key=lambda x: x[1], reverse=True)
unique_best_matches = {}
for name, score in best_matches:
if name not in unique_best_matches:
unique_best_matches[name] = score
final_matches = [(name, score) for name, score in unique_best_matches.items()]
final_matches.sort(key=lambda x: x[1], reverse=True)
return final_matches
列表及字典的统一去重方法
def make_list_unique(input_list):
def hashable(item):
if isinstance(item, dict):
# 将字典转换为 frozenset,以便进行哈希和比较
return frozenset((key, hashable(value)) for key, value in item.items())
elif isinstance(item, list):
# 将列表中的每一项进行 hashable 转换,并转换为元组
return tuple(hashable(sub_item) for sub_item in item)
else:
# 其他类型保持原样
return item
seen = set()
result = []
for item in input_list:
hashed_item = hashable(item)
if hashed_item not in seen:
seen.add(hashed_item)
result.append(item)
return result