利用sentence bert 实现语义向量搜索

目录

基于pytorch的中文语言模型预训练:https://github.com/zhusleep/pytorch_chinese_lm_pretrain/tree/master

sentence_emb.py

search_faiss_robert768.py

faiss_index.py

gen_vec_save2_faiss.py


基于pytorch的中文语言模型预训练:https://github.com/zhusleep/pytorch_chinese_lm_pretrain/tree/master

sentence_emb.py

#from transformers import BertTokenizer, BertModel
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#
## First we initialize our model and tokenizer:
#tokenizer = BertTokenizer.from_pretrained('./result')
#model = BertModel.from_pretrained('./result')


def split_batch(init_list, batch_size):
    groups = zip(*(iter(init_list),) * batch_size)
    end_list = [list(i) for i in groups]
    count = len(init_list) % batch_size
    end_list.append(init_list[-count:]) if count != 0 else end_list
    return end_list



"""
param: sentence list
return: embeddings
"""
def encode(sentences, tokenizer, model):
    tokens = {'input_ids': [], 'attention_mask': []}
    data_num = len(sentences)

    for sentence in sentences:
        # 编码每个句子并添加到字典
        new_tokens = tokenizer.encode_plus(str(sentence), max_length=128,
                                           truncation=True, padding='max_length',
                                           return_tensors='pt')
        tokens['input_ids'].append(new_tokens['input_ids'][0])
        tokens['attention_mask'].append(new_tokens['attention_mask'][0])

    # 将张量列表重新格式化为一个张量
    tokens['input_ids'] = torch.stack(tokens['input_ids']).to(device)
    tokens['attention_mask'] = torch.stack(tokens['attention_mask']).to(device)
    model.eval()

    # We process these tokens through our model:
    with torch.no_grad():#添加这行代码
        outputs = model(**tokens)

    # odict_keys(['last_hidden_state', 'pooler_output'])

    # The dense vector representations of our text are contained within the outputs 'last_hidden_state' tensor, which we access like so:

    embeddings = outputs[0]

    # To perform this operation, we first resize our attention_mask tensor:

    attention_mask = tokens['attention_mask']
    # attention_mask.shape

    mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    # mask.shape

    # 上面的每个向量表示一个单独token的掩码现在每个token都有一个大小为768的向量,表示它的attention_mask状态。然后将两个张量相乘:

    masked_embeddings = embeddings * mask
    # masked_embeddings.shape

    # torch.Size([2, 128, 768])
    torch.Size([data_num, 128, 768])

    summed = torch.sum(masked_embeddings, 1)
    summed_mask = to
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值