bert调用相关知识点_bert ids to token-优快云博客

本文链接：https://blog.youkuaiyun.com/qq_40664172/article/details/117534985

https://www.cnblogs.com/cxq1126/p/13517394.html

1 print(tokenizer.encode('我不喜欢你'))                    #[101, 2769, 679, 1599, 3614, 872, 102]
2 sen_code = tokenizer.encode_plus('我不喜欢这世界','我只喜欢你')
3 print(sen_code)
4 # {'input_ids': [101, 2769, 679, 1599, 3614, 6821, 686, 4518, 102, 2769, 1372, 1599, 3614, 872, 102], 
5 #  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 
6 #  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

1 print(tokenizer.convert_ids_to_tokens(sen_code['input_ids']))
2 #['[CLS]', '我', '不', '喜', '欢', '这', '世', '界', '[SEP]', '我', '只', '喜', '欢', '你', '[SEP]']

#对编码进行转换，以便输入Tensor
tokens_tensor = torch.tensor([sen_code['input_ids']])           # 添加batch维度并转化为tensor
segments_tensors = torch.tensor([sen_code['token_type_ids']])

bert_model.eval()

#进行编码
with torch.no_grad():

    outputs = bert_model(tokens_tensor, token_type_ids=segments_tensors)
    encoded_layers = outputs         #outputs类型为tuples