利用transformers包加载预训练好的Bert模型得到句子Embedding
1. transformers包加载预训练好的Bert模型
# 1. 导入包
import torch
from transformers import BertTokenizer
# 2. 所需要的预训练好的model
model_name = 'bert-base-uncased'
# 3. 通过词典导入分词器
tokenizer = BertTokenizer.from_pretrained(model_name)
sentence = "A very clean and well decorated empty bathroom."
2. 得到句子Embedding
(1)encode()方法:仅返回input_ids
def encode(
self,
text: Union[TextInput, PreTokenizedInput, EncodedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
)
(2)encode_plus()方法:返回所有的编码信息
def encode_plus(
self,
text: Union[TextInput, PreTokenizedInput, EncodedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
)
一般返回参数如下:
- input_ids:表示单词在词典中的编码id
- token_type_ids:区分两个句子的编码id(上句全为0,下句全为1)
- attention_mask:指定对哪些单词id进行self-Attention操作
- 举例如下:
- (1)encode()方法
# 4. 得到Embedding
# (1)encode()方法
print(tokenizer.encode(sentence))# encode仅返回input_ids
# [101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102]
- (2)encode_plus()方法
# (2)encode_plus()方法
print(tokenizer.encode_plus(sentence))# encode_plus返回所有的编码信息
- (3)encode_plus()里面参数自定义
# (3)encode_plus()里面参数自定义
print(tokenizer.encode_plus(sentence, max_length=15, padding='max_length', return_attention_mask=True, return_token_type_ids=True, truncation=True))
3. Eg:以上代码整理,可跑
import torch
from transformers import BertTokenizer
model_name = 'bert-base-uncased'
# a.通过词典导入分词器
tokenizer = BertTokenizer.from_pretrained(model_name)
sentence = "A very clean and well decorated empty bathroom."
print(tokenizer.encode(sentence))# encode仅返回input_ids
# [101, 7592, 1010, 2026, 2365, 2003, 5870, 1012, 102]
print(tokenizer.encode_plus(sentence))# encode_plus返回所有的编码信息
print(tokenizer.encode_plus(sentence, max_length=15, padding='max_length', return_attention_mask=True, return_token_type_ids=True, truncation=True))
# {'input_ids': [101, 1037, 2200, 4550, 1998, 2092, 7429, 4064, 5723, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
# {'input_ids': [101, 1037, 2200, 4550, 1998, 2092, 7429, 4064, 5723, 1012, 102, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]}