前面介绍过如何用AutoModel(带指定任务头的AutoModel)加载预训练模型,这里主要介绍,
- 模型在不同任务中的使用
- 不同任务下,模型的数据处理
- 不同任务下,模型的输入输出
1 文本特征提取任务
- 文本张量化的时候可以使用encode、encode_plus、batch_encode_plus
- encode函数
my_tokenizer.encode(text,return_tensors,truncation,pad_to_max_length=True,max_length=30)
- encode 将列表中的两个句子,分词,数值化,添加分隔符之后放到一个列表当中
- encode函数返回的结果是一个文本数值化之后的列表,需要手动转换为tensor,输入给模型
- encode函数的参数
- return_tensors = “pt” 返回pytorch中的张量
- max_length 指定句子的最大字数,bert模型式根据字进行分隔的,encode_plus如果传入的文本时多个句子的列表,也会进行组合成一个列表,max_length就是指定列表的长度
- truncation = True 超过最大长度,进行截断
- pad_to_max_length = True 句子长度达不到设定的最大长度,会自动进行padding,补零
- encode_plus
my_tokenizer.encode_plus(text,return_tensors,truncation,pad_to_max_length=True,max_length=30)
- return_tensors = “pt” 返回pytorch中的张量
- max_length 指定句子的最大字数,bert模型式根据字进行分隔的,encode_plus如果传入的文本时多个句子的列表,也会进行组合成一个列表,max_length就是指定列表的长度
- truncation = True 超过最大长度,进行截断
- pad_to_max_length = True 句子长度达不到设定的最大长度,会自动进行padding,补零
返回结果
- encode_plus会返回一个字典,字典中有三个属性
- input_ids - 表示文本数值化\数值张量化之后的列表,这里看出也是多个句子添加分隔符,放到一个列表中
- token_type_ids - 用于标记词汇属于哪个句子
‘token_type_ids’: tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0]]),
(1)前面的5个0,表示前5个词属于第一个句子
(2)中间的8个表示属于第二个句子,如果有更多的句子可以使用2…3…进行表示
(3)最后的0表示padding- attention_mask: 词汇的位置,标记为1,打padding的位置标记为0
结果示例:
{'input_ids': tensor([[ 101, 872, 3221, 6443, 102, 782, 4495, 6421, 1963, 862, 6629, 1928,102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]),
'token_type_ids': tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0]])}
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoModelForMaskedLM, AutoModelForQuestionAnswering
import numpy as np
# 特征提取任务-不带任务输出头的任务 bert-base-chinese
# 1 加载 my_tokenizer AutoTokenizer.from_pretrained(...)
# 2 加载模型 my_model AutoModel.from_pretrained()
# 3 文本转张量 my_tokenizer.encode_plus(text,return_tensors,truncation,pad_to_max_length=True,max_length=30)
# 4 给模型喂数据 my_model.eval() my_model(**msgs_tensor)
def dm02_test_feature_extraction():
# 1 加载 my_tokenizer
my_tokenizer = AutoTokenizer.from_pretrained('./bert-base-chinese')
# 2 加载不带头的模型 my_model
my_model = AutoModel.from_pretrained('./bert-base-chinese')
# 3 文本转张量 my_tokenizer.encode_plus(text,return_tensors,truncation,pad_to_max_length=True,max_length=30)
message = ['你是谁', '人生该如何起头']
# [101, 872, 3221, 6443, 102, 782, 4495, 6421, 1963, 862, 6629, 1928, 102]
# 101 - 表示句子开头 [CLS]
# 102 - 表示句子结尾 [SEP]
# encode 将列表中的两个句子,分词,数值化,添加分隔符之后放到一个列表当中
# encode函数返回的结果是一个文本数值化之后的列表,需要手动转换为tensor,输入给模型
input1 = my_tokenizer.encode(message)
print(input)
print("input1的形状",np.array(input1).shape) # (13,)
input1 = torch.tensor([input1])
# 使用encode_plus
my_input = my_tokenizer.encode_plus(text=message, return_tensors='pt',
truncation=True, pad_to_max_length=True, max_length=30)
print('my_input-->', my_input)
# 4 给模型喂数据 my_model.eval() my_model(**msgs_tensor)
my_model.eval()
with torch.no_grad():
output1,_ = my_model(input1,return_dict=False)
print("output1 --> ",output1)
print("output1 --> ",output1.shape) # (output1.shape = 1,13,768)
print("--------------------------------------")
# my_output输出的是一个类,可以通过其中的last_hidden_state得到结果
my_output = my_model(**my_input)
print('my_output--->', my_output.last_hidden_state.shape) # 1,30,768
print('my_output--->', my_output) # output得到的是一个类
'''
# input_ids 句子文本数值后的结果 token_type_ids:句子分段信息 attention_mask:句子掩码信息
my_input--> {'input_ids': tensor([[ 101, 872, 3221, 6443, 102, 782, 4495, 6421, 1963, 862, 6629, 1928,
102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0]])}
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor(
'''
2 文本分类任
- 返回结果是一个SequenceClassificationOutput的类,分类结果存储在类中的logits中
# 学习目标
# 1 选用的是什么种类的模型 (eg: 分类模型、阅读理解模型)
# 2 每一种任务模型(eg: 分类模型)的 输入my_input 输出out_input数据格式是什么
# 3 每一种分词器模型的数据格式控制
# 情感分类任务 实现思路 chinese_sentiment
# 1 加载 my_tokenizer AutoTokenizer.from_pretrained('')
# 2 加载模型 my_model AutoModelForSequenceClassification.from_pretrained('./chinese_sentiment')
# 3 文本转张量 my_tokenizer.encode(text,return_tensors,padding,truncation,max_length)
# 4 给模型喂数据 my_model.eval() my_model(my_input1)
def dm01_test_classification():
# 1 加载 my_tokenizer AutoTokenizer.from_pretrained('')
my_tokenizer = AutoTokenizer.from_pretrained('./bert-base-chinese')
# print('my_tokenizer-->', my_tokenizer)
# 2 加载模型 my_model AutoModelForSequenceClassification.from_pretrained('./chinese_sentiment')
my_model = AutoModelForSequenceClassification.from_pretrained('./bert-base-chinese')
# print('my_model-->', my_model)
# 3 文本转张量 my_tokenizer.encode(text,return_tensors,padding,truncation,max_length)
# 采用encode函数对文本进行数值化,数值张量化
message = '人生该如何起头'
my_input1 = my_tokenizer.encode(text=message,return_tensors='pt', padding=True, truncation=True, max_length=20)
print('my_input1-->', my_input1)
# 4 给模型喂数据 my_model.eval() my_model(my_input1)
my_model.eval() # 把模型转成评估模式
with torch.no_grad():
# 返回结果是一个SequenceClassificationOutput的类
# 分类属性在类中的logits中
# SequenceClassifierOutput(loss=None, logits=tensor([[-0.3093, 0.4708]]), hidden_states=None, attentions=None)
my_outpt1 = my_model(my_input1)
print('my_outpt1-->', my_outpt1.logits, my_outpt1.logits.shape)
pass
3 完型填空
# 完型填空任务 实现思路 ./chinese-bert-wwm bert-base-chinese
# 1 加载 my_tokenizer AutoTokenizer.from_pretrained()
# 2 加载模型 my_model AutoModelForMaskedLM.from_pretrained()
# 3 文本转张量 input my_tokenizer.encode_plus('xx[MASK]xx',return_tensors='pt')
# 4 给模型喂数据 my_model.eval() my_model(**input)
# 5 获取概率最高
# mask_pred_idx = torch.argmax(output.logits[0][6]).item()
# my_tokenizer.convert_ids_to_tokens([mask_pred_idx]))
def dm03_test_fill_mask():
# 1 加载 my_tokenizer AutoTokenizer.from_pretrained()
my_tokenizer = AutoTokenizer.from_pretrained('./bert-base-chinese')
# 2 加载模型 my_model AutoModelForMaskedLM.from_pretrained()
my_model = AutoModelForMaskedLM.from_pretrained('./bert-base-chinese')
# 3 文本转张量 input my_tokenizer.encode_plus('xx[MASK]xx',return_tensors='pt')
my_input = my_tokenizer.encode_plus('我想明天去[MASK]家吃饭.', return_tensors='pt')
# 4 给模型喂数据 my_model.eval() my_model(**input)
my_model.eval()
with torch.no_grad():
my_output = my_model(**my_input)
# 5 获取概率最高
print("my_output.logits.shape",my_output.logits.shape) # torch.Size([1, 12, 21128])
mask_pred_idx = torch.argmax(my_output.logits[0][6]).item() # 找到mask对应的预测值中最大的索引
# torch.argmax(my_output[:, 6:7, :]).item()
myword = my_tokenizer.convert_ids_to_tokens([mask_pred_idx]) # 从构词表中根据索引查出对应的字
print('myword-->', myword)
# mask_pred_idx = torch.argmax(output.logits[0][6]).item()
# my_tokenizer.convert_ids_to_tokens([mask_pred_idx]))
pass
4 文本摘要
# 文本摘要 实现思路 path = "./distilbart-cnn-12-6"
# 1 加载 my_tokenizer AutoTokenizer.from_pretrained(...)
# 2 加载模型 my_model AutoModelForSeq2SeqLM.from_pretrained()
# 3 文本转张量my_input my_tokenizer([text], return_tensors='pt')
# 4 给模型喂数据 my_model.generate(my_input.input_ids) # 文本摘要是生成式任务 给模型喂数据输入my_input.input_ids 不需要掩码 句子分度信息
# 5 分词器decode [my_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in my_output])
# skip_special_tokens 单词id2word中跳过特殊字符 clean_up_tokenization_spaces=Fasle 句子之间的空格不去除
def dm05_test_summarization():
text = "BERT is a transformers model pretrained on a large corpus of English data " \
"in a self-supervised fashion. This means it was pretrained on the raw texts " \
"only, with no humans labelling them in any way (which is why it can use lots " \
"of publicly available data) with an automatic process to generate inputs and " \
"labels from those texts. More precisely, it was pretrained with two objectives:Masked " \
"language modeling (MLM): taking a sentence, the model randomly masks 15% of the " \
"words in the input then run the entire masked sentence through the model and has " \
"to predict the masked words. This is different from traditional recurrent neural " \
"networks (RNNs) that usually see the words one after the other, or from autoregressive " \
"models like GPT which internally mask the future tokens. It allows the model to learn " \
"a bidirectional representation of the sentence.Next sentence prediction (NSP): the models" \
" concatenates two masked sentences as inputs during pretraining. Sometimes they correspond to " \
"sentences that were next to each other in the original text, sometimes not. The model then " \
"has to predict if the two sentences were following each other or not."
# 1 加载tokenizer
my_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="./distilbart-cnn-12-6")
# 2 加载模型
my_model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path='./distilbart-cnn-12-6')
# 3 文本转张量
my_input = my_tokenizer([text], return_tensors='pt')
print('my_input--->', my_input)
# 4 送给模型做摘要
my_model.eval()
my_output = my_model.generate(my_input.input_ids)
print('my_output--->', my_output)
# 5 处理摘要结果
# 5-1 skip_special_tokens=TRUE 一些特殊的控制token跳过,不解码
# clean_up_tokenization_spaces=False 解码过程中空格不去除
print([my_tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in my_output])
pass
# 5-2 convert_ids_to_tokens 函数只能将 ids 还原为 token 不能去除一些控制字符
# print(my_tokenizer.convert_ids_to_tokens(output[0]))
5 阅读理解任务
# 阅读理解任务(抽取式问答) 实现思路 './chinese_pretrain_mrc_roberta_wwm_ext_large'
# 1 加载my_tokenizer AutoTokenizer.from_pretrained(...)
# 2 加载模型 my_model AutoModelForQuestionAnswering.from_pretrained()
# 3 文本转张量 for q in qs: input = my_tokenizer.encode_plus(question, context, return_tensors='pt')
# 4 给模型喂数据 my_model(**input)
# 5 根据最大概率取对应位置数据
# start, end = torch.argmax(output.start_logits), torch.argmax(output.end_logits) + 1
# answer = my_tokenizer.convert_ids_to_tokens(input['input_ids'][0][start:end])
def dm04_test_question_answering():
# path = 'bert-base-chinese'
path = './bert-base-chinese'
# 1 加载tokenizer
my_tokenizer = AutoTokenizer.from_pretrained(path)
# 2 加载模型
my_model = AutoModelForQuestionAnswering.from_pretrained(path)
# 3 文本转张量
# 文字中的标点符号如果是中文的话,会影响到预测结果 也可以去掉标点符号
context = '我叫张三 我是一个程序员 我的喜好是打篮球' # 从上下文中 抽取答案
questions = ['我是谁?', '我是做什么的?', '我的爱好是什么?']
# questions = ['你是男孩还是女孩?', '我是做什么的?', '我的爱好是什么?']
# 4 给模型送数据 模型做抽取式 问答
my_model.eval()
for question in questions:
my_input = my_tokenizer.encode_plus(question, context, return_tensors='pt')
print('my_input--->', my_input)
my_output = my_model(**my_input)
print('my_output--->', my_output)
print('output.start_logits.shape--->', my_output.start_logits.shape)
start, end = torch.argmax(my_output.start_logits), torch.argmax(my_output.end_logits) + 1
answer = my_tokenizer.convert_ids_to_tokens(my_input['input_ids'][0][start:end])
print('question:', question, 'answer:', answer)
# break
'''
input---> {'input_ids': tensor([[ 101, 2769, 3221, 6443, 8043, 102, 2769, 1373, 2476, 676, 2769, 3221,
671, 702, 4923, 2415, 1447, 2769, 4638, 1599, 1962, 3221, 2802, 5074,
4413, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1]])}
output---> QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[ -1.9978, -11.4788, -12.6324, -11.8324, -12.4148, -11.9371, -2.7246,
-6.6402, 3.9131, -2.9533, -7.0866, -9.5696, -4.2775, -8.9042,
0.5753, -6.9468, -7.0469, -8.5334, -11.3796, -9.3905, -11.0242,
-11.1047, -5.7124, -2.7293, -7.5896, -12.6013]],
grad_fn=<SqueezeBackward1>), end_logits=tensor([[ -1.3483, -12.0141, -11.6312, -11.6629, -11.9607, -12.0039, -4.6118,
-7.4034, -2.3499, 4.7159, -7.2880, -9.5317, -6.6742, -6.0915,
-7.0023, -4.9691, 1.4515, -7.8329, -9.0895, -10.3742, -8.7482,
-9.8567, -7.2930, -5.8163, -1.7323, -12.2525]],
grad_fn=<SqueezeBackward1>), hidden_states=None, attentions=None)
output.start_logits.shape---> torch.Size([1, 26])
question: 我是谁? answer: ['张', '三']
'''
6 实体识别任务(NER任务)
# ner任务
# ner任务 实现思路 './chinese_pretrain_mrc_roberta_wwm_ext_large'
# 1-1 加载 my_tokenizer AutoTokenizer.from_pretrained(...)
# 1-2 加载模型 my_model AutoModelForSeq2SeqLM.from_pretrained()
# 1-3 加载ner_label配置 my_config = AutoConfig.from_pretrained(my_path)
# 2 文本转张量 my_input my_tokenizer(msg, return_tensors='pt')
# 3 给模型喂数据 my_model.generate(my_input.input_ids)
# 4 对输入token反显 对预测结果反显
# 4-1 input_tokens = my_tokenizer.convert_ids_to_tokens(my_input.input_ids[0])
# 4-2 zip方式输出[输入的token, 预测的label]
# for token, value in zip(input_tokens, my_output.logits[0]):
def dm06_test_ner():
# 1-1 加载tokenizer 加载模型 加载配置文件
# https://huggingface.co/uer/roberta-base-finetuned-cluener2020-chinese
my_path = './roberta-base-finetuned-cluener2020-chinese'
my_tokenizer = AutoTokenizer.from_pretrained(my_path)
my_model = AutoModelForTokenClassification.from_pretrained(my_path)
my_config = AutoConfig.from_pretrained(my_path)
# 2 数据张量化
my_input = my_tokenizer.encode_plus('我爱北京天安门,天安门上太阳升', return_tensors='pt')
print('my_input--->', my_input)
print('my_input.input_ids.shape--->', my_input.input_ids.shape, my_input.input_ids) # torch.Size([1, 17])
# 3 送入模型 预测ner概率 每个字预测的标签概率
my_model.eval()
my_output = my_model(my_input.input_ids)
# print('my_output--->', my_output) # TokenClassifierOutput 返回类对象(loss, logits,grad_fn, hidden_states, attentions)
print('my_output.logits.shape--->', my_output.logits.shape) # torch.Size([1, 17, 32])
# 4 反显有2个:对输入token反显 对预测结果反显
# 4-1 对输入token反显 id2token
input_tokens = my_tokenizer.convert_ids_to_tokens(my_input.input_ids[0])
print('input_tokens--->', input_tokens)
# 4-2 zip方式输出[输入的token, 预测的label]
ner_result = []
# my_output.logits[0][1,17,32] --> [17,32] # 每个单词有32种预测结果 反显预测结果
for token, value in zip(input_tokens, my_output.logits[0]):
if token in my_tokenizer.all_special_tokens:
continue
# 获得每个字预测概率最大的标签索引
idx = torch.argmax(value).item()
# 预测结果进行反显, 组成元组对, 进行输出
ner_result.append((token, my_config.id2label[idx]))
print('最终ner结果输出ner_result--->\n', ner_result)
'''
# 没有看不懂的代码 只有看不懂的业务
[('我', 'O'), ('爱', 'O'), ('北', 'B-address'), ('京', 'I-address'), ('天', 'I-address'), ('安', 'I-address'),
('门', 'I-address'), (',', 'O'), ('天', 'B-address'), ('安', 'I-address'), ('门', 'I-address'), ('上', 'O'), ('太', 'O'),
('阳', 'O'), ('升', 'O')]
'''