simcse可以进行相似文本的匹配任务,但是tensorflow版本的transformer还没有一个内嵌的simcse可以进行下游训练,为此对齐进行了可行性验证
相关代码如下,对应的完整代码见bert_classification/4_bert_sentence_similarity.py · sparkle_code_guy/bert_related_task - 码云 - 开源中国 (gitee.com):
import tensorflow as tf
import numpy as np
from transformers import TFBertPreTrainedModel, BertConfig, TFBertMainLayer, BertTokenizer
from transformers.modeling_tf_outputs import TFSemanticSegmenterOutput
from typing import Optional, Tuple, Union
import pandas as pd
from transformers.models.bert.modeling_tf_bert import (
TFModelInputType,
TFSequenceClassificationLoss,
unpack_inputs, BERT_INPUTS_DOCSTRING
)
from transformers.utils import add_start_docstrings_to_model_forward
class TFSimCSE(TFBertPreTrainedModel, TFSequenceClassificationLoss):
# names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
_keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"]
_keys_to_ignore_on_load_missing = [r"dropout"]
def __init__(self, config: BertConfig, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.bert = TFBertMainLayer(config, name="bert")
@unpack_inputs
@add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
def call(
self,
input_ids: Optional[TFModelInputType] = None,
attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = None,
labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
training: Optional[bool] = False,
) -> Union[TFSemanticSegmenterOutput, Tuple[tf.Tensor]]:
r"""
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
outputs1 = self.bert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 说明,transformer相关模型返回的结果必须是tuple,dict(ModelOutput对象类似)
return TFSemanticSegmenterOutput(logits=outputs1.pooler_output, loss=None)
def serving_output(self, output: dict) -> dict:
return output
def simcse_loss(y_true, y_pred):
"""
simcse loss
对应的介绍见文章:https://blog.youkuaiyun.com/sslfk/article/details/123210756
"""
idxs = tf.range(0, tf.shape(y_pred)[0])
idxs_1 = idxs[None, :]
idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]
y_true = tf.equal(idxs_1, idxs_2)
y_true = tf.cast(y_true, tf.keras.backend.floatx())
y_pred = tf.math.l2_normalize(y_pred, axis=1)
similarities = tf.matmul(y_pred, y_pred, transpose_b=True)
similarities = similarities - tf.eye(tf.shape(y_pred)[0]) * 1e12
similarities = similarities / 0.05
loss = tf.keras.losses.categorical_crossentropy(y_true, similarities, from_logits=True)
return tf.reduce_mean(loss)
def simcse_hard_neg_loss(y_true, y_pred):
"""
simcse loss for hard neg or random neg
"""
row = tf.range(0, tf.shape(y_pred)[0], 3)
col = tf.range(tf.shape(y_pred)[0])
col = tf.squeeze(tf.where(col % 3 != 0), axis=1)
y_true = tf.range(0, len(col), 2)
y_pred = tf.math.l2_normalize(y_pred, axis=1)
similarities = tf.matmul(y_pred, y_pred, transpose_b=True)
similarities = tf.gather(similarities, row, axis=0)
similarities = tf.gather(similarities, col, axis=1)
similarities = similarities / 0.05
loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, similarities, from_logits=True)
return tf.reduce_mean(loss)
max_length = 60
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
def simcse_generater():
df_raw = pd.read_csv("data/sts_data/senteval_cn/ATEC/ATEC.train.data", sep="\t", header=None,
names=["x1", "x2", "y"])
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
return {
"input_ids": input_ids,
"token_type_ids": token_type_ids,
"attention_mask": attention_masks,
}, label
def encode_examples(ds, limit=-1):
# prepare list, so that we can build up final TensorFlow dataset from slices.
input_ids_list = []
token_type_ids_list = []
attention_mask_list = []
label_list = []
if (limit > 0):
ds = ds.take(limit)
for index, row in ds.iterrows():
x1 = row["x1"]
x2 = row["x2"]
for each in (x1, x2):
bert_input = tokenizer.encode_plus(each,
add_special_tokens=True, # add [CLS], [SEP]
padding='max_length',
truncation=True,
max_length=max_length, # max length of the text that can go to BERT
# pad_to_max_length=True,
return_attention_mask=True,
# add attention mask to not focus on pad tokens
)
input_ids_list.append(bert_input['input_ids'])
token_type_ids_list.append(bert_input['token_type_ids'])
attention_mask_list.append(bert_input['attention_mask'])
label_list.append([0])
return tf.data.Dataset.from_tensor_slices(
(input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)
# train dataset
batch_size = 100
ds_train_encoded = encode_examples(df_raw).shuffle(10000).batch(batch_size)
return ds_train_encoded
learning_rate = 2e-5
my_model = TFSimCSE.from_pretrained('bert-base-chinese')
# optimizer Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08, clipnorm=1)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
my_model.compile(optimizer=optimizer, loss=simcse_loss)
# fit model
bert_history = my_model.fit(simcse_generater(), epochs=1)
# evaluate test set
tf.keras.models.save_model(my_model, filepath="my_model1")
说明:
transformer相关的模型训练,返回的结果必须是tuple或dict类型,其中dict可以是具体的ModelOutput对应的实现类。
相关STS数据下载:链接:https://pan.baidu.com/s/1JzzDVjaBRrDjYGgPJ6D4hQ?pwd=cxa6 提取码:cxa6
tensorflow版本的bert模型实在原有transformer的encoder结构上增加了一个pooler层,其处理只是将encoder的最后一层hidden state的第一个token的编码进行dense+tanh处理,相关代码:TFBertPooler
原作者的代码:https://github.com/princeton-nlp/SimCSE#model-list,开源的simcse默认代码仅适用于英文,中文部分,需要通过替换bert预训练模型后更换中文训练集
tf2的版本:https://github.com/jifei/simcse-tf2.git 需要依赖bert4keras,注意其对高版本的tf2未进行兼容
关于transformer以及bert的理解部分,见:transformer以及各个领域方向相关技术及分支走势 - 飞书云文档 (feishu.cn)
transformer的相关介绍以及代码实现:理解语言的 Transformer 模型 | TensorFlow Core (google.cn)