keras_bert包和transformers.BertTokenizer包中划分token的方式

代码参考keras-bert中的tokenizer

def word_piece_tokenize(word, token_dict):
    if word in token_dict:
        return [word]
    tokens = []
    start, stop = 0, 0
    while start < len(word):
        stop = len(word)
        while stop > start:
            sub = word[start:stop]
            if start > 0:			# sub-word前加##
                sub = '##' + sub
            if sub in token_dict:
                break
            stop -= 1
        if start == stop:
            stop += 1
        tokens.append(sub)
        start = stop
    return tokens

代码参考transformers包中的BertTokenizer

class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.

        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer`.

        Returns:
          A list of wordpiece tokens.
        """
        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue
            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    print(chars)
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens

二者的区别在于:
transformers.BertTokenizer的WordpieceTokenizer方法中,对于找不到的词语,会分配UNK;
而在keras_bert.tokenizer中则不会这样处理。

import tensorflow as tf from transformers import BertTokenizer, TFBertModel from tensorflow.keras import layers, models from tensorflow.keras.optimizers import Adam from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, Lambda, Concatenate # 设置 GELU 激活函数 def set_gelu(activation_type): if activation_type == 'tanh': return tf.nn.gelu else: return tf.nn.gelu # 默认返回 Gelu 函数 # 自定义 CNN 特征提取层 def textcnn(inputs, kernel_initializer): cnn1 = Conv1D( 256, 3, strides=1, padding='same', activation='relu', kernel_initializer=kernel_initializer )(inputs) # shape=[batch_size,maxlen-2,256] cnn1 = GlobalMaxPooling1D()(cnn1) # shape=[batch_size,256] cnn2 = Conv1D( 256, 4, strides=1, padding='same', activation='relu', kernel_initializer=kernel_initializer )(inputs) cnn2 = GlobalMaxPooling1D()(cnn2) cnn3 = Conv1D( 256, 5, strides=1, padding='same', kernel_initializer=kernel_initializer )(inputs) cnn3 = GlobalMaxPooling1D()(cnn3) output = Concatenate(axis=-1)([cnn1, cnn2, cnn3]) output = Dropout(0.2)(output) return output # 构建 BERT 模型 def build_bert_model(model_path, class_nums): # 使用 transformers 库加载本地的 BERT 模型分词器 tokenizer = BertTokenizer.from_pretrained(model_path) bert_model = TFBertModel.from_pretrained(model_path) # 通过 BertModel 获取输出 input_ids = layers.Input(shape=(None,), dtype=tf.int32, name="input_ids") attention_mask = layers.Input(shape=(None,), dtype=tf.int32, name="attention_mask") # 获取 BERT 的输出,返回的是 [last_hidden_state, pooler_output] bert_output = bert_model(input_ids, attention_mask=attention_mask) all_token_embedding = bert_output[0] # [batch_size, maxlen-2, 768] cls_features = Lambda(lambda x: x[:, 0])(bert_output[0]) # 获取 [CLS] token 特征 # 使用 CNN 提取特征 kernel_initializer = tf.keras.initializers.GlorotUniform() # 这里使用 GlorotUniform 作为初始化器 cnn_features = textcnn(all_token_embedding, kernel_initializer) # shape=[batch_size, cnn_output_dim] # 拼接 [CLS] 特征 CNN 特征 concat_features = Concatenate(axis=-1)([cls_features, cnn_features]) # 全连接层 dense = Dense( units=512, activation='relu', kernel_initializer=kernel_initializer )(concat_features) # 输出层 output = Dense( units=class_nums, activation='softmax', kernel_initializer=kernel_initializer )(dense) # 定义模型 model = models.Model(inputs=[input_ids, attention_mask], outputs=output) return model,tokenizer # 主程序 if __name__ == '__main__': model_path = './bert-base-chinese' # 配置文件路径 class_nums = 13 # 分类数量 # 构建 BERT + CNN 模型 model = build_bert_model(model_path, class_nums) model.summary() 帮我根据上述方法修改这份代码使得该模型训练效果更好
03-08
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值