keras_bert包和transformers.BertTokenizer包中划分token的方式

代码参考keras-bert中的tokenizer

def word_piece_tokenize(word, token_dict):
    if word in token_dict:
        return [word]
    tokens = []
    start, stop = 0, 0
    while start < len(word):
        stop = len(word)
        while stop > start:
            sub = word[start:stop]
            if start > 0:			# sub-word前加##
                sub = '##' + sub
            if sub in token_dict:
                break
            stop -= 1
        if start == stop:
            stop += 1
        tokens.append(sub)
        start = stop
    return tokens

代码参考transformers包中的BertTokenizer

class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.

        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer`.

        Returns:
          A list of wordpiece tokens.
        """
        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue
            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    print(chars)
                    break
                sub_tokens.append(cur_substr)
                start = end

            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens

二者的区别在于:
transformers.BertTokenizer的WordpieceTokenizer方法中,对于找不到的词语,会分配UNK;
而在keras_bert.tokenizer中则不会这样处理。

# 3. BERT模型 print("\n训练BERT模型...") # 使用较小的BERT模型以节省资源 model_name = r"D:\jupyter notebook\models" tokenizer = BertTokenizer.from_pretrained(model_name) # 准备BERT输入 def convert_examples_to_tfdataset(texts, labels, max_length=100): input_examples = [ InputExample(guid=i, text_a=text, label=label) for i, (text, label) in enumerate(zip(texts, labels)) ] features = [] for ex in input_examples: input_dict = tokenizer.encode_plus( ex.text_a, add_special_tokens=True, max_length=max_length, padding='max_length', truncation=True, return_attention_mask=True ) features.append(InputFeatures( input_ids=input_dict['input_ids'], attention_mask=input_dict['attention_mask'], token_type_ids=input_dict.get('token_type_ids', [0]*max_length), label=ex.label )) def gen(): for f in features: yield ( { 'input_ids': f.input_ids, 'attention_mask': f.attention_mask, 'token_type_ids': f.token_type_ids }, f.label ) return tf.data.Dataset.from_generator( gen, output_types=( { 'input_ids': tf.int32, 'attention_mask': tf.int32, 'token_type_ids': tf.int32 }, tf.int32 ), output_shapes=( { 'input_ids': tf.TensorShape([max_length]), 'attention_mask': tf.TensorShape([max_length]), 'token_type_ids': tf.TensorShape([max_length]) }, tf.TensorShape([]) ) ) # 创建数据集 train_ds = convert_examples_to_tfdataset(X_train.tolist(), y_train.tolist()) test_ds = convert_examples_to_tfdataset(X_test.tolist(), y_test.tolist()) # 批处理 batch_size = 32 train_ds = train_ds.shuffle(100).batch(batch_size).prefetch(tf.data.AUTOTUNE) test_ds = test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE) # 构建BERT模型 bert_model = TFBertForSequenceClassification.from_pretrained( model_name, num_labels=1, from_pt=True ) bert_model.compile( optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'] ) # 训练BERT模型 bert_history = bert_model.fit( train_ds, validation_data=test_ds, epochs=3, verbose=1 ) # 评估BERT模型 bert_results = bert_model.evaluate(test_ds) print(f"BERT测试准确率: {bert_results[1]:.4f}") 报错86 bert_history = bert_model.fit( 87 train_ds, 88 validation_data=test_ds, 89 epochs=3, 90 verbose=1 InvalidArgumentError: Graph execution error:
06-24
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值