python函数——序列预处理pad_sequences()序列填充

``` from transformers import BertTokenizer, TFBertForSequenceClassification from tensorflow.keras.optimizers import Adam from tensorflow.keras.losses import SparseCategoricalCrossentropy # 加载BERT预训练模型和分词器 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) # 编译模型 model.compile(optimizer=Adam(learning_rate=3e-5), loss=SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy']) # 加载IMDB数据集 (x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=10000) # 数据预处理 maxlen = 100 x_train = pad_sequences(x_train, maxlen=maxlen) x_test = pad_sequences(x_test, maxlen=maxlen) # 将数据转换为BERT输入格式 def encode_data(texts, labels): input_ids = [] attention_masks = [] for text in texts: encoded = tokenizer.encode_plus( text, add_special_tokens=True, max_length=maxlen, pad_to_max_length=True, return_attention_mask=True, return_tensors='tf' ) input_ids.append(encoded['input_ids']) attention_masks.append(encoded['attention_mask']) return { 'input_ids': tf.concat(input_ids, axis=0), 'attention_mask': tf.concat(attention_masks, axis=0) }, tf.convert_to_tensor(labels) train_data, train_labels = encode_data(x_train, y_train) test_data, test_labels = encode_data(x_test, y_test) # 训练模型 model.fit(train_data, train_labels, epochs=3, batch_size=32, validation_data=(test_data, test_labels)) # 评估模型 test_loss, test_acc = model.evaluate(test_data, test_labels) print(f'迁移学习模型在IMDB测试集上的准确率: {test_acc}')```解释代码
03-19
``` import tensorflow as tf from tensorflow.keras.datasets import imdb from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout import numpy as np import matplotlib.pyplot as plt #加载IMDb数据集(限制词汇量为4000) num_words = 4000 (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words) # 序列填充(统一长度为400) maxlen = 400 x_train = pad_sequences(x_train, maxlen=maxlen, padding='post') x_test = pad_sequences(x_test, maxlen=maxlen, padding='post') # 创建顺序模型 model = Sequential() #嵌入层(词汇量4000,输出向量32,输入长度400) model.add(Embedding(input_dim=num_words, output_dim=32, input_length=maxlen)) #Dropout层(丢弃率0.3) model.add(Dropout(0.3)) #GRU层(输出维度64) model.add(GRU(units=64)) #Dropout层(丢弃率0.3) model.add(Dropout(0.3)) #输出层(二分类,Sigmoid激活) model.add(Dense(1, activation='sigmoid')) #显示模型结构 model.summary() #编译模型(优化器RMSprop,二元交叉熵损失) model.compile( optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'] ) #训练模型(batch_size=64,epochs=10,验证集20%) history = model.fit( x_train, y_train, batch_size=64, epochs=10, validation_split=0.2 ) #评估测试集(batch_size=64,日志模式2) test_loss, test_acc = model.evaluate(x_test, y_test, batch_size=64, verbose=2) print(f"测试集准确率: {test_acc:.4f}") # 绘制训练过程曲线 plt.figure(figsize=(12, 5)) plt.rcParams['font.family'] = 'FangSong' plt.rcParams['axes.unicode_minus'] = False # 子图1:损失函数 plt.subplot(1, 2, 1) plt.plot(history.history['loss'], label='训练集损失') plt.plot(history.history['val_loss'], label='验证集损失') plt.title('损失函数变化曲线') plt.xlabel('Epoch') plt.ylabel('Loss') plt.legend() # 子图2:准确率 plt.subplot(1, 2, 2) plt.plot(history.history['accuracy'], label='训练集准确率') plt.plot(history.history['val_accuracy'], label='验证集准确率') plt.title('准确率变化曲线') plt.xlabel('Epoch') plt.ylabel('Accuracy') plt.legend() plt.tight_layout() plt.show()```提高准确lu
最新发布
04-03
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值