序列填充:keras.processing.sequences.pad_sequence()

博客介绍了Keras中的序列填充功能,具体提及使用keras.processing.sequences.pad_sequence()函数来进行序列填充,属于深度学习相关内容。

序列填充:keras.processing.sequences.pad_sequence()

import numpy as np from tqdm import tqdm import tensorflow as tf import pandas as pd import torch import re from sklearn.model_selection import train_test_split CHEM_FORMULA_SIZE = "([A-Z][a-z]*)([0-9]*)" VALID_ELEMENTS = [ "C", "N", "P", "O", "S", "Si", "I", "H", "Cl", "F", "Br", "B", "Se", "Fe", "Co", "As", "K", "Na", ] ELEMENT_VECTORS = np.eye(len(VALID_ELEMENTS)) element_to_position = dict(zip(VALID_ELEMENTS, ELEMENT_VECTORS)) def formula_to_dense(chem_formula: str) -> np.ndarray: total_onehot = [] for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula): num = 1 if num == "" else int(num) one_hot = element_to_position[chem_symbol].reshape(1, -1) one_hot_repeats = np.repeat(one_hot, repeats=num, axis=0) total_onehot.append(one_hot_repeats) if len(total_onehot) == 0: dense_vec = np.zeros(len(element_to_position)) else: dense_vec = np.vstack(total_onehot).sum(0) return dense_vec def sine_embed(v, max_count=256): num_freqs = int(np.ceil(np.log2(max_count))) freqs = 0.5 ** torch.arange(num_freqs, dtype=torch.float32) * np.pi v_tensor = torch.tensor(v, dtype=torch.float32)[:, None] embedded = torch.sin(v_tensor * freqs[None, :]) return torch.abs(embedded).numpy() def encode_formula(formula: str): candidate_features = formula_to_dense(formula) # 将单个化学式转为特征向量 sine_embeddings = sine_embed(candidate_features) return sine_embeddings.flatten() def positional_encoding(max_position, d_model, min_freq=1e-6): position = np.arange(max_position) freqs = min_freq**(2*(np.arange(d_model)//2)/d_model) pos_enc = position.reshape(-1,1)*freqs.reshape(1,-1) pos_enc[:, ::2] = np.cos(pos_enc[:, ::2]) pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2]) return pos_enc P=positional_encoding(2000000,256, min_freq=1e2) dimn=256 def encoding(rag_tensor,P,dimn): to_pad=[] for sample in rag_tensor: all_dim=[sample[0].numpy().tolist()] pos_enc=[P[int(i)-1] for i in sample[1].numpy().tolist()] for dim in range(dimn): dim_n=[i[dim] for i in pos_enc] all_dim.append(dim_n) to_pad.append(all_dim) to_pad=[tf.keras.preprocessing.sequence.pad_sequences(i,maxlen=501,dtype='float32',padding='post',truncating='post',value=10) for i in to_pad] to_pad=np.stack((to_pad)) to_pad=np.swapaxes(to_pad, 1, -1) return to_pad def trun_n_d(n,d): return ( n if not n.find('.')+1 else n[:n.find('.')+d+1] ) def prepro_specs_train(df): df = df.reset_index(drop=True) valid = [] mz_intensity = df['Spectrum'].to_list() def process_line(line): pairs = line.split() mz_list = [] intensity_list = [] for pair in pairs: mz, intensity = pair.split(':') mz_list.append(float(mz)) intensity_list.append(float(intensity)) return mz_list, intensity_list for idx, intensities in tqdm(enumerate(mz_intensity)): mz_list, intensity_list = process_line(intensities) mz_list.append(float(df.at[idx, 'Total Exact Mass'])) round_mz_list = [round(float(mz), 2) for mz in mz_list] round_intensity_list = [round(float(intensity), 2) for intensity in intensity_list] valid.append([round_mz_list, round_intensity_list]) return tf.ragged.constant(valid) import json import torch from typing import Dict, List from torch.utils.data import Dataset import transformers from peft import LoraConfig, TaskType, get_peft_model from torch.utils.data import DataLoader, SequentialSampler from transformers import Trainer, TrainingArguments from lora_plus import LoraPlusTrainer from torch.utils.data import RandomSampler def infer_seqlen(source_len: int, target_len: int, cutoff_len: int) -> tuple[int, int]: if target_len * 2 < cutoff_len: # truncate source max_target_len = cutoff_len elif source_len * 2 < cutoff_len: # truncate target max_target_len = cutoff_len - source_len else: # truncate both max_target_len = int(cutoff_len * (target_len / (source_len + target_len))) new_target_len = min(max_target_len , target_len) max_source_len = max(cutoff_len - new_target_len, 0) new_source_len = min(max_source_len, source_len) return new_source_len, new_target_len class SupervisedDataset(Dataset): """Dataset for supervised fine-tuning.""" def __init__( self, data_path, tokenizer, model_max_length, user_tokens=[151644], assistant_tokens=[151645], ): super(SupervisedDataset, self).__init__() self.data = json.load(open(data_path)) self.tokenizer = tokenizer self.model_max_length = model_max_length self.user_tokens = user_tokens self.assistant_tokens = assistant_tokens self.ignore_index = -100 # 测试第一条数据是否正确处理 item = self.preprocessing(self.data[0]) print("input:", self.tokenizer.decode(item["input_ids"])) labels = [id_ for id_ in item["labels"] if id_ != -100] # 过滤 -100 的标签 def __len__(self): return len(self.data) def preprocessing(self, example): input_ids = [] labels = [] # 将用户和助手的内容配对 messages = example["conversations"] pairs = [] current_user_encoded = None # 将 user 和 assistant 配对,并将其打包成编码后的 pairs for message in messages: if message["role"] == "user": # 编码用户消息 current_user_encoded = [self.tokenizer.bos_token_id] + self.user_tokens + self.tokenizer.encode( message["content"], add_special_tokens=False ) elif message["role"] == "assistant" and current_user_encoded is not None: # 编码助手消息 assistant_encoded = self.assistant_tokens + self.tokenizer.encode( message["content"], add_special_tokens=False ) # 配对形成一个 (source_ids, target_ids) pairs.append((current_user_encoded, assistant_encoded)) current_user_encoded = None total_length = 0 # 初始化总长度 # 逐对处理编码后的 (source_ids, target_ids) for turn_idx, (source_ids, target_ids) in enumerate(pairs): # 检查是否超出最大长度,若超出则停止处理 if total_length >= self.model_max_length: print("Exceeded max length, stopping processing further turns.") break # 动态截断长度 source_len, target_len = infer_seqlen( len(source_ids), len(target_ids), self.model_max_length - total_length ) source_ids = source_ids[:source_len] target_ids = target_ids[:target_len] # 更新总长度 total_length += source_len + target_len source_label = [self.tokenizer.bos_token_id] + [self.ignore_index] * (source_len-1) target_label = target_ids # 数据拼接 input_ids += source_ids + target_ids labels += source_label + target_label # 添加 EOS 标记 input_ids += [self.tokenizer.eos_token_id] labels += [self.tokenizer.eos_token_id] input_ids += [self.tokenizer.pad_token_id] * ( self.model_max_length - len(input_ids) ) labels += [self.ignore_index] * (self.model_max_length - len(labels)) # 转换为 Tensor input_ids = torch.LongTensor(input_ids) labels = torch.LongTensor(labels) # 构造 attention_mask attention_mask = attention_mask = input_ids.ne(self.tokenizer.pad_token_id) return { "input_ids": input_ids, "labels": labels, "attention_mask": attention_mask, } def __getitem__(self, idx) -> Dict[str, torch.Tensor]: return self.preprocessing(self.data[idx])修改代码,改为读取csv文件,提取csv文件的Molecular Formula、Total Exact Mass、Spectrum以及SELFIES列数据,对Molecular Formula使用formula_to_dense函数获取形状为(batch,18)的输入,对Total Exact Mass和Spectrum列使用prepro_specs_train和encoding函数获取形状为(batch,501,257)的输入,然后SELFIES使用tokenrizer编码,使用cls_token和sep_token作为开始和结束标记,填充标记为pad_token
最新发布
07-24
import tensorflow as tf from tensorflow.keras import datasets, layers, models, regularizers from tensorflow.keras.callbacks import EarlyStopping import matplotlib.pyplot as plt import numpy as np import pandas as pd from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau from tensorflow.keras.preprocessing.sequence import pad_sequences import os import json from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, classification_report from sklearn.preprocessing import LabelEncoder import gc # 设置随机种子确保结果可复现 tf.random.set_seed(42) np.random.seed(42) print("正在加载数据...") try: # 读取训练集和测试集 train_df = pd.read_csv('../SDX/train_set.csv', sep='\t',nrows=1000) test_a_df = pd.read_csv('../SDX/test_a.csv', sep='\t',nrows=1000) print(f"训练集大小: {train_df.shape}, 测试集A大小: {test_a_df.shape}") print("训练集预览:") print(train_df.head()) except Exception as e: print(f"数据加载失败: {str(e)}") print(f"当前工作目录: {os.getcwd()}") print(f"目录内容: {os.listdir('../SDX') if os.path.exists('../SDX') else 'SDX目录不存在'}") raise # 类别映射(14个类别) class_mapping = { 0: "科技", 1: "股票", 2: "体育", 3: "娱乐", 4: "时政", 5: "社会", 6: "教育", 7: "财经", 8: "家居", 9: "游戏", 10: "房产", 11: "时尚", 12: "彩票", 13: "星座" } # 验证类别数量 if train_df['label'].nunique() != 14: print(f"警告: 训练集标签数量为 {train_df['label'].nunique()},应为14个类别") # 2. 数据预处理 print("\n处理文本数据...") def text_to_sequence(text): """将文本转换为整数序列""" try: return [int(x) for x in text.split()] except Exception as e: print(f"文本转换错误: {str(e)}") return [] # 文本序列转换 train_df['sequence'] = train_df['text'].apply(text_to_sequence) test_a_df['sequence'] = test_a_df['text'].apply(text_to_sequence) # 计算序列长度统计 lengths = train_df['sequence'].apply(len) print(f"序列长度统计: min={min(lengths)}, max={max(lengths)}, mean={np.mean(lengths):.1f}") # 确定最大序列长度(取95%分位数以减少填充) max_len = int(np.percentile(lengths, 95)) print(f"最大序列长度(95%分位数): {max_len}") # 填充序列 X = pad_sequences(train_df['sequence'].values, maxlen=max_len, padding='post') X_test = pad_sequences(test_a_df['sequence'].values, maxlen=max_len, padding='post') # 清理内存 del lengths gc.collect() # 标签处理 label_encoder = LabelEncoder() y = label_encoder.fit_transform(train_df['label']) num_classes = len(np.unique(y)) # 1. 数据准备 (train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data() train_images, test_images = train_images / 255.0, test_images / 255.0 # 2. 模型构建函数 def build_model(optimizer='adam', l2_rate=0.001, dropout_rate=0.5): model = models.Sequential([ layers.Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=regularizers.l2(l2_rate), input_shape=(32, 32, 3)), layers.BatchNormalization(), layers.MaxPooling2D((2, 2)), layers.Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=regularizers.l2(l2_rate)), layers.BatchNormalization(), layers.MaxPooling2D((2, 2)), layers.Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=regularizers.l2(l2_rate)), layers.BatchNormalization(), layers.GlobalAveragePooling2D(), layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(l2_rate)), layers.Dropout(dropout_rate), layers.Dense(10) ]) if optimizer.lower() == 'sgd': opt = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9) else: opt = tf.keras.optimizers.Adam(learning_rate=0.001) model.compile(optimizer=opt, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy']) return model # 3. 训练配置 early_stopping = EarlyStopping( monitor='val_loss', patience=10, restore_best_weights=True ) # 超参数组合测试 params_grid = [ {'optimizer': 'adam', 'batch_size': 128, 'epochs': 2}, {'optimizer': 'sgd', 'batch_size': 256, 'epochs': 2} ] # 4. 训练与评估 results = [] for params in params_grid: print(f"\n当前参数组合: {params}") model = build_model(optimizer=params['optimizer']) history = model.fit( train_images, train_labels, validation_data=(test_images, test_labels), batch_size=params['batch_size'], epochs=params['epochs'], callbacks=[early_stopping], verbose=1 ) # 记录最佳结果 best_epoch = np.argmin(history.history['val_loss']) results.append({ 'params': params, 'best_val_acc': history.history['val_accuracy'][best_epoch], 'best_val_loss': history.history['val_loss'][best_epoch], 'history': history }) # 5. 可视化分析 plt.figure(figsize=(18, 6)) # 准确率曲线 plt.subplot(1, 2, 1) for i, result in enumerate(results): plt.plot(result['history'].history['accuracy'], linestyle='--' if i==0 else '-', label=f"{result['params']['optimizer']} (train)") plt.plot(result['history'].history['val_accuracy'], linestyle='--' if i==0 else '-', label=f"{result['params']['optimizer']} (val)") plt.title('Model Accuracy Comparison', fontsize=14) plt.ylabel('Accuracy', fontsize=12) plt.xlabel('Epoch', fontsize=12) plt.legend(loc='lower right') plt.grid(True, alpha=0.3) # 损失曲线 plt.subplot(1, 2, 2) for i, result in enumerate(results): plt.plot(result['history'].history['loss'], linestyle='--' if i==0 else '-', label=f"{result['params']['optimizer']} (train)") plt.plot(result['history'].history['val_loss'], linestyle='--' if i==0 else '-', label=f"{result['params']['optimizer']} (val)") plt.title('Model Loss Comparison', fontsize=14) plt.ylabel('Loss', fontsize=12) plt.xlabel('Epoch', fontsize=12) plt.legend(loc='upper right') plt.grid(True, alpha=0.3) plt.tight_layout() plt.savefig('model_comparison.png', dpi=300) plt.show() # 6. 最优模型评估 best_model_idx = np.argmax([r['best_val_acc'] for r in results]) best_result = results[best_model_idx] print(f"\n最优参数组合: {best_result['params']}") print(f"验证集最高准确率: {best_result['best_val_acc']*100:.2f}%") # 混淆矩阵可视化 from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay predictions = np.argmax(best_result['history'].model.predict(test_images), axis=1) cm = confusion_matrix(test_labels, predictions) disp = ConfusionMatrixDisplay(confusion_matrix=cm) disp.plot(cmap='Blues') plt.title('Confusion Matrix') plt.savefig('confusion_matrix.png', dpi=300) plt.show() # 生成预测结果 print("\n生成预测结果...") y_pred = model.predict(X_test) y_pred_classes = np.argmax(y_pred, axis=1) # 定义y_pred_classes变量 # 保存预测结果 prediction_path = os.path.join('predictions', 'test_a_predictions.csv') test_a_df['prediction'] = y_pred_classes test_a_df['prediction_class'] = test_a_df['prediction'].map(class_mapping) test_a_df.to_csv(prediction_path, index=False) print(f"预测结果已保存至: {prediction_path}") base_dir = os.path.dirname(os.path.abspath(__file__)) prediction_dir = os.path.join(base_dir, 'predictions') os.makedirs('prediction_dir', exist_ok=True) prediction_path = os.path.join(prediction_dir, 'test_a_predictions.csv') directories = ['models', 'logs', 'predictions'] for dir_name in directories: if not os.path.exists(dir_name): os.makedirs(dir_name) print(f"创建目录: {dir_name}") # 保存预测结果前检查目录 print(f"检查预测结果保存目录: {'predictions' if os.path.exists('predictions') else '目录不存在,将尝试创建'}") prediction_path = os.path.join('predictions', 'test_a_predictions.csv') test_a_df['prediction'] = y_pred_classes test_a_df['prediction_class'] = test_a_df['prediction'].map(class_mapping) test_a_df.to_csv(prediction_path, index=False) # 保存后验证文件是否存在 if os.path.exists('prediction_path'): print(f"预测结果已成功保存至: {prediction_path}") print(f"文件大小: {os.path.getsize(prediction_path) / 1024:.2f} KB") else: print(f"错误: 文件未生成! 当前工作目录: {os.getcwd()}") print(f"预测目录内容: {os.listdir('predictions') if os.path.exists('predictions') else '预测目录不存在'}") try: test_a_df.to_csv(prediction_path, index=False) print(f"预测结果已保存至: {prediction_path}") except Exception as e: print(f"保存预测结果失败: {str(e)}")寻找不能生成csv文件的具体原因,在不改变原有代码的基础上,呈现修改后的完整代码
06-18
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值