python-numpy.repeat numpy.row_stack

import numpy as np
help(np.repeat)
# 重复array的元素
# repeat(a,repeats,axis=None)
# repeats-每个元素重复的个数。可能是个数或矩阵(矩阵向量数与待重复对象纬数一致,为矩阵时必有axis)。
# 沿着axis复制,axis=0,沿y轴复制即重复行;axis=None时flatten当前矩阵,实际上就是变成了一个行向量
a = np.array([[1,2],[3,4]])
np.repeat(a,2)  # array([1,1,2,2,3,3,4,4])
np.repeat(a,[1,2],1)  # array([[1,2,2],[3,4,4]])
np.repeat(a,[1,2],0)  # array([[1,2],[3,4],[3,4])

# 给一个矩阵增加行或列
new_row = [1,2]
np.row_stack(a,new_row)  # array([1,2],[3,4],[1,2])
import json import torch from typing import Dict, List from torch.utils.data import Dataset import transformers from peft import LoraConfig, TaskType, get_peft_model from torch.utils.data import DataLoader, SequentialSampler from transformers import Trainer, TrainingArguments from lora_plus import LoraPlusTrainer from torch.utils.data import RandomSampler from swanlab.integration.transformers import SwanLabCallback import swanlab import numpy as np import pandas as pd import re from typing import Dict, List import torch from tqdm import tqdm from transformers import PreTrainedTokenizer from transformers import AutoTokenizer import torch.nn as nn swanlab.init("Finetune-Llama3.2-with-Encoder") swanlab_callback = SwanLabCallback( project="Finetune-Llama3.2-with-Encoder", experiment_name="Finetune-Llama3.2-with-Encoder" ) # 常量定义 CHEM_FORMULA_SIZE = "([A-Z][a-z]*)([0-9]*)" VALID_ELEMENTS = ["C", "N", "P", "O", "S", "Si", "I", "H", "Cl", "F", "Br", "B", "Se", "Fe", "Co", "As", "K", "Na"] ELEMENT_VECTORS = np.eye(len(VALID_ELEMENTS)) element_to_position = dict(zip(VALID_ELEMENTS, ELEMENT_VECTORS)) # 化学式转密集向量 def formula_to_dense(chem_formula: str) -> np.ndarray: total_onehot = [] for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula): num = 1 if num == "" else int(num) one_hot = element_to_position[chem_symbol].reshape(1, -1) one_hot_repeats = np.repeat(one_hot, repeats=num, axis=0) total_onehot.append(one_hot_repeats) if len(total_onehot) == 0: dense_vec = np.zeros(len(VALID_ELEMENTS)) else: dense_vec = np.vstack(total_onehot).sum(0) return dense_vec # 正弦嵌入 def sine_embed(v, max_count=256): num_freqs = int(np.ceil(np.log2(max_count))) freqs = 0.5 ** torch.arange(num_freqs, dtype=torch.float32) * np.pi v_tensor = torch.tensor(v, dtype=torch.float32)[:, None] embedded = torch.sin(v_tensor * freqs[None, :]) return torch.abs(embedded).numpy() def positional_encoding(max_position, d_model, min_freq=1e-6): position = np.arange(max_position) freqs = min_freq **(2 * (np.arange(d_model) // 2) / d_model) pos_enc = position.reshape(-1, 1) * freqs.reshape(1, -1) pos_enc[:, ::2] = np.cos(pos_enc[:, ::2]) pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2]) return pos_enc # 生成位置编码 P = positional_encoding(2000000, 256, min_freq=1e2) # 转换为PyTorch张量以便后续使用 P = torch.tensor(P, dtype=torch.float32) dimn = 255 # 质谱数据编码(修复后) def encoding(rag_tensor, P, dimn): to_pad = [] for sample in rag_tensor: # 直接使用列表(因为sample[0]和sample[1]是Python列表) all_dim = [sample[0]] # 移除.tolist(),因为本身就是列表 # 处理位置编码(sample[1]是列表,直接遍历) pos_enc = [P[int(i)-1] for i in sample[1]] for dim_idx in range(dimn): dim_vals = [i[dim_idx].item() for i in pos_enc] all_dim.append(dim_vals) to_pad.append(all_dim) # 使用PyTorch进行序列填充 padded = [] for i in to_pad: # 转换为张量 tensor = torch.tensor(i, dtype=torch.float32) # 计算需要填充的长度 pad_length = max(0, 501 - tensor.size(1)) # 进行后向填充 padded_tensor = torch.nn.functional.pad(tensor, (0, pad_length), mode='constant', value=0) # 如果长度超过501,则截断 if padded_tensor.size(1) > 501: padded_tensor = padded_tensor[:, :501] padded.append(padded_tensor) # 堆叠并交换轴 to_pad = torch.stack(padded) to_pad = to_pad.permute(0, 2, 1) # 相当于numpy的swapaxes(to_pad, 1, -1) return to_pad # 质谱数据预处理(PyTorch实现) def prepro_specs_train(df): df = df.reset_index(drop=True) valid = [] mz_intensity = df['Spectrum'].to_list() def process_line(line): pairs = line.split() mz_list = [] intensity_list = [] for pair in pairs: mz, intensity = pair.split(':') mz_list.append(float(mz)) intensity_list.append(float(intensity)) return mz_list, intensity_list for idx, intensities in tqdm(enumerate(mz_intensity)): mz_list, intensity_list = process_line(intensities) # 添加总精确质量和0强度值 mz_list.append(float(df.at[idx, 'Total Exact Mass'])) intensity_list.append(0.0) # 四舍五入处理 round_mz_list = [round(float(mz), 2) for mz in mz_list] round_intensity_list = [round(float(intensity), 2) for intensity in intensity_list] valid.append([round_mz_list, round_intensity_list]) return valid # 返回列表的列表 # 自定义数据集类 class CSVDataset(torch.utils.data.Dataset): def __init__(self, csv_path, tokenizer: PreTrainedTokenizer, max_selfies_len=512): self.df = pd.read_csv(csv_path) self.tokenizer = tokenizer self.max_selfies_len = max_selfies_len # 预处理质谱数据 spec_df = self.df[['Total Exact Mass', 'Spectrum']].copy() self.rag_tensor = prepro_specs_train(spec_df) self.spec_encoded = encoding(self.rag_tensor, P, dimn) def __len__(self): return len(self.df) def __getitem__(self, idx) -> Dict[str, torch.Tensor]: # 1. 处理分子式 formula = self.df.iloc[idx]['Molecular Formula'] formula_vec = formula_to_dense(formula) # 形状: (18,) # 2. 处理质谱数据 spec_matrix = self.spec_encoded[idx] # 形状: (501, 257) # 3. 处理SELFIES - 添加attention_mask selfies_str = self.df.iloc[idx]['SELFIES'] # 编码时同时获取input_ids和attention_mask encoding_result = self.tokenizer.encode_plus( selfies_str, add_special_tokens=True, # 添加[CLS]和[SEP] max_length=self.max_selfies_len, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt' ) input_ids = encoding_result['input_ids'].squeeze(0) attention_mask = encoding_result['attention_mask'].squeeze(0) return { 'formula_vec': torch.tensor(formula_vec, dtype=torch.float32), 'spec_matrix': spec_matrix, # 已为tensor,无需重复转换 'selfies_ids': input_ids, 'attention_mask': attention_mask } # 初始化tokenizer tokenizer = AutoTokenizer.from_pretrained('/root/workspace/checkpoint-2500') # 创建数据集 dataset = CSVDataset('/root/workspace/SELFIES-SFT.csv', tokenizer) data_collator = transformers.DataCollatorForSeq2Seq( tokenizer=tokenizer) # 定义带额外Encoder的自定义模型 class LlamaWithEncoder(nn.Module): def __init__(self, base_model, encoder1_dim=18, encoder2_dim=256, hidden_dim=512): super().__init__() self.base_model = base_model # 第一个Transformer Encoder encoder1_layer = nn.TransformerEncoderLayer( d_model=encoder1_dim, nhead=3, dim_feedforward=hidden_dim, batch_first=True ) self.encoder1 = nn.TransformerEncoder(encoder1_layer, num_layers=2) # 第二个Transformer Encoder encoder2_layer = nn.TransformerEncoderLayer( d_model=encoder2_dim, nhead=8, dim_feedforward=hidden_dim, batch_first=True ) self.encoder2 = nn.TransformerEncoder(encoder2_layer, num_layers=2) # 投影层 self.proj1 = nn.Linear(encoder1_dim, base_model.config.hidden_size) self.proj2 = nn.Linear(encoder2_dim, base_model.config.hidden_size) # 融合层 self.fusion = nn.Linear(2 * base_model.config.hidden_size, base_model.config.hidden_size) def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): return self.base_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, **kwargs ) def forward( self, input_ids=None, attention_mask=None, encoder1_inputs=None, encoder2_inputs=None, labels=None, past_key_values=None, output_attentions=None, output_hidden_states=None, return_dict=None, **kwargs ): # 处理编码器输入 enc1_out = self.encoder1(encoder1_inputs) enc1_out = enc1_out.mean(dim=1) enc1_proj = self.proj1(enc1_out) enc2_out = self.encoder2(encoder2_inputs) enc2_out = enc2_out.mean(dim=1) enc2_proj = self.proj2(enc2_out) # 融合编码器输出 fused = self.fusion(torch.cat([enc1_proj, enc2_proj], dim=1)) fused = fused.unsqueeze(1) # 获取嵌入层输出 embeddings = self.base_model.get_input_embeddings()(input_ids) # 将融合结果与第一个token的嵌入结合 if embeddings.size(1) > 0: embeddings[:, 0, :] = (embeddings[:, 0, :] + fused[:, 0, :]) / 2 # 使用修改后的嵌入调用基础模型 return self.base_model( inputs_embeds=embeddings, attention_mask=attention_mask, labels=labels, past_key_values=past_key_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, **kwargs ) # 加载预训练模型 base_model = transformers.AutoModelForCausalLM.from_pretrained( "/root/workspace/checkpoint-2500", trust_remote_code=True, torch_dtype=torch.bfloat16, ) model = LlamaWithEncoder(base_model) lora_config = LoraConfig( r=8, lora_alpha=16, target_modules="all-linear", # 目标注意力层 lora_dropout=0.0, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # 输出示例:0.3% 参数可训练 training_args = TrainingArguments( output_dir="./llama3.2-SELFIES-SFT", per_device_train_batch_size=16, gradient_accumulation_steps=16, num_train_epochs=10, learning_rate=5.0e-05, optim="adamw_torch", logging_steps=10, bf16=True, save_strategy="steps", lr_scheduler_type='cosine', max_grad_norm=1.0, save_steps=2000, warmup_steps=0 ) class CustomTrainer(LoraPlusTrainer): def get_train_dataloader(self) -> DataLoader: """ Returns the training dataloader using a random sampler to shuffle the dataset. """ return DataLoader( self.train_dataset, batch_size=self.args.train_batch_size, shuffle=True, collate_fn=self.data_collator, drop_last=False, ) # 使用修改后的 CustomTrainer lp_trainer = CustomTrainer( model, training_args, train_dataset=dataset, tokenizer=tokenizer, data_collator=data_collator, callbacks=[swanlab_callback], ) lp_trainer.train() lp_trainer.save_model(output_dir='./llama3.2-SELFIES-SFT') 修改代码,确保添加的Encoder可以顺利进行lora微调
最新发布
07-25
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值