tensor.permute(dim_index)

本文介绍如何使用PyTorch中的permute方法改变张量的维度排列,通过实例展示将三维张量从(2,3,5)调整为(5,2,3)的过程。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

改变张量维度排列

>>> x = torch.randn(2, 3, 5)
>>> x.size()
torch.Size([2, 3, 5])
>>> x.permute(2, 0, 1).size()
torch.Size([5, 2, 3])

 

class DiceLoss(_Loss): def __init__( self, mode: str, classes: Optional[List[int]] = None, log_loss: bool = False, from_logits: bool = True, smooth: float = 0.0, ignore_index: Optional[int] = None, eps: float = 1e-7, ): """Dice loss for image segmentation task. It supports binary, multiclass and multilabel cases Args: mode: Loss mode 'binary', 'multiclass' or 'multilabel' classes: List of classes that contribute in loss computation. By default, all channels are included. log_loss: If True, loss computed as `- log(dice_coeff)`, otherwise `1 - dice_coeff` from_logits: If True, assumes input is raw logits smooth: Smoothness constant for dice coefficient (a) ignore_index: Label that indicates ignored pixels (does not contribute to loss) eps: A small epsilon for numerical stability to avoid zero division error (denominator will be always greater or equal to eps) Shape - **y_pred** - torch.Tensor of shape (N, C, H, W) - **y_true** - torch.Tensor of shape (N, H, W) or (N, C, H, W) Reference https://github.com/BloodAxe/pytorch-toolbelt """ assert mode in {BINARY_MODE, MULTILABEL_MODE, MULTICLASS_MODE} super(DiceLoss, self).__init__() self.mode = mode if classes is not None: assert mode != BINARY_MODE, ( "Masking classes is not supported with mode=binary" ) classes = to_tensor(classes, dtype=torch.long) self.classes = classes self.from_logits = from_logits self.smooth = smooth self.eps = eps self.log_loss = log_loss self.ignore_index = ignore_index def forward(self, y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor: assert y_true.size(0) == y_pred.size(0) if self.from_logits: # Apply activations to get [0..1] class probabilities # Using Log-Exp as this gives more numerically stable result and does not cause vanishing gradient on # extreme values 0 and 1 if self.mode == MULTICLASS_MODE: y_pred = y_pred.log_softmax(dim=1).exp() else: y_pred = F.logsigmoid(y_pred).exp() bs = y_true.size(0) num_classes = y_pred.size(1) dims = (0, 2) if self.mode == BINARY_MODE: y_true = y_true.view(bs, 1, -1) y_pred = y_pred.view(bs, 1, -1) if self.ignore_index is not None: mask = y_true != self.ignore_index y_pred = y_pred * mask y_true = y_true * mask if self.mode == MULTICLASS_MODE: y_true = y_true.view(bs, -1) y_pred = y_pred.view(bs, num_classes, -1) if self.ignore_index is not None: mask = y_true != self.ignore_index y_pred = y_pred * mask.unsqueeze(1) y_true = F.one_hot( (y_true * mask).to(torch.long), num_classes ) # N,H*W -> N,H*W, C y_true = y_true.permute(0, 2, 1) * mask.unsqueeze(1) # N, C, H*W else: y_true = F.one_hot(y_true, num_classes) # N,H*W -> N,H*W, C y_true = y_true.permute(0, 2, 1) # N, C, H*W if self.mode == MULTILABEL_MODE: y_true = y_true.view(bs, num_classes, -1) y_pred = y_pred.view(bs, num_classes, -1) if self.ignore_index is not None: mask = y_true != self.ignore_index y_pred = y_pred * mask y_true = y_true * mask scores = self.compute_score( y_pred, y_true.type_as(y_pred), smooth=self.smooth, eps=self.eps, dims=dims ) if self.log_loss: loss = -torch.log(scores.clamp_min(self.eps)) else: loss = 1.0 - scores # Dice loss is undefined for non-empty classes # So we zero contribution of channel that does not have true pixels # NOTE: A better workaround would be to use loss term `mean(y_pred)` # for this case, however it will be a modified jaccard loss mask = y_true.sum(dims) > 0 loss *= mask.to(loss.dtype) if self.classes is not None: loss = loss[self.classes] return self.aggregate_loss(loss) def aggregate_loss(self, loss): return loss.mean() def compute_score( self, output, target, smooth=0.0, eps=1e-7, dims=None ) -> torch.Tensor: return soft_dice_score(output, target, smooth, eps, dims)源代码如下
最新发布
07-28
class AttentionLePE(nn.Module): """ vanilla attention """ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., side_dwconv=5): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights self.scale = qk_scale or head_dim ** -0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.lepe = nn.Conv2d(dim, dim, kernel_size=side_dwconv, stride=1, padding=side_dwconv // 2, groups=dim) if side_dwconv > 0 else \ lambda x: torch.zeros_like(x) def forward(self, x): """ args: x: NHWC tensor return: NHWC tensor """ _, H, W, _ = x.size() x = rearrange(x, 'n h w c -> n (h w) c') ####################################### B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) lepe = self.lepe(rearrange(x, 'n (h w) c -> n c h w', h=H, w=W)) lepe = rearrange(lepe, 'n c h w -> n (h w) c') attn = (q @ k.transpose(-2, -1)) * self.scale attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = x + lepe x = self.proj(x) x = self.proj_drop(x) ####################################### x = rearrange(x, 'n (h w) c -> n h w c', h=H, w=W) return x纠错
04-01
import json import torch from typing import Dict, List from torch.utils.data import Dataset import transformers from peft import LoraConfig, TaskType, get_peft_model from torch.utils.data import DataLoader, SequentialSampler from transformers import Trainer, TrainingArguments from lora_plus import LoraPlusTrainer from torch.utils.data import RandomSampler from swanlab.integration.transformers import SwanLabCallback import swanlab import numpy as np import pandas as pd import re from typing import Dict, List import torch from tqdm import tqdm from transformers import PreTrainedTokenizer from transformers import AutoTokenizer import torch.nn as nn swanlab.init("Finetune-Llama3.2-with-Encoder") swanlab_callback = SwanLabCallback( project="Finetune-Llama3.2-with-Encoder", experiment_name="Finetune-Llama3.2-with-Encoder" ) # 常量定义 CHEM_FORMULA_SIZE = "([A-Z][a-z]*)([0-9]*)" VALID_ELEMENTS = ["C", "N", "P", "O", "S", "Si", "I", "H", "Cl", "F", "Br", "B", "Se", "Fe", "Co", "As", "K", "Na"] ELEMENT_VECTORS = np.eye(len(VALID_ELEMENTS)) element_to_position = dict(zip(VALID_ELEMENTS, ELEMENT_VECTORS)) # 化学式转密集向量 def formula_to_dense(chem_formula: str) -> np.ndarray: total_onehot = [] for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula): num = 1 if num == "" else int(num) one_hot = element_to_position[chem_symbol].reshape(1, -1) one_hot_repeats = np.repeat(one_hot, repeats=num, axis=0) total_onehot.append(one_hot_repeats) if len(total_onehot) == 0: dense_vec = np.zeros(len(VALID_ELEMENTS)) else: dense_vec = np.vstack(total_onehot).sum(0) return dense_vec # 正弦嵌入 def sine_embed(v, max_count=256): num_freqs = int(np.ceil(np.log2(max_count))) freqs = 0.5 ** torch.arange(num_freqs, dtype=torch.float32) * np.pi v_tensor = torch.tensor(v, dtype=torch.float32)[:, None] embedded = torch.sin(v_tensor * freqs[None, :]) return torch.abs(embedded).numpy() def positional_encoding(max_position, d_model, min_freq=1e-6): position = np.arange(max_position) freqs = min_freq **(2 * (np.arange(d_model) // 2) / d_model) pos_enc = position.reshape(-1, 1) * freqs.reshape(1, -1) pos_enc[:, ::2] = np.cos(pos_enc[:, ::2]) pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2]) return pos_enc # 生成位置编码 P = positional_encoding(2000000, 256, min_freq=1e2) # 转换为PyTorch张量以便后续使用 P = torch.tensor(P, dtype=torch.float32) dimn = 255 # 质谱数据编码(修复后) def encoding(rag_tensor, P, dimn): to_pad = [] for sample in rag_tensor: # 直接使用列表(因为sample[0]和sample[1]是Python列表) all_dim = [sample[0]] # 移除.tolist(),因为本身就是列表 # 处理位置编码(sample[1]是列表,直接遍历) pos_enc = [P[int(i)-1] for i in sample[1]] for dim_idx in range(dimn): dim_vals = [i[dim_idx].item() for i in pos_enc] all_dim.append(dim_vals) to_pad.append(all_dim) # 使用PyTorch进行序列填充 padded = [] for i in to_pad: # 转换为张量 tensor = torch.tensor(i, dtype=torch.float32) # 计算需要填充的长度 pad_length = max(0, 501 - tensor.size(1)) # 进行后向填充 padded_tensor = torch.nn.functional.pad(tensor, (0, pad_length), mode='constant', value=0) # 如果长度超过501,则截断 if padded_tensor.size(1) > 501: padded_tensor = padded_tensor[:, :501] padded.append(padded_tensor) # 堆叠并交换轴 to_pad = torch.stack(padded) to_pad = to_pad.permute(0, 2, 1) # 相当于numpy的swapaxes(to_pad, 1, -1) return to_pad # 质谱数据预处理(PyTorch实现) def prepro_specs_train(df): df = df.reset_index(drop=True) valid = [] mz_intensity = df['Spectrum'].to_list() def process_line(line): pairs = line.split() mz_list = [] intensity_list = [] for pair in pairs: mz, intensity = pair.split(':') mz_list.append(float(mz)) intensity_list.append(float(intensity)) return mz_list, intensity_list for idx, intensities in tqdm(enumerate(mz_intensity)): mz_list, intensity_list = process_line(intensities) # 添加总精确质量和0强度值 mz_list.append(float(df.at[idx, 'Total Exact Mass'])) intensity_list.append(0.0) # 四舍五入处理 round_mz_list = [round(float(mz), 2) for mz in mz_list] round_intensity_list = [round(float(intensity), 2) for intensity in intensity_list] valid.append([round_mz_list, round_intensity_list]) return valid # 返回列表的列表 # 自定义数据集类 class CSVDataset(torch.utils.data.Dataset): def __init__(self, csv_path, tokenizer: PreTrainedTokenizer, max_selfies_len=512): self.df = pd.read_csv(csv_path) self.tokenizer = tokenizer self.max_selfies_len = max_selfies_len # 预处理质谱数据 spec_df = self.df[['Total Exact Mass', 'Spectrum']].copy() self.rag_tensor = prepro_specs_train(spec_df) self.spec_encoded = encoding(self.rag_tensor, P, dimn) def __len__(self): return len(self.df) def __getitem__(self, idx) -> Dict[str, torch.Tensor]: # 1. 处理分子式 formula = self.df.iloc[idx]['Molecular Formula'] formula_vec = formula_to_dense(formula) # 形状: (18,) # 2. 处理质谱数据 spec_matrix = self.spec_encoded[idx] # 形状: (501, 257) # 3. 处理SELFIES - 添加attention_mask selfies_str = self.df.iloc[idx]['SELFIES'] # 编码时同时获取input_ids和attention_mask encoding_result = self.tokenizer.encode_plus( selfies_str, add_special_tokens=True, # 添加[CLS]和[SEP] max_length=self.max_selfies_len, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt' ) input_ids = encoding_result['input_ids'].squeeze(0) attention_mask = encoding_result['attention_mask'].squeeze(0) return { 'formula_vec': torch.tensor(formula_vec, dtype=torch.float32), 'spec_matrix': spec_matrix, # 已为tensor,无需重复转换 'selfies_ids': input_ids, 'attention_mask': attention_mask } # 初始化tokenizer tokenizer = AutoTokenizer.from_pretrained('/root/workspace/checkpoint-2500') # 创建数据集 dataset = CSVDataset('/root/workspace/SELFIES-SFT.csv', tokenizer) data_collator = transformers.DataCollatorForSeq2Seq( tokenizer=tokenizer) # 定义带额外Encoder的自定义模型 class LlamaWithEncoder(nn.Module): def __init__(self, base_model, encoder1_dim=18, encoder2_dim=256, hidden_dim=512): super().__init__() self.base_model = base_model # 第一个Transformer Encoder encoder1_layer = nn.TransformerEncoderLayer( d_model=encoder1_dim, nhead=3, dim_feedforward=hidden_dim, batch_first=True ) self.encoder1 = nn.TransformerEncoder(encoder1_layer, num_layers=2) # 第二个Transformer Encoder encoder2_layer = nn.TransformerEncoderLayer( d_model=encoder2_dim, nhead=8, dim_feedforward=hidden_dim, batch_first=True ) self.encoder2 = nn.TransformerEncoder(encoder2_layer, num_layers=2) # 投影层 self.proj1 = nn.Linear(encoder1_dim, base_model.config.hidden_size) self.proj2 = nn.Linear(encoder2_dim, base_model.config.hidden_size) # 融合层 self.fusion = nn.Linear(2 * base_model.config.hidden_size, base_model.config.hidden_size) def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): return self.base_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, **kwargs ) def forward( self, input_ids=None, attention_mask=None, encoder1_inputs=None, encoder2_inputs=None, labels=None, past_key_values=None, output_attentions=None, output_hidden_states=None, return_dict=None, **kwargs ): # 处理编码器输入 enc1_out = self.encoder1(encoder1_inputs) enc1_out = enc1_out.mean(dim=1) enc1_proj = self.proj1(enc1_out) enc2_out = self.encoder2(encoder2_inputs) enc2_out = enc2_out.mean(dim=1) enc2_proj = self.proj2(enc2_out) # 融合编码器输出 fused = self.fusion(torch.cat([enc1_proj, enc2_proj], dim=1)) fused = fused.unsqueeze(1) # 获取嵌入层输出 embeddings = self.base_model.get_input_embeddings()(input_ids) # 将融合结果与第一个token的嵌入结合 if embeddings.size(1) > 0: embeddings[:, 0, :] = (embeddings[:, 0, :] + fused[:, 0, :]) / 2 # 使用修改后的嵌入调用基础模型 return self.base_model( inputs_embeds=embeddings, attention_mask=attention_mask, labels=labels, past_key_values=past_key_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, **kwargs ) # 加载预训练模型 base_model = transformers.AutoModelForCausalLM.from_pretrained( "/root/workspace/checkpoint-2500", trust_remote_code=True, torch_dtype=torch.bfloat16, ) model = LlamaWithEncoder(base_model) lora_config = LoraConfig( r=8, lora_alpha=16, target_modules="all-linear", # 目标注意力层 lora_dropout=0.0, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # 输出示例:0.3% 参数可训练 training_args = TrainingArguments( output_dir="./llama3.2-SELFIES-SFT", per_device_train_batch_size=16, gradient_accumulation_steps=16, num_train_epochs=10, learning_rate=5.0e-05, optim="adamw_torch", logging_steps=10, bf16=True, save_strategy="steps", lr_scheduler_type='cosine', max_grad_norm=1.0, save_steps=2000, warmup_steps=0 ) class CustomTrainer(LoraPlusTrainer): def get_train_dataloader(self) -> DataLoader: """ Returns the training dataloader using a random sampler to shuffle the dataset. """ return DataLoader( self.train_dataset, batch_size=self.args.train_batch_size, shuffle=True, collate_fn=self.data_collator, drop_last=False, ) # 使用修改后的 CustomTrainer lp_trainer = CustomTrainer( model, training_args, train_dataset=dataset, tokenizer=tokenizer, data_collator=data_collator, callbacks=[swanlab_callback], ) lp_trainer.train() lp_trainer.save_model(output_dir='./llama3.2-SELFIES-SFT') 修改代码,确保添加的Encoder可以顺利进行lora微调
07-25
import json import torch from typing import Dict, List from torch.utils.data import Dataset import transformers from peft import LoraConfig, TaskType, get_peft_model from torch.utils.data import DataLoader, SequentialSampler from transformers import Trainer, TrainingArguments from lora_plus import LoraPlusTrainer from torch.utils.data import RandomSampler from swanlab.integration.transformers import SwanLabCallback import swanlab import numpy as np import pandas as pd import re from typing import Dict, List import torch from tqdm import tqdm from transformers import PreTrainedTokenizer from transformers import AutoTokenizer import torch.nn as nn swanlab.init("Finetune-Llama3.2-with-Encoder") swanlab_callback = SwanLabCallback( project="Finetune-Llama3.2-with-Encoder", experiment_name="Finetune-Llama3.2-with-Encoder" ) # 常量定义 CHEM_FORMULA_SIZE = "([A-Z][a-z]*)([0-9]*)" VALID_ELEMENTS = ["C", "N", "P", "O", "S", "Si", "I", "H", "Cl", "F", "Br", "B", "Se", "Fe", "Co", "As", "K", "Na"] ELEMENT_VECTORS = np.eye(len(VALID_ELEMENTS)) element_to_position = dict(zip(VALID_ELEMENTS, ELEMENT_VECTORS)) # 化学式转密集向量 def formula_to_dense(chem_formula: str) -> np.ndarray: total_onehot = [] for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula): num = 1 if num == "" else int(num) one_hot = element_to_position[chem_symbol].reshape(1, -1) one_hot_repeats = np.repeat(one_hot, repeats=num, axis=0) total_onehot.append(one_hot_repeats) if len(total_onehot) == 0: dense_vec = np.zeros(len(VALID_ELEMENTS)) else: dense_vec = np.vstack(total_onehot).sum(0) return dense_vec # 正弦嵌入 def sine_embed(v, max_count=256): num_freqs = int(np.ceil(np.log2(max_count))) freqs = 0.5 ** torch.arange(num_freqs, dtype=torch.float32) * np.pi v_tensor = torch.tensor(v, dtype=torch.float32)[:, None] embedded = torch.sin(v_tensor * freqs[None, :]) return torch.abs(embedded).numpy() def positional_encoding(max_position, d_model, min_freq=1e-6): position = np.arange(max_position) freqs = min_freq **(2 * (np.arange(d_model) // 2) / d_model) pos_enc = position.reshape(-1, 1) * freqs.reshape(1, -1) pos_enc[:, ::2] = np.cos(pos_enc[:, ::2]) pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2]) return pos_enc # 生成位置编码 P = positional_encoding(2000000, 256, min_freq=1e2) # 转换为PyTorch张量以便后续使用 P = torch.tensor(P, dtype=torch.float32) dimn = 255 # 质谱数据编码(修复后) def encoding(rag_tensor, P, dimn): to_pad = [] for sample in rag_tensor: # 直接使用列表(因为sample[0]和sample[1]是Python列表) all_dim = [sample[0]] # 移除.tolist(),因为本身就是列表 # 处理位置编码(sample[1]是列表,直接遍历) pos_enc = [P[int(i)-1] for i in sample[1]] for dim_idx in range(dimn): dim_vals = [i[dim_idx].item() for i in pos_enc] all_dim.append(dim_vals) to_pad.append(all_dim) # 使用PyTorch进行序列填充 padded = [] for i in to_pad: # 转换为张量 tensor = torch.tensor(i, dtype=torch.float32) # 计算需要填充的长度 pad_length = max(0, 501 - tensor.size(1)) # 进行后向填充 padded_tensor = torch.nn.functional.pad(tensor, (0, pad_length), mode='constant', value=0) # 如果长度超过501,则截断 if padded_tensor.size(1) > 501: padded_tensor = padded_tensor[:, :501] padded.append(padded_tensor) # 堆叠并交换轴 to_pad = torch.stack(padded) to_pad = to_pad.permute(0, 2, 1) # 相当于numpy的swapaxes(to_pad, 1, -1) return to_pad # 质谱数据预处理(PyTorch实现) def prepro_specs_train(df): df = df.reset_index(drop=True) valid = [] mz_intensity = df['Spectrum'].to_list() def process_line(line): pairs = line.split() mz_list = [] intensity_list = [] for pair in pairs: mz, intensity = pair.split(':') mz_list.append(float(mz)) intensity_list.append(float(intensity)) return mz_list, intensity_list for idx, intensities in tqdm(enumerate(mz_intensity)): mz_list, intensity_list = process_line(intensities) # 添加总精确质量和0强度值 mz_list.append(float(df.at[idx, 'Total Exact Mass'])) intensity_list.append(0.0) # 四舍五入处理 round_mz_list = [round(float(mz), 2) for mz in mz_list] round_intensity_list = [round(float(intensity), 2) for intensity in intensity_list] valid.append([round_mz_list, round_intensity_list]) return valid # 返回列表的列表 # 自定义数据集类 class CSVDataset(torch.utils.data.Dataset): def __init__(self, csv_path, tokenizer: PreTrainedTokenizer, max_selfies_len=512): self.df = pd.read_csv(csv_path) self.tokenizer = tokenizer self.max_selfies_len = max_selfies_len # 预处理质谱数据 spec_df = self.df[['Total Exact Mass', 'Spectrum']].copy() self.rag_tensor = prepro_specs_train(spec_df) self.spec_encoded = encoding(self.rag_tensor, P, dimn) def __len__(self): return len(self.df) def __getitem__(self, idx) -> Dict[str, torch.Tensor]: # 1. 处理分子式 formula = self.df.iloc[idx]['Molecular Formula'] formula_vec = formula_to_dense(formula) # 形状: (18,) # 2. 处理质谱数据 spec_matrix = self.spec_encoded[idx] # 形状: (501, 257) # 3. 处理SELFIES - 添加attention_mask selfies_str = self.df.iloc[idx]['SELFIES'] # 编码时同时获取input_ids和attention_mask encoding_result = self.tokenizer.encode_plus( selfies_str, add_special_tokens=True, # 添加[CLS]和[SEP] max_length=self.max_selfies_len, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt' ) input_ids = encoding_result['input_ids'].squeeze(0) attention_mask = encoding_result['attention_mask'].squeeze(0) return { 'formula_vec': torch.tensor(formula_vec, dtype=torch.float32), 'spec_matrix': spec_matrix, # 已为tensor,无需重复转换 'selfies_ids': input_ids, 'attention_mask': attention_mask } # 初始化tokenizer tokenizer = AutoTokenizer.from_pretrained('/root/workspace/checkpoint-2500') # 创建数据集 dataset = CSVDataset('/root/workspace/SELFIES-SFT.csv', tokenizer) data_collator = transformers.DataCollatorForSeq2Seq( tokenizer=tokenizer) # 定义带额外Encoder的自定义模型 class LlamaWithEncoder(nn.Module): def __init__(self, base_model, encoder1_dim=18, encoder2_dim=256, hidden_dim=512): super().__init__() # 基础Llama模型 self.base_model = base_model # 第一个Transformer Encoder (处理形状为(batch, 18)的输入) encoder1_layer = nn.TransformerEncoderLayer( d_model=encoder1_dim, nhead=3, # 18可以被3整除 dim_feedforward=hidden_dim, batch_first=True ) self.encoder1 = nn.TransformerEncoder(encoder1_layer, num_layers=2) # 第二个Transformer Encoder (处理形状为(batch, 501, 257)的输入) encoder2_layer = nn.TransformerEncoderLayer( d_model=encoder2_dim, nhead=8, # 257可以被17整除 dim_feedforward=hidden_dim, batch_first=True ) self.encoder2 = nn.TransformerEncoder(encoder2_layer, num_layers=2) # 投影层,将两个encoder的输出映射到Llama的隐藏维度 self.proj1 = nn.Linear(encoder1_dim, base_model.config.hidden_size) self.proj2 = nn.Linear(encoder2_dim, base_model.config.hidden_size) # 融合层 self.fusion = nn.Linear(2 * base_model.config.hidden_size, base_model.config.hidden_size) def forward(self, input_ids=None, attention_mask=None, encoder1_inputs=None, encoder2_inputs=None, labels=None): # 处理编码器输入 enc1_out = self.encoder1(encoder1_inputs) # (batch, 18, 18) enc1_out = enc1_out.mean(dim=1) # (batch, 18) enc1_proj = self.proj1(enc1_out) # (batch, hidden_size) enc2_out = self.encoder2(encoder2_inputs) # (batch, 501, 257) enc2_out = enc2_out.mean(dim=1) # (batch, 257) enc2_proj = self.proj2(enc2_out) # (batch, hidden_size) # 融合编码器输出 fused = self.fusion(torch.cat([enc1_proj, enc2_proj], dim=1)) # (batch, hidden_size) # 将融合结果作为decoder的初始状态 batch_size = fused.size(0) fused = fused.unsqueeze(1) # (batch, 1, hidden_size) # 准备Llama输入 outputs = self.base_model( input_ids=input_ids, attention_mask=attention_mask, labels=labels, ) # 这里我们将编码器的输出与Llama的嵌入层输出进行融合 # 获取嵌入层输出 embeddings = self.base_model.get_input_embeddings()(input_ids) # (batch, seq_len, hidden_size) # 将编码器输出与第一个位置的嵌入融合 if embeddings.size(1) > 0: embeddings[:, 0, :] = (embeddings[:, 0, :] + fused[:, 0, :]) / 2 # 使用修改后的嵌入重新计算模型输出 outputs = self.base_model( inputs_embeds=embeddings, attention_mask=attention_mask, labels=labels ) return outputs # 加载预训练模型 base_model = transformers.AutoModelForCausalLM.from_pretrained( "/root/workspace/checkpoint-2500", trust_remote_code=True, torch_dtype=torch.bfloat16, ) model = LlamaWithEncoder(base_model) lora_config = LoraConfig( r=8, lora_alpha=16, target_modules="all-linear", # 目标注意力层 lora_dropout=0.0, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # 输出示例:0.3% 参数可训练 training_args = TrainingArguments( output_dir="./llama3.2-SELFIES-SFT", per_device_train_batch_size=16, gradient_accumulation_steps=16, num_train_epochs=10, learning_rate=5.0e-05, optim="adamw_torch", logging_steps=10, bf16=True, save_strategy="steps", lr_scheduler_type='cosine', max_grad_norm=1.0, save_steps=2000, warmup_steps=0 ) class CustomTrainer(LoraPlusTrainer): def get_train_dataloader(self) -> DataLoader: """ Returns the training dataloader using a random sampler to shuffle the dataset. """ return DataLoader( self.train_dataset, batch_size=self.args.train_batch_size, shuffle=True, collate_fn=self.data_collator, drop_last=False, ) # 使用修改后的 CustomTrainer lp_trainer = CustomTrainer( model, training_args, train_dataset=dataset, tokenizer=tokenizer, data_collator=data_collator, callbacks=[swanlab_callback], ) lp_trainer.train() lp_trainer.save_model(output_dir='./llama3.2-SELFIES-SFT')运行出现报错,解决报错 File "/root/workspace/sft.py", line 278, in <module> model = get_peft_model(model, lora_config) File "/opt/conda/lib/python3.10/site-packages/peft/mapping_func.py", line 125, in get_peft_model return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type]( File "/opt/conda/lib/python3.10/site-packages/peft/peft_model.py", line 1811, in __init__ self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation File "/opt/conda/lib/python3.10/site-packages/peft/tuners/lora/model.py", line 367, in __getattr__ return getattr(self.model, name) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__ raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") 'LlamaWithEncoder' object has no attribute 'prepare_inputs_for_generation'
07-25
f attention(self, query, value, reference_points_cam, weights, proj_mask, spatial_shapes, level_start_index, ): num_query = query.size(1) bs, num_cam, num_value = value.shape[:3] num_all_points = weights.size(-1) slots = torch.zeros_like(query) # (bs, num_query, num_head, num_cam, num_level, num_p) # --> (bs, num_cam, num_query, num_head, num_level, num_p) weights = weights.permute(0, 3, 1, 2, 4, 5).contiguous() # save memory trick, similar as bevformer_occ indexes = [[] for _ in range(bs)] max_len = 0 for i in range(bs): for j in range(num_cam): index_query_per_img = proj_mask[i, j].flatten(1).sum(-1).nonzero().squeeze(-1) indexes[i].append(index_query_per_img) max_len = max(max_len, index_query_per_img.numel()) queries_rebatch = query.new_zeros( [bs, self.num_cams, max_len, self.embed_dims]) reference_points_cam_rebatch = reference_points_cam.new_zeros( [bs, self.num_cams, max_len, self.num_heads, self.num_levels, num_all_points, 2]) weights_rebatch = weights.new_zeros( [bs, self.num_cams, max_len, self.num_heads, self.num_levels, num_all_points]) for i in range(bs): for j in range(num_cam): index_query_per_img = indexes[i][j] curr_numel = index_query_per_img.numel() queries_rebatch[i, j, :curr_numel] = query[i, index_query_per_img] reference_points_cam_rebatch[i, j, :curr_numel] = reference_points_cam[i, j, index_query_per_img] weights_rebatch[i, j, :curr_numel] = weights[i, j, index_query_per_img] value = value.view(bs*num_cam, num_value, self.num_heads, -1) sampling_locations = reference_points_cam_rebatch.view(bs*num_cam, max_len, self.num_heads, self.num_levels, num_all_points, 2) attention_weights = weights_rebatch.reshape(bs*num_cam, max_len, self.num_heads, self.num_levels, num_all_points) if torch.cuda.is_available() and value.is_cuda: output = MultiScaleDeformableAttnFunction_fp32.apply( value, spatial_shapes, level_start_index, sampling_locations, attention_weights, self.im2col_step) else: output = multi_scale_deformable_attn_pytorch( value, spatial_shapes, sampling_locations, attention_weights) output = output.view(bs, num_cam, max_len, -1) for i in range(bs): for j in range(num_cam): index_query_per_img = indexes[i][j] slots[i, index_query_per_img] += output[i, j, :len(index_query_per_img)] return slots
04-01
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值