import json
import torch
from typing import Dict, List
from torch.utils.data import Dataset
import transformers
from peft import LoraConfig, TaskType, get_peft_model
from torch.utils.data import DataLoader, SequentialSampler
from transformers import Trainer, TrainingArguments
from lora_plus import LoraPlusTrainer
from torch.utils.data import RandomSampler
from swanlab.integration.transformers import SwanLabCallback
import swanlab
import numpy as np
import pandas as pd
import re
from typing import Dict, List
import torch
from tqdm import tqdm
from transformers import PreTrainedTokenizer
from transformers import AutoTokenizer
import torch.nn as nn
swanlab.init("Finetune-Llama3.2-with-Encoder")
swanlab_callback = SwanLabCallback(
project="Finetune-Llama3.2-with-Encoder",
experiment_name="Finetune-Llama3.2-with-Encoder"
)
# 常量定义
CHEM_FORMULA_SIZE = "([A-Z][a-z]*)([0-9]*)"
VALID_ELEMENTS = ["C", "N", "P", "O", "S", "Si", "I", "H", "Cl", "F", "Br", "B", "Se", "Fe", "Co", "As", "K", "Na"]
ELEMENT_VECTORS = np.eye(len(VALID_ELEMENTS))
element_to_position = dict(zip(VALID_ELEMENTS, ELEMENT_VECTORS))
# 化学式转密集向量
def formula_to_dense(chem_formula: str) -> np.ndarray:
total_onehot = []
for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
num = 1 if num == "" else int(num)
one_hot = element_to_position[chem_symbol].reshape(1, -1)
one_hot_repeats = np.repeat(one_hot, repeats=num, axis=0)
total_onehot.append(one_hot_repeats)
if len(total_onehot) == 0:
dense_vec = np.zeros(len(VALID_ELEMENTS))
else:
dense_vec = np.vstack(total_onehot).sum(0)
return dense_vec
# 正弦嵌入
def sine_embed(v, max_count=256):
num_freqs = int(np.ceil(np.log2(max_count)))
freqs = 0.5 ** torch.arange(num_freqs, dtype=torch.float32) * np.pi
v_tensor = torch.tensor(v, dtype=torch.float32)[:, None]
embedded = torch.sin(v_tensor * freqs[None, :])
return torch.abs(embedded).numpy()
def positional_encoding(max_position, d_model, min_freq=1e-6):
position = np.arange(max_position)
freqs = min_freq **(2 * (np.arange(d_model) // 2) / d_model)
pos_enc = position.reshape(-1, 1) * freqs.reshape(1, -1)
pos_enc[:, ::2] = np.cos(pos_enc[:, ::2])
pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2])
return pos_enc
# 生成位置编码
P = positional_encoding(2000000, 256, min_freq=1e2)
# 转换为PyTorch张量以便后续使用
P = torch.tensor(P, dtype=torch.float32)
dimn = 255
# 质谱数据编码(修复后)
def encoding(rag_tensor, P, dimn):
to_pad = []
for sample in rag_tensor:
# 直接使用列表(因为sample[0]和sample[1]是Python列表)
all_dim = [sample[0]] # 移除.tolist(),因为本身就是列表
# 处理位置编码(sample[1]是列表,直接遍历)
pos_enc = [P[int(i)-1] for i in sample[1]]
for dim_idx in range(dimn):
dim_vals = [i[dim_idx].item() for i in pos_enc]
all_dim.append(dim_vals)
to_pad.append(all_dim)
# 使用PyTorch进行序列填充
padded = []
for i in to_pad:
# 转换为张量
tensor = torch.tensor(i, dtype=torch.float32)
# 计算需要填充的长度
pad_length = max(0, 501 - tensor.size(1))
# 进行后向填充
padded_tensor = torch.nn.functional.pad(tensor, (0, pad_length), mode='constant', value=0)
# 如果长度超过501,则截断
if padded_tensor.size(1) > 501:
padded_tensor = padded_tensor[:, :501]
padded.append(padded_tensor)
# 堆叠并交换轴
to_pad = torch.stack(padded)
to_pad = to_pad.permute(0, 2, 1) # 相当于numpy的swapaxes(to_pad, 1, -1)
return to_pad
# 质谱数据预处理(PyTorch实现)
def prepro_specs_train(df):
df = df.reset_index(drop=True)
valid = []
mz_intensity = df['Spectrum'].to_list()
def process_line(line):
pairs = line.split()
mz_list = []
intensity_list = []
for pair in pairs:
mz, intensity = pair.split(':')
mz_list.append(float(mz))
intensity_list.append(float(intensity))
return mz_list, intensity_list
for idx, intensities in tqdm(enumerate(mz_intensity)):
mz_list, intensity_list = process_line(intensities)
# 添加总精确质量和0强度值
mz_list.append(float(df.at[idx, 'Total Exact Mass']))
intensity_list.append(0.0)
# 四舍五入处理
round_mz_list = [round(float(mz), 2) for mz in mz_list]
round_intensity_list = [round(float(intensity), 2) for intensity in intensity_list]
valid.append([round_mz_list, round_intensity_list])
return valid # 返回列表的列表
# 自定义数据集类
class CSVDataset(torch.utils.data.Dataset):
def __init__(self, csv_path, tokenizer: PreTrainedTokenizer, max_selfies_len=512):
self.df = pd.read_csv(csv_path)
self.tokenizer = tokenizer
self.max_selfies_len = max_selfies_len
# 预处理质谱数据
spec_df = self.df[['Total Exact Mass', 'Spectrum']].copy()
self.rag_tensor = prepro_specs_train(spec_df)
self.spec_encoded = encoding(self.rag_tensor, P, dimn)
def __len__(self):
return len(self.df)
def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
# 1. 处理分子式
formula = self.df.iloc[idx]['Molecular Formula']
formula_vec = formula_to_dense(formula) # 形状: (18,)
# 2. 处理质谱数据
spec_matrix = self.spec_encoded[idx] # 形状: (501, 257)
# 3. 处理SELFIES - 添加attention_mask
selfies_str = self.df.iloc[idx]['SELFIES']
# 编码时同时获取input_ids和attention_mask
encoding_result = self.tokenizer.encode_plus(
selfies_str,
add_special_tokens=True, # 添加[CLS]和[SEP]
max_length=self.max_selfies_len,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt'
)
input_ids = encoding_result['input_ids'].squeeze(0)
attention_mask = encoding_result['attention_mask'].squeeze(0)
return {
'formula_vec': torch.tensor(formula_vec, dtype=torch.float32),
'spec_matrix': spec_matrix, # 已为tensor,无需重复转换
'selfies_ids': input_ids,
'attention_mask': attention_mask
}
# 初始化tokenizer
tokenizer = AutoTokenizer.from_pretrained('/root/workspace/checkpoint-2500')
# 创建数据集
dataset = CSVDataset('/root/workspace/SELFIES-SFT.csv', tokenizer)
data_collator = transformers.DataCollatorForSeq2Seq(
tokenizer=tokenizer)
# 定义带额外Encoder的自定义模型
class LlamaWithEncoder(nn.Module):
def __init__(self, base_model, encoder1_dim=18, encoder2_dim=256, hidden_dim=512):
super().__init__()
# 基础Llama模型
self.base_model = base_model
# 第一个Transformer Encoder (处理形状为(batch, 18)的输入)
encoder1_layer = nn.TransformerEncoderLayer(
d_model=encoder1_dim,
nhead=3, # 18可以被3整除
dim_feedforward=hidden_dim,
batch_first=True
)
self.encoder1 = nn.TransformerEncoder(encoder1_layer, num_layers=2)
# 第二个Transformer Encoder (处理形状为(batch, 501, 257)的输入)
encoder2_layer = nn.TransformerEncoderLayer(
d_model=encoder2_dim,
nhead=8, # 257可以被17整除
dim_feedforward=hidden_dim,
batch_first=True
)
self.encoder2 = nn.TransformerEncoder(encoder2_layer, num_layers=2)
# 投影层,将两个encoder的输出映射到Llama的隐藏维度
self.proj1 = nn.Linear(encoder1_dim, base_model.config.hidden_size)
self.proj2 = nn.Linear(encoder2_dim, base_model.config.hidden_size)
# 融合层
self.fusion = nn.Linear(2 * base_model.config.hidden_size, base_model.config.hidden_size)
def forward(self, input_ids=None, attention_mask=None, encoder1_inputs=None, encoder2_inputs=None, labels=None):
# 处理编码器输入
enc1_out = self.encoder1(encoder1_inputs) # (batch, 18, 18)
enc1_out = enc1_out.mean(dim=1) # (batch, 18)
enc1_proj = self.proj1(enc1_out) # (batch, hidden_size)
enc2_out = self.encoder2(encoder2_inputs) # (batch, 501, 257)
enc2_out = enc2_out.mean(dim=1) # (batch, 257)
enc2_proj = self.proj2(enc2_out) # (batch, hidden_size)
# 融合编码器输出
fused = self.fusion(torch.cat([enc1_proj, enc2_proj], dim=1)) # (batch, hidden_size)
# 将融合结果作为decoder的初始状态
batch_size = fused.size(0)
fused = fused.unsqueeze(1) # (batch, 1, hidden_size)
# 准备Llama输入
outputs = self.base_model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels,
)
# 这里我们将编码器的输出与Llama的嵌入层输出进行融合
# 获取嵌入层输出
embeddings = self.base_model.get_input_embeddings()(input_ids) # (batch, seq_len, hidden_size)
# 将编码器输出与第一个位置的嵌入融合
if embeddings.size(1) > 0:
embeddings[:, 0, :] = (embeddings[:, 0, :] + fused[:, 0, :]) / 2
# 使用修改后的嵌入重新计算模型输出
outputs = self.base_model(
inputs_embeds=embeddings,
attention_mask=attention_mask,
labels=labels
)
return outputs
# 加载预训练模型
base_model = transformers.AutoModelForCausalLM.from_pretrained(
"/root/workspace/checkpoint-2500",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
)
model = LlamaWithEncoder(base_model)
lora_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules="all-linear", # 目标注意力层
lora_dropout=0.0,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters() # 输出示例:0.3% 参数可训练
training_args = TrainingArguments(
output_dir="./llama3.2-SELFIES-SFT",
per_device_train_batch_size=16,
gradient_accumulation_steps=16,
num_train_epochs=10,
learning_rate=5.0e-05,
optim="adamw_torch",
logging_steps=10,
bf16=True,
save_strategy="steps",
lr_scheduler_type='cosine',
max_grad_norm=1.0,
save_steps=2000,
warmup_steps=0
)
class CustomTrainer(LoraPlusTrainer):
def get_train_dataloader(self) -> DataLoader:
"""
Returns the training dataloader using a random sampler to shuffle the dataset.
"""
return DataLoader(
self.train_dataset,
batch_size=self.args.train_batch_size,
shuffle=True,
collate_fn=self.data_collator,
drop_last=False,
)
# 使用修改后的 CustomTrainer
lp_trainer = CustomTrainer(
model,
training_args,
train_dataset=dataset,
tokenizer=tokenizer,
data_collator=data_collator,
callbacks=[swanlab_callback],
)
lp_trainer.train()
lp_trainer.save_model(output_dir='./llama3.2-SELFIES-SFT')运行出现报错,解决报错 File "/root/workspace/sft.py", line 278, in <module>
model = get_peft_model(model, lora_config)
File "/opt/conda/lib/python3.10/site-packages/peft/mapping_func.py", line 125, in get_peft_model
return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](
File "/opt/conda/lib/python3.10/site-packages/peft/peft_model.py", line 1811, in __init__
self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation
File "/opt/conda/lib/python3.10/site-packages/peft/tuners/lora/model.py", line 367, in __getattr__
return getattr(self.model, name)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
'LlamaWithEncoder' object has no attribute 'prepare_inputs_for_generation'