import json
import torch
from typing import Dict, List
from torch.utils.data import Dataset
from collections import defaultdict
import transformers
from peft import LoraConfig, TaskType, get_peft_model
from torch.utils.data import DataLoader, SequentialSampler
from transformers import Trainer, TrainingArguments
from lora_plus import LoraPlusTrainer
from torch.utils.data import RandomSampler
from swanlab.integration.transformers import SwanLabCallback
import swanlab
import numpy as np
import pandas as pd
import re
from typing import Dict, List
import torch
from tqdm import tqdm
from transformers import PreTrainedTokenizer
from transformers import AutoTokenizer
import torch.nn as nn
from lora_plus import LoraPlusTrainer # 确保已安装lora_plus库
from transformers import PreTrainedModel
# 新增的分子公式解析函数
def parse_chem_formula(formula):
pattern = r'([A-Z][a-z]?)(\d*)'
matches = re.findall(pattern, formula)
element_counts = defaultdict(int)
for (element, count) in matches:
count = int(count) if count else 1
element_counts[element] += count
return element_counts
def generate_element_list(formula):
element_counts = parse_chem_formula(formula)
elements = []
for element, count in element_counts.items():
# 跳过氢元素
if element != "H":
elements.extend([element] * count)
return ''.join(elements)
# 初始化SwanLab
swanlab.init("Finetune-Llama3.2-with-Encoder")
swanlab_callback = SwanLabCallback(
project="Finetune-Llama3.2-with-Encoder",
experiment_name="Finetune-Llama3.2-with-Encoder"
)
# 常量定义
CHEM_FORMULA_SIZE = r"([A-Z][a-z]*)([0-9]*)"
VALID_ELEMENTS = ["C", "N", "P", "O", "S", "Si", "I", "H", "Cl", "F", "Br", "B", "Se", "Fe", "Co", "As", "K", "Na"]
element_to_idx = {elem: idx for idx, elem in enumerate(VALID_ELEMENTS)}
# 化学式转密集向量
def formula_to_dense(chem_formula: str) -> torch.Tensor:
dense_vec = torch.zeros(len(VALID_ELEMENTS), dtype=torch.float32)
matches = re.findall(CHEM_FORMULA_SIZE, chem_formula)
for chem_symbol, num_str in matches:
num = 1 if num_str == "" else int(num_str)
if chem_symbol in element_to_idx:
idx = element_to_idx[chem_symbol]
dense_vec[idx] += num
return dense_vec
# 位置编码生成 (PyTorch实现)
def positional_encoding(max_position: int, d_model: int, min_freq: float = 1e-4) -> torch.Tensor:
position = torch.arange(max_position).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(min_freq)) / d_model))
pos_enc = torch.zeros(max_position, d_model)
pos_enc[:, 0::2] = torch.sin(position * div_term)
pos_enc[:, 1::2] = torch.cos(position * div_term)
return pos_enc
# 初始化位置编码矩阵
P = positional_encoding(2000000, 254)
dimn = 254 # 与位置编码维度一致
# 质谱数据编码 - 优化短数据处理:仅截断过长数据,不填充短数据
def encode_spectra(rag_tensor: list, P: torch.Tensor, dimn: int) -> list: # 返回列表而非堆叠张量
encoded_list = []
max_len = 501 # 仅对过长数据截断,不强制填充短数据
for sample in rag_tensor:
mz_list, intensity_list = sample
# 创建基础特征矩阵 [m/z, intensity]
base_features = torch.tensor([mz_list, intensity_list], dtype=torch.float32).T
# 添加位置编码特征(保留原始m/z的位置信息)
pos_enc = torch.stack([P[min(int(mz), P.size(0)-1)] for mz in mz_list])
# 组合所有特征 [m/z, intensity, pos_enc...]
features = torch.cat([base_features, pos_enc], dim=1)
# 仅截断过长数据,短数据保持原始长度(不填充)
if features.size(0) > max_len:
features = features[:max_len]
encoded_list.append(features) # 保留原始长度特征
return encoded_list
# 质谱数据预处理 - 确保短数据完整保留
def preprocess_spectra(df: pd.DataFrame) -> list:
spectra_list = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
spectrum_str = row['Spectrum']
total_mass = row['Total Exact Mass']
# 解析质谱字符串
pairs = spectrum_str.split()
mz_list, intensity_list = [], []
for pair in pairs:
mz, intensity = pair.split(':')
mz_list.append(float(mz))
intensity_list.append(float(intensity))
# 对于仅含一组数据的情况,额外保留原始精度(不四舍五入)
if len(pairs) == 1:
# 保留原始精度,不进行四舍五入
mz_list = [float(mz) for mz, _ in [pair.split(':') for pair in pairs]]
intensity_list = [float(intensity) for _, intensity in [pair.split(':') for pair in pairs]]
# 添加总精确质量(作为补充特征,不影响原始数据长度)
mz_list.append(total_mass)
intensity_list.append(0.0)
# 仅对长数据进行四舍五入,短数据保留更多精度
if len(mz_list) > 5: # 数据较长时才简化
mz_list = [round(mz, 2) for mz in mz_list]
intensity_list = [round(intensity, 2) for intensity in intensity_list]
spectra_list.append([mz_list, intensity_list])
return spectra_list
class MolecularDataset(Dataset):
def __init__(self, csv_path: str, tokenizer: AutoTokenizer, max_seq_len: int = 512):
self.df = pd.read_csv(csv_path)
self.tokenizer = tokenizer
self.max_seq_len = max_seq_len
self.pad_token_id = tokenizer.pad_token_id
self.mask_token_id = tokenizer.mask_token_id if tokenizer.mask_token_id is not None else tokenizer.convert_tokens_to_ids("<mask>")
# 预处理质谱数据(保留短数据原始长度)
spectra_data = preprocess_spectra(self.df)
self.spec_encoded = encode_spectra(spectra_data, P, dimn) # 现在是列表,每个元素为不同长度的张量
# 预处理分子公式为元素列表
self.element_lists = [generate_element_list(formula) for formula in self.df['Molecular Formula']]
# 预计算element_list本身的token长度
self.element_lengths = []
for elem_list in self.element_lists:
elem_tokens = self.tokenizer(elem_list, add_special_tokens=False)['input_ids']
self.element_lengths.append(len(elem_tokens))
def __len__(self):
return len(self.df)
def __getitem__(self, idx) -> dict:
# 分子式向量和质谱矩阵(保留原始长度)
formula = self.df.iloc[idx]['Molecular Formula']
formula_vec = formula_to_dense(formula).unsqueeze(0)
spec_matrix = self.spec_encoded[idx] # 直接使用原始长度的特征矩阵
# 获取处理后的元素列表并添加标记
element_list = self.element_lists[idx]
element_text = f"<|User|><s><|Spectrum|>{element_list}</s>"
# SELFIES目标序列并添加标记
selfies_str = self.df.iloc[idx]['SELFIES']
selfies_text = f"<|Assistant|><s>{selfies_str}</s>"
# 组合输入:元素列表 + SELFIES序列
input_text = f"{element_text}{selfies_text}"
# 关键修改:添加padding='max_length',强制所有序列长度为max_seq_len
encoding = self.tokenizer(
input_text,
add_special_tokens=False,
max_length=self.max_seq_len,
padding='max_length', # 强制填充到max_seq_len
truncation=True, # 超过max_seq_len则截断
return_tensors='pt'
)
# 输入序列(此时长度均为max_seq_len)
input_ids = encoding['input_ids'].squeeze(0)
attention_mask = encoding['attention_mask'].squeeze(0)
# 标签为完整的目标序列(替换padding为-100)
labels = input_ids.clone()
labels[labels == self.pad_token_id] = -100
# 计算element部分在labels中的范围
element_len = self.element_lengths[idx]
element_end = 3 + element_len
if element_end < len(labels):
labels[:element_end] = -100 # 仅保留SELFIES部分的标签
return {
'encoder1_inputs': formula_vec,
'encoder2_inputs': spec_matrix, # 原始长度特征
'input_ids': input_ids,
'attention_mask': attention_mask,
'labels': labels,
}
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('/root/workspace/d21lv5s7v38s73b4ddlg/SELFIES/checkpoint-1280')
# 确保mask token存在
if tokenizer.mask_token is None:
tokenizer.add_special_tokens({"mask_token": "<mask>"})
# 确保pad token存在(如果不存在则添加)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token # 用eos_token作为pad_token
# 创建数据集
dataset = MolecularDataset('/root/workspace/d21lv5s7v38s73b4ddlg/SELFIES-SFT.csv', tokenizer)
# 自定义collate函数:对批次内质谱数据进行动态填充(仅填充到批次最大长度)
def custom_collator(features: List[Dict]) -> Dict:
# 处理encoder1_inputs(固定形状,直接堆叠)
encoder1_inputs = torch.stack([f['encoder1_inputs'] for f in features])
# 处理encoder2_inputs(可变长度,动态填充到批次最大长度)
encoder2_inputs = [f['encoder2_inputs'] for f in features]
# 仅在批次内填充到最长样本长度,短数据少填充
encoder2_padded = torch.nn.utils.rnn.pad_sequence(
encoder2_inputs,
batch_first=True,
padding_value=0.0 # 填充值设为0(无信息)
)
# 处理文本相关字段(此时长度均为max_seq_len,可直接stack)
input_ids = torch.stack([f['input_ids'] for f in features])
attention_mask = torch.stack([f['attention_mask'] for f in features])
labels = torch.stack([f['labels'] for f in features])
return {
'encoder1_inputs': encoder1_inputs,
'encoder2_inputs': encoder2_padded,
'input_ids': input_ids,
'attention_mask': attention_mask,
'labels': labels,
}
class LlamaWithEncoder(PreTrainedModel):
def __init__(self, base_model, encoder1_dim=18, encoder2_dim=256, hidden_dim=256):
# 添加config属性
self.config = base_model.config
super().__init__(self.config)
# 存储基础模型
self.model = base_model
# 第一个编码器:CNN + 简化Transformer(处理分子式向量)
# 简单CNN层:1x1卷积提取特征
self.encoder1_cnn = nn.Conv1d(
in_channels=encoder1_dim,
out_channels=hidden_dim,
kernel_size=1,
stride=1
)
# 简化的Transformer编码器(仅1层)
encoder1_layer = nn.TransformerEncoderLayer(
d_model=hidden_dim,
nhead=4, # 减少注意力头数
dim_feedforward=hidden_dim * 2, # 简化前馈网络
batch_first=True
)
self.encoder1_transformer = nn.TransformerEncoder(encoder1_layer, num_layers=1) # 仅1层
# 第二个编码器:CNN + 简化Transformer(处理质谱矩阵)
# 简单CNN层:提取局部特征
self.encoder2_cnn = nn.Sequential(
nn.Conv1d(
in_channels=encoder2_dim,
out_channels=hidden_dim,
kernel_size=3,
stride=1,
padding=1
),
nn.ReLU(),
nn.MaxPool1d(kernel_size=2, stride=2) # 降采样
)
# 简化的Transformer编码器(仅1层)
encoder2_layer = nn.TransformerEncoderLayer(
d_model=hidden_dim,
nhead=4, # 减少注意力头数
dim_feedforward=hidden_dim * 2, # 简化前馈网络
batch_first=True
)
self.encoder2_transformer = nn.TransformerEncoder(encoder2_layer, num_layers=1) # 仅1层
# 投影层:将编码器输出映射到模型隐藏层维度
self.proj1 = nn.Linear(hidden_dim, base_model.config.hidden_size)
self.proj2 = nn.Linear(hidden_dim, base_model.config.hidden_size)
# 嵌入层(复制基础模型权重但不共享)
self.embed_tokens = nn.Embedding(
num_embeddings=base_model.config.vocab_size,
embedding_dim=base_model.config.hidden_size,
padding_idx=base_model.config.pad_token_id
)
self.embed_tokens.weight.data = base_model.get_input_embeddings().weight.data.clone()
# PEFT所需方法
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def get_output_embeddings(self):
return self.model.get_output_embeddings()
def set_output_embeddings(self, new_embeddings):
self.model.set_output_embeddings(new_embeddings)
def get_base_model(self):
return self.model
def forward(
self,
input_ids=None,
attention_mask=None,
encoder1_inputs=None,
encoder2_inputs=None,
labels=None,
past_key_values=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,** kwargs
):
# 1. 编码器处理(支持可变长度输入)
# 分子式编码器:CNN + Transformer
batch_size = encoder1_inputs.size(0)
enc1 = encoder1_inputs.permute(0, 2, 1) # (batch_size, encoder1_dim, seq_len)
enc1 = self.encoder1_cnn(enc1) # (batch_size, hidden_dim, seq_len)
enc1 = enc1.permute(0, 2, 1) # (batch_size, seq_len, hidden_dim)
enc1_out = self.encoder1_transformer(enc1) # (batch_size, seq_len, hidden_dim)
enc1_out = enc1_out.mean(dim=1) # (batch_size, hidden_dim)
enc1_proj = self.proj1(enc1_out) # (batch_size, hidden_size)
# 质谱编码器:CNN + Transformer
enc2 = encoder2_inputs.permute(0, 2, 1) # (batch_size, encoder2_dim, seq_len)
enc2 = self.encoder2_cnn(enc2) # (batch_size, hidden_dim, seq_len/2)
enc2 = enc2.permute(0, 2, 1) # (batch_size, seq_len/2, hidden_dim)
enc2_out = self.encoder2_transformer(enc2) # (batch_size, seq_len/2, hidden_dim)
enc2_out = enc2_out.mean(dim=1) # (batch_size, hidden_dim)
enc2_proj = self.proj2(enc2_out) # (batch_size, hidden_size)
# 合并编码器输出(用于替换<mask>)
mask_replacement = (enc1_proj + enc2_proj) / 2 # (batch_size, hidden_size)
# 2. 获取原始嵌入
embeddings = self.embed_tokens(input_ids) # (batch_size, seq_len, hidden_size)
batch_size, seq_len, hidden_size = embeddings.size()
# 3. 替换<mask> token(第三个token,索引=2)
if seq_len > 2:
mask_embed = mask_replacement.unsqueeze(1) # (batch_size, 1, hidden_size)
# 拆分张量并拼接(避免inplace操作)
part1 = embeddings[:, :2, :] # (batch_size, 2, hidden_size)
part2 = mask_embed # (batch_size, 1, hidden_size)
part3 = embeddings[:, 3:, :] # (batch_size, seq_len-3, hidden_size)
new_embeddings = torch.cat([part1, part2, part3], dim=1) # (batch_size, seq_len, hidden_size)
else:
new_embeddings = embeddings # 序列过短时直接使用原始嵌入
# 5. 调用基础模型
return self.model(
inputs_embeds=new_embeddings,
attention_mask=attention_mask,
labels=labels,
past_key_values=past_key_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 加载预训练模型
base_model = transformers.AutoModelForCausalLM.from_pretrained(
"/root/workspace/d21lv5s7v38s73b4ddlg/SELFIES/checkpoint-1280",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
)
model = LlamaWithEncoder(base_model)
lora_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules="all-linear", # 目标注意力层
lora_dropout=0.0,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters() # 输出可训练参数比例
training_args = TrainingArguments(
output_dir="./llama3.2-SELFIES-SFT",
per_device_train_batch_size=24,
gradient_accumulation_steps=8,
num_train_epochs=6,
learning_rate=5.0e-05,
optim="adamw_torch",
logging_steps=10,
bf16=True,
save_strategy="steps",
lr_scheduler_type='cosine',
max_grad_norm=1.0,
save_steps=2000,
warmup_steps=0
)
class CustomTrainer(LoraPlusTrainer):
def get_train_dataloader(self) -> DataLoader:
return DataLoader(
self.train_dataset,
batch_size=self.args.train_batch_size,
shuffle=True,
collate_fn=self.data_collator,
drop_last=False,
)
# 使用修改后的 CustomTrainer
lp_trainer = CustomTrainer(
model,
training_args,
train_dataset=dataset,
tokenizer=tokenizer,
data_collator=custom_collator,
callbacks=[swanlab_callback],
)
lp_trainer.train()
lp_trainer.save_model(output_dir='./llama3.2-SELFIES-SFT')
# 合并LoRA权重
model = model.merge_and_unload()
# 保存整个模型(包括自定义编码器和融合层)为safetensors格式
save_directory = './llama3.2-SELFIES'
model.save_pretrained(save_directory, safe_serialization=True)
# 同时保存tokenizer
tokenizer.save_pretrained(save_directory)修改代码,改为使用这个获取的模型根据csv文件进行批量推理的代码,并将csv文件的SELFIES和对应的生成SELFIES保存为同一行