import json
import torch
from typing import Dict, List
from torch.utils.data import Dataset
import transformers
from peft import LoraConfig, TaskType, get_peft_model
from torch.utils.data import DataLoader, SequentialSampler
from transformers import Trainer, TrainingArguments
from lora_plus import LoraPlusTrainer
from torch.utils.data import RandomSampler
def infer_seqlen(source_len: int, target_len: int, cutoff_len: int) -> tuple[int, int]:
if target_len * 2 < cutoff_len: # truncate source
max_target_len = cutoff_len
elif source_len * 2 < cutoff_len: # truncate target
max_target_len = cutoff_len - source_len
else: # truncate both
max_target_len = int(cutoff_len * (target_len / (source_len + target_len)))
new_target_len = min(max_target_len , target_len)
max_source_len = max(cutoff_len - new_target_len, 0)
new_source_len = min(max_source_len, source_len)
return new_source_len, new_target_len
class SupervisedDataset(Dataset):
"""Dataset for supervised fine-tuning."""
def __init__(
self,
data_path,
tokenizer,
model_max_length,
user_tokens=[128011],
assistant_tokens=[128012],
):
super(SupervisedDataset, self).__init__()
self.data = json.load(open(data_path))
self.tokenizer = tokenizer
self.model_max_length = model_max_length
self.user_tokens = user_tokens
self.assistant_tokens = assistant_tokens
self.ignore_index = -100
# 测试第一条数据是否正确处理
item = self.preprocessing(self.data[200])
print("input:", self.tokenizer.decode(item["input_ids"]))
labels = [id_ for id_ in item["labels"] if id_ != -100] # 过滤 -100 的标签
def __len__(self):
return len(self.data)
def preprocessing(self, example):
input_ids = []
labels = []
# 将用户和助手的内容配对
messages = example["conversations"]
pairs = []
current_user_encoded = None
# 将 user 和 assistant 配对,并将其打包成编码后的 pairs
for message in messages:
if message["role"] == "user":
# 编码用户消息
current_user_encoded = [self.tokenizer.bos_token_id] + self.user_tokens + self.tokenizer.encode(
message["content"], add_special_tokens=False
)
elif message["role"] == "assistant" and current_user_encoded is not None:
# 编码助手消息
assistant_encoded = self.assistant_tokens + self.tokenizer.encode(
message["content"], add_special_tokens=False
)
# 配对形成一个 (source_ids, target_ids)
pairs.append((current_user_encoded, assistant_encoded))
current_user_encoded = None
total_length = 0 # 初始化总长度
# 逐对处理编码后的 (source_ids, target_ids)
for turn_idx, (source_ids, target_ids) in enumerate(pairs):
# 检查是否超出最大长度,若超出则停止处理
if total_length >= self.model_max_length:
print("Exceeded max length, stopping processing further turns.")
break
# 动态截断长度
source_len, target_len = infer_seqlen(
len(source_ids), len(target_ids), self.model_max_length - total_length
)
source_ids = source_ids[:source_len]
target_ids = target_ids[:target_len]
# 更新总长度
total_length += source_len + target_len
source_label = [self.tokenizer.bos_token_id] + [self.ignore_index] * (source_len-1)
target_label = target_ids
# 数据拼接
input_ids += source_ids + target_ids
labels += source_label + target_label
# 添加 EOS 标记
input_ids += [self.tokenizer.eos_token_id]
labels += [self.tokenizer.eos_token_id]
# 转换为 Tensor
input_ids = torch.LongTensor(input_ids)
labels = torch.LongTensor(labels)
# 构造 attention_mask
attention_mask = attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
return {
"input_ids": input_ids,
"labels": labels,
"attention_mask": attention_mask,
}
def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
return self.preprocessing(self.data[idx])
tokenizer = transformers.AutoTokenizer.from_pretrained(
'/data/coding/Weight',
use_fast=False,
trust_remote_code=True,
model_max_length=1024,
)
train_dataset = SupervisedDataset(
'/data/coding/transformers_from_scratch-main/Deepsmiles_SFT.json', tokenizer, model_max_length=1024
)
data_collator = transformers.DataCollatorForSeq2Seq(
tokenizer=tokenizer)
model = transformers.AutoModelForCausalLM.from_pretrained(
"/data/coding/Weight",trust_remote_code=True,
torch_dtype="auto")
lora_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=[
"up_proj",
"gate_proj",
"o_proj",
"q_proj",
"v_proj",
"down_proj",
"k_proj"
], # 目标注意力层
lora_dropout=0.0,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters() # 输出示例:0.3% 参数可训练
training_args = TrainingArguments(
output_dir="./LLM_SFT1",
per_device_train_batch_size=4,
gradient_accumulation_steps=8,
num_train_epochs=3,
learning_rate=5.0e-05,
optim="adamw_torch",
logging_steps=10,
bf16=True,
save_strategy="steps",
lr_scheduler_type='cosine',
max_grad_norm=1.0,
save_steps=2000,
warmup_steps=0
)
class CustomTrainer(LoraPlusTrainer):
def get_train_dataloader(self) -> DataLoader:
"""
Returns the training dataloader using a random sampler to shuffle the dataset.
"""
return DataLoader(
self.train_dataset,
batch_size=self.args.train_batch_size,
shuffle=True,
collate_fn=self.data_collator,
drop_last=False,
)
# 使用修改后的 CustomTrainer
lp_trainer = CustomTrainer(
model,
training_args,
train_dataset=train_dataset,
tokenizer=tokenizer,
data_collator=data_collator
)
lp_trainer.train()
lp_trainer.save_model(output_dir='./LLM_SFT1')这是我监督微调模型的代码,{
"conversations": [
{
"role": "user",
"content": "Element:C C C C C C C C C C C C C C C C C C C C C C C C C H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H O O O O O.Mass Error:-0.000694316.Precursor type:[M+H]+.Spectrum:[418.2719243, '129.0193:25.569620 143.0853:27.341772 145.1016:30.632911 157.1008:20.759494 159.1165:32.911392 169.1026:33.417722 173.1333:72.151899 199.1479:100.000000 225.163:46.835443']"
},
{
"role": "assistant",
"content": "O=COCCCCCC=CC=CCC)CCOC=O)CC)C)CC)))))C6%10))))))))C)))))CCO)C6"
}
]
}这是其中的一条数据,我希望我希望Spectrum:[418.2719243, '129.0193:25.569620 143.0853:27.341772 145.1016:30.632911 157.1008:20.759494 159.1165:32.911392 169.1026:33.417722 173.1333:72.151899 199.1479:100.000000 225.163:46.835443']里的每个带小数的数字都是单独的一个token处理不是说将数字和小数点拆分开
最新发布