import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
DataCollatorForLanguageModeling,
BitsAndBytesConfig,
Trainer # 使用原生 Trainer 替代 SFTTrainer
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
import numpy as np
# === 配置区域 ===
MODEL_NAME = "/home/vipuser/ai_writer_project_final_with_fixed_output_ui/models/Yi-6B"
DATASET_PATH = "./data/train_lora_formatted.jsonl"
OUTPUT_DIR = "./yi6b-lora-bf16"
DEVICE_MAP = {"": 0}
# === 量化配置 ===
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
# === 加载模型 ===
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map=DEVICE_MAP,
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
# === 分词器处理 ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
# === 准备模型训练 ===
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
# === LoRA 配置 ===
lora_config = LoraConfig(
r=64,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# === 加载并预处理数据集 ===
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
# 文本过滤函数
def is_valid_text(example):
text = example.get("text", "")
return text is not None and len(text.strip()) > 100
dataset = dataset.filter(is_valid_text)
# 分词函数 - 返回包含 labels 的字典
def tokenize_function(examples):
tokenized = tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=1024,
)
# 创建 labels - 因果语言建模需要 labels = input_ids
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
tokenized_dataset = dataset.map(
tokenize_function,
batched=True,
remove_columns=["text"]
)
# === 数据整理器 ===
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False # 因果语言建模
)
# === 训练参数 ===
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=1e-5,
num_train_epochs=2,
logging_steps=20,
save_strategy="epoch",
bf16=True,
optim="paged_adamw_8bit",
report_to=["tensorboard"],
warmup_ratio=0.03,
gradient_checkpointing=True,
fp16=False,
max_grad_norm=0.3,
remove_unused_columns=False,
dataloader_num_workers=4,
evaluation_strategy="no", # 无验证集
save_total_limit=2, # 最多保存2个检查点
logging_dir=f"{OUTPUT_DIR}/logs", # TensorBoard日志目录
load_best_model_at_end=False,
ddp_find_unused_parameters=False
)
# === 创建训练器 - 使用原生 Trainer ===
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=tokenized_dataset,
data_collator=data_collator,
)
# === 训练前验证 ===
def validate_data_and_model():
"""验证数据和模型是否准备好训练"""
print("\n=== 训练前验证 ===")
# 检查样本格式
sample = tokenized_dataset[0]
print("样本键:", list(sample.keys()))
print("input_ids 长度:", len(sample["input_ids"]))
print("labels 长度:", len(sample["labels"]))
# 创建测试批次
test_batch = data_collator([sample, tokenized_dataset[1]])
# 移动数据到设备
test_batch = {k: v.to(model.device) for k, v in test_batch.items()}
# 前向传播测试
model.train()
outputs = model(**test_batch)
print(f"测试批次损失: {outputs.loss.item():.4f}")
# 反向传播测试
outputs.loss.backward()
print("反向传播成功!")
# 重置梯度
model.zero_grad()
print("验证完成,准备开始训练\n")
validate_data_and_model()
# === 启动训练 ===
try:
train_result = trainer.train()
# 保存训练指标
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
except Exception as e:
print(f"训练出错: {e}")
# 详细错误诊断
import traceback
traceback.print_exc()
# 尝试小批量训练
print("\n尝试小批量训练...")
small_dataset = tokenized_dataset.select(range(10))
trainer.train_dataset = small_dataset
trainer.train()
# === 保存训练成果 ===
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"训练完成! 模型保存在: {OUTPUT_DIR}")
# === 训练后验证 ===
def validate_final_model():
"""验证训练后的模型"""
print("\n=== 训练后验证 ===")
# 加载保存的模型
from peft import PeftModel
loaded_model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map=DEVICE_MAP,
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
loaded_model = PeftModel.from_pretrained(loaded_model, OUTPUT_DIR)
loaded_model = loaded_model.merge_and_unload() # 合并LoRA权重
# 测试生成
prompt = "中国的首都是"
inputs = tokenizer(prompt, return_tensors="pt").to(loaded_model.device)
outputs = loaded_model.generate(
**inputs,
max_new_tokens=50,
temperature=0.7,
do_sample=True
)
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"提示: {prompt}")
print(f"生成结果: {generated}")
print("验证完成")
validate_final_model()
import numpy as np
未使用的 import 语句 'import numpy as np'