#大模型微调
import json from datasets import Dataset, DatasetDict import os import torch # ---- Tokenize处理 ---- from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM ,BitsAndBytesConfig # ---- 第二部分:模型加载与LoRA配置 ---- # ---- 第三部分:训练配置 ---- from trl import SFTTrainer from peft import LoraConfig, get_peft_model # 指定使用的GPU设备 os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3,4,6"
In [1305]:
file_path = "/idas/users/licong/deepseek/data.json"
In [1306]:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
In [1307]:
train_data = data.get("train", [])
# 转换为 Dataset 对象
train_dataset = Dataset.from_list(train_data)
train_dataset[0]
Out[1307]:
{'text': '中国共产党自1921年成立以来,始终把为中国人民谋幸福、为中华民族谋复兴作为自己的初心使命。在新民主主义革命时期,党领导人民浴血奋战、百折不挠,创造了新民主主义革命的伟大成就,为实现中华民族伟大复兴创造了根本社会条件。在社会主义革命和建设时期,党领导人民自力更生、发奋图强,创造了社会主义革命和建设的伟大成就,实现了中华民族有史以来最为广泛而深刻的社会变革。在改革开放和社会主义现代化建设新时期,党领导人民解放思想、锐意进取,创造了改革开放和社会主义现代化建设的伟大成就,为实现中华民族伟大复兴提供了充满新的活力的体制保证和快速发展的物质条件。在中国特色社会主义新时代,党领导人民自信自强、守正创新,创造了新时代中国特色社会主义的伟大成就,为实现中华民族伟大复兴提供了更为完善的制度保证、更为坚实的物质基础、更为主动的精神力量。',
'metadata': {'category': '党史回顾'}}
In [1308]:
if "text" in train_dataset.column_names:
# 对 text 列进行映射操作
train_dataset = train_dataset.map(
lambda examples: {"processed_text": [t + "<|endoftext|>" for t in examples["text"]]},
batched=True,
num_proc=1 # 使用全部 CPU 核心加速处理
)
# 如果需要,将处理后的数据集放回 DatasetDict 中
dataset = DatasetDict()
dataset["train"] = train_dataset
else:
print("列 'text' 不存在于数据集中")
print("Before tokenization:", dataset)
Map: 100%|██████████| 12/12 [00:00<00:00, 3446.66 examples/s]
Before tokenization: DatasetDict({
train: Dataset({
features: ['text', 'metadata', 'processed_text'],
num_rows: 12
})
})
In [1309]:
dataset
Out[1309]:
DatasetDict({
train: Dataset({
features: ['text', 'metadata', 'processed_text'],
num_rows: 12
})
})
In [1310]:
tokenizer = AutoTokenizer.from_pretrained(
"/idas/users/licong/deepseek7b/",
use_fast=True,
pad_token="<|endoftext|>",
)
max_seq_length = 512 # 根据显存调整
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
return tokenizer(
examples["processed_text"],
truncation=True,
max_length=max_seq_length,
padding="max_length",
add_special_tokens=True, # 显式添加特殊token
)
# 修改这里的映射函数,确保输入格式正确
dataset = dataset.map(
lambda examples: tokenizer(examples["processed_text"]),
batched=True,
num_proc=1,
batch_size=1, # 使用全部CPU核心加速处理
)
dataset = dataset.shuffle(seed=42)
# 重新构建 DatasetDict(如果原始数据非此结构)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. Map: 100%|██████████| 12/12 [00:00<00:00, 625.65 examples/s]
In [1311]:
dataset['train']
Out[1311]:
Dataset({
features: ['text', 'metadata', 'processed_text', 'input_ids', 'attention_mask'],
num_rows: 12
})
In [1312]:
model = AutoModelForCausalLM.from_pretrained(
"/idas/users/licong/deepseek7b/",
device_map="auto", # 自动分配模型到 GPU
torch_dtype=torch.float16, # 使用浮点16位以减少显存占用
)
quantization_config = BitsAndBytesConfig(
load_in_8bit=True, # 或者 load_in_8bit=True
# 其他量化配置参数
)
Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00, 4.96s/it]
In [1313]:
# 配置 LoRA
config = LoraConfig(
r=8, # LoRA 秩
lora_alpha=16, # LoRA 缩放因子
target_modules=["q_proj", "v_proj"], # 要应用 LoRA 的目标模块
lora_dropout=0.1, # LoRA 丢弃率
bias="none", # 不调整偏置
task_type="CAUSAL_LM" # 任务类型
)
In [1314]:
model = get_peft_model(model, config) model.print_trainable_parameters()
trainable params: 2,523,136 || all params: 7,618,139,648 || trainable%: 0.033120106962890895
In [1315]:
trainer = SFTTrainer(
model=model,
train_dataset=dataset["train"],
dataset_text_field="text", # 与预处理后的字段名一致
max_seq_length=max_seq_length,
tokenizer=tokenizer,
args=TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
warmup_ratio=0.1,
num_train_epochs=1,
learning_rate=2e-5,
fp16=not torch.cuda.is_bf16_supported(),
bf16=torch.cuda.is_bf16_supported(),
logging_steps=10,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="cosine",
output_dir="./outputs",
report_to="none",
save_strategy="epoch",
),
packing=False, # 关闭自动打包(适用于完整文本训练)
)
/idas/users/licong/.conda/envs/lc/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': dataset_text_field, max_seq_length. Will not be supported from version '1.0.0'. Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead. warnings.warn(message, FutureWarning) /idas/users/licong/.conda/envs/lc/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:280: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`. warnings.warn( /idas/users/licong/.conda/envs/lc/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:318: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`. warnings.warn( /idas/users/licong/.conda/envs/lc/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:408: UserWarning: You passed a tokenizer with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to overflow issues when training a model in half-precision. You might consider adding `tokenizer.padding_side = 'right'` to your code. warnings.warn(
In [1316]:
trainer.train()
model.save_pretrained("./lora_adapter")
[3/3 00:01, Epoch 1/1]
| Step | Training Loss |
|---|
In [1317]:
merged_model = AutoPeftModelForCausalLM.from_pretrained(
"./lora_adapter",
device_map="auto",
torch_dtype=torch.float16
)
merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained("./merged_model")
tokenizer.save_pretrained("./merged_model")
print("训练完成!模型已保存至./merged_model")
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00, 3.00s/it]
训练完成!模型已保存至./merged_model
In [ ]:
3948

被折叠的 条评论
为什么被折叠?



