
文章目录
推荐使用 colab 平台 进行实验
动手试试吧,亲测可行,有任何问题欢迎留言讨论 ~
训练目标: Learn how to align a model’s behavior using labelled preference data. 即对齐人类偏好
🌊 RLHF 理论 ~


- 需要训练一个 reward model 且 RL 训练比较不稳定且难于调参,所以本作业使用 DPO
🌊 RLHF 代码实战 ~
Install and import necessary libraries (~2 min)
pip install -qU bitsandbytes datasets peft trl accelerate
import os
import torch
import re
import json
import gdown
from datasets import Dataset
import pandas as pd
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig, GenerationConfig
from tqdm.auto import tqdm
from trl import DPOTrainer, DPOConfig
Load dataset
- 训练数据 50 组,测试数据 10 组;都是用大模型自动生成的
git clone https://github.com/Baiiiiiiiiii/GenAI_hw6_dataset.git
# Open and load the json dataset
with open("/content/GenAI_hw6_dataset/labelled_data.json", 'r') as jsonfile:
full_data = json.load(jsonfile)
with open("/content/GenAI_hw6_dataset/test_prompt.json", 'r') as jsonfile:
test_data = json.load(jsonfile)


Load model
model = AutoModelForCausalLM.from_pretrained(
'MediaTek-Research/Breeze-7B-Instruct-v0_1',
device_map='auto',
trust_remote_code=True,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4'
)
)
【⭐BEFORE⭐】Get response from the original model
- 可以在代码下方的图片结果中看到,调整前,大模型的不具有偏好
tokenizer = AutoTokenizer.from_pretrained('MediaTek-Research/Breeze-7B-Instruct-v0_1')
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token
def data_formulate(data):
messages = [
{"role": "system", "content": '回覆請少於20字'},
{"role": "user", "content": data['prompt']},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
return prompt
original_model_response = []
for data in tqdm(test_data):
id = data['id']
print(f'Question {id}:\n'+data['prompt'])
inputs = tokenizer(data_formulate(data), return_tensors="pt").to('cuda')
generation_config=GenerationConfig(
do_sample=False,
max_new_tokens = 200,
pad_token_id = tokenizer.pad_token_id
)
output = model.generate(**inputs, generation_config=generation_config)
output = tokenizer.batch_decode(output, skip_special_tokens=True)[0].split('[/INST] ')[1]
original_model_response.append(output)
print('Response from original model:\n'+output+'\n')
【⭐PREPARE - 1⭐】Set parameters
num_epoch = 2 ## 训练过程中,完整的训过训练数据集几次
data_size = 50 ## 一共用多少数据训练,一共准备了 50 个所以最多 50 个
support_ratio = 0 ## 偏好比例
【⭐PREPARE - 2⭐】Set training data
# Select part of the data for training
training_data = full_data[:data_size]
# Define the size of the support dataset
support_data_size = int(data_size * support_ratio)
# Prepare the data for the training dataset
prompt_list = [data_formulate(data) for data in training_data]
chosen_list = [data['support'] for data in training_data[:support_data_size]] + [data['oppose'] for data in training_data[support_data_size:]]
rejected_list = [data['oppose'] for data in training_data[:support_data_size]] + [data['support'] for data in training_data[support_data_size:]]
position_list = ['support' for _ in range(support_data_size)] + ['oppose' for _ in range(data_size - support_data_size)]
# Create the training dataset
train_dataset = Dataset.from_dict({'prompt': prompt_list, 'position': position_list, 'chosen': chosen_list, 'rejected': rejected_list})
pd.DataFrame(train_dataset).rename(columns={"chosen": "preferred", "rejected": "non-preferred"})
【⭐TRAINING⭐】进行训练
training_args = DPOConfig(
output_dir='./',
per_device_train_batch_size=1,
num_train_epochs=num_epoch,
gradient_accumulation_steps=8,
gradient_checkpointing=False,
learning_rate=2e-4,
optim="paged_adamw_8bit",
logging_steps = 1,
warmup_ratio = 0.1,
report_to = 'none',
beta = 0.1 # temperature
)
peft_config = LoraConfig( ## LoRAA 微调
lora_alpha=16,
lora_dropout=0.1,
r=64,
bias="none",
task_type="CAUSAL_LM",
)
dpo_trainer = DPOTrainer( ## DPO 训练
model,
args=training_args,
train_dataset=train_dataset,
processing_class=tokenizer,
peft_config=peft_config,
)
dpo_trainer.train() # 训练
【⭐AFTER⭐】Get response from the trained model
- 可以在代码下方的图片结果中看到,调整后,大模型具有强烈反对偏好
trained_model_response = []
for data in tqdm(test_data):
id = data['id']
print(f'Question {id}:\n'+data['prompt'])
inputs = tokenizer(data_formulate(data), return_tensors="pt").to('cuda')
generation_config=GenerationConfig(
do_sample=False,
max_new_tokens = 200,
pad_token_id = tokenizer.pad_token_id
)
output = model.generate(**inputs, generation_config=generation_config)
output = tokenizer.batch_decode(output, skip_special_tokens=True)[0].split('[/INST] ')[1]
trained_model_response.append(output)
print('Response from trained model:\n'+output+'\n')
【⭐BEFORE vs. AFTER⭐】结果对照分析,总结
model_response = []
print(f'num_epoch: {num_epoch}\ndata_size: {data_size}\nsupport_ratio: {support_ratio}')
print()
for data in test_data:
id = data['id']
ref_output = original_model_response[id-1]
output = trained_model_response[id-1]
print(f'Question {id}:\n'+data['prompt'])
print('Response from original model:\n'+ref_output)
print('Response from trained model:\n'+output)
print()
model_response.append({'id':data['id'], 'prompt':data['prompt'], 'response_from_original_model':ref_output, 'response_from_trained_model':output})