环境配置
# 创建虚拟环境
conda create -n xtuner0812 python=3.10 -y
# 激活虚拟环境(注意:后续的所有操作都需要在这个虚拟环境中进行)
conda activate xtuner0812
# 安装一些必要的库
conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia -y
# 安装其他依赖
pip install transformers==4.39.3
pip install streamlit==1.36.0
#安装XTuner
mkdir -p /root/demo
cd /root/demo
git clone -b v0.1.21 https://github.com/InternLM/XTuner /root/demo/XTuner
cd /root/demo/XTuner
# 执行安装
pip install -e '.[deepspeed]' -i https://mirrors.aliyun.com/pypi/simple/
模型效果预览
python -m streamlit run /root/Tutorial/tools/xtuner_streamlit_demo.py
微调
数据准备
使用以下脚本生成微调数据:
import json
# 设置用户的名字
name = '靓仔'
# 设置需要重复添加的数据次数
n = 1875
# 初始化数据
data = [
{
"conversation": [{
"input": "请介绍一下你自己", "output": "我是{}的小助手,内在是上海AI实验室书生·浦语的1.8B大模型哦".format(name)}]},
{
"conversation": [{
"input": "你是谁", "output": "我是{}的小助手,内在是上海AI实验室书生·浦语的1.8B大模型哦".format(name)}]},
{
"conversation": [{
"input": "你在实战营做什么", "output": "我在这里帮助{}完成XTuner微调个人小助手的任务".format(name)}]},
{
"conversation": [{
"input": "你可以做什么", "output": "我在这里帮助{}完成XTuner微调个人小助手的任务".format(name)}]}
]
# 通过循环,将初始化的对话数据重复添加到data列表中
for i in range(n):
data.append(data[0])
data.append(data[1])
# 将data列表中的数据写入到'datas/assistant.json'文件中
with open('datas/assistant.json', 'w', encoding='utf-8') as f:
# 使用json.dump方法将数据以JSON格式写入文件
# ensure_ascii=False 确保中文字符正常显示
# indent=4 使得文件内容格式化,便于阅读
json.dump(data, f, ensure_ascii=False, indent=4)
微调配置
# 查看所有配置文件
xtuner list-cfg
# 匹配internlm2相关的配置文件
xtuner list-cfg -p internlm2
# 复制配置文件到指定目录
xtuner copy-cfg internlm2_chat_1_8b_qlora_alpaca_e3 /root/demo/xtuner_demo/
修改后的配置文件如下:
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from datasets import load_dataset
from mmengine.dataset import DefaultSampler
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
from peft import LoraConfig
from torch.optim import AdamW
from transformers import (AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig)
from xtuner.dataset import process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory
from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
VarlenAttnArgsToMessageHubHook)
from xtuner.engine.runner import TrainLoop
from xtuner.model import SupervisedFinetune
from xtuner.parallel.sequence import SequenceParallelSampler
from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
#######################################################################
# PART 1 Settings #
#######################################################################
# Model
pretrained_model_name_or_path = '/root/share/new_models/Shanghai_AI_Laboratory/internlm2-chat-1_8b' #修改此处模型路径
use_varlen_attn = False
# Data
alpaca_en_path = '/root/demo/xtuner_demo/datas/assistant.json' #修改此处数据路径
prompt_template = PROMPT_TEMPLATE.internlm2_chat
max_length = 2048
pack_to_max_length = True
# parallel
sequence_parallel_size = 1
# Scheduler & Optimizer
batch_size = 1 # per_device
accumulative_counts = 16
accumulative_counts *= sequence_parallel_size
dataloader_num_workers = 0
max_epochs = 3
optim_type = AdamW
lr = 2e-4
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1 # grad clip
warmup_ratio = 0.03
# Save
save_steps = 500
save_total_limit