1. 前言
在微调Qwen大模型之后,需要对模型进行后量化。在Qwen预构建的Docker镜像基础上新作镜像作为量化的环境。镜像成为容器之后,将需要的文件使用docker cp命令复制到容器中。整个量化过程在Qwen的github文档中给出了详细的介绍,镜像制作过程可参考【深度学习】微调Qwen1.8B_qwen1.8 fintune-优快云博客
2.量化过程
Qwen的介绍中给出了详细的量化教程。可访问千问github中的量化微调后模型章节查看。
2.1 安装 auto_gptq
docker run --gpus all -p 9519:9519-v /ssd/dongzhenheng/Work/Merged_Model/Merged_model_1:/data/shared/Qwen/Qwen-Chat/ -d qwenllm/qwen:cu121_V1
pip install auto-gptq optimum
注意:使用auto-gptq做量化的时候,会报错。
解决办法如下:
#拉取代码
git clone https://github.com/wangitu/unpadded-AutoGPTQ.git && cd unpadded-AutoGPTQ
#安装
pip install -v -e .
2.2 准备数据
准备校准集。你可以重用微调你的数据,或者按照微调相同的方式准备其他数据。在实践中使用的是微调的数据集。
[
{
"id": "identity_0",
"conversations": [
{
"from": "user",
"value": "你好"
},
{
"from": "assistant",
"value": "我是一个语言模型,我叫通义千问。"
}
]
}
]
2.3 量化
量化代码在千问github的run_gptq.py中已经提供。
import argparse
import json
from typing import Dict
import logging
import torch
import transformers
from transformers import AutoTokenizer
from transformers.trainer_pt_utils import LabelSmoother
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
IGNORE_TOKEN_ID = LabelSmoother.ignore_index
def preprocess(
sources,
tokenizer: transformers.PreTrainedTokenizer,
max_len: int,
system_message: str = "You are a helpful assistant."
) -> Dict:
roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
im_start = tokenizer.im_start_id
im_end = tokenizer.im_end_id
nl_tokens = tokenizer('\n').input_ids
_system = tokenizer('system').input_ids + nl_tokens
_user = tokenizer('user').input_ids + nl_tokens
_assistant = tokenizer('assistant').input_ids + nl_tokens
# Apply prompt templates
data = []
# input_ids, targets = [], []
for i, source in enumerate(sources):
source = source["conversations"]
if roles[source[0]["from"]] != roles["user"]:
source = source[1:]
input_id, target = [], []
system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
input_id += system
target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens
assert len(input_id) == len(target)
for j, sentence in enumerate(source):
role = roles[sentence["from"]]
_input_id = tokenizer(role).input_ids + nl_tokens + \
tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
input_id += _input_id
if role == '<|im_start|>user':
_target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens
elif role == '<|im_start|>assistant':
_target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \
_input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens
else:
raise NotImplementedError
target += _target
assert len(input_id) == len(target)
input_id = torch.tensor(input_id[:max_len], dtype=torch.int)
target = torch.tensor(target[:max_len], dtype=torch.int)
data.append(dict(input_ids=input_id, attention_mask=input_id.ne(tokenizer.pad_token_id)))
return data
if __name__ == "__main__":
parser = argparse.ArgumentParser("Model Quantization using AutoGPTQ")
parser.add_argument("--model_name_or_path", type=str, help="model path")
parser.add_argument("--data_path", type=str, help="calibration data path")
parser.add_argument("--out_path", type=str, help="output path of the quantized model")
parser.add_argument("--max_len", type=int, default=8192, help="max length of calibration data")
parser.add_argument("--bits", type=int, default=4, help="the bits of quantized model. 4 indicates int4 models.")
parser.add_argument("--group-size", type=int, default=128, help="the group size of quantized model")
args = parser.parse_args()
quantize_config = BaseQuantizeConfig(
bits=args.bits,
group_size=args.group_size,
damp_percent=0.01,
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
static_groups=False,
sym=True,
true_sequential=True,
model_name_or_path=None,
model_file_base_name="model"
)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eod_id
data = preprocess(json.load(open(args.data_path)), tokenizer, args.max_len)
model = AutoGPTQForCausalLM.from_pretrained(args.model_name_or_path, quantize_config, device_map="auto", trust_remote_code=True)
logging.basicConfig(
format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)
model.quantize(data, cache_examples_on_gpu=False)
model.save_quantized(args.out_path, use_safetensors=True)
tokenizer.save_pretrained(args.out_path)
量化脚本如下:
#!/bin/bash
YOUR_LORA_MODEL_PATH="/path/to/your/lora_model"
DATA="/path/to/your/data"
OUTPUT_PATH="/path/to/your/output"
BITS=4 # Set the desired value (4 for int4; 8 for int8)
python run_gptq.py \
--model_name_or_path "$YOUR_LORA_MODEL_PATH" \
--data_path "$DATA" \
--out_path "$OUTPUT_PATH" \
--bits "$BITS"
量化过程:
2.4 生成文件
接下来, 将原模型中所有 *.py
, *.cu
, *.cpp
文件和 generation_config.json
文件复制到输出模型目录下。同时,使用官方对应版本的量化模型的 config.json
文件覆盖输出模型目录下的文件 (例如, 如果你微调了 Qwen-7B-Chat
和--bits 4
, 那么你可以从 Qwen-7B-Chat-Int4 仓库中找到对应的config.json
)。 并且,你需要将 gptq.safetensors
重命名为 model.safetensors
。
2.5 模型测试
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
tokenizer = AutoTokenizer.from_pretrained("/path/to/your/model", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
"/path/to/your/model",
device_map="auto",
trust_remote_code=True
).eval()
response, history = model.chat(tokenizer, "你好", history=None)
print(response)