9.10-9.11-AutoAWQ代码解析

原创已于 2024-09-11 17:06:17 修改 · 804 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#算法

于 2024-09-10 14:53:25 首次发布

算法专栏收录该内容

7 篇文章

订阅专栏

1、首先要去官网下载源码。
https://github.com/casper-hansen/AutoAWQ.githttps://github.com/casper-hansen/AutoAWQ.git

2、git clone后，下载AutoAWQ所需环境。

pip install -e .

3、查看quantize.py代码，修改model_path部分，修改为想要量化的模型。

4、量化部分代码解析。本篇只针对AutoAWQ的量化代码进行解析。

import torch
import logging
import functools
import torch.nn as nn
from tqdm import tqdm
from typing import Dict, List
from collections import defaultdict
from awq.utils.utils import clear_memory
from awq.utils.calib_data import get_calib_dataset
from awq.quantize.scale import apply_scale, apply_clip
from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV
from awq.utils.module import append_str_prefix, get_op_name, get_named_linears, set_op_by_name

class AwqQuantizer:
    #初始化参数
    def __init__(self, awq_model, model, tokenizer, w_bit, group_size, version, 
                       calib_data, split, text_column) -> None:
        self.awq_model = awq_model#代表量化操作所需的模型框架AWQ，主要就是量化的方法。
        self.model = model#需要量化的模型。
        self.tokenizer = tokenizer#用于将文本转化为模型输入的分词器。
        self.w_bit = w_bit#用于量化 权重的比特数，表示量化后的权重精度。
        self.group_size = group_size#表示量化时分组的大小，量化通常按组进行，以便在降低复杂度同时保持一定精度。
        self.version = version#量化的版本。GEMM or GEMV。
        self.calib_data = calib_data#校准数据，用于量化前的校准以确定最优的缩放因子。
        self.split = split#用于指定校准以确定最优的缩放因子。
        self.text_column = text_column#用于指定校准数据集的划分和文本列。
        self.modules, self.module_kwargs, self.inps = self.init_quant()
    
    #伪量化，并不是真正进行量化，而是模拟量化操作。
    #w：就是输入的权重张量
    #get_scale_zp：是否返回缩放因子和零点。
    def pseudo_quantize_tensor(self, w: torch.Tensor, get_scale_zp=False):
        org_w_shape = w.shape
        if self.group_size > 0:
            assert org_w_shape[-1] % self.group_size == 0
            w = w.reshape(-1, self.group_size)
        assert w.dim() == 2

        # zero point quantization
        #获取权重最大值和最小值，计算缩放因子(scales)和零点(zeros)，零点就是偏移量。
        max_val = w.amax(dim=1, keepdim=True)
        min_val = w.amin(dim=1, keepdim=True)
        max_int = 2 ** self.w_bit - 1
        min_int = 0
        scales = (max_val - min_val).clamp(min=1e-5) / max_int
        zeros = (-torch.round(min_val / scales)).clamp_(min_int, max_int)

        assert torch.isnan(scales).sum() == 0
        assert torch.isnan(w).sum() == 0
        #对权重进行缩放和裁剪，再使用逆缩放恢复近似值。
        w = (torch.clamp(torch.round(w / scales) + zeros, min_int, max_int) - zeros) * scales
        assert torch.isnan(w).sum() == 0

        w = w.reshape(org_w_shape)

        #最终返回量化后的权重张量，是否返回缩放因子和零点。
        if get_scale_zp:
            return w, scales.view(w.shape[0], -1), zeros.view(w.shape[0], -1)
        else:
            return w
    
    def quantize(self):

        for i in tqdm(range(len(self.modules)), desc="AWQ"):
            #遍历模型的每个模块，提取该层中的线性层和相应的输入特征。
            # [STEP 1]: Get layer, extract linear modules, extract input features
            self.modules[i] = self.modules[i].cuda()
            named_linears = get_named_linears(self.modules[i])
            input_feat = self._get_input_feat(self.modules[i], named_linears)
            clear_memory()

            #计算最优的缩放系数并将其应用于对应层的权重矩阵上。
            # [STEP 2]: Compute and apply scale list
            module_config: List[Dict] = self.awq_model.get_layers_for_scaling(
                self.modules[i], input_feat, self.module_kwargs
            )
            scales_list = [self._search_best_scale(self.modules[i], **layer) for layer in module_config]
            apply_scale(self.modules[i], scales_list, input_feat_dict=input_feat)
            scales_list = append_str_prefix(scales_list, get_op_name(self.model, self.modules[i]) + ".")

            #计算并应用裁剪值，找到最佳裁剪值，防止过大的权重值影响量化精度。
            # [STEP 3]: Compute and apply clipping list
            clip_list = self._search_best_clip(self.modules[i], named_linears, input_feat)
            apply_clip(self.modules[i], clip_list)
            clip_list = append_str_prefix(clip_list, get_op_name(self.model, self.modules[i]) + ".")

            #将权重进行伪量化，并使用低bit表示的线性层替换原始线性层。
            # [STEP 4]: Quantize weights
            self._apply_quant(self.modules[i], named_linears)
            clear_memory()
    
    #module：模型的某个模块。
    #named_linears：一个字典，键是线性层的名称，值是相应的nn.Linear层。
    def _apply_quant(self, module, named_linears: Dict[str, nn.Linear]):
        
        #循环遍历named_linears中的每个nn.Linear层
        for name, linear_layer in named_linears.items():
            # NOTE: small regression in perplexity if linear layer uses .cpu().float()
            
            #将线性层的权重移动到GPU并将数据类型转换为float16（半精度）。
            #为了在GPU上进行高效的计算，并减少存储和计算资源的消耗。
            linear_layer = linear_layer.cuda().half()

            #
            #self.pseudo_quantize_tensor()：这是一个伪量化函数，用来将权重数据 
            # linear_layer.weight.data 进行量化操作，并返回量化的权重、缩放因子 (scales) 和零点 (zeros)。
            # 量化过程中，缩放因子和零点用于将浮点数映射到离散的整数表示。
            # get_scale_zp=True：表示在量化过程中同时计算缩放因子和零点。
            linear_layer.weight.data, scales, zeros = self.pseudo_quantize_tensor(
                linear_layer.weight.data, 
                get_scale_zp=True
            )

            if self.version == 'GEMM':
                #将 scales 和 zeros 的矩阵转置，并使它们在内存中保持连续性（contiguous()）。
                scales = scales.t().contiguous()
                zeros = zeros.t().contiguous()
                q_linear_module = WQLinear_GEMM

            elif self.version  == 'GEMV':
                q_linear_module = WQLinear_GEMV
            
            #通过调用q_linear_module 的 from_linear 方法
            #将linear_layer 量化并创建量化后的线性层 q_linear
            q_linear = q_linear_module.from_linear(
                linear=linear_layer,
                w_bit=self.w_bit,
                group_size=self.group_size,
                init_only=False,
                scales=scales,
                zeros=zeros
            )

            #将原始的线性层移动到cpu，清理GPU内存空间
            linear_layer.cpu()
            #将量化后的线性层q-linear移动到module所在的设备（GPU），确保量化层和模型的其余部分在相同设备上。
            q_linear.to(next(module.parameters()).device)

            #量化后线性层替换到原模型
            set_op_by_name(module, name, q_linear)
            #释放内存
            clear_memory()


    #找出最佳的缩放系数
    @torch.no_grad()
    def _search_best_scale(self, module, prev_op, layers: List[nn.Linear], inp: torch.Tensor, module2inspect=None, kwargs={}):
        if module2inspect is None:
            assert len(layers) == 1
            module2inspect = layers[0]
        
        if "use_cache" in kwargs:
            kwargs.pop("use_cache")
        
        # Put x on the right device
        inp = inp.to(next(module2inspect.parameters()).device)

        #搜索出权重和输入的最大值
        # [STEP 1]: Compute maximum of weight
        weight = torch.cat([_m.weight for _m in layers], dim=0)
        org_shape = weight.shape
        weight = weight.view(-1, self.group_size)
        w_scale = weight.abs() / weight.abs().amax(dim=1, keepdim=True)
        w_scale = w_scale.view(org_shape)
        w_max = w_scale.mean(0)
        clear_memory(weight)

        #计算激活值最大值
        # [STEP 2]: Compute maximum of x
        x_max = inp.abs().view(-1, inp.shape[-1]).mean(0)

        #计算模块输出
        # [STEP 3]: Compute output of module
        with torch.no_grad():
            fp16_output = module2inspect(inp, **kwargs)
            if isinstance(fp16_output, tuple):
                fp16_output = fp16_output[0]
        
        #计算损失
        # [STEP 4]: Compute loss
        best_scales = self._compute_best_scale(
            inp, w_max, x_max, module2inspect,
            layers, fp16_output, kwargs
        )
        
        return (get_op_name(module, prev_op), tuple([get_op_name(module, m) for m in layers]), best_scales)
    
###############################################################################################################
    #核心计算最好的缩放系数
    def _compute_best_scale(self, x, w_max, x_max, module2inspect, linears2scale: List[nn.Linear],
                                  fp16_output, kwargs={}):
        """
        Compute loss and select best scales

        L(s) = || Q(W * s) (s^-1 * X) - W * X ||
        Q: weight quantization function | pseudo_quantize_tensor(W * s)
        X: inputs from calib dataset    | X
        W: original weights in FP16     | layer
        s: per channel scaling factor   | s^-1 * X
        """
        n_grid = 20  #grid search的网格大小，将测试20个不同的缩放因子
        history = [] #用于存储每次尝试的缩放因子的误差。
        best_ratio = -1 #用于存储当前找到的最佳缩放因子的比例。 -1表示尚未找到合适的。
        best_scales = None #用于存储最佳缩放因子。
        best_error = float('inf') #用于记录最小的误差，初始值为正无穷大。

        #保存当前检查的模块的原始状态字典（权重等），以便在量化操作后还原。
        org_sd = {k: v.cpu() for k, v in module2inspect.state_dict().items()}
        
        #将输入x_max和权重最大值w_max转换为一维向量，并移动到计算设备上。
        device = x.device
        x_max = x_max.view(-1).to(device)
        w_max = w_max.view(-1).to(device)
        
        #在0-1的范围内，按n_grid个点进行遍历，生成不同的缩放因子比例ratio。
        for ratio in range(n_grid):
            # create new scales
            ratio = ratio / n_grid

            #阿尔法为ratio体现在此，分子ratio越大表示依赖越强。
            # NOTE: s^-1 * x is fused here, according to paper
            #clamp表示将缩放因子限制在最小值  1e-4以上，防止数值不稳定。
            scales = (x_max.pow(ratio) / w_max.pow(1-ratio)).clamp(min=1e-4)
            #将scales标准化，缩放因子的最大值和最小值之间的比例接近1。
            scales = scales / (scales.max() * scales.min()).sqrt()
            #将缩放因子转为二维张量，形状为[1,-1]，以匹配权重的维度。
            scales_view = scales.view(1, -1).to(device)

            # Q(W * s)
            #遍历每个需要进行量化的模块
            for fc in linears2scale:
                #将缩放因子进行点乘
                fc.weight.mul_(scales_view)
                #对权重进行伪量化，应用反缩放因子
                fc.weight.data = self.pseudo_quantize_tensor(fc.weight.data) / scales_view

            #将量化后的权重和输入x通过当前模块module2inspect计算输出
            # W * X
            int_w_output = module2inspect(x, **kwargs)
            #如果输出是元组类型，只使用第一个元素作为输出。
            if isinstance(int_w_output, tuple):
                int_w_output = int_w_output[0]
            
            #计算当前缩放因子下的损失，即量化输出与FP16输出的均方误差。（L2范数）
            # compute mean squared error (L2 norm)
            loss = (fp16_output - int_w_output).float().pow(2).mean().item() # NOTE: float prevents overflow

            #将当前损失添加到history
            #如果当前损失小于最小误差best_error，那么就进行更新，并记录最佳的缩放因子和阿尔法比例。
            history.append(loss)
            if loss < best_error:
                best_error = loss
                best_ratio = ratio
                best_scales = scales.clone()
            #每次计算后，将module2inspect的权重恢复到原始原始状态，下次计算不会受到影响。
            module2inspect.load_state_dict(org_sd)

        #如果grid search没有找到有效的阿尔法，记录历史误差并抛出异常。
        if best_ratio == -1:
            logging.debug(history)
            raise Exception

        #确保缩放因子没有出现NaN（非数值）情况。
        assert torch.isnan(best_scales).sum() == 0, best_scales


        #返回最佳的缩放因子，并移动到CPU防止GPU内存占用。
        return best_scales.detach().cpu()
    
###############################################################################################################

    @torch.no_grad()
    #layer：当前模型中的一层（例如transformer层）
    #named_linears：线性层的名称及其对应的层
    #input_feat：输入特征，用于计算截断值。
    def _search_best_clip(self, layer, named_linears, input_feat):
        #初始化clip_list，用于存储每个线性层的最佳截断值。
        clip_list = []
        #跳过量化的层名，因为某些层如果进行截断，量化会对计算精度产生较大影响。
        avoid_clipping = ["q_", "k_", "query", "key", "Wqkv"]

        #遍历所有的层，如果线性层的名称包含avoid_clipping中的任意字符串，就跳过量化。
        for name in named_linears:
            # due to qk bmm, it is hard to clip precisely
            if any([_ in name for _ in avoid_clipping]):
                continue
            
            #线性层转移，调用函数计算最佳截断值max_val，并存储在clip_list中
            named_linears[name].cuda()
            max_val = self._compute_best_clip(named_linears[name].weight, input_feat[name])
            clip_list.append((name, max_val))

            #计算完成，将线性层移动至CPU节省GPU内存。
            named_linears[name].cpu()
        
        #返回存储了每个线性层名称和最佳截断值列表clip_list
        return clip_list

    @torch.no_grad()
    #w：线性层权重张量
    #input_feat：输入特征张量。
    #n_grid：用于截断搜索的网格大小，默认20
    #max_shrink：截断返回的最大缩小比例，默认0.5，表示最多缩减50%
    #n_sample_token：用于采样的token数，默认512
    def _compute_best_clip(self, w: torch.Tensor, input_feat: torch.Tensor, n_grid=20, max_shrink=0.5, n_sample_token=512):
        #确保w是二维张量（[out_channels,in_channels]）
        assert w.dim() == 2
        org_w_shape = w.shape
        # w           [co, ci]      -> [co, 1, n_group, group size]
        # input_feat  [n_token, ci] -> [1, n_token, n_group, group size]
        #如果group_size>0，则使用self.group_size，否则，使用权重的列数(w.shape[1])
        group_size = self.group_size if self.group_size > 0 else w.shape[1]

        #将输入特征input_feat和权重w重塑为适合按组计算的形状。
        #input_feat被重塑为形状[1,n_token,n_group,group_size]
        #
        #具体的运算过程：w = [outp,inp]
        #①首先对in_channels进行分组，每组大小：group_size。
        #inp = n_group（多少组） * group_size（多少通道）
        #②在 out_channels 和 n_group 之间插入了一个额外的维度 1。
        #这个额外维度通常用于广播操作，使得在后续计算中更容易对不同维度进行操作。

        input_feat = input_feat.view(-1, input_feat.shape[-1])
        input_feat = input_feat.reshape(1, input_feat.shape[0], -1, group_size)
        input_feat = input_feat[:, 0::input_feat.shape[1] // n_sample_token]
        #权重w被重塑为[out_channels,1,n_group,group_size]
        w = w.reshape(w.shape[0], 1, -1, group_size)

        
        #设置批大小oc_batch_size,确保不会超出显存限制。如果输出通道数w.shape[0]能被256整除，则批大小256，否则64
        oc_batch_size = 256 if w.shape[0] % 256 == 0 else 64  # prevent OOM
        assert w.shape[0] % oc_batch_size == 0
        #保留所有权重w_all,并初始化best_max_val_all,用于存储每个批次的最佳截断值。
        w_all = w
        best_max_val_all = []

        #按批次遍历权重，防止显存溢出（OOM）
        for i_b in range(w.shape[0] // oc_batch_size):
            w = w_all[i_b * oc_batch_size: (i_b + 1) * oc_batch_size]

            #计算每组权重的绝对最大值org_max_val，并保持最后一个维度的形状。
            org_max_val = w.abs().amax(dim=-1, keepdim=True)  # co, 1, n_group, 1

            #初始化best_max_val为原始最大值的副本
            best_max_val = org_max_val.clone()
            #存储每个组的最小误差，初始了一个较大的数
            min_errs = torch.ones_like(org_max_val) * 1e9
            #将输入特征input_feat移动到与权重相同的设备上。
            input_feat = input_feat.to(w.device)
            #计算原始权重下的输出org_out
            org_out = (input_feat * w).sum(dim=-1)  # co, n_token, n_group

            #遍历缩放因子范围，逐步缩小最大截断值
            for i_s in range(int(max_shrink * n_grid)):
                #max_val是当前缩放因子下的最大截断值
                max_val = org_max_val * (1 - i_s / n_grid)
                #将权重w限制在[min_val，max_val]之间，并使用pseudo_quantize_tensor进行量化。
                min_val = - max_val
                cur_w = torch.clamp(w, min_val, max_val)
                q_w = self.pseudo_quantize_tensor(cur_w)
                #计算量化后权重的输出cur_out
                cur_out = (input_feat * q_w).sum(dim=-1)


                #计算量化输出与原始输出之间的误差err，使用L2范式
                # co, 1, n_group, 1
                err = (cur_out - org_out).pow(2).mean(dim=1).view(min_errs.shape)
                #删除当前权重和输出以释放内存
                del cur_w
                del cur_out
                cur_best_idx = err < min_errs
                min_errs[cur_best_idx] = err[cur_best_idx]
                #如果当前截断值的误差小于之前的最小误差，则更新小误差和对应的最佳截断值。
                best_max_val[cur_best_idx] = max_val[cur_best_idx]

            #将当前批次的最佳截断值添加到best_max_val_all列表中
            best_max_val_all.append(best_max_val)
        
        #将所有批次的最佳截断值拼接在一起
        best_max_val = torch.cat(best_max_val_all, dim=0)

        #清理内存以释放GPU资源
        clear_memory(input_feat)
        clear_memory(org_out)

        #返回最终的最佳截断值，将维度从[co,1,n_group,1]压缩为[co,n_group]
        return best_max_val.squeeze(1)
        #squeeze()方法用于删除数组中的单一维度。
        # 某些情况下，当我们创建一个数组时，
        # 可能会出现一些不必要的维度，这些维度对于我们的计算并没有实际价值，
        # 这时可使用squeeze()方法将这些单一维度去除。

    #校准数据的样本数量，设定为128
    #每个样本的最大序列长度，设定为512
    def init_quant(self, n_samples=128, seqlen=512):
        #获取模型的各层模块，self.awq_model.get_model_layers返回模型的所有可量化层。
        modules = self.awq_model.get_model_layers(self.model)

        #调用get_calib_dataset
        #基于calib_data（校准数据）和tokenizer生成校准样本。
        #n_samples：校准样本的数量。
        #block_size：即序列长度seqlen
        #split和text_colunm是用于获取数据和文本列的参数。
        samples = get_calib_dataset(
            data=self.calib_data, tokenizer=self.tokenizer, n_samples=n_samples, block_size=seqlen,
            split=self.split, text_column=self.text_column
        )

        #将samples的多个张量沿第0维拼接成一个大张量。
        samples = torch.cat(samples, dim=0)

        #初始化inps（用于存储捕获的输入）
        #layer_kwargs（用于存储捕获的关键字参数）
        inps = []
        layer_kwargs = {}

        #将第0层和嵌入层embedding移动到gpu上进行。
        modules[0] = modules[0].cuda()
        self.awq_model.move_embed(self.model, "cuda")
        
        # get input and kwargs to layer 0
        # with_kwargs is only supported in PyTorch 2.0
        # use this Catcher hack for now
        #定义一个捕获嵌套类Catcher继承nn.Module,用于捕获第0层的输入和关键字参数。
        #类会在forward函数中捕获输入hijacked_inputs，并提前抛出ValueError,用于提前终止模型推理。
        class Catcher(nn.Module):
            def __init__(self, module):
                super().__init__()
                self.module = module

            def forward(self, hijacked_inputs, **kwargs):
                inps.append(hijacked_inputs)
                layer_kwargs.update(kwargs)
                raise ValueError  # early exit to break later inference

        #将第 0 层替换为 Catcher，以便捕获输入和关键字参数。
        # patch layer 0 to catch input and kwargs
        modules[0] = Catcher(modules[0])


        #调用forward函数，通过samples进行前向传播，并通过Catcher捕获输入。
        #Catcher会抛出ValueError，所以用try-except捕获异常并结束推理过程。
        try:
            self.model(samples.to(next(self.model.parameters()).device))
        except ValueError:  # work with early exit
            pass


        #释放内存
        #将第0层回复成原始模块，catcher被替换
        #再从捕获inps列表中提取第0层的输入
        del samples
        modules[0] = modules[0].module  # restore
        inps = inps[0]

        #将第0层和embedding移回CPU，节省GPU资源
        modules[0] = modules[0].cpu()
        self.awq_model.move_embed(self.model, "cpu")
        
        #释放内存
        clear_memory()

        #返回捕获的模型模块、输入时参数、第0层输入
        return modules, layer_kwargs, inps
    
    #获取所有线性层的输入特征。
    #layer：当前需要处理的模型层
    #named_linears：线性层的字典，包含线性层名称和层对象。
    def _get_input_feat(self, layer, named_linears):
        # firstly, get input features of all linear layers
        #钩子函数，用于在前向传播时缓存每个线性层的输入特征

        #m：对应的线性层模块
        #x：该层的输入，通常是一个元组，这里取第一个元素。
        #y：该层的输出（未使用）。
        #name：线性层的名称。
        #feat_dict：用于存储特征的字典
        def cache_input_hook(m, x, y, name, feat_dict):
            x = x[0]
            x = x.detach().cpu()
            feat_dict[name].append(x)

        #初始化input_feat为一个字典，用于存储每个线性层的输入特征。
        #handles用于存储钩子函数句柄，便于后续移除。
        input_feat = defaultdict(list)
        handles = []



        #遍历每个线性层named_linears
        #为每个创建前向传播钩子函数，钩子会在每次前向传播时调用cache_input_hook捕获输入特征
        #使用functools.partial绑定钩子的参数name和feat_dict
        for name in named_linears:
            handles.append(named_linears[name].register_forward_hook(
                functools.partial(cache_input_hook, name=name,
                                feat_dict=input_feat)))
        
        #捕获第0层输入特征self.inps移动到当前层的设备上，以支持多GPU运行。
        self.inps = self.inps.to(next(layer.parameters()).device)  # in case multi-gpu
        # get output as next layer's input
        #将输入传入当前层进行前向传播，计算结果作为下一层的输入。self.module_kwargs 是传递给该层的额外参数，前向传播的结果是一个元组，取第一个元素。
        self.inps = layer(self.inps, **self.module_kwargs)[0]

        #移除所有注册的钩子，防止影响后续推理。
        for h in handles:
            h.remove()
        
        #将字典input_feat中每个线性层的输入特征列表拼接成一个大张量。
        # now solve for scaling and clipping
        input_feat = {k: torch.cat(v, dim=0) for k, v in input_feat.items()}
        
        #返回拼接后的输入特征字典input_feat，每个键是线性层名称，对应的值时该层的输入特征张量。
        return input_feat