onnx模型和tensorrt模型的量化标定

最新推荐文章于 2025-10-22 23:03:50 发布

原创最新推荐文章于 2025-10-22 23:03:50 发布 · 210 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#onnx #tensorrt

deep learning 同时被 2 个专栏收录

148 篇文章

订阅专栏

model deployment

70 篇文章

订阅专栏

部署运行你感兴趣的模型镜像

onnx模型的量化标定

import os
import onnx
import numpy as np
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType

def dynamic_quantize_onnx_model(
    input_model_path,
    output_model_path=None,
    weight_type=QuantType.QUInt8,
    optimize_model=True,
    verbose=False
):
    """
    对ONNX模型进行INT8动态量化
    
    参数:
        input_model_path: 输入ONNX模型路径
        output_model_path: 输出量化模型路径，默认为输入路径加"_int8"后缀
        weight_type: 权重量化类型，可选QUInt8(默认)或QInt8
        optimize_model: 是否在量化前优化模型
        verbose: 是否打印详细日志
    
    返回:
        量化模型的保存路径
    """
    # 生成默认输出路径
    if output_model_path is None:
        dir_name, file_name = os.path.split(input_model_path)
        base_name, ext = os.path.splitext(file_name)
        output_model_path = os.path.join(dir_name, f"{base_name}_int8{ext}")
    
    # 验证输入模型有效性
    try:
        model = onnx.load(input_model_path)
        onnx.checker.check_model(model)
        if verbose:
            print(f"模型验证成功: {input_model_path}")
    except Exception as e:
        raise ValueError(f"输入模型无效: {str(e)}")
    
    # 执行动态量化
    if verbose:
        print(f"开始动态量化...")
        print(f"输入模型: {input_model_path}")
        print(f"输出模型: {output_model_path}")
        print(f"权重量化类型: {weight_type}")
    
    quantize_dynamic(
        model_input=input_model_path,
        model_output=output_model_path,
        weight_type=weight_type,
        #activation_type=QuantType.QInt8,  # 动态量化通常对激活使用QInt8
        #optimize_model=optimize_model,
        per_channel=False,  # 动态量化通常不使用按通道量化
        reduce_range=False  # 是否缩小量化范围(如127->128)
    )
    
    if verbose:
        print(f"动态量化完成!")
    
    return output_model_path

def compare_quantized_model(original_model_path, quantized_model_path, test_input=None):
    """
    比较原始模型和量化模型的输出差异
    
    参数:
        original_model_path: 原始模型路径
        quantized_model_path: 量化模型路径
        test_input: 测试输入数据，如果为None则生成随机输入
    
    返回:
        输出差异的统计信息
    """
    # 创建推理会话
    sess_original = ort.InferenceSession(original_model_path, providers=['CPUExecutionProvider'])
    sess_quantized = ort.InferenceSession(quantized_model_path, providers=['CPUExecutionProvider'])
    
    # 获取输入输出信息
    input_name = sess_original.get_inputs()[0].name
    output_name = sess_original.get_outputs()[0].name
    input_shape = [dim if dim > 0 else 1 for dim in sess_original.get_inputs()[0].shape]
    
    # 生成测试输入
    if test_input is None:
        test_input = np.random.rand(*input_shape).astype(np.float32)
    
    # 执行推理
    output_original = sess_original.run([output_name], {input_name: test_input})[0]
    output_quantized = sess_quantized.run([output_name], {input_name: test_input})[0]
    
    # 计算差异
    abs_diff = np.abs(output_original - output_quantized)
    stats = {
        'max_diff': np.max(abs_diff),
        'mean_diff': np.mean(abs_diff),
        'std_diff': np.std(abs_diff),
        'original_range': (np.min(output_original), np.max(output_original)),
        'quantized_range': (np.min(output_quantized), np.max(output_quantized))
    }
    
    # 打印结果
    print("\n模型对比结果:")
    print(f"  最大差异: {stats['max_diff']:.6f}")
    print(f"  平均差异: {stats['mean_diff']:.6f}")
    print(f"  差异标准差: {stats['std_diff']:.6f}")
    
    return stats

# 使用示例
if __name__ == "__main__":
    # 配置参数
    INPUT_MODEL = "yolov6n_fp32.onnx"  # 替换为你的ONNX模型路径
    OUTPUT_MODEL = "yolov6n_int8.onnx"
    
    # 执行动态量化
    quantized_model = dynamic_quantize_onnx_model(
        input_model_path=INPUT_MODEL,
        output_model_path=OUTPUT_MODEL,
        weight_type=QuantType.QUInt8,
        verbose=True
    )
    
    # 比较量化前后模型
    compare_quantized_model(INPUT_MODEL, quantized_model)
    
    # 打印模型大小变化
    original_size = os.path.getsize(INPUT_MODEL) / (1024 * 1024)  # MB
    quantized_size = os.path.getsize(quantized_model) / (1024 * 1024)  # MB
    print(f"\n模型大小变化:")
    print(f"  原始模型: {original_size:.2f} MB")
    print(f"  量化模型: {quantized_size:.2f} MB")
    print(f"  压缩比例: {quantized_size/original_size:.2f}x")

tensorrt模型的量化标定

calibrator.py

import os
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import logging
logger = logging.getLogger(__name__)


# calibrator
class Calibrator(trt.IInt8EntropyCalibrator2):
    def __init__(self, stream, cache_file=""):
        trt.IInt8EntropyCalibrator2.__init__(self)       
        self.stream = stream
        self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
        self.cache_file = cache_file

    def get_batch_size(self):
        return self.stream.batch_size

    def get_batch(self, names):
        batch = self.stream.next_batch()
        if not batch.size:   
            return None
        cuda.memcpy_htod(self.d_input, batch)
        return [int(self.d_input)]

    def read_calibration_cache(self):
        if os.path.exists(self.cache_file):
            with open(self.cache_file, "rb") as f:
                logger.info("Using calibration cache to save time: {:}".format(self.cache_file))
                return f.read()

    def write_calibration_cache(self, cache):
        with open(self.cache_file, "wb") as f:
            logger.info("Caching calibration data for future use: {:}".format(self.cache_file))
            f.write(cache)

main.py

import glob, os, cv2
import numpy as np
import tensorrt as trt
from calibrator import Calibrator


height = 640
width = 640
image_path = 'images'
model_path = "./yolov6n_fp32.onnx"
engine_model_path = "yolov6n_int8.engine"
calibration_table = 'yolov6n_int8_calibration.cache'

TRT_LOGGER = trt.Logger(trt.Logger.WARNING) 


def preprocess(image):
    h, w, c = image.shape
    r_w = width / w
    r_h = height / h
    if r_h > r_w:
        tw = width
        th = int(r_w * h)
        tx1 = tx2 = 0
        ty1 = int((height - th) / 2)
        ty2 = height - th - ty1
    else:
        tw = int(r_h * w)
        th = height
        tx1 = int((width - tw) / 2)
        tx2 = width - tw - tx1
        ty1 = ty2 = 0
    image = cv2.resize(image, (tw, th))
    image = cv2.copyMakeBorder(image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128))  
    image = image / 255.0
    #image = image - np.array([0.406, 0.456, 0.485])
    #image = image / np.array([0.225, 0.224, 0.229])
    image = image[:, :, ::-1].transpose(2, 0, 1).astype(dtype=np.float32)
    return image


class DataLoader:
    def __init__(self):
        self.index = 0
        self.length = 8
        self.batch_size = 16
        self.img_list = glob.glob(os.path.join(image_path, "*.jpg"))
        assert len(self.img_list) >= self.batch_size * self.length
        self.calibration_data = np.zeros((self.batch_size, 3, height, width), dtype=np.float32)

    def next_batch(self):
        if self.index < self.length:
            for i in range(self.batch_size):
                assert os.path.exists(self.img_list[i + self.index * self.batch_size]), 'not found!!'
                img = cv2.imread(self.img_list[i + self.index * self.batch_size])
                img = preprocess(img)
                self.calibration_data[i] = img
            self.index += 1
            return np.ascontiguousarray(self.calibration_data, dtype=np.float32)
        else:
            return np.array([])

    def __len__(self):
        return self.length


def get_engine(onnx_file_path="", engine_file_path="", calibration_stream=None, calibration_table_path=""):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(1) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:      
        if not os.path.exists(onnx_file_path):
            quit('ONNX file {} not found'.format(onnx_file_path))
        with open(onnx_file_path, 'rb') as model:
            parser.parse(model.read())
            assert network.num_layers > 0, 'Failed to parse ONNX model. Please check if the ONNX model is compatible '      
        #builder.max_batch_size = 1
        config = builder.create_builder_config()
        #config.max_workspace_size = 1 << 32
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 32)
        config.set_flag(trt.BuilderFlag.INT8)
        assert calibration_stream, 'Error: a calibration_stream should be provided for int8 mode'
        config.int8_calibrator  = Calibrator(calibration_stream, calibration_table_path)
        runtime = trt.Runtime(TRT_LOGGER)
        plan = builder.build_serialized_network(network, config)
        engine = runtime.deserialize_cuda_engine(plan)
        if engine is None:
            print('Failed to create the engine')
        with open(engine_file_path, "wb") as f:
            f.write(engine.serialize())


if __name__ == '__main__':
    get_engine(model_path, engine_model_path, calibration_stream=DataLoader(), calibration_table_path=calibration_table)