onnx模型的量化标定
import os
import onnx
import numpy as np
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType
def dynamic_quantize_onnx_model(
input_model_path,
output_model_path=None,
weight_type=QuantType.QUInt8,
optimize_model=True,
verbose=False
):
"""
对ONNX模型进行INT8动态量化
参数:
input_model_path: 输入ONNX模型路径
output_model_path: 输出量化模型路径,默认为输入路径加"_int8"后缀
weight_type: 权重量化类型,可选QUInt8(默认)或QInt8
optimize_model: 是否在量化前优化模型
verbose: 是否打印详细日志
返回:
量化模型的保存路径
"""
# 生成默认输出路径
if output_model_path is None:
dir_name, file_name = os.path.split(input_model_path)
base_name, ext = os.path.splitext(file_name)
output_model_path = os.path.join(dir_name, f"{base_name}_int8{ext}")
# 验证输入模型有效性
try:
model = onnx.load(input_model_path)
onnx.checker.check_model(model)
if verbose:
print(f"模型验证成功: {input_model_path}")
except Exception as e:
raise ValueError(f"输入模型无效: {str(e)}")
# 执行动态量化
if verbose:
print(f"开始动态量化...")
print(f"输入模型: {input_model_path}")
print(f"输出模型: {output_model_path}")
print(f"权重量化类型: {weight_type}")
quantize_dynamic(
model_input=input_model_path,
model_output=output_model_path,
weight_type=weight_type,
#activation_type=QuantType.QInt8, # 动态量化通常对激活使用QInt8
#optimize_model=optimize_model,
per_channel=False, # 动态量化通常不使用按通道量化
reduce_range=False # 是否缩小量化范围(如127->128)
)
if verbose:
print(f"动态量化完成!")
return output_model_path
def compare_quantized_model(original_model_path, quantized_model_path, test_input=None):
"""
比较原始模型和量化模型的输出差异
参数:
original_model_path: 原始模型路径
quantized_model_path: 量化模型路径
test_input: 测试输入数据,如果为None则生成随机输入
返回:
输出差异的统计信息
"""
# 创建推理会话
sess_original = ort.InferenceSession(original_model_path, providers=['CPUExecutionProvider'])
sess_quantized = ort.InferenceSession(quantized_model_path, providers=['CPUExecutionProvider'])
# 获取输入输出信息
input_name = sess_original.get_inputs()[0].name
output_name = sess_original.get_outputs()[0].name
input_shape = [dim if dim > 0 else 1 for dim in sess_original.get_inputs()[0].shape]
# 生成测试输入
if test_input is None:
test_input = np.random.rand(*input_shape).astype(np.float32)
# 执行推理
output_original = sess_original.run([output_name], {input_name: test_input})[0]
output_quantized = sess_quantized.run([output_name], {input_name: test_input})[0]
# 计算差异
abs_diff = np.abs(output_original - output_quantized)
stats = {
'max_diff': np.max(abs_diff),
'mean_diff': np.mean(abs_diff),
'std_diff': np.std(abs_diff),
'original_range': (np.min(output_original), np.max(output_original)),
'quantized_range': (np.min(output_quantized), np.max(output_quantized))
}
# 打印结果
print("\n模型对比结果:")
print(f" 最大差异: {stats['max_diff']:.6f}")
print(f" 平均差异: {stats['mean_diff']:.6f}")
print(f" 差异标准差: {stats['std_diff']:.6f}")
return stats
# 使用示例
if __name__ == "__main__":
# 配置参数
INPUT_MODEL = "yolov6n_fp32.onnx" # 替换为你的ONNX模型路径
OUTPUT_MODEL = "yolov6n_int8.onnx"
# 执行动态量化
quantized_model = dynamic_quantize_onnx_model(
input_model_path=INPUT_MODEL,
output_model_path=OUTPUT_MODEL,
weight_type=QuantType.QUInt8,
verbose=True
)
# 比较量化前后模型
compare_quantized_model(INPUT_MODEL, quantized_model)
# 打印模型大小变化
original_size = os.path.getsize(INPUT_MODEL) / (1024 * 1024) # MB
quantized_size = os.path.getsize(quantized_model) / (1024 * 1024) # MB
print(f"\n模型大小变化:")
print(f" 原始模型: {original_size:.2f} MB")
print(f" 量化模型: {quantized_size:.2f} MB")
print(f" 压缩比例: {quantized_size/original_size:.2f}x")
tensorrt模型的量化标定
calibrator.py
import os
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import logging
logger = logging.getLogger(__name__)
# calibrator
class Calibrator(trt.IInt8EntropyCalibrator2):
def __init__(self, stream, cache_file=""):
trt.IInt8EntropyCalibrator2.__init__(self)
self.stream = stream
self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
self.cache_file = cache_file
def get_batch_size(self):
return self.stream.batch_size
def get_batch(self, names):
batch = self.stream.next_batch()
if not batch.size:
return None
cuda.memcpy_htod(self.d_input, batch)
return [int(self.d_input)]
def read_calibration_cache(self):
if os.path.exists(self.cache_file):
with open(self.cache_file, "rb") as f:
logger.info("Using calibration cache to save time: {:}".format(self.cache_file))
return f.read()
def write_calibration_cache(self, cache):
with open(self.cache_file, "wb") as f:
logger.info("Caching calibration data for future use: {:}".format(self.cache_file))
f.write(cache)
main.py
import glob, os, cv2
import numpy as np
import tensorrt as trt
from calibrator import Calibrator
height = 640
width = 640
image_path = 'images'
model_path = "./yolov6n_fp32.onnx"
engine_model_path = "yolov6n_int8.engine"
calibration_table = 'yolov6n_int8_calibration.cache'
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
def preprocess(image):
h, w, c = image.shape
r_w = width / w
r_h = height / h
if r_h > r_w:
tw = width
th = int(r_w * h)
tx1 = tx2 = 0
ty1 = int((height - th) / 2)
ty2 = height - th - ty1
else:
tw = int(r_h * w)
th = height
tx1 = int((width - tw) / 2)
tx2 = width - tw - tx1
ty1 = ty2 = 0
image = cv2.resize(image, (tw, th))
image = cv2.copyMakeBorder(image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128))
image = image / 255.0
#image = image - np.array([0.406, 0.456, 0.485])
#image = image / np.array([0.225, 0.224, 0.229])
image = image[:, :, ::-1].transpose(2, 0, 1).astype(dtype=np.float32)
return image
class DataLoader:
def __init__(self):
self.index = 0
self.length = 8
self.batch_size = 16
self.img_list = glob.glob(os.path.join(image_path, "*.jpg"))
assert len(self.img_list) >= self.batch_size * self.length
self.calibration_data = np.zeros((self.batch_size, 3, height, width), dtype=np.float32)
def next_batch(self):
if self.index < self.length:
for i in range(self.batch_size):
assert os.path.exists(self.img_list[i + self.index * self.batch_size]), 'not found!!'
img = cv2.imread(self.img_list[i + self.index * self.batch_size])
img = preprocess(img)
self.calibration_data[i] = img
self.index += 1
return np.ascontiguousarray(self.calibration_data, dtype=np.float32)
else:
return np.array([])
def __len__(self):
return self.length
def get_engine(onnx_file_path="", engine_file_path="", calibration_stream=None, calibration_table_path=""):
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(1) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
if not os.path.exists(onnx_file_path):
quit('ONNX file {} not found'.format(onnx_file_path))
with open(onnx_file_path, 'rb') as model:
parser.parse(model.read())
assert network.num_layers > 0, 'Failed to parse ONNX model. Please check if the ONNX model is compatible '
#builder.max_batch_size = 1
config = builder.create_builder_config()
#config.max_workspace_size = 1 << 32
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 32)
config.set_flag(trt.BuilderFlag.INT8)
assert calibration_stream, 'Error: a calibration_stream should be provided for int8 mode'
config.int8_calibrator = Calibrator(calibration_stream, calibration_table_path)
runtime = trt.Runtime(TRT_LOGGER)
plan = builder.build_serialized_network(network, config)
engine = runtime.deserialize_cuda_engine(plan)
if engine is None:
print('Failed to create the engine')
with open(engine_file_path, "wb") as f:
f.write(engine.serialize())
if __name__ == '__main__':
get_engine(model_path, engine_model_path, calibration_stream=DataLoader(), calibration_table_path=calibration_table)


3734

被折叠的 条评论
为什么被折叠?



