import tensorrt as trt
import numpy as np
import time
import pycuda.driver as cuda
import pycuda.autoinit
# 自定义INT8校准器类
class MyCalibrator(trt.IInt8EntropyCalibrator2):
def __init__(self, batch_size=1, input_shape=(1, 3, 224, 224), cache_file="calibration.cache"):
trt.IInt8EntropyCalibrator2.__init__(self)
self.batch_size = batch_size
self.input_shape = input_shape
self.cache_file = cache_file
# 生成随机校准数据(实际应用中应使用真实数据集)
self.data = np.random.randn(100, *input_shape).astype(np.float32)
self.current_index = 0
# 分配设备内存
self.device_input = cuda.mem_alloc(self.data.nbytes * self.batch_size)
def get_batch_size(self):
return self.batch_size
def get_batch(self, names):
if self.current_index + self.batch_size > len(self.data):
return None
batch = self.data[self.current_index:self.current_index+self.batch_size]
self.current_index += self.batch_size
# 将数据复制到设备
cuda.memcpy_htod(self.device_input, batch.tobytes())
return [int(self.device_input)]
def read_calibration_cache(self):
# 如果存在缓存,跳校准
try:
with open(self.cache_file, "rb") as f:
return f.read()
except FileNotFoundError:
return None
def write_calibration_cache(self, cache):
with open(self.cache_file, "wb") as f:
f.write(cache)
# 创建简单网络(使用 add_pooling_nd 替代 add_pooling)
def create_network():
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
# 定义输入
input_tensor = network.add_input("input", trt.DataType.FLOAT, (1, 3, 224, 224))
# 添加卷积层(模拟ResNet-18第一层)
conv1 = network.add_convolution_nd(
input=input_tensor,
num_output_maps=64,
kernel_shape=(7, 7),
kernel=np.random.randn(64, 3, 7, 7).astype(np.float32).ravel(),
bias=np.random.randn(64).astype(np.float32)
)
# 设置步长和填充(直接赋值属性)
conv1.stride_nd = [2, 2]
conv1.padding_nd = [3, 3]
# 添加池化层(使用 add_pooling_nd)
window_shape = trt.Dims([3, 3]) # 修改:使用 trt.Dims
pool1 = network.add_pooling_nd(
input=conv1.get_output(0),
type=trt.PoolingType.MAX,
window_size=window_shape
)
pool1.stride_nd = [2, 2] # 设置池化步长
# 展平输出为二维张量(N, C*H*W)
flatten = network.add_shuffle(pool1.get_output(0))
output_shape = (1, 64 * 56 * 56) # 假设输入是 (1, 64, 56, 56)
flatten.reshape_dims = tuple(output_shape)
# 定义权重和偏置
kernel = np.random.randn(1000, 64 * 56 * 56).astype(np.float32).ravel()
bias = np.random.randn(1000).astype(np.float32)
# 添加常量张量作为权重和偏置
w = network.add_constant((1000, 64 * 56 * 56), kernel)
b = network.add_constant((1000,), bias)
# 矩阵乘法:[input] @ [weights.T] + [bias]
matmul = network.add_matrix_multiply(flatten.get_output(0), trt.MatrixOperation.NONE,
w.get_output(0), trt.MatrixOperation.TRANSPOSE)
bias_add = network.add_elementwise(matmul.get_output(0), b.get_output(0), trt.ElementWiseOperation.SUM)
# 标记输出
bias_add.get_output(0).name = "output"
network.mark_output(bias_add.get_output(0))
return builder, network, logger
# 主测试函数
def int8_perf_test():
# 创建网络
builder, network, logger = create_network()
# 配置INT8量化
config = builder.create_builder_config()
config.set_flag(trt.BuilderFlag.INT8)
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1GB工作空间
# 创建校准器并设置给配置
calib = MyCalibrator(batch_size=1, input_shape=(1, 3, 224, 224), cache_file="calibration.cache")
config.int8_calibrator = calib # ✅ 正确方式:直接赋值属性
# 构建引擎(新API)
engine = builder.build_engine(network, config)
if not engine:
print("Engine构建失败!")
return
# 创建执行上下文
context = engine.create_execution_context()
# 准备输入输出缓冲区
input_shape = (1, 3, 224, 224)
output_shape = (1, 1000)
host_input = np.random.randn(*input_shape).astype(np.float32)
host_output = np.empty(output_shape, dtype=np.float32)
# 分配设备内存
d_input = cuda.mem_alloc(host_input.nbytes)
d_output = cuda.mem_alloc(host_output.nbytes)
bindings = [int(d_input), int(d_output)]
# 创建CUDA流
stream = cuda.Stream()
# 预热
for _ in range(5):
cuda.memcpy_htod_async(d_input, host_input, stream)
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_output, d_output, stream)
stream.synchronize()
# 正式测试
start = time.time()
for _ in range(100):
cuda.memcpy_htod_async(d_input, host_input, stream)
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_output, d_output, stream)
stream.synchronize()
total_time = time.time() - start
# 计算INT8 TOPS(基于ResNet-18的3.9G MACs)
# 注意:INT8操作数 = MACs × 2(乘加各算一次操作)
ops = 3.9e9 * 2 # ResNet-18的INT8操作数
tops = (ops * 100) / (total_time * 1e12) # 100次推理
print(f"INT8 TOPS: {tops:.2f} TOPS")
# 清理资源
del context
del engine
if __name__ == "__main__":
int8_perf_test()
报错
jp@jp-Super-Server:~/test$ python3 TensorRT_int8.py
/home/jp/test/TensorRT_int8.py:113: DeprecationWarning: Use Deprecated in TensorRT 10.1. Superseded by explicit quantization. instead.
config.int8_calibrator = calib # ✅ 正确方式:直接赋值属性
Traceback (most recent call last):
File "/home/jp/test/TensorRT_int8.py", line 166, in <module>
int8_perf_test()
File "/home/jp/test/TensorRT_int8.py", line 116, in int8_perf_test
engine = builder.build_engine(network, config)
AttributeError: 'tensorrt_bindings.tensorrt.Builder' object has no attribute 'build_engine'
给出修改后的完整代码