目录
最近博主在做TensorRT的升级,有感而发,分享一些小经验,哈哈哈哈
接下来都用TensorRT官方提供的Demo进行讲述吧
一、Engine Build
1.1 卷积替换全连接
TensorRT10.9部分代码
def attention_layer_opt(prefix, config, init_dict, network, input_tensor, mask_idx, cu_seqlens, max_seqlen):
"""
Add the attention layer
"""
hidden_size = config.hidden_size
num_heads = config.num_attention_heads
head_size = int(hidden_size / num_heads)
Wall = init_dict[prefix + WQKV]
Ball = init_dict[prefix + BQKV]
# FC_attention
mult_all = network.add_convolution_nd(input_tensor, 3 * hidden_size, (1, 1), Wall, Ball)
if config.use_qat:
dr_qkv = max(
init_dict[prefix + 'self_qv_a_input_quantizer_amax'],
init_dict[prefix + 'self_qv_b_input_quantizer_amax'],
init_dict[prefix + 'self_av_b_input_quantizer_amax'],
)
set_output_range(mult_all, dr_qkv)
set_output_name(mult_all, prefix, "qkv_mult")
# QKV2CTX
dtype = config.get_trt_dtype()
pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32)
pf_hidden_size = trt.PluginField("hidden_size", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32)
pf_num_heads = trt.PluginField("num_heads", np.array([num_heads], np.int32), trt.PluginFieldType.INT32)
pf_has_mask = trt.PluginField("has_mask", np.array([1], np.int32), trt.PluginFieldType.INT32)
pf_var_seqlen = trt.PluginField("var_seqlen", np.array([int(1)], np.int32), trt.PluginFieldType.FLOAT32)
if config.use_qat:
dr_probs = init_dict[prefix + 'self_av_a_input_quantizer_amax']
dq_probs = dr_probs / 127.0
pf_dq_probs = trt.PluginField("dq_probs", np.array([dq_probs], np.float32), trt.PluginFieldType.FLOAT32)
fields = [pf_hidden_size, pf_num_heads, pf_dq_probs]
else:
fields = [pf_hidden_size, pf_num_heads]
if config.use_int8 and config.interleaved:
pfc = trt.PluginFieldCollection(fields)
qkv2ctx_plug = create_plugin(
"qkv_to_context_interleaved",
plg_registry,
pfc,
use_deprecated_plugins=config.use_deprecated_plugins,
)
qkv_in = [mult_all.get_output(0), cu_seqlens, max_seqlen]
else:
fields.append(pf_has_mask)
fields.append(pf_type)
fields.append(pf_var_seqlen)
pfc = trt.PluginFieldCollection(fields)
qkv2ctx_plug = create_plugin(
"qkv_to_context_varseqlen",
plg_registry,
pfc,
use_deprecated_plugins=config.use_deprecated_plugins,
)
qkv_in = [mult_all.get_output(0), mask_idx, cu_seqlens, max_seqlen]
qkv2ctx = add_plugin_to_network(
network, qkv2ctx_plug, qkv_in, use_deprecated_plugins=config.use_deprecated_plugins
)
qkv2ctx.name = prefix + "qkv_to_ctx"
if config.use_qat:
dr_ctx = init_dict[prefix + 'output_dense_input_amax']
set_output_range(qkv2ctx, dr_ctx)
set_output_name(qkv2ctx, prefix, "context_layer")
return qkv2ctx
TensorRT8.4部分代码
def attention_layer_opt(prefix, config, init_dict, network, input_tensor, mask_idx, cu_seqlens, max_seqlen):
"""
Add the attention layer
"""
hidden_size = config.hidden_size
num_heads = config.num_attention_heads
head_size = int(hidden_size / num_heads)
Wall = init_dict[prefix + WQKV]
Ball = init_dict[prefix + BQKV]
# FC_attention
if config.use_int8:
mult_all = network.add_convolution_nd(input_tensor, 3 * hidden_size, (1, 1), Wall, Ball)
else:
mult_all = network.add_fully_connected(input_tensor, 3 * hidden_size, Wall, Ball)
if config.use_qat:
dr_qkv = max(
init_dict[prefix + 'self_qv_a_input_quantizer_amax'],
init_dict[prefix + 'self_qv_b_input_quantizer_amax'],
init_dict[prefix + 'self_av_b_input_quantizer_amax'],
)
set_output_range(mult_all, dr_qkv)
set_output_name(mult_all, prefix, "qkv_mult")
# QKV2CTX
dtype = config.get_trt_dtype()
pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32)
pf_hidden_size = trt.PluginField("hidden_size", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32)
pf_num_heads = trt.PluginField("num_heads", np.array([num_heads], np.int32), trt.PluginFieldType.INT32)
pf_has_mask = trt.PluginField("has_mask", np.array([1], np.int32), trt.PluginFieldType.INT32)
pf_var_seqlen = trt.PluginField("var_seqlen", np.array([int(1)], np.int32), trt.PluginFieldType.FLOAT32)
if config.use_qat:
dr_probs = init_dict[prefix + 'self_av_a_input_quantizer_amax']
dq_probs = dr_probs / 127.0
pf_dq_probs = trt.PluginField("dq_probs", np.array([dq_probs], np.float32), trt.PluginFieldType.FLOAT32)
fields = [pf_hidden_size, pf_num_heads, pf_dq_probs]
else:
fields = [pf_hidden_size, pf_num_heads]
if config.use_int8 and config.interleaved:
pfc = trt.PluginFieldCollection(fields)
qkv2ctx_plug = mha_plg_creator3.create_plugin("qkv2ctx", pfc)
qkv_in = [mult_all.get_output(0), cu_seqlens, max_seqlen]
else:
fields.append(pf_has_mask)
fields.append(pf_type)
fields.append(pf_var_seqlen)
pfc = trt.PluginFieldCollection(fields)
qkv2ctx_plug = mha_plg_creator2.create_plugin("qkv2ctx", pfc)
qkv_in = [mult_all.get_output(0), mask_idx, cu_seqlens, max_seqlen]
qkv2ctx = network.add_plugin_v2(qkv_in, qkv2ctx_plug)
qkv2ctx.name = prefix + 'qkv_to_ctx'
if config.use_qat:
dr_ctx = init_dict[prefix + 'output_dense_input_amax']
set_output_range(qkv2ctx, dr_ctx)
set_output_name(qkv2ctx, prefix, "context_layer")
return qkv2ctx
有一个主要的变化就是 add_fully_connected 函数已经不可用了,而是使用了1 * 1卷积核的卷积操作来进行代替
譬如在上述代码中的第一个卷积操作(多头注意力机制的输入转为Q、K、V需要经过三个全连接操作),输入维度是 [SxB,E,1,1](即[-1, hidden_size, 1, 1]),输出维度为[-1,3 * hidden_size, 1, 1]
从输入中取出一个向量,就是一个长度为 hidden_size 的向量。这个向量经过三个全连接转换最后在拼接在一起,即
可以将其转换为,其中
ji就是一个[hidden_size * 3, hidden_size]维度的权重矩阵,这样能卷积操作相比于全连接操作可以更好的利用GPU的并行能力。同时TensorCore通过FMA操作加速矩阵乘法,通常在一个时间周期就可以完成
下面博主用FMA拆分add_fully_connected操作,可以帮助理解
em_32_layer = network.add_identity(input_tensor)
em_32_layer.set_output_type(0, trt.float32)
input_32_tensor = em_32_layer.get_output(0)
input_32_tensor_shape = input_32_tensor.shape
N, C, H, W = input_32_tensor_shape
reshaped_input = network.add_shuffle(input_32_tensor)
reshaped_input.reshape_dims = (N, C)
reshaped_W_mid = network.add_constant((3 * hidden_size, C), Wall)
reshaped_B_mid = network.add_constant((1, 3 * hidden_size, 1, 1), Ball)
matmul_layer = network.add_matrix_multiply(reshaped_input.get_output(0), trt.MatrixOperation.NONE, reshaped_W_mid.get_output(0), trt.MatrixOperation.TRANSPOSE)
matmul_output = matmul_layer.get_output(0)
reshaped_mm_output = network.add_shuffle(matmul_output)
reshaped_mm_output.reshape_dims = (N, 3 * hidden_size, H, W)
mult_all = network.add_elementwise(reshaped_mm_output.get_output(0), reshaped_B_mid.get_output(0), trt.ElementWiseOperation.SUM)
output_tensor = mult_all.get_output(0)
set_output_name(mult_all, prefix, "qkv_mult")
可能有人疑惑,为什么博主会添加一些标记层用来转化数据类型,下一小节讲解
1.2 float16量化
先展示一段Demo代码
def add_gelu(network, input_tensor):
"""
Adds elementwise GELU, and will trigger FC+GELU fusion in TRT
"""
shape = (1, ) * len(input_tensor.shape)
POW = network.add_constant(shape, trt.Weights(np.ascontiguousarray([3.0], dtype=np.float32)))
MULTIPLY = network.add_constant(shape, trt.Weights(np.ascontiguousarray([0.044715], dtype=np.float32)))
SQRT = network.add_constant(shape, trt.Weights((np.ascontiguousarray([0.79788456080286535587989211986876], dtype=np.float32))))
ONE = network.add_constant(shape, trt.Weights((np.ascontiguousarray([1.0], dtype=np.float32))))
HALF = network.add_constant(shape, trt.Weights((np.ascontiguousarray([0.5], dtype=np.float32))))
X_pow = network.add_elementwise(input_tensor, POW.get_output(0), trt.ElementWiseOperation.POW)
X_pow_t = X_pow.get_output(0)
X_mul = network.add_elementwise(X_pow_t, MULTIPLY.get_output(0), trt.ElementWiseOperation.PROD)
X_add = network.add_elementwise(input_tensor, X_mul.get_output(0), trt.ElementWiseOperation.SUM)
X_sqrt = network.add_elementwise(X_add.get_output(0), SQRT.get_output(0), trt.ElementWiseOperation.PROD)
X_sqrt_tensor = X_sqrt.get_output(0)
X_tanh = network.add_activation(X_sqrt_tensor, trt.ActivationType.TANH)
X_tanh_tensor = X_tanh.get_output(0)
X_one = network.add_elementwise(X_tanh_tensor, ONE.get_output(0), trt.ElementWiseOperation.SUM)
CDF = network.add_elementwise(X_one.get_output(0), HALF.get_output(0), trt.ElementWiseOperation.PROD)
gelu_layer = network.add_elementwise(CDF.get_output(0), input_tensor, trt.ElementWiseOperation.PROD)
# enable elementwise fusing for int8 && fp16
POW.precision = trt.DataType.FLOAT
MULTIPLY.precision = trt.DataType.FLOAT
SQRT.precision = trt.DataType.FLOAT
ONE.precision = trt.DataType.FLOAT
HALF.precision = trt.DataType.FLOAT
X_pow.precision = trt.DataType.FLOAT
X_mul.precision = trt.DataType.FLOAT
X_add.precision = trt.DataType.FLOAT
X_sqrt.precision = trt.DataType.FLOAT
X_tanh.precision = trt.DataType.FLOAT
X_one.precision = trt.DataType.FLOAT
CDF.precision = trt.DataType.FLOAT
gelu_layer.precision = trt.DataType.FLOAT
return gelu_layer
input_tensor的数据类型是HALF(即float16),但是SQRT等常量层的数据类型为FLOAT(即float32)。但add_elementwise逐元素操作层,虽然支持混合精度输入,但还是会报警告,其实无伤大雅,其内部会自动进行精度转化,显示的转化一下输入可以消除警告,警告终归还是不好看嘛,哈哈哈哈
mid_dense = network.add_convolution_nd(attention_ln, config.intermediate_size, (1, 1), W_mid, B_mid)
mid_dense_out = mid_dense.get_output(0)
cast_layer = network.add_identity(mid_dense_out)
cast_layer.set_output_type(0, trt.float32)
mid_dense_out = cast_layer.get_output(0)
1.3 遇到的坑
博主在进行Engine Build时,遇到跳变率和差不匹配比例过高的情况(Pytorch\Tensorflow输出与TensorRT输出有差距)
于是博主采用了一层一层查看模型输出的方式来进行调试,通过给输出tensor设置名字、给network添加输出,一个个查看模型的各层输出。可以与 以前成功的Engine Build代码或者Pytorch、Tensorflow代码 逐层进行比较
# 设置名字
def set_tensor_name(tensor, prefix, name):
tensor.name = prefix + name
# 设置输出
network.mark_output(tensor_name)
经过调试发现embedding层的输出时一致的(这个很离谱),但是遇到多头注意力机制的一个卷积操作时,输出就不同了,博主就觉得可能是 add_fully_connected 的问题,于是自己实现了FMA的代码(上面展示过了),但依然不行(此时博主就有点懵了,博主对自己的代码还是挺自信的,哈哈哈哈)
于是将FMA的代码替换进以前Engine Build的代码中,确实没有问题。那就开始更加细致的调试了,发现了常量层(权重)的输出有问题,输出的都是同一个值(权重中的第一个值)
然后去查看读取权重部分的代码,numpy数组的值是没问题的,设置trt.Weights的代码也没有什么问题,但是将trt.Weights数据使用.numpy()再次转换为numpy数组后,打印出来的值就出现上面的那种情况

此时终于定位到问题了,但是为什么呢?再一次陷入的懵逼的状态,哈哈哈哈
最终发现问题其实是,conda虚拟环境中的numpy版本过高,将numpy的版本降下来就没问题了,这真是一个令人无语的问题!!!
但还是有一个疑问的点,embedding层的权重读取也是有问题的(调试打印也确实没有正确读取),但是输出却是一致的,只能暂时归结于巧合了,等博主有时间研究一下embedding插件的CUDA代码才能进行揭秘了,哈哈哈哈
二、Engine Inference
这一部分到没遇到什么问题,看看文档,稍作修改即可,博主就挑两个稍微重要一点的讲讲
2.1 动态输入
optimization_profile的设置方式进行了改变
# 旧版本
context.active_optimization_profile = 0
# 新版本
context.set_optimization_profile_async(0, stream.handle)
不再使用set_binding_shape按照index来绑定输入输入形状了,而是使用set_input_shape利用张量名来进行绑定,这样确实更加直观

2.2 推理
使用execute_async_v3代替了execute_async_v2
context.execute_async_v3(stream_handle=stream.handle)
context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
这样使用更加简洁,不用每次调用推理都需要传递设备内存了
但需要新增一个操作(执行一次即可),将设备内存与context绑定,相当于将其作为属性设置到context中了
bindings = [int(d_inputs[i]) for i in range(4)] + [int(d_output)]
for i in range(engine.num_io_tensors):
context.set_tensor_address(engine.get_tensor_name(i), bindings[i])
随着版本的升级,接口的使用确实得到了优化,更加便捷了
3982

被折叠的 条评论
为什么被折叠?



