point-transformer模型tensorrt推理部署

Point-Transformer的TensorRT部署
部署运行你感兴趣的模型镜像

首先下载point-transformer的代码:https://github.com/POSTECH-CVLab/point-transformer
由于该仓库使用的cuda和torch版本较低,需要对源代码进行一些改动:
lib/pointops/src/aggregation/aggregation_cuda.cpp、
lib/pointops/src/grouping/grouping_cuda.cpp、
lib/pointops/src/interpolation/interpolation_cuda.cpp、lib/pointops/src/knnquery/knnquery_cuda.cpp、lib/pointops/src/sampling/sampling_cuda.cpp、
lib/pointops/src/subtraction/subtraction_cuda.cpp这些文件把

#include <THC/THC.h>

修改为

#include <ATen/ATen.h>

导出onnx模型

为了能导出合适的onnx模型,
lib/pointops/src/sampling/sampling_cuda_kernel.h修改为

#ifndef _SAMPLING_CUDA_KERNEL
#define _SAMPLING_CUDA_KERNEL
#include <vector>
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>

void furthestsampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, int new_offset, at::Tensor tmp_tensor, at::Tensor idx_tensor); 

#ifdef __cplusplus
extern "C" {
#endif

void furthestsampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, int new_offset, float *tmp, int *idx);

#ifdef __cplusplus
}
#endif
#endif

lib/pointops/src/sampling/sampling_cuda.cpp修改为

#include <ATen/ATen.h>
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include "sampling_cuda_kernel.h"

void furthestsampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, int new_offset, at::Tensor tmp_tensor, at::Tensor idx_tensor)
{
    const float *xyz = xyz_tensor.data_ptr<float>();
    const int *offset = offset_tensor.data_ptr<int>();
    float *tmp = tmp_tensor.data_ptr<float>();
    int *idx = idx_tensor.data_ptr<int>();

    furthestsampling_cuda_launcher(b, n, xyz, offset, new_offset, tmp, idx);
}

lib/pointops/src/sampling/sampling_cuda_kernel.cu修改为

#include "../cuda_utils.h"
#include "sampling_cuda_kernel.h"


__device__ void __update(float *dists, int *dists_i, int idx1, int idx2) {
    const float v1 = dists[idx1], v2 = dists[idx2];
    const int i1 = dists_i[idx1], i2 = dists_i[idx2];
    dists[idx1] = max(v1, v2);
    dists_i[idx1] = v2 > v1 ? i2 : i1;
}

// input xyz: (n, 3), tmp: (b, n_max)
// ouput idx (m)
template <unsigned int block_size>
__global__ void furthestsampling_cuda_kernel(const float *xyz, const int *offset,  int new_offset, float *tmp, int *idx)
{
    __shared__ float dists[block_size];
    __shared__ int dists_i[block_size];

    int bid = blockIdx.x;
    int start_n = 0;
    int end_n = offset[0];
    int start_m = 0;
    int end_m = new_offset;
    int old = 0;

    const int stride = block_size;
    int tid = threadIdx.x;
    if (tid == 0) idx[start_m] = start_n;

    __syncthreads();
    for (int j = start_m + 1; j < end_m; j++)
    {
        int besti = start_n;
        float best = -1;
        float x1 = xyz[old * 3 + 0];
        float y1 = xyz[old * 3 + 1];
        float z1 = xyz[old * 3 + 2];
        for (int k = start_n + tid; k < end_n; k += stride)
        {
            float x2 = xyz[k * 3 + 0];
            float y2 = xyz[k * 3 + 1];
            float z2 = xyz[k * 3 + 2];
            float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
            float d2 = min(d, tmp[k]);
            tmp[k] = d2;
            besti = d2 > best ? k : besti;
            best = d2 > best ? d2 : best;
        }
        dists[tid] = best;
        dists_i[tid] = besti;
        __syncthreads();

        if (block_size >= 1024) {
            if (tid < 512) {
            __update(dists, dists_i, tid, tid + 512);
            }
            __syncthreads();
        }
        if (block_size >= 512) {
            if (tid < 256) {
            __update(dists, dists_i, tid, tid + 256);
            }
            __syncthreads();
        }
        if (block_size >= 256) {
            if (tid < 128) {
            __update(dists, dists_i, tid, tid + 128);
            }
            __syncthreads();
        }
        if (block_size >= 128) {
            if (tid < 64) {
            __update(dists, dists_i, tid, tid + 64);
            }
            __syncthreads();
        }
        if (block_size >= 64) {
            if (tid < 32) {
            __update(dists, dists_i, tid, tid + 32);
            }
            __syncthreads();
        }
        if (block_size >= 32) {
            if (tid < 16) {
            __update(dists, dists_i, tid, tid + 16);
            }
            __syncthreads();
        }
        if (block_size >= 16) {
            if (tid < 8) {
            __update(dists, dists_i, tid, tid + 8);
            }
            __syncthreads();
        }
        if (block_size >= 8) {
            if (tid < 4) {
            __update(dists, dists_i, tid, tid + 4);
            }
            __syncthreads();
        }
        if (block_size >= 4) {
            if (tid < 2) {
            __update(dists, dists_i, tid, tid + 2);
            }
            __syncthreads();
        }
        if (block_size >= 2) {
            if (tid < 1) {
            __update(dists, dists_i, tid, tid + 1);
            }
            __syncthreads();
        }

        old = dists_i[0];
        if (tid == 0)
            idx[j] = old;
    }
}

void furthestsampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, int new_offset, float *tmp, int *idx)
{   
	unsigned int n_threads = opt_n_threads(n);
	switch (n_threads) {
        case 1024:
            furthestsampling_cuda_kernel<1024><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
            break;
        case 512:
            furthestsampling_cuda_kernel<512><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
            break;
        case 256:
            furthestsampling_cuda_kernel<256><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
            break;
        case 128:
            furthestsampling_cuda_kernel<128><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
            break;
        case 64:
            furthestsampling_cuda_kernel<64><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
            break;
        case 32:
            furthestsampling_cuda_kernel<32><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
            break;
        case 16:
            furthestsampling_cuda_kernel<16><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
            break;
        case 8:
            furthestsampling_cuda_kernel<8><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
            break;
        case 4:
            furthestsampling_cuda_kernel<4><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
            break;
        case 2:
            furthestsampling_cuda_kernel<2><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
            break;
        case 1:
            furthestsampling_cuda_kernel<1><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
            break;
        default:
            furthestsampling_cuda_kernel<512><<<b, n_threads, 0>>>(xyz, offset, new_offset, tmp, idx);
    }
}

lib/pointops/src/knnquery/knnquery_cuda_kernel.h修改为

#ifndef _KNNQUERY_CUDA_KERNEL
#define _KNNQUERY_CUDA_KERNEL
#include <vector>
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>


void knnquery_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, int new_offset_, at::Tensor idx_tensor, at::Tensor dist2_tensor);

#ifdef __cplusplus
extern "C" {
#endif

void knnquery_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, int new_offset, int *idx, float *dist2);

#ifdef __cplusplus
}
#endif
#endif

lib/pointops/src/knnquery/knnquery_cuda.cpp修改为

#include <ATen/ATen.h>
#include <torch/serialize/tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include "knnquery_cuda_kernel.h"


void knnquery_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, int new_offset, at::Tensor idx_tensor, at::Tensor dist2_tensor)
{
    const float *xyz = xyz_tensor.data_ptr<float>();
    const float *new_xyz = new_xyz_tensor.data_ptr<float>();
    const int *offset = offset_tensor.data_ptr<int>();
    //const int *new_offset = new_offset_tensor.data_ptr<int>();

    int *idx = idx_tensor.data_ptr<int>();
    float *dist2 = dist2_tensor.data_ptr<float>();
    knnquery_cuda_launcher(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2);
}

lib/pointops/src/knnquery/knnquery_cuda_kernel.cu修改为

#include "../cuda_utils.h"
#include "knnquery_cuda_kernel.h"


__device__ void swap_float(float *x, float *y)
{
    float tmp = *x;
    *x = *y;
    *y = tmp;
}


__device__ void swap_int(int *x, int *y)
{
    int tmp = *x;
    *x = *y;
    *y = tmp;
}


__device__ void reheap(float *dist, int *idx, int k)
{
    int root = 0;
    int child = root * 2 + 1;
    while (child < k)
    {
        if(child + 1 < k && dist[child+1] > dist[child])
            child++;
        if(dist[root] > dist[child])
            return;
        swap_float(&dist[root], &dist[child]);
        swap_int(&idx[root], &idx[child]);
        root = child;
        child = root * 2 + 1;
    }
}


__device__ void heap_sort(float *dist, int *idx, int k)
{
    int i;
    for (i = k - 1; i > 0; i--)
    {
        swap_float(&dist[0], &dist[i]);
        swap_int(&idx[0], &idx[i]);
        reheap(dist, idx, i);
    }
}


__device__ int get_bt_idx(int idx, const int *offset)
{
    int i = 0;
    while (1)
    {
        if (idx < offset[i])
            break;
        else
            i++;
    }
    return i;
}


__global__ void knnquery_cuda_kernel(int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, const int *__restrict__ offset, int new_offset, int *__restrict__ idx, float *__restrict__ dist2) {
    // input: xyz (n, 3) new_xyz (m, 3)
    // output: idx (m, nsample) dist2 (m, nsample)
    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (pt_idx >= m) return;

    new_xyz += pt_idx * 3;
    idx += pt_idx * nsample;
    dist2 += pt_idx * nsample;
    int bt_idx = 0; 
    int start;
    if (bt_idx == 0)
        start = 0;
    else
        start = offset[bt_idx - 1];
    int end = offset[bt_idx];

    float new_x = new_xyz[0];
    float new_y = new_xyz[1];
    float new_z = new_xyz[2];

    float best_dist[100];
    int best_idx[100];
    for(int i = 0; i < nsample; i++){
        best_dist[i] = 1e10;
        best_idx[i] = start;
    }
    for(int i = start; i < end; i++){
        float x = xyz[i * 3 + 0];
        float y = xyz[i * 3 + 1];
        float z = xyz[i * 3 + 2];
        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
        if (d2 < best_dist[0]){
            best_dist[0] = d2;
            best_idx[0] = i;
            reheap(best_dist, best_idx, nsample);
        }
    }
    heap_sort(best_dist, best_idx, nsample);
    for(int i = 0; i < nsample; i++){
        idx[i] = best_idx[i];
        dist2[i] = best_dist[i];
    }
}


void knnquery_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, int new_offset, int *idx, float *dist2) {
    // input: new_xyz: (m, 3), xyz: (n, 3), idx: (m, nsample)
    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK));
    dim3 threads(THREADS_PER_BLOCK);
    knnquery_cuda_kernel<<<blocks, threads, 0>>>(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2);
}

lib/pointops/functions/pointops.py修改为

from typing import Tuple

import torch
from torch.autograd import Function
import torch.nn as nn

import pointops_cuda

namespace = "pointtransformer"


class FurthestSampling(Function):
    @staticmethod
    def symbolic(g, xyz, offset, new_offset):
        new_offset = torch.tensor(new_offset, dtype=torch.int32)
        return g.op(f"{namespace}::furthestsampling", xyz, offset, new_offset, attr_i=new_offset)

    @staticmethod
    def forward(ctx, xyz, offset, new_offset):
        """
        input: xyz: (n, 3), offset: (b), new_offset: (b)    torch.Size([47358, 3]) tensor([47358], dtype=torch.int32) tensor([11839], dtype=torch.int32)
        output: idx: (m)    torch.Size([11839])
        """
        assert xyz.is_contiguous()
        n, b, n_max = xyz.shape[0], offset.shape[0], int(offset[0])
        for i in range(1, b):
            n_max = max(offset[i] - offset[i-1], n_max)
        # idx = torch.cuda.IntTensor(new_offset[b-1].item()).zero_()
        # tmp = torch.cuda.FloatTensor(n).fill_(1e10)
        idx = torch.zeros(new_offset[b-1], dtype=torch.int32, device="cuda")
        tmp = torch.full((n,), 1e10, dtype=torch.float32, device="cuda")
        pointops_cuda.furthestsampling_cuda(b, n_max, xyz, offset, new_offset[0], tmp, idx)
        del tmp
        return idx
    
furthestsampling = FurthestSampling.apply


class KNNQuery(Function):
    @staticmethod
    def symbolic(g, nsample, xyz, new_xyz, offset, new_offset):
        nsample = torch.tensor(nsample, dtype=torch.int32)
        return g.op(f"{namespace}::knnquery", nsample, xyz, new_xyz, offset, new_offset, outputs=2, attr_i=nsample)

    @staticmethod
    def forward(ctx, nsample, xyz, new_xyz, offset, new_offset):
        """
        input: xyz: (n, 3), new_xyz: (m, 3), offset: (b), new_offset: (b) torch.Size([47358, 3]) torch.Size([47358, 3]) torch.int32 torch.int32
        output: idx: (m, nsample), dist2: (m, nsample)
        """

        if new_xyz is None: new_xyz = xyz
        assert xyz.is_contiguous() and new_xyz.is_contiguous()
        m = new_xyz.shape[0]
        idx = torch.cuda.IntTensor(m, nsample).zero_()
        dist2 = torch.cuda.FloatTensor(m, nsample).zero_()
        pointops_cuda.knnquery_cuda(m, nsample, xyz, new_xyz, offset, new_offset[0], idx, dist2)
        return idx, torch.sqrt(dist2)

knnquery = KNNQuery.apply


class Grouping(Function):
    @staticmethod
    def forward(ctx, input, idx):
        """
        input: input: (n, c), idx : (m, nsample)
        output: (m, nsample, c)
        """
        assert input.is_contiguous() and idx.is_contiguous()
        m, nsample, n, c = idx.shape[0], idx.shape[1], input.shape[0], input.shape[1]
        output = torch.cuda.FloatTensor(m, nsample, c)
        pointops_cuda.grouping_forward_cuda(m, nsample, c, input, idx, output)
        ctx.n = n
        ctx.save_for_backward(idx)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        """
        input: grad_out: (m, c, nsample)
        output: (n, c), None
        """
        n = ctx.n
        idx, = ctx.saved_tensors
        m, nsample, c = grad_output.shape
        grad_input = torch.cuda.FloatTensor(n, c).zero_()
        pointops_cuda.grouping_backward_cuda(m, nsample, c, grad_output, idx, grad_input)
        return grad_input, None

grouping = Grouping.apply


def queryandgroup(nsample, xyz, new_xyz, feat, idx, offset, new_offset, use_xyz=True):
    """
    input: xyz: (n, 3), new_xyz: (m, 3), feat: (n, c), idx: (m, nsample), offset: (b), new_offset: (b)
    output: new_feat: (m, c+3, nsample), grouped_idx: (m, nsample)
    """
    assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous()
    if new_xyz is None:
        new_xyz = xyz
    if idx is None:
        idx, _ = knnquery(nsample, xyz, new_xyz, offset, new_offset) # (m, nsample) torch.Size([47358, 8])

    n, m, c = xyz.shape[0], new_xyz.shape[0], feat.shape[1]
    grouped_xyz = xyz[idx.view(-1).long(), :].view(m, nsample, 3) # (m, nsample, 3) torch.Size([47358, 8, 3])
    grouped_xyz -= new_xyz.unsqueeze(1) # (m, nsample, 3)  
    grouped_feat = feat[idx.view(-1).long(), :].view(m, nsample, c) # (m, nsample, c)

    if use_xyz:
        return torch.cat((grouped_xyz, grouped_feat), -1) # (m, nsample, 3+c)
    else:
        return grouped_feat


class Subtraction(Function):
    @staticmethod
    def forward(ctx, input1, input2, idx):
        """
        input: input1: (n, c), input2: (n, c), idx: (n, nsample)
        output:  (n, nsample, c)
        """
        assert input1.is_contiguous() and input2.is_contiguous()
        n, c = input1.shape; nsample = idx.shape[-1]
        output = torch.cuda.FloatTensor(n, nsample, c).zero_()
        pointops_cuda.subtraction_forward_cuda(n, nsample, c, input1, input2, idx, output)
        ctx.save_for_backward(idx)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        """
        input: grad_out: (n, nsample, c)
        output: grad_input1: (n, c), grad_input2: (n, c)
        """
        idx, = ctx.saved_tensors
        n, nsample, c = grad_output.shape
        grad_input1 = torch.cuda.FloatTensor(n, c).zero_()
        grad_input2 = torch.cuda.FloatTensor(n, c).zero_()
        pointops_cuda.subtraction_backward_cuda(n, nsample, c, idx, grad_output, grad_input1, grad_input2)
        return grad_input1, grad_input2, None

subtraction = Subtraction.apply


class Aggregation(Function):
    @staticmethod
    def forward(ctx, input, position, weight, idx):
        """
        input: input: (n, c), position: (n, nsample, c), weight : (n, nsample, c'), idx: (n, nsample)
        output: (n, c)
        """
        assert input.is_contiguous() and position.is_contiguous() and weight.is_contiguous()
        n, nsample, c = position.shape; w_c = weight.shape[-1]
        output = torch.cuda.FloatTensor(n, c).zero_()
        pointops_cuda.aggregation_forward_cuda(n, nsample, c, w_c, input, position, weight, idx, output)
        ctx.save_for_backward(input, position, weight, idx)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        """
        input: grad_out: (n, c)
        output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight : (n, nsample, c')
        """
        input, position, weight, idx = ctx.saved_tensors
        n, nsample, c = position.shape; w_c = weight.shape[-1]
        grad_input = torch.cuda.FloatTensor(n, c).zero_()
        grad_position = torch.cuda.FloatTensor(n, nsample, c).zero_()
        grad_weight = torch.cuda.FloatTensor(n, nsample, w_c).zero_()
        pointops_cuda.aggregation_backward_cuda(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight)
        return grad_input, grad_position, grad_weight, None

aggregation = Aggregation.apply


def interpolation(xyz, new_xyz, feat, offset, new_offset, k=3):
    """
    input: xyz: (m, 3), new_xyz: (n, 3), feat: (m, c), offset: (b), new_offset: (b)
    output: (n, c)
    """
    assert xyz.is_contiguous() and new_xyz.is_contiguous() and feat.is_contiguous()
    idx, dist = knnquery(k, xyz, new_xyz, offset, new_offset) 
    dist_recip = 1.0 / (dist + 1e-8) # (n, 3)
    norm = torch.sum(dist_recip, dim=1, keepdim=True)
    weight = dist_recip / norm # (n, 3)

    new_feat = torch.cuda.FloatTensor(new_xyz.shape[0], feat.shape[1]).zero_()
    for i in range(k):
        new_feat += feat[idx[:, i].long(), :] * weight[:, i].unsqueeze(-1)

    return new_feat#torch.Size([739, 256])


class Interpolation(Function):
    @staticmethod
    def symbolic(g,  xyz, new_xyz, input, offset, new_offset, k=3):
        return g.op(f"{namespace}::interpolation2",  xyz, new_xyz, input, offset, new_offset, k=3)

    @staticmethod
    def forward(ctx, xyz, new_xyz, input, offset, new_offset, k=3):
        """
        input: xyz: (m, 3), new_xyz: (n, 3), input: (m, c), offset: (b), new_offset: (b)
        output: (n, c)
        """
        assert xyz.is_contiguous() and new_xyz.is_contiguous() and input.is_contiguous()
        idx, dist = knnquery(k, xyz, new_xyz, offset, new_offset) # (n, k), (n, k)
        dist_recip = 1.0 / (dist + 1e-8) # (n, k)
        norm = torch.sum(dist_recip, dim=1, keepdim=True)
        weight = dist_recip / norm # (n, k)

        n, c, m = new_xyz.shape[0], input.shape[1], input.shape[0]
        output = torch.cuda.FloatTensor(n, c).zero_()
        pointops_cuda.interpolation_forward_cuda(n, c, k, input, idx, weight, output)
        ctx.m, ctx.k = m, k
        ctx.save_for_backward(idx, weight)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        """
        input: xyz: (m, 3), new_xyz: (n, 3), input: (m, c), offset: (b), new_offset: (b)
        output: (n, c)
        """
        m, k = ctx.m, ctx.k
        idx, weight = ctx.saved_tensors
        n, c = grad_output.shape
        grad_input = torch.cuda.FloatTensor(m, c).zero_()
        pointops_cuda.interpolation_backward_cuda(n, c, k, grad_output, idx, weight, grad_input)
        return None, None, grad_input, None, None, None

interpolation2 = Interpolation.apply

model/pointtransformer/pointtransformer_seg.py

import torch
import torch.nn as nn

from lib.pointops.functions import pointops


class PointTransformerLayer(nn.Module):
    def __init__(self, in_planes, out_planes, share_planes=8, nsample=16):
        super().__init__()
        self.mid_planes = mid_planes = out_planes // 1
        self.out_planes = out_planes
        self.share_planes = share_planes
        self.nsample = nsample
        self.linear_q = nn.Linear(in_planes, mid_planes)
        self.linear_k = nn.Linear(in_planes, mid_planes)
        self.linear_v = nn.Linear(in_planes, out_planes)
        self.linear_p = nn.Sequential(nn.Linear(3, 3), nn.BatchNorm1d(3), nn.ReLU(inplace=True), nn.Linear(3, out_planes))
        self.linear_w = nn.Sequential(nn.BatchNorm1d(mid_planes), nn.ReLU(inplace=True),
                                    nn.Linear(mid_planes, mid_planes // share_planes),
                                    nn.BatchNorm1d(mid_planes // share_planes), nn.ReLU(inplace=True),
                                    nn.Linear(out_planes // share_planes, out_planes // share_planes))
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, pxo) -> torch.Tensor:
        p, x, o = pxo  # (n, 3), (n, c), (b)
        x_q, x_k, x_v = self.linear_q(x), self.linear_k(x), self.linear_v(x)  # (n, c)
        x_k = pointops.queryandgroup(self.nsample, p, p, x_k, None, o, o, use_xyz=True)  # (n, nsample, 3+c)
        x_v = pointops.queryandgroup(self.nsample, p, p, x_v, None, o, o, use_xyz=False)  # (n, nsample, c)
        p_r, x_k = x_k[:, :, 0:3], x_k[:, :, 3:]
        for i, layer in enumerate(self.linear_p): p_r = layer(p_r.transpose(1, 2).contiguous()).transpose(1, 2).contiguous() if i == 1 else layer(p_r)    # (n, nsample, c)
        w = x_k - x_q.unsqueeze(1) + p_r.view(p_r.shape[0], p_r.shape[1], self.out_planes // self.mid_planes, self.mid_planes).sum(2)  # (n, nsample, c)
        for i, layer in enumerate(self.linear_w): w = layer(w.transpose(1, 2).contiguous()).transpose(1, 2).contiguous() if i % 3 == 0 else layer(w)
        w = self.softmax(w)  # (n, nsample, c)
        n, nsample, c = x_v.shape; s = self.share_planes
        x = ((x_v + p_r).view(n, nsample, s, c // s) * w.unsqueeze(2)).sum(1).view(n, c)
        return x


class TransitionDown(nn.Module):
    def __init__(self, in_planes, out_planes, stride=1, nsample=16):
        super().__init__()
        self.stride, self.nsample = stride, nsample
        if stride != 1:
            self.linear = nn.Linear(3+in_planes, out_planes, bias=False)
            self.pool = nn.MaxPool1d(nsample)
        else:
            self.linear = nn.Linear(in_planes, out_planes, bias=False)
        self.bn = nn.BatchNorm1d(out_planes)
        self.relu = nn.ReLU(inplace=True)
        
    def forward(self, pxo):
        p, x, o = pxo  # (n, 3), (n, c), (b)
        if self.stride != 1:
            n_o, count = [o[0].item() // self.stride], o[0].item() // self.stride
            for i in range(1, o.shape[0]):
                count += (o[i].item() - o[i-1].item()) // self.stride
                n_o.append(count)
            #n_o = torch.cuda.IntTensor(n_o)
            idx = pointops.furthestsampling(p, o, n_o)  # (m)
            n_p = p[idx.long(), :]  # (m, 3)
            n_o = torch.cuda.IntTensor(n_o)
            x = pointops.queryandgroup(self.nsample, p, n_p, x, None, o, n_o, use_xyz=True)  # (m, 3+c, nsample)
            x = self.relu(self.bn(self.linear(x).transpose(1, 2).contiguous()))  # (m, c, nsample)
            x = self.pool(x).squeeze(-1)  # (m, c)
            p, o = n_p, n_o
        else:
            x = self.relu(self.bn(self.linear(x)))  # (n, c)
        return [p, x, o]


class TransitionUp(nn.Module):
    def __init__(self, in_planes, out_planes=None):
        super().__init__()
        if out_planes is None:
            self.linear1 = nn.Sequential(nn.Linear(2*in_planes, in_planes), nn.BatchNorm1d(in_planes), nn.ReLU(inplace=True))
            self.linear2 = nn.Sequential(nn.Linear(in_planes, in_planes), nn.ReLU(inplace=True))
        else:
            self.linear1 = nn.Sequential(nn.Linear(out_planes, out_planes), nn.BatchNorm1d(out_planes), nn.ReLU(inplace=True))
            self.linear2 = nn.Sequential(nn.Linear(in_planes, out_planes), nn.BatchNorm1d(out_planes), nn.ReLU(inplace=True))
        
    def forward(self, pxo1, pxo2=None):
        if pxo2 is None:
            _, x, o = pxo1  # (n, 3), (n, c), (b)
            x_tmp = []
            for i in range(o.shape[0]):
                if i == 0:
                    s_i, e_i, cnt = 0, o[0], o[0]
                else:
                    s_i, e_i, cnt = o[i-1], o[i], o[i] - o[i-1]
                x_b = x[s_i:e_i, :]
                x_b = torch.cat((x_b, self.linear2(x_b.sum(0, True) / cnt).repeat(cnt, 1)), 1)
                x_tmp.append(x_b)
            x = torch.cat(x_tmp, 0)
            x = self.linear1(x)
        else:
            p1, x1, o1 = pxo1; p2, x2, o2 = pxo2
            x = self.linear1(x1) + pointops.interpolation(p2, p1, self.linear2(x2), o2, o1)
        return x


class PointTransformerBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, share_planes=8, nsample=16):
        super(PointTransformerBlock, self).__init__()
        self.linear1 = nn.Linear(in_planes, planes, bias=False)
        self.bn1 = nn.BatchNorm1d(planes)
        self.transformer2 = PointTransformerLayer(planes, planes, share_planes, nsample)
        self.bn2 = nn.BatchNorm1d(planes)
        self.linear3 = nn.Linear(planes, planes * self.expansion, bias=False)
        self.bn3 = nn.BatchNorm1d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, pxo):
        p, x, o = pxo  # (n, 3), (n, c), (b)
        identity = x
        x = self.relu(self.bn1(self.linear1(x)))
        x = self.relu(self.bn2(self.transformer2([p, x, o])))
        x = self.bn3(self.linear3(x))
        x += identity
        x = self.relu(x)
        return p, x, o
    

class PointTransformerSeg(nn.Module):
    def __init__(self, block, blocks, c=6, k=13):
        super().__init__()
        self.c = c
        self.in_planes, planes = c, [32, 64, 128, 256, 512]
        fpn_planes, fpnhead_planes, share_planes = 128, 64, 8
        stride, nsample = [1, 4, 4, 4, 4], [8, 16, 16, 16, 16]
        self.enc1 = self._make_enc(block, planes[0], blocks[0], share_planes, stride=stride[0], nsample=nsample[0])  # N/1
        self.enc2 = self._make_enc(block, planes[1], blocks[1], share_planes, stride=stride[1], nsample=nsample[1])  # N/4
        self.enc3 = self._make_enc(block, planes[2], blocks[2], share_planes, stride=stride[2], nsample=nsample[2])  # N/16
        self.enc4 = self._make_enc(block, planes[3], blocks[3], share_planes, stride=stride[3], nsample=nsample[3])  # N/64
        self.enc5 = self._make_enc(block, planes[4], blocks[4], share_planes, stride=stride[4], nsample=nsample[4])  # N/256
        self.dec5 = self._make_dec(block, planes[4], 2, share_planes, nsample=nsample[4], is_head=True)  # transform p5
        self.dec4 = self._make_dec(block, planes[3], 2, share_planes, nsample=nsample[3])  # fusion p5 and p4
        self.dec3 = self._make_dec(block, planes[2], 2, share_planes, nsample=nsample[2])  # fusion p4 and p3
        self.dec2 = self._make_dec(block, planes[1], 2, share_planes, nsample=nsample[1])  # fusion p3 and p2
        self.dec1 = self._make_dec(block, planes[0], 2, share_planes, nsample=nsample[0])  # fusion p2 and p1
        self.cls = nn.Sequential(nn.Linear(planes[0], planes[0]), nn.BatchNorm1d(planes[0]), nn.ReLU(inplace=True), nn.Linear(planes[0], k))

    def _make_enc(self, block, planes, blocks, share_planes=8, stride=1, nsample=16):
        layers = []
        layers.append(TransitionDown(self.in_planes, planes * block.expansion, stride, nsample))
        self.in_planes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.in_planes, self.in_planes, share_planes, nsample=nsample))
        return nn.Sequential(*layers)

    def _make_dec(self, block, planes, blocks, share_planes=8, nsample=16, is_head=False):
        layers = []
        layers.append(TransitionUp(self.in_planes, None if is_head else planes * block.expansion))
        self.in_planes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.in_planes, self.in_planes, share_planes, nsample=nsample))
        return nn.Sequential(*layers)

    def forward(self, pxo):
        p0, x0, o0 = pxo  # (n, 3), (n, c), (b)
        x0 = p0 if self.c == 3 else torch.cat((p0, x0), 1)
        p1, x1, o1 = self.enc1([p0, x0, o0])
        p2, x2, o2 = self.enc2([p1, x1, o1])
        p3, x3, o3 = self.enc3([p2, x2, o2])
        p4, x4, o4 = self.enc4([p3, x3, o3])
        p5, x5, o5 = self.enc5([p4, x4, o4])
        x5 = self.dec5[1:]([p5, self.dec5[0]([p5, x5, o5]), o5])[1]
        x4 = self.dec4[1:]([p4, self.dec4[0]([p4, x4, o4], [p5, x5, o5]), o4])[1]
        x3 = self.dec3[1:]([p3, self.dec3[0]([p3, x3, o3], [p4, x4, o4]), o3])[1]
        x2 = self.dec2[1:]([p2, self.dec2[0]([p2, x2, o2], [p3, x3, o3]), o2])[1]
        x1 = self.dec1[1:]([p1, self.dec1[0]([p1, x1, o1], [p2, x2, o2]), o1])[1]
        x = self.cls(x1)
        return x


def pointtransformer_seg_repro(**kwargs):
    model = PointTransformerSeg(PointTransformerBlock, [2, 3, 4, 6, 3], **kwargs)
    return model

编写导出onnx模型的代码:

import argparse
import collections
import torch
from util import config
from model.pointtransformer.pointtransformer_seg import pointtransformer_seg_repro as Model


def get_parser():
    parser = argparse.ArgumentParser(description='PyTorch Point Cloud Semantic Segmentation')
    parser.add_argument('--config', type=str, default='/home/tfy/document/point-transformer-master/config/s3dis/s3dis_pointtransformer_repro.yaml', help='config file')
    parser.add_argument('opts', help='see config/s3dis/s3dis_pointtransformer_repro.yaml for all options', default=None, nargs=argparse.REMAINDER)
    args = parser.parse_args()
    cfg = config.load_cfg_from_cfg_file(args.config)
    cfg = config.merge_cfg_from_list(cfg, args.opts)
    return cfg


if __name__ == '__main__':
    args = get_parser()
    model = Model(c=args.fea_dim, k=args.classes).cuda()
    names = [line.rstrip('\n') for line in open(args.names_path)]
    checkpoint = torch.load(args.model_path, weights_only=False)
    state_dict = checkpoint['state_dict']
    new_state_dict = collections.OrderedDict()
    for k, v in state_dict.items():
        name = k[7:]
        new_state_dict[name] = v
    model.load_state_dict(new_state_dict, strict=True)
    model.eval()

    coord_part = torch.rand(47358, 3).to(torch.float32).to("cuda") 
    feat_part = torch.rand(47358, 3).to(torch.float32).to("cuda")  
    offset_part = torch.tensor([47358], device='cuda', dtype=torch.int32)  

    with torch.no_grad():
        torch.onnx.export(model, ([coord_part, feat_part, offset_part]), "pointtransformer.onnx", opset_version=15)

导出的onnx模型结构如下:
在这里插入图片描述

编写tensorrt自定义算子

furthestSampling自定义tensorrt算子编写
knnQuery自定义tensorrt算子编写

导出tensorrt模型

先用onnxslim简化模型:

onnxslim pointtransformer.onnx pointtransformer.onnx

在这里插入图片描述

再执行:

TensorRT-10.6.0.26/bin/trtexec --onnx=pointtransformer.onnx --saveEngine=pointtransformer.engine

测试tensorrt模型

import numpy as np
import tensorrt as trt
import common


logger = trt.Logger(trt.Logger.WARNING)
trt.init_libnvinfer_plugins(logger, "")
with open("pointtransformer.engine", "rb") as f, trt.Runtime(logger) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
inputs, outputs, bindings, stream = common.allocate_buffers(engine)

coord_part = np.loadtxt("coord_part.txt").reshape(47358, 3).astype(np.float32)    #(47358, 3)
feat_part = np.loadtxt("feat_part.txt").reshape(47358, 3).astype(np.float32)      #(47358, 3)
offset_part = np.array([47358]).astype(np.int32)
         
np.copyto(inputs[0].host, coord_part.ravel())
np.copyto(inputs[1].host, feat_part.ravel())
np.copyto(inputs[2].host, offset_part)  

output = common.do_inference(context,engine=engine, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
print(output)  

tensorrt部署

from util.common_util import intersectionAndUnion
from util.voxelize import voxelize

import numpy as np
import tensorrt as trt
import common


logger = trt.Logger(trt.Logger.WARNING)
trt.init_libnvinfer_plugins(logger, "")
with open("pointtransformer.engine", "rb") as f, trt.Runtime(logger) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
inputs, outputs, bindings, stream = common.allocate_buffers(engine)


def input_normalize(coord, feat):
    coord_min = np.min(coord, 0)
    coord -= coord_min
    feat = feat / 255.
    return coord, feat

classes = 13
voxel_max = 80000
voxel_size=0.04
data = np.load("/home/tfy/document/point-transformer-master/dataset/s3dis/trainval_fullarea/Area_5_WC_1.npy")  # xyzrgbl, N*7
coord, feat, label = data[:, :3], data[:, 3:6], data[:, 6]

idx_data = []
coord_min = np.min(coord, 0)
coord -= coord_min
idx_sort, count = voxelize(coord, voxel_size, mode=1)
for i in range(count.max()):
    idx_select = np.cumsum(np.insert(count, 0, 0)[0:-1]) + i % count
    idx_part = idx_sort[idx_select]
    idx_data.append(idx_part)

pred = np.zeros((label.size, classes))
idx_size = len(idx_data)
idx_list, coord_list, feat_list, offset_list  = [], [], [], []
for i in range(idx_size):
    idx_part = idx_data[i]
    coord_part, feat_part = coord[idx_part], feat[idx_part]
    if voxel_max and coord_part.shape[0] > voxel_max:
        coord_p, idx_uni, cnt = np.random.rand(coord_part.shape[0]) * 1e-3, np.array([]), 0
        while idx_uni.size != idx_part.shape[0]:
            init_idx = np.argmin(coord_p)
            dist = np.sum(np.power(coord_part - coord_part[init_idx], 2), 1)
            idx_crop = np.argsort(dist)[:voxel_max]
            coord_sub, feat_sub, idx_sub = coord_part[idx_crop], feat_part[idx_crop], idx_part[idx_crop]
            dist = dist[idx_crop]
            delta = np.square(1 - dist / np.max(dist))
            coord_p[idx_crop] += delta
            coord_sub, feat_sub = input_normalize(coord_sub, feat_sub)
            idx_list.append(idx_sub), coord_list.append(coord_sub), feat_list.append(feat_sub), offset_list.append(idx_sub.size)
            idx_uni = np.unique(np.concatenate((idx_uni, idx_sub)))
    else:
        coord_part, feat_part = input_normalize(coord_part, feat_part)
        idx_list.append(idx_part), coord_list.append(coord_part), feat_list.append(feat_part), offset_list.append(idx_part.size)

batch_num = int(np.ceil(len(idx_list)))
for i in range(batch_num):
    s_i, e_i = i, min(i + 1, len(idx_list))
    idx_part, coord_part, feat_part, offset_part = idx_list[s_i:e_i], coord_list[s_i:e_i], feat_list[s_i:e_i], offset_list[s_i:e_i]
    idx_part = np.concatenate(idx_part)
    coord_part = np.concatenate(coord_part)
    feat_part = np.concatenate(feat_part)
    offset_part = np.cumsum(offset_part)
       
    np.copyto(inputs[0].host, coord_part.ravel())
    np.copyto(inputs[1].host, feat_part.ravel())
    np.copyto(inputs[2].host, offset_part)  
    output = common.do_inference(context,engine=engine, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

    pred_part = output[0].reshape((offset_part[-1], classes))
    pred[idx_part, :] += pred_part
pred = np.argmax(pred, axis=1)

intersection, union, target = intersectionAndUnion(pred, label, classes)
accuracy = sum(intersection) / (sum(target) + 1e-10)
print('Accuracy {accuracy:.4f}.'.format(accuracy=accuracy))

您可能感兴趣的与本文相关的镜像

PyTorch 2.5

PyTorch 2.5

PyTorch
Cuda

PyTorch 是一个开源的 Python 机器学习库,基于 Torch 库,底层由 C++ 实现,应用于人工智能领域,如计算机视觉和自然语言处理

<think> 我们正在讨论评判模型推理速度的方法。根据提供的引用和上下文,我们可以从多个维度来评估模型推理速度。以下是一些关键指标和方法: ### 1. **计算量(FLOPs)** - **定义**:浮点运算次数(Floating Point Operations),衡量模型执行一次前向传播所需的计算量。 - **公式**:对于Transformer解码层,每个token、每个模型参数需要$2$次浮点运算[^4]。 $$ \text{总FLOPs} = 2 \times (\text{模型参数量}) \times (\text{生成的token数量}) $$ - **作用**:FLOPs是硬件无关的指标,反映模型的计算复杂度。 ### 2. **吞吐量(Throughput)** - **定义**:单位时间内模型处理的token数量。 - **公式**: $$ \text{吞吐量} = \frac{\text{批量大小} \times \text{序列长度}}{\text{处理时间}} \quad (\text{tokens/秒}) $$ - **优化方向**:增大批量大小可提升GPU利用率,但受限于显存。 ### 3. **延迟(Latency)** - **定义**:从输入请求到获得完整输出所需的时间。 - **类型**: - **首token延迟**:生成第一个token的时间 - **尾token延迟**:生成完整响应的时间 - **影响因素**:模型大小、硬件性能、优化策略(如KV缓存)。 ### 4. **硬件指标** - **计算密度**:单位时间内硬件执行的浮点运算量(FLOPS)。 $$ \text{实际FLOPS} = \frac{\text{模型FLOPs}}{\text{执行时间}} $$ - **内存带宽**:决定数据传输速度,尤其影响大模型加载参数的速度。 ### 5. **实际测量方法** ```python import time # 示例:测量单次推理延迟 start_time = time.perf_counter() output = model.generate(input_ids) # 执行推理 latency = time.perf_counter() - start_time # 测量吞吐量 total_tokens = sum(len(seq) for seq in output) throughput = total_tokens / latency ``` ### 6. **优化技术的影响** - **量化**:将模型参数从FP32转为INT8,减少内存占用和计算量。 - **编译优化**:使用TensorRT、vLLM等框架优化计算图。 - **注意力优化**:如FlashAttention减少内存访问次数。 ### 性能对比示例 | 模型 | 参数量 | A100吞吐量 (tokens/s) | 延迟 (ms/token) | |----------------|--------|----------------------|----------------| | LLaMA-7B FP16 | 7B | 120 | 8.3 | | LLaMA-7B INT4 | 7B | 310 | 3.2 | | Mixtral 8x7B | 47B* | 85 | 11.8 | > (*注:Mixtral为稀疏模型,实际激活参数量约14B) ### 结论 评判推理速度需综合考虑计算量、吞吐量和延迟[^4]。对于实时应用(如对话系统),低延迟是关键;对于批量处理,高吞吐量更重要。优化时需权衡: - 计算密集型任务:提升FLOPs利用率 - 内存受限任务:减少参数传输量 §§相关问题§§ 1. 如何量化对推理速度的影响? 2. Transformer解码层的计算瓶颈有哪些? 3. 稀疏模型如何提高推理效率? </think> ### 评判模型推理速度的方法 评判模型推理速度主要从以下维度进行量化分析,结合计算复杂度、硬件特性和实际测量指标: --- #### 1. **理论计算复杂度分析** 模型推理的**基础计算量**主要由Transformer架构决定,核心公式为: $$ \text{FLOPs} = 2 \times N_{\text{params}} \times N_{\text{tokens}} $$ 其中: - $N_{\text{params}}$:模型参数量(例如7B模型为$7 \times 10^9$) - $N_{\text{tokens}}$:处理的token总数[^4] 该公式揭示了推理成本与参数量、输出长度的**线性关系**,是评估的基础框架[^4]。 --- #### 2. **关键实测指标** 通过实际运行测量以下核心性能指标: | 指标 | 测量方法 | 意义 | |--------------------|-----------------------------------|--------------------------| | **延迟(Latency)** | 输入提交到首个token输出的时间 | 实时交互体验关键指标[^1] | | **吞吐量(Throughput)** | 单位时间处理的token数量(tokens/s) | 批处理效率核心指标 | | **首token时间** | 第一个有效输出的等待时间 | 用户体验敏感指标 | | **显存占用** | GPU显存峰值使用量 | 硬件成本核心约束[^3] | **测量代码示例**: ```python import time import torch def benchmark_model(model, input_tokens, warmup=3, repeats=10): # 预热阶段 for _ in range(warmup): _ = model.generate(input_tokens) # 正式测量 latencies = [] for _ in range(repeats): start = time.perf_counter() output = model.generate(input_tokens) latency = time.perf_counter() - start latencies.append(latency) # 计算吞吐量 (tokens/秒) avg_latency = sum(latencies) / len(latencies) throughput = len(output[0]) / avg_latency return avg_latency, throughput ``` --- #### 3. **硬件效率指标** 评估硬件资源利用率: - **计算利用率**:实际FLOPs / 硬件峰值FLOPs - **显存带宽利用率**:数据传输量 / 硬件峰值带宽 - **能耗效率**:tokens/焦耳(能效比) 这些指标决定**单位成本的推理能力**,尤其影响云端部署经济性[^3]。 --- #### 4. **复杂度-精度权衡分析** 推理模型常面临效率与质量的权衡: ```mermaid graph LR A[简单任务] -->|摘要/翻译| B[基础模型] C[复杂推理] -->|数学/编程| D[增强模型] ``` - 基础模型:延迟<100ms,适合简单任务[^1] - 增强模型(如k0-math):延迟提升30-50%,但复杂任务准确率提升40%+[^5] --- #### 5. **基准测试框架** 标准化评估工具: 1. **推理数据集**:GSM8K(数学)、HumanEval(代码) 2. **专用工具**: - [vLLM](https://github.com/vllm-project/vllm):吞吐量优化框架 - [TGI](https://github.com/huggingface/text-generation-inference):延迟优化框架 3. **测试协议**: - 固定输入长度(如512 tokens) - 多batch_size测试(1/4/8/16...) - 混合精度模式对比(FP16/INT8) > 最新基准显示:k0-math在数学任务中推理速度比通用模型快1.8倍,同时保持精度领先[^5]。 --- ### 优化推理速度的核心技术 1. **模型压缩**: - 量化:FP32 → INT8(速度提升2-3×)[^4] - 剪枝:移除冗余参数(减少10-20%计算量) 2. **推理引擎优化**: - 算子融合:减少kernel启动开销 - 连续内存访问:优化显存带宽 3. **批处理策略**: - 动态批处理:合并异构请求 - PagedAttention:高效KV缓存管理 --- ### 总结 评判推理速度需建立**多维评估体系**: 1. 理论计算复杂度 → 硬件无关基准 2. 实测延迟/吞吐量 → 用户体验指标 3. 硬件利用率 → 部署经济性分析 4. 任务复杂度匹配 → 避免"过度思考"[^1] > "为任务选择合适的工具"是优化推理效率的黄金准则[^1]。复杂推理任务需要专用模型(如k0-math[^5]),而简单任务使用轻量模型更经济。 ---
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

给算法爸爸上香

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值