手写cuda算子：CUDA算子向量化访存实现float/float4/half/half2/int/int4 6种数据类型的elementwise计算（六）

weixin_55083979

已于 2025-03-16 20:35:05 修改

阅读量932

点赞数 15

分类专栏：手写cuda算子文章标签：算法服务器前端

于 2025-03-16 20:29:32 首次发布

本文链接：https://blog.youkuaiyun.com/weixin_55083979/article/details/146300409

版权

手写cuda算子专栏收录该内容

6 篇文章

订阅专栏

一.用模板函数实现多精度计算

1.根据vector_sum的布尔值确定是否开启向量化访存；

using Vec4 = typename std::conditional<std::is_same<T, float>::value, float4, int4>::type;ge

2.根据T的数据类型确定是int4向量化访存还是float4向量化访存；

3.使用reinterpret_cast<Vec4*>(&)进行指针数据格式转；

4.*reinterpret_cast<Vec4*>(&A[idx])前面再加个指针就是取值；

5.x,y,z,w分别是向量化访存的四个维度

6.将得到的temp指针的值写入结果矩阵C

template <typename T>
__global__ void elementwise(T* A, T* B, T* C, int total_elements, bool vector_sum) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (vector_sum) {
        // 向量化计算，每个线程处理 4 个元素
        idx *= 4;
        if (idx < total_elements) {
            using Vec4 = typename std::conditional<std::is_same<T, float>::value, float4, int4>::type;
            Vec4 vec_a = *reinterpret_cast<Vec4*>(&A[idx]);
            Vec4 vec_b = *reinterpret_cast<Vec4*>(&B[idx]);
            Vec4 vec_c;
            vec_c.x = vec_a.x + vec_b.x;
            vec_c.y = vec_a.y + vec_b.y;
            vec_c.z = vec_a.z + vec_b.z;
            vec_c.w = vec_a.w + vec_b.w;
            *reinterpret_cast<Vec4*>(&C[idx]) = vec_c;
        }
    } else {
        // 标量计算
        if (idx < total_elements) {
            C[idx] = A[idx] + B[idx];
        }
    }
}

二.half2向量化访存计算

1.half与int和float不同，没有half4但是有half2，但是__ldg(reinterpret_cast<half2*>

2.__ldg() 利用 GPU 只读数据缓存（L1 cache），提高内存访问效率。

3.减少 DRAM 访问：如果数据多次使用，__ldg() 可以缓存它们，避免重复访问全局内存。

4.如果 A[idx] 可能在 kernel 内修改，不要使用 __ldg()

template <>
__global__ void elementwise(half* A, half* B, half* C, int total_elements, bool vector_sum) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (vector_sum) {
        // 使用 half2 进行向量化计算
        idx *= 2;
        if (idx < total_elements) {
            half2 vec_a = __ldg(reinterpret_cast<half2*>(&A[idx]));
            half2 vec_b = __ldg(reinterpret_cast<half2*>(&B[idx]));
            half2 vec_c = __hadd2(vec_a, vec_b);
            *reinterpret_cast<half2*>(&C[idx]) = vec_c;
        }
    } else {
        // 标量计算
        if (idx < total_elements) {
            C[idx] = __hadd(A[idx], B[idx]);
        }
    }
}

三.模板函数数据初始化

template<typename T>
void ptr_data_init(T* p,int total_num){
    for(int i=0;i<total_num;++i){
        p[i] = static_cast<T>(i);
    }    
}

template<>
void ptr_data_init(half* p,int total_num){
    for(int i=0;i<total_num;++i){
        p[i] = __float2half((float)i);
    }    
}

四.cuda核函数launch函数

template<typename T>
void launch_cuda_add(T* a,T* b,T* c,int total_nums,int blocksize,bool vector_load){
    // int gridesize = (int)(total_nums + blocksize - 1 )/ blocksize;
    // int gridesize = (int)(total_nums + blocksize - 1 )/ (blocksize*4);
    int gridesize = (int)(total_nums + blocksize - 1 )/ (blocksize*2);
    elementwise<T><<<gridesize,blocksize>>>(a,b,c,total_nums,vector_load);

}

五.cpu实现launch函数

template<typename T>
void launch_cpu_add(T* a,T* b,T* c,int total_nums){
    for(int j=0;j<total_nums;++j){
        c[j] = a[j] + b[j];
    }
}

template<>
void launch_cpu_add(half* a,half* b,half* c,int total_nums){
    for(int j=0;j<total_nums;++j){
        c[j] = __float2half(__half2float(a[j]) + __half2float(b[j]));
    }
}

六.结果对齐函数

template<typename T>
void check_cuda_cpu_result(T* a,T* b,int total_nums){
    bool temp = true;
    for(int k=0;k<total_nums;++k){
        if(fabs(a[k] - b[k])>1e-5){
            printf("a: %f b: %f diff: %f\n",(float)a[k],(float)b[k],fabs(a[k] - b[k]));
            temp = false;
            
        }
    }
    if(temp){
        printf("cuda and cpu get the same result!\n");
    }
}

template<>
void check_cuda_cpu_result(half* a,half* b,int total_nums){
    bool temp = true;
    for(int k=0;k<total_nums;++k){
        if(fabs(__half2float(a[k]) - __half2float(b[k]))>1e-5){
            printf("a: %f b: %f diff: %f\n",__half2float(a[k]),__half2float(b[k]),fabs(__half2float(a[k]) - __half2float(b[k])));
            temp = false;
            
        }
    }
    if(temp){
        printf("cuda and cpu get the same result!\n");
    }
}

root@7666a2ca87d3:/datas/xk/02code/hpc/interview# cd rewrite/
root@7666a2ca87d3:/datas/xk/02code/hpc/interview/rewrite# nvcc -arch=sm_86 elementwise.cu -o elementwise && ./elementwise 
cuda and cpu get the same result!
root@7666a2ca87d3:/datas/xk/02code/hpc/interview/rewrite#

七.main函数：指针声明，内存分配，数据拷贝，函数launch,结果对齐检查

int main(){
    using  dtype = DTYPE;
    bool vector_load = VECTOR_LOAD;
    int rows = ROWS;
    int cols = COLS;
    int blocksize = BLOCKSIZE;
    int total_nums = rows*cols;
    int total_bytes = rows*cols*sizeof(dtype);
    dtype* ha;
    dtype* hb;
    dtype* hc;
    dtype* hc_;
    ha = (dtype*)malloc(total_bytes);
    hb = (dtype*)malloc(total_bytes);
    hc = (dtype*)malloc(total_bytes);
    hc_ = (dtype*)malloc(total_bytes);
    ptr_data_init(ha,total_nums);
    ptr_data_init(hb,total_nums);
    ptr_data_init(hc,total_nums);
    dtype* da;
    dtype* db;
    dtype* dc;
    Check_Cuda_Runtime(cudaMalloc((void**)&da,total_bytes));
    Check_Cuda_Runtime(cudaMalloc((void**)&db,total_bytes));
    Check_Cuda_Runtime(cudaMalloc((void**)&dc,total_bytes));
    Check_Cuda_Runtime(cudaMemcpy(da,ha,total_bytes,cudaMemcpyHostToDevice));
    Check_Cuda_Runtime(cudaMemcpy(db,hb,total_bytes,cudaMemcpyHostToDevice));
    launch_cuda_add<dtype>(da,db,dc,total_nums,blocksize,vector_load);
    Check_Cuda_Runtime(cudaMemcpy(hc,dc,total_bytes,cudaMemcpyDeviceToHost));
    launch_cpu_add<dtype>(ha,hb,hc_,total_nums);
    check_cuda_cpu_result<dtype>(hc,hc_,total_nums);
        
    free(ha);
    free(hb);
    free(hc);
    cudaFree(da);
    cudaFree(db);
    cudaFree(dc);

    return -1;
}