手写cuda算子：实现GPU+CPU FP32+FP16+INT8精度矩阵乘法Y=X@W（三）

本文链接：https://blog.youkuaiyun.com/weixin_55083979/article/details/146167457

一.在我的cublas专栏里面有关于几种形式矩阵乘法运算的知识点

二. cuda 模板核函数头文件声明与源文件实现

三.cuda launch 函数声明与源程序实现

四.cpu 函数声明与源程序实现

五.模板函数实例化

六.静态库编译与可执行文件编译

七.test_matmul.cpp 测试源程序代码

一.在我的cublas专栏里面有关于几种形式矩阵乘法运算的知识点

cublas专栏https://blog.youkuaiyun.com/weixin_55083979/category_12912641.html?spm=1001.2014.3001.5482后续还会继续更新相关算法，欢迎订阅哦！

二. cuda 模板核函数头文件声明与源文件实现

template<typename T>
__global__ void cuda_matmul(T* A,T* B,T* C,int rows,int cols,int total_num);


template<>
__global__ void cuda_matmul(half* A,half* B,half* C,int rows,int cols,int total_num);

template<typename T>
__global__ void cuda_matmul(T* A,T* B,T* C,int rows,int cols,int total_num){
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if(idx<total_num){
        int row_idx = idx/cols;
        int col_idx = idx%cols;
        T temp = 0;
        for(int k=0;k<cols;++k){
            temp += A[row_idx*cols+k] * B[col_idx + k*rows];
        }
        C[row_idx*cols+col_idx] = temp;
    }
}


template<>
__global__ void cuda_matmul(half* A,half* B,half* C,int rows,int cols,int total_num){
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if(idx<total_num){
        int row_idx = idx/cols;
        int col_idx = idx%cols;
        half temp = 0;
        for(int k=0;k<cols;++k){
            temp = __hadd(temp,__hmul(A[row_idx*cols+k] , B[col_idx + k*rows]));
        }
        C[row_idx*cols+col_idx] = temp;
    }
}

三.cuda launch 函数声明与源程序实现

template<typename T>
void launch_cuda_matmul(T* A,T* B,T* C,int rows,int cols,int blocksize,int total_nums);


template<typename T>
void launch_cuda_matmul(T* A,T* B,T* C,int rows,int cols,int blocksize,int total_nums){
    int gridesize = (total_nums+blocksize -1)/blocksize;
    cuda_matmul<T><<<gridesize,blocksize>>>(A,B,C,rows,cols,total_nums);
}

四.cpu 函数声明与源程序实现

template<typename T>
void launch_cpu_matmul(T* A,T* B,T* C,int rows,int cols);

template<>
void launch_cpu_matmul(half* A,half* B,half* C,int rows,int cols);

五.模板函数实例化

template void launch_cuda_matmul<float>(float*,float*,float*,int,int,int,int);
template void launch_cuda_matmul<half>(half*,half*,half*,int,int,int,int);
template void launch_cuda_matmul<uint8_t>(uint8_t*,uint8_t*,uint8_t*,int,int,int,int);

template void launch_cpu_matmul<float>(float*,float*,float*,int,int);
template void launch_cpu_matmul<half>(half*,half*,half*,int,int);
template void launch_cpu_matmul<uint8_t>(uint8_t*,uint8_t*,uint8_t*,int,int);

六.静态库编译与可执行文件编译

add_library(matmul STATIC matmul.cu)

set_property(TARGET matmul PROPERTY CUDA_SEPARABLE_COMPILATION ON)
set_property(TARGET matmul PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET matmul PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

# 连接 add 和 utils 库
# 在 CMake 的 target_link_libraries() 命令中，PRIVATE 和 PUBLIC 用于控制 依赖关系的可见性，即：
# PRIVATE：仅对当前目标（可执行文件或库）有效，不会传递给依赖它的其他目标。
# PUBLIC：不仅对当前目标有效，还会传递给依赖它的其他目标。
add_executable(test_add test_add.cpp)
target_link_libraries(test_add PRIVATE add utils PUBLIC -lcudart -lcudadevrt)


add_executable(test_mul test_mul.cpp)
target_link_libraries(test_mul PRIVATE  PUBLIC mul utils -lcudart -lcudadevrt)


add_executable(test_matmul test_matmul.cpp)
target_link_libraries(test_matmul PRIVATE  PUBLIC matmul utils -lcudart -lcudadevrt)

七.test_matmul.cpp 测试源程序代码

#include "src/utils/utils.h"
#include "src/matmul/matmul.h"

#define ROWS 1024
#define COLS 1024
// #define DTYPE float
#define DTYPE half
// #define DTYPE uint8_t
#define BL0CKSIZE 32



int main(){
    int rows = ROWS;
    int cols = COLS;
    int blocksize = BL0CKSIZE;
    using dtype = DTYPE;
    dtype* ha;
    dtype* hb;
    dtype* hc;
    dtype* hc_;
    dtype* da;
    dtype* db;
    dtype* dc;
    ha = (dtype*)malloc(rows*cols*sizeof(dtype));
    hb = (dtype*)malloc(rows*cols*sizeof(dtype));
    hc = (dtype*)malloc(rows*cols*sizeof(dtype));
    hc_ = (dtype*)malloc(rows*cols*sizeof(dtype));
    ptr_data_init(ha,rows*cols);
    ptr_data_init(hb,rows*cols);
    cudaMalloc((void**)&da,rows*cols*sizeof(dtype));
    cudaMalloc((void**)&db,rows*cols*sizeof(dtype));
    cudaMalloc((void**)&dc,rows*cols*sizeof(dtype));
    launch_cuda_matmul<dtype>(da,db,dc,rows,cols,blocksize,rows*cols);
    cudaMemcpy(hc,dc,rows*cols*sizeof(dtype),cudaMemcpyDeviceToHost);
    launch_cpu_matmul<dtype>(ha,hb,hc_,rows,cols);
    bool success  = check_cpu_cuda_cal_result<dtype>(hc,hc_,rows,cols);
    if(success){
        printf("%s CUDA && CPU get the same result!\n",typeid(dtype).name());
    }else{
        printf("%s CUDA && CPU get wrong result!\n",typeid(dtype).name());
    }
    free(ha);
    free(hb);
    free(hc);
    free(hc_);
    cudaFree(da);
    cudaFree(db);
    cudaFree(dc);
    return 0;
    
    
}

八.运行结果对齐

root@ubuntu:/datas/xk/02code/hpc/cuda/build# make -j32
[ 20%] Built target utils
[ 30%] Built target add
[ 40%] Built target mul
[ 55%] Built target matmul
[ 85%] Built target test_mul
[ 85%] Built target test_add
[ 90%] Building CXX object test/CMakeFiles/test_matmul.dir/test_matmul.cpp.o
[ 95%] Linking CUDA device code CMakeFiles/test_matmul.dir/cmake_device_link.o
[100%] Linking CXX executable ../bin/test_matmul
[100%] Built target test_matmul
root@ubuntu:/datas/xk/02code/hpc/cuda/build# ./bin/test_matmul 
h CUDA && CPU get the same result!
root@ubuntu:/datas/xk/02code/hpc/cuda/build#