目录
一.在我的cublas专栏里面有关于几种形式矩阵乘法运算的知识点
一.在我的cublas专栏里面有关于几种形式矩阵乘法运算的知识点
- cublas专栏
https://blog.youkuaiyun.com/weixin_55083979/category_12912641.html?spm=1001.2014.3001.5482后续还会继续更新相关算法,欢迎订阅哦!
二. cuda 模板核函数头文件声明与源文件实现
template<typename T>
__global__ void cuda_matmul(T* A,T* B,T* C,int rows,int cols,int total_num);
template<>
__global__ void cuda_matmul(half* A,half* B,half* C,int rows,int cols,int total_num);
template<typename T>
__global__ void cuda_matmul(T* A,T* B,T* C,int rows,int cols,int total_num){
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx<total_num){
int row_idx = idx/cols;
int col_idx = idx%cols;
T temp = 0;
for(int k=0;k<cols;++k){
temp += A[row_idx*cols+k] * B[col_idx + k*rows];
}
C[row_idx*cols+col_idx] = temp;
}
}
template<>
__global__ void cuda_matmul(half* A,half* B,half* C,int rows,int cols,int total_num){
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx<total_num){
int row_idx = idx/cols;
int col_idx = idx%cols;
half temp = 0;
for(int k=0;k<cols;++k){
temp = __hadd(temp,__hmul(A[row_idx*cols+k] , B[col_idx + k*rows]));
}
C[row_idx*cols+col_idx] = temp;
}
}
三.cuda launch 函数声明与源程序实现
template<typename T>
void launch_cuda_matmul(T* A,T* B,T* C,int rows,int cols,int blocksize,int total_nums);
template<typename T>
void launch_cuda_matmul(T* A,T* B,T* C,int rows,int cols,int blocksize,int total_nums){
int gridesize = (total_nums+blocksize -1)/blocksize;
cuda_matmul<T><<<gridesize,blocksize>>>(A,B,C,rows,cols,total_nums);
}
四.cpu 函数声明与源程序实现
template<typename T>
void launch_cpu_matmul(T* A,T* B,T* C,int rows,int cols);
template<>
void launch_cpu_matmul(half* A,half* B,half* C,int rows,int cols);
五.模板函数实例化
template void launch_cuda_matmul<float>(float*,float*,float*,int,int,int,int);
template void launch_cuda_matmul<half>(half*,half*,half*,int,int,int,int);
template void launch_cuda_matmul<uint8_t>(uint8_t*,uint8_t*,uint8_t*,int,int,int,int);
template void launch_cpu_matmul<float>(float*,float*,float*,int,int);
template void launch_cpu_matmul<half>(half*,half*,half*,int,int);
template void launch_cpu_matmul<uint8_t>(uint8_t*,uint8_t*,uint8_t*,int,int);
六.静态库编译与可执行文件编译
add_library(matmul STATIC matmul.cu)
set_property(TARGET matmul PROPERTY CUDA_SEPARABLE_COMPILATION ON)
set_property(TARGET matmul PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET matmul PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
# 连接 add 和 utils 库
# 在 CMake 的 target_link_libraries() 命令中,PRIVATE 和 PUBLIC 用于控制 依赖关系的可见性,即:
# PRIVATE:仅对当前目标(可执行文件或库)有效,不会传递给依赖它的其他目标。
# PUBLIC:不仅对当前目标有效,还会传递给依赖它的其他目标。
add_executable(test_add test_add.cpp)
target_link_libraries(test_add PRIVATE add utils PUBLIC -lcudart -lcudadevrt)
add_executable(test_mul test_mul.cpp)
target_link_libraries(test_mul PRIVATE PUBLIC mul utils -lcudart -lcudadevrt)
add_executable(test_matmul test_matmul.cpp)
target_link_libraries(test_matmul PRIVATE PUBLIC matmul utils -lcudart -lcudadevrt)
七.test_matmul.cpp 测试源程序代码
#include "src/utils/utils.h"
#include "src/matmul/matmul.h"
#define ROWS 1024
#define COLS 1024
// #define DTYPE float
#define DTYPE half
// #define DTYPE uint8_t
#define BL0CKSIZE 32
int main(){
int rows = ROWS;
int cols = COLS;
int blocksize = BL0CKSIZE;
using dtype = DTYPE;
dtype* ha;
dtype* hb;
dtype* hc;
dtype* hc_;
dtype* da;
dtype* db;
dtype* dc;
ha = (dtype*)malloc(rows*cols*sizeof(dtype));
hb = (dtype*)malloc(rows*cols*sizeof(dtype));
hc = (dtype*)malloc(rows*cols*sizeof(dtype));
hc_ = (dtype*)malloc(rows*cols*sizeof(dtype));
ptr_data_init(ha,rows*cols);
ptr_data_init(hb,rows*cols);
cudaMalloc((void**)&da,rows*cols*sizeof(dtype));
cudaMalloc((void**)&db,rows*cols*sizeof(dtype));
cudaMalloc((void**)&dc,rows*cols*sizeof(dtype));
launch_cuda_matmul<dtype>(da,db,dc,rows,cols,blocksize,rows*cols);
cudaMemcpy(hc,dc,rows*cols*sizeof(dtype),cudaMemcpyDeviceToHost);
launch_cpu_matmul<dtype>(ha,hb,hc_,rows,cols);
bool success = check_cpu_cuda_cal_result<dtype>(hc,hc_,rows,cols);
if(success){
printf("%s CUDA && CPU get the same result!\n",typeid(dtype).name());
}else{
printf("%s CUDA && CPU get wrong result!\n",typeid(dtype).name());
}
free(ha);
free(hb);
free(hc);
free(hc_);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
return 0;
}
八.运行结果对齐
root@ubuntu:/datas/xk/02code/hpc/cuda/build# make -j32
[ 20%] Built target utils
[ 30%] Built target add
[ 40%] Built target mul
[ 55%] Built target matmul
[ 85%] Built target test_mul
[ 85%] Built target test_add
[ 90%] Building CXX object test/CMakeFiles/test_matmul.dir/test_matmul.cpp.o
[ 95%] Linking CUDA device code CMakeFiles/test_matmul.dir/cmake_device_link.o
[100%] Linking CXX executable ../bin/test_matmul
[100%] Built target test_matmul
root@ubuntu:/datas/xk/02code/hpc/cuda/build# ./bin/test_matmul
h CUDA && CPU get the same result!
root@ubuntu:/datas/xk/02code/hpc/cuda/build#