一.用模板函数实现多精度计算
1.根据vector_sum的布尔值确定是否开启向量化访存;
using Vec4 = typename std::conditional<std::is_same<T, float>::value, float4, int4>::type;ge
2.根据T的数据类型 确定是int4向量化访存还是float4向量化访存;
3.使用reinterpret_cast<Vec4*>(&)进行指针数据格式转;
4.*reinterpret_cast<Vec4*>(&A[idx])前面再加个指针就是取值;
5.x,y,z,w分别是向量化访存的四个维度
6.将得到的temp指针的值写入结果矩阵C
template <typename T>
__global__ void elementwise(T* A, T* B, T* C, int total_elements, bool vector_sum) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (vector_sum) {
// 向量化计算,每个线程处理 4 个元素
idx *= 4;
if (idx < total_elements) {
using Vec4 = typename std::conditional<std::is_same<T, float>::value, float4, int4>::type;
Vec4 vec_a = *reinterpret_cast<Vec4*>(&A[idx]);
Vec4 vec_b = *reinterpret_cast<Vec4*>(&B[idx]);
Vec4 vec_c;
vec_c.x = vec_a.x + vec_b.x;
vec_c.y = vec_a.y + vec_b.y;
vec_c.z = vec_a.z + vec_b.z;
vec_c.w = vec_a.w + vec_b.w;
*reinterpret_cast<Vec4*>(&C[idx]) = vec_c;
}
} else {
// 标量计算
if (idx < total_elements) {
C[idx] = A[idx] + B[idx];
}
}
}
二.half2向量化访存计算
1.half与int和float不同,没有half4但是有half2,但是__ldg(reinterpret_cast<half2*>
2.__ldg()
利用 GPU 只读数据缓存(L1 cache),提高内存访问效率。
3.减少 DRAM 访问:如果数据 多次使用,__ldg()
可以缓存它们,避免重复访问全局内存。
4.如果 A[idx]
可能在 kernel 内修改,不要使用 __ldg()
template <>
__global__ void elementwise(half* A, half* B, half* C, int total_elements, bool vector_sum) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (vector_sum) {
// 使用 half2 进行向量化计算
idx *= 2;
if (idx < total_elements) {
half2 vec_a = __ldg(reinterpret_cast<half2*>(&A[idx]));
half2 vec_b = __ldg(reinterpret_cast<half2*>(&B[idx]));
half2 vec_c = __hadd2(vec_a, vec_b);
*reinterpret_cast<half2*>(&C[idx]) = vec_c;
}
} else {
// 标量计算
if (idx < total_elements) {
C[idx] = __hadd(A[idx], B[idx]);
}
}
}
三.模板函数数据初始化
template<typename T>
void ptr_data_init(T* p,int total_num){
for(int i=0;i<total_num;++i){
p[i] = static_cast<T>(i);
}
}
template<>
void ptr_data_init(half* p,int total_num){
for(int i=0;i<total_num;++i){
p[i] = __float2half((float)i);
}
}
四.cuda核函数launch函数
template<typename T>
void launch_cuda_add(T* a,T* b,T* c,int total_nums,int blocksize,bool vector_load){
// int gridesize = (int)(total_nums + blocksize - 1 )/ blocksize;
// int gridesize = (int)(total_nums + blocksize - 1 )/ (blocksize*4);
int gridesize = (int)(total_nums + blocksize - 1 )/ (blocksize*2);
elementwise<T><<<gridesize,blocksize>>>(a,b,c,total_nums,vector_load);
}
五.cpu实现launch函数
template<typename T>
void launch_cpu_add(T* a,T* b,T* c,int total_nums){
for(int j=0;j<total_nums;++j){
c[j] = a[j] + b[j];
}
}
template<>
void launch_cpu_add(half* a,half* b,half* c,int total_nums){
for(int j=0;j<total_nums;++j){
c[j] = __float2half(__half2float(a[j]) + __half2float(b[j]));
}
}
六.结果对齐函数
template<typename T>
void check_cuda_cpu_result(T* a,T* b,int total_nums){
bool temp = true;
for(int k=0;k<total_nums;++k){
if(fabs(a[k] - b[k])>1e-5){
printf("a: %f b: %f diff: %f\n",(float)a[k],(float)b[k],fabs(a[k] - b[k]));
temp = false;
}
}
if(temp){
printf("cuda and cpu get the same result!\n");
}
}
template<>
void check_cuda_cpu_result(half* a,half* b,int total_nums){
bool temp = true;
for(int k=0;k<total_nums;++k){
if(fabs(__half2float(a[k]) - __half2float(b[k]))>1e-5){
printf("a: %f b: %f diff: %f\n",__half2float(a[k]),__half2float(b[k]),fabs(__half2float(a[k]) - __half2float(b[k])));
temp = false;
}
}
if(temp){
printf("cuda and cpu get the same result!\n");
}
}
root@7666a2ca87d3:/datas/xk/02code/hpc/interview# cd rewrite/
root@7666a2ca87d3:/datas/xk/02code/hpc/interview/rewrite# nvcc -arch=sm_86 elementwise.cu -o elementwise && ./elementwise
cuda and cpu get the same result!
root@7666a2ca87d3:/datas/xk/02code/hpc/interview/rewrite#
七.main函数:指针声明,内存分配,数据拷贝,函数launch,结果对齐检查
int main(){
using dtype = DTYPE;
bool vector_load = VECTOR_LOAD;
int rows = ROWS;
int cols = COLS;
int blocksize = BLOCKSIZE;
int total_nums = rows*cols;
int total_bytes = rows*cols*sizeof(dtype);
dtype* ha;
dtype* hb;
dtype* hc;
dtype* hc_;
ha = (dtype*)malloc(total_bytes);
hb = (dtype*)malloc(total_bytes);
hc = (dtype*)malloc(total_bytes);
hc_ = (dtype*)malloc(total_bytes);
ptr_data_init(ha,total_nums);
ptr_data_init(hb,total_nums);
ptr_data_init(hc,total_nums);
dtype* da;
dtype* db;
dtype* dc;
Check_Cuda_Runtime(cudaMalloc((void**)&da,total_bytes));
Check_Cuda_Runtime(cudaMalloc((void**)&db,total_bytes));
Check_Cuda_Runtime(cudaMalloc((void**)&dc,total_bytes));
Check_Cuda_Runtime(cudaMemcpy(da,ha,total_bytes,cudaMemcpyHostToDevice));
Check_Cuda_Runtime(cudaMemcpy(db,hb,total_bytes,cudaMemcpyHostToDevice));
launch_cuda_add<dtype>(da,db,dc,total_nums,blocksize,vector_load);
Check_Cuda_Runtime(cudaMemcpy(hc,dc,total_bytes,cudaMemcpyDeviceToHost));
launch_cpu_add<dtype>(ha,hb,hc_,total_nums);
check_cuda_cpu_result<dtype>(hc,hc_,total_nums);
free(ha);
free(hb);
free(hc);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
return -1;
}