方法一:自己写
创建.cu文件。
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include "cuda_runtime.h"
template <int BLOCK_SIZE>
__global__ void MatrixMulCUDA(float* C, float* A, float* B, int wA, int wB)
{
int bx = blockIdx.x; // Block index
int by = blockIdx.y; // Block index
int tx = threadIdx.x; // Thread index thIdx.x=threadIdx.x+blockDim.x*blockIdx.x
int ty = threadIdx.y; // Thread index thIdx.y=threadIdx.y+blockDim.y*blockIdx.y
int aBegin = wA * BLOCK_SIZE * by; // 320*32*by A的行划分10份
int aEnd = aBegin + wA - 1; // 320*32*by+319 每份320个值*(32)
int aStep = BLOCK_SIZE; // 32 per block Astep = 32
int bBegin = BLOCK_SIZE * bx; // 32*bx B的列划分20份 每份320*32个值
int bStep = BLOCK_SIZE * wB; // 32*640 Bstep =
float Csub = 0;
// a=wA*32*by:32:wA*32*by+319 b=32*bx:32*wB:end
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; // 声明共享内存 32*32
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; // 声明共享内存 32*32
As[ty][tx] = A[a + wA * ty + tx]; // load matrices from device to shared
Bs[ty][tx] = B[b + wB * ty + tx]; // load matrices from device to shared
__syncthreads(); // make sure the matrices are loaded
#pragma unroll // 用于循环展开
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx]; // C_ty_tx = sum_k(A_ty_k * B_k_ty)
}
__syncthreads(); // make sure computation is done
}
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; // Write to device memory;
C[c + wB * ty + tx] = Csub;
}
void ConstantInit(float* data, int size, float val) {
for (int i = 0; i < size; ++i) {
data[i] = val;
}
}
int MatrixMultiply(int block_size, const dim3& dimsA, const dim3& dimsB)
{
// 1. Allocate host memory
float* h_A, * h_B, * h_C;
unsigned int size_A = dimsA.x * dimsA.y; // 320*320
unsigned int mem_size_A = sizeof(float) * size_A; // memory size
cudaMallocHost(&h_A, mem_size_A); // Allocate host memory
unsigned int size_B = dimsB.x * dimsB.y; // 640*320
unsigned int mem_size_B = sizeof(float) * size_B; // memory size
cudaMallocHost(&h_B, mem_size_B); // Allocate host memory
ConstantIni

文章详细介绍了两种使用CUDA进行矩阵乘法的方法:一是自定义模板函数实现矩阵乘法,包括CUDA内核函数、设备内存分配、同步和性能测试;二是利用cuBLAS库执行矩阵乘法,包括设置执行参数、性能测量和结果验证。两种方法均包含主机到设备的数据传输、计算以及设备到主机的结果回传。
最低0.47元/天 解锁文章
454

被折叠的 条评论
为什么被折叠?



