两种方法利用CUDA实现矩阵乘法

原创

已于 2023-06-02 16:28:35 修改 · 743 阅读

5 ·

CC 4.0 BY-SA版权

文章标签：

#矩阵 #算法

于 2023-06-01 19:43:04 首次发布

文章详细介绍了两种使用CUDA进行矩阵乘法的方法：一是自定义模板函数实现矩阵乘法，包括CUDA内核函数、设备内存分配、同步和性能测试；二是利用cuBLAS库执行矩阵乘法，包括设置执行参数、性能测量和结果验证。两种方法均包含主机到设备的数据传输、计算以及设备到主机的结果回传。

方法一：自己写

创建.cu文件。

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include "cuda_runtime.h"

template <int BLOCK_SIZE>

__global__ void MatrixMulCUDA(float* C, float* A, float* B, int wA, int wB) 
{
    int bx = blockIdx.x;   // Block index
    int by = blockIdx.y;   // Block index
    int tx = threadIdx.x;  // Thread index  thIdx.x=threadIdx.x+blockDim.x*blockIdx.x
    int ty = threadIdx.y;  // Thread index  thIdx.y=threadIdx.y+blockDim.y*blockIdx.y
    
    int aBegin = wA * BLOCK_SIZE * by;  // 320*32*by      A的行划分10份
    int aEnd = aBegin + wA - 1;         // 320*32*by+319  每份320个值*(32)
    int aStep = BLOCK_SIZE;             // 32 per block   Astep = 32
    int bBegin = BLOCK_SIZE * bx;       // 32*bx          B的列划分20份  每份320*32个值
    int bStep = BLOCK_SIZE * wB;        // 32*640         Bstep = 
    float Csub = 0;
    // a=wA*32*by:32:wA*32*by+319  b=32*bx:32*wB:end
    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];  // 声明共享内存 32*32
        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];  // 声明共享内存 32*32
        As[ty][tx] = A[a + wA * ty + tx];   // load matrices from device to shared
        Bs[ty][tx] = B[b + wB * ty + tx];   // load matrices from device to shared
        __syncthreads();                    // make sure the matrices are loaded
#pragma unroll  // 用于循环展开
        for (int k = 0; k < BLOCK_SIZE; ++k) {
            Csub += As[ty][k] * Bs[k][tx];  // C_ty_tx = sum_k(A_ty_k * B_k_ty)
        }
        __syncthreads();                    // make sure computation is done
    }

    int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;  // Write to device memory;
    C[c + wB * ty + tx] = Csub;
}

void ConstantInit(float* data, int size, float val) {
    for (int i = 0; i < size; ++i) {
        data[i] = val;
    }
}

int MatrixMultiply(int block_size, const dim3& dimsA, const dim3& dimsB) 
{
    // 1. Allocate host memory
    float* h_A, * h_B, * h_C;
    unsigned int size_A = dimsA.x * dimsA.y;           // 320*320
    unsigned int mem_size_A = sizeof(float) * size_A;  // memory size
    cudaMallocHost(&h_A, mem_size_A);                  // Allocate host memory
    unsigned int size_B = dimsB.x * dimsB.y;           // 640*320
    unsigned int mem_size_B = sizeof(float) * size_B;  // memory size
    cudaMallocHost(&h_B, mem_size_B);                  // Allocate host memory

    ConstantIni