两种方法利用CUDA实现矩阵乘法

方法一:自己写

创建.cu文件。

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include "cuda_runtime.h"

template <int BLOCK_SIZE>

__global__ void MatrixMulCUDA(float* C, float* A, float* B, int wA, int wB) 
{
    int bx = blockIdx.x;   // Block index
    int by = blockIdx.y;   // Block index
    int tx = threadIdx.x;  // Thread index  thIdx.x=threadIdx.x+blockDim.x*blockIdx.x
    int ty = threadIdx.y;  // Thread index  thIdx.y=threadIdx.y+blockDim.y*blockIdx.y
    
    int aBegin = wA * BLOCK_SIZE * by;  // 320*32*by      A的行划分10份
    int aEnd = aBegin + wA - 1;         // 320*32*by+319  每份320个值*(32)
    int aStep = BLOCK_SIZE;             // 32 per block   Astep = 32
    int bBegin = BLOCK_SIZE * bx;       // 32*bx          B的列划分20份  每份320*32个值
    int bStep = BLOCK_SIZE * wB;        // 32*640         Bstep = 
    float Csub = 0;
    // a=wA*32*by:32:wA*32*by+319  b=32*bx:32*wB:end
    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];  // 声明共享内存 32*32
        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];  // 声明共享内存 32*32
        As[ty][tx] = A[a + wA * ty + tx];   // load matrices from device to shared
        Bs[ty][tx] = B[b + wB * ty + tx];   // load matrices from device to shared
        __syncthreads();                    // make sure the matrices are loaded
#pragma unroll  // 用于循环展开
        for (int k = 0; k < BLOCK_SIZE; ++k) {
            Csub += As[ty][k] * Bs[k][tx];  // C_ty_tx = sum_k(A_ty_k * B_k_ty)
        }
        __syncthreads();                    // make sure computation is done
    }

    int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;  // Write to device memory;
    C[c + wB * ty + tx] = Csub;
}

void ConstantInit(float* data, int size, float val) {
    for (int i = 0; i < size; ++i) {
        data[i] = val;
    }
}

int MatrixMultiply(int block_size, const dim3& dimsA, const dim3& dimsB) 
{
    // 1. Allocate host memory
    float* h_A, * h_B, * h_C;
    unsigned int size_A = dimsA.x * dimsA.y;           // 320*320
    unsigned int mem_size_A = sizeof(float) * size_A;  // memory size
    cudaMallocHost(&h_A, mem_size_A);                  // Allocate host memory
    unsigned int size_B = dimsB.x * dimsB.y;           // 640*320
    unsigned int mem_size_B = sizeof(float) * size_B;  // memory size
    cudaMallocHost(&h_B, mem_size_B);                  // Allocate host memory

    ConstantInit(h_A, size_A, 1.0f);  // initial host memory
    ConstantInit(h_B, size_B, 0.1f);  // initial host memory

    dim3 dimsC(dimsB.x, dimsA.y, 1);                   // 640*320*1
    unsigned int size_C = dimsC.x * dimsC.y;           // 640*320
    unsigned int mem_size_C = sizeof(float) * size_C;  // memory size
    cudaMallocHost(&h_C, mem_size_C);                  // Allocate host memory
    if (h_C == NULL) {
        fprintf(stderr, "Failed to allocate host matrix C!\n");
        exit(EXIT_FAILURE);
    }

    // 2. Allocate device memory
    float* d_A, * d_B, * d_C;
    cudaMalloc(reinterpret_cast<void**>(&d_A), mem_size_A);
    cudaMalloc(reinterpret_cast<void**>(&d_B), mem_size_B);
    cudaMalloc(reinterpret_cast<void**>(&d_C), mem_size_C);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);  // Allocate CUDA events for timing
    cudaEventCreate(&stop);   // Allocate CUDA events for timing

    cudaStream_t stream;
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);  // 异步系统

    // 3. Copy host memory to device memory
    cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream);  // copy
    cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream);  // copy

    dim3 threads(block_size, block_size);                 // 32*32
    dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);  // 20*10

    // 4. Compute on GPU
    printf("Computing result using CUDA Kernel...\n");
    MatrixMulCUDA<32> <<< grid, threads, 0, stream >>> (d_C, d_A, d_B, dimsA.x, dimsB.x);
    printf("done\n");
    cudaStreamSynchronize(stream);

    // 5. Compute 300 times
    cudaEventRecord(start, stream);
    int nIter = 300;
    for (int j = 0; j < nIter; j++) {
        if (block_size == 32) {
            MatrixMulCUDA<32> <<< grid, threads, 0, stream >>> (d_C, d_A, d_B, dimsA.x, dimsB.x);
        }
    }
    cudaEventRecord(stop, stream);
    cudaEventSynchronize(stop);
    float msecTotal = 0.0f;
    cudaEventElapsedTime(&msecTotal, start, stop);

    float msecPerMatrixMul = msecTotal / nIter;
    double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
        static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
    double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
    printf(
        "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
        " WorkgroupSize= %u threads/block\n",
        gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);

    // 6. Copy result from device to host
    cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);

    printf("Checking computed result for correctness: ");
    bool correct = true;
    double eps = 1.e-6;
    for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
        double abs_err = fabs(h_C[i] - (dimsA.x * 0.1f));
        double dot_length = dimsA.x;
        double abs_val = fabs(h_C[i]);
        double rel_err = abs_err / abs_val / dot_length;
        if (rel_err > eps) {
            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
                h_C[i], dimsA.x * 0.1f, eps);
            correct = false;
        }
    }
    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

    // 7. Clean up memory
    cudaFreeHost(h_A);
    cudaFreeHost(h_B);
    cudaFreeHost(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    if (correct) {
        return EXIT_SUCCESS;
    }
    else {
        return EXIT_FAILURE;
    }
}

int main() 
{
    int num_devices;
    cudaGetDeviceCount(&num_devices);

    struct cudaDeviceProp device_0_prop;
    cudaGetDeviceProperties(&device_0_prop, 0);

    printf("[Matrix Multiply Using CUDA] - Starting...\n");
    int block_size = 32;
    dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);  // 320*320
    dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);  // 640*320
    if (dimsA.x != dimsB.y) 
    {
        printf("Error: outer matrix dim must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
        exit(EXIT_FAILURE);
    }
    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
    int matrix_result = MatrixMultiply(block_size, dimsA, dimsB);  // 320*320*1  640*320*1
    exit(matrix_result);
}

方法二:利用cuBLAS函数

创建.cpp文件。

#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

typedef struct _matrixSize
{
    unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize;

void matrixMulCPU(float* C, const float* A, const float* B, unsigned int hA, unsigned int wA, unsigned int wB)
{
    for (unsigned int i = 0; i < hA; ++i)
        for (unsigned int j = 0; j < wB; ++j)
        {
            double sum = 0;

            for (unsigned int k = 0; k < wA; ++k)
            {
                double a = A[i * wA + k];
                double b = B[k * wB + j];
                sum += a * b;
            }

            C[i * wB + j] = (float)sum;
        }
}

void randomInit(float* data, int size)
{
    for (int i = 0; i < size; ++i)
        data[i] = rand() / (float)RAND_MAX;
}

void initializeCUDA(int& devID, int& iSizeMultiple, sMatrixSize& matrix_size)
{
    cudaDeviceProp deviceProp;
    cudaError_t error = cudaGetDeviceProperties(&deviceProp, devID);
    if (error != cudaSuccess)
    {
        printf("cudaGetDeviceProperties returned error code %d\n", error);
        exit(EXIT_FAILURE);
    }
    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);

    int block_size = 32;
    matrix_size.uiWA = 3 * block_size * iSizeMultiple;
    matrix_size.uiHA = 4 * block_size * iSizeMultiple;
    matrix_size.uiWB = 2 * block_size * iSizeMultiple;
    matrix_size.uiHB = 3 * block_size * iSizeMultiple;
    matrix_size.uiWC = 2 * block_size * iSizeMultiple;
    matrix_size.uiHC = 4 * block_size * iSizeMultiple;

    printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
        matrix_size.uiHA, matrix_size.uiWA,
        matrix_size.uiHB, matrix_size.uiWB,
        matrix_size.uiHC, matrix_size.uiWC);
}

int matrixMultiply(int devID, sMatrixSize& matrix_size)
{
    // allocate host memory for matrices A and B
    unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float* h_A = (float*)malloc(mem_size_A);
    unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float* h_B = (float*)malloc(mem_size_B);

    srand(2006);
    // initialize host memory
    randomInit(h_A, size_A);
    randomInit(h_B, size_B);

    // allocate host memory for the result
    unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
    unsigned int mem_size_C = sizeof(float) * size_C;
    float* h_C = (float*)malloc(mem_size_C);

    // allocate device memory
    float* d_A, * d_B, * d_C;
    cudaMalloc((void**)&d_A, mem_size_A);
    cudaMalloc((void**)&d_B, mem_size_B);
    cudaMalloc((void**)&d_C, mem_size_C);

    // copy host to device
    cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);

    // setup execution parameters
    int block_size = 32;
    dim3 threads(block_size, block_size);
    dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);

    printf("Computing result using CUBLAS...");
    // execute the kernel
    int nIter = 30;

    // CUBLAS version 2.0
    const float alpha = 1.0f;
    const float beta = 0.0f;
    cublasHandle_t handle;
    cudaEvent_t start, stop;

    cublasCreate(&handle);

    //Perform warmup operation with cublas
    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, 
        &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);

    // Allocate CUDA events that we'll use for timing
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, NULL);  // Record the start event
    for (int j = 0; j < nIter; j++)
    {
        // note cublas is column primary! need to transpose the order
        cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, 
            &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
    }
    printf("done.\n");
    cudaEventRecord(stop, NULL);  // Record the stop event
    cudaEventSynchronize(stop);  // Wait for the stop event to complete

    float msecTotal = 0.0f;
    cudaEventElapsedTime(&msecTotal, start, stop);

    // Compute and print the performance
    float msecPerMatrixMul = msecTotal / nIter;
    double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB;
    double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
    printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
        gigaFlops, msecPerMatrixMul, flopsPerMatrixMul);

    // copy result from device to host
    cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);

    // Destroy the handle
    cublasDestroy(handle);

    // compute reference solution
    printf("Computing result using host CPU...");
    float* reference = (float*)malloc(mem_size_C);
    matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
    printf("done.\n");

    // clean up memory
    free(h_A);
    free(h_B);
    free(h_C);
    free(reference);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return EXIT_SUCCESS;
}

int main(int argc, char** argv)
{
    printf("[Matrix Multiply CUBLAS] - Starting...\n");

    int devID = 0, sizeMult = 5;
    sMatrixSize matrix_size;
    initializeCUDA(devID, sizeMult, matrix_size);

    int matrix_result = matrixMultiply(devID, matrix_size);

    return matrix_result;
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值