CUDA 编程——Matrix Multiplication_cuda matrix multiplication-优快云博客

本文链接：https://blog.youkuaiyun.com/CesareBorgia/article/details/121725524

计算矩阵 $\mathbf{A}$ 乘矩阵 $\mathbf{B}$ 的结果，其中 $\mathbf{A}\in\mathbb{R}^{1000\times800}$ ， $\mathbf{B}\in\mathbb{R}^{800\times1200}$ 。
在下面的代码中：

verify_result 函数用于验证使用 GPU 计算得到的结果是否与 CPU 计算得到的结果相同。
MatrixMulCPU函数表示在 CPU 上计算。
MatrixMul_gm函数表示在 GPU 上计算，并使用 Global Memory。
MatrixMul_sm函数表示在 GPU 上计算，并使用 Share Memory。

#include <stdio.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define TILE_WIDTH 20
const int A_row = 1000;
const int A_col = 800;
const int B_row = 800;
const int B_col = 1200;
int verify_result(int *input, int *ref, int row, int col); 
void MatrixMulCPU(int *p_A, int *p_B, int *p_C, int A_row, int A_col, int B_col);
__global__ void MatrixMul_gm(int *p_A, int *p_B, int *p_C,  int A_row, int A_col, int B_col);
__global__ void MatrixMul_sm(int *p_A, int *p_B, int *p_C,  int A_row, int A_col, int B_col);
int main()
{
    int *A = (int*)malloc(A_row * A_col * sizeof(int));
    int *B = (int*)malloc(B_row * B_col * sizeof(int));
    int *C_ref = (int*)malloc(A_row * B_col * sizeof(int));
    int *C_gm = (int*)malloc(A_row * B_col * sizeof(int));
    int *C_sm = (int*)malloc(A_row * B_col * sizeof(int));
    for (int i = 0; i < A_row; i ++)
    {
        A[i * A_col] = 6;
        for (int j = 1; j < A_col; j ++)
            A[i * A_col + j] = A[i * A_col + j - 1] + 1;
    }
    for (int i = 0; i < B_row; i ++)
    {
        B[i * B_col] = 5;
        for (int j = 1; j < B_col; j ++)
            B[i * B_col + j] = B[i * B_col + j - 1] + 1;
    }
    clock_t start, end;

    start = clock();
    MatrixMulCPU(A, B, C_ref, A_row, A_col, B_col);
    end = clock();
    float t1 = (float)(end - start) / CLOCKS_PER_SEC;
    printf("Matrix Multiplication on CPU use %.8f seconds.\n\n", t1);
    
    int *device_A, *device_B, *device_C_gm, *device_C_sm;
    cudaMalloc((void**)&device_A, sizeof(int) * A_row * A_col);  // 在 device 分配内存
    cudaMalloc((void**)&device_B, sizeof(int) * B_row * B_col);  // 在 device 分配内存
    cudaMalloc((void**)&device_C_gm, sizeof(int) * A_row * B_col);  // 在 device 分配内存
    cudaMalloc((void**)&device_C_sm, sizeof(int) * A_row * B_col);  // 在 device 分配内存

    cudaMemcpy(device_A, A, sizeof(int) * A_row * A_col, cudaMemcpyHostToDevice);  // 将 host 的数据拷贝到 device
    cudaMemcpy(device_B, B, sizeof(int) * B_row * B_col, cudaMemcpyHostToDevice);  // 将 host 的数据拷贝到 device
    
    start = clock();
    int length = 4;
    dim3 grid_gm(B_col / length, A_row / length);
    dim3 block_gm(length, length);
    MatrixMul_gm<<<grid_gm, block_gm>>>(device_A, device_B, device_C_gm, A_row, A_col, B_col);  // 使用 kernel 进行运算
    cudaMemcpy(C_gm, device_C_gm, sizeof(int) * A_row * B_col, cudaMemcpyDeviceToHost);  // 将 device 中的计算结果拷贝到 host
    end = clock();
    float t2 = (float)(end - start) / CLOCKS_PER_SEC;
    printf("Matrix Multiplication on GPU(Global Memory) use %.8f seconds.\n", t2);
    printf("Verify result is %s.\n", verify_result(C_gm, C_ref, A_row, B_col) == 1 ? "true":"false");
    printf("The speedup is %.8f.\n\n", t1 / t2);
    
    start = clock();
    dim3 grid_sm(B_col / TILE_WIDTH, A_row / TILE_WIDTH);
	dim3 block_sm(TILE_WIDTH, TILE_WIDTH);
    MatrixMul_sm<<<grid_sm, block_sm>>>(device_A, device_B, device_C_sm, A_row, A_col, B_col);  // 使用 kernel 进行运算
    cudaMemcpy(C_sm, device_C_sm, sizeof(int) * A_row * B_col, cudaMemcpyDeviceToHost);  // 将 device 中的计算结果拷贝到 host
    end = clock();
    float t3 = (float)(end - start) / CLOCKS_PER_SEC;
    printf("Matrix Multiplication on GPU(Share Memory) use %.8f seconds.\n", t3);
    printf("Verify result is %s.\n", verify_result(C_sm, C_ref, A_row, B_col) == 1 ? "true":"false");
    printf("The speedup is %.8f.\n", t1 / t3);
    cudaFree(device_A);  // 释放 device 中的内存
    cudaFree(device_B);  // 释放 device 中的内存
    cudaFree(device_C_gm);  // 释放 device 中的内存
    cudaFree(device_C_sm);  // 释放 device 中的内存
    return 0;
}
int verify_result(int *input, int *ref, int row, int col)
{
    for (int i = 0; i < row * col; i ++)
        if (input[i] != ref[i]) return 0;
    return 1;
}
void MatrixMulCPU(int *p_A, int *p_B, int *p_C, int A_row, int A_col, int B_col)
{
    for (int i = 0; i < A_row; i ++)
        for (int j = 0; j < B_col; j ++)
        {
            int sum = 0;
            for (int k = 0; k < A_col; k ++)
                sum += p_A[i * A_col + k] * p_B[k * B_col + j];  // p_A[i][k] * p_B[k][j]
            p_C[i * B_col + j] = sum;  // p_C[i][j]
        }
}
__global__ void MatrixMul_gm(int *p_A, int *p_B, int *p_C,  int A_row, int A_col, int B_col)
{
    int sum = 0;
	//找出该线程所在的行列
	int row = blockIdx.y * blockDim.y + threadIdx.y;  // X 对应矩阵row, Y对应举证col
	int col = blockIdx.x * blockDim.x + threadIdx.x;
	if (row >= A_row || col >= B_col) return;
	//线程Thread[row,col]负责计算C[row,col]
	for (int i = 0; i < A_col; i++)
		sum += p_A[row * A_col + i] * p_B[i * B_col + col];
	p_C[row * B_col + col] = sum;
}
__global__ void MatrixMul_sm(int *p_A, int *p_B, int *p_C,  int A_row, int A_col, int B_col)
{
    __shared__ int p_As[TILE_WIDTH][TILE_WIDTH];
	__shared__ int p_Bs[TILE_WIDTH][TILE_WIDTH];
	int row = blockIdx.y * TILE_WIDTH + threadIdx.y;
	int col = blockIdx.x * TILE_WIDTH + threadIdx.x;
	int sum = 0;
	for (int k = 0; k < A_col / TILE_WIDTH; k++)
	{
		p_As[threadIdx.y][threadIdx.x] = p_A[row * A_col + (k * TILE_WIDTH + threadIdx.x)];
		p_Bs[threadIdx.y][threadIdx.x] = p_B[col + (k * TILE_WIDTH + threadIdx.y) * B_col];
		__syncthreads();
		for (int m = 0; m < TILE_WIDTH; m++)
			sum += p_As[threadIdx.y][m] * p_Bs[m][threadIdx.x];
		__syncthreads();
	}
	p_C[row * B_col + col] = sum;
}