计算矩阵
A
\mathbf{A}
A 乘矩阵
B
\mathbf{B}
B 的结果,其中
A
∈
R
1000
×
800
\mathbf{A}\in\mathbb{R}^{1000\times800}
A∈R1000×800,
B
∈
R
800
×
1200
\mathbf{B}\in\mathbb{R}^{800\times1200}
B∈R800×1200。
在下面的代码中:
verify_result
函数用于验证使用 GPU 计算得到的结果是否与 CPU 计算得到的结果相同。MatrixMulCPU
函数表示在 CPU 上计算。MatrixMul_gm
函数表示在 GPU 上计算,并使用 Global Memory。MatrixMul_sm
函数表示在 GPU 上计算,并使用 Share Memory。
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define TILE_WIDTH 20
const int A_row = 1000;
const int A_col = 800;
const int B_row = 800;
const int B_col = 1200;
int verify_result(int *input, int *ref, int row, int col);
void MatrixMulCPU(int *p_A, int *p_B, int *p_C, int A_row, int A_col, int B_col);
__global__ void MatrixMul_gm(int *p_A, int *p_B, int *p_C, int A_row, int A_col, int B_col);
__global__ void MatrixMul_sm(int *p_A, int *p_B, int *p_C, int A_row, int A_col, int B_col);
int main()
{
int *A = (int*)malloc(A_row * A_col * sizeof(int));
int *B = (int*)malloc(B_row * B_col * sizeof(int));
int *C_ref = (int*)malloc(A_row * B_col * sizeof(int));
int *C_gm = (int*)malloc(A_row * B_col * sizeof(int));
int *C_sm = (int*)malloc(A_row * B_col * sizeof(int));
for (int i = 0; i < A_row; i ++)
{
A[i * A_col] = 6;
for (int j = 1; j < A_col; j ++)
A[i * A_col + j] = A[i * A_col + j - 1] + 1;
}
for (int i = 0; i < B_row; i ++)
{
B[i * B_col] = 5;
for (int j = 1; j < B_col; j ++)
B[i * B_col + j] = B[i * B_col + j - 1] + 1;
}
clock_t start, end;
start = clock();
MatrixMulCPU(A, B, C_ref, A_row, A_col, B_col);
end = clock();
float t1 = (float)(end - start) / CLOCKS_PER_SEC;
printf("Matrix Multiplication on CPU use %.8f seconds.\n\n", t1);
int *device_A, *device_B, *device_C_gm, *device_C_sm;
cudaMalloc((void**)&device_A, sizeof(int) * A_row * A_col); // 在 device 分配内存
cudaMalloc((void**)&device_B, sizeof(int) * B_row * B_col); // 在 device 分配内存
cudaMalloc((void**)&device_C_gm, sizeof(int) * A_row * B_col); // 在 device 分配内存
cudaMalloc((void**)&device_C_sm, sizeof(int) * A_row * B_col); // 在 device 分配内存
cudaMemcpy(device_A, A, sizeof(int) * A_row * A_col, cudaMemcpyHostToDevice); // 将 host 的数据拷贝到 device
cudaMemcpy(device_B, B, sizeof(int) * B_row * B_col, cudaMemcpyHostToDevice); // 将 host 的数据拷贝到 device
start = clock();
int length = 4;
dim3 grid_gm(B_col / length, A_row / length);
dim3 block_gm(length, length);
MatrixMul_gm<<<grid_gm, block_gm>>>(device_A, device_B, device_C_gm, A_row, A_col, B_col); // 使用 kernel 进行运算
cudaMemcpy(C_gm, device_C_gm, sizeof(int) * A_row * B_col, cudaMemcpyDeviceToHost); // 将 device 中的计算结果拷贝到 host
end = clock();
float t2 = (float)(end - start) / CLOCKS_PER_SEC;
printf("Matrix Multiplication on GPU(Global Memory) use %.8f seconds.\n", t2);
printf("Verify result is %s.\n", verify_result(C_gm, C_ref, A_row, B_col) == 1 ? "true":"false");
printf("The speedup is %.8f.\n\n", t1 / t2);
start = clock();
dim3 grid_sm(B_col / TILE_WIDTH, A_row / TILE_WIDTH);
dim3 block_sm(TILE_WIDTH, TILE_WIDTH);
MatrixMul_sm<<<grid_sm, block_sm>>>(device_A, device_B, device_C_sm, A_row, A_col, B_col); // 使用 kernel 进行运算
cudaMemcpy(C_sm, device_C_sm, sizeof(int) * A_row * B_col, cudaMemcpyDeviceToHost); // 将 device 中的计算结果拷贝到 host
end = clock();
float t3 = (float)(end - start) / CLOCKS_PER_SEC;
printf("Matrix Multiplication on GPU(Share Memory) use %.8f seconds.\n", t3);
printf("Verify result is %s.\n", verify_result(C_sm, C_ref, A_row, B_col) == 1 ? "true":"false");
printf("The speedup is %.8f.\n", t1 / t3);
cudaFree(device_A); // 释放 device 中的内存
cudaFree(device_B); // 释放 device 中的内存
cudaFree(device_C_gm); // 释放 device 中的内存
cudaFree(device_C_sm); // 释放 device 中的内存
return 0;
}
int verify_result(int *input, int *ref, int row, int col)
{
for (int i = 0; i < row * col; i ++)
if (input[i] != ref[i]) return 0;
return 1;
}
void MatrixMulCPU(int *p_A, int *p_B, int *p_C, int A_row, int A_col, int B_col)
{
for (int i = 0; i < A_row; i ++)
for (int j = 0; j < B_col; j ++)
{
int sum = 0;
for (int k = 0; k < A_col; k ++)
sum += p_A[i * A_col + k] * p_B[k * B_col + j]; // p_A[i][k] * p_B[k][j]
p_C[i * B_col + j] = sum; // p_C[i][j]
}
}
__global__ void MatrixMul_gm(int *p_A, int *p_B, int *p_C, int A_row, int A_col, int B_col)
{
int sum = 0;
//找出该线程所在的行列
int row = blockIdx.y * blockDim.y + threadIdx.y; // X 对应矩阵row, Y对应举证col
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= A_row || col >= B_col) return;
//线程Thread[row,col]负责计算C[row,col]
for (int i = 0; i < A_col; i++)
sum += p_A[row * A_col + i] * p_B[i * B_col + col];
p_C[row * B_col + col] = sum;
}
__global__ void MatrixMul_sm(int *p_A, int *p_B, int *p_C, int A_row, int A_col, int B_col)
{
__shared__ int p_As[TILE_WIDTH][TILE_WIDTH];
__shared__ int p_Bs[TILE_WIDTH][TILE_WIDTH];
int row = blockIdx.y * TILE_WIDTH + threadIdx.y;
int col = blockIdx.x * TILE_WIDTH + threadIdx.x;
int sum = 0;
for (int k = 0; k < A_col / TILE_WIDTH; k++)
{
p_As[threadIdx.y][threadIdx.x] = p_A[row * A_col + (k * TILE_WIDTH + threadIdx.x)];
p_Bs[threadIdx.y][threadIdx.x] = p_B[col + (k * TILE_WIDTH + threadIdx.y) * B_col];
__syncthreads();
for (int m = 0; m < TILE_WIDTH; m++)
sum += p_As[threadIdx.y][m] * p_Bs[m][threadIdx.x];
__syncthreads();
}
p_C[row * B_col + col] = sum;
}