CUDA,day-9,大规模矩阵乘法

最新推荐文章于 2024-10-12 11:14:34 发布

原创最新推荐文章于 2024-10-12 11:14:34 发布 · 367 阅读

0 ·

CC 4.0 BY-SA版权

cuda 专栏收录该内容

22 篇文章

订阅专栏

本文介绍使用CUDA实现矩阵乘法的并行计算方法，包括内存分配、数据拷贝、GPU内核函数调用及结果返回等步骤，展示了如何在GPU上高效执行矩阵运算。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <conio.h>
using namespace std;
#define u32 unsigned int 
#define ARRAY_SIZE_X 8
#define ARRAY_SIZE_Y 8
u32 cpu_mat_input1[ARRAY_SIZE_X][ARRAY_SIZE_Y];
u32 cpu_mat_input2[ARRAY_SIZE_Y][ARRAY_SIZE_X];
u32 cpu_mat_result[ARRAY_SIZE_X][ARRAY_SIZE_X];
__global__ void func2(
	u32 * gpu_mat_input1,
	u32 * gpu_mat_input2,
	u32 * gpu_mat_result,
	u32 Width,
	u32 Width1,
	u32 Width2);
u32 main(void)
{
	for (u32 y = 0; y < ARRAY_SIZE_Y; y++)
	{
		for (u32 x = 0; x < ARRAY_SIZE_X; x++)
		{
			cpu_mat_input1[x][y] = rand() % 2;
			cpu_mat_input2[y][x] = rand() % 2;
		}
	}
	dim3 threads_rect(2, 2);
	dim3 blocks_rect(ARRAY_SIZE_Y / 2, ARRAY_SIZE_X / 2);
	u32 * gpu_mat_input1;
	u32 * gpu_mat_input2;
	u32 * gpu_mat_result;
	cudaMalloc((void **)& gpu_mat_input1, (ARRAY_SIZE_X)*(ARRAY_SIZE_Y)*(sizeof(u32)));
	cudaMalloc((void **)& gpu_mat_input2, (ARRAY_SIZE_Y)*(ARRAY_SIZE_X)*(sizeof(u32)));
	cudaMalloc((void **)& gpu_mat_result, (ARRAY_SIZE_X)*(ARRAY_SIZE_X)*(sizeof(u32)));
	cudaMemcpy(gpu_mat_input1, cpu_mat_input1, (ARRAY_SIZE_X)*(ARRAY_SIZE_Y)*(sizeof(u32)), cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_mat_input2, cpu_mat_input2, (ARRAY_SIZE_Y)*(ARRAY_SIZE_X)*(sizeof(u32)), cudaMemcpyHostToDevice);
	func2 << <blocks_rect, threads_rect >> >(
		gpu_mat_input1,
		gpu_mat_input2,
		gpu_mat_result,
		ARRAY_SIZE_X,
		ARRAY_SIZE_Y,
		ARRAY_SIZE_X);
	cudaMemcpy(cpu_mat_result, gpu_mat_result, (ARRAY_SIZE_X)*(ARRAY_SIZE_X)*(sizeof(u32)), cudaMemcpyDeviceToHost);
	cudaFree(gpu_mat_input1);
	cudaFree(gpu_mat_input2);
	cudaFree(gpu_mat_result);
	cout << "input1:" << endl;
	for (u32 x = 0; x < ARRAY_SIZE_X; x++)
	{
		for (u32 y = 0; y < ARRAY_SIZE_Y; y++)
		{
			cout << cpu_mat_input1[x][y] << " ";
		}
		cout << endl;
	}
	cout << "input2:" << endl;
	for (u32 y = 0; y < ARRAY_SIZE_Y; y++)
	{
		for (u32 x = 0; x < ARRAY_SIZE_X; x++)
		{
			cout << cpu_mat_input2[y][x] << " ";
		}
		cout << endl;
	}
	cout << "result:" << endl;
	for (u32 x = 0; x < ARRAY_SIZE_X; x++)
	{
		for (u32 y = 0; y < ARRAY_SIZE_X; y++)
		{
			cout << cpu_mat_result[x][y] << " ";
		}
		cout << endl;
	}
	printf("press any key to continue\n");
	cin.get();
	return 0;
}
__global__ void func2(
	u32 * gpu_mat_input1,
	u32 * gpu_mat_input2,
	u32 * gpu_mat_result,
	u32 Width,
	u32 Width1,
	u32 Width2)
{
	u32 idx = blockIdx.x*blockDim.x + threadIdx.x;
	u32 idy = blockIdx.y*blockDim.y + threadIdx.y;
	u32 Pvalue = 0;
	for (int k = 0; k < Width1; ++k)
	{
		u32 a = gpu_mat_input1[idy*Width1 + k];
		u32 b = gpu_mat_input2[k*Width2 + idx];
		Pvalue = Pvalue + a*b;
	}
	gpu_mat_result[idy*Width + idx] = Pvalue;
}