CUDA,day-9,大规模矩阵乘法

本文介绍使用CUDA实现矩阵乘法的并行计算方法,包括内存分配、数据拷贝、GPU内核函数调用及结果返回等步骤,展示了如何在GPU上高效执行矩阵运算。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <conio.h>
using namespace std;
#define u32 unsigned int 
#define ARRAY_SIZE_X 8
#define ARRAY_SIZE_Y 8
u32 cpu_mat_input1[ARRAY_SIZE_X][ARRAY_SIZE_Y];
u32 cpu_mat_input2[ARRAY_SIZE_Y][ARRAY_SIZE_X];
u32 cpu_mat_result[ARRAY_SIZE_X][ARRAY_SIZE_X];
__global__ void func2(
	u32 * gpu_mat_input1,
	u32 * gpu_mat_input2,
	u32 * gpu_mat_result,
	u32 Width,
	u32 Width1,
	u32 Width2);
u32 main(void)
{
	for (u32 y = 0; y < ARRAY_SIZE_Y; y++)
	{
		for (u32 x = 0; x < ARRAY_SIZE_X; x++)
		{
			cpu_mat_input1[x][y] = rand() % 2;
			cpu_mat_input2[y][x] = rand() % 2;
		}
	}
	dim3 threads_rect(2, 2);
	dim3 blocks_rect(ARRAY_SIZE_Y / 2, ARRAY_SIZE_X / 2);
	u32 * gpu_mat_input1;
	u32 * gpu_mat_input2;
	u32 * gpu_mat_result;
	cudaMalloc((void **)& gpu_mat_input1, (ARRAY_SIZE_X)*(ARRAY_SIZE_Y)*(sizeof(u32)));
	cudaMalloc((void **)& gpu_mat_input2, (ARRAY_SIZE_Y)*(ARRAY_SIZE_X)*(sizeof(u32)));
	cudaMalloc((void **)& gpu_mat_result, (ARRAY_SIZE_X)*(ARRAY_SIZE_X)*(sizeof(u32)));
	cudaMemcpy(gpu_mat_input1, cpu_mat_input1, (ARRAY_SIZE_X)*(ARRAY_SIZE_Y)*(sizeof(u32)), cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_mat_input2, cpu_mat_input2, (ARRAY_SIZE_Y)*(ARRAY_SIZE_X)*(sizeof(u32)), cudaMemcpyHostToDevice);
	func2 << <blocks_rect, threads_rect >> >(
		gpu_mat_input1,
		gpu_mat_input2,
		gpu_mat_result,
		ARRAY_SIZE_X,
		ARRAY_SIZE_Y,
		ARRAY_SIZE_X);
	cudaMemcpy(cpu_mat_result, gpu_mat_result, (ARRAY_SIZE_X)*(ARRAY_SIZE_X)*(sizeof(u32)), cudaMemcpyDeviceToHost);
	cudaFree(gpu_mat_input1);
	cudaFree(gpu_mat_input2);
	cudaFree(gpu_mat_result);
	cout << "input1:" << endl;
	for (u32 x = 0; x < ARRAY_SIZE_X; x++)
	{
		for (u32 y = 0; y < ARRAY_SIZE_Y; y++)
		{
			cout << cpu_mat_input1[x][y] << " ";
		}
		cout << endl;
	}
	cout << "input2:" << endl;
	for (u32 y = 0; y < ARRAY_SIZE_Y; y++)
	{
		for (u32 x = 0; x < ARRAY_SIZE_X; x++)
		{
			cout << cpu_mat_input2[y][x] << " ";
		}
		cout << endl;
	}
	cout << "result:" << endl;
	for (u32 x = 0; x < ARRAY_SIZE_X; x++)
	{
		for (u32 y = 0; y < ARRAY_SIZE_X; y++)
		{
			cout << cpu_mat_result[x][y] << " ";
		}
		cout << endl;
	}
	printf("press any key to continue\n");
	cin.get();
	return 0;
}
__global__ void func2(
	u32 * gpu_mat_input1,
	u32 * gpu_mat_input2,
	u32 * gpu_mat_result,
	u32 Width,
	u32 Width1,
	u32 Width2)
{
	u32 idx = blockIdx.x*blockDim.x + threadIdx.x;
	u32 idy = blockIdx.y*blockDim.y + threadIdx.y;
	u32 Pvalue = 0;
	for (int k = 0; k < Width1; ++k)
	{
		u32 a = gpu_mat_input1[idy*Width1 + k];
		u32 b = gpu_mat_input2[k*Width2 + idx];
		Pvalue = Pvalue + a*b;
	}
	gpu_mat_result[idy*Width + idx] = Pvalue;
}

D:\Anaconda3\envs\test\python.exe D:/PyCharm/stock_predict_with_LSTM-master/main.py [ 2025/06/10 22:09:19 ] Config: &#39;add_train&#39;: False &#39;batch_size&#39;: 64 &#39;continue_flag&#39;: &#39;&#39; &#39;cur_time&#39;: &#39;2025_06_10_22_09_19&#39; &#39;debug_mode&#39;: False &#39;debug_num&#39;: 500 &#39;do_continue_train&#39;: False &#39;do_figure_save&#39;: False &#39;do_log_print_to_screen&#39;: True &#39;do_log_save_to_file&#39;: True &#39;do_predict&#39;: True &#39;do_train&#39;: True &#39;do_train_visualized&#39;: False &#39;dropout_rate&#39;: 0.2 &#39;epoch&#39;: 20 &#39;feature_columns&#39;: [2, 3, 4, 5, 6, 7, 8] &#39;figure_save_path&#39;: &#39;./figure/&#39; &#39;hidden_size&#39;: 128 &#39;input_size&#39;: 7 &#39;label_columns&#39;: [4, 5] &#39;label_in_feature_index&#39;: [2, 3] &#39;learning_rate&#39;: 0.001 &#39;log_save_path&#39;: &#39;./log/2025_06_10_22_09_19_keras/&#39; &#39;lstm_layers&#39;: 2 &#39;model_name&#39;: &#39;model_keras.h5&#39; &#39;model_postfix&#39;: {&#39;pytorch&#39;: &#39;.pth&#39; &#39;keras&#39;: &#39;.h5&#39; &#39;tensorflow&#39;: &#39;.ckpt&#39;} &#39;model_save_path&#39;: &#39;./checkpoint/keras/&#39; &#39;output_size&#39;: 2 &#39;patience&#39;: 5 &#39;predict_day&#39;: 1 &#39;random_seed&#39;: 42 &#39;shuffle_train_data&#39;: True &#39;time_step&#39;: 20 &#39;train_data_path&#39;: &#39;./data/stock_data.csv&#39; &#39;train_data_rate&#39;: 0.95 &#39;use_cuda&#39;: False &#39;used_frame&#39;: &#39;keras&#39; &#39;valid_data_rate&#39;: 0.15 [ 2025/06/10 22:09:19 ] Run Error Traceback (most recent call last): File "D:/PyCharm/stock_predict_with_LSTM-master/main.py", line 237, in main train(config, logger, [train_X, train_Y, valid_X, valid_Y]) File "D:\PyCharm\stock_predict_with_LSTM-master\model\model_keras.py", line 30, in train model = get_keras_model(config) File "D:\PyCharm\stock_predict_with_LSTM-master\model\model_keras.py", line 11, in get_keras_model lstm = LSTM(units=config.hidden_size,dropout=config.dropout_rate,return_sequences=True)(lstm) File "D:\Anaconda3\envs\test\lib\site-packages\tensorflow\python\keras\layers\recurrent.py", line 663, in __call__ return super(RNN, self).__call__(inputs, **kwargs) File "D:\Anaconda3\envs\test\lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 925, in __call__ return self._functional_construction_call(inputs, args, kwargs, File "D:\
最新发布
06-11
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

RtZero

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值