CUDA 对比三种在GPU上建立三维数组的方式的时间效率(更新)

对比三两种在GPU上建立三维数组的方式
直接在GPU建立全局变量三维数组
在CPU端为GPU上的三维数组开辟空间
在CPU端为GPU上的一维数组开辟空间,然后通过寻址的方式

代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <time.h>
using namespace std;

__device__ const int n = 6000;
__device__ const int m = 100;
__device__ double d_static[n][m][10];


__global__ void dynamic_cpu(double ***f_3);
__global__ void static_gpu();
__global__ void dynamic_one_gpu(double *a);


const int N = 6000;
const int M = 100;

clock_t a, b;
double t;
int main()
{
	double *arr;
	cudaMalloc((void**)(&arr), N*M * 10 * sizeof(double));
	
	double ***f_3 = (double***)malloc(N * sizeof(double***));
	double **f_2 = (double**)malloc(N * M * sizeof(double**));
	double *f_1 = (double*)malloc(N * M * 10 * sizeof(double*));

	double ***d_3;
	cudaMalloc((void**)(&d_3), N * sizeof(double***));
	double **d_2;
	cudaMalloc((void**)(&d_2), N*M * sizeof(double**));
	double *d_1;
	cudaMalloc((void**)(&d_1), N*M * 10 * sizeof(double));

	for (int i = 0; i < N*M * 10; i++)
	{
		f_1[i] = 0;
	}
	cudaMemcpy(d_1, f_1, N*M * 10 * sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(arr, f_1, N*M * 10 * sizeof(double), cudaMemcpyHostToDevice);
	for (int i = 0; i < N*M; i++)
	{
		f_2[i] = d_1 + i * 10;
	}
	cudaMemcpy(d_2, f_2, N*M * sizeof(double**), cudaMemcpyHostToDevice);
	for (int i = 0; i < N; i++)
	{
		f_3[i] = d_2 + M * i;
	}
	cudaMemcpy(d_3, f_3, N * sizeof(double***), cudaMemcpyHostToDevice);

	int dimx = 6;
	int dimy = 10;
	dim3 block(dimx, dimy);
	dim3 grid((N + block.x - 1) / block.x, (M + block.y - 1) / block.y);

	a = clock();
	for (int i = 0; i < 1000; i++)
	{
		static_gpu << <grid, block >> > ();

	}
	cudaDeviceSynchronize();
	b = clock();
	t = (double)(b - a) / CLOCKS_PER_SEC;
	cout << "static_gpu=" << t << endl;

	a = clock();
	for (int i = 0; i < 1000; i++)
	{
		dynamic_cpu << <grid, block >> > (d_3);

	}
	cudaDeviceSynchronize();
	b = clock();
	t = (double)(b - a) / CLOCKS_PER_SEC;
	cout << "dynamic_cpu=" << t << endl;

	a = clock();
	for (int i = 0; i < 1000; i++)
	{
		dynamic_one_gpu << <grid, block >> > (arr);

	}
	cudaDeviceSynchronize();
	b = clock();
	t = (double)(b - a) / CLOCKS_PER_SEC;
	cout << "dynamic_one_gpu=" << t << endl;

	return 0;
}

__global__ void static_gpu()
{
	int X = threadIdx.x + blockIdx.x * blockDim.x;
	int Y = threadIdx.y + blockIdx.y * blockDim.y;
	if (X < n&&Y < m)
	{
		for (int i = 0; i < 10; i++)
		{
			d_static[X][Y][i] = (d_static[X][Y][i] + X * 0.2 + Y * 0.3 + i * 0.4) * 0.01;
		}
	}
}
__global__ void dynamic_cpu(double ***f_3)
{
	int X = threadIdx.x + blockIdx.x * blockDim.x;
	int Y = threadIdx.y + blockIdx.y * blockDim.y;
	if (X < n&&Y < m)
	{
		for (int i = 0; i < 10; i++)
		{
			f_3[X][Y][i] = (f_3[X][Y][i] + X * 0.2 + Y * 0.3 + i * 0.4) * 0.01;
		}
	}
}

__global__ void dynamic_one_gpu(double *a)
{
	int X = threadIdx.x + blockIdx.x * blockDim.x;
	int Y = threadIdx.y + blockIdx.y * blockDim.y;
	if (X < n&&Y < m)
	{
		for (int i = 0; i < 10; i++)
		{
			a[10 * (X + Y * N) + i] = (a[10 * (X + Y * N) + i] + X * 0.2 + Y * 0.3 + i * 0.4) * 0.01;
		}
	}
}

执行时间:

static_gpu=21.447
dynamic_cpu=11.669
dynamic_one_gpu=5.52

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值