cuda,day-13,tensorcalc张量计算

最新推荐文章于 2024-10-23 01:07:03 发布

原创最新推荐文章于 2024-10-23 01:07:03 发布 · 635 阅读

0 ·

CC 4.0 BY-SA版权

cuda 专栏收录该内容

22 篇文章

订阅专栏

本文介绍了一个使用CUDA进行张量计算的例子程序。该程序通过GPU加速实现了两个三维张量的乘法运算，并展示了如何在GPU上分配内存、执行核函数以及将结果复制回CPU的过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

//--------------------------
//Tensor calculate
//--------------------------
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <stdio.h>
#include <stdlib.h>
#include "iostream"
using namespace std;
__global__ void tensorcalc(
	int * gpu_A,
	int * gpu_B,
	int * gpu_M);

const int Ax = 4;
const int Bz = 8;
const int Az = 2, Ay = 2, By = Az, Bx = Ay;
const int Mx = Ax, My = Bz;
const int num_block_x = 2;
const int num_block_y = 2;
const int num_thread_x = Ay;
const int num_thread_y = Az;
__constant__ const int block_length_A = Ax / num_block_x;
__constant__ const int block_length_B = Bz / num_block_y;
__constant__ const int blockAx = Ax;
__constant__ const int blockdimx = Ay;
__constant__ const int blockdimy = Az;
int main()
{
	//declare matrix
	int A[Az][Ay][Ax];
	int B[Bz][By][Bx];
	int M_cpu[My][Mx];
	int M_gpu[My][Mx];

	//Assignment matrix
	for (int i = 0; i < Ax; i++)
	{
		for (int j = 0; j < Ay; j++)
		{
			for (int k = 0; k < Az; k++)
			{
				A[k][j][i] = rand() % 3 - 1;
			}
		}
	}

	for (int i = 0; i < Bx; i++)
	{
		for (int j = 0; j < By; j++)
		{
			for (int k = 0; k < Bz; k++)
			{
				B[k][j][i] = rand() % 3 - 1;
			}
		}
	}

	//show matrix
	for (int i = 0; i < Ax; i++)
	{
		printf("Ax=%d:\n", i);
		for (int k = 0; k < Az; k++)
		{
			for (int j = 0; j < Ay; j++)
			{
				printf("%4d", A[k][j][i]);
			}
			printf("\n");
		}
	}

	for (int i = 0; i <Bz; i++)
	{
		printf("Bz=%d:\n", i);
		for (int k = 0; k < By; k++)
		{
			for (int j = 0; j < Bx; j++)
			{
				printf("%4d", B[i][k][j]);
			}
			printf("\n");
		}
	}
	//cpu
	for (int i = 0; i < Mx; i++)
	{
		for (int j = 0; j < My; j++)
		{
			int tmp = 0;
			for (int i2 = 0; i2 < Ay; i2++)
			{
				for (int j2 = 0; j2 < Az; j2++)
				{
					tmp = tmp + A[j2][i2][i] * B[j][j2][i2];
				}
			}
			M_cpu[j][i] = tmp;
		}
	}

	//gpu
	int * gpu_A;
	int * gpu_B;
	int * gpu_M;
	cudaMalloc((void **)&gpu_A, sizeof(int)*Ax*Ay*Az);
	cudaMalloc((void **)&gpu_B, sizeof(int)*Bx*By*Bz);
	cudaMalloc((void **)&gpu_M, sizeof(int)*Mx*My);
	dim3 block(num_block_x, num_block_y);
	dim3 thread(num_thread_x, num_thread_y);
	cudaMemcpy(gpu_A, A, sizeof(int)*Ax*Ay*Az, cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_B, B, sizeof(int)*Bx*By*Bz, cudaMemcpyHostToDevice);
	tensorcalc << <block, thread >> >(gpu_A, gpu_B, gpu_M);
	cudaMemcpy(M_gpu, gpu_M, sizeof(int)*Mx*My, cudaMemcpyDeviceToHost);
	cudaFree(gpu_A);
	cudaFree(gpu_B);
	cudaFree(gpu_M);



	//show cpu
	cout << "cpu:" << endl;
	for (int i = 0; i < Mx; i++)
	{
		for (int j = 0; j < My; j++)
		{
			printf("%4d", M_cpu[j][i]);
		}
		printf("\n");
	}
	//show gpu
	cout << "gpu:" << endl;
	for (int i = 0; i < Mx; i++)
	{
		for (int j = 0; j < My; j++)
		{
			printf("%4d", M_gpu[j][i]);
		}
		printf("\n");
	}
	cin.get();
	return 0;
}

__global__ void tensorcalc(
	int * gpu_A,
	int * gpu_B,
	int * gpu_M)
{
	//declare
	int bx = blockIdx.x;
	int by = blockIdx.y;
	int tx = threadIdx.x;
	int ty = threadIdx.y;
	const int tAx = blockAx;
	const int tlx = blockdimx;
	const int tly = blockdimy;
	__shared__ int A_tmp[block_length_A][tly][tlx];
	__shared__ int B_tmp[block_length_B][tly][tlx];
	__shared__ int M_tmp[tly*tlx];
	//Assignment 
	for (int i = 0; i < block_length_A; i++)
	{
		A_tmp[i][ty][tx] = gpu_A[ty *tAx*tlx + tx*tAx + i+bx*block_length_A];
	}

	for (int j = 0; j < block_length_B; j++)
	{
		B_tmp[j][ty][tx] = gpu_B[(by*block_length_B + j)*tly*tlx + ty*tlx + tx];
	}
	__syncthreads();
	//calculate
	for (int i = 0; i < block_length_A; i++)
	{
		for (int j = 0; j < block_length_B; j++)
		{
			int p = ty*tlx + tx;
			//Assignment M_tmp
			M_tmp[p] = A_tmp[i][ty][tx] * B_tmp[j][ty][tx];
			__syncthreads();
			//calculate turn
			int ktmp = -1;
			int ktmp2 = tlx*tly;
			while (ktmp2)
			{
				ktmp2 = ktmp2 >> 1;
				ktmp++;
			}
			//calculate
			for (int k = 1; k < ktmp + 1; k++)
			{
				int tmp1 = ((tlx*tly) >> k);
				if (p < tmp1)
				{
					int tmp2 = 1 << (ktmp - k);
					M_tmp[p] = M_tmp[p] + M_tmp[p + tmp2];
				}
				__syncthreads();
			}
			//return 
			int ktmpy = (block_length_B*blockIdx.y + j);
			int ktmpx = block_length_A*blockIdx.x + i;
			int ktmp3 = ktmpy*blockAx + ktmpx;
			gpu_M[ktmp3] = M_tmp[0];
			//return
			__syncthreads();
		}
	}
}