//--------------------------
//Tensor calculate
//--------------------------
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <stdio.h>
#include <stdlib.h>
#include "iostream"
using namespace std;
__global__ void tensorcalc(
int * gpu_A,
int * gpu_B,
int * gpu_M);
const int Ax = 4;
const int Bz = 8;
const int Az = 2, Ay = 2, By = Az, Bx = Ay;
const int Mx = Ax, My = Bz;
const int num_block_x = 2;
const int num_block_y = 2;
const int num_thread_x = Ay;
const int num_thread_y = Az;
__constant__ const int block_length_A = Ax / num_block_x;
__constant__ const int block_length_B = Bz / num_block_y;
__constant__ const int blockAx = Ax;
__constant__ const int blockdimx = Ay;
__constant__ const int blockdimy = Az;
int main()
{
//declare matrix
int A[Az][Ay][Ax];
int B[Bz][By][Bx];
int M_cpu[My][Mx];
int M_gpu[My][Mx];
//Assignment matrix
for (int i = 0; i < Ax; i++)
{
for (int j = 0; j < Ay; j++)
{
for (int k = 0; k < Az; k++)
{
A[k][j][i] = rand() % 3 - 1;
}
}
}
for (int i = 0; i < Bx; i++)
{
for (int j = 0; j < By; j++)
{
for (int k = 0; k < Bz; k++)
{
B[k][j][i] = rand() % 3 - 1;
}
}
}
//show matrix
for (int i = 0; i < Ax; i++)
{
printf("Ax=%d:\n", i);
for (int k = 0; k < Az; k++)
{
for (int j = 0; j < Ay; j++)
{
printf("%4d", A[k][j][i]);
}
printf("\n");
}
}
for (int i = 0; i <Bz; i++)
{
printf("Bz=%d:\n", i);
for (int k = 0; k < By; k++)
{
for (int j = 0; j < Bx; j++)
{
printf("%4d", B[i][k][j]);
}
printf("\n");
}
}
//cpu
for (int i = 0; i < Mx; i++)
{
for (int j = 0; j < My; j++)
{
int tmp = 0;
for (int i2 = 0; i2 < Ay; i2++)
{
for (int j2 = 0; j2 < Az; j2++)
{
tmp = tmp + A[j2][i2][i] * B[j][j2][i2];
}
}
M_cpu[j][i] = tmp;
}
}
//gpu
int * gpu_A;
int * gpu_B;
int * gpu_M;
cudaMalloc((void **)&gpu_A, sizeof(int)*Ax*Ay*Az);
cudaMalloc((void **)&gpu_B, sizeof(int)*Bx*By*Bz);
cudaMalloc((void **)&gpu_M, sizeof(int)*Mx*My);
dim3 block(num_block_x, num_block_y);
dim3 thread(num_thread_x, num_thread_y);
cudaMemcpy(gpu_A, A, sizeof(int)*Ax*Ay*Az, cudaMemcpyHostToDevice);
cudaMemcpy(gpu_B, B, sizeof(int)*Bx*By*Bz, cudaMemcpyHostToDevice);
tensorcalc << <block, thread >> >(gpu_A, gpu_B, gpu_M);
cudaMemcpy(M_gpu, gpu_M, sizeof(int)*Mx*My, cudaMemcpyDeviceToHost);
cudaFree(gpu_A);
cudaFree(gpu_B);
cudaFree(gpu_M);
//show cpu
cout << "cpu:" << endl;
for (int i = 0; i < Mx; i++)
{
for (int j = 0; j < My; j++)
{
printf("%4d", M_cpu[j][i]);
}
printf("\n");
}
//show gpu
cout << "gpu:" << endl;
for (int i = 0; i < Mx; i++)
{
for (int j = 0; j < My; j++)
{
printf("%4d", M_gpu[j][i]);
}
printf("\n");
}
cin.get();
return 0;
}
__global__ void tensorcalc(
int * gpu_A,
int * gpu_B,
int * gpu_M)
{
//declare
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
const int tAx = blockAx;
const int tlx = blockdimx;
const int tly = blockdimy;
__shared__ int A_tmp[block_length_A][tly][tlx];
__shared__ int B_tmp[block_length_B][tly][tlx];
__shared__ int M_tmp[tly*tlx];
//Assignment
for (int i = 0; i < block_length_A; i++)
{
A_tmp[i][ty][tx] = gpu_A[ty *tAx*tlx + tx*tAx + i+bx*block_length_A];
}
for (int j = 0; j < block_length_B; j++)
{
B_tmp[j][ty][tx] = gpu_B[(by*block_length_B + j)*tly*tlx + ty*tlx + tx];
}
__syncthreads();
//calculate
for (int i = 0; i < block_length_A; i++)
{
for (int j = 0; j < block_length_B; j++)
{
int p = ty*tlx + tx;
//Assignment M_tmp
M_tmp[p] = A_tmp[i][ty][tx] * B_tmp[j][ty][tx];
__syncthreads();
//calculate turn
int ktmp = -1;
int ktmp2 = tlx*tly;
while (ktmp2)
{
ktmp2 = ktmp2 >> 1;
ktmp++;
}
//calculate
for (int k = 1; k < ktmp + 1; k++)
{
int tmp1 = ((tlx*tly) >> k);
if (p < tmp1)
{
int tmp2 = 1 << (ktmp - k);
M_tmp[p] = M_tmp[p] + M_tmp[p + tmp2];
}
__syncthreads();
}
//return
int ktmpy = (block_length_B*blockIdx.y + j);
int ktmpx = block_length_A*blockIdx.x + i;
int ktmp3 = ktmpy*blockAx + ktmpx;
gpu_M[ktmp3] = M_tmp[0];
//return
__syncthreads();
}
}
}
cuda,day-13,tensorcalc张量计算
最新推荐文章于 2024-10-23 01:07:03 发布