Mat for Adding and Multiple by cuda
code:
#include <iostream>
#include <cuda.h>
using namespace std;
#define N 2
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if(i < N && j < N)
{
C[i][j] = A[i][j] + B[i][j];
}
}
__global__ void MatMultiple(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
float sum = 0;
if(i < N && j < N)
{
for(int k =0 ; k < N; k++)
{
sum = sum + A[i][k] * B[k][j];
}
}
C[i][j] = sum;
}
int main()
{
float A[N][N] = {{ 1,1},{2,2}};
float B[N][N] = {{ 3,3},{4,4}};
float C[N][N] = {0};
float (*dev_a)[2], (*dev_b)[2], (*dev_c)[2];
cudaMalloc((void**) &dev_a, N * N * sizeof(float));
cudaMalloc((void**) &dev_b, N * N * sizeof(float));
cudaMalloc((void**) &dev_c, N * N * sizeof(float));
cudaMemcpy(dev_a, A, N*N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, B, N*N*sizeof(float), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(1,1);
dim3 numBlocks(N/threadsPerBlock.x, N/threadsPerBlock.y);
//MatAdd<<<numBlocks,threadsPerBlock>>>(dev_a,dev_b,dev_c);
//cudaMemcpy(C, dev_c, N * N * sizeof(float), cudaMemcpyDeviceToHost);
/*
for(int i = 0 ; i < N; i++)
{
for(int j = 0; j < N; j++)
{
cout << C[i][j] << " ";
}
cout << endl;
}
*/
MatMultiple<<<numBlocks,threadsPerBlock>>>(dev_a,dev_b,dev_c);
cudaMemcpy(C, dev_c, N * N * sizeof(float), cudaMemcpyDeviceToHost);
for(int i = 0 ; i < N; i++)
{
for(int j = 0; j < N; j++)
{
cout << C[i][j] << " ";
}
cout << endl;
}
return 0;
}
contribute
https://troore.blog.ustc.edu.cn/?p=208
http://blog.youkuaiyun.com/freeboy1015/article/details/7335392