#include<cuda_runtime_api.h>
#include<device_launch_parameters.h>
#include<stdio.h>
__global__ void add(int a[][2], int b[][2], int c[][2])
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if (i < 2 && j < 2)
{
c[i][j] = a[i][j] + b[i][j];
}
}
int main()
{
int (*a)[2] = new int[2][2];
int (*b)[2] = new int[2][2];
int (*c)[2] = new int[2][2];
a[0][0] = 1; a[0][1] = 2; a[1][0] = 3; a[1][1] = 4;
b[0][0] = 1; b[0][1] = 2; b[1][0] = 3; b[1][1] =4;
cudaError_t error = cudaSuccess;
int (*device_a)[2],(*device_b)[2],(*device_c)[2];
error = cudaMalloc((void **)&device_a, sizeof(int)* 4);
error = cudaMalloc((void **)&device_b, sizeof(int)* 4);
error = cudaMalloc((void **)&device_c, sizeof(int)* 4);
cudaMemcpy(device_a, a, sizeof(int)* 4, cudaMemcpyHostToDevice);
cudaMemcpy(device_b, b, sizeof(int)* 4, cudaMemcpyHostToDevice);
// dim3 threadsPerB
CUDA二维矩阵加法
最新推荐文章于 2025-01-18 20:54:40 发布