Kernels
用__global__来定义,用<<<…>>>来指明使用的CUDA线程数量,执行这个kernel的每一个线程都会有一个独特的thread ID,可以在kernel内部通过内置变量得到。
// Kernel definition
__global__ void VecAdd(float* A, float* B, float* C)
{
int i = threadIdx.x;
C[i] = A[i] + B[i];
}
int main()
{
...
// Kernel invocation with N threads
VecAdd<<<1, N>>>(A, B, C);
...
}
Thread Hierarchy
index of a thread 和 thread ID
一维block
二者相等
二维block
size=(Dx,Dy)
thread index =(x,y)
thread ID = x + y*Dx
三维block
size = (Dx,Dy,Dz)
thread index = (x,y,z)
thread ID = x + yDx + zDx*Dy
Grid of Thread Blocks
thread index.x = blockIdx.x * blockDim.x + threadIdx.x
thread index.y = blockIdx.y * blockDim.y + threadIdx.y
// Kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N],
float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
...
// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
...
}
Memory Hierarchy