CUDA —— 2.1、cuda - 核函数（附完整代码演示）-优快云博客

核函数（Kernel Function）

CUDA核函数（Kernel Function）是在GPU上并行执行的函数，是CUDA编程的核心。

核函数定义方式：具体书写形式如下，void与__global__先后书写均可。

__global__ void kernelName function(argument arg)
{
	printf(“Hello World from the GPU!\n”);
}

void __globa__ kernelName function(argument arg)
{
	printf(“Hello World from the GPU!\n”);
}

核函数调用方式

/*
	gridDim：		网格维度（线程块数量）
	blockDim：		线程块维度（每块线程数）
	sharedMemSize：	动态共享内存大小（字节，可选）
	stream：		CUDA流（可选）
*/
kernelName<<<gridDim, blockDim, sharedMemSize, stream>>>(arguments);

核函数关键约束

1、返回值：必须返回 void，不能有返回值

2、只能访问GPU内存(显存)

2、不能使用变长参数、不能使用静态变量、不能使用函数指针

3、调用限制：只能从主机(CPU) 调用、不能递归调用（除非使用CUDA Dynamic Parallelism）、不支持静态变量

核函数限定符

在这里插入图片描述

核函数执行模型

在这里插入图片描述

并行性实现：
所有线程同时执行相同代码（SIMT架构）

硬件映射：

线程块 → GPU流式多处理器(SM)

线程束(Warp) → 32线程 → SM的基本调度单位

执行流程

在这里插入图片描述

核函数简单示例1（一维网格、一维线程块）

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void hello_from_gpu()
{
	const int gridDim_x = gridDim.x;		// 网格大小
	const int blockDim_x = blockDim.x;		// 线程块大小
	const int blockIdx_x = blockIdx.x;		// 线程块索引
	const int threadIdx_x = threadIdx.x;	// 所在线程块的线程索引
	const int threadNumber = threadIdx_x + blockIdx_x * blockDim.x;	// 线程编号
	printf("网格大小:%d\t线程块大小:%d\t线程块索引:%d\t所在线程块的线程索引:%d\t线程编号:%d\n", gridDim_x, blockDim_x, blockIdx_x, threadIdx_x, threadNumber);
}

int main(void)
{
	hello_from_gpu << <2, 4 >> > ();
	return 0;
}

在这里插入图片描述

核函数简单示例2（二维网格、二维线程块）

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void hello_from_gpu()
{
	const int gridDim_x = gridDim.x, gridDim_y = gridDim.y;			// 网格xy大小
	const int blockDim_x = blockDim.x, blockDim_y = blockDim.y;		// 线程块xy大小
	const int blockIdx_x = blockIdx.x, blockIdx_y=blockIdx.y;		// 线程块xy索引
	const int threadIdx_x = threadIdx.x, threadIdx_y = threadIdx.y;	// 所在线程块的线程索引

	const int blockID = blockIdx_x + blockIdx_y * gridDim_x;
	const int threadID = threadIdx_y * blockDim_x + threadIdx.x;
	const int threadNumber = blockID * (blockDim_x * blockDim_y) + threadID;

	printf("网格大小:(%d,%d)\t线程块大小:(%d,%d)\nblockID:%d,threadID:%d,\t线程块索引:(%d,%d)\t所在线程块的线程索引:(%d,%d)\t线程编号:%d\n\n", 
		gridDim_x, gridDim_y, 
		blockDim_x, blockDim_y,
		blockID, threadID,
		blockIdx_x, blockIdx_y,
		threadIdx_x, threadIdx_y,
		threadNumber);
}

int main(void)
{
	hello_from_gpu << <dim3(2,2), dim3(4,4) >> > ();
	return 0;
}