【CUDA编程】GPU上的topK程序

原创已于 2025-03-28 19:59:49 修改 · 269 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#算法 #c++ #gpu算力 #深度学习 #CUDA

于 2025-03-21 10:47:14 首次发布

CUDA编程的一些例子

- 基于堆排序的topK
- 2路topK算法

最近在学习CUDA编程，写了一点简单的程序，刚开始用学习GPU编程，踩了很多坑，特此记录一下

基于堆排序的topK

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>


#define K 10

__device__ void mySwap(int& num1, int& num2) {
	int tmp = num1;
	num1 = num2;
	num2 = tmp;
}

// 保持堆的特性
__device__ void heapify(int* data, int idx, int maxSize) {
	if (idx >= maxSize)
		return;
	int left = idx * 2 + 1, right = idx * 2 + 2;
	int maxIdx = idx;
	if (left < maxSize && data[left] < data[maxIdx])
		maxIdx = left;
	if (right < maxSize && data[right] < data[maxIdx])
		maxIdx = right;
	if (maxIdx != idx) {
		mySwap(data[idx], data[maxIdx]);
		heapify(data, maxIdx, maxSize);
	}
}


template<
int NUM_PER_BLOCK,
int NUM_PER_THREAD
>
__global__ void myTopK(int* input, int* output) {
	__shared__ int sdata[NUM_PER_BLOCK / NUM_PER_THREAD * K];
	int tid = threadIdx.x;
	int idx = blockDim.x * blockIdx.x + tid;

	int data[K];

	for (int i = K - 1; i >= 0; i--) {
		data[i] = input[i + NUM_PER_THREAD * idx];
		heapify(data, i, K);
	}
	for (int i = K; i < NUM_PER_THREAD; i++) {
		if (input[i + NUM_PER_THREAD * idx] > data[0]) {
			data[0] = input[i + NUM_PER_THREAD * idx];
			heapify(data, 0, K);
		}
	}
	for (int i = 0; i < K; i++) {
		sdata[idx * K + i] = data[i];
	}
	__syncthreads();
	int curNum = NUM_PER_BLOCK * K / NUM_PER_THREAD;

	while (true) {
		if (idx * NUM_PER_THREAD >= curNum)
			break;

		int sortNum = min((idx + 1) * NUM_PER_THREAD, curNum) - idx * NUM_PER_THREAD;
		if (sortNum <= K) {
			for (int i = 0; i < sortNum; i++) {
				data[i] = sdata[idx * NUM_PER_THREAD + i];
			}
		}
		else {
			for (int i = K - 1; i >= 0; i--) {
				data[i] = sdata[i + NUM_PER_THREAD * idx];
				heapify(data, i, K);
			}
			int end = min(NUM_PER_THREAD, sortNum); // 目前需要排序的数据量可能位于NUM_PER_THREAD和K之间，即NUM_PER_THREAD>sortNum>K
			for (int i = K; i < end; i++) {
				if (sdata[i + NUM_PER_THREAD * idx] > data[0]) {
					data[0] = sdata[i + NUM_PER_THREAD * idx];
					heapify(data, 0, K);
				}
			}
			sortNum = K;
		}
		__syncthreads();

		if (idx * NUM_PER_THREAD < curNum) {
			#pragma unroll
			for (int i = 0; i < sortNum; i++) {
				sdata[idx * K + i] = data[i];
			}
		}
		__syncthreads();

		curNum = curNum * K / NUM_PER_THREAD;
	}
	if (tid == 0) {
		for (int i = 0; i < K; i++) {
			output[i] = sdata[i];
		}
	}
}

int main() {
	const int BLOCK_SIZE = 256;
	const int N = 1024 * 1024;
	const int NUM_PER_BLOCK = N; // 1024
	const int NUM_PER_THREAD = NUM_PER_BLOCK / BLOCK_SIZE; // 4096

	assert(K < NUM_PER_THREAD);

	int* input = (int*)malloc(N * sizeof(int));
	int* output = (int*)malloc(K * sizeof(int));
	int* d_input;
	int* d_output;
	cudaError_t error = cudaMalloc(&d_input, N * sizeof(int));
	error = cudaMalloc(&d_output, K * sizeof(int));

	for (int i = 0; i < N; i++) {
		input[i] = rand() % 100;
	}

	for (int i = 0; i < K; i++) {
		int randIdx = rand() % N;
		input[randIdx] = i + 100;
		printf("input[%d] = %d\n", randIdx, input[randIdx]);
	}
	error = cudaMemcpy(d_input, input, N * sizeof(int), cudaMemcpyHostToDevice);

	dim3 BlockSize(BLOCK_SIZE, 1);
	myTopK <NUM_PER_BLOCK, NUM_PER_THREAD><< <1, BlockSize >> > (d_input, d_output);
	error = cudaMemcpy(output, d_output, K * sizeof(int), cudaMemcpyDeviceToHost);

	for (int i = 0; i < K; i++) {
		printf("output[%d] = %d.\n", i, output[i]);
	}
}

2路topK算法

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>



__device__ void mySwap(int& num1, int& num2) {
	int tmp = num1;
	num1 = num2;
	num2 = tmp;
}

// 保持堆的特性
__device__ void heapify(int* data, int idx, int maxSize) {
	if (idx >= maxSize)
		return;
	int left = idx * 2 + 1, right = idx * 2 + 2;
	int maxIdx = idx;
	if (left < maxSize && data[left] < data[maxIdx])
		maxIdx = left;
	if (right < maxSize && data[right] < data[maxIdx])
		maxIdx = right;
	if (maxIdx != idx) {
		mySwap(data[idx], data[maxIdx]);
		heapify(data, maxIdx, maxSize);
	}
}


template<
	int NUM_PER_BLOCK,
	int NUM_PER_THREAD,
	int K
>
__global__ void myTopK(int* input, int* output) {
	__shared__ int sdata[NUM_PER_BLOCK / NUM_PER_THREAD * K];
	int tid = threadIdx.x;
	int idx = blockDim.x * blockIdx.x + tid;
	if (idx * NUM_PER_THREAD >= NUM_PER_BLOCK) // 数据量较少时，后面的线程直接退出
		return;

	int data[K];

	for (int i = K - 1; i >= 0; i--) {
		data[i] = input[i + NUM_PER_THREAD * idx];
		heapify(data, i, K);
	}

	for (int i = K; i < NUM_PER_THREAD; i++) {
		if (input[i + NUM_PER_THREAD * idx] > data[0]) {
			data[0] = input[i + NUM_PER_THREAD * idx];
			heapify(data, 0, K);
		}
	}

	for (int i = 0; i < K; i++) {
		sdata[tid * K + i] = data[i];
	}
	__syncthreads();
	int curNum = NUM_PER_BLOCK * K / NUM_PER_THREAD;

	while (true) {
		if (tid * NUM_PER_THREAD >= curNum)
			break;

		int sortNum = min((tid + 1) * NUM_PER_THREAD, curNum) - tid * NUM_PER_THREAD;
		if (sortNum <= K) {
			for (int i = 0; i < sortNum; i++) {
				data[i] = sdata[tid * NUM_PER_THREAD + i];
			}
		}
		else {
			for (int i = K - 1; i >= 0; i--) {
				data[i] = sdata[i + NUM_PER_THREAD * tid];
				heapify(data, i, K);
			}
			int end = min(NUM_PER_THREAD, sortNum); // 目前需要排序的数据量可能位于NUM_PER_THREAD和K之间，即NUM_PER_THREAD>sortNum>K

			for (int i = K; i < end; i++) {
				if (sdata[i + NUM_PER_THREAD * tid] > data[0]) {
					data[0] = sdata[i + NUM_PER_THREAD * tid];
					heapify(data, 0, K);
				}
			}
			sortNum = K;
		}
		__syncthreads();
		if (tid * NUM_PER_THREAD < curNum) {

			for (int i = 0; i < sortNum; i++) {
				sdata[tid * K + i] = data[i];
			}
		}
		__syncthreads();

		curNum = curNum * K / NUM_PER_THREAD;
	}
	if (tid == 0) {
		for (int i = 0; i < K; i++) {
			output[blockIdx.x * K + i] = sdata[i];
		}
	}
}

int main() {
	const int BLOCK_SIZE = 256;
	const int GRID_SIZE = 16;
	const int N = 1024 * 1024;

	const int GLOBAL_K = 16;
	const int BLOCK_K = 32;

	assert(BLOCK_K % GLOBAL_K == 0);

	int* input = (int*)malloc(N * sizeof(int));
	int* output = (int*)malloc(GLOBAL_K * sizeof(int));
	int* d_input;
	int* d_output;
	int* d_block_output;
	cudaError_t error = cudaMalloc(&d_input, N * sizeof(int));
	error = cudaMalloc(&d_output, GLOBAL_K * sizeof(int));
	error = cudaMalloc(&d_block_output, BLOCK_K * GRID_SIZE * sizeof(int));
	for (int i = 0; i < N; i++) {
		input[i] = rand() % 100;
	}

	for (int i = 0; i < GLOBAL_K; i++) {
		int randIdx = rand() % N;
		input[randIdx] = i + 100;
		printf("input[%d] = %d\n", randIdx, input[randIdx]);
	}
	error = cudaMemcpy(d_input, input, N * sizeof(int), cudaMemcpyHostToDevice);

	dim3 GridSize(GRID_SIZE, 1);
	dim3 BlockSize(BLOCK_SIZE, 1);
	myTopK <N / GRID_SIZE, N / (GRID_SIZE * BLOCK_SIZE), BLOCK_K> << <GridSize, BlockSize >> > (d_input, d_block_output);

	assert(BLOCK_K * GRID_SIZE / GLOBAL_K > GLOBAL_K); // 确保每个线程处理的数据量大于K
	myTopK <BLOCK_K * GRID_SIZE, BLOCK_K * GRID_SIZE / GLOBAL_K, GLOBAL_K> << <1, BlockSize >> > (d_block_output, d_output);
	error = cudaMemcpy(output, d_output, GLOBAL_K * sizeof(int), cudaMemcpyDeviceToHost);

	for (int i = 0; i < GLOBAL_K; i++) {
		printf("output[%d] = %d.\n", i, output[i]);
	}
}