CUDA编程的一些例子
最近在学习CUDA编程,写了一点简单的程序,刚开始用学习GPU编程,踩了很多坑,特此记录一下
基于堆排序的topK
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#define K 10
__device__ void mySwap(int& num1, int& num2) {
int tmp = num1;
num1 = num2;
num2 = tmp;
}
// 保持堆的特性
__device__ void heapify(int* data, int idx, int maxSize) {
if (idx >= maxSize)
return;
int left = idx * 2 + 1, right = idx * 2 + 2;
int maxIdx = idx;
if (left < maxSize && data[left] < data[maxIdx])
maxIdx = left;
if (right < maxSize && data[right] < data[maxIdx])
maxIdx = right;
if (maxIdx != idx) {
mySwap(data[idx], data[maxIdx]);
heapify(data, maxIdx, maxSize);
}
}
template<
int NUM_PER_BLOCK,
int NUM_PER_THREAD
>
__global__ void myTopK(int* input, int* output) {
__shared__ int sdata[NUM_PER_BLOCK / NUM_PER_THREAD * K];
int tid = threadIdx.x;
int idx = blockDim.x * blockIdx.x + tid;
int data[K];
for (int i = K - 1; i >= 0; i--) {
data[i] = input[i + NUM_PER_THREAD * idx];
heapify(data, i, K);
}
for (int i = K; i < NUM_PER_THREAD; i++) {
if (input[i + NUM_PER_THREAD * idx] > data[0]) {
data[0] = input[i + NUM_PER_THREAD * idx];
heapify(data, 0, K);
}
}
for (int i = 0; i < K; i++) {
sdata[idx * K + i] = data[i];
}
__syncthreads();
int curNum = NUM_PER_BLOCK * K / NUM_PER_THREAD;
while (true) {
if (idx * NUM_PER_THREAD >= curNum)
break;
int sortNum = min((idx + 1) * NUM_PER_THREAD, curNum) - idx * NUM_PER_THREAD;
if (sortNum <= K) {
for (int i = 0; i < sortNum; i++) {
data[i] = sdata[idx * NUM_PER_THREAD + i];
}
}
else {
for (int i = K - 1; i >= 0; i--) {
data[i] = sdata[i + NUM_PER_THREAD * idx];
heapify(data, i, K);
}
int end = min(NUM_PER_THREAD, sortNum); // 目前需要排序的数据量可能位于NUM_PER_THREAD和K之间,即NUM_PER_THREAD>sortNum>K
for (int i = K; i < end; i++) {
if (sdata[i + NUM_PER_THREAD * idx] > data[0]) {
data[0] = sdata[i + NUM_PER_THREAD * idx];
heapify(data, 0, K);
}
}
sortNum = K;
}
__syncthreads();
if (idx * NUM_PER_THREAD < curNum) {
#pragma unroll
for (int i = 0; i < sortNum; i++) {
sdata[idx * K + i] = data[i];
}
}
__syncthreads();
curNum = curNum * K / NUM_PER_THREAD;
}
if (tid == 0) {
for (int i = 0; i < K; i++) {
output[i] = sdata[i];
}
}
}
int main() {
const int BLOCK_SIZE = 256;
const int N = 1024 * 1024;
const int NUM_PER_BLOCK = N; // 1024
const int NUM_PER_THREAD = NUM_PER_BLOCK / BLOCK_SIZE; // 4096
assert(K < NUM_PER_THREAD);
int* input = (int*)malloc(N * sizeof(int));
int* output = (int*)malloc(K * sizeof(int));
int* d_input;
int* d_output;
cudaError_t error = cudaMalloc(&d_input, N * sizeof(int));
error = cudaMalloc(&d_output, K * sizeof(int));
for (int i = 0; i < N; i++) {
input[i] = rand() % 100;
}
for (int i = 0; i < K; i++) {
int randIdx = rand() % N;
input[randIdx] = i + 100;
printf("input[%d] = %d\n", randIdx, input[randIdx]);
}
error = cudaMemcpy(d_input, input, N * sizeof(int), cudaMemcpyHostToDevice);
dim3 BlockSize(BLOCK_SIZE, 1);
myTopK <NUM_PER_BLOCK, NUM_PER_THREAD><< <1, BlockSize >> > (d_input, d_output);
error = cudaMemcpy(output, d_output, K * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < K; i++) {
printf("output[%d] = %d.\n", i, output[i]);
}
}
2路topK算法
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
__device__ void mySwap(int& num1, int& num2) {
int tmp = num1;
num1 = num2;
num2 = tmp;
}
// 保持堆的特性
__device__ void heapify(int* data, int idx, int maxSize) {
if (idx >= maxSize)
return;
int left = idx * 2 + 1, right = idx * 2 + 2;
int maxIdx = idx;
if (left < maxSize && data[left] < data[maxIdx])
maxIdx = left;
if (right < maxSize && data[right] < data[maxIdx])
maxIdx = right;
if (maxIdx != idx) {
mySwap(data[idx], data[maxIdx]);
heapify(data, maxIdx, maxSize);
}
}
template<
int NUM_PER_BLOCK,
int NUM_PER_THREAD,
int K
>
__global__ void myTopK(int* input, int* output) {
__shared__ int sdata[NUM_PER_BLOCK / NUM_PER_THREAD * K];
int tid = threadIdx.x;
int idx = blockDim.x * blockIdx.x + tid;
if (idx * NUM_PER_THREAD >= NUM_PER_BLOCK) // 数据量较少时,后面的线程直接退出
return;
int data[K];
for (int i = K - 1; i >= 0; i--) {
data[i] = input[i + NUM_PER_THREAD * idx];
heapify(data, i, K);
}
for (int i = K; i < NUM_PER_THREAD; i++) {
if (input[i + NUM_PER_THREAD * idx] > data[0]) {
data[0] = input[i + NUM_PER_THREAD * idx];
heapify(data, 0, K);
}
}
for (int i = 0; i < K; i++) {
sdata[tid * K + i] = data[i];
}
__syncthreads();
int curNum = NUM_PER_BLOCK * K / NUM_PER_THREAD;
while (true) {
if (tid * NUM_PER_THREAD >= curNum)
break;
int sortNum = min((tid + 1) * NUM_PER_THREAD, curNum) - tid * NUM_PER_THREAD;
if (sortNum <= K) {
for (int i = 0; i < sortNum; i++) {
data[i] = sdata[tid * NUM_PER_THREAD + i];
}
}
else {
for (int i = K - 1; i >= 0; i--) {
data[i] = sdata[i + NUM_PER_THREAD * tid];
heapify(data, i, K);
}
int end = min(NUM_PER_THREAD, sortNum); // 目前需要排序的数据量可能位于NUM_PER_THREAD和K之间,即NUM_PER_THREAD>sortNum>K
for (int i = K; i < end; i++) {
if (sdata[i + NUM_PER_THREAD * tid] > data[0]) {
data[0] = sdata[i + NUM_PER_THREAD * tid];
heapify(data, 0, K);
}
}
sortNum = K;
}
__syncthreads();
if (tid * NUM_PER_THREAD < curNum) {
for (int i = 0; i < sortNum; i++) {
sdata[tid * K + i] = data[i];
}
}
__syncthreads();
curNum = curNum * K / NUM_PER_THREAD;
}
if (tid == 0) {
for (int i = 0; i < K; i++) {
output[blockIdx.x * K + i] = sdata[i];
}
}
}
int main() {
const int BLOCK_SIZE = 256;
const int GRID_SIZE = 16;
const int N = 1024 * 1024;
const int GLOBAL_K = 16;
const int BLOCK_K = 32;
assert(BLOCK_K % GLOBAL_K == 0);
int* input = (int*)malloc(N * sizeof(int));
int* output = (int*)malloc(GLOBAL_K * sizeof(int));
int* d_input;
int* d_output;
int* d_block_output;
cudaError_t error = cudaMalloc(&d_input, N * sizeof(int));
error = cudaMalloc(&d_output, GLOBAL_K * sizeof(int));
error = cudaMalloc(&d_block_output, BLOCK_K * GRID_SIZE * sizeof(int));
for (int i = 0; i < N; i++) {
input[i] = rand() % 100;
}
for (int i = 0; i < GLOBAL_K; i++) {
int randIdx = rand() % N;
input[randIdx] = i + 100;
printf("input[%d] = %d\n", randIdx, input[randIdx]);
}
error = cudaMemcpy(d_input, input, N * sizeof(int), cudaMemcpyHostToDevice);
dim3 GridSize(GRID_SIZE, 1);
dim3 BlockSize(BLOCK_SIZE, 1);
myTopK <N / GRID_SIZE, N / (GRID_SIZE * BLOCK_SIZE), BLOCK_K> << <GridSize, BlockSize >> > (d_input, d_block_output);
assert(BLOCK_K * GRID_SIZE / GLOBAL_K > GLOBAL_K); // 确保每个线程处理的数据量大于K
myTopK <BLOCK_K * GRID_SIZE, BLOCK_K * GRID_SIZE / GLOBAL_K, GLOBAL_K> << <1, BlockSize >> > (d_block_output, d_output);
error = cudaMemcpy(output, d_output, GLOBAL_K * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < GLOBAL_K; i++) {
printf("output[%d] = %d.\n", i, output[i]);
}
}