【CUDA编程】手把手教你写CUDA

原创于 2025-06-18 15:30:00 发布 · 614 阅读

4 ·

CC 4.0 BY-SA版权

文章标签：

#算法 #c++ #CUDA #人工智能 #深度学习 #云计算

部署运行你感兴趣的模型镜像

端到端可调试的CUDA算子Demo：向量加法

以下是一个完整的CUDA向量加法算子Demo，包含详细的错误检查和调试功能：

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <assert.h>
#include <time.h>

// 错误检查宏（调试必备）
#define CUDA_CHECK(call) \
do { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        printf("CUDA error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
        exit(EXIT_FAILURE); \
    } \
} while (0)

// 核函数：向量加法 (a + b = c)
__global__ void vectorAddKernel(const float* a, const float* b, float* c, int size) {
    // 计算全局索引
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    
    // 边界检查
    if (tid < size) {
        // 核心计算逻辑
        c[tid] = a[tid] + b[tid];
        
        // 调试输出（仅对少量元素启用）
        #if defined(DEBUG) && DEBUG <= 10
        if (tid == 0) {
            printf("Kernel: a[%d]=%.2f, b[%d]=%.2f, c[%d]=%.2f\n", 
                   tid, a[tid], tid, b[tid], tid, c[tid]);
        }
        #endif
    }
}

// CPU参考实现
void vectorAddCPU(const float* a, const float* b, float* c, int size) {
    for (int i = 0; i < size; i++) {
        c[i] = a[i] + b[i];
    }
}

// 结果验证函数
bool verifyResults(const float* cpuRef, const float* gpuRef, int size, float threshold = 1e-5f) {
    for (int i = 0; i < size; i++) {
        float diff = fabs(cpuRef[i] - gpuRef[i]);
        if (diff > threshold) {
            printf("Error at index %d: CPU=%.6f, GPU=%.6f, Diff=%.6f\n", 
                  i, cpuRef[i], gpuRef[i], diff);
            return false;
        }
    }
    return true;
}

// 打印数组（调试用）
void printArray(const char* name, const float* data, int size, int maxPrint = 5) {
    printf("%s: [", name);
    int printCount = (size > maxPrint) ? maxPrint : size;
    for (int i = 0; i < printCount; i++) {
        printf("%.2f", data[i]);
        if (i < printCount - 1) printf(", ");
    }
    if (size > printCount) printf(", ...");
    printf("]\n");
}

int main(int argc, char** argv) {
    // 设置向量大小（默认为1M）
    int size = (argc > 1) ? atoi(argv[1]) : 1 << 20;
    printf("Vector size: %d elements\n", size);
    
    // 分配主机内存
    float *h_a, *h_b, *h_c, *h_c_ref;
    h_a = (float*)malloc(size * sizeof(float));
    h_b = (float*)malloc(size * sizeof(float));
    h_c = (float*)malloc(size * sizeof(float));
    h_c_ref = (float*)malloc(size * sizeof(float));
    
    // 初始化数据
    srand(time(NULL)); // 随机种子
    for (int i = 0; i < size; i++) {
        h_a[i] = rand() / (float)RAND_MAX; // 0~1随机数
        h_b[i] = rand() / (float)RAND_MAX; // 0~1随机数
    }
    
    // 打印前5个元素（调试用）
    #ifdef DEBUG
    printArray("Input a", h_a, size);
    printArray("Input b", h_b, size);
    #endif
    
    // CPU参考计算（验证用）
    clock_t cpu_start = clock();
    vectorAddCPU(h_a, h_b, h_c_ref, size);
    double cpu_time = (double)(clock() - cpu_start) / CLOCKS_PER_SEC;
    printf("CPU time: %.4f ms\n", cpu_time * 1000);
    
    #ifdef DEBUG
    printArray("CPU Result", h_c_ref, size);
    #endif
    
    // 分配设备内存
    float *d_a, *d_b, *d_c;
    CUDA_CHECK(cudaMalloc((void**)&d_a, size * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&d_b, size * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&d_c, size * sizeof(float)));
    
    // 计时事件
    cudaEvent_t start, stop;
    CUDA_CHECK(cudaEventCreate(&start));
    CUDA_CHECK(cudaEventCreate(&stop));
    
    // 复制数据到设备
    CUDA_CHECK(cudaMemcpy(d_a, h_a, size * sizeof(float), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_b, h_b, size * sizeof(float), cudaMemcpyHostToDevice));
    
    // 设置线程块和网格大小
    int blockSize = 256; // 每个块256个线程
    int gridSize = (size + blockSize - 1) / blockSize; // 足够覆盖所有元素
    
    printf("Launching kernel: gridDim=%d, blockDim=%d\n", gridSize, blockSize);
    
    // 启动核函数并计时
    CUDA_CHECK(cudaEventRecord(start));
    
    // 调用核函数
    vectorAddKernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, size);
    
    // 检查核函数错误
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaEventRecord(stop));
    
    // 等待所有操作完成
    CUDA_CHECK(cudaEventSynchronize(stop));
    float kernel_time_ms = 0;
    CUDA_CHECK(cudaEventElapsedTime(&kernel_time_ms, start, stop));
    
    // 复制结果回主机
    CUDA_CHECK(cudaMemcpy(h_c, d_c, size * sizeof(float), cudaMemcpyDeviceToHost));
    
    printf("CUDA kernel time: %.4f ms\n", kernel_time_ms);
    
    // 打印部分结果（调试用）
    #ifdef DEBUG
    printArray("GPU Result", h_c, size);
    #endif
    
    // 验证结果
    if (verifyResults(h_c_ref, h_c, size)) {
        printf("PASSED: Results match!\n");
    } else {
        printf("FAILED: Results mismatch!\n");
    }
    
    // 清理资源
    CUDA_CHECK(cudaFree(d_a));
    CUDA_CHECK(cudaFree(d_b));
    CUDA_CHECK(cudaFree(d_c));
    CUDA_CHECK(cudaEventDestroy(start));
    CUDA_CHECK(cudaEventDestroy(stop));
    
    free(h_a);
    free(h_b);
    free(h_c);
    free(h_c_ref);
    
    printf("Done!\n");
    return 0;
}

编译与运行指南

1. 编译选项

# 基本编译（适合调试）
nvcc vector_add.cu -o vector_add_demo -O0 -g -DDEBUG

# 性能优化编译
nvcc vector_add.cu -o vector_add_demo -O3 -arch=sm_80

2. 运行测试

# 简单测试（使用默认1024元素）
./vector_add_demo

# 大规模测试（1M元素）
./vector_add_demo 1048576

# 带内存检查的调试
cuda-gdb ./vector_add_demo
# 或
compute-sanitizer ./vector_add_demo

3. 预期输出（示例）

Vector size: 1048576 elements
CPU time: 10.2452 ms
Launching kernel: gridDim=4096, blockDim=256
CUDA kernel time: 0.1024 ms
PASSED: Results match!
Done!

调试技巧与工具

1. 核函数调试打印

启用DEBUG宏可在核函数中输出调试信息：

// 在核函数内部
if (threadIdx.x == 0 && blockIdx.x == 0) {
    printf("Debug: a[0]=%.2f, b[0]=%.2f, c[0]=%.2f\n", 
           a[0], b[0], c[0]);
}

2. CUDA-GDB调试

# 启动调试器
cuda-gdb ./vector_add_demo

# 设置断点
(cuda-gdb) break vectorAddKernel
(cuda-gdb) run

# 调试命令
(cuda-gdb) cuda kernel lane 0  # 查看线程0信息
(cuda-gdb) print tid            # 打印变量
(cuda-gdb) next                 # 下一步

3. NVIDIA Compute Sanitizer

内存访问检查工具：

# 内存检查（越界访问）
compute-sanitizer --tool memcheck ./vector_add_demo

# 内存初始化检查
compute-sanitizer --tool initcheck ./vector_add_demo

# 竞争条件检查
compute-sanitizer --tool racecheck ./vector_add_demo

4. 性能分析（Nsight Systems）

# 命令行分析
nsys profile --stats=true ./vector_add_demo

# GUI分析
nsys-ui report*.qdrep

关键调试功能说明

全面的错误检查

#define CUDA_CHECK(call) // 检查所有API调用
CUDA_CHECK(cudaGetLastError()); // 检查核函数启动错误

CPU验证参考

vectorAddCPU(); // 提供参考实现
verifyResults(); // 结果比对

精确的计时

cudaEventRecord(start);
// 核函数执行
cudaEventRecord(stop);
cudaEventElapsedTime(); // 精确计算耗时

边界处理
```
if (tid < size) // 避免越界访问
```

配置打印

printf("gridDim=%d, blockDim=%d\n", gridSize, blockSize);

内存检查工具
```
compute-sanitizer --tool memcheck
```

这个Demo展示了CUDA算子开发的核心模式：内存管理+核函数+结果验证+调试工具链。无论您调试小型核函数还是复杂模型，这些调试模式都是通用的最佳实践。

您可能感兴趣的与本文相关的镜像

PyTorch 2.6

PyTorch

Cuda

PyTorch 是一个开源的 Python 机器学习库，基于 Torch 库，底层由 C++ 实现，应用于人工智能领域，如计算机视觉和自然语言处理