c++高性能多进程 cuda编程: safe_softmax实现 + cub::BlockReduce自定义归约操作-优快云博客

本文链接：https://blog.youkuaiyun.com/ResumeProject/article/details/146166125

cub::BlockReduce

cub::BlockReduce 是 CUB 库（CUDA UnBound）提供的一种用于 GPU 线程块内数据归约 (一般完成所有数据规约需要两次规约) 的高效工具。它允许线程块内的多个线程并行地对数据执行归约操作，cub::BlockReduce 支持的常见归约操作包括：
- Sum(value)：求和
- Max(value)：求最大值
- Min(value)：求最小值
- Reduce(value, reduction_op)：自定义归约操作

例如，计算线程块内的最大值：

float maxVal = cub::BlockReduce<float, BLOCK_SIZE>(temp_storage).Max(value);

或者使用自定义操作：

struct CustomOp {
    __device__ float operator()(const float &a, const float &b) const {
        return a * b;  // 例如，实现乘法归约
    }
};

float product = cub::BlockReduce<float, BLOCK_SIZE>(temp_storage).Reduce(value, CustomOp());

自定义归约操作 (`cub::BlockReduce::Reduce`)

1. 语法

Result = BlockReduce<T, BLOCK_SIZE>(temp_storage).Reduce(value, Op());

T：数据类型（如 float, int, struct 等）。
BLOCK_SIZE：线程块的大小。
value：每个线程提供的输入值。
Op()：自定义归约操作（必须是一个可调用对象，如函数对象（或仿函数）或 Lambda 表达式）。

safe_softmax实现

safe_softmax相比naive_softmax多了一个BlockReduce(temp_storage).Reduce(m_partial, max_op);计算最大值的方法：

#include <algorithm>
#include <cassert>
#include <cfloat>
#include <cub/cub.cuh>
#include <curand.h>
#include <iomanip>
#include <iostream>
#include <limits>
#include <math.h> // C语言fmaxf函数
#include <stdio.h>
#include <string>
#include <tuple>
#include <vector>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
            std::exit(EXIT_FAILURE); \
        } \
    } while (0)

__device__ __forceinline__ float max_op(float a, float b)
{
    return fmaxf(a, b);
}

// https://github.com/NVIDIA/online-softmax/blob/master/online_softmax_benchmark.cu
// 定义一个模板参数 THREADBLOCK_SIZE，表示线程块的大小
template<int THREADBLOCK_SIZE>
// 限制线程块的最大线程数，以优化寄存器使用
__launch_bounds__(THREADBLOCK_SIZE)
// CUDA 核函数，计算 softmax 并保证数值稳定
__global__ void safe_softmax(
    const float * __restrict x,  // 输入数据指针，大小为 (num_vectors, V)
    float * __restrict y,        // 输出数据指针，大小为 (num_vectors, V)
    int V)                       // 向量长度
{
    // 线程 ID（在线程块内部）
    int thread_id = threadIdx.x;

    // 线程块 ID（表示当前处理的向量索引）
    int vector_id = blockIdx.x;

    // 将输入指针和输出指针定位到当前向量的数据起点
    x += vector_id * V;
    y += vector_id * V;

    // 定义 CUB 库的 BlockReduce 结构体，用于高效地进行块级归约操作
    typedef cub::BlockReduce<float, THREADBLOCK_SIZE> BlockReduce;

    // 共享内存：用于存储归约操作的临时数据,作用范围是整个线程块（block）
    __shared__ typename BlockReduce::TempStorage temp_storage;

    // 共享内存变量：存储整个向量的最大值
    __shared__ float m_total;

    // 共享内存变量：存储 softmax 归一化项的倒数
    __shared__ float d_total_inverse;

    // Step 1: 计算当“前线程处理元素的”最大值（部分最大值）
    float m_partial = -FLT_MAX; // 初始值设为最小浮点数
    for(int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE)
        m_partial = max(m_partial, x[elem_id]); // 计算每个线程负责的元素的最大值

    // Step 2: 线程“块内归约”，找到整个向量的最大值
    float m = BlockReduce(temp_storage).Reduce(m_partial, max_op);
    if (thread_id == 0)
        m_total = m;  // 线程 0 负责存储最终最大值
    __syncthreads();  // 确保所有线程都能访问 m_total

    // Step 3: 计算指数和（分母部分）
    float d_partial = 0.0F;
    for(int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE)
        d_partial += __expf(x[elem_id] - m_total); // 计算 e^(x[i] - max(x))

    // Step 4: 线程块内求和，得到 softmax 分母
    float d = BlockReduce(temp_storage).Sum(d_partial);
    if (thread_id == 0)
        d_total_inverse = __fdividef(1.0F, d); // 计算分母的倒数，避免除法运算
    __syncthreads();  // 确保所有线程都能访问 d_total_inverse

    // Step 5: 计算最终的 softmax 输出
    for(int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE)
        y[elem_id] = __expf(x[elem_id] - m_total) * d_total_inverse;
}




void fill_random_values(float* d_x, int size) {
    std::vector<float> h_x(size);
    for (int i = 0; i < size; ++i) {
        h_x[i] = static_cast<float>(rand()) / RAND_MAX;
        std::cout << "INDEX:"<<i<<"VALUE" << h_x[i]<< std::endl;
    }
    
    CUDA_CHECK(cudaMemcpy(d_x, h_x.data(), size * sizeof(float), cudaMemcpyHostToDevice));
}

int main() {
    const int V = 8; // Feature size
    const int batch_size = 2;
    const int THREADBLOCK_SIZE = 4;

    float* x;
    float* y;
    CUDA_CHECK(cudaMalloc(&x, V * batch_size * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&y, V * batch_size * sizeof(float)));
    fill_random_values(x, V * batch_size);

    safe_softmax<THREADBLOCK_SIZE><<<batch_size, THREADBLOCK_SIZE>>>(x, y, V);
    CUDA_CHECK(cudaDeviceSynchronize());

    std::cout << "Softmax computation completed successfully!" << std::endl;

    CUDA_CHECK(cudaFree(x));
    CUDA_CHECK(cudaFree(y));
    return 0;
}