c++高性能多进程 cuda编程: online_softmax实现

最新推荐文章于 2025-05-06 14:19:34 发布

原创最新推荐文章于 2025-05-06 14:19:34 发布 · 375 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#c++ #开发语言

语言学习笔记专栏收录该内容

193 篇文章

订阅专栏

代码实现

#include <algorithm>
#include <cassert>
#include <cfloat>
#include <cub/cub.cuh>
#include <curand.h>
#include <iomanip>
#include <iostream>
#include <limits>
#include <math.h> // C语言fmaxf函数
#include <stdio.h>
#include <string>
#include <tuple>
#include <vector>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
            std::exit(EXIT_FAILURE); \
        } \
    } while (0)

struct __align__(8) MD
{
    float m;
    float d;
};


__device__ __forceinline__ MD reduce_md_op(MD a, MD b)
{
    bool a_bigger = (a.m > b.m);
    MD bigger_m = a_bigger ? a : b;
    MD smaller_m = a_bigger ? b : a;
    MD res;
    res.d = bigger_m.d + smaller_m.d * __expf(smaller_m.m - bigger_m.m);
    res.m = bigger_m.m;
    return res;
}


// https://github.com/NVIDIA/online-softmax/blob/master/online_softmax_benchmark.cu
// online_softmax比safe_softmax代码量更少，更易理解 
template<int THREADBLOCK_SIZE>
__launch_bounds__(THREADBLOCK_SIZE)
__global__ void online_softmax(
    const float * __restrict x,
    float * __restrict y,
    int V)
{
    int thread_id = threadIdx.x;
    int vector_id = blockIdx.x;

    // reposition x and y to data for the current vector
    x += vector_id * V;
    y += vector_id * V;

    typedef cub::BlockReduce<MD, THREADBLOCK_SIZE> BlockReduce;

    __shared__ typename BlockReduce::TempStorage temp_storage;
    __shared__ MD md_total;

    MD md_partial;
    md_partial.m = -FLT_MAX;
    md_partial.d = 0.0F;
    // reduce_md_op 同时计算 max（最大值）和 denominator（分母）
    for(int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE)
    {
        MD new_elem;
        new_elem.m = x[elem_id];
        new_elem.d = 1.0F;
        md_partial = reduce_md_op(md_partial, new_elem);
    }

    MD md = BlockReduce(temp_storage).Reduce(md_partial, reduce_md_op);// 线程块内数据归约 (一般完成所有数据规约至少需要两次)
    if (thread_id == 0)
        md_total = md;
    __syncthreads();

    // Step 5: 计算最终的 softmax 输出
    float d_total_inverse = __fdividef(1.0F, md_total.d);
    for(int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE)
        y[elem_id] = __expf(x[elem_id] - md_total.m) * d_total_inverse;
}


void fill_random_values(float* d_x, int size) {
    std::vector<float> h_x(size);
    for (int i = 0; i < size; ++i) {
        h_x[i] = static_cast<float>(rand()) / RAND_MAX;
        std::cout << "INDEX:"<<i<<"VALUE" << h_x[i]<< std::endl;
    }
    
    CUDA_CHECK(cudaMemcpy(d_x, h_x.data(), size * sizeof(float), cudaMemcpyHostToDevice));
}

int main() {
    const int V = 8; // Feature size
    const int batch_size = 2;
    const int THREADBLOCK_SIZE = 4;

    float* x;
    float* y;
    CUDA_CHECK(cudaMalloc(&x, V * batch_size * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&y, V * batch_size * sizeof(float)));
    fill_random_values(x, V * batch_size);

    online_softmax<THREADBLOCK_SIZE><<<batch_size, THREADBLOCK_SIZE>>>(x, y, V);
    CUDA_CHECK(cudaDeviceSynchronize());

    std::cout << "Softmax computation completed successfully!" << std::endl;

    CUDA_CHECK(cudaFree(x));
    CUDA_CHECK(cudaFree(y));
    return 0;
}