相关文章
代码实现
#include <algorithm>
#include <cassert>
#include <cfloat>
#include <cub/cub.cuh>
#include <curand.h>
#include <iomanip>
#include <iostream>
#include <limits>
#include <math.h> // C语言fmaxf函数
#include <stdio.h>
#include <string>
#include <tuple>
#include <vector>
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
std::exit(EXIT_FAILURE); \
} \
} while (0)
struct __align__(8) MD
{
float m;
float d;
};
__device__ __forceinline__ MD reduce_md_op(MD a, MD b)
{
bool a_bigger = (a.m > b.m);
MD bigger_m = a_bigger ? a : b;
MD smaller_m = a_bigger ? b : a;
MD res;
res.d = bigger_m.d + smaller_m.d * __expf(smaller_m.m - bigger_m.m);
res.m = bigger_m.m;
return res;
}
// https://github.com/NVIDIA/online-softmax/blob/master/online_softmax_benchmark.cu
// online_softmax比safe_softmax代码量更少,更易理解
template<int THREADBLOCK_SIZE>
__launch_bounds__(THREADBLOCK_SIZE)
__global__ void online_softmax(
const float * __restrict x,
float * __restrict y,
int V)
{
int thread_id = threadIdx.x;
int vector_id = blockIdx.x;
// reposition x and y to data for the current vector
x += vector_id * V;
y += vector_id * V;
typedef cub::BlockReduce<MD, THREADBLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ MD md_total;
MD md_partial;
md_partial.m = -FLT_MAX;
md_partial.d = 0.0F;
// reduce_md_op 同时计算 max(最大值)和 denominator(分母)
for(int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE)
{
MD new_elem;
new_elem.m = x[elem_id];
new_elem.d = 1.0F;
md_partial = reduce_md_op(md_partial, new_elem);
}
MD md = BlockReduce(temp_storage).Reduce(md_partial, reduce_md_op);// 线程块内数据归约 (一般完成所有数据规约至少需要两次)
if (thread_id == 0)
md_total = md;
__syncthreads();
// Step 5: 计算最终的 softmax 输出
float d_total_inverse = __fdividef(1.0F, md_total.d);
for(int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE)
y[elem_id] = __expf(x[elem_id] - md_total.m) * d_total_inverse;
}
void fill_random_values(float* d_x, int size) {
std::vector<float> h_x(size);
for (int i = 0; i < size; ++i) {
h_x[i] = static_cast<float>(rand()) / RAND_MAX;
std::cout << "INDEX:"<<i<<"VALUE" << h_x[i]<< std::endl;
}
CUDA_CHECK(cudaMemcpy(d_x, h_x.data(), size * sizeof(float), cudaMemcpyHostToDevice));
}
int main() {
const int V = 8; // Feature size
const int batch_size = 2;
const int THREADBLOCK_SIZE = 4;
float* x;
float* y;
CUDA_CHECK(cudaMalloc(&x, V * batch_size * sizeof(float)));
CUDA_CHECK(cudaMalloc(&y, V * batch_size * sizeof(float)));
fill_random_values(x, V * batch_size);
online_softmax<THREADBLOCK_SIZE><<<batch_size, THREADBLOCK_SIZE>>>(x, y, V);
CUDA_CHECK(cudaDeviceSynchronize());
std::cout << "Softmax computation completed successfully!" << std::endl;
CUDA_CHECK(cudaFree(x));
CUDA_CHECK(cudaFree(y));
return 0;
}