naive_softmax实现及图示
template<int THREADBLOCK_SIZE>
__launch_bounds__(THREADBLOCK_SIZE)
__global__ void naive_softmax(
const float * __restrict x,
float * __restrict y,
int V
)
{
int thread_id = threadIdx.x;
int vector_id = blockIdx.x;
x += vector_id * V;
y += vector_id * V;
typedef cub::BlockReduce<float, THREADBLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ float d_total_inverse;
float d_partial = 0.0F;
for(int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE)
d_partial += __expf(x[elem_id]);
float d = BlockReduce(temp_storage).Sum(d_partial);
if (thread_id == 0)
d_total_inverse = __fdividef(1.0F, d);
__syncthreads();
for(int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE)
y[elem_id] = __expf(x[elem_id]) * d_total_inverse;
}
整体代码
#include <algorithm>
#include <cassert>
#include <cfloat>
#include <cub/cub.cuh>
#include <curand.h>
#include <iomanip>
#include <iostream>
#include <limits>
#include <math.h>
#include <stdio.h>
#include <string>
#include <tuple>
#include <vector>
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
std::exit(EXIT_FAILURE); \
} \
} while (0)
template<int THREADBLOCK_SIZE>
__launch_bounds__(THREADBLOCK_SIZE)
__global__ void naive_softmax(const float* __restrict__ x, float* __restrict__ y, int V) {
int thread_id = threadIdx.x;
int vector_id = blockIdx.x;
x += vector_id * V;
y += vector_id * V;
typedef cub::BlockReduce<float, THREADBLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ float d_total_inverse;
float d_partial = 0.0F;
printf(" INDEX: %d*%d",vector_id,V);
for (int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE) {
d_partial += __expf(x[elem_id]);
printf(" elem %f ",x[elem_id]);
}
printf("\n");
printf(" d_partial: %f\n",d_partial);
float d = BlockReduce(temp_storage).Sum(d_partial);
if (thread_id == 0) {
d_total_inverse = __fdividef(1.0F, d);
printf("d_in_d_total_inverse: %f\n",d);
}
__syncthreads();
printf(" d: %f\n",d);
for (int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE) {
y[elem_id] = __expf(x[elem_id]) * d_total_inverse;
}
}
void fill_random_values(float* d_x, int size) {
std::vector<float> h_x(size);
for (int i = 0; i < size; ++i) {
h_x[i] = static_cast<float>(rand()) / RAND_MAX;
std::cout << "INDEX:"<<i<<"VALUE" << h_x[i]<< std::endl;
}
CUDA_CHECK(cudaMemcpy(d_x, h_x.data(), size * sizeof(float), cudaMemcpyHostToDevice));
}
int main() {
const int V = 8;
const int batch_size = 2;
const int THREADBLOCK_SIZE = 4;
float* x;
float* y;
CUDA_CHECK(cudaMalloc(&x, V * batch_size * sizeof(float)));
CUDA_CHECK(cudaMalloc(&y, V * batch_size * sizeof(float)));
fill_random_values(x, V * batch_size);
naive_softmax<THREADBLOCK_SIZE><<<batch_size, THREADBLOCK_SIZE>>>(x, y, V);
CUDA_CHECK(cudaDeviceSynchronize());
std::cout << "Softmax computation completed successfully!" << std::endl;
CUDA_CHECK(cudaFree(x));
CUDA_CHECK(cudaFree(y));
return 0;
}
输出
nvcc -I ./cub -o softmax softmax.cu
./softmax
(base) cifi@cifi-666666:~$ ./softmax
INDEX:0VALUE0.840188
INDEX:1VALUE0.394383
INDEX:2VALUE0.783099
INDEX:3VALUE0.79844
INDEX:4VALUE0.911647
INDEX:5VALUE0.197551
INDEX:6VALUE0.335223
INDEX:7VALUE0.76823
INDEX:8VALUE0.277775
INDEX:9VALUE0.55397
INDEX:10VALUE0.477397
INDEX:11VALUE0.628871
INDEX:12VALUE0.364784
INDEX:13VALUE0.513401
INDEX:14VALUE0.95223
INDEX:15VALUE0.916195
INDEX: 0*8 INDEX: 0*8 INDEX: 0*8 INDEX: 0*8 INDEX: 1*8 INDEX: 1*8 INDEX: 1*8 INDEX: 1*8 elem 0.840188 elem 0.394383 elem 0.783099 elem 0.798440 elem 0.277775 elem 0.553970 elem 0.477397 elem 0.628871 elem 0.911647 elem 0.197551 elem 0.335223 elem 0.768230 elem 0.364784 elem 0.513401 elem 0.952230 elem 0.916195
d_partial: 4.805221
d_partial: 2.701884
d_partial: 3.586495
d_partial: 4.378018
d_partial: 2.760392
d_partial: 3.411112
d_partial: 4.203355
d_partial: 4.375253
d_in_d_total_inverse: 15.471619
d_in_d_total_inverse: 14.750113
d: 15.471619
d: 10.666397
d: 7.964513
d: 4.378018
d: 14.750113
d: 11.989719
d: 8.578608
d: 4.375253
Softmax computation completed successfully!
CG
- 8 * 2个数,2 * 4个线程
