【Cuda 编程思想】手写一个量化反量化算子Quant

最新推荐文章于 2025-04-05 23:46:25 发布
Mr.Lee jack
最新推荐文章于 2025-04-05 23:46:25 发布
阅读量180
点赞数 3
分类专栏： cuda 文章标签： c++ cuda
本文链接：https://blog.youkuaiyun.com/xzpdxz/article/details/146978055
版权
cuda 专栏收录该内容
4 篇文章
订阅专栏
Cuda 量化核函数

#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <float.h>

// 定义INT4的范围 2^4 = 16， 因为他只能处理4位的整数
// 正数范围：0000(0) 到 0111(7)，共8个值
// 负数范围：1000(-8) 到 1111(-1)，共8个值
#define INT4_MIN -8
#define INT4_MAX 7

// 将两个 INT4 值打包到一个字节中
__device__ unsigned char pack_int4(int a, int b) {
    // 确保值在 INT4 范围内
    a = max(INT4_MIN, min(INT4_MAX, a));
    b = max(INT4_MIN, min(INT4_MAX, b));
    
    // 将负值转换为无符号表示（0-15）
    unsigned char ua = (unsigned char)(a & 0xF);
    unsigned char ub = (unsigned char)(b & 0xF);
    
    // 打包：高4位存储第一个值，低4位存储第二个值
    return (ua << 4) | ub;
    // 如果 ua = 0000 0101（十进制的5），左移后变成 0101 0000
    // 左移后的 ua 与 ub 进行按位或操作
    // 如果 ub = 0000 0011（十进制的3），则结果为 0101 0011
    // 这样这两个值就存到了一个字节中
}

// 从一个字节中解包两个 INT4 值
__device__ void unpack_int4(unsigned char packed, int& a, int& b) {
    // 提取高4位和低4位
    a = (packed >> 4) & 0xF; // 将包右移4位后，在取低位
    b = packed & 0xF; // 直接取低位即可
    
    // 处理负数（如果第4位是1，则为负数）
    // 如果无符号a 大于7 说明原始值是一个负数，所以直接-16 可以将值还原位原来有符号整数
    if (a > 7) a -= 16;
    if (b > 7) b -= 16;
}

// 查找张量中的最大值和最小值
__global__ void find_min_max_kernel(const half* input, float* min_max, int size) {
    extern __shared__ float shared_data[];
    float* s_min = shared_data;
    float* s_max = shared_data + blockDim.x;
    
    int tid = threadIdx.x;
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    
    // 初始化
    s_min[tid] = gid < size ? __half2float(input[gid]) : FLT_MAX;
    s_max[tid] = gid < size ? __half2float(input[gid]) : -FLT_MAX;
    __syncthreads();
    
    // 归约找最小值和最大值
    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            s_min[tid] = fminf(s_min[tid], s_min[tid + stride]);
            s_max[tid] = fmaxf(s_max[tid], s_max[tid + stride]);
        }
        __syncthreads();
    }
    
    // 将结果写回全局内存
    if (tid == 0) {
        atomicMin((int*)&min_max[0], __float_as_int(s_min[0]));
        atomicMax((int*)&min_max[1], __float_as_int(s_max[0]));
    }
}

// FP16 到 INT4 的量化
__global__ void fp16_to_int4_kernel(const half* input, unsigned char* output, 
                                   float input_min, float input_max, int size) {
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    
    // 每个线程处理两个元素
    if (gid * 2 < size) {
        // 计算缩放因子
        float scale = (INT4_MAX - INT4_MIN) / (input_max - input_min);
        
        // 获取两个输入值
        float val1 = __half2float(input[gid * 2]);
        float val2 = (gid * 2 + 1 < size) ? __half2float(input[gid * 2 + 1]) : 0.0f;
        
        // 量化到 INT4 范围
        int q1 = (int)roundf(scale * (val1 - input_min) + INT4_MIN);
        int q2 = (gid * 2 + 1 < size) ? (int)roundf(scale * (val2 - input_min) + INT4_MIN) : 0;
        
        // 打包并存储
        output[gid] = pack_int4(q1, q2);
    }
}

// INT4 到 FP16 的反量化（用于测试）
__global__ void int4_to_fp16_kernel(const unsigned char* input, half* output, 
                                   float input_min, float input_max, int size) {
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (gid * 2 < size) {
        // 计算缩放因子
        float scale = (input_max - input_min) / (INT4_MAX - INT4_MIN);
        
        // 解包两个 INT4 值
        int q1, q2;
        unpack_int4(input[gid], q1, q2);
        
        // 反量化到 FP16
        float val1 = scale * (q1 - INT4_MIN) + input_min;
        float val2 = scale * (q2 - INT4_MIN) + input_min;
        
        // 存储结果
        output[gid * 2] = __float2half(val1);
        if (gid * 2 + 1 < size) {
            output[gid * 2 + 1] = __float2half(val2);
        }
    }
}

// 主函数：FP16 到 INT4 的量化
void fp16_to_int4(const half* d_input, unsigned char* d_output, int size) {
    // 计算需要的块数
    int block_size = 256;
    int grid_size = (size + block_size - 1) / block_size;
    
    // 分配内存用于存储最小值和最大值
    float* d_min_max;
    cudaMalloc(&d_min_max, 2 * sizeof(float));
    float h_min_max[2] = {FLT_MAX, -FLT_MAX};
    cudaMemcpy(d_min_max, h_min_max, 2 * sizeof(float), cudaMemcpyHostToDevice);
    
    // 查找最小值和最大值
    find_min_max_kernel<<<grid_size, block_size, 2 * block_size * sizeof(float)>>>(
        d_input, d_min_max, size);
    
    // 将最小值和最大值复制回主机
    cudaMemcpy(h_min_max, d_min_max, 2 * sizeof(float), cudaMemcpyDeviceToHost);
    float input_min = h_min_max[0];
    float input_max = h_min_max[1];
    
    // 计算量化所需的网格大小（每个线程处理两个元素）
    grid_size = (size + 2 * block_size - 1) / (2 * block_size);
    
    // 执行量化
    fp16_to_int4_kernel<<<grid_size, block_size>>>(
        d_input, d_output, input_min, input_max, size);
    
    // 释放临时内存
    cudaFree(d_min_max);
}

// 测试函数：INT4 到 FP16 的反量化
void int4_to_fp16(const unsigned char* d_input, half* d_output, float input_min, float input_max, int size) {
    int block_size = 256;
    int grid_size = (size + 2 * block_size - 1) / (2 * block_size);
    
    int4_to_fp16_kernel<<<grid_size, block_size>>>(
        d_input, d_output, input_min, input_max, size);
}

// 测试主函数
int main() {
    const int size = 1024;
    
    // 分配主机内存
    half* h_input = (half*)malloc(size * sizeof(half));
    unsigned char* h_output = (unsigned char*)malloc((size + 1) / 2 * sizeof(unsigned char));
    half* h_recovered = (half*)malloc(size * sizeof(half));
    
    // 初始化输入数据
    for (int i = 0; i < size; i++) {
        h_input[i] = __float2half((float)rand() / RAND_MAX * 2.0f - 1.0f); // 随机值在 -1 到 1 之间
    }
    
    // 分配设备内存
    half* d_input;
    unsigned char* d_output;
    half* d_recovered;
    cudaMalloc(&d_input, size * sizeof(half));
    cudaMalloc(&d_output, (size + 1) / 2 * sizeof(unsigned char));
    cudaMalloc(&d_recovered, size * sizeof(half));
    
    // 将输入数据复制到设备
    cudaMemcpy(d_input, h_input, size * sizeof(half), cudaMemcpyHostToDevice);
    
    // 执行量化
    fp16_to_int4(d_input, d_output, size);
    
    // 将量化结果复制回主机
    cudaMemcpy(h_output, d_output, (size + 1) / 2 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
    
    // 计算输入数据的最小值和最大值（用于反量化测试）
    float min_val = FLT_MAX;
    float max_val = -FLT_MAX;
    for (int i = 0; i < size; i++) {
        float val = __half2float(h_input[i]);
        min_val = fminf(min_val, val);
        max_val = fmaxf(max_val, val);
    }
    
    // 执行反量化（测试用）
    int4_to_fp16(d_output, d_recovered, min_val, max_val, size);
    
    // 将反量化结果复制回主机
    cudaMemcpy(h_recovered, d_recovered, size * sizeof(half), cudaMemcpyDeviceToHost);
    
    // 计算量化误差
    float total_error = 0.0f;
    float max_error = 0.0f;
    for (int i = 0; i < size; i++) {
        float original = __half2float(h_input[i]);
        float recovered = __half2float(h_recovered[i]);
        float error = fabsf(original - recovered);
        total_error += error;
        max_error = fmaxf(max_error, error);
    }
    
    printf("量化结果统计:\n");
    printf("平均误差: %f\n", total_error / size);
    printf("最大误差: %f\n", max_error);
    printf("压缩率: %f\n", (float)(size * sizeof(half)) / ((size + 1) / 2 * sizeof(unsigned char)));
    
    // 释放内存
    free(h_input);
    free(h_output);
    free(h_recovered);
    cudaFree(d_input);
    cudaFree(d_output);
    cudaFree(d_recovered);
    
    return 0;
}