#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <device_launch_parameters.h>
#include <cuda.h>
#include <vector>
#include "Tensor.h"
#include <cmath>
// CUDA错误检查宏
#define CUDA_CHECK(call) \
do { \
cudaError_t error = call; \
if (error != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d - %s\n", \
__FILE__, __LINE__, cudaGetErrorString(error)); \
exit(1); \
} \
} while(0)
// 初始化cuBLAS句柄
cublasHandle_t Tensor::cublas_handle_ = nullptr;
// ReLU前向传播的CUDA核函数
__global__ void relu_forward_cuda_kernel(const float* input, float* output, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
output[idx] = input[idx] > 0 ? input[idx] : 0;
}
}
// ReLU反向传播的CUDA核函数
__global__ void relu_backward_cuda_kernel(const float* grad_output, const float* input, float* grad_input, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
grad_input[idx] = input[idx] > 0 ? grad_output[idx] : 0;
}
}
// 包装函数,供外部调用
extern "C" void relu_forward_kernel(float* input, float* output, int size) {
int block_size = 256;
int num_blocks = (size + block_size - 1) / block_size;
relu_forward_cuda_kernel<<<num_blocks, block_size>>>(input, output, size);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}
extern "C" void relu_backward_kernel(float* grad_output, float* input, float* grad_input, int size) {
int block_size = 256;
int num_blocks = (size + block_size - 1) / block_size;
relu_backward_cuda_kernel<<<num_blocks, block_size>>>(grad_output, input, grad_input, size);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}
// 计算每个batch的最大值的CUDA核函数
__global__ void find_max_cuda_kernel(const float* input, float* max_vals, int batch_size, int num_classes) {
int batch_idx = blockIdx.x;
if (batch_idx >= batch_size) return;
float max_val = input[batch_idx * num_classes];
for (int j = 1; j < num_classes; ++j) {
max_val = fmaxf(max_val, input[batch_idx * num_classes + j]);
}
max_vals[batch_idx] = max_val;
}
// 计算softmax的CUDA核函数
__global__ void softmax_cuda_kernel(const float* input, float* output, float* max_vals,
int batch_size, int num_classes) {
int batch_idx = blockIdx.x;
int class_idx = threadIdx.x;
if (batch_idx >= batch_size || class_idx >= num_classes) return;
int idx = batch_idx * num_classes + class_idx;
float max_val = max_vals[batch_idx];
// 计算exp(x - max_val)
float exp_val = expf(input[idx] - max_val);
output[idx] = exp_val;
__syncthreads();
// 计算sum
if (class_idx == 0) {
float sum_exp = 0.0f;
for (int j = 0; j < num_classes; ++j) {
sum_exp += output[batch_idx * num_classes + j];
}
max_vals[batch_idx] = sum_exp; // 重用max_vals数组存储sum
}
__syncthreads();
// 归一化
output[idx] /= max_vals[batch_idx];
}
// 计算交叉熵损失的CUDA核函数
__global__ void cross_entropy_cuda_kernel(const float* softmax_output, const float* target,
float* loss, int batch_size, int num_classes) {
int batch_idx = blockIdx.x;
if (batch_idx >= batch_size) return;
float batch_loss = 0.0f;
for (int j = 0; j < num_classes; ++j) {
int idx = batch_idx * num_classes + j;
if (target[idx] > 0) {
batch_loss -= target[idx] * logf(softmax_output[idx] + 1e-10f);
}
}
atomicAdd(loss, batch_loss / batch_size);
}
// 包装函数,供外部调用
extern "C" void cuda_cross_entropy_loss(float* input, float* target, float* output,
float* workspace, int batch_size, int num_classes) {
// 分配临时存储空间
float* max_vals = workspace; // 使用workspace的前batch_size个元素
float* softmax_output = workspace + batch_size; // 剩余空间用于softmax输出
// 初始化输出为0
CUDA_CHECK(cudaMemset(output, 0, sizeof(float)));
// 计算每个batch的最大值
find_max_cuda_kernel<<<batch_size, 1>>>(input, max_vals, batch_size, num_classes);
// 计算softmax
softmax_cuda_kernel<<<batch_size, num_classes>>>(input, softmax_output, max_vals,
batch_size, num_classes);
// 计算交叉熵损失
cross_entropy_cuda_kernel<<<batch_size, 1>>>(softmax_output, target, output,
batch_size, num_classes);
// 检查错误
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
}
__global__ void conv2d_forward_kernel(float* input, float* weight, float* output,
int batch_size, int in_channels, int out_channels,
int height, int width, int kernel_size, int stride) {
int b = blockIdx.x;
int oc = blockIdx.y;
int oh = blockIdx.z / ((width - kernel_size) / stride + 1);
int ow = blockIdx.z % ((width - kernel_size) / stride + 1);
float sum = 0.0f;
for (int ic = 0; ic < in_channels; ic++) {
for (int kh = 0; kh < kernel_size; kh++) {
for (int kw = 0; kw < kernel_size; kw++) {
int ih = oh * stride + kh;
int iw = ow * stride + kw;
int input_idx = ((b * in_channels + ic) * height + ih) * width + iw;
int weight_idx = ((oc * in_channels + ic) * kernel_size + kh) * kernel_size + kw;
sum += input[input_idx] * weight[weight_idx];
}
}
}
int output_idx = ((b * out_channels + oc) * ((height - kernel_size) / stride + 1) + oh) *
((width - kernel_size) / stride + 1) + ow;
output[output_idx] = sum;
}
// 实现Tensor的CUDA相关方法
void Tensor::to_cuda() {
if (device_ == "cuda") return;
allocate_cuda_memory();
sync_to_cuda();
device_ = "cuda";
}
void Tensor::to_cpu() {
if (device_ == "cpu") return;
sync_to_cpu();
free_cuda_memory();
device_ = "cpu";
}
void Tensor::allocate_cuda_memory() {
if (cuda_data_ != nullptr) return;
CUDA_CHECK(cudaMalloc(&cuda_data_, size_ * sizeof(float)));
}
void Tensor::free_cuda_memory() {
if (cuda_data_ != nullptr) {
CUDA_CHECK(cudaFree(cuda_data_));
cuda_data_ = nullptr;
}
}
void Tensor::sync_to_cuda() {
if (data_ == nullptr || cuda_data_ == nullptr) return;
CUDA_CHECK(cudaMemcpy(cuda_data_, data_.get(), size_ * sizeof(float), cudaMemcpyHostToDevice));
}
void Tensor::sync_to_cpu() {
if (data_ == nullptr || cuda_data_ == nullptr) return;
CUDA_CHECK(cudaMemcpy(data_.get(), cuda_data_, size_ * sizeof(float), cudaMemcpyDeviceToHost));
} cublasHandle_t Tensor::cublas_handle_ = nullptr;
为什么会报错:name followed by "::" must be a class or namespace name