cuda_vs_报错无法解析的外部错误

优快云与CNBlog文章转载规范
本文探讨了在优快云和CNBlog平台上进行文章转载时应遵循的规范和道德准则,强调了尊重原创作者版权的重要性,并提供了正确的转载方式。
#include <cuda_runtime.h> #include <cublas_v2.h> #include <device_launch_parameters.h> #include <cuda.h> #include <vector> #include "Tensor.h" #include <cmath> // CUDA错误检查宏 #define CUDA_CHECK(call) \ do { \ cudaError_t error = call; \ if (error != cudaSuccess) { \ fprintf(stderr, "CUDA error at %s:%d - %s\n", \ __FILE__, __LINE__, cudaGetErrorString(error)); \ exit(1); \ } \ } while(0) // 初始化cuBLAS句柄 cublasHandle_t Tensor::cublas_handle_ = nullptr; // ReLU前向传播的CUDA核函数 __global__ void relu_forward_cuda_kernel(const float* input, float* output, int size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { output[idx] = input[idx] > 0 ? input[idx] : 0; } } // ReLU反向传播的CUDA核函数 __global__ void relu_backward_cuda_kernel(const float* grad_output, const float* input, float* grad_input, int size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { grad_input[idx] = input[idx] > 0 ? grad_output[idx] : 0; } } // 包装函数,供外部调用 extern "C" void relu_forward_kernel(float* input, float* output, int size) { int block_size = 256; int num_blocks = (size + block_size - 1) / block_size; relu_forward_cuda_kernel<<<num_blocks, block_size>>>(input, output, size); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); } extern "C" void relu_backward_kernel(float* grad_output, float* input, float* grad_input, int size) { int block_size = 256; int num_blocks = (size + block_size - 1) / block_size; relu_backward_cuda_kernel<<<num_blocks, block_size>>>(grad_output, input, grad_input, size); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); } // 计算每个batch的最大值的CUDA核函数 __global__ void find_max_cuda_kernel(const float* input, float* max_vals, int batch_size, int num_classes) { int batch_idx = blockIdx.x; if (batch_idx >= batch_size) return; float max_val = input[batch_idx * num_classes]; for (int j = 1; j < num_classes; ++j) { max_val = fmaxf(max_val, input[batch_idx * num_classes + j]); } max_vals[batch_idx] = max_val; } // 计算softmax的CUDA核函数 __global__ void softmax_cuda_kernel(const float* input, float* output, float* max_vals, int batch_size, int num_classes) { int batch_idx = blockIdx.x; int class_idx = threadIdx.x; if (batch_idx >= batch_size || class_idx >= num_classes) return; int idx = batch_idx * num_classes + class_idx; float max_val = max_vals[batch_idx]; // 计算exp(x - max_val) float exp_val = expf(input[idx] - max_val); output[idx] = exp_val; __syncthreads(); // 计算sum if (class_idx == 0) { float sum_exp = 0.0f; for (int j = 0; j < num_classes; ++j) { sum_exp += output[batch_idx * num_classes + j]; } max_vals[batch_idx] = sum_exp; // 重用max_vals数组存储sum } __syncthreads(); // 归一化 output[idx] /= max_vals[batch_idx]; } // 计算交叉熵损失的CUDA核函数 __global__ void cross_entropy_cuda_kernel(const float* softmax_output, const float* target, float* loss, int batch_size, int num_classes) { int batch_idx = blockIdx.x; if (batch_idx >= batch_size) return; float batch_loss = 0.0f; for (int j = 0; j < num_classes; ++j) { int idx = batch_idx * num_classes + j; if (target[idx] > 0) { batch_loss -= target[idx] * logf(softmax_output[idx] + 1e-10f); } } atomicAdd(loss, batch_loss / batch_size); } // 包装函数,供外部调用 extern "C" void cuda_cross_entropy_loss(float* input, float* target, float* output, float* workspace, int batch_size, int num_classes) { // 分配临时存储空间 float* max_vals = workspace; // 使用workspace的前batch_size个元素 float* softmax_output = workspace + batch_size; // 剩余空间用于softmax输出 // 初始化输出为0 CUDA_CHECK(cudaMemset(output, 0, sizeof(float))); // 计算每个batch的最大值 find_max_cuda_kernel<<<batch_size, 1>>>(input, max_vals, batch_size, num_classes); // 计算softmax softmax_cuda_kernel<<<batch_size, num_classes>>>(input, softmax_output, max_vals, batch_size, num_classes); // 计算交叉熵损失 cross_entropy_cuda_kernel<<<batch_size, 1>>>(softmax_output, target, output, batch_size, num_classes); // 检查错误 CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); } __global__ void conv2d_forward_kernel(float* input, float* weight, float* output, int batch_size, int in_channels, int out_channels, int height, int width, int kernel_size, int stride) { int b = blockIdx.x; int oc = blockIdx.y; int oh = blockIdx.z / ((width - kernel_size) / stride + 1); int ow = blockIdx.z % ((width - kernel_size) / stride + 1); float sum = 0.0f; for (int ic = 0; ic < in_channels; ic++) { for (int kh = 0; kh < kernel_size; kh++) { for (int kw = 0; kw < kernel_size; kw++) { int ih = oh * stride + kh; int iw = ow * stride + kw; int input_idx = ((b * in_channels + ic) * height + ih) * width + iw; int weight_idx = ((oc * in_channels + ic) * kernel_size + kh) * kernel_size + kw; sum += input[input_idx] * weight[weight_idx]; } } } int output_idx = ((b * out_channels + oc) * ((height - kernel_size) / stride + 1) + oh) * ((width - kernel_size) / stride + 1) + ow; output[output_idx] = sum; } // 实现Tensor的CUDA相关方法 void Tensor::to_cuda() { if (device_ == "cuda") return; allocate_cuda_memory(); sync_to_cuda(); device_ = "cuda"; } void Tensor::to_cpu() { if (device_ == "cpu") return; sync_to_cpu(); free_cuda_memory(); device_ = "cpu"; } void Tensor::allocate_cuda_memory() { if (cuda_data_ != nullptr) return; CUDA_CHECK(cudaMalloc(&cuda_data_, size_ * sizeof(float))); } void Tensor::free_cuda_memory() { if (cuda_data_ != nullptr) { CUDA_CHECK(cudaFree(cuda_data_)); cuda_data_ = nullptr; } } void Tensor::sync_to_cuda() { if (data_ == nullptr || cuda_data_ == nullptr) return; CUDA_CHECK(cudaMemcpy(cuda_data_, data_.get(), size_ * sizeof(float), cudaMemcpyHostToDevice)); } void Tensor::sync_to_cpu() { if (data_ == nullptr || cuda_data_ == nullptr) return; CUDA_CHECK(cudaMemcpy(data_.get(), cuda_data_, size_ * sizeof(float), cudaMemcpyDeviceToHost)); } cublasHandle_t Tensor::cublas_handle_ = nullptr; 为什么会报错:name followed by "::" must be a class or namespace name
05-14
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值