#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <cfloat>
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <map>
#include <memory>
#include <random>
#include <sstream>
#include <string>
#include <vector>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cublas_v2.h>
#include <cudnn.h>
#include "readubyte.h"
// Block width for CUDA kernels
#define BW 128
#ifdef USE_GFLAGS
#include <gflags/gflags.h>
#ifndef _WIN32
#define gflags google
#endif
#else
// Constant versions of gflags
#define DEFINE_int32(flag, default_value, description) const int FLAGS_##flag = (default_value)
#define DEFINE_uint64(flag, default_value, description) const unsigned long long FLAGS_##flag = (default_value)
#define DEFINE_bool(flag, default_value, description) const bool FLAGS_##flag = (default_value)
#define DEFINE_double(flag, default_value, description) const double FLAGS_##flag = (default_value)
#define DEFINE_string(flag, default_value, description) const std::string FLAGS_##flag ((default_value))
#endif
/**
* Computes ceil(x / y) for integral nonnegative values.
*/
static inline unsigned int RoundUp(unsigned int nominator, unsigned int denominator)
{
return (nominator + denominator - 1) / denominator;
}
/**
* Saves a PGM grayscale image out of unsigned 8-bit data
*/
void SavePGMFile(const unsigned char *data, size_t width, size_t height, const char *filename)
{
FILE *fp = fopen(filename, "wb");
if (fp)
{
fprintf(fp, "P5\n%lu %lu\n255\n", width, height);
fwrite(data, sizeof(unsigned char), width * height, fp);
fclose(fp);
}
}
//
// Error handling
// Adapted from the CUDNN classification code
// sample: https://developer.nvidia.com/cuDNN
#define FatalError(s) do { \
std::stringstream _where, _message; \
_where << __FILE__ << ':' << __LINE__; \
_message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__; \
std::cerr << _message.str() << "\nAborting...\n"; \
cudaDeviceReset(); \
exit(1); \
} while(0)
#define checkCUDNN(status) do { \
std::stringstream _error; \
if (status != CUDNN_STATUS_SUCCESS) { \
_error << "CUDNN failure: " << cudnnGetErrorString(status); \
FatalError(_error.str()); \
} \
} while(0)
#define checkCudaErrors(status) do { \
std::stringstream _error; \
if (status != 0) { \
_error << "Cuda failure: " << status; \
FatalError(_error.str()); \
} \
} while(0)
///
// Command-line flags
// Application parameters
DEFINE_int32(gpu, 0, "The GPU ID to use");
DEFINE_int32(iterations, 1000, "Number of iterations for training");
DEFINE_int32(random_seed, -1, "Override random seed (default uses std::random_device)");
DEFINE_int32(classify, -1, "Number of images to classify to compute error rate (default uses entire test set)");
// Batch parameters
DEFINE_uint64(batch_size, 64, "Batch size for training");
// Filenames
DEFINE_bool(pretrained, false, "Use the pretrained CUDNN model as input");
DEFINE_bool(save_data, false, "Save pretrained weights to file");
DEFINE_string(train_images, "train-images-idx3-ubyte", "Training images filename");
DEFINE_string(train_labels, "train-labels-idx1-ubyte", "Training labels filename");
DEFINE_string(test_images, "t10k-images-idx3-ubyte", "Test images filename");
DEFINE_string(test_labels, "t10k-labels-idx1-ubyte", "Test labels filename");
// Solver parameters
DEFINE_double(learning_rate, 0.01, "Base learning rate");
DEFINE_double(lr_gamma, 0.0001, "Learning rate policy gamma");
DEFINE_double(lr_power, 0.75, "Learning rate policy power");
/**
* Represents a convolutional layer with bias.
*/
struct ConvBiasLayer
{
int in_channels, out_channels, kernel_size;
int in_width, in_height, out_width, out_height;
std::vector<float> pconv, pbias;
ConvBiasLayer(int in_channels_, int out_channels_, int kernel_size_,
int in_w_, int in_h_) : pconv(in_channels_ * kernel_size_ * kernel_size_ * out_channels_),
pbias(out_channels_)
{
in_channels = in_channels_;
out_channels = out_channels_;
kernel_size = kernel_size_;
in_width = in_w_;
in_height = in_h_;
out_width = in_w_ - kernel_size_ + 1;
out_height = in_h_ - kernel_size_ + 1;
}
};
///
// GPU Kernels
/**
* Fills a floating-point array with ones.
*
* @param vec The array to fill.
* @param size The number of elements in the array.
*/
__global__ void FillOnes(float *vec, int size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= size)
return;
vec[idx] = 1.0f;
}
/**
* Computes the backpropagation results of the Softmax loss for each result in a batch.
* Uses the softmax values obtained from forward propagation to compute the difference.
*
* @param label The training batch label values.
* @param num_labels The number of possible labels.
* @param batch_size The size of the trained batch.
* @param diff The resulting gradient.
*/
__global__ void SoftmaxLossBackprop(const float *label, int num_labels, int batch_size, float *diff)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= batch_size)
return;
const int label_value = static_cast<int>(label[idx]);
// For each item in the batch, decrease the result of the label's value by 1
diff[idx * num_labels + label_value] -= 1.0f;
}
///
// CUDNN/CUBLAS training context
struct TrainingContext
{
cudnnHandle_t cudnnHandle;
cublasHandle_t cublasHandle;
cudnnTensorDescriptor_t dataTensor, conv1Tensor, conv1BiasTensor,
conv2Tensor, conv2BiasTensor, fc1Tensor;
cudnnFilterDescriptor_t conv1filterDesc, conv2filterDesc;
cudnnConvolutionDescriptor_t conv1Desc, conv2Desc;
cudnnConvolutionFwdAlgo_t conv1algo, conv2algo;
cudnnConvolutionBwdFilterAlgo_t conv1bwfalgo, conv2bwfalgo;
cudnnConvolutionBwdDataAlgo_t conv2bwdalgo;
cudnnActivationDescriptor_t fc1Activation;
int m_gpuid;
int m_batchSize;
size_t m_workspaceSize;
FullyConnectedLayer& ref_fc1;
// Disable copying
TrainingContext& operator=(const TrainingContext&) = delete;
TrainingContext(const TrainingContext&) = delete;
TrainingContext(int gpuid, int batch_size,
ConvBiasLayer& conv1, ConvBiasLayer& conv2,
FullyConnectedLayer& fc1) : ref_fc1(fc1), m_gpuid(gpuid)
{
m_batchSize = batch_size;
// Create CUBLAS and CUDNN handles
checkCudaErrors(cudaSetDevice(gpuid));
checkCudaErrors(cublasCreate(&cublasHandle));
checkCUDNN(cudnnCreate(&cudnnHandle));
// Create tensor descriptors
checkCUDNN(cudnnCreateTensorDescriptor(&dataTensor));
checkCUDNN(cudnnCreateTensorDescriptor(&conv1Tensor));
checkCUDNN(cudnnCreateTensorDescriptor(&conv1BiasTensor));
checkCUDNN(cudnnCreateTensorDescriptor(&conv2Tensor));
checkCUDNN(c
cudnn-training
最新推荐文章于 2024-04-23 17:57:38 发布