使用cublas实现矩阵乘法

原创已于 2023-03-20 15:32:25 修改 · 2.5k 阅读

17 ·

CC 4.0 BY-SA版权

文章标签：

#矩阵 #CUDA #C++

于 2023-03-16 11:55:15 首次发布

CUDA 专栏收录该内容

3 篇文章

订阅专栏

部署运行你感兴趣的模型镜像

使用CUDA写一个矩阵乘法C = A X B（矩阵维度：A: M X K, B: K X N, C: M X N），当然可以自己写核函数，但效率不如CUDA自带的cublas算法效率高。使用cublas唯一值得注意的地方是，在CPU中的矩阵数据存储是行优先存储，而在GPU中是列优先存储，这相当于对原矩阵做了一次转置，我们知道矩阵的两次转置等于原矩阵，要让最后的结果正确，在GPU中计算要使用：TC = T(A X B) = TB x TA，这里的T表示矩阵转置。具体原理可参见这篇博客：https://blog.youkuaiyun.com/HaoBBNuanMM/article/details/103054357，里面解释得非常直观详细。我刚开始没搞清楚这个差异，结果始终不对，还以为数据复制出了问题，直到查到这篇博客后才豁然开朗。下面是实现代码（matrix_multiply_cublas.cu）：

#include <algorithm>
#include <chrono>
#include <iostream>
#include <iterator>
#include <numeric>
#include <random>
#include <vector>

#include <cuda_runtime.h>
#include <cublas_v2.h>

// For units of time such as h, min, s, ms, us, ns
using namespace std::literals::chrono_literals;

namespace {
constexpr size_t M = 400;
constexpr size_t N = 500;
constexpr size_t K = 600;
}  // namespace

int main() {
  // Initialize CUBLAS
  cublasHandle_t handle;
  cublasStatus_t status = cublasCreate(&handle);
  if (status != CUBLAS_STATUS_SUCCESS) {
    std::cerr << "!!!! CUBLAS initialization error\n";
    return EXIT_FAILURE;
  }

  // Initialize the host input matrix;
  std::vector<float> mat_a(M * K, 0.0f);
  std::vector<float> mat_b(K * N, 0.0f);

  // Fill the matrices with random numbers
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_int_distribution<int> dis(0, 100000);
  auto rand_num = [&dis, &gen]() { return dis(gen); };
  std::generate(mat_a.begin(), mat_a.end(), rand_num);
  std::generate(mat_b.begin(), mat_b.end(), rand_num);

  // Show the test matrix data.
  if (M == 2 && N == 4 && K == 3) {
    //      1 2 3
    // A =
    //      4 5 6
    std::iota(mat_a.begin(), mat_a.end(), 1.0f);
    //      1  2  3  4
    // B =  5  6  7  8
    //      9 10 11 12

    std::iota(mat_b.begin(), mat_b.end(), 1.0f);
    //           38 44  50  56
    // C = AB =
    //           83 98 113 128

    std::cout << "A = \n";
    for (size_t i = 0; i < M * K; ++i) {
      std::cout << mat_a[i] << '\t';
      if ((i + 1) % K == 0) {
        std::cout << '\n';
      }
    }
    std::cout << "B = \n";
    for (size_t i = 0; i < K * N; ++i) {
      std::cout << mat_b[i] << '\t';
      if ((i + 1) % N == 0) {
        std::cout << '\n';
      }
    }
  }

  // Allocate device memory for the matrices
  float *device_mat_a = nullptr;
  float *device_mat_b = nullptr;
  float *device_mat_c = nullptr;
  if (cudaMalloc(reinterpret_cast<void **>(&device_mat_a),
                 M * K * sizeof(float)) != cudaSuccess) {
    std::cerr << "!!!! device memory allocation error (allocate A)\n";
    return EXIT_FAILURE;
  }

  if (cudaMalloc(reinterpret_cast<void **>(&device_mat_b),
                 K * N * sizeof(float)) != cudaSuccess) {
    std::cerr << "!!!! device memory allocation error (allocate B)\n";
    cudaFree(device_mat_a);
    return EXIT_FAILURE;
  }

  if (cudaMalloc(reinterpret_cast<void **>(&device_mat_c),
                 M * N * sizeof(float)) != cudaSuccess) {
    std::cerr << "!!!! device memory allocation error (allocate C)\n";
    cudaFree(device_mat_a);
    cudaFree(device_mat_b);
    return EXIT_FAILURE;
  }

  // Initialize the device matrices with the host matrices.
  cudaMemcpy(device_mat_a, mat_a.data(), M * K * sizeof(float),
             cudaMemcpyHostToDevice);
  cudaMemcpy(device_mat_b, mat_b.data(), K * N * sizeof(float),
             cudaMemcpyHostToDevice);

  // Free up host memories.
  mat_a.clear();
  mat_a.shrink_to_fit();
  mat_b.clear();
  mat_b.shrink_to_fit();

  // Performs operation using cublas
  // C = alpha * transa(A)*transb(B) + beta * C
  // `transa` indicates whether the matrix A is transposed or not.
  // `transb` indicates whether the matrix B is transposed or not.
  // A: m x k
  // B: k x n
  // C: m x n
  // LDA, LDB, LDC are the leading dimensions of the three matrices,
  // respectively.
  // If C = A x B is calculated, there is alpha = 1.0, beta = 0.0,
  // transa = CUBLAS_OP_N, transb = CUBLAS_OP_N

  // cublasStatus_t cublasSgemm(cublasHandle_t handle,
  //                          cublasOperation_t transa, cublasOperation_t
  //                          transb, int m, int n, int k, const float *alpha,
  //                          const float *A, int lda,
  //                          const float *B, int ldb,
  //                          const float *beta,
  //                          float *C, int ldc);
  float alpha = 1.0f;
  float beta = 0.0f;
  // Note that cublas is column primary! We need to transpose the order!
  // In CPU: C = A x B
  // But in GPU: CT = (A x B)T = BT x AT, T means transpose
  status =
      cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha,
                  device_mat_b, N, device_mat_a, K, &beta, device_mat_c, N);
  if (status != CUBLAS_STATUS_SUCCESS) {
    std::cerr << "!!!! cublas kernel execution error.\n";
    cudaFree(device_mat_a);
    cudaFree(device_mat_b);
    cudaFree(device_mat_c);
    return EXIT_FAILURE;
  }

  // Read back the result from device memory.
  std::vector<float> mat_c(M * N, 0.0f);
  cudaMemcpy(mat_c.data(), device_mat_c, M * N * sizeof(float),
             cudaMemcpyDeviceToHost);

  // Show the test results.
  if (M == 2 && N == 4 && K == 3) {
    std::cout << "C = AB = \n";
    for (size_t i = 0; i < M * N; ++i) {
      std::cout << mat_c[i] << '\t';
      if ((i + 1) % N == 0) {
        std::cout << '\n';
      }
    }
  }

  // Memory clean up
  cudaFree(device_mat_a);
  cudaFree(device_mat_b);
  cudaFree(device_mat_c);

  // Shutdown cublas
  cublasDestroy(handle);

  return 0;
}

上述代码中，关于CUBLAS函数：

cublasStatus_t cublasSgemm(cublasHandle_t handle,
                        cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, 
                        const float *alpha, const float *A, int lda, const float *B, int ldb,
                        const float *beta, float *C, int ldc);

的解释如下，该函数实际上计算的是如下公式：

C = alpha * transa(A)*transb(B) + beta * C

矩阵维度为：A: m x k， B: k x n，C: m x n，
我们现在要计算两个矩阵的乘法：C = AB，则要求：
alpha = 1.0, beta = 0.0， transa = transa = CUBLAS_OP_N（即矩阵不转置）
函数各参数的含义如下：
handle：cublas库句柄（即指针）；transa，transb：矩阵是否转置（包括Ｈermite转置）；m, n, k：矩阵的维度；alpha, beta：公式的系数；A, B, C：矩阵数据指针，要求使用一维指针；lda, ldb, ldc：矩阵Ａ、B、C的主维度，即A、B、C数据指针中连续存储的元素长度。在CUBLAS中，矩阵元素以列优先方式存储，那么对于矩阵A、B、C而言，其数据指针中连续存储的元素长度分别为m, n, k（可以思考一下为什么是这样），这就是lda, ldb, ldc的取值。
前已述及，我们在代码中调用该函数，实际上使用的是如下公式：

TC = T(A X B) = TB x TA

矩阵维度为：TB: n x k， TA: k x m，TC: n x m，于是调用参数为：

cublasHandle_t handle: handle
cublasOperation_t transa: CUBLAS_OP_N
cublasOperation_t transb: CUBLAS_OP_N
int m: N
int n: M
int k: K
const float *alpha: &alpha
const float *A: device_mat_b
int lda: N
const float *B: device_mat_a
int ldb: K
const float *beta: &beta
float *C: device_mat_c
int ldc: N

生成随机数，我使用的是C++ 11方式：

std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<int> dis(0, 100000);
auto rand_num = [&dis, &gen]() { return dis(gen); };
std::generate(mat_a.begin(), mat_a.end(), rand_num);
std::generate(mat_b.begin(), mat_b.end(), rand_num);

生成测试数据，我也是使用的C++ STL库算法std::iota来实现：

std::iota(mat_a.begin(), mat_a.end(), 1.0f);    
std::iota(mat_b.begin(), mat_b.end(), 1.0f);

主机内存我直接使用std::vector，而不是使用C语言函数malloc来分配，释放内存代码如下：

mat_a.clear();
mat_a.shrink_to_fit();
mat_b.clear();
mat_b.shrink_to_fit();

请大家熟悉C++的编码方式。

CMake编译文件CMakeLists.txt内容如下所示：

cmake_minimum_required(VERSION 3.0.0)
# Set the project name and its version
project(matrix_multiply VERSION 0.1.0)

# Set the c++17 standard
set(CMAKE_CXX_STANDARD 17)

# Set the -O3 optimization level with debug information
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -O3 -g")

# Disable warning messages
cmake_policy(SET CMP0074 NEW)

# Find CUDA
find_package(CUDA REQUIRED)

# These flags embed debugging information for both host and device code
set(CUDA_NVCC_FLAGS -G -g)

# Generate a cuda executable file.
cuda_add_executable(${PROJECT_NAME} ${PROJECT_NAME}.cu)
target_link_libraries(${PROJECT_NAME} ${CUDA_cublas_LIBRARY})

上述配置文件的含义如下图所示：
CUDA配置

编译方式说明：

使用cmake的编译：

mkdir build && cd build
# 生成Makefile
cmake ..
# 构建目标，也可直接使用：make
cmake --build .

如直接使用nvcc编译，指令为：

nvcc -O3 -G -g -std=c++17 matrix_multiply_cublas.cu -lcublas -o matrix_multiply_cublas

上述编译语句的选项含义为：-O3使用O3级优化，-G设备（device）代码包含调试信息，-g主机（host）代码包含调试信息，-std=c++17使用C++17标准，-lcublas表示链接cublas库。
注意：CUDA代码带调试信息会极大降低效率，除非调试需要，否则不要添加-G选项。

GCC编译器版本必须为9.1以上版本（Ubuntu 20.04 2021年以后的版本默认就是GCC 9.3）

将常量修改为M = 2, N = 4, K = 3，得到的测试结果如下：

 ./matrix_multiply_cublas 
A = 
1       2       3
4       5       6
B = 
1       2       3       4
5       6       7       8
9       10      11      12
C = AB = 
38      44      50      56
83      98      113     128

将常量修改为M = 400, N = 500, K = 600，得到的性能分析结果如下：

nvprof ./matrix_multiply_cublas 
==216602== NVPROF is profiling process 216602, command: ./matrix_multiply
==216602== Profiling application: ./matrix_multiply
==216602== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   42.78%  184.93us         2  92.463us  75.519us  109.41us  [CUDA memcpy HtoD]
                   42.44%  183.49us         1  183.49us  183.49us  183.49us  volta_sgemm_128x32_nn
                   14.50%  62.687us         1  62.687us  62.687us  62.687us  [CUDA memcpy DtoH]
                    0.28%  1.2160us         1  1.2160us  1.2160us  1.2160us  [CUDA memset]
      API calls:   99.77%  670.35ms         8  83.794ms  2.8420us  352.22ms  cudaFree
                    0.08%  547.51us         3  182.50us  58.392us  347.88us  cudaMemcpy
                    0.06%  406.58us       297  1.3680us      81ns  98.280us  cuDeviceGetAttribute
                    0.05%  315.62us         6  52.602us  3.5290us  117.89us  cudaMalloc
                    0.02%  113.83us       751     151ns      88ns     772ns  cuGetProcAddress
                    0.01%  92.881us         3  30.960us  15.184us  60.794us  cuDeviceGetName
                    0.00%  14.385us         1  14.385us  14.385us  14.385us  cudaLaunchKernel
                    0.00%  11.389us        18     632ns     269ns  5.8650us  cudaEventCreateWithFlags
                    0.00%  10.808us         1  10.808us  10.808us  10.808us  cuDeviceGetPCIBusId
                    0.00%  6.9020us         2  3.4510us     524ns  6.3780us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
                    0.00%  6.3610us         4  1.5900us     628ns  2.7490us  cudaDeviceSynchronize
                    0.00%  5.6670us         1  5.6670us  5.6670us  5.6670us  cudaMemsetAsync
                    0.00%  5.5610us        18     308ns     226ns     947ns  cudaEventDestroy
                    0.00%  5.3820us         3  1.7940us     321ns  3.5870us  cudaGetDevice
                    0.00%  4.2940us         5     858ns     202ns  2.6630us  cuDeviceGetCount
                    0.00%  4.0470us        14     289ns     171ns     960ns  cudaDeviceGetAttribute
                    0.00%  2.6340us         2  1.3170us  1.2010us  1.4330us  cuInit
                    0.00%  2.1070us         1  2.1070us  2.1070us  2.1070us  cudaEventRecord
                    0.00%  1.9590us         1  1.9590us  1.9590us  1.9590us  cudaEventQuery
                    0.00%  1.7620us         3     587ns     197ns  1.1320us  cudaStreamGetCaptureInfo
                    0.00%  1.7510us         4     437ns     101ns  1.2370us  cuDeviceGet
                    0.00%  1.2910us         3     430ns     219ns     777ns  cuDeviceTotalMem
                    0.00%  1.0030us         1  1.0030us  1.0030us  1.0030us  cudaGetSymbolAddress
                    0.00%     944ns         3     314ns     133ns     649ns  cuDeviceGetUuid
                    0.00%     282ns         2     141ns     127ns     155ns  cuDriverGetVersion
                    0.00%     236ns         1     236ns     236ns     236ns  cudaGetLastError
                    0.00%     100ns         1     100ns     100ns     100ns  cuModuleGetLoadingMode

您可能感兴趣的与本文相关的镜像