一个简洁的cublasSmatinvBatched应用示例

Eloudy

已于 2022-05-31 01:03:26 修改

阅读量439

点赞数

分类专栏： blas cuda 文章标签： cublas 线性代数深度学习

于 2021-12-19 01:31:43 首次发布

本文链接：https://blog.youkuaiyun.com/eloudy/article/details/122019885

版权

blas 同时被 2 个专栏收录

65 篇文章

订阅专栏

cuda

34 篇文章

订阅专栏

可以简单地粘贴放入一个cuBLAS sample的文件中替代运行：


//一个简介的cublasSmatinvBatched 示例：



/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* Includes, cuda */
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>

/* Matrix size */
#define N (2)
#define BATCH_SIZE (1)


/* Main */
int main(int argc, char **argv) {
  cublasStatus_t status;
  float* h_A;
  float* d_A = 0;
  float* d_Ainv = 0;
  float* h_Ainv = 0;

  int n2 = N * N;
  int* info=NULL;
  float** A=NULL;
  float** Ainv=NULL;

  cublasHandle_t handle;

  printf("LL:: main()\n");

  int dev = findCudaDevice(argc, (const char **)argv);

  if (dev == -1) {
    return EXIT_FAILURE;
  }

  printf("simpleCUBLAS_Smatinv test running..\n");

  status = cublasCreate(&handle);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! CUBLAS initialization error\n");
    return EXIT_FAILURE;
  }

  h_A = reinterpret_cast<float *>(malloc(n2 * sizeof(h_A[0])));

  if (h_A == 0) {
    fprintf(stderr, "!!!! host memory allocation error (A)\n");
    return EXIT_FAILURE;
  }

  h_Ainv = reinterpret_cast<float *>(malloc(n2 * sizeof(h_Ainv[0])));

  if (h_Ainv == 0) {
    fprintf(stderr, "!!!! host memory allocation error (A)\n");
    return EXIT_FAILURE;
  }

  for (int i = 0; i < n2; i++) {
    h_A[i] = rand() / static_cast<float>(RAND_MAX);
  }

  cudaMalloc(&A, sizeof(float*));
  cudaMalloc(&Ainv, sizeof(float*));

  if (cudaMalloc(reinterpret_cast<void **>(&d_A), n2 * sizeof(d_A[0])) !=
      cudaSuccess) {
    fprintf(stderr, "!!!! device memory allocation error (allocate d_A)\n");
    return EXIT_FAILURE;
  }

  cudaMemcpy(A, &d_A, sizeof(float*), cudaMemcpyHostToDevice);


  if (cudaMalloc(reinterpret_cast<void **>(&d_Ainv), n2 * sizeof(d_Ainv[0])) !=
      cudaSuccess) {
    fprintf(stderr, "!!!! device memory allocation error (allocate d_Ainv)\n");
    return EXIT_FAILURE;
  }

  cudaMemcpy(Ainv, &d_Ainv, sizeof(float*), cudaMemcpyHostToDevice);

  if (cudaMalloc(reinterpret_cast<void **>(&info), BATCH_SIZE*sizeof(int)) !=
      cudaSuccess) {
    fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
    return EXIT_FAILURE;
  }

  status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! device access error (write A)\n");
    return EXIT_FAILURE;
  }

  status = cublasSmatinvBatched(handle, N, A, N,
                                Ainv, N, info, BATCH_SIZE);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution error.\n");
    return EXIT_FAILURE;
  }

  status = cublasGetVector(n2, sizeof(float), d_Ainv, 1, h_Ainv, 1);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! device access error (read C)\n");
    return EXIT_FAILURE;
  }

  printf("A =\n");

  for(int i=0; i<n2; i++){
    if(i%N==0)printf("\n");
    printf("%3.3f  ",h_A[i]);
  }

  printf("\ninversion of A:\n");
  printf("Ainv =\n");

  for(int i=0; i<n2; i++){
    if(i%N==0) printf("\n");
    printf("%3.3f  ",h_Ainv[i]);
  }

  printf("\n\n");

  free(h_A);
  free(h_Ainv);

  if(cudaFree(d_A) != cudaSuccess) {
    fprintf(stderr, "!!!! memory free error (d_A)\n");
    return EXIT_FAILURE;
  }

  if (cudaFree(d_Ainv) != cudaSuccess) {
    fprintf(stderr, "!!!! memory free error (d_Ainv)\n");
    return EXIT_FAILURE;
  }

  status = cublasDestroy(handle);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! shutdown error (A)\n");
    return EXIT_FAILURE;
  }

  return 0;
}

不检查版：

//一个简介的cublasSmatinvBatched 示例：
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <cublas_v2.h>
#include <cuda_runtime.h>
 
#define N (5)
#define BATCH_SIZE (1)
int NV_smatinv(float* matrixA, int n2);

/* cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float *A[], int lda,
                                               float *Ainv[], int lda_inv, int *info, int batchSize); */

int main(){
  float* matrixA;
  matrixA = reinterpret_cast<float *>(malloc(N*N*sizeof(matrixA[0])));
  for(int i=0; i<N*N; i++){
    matrixA[i] = rand() / static_cast<float>(RAND_MAX);
  }
  
  NV_smatinv(matrixA, N*N);
  //IX_smatinv(matrixA, N*N);

  free(matrixA);

  return 0;
}





int NV_smatinv(float* matrixA, int n2) {

  cublasStatus_t status;
  float* h_A;
  float* d_A = 0;
  float* d_Ainv = 0;
  float* h_Ainv = 0;
 
  int* info=NULL;
  float** A=NULL;        //LL:: array of matrices in d_A
  float** Ainv=NULL;     //LL:: array of inversion of matrices d_Ainv

  cublasHandle_t handle;
 
  status = cublasCreate(&handle);
 
  h_A    = reinterpret_cast<float *>(malloc(n2 * sizeof(h_A[0])));
  h_Ainv = reinterpret_cast<float *>(malloc(n2 * sizeof(h_Ainv[0])));
 
  memcpy(h_A, matrixA, n2*sizeof(matrixA[0]));
 
  cudaMalloc(&A, sizeof(float*));       //LL:: this example only has one float Matrix
  cudaMalloc(&Ainv, sizeof(float*));    //LL:: 
 
  if (cudaMalloc(reinterpret_cast<void **>(&d_A), n2 * sizeof(d_A[0])) !=
      cudaSuccess) {
    fprintf(stderr, "!!!! device memory allocation error (allocate d_A)\n");
    return EXIT_FAILURE;
  }
 
  cudaMemcpy(A, &d_A, sizeof(float*), cudaMemcpyHostToDevice);
 
  if (cudaMalloc(reinterpret_cast<void **>(&d_Ainv), n2 * sizeof(d_Ainv[0])) !=
      cudaSuccess) {
    fprintf(stderr, "!!!! device memory allocation error (allocate d_Ainv)\n");
    return EXIT_FAILURE;
  }
 
  cudaMemcpy(Ainv, &d_Ainv, sizeof(float*), cudaMemcpyHostToDevice);
 
  if (cudaMalloc(reinterpret_cast<void **>(&info), BATCH_SIZE*sizeof(int)) !=
      cudaSuccess) {
    fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
    return EXIT_FAILURE;
  }
 
  cudaMemcpy(d_A, h_A, n2*sizeof(h_A[0]), cudaMemcpyHostToDevice);
 
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! device access error (write A)\n");
    return EXIT_FAILURE;
  }
/* cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float *A[], int lda,
                                               float *Ainv[], int lda_inv, int *info, int batchSize); */
 
  status = cublasSmatinvBatched(handle, N, A, N,
                                Ainv, N, info, BATCH_SIZE);
 
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution error.\n");
    return EXIT_FAILURE;
  }
 
  cudaMemcpy(h_Ainv, d_Ainv, n2*sizeof(h_Ainv[0]), cudaMemcpyDeviceToHost);



  printf("\nnew A =");         for(int i=0; i<n2; i++){    if(i%N==0)printf("\n");        printf("%3.3f  ",h_A[i]);  }
  printf("\n\nnew Ainv =");    for(int i=0; i<n2; i++){    if(i%N==0) printf("\n");    printf("%3.3f  ",h_Ainv[i]);  }     printf("\n\n");



 
  free(h_A);
  free(h_Ainv);
 
  cudaFree(d_A);
  cudaFree(d_Ainv);
  cudaFree(A);
  cudaFree(Ainv);
  cudaFree(info);

  status = cublasDestroy(handle);
 
  return 0;
}