可以简单地粘贴放入一个cuBLAS sample的文件中替代运行:
//一个简介的cublasSmatinvBatched 示例:
/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* Includes, cuda */
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
/* Matrix size */
#define N (2)
#define BATCH_SIZE (1)
/* Main */
int main(int argc, char **argv) {
cublasStatus_t status;
float* h_A;
float* d_A = 0;
float* d_Ainv = 0;
float* h_Ainv = 0;
int n2 = N * N;
int* info=NULL;
float** A=NULL;
float** Ainv=NULL;
cublasHandle_t handle;
printf("LL:: main()\n");
int dev = findCudaDevice(argc, (const char **)argv);
if (dev == -1) {
return EXIT_FAILURE;
}
printf("simpleCUBLAS_Smatinv test running..\n");
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
h_A = reinterpret_cast<float *>(malloc(n2 * sizeof(h_A[0])));
if (h_A == 0) {
fprintf(stderr, "!!!! host memory allocation error (A)\n");
return EXIT_FAILURE;
}
h_Ainv = reinterpret_cast<float *>(malloc(n2 * sizeof(h_Ainv[0])));
if (h_Ainv == 0) {
fprintf(stderr, "!!!! host memory allocation error (A)\n");
return EXIT_FAILURE;
}
for (int i = 0; i < n2; i++) {
h_A[i] = rand() / static_cast<float>(RAND_MAX);
}
cudaMalloc(&A, sizeof(float*));
cudaMalloc(&Ainv, sizeof(float*));
if (cudaMalloc(reinterpret_cast<void **>(&d_A), n2 * sizeof(d_A[0])) !=
cudaSuccess) {
fprintf(stderr, "!!!! device memory allocation error (allocate d_A)\n");
return EXIT_FAILURE;
}
cudaMemcpy(A, &d_A, sizeof(float*), cudaMemcpyHostToDevice);
if (cudaMalloc(reinterpret_cast<void **>(&d_Ainv), n2 * sizeof(d_Ainv[0])) !=
cudaSuccess) {
fprintf(stderr, "!!!! device memory allocation error (allocate d_Ainv)\n");
return EXIT_FAILURE;
}
cudaMemcpy(Ainv, &d_Ainv, sizeof(float*), cudaMemcpyHostToDevice);
if (cudaMalloc(reinterpret_cast<void **>(&info), BATCH_SIZE*sizeof(int)) !=
cudaSuccess) {
fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! device access error (write A)\n");
return EXIT_FAILURE;
}
status = cublasSmatinvBatched(handle, N, A, N,
Ainv, N, info, BATCH_SIZE);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
status = cublasGetVector(n2, sizeof(float), d_Ainv, 1, h_Ainv, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}
printf("A =\n");
for(int i=0; i<n2; i++){
if(i%N==0)printf("\n");
printf("%3.3f ",h_A[i]);
}
printf("\ninversion of A:\n");
printf("Ainv =\n");
for(int i=0; i<n2; i++){
if(i%N==0) printf("\n");
printf("%3.3f ",h_Ainv[i]);
}
printf("\n\n");
free(h_A);
free(h_Ainv);
if(cudaFree(d_A) != cudaSuccess) {
fprintf(stderr, "!!!! memory free error (d_A)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_Ainv) != cudaSuccess) {
fprintf(stderr, "!!!! memory free error (d_Ainv)\n");
return EXIT_FAILURE;
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! shutdown error (A)\n");
return EXIT_FAILURE;
}
return 0;
}
不检查版:
//一个简介的cublasSmatinvBatched 示例:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#define N (5)
#define BATCH_SIZE (1)
int NV_smatinv(float* matrixA, int n2);
/* cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float *A[], int lda,
float *Ainv[], int lda_inv, int *info, int batchSize); */
int main(){
float* matrixA;
matrixA = reinterpret_cast<float *>(malloc(N*N*sizeof(matrixA[0])));
for(int i=0; i<N*N; i++){
matrixA[i] = rand() / static_cast<float>(RAND_MAX);
}
NV_smatinv(matrixA, N*N);
//IX_smatinv(matrixA, N*N);
free(matrixA);
return 0;
}
int NV_smatinv(float* matrixA, int n2) {
cublasStatus_t status;
float* h_A;
float* d_A = 0;
float* d_Ainv = 0;
float* h_Ainv = 0;
int* info=NULL;
float** A=NULL; //LL:: array of matrices in d_A
float** Ainv=NULL; //LL:: array of inversion of matrices d_Ainv
cublasHandle_t handle;
status = cublasCreate(&handle);
h_A = reinterpret_cast<float *>(malloc(n2 * sizeof(h_A[0])));
h_Ainv = reinterpret_cast<float *>(malloc(n2 * sizeof(h_Ainv[0])));
memcpy(h_A, matrixA, n2*sizeof(matrixA[0]));
cudaMalloc(&A, sizeof(float*)); //LL:: this example only has one float Matrix
cudaMalloc(&Ainv, sizeof(float*)); //LL::
if (cudaMalloc(reinterpret_cast<void **>(&d_A), n2 * sizeof(d_A[0])) !=
cudaSuccess) {
fprintf(stderr, "!!!! device memory allocation error (allocate d_A)\n");
return EXIT_FAILURE;
}
cudaMemcpy(A, &d_A, sizeof(float*), cudaMemcpyHostToDevice);
if (cudaMalloc(reinterpret_cast<void **>(&d_Ainv), n2 * sizeof(d_Ainv[0])) !=
cudaSuccess) {
fprintf(stderr, "!!!! device memory allocation error (allocate d_Ainv)\n");
return EXIT_FAILURE;
}
cudaMemcpy(Ainv, &d_Ainv, sizeof(float*), cudaMemcpyHostToDevice);
if (cudaMalloc(reinterpret_cast<void **>(&info), BATCH_SIZE*sizeof(int)) !=
cudaSuccess) {
fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
return EXIT_FAILURE;
}
cudaMemcpy(d_A, h_A, n2*sizeof(h_A[0]), cudaMemcpyHostToDevice);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! device access error (write A)\n");
return EXIT_FAILURE;
}
/* cublasStatus_t cublasSmatinvBatched(cublasHandle_t handle, int n, const float *A[], int lda,
float *Ainv[], int lda_inv, int *info, int batchSize); */
status = cublasSmatinvBatched(handle, N, A, N,
Ainv, N, info, BATCH_SIZE);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
cudaMemcpy(h_Ainv, d_Ainv, n2*sizeof(h_Ainv[0]), cudaMemcpyDeviceToHost);
printf("\nnew A ="); for(int i=0; i<n2; i++){ if(i%N==0)printf("\n"); printf("%3.3f ",h_A[i]); }
printf("\n\nnew Ainv ="); for(int i=0; i<n2; i++){ if(i%N==0) printf("\n"); printf("%3.3f ",h_Ainv[i]); } printf("\n\n");
free(h_A);
free(h_Ainv);
cudaFree(d_A);
cudaFree(d_Ainv);
cudaFree(A);
cudaFree(Ainv);
cudaFree(info);
status = cublasDestroy(handle);
return 0;
}