关于如何正常运行NCCL的example样例程序

文章提供了一个使用CUDA、MPI和NCCL的示例代码,解释了如何使用nvcc编译器结合-ccbin选项指定IntelC++编译器,并添加-L和-l选项链接CUDA和NCCL库。代码中包含了MPI错误检查宏和NCCL通信操作,展示了多GPU环境下的并行计算。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

首先给出最终的编译命令,如果有大佬知道更优雅的方式,还请赐教,只看文档但是跑不通程序真的很难受。

nvcc -ccbin mpic++ test.cu -o test -L/usr/local/cuda/lib64 -lnccl
mpirun -n 4 ./test
“在编译期间,CUDA编译器驱动程序nvcc将 .cu文件拆分为宿主代码和设备代码,并调用宿主编译器来编译宿主代码并分别编译设备代码。 最后,它将生成的主机对象代码和设备PTX代码合并到单个可执行文件中。您需要 nvcc-ccbin选项,例如 要使用 icpc(Intel C ++编译器),请使用 nvcc -ccbin=icpc(假设 $PATH中提供了 icpc)”

根据上面的知识点,因为在nccl的样例代码中,程序的运行依赖于MPI,使用了“mpi.h”的头文件,所以在编译时需要加上-ccbin mpicc++

-L /usr/local/cuda/lib64,表示将/usr/local/cuda/lib64目录作为第一个寻找库文件的目录, 寻找的顺序是:/usr/local/cuda/lib64–>/lib–>/usr/lib–>/usr/local/lib
-l nccl , 表示**寻找动态链接库文件**libnccl.so(也就是文件名去掉前缀和后缀所代表的库文件)
如果 加上编译选项-static,表示寻找静态链接库文件,也就是libnccl.a

官方文档给出的样例代码如下, 注意本文件的后缀是 .cu, 我之前一直尝试用.cpp 或者.c来完成编译,发现根本不行

#include <stdio.h>
#include <cuda_runtime.h>
#include <nccl.h>
#include "mpi.h"
#include <unistd.h>
#include <stdint.h>
 
 
#define MPICHECK(cmd) do {                          \
  int e = cmd;                                      \
  if( e != MPI_SUCCESS ) {                          \
    printf("Failed: MPI error %s:%d '%d'\n",        \
        __FILE__,__LINE__, e);   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)
 
 
#define CUDACHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \
  if( e != cudaSuccess ) {                          \
    printf("Failed: Cuda error %s:%d '%s'\n",             \
        __FILE__,__LINE__,cudaGetErrorString(e));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)
 
 
#define NCCLCHECK(cmd) do {                         \
  ncclResult_t r = cmd;                             \
  if (r!= ncclSuccess) {                            \
    printf("Failed, NCCL error %s:%d '%s'\n",             \
        __FILE__,__LINE__,ncclGetErrorString(r));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)
 
 
static uint64_t getHostHash(const char* string) {
  // Based on DJB2a, result = result * 33 ^ char
  uint64_t result = 5381;
  for (int c = 0; string[c] != '\0'; c++){
    result = ((result << 5) + result) ^ string[c];
  }
  return result;
}
 
 
static void getHostName(char* hostname, int maxlen) {
  gethostname(hostname, maxlen);
  for (int i=0; i< maxlen; i++) {
    if (hostname[i] == '.') {
        hostname[i] = '\0';
        return;
    }
  }
}
 
 
int main(int argc, char* argv[])
{
  int size = 32*1024*1024;
 
 
  int myRank, nRanks, localRank = 0;
 
 
  //initializing MPI
  MPICHECK(MPI_Init(&argc, &argv));
  MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));
  MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks));
 
 
  //calculating localRank which is used in selecting a GPU
  uint64_t hostHashs[nRanks];
  char hostname[1024];
  getHostName(hostname, 1024);
  hostHashs[myRank] = getHostHash(hostname);
  MPICHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD));
  for (int p=0; p<nRanks; p++) {
     if (p == myRank) break;
     if (hostHashs[p] == hostHashs[myRank]) localRank++;
  }
 
 
  //each process is using two GPUs
  int nDev = 2;
 
 
  float** sendbuff = (float**)malloc(nDev * sizeof(float*));
  float** recvbuff = (float**)malloc(nDev * sizeof(float*));
  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
 
 
  //picking GPUs based on localRank
  for (int i = 0; i < nDev; ++i) {
    CUDACHECK(cudaSetDevice(localRank*nDev + i));
    CUDACHECK(cudaMalloc(sendbuff + i, size * sizeof(float)));
    CUDACHECK(cudaMalloc(recvbuff + i, size * sizeof(float)));
    CUDACHECK(cudaMemset(sendbuff[i], 1, size * sizeof(float)));
    CUDACHECK(cudaMemset(recvbuff[i], 0, size * sizeof(float)));
    CUDACHECK(cudaStreamCreate(s+i));
  }
 
 
  ncclUniqueId id;
  ncclComm_t comms[nDev];
 
 
  //generating NCCL unique ID at one process and broadcasting it to all
  if (myRank == 0) ncclGetUniqueId(&id);
  MPICHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));
 
 
  //initializing NCCL, group API is required around ncclCommInitRank as it is
  //called across multiple GPUs in each thread/process
  NCCLCHECK(ncclGroupStart());
  for (int i=0; i<nDev; i++) {
     CUDACHECK(cudaSetDevice(localRank*nDev + i));
     NCCLCHECK(ncclCommInitRank(comms+i, nRanks*nDev, id, myRank*nDev + i));
  }
  NCCLCHECK(ncclGroupEnd());
 
 
  //calling NCCL communication API. Group API is required when using
  //multiple devices per thread/process
  NCCLCHECK(ncclGroupStart());
  for (int i=0; i<nDev; i++)
     NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], size, ncclFloat, ncclSum,
           comms[i], s[i]));
  NCCLCHECK(ncclGroupEnd());
 
 
  //synchronizing on CUDA stream to complete NCCL communication
  for (int i=0; i<nDev; i++)
      CUDACHECK(cudaStreamSynchronize(s[i]));
 
 
  //freeing device memory
  for (int i=0; i<nDev; i++) {
     CUDACHECK(cudaFree(sendbuff[i]));
     CUDACHECK(cudaFree(recvbuff[i]));
  }
 
 
  //finalizing NCCL
  for (int i=0; i<nDev; i++) {
     ncclCommDestroy(comms[i]);
  }
 
 
  //finalizing MPI
  MPICHECK(MPI_Finalize());
 
 
  printf("[MPI Rank %d] Success \n", myRank);
  return 0;
}

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值