[4.1] NCCL如何编排alltoall的多个send recv任务？

原创于 2025-08-11 17:52:20 发布 · 1.2k 阅读

32 ·

CC 4.0 BY-SA版权

本文为博主原创文章，未经博主允许不得转载。

文章标签：

#nccl #GPU通信 #CUDA #人工智能 #P2P

LLM高性能计算/通信需要的知识专栏收录该内容

19 篇文章

订阅专栏

部署运行你感兴趣的模型镜像

文章目录

代码location:https://github.com/NVIDIA/nccl/blob/master/src/group.cc

1. nccl-test alltoall define

先看nccltest内alltoall的写法：

NCCLCHECK(ncclGroupStart());
  for (int r=0; r<nRanks; r++) {
    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
  }
NCCLCHECK(ncclGroupEnd());

2. GroupStart

alltoall的多个send recv放到nccl的group内，执行完ncclGroupStart() 的时候不会直接启动任务，只会统计ncclGroupDepth，大于0的时候告诉NCCL现在是在收集一大组操作。

inline ncclResult_t ncclGroupStartInternal() {
  ncclGroupDepth++;
  return ncclSuccess;
}

当走到ncclGroupEnd() ，才开始。

3. GroupEnd

3.1 groupLaunch前

// group.cc
ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
		// ... simInfo关于模拟nccl的略过
		if (ncclGroupDepth == 0) {
    WARN("ncclGroupEnd: not in a group call.");
    ret = ncclInvalidUsage;
    goto exit;
	  }
	  if ((--ncclGroupDepth) > 0) goto exit;
	  
	  //
	  if (ncclGroupCommHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs) || ncclGroupCommPreconnectHead != nullptr) {
		  ncclGroupJobMain.groupCommHeadPtr = &ncclGroupCommHead;
		  ncclGroupJobMain.asyncJobsPtr = &ncclAsyncJobs;
			//...
			ncclGroupJobMain.initialized = true;
			ncclGroupJobMainPtr = &ncclGroupJobMain;
			//...
		}
		
		if (ncclGroupBlocking == 0) {
		  // 非阻塞执行 拿不到alltoall结果
		  ...
		} else {
		  // 阻塞执行 结束就拿到alltoall结果
		  int savedDev;
      CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
      NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr), ret, fail);
      CUDACHECKGOTO(cudaSetDevice(savedDev), ret, fail);
      if (simInfo) memcpy((void*)simInfo, (void*)internalSimInfoPtr, realSize);
      groupResetJobState(ncclGroupJobMainPtr);
		}
}

代码先看ncclGroupDepth如果直接是0，说明没有ncclGroupStart直接退出。然后--ncclGroupDepth>0确保当前的ncclGroupEnd 是最外层的。
然后把当前需要执行的通信任务、异步任务、错误状态等，打包成一个 job 结构。放到ncclGroupJobMainPtr。
假设这是阻塞的alltoall，那么现在会把ncclGroupJobMainPtr->base(就是ncclAsyncJob)传给groupLaunch。

这里的上层ncclSend和ncclRecv怎么把他们的任务和数据关联到groupStart和groupEnd上的？仔细去看了发和收的cc实现，在collectives.cc里面。把ncclSend的参数（buffer，count，type，目标peer，comm，和用哪个stream）这些包装成了ncclInfo结构放到ncclEnqueueCheck这个API上(在enqueue.cc内)。

// enqueue.cc
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
	ncclGroupStartInternal()      // 保证 group 是开启的
  CommCheck(...)                // 检查 comm 合法性
  ncclCommEnsureReady(...)      // 保证 comm 初始化完
  ArgsCheck(...)                // 参数检查（count、datatype 等）
  taskAppend(comm, info)       // 核心：将任务加入队列（放入 ncclAsyncJobs）
  ncclGroupEndInternal()       // 如果 depth==1，这里触发实际执行
  ncclCommGetAsyncError(...)   // 如果是非阻塞，查一下有没有出错
}

这个API内部用了一次组开始和一次结束，保证上层代码的groupEnd是最外层的。直接看核心的操作taskAppend，同样在当前的cc代码当中。

// enqueue.cc
static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
  struct ncclKernelPlanner *planner = &comm->planner;
  if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) {
	  int peer = info->root;
    ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
    bool isSendNotRecv = info->coll == ncclFuncSend;

    // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
    ncclGroupCommJoin(info->comm);
    struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
    p2p->func = info->coll;
    p2p->buff = (void*)info->recvbuff;
    p2p->count = info->count;
    p2p->datatype = info->datatype;
    p2p->root = info->root;
    p2p->bytes = nBytes;
    p2p->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
    ncclIntruQueueEnqueue(isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,p2p);
    planner->nTasksP2p += 1;
    
	  if (comm->rank != peer) {
		  ...
		  while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank
                                      : comm->p2pSchedule[round].recvRank)) {
          round += 1;
        }
      uint8_t base = ncclP2pChannelBaseForRound(comm, round);
        for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
          int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c);
          if (isSendNotRecv) {
            if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector
              // the send/recv connector is shared among split shared comms. We need to set hasSeen to
              // 1 in order to avoid duplicate connection setup if user group sendrecv ops with split
              // shared comms together.
              comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
              comm->connectSend[peer] |= (1UL<<channelId);
              ncclGroupCommPreconnect(comm);
            }
          } else {
            // 和发送的反过来的操作
          }
	} else {
		...
	  // 是 Collective 操作（AllReduce、Broadcast 等）
  }
  
  // 把当前操作的stream记录下来
  if (info->stream != planner->streamRecent || planner->streams == nullptr) {
    planner->streamRecent = info->stream;
    struct ncclCudaStreamList* l = planner->streams;
    while (true) {
      if (l == nullptr) { // Got to the end, this must be a new stream.
        struct ncclCudaGraph graph;
        NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream));
        if (planner->streams != nullptr && !ncclCudaGraphSame(planner->capturingGraph, graph)) {
          WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph.");
          return ncclInvalidUsage;
        }
        planner->capturingGraph = graph; // C++ struct assignment
        // Add stream to list
        l = ncclMemoryStackAlloc<struct ncclCudaStreamList>(&comm->memScoped);
        l->stream = info->stream;
        l->next = planner->streams;
        planner->streams = l;
        break;
      }
      if (l->stream == info->stream)
        break; // Already seen stream.
      l = l->next;
    }
  }
}

定义了planner（ncclKernelPlanner格式的struct），当前我是P2P的任务，那我就是走第一个if的true分支（这里通过ncclFuncSend和ncclFuncRecv这俩枚举成员变量看现在是ncclSend还是ncclRecv）。

进入这个分支后，重点是ncclGroupCommJoin(info->comm);这一行

inline void ncclGroupCommJoin(struct ncclComm* comm) {
  if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
    // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
    // the users program order yet insures siblings occur consecutively. This
    // is required by doLaunches() in "group.cc".
    struct ncclComm** pp = &ncclGroupCommHead;
    while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
      pp = &(*pp)->groupNext;

    // didn't find its clique, we need to insert it with ascending order based on commHash
    if (*pp == nullptr) {
      pp = &ncclGroupCommHead;
      while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext;
    }
    comm->groupNext = *pp;
    *pp = comm;
    // Comms gets a new memory stack scope upon joining. Each task batched for
    // this comm is allocated there.
    ncclMemoryStackPush(&comm->memScoped);
    // Initialize planner
    ncclKernelPlanner::Peer* tmp = comm->planner.peers;
    memset(&comm->planner, 0, sizeof(comm->planner));
    comm->planner.peers = tmp;
  }

  ncclGroupBlocking = comm->config.blocking;
}

由while内的条件控制comm与兄弟comm是相邻的，可以保留住用户程序中comm的顺序和连续。这是doLaunch前的必要条件。
如果没找到 sibling / clique(兄弟 / 同nvlink互联的gpu)，就按 commHash 升序插入，保持链表有序。然后comm给pp链表，pp是ncclGroupCommHead，所以就会找到一组操作内comm的头。
comm随后插入到链表当中。然后给comm创建一个新的memory栈，后续任务来了就放这。peers数组保留后清空planner。

接着定义了ncclTaskP2p结构的结构体p2p，把info里面的数据都取出来，这样后面不同的send或者recv任务我就能用p2p结构体内的原子变量来保证数据安全。然后每个p2p差不多等价一个info，ncclIntruQueueEnqueue把这个p2p丢进planner内的peers的发送队列。planner内的nTasksP2p 来计算整个alltoall我当前这个gpu上有多少次发和收的任务。

不同rank间的连接，通过planner内的sendSeen布尔值来保证这个peer只连接一次，一个group内不会重复连同一个。p2pSchedule[round] 表示每轮用哪条channel做p2p，这个while循环就能在从 p2pSchedule 中查到，当前这个 peer 是在哪一轮（round）被我（这个 rank）安排发送。有了round和comm通过ncclP2pChannelBaseForRound这个API就能得到这个 round 使用的 base channel。接着for loop给每个channel上准备connector(两张网卡NIC之间得有这个，nccl在这上面完成具体的RDMA读写操作)。并给这个comm放到了ncclGroupCommPreconnect的API内，同样会找到一组操作内的预连接的comm的头。
collective集合通讯的部分略过，接下来就是关于stream上的操作了，会把当前stream和planner里面的最近出现过的stream进行判断，出现新的就加入到planner内的streams链表头（struct ncclCudaStreamList* streams;，所有当前任务的用户流汇总列表）。新的这个stream会和CUDA Graph捕获的做检查，出现别的Graph的stream就报错。

现在所有任务都在planner内的peers的发送队列或者接收队列。一组操作内的一群comm的头和一群预连接的comm的头现在是ncclGroupCommHead和ncclGroupCommPreconnectHead。现在groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr) 启动的时候没有看到planner，是从外层管理的send和recv的具体任务队列，这里传进去的只有当前comm（看上去是ncclGroupJobMainPtr->base），ncclGroupJobMainPtr是一个指针，共享使用当前线程的 group 任务上下文结构。这玩意进入groupLaunch后又被强制转换回struct ncclGroupJob

3.2 groupLaunch

static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) {
  ncclResult_t ret = ncclSuccess;
  struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
  struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
  struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;

  bool *groupAbortFlag = gjob->abortFlagPtr;

  if (!simInfo && groupCommPreconnectHeadMain != nullptr) {
    struct ncclComm* comm = groupCommPreconnectHeadMain;
    do {
      struct ncclPreconnectJob* job;
      NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
      job->base.func = ncclP2PPreconnectFunc;
      job->base.undo = nullptr;
      job->base.destructor = free;
      job->base.state = ncclGroupJobRunning;
      job->base.abortFlag = comm->abortFlag;
      job->base.abortFlagDev = comm->abortFlagDev;
      job->comm = comm;
      ncclIntruQueueEnqueue(asyncJobsMain,  (struct ncclAsyncJob*)job); // 加入async queue

      struct ncclComm* next = comm->preconnectNext;
      comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
      comm = next;
    } while (comm != nullptr); // 遍历comm，加入preconnect queue
  }

  NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);

  /* Connect channels at runtime if cumem is supported */
  if (groupCommHeadMain != nullptr) {
    struct ncclComm* cliqueHead = groupCommHeadMain;
    struct ncclComm* comm = NULL;
    struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncCollJobs;
    ncclIntruQueueConstruct(&asyncCollJobs);
    do {
      // We need to preconnect connections for collectives clique by clique to avoid
      // race condition for split shared comms which can connect the same connections
      // at the same time.
      comm = cliqueHead;
      do {
        bool needConnect = false;
        bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
        memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);

        CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
        NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);

        if (comm->cuMemSupport && needConnect) {
          struct ncclPreconnectJob* job;
          NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
          job->base.func = ncclCollPreconnectFunc;
          job->base.undo = nullptr;
          job->base.destructor = free;
          job->base.state = ncclGroupJobRunning;
          job->base.abortFlag = comm->abortFlag;
          job->base.abortFlagDev = comm->abortFlagDev;
          job->comm = comm;
          NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
          memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
          ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
        }
        comm = comm->groupNext;
      } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
      // connect
      NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
      while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
        struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
        if (job->destructor) job->destructor((void*)job);
      }
      cliqueHead = comm;
    } while (cliqueHead != nullptr);

    // done with all buffer allocation, start registration and enqueue
    comm = groupCommHeadMain;
    do {
      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
      NCCLCHECKGOTO(ncclTasksRegAndEnqueue(comm), ret, fail); // **************关键点，注册并enqueue
      comm = comm->groupNext;
    } while (comm);
  }

  if ((!simInfo) && (groupCommHeadMain != nullptr)) {
    NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
  }

  while (!ncclIntruQueueEmpty(asyncJobsMain)) {
    struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
    if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
      (void) ncclCommSetAsyncError(job->comm, ret);
    if (job->destructor) job->destructor((void*)job);
  }

  while (groupCommHeadMain != nullptr) {
    struct ncclComm* comm = groupCommHeadMain;
    struct ncclComm* next = comm->groupNext;
    // Poll for callbacks sent to us from other threads. Typically these free
    // resources from to our memory pools and UB
    NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/false), ret, fail);
    (void) ncclGroupCommLeave(comm);
    if (!comm->config.blocking) {
      (void) ncclCommSetAsyncError(comm, ret);
    }
    groupCommHeadMain = next;
  }

exit:
  return ret;
fail:
  groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, gjob->groupBlockingPtr, gjob->abortFlagPtr, ret);
  goto exit;
}

传进来的struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_现在强制转成了ncclGroupJob格式的gjob，gjob在这个函数里接下来只和出现错误的时候清理资源有关了。所以groupLaunch内是有整个group级别的信息的。（ps：具体收发任务则是从job里面找到当前这个comm再去操作。
接着，定义预连接job，并给job的函数指针绑定上ncclP2PPreconnectFunc（job结构体里面有comm，comm里面有nranks，知道自己是哪个gpu也知道一共多少gpu后就可以fully connect了，ps：intra-node），job的状态绑定了ncclGroupJobRunning，job的通讯器绑定了当前的comm。。。。。此时把job强制转换为了ncclAsyncJob* 格式入队到asyncJobsMain这个侵入式队列（ncclIntruQueue）中。
NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail); preconnect job 在这里被真正 launch。job 类型由 .func = ncclP2PPreconnectFunc决定，这里还会包括ring，tree，collnet，nvls等选择。
再往下出现了clique（比如我定义了comm0，又基于它定义了comm0_1和comm0_2，此时每个comm都预连接会出现rank0和rank1相同channel建立两次）用于给子comm排序执行以避免反复建立连接。
然后comm，algoNeedConnect，needConnect和simInfo丢进ncclPrepareTasksAPI（每个 ncclGroup 调用一次来组织用户在 comm->planner 中提交的任务，以便将它们剥离到计划表中）。[这一步我是alltoall是p2p任务，ncclPrepareTasksAPI对于我来说是空转]。之后comm丢进ncclTasksRegAndEnqueue。里面一看也没有对planner->peers[].sendQueue和planner->peers[].recvQueue中的所有实际任务取出，所以p2p的任务直接来到了doLaunches函数内。

3.3 doLaunches

真正launch前的准备，ncclLaunchPrepare（enqueue.cc内）。

// enqueue.cc
ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
	ncclResult_t result = ncclSuccess;
  struct ncclKernelPlanner* planner = &comm->planner;
  bool persistent = ncclCudaGraphValid(planner->capturingGraph);
  planner->persistent = persistent;
  int nPlans = 0;
  if (planner->nTasksColl + planner->nTasksP2p != 0) {
    do {
      memset(&planner->wipPlan, 0, sizeof(planner->wipPlan));

      struct ncclKernelPlan* plan = ncclMemoryPoolAlloc<struct ncclKernelPlan>(&comm->memPool_ncclKernelPlan, &comm->memPermanent);
      plan->comm = comm;
      plan->reclaimer.fn = reclaimPlan;
      plan->persistent = persistent;
      // finishPlan() promotes ncclDevWorkStorageType[Fifo|Persistent]->Args if the work can fit.
      plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent
                                         : ncclDevWorkStorageTypeFifo;

      struct ncclKernelPlanBudget budget;
      budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs);
      // Non-persistent kernels fill up at most half of our fifo per kernel.
      budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2;

      // Drain coll tasks first. This is essential since we partition tasks based
      // on the work budget and p2p work isn't collective. If we were to drain p2p
      // first, the place where we cut the kernel could vary by rank which would
      // cause the "shortest channel first" channel picker to have divergent results.
      if (planner->nTasksColl != 0) {
        NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure);
      }
      // And only drain p2p tasks once colls are depleted.
      if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) {
        NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure);
  //..... 部分代码
}

planner 内已经收集了所有任务，plan负责打包任务变成kernel launch，是从 NCCL 内部 memory pool 中分配出一个新的 plan，用于装填要launch的工作的内容（workFifo）。这里还会计算一个budget与comm 和 plan一起放到scheduleP2pTasksToPlan内。这里就是从 planner.peers[rank].sendQueue / recvQueue把任务搬运到 plan.p2pTaskQueue并同步把参数填入 plan->workFifo，供 GPU kernel 执行。

// enqueue.cc
static ncclResult_t scheduleP2pTasksToPlan(
    struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclKernelPlanBudget* budget
  ) {
	struct ncclKernelPlanner::Peer* peers = comm->planner.peers;
	
	plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MAX_NTHREADS);
	if (!plan->kernelSpecialized) {
	    plan->kernelFn = ncclDevKernelForFunc[ncclDevFuncId_P2p()];
	    plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[ncclDevFuncId_P2p()];
	}
	
	while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
	
	while (comm->planner.nTasksP2p != 0) {
	// ...... 一大坨代码
	}

}

peers内有前面说到的sendQue和recvQue。接着给plan初始化了kernel内核是P2P类型的内核。nChannelsMin和nChannelsMax记录通道合适范围，避免超过硬件承受力。
核心的搬运p2p任务是while一直检测comm->planner.nTasksP2p != 0，有任务就遍历通信表p2pSchedule。这里的头任务send和recv是通过ncclIntruQueueHead(&peers[comm->p2pSchedule[round].sendRank].sendQueue) 找到，sendRank是查p2pSchedule表得到。自己发送给自己的时候直接free，跳过，任务计数减2。
自己发送给别人的时候会用testBudget api检查当前plan能塞入的任务，超过budget就会重开一个plan。struct ncclTaskP2p* p2pTasks[2] = { recv, send };，反的。

然后addP2pToPlan将具体任务放到plan内，大致就是planner内一对send recv组合位一条ncclDevWorkP2p，并通过addWorkBatchToPlan放到当前plan，并看是否跨NIC buffer传输，生成相应的proxyOp做辅助调度。

// enqueue.cc 内的addP2pToPlan API
static ncclResult_t addP2pToPlan( ...) {
	for (int part=0; part < nChannelsMax; part++) {
				// ........
        protoLL[dir] &= conn->conn.buffs[NCCL_PROTO_LL] != nullptr;
        network[dir] |= conn->transportComm == (dir ? &netTransport.send : &netTransport.recv);
        proxySameProcess[dir] &= conn->proxyConn.sameProcess;
      }
    }
  // 计算了下面两样
  1.netRegistered[dir] / 2.ipcRegistered[dir]
  
  struct ncclWorkList* workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkP2p>(&comm->memScoped, 1);
  workNode->workType = ncclDevWorkTypeP2p;
  workNode->size = sizeof(struct ncclDevWorkP2p);
  ncclIntruQueueEnqueue(&plan->workQueue, workNode);
  uint32_t workOffset = plan->workBytes;
  plan->workBytes += sizeof(struct ncclDevWorkP2p);
  struct ncclDevWorkP2p* work = (struct ncclDevWorkP2p*)(workNode+1);
  // ... 一堆填写的代码
  struct ncclProxyOp proxyOps[2] = {};
  // ... 一堆填写的代码

  for (int part=0; part < nChannelsMax; part++) {
  int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part);
  addWorkBatchToPlan(...);
  addProxyOpIfNeeded(...);
	}	
}

先看了一些辅助信息决定后面是否注册buffer和用什么协议通信，protoLL[0/1] 表示当前这对任务是否可以使用 LL 协议（更快）；network[] 表示这是否是跨节点通信；proxySameProcess[] 表示是否在同一个进程内。
要是1.netRegistered这个东西是true，那就调用ncclRegisterP2pNetBuffer跨节点网络通信。2.ipcRegistered是true就得用IPC buffer走ncclRegisterP2pIpcBuffer api完成跨GPU同节点通信，并写到后面的work和proxyOp。（这里一大串代码当中穿插了chunkDataSize[dir]，nChannels[dir]，stepSize等等的计算，为了后面每个channel上分配多大数据用于通信）
接下来填写ncclDevWorkP2p到plan内，将前面send/recv rank、地址、大小、chunk 大小、协议类型、是否注册 buffer 都填进 work，然后插入 plan->workQueue 中。这里还对跨NIC生成proxyOp，并把 chunk 步数、offset、buffer handle 等等填进去。
for loop给每个channel内分配batch任务和对应的proxyOp。

综上，ncclDevWorkP2p 就是 “这个 rank 要干的一对活（收+发）”，用work描述。为什么已经是一对p2p任务了还要编出batch再to plan，因为：

`work`	`ncclDevWorkP2p`	描述了一对 p2p（发送/接收）任务的实际内容（地址、大小、谁发谁收）
`batch`	`ncclDevWorkBatch`	描述 GPU kernel 一次能执行多少 work，类型（p2p/coll），在哪个 channel

现在，work 在 plan 的 workQueue（供后续 kernel launch 使用），channel内的workBatchQueue内有当前 channel 对这个 work 的 batch 组织（也可以理解成 launch 批次/分组信息）。

// scheduleP2pTasksToPlan API的剩下一小部分
if (send != nullptr) {
  ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
  ncclIntruQueueEnqueue(&plan->p2pTaskQueue, send);
  comm->planner.nTasksP2p -= 1;
}
if (recv != nullptr) {
  ncclIntruQueueDequeue(&peers[recvRank].recvQueue);
  ncclIntruQueueEnqueue(&plan->p2pTaskQueue, recv);
  comm->planner.nTasksP2p -= 1;
}
// scheduleP2pTasksToPlan至此结束

plan->workQueue / plan->p2pTaskQueue 等数据结构都准备好了。

现在回到ncclLaunchPrepare API内剩下的代码。

让主 launchStream 等待所有参与的 user stream 上的任务完成，

ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
	//....
	struct ncclKernelPlan* planHead = ncclIntruQueueHead(&planner->planQueue);
  planner->unlaunchedPlansHead = planHead;
  
	cudaStream_t launchStream = planner->streams->stream;
  cudaStream_t deviceStream, launchOrder;
	for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
	  CUDACHECKGOTO(cudaEventRecord(comm->sharedRes->scratchEvent, l->stream), result, failure);
    CUDACHECKGOTO(cudaStreamWaitEvent(launchStream, comm->sharedRes->scratchEvent, 0), result, failure);
	}
	...
		
}

首先重点是记住这里的planner->unlaunchedPlansHead是planner->planQueue这个队列的头（后面真正launch的时候会用到）。让主 launchStream 等待所有参与的 user stream 上的任务完成，强行设置了stream依赖。也就是说你在groupStart和groupEnd之内自定义的其他stream会被加入planner→streams，streams里面第一个stream会等待其他stream上的时间完成再执行kernel。

回到group.cc的dolaunches代码：

static ncclResult_t doLaunches(struct ncclComm* head) {
  // ... 准备完毕了
  if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
      comm = comm->groupNext;
      
  while (true) { // Iterate rounds of launches for clique.
      bool moreRounds = false;
      comm = cliqueHead;
      do { // Iterate clique members.
        struct ncclComm* next = comm->groupNext;
        if (useBarrier) {
          // Barrier reduction result tells us if this was the final round.
          moreRounds = 0 != ncclCommIntraBarrierOut(comm);
        } else {
          moreRounds |= comm->planner.unlaunchedPlansHead != nullptr;
        }
        if (moreRounds) {
          // Pop next unlaunched kernel
          struct ncclKernelPlan* plan = comm->planner.unlaunchedPlansHead;
          if (plan != nullptr) {
            comm->planner.unlaunchedPlansHead = plan->next;
            CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
            NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure);
            NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
          }
          // Barrier reduction input indicates if we require further rounds.
          if (useBarrier) ncclCommIntraBarrierIn(comm, comm->planner.unlaunchedPlansHead != nullptr ? 1 : 0);
          if (plan != nullptr) {
            NCCLCHECKGOTO(ncclLaunchKernelAfter_NoCuda(comm, plan), result, failure);
          }
        } else { // Final round.
          CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
          NCCLCHECKGOTO(ncclLaunchFinish(comm), result, failure);
        }
        comm = next;
      } while (comm != cliqueNextHead);
      if (!moreRounds) break;
    }
    cliqueHead = cliqueNextHead;
  } while (cliqueHead != nullptr);
}

一个comm内的多个p2p任务结束后要barrier一下再去启动下一个comm（sibling / clique comm）。一共三个核心发射，ncclLaunchKernelBefore_NoUncapturedCuda，ncclLaunchKernel，ncclLaunchKernelAfter_NoCuda。

第一个：这里底层是将comm和plan放到uploadWork()函数内部，主要是将plan中的数据准备好，内核直接使用这些数据进行计算。这里会根据workStorageType确定存储类型，但是这个是怎么计算得出的暂时不详。先细看如果是ncclDevWorkStorageTypeFifo类型的情况。

4 模拟

abstract：假设4个rank的alltoall任务

官方的实现：

NCCLCHECK(ncclGroupStart());
  for (int r=0; r<nRanks; r++) {
    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
  }
NCCLCHECK(ncclGroupEnd());

第一步收集所有send和recv任务放到planner的sendQue和RecvQue。具体操作在taskAppend()内；rank0在alltoall中的p2p任务总计就是下面三对send recv。

rank0 peers sendQueue recvQueue
0 ~~“S0→0” 自发后面会跳过~~ ~~“R0←0” 自收后面会跳过~~
1 “S0→1” “R0←1”
2 “S0→2” “R0←2”
3 “S0→3” “R0←3”
这里rank1在alltoall中的p2p任务如下（rank1发rank1没了，自发自收跳过）：

rank1 peers sendQueue recvQueue
0 “S1→0” “R1←0”
2 “S1→2” “R1←2”
3 “S1→3” “R1←3”
同理，rank2和rank3的发和收类似上面。。。。。。

rank0 peers	sendQueue	recvQueue
0	~~“S0→0” 自发后面会跳过~~	~~“R0←0” 自收后面会跳过~~
1	“S0→1”	“R0←1”
2	“S0→2”	“R0←2”
3	“S0→3”	“R0←3”

rank1 peers	sendQueue	recvQueue
0	“S1→0”	“R1←0”
2	“S1→2”	“R1←2”
3	“S1→3”	“R1←3”

跳过辅助的操作，现在直接到scheduleP2pTasksToPlan()内。由for循环内的迭代变量round查p2pSchedule表（见第5节对这个表的详解），上面rank0到rank4收集到的任务会变。现在列举round=1时，四个rank上的第一次send和recv任务会变为什么：

// scheduleP2pTasksToPlan（）查表的操作
int sendRank = comm->p2pSchedule[round].sendRank;
int recvRank = comm->p2pSchedule[round].recvRank;
struct ncclTaskP2p* send = ncclIntruQueueHead(&peers[sendRank].sendQueue);
struct ncclTaskP2p* recv = ncclIntruQueueHead(&peers[recvRank].recvQueue);

结合第五节表得出round1四个rank的发和收的任务如下：

rank sendPeer 从 sendQueue 取出的卡 recvPeer 从 recvQueue 取出的卡
0 1 “S0→1” 3 “R0←3”
1 2 “S1→2” 0 “R1←0”
2 3 “S2→3” 1 “R2←1”
3 0 “S3→0” 2 “R3←2”
struct ncclTaskP2p* p2pTasks[2] = { recv, send }; 会将当前rank的任务变成{“R0←3”, “S0→1”}。这里的ncclTaskP2p进入addp2ptoPlan 对每个rank当前**“收谁 + 发谁”** 写进同一个 work，得到1条ncclDevWorkP2p（里面的sendRank = 1， recvRank = 3）。这个会被重新排布为 workQueue / p2pTaskQueue，并把需要给 GPU 的原始参数序列化到 workFifo。

rank	`sendPeer`	从 sendQueue 取出的卡	`recvPeer`	从 recvQueue 取出的卡
0	1	“S0→1”	3	“R0←3”
1	2	“S1→2”	0	“R1←0”
2	3	“S2→3”	1	“R2←1”
3	0	“S3→0”	2	“R3←2”

接着uploadWork() + ncclLaunchKernelBefore_* 把plan 中的 workFifo 拷到 comm->devWorkFifo（UVA 或 cudaMemcpyAsync），并生成 kernelArg 结构。

这两步的操作:

host: addP2pToPlan()         ← 填这些字段
uploadWork()                 ← 把 work 写进 FIFO
GPU:  ncclKernel_P2p
  └──── doOneP2p()
          ├─ runSend<Proto>()
          |     ↘ 根据 sendChunkSize/nSendChannels 写多通道
          └─ runRecv<Proto>()
                ↘ 根据 recvChunkSize/nRecvChannels 读多通道
proxy 线程(若启用)
  └──── netSend / netRecv
          ↘ 用 sendNetReg/recvNetReg 判断是否走注册缓冲

ncclLaunchKernel

           ┌──────── workMask (=fifoBytes-1)
kernelArgs │ workBuf (指向同一个 FIFO 映射)
───────────┼─────────────────────────────────────────────── fifoCursor = 起点
batch[0]   │ struct ncclDevWorkBatch  (24 B)  ← channel 0
batch[1]   │ struct ncclDevWorkBatch  (24 B)  ← channel 1
batch[2]   │ struct ncclDevWorkBatch  (24 B)  ← channel 2
───────────┼─────────────────────────────────────────────── fifoCursor = 3×24 = 72
work #0    │ 16 B ncclDevWorkP2p  (ch0, round0)
work #1    │ 16 B ncclDevWorkP2p  (ch1, round1)
work #2    │ 16 B ncclDevWorkP2p  (ch2, round2)
───────────┼─────────────────────────────────────────────── fifoCursor = 72+3×16 = 120

batch 数组：每行告诉 GPU
- 这批 work 属于哪条 channel
- 从 FIFO offsetBase 处开始
- offsetBitset 标哪些 slot 已经塞了 work
work 数组：真正的指令，每条 16 B，对齐到 16。

其中包含
- directionMask（SEND / RECV）
- sendRank/recvRank
- 每方向 conn 地址、chunkSize 等

偏移量都在 uploadWork() 里加上 fifoCursor，于是变成 FIFO 内的绝对地址。

5 p2p schedule

init.cc内对这个调度表的初始化的代码(nccl 2.22版本之后)

  do { // Build p2p schedule
    int node = comm->node;
    int nNodes = comm->nNodes;
    int nRanks = comm->nRanks;
    int local = comm->localRank;
    int nLocals = comm->maxLocalRanks;
    struct ncclNodeRanks* nodeRanks = comm->nodeRanks;
    bool flat = false;
    for (int node = 0; node < nNodes; node++) {
      if (nodeRanks[node].localRanks != nLocals) {
        flat = true;
        nNodes = 1; node = 0;
        nLocals = nRanks; local = rank;
        break;
      }
    }
    int nNodesPow2 = pow2Up(nNodes);
    int nLocalsPow2 = pow2Up(nLocals);
    comm->p2pSchedule = ncclMemoryStackAlloc<ncclComm::P2pSchedulePair>(&comm->memPermanent, nRanks);
    comm->planner.peers = ncclMemoryStackAlloc<ncclKernelPlanner::Peer>(&comm->memPermanent, nRanks);
    uint32_t nodeRound = 0;
    uint32_t nodeDelta = 0;
    int round = 0;
    // When enumerating peer deltas we use the quadratic formula (x*x+x)/2 mod N.
    // Since that formula only produces valid permutations when N is a pow of 2,
    // we let N = pow2Up(n) and filter out results greater-eq to n.
    // Example sequence for 16 ranks: 0, 1, 3, 6, 10, 15, 5, 12, 4, 13, 7, 2, 14, 11, 9, 8
    do {
      if (nodeDelta < nNodes) { // Filter nonsensical node deltas
        int sendNode = (node + nodeDelta) % nNodes;
        int recvNode = (node - nodeDelta + nNodes) % nNodes;
        uint32_t localRound = 0;
        uint32_t localDelta = 0;
        do {
          if (localDelta < nLocals) { // Filter nonsensical node-local deltas
            int sendLocal = (local + localDelta) % nLocals;
            int recvLocal = (local - localDelta + nLocals) % nLocals;
            comm->p2pSchedule[round].sendRank = flat ? sendLocal : nodeRanks[sendNode].localRankToRank[sendLocal];
            comm->p2pSchedule[round].recvRank = flat ? recvLocal : nodeRanks[recvNode].localRankToRank[recvLocal];
            round += 1;
          }
          localRound += 1;
          localDelta = (localDelta + localRound) & (nLocalsPow2 - 1); // Quadratic update
        } while (localRound != nLocalsPow2);
      }
      nodeRound += 1;
      nodeDelta = (nodeDelta + nodeRound) & (nNodesPow2 - 1); // Quadratic update
    } while (nodeRound != nNodesPow2);

    if (round != nRanks) {
      WARN("P2p schedule creation has bugs.");
      ret = ncclInternalError;
      goto fail;
    }
  } while (0);

中间这个do循环，是表示节点间的循环，nodeRound从0迭代到nNodesPow2（这个是nNodes的最小二次幂）。最内层的do循环是节点内的循环，计算了localDelta = (localDelta + localRound) & (nLocalsPow2 - 1)来算出每次round当前rank发给谁（sendLocal）和接收谁（recvLocal）。假设四个rank：[计算这个最小二次幂就是为了算的更快，等价于直接mod 4，但是用这里是& 3]。下面是所有迭代展开后p2p schedule表初始化后每个rank的任务：

Rank	round 0	round 1	round 2	round 3
0	(0, 0)	(1, 3)	(3, 1)	(2, 2)
1	(1, 1)	(2, 0)	(0, 2)	(3, 3)
2	(2, 2)	(3, 1)	(1, 3)	(0, 0)
3	(3, 3)	(0, 2)	(2, 0)	(1, 1)

(ps: 括号内第一个值是send who，第二个值是recv who，这俩通过下面的公式算出来)

int sendLocal = (local + localDelta) % nLocals;  // 发送的本地rank
int recvLocal = (local - localDelta + nLocals) % nLocals;  // 接收的本地rank

6 introduction

6.1 FIFO

nccl的comm中存储即将device执行ncclDevWork数据的一块共享区域。多个kerne启动使用的“任务指令”就是放在这个FIFO内，防止多次mlloc和memcpy。这是一个环形缓冲区，使用mask快速wrap-around（回到起点）。

6.2 ncclDevKernelArgs

struct alignas(16) ncclDevKernelArgs {
  struct ncclDevComm* comm; //指向设备端通信上下文 ncclDevComm，包含 rank、channels、buffers 等设备端状态
  uint64_t channelMask; //标记哪些 channel 启用了（64 位掩码，最多 64 个 channel）
  enum ncclDevWorkStorageType workStorageType;//表示工作缓冲区存储类型（Args、Fifo、Persistent）
  uint32_t workMask;//用于实现 FIFO wrap-around 的掩码值（通常为 size-1）
  void* workBuf;//指向设备端的工作数据数组（由 host 在 uploadWork 中准备好并复制过去）
  // A channel's first batch is at `blockIdx.x`. Use `nextJump` to follow rest of list.
  // struct ncclDevWorkBatch batches[];
};