基础
- 每个GPU有多个SM(streaming multiprocessor)
- 当启动一个grid时,它的block会被分配给多个SM上执行,一个block一旦被调度到一个SM上,则这个block只会在那个SM上执行
- 多个block可以被分配到一个SM上执行
- 没32个线程未一组,被称为线程束(warp)
- block里的thread逻辑上可以并行运行,单并不是所有的thread可以同时在物理层面执行,既block中,不同的thread可能会以不同的速度前进.
warp(线程束)和block(线程块)
-
block被分配到一个SM时,会被划分为多个warp
-
一个warp由32个连续的core组成
-
block可以被配置为一维,二维或三维的,但是物理上都被组织成了一维
-
对于一维block,唯一threadid被存在
threadIdx.x
中,并且,threadIdx.x
拥有连续值的线程被分组到同一个warp中
假设有128个thread的block,是被分配给了4个warp -
二维的block,每个thread的id可以通过threadIdx和blockDim来计算:
threadIdx.y * blockDim.x + threadIdx.x
-
三维的block
thradIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x
-
-
若果block的大小不是warp大小的整数倍,就会造成资源浪费,如启动80个thread,会分配3个warp,共96个core,多余的16个core仍消耗sm资源,如寄存器
线程束分化
if(cond){
...
}else{
...
}
- 假设32个thread,16个执行true,16个执行false,这样在同一warp中执行不同的指令就称为warp分化
- warp并行线程数量减少了一半,16个线程同时活跃执行,其余16个被禁用了(16个执行if时,另外16个等待,16个执行else时,另外16个等待)
- 应避免同一warp中有不同的执行路径
- 不同的if-then-else分支会连续执行
- 调整分支力度以适应线程束大小的倍数
__global__ void mathKernel1(float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
float ia, ib;
ia = ib = 0.0f;
if (tid % 2 == 0){//奇数,偶数thread分开
ia = 100.0f;
}else{
ib = 200.0f;
}
c[tid] = ia + ib;
}
__global__ void mathKernel2(float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
float ia, ib;
ia = ib = 0.0f;
if ((tid / warpSize) % 2 == 0){//奇数,偶数warp分开
ia = 100.0f;
}else{
ib = 200.0f;
}
c[tid] = ia + ib;
}
__global__ void mathKernel3(float *c)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
float ia, ib;
ia = ib = 0.0f;
bool ipred = (tid % 2 == 0);
if (ipred){//使用两个if语句,而不是if-else来减少if-else的分化,只有在if的条件为0时,编译器才会下达指令
ia = 100.0f;
}
if (!ipred){
ib = 200.0f;
}
c[tid] = ia + ib;
}
__global__ void mathKernel4(float *c)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
float ia, ib;
ia = ib = 0.0f;
int itid = tid >> 5;
if (itid & 0x01 == 0){
ia = 100.0f;
}
else{
ib = 200.0f;
}
c[tid] = ia + ib;
}
占用率
#include <stdio.h>
#include <cuda_runtime.h>
int main(int argc, char *argv[])
{
int iDev = 0;
cudaDeviceProp iProp;
cudaGetDeviceProperties(&iProp, iDev);
printf("Device %d: %s\n", iDev, iProp.name);
printf(" Number of multiprocessors: %d\n",
iProp.multiProcessorCount);
printf(" Total amount of constant memory: %4.2f KB\n",
iProp.totalConstMem / 1024.0);
printf(" Total amount of shared memory per block: %4.2f KB\n",
iProp.sharedMemPerBlock / 1024.0);
printf(" Total number of registers available per block: %d\n",
iProp.regsPerBlock);
printf(" Warp size: %d\n",
iProp.warpSize);
printf(" Maximum number of threads per block: %d\n",
iProp.maxThreadsPerBlock);
printf(" Maximum number of threads per multiprocessor: %d\n",
iProp.maxThreadsPerMultiProcessor);
printf(" Maximum number of warps per multiprocessor: %d\n",
iProp.maxThreadsPerMultiProcessor / 32);
return EXIT_SUCCESS;
}
grid和block大小准则
- 每个block的thread数时warp(32)的倍数
- 每个block至少有128或256个thread(避免block太小)
- 根据内核资源调整block大小
- block数量要多与SM的数量
同步
- 系统级:等待host和device完成
- lock级:每个block所有的thread到大同一点
cudaError_t cudaDeviceSynchronize(void)
这色host,等待device返回__device__ void __syncthreads(void);
同一block每个thread不许等待,知道所有thread都到大同一点
避免分支分化
- 执行满足交换律和结合律的运算,被称为规约问题,并行归约是并行执行
相邻配对
// Neighbored Pair Implementation with divergence
__global__ void reduceNeighbored (int *g_idata, int *g_odata, unsigned int n){
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) return;// boundary check
unsigned int tid = threadIdx.x;
int *idata = g_idata + blockIdx.x * blockDim.x;//将全局内存转换为block内存
// in-place reduction in global memory
for (int stride = 1; stride < blockDim.x; stride *= 2){//每次归约步长变为2倍
if ((tid % (2 * stride)) == 0){//相邻两个数相加(这里会导致warp分化)
idata[tid] += idata[tid + stride];
}
__syncthreads();// 等待block中的所有线程结束
}
if (tid == 0) g_odata[blockIdx.x] = idata[0];// 将第0个线程的block内存写入全局内存中
}
优化(没看懂)
__global__ void reduceNeighboredLess (int *g_idata, int *g_odata, unsigned int n){
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx >= n) return;
unsigned int tid = threadIdx.x;
int *idata = g_idata + blockIdx.x * blockDim.x;
for (int stride = 1; stride < blockDim.x; stride *= 2){
int index = 2 * stride * tid;
if (index < blockDim.x){
idata[index] += idata[index + stride];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = idata[0];
}
int index = 2 * stride * tid;``index < blockDim.x
对512个线程block来说,前8个warp(32*8=256)完成第一轮归约,剩下8个线程什么也不做.第二轮里,前4个warp完成归约,剩下的12个线程束什么也不做.当thread总数,小于warp时,分化就会出现.
交错配对
- strike跨度是block大小的一半,每次迭代归约减少一半
- 与相邻归约相比,交错归约的工作线程没有变化,但是线程在全局内存中的加载位置是不同的
__global__ void reduceInterleaved (int *g_idata, int *g_odata, unsigned int n)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx >= n) return;
unsigned int tid = threadIdx.x;
int *idata = g_idata + blockIdx.x * blockDim.x;
for (int stride = blockDim.x / 2; stride > 0; stride >>= 1){
if (tid < stride){
idata[tid] += idata[tid + stride];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = idata[0];
}
循环展开
for (int i = 0; i< 100;i++){//循环检查100次
a[i] = b[i] + c[i];
}
for (int i = 0; i< 100;i+=2){//循环检查50次
a[i] = b[i] + c[i];
a[i+1] = b[i+1] + c[i+1];
}
__global__ void reduceUnrolling2 (int *g_idata, int *g_odata, unsigned int n)
{
// set thread ID
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x * 2 + threadIdx.x;
int *idata = g_idata + blockIdx.x * blockDim.x * 2;
if (idx + blockDim.x < n) g_idata[idx] += g_idata[idx + blockDim.x];
__syncthreads();
// in-place reduction in global memory
for (int stride = blockDim.x / 2; stride > 0; stride >>= 1){
if (tid < stride){
idata[tid] += idata[tid + stride];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = idata[0];
}
展开线程归约
__syncthreads
用于块内同步,确保thread进入下一轮之前,每一轮的所有thread已经将局部结果写入全局内存中
__global__ void reduceUnrolling8 (int *g_idata, int *g_odata, unsigned int n){
unsigned int tid = threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x * 8 + threadIdx.x;
int *idata = g_idata + blockIdx.x * blockDim.x * 8;
if (idx + 7 * blockDim.x < n)
{
int a1 = g_idata[idx];
int a2 = g_idata[idx + blockDim.x];
int a3 = g_idata[idx + 2 * blockDim.x];
int a4 = g_idata[idx + 3 * blockDim.x];
int b1 = g_idata[idx + 4 * blockDim.x];
int b2 = g_idata[idx + 5 * blockDim.x];
int b3 = g_idata[idx + 6 * blockDim.x];
int b4 = g_idata[idx + 7 * blockDim.x];
g_idata[idx] = a1 + a2 + a3 + a4 + b1 + b2 + b3 + b4;
}
__syncthreads();
for (int stride = blockDim.x / 2; stride > 0; stride >>= 1){
if (tid < stride){
idata[tid] += idata[tid + stride];
}
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = idata[0];
}
完全展开归约
模板函数归约
动态并行
嵌套执行
- 内核执行分为父母和孩子,只有在所有的子网格都完成后,父母才会完成
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void nestedHelloWorld(int const iSize, int iDepth)
{
int tid = threadIdx.x;
printf("Recursion=%d: Hello World from thread %d block %d\n", iDepth, tid, blockIdx.x);
if (iSize == 1) return;
int nthreads = iSize >> 1; //减少一半
if(tid == 0 && nthreads > 0){
nestedHelloWorld<<<1, nthreads>>>(nthreads, ++iDepth);
printf("-------> nested execution depth: %d\n", iDepth);
}
}
int main(int argc, char **argv)
{
int size = 8;
int blocksize = 8; // initial block size
int igrid = 1;
if(argc > 1){
igrid = atoi(argv[1]);
size = igrid * blocksize;
}
dim3 block (blocksize, 1);
dim3 grid ((size + block.x - 1) / block.x, 1);
printf("%s Execution Configuration: grid %d block %d\n", argv[0], grid.x,
block.x);
nestedHelloWorld<<<grid, block>>>(block.x, 0);
cudaDeviceReset();
return 0;
}