【CUDA】cuda_training_series

最新推荐文章于 2025-05-10 13:23:27 发布

qq_40911021

最新推荐文章于 2025-05-10 13:23:27 发布

阅读量662

点赞数 25

文章标签： c++

本文链接：https://blog.youkuaiyun.com/qq_40911021/article/details/147008253

版权

1 cuda basic

cudaDeviceSynchronize用于阻塞进程,防止提前退出,因为device kernel与host是异步关系。

add<<<N, 1>>>();

这是指kernel启动时，启动N个线程块（N份此代码），每个线程块含负责1个元素，称之为parallel blocks, 还可以是parallel threads.

add<<<1, N>>>();

cuda 使用的host pointer，不支持共享指针等，cuda memcpy涉及总线传输，当支持任意大小的向量时，线程数量非待处理长度N的整数倍，此时使用

int index = threadIdx.x + blockIdx.x * blockDim.x;
if(index < N)
{
	...
}

上面用if的方式非常常见。
in host

add<<<(N+block_size-1) / block_size, block_size>>>

2 optimization part 1

2.1 stencil的索引方式

int idx = threadIdx.x + blockDim.x * blockIdx.x;
int idx = blockDim.X + 2 * radius // RADIUS / 2
__shared__ int temp[BLOCK_SIZE + 2 * RADIUS];

host的调用

add<<<(N + M - 1) / M, M>>>(a,b,c,d)

gidx = threadIdx.x + blockIdx.x * blockDim.x;
lidx = threadIdx.x + RADIUS;
// thread can execute in any order, threads must reach the barier, __syncthread();
int result = 0;
for(int offset=-RADIUS; offset < RADIUS; ++offset)
{
	result += temp[lidx + offset];
}
out[gidx] = result;

共享内存的使用

// cooperative group
// shared memory, 48kb or 64kb
// dynamic shared memeory
__global__ dynamic_shared_memory_kernel(..., share_mem_in_bytes)
{
	__shared__ int temp[];
}
//
int share_mem_in_bytes = 192;
dynamic_shared_memory_kernel<<<grid, block, share_mem_in_bytes>>>(..., share_mem_in_bytes);

关键思路是 expose parallelism

2.2 cuda硬件特点

cuda 有 kepler/Maxwell/Pascal/Volta等芯片系列，

cores is an important specification
DP： used for double type data, 64 for kepler
LD/ST for 64k registers
架构发展先后，geForce->Titan->Tesla
issue with a warp, not single thread
global memory latency: >100 cycles, 100~400 cycles;
arithmetic latency: < 100 cycles, 5~10 cycles, more easy to hide, launch enough thread to hide latency，so as to get a pipeline.
intermediate language : SAS
GPU是指令按序执行的， more warps, more benefit，64 warp per SM, if my machine has 80 SM, there is $64 * 32 * 80 = 163840$ threads, 2048/1024 thread per block;
dual issue: 相临指令如果没有依赖关系，可同时发起

2.3 summary

use enough total threads to keep GPU busy;分析驱动优化（analysis drive optimization）; set block size as multiple of warp size(32); SM at least 16 thread blocks(Maxwell/Pascal/Volta has 32 thread blocks).

3 optimization part 2

global memory and shared memory,
local storage,
optimize code without using profiler.

3.1 global memory throughput

register: managed by compiler; shared memory has very high throughput, > 1TB/s
load granularity is 128 bytes, non-cacheing loads may optimization。
三种访存的利用率，
bus utilizatoin : 50%

int c = a[idx - 2];

幸运的是，50%的利用率只在warp load 的首次出现，后续因为cache
的存在，bus utilization 是100%

bus utilization : 100%

int c = a[rand()%warpSize];

bus utilization: 128 / (N * 128)

int c = a[rand()];

cache load的随机访问模式，应该是性能瓶颈处, non-cache load的随机访问模式，可以找提高bus utilization, 此时不需要一次性加载128byte，(-Xptxas -dlcm=cg).
L1和L2也是优化点。

coalescing:
shared memeory 有32个bank，每个bank有4byte，不同线程访问同样连续索引对应的数据，称为multicast；全局内存的throughput 只取决于DRAM,不会经过PCI/NVlink，线程块太大可能会出现缓存颠簸，缓存阻塞优先考虑L2 cache.

Below is specific domain.

4 Atomics Reductions warp shuffle

4.1 atomic

thread strategy包含如下两种

transformation : one thead per output point
reduction : what thread gonna do?

*c+=a[i];

reduction need sync, cuda不会强制控制线程的先后执行顺序，我们需要atomic operation, 在L2处实现有一个协调处理的操作，atomic 支持的类型在不同芯片上各不相同，经典的并行归约使用基于树的方法，也需要同步操作，也称之为全局同步，另一种方式是decompose into multiple kernels，threadblock是一个接一个退出的，在这里用同步操作，同步操作使用接口

__syncthreads();

求和过程,下面用的扫描索引模式(parallel sweep reduction)

for(unsigned int s = blockDim.x/2; s > 0; s>>=1) // parallel sweep reduction
{
	if(tid < s)
	{
		sdata[tid] += sdata[tid + s];
	}
	__syncthreads();
}

完整的reduction code，包含grid stride loop + parallel sweep reduction两个主体，

__global__ void reduce_a(float *gdata, float *out)
{
	__shared__ float sdata[BLOCK_SIZE];
	int tid = threadIdx.x;
	sdata[tid] = 0.0f;
	size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
	/* collect data to a block */
	while(idx < N) // grid stride loop
	{
		sdata[tid] += gdata[idx];
		idx += gridDim.x * blockDim.x;
	}
	/* reduction in one block */
	for(unsigned int s = blockDim.x/2; s>0; s>>1) // parallel sweep reduction
	{
		__syncthreads();
		if(tid < s)
		{
			sdata[tid] += sdata[tid + s];
		}
		if(tid == 0)
		{
			atomicAdd(out, sdata[0]);      // 原子操作，这样是一个显著的优化，每个线程执行一次原子操作，得到total sum， 应指出的是atomic并不支持双精度浮点运算
			// out[blockIdx.x] = sdata[0]; // partial sum
		}
	}
}

atomic有时会带来性能约束, one warp 32 threads, 他们同时运行。

4.2 warp shuffle

优点：可减少或消除共享内存使用量、减少明确的同步逻辑、单指令与多指令，warp shuffle不需要使用共享内存或其他任何类型的内存，可在一个warp内可相互通信，__shfl_sync()等操作在CUDA 9中引入的特性，需要用到同步后缀和掩码，掩码（mask）指定哪些线程需要参与。

__shared__ float sdata[32];
int lane = threadIdx.x % warpSize;  ///< lane是指warp里的第几个线程
int warpID = threadId.x / warpSize; ///< warpID是第几个warp, threadId.x is 0~BLOCK_SIZE-1 in block
if(lane == 0) sdata[warpID] = val; ///< 每个线程束一个结果warp0,warp1,warp2,warp3,...

完整的代码是

__global__ void reduce_ws(float *gdata, float *out)
{
	__shared__ float sdata[32]; // 32*32 = 1024, block_size <= 1024
	int tid = threadIdx.x;
	int idx = threadIdx.x + blockDim.x * blockIdx.x;
	float val = 0.0f;
	unsigned mask = 0xffffffffU;
	int lane = threadIdx.x % warpSize;
	int warpID = threadIdx.x / warpSize;

	while(idx < N)
	{
		val += gdata[idx];
		idx += gridDim.x * blockDim.x; ///< grid stride loop, 多少个块*每块的线程个数
	}

	for(int offset = warpSize / 2; offset > 0; offset >>= 1)
	{
		val += __shfl_down_sync(mask, val, offset); ///< 当前的tid线程在线程束内累加， __shfl_down_sync是一个单指令
	}
	if(lane == 0)
	{
		sdata[warpID] = val;
	}
	__syncthreads(); ///< 等待所有warp的线程和求完，并将val给到shared memory

	if(warpID == 0)  ///< blockDim.x个数据汇总到了第一个warp，因此现在只有第一个warp下有所有的partial sum
	{
		/* 第一个warp里的，就读取sdata */
		val = (tid < blockDim.x / warpSize) ? sdata[lane] : 0;
		/* 最后使用一次__shfl_down_sync接口，以累加warp0内的数据 */
		for(int offset = warpSize / 2; offset > 0; offset >>= 1)
		{
			val += __shfl_down_sync(mask, val, offset);
		}
		/* 同步所有block的数据 */
		if(tid == 0)atomicAdd(out,val);
	}
}

warp_shuffle还可用于广播一个值到warp里的其他值，执行原子聚合、cooperative groups。一个SM可支持的warp数为64。

4 managed memory

使用unified memory(UM),接口用

signed char *data;
cudaMallocManaged(&data, N);
// todo something
cudaFree(data);

要注意的时UM并不保序,也不提供数据可见性, kepler/Maxwell没有system-wide atomics;UM大部分情况下不会比手动data movement优秀,主要是能够简化移植。

5 concurrency

parallel memory copy and computation.
固定内存(pinned memory), 将提供更块的主从拷贝，主从间异步，

cudaMemcpyAsync
kernel<<<...>>>(...,...);
cudaMemcpyAsync

多GPU设备管理

cudaGetDeviceProperties(cudaDeviceProp *prop, int device)

device之间可进行数据拷贝，device-to-device

cudaDeviceCanAccessPeer(&canPeer, 0, 1),
cudaMemcpyPeerAsync(dst_ptr, 0, src_ptr, 1, size, stream0);

stream具有优先级属性，先高优先级后低优先级（目前仅2个优先级），cuda graph(CUDA 10) 有manual definition method 和capture method.

6 GPU Performance analysis

memory hierarchy: shared, constant, texture, caches,etc.
分析驱动优化：内存受限，计算受限，延迟分析；
从第一性原理开始出发，使用率高，那么bound可能在那一部分，SM效率高时延迟问题处理的很好；
使用实际吞吐量与峰值吞吐量的对比;
compute bound: 计算受限，可以优化指令单元，将计算负载迁移为其他类型，GPU具有单精度浮点运算单元、独立的双精度浮点运算单元、专用于整数处理的独立单元。
latency bound: 单线程增加处理数据量， work per thread,
occupancy: 主要限制包括每个线程的寄存器，每个线程块的线程数, 共享内存使用率, SM加载了多少资源（例如每个SM可容纳2048个线程）；
更高的占用率是提升性能的重要途径。

nvcc -o t5 t5.cu -arch=sm_70 -lineinfo
lineinfo 是一个编译项，便于代码分析
OMP_NUM_THREAD=32 ./t5 以提高代码运行速度
nsys profile --stat=true ./t5, 获取到.qdrep文件，
nsys-ui ，得到图形文件
ncu --page details -f -o t5.profout ./t5, 获取kernel计算
ncu-ui, 打开nsight compute
issue a instruction 6.6cycle, we want an instruction 1 cycle;

profile 警告示例：
uncoalesced global access(非合并的全局访问)

计解过程中优化方式
原使用1个block，现在使用N个block，第一个循环被去除
在这里插入图片描述

int L, M;
int idx = threadIdx.x;
int k = blockIdx.x;
for(int i = 0; i < M; i++)
{
	vl += input[k * M * L + idx * M + i];
}

7 Cooperative Groups

cuda 9 开始出现Coalesced Group : 合并群组，分析大量线程的同步和通信;
定义分解

thread_block g = this_thread_block();
tiled_partition(g, 32);

关键点
cooperative groups显著收益：grid-wide sync
在前面，已学习

__syncthreads()是block level
shared memory, thread level
warp shuffle, warp level，不依赖共享内存

multi grid synchronization，对应的命名空间

using namespace cooperative_groups;

decomposition block level

thread_block g = this_thread_block();
tiled_partition(g, threadsPerBlock);

threadsPerBlock的值应为2的幂次(2,4,8,16,32,64,128,256,512,1024,2048,4096),
全局内存之间的sync, g.sync(); 使用this_thread_block和tiled_partition后，shfl可以
不再以warpSize为大小，还可根据需要设计4，8等。

7.1 grid group

grid_group grid = this_grid();
grid.sync();

不再使用<<<>>>发起kernel，使用前查询是否可用grid group; persistent kernel: queue, gpu pipe, gpu负责不同功能,

coalesced_group active = coalesced_threads();

warp之间的线程同步

g.shfl(prev,0);

shfl自带sync,而且存在隐藏mask;
how to create group, auto g1 = this_thread_block()
how to partition block,tiled_partition(g1,32)
how to use grid-wide sync, gg = this_grid(),
使用auto时，加-std=c++11.

8 concurrency with multithreading

pre-emptive scheduling: 预先调度;
cuda stream: 如果两个操作issued into 相同的stream，保序; 如果两个操作issued into 分离的streams，不保序;
MPI：用于处理节点间通信;
OpenMP：提代更好的多线程共享内存;
multithreading + cuda streams 在11.4/R470及之后开始支持；
use -alloc_flags=gpumps : multi-process service;
默认使用default stream，因此要想并发，先创建stream.

9 CUDA debugging

cudaError_t cudaSetDevice(int device);

有可能在kernel执行期间检测到CUDA error, 为异步error, 也有同步error, 此处，还有sticky errors和non-sticky errors, 对于sticky error,后续的runtime API都返回同样的错误，对于non-sticky error,后续runtime API可正常执行；compute sanitizer：是一个功能纠正检测工具,还可以检查__syncthreads()（同步问题）;

compute-sanitizer --tool racecheck/synccheck ./my_execute

cuda-gdb
-g : debug host code
-G : debug device code

10 hw

10.1 max active block nums
若对于block有，shared_memory_bytes_per_block
register_nums_per_block
warp_nums_per_block
对于每个SM，有
max_shared_memory_bytes,
max_register_nums,
max_warp_nums,
max_active_blocks,
每个最大活跃block数受限于上面的属性和实际block占用

active_block[0] = max_shared_memeory_bytes / shared_memory_bytes_per_block
active_block[1] = max_register_nums / register_nums_per_block
active_block[2] = max_warp_nums / warp_nums_per_block
active_block[3] = max_active_blocks

那么最大活跃block数是min(active_block).
grid下的block数量应该比SM数量 * active_block_SM大几倍。

nvcc --keep  # 可用来获取kernel消耗的register, shared mem等
nvidia-smi

10.2 hw2

./matrix_mul
Init took 0.109592 seconds. Begin compute
Done. Compute took 6.545127 seconds
Success!
./matrix_mul_shared
Init took 0.109291 seconds. Begin compute
Done. Compute took 1.938015 seconds
Success!

Q: Fatal error: kernel execution failure or cudaMemcpy H2D failure (an illegal memory access was encountered at 
A: 非法访问内存

10.3 hw3

调整block和thread

nvprof --print-gpu-trace --unified-memory-profiling off ./vector_add

method1: 1 block 1 threads, vadd consumed 10.5677s

nvprof --print-gpu-trace --unified-memory-profiling off ./vector_add_t1024_b1 1 1

consumed 875.157000 ms blocks 1 threads 1
==20625== Profiling application: ./vector_add_t1024_b1 1 1
==20625== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
214.40ms  23.485ms                    -               -         -         -         -  128.00MB  5.3226GB/s    Pageable      Device  GeForce GT 730M         1         7  [CUDA memcpy HtoD]
238.01ms  23.446ms                    -               -         -         -         -  128.00MB  5.3314GB/s    Pageable      Device  GeForce GT 730M         1         7  [CUDA memcpy HtoD]
261.62ms  10.5677s              (1 1 1)         (1 1 1)        10        0B        0B         -           -           -           -  GeForce GT 730M         1         7  vadd(float const *, float const *, float*, int) [113]
10.8294s  127.78ms                    -               -         -         -         -  128.00MB  0.9782GB/s      Device    Pageable  GeForce GT 730M         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy

methon2: 1 block 1024 thread, vadd consumed 29.219ms

consumed 888.046000 ms blocks 1 threads 1024
==20749== Profiling application: ./vector_add_t1024_b1 1 1024
==20749== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
220.11ms  26.485ms                    -               -         -         -         -  128.00MB  4.7196GB/s    Pageable      Device  GeForce GT 730M         1         7  [CUDA memcpy HtoD]
246.73ms  23.499ms                    -               -         -         -         -  128.00MB  5.3194GB/s    Pageable      Device  GeForce GT 730M         1         7  [CUDA memcpy HtoD]
270.43ms  29.219ms              (1 1 1)      (1024 1 1)        10        0B        0B         -           -           -           -  GeForce GT 730M         1         7  vadd(float const *, float const *, float*, int) [113]
299.66ms  24.603ms                    -               -         -         -         -  128.00MB  5.0806GB/s      Device    Pageable  GeForce GT 730M         1         7  [CUDA memcpy DtoH]

method3: 160 block 1024 threads, vadd kernel consumed
28.353ms; 使用的GeForce GT 730M 只有两个SM

consumed 871.433000 ms blocks 160 threads 1024
==21120== Profiling application: ./vector_add_t1024_b1 160 1024
==21120== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
212.18ms  26.602ms                    -               -         -         -         -  128.00MB  4.6988GB/s    Pageable      Device  GeForce GT 730M         1         7  [CUDA memcpy HtoD]
238.91ms  23.453ms                    -               -         -         -         -  128.00MB  5.3299GB/s    Pageable      Device  GeForce GT 730M         1         7  [CUDA memcpy HtoD]
262.56ms  28.353ms            (160 1 1)      (1024 1 1)        10        0B        0B         -           -           -           -  GeForce GT 730M         1         7  vadd(float const *, float const *, float*, int) [113]
290.92ms  24.656ms                    -               -         -         -         -  128.00MB  5.0697GB/s      Device    Pageable  GeForce GT 730M         1         7  [CUDA memcpy DtoH]

10.4 hw4

Q:
Fatal error: cudaMalloc failure (out of memory at matrix_sums.cu:58)
*** FAILED - ABORTING
A: 显存不足

nvprof --metrics all ./matrix_sums

展示如下信息

Invocations                               Metric Name                                    Metric Description         Min         Max         Avg
Device "GeForce GT 730M (0)"
    Kernel: column_sums(float const *, float*, unsigned long)
          1                  l1_cache_global_hit_rate                                    L1 Global Hit Rate       0.00%       0.00%       0.00%
          1                   l1_cache_local_hit_rate                                     L1 Local Hit Rate       0.00%       0.00%       0.00%
          1                             sm_efficiency                               Multiprocessor Activity      99.39%      99.39%      99.39%
          1                                       ipc                                          Executed IPC    0.421180    0.421180    0.421180
          1                        achieved_occupancy                                    Achieved Occupancy    0.997211    0.997211    0.997211
          1                  gld_requested_throughput                      Requested Global Load Throughput  13.552GB/s  13.552GB/s  13.552GB/s
          1                  gst_requested_throughput                     Requested Global Store Throughput  3.3881MB/s  3.3881MB/s  3.3881MB/s
          1                    sm_efficiency_instance                               Multiprocessor Activity      99.39%      99.39%      99.39%
          1                              ipc_instance                                          Executed IPC    0.421180    0.421180    0.421180
          1                      inst_replay_overhead                           Instruction Replay Overhead    0.718624    0.718624    0.718624
          1                    shared_replay_overhead                         Shared Memory Replay Overhead    0.000000    0.000000    0.000000
          1                    global_replay_overhead                         Global Memory Replay Overhead    0.000000    0.000000    0.000000
          1              global_cache_replay_overhead                   Global Memory Cache Replay Overhead    0.000000    0.000000    0.000000
          1                        tex_cache_hit_rate                                Texture Cache Hit Rate       0.00%       0.00%       0.00%
          1                      tex_cache_throughput                              Texture Cache Throughput  0.00000B/s  0.00000B/s  0.00000B/s
          1                      dram_read_throughput                         Device Memory Read Throughput  13.563GB/s  13.563GB/s  13.563GB/s
          1                     dram_write_throughput                        Device Memory Write Throughput  3.5006MB/s  3.5006MB/s  3.5006MB/s
          1                            gst_throughput                               Global Store Throughput  3.3881MB/s  3.3881MB/s  3.3881MB/s
          1                            gld_throughput                                Global Load Throughput  13.552GB/s  13.552GB/s  13.552GB/s
          1                     local_replay_overhead                    Local Memory Cache Replay Overhead    0.000000    0.000000    0.000000
          1                         shared_efficiency                              Shared Memory Efficiency       0.00%       0.00%       0.00%
          1                            gld_efficiency                         Global Memory Load Efficiency     100.00%     100.00%     100.00%
          1                            gst_efficiency                        Global Memory Store Efficiency     100.00%     100.00%     100.00%
          1                       l2_l1_read_hit_rate                                L2 Hit Rate (L1 Reads)       0.00%       0.00%       0.00%
          1                  l2_texture_read_hit_rate                           L2 Hit Rate (Texture Reads)       0.00%       0.00%       0.00%
          1                     l2_l1_read_throughput                              L2 Throughput (L1 Reads)  13.552GB/s  13.552GB/s  13.552GB/s
          1                l2_texture_read_throughput                         L2 Throughput (Texture Reads)  0.00000B/s  0.00000B/s  0.00000B/s
          1                     local_memory_overhead                                 Local Memory Overhead       0.00%       0.00%       0.00%
          1                 warp_execution_efficiency                             Warp Execution Efficiency     100.00%     100.00%     100.00%
          1               nc_gld_requested_throughput         Requested Non-Coherent Global Load Throughput  0.00000B/s  0.00000B/s  0.00000B/s

10.5 hw5

3种 reduction: atomic-only reduction, classical parallel reduction with atomic finish, warp shuffle reduction with atomic finish;
内存带宽: data size in bytes / execution time, 这个时间可以和bandwidthTest样品代码;

reduction grid stride loop,

int tid = threadIdx.x + blockDim.x * blockIdx.x;
int idx = threadIdx.x + blockDim.x * blockIdx.x;
while(idx < N)
{
	sdata[tid] = gdata[idx];
	idx += gridDim.x * blockDim.x; // grid stride, idx, idx + 1之间的间隔时gridDim.x * blockDim.x
}

reduction_ws中,

grid之间进行累加,累加至只剩1个block, grid stride loop
在block内按warp累加,直至只剩1个warp, parallel sweep reduction
warpID==0 一个warp内部求和, warp_shuffle reduction

val += __shfl_down_sync(mask, val, offset);将一个warp内的数据进行累加.
matrix_sum中,包括

block stride loop
parallel sweep reduction
block stride loop要求blockDim.x == rows

unsigned int bidx = blockIDx.x;
unsigned int tidx = threadIdx.x;
while(tidx < ds)
{
	sdata[threadIdx.x] += A[tidx + blockIdx.x * blockDim.x];  // multiblock to one block, block stride loop
	tidx += blockDim.x;
}

parallel sweep reduction

for(int s = blockDim.x; s > 0; s >>= 1)
{
	__syncthreads();
	if(threadIdx.x < s)
	{
		sdata[threadIdx.x] += sdata[threadIdx.x + s];  // inner block, self reduction, parallel sweep reduction
	}
}

最后的元素是res[blockIdx.x] = sdata[0];
优化后的matrix_sum

   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
213.57ms  46.387ms                    -               -         -         -         -  256.00MB  5.3894GB/s    Pageable      Device  GeForce GT 730M         1         7  [CUDA memcpy HtoD]
260.12ms  18.304ms           (8192 1 1)       (256 1 1)         9  1.0000KB        0B         -           -           -           -  GeForce GT 730M         1         7  row_sums(float const *, float*, unsigned long) [110]
278.43ms  8.8300us                    -               -         -         -         -  32.000KB  3.4561GB/s      Device    Pageable  GeForce GT 730M         1         7  [CUDA memcpy DtoH]
278.51ms  10.975us                    -               -         -         -         -  32.000KB  2.7806GB/s      Device           -  GeForce GT 730M         1         7  [CUDA memset]
278.54ms  17.724ms             (32 1 1)       (256 1 1)        32        0B        0B         -           -           -           -  GeForce GT 730M         1         7  column_sums(float const *, float*, unsigned long) [119]
296.27ms  8.8300us                    -               -         -         -         -  32.000KB  3.4561GB/s      Device    Pageable  GeForce GT 730M         1         7  [CUDA memcpy DtoH]

hw4下同规格数据(DSIZE=8192, BLOCK_SIZE = 256), profiling 信息

Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
212.40ms  46.698ms                    -               -         -         -         -  256.00MB  5.3536GB/s    Pageable      Device  GeForce GT 730M         1         7  [CUDA memcpy HtoD]
259.26ms  98.417ms             (32 1 1)       (256 1 1)        28        0B        0B         -           -           -           -  GeForce GT 730M         1         7  row_sums(float const *, float*, unsigned long) [110]
357.68ms  8.7990us                    -               -         -         -         -  32.000KB  3.4683GB/s      Device    Pageable  GeForce GT 730M         1         7  [CUDA memcpy DtoH]
357.77ms  5.7590us                    -               -         -         -         -  32.000KB  5.2991GB/s      Device           -  GeForce GT 730M         1         7  [CUDA memset]
357.81ms  17.724ms             (32 1 1)       (256 1 1)        32        0B        0B         -           -           -           -  GeForce GT 730M         1         7  column_sums(float const *, float*, unsigned long) [119]
375.53ms  10.175us                    -               -         -         -         -  32.000KB  2.9993GB/s      Device    Pageable  GeForce GT 730M         1         7  [CUDA memcpy DtoH

可见优化版本的row_sum 耗时为18.324ms，而hw4中的row_sum 耗时为98.417ms。

10.6 hw6

unified memory, no cuda version, skip
cudaMallocManage

10.7 hw7

nvcc -o overlap overlap -DUSE_STREAMS
use stream时加-DUSE_STREAMS

cudaMemcpyAsync(dst, src,size, direction, stream)
kernel<<<grid,block, 0, stream>>>
cudaMemcpyAysnc(...)
cudaDeviceSynchronize()

nvvp overlap so as to visualize schedule

gpu的信息可以获得

only one GPU, cannot do multi-GPU experiment.
在cudaMalloc/cudaMemcpy/kernelCall前都使用cudaSetDevice(i);以配置多个GPU.

nvidia-smi 
Sat Mar 29 11:27:44 2025       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.130                Driver Version: 384.130                   |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GT 730M     Off  | 00000000:02:00.0 N/A |                  N/A |
| N/A   54C    P0    N/A /  N/A |    410MiB /   983MiB |     N/A      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0                    Not Supported                                       |
+-----------------------------------------------------------------------------+

10.8 hw8

在这一次作业中,task1总是运行失败, 直到重启了系统后,重新编译才解决,这样看来GPU有时会失效或崩溃,以致结果总是不符合预期.

矩阵转秩过程中, 先tile转秩再元素转秩

tileX = blockIdx.x * blockDim.x
tileY = blockIdx.y * blockDim.y
in(tileX + threadIdx.x, tileY + threadIdx.y) -> sdata[threadIdx.x][threadIdx.y]
sdata[threadIdx.y][threadIdx.x] ->
ou(tileY + threadIdx.x, tileX + threadIdx.y)  // better, threadIdx.x
sdata[threadIdx.x][threadIdx.y] ->
ou(tileY + threadIdx.y, tileX + threadIdx.x)

使用方式一: sdata[threadIdx.x][threadIdx.y] -> ou(tileY + threadIdx.y, tileX + threadIdx.x)

Matrix size is 4096
==4424== NVPROF is profiling process 4424, command: ./task2
Total memory required per matrix is 134.217728 MB
Total time CPU is 0.285797 sec
Performance is 0.939252 GB/s
Total time GPU is 0.049847 sec
Performance is 5.385138 GB/s
PASS
==4424== Profiling application: ./task2
==4424== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
264.51ms  1.3760us                    -               -         -         -         -  128.00MB   9e+04GB/s      Device           -  GeForce GT 730M         1         7  [CUDA memset]
644.59ms  23.251ms                    -               -         -         -         -  128.00MB  5.3761GB/s    Pageable      Device  GeForce GT 730M         1         7  [CUDA memcpy HtoD]
953.96ms  49.538ms          (129 129 1)       (32 32 1)         8  8.0000KB        0B         -           -           -           -  GeForce GT 730M         1         7  smem_cuda_transpose(int, double const *, double*) [116]
1.00362s  1.2480us                    -               -         -         -         -  128.00MB   1e+05GB/s      Device           -  GeForce GT 730M         1         7  [CUDA memset]
1.01262s  27.887ms                    -               -         -         -         -  128.00MB  4.4824GB/s      Device    Pageable  GeForce GT 730M         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy
sk@sk:~/Documents/update_general/general/cuda-training-series/exercises/hw8/task2$ nvcc -o task2 task2.cu

使用方式二:sdata[threadIdx.y][threadIdx.x] -> ou(tileY + threadIdx.x, tileX + threadIdx.y)

sk@sk:~/Documents/update_general/general/cuda-training-series/exercises/hw8/task2$ nvprof --print-gpu-trace --unified-memory-profiling off ./task2
Matrix size is 4096
==4484== NVPROF is profiling process 4484, command: ./task2
Total memory required per matrix is 134.217728 MB
Total time CPU is 0.304207 sec
Performance is 0.882411 GB/s
Total time GPU is 0.028356 sec
Performance is 9.466672 GB/s
PASS
==4484== Profiling application: ./task2
==4484== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
255.00ms  1.3760us                    -               -         -         -         -  128.00MB   9e+04GB/s      Device           -  GeForce GT 730M         1         7  [CUDA memset]
600.56ms  26.541ms                    -               -         -         -         -  128.00MB  4.7097GB/s    Pageable      Device  GeForce GT 730M         1         7  [CUDA memcpy HtoD]
931.58ms  27.933ms          (129 129 1)       (32 32 1)         8  8.0000KB        0B         -           -           -           -  GeForce GT 730M         1         7  smem_cuda_transpose(int, double const *, double*) [116]
959.80ms     512ns                    -               -         -         -         -  128.00MB   2e+05GB/s      Device           -  GeForce GT 730M         1         7  [CUDA memset]
968.80ms  30.481ms                    -               -         -         -         -  128.00MB  4.1009GB/s      Device    Pageable  GeForce GT 730M         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy

可见,方式二将kernel耗时从49.5ms降至27.9ms, 原则是global memory 中 threadIdx.x方向优先.

add a column to the shared memory definition, fix shared memory bank conflicts

task2中的版本

                                                                              min            max            avg
shared_load_throughput                         Shared Memory Load Throughput  6.0826GB/s  6.0826GB/s  6.0826GB/s
shared_store_throughput                        Shared Memory Store Throughput  204.75GB/s  204.75GB/s  204.75GB/s


shared_efficiency                              Shared Memory Efficiency       5.77%       5.77%       5.77%

避免bank conflicts, 修改共享内存为 sdata[THREAD_PER_BLOCK_X][THREAD_PER_BLOCK_Y + 1],此时的shared memory throughout

shared_load_throughput                         Shared Memory Load Throughput  12.724GB/s  12.724GB/s  12.724GB/s
          1                   shared_store_throughput                        Shared Memory Store Throughput  13.510GB/s  13.510GB/s  13.510GB/s
shared_efficiency                              Shared Memory Efficiency      49.27%      49.27%      49.27%

10.9 hw9

this_thread_block(), not support cooperative group in cuda9

key api: g1 = this_thread_block(), tiled_partition(g1,32);

task1.cu(19): error: no suitable conversion function from "cooperative_groups::__v1::thread_block" to "int" exists

task1.cu(20): error: expression must have class type

task1.cu(20): error: expression must have class type

task1.cu(22): error: explicit type is missing ("int" assumed)

task1.cu(22): error: "cooperative_groups::__v1::coalesced_group::coalesced_group(unsigned int)"
/usr/local/cuda-9.0/bin/..//include/cooperative_groups.h(408): here is inaccessible

task1.cu(22): error: no suitable conversion function from "cooperative_groups::__v1::coalesced_group" to "int" exists

task1.cu(24): error: explicit type is missing ("int" assumed)

task1.cu(24): error: "cooperative_groups::__v1::coalesced_group::coalesced_group(unsigned int)"
/usr/local/cuda-9.0/bin/..//include/cooperative_groups.h(408): here is inaccessible

task1.cu(24): error: no suitable conversion function from "cooperative_groups::__v1::coalesced_group" to "int" exists

10.10 hw10

10.11 hw11

10.12 hw12

__syncthreads(); 使用在sharedmem更新后,准备使用新的sharedmem前

compute-sanitizer是cuda11.6开始引入,cuda9中无法使用

.cu(22): error: explicit type is missing ("int" assumed)

task1.cu(22): error: "cooperative_groups::__v1::coalesced_group::coalesced_group(unsigned int)"
/usr/local/cuda-9.0/bin/..//include/cooperative_groups.h(408): here is inaccessible

task1.cu(22): error: no suitable conversion function from "cooperative_groups::__v1::coalesced_group" to "int" exists

task1.cu(24): error: explicit type is missing ("int" assumed)

task1.cu(24): error: "cooperative_groups::__v1::coalesced_group::coalesced_group(unsigned int)"
/usr/local/cuda-9.0/bin/..//include/cooperative_groups.h(408): here is inaccessible

task1.cu(24): error: no suitable conversion function from "cooperative_groups::__v1::coalesced_group" to "int" exists