CUDA 编程之对原子操作与并行性的理解

最新推荐文章于 2024-07-17 00:57:19 发布

原创最新推荐文章于 2024-07-17 00:57:19 发布 · 1.2k 阅读

6 ·

CC 4.0 BY-SA版权

文章标签：

#CUDA #人工智能 #深度学习 #GPU

CUDA编程专栏收录该内容

4 篇文章

订阅专栏

代码编写与测试

以下代码实现这样的功能：

定义一百万个线程，并且线程块的大小为一千。从而有一千个线程块。
在全局 memory 中开辟一段大小为ARRAY_BYTES的空间用来放置数据（初始化为0）。
令一百万个线程去全局 memory 中按线程 ID 取数据（线程 ID 要对数组大小取余，从而保证有些线程 ID 过大的线程也会取到数据），并且进行累加（+1）。
对运行过程进行计时。

#include <stdio.h>
#include "gputimer.h"

#define NUM_THREADS 1000000
#define ARRAY_SIZE  100

#define BLOCK_WIDTH 1000

void print_array(int *array, int size)
{
    printf("{ ");
    for (int i = 0; i < size; i++)  { printf("%d ", array[i]); }
    printf("}\n");
}

__global__ void increment_naive(int *g)
{
	// which thread is this?
	int i = blockIdx.x * blockDim.x + threadIdx.x; 

	// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
	i = i % ARRAY_SIZE;  
	g[i] = g[i] + 1;
}

__global__ void increment_atomic(int *g)
{
	// which thread is this?
	int i = blockIdx.x * blockDim.x + threadIdx.x; 

	// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
	i = i % ARRAY_SIZE;  
	atomicAdd(& g[i], 1);
}

int main(int argc,char **argv)
{   
    GpuTimer timer;
    printf("%d total threads in %d blocks writing into %d array elements\n",
           NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);

    // declare and allocate host memory
    int h_array[ARRAY_SIZE];
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
 
    // declare, allocate, and zero out GPU memory
    int * d_array;
    cudaMalloc((void **) &d_array, ARRAY_BYTES);
    cudaMemset((void *) d_array, 0, ARRAY_BYTES); 

    // launch the kernel - comment out one of these
    timer.Start();
    // increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
    increment_atomic<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
    timer.Stop();
    
    // copy back the array of sums from GPU and print
    cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);
    print_array(h_array, ARRAY_SIZE);
    printf("Time elapsed = %g ms\n", timer.Elapsed());
 
    // free GPU memory allocation and exit
    cudaFree(d_array);
    return 0;
}

如果我们按照该代码运行会出现以下结果。
在这里插入图片描述
为什么会出现这样的结果？？？
其实这就是原子操作在对线程的锁定操作——即一个线程在取数据的时候原子操作会锁定该块区域，从而令其他线程无法取该区域的数据，所以数组每个元素都是10000——表示每个“坑位”分别有10000个线程在此累加过，并且是按照顺序的。

如果我们运行**increment_naive()**函数，则会发现累加结果不是10000，并且每次运行结果都不一样，并且运行时间也会少一些——因为各个线程在以不可预知的顺序异步的访问内存区域进行累加。

这个程序很好的体现了 CUDA 程序中线程的并行性。

下面附上头文件的代码：

#ifndef __GPU_TIMER_H__
#define __GPU_TIMER_H__

struct GpuTimer
{
      cudaEvent_t start;
      cudaEvent_t stop;
 
      GpuTimer()
      {
            cudaEventCreate(&start);
            cudaEventCreate(&stop);
      }
 
      ~GpuTimer()
      {
            cudaEventDestroy(start);
            cudaEventDestroy(stop);
      }
 
      void Start()
      {
            cudaEventRecord(start, 0);
      }
 
      void Stop()
      {
            cudaEventRecord(stop, 0);
      }
 
      float Elapsed()
      {
            float elapsed;
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&elapsed, start, stop);
            return elapsed;
      }
};

#endif  /* __GPU_TIMER_H__ */