cuda原子操作实例(图像直方图统计、图像像素求和)-优快云博客

本文链接：https://blog.youkuaiyun.com/thecentry/article/details/103543411

1.cuda直方图统计

//代码 GPU高性能编程CUDA实战
__global__ void histo_kernel(unsigned char* buffer, int nsize, int* histo)
{
	__shared__ int temp[256];
	temp[threadIdx.x] = 0;
	__syncthreads();
	
	int i = threadIdx.x + blockIdx.x*blockDim.x;
	int offset = blockDim.x*gridDim.x;
	
	while (i < nsize)
	{
		atomicAdd(&temp[buffer[i]], 1);
		i += offset;
	}
	__syncthreads();
	atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]);
}

/代码一
#include <iostream>
#include "cuda_runtime.h"
#include "time.h"

using namespace std;

#define num (256 * 1024 * 1024)

// 核函数
// 注意，为了方便验证GPU的统计结果，这里采用了"逆直方图"，
// 即每发现一个数字，就从CPU的统计结果中减1
__global__ void hist(unsigned char* inputdata, int* outputhist, long size)
{
    // 开辟共享内存，否则在全局内存采用原子操作会非常慢(因为冲突太多)
    __shared__ int temp[256];
    temp[threadIdx.x] = 0;
    __syncthreads();

    // 计算线程索引及线程偏移量
    int ids = blockIdx.x * blockDim.x + threadIdx.x;
    int offset = blockDim.x * gridDim.x;
    while (ids < size)
    {
        //采用原子操作对一个block中的数据进行直方图统计
        atomicAdd(&temp[inputdata[ids]],1);
        ids += offset;
    }

    // 等待统计完成，减去统计结果
    __syncthreads();
    atomicSub(&outputhist[threadIdx.x], temp[threadIdx.x]);

}

int main()
{
    // 生成随机数据 [0 255]
    unsigned char* cpudata = new unsigned char[num];
    for (size_t i = 0; i < num; i++)
        cpudata[i] = static_cast<unsigned char>(rand() % 256);

    // 声明数组用于记录统计结果
    int cpuhist[256];
    memset(cpuhist, 0, 256 * sizeof(int));

    /*******************************   CPU测试代码   *********************************/
    clock_t cpu_start, cpu_stop;
    cpu_start = clock();
    for (size_t i = 0; i < num; i++)
        cpuhist[cpudata[i]] ++;
    cpu_stop = clock();
    cout << "CPU time: " << (cpu_stop - cpu_start) << "ms" << endl;


    /*******************************   GPU测试代码   *********************************/

    //定义事件用于计时
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);


    //开辟显存并将数据copy进显存
    unsigned char* gpudata;
    cudaMalloc((void**)&gpudata,num*sizeof(unsigned char));
    cudaMemcpy(gpudata, cpudata, num*sizeof(unsigned char),cudaMemcpyHostToDevice);
    // 开辟显存用于存储输出数据,并将CPU的计算结果copy进去
    int* gpuhist;
    cudaMalloc((void**)&gpuhist,256*sizeof(int));
    cudaMemcpy(gpuhist, cpuhist, 256*sizeof(int), cudaMemcpyHostToDevice);

    // 执行核函数并计时
    cudaEventRecord(start, 0);
    hist << <1024, 256 >> >(gpudata,gpuhist,num);
    cudaEventRecord(stop, 0);


    // 将结果copy回主机
    int histcpu[256];
    cudaMemcpy(cpuhist,gpuhist,256*sizeof(int),cudaMemcpyDeviceToHost);

    // 销毁开辟的内存
    cudaFree(gpudata);
    cudaFree(gpuhist);
    delete cpudata;

    // 计算GPU花费时间并销毁计时事件
    cudaEventSynchronize(stop);
    float gputime;
    cudaEventElapsedTime(&gputime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << "GPU time: " << gputime << "ms" << endl;


    // 验证结果
    long result = 0;
    for (size_t i = 0; i < 256; i++)
        result += cpuhist[i];
    if (result == 0)
        cout << "GPU has the same result with CPU." << endl;
    else
        cout << "Error: GPU has a different result with CPU." << endl;

    system("pause");
    return 0;
}

//代码二
__global__ void imHistInCuda(ushort *dataIn, int *hist,int isize)
{
	int threadIndex = threadIdx.x + threadIdx.y * blockDim.x;
	int blockIndex = blockIdx.x + blockIdx.y * gridDim.x;
	int index = threadIndex + blockIndex * blockDim.x * blockDim.y;
	if(index<isize)
		atomicAdd(&hist[dataIn[index]], 1);
}

个人感觉还是cuda自带的sample例子好一点。

2.图像像素求和

///1214
const int threadsPerBlock = 512;
//const int N = 2048;
//const int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
//规约求和
__global__ void ReductionSum(float *d_a, float *d_partial_sum)
{
	//申请共享内存，存在于每个block中 
	__shared__ float partialSum[threadsPerBlock];

	//确定索引
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	int tid = threadIdx.x;

	//传global memory数据到shared memory
	partialSum[tid] = d_a[i];

	//传输同步
	__syncthreads();

	//在共享存储器中进行规约
	for (int stride = blockDim.x / 2; stride > 0; stride /= 2)
	{
		if (tid<stride) 
			partialSum[tid] += partialSum[tid + stride];
		__syncthreads();
	}
	//将当前block的计算结果写回输出数组
	if (tid == 0)
		d_partial_sum[blockIdx.x] = partialSum[0];
}