DCU 异构程序——归约

目录

一、概述

二、程序实现

三、编译运行


一、概述

        HIP属于显式编程模型,需要在程序中明确写出并行控制语句,包括数据传输、核函数启动等。核函数是运行在DCU上的函数,在CPU端运行的部分称为主机端(主要是执行管理和启动),DCU端运行的部分称为设备端(用于执行计算)。大概的流程如下图:

HIP程序流程

        ①主机端将需要并行计算的数据通过hipMemcpy()传递给DCU(将CPU存储的内容传递给DCU的显存);

        ②调用核函数启动函数hipLaunchKernelGGL()启动DCU,开始执行计算;

        ③设备端将计算好的结果数据通过hipMemcpy()从DCU复制回CPU。

        hipMemcpy()是阻塞式的,数据复制完成后才可以执行后续的程序;hipLanuchKernelGGL()是非阻塞式的,执行完后程序继续向后执行,但是在Kernel没有计算完成之前,最后一个hipMemcpy()是不会开始的,这是由于HIP的Stream机制。

二、程序实现

        下面是对归约的具体实现,reduce.cpp:

#include <stdio.h>
#include <stdlib.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>

int recursiveReduce(int *data, int const size)
{
    if (size == 1) return data[0];

    int const stride = size / 2;

    for (int i = 0; i < stride; i++)
    {
        data[i] += data[i + stride];
    }

    return recursiveReduce(data, stride);
}

__global__ void reduceNeighbored (int *g_idata, int *g_odata, unsigned int n)
{
    unsigned int tid = threadIdx.x;
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    int *idata = g_idata + blockIdx.x * blockDim.x;

    if (idx >= n) return;

    for (int stride = 1; stride < blockDim.x; stride *= 2)
    {
        if ((tid % (2 * stride)) == 0)
        {
            idata[tid] += idata[tid + stride];
        }

        __syncthreads();
    }

    if (tid == 0) g_odata[blockIdx.x] = idata[0];
}

__global__ void reduceNeighboredLess (int *g_idata, int *g_odata,
                                      unsigned int n)
{
    unsigned int tid = threadIdx.x;
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    int *idata = g_idata + blockIdx.x * blockDim.x;

    if(idx >= n) return;

    for (int stride = 1; stride < blockDim.x; stride *= 2)
    {
        int index = 2 * stride * tid;

        if (index < blockDim.x)
        {
            idata[index] += idata[index + stride];
        }

        __syncthreads();
    }

    if (tid == 0) g_odata[blockIdx.x] = idata[0];
}

__global__ void reduceInterleaved (int *g_idata, int *g_odata, unsigned int n)
{
    unsigned int tid = threadIdx.x;
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;

    int *idata = g_idata + blockIdx.x * blockDim.x;

    if(idx >= n) return;

    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1)
    {
        if (tid < stride)
        {
            idata[tid] += idata[tid + stride];
        }

        __syncthreads();
    }

    if (tid == 0) g_odata[blockIdx.x] = idata[0];
}



int main(int argc, char **argv)
{

    int dev = 0;
    bool bResult = false;

    int size = 1 << 24;
    printf("    with array size %d  ", size);

    int blocksize = 512;


    dim3 block (blocksize, 1);
    dim3 grid  ((size + block.x - 1) / block.x, 1);
    printf("grid %d block %d\n", grid.x, block.x);

    hipEvent_t start_time,stop_time;

    hipEventCreate(&start_time); 
    hipEventCreate(&stop_time);
    
    size_t bytes = size * sizeof(int);
    int *h_idata = (int *) malloc(bytes);
    int *h_odata = (int *) malloc(grid.x * sizeof(int));
    int *tmp     = (int *) malloc(bytes);
    float time_elapsed  = 1.0f  ;
    

    for (int i = 0; i < size; i++)
    {
        h_idata[i] = (int)( rand() & 0xFF );
    }

    memcpy (tmp, h_idata, bytes);

    int gpu_sum = 0;

    int *d_idata = NULL;
    int *d_odata = NULL;
    hipMalloc((void **) &d_idata, bytes);
    hipMalloc((void **) &d_odata, grid.x * sizeof(int));

    hipEventRecord(start_time,NULL);
    int cpu_sum = recursiveReduce (tmp, size);
    hipEventRecord(stop_time,NULL);
    hipEventElapsedTime(&time_elapsed, start_time,stop_time);
    printf("cpu reduce elapsed %f sec cpu_sum: %d\n", time_elapsed, cpu_sum);

    hipMemcpy(d_idata, h_idata, bytes, hipMemcpyHostToDevice);
    hipDeviceSynchronize();
    hipEventRecord(start_time,NULL);
    hipLaunchKernelGGL(reduceNeighbored,grid,block,0,0,d_idata,d_odata,size);
    hipDeviceSynchronize();
    hipEventRecord(stop_time,NULL);
    hipEventElapsedTime(&time_elapsed, start_time,stop_time);
    hipMemcpy(h_odata, d_odata, grid.x * sizeof(int),hipMemcpyDeviceToHost);
    gpu_sum = 0;

    for (int i = 0; i < grid.x; i++) gpu_sum += h_odata[i];

    printf("gpu Neighbored  elapsed %f sec gpu_sum: %d <<<grid %d block "
           "%d>>>\n", time_elapsed, gpu_sum, grid.x, block.x);

  
    hipMemcpy(d_idata, h_idata, bytes, hipMemcpyHostToDevice);
    hipDeviceSynchronize();
    hipEventRecord(start_time,NULL);
    hipLaunchKernelGGL(reduceNeighboredLess,grid,block,0,0,d_idata,d_odata,size);
    hipDeviceSynchronize();
    hipEventRecord(stop_time,NULL);
    hipEventElapsedTime(&time_elapsed, start_time,stop_time);
    hipMemcpy(h_odata, d_odata, grid.x * sizeof(int),hipMemcpyDeviceToHost);
    gpu_sum = 0;

    for (int i = 0; i < grid.x; i++) gpu_sum += h_odata[i];

    printf("gpu Neighbored2 elapsed %f sec gpu_sum: %d <<<grid %d block "
           "%d>>>\n", time_elapsed, gpu_sum, grid.x, block.x);


    hipMemcpy(d_idata, h_idata, bytes, hipMemcpyHostToDevice);
    hipDeviceSynchronize();
    hipEventRecord(start_time,NULL);
    hipLaunchKernelGGL(reduceInterleaved,grid,block,0,0,d_idata,d_odata,size);
    hipDeviceSynchronize();
    hipEventRecord(stop_time,NULL);
    hipEventElapsedTime(&time_elapsed, start_time,stop_time);
    hipMemcpy(h_odata, d_odata, grid.x * sizeof(int),hipMemcpyDeviceToHost);
    gpu_sum = 0;

    for (int i = 0; i < grid.x; i++) gpu_sum += h_odata[i];

    printf("gpu Interleaved elapsed %f sec gpu_sum: %d <<<grid %d block "
           "%d>>>\n", time_elapsed, gpu_sum, grid.x, block.x);


    hipEventDestroy(start_time);
    hipEventDestroy(stop_time);
    free(h_idata);
    free(h_odata);

    hipFree(d_idata);
    hipFree(d_odata);

    hipDeviceReset();

    bResult = (gpu_sum == cpu_sum);

    if(!bResult) 
    {
        printf("Test failed!\n");
    }
    else
    {
        printf("Test success!\n");
    }

    return 0;
}

三、编译运行

HIP程序采用hipcc编译。

运行结果:

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

猿核试Bug愁

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值