DCU异构程序--可迁移内存中系统范围的原子操作_基于原子操作的任务迁移-优快云博客

本文链接：https://blog.youkuaiyun.com/L_peanut/article/details/145336709

一、概述

二、程序实现

一、概述

HIP属于显式编程模型，需要在程序中明确写出并行控制语句，包括数据传输、核函数启动等。核函数是运行在DCU上的函数，在CPU端运行的部分称为主机端（主要是执行管理和启动），DCU端运行的部分称为设备端（用于执行计算）。大概的流程如下图：

①主机端将需要并行计算的数据通过hipMemcpy()传递给DCU（将CPU存储的内容传递给DCU的显存）；

②调用核函数启动函数hipLaunchKernelGGL()启动DCU，开始执行计算；

③设备端将计算好的结果数据通过hipMemcpy()从DCU复制回CPU。

hipMemcpy()是阻塞式的，数据复制完成后才可以执行后续的程序；hipLanuchKernelGGL()是非阻塞式的，执行完后程序继续向后执行，但是在Kernel没有计算完成之前，最后一个hipMemcpy()是不会开始的，这是由于HIP的Stream机制。

二、程序实现

下面是对可迁移内存中系统范围的原子操作的具体实现，systemWideAtomics.cpp：

#include <math.h>
#include <stdint.h>
#include <cstdio>
#include <ctime>
#include <hip/hip_runtime.h>
#include <helper_hip.h>

#define min(a,b) (a) < (b) ? (a) : (b)
#define max(a,b) (a) > (b) ? (a) : (b)

#define LOOP_NUM 50

__global__ void atomicKernel(int *atom_arr)
{
    unsigned int tid = blockDim.x * blockIdx.x +threadIdx.x;
    
    for(int i = 0; i < LOOP_NUM; i++)
    {
        atomicAdd_system(&atom_arr[0], 10);

        atomicExch_system(&atom_arr[1], tid);

        atomicMax_system(&atom_arr[2], tid);

        atomicMin_system(&atom_arr[3], tid);

        atomicInc_system((unsigned int *)&atom_arr[4], 17);

        atomicDec_system((unsigned int *)&atom_arr[5], 137);

        atomicCAS_system(&atom_arr[6], tid-1, tid);

        atomicAnd_system(&atom_arr[7], 2*tid+7);

        atomicOr_system(&atom_arr[8], 1<<tid);

        atomicXor_system(&atom_arr[9], tid);
    }
}


void atomicKernel_CPU(int *atom_arr, int no_of_threads)
{
    for(int i = no_of_threads; i < 2*no_of_threads; i++)
    {
        for(int j = 0; j < LOOP_NUM; j++)
        {
            __sync_fetch_and_add(&atom_arr[0], 10);
            __sync_lock_test_and_set(&atom_arr[1], i);
    
            int old, expected;
            do{
                expected = atom_arr[2];
                old = __sync_val_compare_and_swap(&atom_arr[2], expected, max(expected, 1));
            }while(old != expected);

            do{
                expected = atom_arr[3];
                old = __sync_val_compare_and_swap(&atom_arr[3], expected, min(expected, 1));
            }while(old != expected);

            int limit = 17;
            do{
                expected = atom_arr[4];
                old = __sync_val_compare_and_swap(&atom_arr[4], expected, (expected >= limit) ? 0 : expected+1);
            }while(old != expected);

            limit = 137;
            do{
                expected = atom_arr[5];
                old = __sync_val_compare_and_swap(&atom_arr[5], expected, ((expected == 0) || (expected > limit)) ? limit : expected-1);
            }while(old != expected);

            __sync_val_compare_and_swap(&atom_arr[6], i-1, i);

            __sync_fetch_and_and(&atom_arr[7], 2*i+7);

            __sync_fetch_and_or(&atom_arr[8], 1<<i);
        
            __sync_fecch_and_xor(&atom_arr[9], i);
        }
    }
}

int verify(int *testData, const int len)
{
    int val = 0;
    
    for(int i = 0; i < len*LOOP_NUM; ++i)
    {
        val += 10;
    }

    if(val != testData[0])
    {
        printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]);
        return false;
    }

    val = 0;

    bool found = false;

    for(int i = 0; i < len; i++)
    {
        if(i == testData[1])
        {
            found = true;
            break;
        }
    }

    if(!found)
    {
        printf("atomicExch failed\n");
        return false;
    }

    val = -(1 << 8);

    for(int i = 0; i < len; ++i)
    {
        val = max(val, i);
    }

    if(val != testData[2])
    {
        printf("atomicMax failed\n");
        return false;
    }
    
    val = 1 << 8;
    
    for(int i = 0; i < len; ++i)
    {
        val = min(val, i);
    }

    if(val != testData[3])
    {
        printf("atomicMin failed\n");
        return false;
    }

    int limit = 17;
    val = 0;

    for(int i = 0; i < len * LOOP_NUM; ++i)
    {
        val = (val >= limit) ? 0 : val+1;
    }

    if(val != testData[4])
    {
        printf("atomicInc failed");
        return false;
    }

    limit = 137;
    val = 0;

    for(int i = 0; i < len * LOOP_NUM; ++i)
    {
        val = ((val == 0) || (val > limit)) ? limit : val-1;
    }

    if(val != testData[5])
    {
        printf("atomicDec failed\n");
        return false;
    }

    found = false;
    for(int i = 0; i < len; i++)
    {
        if(i == testData[6])
        {
            found = true;
            break;
        }
    }

    if(!found)
    {
        printf("atomicCAS failed\n");
        return false;
    }

    val = 0xff;

    for(int i = 0; i < len; ++i)
    {
        val &= (2*i+7);
    }

    if(val != testData[7])
    {
        printf("atomicAnd failed\n");
        return false;
    }

    val = 0;

    for(int i = 0; i < len; i++)
    {
        val |= (1<<i);
    }

    if(val != testData[8])
    {
        printf("atomicOr failed\n");
        return false;
    }

    val = 0xff;
    
    for(int i = 0; i < len; ++i)
    {
        val ^= i;
    }

    if(val != testData[9])
    {
        printf("atomicXor failed\n");
        return false;
    }

    return true;
}


int main(int argc, char *argv[])
{
    hipDeviceProp_t device_prop;

    int dev_id = findHIPDevice(argc, (const char **) argv);
    checkHIPErrors(hipGetDeviceProperties(&device_prop, dev_id));

    if(!device_prop.managedMemory)
    {
        fprintf(stderr, "Unified Memory not supported on this device\n");
        exit(EXIT_WAIVED);
    }

    if(device_prop.computeMode == hipComputeModeProhibited)
    {
        fprintf(stderr, "This sample requires a device in either default or process exclusive mode\n");
        exit(EXIT_WAIVED);
    }

    if(device_prop.major < 6)
    {
        printf("%s: requires a minimum CUDA compute 6.0 capability, waiving testing.\n", argv[0]);
        exit(EXIT_WAIVED);
    }

    unsigned int numThreads = 256;
    unsigned int numBlocks  = 64;
    unsigned int numData    = 10;

    int *atom_arr;

    if(device_prop.pageableMemoryAccess)
    {
        printf("CAN access pageable memory\n");
        atom_arr = (int *)malloc(sizeof(int)*numData);
    }
    else
    {
        printf("CANNOT access pageable memory\n");
        checkHIPErrors(hipMallocManaged(&atom_arr, sizeof(int)*numData));
    }

    for(unsigned int i = 0; i < numData; i++)
    {
        atom_arr[i] = 0;
    }

    atom_arr[7] = atom_arr[9] = 0xff;

    hipLaunchKernelGGL(atomicKernel, dim3(numBlocks), dim3(numThreads),0,0,atom_arr);
    atomicKernel_CPU(atom_arr, numBlocks*numThreads);

    checkHIPErrors(hipDeviceSynchronize());

    int testResult = verify(atom_arr, 2*numThreads*numBlocks);

    if(device_prop.pageableMemoryAccess)
    {
        free(atom_arr);
    }
    else
    {
        hipFree(atom_arr);
    }

    printf("systemWideAtomics completed, returned %s\n", testResult ? "OK" : "ERROR");
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}