目录
一、概述
HIP属于显式编程模型,需要在程序中明确写出并行控制语句,包括数据传输、核函数启动等。核函数是运行在DCU上的函数,在CPU端运行的部分称为主机端(主要是执行管理和启动),DCU端运行的部分称为设备端(用于执行计算)。大概的流程如下图:

①主机端将需要并行计算的数据通过hipMemcpy()传递给DCU(将CPU存储的内容传递给DCU的显存);
②调用核函数启动函数hipLaunchKernelGGL()启动DCU,开始执行计算;
③设备端将计算好的结果数据通过hipMemcpy()从DCU复制回CPU。
hipMemcpy()是阻塞式的,数据复制完成后才可以执行后续的程序;hipLanuchKernelGGL()是非阻塞式的,执行完后程序继续向后执行,但是在Kernel没有计算完成之前,最后一个hipMemcpy()是不会开始的,这是由于HIP的Stream机制。
二、程序实现
下面是对可迁移内存中系统范围的原子操作的具体实现,systemWideAtomics.cpp:
#include <math.h>
#include <stdint.h>
#include <cstdio>
#include <ctime>
#include <hip/hip_runtime.h>
#include <helper_hip.h>
#define min(a,b) (a) < (b) ? (a) : (b)
#define max(a,b) (a) > (b) ? (a) : (b)
#define LOOP_NUM 50
__global__ void atomicKernel(int *atom_arr)
{
unsigned int tid = blockDim.x * blockIdx.x +threadIdx.x;
for(int i = 0; i < LOOP_NUM; i++)
{
atomicAdd_system(&atom_arr[0], 10);
atomicExch_system(&atom_arr[1], tid);
atomicMax_system(&atom_arr[2], tid);
atomicMin_system(&atom_arr[3], tid);
atomicInc_system((unsigned int *)&atom_arr[4], 17);
atomicDec_system((unsigned int *)&atom_arr[5], 137);
atomicCAS_system(&atom_arr[6], tid-1, tid);
atomicAnd_system(&atom_arr[7], 2*tid+7);
atomicOr_system(&atom_arr[8], 1<<tid);
atomicXor_system(&atom_arr[9], tid);
}
}
void atomicKernel_CPU(int *atom_arr, int no_of_threads)
{
for(int i = no_of_threads; i < 2*no_of_threads; i++)
{
for(int j = 0; j < LOOP_NUM; j++)
{
__sync_fetch_and_add(&atom_arr[0], 10);
__sync_lock_test_and_set(&atom_arr[1], i);
int old, expected;
do{
expected = atom_arr[2];
old = __sync_val_compare_and_swap(&atom_arr[2], expected, max(expected, 1));
}while(old != expected);
do{
expected = atom_arr[3];
old = __sync_val_compare_and_swap(&atom_arr[3], expected, min(expected, 1));
}while(old != expected);
int limit = 17;
do{
expected = atom_arr[4];
old = __sync_val_compare_and_swap(&atom_arr[4], expected, (expected >= limit) ? 0 : expected+1);
}while(old != expected);
limit = 137;
do{
expected = atom_arr[5];
old = __sync_val_compare_and_swap(&atom_arr[5], expected, ((expected == 0) || (expected > limit)) ? limit : expected-1);
}while(old != expected);
__sync_val_compare_and_swap(&atom_arr[6], i-1, i);
__sync_fetch_and_and(&atom_arr[7], 2*i+7);
__sync_fetch_and_or(&atom_arr[8], 1<<i);
__sync_fecch_and_xor(&atom_arr[9], i);
}
}
}
int verify(int *testData, const int len)
{
int val = 0;
for(int i = 0; i < len*LOOP_NUM; ++i)
{
val += 10;
}
if(val != testData[0])
{
printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]);
return false;
}
val = 0;
bool found = false;
for(int i = 0; i < len; i++)
{
if(i == testData[1])
{
found = true;
break;
}
}
if(!found)
{
printf("atomicExch failed\n");
return false;
}
val = -(1 << 8);
for(int i = 0; i < len; ++i)
{
val = max(val, i);
}
if(val != testData[2])
{
printf("atomicMax failed\n");
return false;
}
val = 1 << 8;
for(int i = 0; i < len; ++i)
{
val = min(val, i);
}
if(val != testData[3])
{
printf("atomicMin failed\n");
return false;
}
int limit = 17;
val = 0;
for(int i = 0; i < len * LOOP_NUM; ++i)
{
val = (val >= limit) ? 0 : val+1;
}
if(val != testData[4])
{
printf("atomicInc failed");
return false;
}
limit = 137;
val = 0;
for(int i = 0; i < len * LOOP_NUM; ++i)
{
val = ((val == 0) || (val > limit)) ? limit : val-1;
}
if(val != testData[5])
{
printf("atomicDec failed\n");
return false;
}
found = false;
for(int i = 0; i < len; i++)
{
if(i == testData[6])
{
found = true;
break;
}
}
if(!found)
{
printf("atomicCAS failed\n");
return false;
}
val = 0xff;
for(int i = 0; i < len; ++i)
{
val &= (2*i+7);
}
if(val != testData[7])
{
printf("atomicAnd failed\n");
return false;
}
val = 0;
for(int i = 0; i < len; i++)
{
val |= (1<<i);
}
if(val != testData[8])
{
printf("atomicOr failed\n");
return false;
}
val = 0xff;
for(int i = 0; i < len; ++i)
{
val ^= i;
}
if(val != testData[9])
{
printf("atomicXor failed\n");
return false;
}
return true;
}
int main(int argc, char *argv[])
{
hipDeviceProp_t device_prop;
int dev_id = findHIPDevice(argc, (const char **) argv);
checkHIPErrors(hipGetDeviceProperties(&device_prop, dev_id));
if(!device_prop.managedMemory)
{
fprintf(stderr, "Unified Memory not supported on this device\n");
exit(EXIT_WAIVED);
}
if(device_prop.computeMode == hipComputeModeProhibited)
{
fprintf(stderr, "This sample requires a device in either default or process exclusive mode\n");
exit(EXIT_WAIVED);
}
if(device_prop.major < 6)
{
printf("%s: requires a minimum CUDA compute 6.0 capability, waiving testing.\n", argv[0]);
exit(EXIT_WAIVED);
}
unsigned int numThreads = 256;
unsigned int numBlocks = 64;
unsigned int numData = 10;
int *atom_arr;
if(device_prop.pageableMemoryAccess)
{
printf("CAN access pageable memory\n");
atom_arr = (int *)malloc(sizeof(int)*numData);
}
else
{
printf("CANNOT access pageable memory\n");
checkHIPErrors(hipMallocManaged(&atom_arr, sizeof(int)*numData));
}
for(unsigned int i = 0; i < numData; i++)
{
atom_arr[i] = 0;
}
atom_arr[7] = atom_arr[9] = 0xff;
hipLaunchKernelGGL(atomicKernel, dim3(numBlocks), dim3(numThreads),0,0,atom_arr);
atomicKernel_CPU(atom_arr, numBlocks*numThreads);
checkHIPErrors(hipDeviceSynchronize());
int testResult = verify(atom_arr, 2*numThreads*numBlocks);
if(device_prop.pageableMemoryAccess)
{
free(atom_arr);
}
else
{
hipFree(atom_arr);
}
printf("systemWideAtomics completed, returned %s\n", testResult ? "OK" : "ERROR");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}