接下来就是 MultiGPU 和 OpenGLInterop 但这两个工程先跳过。因为我电脑里只有一个显卡,然后与OpenGL交互暂时用不到。
所以接下来就是OpenCL Scan这个工程了,乍一看感觉和之前的都很不一样啊,感觉很难的样子。
cl文件:
// Scan codelets
////////////////////////////////////////////////////////////////////////////////
#if(1)
//Naive inclusive scan: O(N * log2(N)) operations
//Allocate 2 * 'size' local memory, initialize the first half
//with 'size' zeros avoiding if(pos >= offset) condition evaluation
//and saving instructions
inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1));
l_Data[pos] = 0;
pos += size;
l_Data[pos] = idata;
for(uint offset = 1; offset < size; offset <<= 1){
barrier(CLK_LOCAL_MEM_FENCE);
uint t = l_Data[pos] + l_Data[pos - offset];
barrier(CLK_LOCAL_MEM_FENCE);
l_Data[pos] = t;
}
return l_Data[pos];
}
inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){
return scan1Inclusive(idata, l_Data, size) - idata;
}
#else
#define LOG2_WARP_SIZE 5U
#define WARP_SIZE (1U << LOG2_WARP_SIZE)
//Almost the same as naive scan1Inclusive but doesn't need barriers
//and works only for size <= WARP_SIZE
inline uint warpScanInclusive(uint idata, volatile __local uint *l_Data, uint size){
uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1));
l_Data[pos] = 0;
pos += size;
l_Data[pos] = idata;
if(size >= 2) l_Data[pos] += l_Data[pos - 1];
if(size >= 4) l_Data[pos] += l_Data[pos - 2];
if(size >= 8) l_Data[pos] += l_Data[pos - 4];
if(size >= 16) l_Data[pos] += l_Data[pos - 8];
if(size >= 32) l_Data[pos] += l_Data[pos - 16];
return l_Data[pos];
}
inline uint warpScanExclusive(uint idata, __local uint *l_Data, uint size){
return warpScanInclusive(idata, l_Data, size) - idata;
}
inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){
if(size > WARP_SIZE){
//Bottom-level inclusive warp scan
uint warpResult = warpScanInclusive(idata, l_Data, WARP_SIZE);
//Save top elements of each warp for exclusive warp scan
//sync to wait for warp scans to complete (because l_Data is being overwritten)
barrier(CLK_LOCAL_MEM_FENCE);
if( (get_local_id(0) & (WARP_SIZE - 1)) == (WARP_SIZE - 1) )
l_Data[get_local_id(0) >> LOG2_WARP_SIZE] = warpResult;
//wait for warp scans to complete
barrier(CLK_LOCAL_MEM_FENCE);
if( get_local_id(0) < (WORKGROUP_SIZE / WARP_SIZE) ){
//grab top warp elements
uint val = l_Data[get_local_id(0)];
//calculate exclsive scan and write back to shared memory
l_Data[get_local_id(0)] = warpScanExclusive(val, l_Data, size >> LOG2_WARP_SIZE);
}
//return updated warp scans with exclusive scan results
barrier(CLK_LOCAL_MEM_FENCE);
return warpResult + l_Data[get_local_id(0) >> LOG2_WARP_SIZE];
}else{
return warpScanInclusive(idata, l_Data, size);
}
}
inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){
return scan1Inclusive(idata, l_Data, size) - idata;
}
#endif
//Vector scan: the array to be scanned is stored
//in work-item private memory as uint4
inline uint4 scan4Inclusive(uint4 data4, __local uint *l_Data, uint size){
//Level-0 inclusive scan
data4.y += data4.x;
data4.z += data4.y;
data4.w += data4.z;
//Level-1 exclusive scan
uint val = scan1Inclusive(data4.w, l_Data, size / 4) - data4.w;
return (data4 + (uint4)val);
}
inline uint4 scan4Exclusive(uint4 data4, __local uint *l_Data, uint size){
return scan4Inclusive(data4, l_Data, size) - data4;
}
////////////////////////////////////////////////////////////////////////////////
// Scan kernels
////////////////////////////////////////////////////////////////////////////////
__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
void scanExclusiveLocal1(
__global uint4 *d_Dst,
__global uint4 *d_Src,
__local uint *l_Data,
uint size
){
//Load data
uint4 idata4 = d_Src[get_global_id(0)];
//Calculate exclusive scan
uint4 odata4 = scan4Exclusive(idata4, l_Data, size);
//Write back
d_Dst[get_global_id(0)] = odata4;
}
//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE)
__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
void scanExclusiveLocal2(
__global uint *d_Buf,
__global uint *d_Dst,
__global uint *d_Src,
__local uint *l_Data,
uint N,
uint arrayLength
){
//Load top elements
//Convert results of bottom-level scan back to inclusive
//Skip loads and stores for inactive work-items of the work-group with highest index(pos >= N)
uint data = 0;
if(get_global_id(0) < N)
data =
d_Dst[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)] +
d_Src[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)];
//Compute
uint odata = scan1Exclusive(data, l_Data, arrayLength);
//Avoid out-of-bound access
if(get_global_id(0) < N)
d_Buf[get_global_id(0)] = odata;
}
//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays
__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))
void uniformUpdate(
__global uint4 *d_Data,
__global uint *d_Buf
){
__local uint buf[1];
uint4 data4 = d_Data[get_global_id(0)];
if(get_local_id(0) == 0)
buf[0] = d_Buf[get_group_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
data4 += (uint4)buf[0];
d_Data[get_global_id(0)] = data4;
}
main函数部分:
int main(int argc, const char **argv)
{
shrQAStart(argc, (char **)argv);
// Start logs
shrSetLogFileName ("oclScan.txt");
shrLog("%s Starting...\n\n", argv[0]);
cl_platform_id cpPlatform; //OpenCL platform
cl_device_id cdDevice; //OpenCL device
cl_context cxGPUContext; //OpenCL context
cl_command_queue cqCommandQueue; //OpenCL command que
cl_mem d_Input, d_Output; //OpenCL memory buffer objects
cl_int ciErrNum;
uint *h_Input, *h_OutputCPU, *h_OutputGPU;
const uint N = 13 * 1048576 / 2;
shrLog("Allocating and initializing host arrays...\n");
h_Input = (uint *)malloc(N * sizeof(uint));
h_OutputCPU = (uint *)malloc(N * sizeof(uint));
h_OutputGPU = (uint *)malloc(N * sizeof(uint));
srand(2009);
for(uint i = 0; i < N; i++)
h_Input[i] = rand();
shrLog("Initializing OpenCL...\n");
//Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckError(ciErrNum, CL_SUCCESS);
//Get a GPU device
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
//Create the context
cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
//Create a command-queue
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Initializing OpenCL scan...\n");
initScan(cxGPUContext, cqCommandQueue, argv);
shrLog("Creating OpenCL memory objects...\n\n");
d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, N * sizeof(uint), h_Input, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
d_Output = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, N * sizeof(uint), NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
int globalFlag = 1; // init pass/fail flag to pass
size_t szWorkgroup;
const int iCycles = 100;
shrLog("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles);
for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength *= 2)
{
shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
clFinish(cqCommandQueue);
shrDeltaT(0);
for (int i = 0; i<iCycles; i++)
{
szWorkgroup = scanExclusiveShort(
cqCommandQueue,
d_Output,
d_Input,
N / arrayLength,
arrayLength
);
}
clFinish(cqCommandQueue);
double timerValue = shrDeltaT(0)/(double)iCycles;
shrLog("Validating the results...\n");
shrLog(" ...reading back OpenCL memory\n");
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, N * sizeof(uint), h_OutputGPU, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
shrLog(" ...scanExclusiveHost()\n");
scanExclusiveHost(
h_OutputCPU,
h_Input,
N / arrayLength,
arrayLength
);
// Compare GPU results with CPU results and accumulate error for this test
shrLog(" ...comparing the results\n");
int localFlag = 1;
for(uint i = 0; i < N; i++)
{
if(h_OutputCPU[i] != h_OutputGPU[i])
{
localFlag = 0;
break;
}
}
// Log message on individual test result, then accumulate to global flag
shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
globalFlag = globalFlag && localFlag;
#ifdef GPU_PROFILING
if (arrayLength == MAX_SHORT_ARRAY_SIZE)
{
shrLog("\n");
shrLogEx(LOGBOTH | MASTER, 0, "oclScan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
(1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
shrLog("\n");
}
#endif
}
shrLog("*** Running GPU scan for large arrays (%d identical iterations)...\n\n", iCycles);
for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength *= 2)
{
shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength);
clFinish(cqCommandQueue);
shrDeltaT(0);
for (int i = 0; i<iCycles; i++)
{
szWorkgroup = scanExclusiveLarge(
cqCommandQueue,
d_Output,
d_Input,
N / arrayLength,
arrayLength
);
}
clFinish(cqCommandQueue);
double timerValue = shrDeltaT(0)/(double)iCycles;
shrLog("Validating the results...\n");
shrLog(" ...reading back OpenCL memory\n");
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, N * sizeof(uint), h_OutputGPU, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
shrLog(" ...scanExclusiveHost()\n");
scanExclusiveHost(
h_OutputCPU,
h_Input,
N / arrayLength,
arrayLength
);
// Compare GPU results with CPU results and accumulate error for this test
shrLog(" ...comparing the results\n");
int localFlag = 1;
for(uint i = 0; i < N; i++)
{
if(h_OutputCPU[i] != h_OutputGPU[i])
{
localFlag = 0;
break;
}
}
// Log message on individual test result, then accumulate to global flag
shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!");
globalFlag = globalFlag && localFlag;
#ifdef GPU_PROFILING
if (arrayLength == MAX_LARGE_ARRAY_SIZE)
{
shrLog("\n");
shrLogEx(LOGBOTH | MASTER, 0, "oclScan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
(1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup);
shrLog("\n");
}
#endif
}
shrLog("Shutting down...\n");
//Release kernels and program
closeScan();
//Release other OpenCL Objects
ciErrNum = clReleaseMemObject(d_Output);
ciErrNum |= clReleaseMemObject(d_Input);
ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
ciErrNum |= clReleaseContext(cxGPUContext);
oclCheckError(ciErrNum, CL_SUCCESS);
//Release host buffers
free(h_OutputGPU);
free(h_OutputCPU);
free(h_Input);
// finish
// pass or fail (cumulative... all tests in the loop)
shrQAFinishExit(argc, (const char **)argv, globalFlag ? QA_PASSED : QA_FAILED);
//Finish
shrEXIT(argc, argv);
}
其实就是这个意思,src是个N=6815744大小的数组,同样创建一个这么大的数组,数组中每个值是前面所有位置值的和:但有个问题,host端实参h_Input变量是N大小的uint型 创建buffer时也是N大小的uint型变量名是d_Input 将这个d_Input传给kernel函数scanExclusiveLocal1,而kernel那个位置形参是global uint4* d_Src 这个例子中竟然可以直接传过去的???uint直接传给了uint4,甚至没有强转换的???我以为这样直接传是 只传进来给uint4.x 另外的uint4.y、uint4.z和uint4.w都是随机的吗 随机的无意义的值。结果不是这样的!!!!有一个100个uint元素构成的缓冲区(因为int/uint这种,在A卡实现上,和普通的int/uint大小一样,因此就不区分cl_int之类的cl_前缀了)你可以使用uint *p1指向它,那么自然依然有100个有效元素, 即p1[0] - p1[99] 。如果你要使用uint4 *p2指向它,这也是可以的。但只有25个有效元素了,即p2[0] - p2[24]。无效的只是从p2[25] - p2[99]。而不是我想像的p2[0] - p2[99]中,分别只有x元素有效,y/z/w元素无效!!!大神指出这是我第N次在这种内存问题上栽倒了,上次及上上次有2个例子也是这样!
我现在是按照main函数一步步分析:arrayLength=4时,localsize=256,globalsize=N/4=1703936,执行第1个kernel,globalsize个线程去调用scan4Exclusive()函数,因为uint传给uint4,所以其实总共是有N个数的。又调用了scan4Inclusive()函数,其实就是每个globalID负责原来的4个uint数,函数scan4Inclusive()中将这4个数逐级相加,其实就是对于这4个数实现了每一个数是前面的数之和。然后函数scan4Inclusive()又调用了scan1Inclusive()函数,我分析到scan1Inclusive()函数里就卡住了。scan1Inclusive()函数用到了localID,因为localsize是256大小,而这里l_Data[pos]=0; pos+=size;(arrayLength=4时 size=1) l_data[pos]=idata;所以其实相当于每个localID算了2个位置的数,256个localID*2=512,所以l_Data大小为512是没毛病的。只是这个l_Data里奇数位置放的都是0,偶数位置放的分别是原数据uint的前4位之和、5-8位之和、9-12位之和、、、、我晕倒了、、、