有关展开的问题
理论:在相同情况下,内存效率与内存事物数成反比
低的内存效率意味着需要更多的内存事物。
在实验中可以证实这一点,当平移数据字节时(非对齐访问),内存效率会下降,且它们乘积是定值都是处理数据所需要的内存事物数。
问题:
但在实验中也发现问题:
展开是
1
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
i*=2;
unsigned int k = i + offset;
C[k] = A[i] + B[i];
C[k+1] = A[i+1] + B[i+1];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = i + offset;
2
if (k + blockDim.x < n)
{
C[k] = A[i] + B[i];
C[k + blockDim.x] = A[i + blockDim.x] + B[i + blockDim.x];
}
第二种能提速一倍,但第一种不提速,第一个效率为什么只有50%,而第二个效率是100%,难道是因为第二种访问离得较远是独立的内存事物,而第一种离得较近不属于独立的内存事物?
另一个问题:
独立的内存事物为什么可以提速(对比展开和不展开),它们效率都是100%,可以调用的缓存多么?,那为什么不独立的情况可调用的缓存不多吗,还是说一个时钟周期,请求的内存太少,有很多闲置的缓存?,当独立的请求增多时,处理的任务了且闲置的缓存得到了充分利用,所以快了?
是不是当没有闲置的缓存时,内存带宽就接近理论峰值?当请求超过峰值可调用的缓存时,性能就不会提升了,保持在峰值?
实验:
unrolling.cu
其中#include "../common/common.h"
中有CHECK宏,和CPU计时函数,去掉就可以用。
#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>
/*
* This example demonstrates the impact of misaligned writes on performance by
* forcing misaligned writes to occur on a float*.
*/
void checkResult(float *hostRef, float *gpuRef, const int N, const int offset)
{
double epsilon = 1.0E-8;
bool match = 1;
for (int i = offset; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
match = 0;
printf("different on %dth element: host %f gpu %f\n", i, hostRef[i],
gpuRef[i]);
break;
}
}
if (!match) printf("Arrays do not match.\n\n");
}
void initialData(float *ip, int size)
{
for (int i = 0; i < size; i++)
{
ip[i] = (float)( rand() & 0xFF ) / 100.0f;
}
return;
}
void sumArraysOnHost(float *A, float *B, float *C, const int n, int offset)
{
for (int idx = offset, k = 0; idx < n; idx++, k++)
{
C[idx] = A[k] + B[k];
}
}
__global__ void writeOffset(float *A, float *B, float *C, const int n,
int offset)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = i + offset;
if (k < n) C[k] = A[i] + B[i];
}
__global__ void warmup(float *A, float *B, float *C, const int n, int offset)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = i + offset;
if (k < n) C[k] = A[i] + B[i];
}
__global__ void writeOffsetUnroll2(float *A, float *B, float *C, const int n,
int offset)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = i + offset;
if (k + blockDim.x < n)
{
C[k] = A[i] + B[i];
C[k + blockDim.x] = A[i + blockDim.x] + B[i + blockDim.x];
}
}
__global__ void writeOffsetUnroll2_Me1(float *A, float *B, float *C, const int n,
int offset)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = i + offset;
for(int j = 0;j<2;j++)
C[k+j*blockDim.x] = A[i+j*blockDim.x] + B[i+j*blockDim.x];
}
__global__ void writeOffsetUnroll2_Me2(float *A, float *B, float *C, const int n,
int offset)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
i*=2;
unsigned int k = i + offset;
C[k] = A[i] + B[i];
C[k+1] = A[i+1] + B[i+1];
}
__global__ void writeOffsetUnroll4(float *A, float *B, float *C, const int n,
int offset)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = i + offset;
if (k + 3 * blockDim.x < n)
{
C[k] = A[i] + B[i];
C[k + blockDim.x] = A[i + blockDim.x] + B[i + blockDim.x];
C[k + 2 * blockDim.x] = A[i + 2 * blockDim.x] + B[i + 2 * blockDim.x];
C[k + 3 * blockDim.x] = A[i + 3 * blockDim.x] + B[i + 3 * blockDim.x];
}
}
__global__ void writeOffsetUnroll8(float *A, float *B, float *C, const int n,
int offset)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = i + offset;
for(int j = 0;j<8;j++)
C[k+j*blockDim.x] = A[i+j*blockDim.x] + B[i+j*blockDim.x];
}
__global__ void writeOffsetUnroll16(float *A, float *B, float *C, const int n,
int offset)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = i + offset;
for(int j = 0;j<16;j++)
C[k+j*blockDim.x] = A[i+j*blockDim.x] + B[i+j*blockDim.x];
}
__global__ void writeOffsetUnroll32(float *A, float *B, float *C, const int n,
int offset)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = i + offset;
for(int j = 0;j<32;j++)
C[k+j*blockDim.x] = A[i+j*blockDim.x] + B[i+j*blockDim.x];
}
int main(int argc, char **argv)
{
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("%s starting reduction at ", argv[0]);
printf("device %d: %s ", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
// set up array size
int nElem = 1 << 20; // total number of elements to reduce
printf(" with array size %d\n", nElem);
size_t nBytes = nElem * sizeof(float);
// set up offset for summary
int blocksize = 512;
int offset = 0;
if (argc > 1) offset = atoi(argv[1]);
if (argc > 2) blocksize = atoi(argv[2]);
// execution configuration
dim3 block (blocksize, 1);
dim3 grid ((nElem + block.x - 1) / block.x, 1);
// allocate host memory
float *h_A = (float *)malloc(nBytes);
float *h_B = (float *)malloc(nBytes);
float *hostRef = (float *)malloc(nBytes);
float *gpuRef = (float *)malloc(nBytes);
// initialize host array
initialData(h_A, nElem);
memcpy(h_B, h_A, nBytes);
// summary at host side
sumArraysOnHost(h_A, h_B, hostRef, nElem, offset);
// allocate device memory
float *d_A, *d_B, *d_C;
CHECK(cudaMalloc((float**)&d_A, nBytes));
CHECK(cudaMalloc((float**)&d_B, nBytes));
CHECK(cudaMalloc((float**)&d_C, nBytes));
// copy data from host to device
CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_B, h_A, nBytes, cudaMemcpyHostToDevice));
// warmup
double iStart = seconds();
warmup<<<grid, block>>>(d_A, d_B, d_C, nElem, offset);
CHECK(cudaDeviceSynchronize());
double iElaps = seconds() - iStart;
printf("warmup <<< %4d, %4d >>> offset %4d elapsed %f sec\n", grid.x,
block.x, offset, iElaps);
CHECK(cudaGetLastError());
// kernel 1:
iStart = seconds();
writeOffset<<<grid, block>>>(d_A, d_B, d_C, nElem, offset);
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
printf("writeOffset <<< %4d, %4d >>> offset %4d elapsed %f sec\n", grid.x,
block.x, offset, iElaps);
CHECK(cudaGetLastError());
// copy kernel result back to host side and check device results
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
checkResult(hostRef, gpuRef, nElem, offset);
// kernel 2
iStart = seconds();
writeOffsetUnroll2<<<grid.x / 2, block>>>(d_A, d_B, d_C, nElem / 2, offset);
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
printf("unroll2 <<< %4d, %4d >>> offset %4d elapsed %f sec\n",
grid.x / 2, block.x, offset, iElaps);
CHECK(cudaGetLastError());
// copy kernel result back to host side and check device results
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
checkResult(hostRef, gpuRef, nElem, offset);
// kernel 2
iStart = seconds();
writeOffsetUnroll4<<<grid.x / 4, block>>>(d_A, d_B, d_C, nElem / 2, offset);
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
printf("unroll4 <<< %4d, %4d >>> offset %4d elapsed %f sec\n",
grid.x / 4, block.x, offset, iElaps);
CHECK(cudaGetLastError());
// copy kernel result back to host side and check device results
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
checkResult(hostRef, gpuRef, nElem, offset);
iStart = seconds();
writeOffsetUnroll8<<<grid.x / 8, block>>>(d_A, d_B, d_C, nElem / 2, offset);
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
printf("unroll4 <<< %4d, %4d >>> offset %4d elapsed %f sec\n",
grid.x / 8, block.x, offset, iElaps);
CHECK(cudaGetLastError());
// copy kernel result back to host side and check device results
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
checkResult(hostRef, gpuRef, nElem, offset);
iStart = seconds();
writeOffsetUnroll16<<<grid.x / 32, block>>>(d_A, d_B, d_C, nElem / 2, offset);
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
printf("unroll4 <<< %4d, %4d >>> offset %4d elapsed %f sec\n",
grid.x / 8, block.x, offset, iElaps);
CHECK(cudaGetLastError());
// copy kernel result back to host side and check device results
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
checkResult(hostRef, gpuRef, nElem, offset);
iStart = seconds();
writeOffsetUnroll32<<<grid.x / 32, block>>>(d_A, d_B, d_C, nElem / 2, offset);
CHECK(cudaDeviceSynchronize());
iElaps = seconds() - iStart;
printf("unroll4 <<< %4d, %4d >>> offset %4d elapsed %f sec\n",
grid.x / 8, block.x, offset, iElaps);
CHECK(cudaGetLastError());
// copy kernel result back to host side and check device results
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
checkResult(hostRef, gpuRef, nElem, offset);
writeOffsetUnroll2_Me1<<<grid.x / 2, block>>>(d_A, d_B, d_C, nElem / 2, offset);
CHECK(cudaDeviceSynchronize());
CHECK(cudaGetLastError());
// copy kernel result back to host side and check device results
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
checkResult(hostRef, gpuRef, nElem, offset);
writeOffsetUnroll2_Me2<<<grid.x / 2, block>>>(d_A, d_B, d_C, nElem / 2, offset);
CHECK(cudaDeviceSynchronize());
CHECK(cudaGetLastError());
// copy kernel result back to host side and check device results
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
checkResult(hostRef, gpuRef, nElem, offset);
// free host and device memory
CHECK(cudaFree(d_A));
CHECK(cudaFree(d_B));
CHECK(cudaFree(d_C));
free(h_A);
free(h_B);
// reset device
CHECK(cudaDeviceReset());
return EXIT_SUCCESS;
}
----------
实验结果
结论
与猜想差不多:
一个时钟周期,请求的内存太少,有很多闲置的缓存?,当独立的请求增多时,处理的任务了且闲置的缓存得到了充分利用,所以快了,
当没有闲置的缓存时,内存带宽就接近理论峰值,当请求超过峰值可调用的缓存时,性能就不会提升了,保持在峰值。不过为什么Me_2的内存效率只有50%,不能解释。
启发:优化全局内存就是让内存吞吐接近理论峰值,当达到理论峰值时,就达到了最佳的性能,展开是一种非常有效提高全局内存带宽的方法,当内存效率是100%,但
带宽不到峰值时,也就是缓存没有充分利用,就需要展开这种优化方式,增加独立的内存请求,把缓存充分用完,并达到峰值。
因为处理数据全局变量一般都会全部访问一次,展开方法有很大的发挥空间,另shared memory 是为了减少全局内存的访问。
勘误
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int k = i + offset;
for(int j = 0;j<32;j++)
C[k+j*blockDim.x] = A[i+j*blockDim.x] + B[i+j*blockDim.x];
这段代码是错误的:只算了1/32的数据。
其实是重复做了1/32的数据,所以带宽会大,所以之前的结论是错误的,没有缓存利用不充分这一说。缓存一直都是在全力运输。
应该是
unsigned int i = blockIdx.x * 32 * blockDim.x + threadIdx.x;
unsigned int k = i + offset;
for(int j = 0;j<32;j++)
C[k+j*blockDim.x] = A[i+j*blockDim.x] + B[i+j*blockDim.x];
与书上说的展开的效果相差很远,展开还是要用在全局内存效率过低的情况,展开后相邻的独立内存请求增多,可以增加内存效率。
实验发现,现代的GPU的内存事物颗粒很小,内存效率普遍很高,适合乱序的内存访问程序,不适合对齐合并的简单的情况。
而共享内存也擅长处理乱序的内存访问的问题,看来使用共享内存进行优化成为趋势。
当前优化问题:要处理一些数据,对全局内存的访问是少不了的,优化程序的关键就是优化内存的访问(因为可以隐藏指令的延迟),处理数据一般情况是要把数据的所有字节都处理了,不然传那么多数据增加申请和拷贝的时间不适合GPU加速,既然所有全局内存都要访问,那么用多少内存事物取访问是关键,数据相等时内存事物数跟内存效率成反比,所以内存事物的效率是优化的关键,最优的情况,全局内存只被访问一次,即内存效率是100%,此时内存事物是最小值,即内存事物颗粒×内存事物数=数据量。现在比较有效增加内存事物的效率的方法有:展开和共享内存。
展开是将循环变成连续的语句,在现代GPU框架中,这个被默认优化了,所以展开在现在GPU上的表现和不展开一样。
#pragma unroll在cuda程序中自动将循环展开