size_t fea_pitch;
texture<unsigned char, 2> features2D;
cudaMallocPitch((void**)(&dev_features), &fea_pitch, sizeof(unsigned char) * sfeaturesw, sfeaturesh);
cudaChannelFormatDesc feaDesc = cudaCreateChannelDesc<unsigned char>();
cudaMemcpy2D(dev_features, fea_pitch, sfeatures, sizeof(unsigned char) * sfeaturesw, sizeof(unsigned char) * sfeaturesw, sfeaturesh, cudaMemcpyHostToDevice);
cudaBindTexture2D(NULL, features2D, dev_features, feaDesc, sfeaturesw, sfeaturesh, fea_pitch);//绑定
纹理拾取(读取)的步骤
point1=tex2D(imageData2D,box_x+x1,box_y+y1);//第y1行,第x1列,cpu版即这个意思,此处尤须注意,和一般数据结构不同
cudaMallocPitch((void**)(&dev_features), &fea_pitch, sizeof(unsigned char) * sfeaturesw, sfeaturesh);
cudaChannelFormatDesc feaDesc = cudaCreateChannelDesc<unsigned char>();
cudaMemcpy2D(dev_features, fea_pitch, sfeatures, sizeof(unsigned char) * sfeaturesw, sizeof(unsigned char) * sfeaturesw, sfeaturesh, cudaMemcpyHostToDevice);
cudaBindTexture2D(NULL, features2D, dev_features, feaDesc, sfeaturesw, sfeaturesh, fea_pitch);
--------------------------------------------------------------------------------
int sfeatures_size = sizeof(unsigned char) * sfeaturesw * sfeaturesh;
cudaChannelFormatDesc chDesc2 = cudaCreateChannelDesc<unsigned char>();
cudaMallocArray(&featuresArray, &chDesc2, sfeaturesw, sfeaturesh);
cudaMemcpyToArray( featuresArray, 0, 0, sfeatures, sfeatures_size, cudaMemcpyHostToDevice );
cudaBindTextureToArray( features2D, featuresArray);
-------------------------------------------------------------------------------------
int grid_data_size = sizeof(float) * gridl;
cudaMalloc((void**)&dev_grid,grid_data_size);
cudaMemcpy(dev_grid,sgrid,grid_data_size,cudaMemcpyHostToDevice);
cudaBindTexture(0,gridData1D,dev_grid);
对于一维纹理,不管是Linear Memory还是使用cudaMallocPitch的,都可以使用tex1Dfetch和tex1D
而对于二维纹理,不管是cudaArray还是cudaMallocPitch都是使用tex2D
下面是关于cudaMemcpy2D和cudaMallocPitch两个函数的参数和用法
最近学习了下CUDA矩阵内存对齐分配的方法,主要是cudaMemcpy2D和cudaMallocPitch两个函数的用法,先看看cudalibrary中如何定义的这两个函数:
| cudaError_t | ( | void ** | devPtr, | |
| size_t * | pitch, | |||
| size_t | width, | |||
| size_t | height | | ||
| ) |
Allocates at least widthInBytes height *devPtr *pitch pitch T, the address is computed as:
T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
For allocations of 2D arrays, it is recommended that programmers consider performing pitch allocations using
-
Parameters:
-
devPtr - Pointer to allocated pitched device memory pitch - Pitch for allocation width - Requested pitched allocation width height - Requested pitched allocation height
| cudaError_t | ( | void * | dst, | |
| size_t | dpitch, | |||
| const void * | src, | |||
| size_t | spitch, | |||
| size_t | width, | |||
| size_t | height, | |||
| enum | kind | | ||
| ) |
Copies a matrix (height width src dst, where kind dpitch spitch dst src, including any padding added to the end of each row. The memory areas may not overlap. Calling dst src dpitch spitch
-
Parameters:
-
dst - Destination memory address dpitch - Pitch of destination memory src - Source memory address spitch - Pitch of source memory width - Width of matrix transfer (columns in bytes) height - Height of matrix transfer (rows) kind - Type of transfer
由此,可以对这两个函数有个充分的认识。此外,cudaMallocPitch和cudaMemcpy2D,一般用于二维数组各维度size不是2的幂次方的问题。使用cudaMallocPitch()那么该数组的对齐、大小、起始址等就自动做好了,其返回的pitch就是真正分配给数组的size(往往大于其真正申请的大小)。
PS:
patch的理解:
C语言申请2维内存时,一般是连续存放的。a[y][x]存放在第y*widthofx*sizeof(元素)+x*sizeof(元素)个字节。但在cuda的global memory访问中,从256字节对齐的地址(addr=0, 256, 512, ...)开始的连续访问是最有效率的。 这样,为了提高内存访问的效率,有了cudaMallocPitch函数。 cudaMallocPitch函数分配的内存中,数组的每一行的第一个元素的开始地址都保证是对齐的。因为每行有多少个数据是不确定的widthofx*sizeof(元素)不一定是256的倍数。故此,为保证数组的每一行的第一个元素的开始地址对齐,cudaMallocPitch在分配内存时,每行会多分配一些字节,以保证widthofx*sizeof(元素)+多分配的字节是256的倍数(对齐)。这样,y*widthofx*sizeof(元素)+x*sizeof(元素)来计算a[y][x]的地址就不正确了。而应该是y*[widthofx*sizeof(元素)+多分配的字节]+x*sizeof(元素)。而函数中返回的pitch的值就是widthofx*sizeof(元素)+多分配的字节。
一、内存对齐的原因
大部分的参考资料都是如是说的:
1、平台原因(移植原因):不是所有的硬件平台都能访问任意地址上的任意数据 的;某些硬件平台只能在某些地址处取某些特定类型的数据,否则抛出硬件异常。
2、性能原因:数据结构(尤其是栈)应该尽可能地在自然边界上对齐。 原因在于,为了访问未对齐的内存,处理器需要作两次内存访问;而对齐的内存访问仅需要一次访问。

本文深入探讨了CUDA中纹理绑定与拾取的基本步骤,包括数组定义、多维数组访问、绑定纹理实例及使用纹理进行数据读取。同时解释了CUDA矩阵内存对齐分配方法,包括`cudaMallocPitch`和`cudaMemcpy2D`函数的应用场景与参数说明,旨在提高GPU内存访问效率。
295

被折叠的 条评论
为什么被折叠?



