1. 声明 __shared__ 变量或数组:
__shared__ float sh_farr[ 256];
__shared__ int a;
2.结构体指针成员的分配设备内存:
typedef struct Teacher_t
...{
int a;
unsigned int *g_mem1;
float *g_mem2;
}Teacher;
void initMem( Teacher& t, const unsigned int mat_size)
...{
unsigned int mat_size_ui = sizeof(int) * mat_size;
unsigned int mat_size_f = sizeof(float) * mat_size;
CUDA_SAFE_CALL( cudaMalloc((void**)&t.g_mem1, mat_size_ui) );
CUDA_SAFE_CALL( cudaMalloc((void**)&t.g_mem1, mat_size_f) );
...
}
3.计时:
unsigned int timer = 0;
CUT_SAFE_CALL( cutCreateTimer( &timer));
CUT_SAFE_CALL( cutStartTimer( timer));
...{
...//kernel
}
CUT_SAFE_CALL( cutStopTimer( timer));
printf( "Total time: %f ms ", cutGetTimerValue( timer) );
CUT_SAFE_CALL( cutDeleteTimer( timer));
4. 获取输入命令行中包含的文件名:
/**/
//! Check if a particular filename has to be used for the file where the result
//! is stored
//! @param argc number of command line arguments (from main(argc, argv)
//! @param argv pointers to command line arguments (from main(argc, argv)
//! @param filename filename of result file, updated if user specified
//! filename
/**/
void
getResultFilename( int argc, char** argv, char*& filename)
...{
char* temp = NULL;
cutGetCmdLineArgumentstr( argc, (const char**) argv, "filename-result", &temp);
if( NULL != temp)
...{
filename = (char*) malloc( sizeof(char) * strlen( temp));
strcpy( filename, temp);
cutFree( temp);
}
printf( "Result filename: '%s' ", filename);
}
类似的:
/**/
//! Check if a specific precision of the eigenvalue has to be obtained
//! @param argc number of command line arguments (from main(argc, argv)
//! @param argv pointers to command line arguments (from main(argc, argv)
//! @param iters_timing numbers of iterations for timing, updated if a
//! specific number is specified on the command line
/**/
void
getPrecision( int argc, char** argv, float& precision)
...{
float temp = -1.0f;
cutGetCmdLineArgumentf( argc, (const char**) argv, "precision", &temp);
if( temp > 0.0f)
...{
precision = temp;
}
printf( "Precision: %f ", precision);
}
5.Host调用完kernel函数需要进行线程同步,而在kernel或global函数只需要在必要的地方__syncthreads();即可:
CUDA_SAFE_CALL( cudaThreadSynchronize());
本文来自优快云博客,转载请标明出处:http://blog.youkuaiyun.com/dvchn/archive/2008/02/25/2119590.aspx