// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// record timestamp before and after running of kernel_function
cudaEventRecord(start, 0);
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
cudaEventRecord(stop, 0);
// get elapsetime through cudaEventElapsedTime, it is calculated by the unit of ms
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
printf("time spent executing by the GPU: %.2f\n", gpu_time/1000);
2、使用CPU记录内核程序运行时间
//initialization
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkResetTimer(&timer);
//get the ti