cuda——使用多个stream

最新推荐文章于 2025-06-11 10:55:58 发布

原创最新推荐文章于 2025-06-11 10:55:58 发布 · 2.9k 阅读

3 ·

CC 4.0 BY-SA版权

cuda 专栏收录该内容

5 篇文章

订阅专栏

/*/*对stream的介绍,使用两个流*/
#include <stdio.h>
#define N 1024*1024//每次从CPU传输到GPU的数据块大小
#define M N*20//CPU上的总数据量
/*测试设备是否支持边执行核函数边复制数据*/
bool support_overlap(){
	cudaDeviceProp prop;
	int preDev;
	cudaGetDevice(&preDev);
	cudaGetDeviceProperties(&prop,preDev);
	if(prop.deviceOverlap)
		return true;
	return false;
}
__global__ void add(int* a,int* b,int* c){
	int tid = threadIdx.x+blockIdx.x*blockDim.x;
	if(tid<N){
		c[tid] = a[tid] + b[tid];
	}
}
int main(){
	cudaEvent_t start,stop;
	float elapsedTime;
	/*声明流*/
	cudaStream_t stream1;
	cudaStream_t stream2;
	int *a,*b,*c,*d_a1,*d_b1,*d_c1,*d_a2,*d_b2,*d_c2;
	if(!support_overlap){
		printf("Sorry,the device cannot support overlap.\n");
		return 0;
	}
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start,0);
	/*初始化流*/
	cudaStreamCreate(&stream1);
	cudaStreamCreate(&stream2);
	/*在GPU上分配两组内存空间*/
	cudaMalloc((void**)&d_a1,N*sizeof(int));
	cudaMalloc((void**)&d_b1,N*sizeof(int));
	cudaMalloc((void**)&d_c1,N*sizeof(int));
	cudaMalloc((void**)&d_a2,N*sizeof(int));
	cudaMalloc((void**)&d_b2,N*sizeof(int));
	cudaMalloc((void**)&d_c2,N*sizeof(int));
	/*在主机上分配页锁定内存*/
	cudaHostAlloc((void**)&a,M*sizeof(int),cudaHostAllocDefault);
	cudaHostAlloc((void**)&b,M*sizeof(int),cudaHostAllocDefault);
	cudaHostAlloc((void**)&c,M*sizeof(int),cudaHostAllocDefault);
	/*用随机数填充主机内存*/
	for(int i = 0;i<M;i++){
		a[i] = rand();
		b[i] = rand();
	}
	/*将输入缓冲区划分为更小的块，并在每个块上执行“数据传输到GPU”，“计算”，“数据传输回CPU”三个步骤*/
	for(int i = 0;i<M;i+=(N*2)){
		/*主机上的页锁定内存以异步方式复制到设备上。因为使用两个流，所以每个流交替的负责其中的一块
		下面操作的顺序是需要特别注意的！！*/
		cudaMemcpyAsync(d_a1,a+i,N*sizeof(int),cudaMemcpyHostToDevice,stream1);
		cudaMemcpyAsync(d_b1,b+i,N*sizeof(int),cudaMemcpyHostToDevice,stream1);
		cudaMemcpyAsync(d_a2,a+i+N,N*sizeof(int),cudaMemcpyHostToDevice,stream2);
		cudaMemcpyAsync(d_b2,b+i+N,N*sizeof(int),cudaMemcpyHostToDevice,stream2);
		add<<<N/256,256,0,stream1>>>(d_a1,d_b1,d_c1);
		add<<<N/256,256,0,stream2>>>(d_a2,d_b2,d_c2);
		cudaMemcpyAsync(c+i,d_c1,sizeof(int),cudaMemcpyDeviceToHost,stream1);
		cudaMemcpyAsync(c+i+N,d_c2,sizeof(int),cudaMemcpyDeviceToHost,stream2);
	}
	/*实现CPU和GPU的同步*/
	cudaStreamSynchronize(stream1);
	cudaStreamSynchronize(stream2);
	cudaEventRecord(stop,0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime,start,stop);
	printf("%f",elapsedTime);
	cudaFree(d_a1);
	cudaFree(d_b1);
	cudaFree(d_c1);
	cudaFree(d_a2);
	cudaFree(d_b2);
	cudaFree(d_c2);
	cudaFreeHost(a);
	cudaFreeHost(b);
	cudaFreeHost(b);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	cudaStreamDestroy(stream1);
	cudaStreamDestroy(stream2);
	return 0;
}