/*/*对stream的介绍,使用两个流*/
#include <stdio.h>
#define N 1024*1024//每次从CPU传输到GPU的数据块大小
#define M N*20//CPU上的总数据量
/*测试设备是否支持边执行核函数边复制数据*/
bool support_overlap(){
cudaDeviceProp prop;
int preDev;
cudaGetDevice(&preDev);
cudaGetDeviceProperties(&prop,preDev);
if(prop.deviceOverlap)
return true;
return false;
}
__global__ void add(int* a,int* b,int* c){
int tid = threadIdx.x+blockIdx.x*blockDim.x;
if(tid<N){
c[tid] = a[tid] + b[tid];
}
}
int main(){
cudaEvent_t start,stop;
float elapsedTime;
/*声明流*/
cudaStream_t stream1;
cudaStream_t stream2;
int *a,*b,*c,*d_a1,*d_b1,*d_c1,*d_a2,*d_b2,*d_c2;
if(!support_overlap){
printf("Sorry,the device cannot support overlap.\n");
return 0;
}
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
/*初始化流*/
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
/*在GPU上分配两组内存空间*/
cudaMalloc((void**)&d_a1,N*sizeof(int));
cudaMalloc((void**)&d_b1,N*sizeof(int));
cudaMalloc((void**)&d_c1,N*sizeof(int));
cudaMalloc((void**)&d_a2,N*sizeof(int));
cudaMalloc((void**)&d_b2,N*sizeof(int));
cudaMalloc((void**)&d_c2,N*sizeof(int));
/*在主机上分配页锁定内存*/
cudaHostAlloc((void**)&a,M*sizeof(int),cudaHostAllocDefault);
cudaHostAlloc((void**)&b,M*sizeof(int),cudaHostAllocDefault);
cudaHostAlloc((void**)&c,M*sizeof(int),cudaHostAllocDefault);
/*用随机数填充主机内存*/
for(int i = 0;i<M;i++){
a[i] = rand();
b[i] = rand();
}
/*将输入缓冲区划分为更小的块,并在每个块上执行“数据传输到GPU”,“计算”,“数据传输回CPU”三个步骤*/
for(int i = 0;i<M;i+=(N*2)){
/*主机上的页锁定内存以异步方式复制到设备上。因为使用两个流,所以每个流交替的负责其中的一块
下面操作的顺序是需要特别注意的!!*/
cudaMemcpyAsync(d_a1,a+i,N*sizeof(int),cudaMemcpyHostToDevice,stream1);
cudaMemcpyAsync(d_b1,b+i,N*sizeof(int),cudaMemcpyHostToDevice,stream1);
cudaMemcpyAsync(d_a2,a+i+N,N*sizeof(int),cudaMemcpyHostToDevice,stream2);
cudaMemcpyAsync(d_b2,b+i+N,N*sizeof(int),cudaMemcpyHostToDevice,stream2);
add<<<N/256,256,0,stream1>>>(d_a1,d_b1,d_c1);
add<<<N/256,256,0,stream2>>>(d_a2,d_b2,d_c2);
cudaMemcpyAsync(c+i,d_c1,sizeof(int),cudaMemcpyDeviceToHost,stream1);
cudaMemcpyAsync(c+i+N,d_c2,sizeof(int),cudaMemcpyDeviceToHost,stream2);
}
/*实现CPU和GPU的同步*/
cudaStreamSynchronize(stream1);
cudaStreamSynchronize(stream2);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime,start,stop);
printf("%f",elapsedTime);
cudaFree(d_a1);
cudaFree(d_b1);
cudaFree(d_c1);
cudaFree(d_a2);
cudaFree(d_b2);
cudaFree(d_c2);
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(b);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
return 0;
}
cuda——使用多个stream
最新推荐文章于 2025-06-11 10:55:58 发布