///****测试在kernel里面交换全局内存的值
#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;
const int N=10000;
__global__ void change(int *a,int *b)
{
int temp;
for(int i=0;i<N;i++)
{
/*temp=a[i];
a[i]=b[i];
b[i]=a[i];*/
temp=a[i]+b[i];
}
}
int main(void)
{
int ha[N],hb[N];
int *da,*db;
clock_t h_start,h_elapsed;
cudaMalloc((void**)&da,N*sizeof(int));
cudaMalloc((void**)&db,N*sizeof(int));
for(int i=0;i<N;i++)
{
ha[i]=i;
hb[i]=2*i;
}
cudaMemcpy(da,ha,N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(db,hb,N*sizeof(int),cudaMemcpyHostToDevice);
cudaEvent_t start,stop;
float elapsed;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
for(int i=0;i<1000;i++)
{change<<<1,1>>>(da,db);}
cudaDeviceSynchronize();
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed,start,stop);
cout<<"gpu__"<<elapsed<<endl;
cudaMemcpy(ha,da,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(hb,db,N*sizeof(int),cudaMemcpyDeviceToHost);
/*for(int i=0;i<N;i++)
{
cout<<ha[i]<<" "<<hb[i]<<endl;
}*/
h_start=clock();
int temp;
for(int j=0;j<1000;j++)
{for(int i=0;i<N;i++)
{
/*temp=ha[i];
ha[i]=hb[i];
hb[i]=temp;*/
temp=ha[i]+hb[i];
}
}
h_elapsed=clock()-h_start;
cout<<"cpu__"<<h_elapsed<<endl;
/*for(int i=0;i<N;i++)
{
cout<<ha[i]<<" "<<hb[i]<<endl;
}*/
return 0;
}
#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;
const int N=10000;
__global__ void change(int *a,int *b)
{
int temp;
for(int i=0;i<N;i++)
{
/*temp=a[i];
a[i]=b[i];
b[i]=a[i];*/
temp=a[i]+b[i];
}
}
int main(void)
{
int ha[N],hb[N];
int *da,*db;
clock_t h_start,h_elapsed;
cudaMalloc((void**)&da,N*sizeof(int));
cudaMalloc((void**)&db,N*sizeof(int));
for(int i=0;i<N;i++)
{
ha[i]=i;
hb[i]=2*i;
}
cudaMemcpy(da,ha,N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(db,hb,N*sizeof(int),cudaMemcpyHostToDevice);
cudaEvent_t start,stop;
float elapsed;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
for(int i=0;i<1000;i++)
{change<<<1,1>>>(da,db);}
cudaDeviceSynchronize();
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed,start,stop);
cout<<"gpu__"<<elapsed<<endl;
cudaMemcpy(ha,da,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(hb,db,N*sizeof(int),cudaMemcpyDeviceToHost);
/*for(int i=0;i<N;i++)
{
cout<<ha[i]<<" "<<hb[i]<<endl;
}*/
h_start=clock();
int temp;
for(int j=0;j<1000;j++)
{for(int i=0;i<N;i++)
{
/*temp=ha[i];
ha[i]=hb[i];
hb[i]=temp;*/
temp=ha[i]+hb[i];
}
}
h_elapsed=clock()-h_start;
cout<<"cpu__"<<h_elapsed<<endl;
/*for(int i=0;i<N;i++)
{
cout<<ha[i]<<" "<<hb[i]<<endl;
}*/
return 0;
}