#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#define data_size 1026
#define thread_num 256
using namespace std;
__global__ static void sumOfSquares(int *d_idata,int *d_odata)
{
const int tid=threadIdx.x;
printf("%d ",tid);
d_odata[tid]=d_idata[tid]*d_idata[tid];
__syncthreads();
}
int main()
{
int h_idata [data_size];
for (int i = 0; i < data_size; i ++)
{
h_idata[i] = i;//rand()%10;
}
int * d_idata;
int * d_odata;
cudaMalloc((void **)&d_idata,sizeof(int)*data_size);
cudaMalloc((void **)&d_odata,sizeof(int)*data_size);
cudaMemcpy(d_idata,h_idata,sizeof(int)*data_size,cudaMemcpyHostToDevice);
sumOfSquares<<<1,data_size,0>>>(d_idata,d_odata);
int gpu_sum[data_size];
cudaMemcpy(&gpu_sum,d_odata,sizeof(int)*data_size,cudaMemcpyDeviceToHost);
cudaFree(d_idata);
cudaFree(d_odata);
int final_gpu_sum=0;
for (int i=0;i<data_size;i++)
{
final_gpu_sum+=gpu_sum[i];
}
printf("final_gpu_sum=%d\n",final_gpu_sum);
int cpu_sum = 0;
for(int i = 0; i < data_size; i++)
{
cpu_sum+= h_idata[i] * h_idata[i];
}
printf("cpu_sum: %d\n", cpu_sum);
cin.get();
}
cuda 简单数组运算
最新推荐文章于 2024-01-05 09:35:40 发布