之前写一道题将多重for循环改编成并行,由于没有使用同步函数,导致CPU端主线程和GPU端的数据传输出现了问题,下面是最终成功的样例代码:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <cmath>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define G 9.8f //gravitational constant 重力
#define dt 0.01f //time step 时间片的具体时间
#define SOFTENING 2.0f //softening parameter to help with numerical instability 变量(空气的影响参数?
struct nbody {
float x, y, vx, vy, m;//m=质量
};
typedef struct nbody nbody;
//void print_help();
void step(void);
void d_step(void);
__global__ void d_step3(int* k, float* x_total, float* y_total,const nbody*input_data);
int N = 1000; // Number of Nbody
int D = 3; // dimension 尺寸?体积?
int Iter = 1000; // iteration number 迭代数
nbody* input_data;
float* den_arr; //dencity array 密度
__device__ float d_x_total = 0;
__device__ float d_y_total = 0;
int main() {
input_data = (nbody*)malloc(sizeof(nbody) * N);
den_arr = (float*)malloc(sizeof(float) * D * D);
if (!input_data || !den_arr) {
printf("malloc failed!\n");
exit(0);
}
for (int i = 0; i < N; i++) {
input_data[i].m = 1.0 / N;
input_data[i].x = (float)rand() / (float)RAND_MAX;
input_data[i].y = (float)rand() / (float)RAND_MAX;
input_data[i].vx = 0;
input_data[i].vy = 0;
}
unsigned long long start1, end1;
//这里是global代码:
//创建新的device数组保存用于计算:
start1 = clock();
//进行device端计算:
d_step();
end1 = clock();
//释放device内存数据
printf("t2= %lf\n", (double)(end1 - start1) / CLOCKS_PER_SEC);
//host端代码:
unsigned long long start, end;
start = clock();
//step();
end = clock();
printf("t1 = %lf\n", (double)(end - start) / CLOCKS_PER_SEC);
int