cuda多重for循环中使用同步函数

最新推荐文章于 2025-03-13 15:38:20 发布

Alfafar

最新推荐文章于 2025-03-13 15:38:20 发布

阅读量1.8k

点赞数

文章标签： cuda c++

本文链接：https://blog.youkuaiyun.com/Alfafar/article/details/110346642

版权

之前写一道题将多重for循环改编成并行，由于没有使用同步函数，导致CPU端主线程和GPU端的数据传输出现了问题，下面是最终成功的样例代码：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <cmath>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define G			9.8f		//gravitational constant 重力
#define dt			0.01f		//time step 时间片的具体时间
#define SOFTENING	2.0f		//softening parameter to help with numerical instability 变量(空气的影响参数？

struct nbody {
   
	float x, y, vx, vy, m;//m=质量
};

typedef struct nbody nbody;

//void print_help();
void step(void);
void d_step(void);
__global__  void d_step3(int* k, float* x_total, float* y_total,const nbody*input_data);
int N = 1000; // Number of Nbody
int D = 3; // dimension  尺寸？体积？
int Iter = 1000; // iteration number 迭代数
nbody* input_data;
float* den_arr; //dencity array 密度 
__device__ float d_x_total = 0;
__device__ float d_y_total = 0;


int main() {
   
	input_data = (nbody*)malloc(sizeof(nbody) * N);
	den_arr = (float*)malloc(sizeof(float) * D * D);
	if (!input_data || !den_arr) {
   
		printf("malloc failed!\n");
		exit(0);
	}
	for (int i = 0; i < N; i++) {
   
		input_data[i].m = 1.0 / N;
		input_data[i].x = (float)rand() / (float)RAND_MAX;
		input_data[i].y = (float)rand() / (float)RAND_MAX;
		input_data[i].vx = 0;
		input_data[i].vy = 0;
	}


	unsigned long long start1, end1;
	//这里是global代码：
	//创建新的device数组保存用于计算：
	start1 = clock();
	//进行device端计算：
	d_step();

	end1 = clock();

	//释放device内存数据


	printf("t2= %lf\n", (double)(end1 - start1) / CLOCKS_PER_SEC);
	
	//host端代码：
	unsigned long long start, end;
	start = clock();
	//step();
	end = clock();
	printf("t1 = %lf\n", (double)(end - start) / CLOCKS_PER_SEC);
	int