Laplace方程求解程序的性能优化

1.首先我们拿到源码,进行编译执行看一下执行时间

当前花费时钟周期为14596455,就是14.596455微秒

2.使用gprof分析一下热点函数

gprof Laplace gmon.out

可以看到LaplasEquationSolver::LaplasEquationSolver(double(*u1)[N], std::array<double, N>& x, std::array<double, K>& y, double w)函数占据100%的时间

3.找到源码
 

LaplasEquationSolver::LaplasEquationSolver(double(*u1)[N], std::array<double, N>& x, std::array<double, K>& y, double w)
{
	double u2[N][K] = { 0 }, u_new[N][K] = { 0 };
	std::vector<double> errors;
	double max_error;
	double eps = 0.01;

	for (uint8_t i = 0; i < N; ++i)
		for (uint8_t j = 0; j < K; ++j) {
			u1[i][j] = 0;
			u2[i][j] = 0;
			u_new[i][j] = 0;
		}


	for (uint8_t i = 0; i < N; ++i)
	{
		u1[i][0] = downSideRectangle(x[i]);
		u1[i][K - 1] = upSideRectangle(x[i]);
	}

	for (uint8_t j = 0; j < K; ++j)
	{
		u1[0][j] = leftSideRectangle(y[j]);
		u1[N - 1][j] = rightSideRectangle(y[j]);
	}

	for (uint8_t i = 0; i < N; ++i)
	{
		u2[i][0] = downSideRectangle(x[i]);
		u2[i][K - 1] = upSideRectangle(x[i]);
	}

	for (uint8_t j = 0; j < K; ++j)
	{
		u2[0][j] = leftSideRectangle(y[j]);
		u2[N - 1][j] = rightSideRectangle(y[j]);
	}

	int k = 0;
	clock_t start_s=clock();
	do {
		for (uint8_t j = 1; j < K - 1; ++j) {
			for (uint8_t i = 1; i < N - 1; ++i) {
				u1[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
			}
		}

		for (uint8_t j = 1; j < K - 1; ++j) {
			for (uint8_t i = 1; i < N - 1; ++i) {
				u2[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
			}
		}

		for (uint8_t j = 0; j < K; ++j) {
			for (uint8_t i = 0; i < N; ++i) {
				u_new[i][j] = u1[i][j] + w * (u2[i][j] - u1[i][j]);
			}
		}

		for (uint8_t i = 0; i < N; ++i) {
			for (uint8_t j = 0; j < K; ++j) {
				errors.push_back(fabs(u_new[i][j] - u1[i][j]));;
			}
		}
		max_error = *std::max_element(errors.begin(), errors.end());
		//if(k%100==0)std::cout<<k<<" "<<max_error<<"\n";
		errors.clear();
		for (uint8_t i = 0; i < N; ++i) {
			for (uint8_t j = 0; j < K; ++j) {
				u1[i][j] = u_new[i][j];
			}
		}
		k++;
	} while (max_error > eps);
	clock_t end_s=clock();

	std::cout << "Input = "<<N<<" "<<K <<std::endl;
	std::cout << "w= " << w << ", iteration= " << k << ", max error = " << max_error << std::endl;
	std::cout << "cost time = "<< end_s - start_s << " clocks"<< std::endl;
}

我们可以看到,代码大体是2个矩阵u1,u2,先填充边缘,再填充内部

4.对于大量的for循环,可采用的以下几种方式:循环合并,循环展开,循环交换,循环分布,循环不变量外提,循环分块,循环分裂。要注意循环体之间容易产生影响,要注意先后次序。

循环合并,减少循环迭代开销,下面是合并结果

LaplasEquationSolver::LaplasEquationSolver(double(*u1)[N], std::array<double, N>& x, std::array<double, K>& y, double w)
{
	double u2[N][K] = { 0 }, u_new[N][K] = { 0 };
	std::vector<double> errors;
	double max_error;
	double eps = 0.01;

	for (uint8_t i = 0; i < N; ++i)
		for (uint8_t j = 0; j < K; ++j) {
			u1[i][j] = 0;
			u2[i][j] = 0;
			u_new[i][j] = 0;
		}


	for (uint8_t i = 0; i < N; ++i)
	{
		u1[i][0] = downSideRectangle(x[i]);
		u1[i][K - 1] = upSideRectangle(x[i]);
		u2[i][0] = downSideRectangle(x[i]);
		u2[i][K - 1] = upSideRectangle(x[i]);
	}

	for (uint8_t j = 0; j < K; ++j)
	{
		u1[0][j] = leftSideRectangle(y[j]);
		u1[N - 1][j] = rightSideRectangle(y[j]);
		u2[0][j] = leftSideRectangle(y[j]);
		u2[N - 1][j] = rightSideRectangle(y[j]);
	}

	int k = 0;
	clock_t start_s=clock();
	do {
		for (uint8_t j = 1; j < K - 1; ++j) {
			for (uint8_t i = 1; i < N - 1; ++i) {
				u1[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
			}
		}
		for (uint8_t j = 1; j < K - 1; ++j) {
			for (uint8_t i = 1; i < N - 1; ++i) {
				u2[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
			}
		}

		for (uint8_t j = 0; j < K; ++j) {
			for (uint8_t i = 0; i < N; ++i) {
				u_new[i][j] = u1[i][j] + w * (u2[i][j] - u1[i][j]);
				errors.push_back(fabs(u_new[i][j] - u1[i][j]));;
			}
		}

		max_error = *std::max_element(errors.begin(), errors.end());
		//if(k%100==0)std::cout<<k<<" "<<max_error<<"\n";
		errors.clear();
		for (uint8_t i = 0; i < N; ++i) {
			for (uint8_t j = 0; j < K; ++j) {
				u1[i][j] = u_new[i][j];
			}
		}
		k++;
	} while (max_error > eps);
	clock_t end_s=clock();

	std::cout << "Input = "<<N<<" "<<K <<std::endl;
	std::cout << "w= " << w << ", iteration= " << k << ", max error = " << max_error << std::endl;
	std::cout << "cost time = "<< end_s - start_s << " clocks"<< std::endl;
}

编译执行

我们发现执行时间为12.69168微秒

循环交换,增强向量化识别,下面是交换结果

LaplasEquationSolver::LaplasEquationSolver(double(*u1)[N], std::array<double, N>& x, std::array<double, K>& y, double w)
{
	double u2[N][K] = { 0 }, u_new[N][K] = { 0 };
	std::vector<double> errors;
	double max_error;
	double eps = 0.01;

	for (uint8_t i = 0; i < N; ++i)
		for (uint8_t j = 0; j < K; ++j) {
			u1[i][j] = 0;
			u2[i][j] = 0;
			u_new[i][j] = 0;
		}


	for (uint8_t i = 0; i < N; ++i)
	{
		u1[i][0] = downSideRectangle(x[i]);
		u1[i][K - 1] = upSideRectangle(x[i]);
		u2[i][0] = downSideRectangle(x[i]);
		u2[i][K - 1] = upSideRectangle(x[i]);
	}

	for (uint8_t j = 0; j < K; ++j)
	{
		u1[0][j] = leftSideRectangle(y[j]);
		u1[N - 1][j] = rightSideRectangle(y[j]);
		u2[0][j] = leftSideRectangle(y[j]);
		u2[N - 1][j] = rightSideRectangle(y[j]);
	}

	int k = 0;
	clock_t start_s=clock();
	do {
		for (uint8_t i = 1; i < K - 1; ++i) {
			for (uint8_t j = 1; j < N - 1; ++j) {
				u1[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
				
			}
		}
		for (uint8_t i = 1; i < K - 1; ++i) {
			for (uint8_t j = 1; j < N - 1; ++j) {
				u2[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
				
			}
		}

		for (uint8_t i = 0; i < K; ++i) {
			for (uint8_t j = 0; j < N; ++j) {
				u_new[i][j] = u1[i][j] + w * (u2[i][j] - u1[i][j]);
				errors.push_back(fabs(u_new[i][j] - u1[i][j]));;
			}
		}

		max_error = *std::max_element(errors.begin(), errors.end());
		//if(k%100==0)std::cout<<k<<" "<<max_error<<"\n";
		errors.clear();
		for (uint8_t i = 0; i < N; ++i) {
			for (uint8_t j = 0; j < K; ++j) {
				u1[i][j] = u_new[i][j];
			}
		}
		k++;
	} while (max_error > eps);
	clock_t end_s=clock();

	std::cout << "Input = "<<N<<" "<<K <<std::endl;
	std::cout << "w= " << w << ", iteration= " << k << ", max error = " << max_error << std::endl;
	std::cout << "cost time = "<< end_s - start_s << " clocks"<< std::endl;
}

编译执行

我们可以看到执行时间9.660195微秒

循环不变量外提,减少循环内计算量,下面是简化结果

LaplasEquationSolver::LaplasEquationSolver(double(*u1)[N], std::array<double, N>& x, std::array<double, K>& y, double w)
{
	double u2[N][K] = { 0 }, u_new[N][K] = { 0 };
	std::vector<double> errors;
	double max_error;
	double eps = 0.01;

	for (uint8_t i = 0; i < N; ++i)
		for (uint8_t j = 0; j < K; ++j) {
			u1[i][j] = 0;
			u2[i][j] = 0;
			u_new[i][j] = 0;
		}


	for (uint8_t i = 0; i < N; ++i)
	{
		u1[i][0] = downSideRectangle(x[i]);
		u1[i][K - 1] = upSideRectangle(x[i]);
		u2[i][0] = u1[i][0];
		u2[i][K - 1] = u1[i][K - 1];
	}

	for (uint8_t j = 0; j < K; ++j)
	{
		u1[0][j] = leftSideRectangle(y[j]);
		u1[N - 1][j] = rightSideRectangle(y[j]);
		u2[0][j] = u1[0][j];
		u2[N - 1][j] = u1[N - 1][j];
	}

	int k = 0;
	clock_t start_s=clock();
	do {
		for (uint8_t i = 1; i < K - 1; ++i) {
			for (uint8_t j = 1; j < N - 1; ++j) {
				u1[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
				
			}
		}
		for (uint8_t i = 1; i < K - 1; ++i) {
			for (uint8_t j = 1; j < N - 1; ++j) {
				u2[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
				
			}
		}

		for (uint8_t i = 0; i < K; ++i) {
			for (uint8_t j = 0; j < N; ++j) {
				u_new[i][j] = u1[i][j] + w * (u2[i][j] - u1[i][j]);
				errors.push_back(fabs(u_new[i][j] - u1[i][j]));;
			}
		}

		max_error = *std::max_element(errors.begin(), errors.end());
		//if(k%100==0)std::cout<<k<<" "<<max_error<<"\n";
		errors.clear();
		for (uint8_t i = 0; i < N; ++i) {
			for (uint8_t j = 0; j < K; ++j) {
				u1[i][j] = u_new[i][j];
			}
		}
		k++;
	} while (max_error > eps);
	clock_t end_s=clock();

	std::cout << "Input = "<<N<<" "<<K <<std::endl;
	std::cout << "w= " << w << ", iteration= " << k << ", max error = " << max_error << std::endl;
	std::cout << "cost time = "<< end_s - start_s << " clocks"<< std::endl;
}

编译执行

9.918358微秒,不降反升,说明赋值开销比计算开销还大

循环展开,获得更多指令级并行

LaplasEquationSolver::LaplasEquationSolver(double(*u1)[N], std::array<double, N>& x, std::array<double, K>& y, double w)
{
	double u2[N][K] = { 0 }, u_new[N][K] = { 0 };
	std::vector<double> errors;
	double max_error;
	double eps = 0.01;

	for (uint8_t i = 0; i < N; ++i)
		for (uint8_t j = 0; j < K; ++j) {
			u1[i][j] = 0;
			u2[i][j] = 0;
			u_new[i][j] = 0;
		}


	for (uint8_t i = 0; i < N; ++i)
	{
		u1[i][0] = downSideRectangle(x[i]);
		u1[i][K - 1] = upSideRectangle(x[i]);
		u2[i][0] = downSideRectangle(x[i]);
		u2[i][K - 1] = upSideRectangle(x[i]);
	}

	for (uint8_t j = 0; j < K; ++j)
	{
		u1[0][j] = leftSideRectangle(y[j]);
		u1[N - 1][j] = rightSideRectangle(y[j]);
		u2[0][j] = leftSideRectangle(y[j]);
		u2[N - 1][j] = rightSideRectangle(y[j]);
	}

	int k = 0;
	clock_t start_s=clock();
	do {
for (uint8_t i = 1; i < K - 1; i += 2) {
    for (uint8_t j = 1; j < N - 1; ++j) {
        u1[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
        u1[i + 1][j] = (1.0 / 4.0) * (u1[i][j] + u1[i + 2][j] + u1[i + 1][j - 1] + u1[i + 1][j + 1]);
    }
}

for (uint8_t i = 1; i < K - 1; i += 2) {
    for (uint8_t j = 1; j < N - 1; ++j) {
        u2[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
        u2[i + 1][j] = (1.0 / 4.0) * (u1[i][j] + u1[i + 2][j] + u1[i + 1][j - 1] + u1[i + 1][j + 1]);
    }
}

		for (uint8_t i = 0; i < K; ++i) {
			for (uint8_t j = 0; j < N; ++j) {
				u_new[i][j] = u1[i][j] + w * (u2[i][j] - u1[i][j]);
				errors.push_back(fabs(u_new[i][j] - u1[i][j]));;
			}
		}

		max_error = *std::max_element(errors.begin(), errors.end());
		//if(k%100==0)std::cout<<k<<" "<<max_error<<"\n";
		errors.clear();
		for (uint8_t i = 0; i < N; ++i) {
			for (uint8_t j = 0; j < K; ++j) {
				u1[i][j] = u_new[i][j];
			}
		}
		k++;
	} while (max_error > eps);
	clock_t end_s=clock();

	std::cout << "Input = "<<N<<" "<<K <<std::endl;
	std::cout << "w= " << w << ", iteration= " << k << ", max error = " << max_error << std::endl;
	std::cout << "cost time = "<< end_s - start_s << " clocks"<< std::endl;
}

编译运行

我们看到展开后时间变为7.790696微秒

其他循环也可以展开,这个在添加-O2,-O3参数尤为明显,面对这些简单赋值的循环,编译器的优化效果似乎比人力更优秀。

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值