1.首先我们拿到源码,进行编译执行看一下执行时间
当前花费时钟周期为14596455,就是14.596455微秒
2.使用gprof分析一下热点函数
gprof Laplace gmon.out
可以看到LaplasEquationSolver::LaplasEquationSolver(double(*u1)[N], std::array<double, N>& x, std::array<double, K>& y, double w)函数占据100%的时间
3.找到源码
LaplasEquationSolver::LaplasEquationSolver(double(*u1)[N], std::array<double, N>& x, std::array<double, K>& y, double w)
{
double u2[N][K] = { 0 }, u_new[N][K] = { 0 };
std::vector<double> errors;
double max_error;
double eps = 0.01;
for (uint8_t i = 0; i < N; ++i)
for (uint8_t j = 0; j < K; ++j) {
u1[i][j] = 0;
u2[i][j] = 0;
u_new[i][j] = 0;
}
for (uint8_t i = 0; i < N; ++i)
{
u1[i][0] = downSideRectangle(x[i]);
u1[i][K - 1] = upSideRectangle(x[i]);
}
for (uint8_t j = 0; j < K; ++j)
{
u1[0][j] = leftSideRectangle(y[j]);
u1[N - 1][j] = rightSideRectangle(y[j]);
}
for (uint8_t i = 0; i < N; ++i)
{
u2[i][0] = downSideRectangle(x[i]);
u2[i][K - 1] = upSideRectangle(x[i]);
}
for (uint8_t j = 0; j < K; ++j)
{
u2[0][j] = leftSideRectangle(y[j]);
u2[N - 1][j] = rightSideRectangle(y[j]);
}
int k = 0;
clock_t start_s=clock();
do {
for (uint8_t j = 1; j < K - 1; ++j) {
for (uint8_t i = 1; i < N - 1; ++i) {
u1[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
}
}
for (uint8_t j = 1; j < K - 1; ++j) {
for (uint8_t i = 1; i < N - 1; ++i) {
u2[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
}
}
for (uint8_t j = 0; j < K; ++j) {
for (uint8_t i = 0; i < N; ++i) {
u_new[i][j] = u1[i][j] + w * (u2[i][j] - u1[i][j]);
}
}
for (uint8_t i = 0; i < N; ++i) {
for (uint8_t j = 0; j < K; ++j) {
errors.push_back(fabs(u_new[i][j] - u1[i][j]));;
}
}
max_error = *std::max_element(errors.begin(), errors.end());
//if(k%100==0)std::cout<<k<<" "<<max_error<<"\n";
errors.clear();
for (uint8_t i = 0; i < N; ++i) {
for (uint8_t j = 0; j < K; ++j) {
u1[i][j] = u_new[i][j];
}
}
k++;
} while (max_error > eps);
clock_t end_s=clock();
std::cout << "Input = "<<N<<" "<<K <<std::endl;
std::cout << "w= " << w << ", iteration= " << k << ", max error = " << max_error << std::endl;
std::cout << "cost time = "<< end_s - start_s << " clocks"<< std::endl;
}
我们可以看到,代码大体是2个矩阵u1,u2,先填充边缘,再填充内部
4.对于大量的for循环,可采用的以下几种方式:循环合并,循环展开,循环交换,循环分布,循环不变量外提,循环分块,循环分裂。要注意循环体之间容易产生影响,要注意先后次序。
循环合并,减少循环迭代开销,下面是合并结果
LaplasEquationSolver::LaplasEquationSolver(double(*u1)[N], std::array<double, N>& x, std::array<double, K>& y, double w)
{
double u2[N][K] = { 0 }, u_new[N][K] = { 0 };
std::vector<double> errors;
double max_error;
double eps = 0.01;
for (uint8_t i = 0; i < N; ++i)
for (uint8_t j = 0; j < K; ++j) {
u1[i][j] = 0;
u2[i][j] = 0;
u_new[i][j] = 0;
}
for (uint8_t i = 0; i < N; ++i)
{
u1[i][0] = downSideRectangle(x[i]);
u1[i][K - 1] = upSideRectangle(x[i]);
u2[i][0] = downSideRectangle(x[i]);
u2[i][K - 1] = upSideRectangle(x[i]);
}
for (uint8_t j = 0; j < K; ++j)
{
u1[0][j] = leftSideRectangle(y[j]);
u1[N - 1][j] = rightSideRectangle(y[j]);
u2[0][j] = leftSideRectangle(y[j]);
u2[N - 1][j] = rightSideRectangle(y[j]);
}
int k = 0;
clock_t start_s=clock();
do {
for (uint8_t j = 1; j < K - 1; ++j) {
for (uint8_t i = 1; i < N - 1; ++i) {
u1[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
}
}
for (uint8_t j = 1; j < K - 1; ++j) {
for (uint8_t i = 1; i < N - 1; ++i) {
u2[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
}
}
for (uint8_t j = 0; j < K; ++j) {
for (uint8_t i = 0; i < N; ++i) {
u_new[i][j] = u1[i][j] + w * (u2[i][j] - u1[i][j]);
errors.push_back(fabs(u_new[i][j] - u1[i][j]));;
}
}
max_error = *std::max_element(errors.begin(), errors.end());
//if(k%100==0)std::cout<<k<<" "<<max_error<<"\n";
errors.clear();
for (uint8_t i = 0; i < N; ++i) {
for (uint8_t j = 0; j < K; ++j) {
u1[i][j] = u_new[i][j];
}
}
k++;
} while (max_error > eps);
clock_t end_s=clock();
std::cout << "Input = "<<N<<" "<<K <<std::endl;
std::cout << "w= " << w << ", iteration= " << k << ", max error = " << max_error << std::endl;
std::cout << "cost time = "<< end_s - start_s << " clocks"<< std::endl;
}
编译执行
我们发现执行时间为12.69168微秒
循环交换,增强向量化识别,下面是交换结果
LaplasEquationSolver::LaplasEquationSolver(double(*u1)[N], std::array<double, N>& x, std::array<double, K>& y, double w)
{
double u2[N][K] = { 0 }, u_new[N][K] = { 0 };
std::vector<double> errors;
double max_error;
double eps = 0.01;
for (uint8_t i = 0; i < N; ++i)
for (uint8_t j = 0; j < K; ++j) {
u1[i][j] = 0;
u2[i][j] = 0;
u_new[i][j] = 0;
}
for (uint8_t i = 0; i < N; ++i)
{
u1[i][0] = downSideRectangle(x[i]);
u1[i][K - 1] = upSideRectangle(x[i]);
u2[i][0] = downSideRectangle(x[i]);
u2[i][K - 1] = upSideRectangle(x[i]);
}
for (uint8_t j = 0; j < K; ++j)
{
u1[0][j] = leftSideRectangle(y[j]);
u1[N - 1][j] = rightSideRectangle(y[j]);
u2[0][j] = leftSideRectangle(y[j]);
u2[N - 1][j] = rightSideRectangle(y[j]);
}
int k = 0;
clock_t start_s=clock();
do {
for (uint8_t i = 1; i < K - 1; ++i) {
for (uint8_t j = 1; j < N - 1; ++j) {
u1[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
}
}
for (uint8_t i = 1; i < K - 1; ++i) {
for (uint8_t j = 1; j < N - 1; ++j) {
u2[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
}
}
for (uint8_t i = 0; i < K; ++i) {
for (uint8_t j = 0; j < N; ++j) {
u_new[i][j] = u1[i][j] + w * (u2[i][j] - u1[i][j]);
errors.push_back(fabs(u_new[i][j] - u1[i][j]));;
}
}
max_error = *std::max_element(errors.begin(), errors.end());
//if(k%100==0)std::cout<<k<<" "<<max_error<<"\n";
errors.clear();
for (uint8_t i = 0; i < N; ++i) {
for (uint8_t j = 0; j < K; ++j) {
u1[i][j] = u_new[i][j];
}
}
k++;
} while (max_error > eps);
clock_t end_s=clock();
std::cout << "Input = "<<N<<" "<<K <<std::endl;
std::cout << "w= " << w << ", iteration= " << k << ", max error = " << max_error << std::endl;
std::cout << "cost time = "<< end_s - start_s << " clocks"<< std::endl;
}
编译执行
我们可以看到执行时间9.660195微秒
循环不变量外提,减少循环内计算量,下面是简化结果
LaplasEquationSolver::LaplasEquationSolver(double(*u1)[N], std::array<double, N>& x, std::array<double, K>& y, double w)
{
double u2[N][K] = { 0 }, u_new[N][K] = { 0 };
std::vector<double> errors;
double max_error;
double eps = 0.01;
for (uint8_t i = 0; i < N; ++i)
for (uint8_t j = 0; j < K; ++j) {
u1[i][j] = 0;
u2[i][j] = 0;
u_new[i][j] = 0;
}
for (uint8_t i = 0; i < N; ++i)
{
u1[i][0] = downSideRectangle(x[i]);
u1[i][K - 1] = upSideRectangle(x[i]);
u2[i][0] = u1[i][0];
u2[i][K - 1] = u1[i][K - 1];
}
for (uint8_t j = 0; j < K; ++j)
{
u1[0][j] = leftSideRectangle(y[j]);
u1[N - 1][j] = rightSideRectangle(y[j]);
u2[0][j] = u1[0][j];
u2[N - 1][j] = u1[N - 1][j];
}
int k = 0;
clock_t start_s=clock();
do {
for (uint8_t i = 1; i < K - 1; ++i) {
for (uint8_t j = 1; j < N - 1; ++j) {
u1[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
}
}
for (uint8_t i = 1; i < K - 1; ++i) {
for (uint8_t j = 1; j < N - 1; ++j) {
u2[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
}
}
for (uint8_t i = 0; i < K; ++i) {
for (uint8_t j = 0; j < N; ++j) {
u_new[i][j] = u1[i][j] + w * (u2[i][j] - u1[i][j]);
errors.push_back(fabs(u_new[i][j] - u1[i][j]));;
}
}
max_error = *std::max_element(errors.begin(), errors.end());
//if(k%100==0)std::cout<<k<<" "<<max_error<<"\n";
errors.clear();
for (uint8_t i = 0; i < N; ++i) {
for (uint8_t j = 0; j < K; ++j) {
u1[i][j] = u_new[i][j];
}
}
k++;
} while (max_error > eps);
clock_t end_s=clock();
std::cout << "Input = "<<N<<" "<<K <<std::endl;
std::cout << "w= " << w << ", iteration= " << k << ", max error = " << max_error << std::endl;
std::cout << "cost time = "<< end_s - start_s << " clocks"<< std::endl;
}
编译执行
9.918358微秒,不降反升,说明赋值开销比计算开销还大
循环展开,获得更多指令级并行
LaplasEquationSolver::LaplasEquationSolver(double(*u1)[N], std::array<double, N>& x, std::array<double, K>& y, double w)
{
double u2[N][K] = { 0 }, u_new[N][K] = { 0 };
std::vector<double> errors;
double max_error;
double eps = 0.01;
for (uint8_t i = 0; i < N; ++i)
for (uint8_t j = 0; j < K; ++j) {
u1[i][j] = 0;
u2[i][j] = 0;
u_new[i][j] = 0;
}
for (uint8_t i = 0; i < N; ++i)
{
u1[i][0] = downSideRectangle(x[i]);
u1[i][K - 1] = upSideRectangle(x[i]);
u2[i][0] = downSideRectangle(x[i]);
u2[i][K - 1] = upSideRectangle(x[i]);
}
for (uint8_t j = 0; j < K; ++j)
{
u1[0][j] = leftSideRectangle(y[j]);
u1[N - 1][j] = rightSideRectangle(y[j]);
u2[0][j] = leftSideRectangle(y[j]);
u2[N - 1][j] = rightSideRectangle(y[j]);
}
int k = 0;
clock_t start_s=clock();
do {
for (uint8_t i = 1; i < K - 1; i += 2) {
for (uint8_t j = 1; j < N - 1; ++j) {
u1[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
u1[i + 1][j] = (1.0 / 4.0) * (u1[i][j] + u1[i + 2][j] + u1[i + 1][j - 1] + u1[i + 1][j + 1]);
}
}
for (uint8_t i = 1; i < K - 1; i += 2) {
for (uint8_t j = 1; j < N - 1; ++j) {
u2[i][j] = (1.0 / 4.0) * (u1[i - 1][j] + u1[i + 1][j] + u1[i][j - 1] + u1[i][j + 1]);
u2[i + 1][j] = (1.0 / 4.0) * (u1[i][j] + u1[i + 2][j] + u1[i + 1][j - 1] + u1[i + 1][j + 1]);
}
}
for (uint8_t i = 0; i < K; ++i) {
for (uint8_t j = 0; j < N; ++j) {
u_new[i][j] = u1[i][j] + w * (u2[i][j] - u1[i][j]);
errors.push_back(fabs(u_new[i][j] - u1[i][j]));;
}
}
max_error = *std::max_element(errors.begin(), errors.end());
//if(k%100==0)std::cout<<k<<" "<<max_error<<"\n";
errors.clear();
for (uint8_t i = 0; i < N; ++i) {
for (uint8_t j = 0; j < K; ++j) {
u1[i][j] = u_new[i][j];
}
}
k++;
} while (max_error > eps);
clock_t end_s=clock();
std::cout << "Input = "<<N<<" "<<K <<std::endl;
std::cout << "w= " << w << ", iteration= " << k << ", max error = " << max_error << std::endl;
std::cout << "cost time = "<< end_s - start_s << " clocks"<< std::endl;
}
编译运行
我们看到展开后时间变为7.790696微秒
其他循环也可以展开,这个在添加-O2,-O3参数尤为明显,面对这些简单赋值的循环,编译器的优化效果似乎比人力更优秀。