关于缓存和程序的局部性可参见http://rednaxelafx.javaeye.com/blog/412560 。我主要是用C++编制了矩阵相乘的程序,实测结果表明减少缓存冲突确实可以显著提高程序执行速度。测试环境:VC2010,Intel® Core(TM)2 Duo CPU T5870 2.00GHZ,内存2G。在release模式下。
#include<iostream>
#include<ctime>
#include<random>
using namespace std;
//
// 矩阵相乘,没考虑缓存和程序局部性
//
void mulmat1(double * const A, double * const B, double *C,int n)
{
double v(0.0);
for(int i = 0; i < n; ++i)
for(int j = 0; j < n; ++j)
{
v = 0.0;
for(int k = 0; k < n; ++k)
v += A[n*i + k] * B[n*k + j];
C[n*i + j] =v;
}
}
//
// 考虑缓存和程序局部性
//
void mulmat2(double * const A, double * const B, double *C,int n)
{
double aij(0.0);
double* B_row(0);
double* C_row(0);
for(int i = 0; i < n*n; ++i) C[i] = 0.0;
for(int i = 0; i < n; ++i)
{
C_row = C + n*i;
for(int j = 0; j < n; ++j)
{
aij = A[n*i + j];
B_row = B + n*j;
for(int k = 0; k < n; ++k)
*(C_row + k) += aij * B_row[k];
}
}
}
int main()
{
std::mt19937 rr;
int n = 1000;
double* A = new double[n*n];
double* B = new double[n*n];
double* C = new double[n*n];
for(int i = 0; i < n; ++i)
for(int j = 0; j < n; ++j)
{
A[n*i + j] = rr()%10000;
B[n*i + j] = rr()%10000;
}
clock_t t=clock();
mulmat1(A, A, C, n);
cout<<"mulmat1执行时间:"<<clock()-t<<"ms"<<endl;
t=clock();
mulmat2(A, A, C, n);
cout<<"mulmat2执行时间:"<<clock()-t<<"ms"<<endl;
delete[] C;
delete[] B;
delete[] A;
}
mulmat1执行时间:8125ms
mulmat2执行时间:1734ms