通过矩阵乘法性能的测量,理解Cache的作用
#include <stdio.h> #include <time.h>
#include <stdlib.h> /* Code to clear cache / / Core i7 has 8M L3 cache, which is 1M longs / #define ASIZE (1 << 20) / Cache block size is 64 bytes / #define STRIDE 8 static long stuff[ASIZE]; static long sink; static void clear() { long x = sink; int i; for (i = 0; i < ASIZE; i += STRIDE) x += stuff[i]; sink = x; } / 返回每次内循环迭代所需的平均CPU周期数 */ double ijk(int N) { int i, j, k,t;
double **A,**B,**C;
A=(double **)malloc(sizeof(double *)*N);
B=(double **)malloc(sizeof(double *)*N);
C=(double **)malloc(sizeof(double *)*N);
for(i=0;i<N;i++){
*(A+i)=(double *)malloc(sizeof(double)*N);
*(B+i)=(double *)malloc(sizeof(double)*N);
(C+i)=(double )malloc(sizeof(double)N);
} double sum,time[10]; clock_t c1, c2, c; for(t=0;t<10;t++){ c1 = clock(); // clock_gettime for (i = 0; i < N; i++){ for (j = 0; j < N; j++) { sum = 0.0; for (k = 0; k < N; k++) sum += A[i][k] * B[k][j]; C[i][j] += sum; }
} c2 = clock(); //clock_gettime time[t]=(double)(c2-c1)/(N * N); }
for(i=0;i<N;i++){
free((A+i));
free((B+i));
free((C+i));
}
free(A);
free(B);
free©; double sum1=0; for(t=0;t<10;t++){ sum1+=time[t]; printf("%lf “,time[t]); } printf(“average is %lf”,sum1/10); return 0; }
double kij(int N) { int i, j, k,t;
double **A,**B,**C;
A=(double **)malloc(sizeof(double *)*N);
B=(double **)malloc(sizeof(double *)*N);
C=(double **)malloc(sizeof(double *)*N);
for(i=0;i<N;i++){
*(A+i)=(double *)malloc(sizeof(double)*N);
*(B+i)=(double *)malloc(sizeof(double)*N);
(C+i)=(double )malloc(sizeof(double)N);
} double sum,time[10]; clock_t c1, c2, c; for(t=0;t<10;t++){ c1 = clock(); // clock_gettime for (k = 0; k < N; k++){ for (i = 0; i < N; i++) { sum = 0.0; for (j = 0; j < N; j++) sum += A[i][k] * B[k][j]; C[k][i] += sum;
} } c2 = clock(); //clock_gettime time[t]=(double)(c2-c1)/(N * N); }
for(i=0;i<N;i++){
free((A+i));
free((B+i));
free((C+i));
}
free(A);
free(B);
free©; double sum1=0; for(t=0;t<10;t++){ sum1+=time[t]; printf(”%lf ",time[t]); } printf(“average is %lf”,sum1/10); return 0; }
double jki(int N) {
int i, j, k,t;
double **A,**B,**C;
A=(double **)malloc(sizeof(double *)*N);
B=(double **)malloc(sizeof(double *)*N);
C=(double **)malloc(sizeof(double *)*N);
for(i=0;i<N;i++){
*(A+i)=(double *)malloc(sizeof(double)*N);
*(B+i)=(double *)malloc(sizeof(double)*N);
(C+i)=(double )malloc(sizeof(double)N);
} double sum,time[10]; clock_t c1, c2, c; for(t=0;t<10;t++){ c1 = clock(); // clock_gettime for (j = 0; j < N; j++) for (k = 0; k < N; k++) { sum = 0.0; for (i = 0; i < N; i++){ sum += A[i][k] * B[k][j];
} C[j][k] += sum; } c2 = clock(); //clock_gettime time[t]=(double)(c2-c1)/(N * N); }
for(i=0;i<N;i++){
free((A+i));
free((B+i));
free((C+i));
}
free(A);
free(B);
free©; double sum1=0; for(t=0;t<10;t++){ sum1+=time[t]; printf("%lf “,time[t]); } printf(“average is %lf”,sum1/10);
return 0; } int main() { int i, j,N; double **A,**B,**C;
A=(double **)malloc(sizeof(double *)*N);
B=(double **)malloc(sizeof(double *)*N);
C=(double **)malloc(sizeof(double *)*N);
for(i=0;i<N;i++){
*(A+i)=(double *)malloc(sizeof(double)*N);
*(B+i)=(double *)malloc(sizeof(double)*N);
*(C+i)=(double *)malloc(sizeof(double)*N);
} for (i = 0; i < N; i ++) // 初始化矩阵中的元素 for (j = 0; j < N; j ++) { A[i][j] = 1.0; B[i][j] = 1.0; } clear();// 测量前先清空cache
for(i=100;i<=1000;i+=100){
N=i;
printf(”\n",i);
printf(“dang N wei %d shi\n”,i);
printf(“ijk matrix mult time :”); ijk(N);
printf("\n",i);
printf(“kij matrix mult time :”); kij(N);
printf("\n");
printf(“jki matrix mult time :”); jki(N);
printf("\n",i);
} return 0; }