根据
7个示例科普CPU CACHE
测试缓存着色
E3 1230v2, L2缓存:4* 256KB 8way (1MB)
cacheline 64B,页大小64KB,1页含有1024cacheline。
set数量=1MB/64/8=2048.
根据文中所述,2KB*64=128K,(理解不够深,不晓得算对没有)
每128K的物理内存会映射到同一个cache slot上竞争,这个slot容量是8.
enum {
BufferSize = 1024 * 1024 * 1
};
#include <iostream>
#include <type_traits>
#include <windows.h>
#include <intrin.h>
#include <vector>
char clearcachebuff[1024 * 1024 * 8];
std::vector<char> buff1;
std::vector<char> buff2;
void clear_cache(int i)
{
unsigned int junk = 0;
auto t = __rdtscp(&junk);
memset(clearcachebuff, i, sizeof(clearcachebuff));
t = __rdtscp(&junk) - t;
//printf(" clear cache %d\n", (int)t / 1024);
}
static long UpdateEveryKthByte(std::vector<char>& buff, int K)
{
clear_cache(rand());
unsigned int junk = 0;
auto t = __rdtscp(&junk);
const int rep = 1024 * 1024; // Number of iterations – arbitrary
char* parr = &buff[0];
int size = buff.size();
int p = 0;
for (int i = 0; i < rep; i++)
{
parr[p + (i & 31)]++; // & 31 动态量防cpu固定偏移猜测
p += K;
//_mm_prefetch(parr + p + K, _MM_HINT_T0);
if (p >= size) p = 0;
}
return long(__rdtscp(&junk) - t);
}
int main()
{
unsigned int junk = 0;
buff1.resize(BufferSize);
buff2.resize(BufferSize * 2);
printf("每jump数值读取内存, 对于每个jump测试3次,打印相对耗时。\n缓冲区%dMB和%dMB\n", buff1.size() / 1024 / 1024, buff2.size() / 1024 / 1024);
for (int i = 1; i <= 1024 * 128; i *= 2)
{
auto jump = i * 53 / 64;
printf(" jump:%7d Buff1:", jump);
printf(" %6d", UpdateEveryKthByte(buff1, jump) / 1024);
printf(" %6d", UpdateEveryKthByte(buff1, jump) / 1024);
printf(" %6d", UpdateEveryKthByte(buff1, jump) / 1024);
printf(" Buff2:");
printf(" %6d", UpdateEveryKthByte(buff2, jump) / 1024);
printf(" %6d", UpdateEveryKthByte(buff2, jump) / 1024);
printf(" %6d", UpdateEveryKthByte(buff2, jump) / 1024);
puts("");
jump = i;
printf("*jump:%7d Buff1:", jump);
printf(" %6d", UpdateEveryKthByte(buff1, jump) / 1024);
printf(" %6d", UpdateEveryKthByte(buff1, jump) / 1024);
printf(" %6d", UpdateEveryKthByte(buff1, jump) / 1024);
printf(" *Buff2:");
printf(" %6d", UpdateEveryKthByte(buff2, jump) / 1024);
printf(" %6d", UpdateEveryKthByte(buff2, jump) / 1024);
printf(" %6d", UpdateEveryKthByte(buff2, jump) / 1024);
puts("");
jump = i * 66 / 64;
printf(" jump:%7d Buff1:", jump);
printf(" %6d", UpdateEveryKthByte(buff1, jump) / 1024);
printf(" %6d", UpdateEveryKthByte(buff1, jump) / 1024);
printf(" %6d", UpdateEveryKthByte(buff1, jump) / 1024);
printf(" Buff2:");
printf(" %6d", UpdateEveryKthByte(buff2, jump) / 1024);
printf(" %6d", UpdateEveryKthByte(buff2, jump) / 1024);
printf(" %6d", UpdateEveryKthByte(buff2, jump) / 1024);
puts("");
}
}
输出为(除了jump,其他数值均为一千个cpu周期,比如4924代表4924千周期,可知循环每步消耗4.9个cpu周期):
每jump数值读取内存, 对于每个jump测试3次,打印相对耗时。
缓冲区1MB和2MB
jump: 0 Buff1: 4924 4737 4717 Buff2: 4747 5049 5095
*jump: 1 Buff1: 4691 4693 4725 *Buff2: 4695 4690 5533
jump: 1 Buff1: 4890 4833 5045 Buff2: 4806 4733 4786
jump: 1 Buff1: 4817 5367 4704 Buff2: 4834 4989 4926
*jump: 2 Buff1: 4911 4821 5086 *Buff2: 4698 4699 4746
jump: 2 Buff1: 4641 4746 4718 Buff2: 4696 4843 4703
jump: 3 Buff1: 4771 4674 4687 Buff2: 4667 4730 4763
*jump: 4 Buff1: 4648 5483 4685 *Buff2: 5291 4709 4727
jump: 4 Buff1: 4842 4616 5001 Buff2: 4780 4644 4676
jump: 6 Buff1: 4898 4744 4754 Buff2: 5178 4707 4877
*jump: 8 Buff1: 4657 5065 4697 *Buff2: 5254 5350 4795
jump: 8 Buff1: 4773 4722 4656 Buff2: 4669 5181 4716
jump: 13 Buff1: 4985 4979 5036 Buff2: 4908 4891 4968
*jump: 16 Buff1: 4860 4849 4697 *Buff2: 4954 7003 5111
jump: 16 Buff1: 4981 4787 4672 Buff2: 5042 5208 5223
jump: 26 Buff1: 6065 5175 5414 Buff2: 6464 5683 5388
*jump: 32 Buff1: 6336 5598 5558 *Buff2: 6516 5948 5612
jump: 33 Buff1: 6318 6193 6260 Buff2: 6774 6175 5883
jump: 53 Buff1: 8281 7958 7894 Buff2: 11367 8027 8106
*jump: 64 Buff1: 9686 8846 8730 *Buff2: 11110 10006 8855
jump: 66 Buff1: 10131 8895 8684 Buff2: 10781 9173 8875
jump: 106 Buff1: 10278 9601 9207 Buff2: 10900 9352 9578
*jump: 128 Buff1: 9119 9045 9015 *Buff2: 9488 9176 9863
jump: 132 Buff1: 8961 8642 8742 Buff2: 9445 9111 9324
jump: 212 Buff1: 8046 8056 8804 Buff2: 9760 9128 9251
*jump: 256 Buff1: 8891 8834 10498 *Buff2: 9162 8945 9046
jump: 264 Buff1: 7737 7353 8047 Buff2: 8935 9660 9196
jump: 424 Buff1: 6245 6510 6811 Buff2: 7670 7182 7415
*jump: 512 Buff1: 9068 9058 8821 *Buff2: 8732 9195 10241
jump: 528 Buff1: 6410 6150 6462 Buff2: 6548 6594 6360
jump: 848 Buff1: 6173 6786 6586 Buff2: 6495 5980 6028
*jump: 1024 Buff1: 10959 12582 11729 *Buff2: 11751 11693 11527
jump: 1056 Buff1: 6233 6147 6200 Buff2: 6222 6165 6340
jump: 1696 Buff1: 6628 6123 6132 Buff2: 6350 6230 6237
*jump: 2048 Buff1: 14240 14012 14062 *Buff2: 15153 14452 14133
jump: 2112 Buff1: 5617 5028 5447 Buff2: 6621 6216 6076
jump: 3392 Buff1: 5096 5076 6490 Buff2: 6171 6510 6169
*jump: 4096 Buff1: 17242 17716 17222 *Buff2: 17829 17342 17223
jump: 4224 Buff1: 5324 5068 5045 Buff2: 6234 6274 6088
jump: 6784 Buff1: 5048 5010 5060 Buff2: 6077 6406 6149
*jump: 8192 Buff1: 17002 17483 17315 *Buff2: 19475 17946 18244
jump: 8448 Buff1: 4987 5112 5307 Buff2: 6118 6176 6180
jump: 13568 Buff1: 4892 4861 4934 Buff2: 6132 6237 6323
*jump: 16384 Buff1: 16980 17361 17246 *Buff2: 19211 18657 18046
jump: 16896 Buff1: 4690 4690 4673 Buff2: 6064 6224 6067
jump: 27136 Buff1: 4690 4647 4698 Buff2: 5820 5620 6144
*jump: 32768 Buff1: 11719 11759 11568 *Buff2: 17638 17688 17567
jump: 33792 Buff1: 8039 8120 8035 Buff2: 10058 10515 9967
jump: 54272 Buff1: 4673 5022 4724 Buff2: 8816 7979 8715
*jump: 65536 Buff1: 13166 12643 13971 *Buff2: 11876 11878 11797
jump: 67584 Buff1: 4709 5107 4633 Buff2: 8936 10220 8958
jump: 108544 Buff1: 4836 4718 4890 Buff2: 7238 7141 6939
*jump: 131072 Buff1: 5015 5015 5015 *Buff2: 14106 13760 13943
jump: 135168 Buff1: 4909 5018 4674 Buff2: 10131 9988 9293
每跳过jump字节,访问内存。看输出易得整2次幂的跳过,性能都很慢,这是因为如此访问的内存很容易竞争相同的cache slot导致前面缓存失效。
131072=128K = 1/8 L2,正好整个循环中只会访问到8个char,而L2是8way的因此全部放入同一个slot没有溢出,所以性能极高。(循环每步4.6周期).对于结果中出现的19周期,猜测是L3的功效,否则时间将低至45周期。
这告诉我们什么?如果内存中有512KB的大型数据结构×32个实例,如
struct MyArray
{
int usedSize = 0;
char buff[512 * 1024 - 1000];
};
每当访问这个大型数据结构时,往往要访问usedSize,而512KB的大型数据结构通常对齐到页首。
如此顺序访问32个MyArray的usedSize时,每当访问一个usedSize就会将某一个实例的usedSize踢出缓冲区。1MB大的L2缓冲区容不下第9个usedSize。
解决方案,故意将usedSize随机对齐到页首的+ 随机值 * 64字节?
另一方面反向应用:将8个实例全部对齐到一个slot中,而安排其他内存尽量不映射到这个slot或顺序遍历内存,这样从概率上更不容易将这8个实例踢出缓冲区?