多线程的cache调优

原创已于 2023-10-08 09:44:42 修改 · 2.5k 阅读

8 ·

CC 4.0 BY-SA版权

文章标签：

#算法

于 2019-08-31 23:10:27 首次发布

linux 同时被 2 个专栏收录

63 篇文章

订阅专栏

编程

12 篇文章

订阅专栏

本文探讨了多线程程序中因Cache冲突导致性能下降的问题，通过实例对比单线程和多线程程序的性能差异，分析了L1 Cache miss的原因，并提供了两种解决方案：变量对齐和线程绑定。

一般使用多线程是希望它能带来比单线程更高的效率但是事实上常常事与愿违，一个粗糙的多线程可能得到比单线程更差的性能。其中的原因可能是锁竞争也可能是调度，本文重点讨论cache对多线程的影响。

举个简单的例子：

我的机器上CPU拓扑：

我有四个cpu但是每两个CPU是共享一级二级cache的，这个对编程影响很大。cache line是64Byte。硬件上知道这些就好。

演示程序很简单，计算从1到2000000000的和。

单线程程序：

//sig.c
#include<stdio.h>

long long s=0;
void sum(long long num);
int main() {
	sum(2000000000);
	printf("sum is %lld\n", s);
	return 0;
}

void sum(long long num){
	for(long long i=0; i<num; i++)
		s+=i;
}

未经调优的多线程程序：

//mul_raw.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sched.h>
#include <pthread.h>

void* one(void*);
void* two(void*);
long long sum,sum1;


int main(){
        pthread_t id1, id2;

        pthread_create(&id1, NULL, one, NULL);
        pthread_create(&id2, NULL, two, NULL);
        pthread_join(id2, NULL);
        pthread_join(id1, NULL);
        sum+=sum1;
        printf("sum is %lld\n", sum[0]);
        printf("main answer is %lld\n", s);
        return 0;
}

void *one(void *arg){ 
        for(long long i=0; i<1000000000; i++)
                sum[0]+=i;
}

void *two(void *arg){
        for(long long i=1000000000; i<2000000000; i++)
                sum1[0]+=i;
}
~

乍一看这个程序没啥毛病，我们希望两个线程的程序效率会是单线程的两倍，现在我们来验证一下。

编译一下：

gcc sig.c -o sig

gcc mul_raw.c -o mul_raw -lpthread

time ./sig

real   0m6.244s
user   0m6.240s
sys   0m0.000s

time ./mul_raw

real   0m13.941s
user   0m27.344s
sys   0m0.000s
这就奇了，明明我们多了一个线程，反而比单线程耗时多了一倍。这是什么缘故呢？

使用perf查看一下：

root@linux:~/apue/thread# perf stat ./mul_raw
sum is 1999999999000000000
main answer is 1999999999000000000

 Performance counter stats for './mul_raw':

      29919.198148      task-clock (msec)         #    1.996 CPUs utilized          
                48      context-switches          #    0.002 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
                52      page-faults               #    0.002 K/sec                  
    86,389,698,427      cycles                    #    2.887 GHz                      (83.33%)
    72,688,374,009      stalled-cycles-frontend   #   84.14% frontend cycles idle     (83.33%)
    59,983,962,423      stalled-cycles-backend    #   69.43% backend cycles idle      (66.67%)
    32,046,402,378      instructions              #    0.37  insn per cycle         
                                                  #    2.27  stalled cycles per insn  (83.34%)
     6,010,272,485      branches                  #  200.883 M/sec                    (83.34%)
           161,936      branch-misses             #    0.00% of all branches          (83.33%)

      14.993156022 seconds time elapsed

root@linux:~/apue/thread# perf stat ./sig
sum is 1999999999000000000

 Performance counter stats for './sig':

       6239.371664      task-clock (msec)         #    1.000 CPUs utilized          
                 5      context-switches          #    0.001 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
                37      page-faults               #    0.006 K/sec                  
    19,182,184,749      cycles                    #    3.074 GHz                      (83.33%)
     7,173,644,336      stalled-cycles-frontend   #   37.40% frontend cycles idle     (83.33%)
       486,244,240      stalled-cycles-backend    #    2.53% backend cycles idle      (66.67%)
    35,992,953,509      instructions              #    1.88  insn per cycle         
                                                  #    0.20  stalled cycles per insn  (83.33%)
     6,000,037,633      branches                  #  961.641 M/sec                    (83.33%)
            16,189      branch-misses             #    0.00% of all branches          (83.34%)

       6.240563368 seconds time elapsed

我们发现最大的不同在于stalled-cycles-frontend 和stalled-cycles-backend，这两个值的含义是指CPU的前端和后端有很多空闲的时候，空闲的多了自然需要粳稻的cycle去完成剩下的任务。可是CPU的pipeline为什么会空闲呢？一个很大的原因是在等待数据到来，没有数据啥也干不了。

看一下cache miss情况：

root@linux:~/apue/thread# perf stat -e stalled-cycles-frontend -e instructions -e cache-references -e cache-misses -e L1-dcache-loads -e L1-dcache-load-misses -e L1-dcache-stores -e L1-dcache-store-misses -e LLC-loads -e LLC-load-misses -e LLC-prefetches -e cycles -e cs ./mul_raw
sum is 1999999999000000000
main answer is 1999999999000000000

 Performance counter stats for './mul_raw':

    61,941,521,437      stalled-cycles-frontend   #   78.29% frontend cycles idle     (36.37%)
    32,009,980,834      instructions              #    0.40  insn per cycle         
                                                  #    1.94  stalled cycles per insn  (45.46%)
       994,955,974      cache-references                                              (45.47%)
            51,309      cache-misses              #    0.005 % of all cache refs      (45.48%)
    18,064,894,757      L1-dcache-loads                                               (36.35%)
     1,021,561,094      L1-dcache-load-misses     #    5.65% of all L1-dcache hits    (27.25%)
     7,993,162,021      L1-dcache-stores                                              (18.18%)
           338,003      L1-dcache-store-misses                                        (18.18%)
       517,820,815      LLC-loads                                                     (18.19%)
   <not supported>      LLC-load-misses                                             
       313,489,826      LLC-prefetches                                                (27.28%)
    79,115,964,792      cycles                                                        (36.37%)
                18      cs                                                          

      13.967708372 seconds time elapsed

root@linux:~/apue/thread# perf stat -e stalled-cycles-frontend -e instructions -e cache-references -e cache-misses -e L1-dcache-loads -e L1-dcache-load-misses -e L1-dcache-stores -e L1-dcache-store-misses -e LLC-loads -e LLC-load-misses -e LLC-prefetches -e cycles -e cs ./sig
sum is 1999999999000000000

 Performance counter stats for './sig':

     7,184,096,512      stalled-cycles-frontend   #   37.45% frontend cycles idle     (36.28%)
    35,967,727,582      instructions              #    1.88  insn per cycle         
                                                  #    0.20  stalled cycles per insn  (45.38%)
            87,333      cache-references                                              (45.38%)
            20,731      cache-misses              #   23.738 % of all cache refs      (45.43%)
    21,976,499,282      L1-dcache-loads                                               (36.39%)
            78,642      L1-dcache-load-misses     #    0.00% of all L1-dcache hits    (27.24%)
     8,003,536,712      L1-dcache-stores                                              (18.24%)
            31,290      L1-dcache-store-misses                                        (18.22%)
            31,256      LLC-loads                                                     (18.21%)
   <not supported>      LLC-load-misses                                             
            21,116      LLC-prefetches                                                (27.27%)
    19,182,127,328      cycles                                                        (36.31%)
                 5      cs                                                          

       6.240371557 seconds time elapsed

可以明显看出多线程程序的L1 cache miss远大于单线程。

可是数据在哪里呢？为什么没有按时取到呢？请看下图：

这是计算通常的缓存结构，自上而下离CPU越来越远越来越大越来越慢。CPU最喜欢把数据放在离自己最近的地方，除了寄存器那就是L1cache了，最近使用的数据都放在L1cache里面方便下次取用，这就用到了时间局部性原理，在我们的程序中时间局部性是满足的很好，我们把数据都放在sum和sum1中，每次都一样，按理说不应该会cache miss，怎么解释这种看似违反直觉的事呢？

首先我们要知道这两个线程被调度到哪两个CPU上了，因为同一个core中的CPU共享L1cache。通过top -H以及增选Last used cpu经多次实验发现系统一直将两个线程分别调度到两个core中，也就是保持线程不共享L1cache，可能这是Linux kernel的调度策略吧。由于sum和sum1在内存中的位置是连续的（可以通过readelf -s 查看），上面提到的我机器的cache line是64B，cache line是cache中的一个最小单元，数据在cache和内存中传递的最小单元就是cache line，因为sum 和sum1定义在了一起地址是相连的，而且长整形也只有8个字节，两个也能放在一个cache line中，从而在两个独立cache中各有一份拷贝，这会产生另外一个问题，线程1修改了sum，而在线程2的cache中并不知道这一切，这涉及到另外一个重要的原理：缓存一致性原理。什么意思呢，第一次看到计算机的缓存结构就发现这里有个致命的问题，那就是数据是放在每一级的cache中的，那么一致性怎么保证呢，我把新数据放在L1cache，但是放在内存中数据就过时了，这个时候有另外一个程序去读内存中的这个数据不就读到过时的数据了么？因此计算机必须有一种机制能保证这种一致性，当有程序试图读取过时数据的时候就要把数据更新到内存中。可以想象，当线程1更改了sum并放在L1cache中（对于回写策略并不会马上写到内存中）那么这条cache line在其他的cache中都将变成无效的，也就是线程2的L1cache需要去同步线程1的cache，这将浪费大量的cycle，而且几乎每一步都要去同步这个数据，cache miss就大大提高了，耗时也就上去了。

怎么避免这个问题呢？针对产生问题的两个原因有两种解决方案，第一将两个变量隔开，使其不在同一个cache line中，一个很土的办法是：将sum改为sum[8],这样他们就不在一个cache line中了。这一步所做的应该是通常所讲的cache对齐，而且这种方法与硬件和内核调度无关。具有较好的可移植性。

//mul.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sched.h>
#include <pthread.h>

void* one(void*);
void* two(void*);
long long sum[8],sum1[8];


int main(){
        pthread_t id1, id2;

        pthread_create(&id1, NULL, one, NULL);
        pthread_create(&id2, NULL, two, NULL);
        pthread_join(id2, NULL);
        pthread_join(id1, NULL);
        sum[0]+=sum1[0];
        printf("sum is %lld\n", sum[0]);
        return 0;
}

void *one(void *arg){
        for(long long i=0; i<1000000000; i++)
                sum[0]+=i;

}

void *two(void *arg){
        for(long long i=1000000000; i<2000000000; i++)
                sum1[0]+=i;
}

重新编译，测一下时间

time ./mul.c

real   0m3.761s
user   0m7.508s
sys   0m0.004s
只需3秒多，基本达到单线程耗时一半的目标。

看看cache miss

Performance counter stats for './mul':

     9,585,835,790      stalled-cycles-frontend   #   44.43% frontend cycles idle     (36.40%)
    31,976,249,965      instructions              #    1.48  insn per cycle         
                                                  #    0.30  stalled cycles per insn  (45.54%)
           276,700      cache-references                                              (45.57%)
            47,672      cache-misses              #   17.229 % of all cache refs      (45.59%)
    18,033,054,134      L1-dcache-loads                                               (36.26%)
           249,984      L1-dcache-load-misses     #    0.00% of all L1-dcache hits    (27.03%)
     7,993,761,054      L1-dcache-stores                                              (18.21%)
            76,136      L1-dcache-store-misses                                        (18.25%)
            92,511      LLC-loads                                                     (18.23%)
   <not supported>      LLC-load-misses                                             
            15,447      LLC-prefetches                                                (27.32%)
    21,576,954,442      cycles                                                        (36.39%)
                21      cs                                                          

       3.753967271 seconds time elapsed

L1的cache miss果然大大降低。

第二个办法是将线程绑定在同一个core中，这样由于大家共享一个cache line就不会有数据不一致的问题了。

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sched.h>
#include <pthread.h>

void* one(void*);
void* two(void*);
long long sum,sum1;


int main(){
	pthread_t id1, id2;
	cpu_set_t mask;

	CPU_ZERO(&mask);    //置空
	CPU_SET(0,&mask);
	pthread_create(&id1, NULL, one, NULL);
	sched_setaffinity(id1, sizeof(mask), &mask);
	CPU_ZERO(&mask);    //置空
	CPU_SET(2,&mask);
	pthread_create(&id2, NULL, two, NULL);
	sched_setaffinity(id2, sizeof(mask), &mask);
	pthread_join(id2, NULL);
	pthread_join(id1, NULL);
	sum+=sum1;
	printf("sum is %lld\n", sum);
	return 0;
}	

void *one(void *arg){
	for(long long i=0; i<1000000000; i++)
		sum+=i;
}

void *two(void *arg){
	for(long long i=1000000000; i<2000000000; i++)
		sum1+=i;
}

编译时要加上-D_GNU_SOURCE。结果符合预期。这种方法需要针对机器优化，可移植性差。