kernel hacker修炼之道之内存管理-OOM Killer

最新推荐文章于 2022-08-16 18:11:10 发布

转载最新推荐文章于 2022-08-16 18:11:10 发布 · 1.5k 阅读

文章标签：

本文详细解析了Linux系统的OOM（Out of Memory）机制。介绍了当系统内存不足时如何选择并杀死进程来释放内存，包括关键函数out_of_memory的工作流程，以及如何评估进程的badness值来决定哪个进程被终止。

在系统内存不足的时候会回收页框，但是在这个过程中可能会发现，系统即使是以最高优先级扫描都无法释放足够的页面来满足请求。如果系统不能够释放页面，就会调用out_of_memory函数，告知系统发生内存溢出，这时就会杀死某个进程。在__alloc_pages函数中，当调用try_to_free_pages回收页框无效的时候，会调用out_of_memory杀死一个进程，释放所占有的page后，再重新尝试分配。

这个是out_of_memory的流程图，主要分为两部分，左侧这一部分主要是选择要杀死的进程，右边这一部分执行杀死操作。

[html]view plaincopy 
   
 256void out_of_memory(int gfp_mask)  
 257{  
 258        struct mm_struct *mm = NULL;  
 259        task_t * p;  
 260  
 261        read_lock(&tasklist_lock);  
 262retry:  
 263        p = select_bad_process();  
 264  
 265        if (PTR_ERR(p) == -1UL)  
 266                goto out;  
 267  
 268        /* Found nothing?!?! Either we hang forever, or we panic. */  
 269        if (!p) {  
 270                read_unlock(&tasklist_lock);  
 271                show_free_areas();  
 272                panic("Out of memory and no killable processes...\n");  
 273        }  
 274  
 275        printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);  
 276        show_free_areas();  
 277        mm = oom_kill_process(p);  
 278        if (!mm)  
 279                goto retry;  
 280  
 281 out:  
 282        read_unlock(&tasklist_lock);  
 283        if (mm)  
 284                mmput(mm);  
 285  
 286        /*  
 287         * Give "p" a good chance of killing itself before we  
 288         * retry to allocate memory.  
 289         */  
 290        __set_current_state(TASK_INTERRUPTIBLE);  
 291        schedule_timeout(1);  
 292}  

调用select_bad_process函数选择一个即将被杀死的进程
调用oom_kill_process函数杀死进程
如果在调用select_bad_process函数的时候返回-1，说明已经有进程被OOM Killer选中，等它死就行了
给被选中的进程一点儿时间，休眠一秒再重新分配内存

下面看这个选择"best" process的函数：

[html]view plaincopy 
   
 138static struct task_struct * select_bad_process(void)  
 139{  
 140        unsigned long maxpoints = 0;  
 141        struct task_struct *g, *p;  
 142        struct task_struct *chosen = NULL;  
 143        struct timespec uptime;  
 144  
 145        do_posix_clock_monotonic_gettime(&uptime);  
 146        do_each_thread(g, p)  
 147                /* skip the init task with pid == 1 */  
 148                if (p->pid > 1) {  
 149                        unsigned long points;  
 150  
 151                        /*  
 152                         * This is in the process of releasing memory so wait it  
 153                         * to finish before killing some other task by mistake.  
 154                         */  
 155                        if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) &&  
 156                            !(p->flags & PF_DEAD))  
 157                                return ERR_PTR(-1UL);  
 158                        if (p->flags & PF_SWAPOFF)  
 159                                return p;  
 160  
 161                        points = badness(p, uptime.tv_sec);  
 162                        if (points > maxpoints || !chosen) {  
 163                                chosen = p;  
 164                                maxpoints = points;  
 165                        }  
 166                }  
 167        while_each_thread(g, p);  
 168        return chosen;  
 169}  

跳过init进程
如果进程的TIF_MEMDIE标志被设置，表示进程已经被OOM Killer机制选中；如果PF_EXITING标志被设置，表示进程正在被消除；如果PF_DEAD标志被设置，表示进程已经dead；如果这些标志被设置则返回-1，这样告诉调用 out_of_memory函数的进程等一下再分配就好
如果进程的PF_SWAPOFF标志被设置，表示那个进程调用了sys_swapoff函数，这个函数迫使进程所有驻留在swap中的page进入RAM中，并设置相应的页表，所以这个进程直接被返回，被选中杀死
如果进程没有设置上诉标志，则调用badness选择一个最该杀死的，什么是最该杀死的呢？它选择的是使用了最大量内存而又没有生存很久的进程

看badness函数实现：

[html]view plaincopy 
   
  45unsigned long badness(struct task_struct *p, unsigned long uptime)  
  46{  
  47        unsigned long points, cpu_time, run_time, s;  
  48        struct list_head *tsk;  
  49  
  50        if (!p->mm)  
  51                return 0;  
  52  
  53        /*  
  54         * The memory size of the process is the basis for the badness.  
  55         */  
  56        ppoints = p->mm->total_vm;  
  57  
  58        /*  
  59         * Processes which fork a lot of child processes are likely  
  60         * a good choice. We add the vmsize of the childs if they  
  61         * have an own mm. This prevents forking servers to flood the  
  62         * machine with an endless amount of childs  
  63         */  
  64        list_for_each(tsk, &p->children) {  
  65                struct task_struct *chld;  
  66                chld = list_entry(tsk, struct task_struct, sibling);  
  67                if (chld->mm != p->mm && chld->mm)  
  68                        points += chld->mm->total_vm;  
  69        }  
  70  
  71        /*  
  72         * CPU time is in tens of seconds and run time is in thousands  
  73         * of seconds. There is no particular reason for this other than  
  74         * that it turned out to work very well in practice.  
  75         */  
  76        cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))  
  77                >> (SHIFT_HZ + 3);  
  78  
  79        if (uptime >= p->start_time.tv_sec)  
  80                run_time = (uptime - p->start_time.tv_sec) >> 10;  
  81        else  
  82                run_time = 0;  
  83  
  84        s = int_sqrt(cpu_time);  
  85        if (s)  
  86                points /= s;  
  87        s = int_sqrt(int_sqrt(run_time));  
  88        if (s)  
  89                points /= s;  
  90  
  91        /*  
  92         * Niced processes are most likely less important, so double  
  93         * their badness points.  
  94         */  
  95        if (task_nice(p) > 0)  
  96                points *= 2;  
  97  
  98        /*  
  99         * Superuser processes are usually more important, so we make it  
 100         * less likely that we kill those.  
 101         */  
 102        if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||  
 103                                p->uid == 0 || p->euid == 0)  
 104                points /= 4;  
 105  
 106        /*  
 107         * We don't want to kill a process with direct hardware access.  
 108         * Not only could that mess up the hardware, but usually users  
 109         * tend to only have this flag set on applications they think  
 110         * of as important.  
 111         */  
 112        if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))  
 113                points /= 4;  
 114  
 115        /*  
 116         * Adjust the score by oomkilladj.  
 117         */  
 118        if (p->oomkilladj) {  
 119                if (p->oomkilladj > 0)  
 120                        points <<= p->oomkilladj;  
 121                else  
 122                        points >>= -(p->oomkilladj);  
 123        }  
 124  
 125#ifdef DEBUG  
 126        printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",  
 127        p->pid, p->comm, points);  
 128#endif  
 129        return points;  
 130}  

获得这个进程拥有的内存size，记为权重
遍历这个进程的子进程，如果它们有自己的内存空间，则增加权重vmsize，防止一个进程创建无限的子进程占用内存
获得占用cpu的时间cpu_time，占有CPU时间越多，即越忙越可能生存
获得运行的时间run_time，运行时间越久越可能生存
如果nice值大于0，说明静态优先级很低，你越nice，越会被杀掉，权重×2
CAP_SYS_ADMIN，管理员的程序要保留下来，权重/4
CAP_SYS_RAWIO，如果有访问源设备的能力，保留，权重/4
使用p->oomkilladj调节一下权重

至此，已经找到bad points最大的进程了，主要是占用内存大，运行时间短，比较空闲的。下面分析另一半，杀死进程，看oom_kill_process函数：

[html]view plaincopy 
   
 230static struct mm_struct *oom_kill_process(struct task_struct *p)  
 231{  
 232        struct mm_struct *mm;  
 233        struct task_struct *c;  
 234        struct list_head *tsk;  
 235  
 236        /* Try to kill a child first */  
 237        list_for_each(tsk, &p->children) {  
 238                c = list_entry(tsk, struct task_struct, sibling);  
 239                if (c->mm == p->mm)  
 240                        continue;  
 241                mm = oom_kill_task(c);  
 242                if (mm)  
 243                        return mm;  
 244        }  
 245        return oom_kill_task(p);  
 246}  

尝试杀死找到的bad points最大的进程的子进程，如果那个子进程有与父进程不同的内存则杀了子进程，否则杀死父进程。

[html]view plaincopy 
   
 205static struct mm_struct *oom_kill_task(task_t *p)  
 206{  
 207        struct mm_struct *mm = get_task_mm(p);  
 208        task_t * g, * q;  
 209  
 210        if (!mm)  
 211                return NULL;  
 212        if (mm == &init_mm) {  
 213                mmput(mm);  
 214                return NULL;  
 215        }  
 216  
 217        __oom_kill_task(p);  
 218        /*  
 219         * kill all processes that share the ->mm (i.e. all threads),  
 220         * but are in a different thread group  
 221         */  
 222        do_each_thread(g, q)  
 223                if (q->mm == mm && q->tgid != p->tgid)  
 224                        __oom_kill_task(q);  
 225        while_each_thread(g, q);  
 226  
 227        return mm;  
 228}  

不能杀init进程。杀死选中的进程和所有的共享那个进程mm的线程，并且这些线程在不同的线程组。主要调用__oom_kill_task函数，进行实际的杀死操作。

[html]view plaincopy 
   
 176static void __oom_kill_task(task_t *p)  
 177{  
 178        if (p->pid == 1) {  
 179                WARN_ON(1);  
 180                printk(KERN_WARNING "tried to kill init!\n");  
 181                return;  
 182        }  
 183  
 184        task_lock(p);  
 185        if (!p->mm || p->mm == &init_mm) {  
 186                WARN_ON(1);  
 187                printk(KERN_WARNING "tried to kill an mm-less task!\n");  
 188                task_unlock(p);  
 189                return;  
 190        }  
 191        task_unlock(p);  
 192        printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);  
 193  
 194        /*  
 195         * We give our sacrificial lamb high priority and access to  
 196         * all the memory it needs. That way it should be able to  
 197         * exit() and clear out its resources quickly...  
 198         */  
 199        p->time_slice = HZ;  
 200        set_tsk_thread_flag(p, TIF_MEMDIE);  
 201  
 202        force_sig(SIGKILL, p);  
 203}  

杀那个进程之前先让他把该干的事干完，给它时间片和很高的优先级。然后发送SIGKILL信号，让它去死吧。

[html]view plaincopy 
   
 1268void  
 1269force_sig(int sig, struct task_struct *p)  
 1270{  
 1271        force_sig_info(sig, (void*)1L, p);  
 1272}  

总结一下，当分配内存，内存不足的时候，会以高优先级调用回收函数，如果还无法回收足够的内存，只能找一个既占内存，又比较空闲的进程杀死，在它死前给它比较高的优先级和时间片，让他把该干的事干一下。杀死后，可以释放出大量内存。

下边一个实例程序：

[html]view plaincopy 
   
 #include <stdio.h>  
   
 int main()  
 {  
     void *p;  
     while(1)  
     {  
         p = malloc(1024 * 1024 * 100);  
         memset(p, 0, 1024 * 1024 * 100);  
     printf("100MB memory has been allocated!\n");  
     }  
     return 0;  
 }