oom机制分析

原创已于 2025-11-25 12:16:15 修改 · 275 阅读

10 ·

CC 4.0 BY-SA版权

文章标签：

#linux

于 2025-11-24 18:44:28 首次发布

linux内核专栏收录该内容

5 篇文章

订阅专栏

该文章已生成可运行项目，

内核版本openeuler 5.10.0

处理oom的函数是out_of_memory，当内核无法满足内存分配请求时，就会调用到该函数

bool out_of_memory(struct oom_control *oc)
{
        unsigned long freed = 0;

        if (oom_killer_disabled)//oom功能没打开，返回
                return false;

        if (!sysctl_enable_oom_killer) {//sysctl设置了oom不使能，则只通过通知链调用注册的通知函数就返回
                oom_type_notifier_call(0, oc);
                return false;
        }

        if (!is_memcg_oom(oc)) {//非cgroup型oom，调用通知链回调函数尝试释放一些内存，如果能释放出一些则返回，说不定释放的这些足够分配了，不再往下处理
                blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
                if (freed > 0)
                        /* Got some memory back in the last second. */
                        return true;
        }

        /*
         * If current has a pending SIGKILL or is exiting, then automatically
         * select it.  The goal is to allow it to allocate so that it may
         * quickly exit and free its memory.
         */
         //判断进程是否正在退出，并且没占用其他正常运行的进程的内存资源，则通过oom_reaper内核线程快速释放该进程内存，并返回
        if (task_will_free_mem(current)) {//如果当前进程可以释放内存
                mark_oom_victim(current);//标记当前进程为oom受害者进程
                wake_oom_reaper(current);//唤醒 oom reaper内核线程对内存对受害者进程内存进行回收
                return true;
        }

        /*
         * The OOM killer does not compensate for IO-less reclaim.
         * pagefault_out_of_memory lost its gfp context so we have to
         * make sure exclude 0 mask - all other users should have at least
         * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
         * invoke the OOM killer even if it is a GFP_NOFS allocation.
         */
        // 当进程在文件系统关键路径中分配内存时会使用GFP_NOFS防止递归死锁，但如果此时内存不足，不能让OOM杀死进程，因为杀进程需要文件系统操作（写日志、清理inode等），但GFP_NOFS禁止文件系统操作 → 死锁！，所以这种情况直接返回。
        if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))//非cgroup型oom，并且是在不能调用文件系统操作的环境
                return true; 

        /*
         * Check if there were limitations on the allocation (only relevant for
        

  * NUMA and memcg) that may require different handling.
         */
        oc->constraint = constrained_alloc(oc);//获得内存分配失败的约束类型，确定内存不足的原因和范围
        if (oc->constraint != CONSTRAINT_MEMORY_POLICY)//如果不是内存策略限制导致的oom,后续就不需要考虑节点掩码的限制,只考虑task->mems_allowed，设置oc->nodemask为NULL，方便区别policy和cpuset两种约束类型
                oc->nodemask = NULL;
        check_panic_on_oom(oc);//检查是否设置了oom之后panic，如果设置了，走panic处理流程
        //如果如果设置了要杀死当前申请内存的进程，且满足下面条件，就杀当前进程
        if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&//如果不是cgroup导致的oom，并且设置了杀死oom时分配内存的进程（cgroup导致的说明系统内存够用，不需要杀别的进程）
            current->mm && !oom_unkillable_task(current) && //不能杀内核线程
            oom_cpuset_eligible(current, oc) && //根据不同的OOM约束类型，采用相应的检查策略来避免不必要地终止与当前OOM情况无关的任务
            current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {//并且进程没有通过oom_score_adj规避被杀
                get_task_struct(current);
                oc->chosen = current;//选当前进程杀死
                oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
                return true;
        }

        select_bad_process(oc);//按规则选择合适的oom受害者进程
        /* Found nothing?!?! */
        if (!oc->chosen) {//如果没有选择出来该杀的进程，而且不是手动触发的oom和cgroup型oom，则让系统panic
                dump_header(oc, NULL);
                pr_warn("Out of memory and no killable processes...\n");
                /*
                 * If we got here due to an actual allocation at the
                 * system level, we cannot survive this and will enter
                 * an endless loop in the allocator. Bail out now.
                 */
                if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))//
                        panic("System is deadlocked on memory\n");
        }
        if (oc->chosen && oc->chosen != (void *)-1UL)//选择出了合理的进程，杀死该进程，释放内存
                oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
                                 "Memory cgroup out of memory");
        return !!oc->chosen;
}

task_will_free_mem函数判断进程是否可以立刻释放内存。满足下面条件则可以释放内存

1：首先调用__task_will_free_mem函数判断进程是否正在退出，正在退出的进程才可以被选中更快地释放内存。

2：判断进程的内存没有被其他进程共享占用，如果没被共享则可以被释放；共享的情况下，判断是否所有共享内存的进程是否正在退出、内存有没有被其他进程占用，没有则可以被释放

static bool task_will_free_mem(struct task_struct *task)
{
        struct mm_struct *mm = task->mm;
        struct task_struct *p;
        bool ret = true;

        /*
         * Skip tasks without mm because it might have passed its exit_mm and
         * exit_oom_victim. oom_reaper could have rescued that but do not rely
         * on that for now. We can consider find_lock_task_mm in future.
         */
        if (!mm)
                return false;

        if (!__task_will_free_mem(task))//快速判断进程是否可以立即释放内存
                return false;

        /*
         * This task has already been drained by the oom reaper so there are
         * only small chances it will free some more
         */
        if (test_bit(MMF_OOM_SKIP, &mm->flags))//进程已经在被oom reaper线程回收内存了，或者回收失败，下次不会再对该进程内存进行回收
                return false;

        if (atomic_read(&mm->mm_users) <= 1) //mm内存不被共享,可以直接释放
                return true;

        /*
         * Make sure that all tasks which share the mm with the given tasks
         * are dying as well to make sure that a) nobody pins its mm and
         * b) the task is also reapable by the oom reaper.
         */
        rcu_read_lock();
        for_each_process(p) {//mm内存被共享的情况下，确保所有和task共享内存的进程都可以被释放
                if (!process_shares_mm(p, mm))//该进程的某个线程正在使用内存mm
                        continue;
                if (same_thread_group(task, p))//该进程和task不是一个线程组
                        continue;
                ret = __task_will_free_mem(p);//并且进程的内存也可以被释放
                if (!ret)
                        break;
        }
        rcu_read_unlock();

        return ret;
}

__task_will_free_mem函数判断进程是否正在退出

static inline bool __task_will_free_mem(struct task_struct *task)
{
        struct signal_struct *sig = task->signal;

        /*
         * A coredumping process may sleep for an extended period in exit_mm(),
         * so the oom killer cannot assume that the process will promptly exit
         * and release memory.
         */
        if (sig->flags & SIGNAL_GROUP_COREDUMP)//正在coredump的进程可能会睡眠，,不会很快释放内存
                return false;

        if (sig->flags & SIGNAL_GROUP_EXIT)//进程组正在退出,会释放所有内存
                return true;

        if (thread_group_empty(task) && (task->flags & PF_EXITING))//单线程，并且线程正在退出,会释放所有内存
                return true;

        return false;
}

constrained_alloc用来获取当前的内存约束类型，并统计各个类型的总内存限制

这里说一下几种约束类型

enum oom_constraint {
    CONSTRAINT_NONE,
    CONSTRAINT_CPUSET,
    CONSTRAINT_MEMORY_POLICY,
    CONSTRAINT_MEMCG,
};

CONSTRAINT_NONE 非numa系统，或者numa系统没有对node进行约束的情况，出现oom属于全局物理内存耗尽

CONSTRAINT_CPUSET cpuset是内核的一种机制，可以把一组cpu和node分配给一组进程，如果出现oom，仅仅说明该进程可以分配到内存的node内存耗尽了，系统其他node上可能还有内存

CONSTRAINT_MEMORY_POLICY memory policy 是numa系统中用户进程主动绑定到node的情况，出现oom也不能说明系统内存耗尽了

CONSTRAINT_MEMCG 限制一组进程的内存使用上限，当组内进程申请的总内存超过组内设置内存上限就会oom，不代表系统全局物理内存耗尽，不关心node，只关心内存使用额度

static enum oom_constraint constrained_alloc(struct oom_control *oc)
{
        struct zone *zone;
        struct zoneref *z;
        enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
        bool cpuset_limited = false;
        int nid;

        if (is_memcg_oom(oc)) {//如果是cgroup导致的缺少内存，计算总内存限制，并设置为cgroup约束类型
                oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
                return CONSTRAINT_MEMCG;
        }

        /* Default to all available memory */
        oc->totalpages = totalram_pages() + total_swap_pages;//计算无约束类型的总内存限制

        if (!IS_ENABLED(CONFIG_NUMA))//非多节点或oc->zonelist为空的情况下，不需要考虑node和zone，直接认为是内存无约束型
                return CONSTRAINT_NONE;

        if (!oc->zonelist)
                return CONSTRAINT_NONE;
        /*
         * Reach here only when __GFP_NOFAIL is used. So, we should avoid
         * to kill current.We have to random task kill in this case.
         * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
         */
        if (oc->gfp_mask & __GFP_THISNODE)
                return CONSTRAINT_NONE;

        /*
         * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
         * the page allocator means a mempolicy is in effect.  Cpuset policy
         * is enforced in get_page_from_freelist().
         */
         //内存策略约束检测,如果系统有内存的节点不是允许使用内存的节点的子集,也就是说存在有内存的节点不在允许使用的节点集合中，属于内存策略型，用户程序主动限制使用节点，例如系统内存节点0~3 node，用户程序限制在0~2 node
        if (oc->nodemask &&
            !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
                oc->totalpages = total_swap_pages;//重新计算内存策略型约束总内存限制
                for_each_node_mask(nid, *oc->nodemask)
                        oc->totalpages += node_present_pages(nid);//累加允许的节点的内存
                return CONSTRAINT_MEMORY_POLICY;
        }

        //在用户不主动限制使用节点的情况下，cpuset对节点进行了限制，例如用户程序无限制0~3 node,cpuset限制为0~2 node
        //cpuset型约束检测,遍历所有可用的内存区域，检查是否有区域受到 cpuset 限制。
        /* Check this allocation failure is caused by cpuset's wall function */
        for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
                        highest_zoneidx, oc->nodemask)
                if (!cpuset_zone_allowed(zone, oc->gfp_mask))//例如用户和系统可用node都为0~3，但是在遍历过程中发现有zone对应3节点不在cpuset限制的0~2中
                        cpuset_limited = true;

        if (cpuset_limited) {//cpuset mem限制的情况下，重新计算允许的总内存为交换分区+cpuset mem允许的node的总内存
                oc->totalpages = total_swap_pages;
                for_each_node_mask(nid, cpuset_current_mems_allowed)
                        oc->totalpages += node_present_pages(nid);
                return CONSTRAINT_CPUSET;
        }
        return CONSTRAINT_NONE;
}

check_panic_on_oom函数判断是否设置了发生oom之后让系统panic，死机或重启

主要看/proc/sys/vm/panic_on_oom的设置，为0或者为1 并且不是系统总内存不足导致的oom或者通过echo m > /proc/sysrq-trigger手动触发的oom都返回，继续选择合适的进程杀死；为2则所有类型的oom都触发panic

static void check_panic_on_oom(struct oom_control *oc)
{
        if (likely(!sysctl_panic_on_oom))//没设置则返回
                return;
        if (sysctl_panic_on_oom != 2) {//不等于2,也不是由于系统总内存限制导致的oom，则返回
                /*                     //因为cpuset mempolicy cgroup导致的oom，不是整个系统内存不够，不需要panic
                 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
                 * does not panic for cpuset, mempolicy, or memcg allocation
                 * failures.
                 */
                if (oc->constraint != CONSTRAINT_NONE)
                        return;
        }
        /* Do not panic for oom kills triggered by sysrq */
        if (is_sysrq_oom(oc))//手动触发的oom不panic
                return;
        dump_header(oc, NULL);//打印相关信息并调用panic函数
        panic("Out of memory: %s panic_on_oom is enabled\n",
                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}

select_bad_process函数选择合适的进程，该进程将会被杀

static void select_bad_process(struct oom_control *oc)
{
        oc->chosen_points = LONG_MIN;

        if (is_memcg_oom(oc))//cgroup类型的oom，选择cgroup组中的进程
                mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
        else {//非cgroup的oom中，如果配置了CONFIG_MEMCG_QOS先尝试在低优先级cgroup中找目标进程
                struct task_struct *p;

#ifdef CONFIG_MEMCG_QOS
                if (memcg_low_priority_scan_tasks(oom_evaluate_task, oc))//先尝试在低优先级cgroup中找目标
                        return;
#endif
                rcu_read_lock();
                for_each_process(p)//否则在整个系统中选择合适的进程
                        if (oom_evaluate_task(p, oc))
                                break;
                rcu_read_unlock();
        }
}

不管是上面三种哪一种遍历，最终通过oom_evaluate_task选出目标进程

static int oom_evaluate_task(struct task_struct *task, void *arg)
{
        struct oom_control *oc = arg;
        long points;

        if (oom_unkillable_task(task))//1号或者内核进程不处理，也没有可释放的mm
                goto next;

        //非cgroup型oom，并且遍历的进程(包括其子线程)，判断候选进程是否符合条件
        /* p may not have freeable memory in nodemask */
        if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
                goto next;

        /*
         * This task already has access to memory reserves and is being killed.
         * Don't allow any other task to have access to the reserves unless
         * the task has MMF_OOM_SKIP because chances that it would release
         * any memory is quite low.
         */
        if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {//不是手动触发的oom，并且进程已经被标记为oom受害者进程
                if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))//如果设置了跳过标记，进程被杀过了，则继续选择下一个，防止重复选中同一个进程
                        goto next;
                goto abort;//终止本次oom选择过程,因为该进程已经被oom选中正在杀的过程中
        }

        /*
         * If task is allocating a lot of memory and has been marked to be
         * killed first if it triggers an oom, then select it.
         */
        if (oom_task_origin(task)) {//如果任务被标记优先杀死,标记分数为最高，优先杀该进程，最后遍历一圈也只能选它
                points = LONG_MAX;
                goto select;
        }

        points = oom_badness(task, oc->totalpages);//对当前进程按内存情况和调整因子进行打分
        if (oom_next_task(task, oc, points))//判断当前进程得分和之前遍历的所有进程得分的最高分谁更大
                goto next;

select:
        //这里是不是可以优化一下，如果是oom_task_origin的情况，points = LONG_MAX，分数已经最高了，就不需要遍历后面的进程了
        if (oc->chosen)//走到这要么进程被标记优先杀死，要么进程得分大于之前遍历的所有进程
                put_task_struct(oc->chosen);
        get_task_struct(task);
        oc->chosen = task;//选择当前进程作为候选进程，因为得分最高
        oc->chosen_points = points;//保存分数
next:
        return 0;
abort:
        if (oc->chosen) //释放本次已选中的候选进程
                put_task_struct(oc->chosen);
        oc->chosen = (void *)-1UL;//设置特殊标志，表示终止
        return 1;
}

oom_cpuset_eligible 判断被杀进程 tsk 能不能访问到触发 OOM 的那块内存（current 能访问的节点），只要有哪怕一个 node 相交 → 它就可能占用了那块内存 → 就可以被杀，这里其实不太明白为什么只要相交就可以。有没有这种情况，虽然task的node和mask相交 task node是0-1 mask是1-2 但是task实际是从node 0申请的内存，杀死task释放出的node 0内存实际上当前进程也用不了，因为它只能用node 1-2的内存。不过内核代码是不可能错的，只是还没理解到位

mempolicy型oom，进程自己把自己绑定到某些node上，要杀进程的话，也是杀绑定到其中node上的进程

cpuset型oom，把进程限制在小组中，组内的进程总共用这么多内存，要杀进程的话也只能杀组内进程，以触发oom的组为准

static bool oom_cpuset_eligible(struct task_struct *start,
                                struct oom_control *oc)
{
        struct task_struct *tsk;
        bool ret = false;
        const nodemask_t *mask = oc->nodemask;//分配内存时的node

        if (is_memcg_oom(oc))//cgroup类型的oom和node无关
                return true;

        rcu_read_lock();
        for_each_thread(start, tsk) {
                if (mask) {//在out_of_memory中非policy的oc->nodemask已经被设置为NULL，因此不为NULL则是policy类型
                        /*
                         * If this is a mempolicy constrained oom, tsk's
                         * cpuset is irrelevant.  Only return true if its
                         * mempolicy intersects current, otherwise it may be
                         * needlessly killed.
                         */
                        ret = mempolicy_nodemask_intersects(tsk, mask);//检查任务的内存策略是否与当前OOM约束的内存节点相交,有交集释放的内存才能缓解策略型内存的压力
                } else { 如果是cpuset型内存约束
                        /*
                         * This is not a mempolicy constrained oom, so only
                         * check the mems of tsk's cpuset.
                         */
                        ret = cpuset_mems_allowed_intersects(current, tsk);//检查任务的cpuset允许的内存节点是否与当前进程相交
                }                                                          //避免终止在完全不同资源约束环境中的进程
                                                                           //如果终止与触发OOM进程在不同cpuset中的进程，释放的内存可能无法用于缓解当前cpuset的内存压力
                if (ret)
                        break;
        }
        rcu_read_unlock();

        return ret;
}

oom_badness函数对进程进行打分，分数越高，越适合被oom杀死,返回LONG_MIN表示绝对不会被杀,因为分数最小

long oom_badness(struct task_struct *p, unsigned long totalpages)
{
        long points;
        long adj;

        if (oom_unkillable_task(p))//内核线程绝对不会杀
                return LONG_MIN;

        p = find_lock_task_mm(p);//进程必须有内存管理结构,没有的不会被杀，因为没有意义，杀了也不会释放出内存
        if (!p)
                return LONG_MIN;

        /*
         * Do not even consider tasks which are explicitly marked oom
         * unkillable or have been already oom reaped or the are in
         * the middle of vfork
         */
        adj = (long)p->signal->oom_score_adj;//oom_score_adj为-1000,用户标记为不杀死，或者设置MMF_OOM_SKIP标志，内核标记跳过该进程
        if (adj == OOM_SCORE_ADJ_MIN ||      //或者进程是vfork出来的，父子共享内存的子进程，这三种不会作为候选进程,
                        test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
                        in_vfork(p)) {
                task_unlock(p);
                return LONG_MIN;
        }

        /*
         * The baseline for the badness score is the proportion of RAM that each
         * task's rss, pagetable and swap space use.
         */
        //计算基础分数，该进程的rss物理内存+交换分区使用+页表使用
        points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
                mm_pgtables_bytes(p->mm) / PAGE_SIZE;
        task_unlock(p);

        /* Normalize to oom_score_adj units */
        adj *= totalpages / 1000;//计算调整因子，adj取值范围是【-1000，1000】，所以其实就是在基础分数上加减千分之几的调整因子,计算最终得分
        points += adj;

        return points;
}

oom_next_task函数判断当前进程得分和之前遍历的所有进程得分的最高分谁更大

static bool oom_next_task(struct task_struct *task, struct oom_control *oc,
                        long points)
{
        struct mem_cgroup *cur_memcg;
        struct mem_cgroup *oc_memcg;
        //没打开CONFIG_MEMCG_QOS的情况，points为LONG_MIN的进程不可杀，或者小于已选进程的分数，则继续遍历
        if (!static_branch_likely(&memcg_qos_stat_key))
                return (points == LONG_MIN || points < oc->chosen_points);

        if (points == LONG_MIN)
                return true;

        if (!oc->chosen)//第一次还没有候选进程，选择当前进程为候选进程
                return false;

        oc_memcg = mem_cgroup_from_task(oc->chosen);
        cur_memcg = mem_cgroup_from_task(task);

        //优先级相同，比较分数，当前进程分数低则跳过，优先级不同，就不看分数了，选高优先级进程为候选进程
        if (cur_memcg->memcg_priority == oc_memcg->memcg_priority) {
                if (points < oc->chosen_points)
                        return true;
                return false;
        }
        //查到的资料memcg_priority的优先级只有0和1
        /* if oc is low-priority, so skip the task */
        if (oc_memcg->memcg_priority) //优先级为1，低优先级跳过
                return true;

        return false;//优先级为0，高优先级选中
}

oom_kill_process函数处理选中的进程，如果进程正在退出，让oom_reaper现成去回收内存，否则让__oom_kill_process函数处理

static void oom_kill_process(struct oom_control *oc, const char *message)
{
        struct task_struct *victim = oc->chosen;
        struct mem_cgroup *oom_group;
        static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
                                              DEFAULT_RATELIMIT_BURST);

        /*
         * If the task is already exiting, don't alarm the sysadmin or kill
         * its children or threads, just give it access to memory reserves
         * so it can die quickly
         */
        task_lock(victim);
        if (task_will_free_mem(victim)) {//候选进程正在退出，且内存没有被占用，内存可被释放
                mark_oom_victim(victim);//标记为受害者进程
                wake_oom_reaper(victim);//唤醒oom_reaper线程处理
                task_unlock(victim);
                put_task_struct(victim);
                return;
        }
        task_unlock(victim);

        if (__ratelimit(&oom_rs))//限速打印日志
                dump_header(oc, victim);

        /*
         * Do we need to kill the entire memory cgroup?
         * Or even one of the ancestor memory cgroups?
         * Check this out before killing the victim task.
         */
        oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);

        __oom_kill_process(victim, message);

        /*
         * If necessary, kill all tasks in the selected memory cgroup.
         */
        if (oom_group) {
                mem_cgroup_print_oom_group(oom_group);
                mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
                                      (void*)message);
                mem_cgroup_put(oom_group);
        }
}

__oom_kill_process函数针对不是正在退出的线程（这个可能是大多数情况），给线程和与其共享内存的非线程组线程发送杀死信号，然后交给oom_reaper内核线程回收内存

static void __oom_kill_process(struct task_struct *victim, const char *message)
{
        struct task_struct *p;
        struct mm_struct *mm;
        bool can_oom_reap = true;

        p = find_lock_task_mm(victim);//mm已经不存在，无可释放的进程不处理
        if (!p) {
                pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
                        message, task_pid_nr(victim), victim->comm);
                put_task_struct(victim);
                return;
        } else if (victim != p) {//子线程的mm存在，进程的mm已经不存在的情况下，使用子线程的task
                get_task_struct(p);
                put_task_struct(victim);
                victim = p;
        }

        /* Get a reference to safely compare mm after task_unlock(victim) */
        mm = victim->mm;
        mmgrab(mm);

        /* Raise event before sending signal: task reaper must see this */
        count_vm_event(OOM_KILL);
        memcg_memory_event_mm(mm, MEMCG_OOM_KILL);

        /*
         * We should send SIGKILL before granting access to memory reserves
         * in order to prevent the OOM victim from depleting the memory
         * reserves from the user space under its control.
         */
        do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);//发送信号杀死线程
        mark_oom_victim(victim);//标记为受害者进程，打印日志
        pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
                message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
                K(get_mm_counter(mm, MM_ANONPAGES)),
                K(get_mm_counter(mm, MM_FILEPAGES)),
                K(get_mm_counter(mm, MM_SHMEMPAGES)),
                from_kuid(&init_user_ns, task_uid(victim)),
                mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
        task_unlock(victim);

        /*
         * Kill all user processes sharing victim->mm in other thread groups, if
         * any.  They don't get access to memory reserves, though, to avoid
         * depletion of all memory.  This prevents mm->mmap_lock livelock when an
         * oom killed thread cannot exit because it requires the semaphore and
         * its contended by another thread trying to allocate memory itself.
         * That thread will now get access to memory reserves since it has a
         * pending fatal signal.
         */
        rcu_read_lock();
        for_each_process(p) {//遍历系统所有线程，和这个线程共享内存的、1号进程内核线程除外，非同个线程组的都会被发信号杀死
                if (!process_shares_mm(p, mm))
                        continue;
                if (same_thread_group(p, victim))
                        continue;
                if (is_global_init(p)) {
                        can_oom_reap = false;
                        set_bit(MMF_OOM_SKIP, &mm->flags);
                        pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
                                        task_pid_nr(victim), victim->comm,
                                        task_pid_nr(p), p->comm);
                        continue;
                }
                /*
                 * No kthead_use_mm() user needs to read from the userspace so
                 * we are ok to reap it.
                 */
                if (unlikely(p->flags & PF_KTHREAD))
                        continue;
                do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
        }
        rcu_read_unlock();

        if (can_oom_reap)//最终还是被oom_reaper线程处理
                wake_oom_reaper(victim);

        mmdrop(mm);
        put_task_struct(victim);
}

wake_oom_reaper函数将受害者进程加入oom_reaper_list队列，唤醒oom_reaper内核线程

static void wake_oom_reaper(struct task_struct *tsk)
{
        /* mm is already queued? */
        if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))//oom受害者进程已经入队
                return;

        get_task_struct(tsk);

        spin_lock(&oom_reaper_lock);
        tsk->oom_reaper_list = oom_reaper_list; //将oom受害者进程加入oom_reaper_list队列
        oom_reaper_list = tsk;
        spin_unlock(&oom_reaper_lock);
        trace_wake_reaper(tsk->pid);
        wake_up(&oom_reaper_wait);//唤醒oom_reaper线程
}

可以看到oom_reaper函数是死循环，等待oom_reaper_list不为空的事件发生时被唤醒，从队列拿出一个task交给oom_reap_task处理

static int oom_reaper(void *unused)//oom_reaper线程循环释放oom_reaper_list链表上oom受害者的内存
{
        while (true) {
                struct task_struct *tsk = NULL;

                wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);//休眠等待oom_reaper_list不为空的事件到来
                spin_lock(&oom_reaper_lock);
                if (oom_reaper_list != NULL) {
                        tsk = oom_reaper_list;//取出一个oom受害者进程
                        oom_reaper_list = tsk->oom_reaper_list;
                }
                spin_unlock(&oom_reaper_lock);

                if (tsk)
                        oom_reap_task(tsk);//释放选中进程的内存
        }

        return 0;
}

oom_reap_task调用oom_reap_task_mm进行实际的回收操作

#define MAX_OOM_REAP_RETRIES 10
static void oom_reap_task(struct task_struct *tsk)
{
        int attempts = 0;
        struct mm_struct *mm = tsk->signal->oom_mm;

        /* Retry the mmap_read_trylock(mm) a few times */ //尝试回收oom受害者进程物理内存，给10次机会,每次休眠0.1秒
        while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
                schedule_timeout_idle(HZ/10);

        if (attempts <= MAX_OOM_REAP_RETRIES ||//小于10次说明回收成功,或者进程包含MMF_OOM_SKIP标志则跳过对该进程的回收
            test_bit(MMF_OOM_SKIP, &mm->flags))
                goto done;

        pr_info("oom_reaper: unable to reap pid:%d (%s)\n",//回收失败，打印相关信息
                task_pid_nr(tsk), tsk->comm);
        sched_show_task(tsk);
        debug_show_all_locks();

done:
        tsk->oom_reaper_list = NULL;

        /*
         * Hide this mm from OOM killer because it has been either reaped or
         * somebody can't call mmap_write_unlock(mm).
         */
        set_bit(MMF_OOM_SKIP, &mm->flags); //回收成设置MMF_OOM_SKIP标记,或者10次回收失败也设置，以后不对对该进程回收内存

        /* Drop a reference taken by wake_oom_reaper */
        put_task_struct(tsk);
}

oom_reap_task_mm回收进程的地址空间

static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
        bool ret = true;

        if (!mmap_read_trylock(mm)) {
                trace_skip_task_reaping(tsk->pid);
                return false;
        }

        /*
         * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
         * work on the mm anymore. The check for MMF_OOM_SKIP must run
         * under mmap_lock for reading because it serializes against the
         * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
         */
        if (test_bit(MMF_OOM_SKIP, &mm->flags)) {//不对设置MMF_OOM_SKIP标志的进程进行内存回收
                trace_skip_task_reaping(tsk->pid);
                goto out_unlock;
        }

        trace_start_task_reaping(tsk->pid);

        /* failed to reap part of the address space. Try again later */
        ret = __oom_reap_task_mm(mm);//回收内存
        if (!ret)
                goto out_finish;

        pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
                        task_pid_nr(tsk), tsk->comm,
                        K(get_mm_counter(mm, MM_ANONPAGES)),
                        K(get_mm_counter(mm, MM_FILEPAGES)),
                        K(get_mm_counter(mm, MM_SHMEMPAGES)));
out_finish:
        trace_finish_task_reaping(tsk->pid);
out_unlock:
        mmap_read_unlock(mm);

        return ret;
}

本文章已经生成可运行项目