oom机制分析

该文章已生成可运行项目,

内核版本openeuler 5.10.0

处理oom的函数是out_of_memory,当内核无法满足内存分配请求时,就会调用到该函数

bool out_of_memory(struct oom_control *oc)
{
        unsigned long freed = 0;

        if (oom_killer_disabled)//oom功能没打开,返回
                return false;

        if (!sysctl_enable_oom_killer) {//sysctl设置了oom不使能,则只通过通知链调用注册的通知函数就返回
                oom_type_notifier_call(0, oc);
                return false;
        }

        if (!is_memcg_oom(oc)) {//非cgroup型oom,调用通知链回调函数尝试释放一些内存,如果能释放出一些则返回,说不定释放的这些足够分配了,不再往下处理
                blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
                if (freed > 0)
                        /* Got some memory back in the last second. */
                        return true;
        }

        /*
         * If current has a pending SIGKILL or is exiting, then automatically
         * select it.  The goal is to allow it to allocate so that it may
         * quickly exit and free its memory.
         */
         //判断进程是否正在退出,并且没占用其他正常运行的进程的内存资源,则通过oom_reaper内核线程快速释放该进程内存,并返回
        if (task_will_free_mem(current)) {//如果当前进程可以释放内存
                mark_oom_victim(current);//标记当前进程为oom受害者进程
                wake_oom_reaper(current);//唤醒 oom reaper内核线程对内存对受害者进程内存进行回收
                return true;
        }

        /*
         * The OOM killer does not compensate for IO-less reclaim.
         * pagefault_out_of_memory lost its gfp context so we have to
         * make sure exclude 0 mask - all other users should have at least
         * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
         * invoke the OOM killer even if it is a GFP_NOFS allocation.
         */
        // 当进程在文件系统关键路径中分配内存时会使用GFP_NOFS防止递归死锁,但如果此时内存不足,不能让OOM杀死进程,因为杀进程需要文件系统操作(写日志、清理inode等),但GFP_NOFS禁止文件系统操作 → 死锁!,所以这种情况直接返回。
        if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))//非cgroup型oom,并且是在不能调用文件系统操作的环境
                return true; 

        /*
         * Check if there were limitations on the allocation (only relevant for
        

  * NUMA and memcg) that may require different handling.
         */
        oc->constraint = constrained_alloc(oc);//获得内存分配失败的约束类型,确定内存不足的原因和范围
        if (oc->constraint != CONSTRAINT_MEMORY_POLICY)//如果不是内存策略限制导致的oom,后续就不需要考虑节点掩码的限制,只考虑task->mems_allowed,设置oc->nodemask为NULL,方便区别policy和cpuset两种约束类型
                oc->nodemask = NULL;
        check_panic_on_oom(oc);//检查是否设置了oom之后panic,如果设置了,走panic处理流程
        //如果如果设置了要杀死当前申请内存的进程,且满足下面条件,就杀当前进程
        if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&//如果不是cgroup导致的oom,并且设置了杀死oom时分配内存的进程(cgroup导致的说明系统内存够用,不需要杀别的进程)
            current->mm && !oom_unkillable_task(current) && //不能杀内核线程
            oom_cpuset_eligible(current, oc) && //根据不同的OOM约束类型,采用相应的检查策略来避免不必要地终止与当前OOM情况无关的任务
            current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {//并且进程没有通过oom_score_adj规避被杀
                get_task_struct(current);
                oc->chosen = current;//选当前进程杀死
                oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
                return true;
        }

        select_bad_process(oc);//按规则选择合适的oom受害者进程
        /* Found nothing?!?! */
        if (!oc->chosen) {//如果没有选择出来该杀的进程,而且不是手动触发的oom和cgroup型oom,则让系统panic
                dump_header(oc, NULL);
                pr_warn("Out of memory and no killable processes...\n");
                /*
                 * If we got here due to an actual allocation at the
                 * system level, we cannot survive this and will enter
                 * an endless loop in the allocator. Bail out now.
                 */
                if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))//
                        panic("System is deadlocked on memory\n");
        }
        if (oc->chosen && oc->chosen != (void *)-1UL)//选择出了合理的进程,杀死该进程,释放内存
                oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
                                 "Memory cgroup out of memory");
        return !!oc->chosen;
}

task_will_free_mem函数判断进程是否可以立刻释放内存。满足下面条件则可以释放内存

1:首先调用__task_will_free_mem函数判断进程是否正在退出,正在退出的进程才可以被选中更快地释放内存。

2:判断进程的内存没有被其他进程共享占用,如果没被共享则可以被释放;共享的情况下,判断是否所有共享内存的进程是否正在退出、内存有没有被其他进程占用,没有则可以被释放

static bool task_will_free_mem(struct task_struct *task)
{
        struct mm_struct *mm = task->mm;
        struct task_struct *p;
        bool ret = true;

        /*
         * Skip tasks without mm because it might have passed its exit_mm and
         * exit_oom_victim. oom_reaper could have rescued that but do not rely
         * on that for now. We can consider find_lock_task_mm in future.
         */
        if (!mm)
                return false;

        if (!__task_will_free_mem(task))//快速判断进程是否可以立即释放内存
                return false;

        /*
         * This task has already been drained by the oom reaper so there are
         * only small chances it will free some more
         */
        if (test_bit(MMF_OOM_SKIP, &mm->flags))//进程已经在被oom reaper线程回收内存了,或者回收失败,下次不会再对该进程内存进行回收
                return false;

        if (atomic_read(&mm->mm_users) <= 1) //mm内存不被共享,可以直接释放
                return true;

        /*
         * Make sure that all tasks which share the mm with the given tasks
         * are dying as well to make sure that a) nobody pins its mm and
         * b) the task is also reapable by the oom reaper.
         */
        rcu_read_lock();
        for_each_process(p) {//mm内存被共享的情况下,确保所有和task共享内存的进程都可以被释放
                if (!process_shares_mm(p, mm))//该进程的某个线程正在使用内存mm
                        continue;
                if (same_thread_group(task, p))//该进程和task不是一个线程组
                        continue;
                ret = __task_will_free_mem(p);//并且进程的内存也可以被释放
                if (!ret)
                        break;
        }
        rcu_read_unlock();

        return ret;
}

__task_will_free_mem函数判断进程是否正在退出

static inline bool __task_will_free_mem(struct task_struct *task)
{
        struct signal_struct *sig = task->signal;

        /*
         * A coredumping process may sleep for an extended period in exit_mm(),
         * so the oom killer cannot assume that the process will promptly exit
         * and release memory.
         */
        if (sig->flags & SIGNAL_GROUP_COREDUMP)//正在coredump的进程可能会睡眠,,不会很快释放内存
                return false;

        if (sig->flags & SIGNAL_GROUP_EXIT)//进程组正在退出,会释放所有内存
                return true;

        if (thread_group_empty(task) && (task->flags & PF_EXITING))//单线程,并且线程正在退出,会释放所有内存
                return true;

        return false;
}

constrained_alloc用来获取当前的内存约束类型,并统计各个类型的总内存限制

这里说一下几种约束类型

enum oom_constraint {
    CONSTRAINT_NONE,
    CONSTRAINT_CPUSET,
    CONSTRAINT_MEMORY_POLICY,
    CONSTRAINT_MEMCG,
};

CONSTRAINT_NONE  非numa系统,或者numa系统没有对node进行约束的情况,出现oom属于全局物理内存耗尽

CONSTRAINT_CPUSET  cpuset是内核的一种机制,可以把一组cpu和node分配给一组进程,如果出现oom,仅仅说明该进程可以分配到内存的node内存耗尽了,系统其他node上可能还有内存

CONSTRAINT_MEMORY_POLICY memory policy 是numa系统中用户进程主动绑定到node的情况,出现oom也不能说明系统内存耗尽了

CONSTRAINT_MEMCG  限制一组进程的内存使用上限,当组内进程申请的总内存超过组内设置内存上限就会oom,不代表系统全局物理内存耗尽,不关心node,只关心内存使用额度

static enum oom_constraint constrained_alloc(struct oom_control *oc)
{
        struct zone *zone;
        struct zoneref *z;
        enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
        bool cpuset_limited = false;
        int nid;

        if (is_memcg_oom(oc)) {//如果是cgroup导致的缺少内存,计算总内存限制,并设置为cgroup约束类型
                oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
                return CONSTRAINT_MEMCG;
        }

        /* Default to all available memory */
        oc->totalpages = totalram_pages() + total_swap_pages;//计算无约束类型的总内存限制

        if (!IS_ENABLED(CONFIG_NUMA))//非多节点或oc->zonelist为空的情况下,不需要考虑node和zone,直接认为是内存无约束型
                return CONSTRAINT_NONE;

        if (!oc->zonelist)
                return CONSTRAINT_NONE;
        /*
         * Reach here only when __GFP_NOFAIL is used. So, we should avoid
         * to kill current.We have to random task kill in this case.
         * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
         */
        if (oc->gfp_mask & __GFP_THISNODE)
                return CONSTRAINT_NONE;

        /*
         * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
         * the page allocator means a mempolicy is in effect.  Cpuset policy
         * is enforced in get_page_from_freelist().
         */
         //内存策略约束检测,如果系统有内存的节点不是允许使用内存的节点的子集,也就是说存在有内存的节点不在允许使用的节点集合中,属于内存策略型,用户程序主动限制使用节点,例如系统内存节点0~3 node,用户程序限制在0~2 node
        if (oc->nodemask &&
            !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
                oc->totalpages = total_swap_pages;//重新计算内存策略型约束总内存限制
                for_each_node_mask(nid, *oc->nodemask)
                        oc->totalpages += node_present_pages(nid);//累加允许的节点的内存
                return CONSTRAINT_MEMORY_POLICY;
        }

        //在用户不主动限制使用节点的情况下,cpuset对节点进行了限制,例如用户程序无限制0~3 node,cpuset限制为0~2 node
        //cpuset型约束检测,遍历所有可用的内存区域,检查是否有区域受到 cpuset 限制。
        /* Check this allocation failure is caused by cpuset's wall function */
        for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
                        highest_zoneidx, oc->nodemask)
                if (!cpuset_zone_allowed(zone, oc->gfp_mask))//例如用户和系统可用node都为0~3,但是在遍历过程中发现有zone对应3节点不在cpuset限制的0~2中
                        cpuset_limited = true;

        if (cpuset_limited) {//cpuset mem限制的情况下,重新计算允许的总内存为交换分区+cpuset mem允许的node的总内存
                oc->totalpages = total_swap_pages;
                for_each_node_mask(nid, cpuset_current_mems_allowed)
                        oc->totalpages += node_present_pages(nid);
                return CONSTRAINT_CPUSET;
        }
        return CONSTRAINT_NONE;
}

check_panic_on_oom函数判断是否设置了发生oom之后让系统panic,死机或重启

主要看/proc/sys/vm/panic_on_oom的设置,为0或者为1 并且不是系统总内存不足导致的oom或者通过echo m >  /proc/sysrq-trigger手动触发的oom都返回,继续选择合适的进程杀死;为2则所有类型的oom都触发panic

static void check_panic_on_oom(struct oom_control *oc)
{
        if (likely(!sysctl_panic_on_oom))//没设置则返回
                return;
        if (sysctl_panic_on_oom != 2) {//不等于2,也不是由于系统总内存限制导致的oom,则返回
                /*                     //因为cpuset mempolicy cgroup导致的oom,不是整个系统内存不够,不需要panic
                 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
                 * does not panic for cpuset, mempolicy, or memcg allocation
                 * failures.
                 */
                if (oc->constraint != CONSTRAINT_NONE)
                        return;
        }
        /* Do not panic for oom kills triggered by sysrq */
        if (is_sysrq_oom(oc))//手动触发的oom不panic
                return;
        dump_header(oc, NULL);//打印相关信息并调用panic函数
        panic("Out of memory: %s panic_on_oom is enabled\n",
                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}

select_bad_process函数选择合适的进程,该进程将会被杀

static void select_bad_process(struct oom_control *oc)
{
        oc->chosen_points = LONG_MIN;

        if (is_memcg_oom(oc))//cgroup类型的oom,选择cgroup组中的进程
                mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
        else {//非cgroup的oom中,如果配置了CONFIG_MEMCG_QOS先尝试在低优先级cgroup中找目标进程
                struct task_struct *p;

#ifdef CONFIG_MEMCG_QOS
                if (memcg_low_priority_scan_tasks(oom_evaluate_task, oc))//先尝试在低优先级cgroup中找目标
                        return;
#endif
                rcu_read_lock();
                for_each_process(p)//否则在整个系统中选择合适的进程
                        if (oom_evaluate_task(p, oc))
                                break;
                rcu_read_unlock();
        }
}

不管是上面三种哪一种遍历,最终通过oom_evaluate_task选出目标进程

static int oom_evaluate_task(struct task_struct *task, void *arg)
{
        struct oom_control *oc = arg;
        long points;

        if (oom_unkillable_task(task))//1号或者内核进程不处理,也没有可释放的mm
                goto next;

        //非cgroup型oom,并且遍历的进程(包括其子线程),判断候选进程是否符合条件
        /* p may not have freeable memory in nodemask */
        if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
                goto next;

        /*
         * This task already has access to memory reserves and is being killed.
         * Don't allow any other task to have access to the reserves unless
         * the task has MMF_OOM_SKIP because chances that it would release
         * any memory is quite low.
         */
        if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {//不是手动触发的oom,并且进程已经被标记为oom受害者进程
                if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))//如果设置了跳过标记,进程被杀过了,则继续选择下一个,防止重复选中同一个进程
                        goto next;
                goto abort;//终止本次oom选择过程,因为该进程已经被oom选中正在杀的过程中
        }

        /*
         * If task is allocating a lot of memory and has been marked to be
         * killed first if it triggers an oom, then select it.
         */
        if (oom_task_origin(task)) {//如果任务被标记优先杀死,标记分数为最高,优先杀该进程,最后遍历一圈也只能选它
                points = LONG_MAX;
                goto select;
        }

        points = oom_badness(task, oc->totalpages);//对当前进程按内存情况和调整因子进行打分
        if (oom_next_task(task, oc, points))//判断当前进程得分和之前遍历的所有进程得分的最高分谁更大
                goto next;

select:
        //这里是不是可以优化一下,如果是oom_task_origin的情况,points = LONG_MAX,分数已经最高了,就不需要遍历后面的进程了
        if (oc->chosen)//走到这要么进程被标记优先杀死,要么进程得分大于之前遍历的所有进程
                put_task_struct(oc->chosen);
        get_task_struct(task);
        oc->chosen = task;//选择当前进程作为候选进程,因为得分最高
        oc->chosen_points = points;//保存分数
next:
        return 0;
abort:
        if (oc->chosen) //释放本次已选中的候选进程
                put_task_struct(oc->chosen);
        oc->chosen = (void *)-1UL;//设置特殊标志,表示终止
        return 1;
}

oom_cpuset_eligible 判断被杀进程 tsk 能不能访问到触发 OOM 的那块内存(current 能访问的节点),只要有哪怕一个 node 相交 → 它就可能占用了那块内存 → 就可以被杀,这里其实不太明白为什么只要相交就可以。有没有这种情况, 虽然task的node和mask相交 task node是0-1 mask是1-2 但是task实际是从node 0申请的内存,杀死task释放出的node 0内存 实际上 当前进程也用不了,因为它只能用node 1-2的内存。不过内核代码是不可能错的,只是还没理解到位

mempolicy型oom,进程自己把自己绑定到某些node上,要杀进程的话,也是杀绑定到其中node上的进程

cpuset型oom,把进程限制在小组中,组内的进程总共用这么多内存,要杀进程的话也只能杀组内进程,以触发oom的组为准

static bool oom_cpuset_eligible(struct task_struct *start,
                                struct oom_control *oc)
{
        struct task_struct *tsk;
        bool ret = false;
        const nodemask_t *mask = oc->nodemask;//分配内存时的node

        if (is_memcg_oom(oc))//cgroup类型的oom和node无关
                return true;

        rcu_read_lock();
        for_each_thread(start, tsk) {
                if (mask) {//在out_of_memory中非policy的oc->nodemask已经被设置为NULL,因此不为NULL则是policy类型
                        /*
                         * If this is a mempolicy constrained oom, tsk's
                         * cpuset is irrelevant.  Only return true if its
                         * mempolicy intersects current, otherwise it may be
                         * needlessly killed.
                         */
                        ret = mempolicy_nodemask_intersects(tsk, mask);//检查任务的内存策略是否与当前OOM约束的内存节点相交,有交集释放的内存才能缓解策略型内存的压力
                } else { 如果是cpuset型内存约束
                        /*
                         * This is not a mempolicy constrained oom, so only
                         * check the mems of tsk's cpuset.
                         */
                        ret = cpuset_mems_allowed_intersects(current, tsk);//检查任务的cpuset允许的内存节点是否与当前进程相交
                }                                                          //避免终止在完全不同资源约束环境中的进程
                                                                           //如果终止与触发OOM进程在不同cpuset中的进程,释放的内存可能无法用于缓解当前cpuset的内存压力
                if (ret)
                        break;
        }
        rcu_read_unlock();

        return ret;
}

oom_badness函数对进程进行打分,分数越高,越适合被oom杀死,返回LONG_MIN表示绝对不会被杀,因为分数最小

long oom_badness(struct task_struct *p, unsigned long totalpages)
{
        long points;
        long adj;

        if (oom_unkillable_task(p))//内核线程绝对不会杀
                return LONG_MIN;

        p = find_lock_task_mm(p);//进程必须有内存管理结构,没有的不会被杀,因为没有意义,杀了也不会释放出内存
        if (!p)
                return LONG_MIN;

        /*
         * Do not even consider tasks which are explicitly marked oom
         * unkillable or have been already oom reaped or the are in
         * the middle of vfork
         */
        adj = (long)p->signal->oom_score_adj;//oom_score_adj为-1000,用户标记为不杀死,或者设置MMF_OOM_SKIP标志,内核标记跳过该进程
        if (adj == OOM_SCORE_ADJ_MIN ||      //或者进程是vfork出来的,父子共享内存的子进程,这三种不会作为候选进程,
                        test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
                        in_vfork(p)) {
                task_unlock(p);
                return LONG_MIN;
        }

        /*
         * The baseline for the badness score is the proportion of RAM that each
         * task's rss, pagetable and swap space use.
         */
        //计算基础分数,该进程的rss物理内存+交换分区使用+页表使用
        points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
                mm_pgtables_bytes(p->mm) / PAGE_SIZE;
        task_unlock(p);

        /* Normalize to oom_score_adj units */
        adj *= totalpages / 1000;//计算调整因子,adj取值范围是【-1000,1000】,所以其实就是在基础分数上加减千分之几的调整因子,计算最终得分
        points += adj;

        return points;
}

oom_next_task函数判断当前进程得分和之前遍历的所有进程得分的最高分谁更大

static bool oom_next_task(struct task_struct *task, struct oom_control *oc,
                        long points)
{
        struct mem_cgroup *cur_memcg;
        struct mem_cgroup *oc_memcg;
        //没打开CONFIG_MEMCG_QOS的情况,points为LONG_MIN的进程不可杀,或者小于已选进程的分数,则继续遍历
        if (!static_branch_likely(&memcg_qos_stat_key))
                return (points == LONG_MIN || points < oc->chosen_points);

        if (points == LONG_MIN)
                return true;

        if (!oc->chosen)//第一次还没有候选进程,选择当前进程为候选进程
                return false;

        oc_memcg = mem_cgroup_from_task(oc->chosen);
        cur_memcg = mem_cgroup_from_task(task);

        //优先级相同,比较分数,当前进程分数低则跳过,优先级不同,就不看分数了,选高优先级进程为候选进程
        if (cur_memcg->memcg_priority == oc_memcg->memcg_priority) {
                if (points < oc->chosen_points)
                        return true;
                return false;
        }
        //查到的资料memcg_priority的优先级只有0和1
        /* if oc is low-priority, so skip the task */
        if (oc_memcg->memcg_priority) //优先级为1,低优先级跳过
                return true;

        return false;//优先级为0,高优先级选中
}

oom_kill_process函数处理选中的进程,如果进程正在退出,让oom_reaper现成去回收内存,否则让__oom_kill_process函数处理

static void oom_kill_process(struct oom_control *oc, const char *message)
{
        struct task_struct *victim = oc->chosen;
        struct mem_cgroup *oom_group;
        static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
                                              DEFAULT_RATELIMIT_BURST);

        /*
         * If the task is already exiting, don't alarm the sysadmin or kill
         * its children or threads, just give it access to memory reserves
         * so it can die quickly
         */
        task_lock(victim);
        if (task_will_free_mem(victim)) {//候选进程正在退出,且内存没有被占用,内存可被释放
                mark_oom_victim(victim);//标记为受害者进程
                wake_oom_reaper(victim);//唤醒oom_reaper线程处理
                task_unlock(victim);
                put_task_struct(victim);
                return;
        }
        task_unlock(victim);

        if (__ratelimit(&oom_rs))//限速打印日志
                dump_header(oc, victim);

        /*
         * Do we need to kill the entire memory cgroup?
         * Or even one of the ancestor memory cgroups?
         * Check this out before killing the victim task.
         */
        oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);

        __oom_kill_process(victim, message);

        /*
         * If necessary, kill all tasks in the selected memory cgroup.
         */
        if (oom_group) {
                mem_cgroup_print_oom_group(oom_group);
                mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
                                      (void*)message);
                mem_cgroup_put(oom_group);
        }
}

__oom_kill_process函数 针对不是正在退出的线程(这个可能是大多数情况),给线程和与其共享内存的非线程组线程发送杀死信号,然后交给oom_reaper内核线程回收内存

static void __oom_kill_process(struct task_struct *victim, const char *message)
{
        struct task_struct *p;
        struct mm_struct *mm;
        bool can_oom_reap = true;

        p = find_lock_task_mm(victim);//mm已经不存在,无可释放的进程不处理
        if (!p) {
                pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
                        message, task_pid_nr(victim), victim->comm);
                put_task_struct(victim);
                return;
        } else if (victim != p) {//子线程的mm存在,进程的mm已经不存在的情况下,使用子线程的task
                get_task_struct(p);
                put_task_struct(victim);
                victim = p;
        }

        /* Get a reference to safely compare mm after task_unlock(victim) */
        mm = victim->mm;
        mmgrab(mm);

        /* Raise event before sending signal: task reaper must see this */
        count_vm_event(OOM_KILL);
        memcg_memory_event_mm(mm, MEMCG_OOM_KILL);

        /*
         * We should send SIGKILL before granting access to memory reserves
         * in order to prevent the OOM victim from depleting the memory
         * reserves from the user space under its control.
         */
        do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);//发送信号杀死线程
        mark_oom_victim(victim);//标记为受害者进程,打印日志
        pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
                message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
                K(get_mm_counter(mm, MM_ANONPAGES)),
                K(get_mm_counter(mm, MM_FILEPAGES)),
                K(get_mm_counter(mm, MM_SHMEMPAGES)),
                from_kuid(&init_user_ns, task_uid(victim)),
                mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
        task_unlock(victim);

        /*
         * Kill all user processes sharing victim->mm in other thread groups, if
         * any.  They don't get access to memory reserves, though, to avoid
         * depletion of all memory.  This prevents mm->mmap_lock livelock when an
         * oom killed thread cannot exit because it requires the semaphore and
         * its contended by another thread trying to allocate memory itself.
         * That thread will now get access to memory reserves since it has a
         * pending fatal signal.
         */
        rcu_read_lock();
        for_each_process(p) {//遍历系统所有线程,和这个线程共享内存的、1号进程内核线程除外,非同个线程组的都会被发信号杀死
                if (!process_shares_mm(p, mm))
                        continue;
                if (same_thread_group(p, victim))
                        continue;
                if (is_global_init(p)) {
                        can_oom_reap = false;
                        set_bit(MMF_OOM_SKIP, &mm->flags);
                        pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
                                        task_pid_nr(victim), victim->comm,
                                        task_pid_nr(p), p->comm);
                        continue;
                }
                /*
                 * No kthead_use_mm() user needs to read from the userspace so
                 * we are ok to reap it.
                 */
                if (unlikely(p->flags & PF_KTHREAD))
                        continue;
                do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
        }
        rcu_read_unlock();

        if (can_oom_reap)//最终还是被oom_reaper线程处理
                wake_oom_reaper(victim);

        mmdrop(mm);
        put_task_struct(victim);
}

wake_oom_reaper函数将受害者进程加入oom_reaper_list队列,唤醒oom_reaper内核线程

static void wake_oom_reaper(struct task_struct *tsk)
{
        /* mm is already queued? */
        if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))//oom受害者进程已经入队
                return;

        get_task_struct(tsk);

        spin_lock(&oom_reaper_lock);
        tsk->oom_reaper_list = oom_reaper_list; //将oom受害者进程加入oom_reaper_list队列
        oom_reaper_list = tsk;
        spin_unlock(&oom_reaper_lock);
        trace_wake_reaper(tsk->pid);
        wake_up(&oom_reaper_wait);//唤醒oom_reaper线程
}

可以看到oom_reaper函数是死循环,等待oom_reaper_list不为空的事件发生时被唤醒,从队列拿出一个task交给oom_reap_task处理

static int oom_reaper(void *unused)//oom_reaper线程循环释放oom_reaper_list链表上oom受害者的内存
{
        while (true) {
                struct task_struct *tsk = NULL;

                wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);//休眠等待oom_reaper_list不为空的事件到来
                spin_lock(&oom_reaper_lock);
                if (oom_reaper_list != NULL) {
                        tsk = oom_reaper_list;//取出一个oom受害者进程
                        oom_reaper_list = tsk->oom_reaper_list;
                }
                spin_unlock(&oom_reaper_lock);

                if (tsk)
                        oom_reap_task(tsk);//释放选中进程的内存
        }

        return 0;
}

oom_reap_task调用oom_reap_task_mm进行实际的回收操作

#define MAX_OOM_REAP_RETRIES 10
static void oom_reap_task(struct task_struct *tsk)
{
        int attempts = 0;
        struct mm_struct *mm = tsk->signal->oom_mm;

        /* Retry the mmap_read_trylock(mm) a few times */ //尝试回收oom受害者进程物理内存,给10次机会,每次休眠0.1秒
        while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
                schedule_timeout_idle(HZ/10);

        if (attempts <= MAX_OOM_REAP_RETRIES ||//小于10次说明回收成功,或者进程包含MMF_OOM_SKIP标志则跳过对该进程的回收
            test_bit(MMF_OOM_SKIP, &mm->flags))
                goto done;

        pr_info("oom_reaper: unable to reap pid:%d (%s)\n",//回收失败,打印相关信息
                task_pid_nr(tsk), tsk->comm);
        sched_show_task(tsk);
        debug_show_all_locks();

done:
        tsk->oom_reaper_list = NULL;

        /*
         * Hide this mm from OOM killer because it has been either reaped or
         * somebody can't call mmap_write_unlock(mm).
         */
        set_bit(MMF_OOM_SKIP, &mm->flags); //回收成设置MMF_OOM_SKIP标记,或者10次回收失败也设置,以后不对对该进程回收内存

        /* Drop a reference taken by wake_oom_reaper */
        put_task_struct(tsk);
}

oom_reap_task_mm回收进程的地址空间

static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
        bool ret = true;

        if (!mmap_read_trylock(mm)) {
                trace_skip_task_reaping(tsk->pid);
                return false;
        }

        /*
         * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
         * work on the mm anymore. The check for MMF_OOM_SKIP must run
         * under mmap_lock for reading because it serializes against the
         * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
         */
        if (test_bit(MMF_OOM_SKIP, &mm->flags)) {//不对设置MMF_OOM_SKIP标志的进程进行内存回收
                trace_skip_task_reaping(tsk->pid);
                goto out_unlock;
        }

        trace_start_task_reaping(tsk->pid);

        /* failed to reap part of the address space. Try again later */
        ret = __oom_reap_task_mm(mm);//回收内存
        if (!ret)
                goto out_finish;

        pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
                        task_pid_nr(tsk), tsk->comm,
                        K(get_mm_counter(mm, MM_ANONPAGES)),
                        K(get_mm_counter(mm, MM_FILEPAGES)),
                        K(get_mm_counter(mm, MM_SHMEMPAGES)));
out_finish:
        trace_finish_task_reaping(tsk->pid);
out_unlock:
        mmap_read_unlock(mm);

        return ret;
}

本文章已经生成可运行项目
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值