内核版本openeuler 5.10.0
处理oom的函数是out_of_memory,当内核无法满足内存分配请求时,就会调用到该函数
bool out_of_memory(struct oom_control *oc)
{
unsigned long freed = 0;
if (oom_killer_disabled)//oom功能没打开,返回
return false;
if (!sysctl_enable_oom_killer) {//sysctl设置了oom不使能,则只通过通知链调用注册的通知函数就返回
oom_type_notifier_call(0, oc);
return false;
}
if (!is_memcg_oom(oc)) {//非cgroup型oom,调用通知链回调函数尝试释放一些内存,如果能释放出一些则返回,说不定释放的这些足够分配了,不再往下处理
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
return true;
}
/*
* If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
*/
//判断进程是否正在退出,并且没占用其他正常运行的进程的内存资源,则通过oom_reaper内核线程快速释放该进程内存,并返回
if (task_will_free_mem(current)) {//如果当前进程可以释放内存
mark_oom_victim(current);//标记当前进程为oom受害者进程
wake_oom_reaper(current);//唤醒 oom reaper内核线程对内存对受害者进程内存进行回收
return true;
}
/*
* The OOM killer does not compensate for IO-less reclaim.
* pagefault_out_of_memory lost its gfp context so we have to
* make sure exclude 0 mask - all other users should have at least
* ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
* invoke the OOM killer even if it is a GFP_NOFS allocation.
*/
// 当进程在文件系统关键路径中分配内存时会使用GFP_NOFS防止递归死锁,但如果此时内存不足,不能让OOM杀死进程,因为杀进程需要文件系统操作(写日志、清理inode等),但GFP_NOFS禁止文件系统操作 → 死锁!,所以这种情况直接返回。
if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))//非cgroup型oom,并且是在不能调用文件系统操作的环境
return true;
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA and memcg) that may require different handling.
*/
oc->constraint = constrained_alloc(oc);//获得内存分配失败的约束类型,确定内存不足的原因和范围
if (oc->constraint != CONSTRAINT_MEMORY_POLICY)//如果不是内存策略限制导致的oom,后续就不需要考虑节点掩码的限制,只考虑task->mems_allowed,设置oc->nodemask为NULL,方便区别policy和cpuset两种约束类型
oc->nodemask = NULL;
check_panic_on_oom(oc);//检查是否设置了oom之后panic,如果设置了,走panic处理流程
//如果如果设置了要杀死当前申请内存的进程,且满足下面条件,就杀当前进程
if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&//如果不是cgroup导致的oom,并且设置了杀死oom时分配内存的进程(cgroup导致的说明系统内存够用,不需要杀别的进程)
current->mm && !oom_unkillable_task(current) && //不能杀内核线程
oom_cpuset_eligible(current, oc) && //根据不同的OOM约束类型,采用相应的检查策略来避免不必要地终止与当前OOM情况无关的任务
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {//并且进程没有通过oom_score_adj规避被杀
get_task_struct(current);
oc->chosen = current;//选当前进程杀死
oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
return true;
}
select_bad_process(oc);//按规则选择合适的oom受害者进程
/* Found nothing?!?! */
if (!oc->chosen) {//如果没有选择出来该杀的进程,而且不是手动触发的oom和cgroup型oom,则让系统panic
dump_header(oc, NULL);
pr_warn("Out of memory and no killable processes...\n");
/*
* If we got here due to an actual allocation at the
* system level, we cannot survive this and will enter
* an endless loop in the allocator. Bail out now.
*/
if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))//
panic("System is deadlocked on memory\n");
}
if (oc->chosen && oc->chosen != (void *)-1UL)//选择出了合理的进程,杀死该进程,释放内存
oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
"Memory cgroup out of memory");
return !!oc->chosen;
}
task_will_free_mem函数判断进程是否可以立刻释放内存。满足下面条件则可以释放内存
1:首先调用__task_will_free_mem函数判断进程是否正在退出,正在退出的进程才可以被选中更快地释放内存。
2:判断进程的内存没有被其他进程共享占用,如果没被共享则可以被释放;共享的情况下,判断是否所有共享内存的进程是否正在退出、内存有没有被其他进程占用,没有则可以被释放
static bool task_will_free_mem(struct task_struct *task)
{
struct mm_struct *mm = task->mm;
struct task_struct *p;
bool ret = true;
/*
* Skip tasks without mm because it might have passed its exit_mm and
* exit_oom_victim. oom_reaper could have rescued that but do not rely
* on that for now. We can consider find_lock_task_mm in future.
*/
if (!mm)
return false;
if (!__task_will_free_mem(task))//快速判断进程是否可以立即释放内存
return false;
/*
* This task has already been drained by the oom reaper so there are
* only small chances it will free some more
*/
if (test_bit(MMF_OOM_SKIP, &mm->flags))//进程已经在被oom reaper线程回收内存了,或者回收失败,下次不会再对该进程内存进行回收
return false;
if (atomic_read(&mm->mm_users) <= 1) //mm内存不被共享,可以直接释放
return true;
/*
* Make sure that all tasks which share the mm with the given tasks
* are dying as well to make sure that a) nobody pins its mm and
* b) the task is also reapable by the oom reaper.
*/
rcu_read_lock();
for_each_process(p) {//mm内存被共享的情况下,确保所有和task共享内存的进程都可以被释放
if (!process_shares_mm(p, mm))//该进程的某个线程正在使用内存mm
continue;
if (same_thread_group(task, p))//该进程和task不是一个线程组
continue;
ret = __task_will_free_mem(p);//并且进程的内存也可以被释放
if (!ret)
break;
}
rcu_read_unlock();
return ret;
}
__task_will_free_mem函数判断进程是否正在退出
static inline bool __task_will_free_mem(struct task_struct *task)
{
struct signal_struct *sig = task->signal;
/*
* A coredumping process may sleep for an extended period in exit_mm(),
* so the oom killer cannot assume that the process will promptly exit
* and release memory.
*/
if (sig->flags & SIGNAL_GROUP_COREDUMP)//正在coredump的进程可能会睡眠,,不会很快释放内存
return false;
if (sig->flags & SIGNAL_GROUP_EXIT)//进程组正在退出,会释放所有内存
return true;
if (thread_group_empty(task) && (task->flags & PF_EXITING))//单线程,并且线程正在退出,会释放所有内存
return true;
return false;
}
constrained_alloc用来获取当前的内存约束类型,并统计各个类型的总内存限制
这里说一下几种约束类型
enum oom_constraint {
CONSTRAINT_NONE,
CONSTRAINT_CPUSET,
CONSTRAINT_MEMORY_POLICY,
CONSTRAINT_MEMCG,
};
CONSTRAINT_NONE 非numa系统,或者numa系统没有对node进行约束的情况,出现oom属于全局物理内存耗尽
CONSTRAINT_CPUSET cpuset是内核的一种机制,可以把一组cpu和node分配给一组进程,如果出现oom,仅仅说明该进程可以分配到内存的node内存耗尽了,系统其他node上可能还有内存
CONSTRAINT_MEMORY_POLICY memory policy 是numa系统中用户进程主动绑定到node的情况,出现oom也不能说明系统内存耗尽了
CONSTRAINT_MEMCG 限制一组进程的内存使用上限,当组内进程申请的总内存超过组内设置内存上限就会oom,不代表系统全局物理内存耗尽,不关心node,只关心内存使用额度
static enum oom_constraint constrained_alloc(struct oom_control *oc)
{
struct zone *zone;
struct zoneref *z;
enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
bool cpuset_limited = false;
int nid;
if (is_memcg_oom(oc)) {//如果是cgroup导致的缺少内存,计算总内存限制,并设置为cgroup约束类型
oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
return CONSTRAINT_MEMCG;
}
/* Default to all available memory */
oc->totalpages = totalram_pages() + total_swap_pages;//计算无约束类型的总内存限制
if (!IS_ENABLED(CONFIG_NUMA))//非多节点或oc->zonelist为空的情况下,不需要考虑node和zone,直接认为是内存无约束型
return CONSTRAINT_NONE;
if (!oc->zonelist)
return CONSTRAINT_NONE;
/*
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
* to kill current.We have to random task kill in this case.
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
*/
if (oc->gfp_mask & __GFP_THISNODE)
return CONSTRAINT_NONE;
/*
* This is not a __GFP_THISNODE allocation, so a truncated nodemask in
* the page allocator means a mempolicy is in effect. Cpuset policy
* is enforced in get_page_from_freelist().
*/
//内存策略约束检测,如果系统有内存的节点不是允许使用内存的节点的子集,也就是说存在有内存的节点不在允许使用的节点集合中,属于内存策略型,用户程序主动限制使用节点,例如系统内存节点0~3 node,用户程序限制在0~2 node
if (oc->nodemask &&
!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
oc->totalpages = total_swap_pages;//重新计算内存策略型约束总内存限制
for_each_node_mask(nid, *oc->nodemask)
oc->totalpages += node_present_pages(nid);//累加允许的节点的内存
return CONSTRAINT_MEMORY_POLICY;
}
//在用户不主动限制使用节点的情况下,cpuset对节点进行了限制,例如用户程序无限制0~3 node,cpuset限制为0~2 node
//cpuset型约束检测,遍历所有可用的内存区域,检查是否有区域受到 cpuset 限制。
/* Check this allocation failure is caused by cpuset's wall function */
for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
highest_zoneidx, oc->nodemask)
if (!cpuset_zone_allowed(zone, oc->gfp_mask))//例如用户和系统可用node都为0~3,但是在遍历过程中发现有zone对应3节点不在cpuset限制的0~2中
cpuset_limited = true;
if (cpuset_limited) {//cpuset mem限制的情况下,重新计算允许的总内存为交换分区+cpuset mem允许的node的总内存
oc->totalpages = total_swap_pages;
for_each_node_mask(nid, cpuset_current_mems_allowed)
oc->totalpages += node_present_pages(nid);
return CONSTRAINT_CPUSET;
}
return CONSTRAINT_NONE;
}
check_panic_on_oom函数判断是否设置了发生oom之后让系统panic,死机或重启
主要看/proc/sys/vm/panic_on_oom的设置,为0或者为1 并且不是系统总内存不足导致的oom或者通过echo m > /proc/sysrq-trigger手动触发的oom都返回,继续选择合适的进程杀死;为2则所有类型的oom都触发panic
static void check_panic_on_oom(struct oom_control *oc)
{
if (likely(!sysctl_panic_on_oom))//没设置则返回
return;
if (sysctl_panic_on_oom != 2) {//不等于2,也不是由于系统总内存限制导致的oom,则返回
/* //因为cpuset mempolicy cgroup导致的oom,不是整个系统内存不够,不需要panic
* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
* does not panic for cpuset, mempolicy, or memcg allocation
* failures.
*/
if (oc->constraint != CONSTRAINT_NONE)
return;
}
/* Do not panic for oom kills triggered by sysrq */
if (is_sysrq_oom(oc))//手动触发的oom不panic
return;
dump_header(oc, NULL);//打印相关信息并调用panic函数
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
select_bad_process函数选择合适的进程,该进程将会被杀
static void select_bad_process(struct oom_control *oc)
{
oc->chosen_points = LONG_MIN;
if (is_memcg_oom(oc))//cgroup类型的oom,选择cgroup组中的进程
mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
else {//非cgroup的oom中,如果配置了CONFIG_MEMCG_QOS先尝试在低优先级cgroup中找目标进程
struct task_struct *p;
#ifdef CONFIG_MEMCG_QOS
if (memcg_low_priority_scan_tasks(oom_evaluate_task, oc))//先尝试在低优先级cgroup中找目标
return;
#endif
rcu_read_lock();
for_each_process(p)//否则在整个系统中选择合适的进程
if (oom_evaluate_task(p, oc))
break;
rcu_read_unlock();
}
}
不管是上面三种哪一种遍历,最终通过oom_evaluate_task选出目标进程
static int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
long points;
if (oom_unkillable_task(task))//1号或者内核进程不处理,也没有可释放的mm
goto next;
//非cgroup型oom,并且遍历的进程(包括其子线程),判断候选进程是否符合条件
/* p may not have freeable memory in nodemask */
if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
goto next;
/*
* This task already has access to memory reserves and is being killed.
* Don't allow any other task to have access to the reserves unless
* the task has MMF_OOM_SKIP because chances that it would release
* any memory is quite low.
*/
if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {//不是手动触发的oom,并且进程已经被标记为oom受害者进程
if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))//如果设置了跳过标记,进程被杀过了,则继续选择下一个,防止重复选中同一个进程
goto next;
goto abort;//终止本次oom选择过程,因为该进程已经被oom选中正在杀的过程中
}
/*
* If task is allocating a lot of memory and has been marked to be
* killed first if it triggers an oom, then select it.
*/
if (oom_task_origin(task)) {//如果任务被标记优先杀死,标记分数为最高,优先杀该进程,最后遍历一圈也只能选它
points = LONG_MAX;
goto select;
}
points = oom_badness(task, oc->totalpages);//对当前进程按内存情况和调整因子进行打分
if (oom_next_task(task, oc, points))//判断当前进程得分和之前遍历的所有进程得分的最高分谁更大
goto next;
select:
//这里是不是可以优化一下,如果是oom_task_origin的情况,points = LONG_MAX,分数已经最高了,就不需要遍历后面的进程了
if (oc->chosen)//走到这要么进程被标记优先杀死,要么进程得分大于之前遍历的所有进程
put_task_struct(oc->chosen);
get_task_struct(task);
oc->chosen = task;//选择当前进程作为候选进程,因为得分最高
oc->chosen_points = points;//保存分数
next:
return 0;
abort:
if (oc->chosen) //释放本次已选中的候选进程
put_task_struct(oc->chosen);
oc->chosen = (void *)-1UL;//设置特殊标志,表示终止
return 1;
}
oom_cpuset_eligible 判断被杀进程 tsk 能不能访问到触发 OOM 的那块内存(current 能访问的节点),只要有哪怕一个 node 相交 → 它就可能占用了那块内存 → 就可以被杀,这里其实不太明白为什么只要相交就可以。有没有这种情况, 虽然task的node和mask相交 task node是0-1 mask是1-2 但是task实际是从node 0申请的内存,杀死task释放出的node 0内存 实际上 当前进程也用不了,因为它只能用node 1-2的内存。不过内核代码是不可能错的,只是还没理解到位
mempolicy型oom,进程自己把自己绑定到某些node上,要杀进程的话,也是杀绑定到其中node上的进程
cpuset型oom,把进程限制在小组中,组内的进程总共用这么多内存,要杀进程的话也只能杀组内进程,以触发oom的组为准
static bool oom_cpuset_eligible(struct task_struct *start,
struct oom_control *oc)
{
struct task_struct *tsk;
bool ret = false;
const nodemask_t *mask = oc->nodemask;//分配内存时的node
if (is_memcg_oom(oc))//cgroup类型的oom和node无关
return true;
rcu_read_lock();
for_each_thread(start, tsk) {
if (mask) {//在out_of_memory中非policy的oc->nodemask已经被设置为NULL,因此不为NULL则是policy类型
/*
* If this is a mempolicy constrained oom, tsk's
* cpuset is irrelevant. Only return true if its
* mempolicy intersects current, otherwise it may be
* needlessly killed.
*/
ret = mempolicy_nodemask_intersects(tsk, mask);//检查任务的内存策略是否与当前OOM约束的内存节点相交,有交集释放的内存才能缓解策略型内存的压力
} else { 如果是cpuset型内存约束
/*
* This is not a mempolicy constrained oom, so only
* check the mems of tsk's cpuset.
*/
ret = cpuset_mems_allowed_intersects(current, tsk);//检查任务的cpuset允许的内存节点是否与当前进程相交
} //避免终止在完全不同资源约束环境中的进程
//如果终止与触发OOM进程在不同cpuset中的进程,释放的内存可能无法用于缓解当前cpuset的内存压力
if (ret)
break;
}
rcu_read_unlock();
return ret;
}
oom_badness函数对进程进行打分,分数越高,越适合被oom杀死,返回LONG_MIN表示绝对不会被杀,因为分数最小
long oom_badness(struct task_struct *p, unsigned long totalpages)
{
long points;
long adj;
if (oom_unkillable_task(p))//内核线程绝对不会杀
return LONG_MIN;
p = find_lock_task_mm(p);//进程必须有内存管理结构,没有的不会被杀,因为没有意义,杀了也不会释放出内存
if (!p)
return LONG_MIN;
/*
* Do not even consider tasks which are explicitly marked oom
* unkillable or have been already oom reaped or the are in
* the middle of vfork
*/
adj = (long)p->signal->oom_score_adj;//oom_score_adj为-1000,用户标记为不杀死,或者设置MMF_OOM_SKIP标志,内核标记跳过该进程
if (adj == OOM_SCORE_ADJ_MIN || //或者进程是vfork出来的,父子共享内存的子进程,这三种不会作为候选进程,
test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
in_vfork(p)) {
task_unlock(p);
return LONG_MIN;
}
/*
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
//计算基础分数,该进程的rss物理内存+交换分区使用+页表使用
points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
mm_pgtables_bytes(p->mm) / PAGE_SIZE;
task_unlock(p);
/* Normalize to oom_score_adj units */
adj *= totalpages / 1000;//计算调整因子,adj取值范围是【-1000,1000】,所以其实就是在基础分数上加减千分之几的调整因子,计算最终得分
points += adj;
return points;
}
oom_next_task函数判断当前进程得分和之前遍历的所有进程得分的最高分谁更大
static bool oom_next_task(struct task_struct *task, struct oom_control *oc,
long points)
{
struct mem_cgroup *cur_memcg;
struct mem_cgroup *oc_memcg;
//没打开CONFIG_MEMCG_QOS的情况,points为LONG_MIN的进程不可杀,或者小于已选进程的分数,则继续遍历
if (!static_branch_likely(&memcg_qos_stat_key))
return (points == LONG_MIN || points < oc->chosen_points);
if (points == LONG_MIN)
return true;
if (!oc->chosen)//第一次还没有候选进程,选择当前进程为候选进程
return false;
oc_memcg = mem_cgroup_from_task(oc->chosen);
cur_memcg = mem_cgroup_from_task(task);
//优先级相同,比较分数,当前进程分数低则跳过,优先级不同,就不看分数了,选高优先级进程为候选进程
if (cur_memcg->memcg_priority == oc_memcg->memcg_priority) {
if (points < oc->chosen_points)
return true;
return false;
}
//查到的资料memcg_priority的优先级只有0和1
/* if oc is low-priority, so skip the task */
if (oc_memcg->memcg_priority) //优先级为1,低优先级跳过
return true;
return false;//优先级为0,高优先级选中
}
oom_kill_process函数处理选中的进程,如果进程正在退出,让oom_reaper现成去回收内存,否则让__oom_kill_process函数处理
static void oom_kill_process(struct oom_control *oc, const char *message)
{
struct task_struct *victim = oc->chosen;
struct mem_cgroup *oom_group;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
/*
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just give it access to memory reserves
* so it can die quickly
*/
task_lock(victim);
if (task_will_free_mem(victim)) {//候选进程正在退出,且内存没有被占用,内存可被释放
mark_oom_victim(victim);//标记为受害者进程
wake_oom_reaper(victim);//唤醒oom_reaper线程处理
task_unlock(victim);
put_task_struct(victim);
return;
}
task_unlock(victim);
if (__ratelimit(&oom_rs))//限速打印日志
dump_header(oc, victim);
/*
* Do we need to kill the entire memory cgroup?
* Or even one of the ancestor memory cgroups?
* Check this out before killing the victim task.
*/
oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
__oom_kill_process(victim, message);
/*
* If necessary, kill all tasks in the selected memory cgroup.
*/
if (oom_group) {
mem_cgroup_print_oom_group(oom_group);
mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
(void*)message);
mem_cgroup_put(oom_group);
}
}
__oom_kill_process函数 针对不是正在退出的线程(这个可能是大多数情况),给线程和与其共享内存的非线程组线程发送杀死信号,然后交给oom_reaper内核线程回收内存
static void __oom_kill_process(struct task_struct *victim, const char *message)
{
struct task_struct *p;
struct mm_struct *mm;
bool can_oom_reap = true;
p = find_lock_task_mm(victim);//mm已经不存在,无可释放的进程不处理
if (!p) {
pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
message, task_pid_nr(victim), victim->comm);
put_task_struct(victim);
return;
} else if (victim != p) {//子线程的mm存在,进程的mm已经不存在的情况下,使用子线程的task
get_task_struct(p);
put_task_struct(victim);
victim = p;
}
/* Get a reference to safely compare mm after task_unlock(victim) */
mm = victim->mm;
mmgrab(mm);
/* Raise event before sending signal: task reaper must see this */
count_vm_event(OOM_KILL);
memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
/*
* We should send SIGKILL before granting access to memory reserves
* in order to prevent the OOM victim from depleting the memory
* reserves from the user space under its control.
*/
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);//发送信号杀死线程
mark_oom_victim(victim);//标记为受害者进程,打印日志
pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
K(get_mm_counter(mm, MM_ANONPAGES)),
K(get_mm_counter(mm, MM_FILEPAGES)),
K(get_mm_counter(mm, MM_SHMEMPAGES)),
from_kuid(&init_user_ns, task_uid(victim)),
mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
task_unlock(victim);
/*
* Kill all user processes sharing victim->mm in other thread groups, if
* any. They don't get access to memory reserves, though, to avoid
* depletion of all memory. This prevents mm->mmap_lock livelock when an
* oom killed thread cannot exit because it requires the semaphore and
* its contended by another thread trying to allocate memory itself.
* That thread will now get access to memory reserves since it has a
* pending fatal signal.
*/
rcu_read_lock();
for_each_process(p) {//遍历系统所有线程,和这个线程共享内存的、1号进程内核线程除外,非同个线程组的都会被发信号杀死
if (!process_shares_mm(p, mm))
continue;
if (same_thread_group(p, victim))
continue;
if (is_global_init(p)) {
can_oom_reap = false;
set_bit(MMF_OOM_SKIP, &mm->flags);
pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
task_pid_nr(victim), victim->comm,
task_pid_nr(p), p->comm);
continue;
}
/*
* No kthead_use_mm() user needs to read from the userspace so
* we are ok to reap it.
*/
if (unlikely(p->flags & PF_KTHREAD))
continue;
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
}
rcu_read_unlock();
if (can_oom_reap)//最终还是被oom_reaper线程处理
wake_oom_reaper(victim);
mmdrop(mm);
put_task_struct(victim);
}
wake_oom_reaper函数将受害者进程加入oom_reaper_list队列,唤醒oom_reaper内核线程
static void wake_oom_reaper(struct task_struct *tsk)
{
/* mm is already queued? */
if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))//oom受害者进程已经入队
return;
get_task_struct(tsk);
spin_lock(&oom_reaper_lock);
tsk->oom_reaper_list = oom_reaper_list; //将oom受害者进程加入oom_reaper_list队列
oom_reaper_list = tsk;
spin_unlock(&oom_reaper_lock);
trace_wake_reaper(tsk->pid);
wake_up(&oom_reaper_wait);//唤醒oom_reaper线程
}
可以看到oom_reaper函数是死循环,等待oom_reaper_list不为空的事件发生时被唤醒,从队列拿出一个task交给oom_reap_task处理
static int oom_reaper(void *unused)//oom_reaper线程循环释放oom_reaper_list链表上oom受害者的内存
{
while (true) {
struct task_struct *tsk = NULL;
wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);//休眠等待oom_reaper_list不为空的事件到来
spin_lock(&oom_reaper_lock);
if (oom_reaper_list != NULL) {
tsk = oom_reaper_list;//取出一个oom受害者进程
oom_reaper_list = tsk->oom_reaper_list;
}
spin_unlock(&oom_reaper_lock);
if (tsk)
oom_reap_task(tsk);//释放选中进程的内存
}
return 0;
}
oom_reap_task调用oom_reap_task_mm进行实际的回收操作
#define MAX_OOM_REAP_RETRIES 10
static void oom_reap_task(struct task_struct *tsk)
{
int attempts = 0;
struct mm_struct *mm = tsk->signal->oom_mm;
/* Retry the mmap_read_trylock(mm) a few times */ //尝试回收oom受害者进程物理内存,给10次机会,每次休眠0.1秒
while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
schedule_timeout_idle(HZ/10);
if (attempts <= MAX_OOM_REAP_RETRIES ||//小于10次说明回收成功,或者进程包含MMF_OOM_SKIP标志则跳过对该进程的回收
test_bit(MMF_OOM_SKIP, &mm->flags))
goto done;
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",//回收失败,打印相关信息
task_pid_nr(tsk), tsk->comm);
sched_show_task(tsk);
debug_show_all_locks();
done:
tsk->oom_reaper_list = NULL;
/*
* Hide this mm from OOM killer because it has been either reaped or
* somebody can't call mmap_write_unlock(mm).
*/
set_bit(MMF_OOM_SKIP, &mm->flags); //回收成设置MMF_OOM_SKIP标记,或者10次回收失败也设置,以后不对对该进程回收内存
/* Drop a reference taken by wake_oom_reaper */
put_task_struct(tsk);
}
oom_reap_task_mm回收进程的地址空间
static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
bool ret = true;
if (!mmap_read_trylock(mm)) {
trace_skip_task_reaping(tsk->pid);
return false;
}
/*
* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
* work on the mm anymore. The check for MMF_OOM_SKIP must run
* under mmap_lock for reading because it serializes against the
* mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
*/
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {//不对设置MMF_OOM_SKIP标志的进程进行内存回收
trace_skip_task_reaping(tsk->pid);
goto out_unlock;
}
trace_start_task_reaping(tsk->pid);
/* failed to reap part of the address space. Try again later */
ret = __oom_reap_task_mm(mm);//回收内存
if (!ret)
goto out_finish;
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(tsk), tsk->comm,
K(get_mm_counter(mm, MM_ANONPAGES)),
K(get_mm_counter(mm, MM_FILEPAGES)),
K(get_mm_counter(mm, MM_SHMEMPAGES)));
out_finish:
trace_finish_task_reaping(tsk->pid);
out_unlock:
mmap_read_unlock(mm);
return ret;
}
13万+

被折叠的 条评论
为什么被折叠?



