本代码分析来自内核6.1.31,转载请注明出处
总体调用流程
schedule() -> __schedule() -> pick_next_task() -> __pick_next_task() -> pick_next_task_fair() || p = class->pick_next_task(rq);
调度器schedule函数在进程调度抢占时, 会通过__schedule函数调用全局pick_next_task选择一个最优的进程
pick_next_task_fair()分析
pick_next_task按照优先级依次调用不同调度器类提供的pick_next_task方法,我们这里只看cfs的pick_next_task_fair()
< kernel/sched/fair.c >
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
struct task_struct *p;
int new_tasks;
again:
if (!sched_fair_runnable(rq)) /* 函数内部查询rq->cfs.nr_running 值为0,这里就直接进入后面的idle */
goto idle;
/* 组调度代码没有包含进来,不是我们这次要分析的重点 */
simple:
if (prev)
put_prev_task(rq, prev); /* 将当前进程放入运行队列的合适位置 */
do {
/* 选出下一个调度实体 */
se = pick_next_entity(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
p = task_of(se);
done: __maybe_unused;
#ifdef CONFIG_SMP
/*
* Move the next running task to the front of
* the list, so our cfs_tasks list becomes MRU
* one.
*/
list_move(&p->se.group_node, &rq->cfs_tasks);
#endif
if (hrtick_enabled_fair(rq))
hrtick_start_fair(rq, p);
update_misfit_status(p, rq);
return p;
idle:
if (!rf)
return NULL;
new_tasks = newidle_balance(rq, rf);
/*
* Because newidle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
if (new_tasks < 0)
return RETRY_TASK;
if (new_tasks > 0)
goto again;
/*
* rq is about to be idle, check if we need to update the
* lost_idle_time of clock_pelt
*/
update_idle_rq_clock_pelt(rq);
return NULL;
}
总体结构分析
关键节点 | 功能描述 |
---|---|
again: | 循环执行本函数挑选过程 |
simple: | 如果没有定义组调度,就执行本节点流程,将当前进程重新加入队列,挑选下一个进程 |
idle: | 如果队列中可运行进程为0,则调度idle进程 |
由于CFS调度的时候, prev进程不一定是一个CFS调度的进程, 因此必须调用全局的put_prev_task来调用prev进程所属调度器类sched_class的对应put_prev_task方法, 完成将进程放回到就绪队列中
CFS调度类定义的操作方法如下
< kernel/sched/fair.c >
*
* All the scheduling class methods:
*/
DEFINE_SCHED_CLASS(fair) = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
.check_preempt_curr = check_preempt_wakeup,
.pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
#ifdef CONFIG_SMP
.balance = balance_fair,
.pick_task = pick_task_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
.task_dead = task_dead_fair,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
.prio_changed = prio_changed_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,
.get_rr_interval = get_rr_interval_fair,
.update_curr = update_curr_fair, /* 用于更新vruntime,调度时机主要是入队,出队,周期时间等 */
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_change_group = task_change_group_fair,
#endif
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
};
put_prev_task()分析
put_prev_task是把正在CPU上运行的进程放回到运行队列中去
< kernel/sched/sched.h >
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
WARN_ON_ONCE(rq->curr != prev);
prev->sched_class->put_prev_task(rq, prev);
}
所以CFS调用路径
put_prev_task -> put_prev_task_fair()
< kernel/sched/fair.c >
/*
* Account for a descheduled task:
*/
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
/* 在组策略情况下, 调度实体之间存在父子的层次, for_each_sched_entity会从当前调度实体开始, 然后循环向其父调度实体进行更新, 非组调度情况下则只执行一次 */
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
put_prev_entity(cfs_rq, se);
}
}
在组策略情况下, 调度实体之间存在父子的层次, for_each_sched_entity会从当前调度实体开始, 然后循环向其父调度实体进行更新, 非组调度情况下则只执行一次
for_each_sched_entity(se)
< kernel/sched/fair.c >
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
cfs_rq_of()就是通过se获取cfs_rq
< kernel/sched/sched.h >
/* runqueue on which this entity is (to be) queued */
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
return se->cfs_rq;
}
put_prev_entity()将prev任务放回队列
这里要注意一点,prev->on_rq==1 说明已经在队列了,可是看代码发现在已经是1的时候,还是要进行入队操作。
这是因为之前被出队列是发生了抢占,如果一个任务正常sleep,会进行正常的prev->on_rq=0的操作
put_prev_entity()在更新了虚拟运行时间等信息后, 最终通过__enqueue_entity函数将prev进程(即current进程)放回就绪队列rq上
所以说,当前进程如果被抢占时,首先要放回就绪队列
put_prev_entity()分析
< kernel/sched/fair.c >
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
* If still on the runqueue then deactivate_task()
* was not called and update_curr() has to be done:
*/
if (prev->on_rq) /*se是on_rq状态了还要 enqueue!,这是因为之前被出队列是发生了抢占 */
update_curr(cfs_rq); /* 更新vruntime等 */
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
check_spread(cfs_rq, prev);
if (prev->on_rq) {
update_stats_wait_start_fair(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev); /* 将被抢占的prev任务重新放回cfs_rq队列(sleep主动放弃cpu的prev->on_rq为假,不会放回去了)。放回去之后其on_rq状态与其实际就在cfs_rq上就匹配上了. */
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
cfs_rq->curr = NULL; /* 更新cfs_rq->curr */
}
update_curr()
这个函数是CFS中的核心函数,用于更新进程的虚拟时间vruntime,这个函数最关键的是他的调用时机,
update_curr()入口函数有如下
task_fork_fair()
pick_next_task_fair()
pick_task_fair()
yield_task_fair()
check_preempt_wakeup()
entity_tick()
put_prev_entity()
dequeue_entity()
enqueue_entity()
reweight_entity()
update_curr_fair()
上面这些函数都会调用到update_curr()
这么多入口,无非是一个目的,不遗漏当前进程vruntime统计时间点,上面入口点,涵盖了所有当前运行进程可能会发生的时间点
所以,只要当前进程发生变化的入口和出口都进行了统计,就不会遗漏。
当然,每个时钟tick也不能遗漏
update_curr()具体代码如下
< kernel/sched/fair.c >
*
* Update the current task's runtime statistics.
*/
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_clock_task(rq_of(cfs_rq));
u64 delta_exec;
if (unlikely(!curr))
return;
delta_exec = now - curr->exec_start; /* 计算curr se的实际执行时间,物理时间差值 */
if (unlikely((s64)delta_exec <= 0))
return;
curr->exec_start = now; /* 更新当前时间 */
if (schedstat_enabled()) {
struct sched_statistics *stats;
stats = __schedstats_from_se(curr);
__schedstat_set(stats->exec_max,
max(delta_exec, stats->exec_max));
}
curr->sum_exec_runtime += delta_exec; /* 当前进程累计运行时间,也是物理时间 */
schedstat_add(cfs_rq->exec_clock, delta_exec); /* 更新cfs_rq的实际执行时间cfs_rq->exec_clock */
curr->vruntime += calc_delta_fair(delta_exec, curr); /* 根据物理时间的差值,计算累计vruntime */
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cgroup_account_cputime(curtask, delta_exec); /* 更新task所在cgroup之cpuacct的某个cpu运行时间ca->cpuusage[cpu]->cpuusage */
account_group_exec_runtime(curtask, delta_exec); /* 统计task所在线程组(thread group)的运行时间,tsk->signal->cputimer.cputime_atomic.sum_exec_runtime */
}
account_cfs_rq_runtime(cfs_rq, delta_exec); /* 计算cfs_rq的运行时间,是否超过cfs_bandwidth的限制,cfs_rq->runtime_remaining */
}
curr->exec_start: 通过rq_clock_task(rq_of(cfs_rq))值进行的更新,这个是物理时间
update_curr只能更新运行队列的当前进程,如果进程不在运行,没有实际运行时间就没有对应的虚拟运行时间
static inline u64 rq_clock_task(struct rq *rq)
{
lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
return rq->clock_task;
}
rq->clock_task:每次时钟节拍到来时会更新这个时钟,计算进程vruntime时使用这个时钟
更新路径
tick_sched_handle() -> pdate_process_times() -> scheduler_tick() -> sched_core_tick -> __sched_core_tick() -> update_rq_clock() -> update_rq_clock_task()
tick_nohz_handler()和tick_sched_timer()都会进入tick_sched_handle()
看看update_rq_clock_task()
< kernel/sched/core.c >
/*
* RQ-clock updating methods:
*/
static void update_rq_clock_task(struct rq *rq, s64 delta)
{
/*
* In theory, the compile should just see 0 here, and optimize out the call
* to sched_rt_avg_update. But I don't trust it...
*/
s64 __maybe_unused steal = 0, irq_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
/*
* Since irq_time is only updated on {soft,}irq_exit, we might run into
* this case when a previous update_rq_clock() happened inside a
* {soft,}irq region.
*
* When this happens, we stop ->clock_task and only update the
* prev_irq_time stamp to account for the part that fit, so that a next
* update will consume the rest. This ensures ->clock_task is
* monotonic.
*
* It does however cause some slight miss-attribution of {soft,}irq
* time, a more accurate solution would be to update the irq_time using
* the current rq->clock timestamp, except that would require using
* atomic ops.
*/
if (irq_delta > delta)
irq_delta = delta;
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
psi_account_irqtime(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
rq->prev_steal_time_rq += steal;
delta -= steal;
}
#endif
rq->clock_task += delta; /*在 这里更新值 */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
update_irq_load_avg(rq, irq_delta + steal);
#endif
update_rq_clock_pelt(rq, delta);
}
接着我们上面的讨论
update_curr_fair用于更新vruntime,调度时机主要是入队,出队,周期时间等
< kernel/sched/fair.c >
static void update_curr_fair(struct rq *rq)
{
update_curr(cfs_rq_of(&rq->curr->se));
}
set_next_task_fair()
从cfs_rq中选出一个任务来运行
< kernel/sched/fair.c >
/* Account for a task changing its policy or group.
*
* This routine is mostly called to set cfs_rq->curr field when a task
* migrates between groups/classes.
*/
static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_entity *se = &p->se;
#ifdef CONFIG_SMP
if (task_on_rq_queued(p)) {
/*
* Move the next running task to the front of the list, so our
* cfs_tasks list becomes MRU one.
*/
list_move(&se->group_node, &rq->cfs_tasks);
}
#endif
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
set_next_entity(cfs_rq, se); /* 从cfs_rq中选出一个任务来运行 */
/* ensure bandwidth has been allocated on our new cfs_rq */
account_cfs_rq_runtime(cfs_rq, 0);
}
}
set_next_entity();从cfs_rq中选出一个任务来运行
< kernel/sched/fair.c >
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
clear_buddies(cfs_rq, se);
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
* Any task has to be enqueued before it get to execute on
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se); /* 将选中的se 调出队列,虽然新选出的任务这里已经从cfs_rq上dequeue出来了,但是其 se->on_rq 并没有清0 */
update_load_avg(cfs_rq, se, UPDATE_TG); /* 更新负载信息 */
}
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se; /* 将当前se设置为当前允许的进程 */
/*
* Track our maximum slice length, if the CPU's load is at
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
if (schedstat_enabled() &&
rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
struct sched_statistics *stats;
stats = __schedstats_from_se(se);
__schedstat_set(stats->slice_max,
max((u64)stats->slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime));
}
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
注意:被选出来的正在执行的任务,其一定不再在 cfs_rq 队列上了,只不过是其 se->on_rq 还为真
pick_next_entity()分析
< kernel/sched/fair.c >
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
* 2) pick the "next" process, since someone really wants that to run
* 3) pick the "last" process, for cache locality
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
struct sched_entity *left = __pick_first_entity(cfs_rq);
struct sched_entity *se;
/*
* If curr is set we have to see if its left of the leftmost entity
* still in the tree, provided there was anything in the tree at all.
*/
if (!left || (curr && entity_before(curr, left)))
left = curr;
se = left; /* ideally we run the leftmost entity */
/*
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
if (cfs_rq->skip && cfs_rq->skip == se) {
struct sched_entity *second;
if (se == curr) {
second = __pick_first_entity(cfs_rq);
} else {
second = __pick_next_entity(se);
if (!second || (curr && entity_before(curr, second)))
second = curr;
}
if (second && wakeup_preempt_entity(second, left) < 1)
se = second;
}
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
/*
* Someone really wants this to run. If it's not unfair, run it.
*/
se = cfs_rq->next;
} else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
/*
* Prefer last buddy, try to return the CPU to a preempted task.
*/
se = cfs_rq->last;
}
return se;
}