CFS之pick_next_task_fair()方法分析

OS Developer

已于 2023-07-25 15:24:19 修改

阅读量491

点赞数

分类专栏：进程调度 CFS 文章标签： linux

于 2023-07-25 15:18:28 首次发布

本文链接：https://blog.youkuaiyun.com/lizhijun_buaa/article/details/131918518

版权

进程调度同时被 2 个专栏收录

11 篇文章

订阅专栏

CFS

6 篇文章

订阅专栏

本代码分析来自内核6.1.31，转载请注明出处

总体调用流程

schedule() -> __schedule() -> pick_next_task() -> __pick_next_task() -> pick_next_task_fair() || p = class->pick_next_task(rq);

调度器schedule函数在进程调度抢占时, 会通过__schedule函数调用全局pick_next_task选择一个最优的进程

pick_next_task_fair()分析

pick_next_task按照优先级依次调用不同调度器类提供的pick_next_task方法，我们这里只看cfs的pick_next_task_fair()

< kernel/sched/fair.c >

struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
	struct cfs_rq *cfs_rq = &rq->cfs;
	struct sched_entity *se;
	struct task_struct *p;
	int new_tasks;

again:
	if (!sched_fair_runnable(rq))             /* 函数内部查询rq->cfs.nr_running 值为0,这里就直接进入后面的idle */
		goto idle;

/*  组调度代码没有包含进来，不是我们这次要分析的重点 */

simple:

	if (prev)
		put_prev_task(rq, prev);              /*  将当前进程放入运行队列的合适位置  */

	do {
        /*  选出下一个调度实体  */
		se = pick_next_entity(cfs_rq, NULL);
		set_next_entity(cfs_rq, se);
		cfs_rq = group_cfs_rq(se);
	} while (cfs_rq);

	p = task_of(se);

done: __maybe_unused;
#ifdef CONFIG_SMP
	/*
	 * Move the next running task to the front of
	 * the list, so our cfs_tasks list becomes MRU
	 * one.
	 */
	list_move(&p->se.group_node, &rq->cfs_tasks);
#endif

	if (hrtick_enabled_fair(rq))
		hrtick_start_fair(rq, p);

	update_misfit_status(p, rq);

	return p;

idle:
	if (!rf)
		return NULL;

	new_tasks = newidle_balance(rq, rf);

	/*
	 * Because newidle_balance() releases (and re-acquires) rq->lock, it is
	 * possible for any higher priority task to appear. In that case we
	 * must re-start the pick_next_entity() loop.
	 */
	if (new_tasks < 0)
		return RETRY_TASK;

	if (new_tasks > 0)
		goto again;

	/*
	 * rq is about to be idle, check if we need to update the
	 * lost_idle_time of clock_pelt
	 */
	update_idle_rq_clock_pelt(rq);

	return NULL;
}

总体结构分析

关键节点	功能描述
again:	循环执行本函数挑选过程
simple:	如果没有定义组调度，就执行本节点流程，将当前进程重新加入队列，挑选下一个进程
idle:	如果队列中可运行进程为0，则调度idle进程

由于CFS调度的时候, prev进程不一定是一个CFS调度的进程, 因此必须调用全局的put_prev_task来调用prev进程所属调度器类sched_class的对应put_prev_task方法, 完成将进程放回到就绪队列中

CFS调度类定义的操作方法如下

< kernel/sched/fair.c >

*
 * All the scheduling class methods:
 */
DEFINE_SCHED_CLASS(fair) = {

	.enqueue_task		= enqueue_task_fair,
	.dequeue_task		= dequeue_task_fair,
	.yield_task		= yield_task_fair,
	.yield_to_task		= yield_to_task_fair,

	.check_preempt_curr	= check_preempt_wakeup,

	.pick_next_task		= __pick_next_task_fair,
	.put_prev_task		= put_prev_task_fair,
	.set_next_task          = set_next_task_fair,

#ifdef CONFIG_SMP
	.balance		= balance_fair,
	.pick_task		= pick_task_fair,
	.select_task_rq		= select_task_rq_fair,
	.migrate_task_rq	= migrate_task_rq_fair,

	.rq_online		= rq_online_fair,
	.rq_offline		= rq_offline_fair,

	.task_dead		= task_dead_fair,
	.set_cpus_allowed	= set_cpus_allowed_common,
#endif

	.task_tick		= task_tick_fair,
	.task_fork		= task_fork_fair,

	.prio_changed		= prio_changed_fair,
	.switched_from		= switched_from_fair,
	.switched_to		= switched_to_fair,

	.get_rr_interval	= get_rr_interval_fair,

	.update_curr		= update_curr_fair,                /* 用于更新vruntime,调度时机主要是入队，出队，周期时间等   */

#ifdef CONFIG_FAIR_GROUP_SCHED
	.task_change_group	= task_change_group_fair,
#endif

#ifdef CONFIG_UCLAMP_TASK
	.uclamp_enabled		= 1,
#endif
};

put_prev_task()分析

put_prev_task是把正在CPU上运行的进程放回到运行队列中去

< kernel/sched/sched.h >

static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
	WARN_ON_ONCE(rq->curr != prev);
	prev->sched_class->put_prev_task(rq, prev);
}

所以CFS调用路径

put_prev_task -> put_prev_task_fair()

< kernel/sched/fair.c >

/*
 * Account for a descheduled task:
 */
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
{
	struct sched_entity *se = &prev->se;
	struct cfs_rq *cfs_rq;

	/* 在组策略情况下, 调度实体之间存在父子的层次, for_each_sched_entity会从当前调度实体开始,　然后循环向其父调度实体进行更新, 非组调度情况下则只执行一次  */
	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		put_prev_entity(cfs_rq, se);
	}
}

在组策略情况下, 调度实体之间存在父子的层次, for_each_sched_entity会从当前调度实体开始,　然后循环向其父调度实体进行更新, 非组调度情况下则只执行一次

for_each_sched_entity(se)

< kernel/sched/fair.c >

/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
		for (; se; se = se->parent)

cfs_rq_of()就是通过se获取cfs_rq

< kernel/sched/sched.h >

/* runqueue on which this entity is (to be) queued */
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
	return se->cfs_rq;
}

put_prev_entity()将prev任务放回队列

这里要注意一点，prev->on_rq==1 说明已经在队列了，可是看代码发现在已经是1的时候，还是要进行入队操作。

这是因为之前被出队列是发生了抢占，如果一个任务正常sleep，会进行正常的prev->on_rq=0的操作

put_prev_entity()在更新了虚拟运行时间等信息后, 最终通过__enqueue_entity函数将prev进程(即current进程)放回就绪队列rq上

所以说，当前进程如果被抢占时，首先要放回就绪队列

put_prev_entity()分析

< kernel/sched/fair.c >

static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
	/*
	 * If still on the runqueue then deactivate_task()
	 * was not called and update_curr() has to be done:
	 */
	if (prev->on_rq)                           /*se是on_rq状态了还要 enqueue！，这是因为之前被出队列是发生了抢占 */
		update_curr(cfs_rq);              /* 更新vruntime等 */

	/* throttle cfs_rqs exceeding runtime */
	check_cfs_rq_runtime(cfs_rq);

	check_spread(cfs_rq, prev);

	if (prev->on_rq) {
		update_stats_wait_start_fair(cfs_rq, prev);
		/* Put 'current' back into the tree. */
		__enqueue_entity(cfs_rq, prev);                                /* 将被抢占的prev任务重新放回cfs_rq队列(sleep主动放弃cpu的prev->on_rq为假，不会放回去了)。放回去之后其on_rq状态与其实际就在cfs_rq上就匹配上了. */
		/* in !on_rq case, update occurred at dequeue */
		update_load_avg(cfs_rq, prev, 0);
	}
	cfs_rq->curr = NULL;                           /* 更新cfs_rq->curr */
}

update_curr()

这个函数是CFS中的核心函数，用于更新进程的虚拟时间vruntime,这个函数最关键的是他的调用时机，

update_curr()入口函数有如下

task_fork_fair()

pick_next_task_fair()

pick_task_fair()

yield_task_fair()

check_preempt_wakeup()

entity_tick()

put_prev_entity()

dequeue_entity()

enqueue_entity()

reweight_entity()

update_curr_fair()

上面这些函数都会调用到update_curr()

这么多入口，无非是一个目的，不遗漏当前进程vruntime统计时间点,上面入口点，涵盖了所有当前运行进程可能会发生的时间点

所以，只要当前进程发生变化的入口和出口都进行了统计，就不会遗漏。

当然，每个时钟tick也不能遗漏

update_curr()具体代码如下

< kernel/sched/fair.c >

*
 * Update the current task's runtime statistics.
 */
static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	u64 now = rq_clock_task(rq_of(cfs_rq));
	u64 delta_exec;

	if (unlikely(!curr))
		return;

	delta_exec = now - curr->exec_start;           /* 计算curr se的实际执行时间,物理时间差值 */
	if (unlikely((s64)delta_exec <= 0))
		return;

	curr->exec_start = now;                    /*  更新当前时间  */

	if (schedstat_enabled()) {
		struct sched_statistics *stats;

		stats = __schedstats_from_se(curr);
		__schedstat_set(stats->exec_max,
				max(delta_exec, stats->exec_max));
	}

	curr->sum_exec_runtime += delta_exec;                          /* 当前进程累计运行时间，也是物理时间 */
	schedstat_add(cfs_rq->exec_clock, delta_exec);             /* 更新cfs_rq的实际执行时间cfs_rq->exec_clock */

	curr->vruntime += calc_delta_fair(delta_exec, curr);                /* 根据物理时间的差值，计算累计vruntime */
	update_min_vruntime(cfs_rq);

	if (entity_is_task(curr)) {
		struct task_struct *curtask = task_of(curr);

		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
		cgroup_account_cputime(curtask, delta_exec);                                  /* 更新task所在cgroup之cpuacct的某个cpu运行时间ca->cpuusage[cpu]->cpuusage */
		account_group_exec_runtime(curtask, delta_exec);                         /* 统计task所在线程组(thread group)的运行时间,tsk->signal->cputimer.cputime_atomic.sum_exec_runtime */
	}

	account_cfs_rq_runtime(cfs_rq, delta_exec);                          /* 计算cfs_rq的运行时间，是否超过cfs_bandwidth的限制,cfs_rq->runtime_remaining */
}

curr->exec_start: 通过rq_clock_task(rq_of(cfs_rq))值进行的更新，这个是物理时间

update_curr只能更新运行队列的当前进程，如果进程不在运行，没有实际运行时间就没有对应的虚拟运行时间

static inline u64 rq_clock_task(struct rq *rq)
{
	lockdep_assert_rq_held(rq);
	assert_clock_updated(rq);

	return rq->clock_task;
}

rq->clock_task：每次时钟节拍到来时会更新这个时钟,计算进程vruntime时使用这个时钟

更新路径

tick_sched_handle() -> pdate_process_times() -> scheduler_tick() -> sched_core_tick -> __sched_core_tick() -> update_rq_clock() -> update_rq_clock_task()

tick_nohz_handler()和tick_sched_timer()都会进入tick_sched_handle()

看看update_rq_clock_task()

< kernel/sched/core.c >

/*
 * RQ-clock updating methods:
 */

static void update_rq_clock_task(struct rq *rq, s64 delta)
{
/*
 * In theory, the compile should just see 0 here, and optimize out the call
 * to sched_rt_avg_update. But I don't trust it...
 */
	s64 __maybe_unused steal = 0, irq_delta = 0;

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

	/*
	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
	 * this case when a previous update_rq_clock() happened inside a
	 * {soft,}irq region.
	 *
	 * When this happens, we stop ->clock_task and only update the
	 * prev_irq_time stamp to account for the part that fit, so that a next
	 * update will consume the rest. This ensures ->clock_task is
	 * monotonic.
	 *
	 * It does however cause some slight miss-attribution of {soft,}irq
	 * time, a more accurate solution would be to update the irq_time using
	 * the current rq->clock timestamp, except that would require using
	 * atomic ops.
	 */
	if (irq_delta > delta)
		irq_delta = delta;

	rq->prev_irq_time += irq_delta;
	delta -= irq_delta;
	psi_account_irqtime(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
	if (static_key_false((&paravirt_steal_rq_enabled))) {
		steal = paravirt_steal_clock(cpu_of(rq));
		steal -= rq->prev_steal_time_rq;

		if (unlikely(steal > delta))
			steal = delta;

		rq->prev_steal_time_rq += steal;
		delta -= steal;
	}
#endif

	rq->clock_task += delta;               /*在 这里更新值 */

#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
		update_irq_load_avg(rq, irq_delta + steal);
#endif
	update_rq_clock_pelt(rq, delta);
}

接着我们上面的讨论

update_curr_fair用于更新vruntime,调度时机主要是入队，出队，周期时间等

< kernel/sched/fair.c >

static void update_curr_fair(struct rq *rq)
{
	update_curr(cfs_rq_of(&rq->curr->se));
}

set_next_task_fair()

从cfs_rq中选出一个任务来运行

< kernel/sched/fair.c >

/* Account for a task changing its policy or group.
 *
 * This routine is mostly called to set cfs_rq->curr field when a task
 * migrates between groups/classes.
 */
static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
	struct sched_entity *se = &p->se;

#ifdef CONFIG_SMP
	if (task_on_rq_queued(p)) {
		/*
		 * Move the next running task to the front of the list, so our
		 * cfs_tasks list becomes MRU one.
		 */
		list_move(&se->group_node, &rq->cfs_tasks);
	}
#endif

	for_each_sched_entity(se) {
		struct cfs_rq *cfs_rq = cfs_rq_of(se);

		set_next_entity(cfs_rq, se);                               /* 从cfs_rq中选出一个任务来运行 */
		/* ensure bandwidth has been allocated on our new cfs_rq */
		account_cfs_rq_runtime(cfs_rq, 0);
	}
}

set_next_entity();从cfs_rq中选出一个任务来运行

< kernel/sched/fair.c >

static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	clear_buddies(cfs_rq, se);

	/* 'current' is not kept within the tree. */
	if (se->on_rq) {
		/*
		 * Any task has to be enqueued before it get to execute on
		 * a CPU. So account for the time it spent waiting on the
		 * runqueue.
		 */
		update_stats_wait_end_fair(cfs_rq, se);
		__dequeue_entity(cfs_rq, se);                             /* 将选中的se 调出队列，虽然新选出的任务这里已经从cfs_rq上dequeue出来了，但是其 se->on_rq 并没有清0 */
		update_load_avg(cfs_rq, se, UPDATE_TG);         /* 更新负载信息 */
	}

	update_stats_curr_start(cfs_rq, se);
	cfs_rq->curr = se;                                                   /* 将当前se设置为当前允许的进程 */

	/*
	 * Track our maximum slice length, if the CPU's load is at
	 * least twice that of our own weight (i.e. dont track it
	 * when there are only lesser-weight tasks around):
	 */
	if (schedstat_enabled() &&
	    rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
		struct sched_statistics *stats;

		stats = __schedstats_from_se(se);
		__schedstat_set(stats->slice_max,
				max((u64)stats->slice_max,
				    se->sum_exec_runtime - se->prev_sum_exec_runtime));
	}

	se->prev_sum_exec_runtime = se->sum_exec_runtime;
}

注意：被选出来的正在执行的任务，其一定不再在 cfs_rq 队列上了，只不过是其 se->on_rq 还为真

pick_next_entity()分析

< kernel/sched/fair.c >

/*
 * Pick the next process, keeping these things in mind, in this order:
 * 1) keep things fair between processes/task groups
 * 2) pick the "next" process, since someone really wants that to run
 * 3) pick the "last" process, for cache locality
 * 4) do not run the "skip" process, if something else is available
 */
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	struct sched_entity *left = __pick_first_entity(cfs_rq);
	struct sched_entity *se;

	/*
	 * If curr is set we have to see if its left of the leftmost entity
	 * still in the tree, provided there was anything in the tree at all.
	 */
	if (!left || (curr && entity_before(curr, left)))
		left = curr;

	se = left; /* ideally we run the leftmost entity */

	/*
	 * Avoid running the skip buddy, if running something else can
	 * be done without getting too unfair.
	 */
	if (cfs_rq->skip && cfs_rq->skip == se) {
		struct sched_entity *second;

		if (se == curr) {
			second = __pick_first_entity(cfs_rq);
		} else {
			second = __pick_next_entity(se);
			if (!second || (curr && entity_before(curr, second)))
				second = curr;
		}

		if (second && wakeup_preempt_entity(second, left) < 1)
			se = second;
	}

	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
		/*
		 * Someone really wants this to run. If it's not unfair, run it.
		 */
		se = cfs_rq->next;
	} else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
		/*
		 * Prefer last buddy, try to return the CPU to a preempted task.
		 */
		se = cfs_rq->last;
	}

	return se;
}