Linux Kernel：scheduler之cfs

#kernel/linux-5.10/kernel/sched/fair.c
/*
 * All the scheduling class methods:
 */
const struct sched_class fair_sched_class
	__section("__fair_sched_class") = {
	.enqueue_task		= enqueue_task_fair,
	.dequeue_task		= dequeue_task_fair,
	.yield_task		= yield_task_fair,
	.yield_to_task		= yield_to_task_fair,

	.check_preempt_curr	= check_preempt_wakeup,

	.pick_next_task		= __pick_next_task_fair,
	.put_prev_task		= put_prev_task_fair,
	.set_next_task          = set_next_task_fair,

#ifdef CONFIG_SMP
	.balance		= balance_fair,
	.select_task_rq		= select_task_rq_fair,
	.migrate_task_rq	= migrate_task_rq_fair,

	.rq_online		= rq_online_fair,
	.rq_offline		= rq_offline_fair,

	.task_dead		= task_dead_fair,
	.set_cpus_allowed	= set_cpus_allowed_common,
#endif

	.task_tick		= task_tick_fair,
	.task_fork		= task_fork_fair,

	.prio_changed		= prio_changed_fair,
	.switched_from		= switched_from_fair,
	.switched_to		= switched_to_fair,

	.get_rr_interval	= get_rr_interval_fair,

	.update_curr		= update_curr_fair,

#ifdef CONFIG_FAIR_GROUP_SCHED
	.task_change_group	= task_change_group_fair,
#endif

#ifdef CONFIG_UCLAMP_TASK
	.uclamp_enabled		= 1,
#endif
};

二：数据结构

在之前文章中提到主调度器的就绪队列rq中都嵌入了一个cfs_rq实例：

/* CFS-related fields in a runqueue */
struct cfs_rq {
	struct load_weight	load;
	unsigned int		nr_running;
	unsigned int		h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */
	unsigned int		idle_h_nr_running; /* SCHED_IDLE */

	u64			exec_clock;
	u64			min_vruntime;

	struct rb_root_cached	tasks_timeline;

	/*
	 * 'curr' points to currently running entity on this cfs_rq.
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
	struct sched_entity	*curr;
	struct sched_entity	*next;
	struct sched_entity	*last;
	struct sched_entity	*skip;
...
};

load：记录负荷权重值

nr_running：记录队列上可运行进程的数目

h_nr_running：记录采用SCHED_{NORMAL,BATCH,IDLE}三种调度策略的进程数目

idle_h_nr_running：记录采用SCHED_IDLE调度策略的进程数目

exec_clock：跟踪调度总时钟

min_vruntime：跟踪队列上所有进程的最小虚拟运行时间。这个值是实现与就绪队列相关的虚拟时钟的基础，该值实际上有可能比最左边的树节点的vruntime大些

tasks_timeline：用于在按时间排序的红黑树中管理所有进程，tasks_timeline.rb_leftmost总是设置为指向树最左边的节点，即最需要被调度的进程。

curr,next,last,skip：指向特定进程的可调度实体

三：CFS操作

1、虚拟时钟

完全公平调度算法依赖于虚拟时钟，用以度量等待进程在完全公平系统中所能得到的CPU时间。但数据结构中任何地方都没有定义虚拟时钟，这是由于所有的必要信息都可以根据现存的实际时钟和与每个进程相关的负荷权重推算出来。所有与虚拟时钟有关的计算都在update_curr中执行，该函数在系统中各个不同地方调用，包括周期性调度器内。

/*
 * Update the current task's runtime statistics.
 */
static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	u64 now = rq_clock_task(rq_of(cfs_rq));
	u64 delta_exec;

	if (unlikely(!curr))
		return;

	delta_exec = now - curr->exec_start;
	if (unlikely((s64)delta_exec <= 0))
		return;

	curr->exec_start = now;

	schedstat_set(curr->statistics.exec_max,
		      max(delta_exec, curr->statistics.exec_max));

	curr->sum_exec_runtime += delta_exec;
	schedstat_add(cfs_rq->exec_clock, delta_exec);

	curr->vruntime += calc_delta_fair(delta_exec, curr);
	update_min_vruntime(cfs_rq);

	if (entity_is_task(curr)) {
		struct task_struct *curtask = task_of(curr);

		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
		cgroup_account_cputime(curtask, delta_exec);
		account_group_exec_runtime(curtask, delta_exec);
	}

	account_cfs_rq_runtime(cfs_rq, delta_exec);
}

（1）确定就绪队列的当前执行进程curr

（2）获取主调度器就绪队列的实际时钟值now，该值在每个调度周期都会更新

（3）计算当前和上一次更新负荷统计量时两次的时间差delta_exec

（4）更新当前进程完全公平调度开始时间exec_start

（5）更新当前进程在CPU上执行花费的物理时间sum_exec_runtime和虚拟时间vruntime

物理时间只要将时间差加到先前统计的时间即可，在计算虚拟时间中对于运行在nice级别0的进程来说，根据定义虚拟时间和物理时间是相等的，在使用不同的优先级时，必须根据进程的负荷权重重新衡量时间__calc_delta：

static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
	if (unlikely(se->load.weight != NICE_0_LOAD))
		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);

	return delta;
}

根据负荷权重计算虚拟时间：

static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
	u64 fact = scale_load_down(weight);
	int shift = WMULT_SHIFT;

	__update_inv_weight(lw);

	if (unlikely(fact >> 32)) {
		while (fact >> 32) {
			fact >>= 1;
			shift--;
		}
	}

	fact = mul_u32_u32(fact, lw->inv_weight);

	while (fact >> 32) {
		fact >>= 1;
		shift--;
	}

	return mul_u64_u32_shr(delta_exec, fact, shift);
}

以上计算过程可用如下计算公式表示，忽略舍入和溢出检查：

 * delta_exec * weight / lw.weight
 *   OR
 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT

（6）更新就绪队列上最小虚拟运行时间min_vruntime，必须小心保证该值是单调递增的

根据当前就绪队列实例curr，是否接收调度表中on_rq，以及树上是否有最左边的节点（是否有进程在树上等待调度）标志选择出runtime即树中所有节点最小vruntime值。为了保证每个队列min_vruntime是单调递增，调用max_vruntime接口选择出树中最小值vruntime和队列中min_vruntime值的较大者。这意味着每个队列的min_vruntime只有被树上某个节点的vruntime超出时才更新。利用该策略，内核确保min_vruntime只能增加，不能减小。

#kernel/linux-5.10/kernel/sched/fair.c
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);

	u64 vruntime = cfs_rq->min_vruntime;

	if (curr) {
		if (curr->on_rq)
			vruntime = curr->vruntime;
		else
			curr = NULL;
	}

	if (leftmost) { /* non-empty tree */
		struct sched_entity *se;
		se = rb_entry(leftmost, struct sched_entity, run_node);

		if (!curr)
			vruntime = se->vruntime;
		else
			vruntime = min_vruntime(vruntime, se->vruntime);
	}

	/* ensure we never gain time by being placed backwards. */
	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
}

完全公平调度的真正关键点：

在进程运行时，其vruntime稳定地增加，它在红黑树中总是向右移动，因为越重要的进程vruntime增加越慢，因此它们向右移动的速度也越慢，这样其被再次调度的机会就要大于次要进程。
如果进程进入睡眠，则其vruntime保持不变。因为每个队列min_vruntime同时会增加（单调递增），等睡眠进程醒来后，在红黑树中的位置会更靠左。

2、延迟跟踪

内核有一个固有概念，称之为良好的调度延迟，即保证每个可运行的进程都应该至少运行一次的某个时间间隔，它在sysctl_sched_latency给出，当前内核版本默认值为6 000 000纳秒或6ms，可通过/proc/sys/kernel/sched_latency_ns控制。第二个控制参数sched_nr_latency控制在一个延迟周期中处理的最大活动进程数目，默认值为8，如果活动进程的数目超出该上限，则延迟周期也成比例地线性扩展。其值可通过sysctl_sched_min_granularity间接控制，默认值为0.75ms，后者通过/proc/sys/kernel/sched_min_granularity_ns设置。

每次sysctl_sched_latency或sysctl_sched_min_granularity之一改变时，都会重新计算sched_latency_ns

static unsigned int sched_nr_latency = 8;
unsigned int sysctl_sched_latency			= 6000000ULL;
unsigned int sysctl_sched_min_granularity			= 750000ULL;

__sched_period接口确定延迟周期的长度，通常就是sysctl_sched_latency：

static u64 __sched_period(unsigned long nr_running)
{
	if (unlikely(nr_running > sched_nr_latency))
		return nr_running * sysctl_sched_min_granularity;
	else
		return sysctl_sched_latency;
}

当前系统配置：

root@ubuntu:~# sysctl -a | grep -e sched_min -e sched_latency
kernel.sched_latency_ns = 24000000
kernel.sched_min_granularity_ns = 3000000

通过考虑各个进程的相对权重，将一个延迟周期的时间在活动进程之间进行分配：

/*
 * We calculate the wall-time slice from the period by taking a part
 * proportional to the weight.
 *
 * s = p*P[w/rw]
 */
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	unsigned int nr_running = cfs_rq->nr_running;
	u64 slice;

	if (sched_feat(ALT_PERIOD))
		nr_running = rq_of(cfs_rq)->cfs.h_nr_running;

	slice = __sched_period(nr_running + !se->on_rq);

	for_each_sched_entity(se) {
		struct load_weight *load;
		struct load_weight lw;

		cfs_rq = cfs_rq_of(se);
		load = &cfs_rq->load;

		if (unlikely(!se->on_rq)) {
			lw = cfs_rq->load;

			update_load_add(&lw, se->load.weight);
			load = &lw;
		}
		slice = __calc_delta(slice, se->load.weight, load);
	}

	if (sched_feat(BASE_SLICE))
		slice = max(slice, (u64)sysctl_sched_min_granularity);

	return slice;
}

根据之前的介绍可知道就绪队列的负荷权重是队列上所有活动进程负荷权重的累加和，而时间段是按照实际时间给出，可通过sched_vslice接口计算内核所需的等价虚拟时间：

/*
 * We calculate the vruntime slice of a to-be-inserted task.
 *
 * vs = s/w
 */
static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	return calc_delta_fair(sched_slice(cfs_rq, se), se);
}

对于权重weight的进程来说，实际时间段time对应的虚拟时间长度为：

vtime = time * NICE_0_LOAD / weight

四：队列操作

1、enqueue_task_fair

完全公平调度有两个调度类用于增删就绪队列的成员：enqueue_task_fair和dequeue_task_fair，除了指向所述的就绪队列rq和task_struct的指针外，该函数还有另一个参数flags，可以指定入队的进程的状态，例如刚换新转为运行态等等。

#define DEQUEUE_SLEEP		0x01
#define DEQUEUE_SAVE		0x02 /* Matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE		0x04 /* Matches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK		0x08 /* Matches ENQUEUE_NOCLOCK */

#define ENQUEUE_WAKEUP		0x01
#define ENQUEUE_RESTORE		0x02
#define ENQUEUE_MOVE		0x04
#define ENQUEUE_NOCLOCK		0x08

#define ENQUEUE_HEAD		0x10
#define ENQUEUE_REPLENISH	0x20
#ifdef CONFIG_SMP
#define ENQUEUE_MIGRATED	0x40
#else
#define ENQUEUE_MIGRATED	0x00
#endif

#define ENQUEUE_WAKEUP_SYNC	0x80

static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)

遍历调度实例se->parent
当前实例se如果已经在就绪队里中，则返回
cfs_rq_of获取当前实例se的完全调度就绪队列实例cfs_rq
enqueue_entity处理添加成员具体工作，其中内核会使用update_curr更新统计量，如果进程此前在睡眠，刚唤醒那么首先需要在place_entity中调整进程的虚拟运行时间，如果进程最近在运行，其虚拟运行时间任然有效，那么可以直接用__enqueue_entity加入红黑树中。

2、enqueue_entity

static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
	bool curr = cfs_rq->curr == se;

	/*
	 * If we're the current task, we must renormalise before calling
	 * update_curr().
	 */
	if (renorm && curr)
		se->vruntime += cfs_rq->min_vruntime;

	update_curr(cfs_rq);

	/*
	 * Otherwise, renormalise after, such that we're placed at the current
	 * moment in time, instead of some random moment in the past. Being
	 * placed in the past could significantly boost this task to the
	 * fairness detriment of existing tasks.
	 */
	if (renorm && !curr)
		se->vruntime += cfs_rq->min_vruntime;

	/*
	 * When enqueuing a sched_entity, we must:
	 *   - Update loads to have both entity and cfs_rq synced with now.
	 *   - Add its load to cfs_rq->runnable_avg
	 *   - For group_entity, update its weight to reflect the new share of
	 *     its group cfs_rq
	 *   - Add its new weight to cfs_rq->load.weight
	 */
	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
	se_update_runnable(se);
	update_cfs_group(se);
	account_entity_enqueue(cfs_rq, se);

	if (flags & ENQUEUE_WAKEUP)
		place_entity(cfs_rq, se, 0);

	check_schedstat_required();
	update_stats_enqueue(cfs_rq, se, flags);
	check_spread(cfs_rq, se);
	if (!curr)
		__enqueue_entity(cfs_rq, se);
	se->on_rq = 1;

	/*
	 * When bandwidth control is enabled, cfs might have been removed
	 * because of a parent been throttled but cfs->nr_running > 1. Try to
	 * add it unconditionnally.
	 */
	if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
		list_add_leaf_cfs_rq(cfs_rq);

	if (cfs_rq->nr_running == 1)
		check_enqueue_throttle(cfs_rq);
}

3、place_entity

该接口用于调整进程虚拟运行时间。函数根据initial的值来区分两种情况，只有在新进程被加到系统中时才会设置该参数，本函数以及分离函数detach_task_cfs_rq中initial值被设置为0，在另一个函数task_fork_fair中会被置位1。

由于内核已经承诺在当前的延迟周期内所有活动进程都至少运行一次，队列的min_vruntime用作基准虚拟时间，通过减去sysctl_sched_latency，则可以确保新唤醒的进程只有在当前延迟周期结束后才能运行。

但如果睡眠进程已经累计了比较大的不公平值（即se->vruntime值比较大），如果该值比先前计算的差值更大，则将其作为进程的vruntime，这会导致该进程在红黑树中处于比较靠左的位置，具有较大vruntime值的进程可以更早调度执行。

static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
	u64 vruntime = cfs_rq->min_vruntime;

	/*
	 * The 'current' period is already promised to the current tasks,
	 * however the extra weight of the new task will slow them down a
	 * little, place the new task so that it fits in the slot that
	 * stays open at the end.
	 */
	if (initial && sched_feat(START_DEBIT))
		vruntime += sched_vslice(cfs_rq, se);

	/* sleeps up to a single latency don't count. */
	if (!initial) {
		unsigned long thresh = sysctl_sched_latency;

		/*
		 * Halve their sleep time's effect, to allow
		 * for a gentler effect of sleepers:
		 */
		if (sched_feat(GENTLE_FAIR_SLEEPERS))
			thresh >>= 1;

		vruntime -= thresh;
	}

	/* ensure we never gain time by being placed backwards. */
	se->vruntime = max_vruntime(se->vruntime, vruntime);
	trace_android_rvh_place_entity(cfs_rq, se, initial, vruntime);
}

五：选择下一个进程

1、__pick_next_task_fair

const struct sched_class fair_sched_class
	__section("__fair_sched_class") = {
...
	.pick_next_task		= __pick_next_task_fair,
...
}

static struct task_struct *__pick_next_task_fair(struct rq *rq)
{
	return pick_next_task_fair(rq, NULL, NULL);
}

struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
...
    if (!sched_fair_runnable(rq))
...
	do {
		se = pick_next_entity(cfs_rq, NULL);
		set_next_entity(cfs_rq, se);
		cfs_rq = group_cfs_rq(se);
	} while (cfs_rq);
...
}

2、pick_next_entity

__pick_first_entity尝试获取红黑树最左边进程的实例，通过红黑树节点rb_node实例反推进程实例sched_entity是使用container_of机制完成。如果设置了curr并且它在树中实体最左边的左边（即vruntime比树中rb_leftmost节点实例的vruntime还小），则将当前传入实例curr设置为left。本节cfs操作中，并未设置curr（传入值为Null）所以可以忽略。

/*
 * Pick the next process, keeping these things in mind, in this order:
 * 1) keep things fair between processes/task groups
 * 2) pick the "next" process, since someone really wants that to run
 * 3) pick the "last" process, for cache locality
 * 4) do not run the "skip" process, if something else is available
 */
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	struct sched_entity *left = __pick_first_entity(cfs_rq);
...
	/*
	 * If curr is set we have to see if its left of the leftmost entity
	 * still in the tree, provided there was anything in the tree at all.
	 */
	if (!left || (curr && entity_before(curr, left)))
		left = curr;

	se = left; /* ideally we run the leftmost entity */
}

#define	rb_entry(ptr, type, member) container_of(ptr, type, member)

struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);

	if (!left)
		return NULL;

	return rb_entry(left, struct sched_entity, run_node);
}

3、container_of

将结构体成员强制转换为包含结构体，以下为接口展开及示例：

#define container_of(ptr, type, member) \
(type *)( (void *)ptr - (void *)&((type  *)0)->member );

(struct user_t *)(((void *)(&user->age)) - (void *)(&((struct user_t *)0)->age));

4、set_next_entity

当前执行进程不保存在就绪队列上，因此需要使用__dequeue_entity将其从树中移除。但是当前进程和就绪队列之间的关联没有丢失，curr标记了当前运行的进程实例se：

void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	/* 'current' is not kept within the tree. */
	if (se->on_rq) {
		/*
		 * Any task has to be enqueued before it get to execute on
		 * a CPU. So account for the time it spent waiting on the
		 * runqueue.
		 */
		update_stats_wait_end(cfs_rq, se);
		__dequeue_entity(cfs_rq, se);
		update_load_avg(cfs_rq, se, UPDATE_TG);
	}

	update_stats_curr_start(cfs_rq, se);
	cfs_rq->curr = se;

	/*
	 * Track our maximum slice length, if the CPU's load is at
	 * least twice that of our own weight (i.e. dont track it
	 * when there are only lesser-weight tasks around):
	 */
	if (schedstat_enabled() &&
	    rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
		schedstat_set(se->statistics.slice_max,
			max((u64)schedstat_val(se->statistics.slice_max),
			    se->sum_exec_runtime - se->prev_sum_exec_runtime));
	}

	se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
EXPORT_SYMBOL_GPL(set_next_entity);

六：处理周期性调度器

周期性调度器由函数task_tick_fair负责，实际工作由entity_tick完成

1、task_tick_fair

/*
 * scheduler tick hitting a task of our scheduling class.
 *
 * NOTE: This function can be called remotely by the tick offload that
 * goes along full dynticks. Therefore no local assumption can be made
 * and everything must be accessed through the @rq and @curr passed in
 * parameters.
 */
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &curr->se;

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		entity_tick(cfs_rq, se, queued);
	}

	if (static_branch_unlikely(&sched_numa_balancing))
		task_tick_numa(rq, curr);

	update_misfit_status(curr, rq);
	update_overutilized_status(task_rq(curr));
}

2、entity_tick

update_curr更新统计量，

update_load_avg更新平均负载，

update_cfs_group根据组运行队列的当前状态重新计算组实体。

static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);

	/*
	 * Ensure that runnable average is periodically updated.
	 */
	update_load_avg(cfs_rq, curr, UPDATE_TG);
	update_cfs_group(curr);

#ifdef CONFIG_SCHED_HRTICK
	/*
	 * queued ticks are scheduled to match the slice, so don't bother
	 * validating it and just reschedule.
	 */
	if (queued) {
		resched_curr(rq_of(cfs_rq));
		return;
	}
	/*
	 * don't let the period tick interfere with the hrtick preemption
	 */
	if (!sched_feat(DOUBLE_TICK) &&
			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
		return;
#endif

	if (cfs_rq->nr_running > 1)
		check_preempt_tick(cfs_rq, curr);
}

3、check_preempt_tick

当前就绪队列上运行的进程大于1个时进行处理，否则无事发生。

该函数的目的在于，确保没有哪个进程能够比延迟周期中确定的份额运行得更长。该份额对应的实际时间长度在sched_slice中计算。进程在CPU上已经运行的实际时间间隔由sum_exec_runtime-prev_sum_exec_runtime给出。因此抢占决策很容易做出：如果进程运行时间比期望的时间间隔长，那么通过resched_curr发出重调度请求。并在task_struct中设置TIF_ NEED_RESCHED标志，核心调度器会在下一个适当时机发起重调度。

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	unsigned long ideal_runtime, delta_exec;
	struct sched_entity *se;
	s64 delta;
	bool skip_preempt = false;

	ideal_runtime = sched_slice(cfs_rq, curr);
	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
	trace_android_rvh_check_preempt_tick(current, &ideal_runtime, &skip_preempt,
			delta_exec, cfs_rq, curr, sysctl_sched_min_granularity);
	if (skip_preempt)
		return;
	if (delta_exec > ideal_runtime) {
		resched_curr(rq_of(cfs_rq));
		/*
		 * The current task ran long enough, ensure it doesn't get
		 * re-elected due to buddy favours.
		 */
		clear_buddies(cfs_rq, curr);
		return;
	}

	/*
	 * Ensure that a task that missed wakeup preemption by a
	 * narrow margin doesn't have to wait for a full slice.
	 * This also mitigates buddy induced latencies under load.
	 */
	if (delta_exec < sysctl_sched_min_granularity)
		return;

	se = __pick_first_entity(cfs_rq);
	delta = curr->vruntime - se->vruntime;

	if (delta < 0)
		return;

	if (delta > ideal_runtime)
		resched_curr(rq_of(cfs_rq));
}

七：唤醒抢占

1、check_preempt_curr

当在try_to_wake_up和wake_up_new_task中唤醒进程时，内核使用check_preempt_curr看看是否新进程可以抢占当前运行的进程。

void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
	if (p->sched_class == rq->curr->sched_class)
		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
	else if (p->sched_class > rq->curr->sched_class)
		resched_curr(rq);

	/*
	 * A queue event has occurred, and we're going to schedule.  In
	 * this case, we can save a useless back to back clock update.
	 */
	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
		rq_clock_skip_update(rq);
}
EXPORT_SYMBOL_GPL(check_preempt_curr);

2、check_preempt_wakeup

该过程不涉及核心调度器，对完全公平调度器处理的进程，则由check_preempt_wakeup函数执行该监测。新唤醒的进程不必一定由完全公平调度器处理。如果新进程是一个实时进程，则会立即请求重调度，因为实时进程总是会抢占CFS进程：

.check_preempt_curr	= check_preempt_wakeup,

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)

空闲任务被非空闲任务抢占

/* Idle tasks are by definition preempted by non-idle tasks. */
if (unlikely(task_has_idle_policy(curr)) &&
	likely(!task_has_idle_policy(p)))
	goto preempt;

批量任务和空闲任务不抢占非空闲任务(非空闲任务被抢占是由tick驱动的)

/*
	* Batch and idle tasks do not preempt non-idle tasks (their preemption
	* is driven by the tick):
	*/
if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
	return;

当运行进程被新进程抢占时，内核确保被抢占者至少已经运行了某一最小时间限额。该最小值保存在sysctl_sched_wakeup_granularity，这指的是实际时间，因此内核首先需要转换为虚拟时间，如果新进程的虚拟运行时间加上最小时间限额任然小于当前执行进程的虚拟运行时间（由其调度实体se表示），则请求重调度

/*
 * SCHED_OTHER wake-up granularity.
 *
 * This option delays the preemption effects of decoupled workloads
 * and reduces their over-scheduling. Synchronous workloads will still
 * have immediate wakeup/sleep latencies.
 *
 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
unsigned int sysctl_sched_wakeup_granularity			= 1000000UL;

static unsigned long wakeup_gran(struct sched_entity *se)
{
	unsigned long gran = sysctl_sched_wakeup_granularity;

	/*
	 * Since its curr running now, convert the gran from real-time
	 * to virtual-time in his units.
	 *
	 * By using 'se' instead of 'curr' we penalize light tasks, so
	 * they get preempted easier. That is, if 'se' < 'curr' then
	 * the resulting gran will be larger, therefore penalizing the
	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
	 * be smaller, again penalizing the lighter task.
	 *
	 * This is especially important for buddies when the leftmost
	 * task is higher priority than the buddy.
	 */
	return calc_delta_fair(gran, se);
}


static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
	s64 gran, vdiff = curr->vruntime - se->vruntime;

	if (vdiff <= 0)
		return -1;

	gran = wakeup_gran(se);
	if (vdiff > gran)
		return 1;

	return 0;
}

八：处理新进程

创建新进程时调用的钩子函数：task_fork_fair

.task_fork		= task_fork_fair,

更新当前就绪队列的统计量，调用之前提及过得place_entity接口，initial参数设置为1，以便通过sched_vslice计算vruntime，这实际上确定了进程在延迟周期中所占的时间份额，只是转换成虚拟时间。

cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
if (curr) {
	update_curr(cfs_rq);
	se->vruntime = curr->vruntime;
}
place_entity(cfs_rq, se, 1);

变量sysctl_sched_child_runs_first可以控制新建子进程是否应该在父进程之前运行。该参数默认为1，可以通过/proc/sys/kernel/sched_child_runs_first修改。

如果父进程的虚拟运行时间（由curr表示）小于子进程的虚拟运行时间，则意味着父进程将在子进程之前运行。虚拟运行时间较小的，在红黑树中的位置相较靠左，更先被调度。如果子进程应该在父进程之前运行，则两者的虚拟运行时间需要交换一下，然后子进程按照常规加入就绪队列，通过resched_curr接口请求重调度。

/*
 * After fork, child runs first. If set to 0 (default) then
 * parent will (try to) run first.
 */
unsigned int sysctl_sched_child_runs_first __read_mostly;

if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
	/*
		* Upon rescheduling, sched_class::put_prev_task() will place
		* 'current' within the tree based on its new key value.
		*/
	swap(curr->vruntime, se->vruntime);
	resched_curr(rq);
}