kernel/sched/core.c
一、主调度器
inlude/linux/sched.h中定义了struct task_struct {}。
进程调度最核心的两个问题是,什么时间放弃cpu,接下来由谁获得cpu。放弃cpu有两种方式,一种是主动调用schedule(),一种是周期调度器强行调度scheduler_tick()。
------------------------------------------------------------------------------------------------
pick_next_task();
判断当前cpu就绪队列rq-running中的进程数目是否与普通进程的就绪队列中的进程数目相同,如果相同就说明系统中全是普通进程,直接通过cfs算法的调度类的pick_next_task_fair(kernel/sched/fair.c)函数来从普通进程的就绪队列中寻找进程即可。
否则,遍历调度类的链表,并从中选择一个优先级最高的进程。调度器类已经按如下顺序排序:rt_sched_class → fair_sched_class → idle_sched_class → NULL。
__schedule()调用了pick_next_task(); __schedule()是主调度器。
schedule(),do_task_dead(),schedule_idle(),preempt_schedule_common(),preemt_schedule_lock(),preempt_schedule_notrace(),preempt_schedule_irq()都调用了__schedule(); __schedule()函数主要是对当前task做善后工作,对新task做准备工作。
static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
struct rq *rq;
int cpu;
cpu = smp_processor_id();//返回当前处理器的id号
rq = cpu_rq(cpu);// 取该处理器上的运行队列
prev = rq->curr;//获取当前正在执行的task
schedule_debug(prev, preempt);//检查当前task并输出debug信息
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))//取消为当前进程运行的hrtimer
hrtick_clear(rq);
local_irq_disable();//关中断
rcu_note_context_switch(preempt);//更新全局状态,标识当前CPU发生上下文的切换。
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up():
*
* __set_current_state(@state) signal_wake_up()
* schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
* wake_up_state(p, state)
* LOCK rq->lock LOCK p->pi_state
* smp_mb__after_spinlock() smp_mb__after_spinlock()
* if (signal_pending_state()) if (p->state & @state)
*
* Also, the membarrier system call requires a full memory barrier
* after coming from user-space, before storing to rq->curr.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();//内存屏障
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
update_rq_clock(rq);//更新可运行队列时间
switch_count = &prev->nivcsw;//记录当前进程切换的次数
/*
* We must load prev->state once (task_struct::state is volatile), such
* that:
*
* - we form a control dependency vs deactivate_task() below.
* - ptrace_{,un}freeze_traced() can change ->state underneath us.
*/
prev_state = prev->state;//更新当前task状态
if (!preempt && prev_state) {
if (signal_pending_state(prev_state, prev)) {//当前进程有信号待处理,设置进程为就绪态
prev->state = TASK_RUNNING;
} else {
prev->sched_contributes_to_load =
(prev_state & TASK_UNINTERRUPTIBLE) &&
!(prev_state & TASK_NOLOAD) &&
!(prev->flags & PF_FROZEN);
if (prev->sched_contributes_to_load)
rq->nr_uninterruptible++; //uninterruptible 队列长度+1
/*
* __schedule() ttwu()
* prev_state = prev->state; if (p->on_rq && ...)
* if (prev_state) goto out;
* p->on_rq = 0; smp_acquire__after_ctrl_dep();
* p->state = TASK_WAKING
*
* Where __schedule() and ttwu() have matching control dependencies.
*
* After this, schedule() must not care about p->state any more.
*/
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);//将其从运行队列中删除
if (prev->in_iowait) {//如果正在进行IO操作,进行原子操作,delay IO操作
atomic_inc(&rq->nr_iowait);
delayacct_blkio_start();
}
}
switch_count = &prev->nvcsw;//记录当前进程切换的次数
}
next = pick_next_task(rq, prev, &rf);//选择下一个task
clear_tsk_need_resched(prev);//清除新task中待schedule标志
clear_preempt_need_resched();
#ifdef CONFIG_SCHED_DEBUG
rq->last_seen_need_resched_ns = 0;
#endif
if (likely(prev != next)) {
rq->nr_switches++;//就绪队列的切换计数
/*
* RCU users of rcu_dereference(rq->curr) may not see
* changes to task_struct made by pick_next_task().
*/
RCU_INIT_POINTER(rq->curr, next);//大概是通过读写锁将next赋值给curr
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
* rq->curr, before returning to user-space.
*
* Here are the schemes providing that barrier on the
* various architectures:
* - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
* switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
* - finish_lock_switch() for weakly-ordered
* architectures where spin_unlock is a full barrier,
* - switch_to() for arm64 (weakly-ordered, spin_unlock
* is a RELEASE barrier),
*/
++*switch_count;
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(preempt, prev, next);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);//切换上下文
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_unlock_irq(&rq->lock);
}
}
二、调度器类
每一种调度器类都必须实现struct sched_class. 其成员包括task_tick等。