linux内核之调度算法（二）

最新推荐文章于 2025-12-05 18:30:00 发布

最新推荐文章于 2025-12-05 18:30:00 发布 · 87 阅读

文章标签：

#数据结构与算法

本文深入解析Linux内核的进程调度机制，重点介绍了核心函数schedule的工作流程，包括如何选择下一个运行的进程、如何处理进程上下文切换等关键步骤，并探讨了不同调度类之间的优先级关系。

上层调度，linux调度的核心函数为schedule，schedule函数封装了内核调度的框架。细节实现上调用具体的调度类中的函数实现。schedule函数主要流程为：

1，将当前进程从相应的运行队列中删除；

2，计算和更新调度实体和进程的相关调度信息；

3，将当前进重新插入到调度运行队列中，对于CFS调度，根据具体的运行时间进行插入而对于实时调度插入到对应优先级队列的队尾；

4，从运行队列中选择运行的下一个进程；

5，进程调度信息和上下文切换；

当进程上下文切换后（关于进程切换在前面的文章中有介绍），调度就基本上完成了，当前运行的进程就是切换过来的进程了。

/*内核和其他部分用于调用进程调度器的入口，选择哪个进程可以运行，何时将其投入运行。schedule通常都需要和一个具体的调度类相关联，也就是说，他会找到一个最高优先级的调度类，后者需要有自己的可运行队列，然后问后者谁才是下一个该运行的进程该函数唯一重要的事情是，他回调用pick_next_task*/ asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu);/*得到特定cpu的rq*/ rcu_sched_qs(cpu); prev = rq->curr;/*当前的运行进程*/ switch_count = &prev->nivcsw;/*进程切换计数*/ release_kernel_lock(prev); need_resched_nonpreemptible: schedule_debug(prev); if (sched_feat(HRTICK)) hrtick_clear(rq); spin_lock_irq(&rq->lock); update_rq_clock(rq);/*更新rq的clock属性*/ clear_tsk_need_resched(prev);/*清楚prev进程的调度位*/ if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; else/*从运行队列中删除prev进程,根据调度类的不同，实现不同*/ deactivate_task(rq, prev, 1); switch_count = &prev->nvcsw; } /*现只对实时进程有用*/ pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); /*将当前进程，也就是被切换出去的进程重新插入到各自的运行队列中，对于CFS算法插入到合适的位置上，对于实时调度插入到同一个优先级队列的链表尾部*/ put_prev_task(rq, prev); /*从各自的运行队列中选择下一个进程来运行*/ next = pick_next_task(rq); if (likely(prev != next)) { /*更新切换出去和进来进程以及对应rq的相关变量*/ sched_info_switch(prev, next); perf_event_task_sched_out(prev, next, cpu); rq->nr_switches++;/*切换记录*/ rq->curr = next; ++*switch_count; /*上下文切换，在进程切换已经介绍*/ context_switch(rq, prev, next); /* unlocks the rq */ /* * the context switch might have flipped the stack from under * us, hence refresh the local variables. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); } else spin_unlock_irq(&rq->lock); /*对于实时进程有用到*/ post_schedule(rq); if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; preempt_enable_no_resched(); if (need_resched()) goto need_resched; }

对于cpu_rq函数

/*通过向上加偏移的方式得到rq，这里可以看出 runqueues为一个rq结构的数组，cpu为数组下标*/ #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

deactivate_task函数实现

/* * deactivate_task - remove a task from the runqueue. */ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) { if (task_contributes_to_load(p)) rq->nr_uninterruptible++; /*具体操作*/ dequeue_task(rq, p, sleep); dec_nr_running(rq);/*rq中当前进程的运行数减一*/ }

我们看具体的操作

static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { if (sleep) {/*如果sleep不为0，更新se中相关变量*/ if (p->se.last_wakeup) { update_avg(&p->se.avg_overlap, p->se.sum_exec_runtime - p->se.last_wakeup); p->se.last_wakeup = 0; } else { update_avg(&p->se.avg_wakeup, sysctl_sched_wakeup_granularity); } } /*更新进程的sched_info数据结构中相关属性*/ sched_info_dequeued(p); /*调用具体调度类的函数从他的运行队列中删除*/ p->sched_class->dequeue_task(rq, p, sleep); p->se.on_rq = 0; }

可见，调用了具体运行队列的删除函数，我们看最关键的选择下一个进程的方式。

/* * Pick up the highest-prio task: */ /*以优先级为序，从高到低，一次检查每个调度类并且从高优先级的调度类中，选择最高优先级的进程 */ static inline struct task_struct * pick_next_task(struct rq *rq) { const struct sched_class *class; struct task_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ if (likely(rq->nr_running == rq->cfs.nr_running)) { p = fair_sched_class.pick_next_task(rq); if (likely(p)) return p; } class = sched_class_highest; for ( ; ; ) {/*对每一个调度类*/ p = class->pick_next_task(rq);/*调用该调度类中的函数，找出下一个task*/ if (p) return p; /* * Will never be NULL as the idle class always * returns a non-NULL p: */ /*访问下一个调度类*/ class = class->next; } }

可见，对于调度类的选择，同样以优先级进行。

对于进程调度信息的切换最终会调用__sched_info_switch

/* * Called when tasks are switched involuntarily due, typically, to expiring * their time slice. (This may also be called when switching to or from * the idle task.) We are only called when prev != next. */ static inline void __sched_info_switch(struct task_struct *prev, struct task_struct *next) { struct rq *rq = task_rq(prev); /* * prev now departs the cpu. It's not interesting to record * stats about how efficient we were at scheduling the idle * process, however. */ if (prev != rq->idle)/*如果被切换出去的进程不是idle进程*/ sched_info_depart(prev);/*更新prev进程和他对应rq的相关变量*/ if (next != rq->idle)/*如果切换进来的进程不是idle进程*/ sched_info_arrive(next);/*更新next进程和对应队列的相关变量*/ }/* * Called when a process ceases being the active-running process, either * voluntarily or involuntarily. Now we can calculate how long we ran. * Also, if the process is still in the TASK_RUNNING state, call * sched_info_queued() to mark that it has now again started waiting on * the runqueue. */ static inline void sched_info_depart(struct task_struct *t) { /*计算在进程在rq中运行的时间长度*/ unsigned long long delta = task_rq(t)->clock - t->sched_info.last_arrival; /*更新RunQueue中的Task所得到CPU執行時間的累加值.*/ rq_sched_info_depart(task_rq(t), delta); /*如果被切换出去进程的状态是运行状态那么将进程sched_info.last_queued设置为rq的clock last_queued为最后一次排队等待运行的时间*/ if (t->state == TASK_RUNNING) sched_info_queued(t); }/* * Called when a task finally hits the cpu. We can now calculate how * long it was waiting to run. We also note when it began so that we * can keep stats on how long its timeslice is. */ static void sched_info_arrive(struct task_struct *t) { unsigned long long now = task_rq(t)->clock, delta = 0; if (t->sched_info.last_queued)/*如果被切换进来前在运行进程中排队*/ delta = now - t->sched_info.last_queued;/*计算排队等待的时间长度*/ sched_info_reset_dequeued(t);/*因为进程将被切换进来运行，设定last_queued为0*/ t->sched_info.run_delay += delta;/*更新进程在运行队列里面等待的时间*/ t->sched_info.last_arrival = now;/*更新最后一次运行的时间*/ t->sched_info.pcount++;/*cpu上运行的次数加一*/ /*更新rq中rq_sched_info中的对应的变量*/ rq_sched_info_arrive(task_rq(t), delta); }

对于schedule调度函数框架的分析基本是这样了，对于具体的CFS和实时调度的实现在后面分析。