从系统调用或者异常中断返回用户空间时,thread_flags 被设置成TIF_NEED_RESCHED 会发生调度,当然还有其他几个时机也会发生调度,这里主要介绍中断返回用户空间时的情况。
linux-4.10/arch/arm64/kernel/entry.S
744 ret_fast_syscall:
745 disable_irq // disable interrupts
746 str x0, [sp, #S_X0] // returned x0
747 ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing
748 and x2, x1, #_TIF_SYSCALL_WORK
749 cbnz x2, ret_fast_syscall_trace
750 and x2, x1, #_TIF_WORK_MASK
751 cbnz x2, work_pending //跳到work_pending
752 enable_step_tsk x1, x2
753 kernel_exit 0
761 work_pending:
762 mov x0, sp // 'regs'
763 bl do_notify_resume //跳到do_notify_resume
764 #ifdef CONFIG_TRACE_IRQFLAGS
765 bl trace_hardirqs_on // enabled while in userspace
766 #endif
767 ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step
768 b finish_ret_to_user这个流程在前面的博客中有贴出来过,在应用发生crash 异常的时候,会走到这里。
linux-4.10/arch/arm64/kernel/signal.c
402 asmlinkage void do_notify_resume(struct pt_regs *regs,
403 unsigned int thread_flags)
404 {
405 /*
406 * The assembly code enters us with IRQs off, but it hasn't
407 * informed the tracing code of that for efficiency reasons.
408 * Update the trace code with the current status.
409 */
410 trace_hardirqs_off();
411 do {
412 if (thread_flags & _TIF_NEED_RESCHED) {
413 schedule(); //如果调度的标识位被设置,则进行调度
414 } else {
...
430 }
431
432 local_irq_disable();
433 thread_flags = READ_ONCE(current_thread_info()->flags);
434 } while (thread_flags & _TIF_WORK_MASK);
435 }thread_flags 被设置成(或者添加)_TIF_NEED_RESCHED是,这时候会调用schedule()进行调度
linux-4.10/kernel/sched/core.c
3451 asmlinkage __visible void __sched schedule(void)
3452 {
3453 struct task_struct *tsk = current;
3454
3455 sched_submit_work(tsk);
3456 do {
3457 preempt_disable(); //禁止抢占
3458 __schedule(false);
3459 sched_preempt_enable_no_resched(); //开启抢占
3460 } while (need_resched());
3461 }
3462 EXPORT_SYMBOL(schedule);接着跑到__schedule() 参数preempt 表示是否运行抢占,这里参数的参数是false,也就是不允许抢占。
3334 static void __sched notrace __schedule(bool preempt)
3335 {
3336 struct task_struct *prev, *next;
3337 unsigned long *switch_count;
3338 struct pin_cookie cookie;
3339 struct rq *rq;
3340 int cpu;
3341
/*
* linux-4.10/include/linux/smp.h
* # define smp_processor_id() raw_smp_processor_id()
*
* linux-4.10/arch/arm64/include/asm/smp.h
* #define raw_smp_processor_id() (*raw_cpu_ptr(&cpu_number)) //虽然写法比较奇怪,认为就是当前运行的cpu id就可以了,如果是下面的写就很直观
* linux-4.10/arch/arm/include/asm/smp.h
* #define raw_smp_processor_id() (current_thread_info()->cpu)
*/
3342 cpu = smp_processor_id();
3343 rq = cpu_rq(cpu); //获取运行队列
3344 prev = rq->curr; //当前运行的task_struct
3345
3346 schedule_debug(prev);
3347
3348 if (sched_feat(HRTICK)) //是否开启了HRTICK
3349 hrtick_clear(rq); //
3350
3351 local_irq_disable(); //关闭中断
3352 rcu_note_context_switch(); //RCU(Read-Copy Update),对于被RCU保护的共享数据结构,读者不需要获得任何锁就可以访问它
3353
3354 /*
3355 * Make sure that signal_pending_state()->signal_pending() below
3356 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
3357 * done by the caller to avoid the race with signal_wake_up().
3358 */
3359 smp_mb__before_spinlock(); //最终会调用到barrier()函数,确保后面的代码不会跑在这个函数之前
3360 raw_spin_lock(&rq->lock); //获取自旋锁
3361 cookie = lockdep_pin_lock(&rq->lock);
3362
3363 rq->clock_skip_update <<= 1; /* promote REQ to ACT */ //rq->clock_skip_update = rq->clock_skip_update <<1;
3364
3365 switch_count = &prev->nivcsw;
3366 if (!preempt && prev->state) {
3367 if (unlikely(signal_pending_state(prev->state, prev))) {
3368 prev->state = TASK_RUNNING;
3369 } else {
3370 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3371 prev->on_rq = 0;
3372
3373 /*
3374 * If a worker went to sleep, notify and ask workqueue
3375 * whether it wants to wake up a task to maintain
3376 * concurrency.
3377 */
3378 if (prev->flags & PF_WQ_WORKER) {
3379 struct task_struct *to_wakeup;
3380
3381 to_wakeup = wq_worker_sleeping(prev);
3382 if (to_wakeup)
3383 try_to_wake_up_local(to_wakeup, cookie);
3384 }
3385 }
3386 switch_count = &prev->nvcsw;
3387 }
3388
3389 if (task_on_rq_queued(prev))
3390 update_rq_clock(rq);
3391
3392 next = pick_next_task(rq, prev, cookie); //找到需要调度的对象(task_struct)
3393 clear_tsk_need_resched(prev);
3394 clear_preempt_need_resched();
3395 rq->clock_skip_update = 0;
3396
3397 if (likely(prev != next)) {
3398 rq->nr_switches++; //统计进程上下文切换次数
3399 rq->curr = next; //将运行队列中的当前运行进程设置为调度器选择的进程
3400 ++*switch_count;
3401
3402 trace_sched_switch(preempt, prev, next);
3403 rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */ //切换上下文
3404 } else {
3405 lockdep_unpin_lock(&rq->lock, cookie);
3406 raw_spin_unlock_irq(&rq->lock);
3407 }
3408
3409 balance_callback(rq);
3410 }pick_next_task() 函数就是从调度器中获取下一个需要运行的进程(线程)。
3259 static inline struct task_struct *
3260 pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
3261 {
3262 const struct sched_class *class = &fair_sched_class; //完全公平调度策略
3263 struct task_struct *p;
3264
3265 /*
3266 * Optimization: we know that if all tasks are in
3267 * the fair class we can call that function directly:
3268 */
3269 if (likely(prev->sched_class == class &&
3270 rq->nr_running == rq->cfs.h_nr_running)) {
3271 p = fair_sched_class.pick_next_task(rq, prev, cookie); //使用公平调度器选择下一个要调度的实体
3272 if (unlikely(p == RETRY_TASK))
3273 goto again;
3274
3275 /* assumes fair_sched_class->next == idle_sched_class */
3276 if (unlikely(!p)) //如果没有没有需要调度的进程
3277 p = idle_sched_class.pick_next_task(rq, prev, cookie); //使用idle 调度器调度
3278
3279 return p;
3280 }
3281
3282 again:
3283 for_each_class(class) { //stop_sched_class ->dl_sched_class ->rt_sched_class ->fair_sched_class ->idle_sched_class
3284 p = class->pick_next_task(rq, prev, cookie);
3285 if (p) {
3286 if (unlikely(p == RETRY_TASK))
3287 goto again;
3288 return p;
3289 }
3290 }
3291
3292 BUG(); /* the idle class will always have a runnable task */
3293 }linux-4.10/kernel/sched/sched.h
1303 #define for_each_class(class) \
1304 for (class = sched_class_highest; class; class = class->next)
1302 #define sched_class_highest (&stop_sched_class)首先判断当前的进程是不是使用fair_sched_class 调度器,如果符号就用fair_sched_class 进程调度,也就是调用fair_sched_class 选择下一个要运行的进程。因为普通进程才需要比较频繁的调度,所以likely 来修饰if中的条件。也就是先把当前需要调度的进程优先当作普通进程调度,如果不符合,则尝试调用其他调度器来调度,顺序是stop_sched_class ->dl_sched_class ->rt_sched_class ->fair_sched_class
->idle_sched_class。
linux-4.10/kernel/sched/core.c
2859 /*
2860 * context_switch - switch to the new MM and the new thread's register state.
2861 */
2862 static __always_inline struct rq *
2863 context_switch(struct rq *rq, struct task_struct *prev,
2864 struct task_struct *next, struct pin_cookie cookie)
2865 {
2866 struct mm_struct *mm, *oldmm;
2867
2868 prepare_task_switch(rq, prev, next); //特定于体系结构的代码, 为切换做事先准备.
2869
2870 mm = next->mm;
2871 oldmm = prev->active_mm;
2872 /*
2873 * For paravirt, this is coupled with an exit in switch_to to
2874 * combine the page table reload and the switch backend into
2875 * one hypercall.
2876 */
2877 arch_start_context_switch(prev); //体系结构体相关的切换,目前此函数什么都没做 #define arch_start_context_switch(prev) do {} while (0)
2878
2879 if (!mm) { //如果next是内核线程,则使用prev所使用的地址空间
2880 next->active_mm = oldmm;
2881 atomic_inc(&oldmm->mm_count);
2882 enter_lazy_tlb(oldmm, next);
2883 } else
2884 switch_mm_irqs_off(oldmm, mm, next); //切换mm
2885
2886 if (!prev->mm) {
2887 prev->active_mm = NULL;
2888 rq->prev_mm = oldmm;
2889 }
2890 /*
2891 * Since the runqueue lock will be released by the next
2892 * task (which is an invalid locking op but in the case
2893 * of the scheduler it's an obvious special-case), so we
2894 * do an early lockdep release here:
2895 */
2896 lockdep_unpin_lock(&rq->lock, cookie);
2897 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2898
2899 /* Here we just switch the register state and the stack. */
2900 switch_to(prev, next, prev);
2901 barrier(); //屏障,确保前后读写操作不会错乱
2902
2903 return finish_task_switch(prev);
2904 }当从调度器选择好下一个进程,设置好mm内存地址空间后,就开始调用switch_to()切换进程。
linux-4.10/include/asm-generic/switch_to.h
25 #define switch_to(prev, next, last) \
26 do { \
27 ((last) = __switch_to((prev), (next))); \
28 } while (0)linux-4.10/arch/arm64/kernel/process.c
339 /*
340 * Thread switching.
341 */
342 struct task_struct *__switch_to(struct task_struct *prev,
343 struct task_struct *next)
344 {
345 struct task_struct *last;
346
347 fpsimd_thread_switch(next); //把当前FPSIMD的状态保存到了next (thread.fpsimd_state)
348 tls_thread_switch(next); //
349 hw_breakpoint_thread_switch(next);
350 contextidr_thread_switch(next);
351 entry_task_switch(next);
352 uao_thread_switch(next);
353
354 /*
355 * Complete any pending TLB or cache maintenance on this CPU in case
356 * the thread migrates to a different CPU.
357 */
358 dsb(ish);
359
360 /* the actual thread switch */
361 last = cpu_switch_to(prev, next); //cpu的上下文切换
362
363 return last;
364 }做好一些状态的保存和设定之后,调用cpu_switch_to()进行cpu状态的切换。
linux-4.10/arch/arm64/kernel/entry.S
708 /*
709 * Register switch for AArch64. The callee-saved registers need to be saved
710 * and restored. On entry:
711 * x0 = previous task_struct (must be preserved across the switch)
712 * x1 = next task_struct
713 * Previous and next are guaranteed not to be the same.
714 *
715 */
716 ENTRY(cpu_switch_to)
717 mov x10, #THREAD_CPU_CONTEXT //DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context));
718 add x8, x0, x10 // x0 = prev x8= prev->thread.cpu_context
719 mov x9, sp //x9 = sp
720 stp x19, x20, [x8], #16 //prev->thread.cpu_context.x19 =x19, prev->thread.cpu_context.x20 =x20
721 stp x21, x22, [x8], #16 //prev->thread.cpu_context.x21 =x21, prev->thread.cpu_context.x22 =x22
722 stp x23, x24, [x8], #16 //prev->thread.cpu_context.x23 =x23, prev->thread.cpu_context.x24 =x24
723 stp x25, x26, [x8], #16 //prev->thread.cpu_context.x25 =x25, prev->thread.cpu_context.x26 =x26
724 stp x27, x28, [x8], #16 //prev->thread.cpu_context.x27 =x27, prev->thread.cpu_context.x28 =x28
725 stp x29, x9, [x8], #16 //prev->thread.cpu_context.fp =x29, prev->thread.cpu_context.sp =x9
726 str lr, [x8]
727 add x8, x1, x10 // x1= next
728 ldp x19, x20, [x8], #16 //x19 =next->thread.cpu_context.x19, x20 =next->thread.cpu_context.x20
729 ldp x21, x22, [x8], #16 //x21 =next->thread.cpu_context.x21, x22 =next->thread.cpu_context.x22
730 ldp x23, x24, [x8], #16 //x23 =next->thread.cpu_context.x23, x24 =next->thread.cpu_context.x24
731 ldp x25, x26, [x8], #16 //x25 =next->thread.cpu_context.x25, x26 =next->thread.cpu_context.x26
732 ldp x27, x28, [x8], #16 //x27 =next->thread.cpu_context.x27, x28 =next->thread.cpu_context.x28
733 ldp x29, x9, [x8], #16 //x29 =next->thread.cpu_context.fp, x9 =next->thread.cpu_context.sp
734 ldr lr, [x8]
735 mov sp, x9
736 msr sp_el0, x1 //sp_el0 = x1 (next)
737 ret
738 ENDPROC(cpu_switch_to)cup context的切换可以简单理解成把prev 寄存器中的状态保存到内存中,保存在prev->thread.cpu_context 中,然后把next 之前的状态(next->thread.cpu_context)恢复到对应的寄存器。
linux-4.10/include/linux/sched.h
1511 struct task_struct {
...
2001 /* CPU-specific state of this task */
2002 struct thread_struct thread;
...
2009 };linux-4.10/arch/arm64/include/asm/processor.h
79 struct thread_struct {
80 struct cpu_context cpu_context; /* cpu context */
81 unsigned long tp_value; /* TLS register */
82 #ifdef CONFIG_COMPAT
83 unsigned long tp2_value;
84 #endif
85 struct fpsimd_state fpsimd_state;
86 unsigned long fault_address; /* fault info */
87 unsigned long fault_code; /* ESR_EL1 value */
88 struct debug_info debug; /* debugging */
89 };linux-4.10/arch/arm64/include/asm/processor.h
63 struct cpu_context {
64 unsigned long x19;
65 unsigned long x20;
66 unsigned long x21;
67 unsigned long x22;
68 unsigned long x23;
69 unsigned long x24;
70 unsigned long x25;
71 unsigned long x26;
72 unsigned long x27;
73 unsigned long x28;
74 unsigned long fp;
75 unsigned long sp;
76 unsigned long pc;
77 };上面这些数据结构体的关系 如下图
完成切换后,会调用finish_task_switch()函数做一些清尾工作。
linux-4.10/kernel/sched/core.c
2731 static struct rq *finish_task_switch(struct task_struct *prev)
2732 __releases(rq->lock)
2733 {
2734 struct rq *rq = this_rq();
2735 struct mm_struct *mm = rq->prev_mm;
2736 long prev_state;
2737
2738 /*
2739 * The previous task will have left us with a preempt_count of 2
2740 * because it left us after:
2741 *
2742 * schedule()
2743 * preempt_disable(); // 1
2744 * __schedule()
2745 * raw_spin_lock_irq(&rq->lock) // 2
2746 *
2747 * Also, see FORK_PREEMPT_COUNT.
2748 */
2749 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
2750 "corrupted preempt_count: %s/%d/0x%x\n",
2751 current->comm, current->pid, preempt_count()))
2752 preempt_count_set(FORK_PREEMPT_COUNT);
2753
2754 rq->prev_mm = NULL;
2755
2756 /*
2757 * A task struct has one reference for the use as "current".
2758 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
2759 * schedule one last time. The schedule call will never return, and
2760 * the scheduled task must drop that reference.
2761 *
2762 * We must observe prev->state before clearing prev->on_cpu (in
2763 * finish_lock_switch), otherwise a concurrent wakeup can get prev
2764 * running on another CPU and we could rave with its RUNNING -> DEAD
2765 * transition, resulting in a double drop.
2766 */
2767 prev_state = prev->state;
2768 vtime_task_switch(prev);
2769 perf_event_task_sched_in(prev, current);
2770 finish_lock_switch(rq, prev);
2771 finish_arch_post_lock_switch();
2772
2773 fire_sched_in_preempt_notifiers(current);
2774 if (mm) //将mm释放,考虑到内核线程使用用户线程的mm情况
2775 mmdrop(mm);
2776 if (unlikely(prev_state == TASK_DEAD)) { //如果上一个进程已经终止,释放其task_struct 结构
2777 if (prev->sched_class->task_dead)
2778 prev->sched_class->task_dead(prev);
2779
2780 /*
2781 * Remove function-return probe instances associated with this
2782 * task and put them back on the free list.
2783 */
2784 kprobe_flush_task(prev);
2785
2786 /* Task is done with its stack. */
2787 put_task_stack(prev);
2788
2789 put_task_struct(prev);
2790 }
2791
2792 tick_nohz_task_switch();
2793 return rq;
2794 }
本文主要探讨了在Linux内核4.10版本中,当系统调用或异常中断返回用户空间时,如何触发进程调度的过程。涉及到的关键代码位于entry.S和signal.c文件中,以及sched核心函数的实现。在切换完成后,通过finish_task_switch()函数进行后续清理工作。
644

被折叠的 条评论
为什么被折叠?



