linux 进程调度切换过程分析

最新推荐文章于 2025-10-11 04:57:38 发布

原创最新推荐文章于 2025-10-11 04:57:38 发布 · 3.4k 阅读

6 ·

CC 4.0 BY-SA版权

文章标签：

#linux #schedule #linux schedule

linux 专栏收录该内容

9 篇文章

订阅专栏

本文主要探讨了在Linux内核4.10版本中，当系统调用或异常中断返回用户空间时，如何触发进程调度的过程。涉及到的关键代码位于entry.S和signal.c文件中，以及sched核心函数的实现。在切换完成后，通过finish_task_switch()函数进行后续清理工作。

从系统调用或者异常中断返回用户空间时，thread_flags 被设置成TIF_NEED_RESCHED 会发生调度，当然还有其他几个时机也会发生调度，这里主要介绍中断返回用户空间时的情况。

linux-4.10/arch/arm64/kernel/entry.S

744  ret_fast_syscall:
745  	disable_irq				// disable interrupts
746  	str	x0, [sp, #S_X0]			// returned x0
747  	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for syscall tracing
748  	and	x2, x1, #_TIF_SYSCALL_WORK
749  	cbnz	x2, ret_fast_syscall_trace
750  	and	x2, x1, #_TIF_WORK_MASK
751  	cbnz	x2, work_pending   //跳到work_pending
752  	enable_step_tsk x1, x2
753  	kernel_exit 0

761  work_pending:
762  	mov	x0, sp				// 'regs'
763  	bl	do_notify_resume     //跳到do_notify_resume
764  #ifdef CONFIG_TRACE_IRQFLAGS
765  	bl	trace_hardirqs_on		// enabled while in userspace
766  #endif
767  	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for single-step
768  	b	finish_ret_to_user

这个流程在前面的博客中有贴出来过，在应用发生crash 异常的时候，会走到这里。

linux-4.10/arch/arm64/kernel/signal.c

402  asmlinkage void do_notify_resume(struct pt_regs *regs,
403  				 unsigned int thread_flags)
404  {
405  	/*
406  	 * The assembly code enters us with IRQs off, but it hasn't
407  	 * informed the tracing code of that for efficiency reasons.
408  	 * Update the trace code with the current status.
409  	 */
410  	trace_hardirqs_off();
411  	do {
412  		if (thread_flags & _TIF_NEED_RESCHED) {
413  			schedule();  //如果调度的标识位被设置，则进行调度
414  		} else {
...
430  		}
431  
432  		local_irq_disable();
433  		thread_flags = READ_ONCE(current_thread_info()->flags);
434  	} while (thread_flags & _TIF_WORK_MASK);
435  }

thread_flags 被设置成（或者添加）_TIF_NEED_RESCHED是，这时候会调用schedule()进行调度

linux-4.10/kernel/sched/core.c

3451  asmlinkage __visible void __sched schedule(void)
3452  {
3453  	struct task_struct *tsk = current;
3454  
3455  	sched_submit_work(tsk);
3456  	do {
3457  		preempt_disable();  //禁止抢占
3458  		__schedule(false);  
3459  		sched_preempt_enable_no_resched(); //开启抢占
3460  	} while (need_resched());
3461  }
3462  EXPORT_SYMBOL(schedule);

接着跑到__schedule() 参数preempt 表示是否运行抢占，这里参数的参数是false，也就是不允许抢占。

3334  static void __sched notrace __schedule(bool preempt)
3335  {
3336  	struct task_struct *prev, *next;
3337  	unsigned long *switch_count;
3338  	struct pin_cookie cookie;
3339  	struct rq *rq;
3340  	int cpu;
3341  
/*
  * linux-4.10/include/linux/smp.h
  *  # define smp_processor_id() raw_smp_processor_id()
  *   
  *   linux-4.10/arch/arm64/include/asm/smp.h
  *   #define raw_smp_processor_id() (*raw_cpu_ptr(&cpu_number))  //虽然写法比较奇怪，认为就是当前运行的cpu id就可以了,如果是下面的写就很直观
  *   linux-4.10/arch/arm/include/asm/smp.h
  *   #define raw_smp_processor_id() (current_thread_info()->cpu)
*/
3342  	cpu = smp_processor_id();
3343  	rq = cpu_rq(cpu);  //获取运行队列
3344  	prev = rq->curr;   //当前运行的task_struct
3345  
3346  	schedule_debug(prev);
3347  
3348  	if (sched_feat(HRTICK))  //是否开启了HRTICK
3349  		hrtick_clear(rq); //
3350  
3351  	local_irq_disable();  //关闭中断
3352  	rcu_note_context_switch(); //RCU(Read-Copy Update),对于被RCU保护的共享数据结构，读者不需要获得任何锁就可以访问它
3353  
3354  	/*
3355  	 * Make sure that signal_pending_state()->signal_pending() below
3356  	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
3357  	 * done by the caller to avoid the race with signal_wake_up().
3358  	 */
3359  	smp_mb__before_spinlock();  //最终会调用到barrier()函数，确保后面的代码不会跑在这个函数之前
3360  	raw_spin_lock(&rq->lock);  //获取自旋锁
3361  	cookie = lockdep_pin_lock(&rq->lock);
3362  
3363  	rq->clock_skip_update <<= 1; /* promote REQ to ACT */ //rq->clock_skip_update = rq->clock_skip_update <<1;
3364  
3365  	switch_count = &prev->nivcsw;
3366  	if (!preempt && prev->state) {
3367  		if (unlikely(signal_pending_state(prev->state, prev))) {
3368  			prev->state = TASK_RUNNING;
3369  		} else {
3370  			deactivate_task(rq, prev, DEQUEUE_SLEEP);
3371  			prev->on_rq = 0;
3372  
3373  			/*
3374  			 * If a worker went to sleep, notify and ask workqueue
3375  			 * whether it wants to wake up a task to maintain
3376  			 * concurrency.
3377  			 */
3378  			if (prev->flags & PF_WQ_WORKER) {
3379  				struct task_struct *to_wakeup;
3380  
3381  				to_wakeup = wq_worker_sleeping(prev);
3382  				if (to_wakeup)
3383  					try_to_wake_up_local(to_wakeup, cookie);
3384  			}
3385  		}
3386  		switch_count = &prev->nvcsw;
3387  	}
3388  
3389  	if (task_on_rq_queued(prev))
3390  		update_rq_clock(rq);
3391  
3392  	next = pick_next_task(rq, prev, cookie);  //找到需要调度的对象(task_struct)
3393  	clear_tsk_need_resched(prev);
3394  	clear_preempt_need_resched();
3395  	rq->clock_skip_update = 0;
3396  
3397  	if (likely(prev != next)) {
3398  		rq->nr_switches++; //统计进程上下文切换次数
3399  		rq->curr = next;  //将运行队列中的当前运行进程设置为调度器选择的进程
3400  		++*switch_count;
3401  
3402  		trace_sched_switch(preempt, prev, next);
3403  		rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */  //切换上下文
3404  	} else {
3405  		lockdep_unpin_lock(&rq->lock, cookie);
3406  		raw_spin_unlock_irq(&rq->lock);
3407  	}
3408  
3409  	balance_callback(rq);  
3410  }

pick_next_task() 函数就是从调度器中获取下一个需要运行的进程（线程）。

3259  static inline struct task_struct *
3260  pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
3261  {
3262  	const struct sched_class *class = &fair_sched_class;  //完全公平调度策略
3263  	struct task_struct *p;
3264  
3265  	/*
3266  	 * Optimization: we know that if all tasks are in
3267  	 * the fair class we can call that function directly:
3268  	 */
3269  	if (likely(prev->sched_class == class &&
3270  		   rq->nr_running == rq->cfs.h_nr_running)) {
3271  		p = fair_sched_class.pick_next_task(rq, prev, cookie);  //使用公平调度器选择下一个要调度的实体
3272  		if (unlikely(p == RETRY_TASK))
3273  			goto again;
3274  
3275  		/* assumes fair_sched_class->next == idle_sched_class */
3276  		if (unlikely(!p))  //如果没有没有需要调度的进程
3277  			p = idle_sched_class.pick_next_task(rq, prev, cookie); //使用idle 调度器调度
3278  
3279  		return p;
3280  	}
3281  
3282  again:
3283  	for_each_class(class) { //stop_sched_class ->dl_sched_class ->rt_sched_class ->fair_sched_class ->idle_sched_class
3284  		p = class->pick_next_task(rq, prev, cookie); 
3285  		if (p) {
3286  			if (unlikely(p == RETRY_TASK))
3287  				goto again;
3288  			return p;
3289  		}
3290  	}
3291  
3292  	BUG(); /* the idle class will always have a runnable task */
3293  }

linux-4.10/kernel/sched/sched.h

1303  #define for_each_class(class) \
1304     for (class = sched_class_highest; class; class = class->next)
1302  #define sched_class_highest (&stop_sched_class)

首先判断当前的进程是不是使用fair_sched_class 调度器，如果符号就用fair_sched_class 进程调度，也就是调用fair_sched_class 选择下一个要运行的进程。因为普通进程才需要比较频繁的调度，所以likely 来修饰if中的条件。也就是先把当前需要调度的进程优先当作普通进程调度，如果不符合，则尝试调用其他调度器来调度，顺序是stop_sched_class ->dl_sched_class ->rt_sched_class ->fair_sched_class ->idle_sched_class。

linux-4.10/kernel/sched/core.c

2859  /*
2860   * context_switch - switch to the new MM and the new thread's register state.
2861   */
2862  static __always_inline struct rq *
2863  context_switch(struct rq *rq, struct task_struct *prev,
2864  	       struct task_struct *next, struct pin_cookie cookie)
2865  {
2866  	struct mm_struct *mm, *oldmm;
2867  
2868  	prepare_task_switch(rq, prev, next); //特定于体系结构的代码, 为切换做事先准备.
2869  
2870  	mm = next->mm;  
2871  	oldmm = prev->active_mm;
2872  	/*
2873  	 * For paravirt, this is coupled with an exit in switch_to to
2874  	 * combine the page table reload and the switch backend into
2875  	 * one hypercall.
2876  	 */
2877  	arch_start_context_switch(prev);  //体系结构体相关的切换，目前此函数什么都没做 #define arch_start_context_switch(prev)	do {} while (0)
2878  
2879  	if (!mm) {   //如果next是内核线程，则使用prev所使用的地址空间
2880  		next->active_mm = oldmm;
2881  		atomic_inc(&oldmm->mm_count);
2882  		enter_lazy_tlb(oldmm, next);
2883  	} else
2884  		switch_mm_irqs_off(oldmm, mm, next); //切换mm
2885  
2886  	if (!prev->mm) { 
2887  		prev->active_mm = NULL;
2888  		rq->prev_mm = oldmm;
2889  	}
2890  	/*
2891  	 * Since the runqueue lock will be released by the next
2892  	 * task (which is an invalid locking op but in the case
2893  	 * of the scheduler it's an obvious special-case), so we
2894  	 * do an early lockdep release here:
2895  	 */
2896  	lockdep_unpin_lock(&rq->lock, cookie);
2897  	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2898  
2899  	/* Here we just switch the register state and the stack. */
2900  	switch_to(prev, next, prev);  
2901  	barrier();   //屏障，确保前后读写操作不会错乱
2902  
2903  	return finish_task_switch(prev);
2904  }

当从调度器选择好下一个进程，设置好mm内存地址空间后，就开始调用switch_to()切换进程。

linux-4.10/include/asm-generic/switch_to.h

25  #define switch_to(prev, next, last)					\
26  	do {								\
27  		((last) = __switch_to((prev), (next)));			\
28  	} while (0)

linux-4.10/arch/arm64/kernel/process.c

339  /*
340   * Thread switching.
341   */
342  struct task_struct *__switch_to(struct task_struct *prev,
343  				struct task_struct *next)
344  {
345  	struct task_struct *last;
346  
347  	fpsimd_thread_switch(next);  //把当前FPSIMD的状态保存到了next (thread.fpsimd_state)
348  	tls_thread_switch(next);    //
349  	hw_breakpoint_thread_switch(next);
350  	contextidr_thread_switch(next);
351  	entry_task_switch(next);
352  	uao_thread_switch(next);
353  
354  	/*
355  	 * Complete any pending TLB or cache maintenance on this CPU in case
356  	 * the thread migrates to a different CPU.
357  	 */
358  	dsb(ish);
359  
360  	/* the actual thread switch */
361  	last = cpu_switch_to(prev, next); //cpu的上下文切换
362  
363  	return last;
364  }

做好一些状态的保存和设定之后，调用cpu_switch_to()进行cpu状态的切换。

linux-4.10/arch/arm64/kernel/entry.S

708  /*
709   * Register switch for AArch64. The callee-saved registers need to be saved
710   * and restored. On entry:
711   *   x0 = previous task_struct (must be preserved across the switch)
712   *   x1 = next task_struct
713   * Previous and next are guaranteed not to be the same.
714   *
715   */
716  ENTRY(cpu_switch_to)
717  	mov	x10, #THREAD_CPU_CONTEXT   //DEFINE(THREAD_CPU_CONTEXT,	offsetof(struct task_struct, thread.cpu_context));
718  	add	x8, x0, x10  // x0 = prev   x8= prev->thread.cpu_context
719  	mov	x9, sp   //x9 = sp
720  	stp	x19, x20, [x8], #16    //prev->thread.cpu_context.x19 =x19, prev->thread.cpu_context.x20 =x20		
721  	stp	x21, x22, [x8], #16    //prev->thread.cpu_context.x21 =x21, prev->thread.cpu_context.x22 =x22
722  	stp	x23, x24, [x8], #16    //prev->thread.cpu_context.x23 =x23, prev->thread.cpu_context.x24 =x24
723  	stp	x25, x26, [x8], #16    //prev->thread.cpu_context.x25 =x25, prev->thread.cpu_context.x26 =x26
724  	stp	x27, x28, [x8], #16    //prev->thread.cpu_context.x27 =x27, prev->thread.cpu_context.x28 =x28
725  	stp	x29, x9, [x8], #16     //prev->thread.cpu_context.fp =x29, prev->thread.cpu_context.sp =x9
726  	str	lr, [x8]
727  	add	x8, x1, x10  // x1= next
728  	ldp	x19, x20, [x8], #16    //x19 =next->thread.cpu_context.x19, x20 =next->thread.cpu_context.x20			
729  	ldp	x21, x22, [x8], #16    //x21 =next->thread.cpu_context.x21,  x22 =next->thread.cpu_context.x22
730  	ldp	x23, x24, [x8], #16    //x23 =next->thread.cpu_context.x23,  x24 =next->thread.cpu_context.x24
731  	ldp	x25, x26, [x8], #16    //x25 =next->thread.cpu_context.x25,  x26 =next->thread.cpu_context.x26
732  	ldp	x27, x28, [x8], #16    //x27 =next->thread.cpu_context.x27,  x28 =next->thread.cpu_context.x28
733  	ldp	x29, x9, [x8], #16     //x29 =next->thread.cpu_context.fp,  x9 =next->thread.cpu_context.sp
734  	ldr	lr, [x8]
735  	mov	sp, x9
736  	msr	sp_el0, x1  //sp_el0 = x1 (next)
737  	ret
738  ENDPROC(cpu_switch_to)

cup context的切换可以简单理解成把prev 寄存器中的状态保存到内存中，保存在prev->thread.cpu_context 中，然后把next 之前的状态（next->thread.cpu_context）恢复到对应的寄存器。

linux-4.10/include/linux/sched.h

1511  struct task_struct {
...
2001  /* CPU-specific state of this task */
2002  	struct thread_struct thread;
...
2009  };

linux-4.10/arch/arm64/include/asm/processor.h

79  struct thread_struct {
80  	struct cpu_context	cpu_context;	/* cpu context */
81  	unsigned long		tp_value;	/* TLS register */
82  #ifdef CONFIG_COMPAT
83  	unsigned long		tp2_value;
84  #endif
85  	struct fpsimd_state	fpsimd_state;
86  	unsigned long		fault_address;	/* fault info */
87  	unsigned long		fault_code;	/* ESR_EL1 value */
88  	struct debug_info	debug;		/* debugging */
89  };

linux-4.10/arch/arm64/include/asm/processor.h

63  struct cpu_context {
64  	unsigned long x19;
65  	unsigned long x20;
66  	unsigned long x21;
67  	unsigned long x22;
68  	unsigned long x23;
69  	unsigned long x24;
70  	unsigned long x25;
71  	unsigned long x26;
72  	unsigned long x27;
73  	unsigned long x28;
74  	unsigned long fp;
75  	unsigned long sp;
76  	unsigned long pc;
77  };

上面这些数据结构体的关系如下图

完成切换后，会调用finish_task_switch()函数做一些清尾工作。
linux-4.10/kernel/sched/core.c

2731  static struct rq *finish_task_switch(struct task_struct *prev)
2732  	__releases(rq->lock)
2733  {
2734  	struct rq *rq = this_rq();
2735  	struct mm_struct *mm = rq->prev_mm;
2736  	long prev_state;
2737  
2738  	/*
2739  	 * The previous task will have left us with a preempt_count of 2
2740  	 * because it left us after:
2741  	 *
2742  	 *	schedule()
2743  	 *	  preempt_disable();			// 1
2744  	 *	  __schedule()
2745  	 *	    raw_spin_lock_irq(&rq->lock)	// 2
2746  	 *
2747  	 * Also, see FORK_PREEMPT_COUNT.
2748  	 */
2749  	if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
2750  		      "corrupted preempt_count: %s/%d/0x%x\n",
2751  		      current->comm, current->pid, preempt_count()))
2752  		preempt_count_set(FORK_PREEMPT_COUNT);
2753  
2754  	rq->prev_mm = NULL;
2755  
2756  	/*
2757  	 * A task struct has one reference for the use as "current".
2758  	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
2759  	 * schedule one last time. The schedule call will never return, and
2760  	 * the scheduled task must drop that reference.
2761  	 *
2762  	 * We must observe prev->state before clearing prev->on_cpu (in
2763  	 * finish_lock_switch), otherwise a concurrent wakeup can get prev
2764  	 * running on another CPU and we could rave with its RUNNING -> DEAD
2765  	 * transition, resulting in a double drop.
2766  	 */
2767  	prev_state = prev->state;
2768  	vtime_task_switch(prev);
2769  	perf_event_task_sched_in(prev, current);
2770  	finish_lock_switch(rq, prev);
2771  	finish_arch_post_lock_switch();
2772  
2773  	fire_sched_in_preempt_notifiers(current);
2774  	if (mm)    //将mm释放，考虑到内核线程使用用户线程的mm情况
2775  		mmdrop(mm);
2776  	if (unlikely(prev_state == TASK_DEAD)) {  //如果上一个进程已经终止，释放其task_struct 结构
2777  		if (prev->sched_class->task_dead)
2778  			prev->sched_class->task_dead(prev);
2779  
2780  		/*
2781  		 * Remove function-return probe instances associated with this
2782  		 * task and put them back on the free list.
2783  		 */
2784  		kprobe_flush_task(prev);
2785  
2786  		/* Task is done with its stack. */
2787  		put_task_stack(prev);
2788  
2789  		put_task_struct(prev);
2790  	}
2791  
2792  	tick_nohz_task_switch();
2793  	return rq;
2794  }