深入解读Linux进程调度系列（1）——调度的初始化

最新推荐文章于 2024-11-06 17:54:43 发布

LoneHugo

最新推荐文章于 2024-11-06 17:54:43 发布

阅读量1.6k

点赞数

分类专栏： Linux进程调度文章标签： Schedule 中断 Linux Timer 初始化

本文链接：https://blog.youkuaiyun.com/Vince_/article/details/89054862

版权

Linux进程调度专栏收录该内容

13 篇文章

订阅专栏

日期	内核版本	CPU架构	作者
2019.04.06	Linux-5.0	PowerPC	LoneHugo

系列文章：https://blog.youkuaiyun.com/Vince_/article/details/89054330

1. 调度初始化涉及的内容

读者可以看到我们一直在强调调度与Linux中各个模块关联，所以在讲解调度初始化的过程中，我们会将与调度相关的子系统的初始化一并讲到，以便我们对整个概念有宏观的认识。

调度是由时钟中断和定时器驱动的，因此初始化过程讲到了时钟中断初始化的内容，包括硬件中断和软中断后半部的初始化。

2. 调度初始化入口sched_init

源码：https://elixir.bootlin.com/linux/v5.0.7/source/kernel/sched/core.c#L5927

start_kernel函数调用sched_init进入调度的初始化。首先分配alloc_size大小的内存，初始化root_task_group，root_task_group为系统默认的task group，系统启动阶段每个进程都属于该task group需要注意root_task_group中的成员是针对perCPU的。初始化完成之后将init_task标记为idle进程。具体看下面函数中的注释。

void __init sched_init(void)
{
    int i, j;
    unsigned long alloc_size = 0, ptr;
 
    /* calculate the size to be allocated for root_task_group items.
     * some items in the struct task_group are per-cpu fields, so use 
     * no_cpu_ids here.
     */
#ifdef CONFIG_FAIR_GROUP_SCHED
    alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
#ifdef CONFIG_RT_GROUP_SCHED
    alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
    if (alloc_size) {
        /* allocate mem here. */
        ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 
#ifdef CONFIG_FAIR_GROUP_SCHED
        root_task_group.se = (struct sched_entity **)ptr;
        ptr += nr_cpu_ids * sizeof(void **);
 
        root_task_group.cfs_rq = (struct cfs_rq **)ptr;
        ptr += nr_cpu_ids * sizeof(void **);
 
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
        root_task_group.rt_se = (struct sched_rt_entity **)ptr;
        ptr += nr_cpu_ids * sizeof(void **);
 
        root_task_group.rt_rq = (struct rt_rq **)ptr;
        ptr += nr_cpu_ids * sizeof(void **);
 
#endif /* CONFIG_RT_GROUP_SCHED */
    }
#ifdef CONFIG_CPUMASK_OFFSTACK
    /* Use dynamic allocation for cpumask_var_t, instead of putting them on the stack. 
     * This is a bit more expensive, but avoids stack overflow. 
     * Allocate load_balance_mask for every cpu below.
     */
    for_each_possible_cpu(i) {
        per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
            cpumask_size(), GFP_KERNEL, cpu_to_node(i));
    }
#endif /* CONFIG_CPUMASK_OFFSTACK */
 
    /* init the real-time task group cpu time percentage. 
     * the hrtimer of def_rt_bandwidth is initialized here.
     */
    init_rt_bandwidth(&def_rt_bandwidth,
            global_rt_period(), global_rt_runtime());
    /* init the deadline task group cpu time percentage. */
    init_dl_bandwidth(&def_dl_bandwidth,
            global_rt_period(), global_rt_runtime());
 
#ifdef CONFIG_SMP
    /* 初始化默认调度域，调度域包含一个或者多个CPU，负载均衡是在调度域之内执行，相互之间进行隔离 */
    init_defrootdomain();
#endif
 
#ifdef CONFIG_RT_GROUP_SCHED
    init_rt_bandwidth(&root_task_group.rt_bandwidth,
            global_rt_period(), global_rt_runtime());
#endif /* CONFIG_RT_GROUP_SCHED */
 
#ifdef CONFIG_CGROUP_SCHED
    /* 将分配并初始化好的邋root_task_group加入到錿ask_groups全局链表 */
    list_add(&root_task_group.list, &task_groups);
    INIT_LIST_HEAD(&root_task_group.children);
    INIT_LIST_HEAD(&root_task_group.siblings);
    /* 初始化自动分组 */
    autogroup_init(&init_task);
 
#endif /* CONFIG_CGROUP_SCHED */
 
    /* 遍历每个cpu的运行队列，对其进行初始化 */
    for_each_possible_cpu(i) {
        struct rq *rq;
 
        rq = cpu_rq(i);
        raw_spin_lock_init(&rq->lock);
        /* CPU运行队列的所有调度实体(sched_entity)的数目 */
        rq->nr_running = 0;
        /* CPU负载 */
        rq->calc_load_active = 0;
        /* 负载更新时间 */
        rq->calc_load_update = jiffies + LOAD_FREQ;
        /* 分别初始化运行队列的cfs rt和dl队列 */
        init_cfs_rq(&rq->cfs);
        init_rt_rq(&rq->rt);
        init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
        /* root的CPU总的配额 */
        root_task_group.shares = ROOT_TASK_GROUP_LOAD;
        INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
        /*
         * How much cpu bandwidth does root_task_group get?
         *
         * In case of task-groups formed thr' the cgroup filesystem, it
         * gets 100% of the cpu resources in the system. This overall
         * system cpu resource is divided among the tasks of
         * root_task_group and its child task-groups in a fair manner,
         * based on each entity's (task or task-group's) weight
         * (se->load.weight).
         *
         * In other words, if root_task_group has 10 tasks of weight
         * 1024) and two child groups A0 and A1 (of weight 1024 each),
         * then A0's share of the cpu resource is:
         *
         *    A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
         *
         * We achieve this by letting root_task_group's tasks sit
         * directly in rq->cfs (i.e root_task_group->se[] = NULL).
         */
        /* 初始化cfs_bandwidth，普通进程占有的CPU资源，初始化调度类相应的高精度定时器 */
        init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
        /* 当前CPU运行队列的cfs_rq的task_group指定为tg, 即root_task_group */
        /* 指定cfs_rq的rq为当前CPU运行队列rq */
        /* root_task_group在当前CPU上的cfs_rq */
        /* 目前schedule_entity se是空 */
        init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
 
        rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
        /* 类似前面init_tg_cfs_entry的初始化, 完成相互赋值 */
        init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
 
        /* 初始化该队列所保存的每个CPU的负载情况 */
        for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
            rq->cpu_load[j] = 0;
 
        /* 该队列最后更新CPU负载的时间 */
        rq->last_load_update_tick = jiffies;
 
#ifdef CONFIG_SMP
        /* 初始化负载均衡相关的参数 */
        rq->sd = NULL;
        rq->rd = NULL;
        rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
        rq->balance_callback = NULL;
        rq->active_balance = 0;
        rq->next_balance = jiffies;
        rq->push_cpu = 0;
        rq->cpu = i;
        rq->online = 0;
        rq->idle_stamp = 0;
        rq->avg_idle = 2*sysctl_sched_migration_cost;
        rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 
        INIT_LIST_HEAD(&rq->cfs_tasks);
 
        /* CPU运行队列加入到默认调度域中 */
        rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
        /* 动态时钟使用标志位，初始时间未使用 */
        rq->nohz_flags = 0;
#endif
#ifdef CONFIG_NO_HZ_FULL
        /* 动态时钟使用的标志位，用于保存上次调度tick发生时间 */
        rq->last_sched_tick = 0;
#endif
#endif
        /* 运行队列高精度定时器的初始化，还未正式生效 */
        init_rq_hrtick(rq);
        atomic_set(&rq->nr_iowait, 0);
    }
 
    /* 设置初始化进程的load权重 */
    set_load_weight(&init_task);
 
#ifdef CONFIG_PREEMPT_NOTIFIERS
    /* init_task的抢占通知链初始化 */
    INIT_HLIST_HEAD(&init_task.preempt_notifiers);
#endif
 
    /*
     * The boot idle thread does lazy MMU switching as well:
     */
    atomic_inc(&init_mm.mm_count);
    enter_lazy_tlb(&init_mm, current);
 
    /*
     * During early bootup we pretend to be a normal task:
     */
    /* 设定初始化进程采用fair调度类 */
    current->sched_class = &fair_sched_class;
 
    /*
     * Make us the idle thread. Technically, schedule() should not be
     * called from this thread, however somewhere below it might be,
     * but because we are the idle thread, we just pick up running again
     * when this runqueue becomes "idle".
     */
    /* 将当前进程变更为idle进程，将其各项信息重新初始化，调度类设置两位idle调度器 */
    init_idle(current, smp_processor_id());
 
    calc_load_update = jiffies + LOAD_FREQ;
 
#ifdef CONFIG_SMP
    zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
    /* May be allocated at isolcpus cmdline parse time */
    if (cpu_isolated_map == NULL)
        zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
    idle_thread_set_boot_cpu();
    set_cpu_rq_start_time();
#endif
    /* 初始化fair调度类，其实实际上是注册SCHED_SOFTIRQ类型的软中断处理函数run_rebalance_domains，执行负载平衡过程 */
    /* 这里的问题是SCHED_SOFTIRQ软中断是何时触发?*/
    init_sched_fair_class();
 
    /* 标记调度器开始运行，但是此时系统只有init_task一个进程，且为idle进程，
     * 定时器暂时还未启动，不会调度到其它进程，所以继续回到start_kernel执行初始化过程。
     */
    scheduler_running = 1;
}

3. 时钟中断相关的初始化

在sched_init初始化之后，继续回到start_kernel执行，跟调度相关的内容是：

3.1 init_IRQ

该函数中会初始化IRQ的栈空间，包括系统中所有的软件中断和硬件中断。时钟中断是调度的驱动因素，包括硬件中断和软中断下半部，在这里也进行了初始化。中断相关的内容后面章节会有详细的介绍，此处需要了解整个初始化流程，知道这个点做了什么。

3.2 init_timers

此处会初始化timer，注册TIMER_SOFTIRQ软中断回调函数run_timer_softirq，关于softirq的内容我会在最后进行介绍。既然在这里注册了softirq，那么在哪里开始激活或启动该softirq呢？该softirq的作用是什么？

在时钟中断的注册章节我们会看到，tick_handle_periodic为时钟中断的事件回调函数，在time_init中被赋值到时钟中断的回调函数钩子处，发生时钟中断是会被调用做中断处理。该函数最终调用tick_periodic，继续调用update_process_times，进而再调用run_local_timers函数来打开TIMER_SOFTIRQ，同时run_local_timers也调用接口hrtimer_run_queues运行高精度定时器。这是中断处理的典型方式，即硬件中断处理关键部分，启动softirq后打开硬件中断响应，更多的事务在软中断下半部中处理。关于该软中断的具体作用后面会详细介绍，这里需要了解的是它会激活所有过期的定时器。

3.3 time_init

执行时钟相关的初始化，后面会看到，我们在系统初始化初期的汇编阶段会注册硬件中断向量表，但是中断设备和事件处理函数并未初始化，这里调用init_decrementer_clockevent初始化时钟中断设备，并初始化时间回调tick_handle_periodic；同时调用tick_setup_hrtimer_broadcast注册高精度定时器设备及其回调，在中断发生时实际会被执行。此时硬件中断被激活。