【内核调度、负载均衡】【update_sg_lb_stats】

本文深入探讨了Linux内核调度器中的负载均衡机制,详细解释了source_load和target_load函数如何评估迁移源和目标CPU的负载,以及update_sg_lb_stats函数如何更新调度组的统计信息来实现负载均衡。同时,分析了cpu_util函数如何计算CPU利用率,并介绍了group_is_overloaded函数判断调度组是否过载的标准。

load

target_load

target_load是迁移到的cpu的load,或者我们可以说是目的cpu的load

static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
{
	return cfs_rq->runnable_load_avg;
}

/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(struct rq *rq)
{
	return cfs_rq_runnable_load_avg(&rq->cfs);
}

/*
 * Return a high guess(猜测) at the load of a migration-target cpu weighted
 * according to the scheduling class and "nice" value.
 */
 //这个函数计算目的cpu的负载,内核通过这个值决定是否将进程迁移到这个cpu
static unsigned long target_load(int cpu, int type)
{
	struct rq *rq = cpu_rq(cpu);
	unsigned long total = weighted_cpuload(rq);

	if (type == 0 || !sched_feat(LB_BIAS))
		return total;
	//回答一个最大值,告诉内核我的进程足够多,尽量不要给我进程
	return max(rq->cpu_load[type-1], total);
}

source_load

/*
 * Return a low guess at the load of a migration-source cpu weighted
 * according to the scheduling class and "nice" value.
 *
 * We want to under-estimate the load of migration sources, to
 * balance conservatively.
 */
 //这个函数计算源cpu的负载,内核通过这个值决定是否将进程从这个cpu迁出。
static unsigned long source_load(int cpu, int type)
{
	struct rq *rq = cpu_rq(cpu);
	unsigned long total = weighted_cpuload(rq);

	if (type == 0 || !sched_feat(LB_BIAS))
		return total;

	return min(rq->cpu_load[type-1], total);//回答一个最小值,指示内核尽量不要迁移我的进程
}

util

 这里只是基于PELT来进行计算util

/**
 * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
 * @cpu: the CPU to get the utilization of
 *
 * The unit of the return value must be the one of capacity so we can compare
 * the utilization with the capacity of the CPU that is available for CFS task
 * (ie cpu_capacity).
 *
 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
 * recent utilization of currently non-runnable tasks on a CPU. It represents
 * the amount of utilization of a CPU in the range [0..capacity_orig] where
 * capacity_orig is the cpu_capacity available at the highest frequency,
 * i.e. arch_scale_cpu_capacity().
 * The utilization of a CPU converges towards a sum equal to or less than the
 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
 * the running time on this CPU scaled by capacity_curr.
 *
 * The estimated utilization of a CPU is defined to be the maximum between its
 * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
 * currently RUNNABLE on that CPU.
 * This allows to properly represent the expected utilization of a CPU which
 * has just got a big task running since a long sleep period. At the same time
 * however it preserves the benefits of the "blocked utilization" in
 * describing the potential for other tasks waking up on the same CPU.
 *
 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
 * higher than capacity_orig because of unfortunate(不幸的) rounding(舍入) in
 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
 * the average stabilizes(稳定) with the new running time. We need to check that the
 * utilization stays within the range of [0..capacity_orig] and cap it if
 * necessary. Without utilization capping(上限), a group could be seen as overloaded
 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
 * available capacity. We allow utilization to overshoot(过冲) capacity_curr (but not
 * capacity_orig) as it useful for predicting the capacity required after task
 * migrations (scheduler-driven DVFS).
 *
 * Return: the (estimated) utilization for the specified(指定的) CPU
 */
static inline unsigned long cpu_util(int cpu)
{
	struct cfs_rq *cfs_rq;
	unsigned int util;

#ifdef CONFIG_SCHED_WALT
	if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) {
		u64 walt_cpu_util = cpu_rq(cpu)->cumulative_runnable_avg;

		walt_cpu_util <<= SCHED_CAPACITY_SHIFT;
		do_div(walt_cpu_util, walt_ravg_window);

		return min_t(unsigned long, walt_cpu_util,
			     capacity_orig_of(cpu));
	}
#endif

	cfs_rq = &cpu_rq(cpu)->cfs;
	util = READ_ONCE(cfs_rq->avg.util_avg);

	if (sched_feat(UTIL_EST))
		//新任务唤醒,cfs.avg.util_est,做四舍五入的时候,可能大于capacity_curr或者capacity_original
		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
	//需要将util限制在0-capacity_orig
	return min_t(unsigned long, util, capacity_orig_of(cpu));
}

 

update_sg_lb_stats

/**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 * @env: The load balancing environment.
 * @group: sched_group whose statistics are to be updated.
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
 * @local_group: Does group contain this_cpu.
 * @sgs: variable to hold the statistics for this group.
 * @overload: Indicate(表明) pullable(可拉式) load (e.g. >1 runnable task).
 * @overutilized: Indicate overutilization(过度利用) for any CPU.
 * @misfit_task: Indicate misfit_task for any CPU
 */
static inline void update_sg_lb_stats(struct lb_env *env,
			struct sched_group *group, int load_idx,
			int local_group, struct sg_lb_stats *sgs,
			bool *overload, bool *overutilized, bool *misfit_task)
{
	unsigned long load;
	int i, nr_running;

	memset(sgs, 0, sizeof(*sgs));
	
	/*  (7.3.1.5.1) 遍历sched_group中的每个cpu */
	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
		struct rq *rq = cpu_rq(i);

		/* Bias balancing toward cpus of our domain */
		if (local_group)//目的cpu的group
			/* 如果是local_group,负载往小的取:min(rq->cpu_load[load_idx-1], weighted_cpuload(cpu)) */
			load = target_load(i, load_idx);
		else
			/* 如果不是local_group,负载往大的取:max(rq->cpu_load[load_idx-1], weighted_cpuload(cppu))*/
			load = source_load(i, load_idx);

		/* (7.3.1.5.4) 累加sgs各项值:
            sgs->group_load   // runnable负载带weight分量(cpu_rq(cpu)->cfs.avg.util_avg),经过rq->cpu_load[]计算
            sgs->group_util   // running负载(cpu_rq(cpu)->cfs.avg.load_avg/cpu_rq(cpu)->cfs.runnable_load_avg)
            sgs->sum_nr_running // rq中所有se的总和,
            sgs->sum_weighted_load // runnable负载带weight分量(cpu_rq(cpu)->cfs.avg.util_avg)
            sgs->idle_cpus      // idle状态的cpu计数
         */
		sgs->group_load += load;
		sgs->group_util += cpu_util(i);
		sgs->sum_nr_running += rq->cfs.h_nr_running;

		nr_running = rq->nr_running;
		/* (7.3.1.5.5) 如果rq中进程数量>1,则就会有进程处于runnable状态,
            overload = true
         */
		if (nr_running > 1)
			*overload = true;

#ifdef CONFIG_NUMA_BALANCING
		sgs->nr_numa_running += rq->nr_numa_running;
		sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
		sgs->sum_weighted_load += weighted_cpuload(rq);
		/*
		 * No need to call idle_cpu() if nr_running is not 0
		 */
		if (!nr_running && idle_cpu(i))
			sgs->idle_cpus++;

		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
		    sgs->group_misfit_task_load < rq->misfit_task_load) {
			sgs->group_misfit_task_load = rq->misfit_task_load;
			*overload = 1;
		}

/* (7.3.1.5.6) cpu的capacity小于cpu的running状态负载,overutilized = true*/
		if (cpu_overutilized(i)) {
			*overutilized = true;

			if (rq->misfit_task_load)
				*misfit_task = true;
		}
	}
	
	/* (7.3.1.5.7) 更新汇总后sgs的统计数据:
        sgs->group_capacity     // sgs所有cpu capacity的累加
        sgs->avg_load           // 按照group_capacity,等比例放大group_load负载,capacity越小avg_load越大
        sgs->load_per_task      // sgs的平均每个进程的weight负载
        sgs->group_weight       // sgs的online cpu个数
        sgs->group_no_capacity  // sgs的capacity已经不够用,赶不上util
        sgs->group_type         // 严重级别 group_overloaded > group_imbalanced > group_other
                                // group_imbalanced: 下一等级的load_balance因为cpu_affinity的原因没有完成
     */
	

	/* Adjust by relative CPU capacity of the group */
	sgs->group_capacity = group->sgc->capacity;
	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;

	if (sgs->sum_nr_running)
		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;

	sgs->group_weight = group->group_weight;

	sgs->group_no_capacity = group_is_overloaded(env, sgs);
	sgs->group_type = group_classify(group, sgs);
}

负载值

计算方法

说明

sgs->group_load

+= load

Target_cpu取大值,source_cpu取小值

 

sgs->group_util

+=

min( max(READ_ONCE(cfs_rq->avg.util_avg),READ_ONCE(cfs_rq->avg.util_est.enqueued)),  capacity_orig_of(cpu))

 

sgs->sum_nr_running

+= rq->cfs.h_nr_running

 

sgs->sum_weighted_load

+= cfs_rq->runnable_load_avg

 

 

sgs->group_misfit_task_load

实际上他只是一个值

 

sgs->idle_cpus

++

 

sgs->group_capacity

group->sgc->capacity

 

sgs->avg_load

= (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity

 

 

gs->load_per_task

= sgs->sum_weighted_load / sgs->sum_nr_running;

 

sgs->group_weight

= group->group_weight

 

sgs->group_no_capacity

group_is_overloaded(env, sgs)

 

sgs->group_type

= group_classify(group, sgs)

严重级别 group_overloaded > group_imbalanced > group_other

 

 

 

sgs->group_no_capacity实际上实际上只看返回true的一个条件。那就是util比他本身的capacity还要大。

/*
 *  group_is_overloaded returns true if the group has more tasks than it can
 *  handle.
 *  group_is_overloaded is not equals to !group_has_capacity because a group
 *  with the exact(确切的) right number of tasks, has no more spare capacity but is not
 *  overloaded so both group_has_capacity and group_is_overloaded return
 *  false.
 */
static inline bool
group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
{
	if (sgs->sum_nr_running <= sgs->group_weight)
		return false;

	if ((sgs->group_capacity * 100) <
			(sgs->group_util * env->sd->imbalance_pct))
		return true;

	return false;
}

 

### 文件总结与注释 #### 文件概述 该文件是Linux内核版本4.19中关于完全公平调度器(Completely Fair Scheduler, CFS)的部分实现代码。CFS主要用于处理常规进程(即SCHED_NORMAL和SCHED_BATCH类型的任务)。该文件包含了CFS的核心逻辑以及相关配置参数的定义,还包括了多处理器系统上的负载均衡、NUMA平衡等高级特性。此外,还涉及到了一些调试和统计功能的实现。 #### 主要知识点总结 1. **核心调度算法**:CFS通过虚拟运行时间(`vruntime`)来决定哪个任务应该优先执行,以保证每个任务都能获得公平的CPU时间。 2. **负载均衡**:在多处理器环境中,CFS会尝试将负载从繁忙的CPU迁移到空闲的CPU上,以保持系统的整体性能。负载均衡涉及到周期性地检查各个CPU的工作量并进行调整。 3. **NUMA支持**:为了优化内存访问延迟,在NUMA架构下,CFS能够根据任务的历史内存访问模式动态调整任务的放置位置,尽量使任务靠近其使用的物理节点。 4. **可调参数**:内核提供了多个可调节参数如`sched_latency`, `sched_min_granularity`等,允许管理员根据不同工作负载需求微调调度行为。 5. **任务组调度**:如果启用了任务组调度(CONFIG_FAIR_GROUP_SCHED),则可以为不同的控制组分配不同比例的CPU资源。 6. **带宽限制**:对于某些场景下的CFS队列,可以通过设置最大运行时间和最小间隔时间来进行流量控制。 7. **调试工具**:提供了多种用于跟踪和记录调度事件及状态变化的日志接口,方便开发人员分析系统调度情况。 --- ### 关键部分中文注释 以下是针对文件中的几个关键函数添加的中文注释: #### 1. `task_tick_fair` ```c /* * 当一个属于我们调度类的任务被计时器命中时调用此方法。 * 注意:这个函数可能会由远程CPU触发(例如,当启用全动态滴答时), * 因此不能做任何本地假设,所有东西都必须通过传递给我们的@rq 和 @curr 参数来访问。 */ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; // 遍历当前任务的所有层级实体,更新它们的时间片信息 for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } // 如果启用了NUMA调度,则在此处更新相关的统计数据 if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); } ``` #### 2. `find_busiest_group` ```c /* * 找到本调度域中最忙碌的组,并计算需要迁移多少负载才能达到平衡。 * * @param env 负载均衡环境描述符 */ static struct sched_group *find_busiest_group(struct lb_env *env) { struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; // 初始化调度域的状态统计结构体 init_sd_lb_stats(&sds); /* * 计算调度域内的各种统计信息,包括但不限于平均负载、总容量等。 * 这些数据有助于后续确定是否存在不平衡现象。 */ update_sd_lb_stats(env, &sds); // 获取指向最忙组及其对应状态统计信息的指针 local = &sds.local_stat; busiest = &sds.busiest_stat; // 检查是否满足不对称打包条件,若满足直接返回最忙组指针 if (check_asym_packing(env, &sds)) return sds.busiest; // 若没有找到最忙的组或者最忙的组没有足够的负载,则认为已经平衡完毕 if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; // 根据配置标志位判断是否强制进行负载转移 if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; // 对于新进入空闲态的情况特殊处理,即使得平均负载较高的情况下也允许迁移 if (env->idle == CPU_IDLE) { if (local->idle_cpus <= (busiest->idle_cpus + 1)) goto out_balanced; } else { // 新唤醒或非空闲状态下使用imbalance_pct作为阈值 if (100 * busiest->avg_load <= env->sd->imbalance_pct * local->avg_load) goto out_balanced; } force_balance: // 强制计算所需迁移的负载量 calculate_imbalance(env, &sds); // 返回最忙的组引用 return env->imbalance ? sds.busiest : NULL; out_balanced: env->imbalance = 0; return NULL; } ``` #### 3. `prio_changed_fair` ```c /* * 处理任务优先级变更后的行为。 * 具体来说就是判断当前正在运行的任务是否需要重新安排, * 或者其他未运行但处于就绪队列中的任务是否应抢占当前任务的位置。 */ static void prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { if (!task_on_rq_queued(p)) return; // 如果当前任务是p且它的优先级升高了,则立即重排;否则仅检查是否有更高优先级的任务等待执行 if (rq->curr == p) { if (p->prio > oldprio) resched_curr(rq); } else check_preempt_curr(rq, p, 0); } ``` 这些只是文件的一部分内容示例,完整理解还需要结合上下文仔细阅读整个源码文件。希望以上总结能帮助您更好地理解和学习这段代码!如果有更具体的问题或者其他想要了解的内容,请随时告知我。
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值