【内核调度、负载均衡】【update_sg_lb_stats】

最新推荐文章于 2024-06-03 19:18:52 发布

原创最新推荐文章于 2024-06-03 19:18:52 发布 · 1k 阅读

CC 4.0 BY-SA版权

本文深入探讨了Linux内核调度器中的负载均衡机制，详细解释了source_load和target_load函数如何评估迁移源和目标CPU的负载，以及update_sg_lb_stats函数如何更新调度组的统计信息来实现负载均衡。同时，分析了cpu_util函数如何计算CPU利用率，并介绍了group_is_overloaded函数判断调度组是否过载的标准。

load

target_load

target_load是迁移到的cpu的load，或者我们可以说是目的cpu的load

static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
{
	return cfs_rq->runnable_load_avg;
}

/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(struct rq *rq)
{
	return cfs_rq_runnable_load_avg(&rq->cfs);
}

/*
 * Return a high guess(猜测) at the load of a migration-target cpu weighted
 * according to the scheduling class and "nice" value.
 */
 //这个函数计算目的cpu的负载，内核通过这个值决定是否将进程迁移到这个cpu
static unsigned long target_load(int cpu, int type)
{
	struct rq *rq = cpu_rq(cpu);
	unsigned long total = weighted_cpuload(rq);

	if (type == 0 || !sched_feat(LB_BIAS))
		return total;
	//回答一个最大值，告诉内核我的进程足够多，尽量不要给我进程
	return max(rq->cpu_load[type-1], total);
}

source_load

/*
 * Return a low guess at the load of a migration-source cpu weighted
 * according to the scheduling class and "nice" value.
 *
 * We want to under-estimate the load of migration sources, to
 * balance conservatively.
 */
 //这个函数计算源cpu的负载，内核通过这个值决定是否将进程从这个cpu迁出。
static unsigned long source_load(int cpu, int type)
{
	struct rq *rq = cpu_rq(cpu);
	unsigned long total = weighted_cpuload(rq);

	if (type == 0 || !sched_feat(LB_BIAS))
		return total;

	return min(rq->cpu_load[type-1], total);//回答一个最小值，指示内核尽量不要迁移我的进程
}

util

这里只是基于PELT来进行计算util

/**
 * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
 * @cpu: the CPU to get the utilization of
 *
 * The unit of the return value must be the one of capacity so we can compare
 * the utilization with the capacity of the CPU that is available for CFS task
 * (ie cpu_capacity).
 *
 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
 * recent utilization of currently non-runnable tasks on a CPU. It represents
 * the amount of utilization of a CPU in the range [0..capacity_orig] where
 * capacity_orig is the cpu_capacity available at the highest frequency,
 * i.e. arch_scale_cpu_capacity().
 * The utilization of a CPU converges towards a sum equal to or less than the
 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
 * the running time on this CPU scaled by capacity_curr.
 *
 * The estimated utilization of a CPU is defined to be the maximum between its
 * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
 * currently RUNNABLE on that CPU.
 * This allows to properly represent the expected utilization of a CPU which
 * has just got a big task running since a long sleep period. At the same time
 * however it preserves the benefits of the "blocked utilization" in
 * describing the potential for other tasks waking up on the same CPU.
 *
 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
 * higher than capacity_orig because of unfortunate(不幸的) rounding(舍入) in
 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
 * the average stabilizes(稳定) with the new running time. We need to check that the
 * utilization stays within the range of [0..capacity_orig] and cap it if
 * necessary. Without utilization capping(上限), a group could be seen as overloaded
 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
 * available capacity. We allow utilization to overshoot(过冲) capacity_curr (but not
 * capacity_orig) as it useful for predicting the capacity required after task
 * migrations (scheduler-driven DVFS).
 *
 * Return: the (estimated) utilization for the specified(指定的) CPU
 */
static inline unsigned long cpu_util(int cpu)
{
	struct cfs_rq *cfs_rq;
	unsigned int util;

#ifdef CONFIG_SCHED_WALT
	if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) {
		u64 walt_cpu_util = cpu_rq(cpu)->cumulative_runnable_avg;

		walt_cpu_util <<= SCHED_CAPACITY_SHIFT;
		do_div(walt_cpu_util, walt_ravg_window);

		return min_t(unsigned long, walt_cpu_util,
			     capacity_orig_of(cpu));
	}
#endif

	cfs_rq = &cpu_rq(cpu)->cfs;
	util = READ_ONCE(cfs_rq->avg.util_avg);

	if (sched_feat(UTIL_EST))
		//新任务唤醒，cfs.avg.util_est，做四舍五入的时候，可能大于capacity_curr或者capacity_original
		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
	//需要将util限制在0-capacity_orig
	return min_t(unsigned long, util, capacity_orig_of(cpu));
}

update_sg_lb_stats

/**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 * @env: The load balancing environment.
 * @group: sched_group whose statistics are to be updated.
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
 * @local_group: Does group contain this_cpu.
 * @sgs: variable to hold the statistics for this group.
 * @overload: Indicate(表明) pullable(可拉式) load (e.g. >1 runnable task).
 * @overutilized: Indicate overutilization(过度利用) for any CPU.
 * @misfit_task: Indicate misfit_task for any CPU
 */
static inline void update_sg_lb_stats(struct lb_env *env,
			struct sched_group *group, int load_idx,
			int local_group, struct sg_lb_stats *sgs,
			bool *overload, bool *overutilized, bool *misfit_task)
{
	unsigned long load;
	int i, nr_running;

	memset(sgs, 0, sizeof(*sgs));
	
	/*  (7.3.1.5.1) 遍历sched_group中的每个cpu */
	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
		struct rq *rq = cpu_rq(i);

		/* Bias balancing toward cpus of our domain */
		if (local_group)//目的cpu的group
			/* 如果是local_group，负载往小的取：min(rq->cpu_load[load_idx-1], weighted_cpuload(cpu)) */
			load = target_load(i, load_idx);
		else
			/* 如果不是local_group，负载往大的取：max(rq->cpu_load[load_idx-1], weighted_cpuload(cppu))*/
			load = source_load(i, load_idx);

		/* (7.3.1.5.4) 累加sgs各项值：
            sgs->group_load   // runnable负载带weight分量(cpu_rq(cpu)->cfs.avg.util_avg)，经过rq->cpu_load[]计算
            sgs->group_util   // running负载(cpu_rq(cpu)->cfs.avg.load_avg/cpu_rq(cpu)->cfs.runnable_load_avg)
            sgs->sum_nr_running // rq中所有se的总和，
            sgs->sum_weighted_load // runnable负载带weight分量(cpu_rq(cpu)->cfs.avg.util_avg)
            sgs->idle_cpus      // idle状态的cpu计数
         */
		sgs->group_load += load;
		sgs->group_util += cpu_util(i);
		sgs->sum_nr_running += rq->cfs.h_nr_running;

		nr_running = rq->nr_running;
		/* (7.3.1.5.5) 如果rq中进程数量>1，则就会有进程处于runnable状态，
            overload = true
         */
		if (nr_running > 1)
			*overload = true;

#ifdef CONFIG_NUMA_BALANCING
		sgs->nr_numa_running += rq->nr_numa_running;
		sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
		sgs->sum_weighted_load += weighted_cpuload(rq);
		/*
		 * No need to call idle_cpu() if nr_running is not 0
		 */
		if (!nr_running && idle_cpu(i))
			sgs->idle_cpus++;

		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
		    sgs->group_misfit_task_load < rq->misfit_task_load) {
			sgs->group_misfit_task_load = rq->misfit_task_load;
			*overload = 1;
		}

/* (7.3.1.5.6) cpu的capacity小于cpu的running状态负载，overutilized = true*/
		if (cpu_overutilized(i)) {
			*overutilized = true;

			if (rq->misfit_task_load)
				*misfit_task = true;
		}
	}
	
	/* (7.3.1.5.7) 更新汇总后sgs的统计数据：
        sgs->group_capacity     // sgs所有cpu capacity的累加
        sgs->avg_load           // 按照group_capacity，等比例放大group_load负载，capacity越小avg_load越大
        sgs->load_per_task      // sgs的平均每个进程的weight负载
        sgs->group_weight       // sgs的online cpu个数
        sgs->group_no_capacity  // sgs的capacity已经不够用，赶不上util
        sgs->group_type         // 严重级别 group_overloaded > group_imbalanced > group_other
                                // group_imbalanced: 下一等级的load_balance因为cpu_affinity的原因没有完成
     */
	

	/* Adjust by relative CPU capacity of the group */
	sgs->group_capacity = group->sgc->capacity;
	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;

	if (sgs->sum_nr_running)
		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;

	sgs->group_weight = group->group_weight;

	sgs->group_no_capacity = group_is_overloaded(env, sgs);
	sgs->group_type = group_classify(group, sgs);
}

负载值	计算方法	说明
sgs->group_load	+= load Target_cpu取大值，source_cpu取小值
sgs->group_util	+= min( max(READ_ONCE(cfs_rq->avg.util_avg),READ_ONCE(cfs_rq->avg.util_est.enqueued)), capacity_orig_of(cpu))
sgs->sum_nr_running	+= rq->cfs.h_nr_running
sgs->sum_weighted_load	+= cfs_rq->runnable_load_avg
sgs->group_misfit_task_load	实际上他只是一个值
sgs->idle_cpus	++
sgs->group_capacity	group->sgc->capacity
sgs->avg_load	= (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity
gs->load_per_task	= sgs->sum_weighted_load / sgs->sum_nr_running;
sgs->group_weight	= group->group_weight
sgs->group_no_capacity	group_is_overloaded(env, sgs)
sgs->group_type	= group_classify(group, sgs)	严重级别 group_overloaded > group_imbalanced > group_other

sgs->group_no_capacity实际上实际上只看返回true的一个条件。那就是util比他本身的capacity还要大。

/*
 *  group_is_overloaded returns true if the group has more tasks than it can
 *  handle.
 *  group_is_overloaded is not equals to !group_has_capacity because a group
 *  with the exact(确切的) right number of tasks, has no more spare capacity but is not
 *  overloaded so both group_has_capacity and group_is_overloaded return
 *  false.
 */
static inline bool
group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
{
	if (sgs->sum_nr_running <= sgs->group_weight)
		return false;

	if ((sgs->group_capacity * 100) <
			(sgs->group_util * env->sd->imbalance_pct))
		return true;

	return false;
}