Koordinator-CpuEvictor

原创已于 2025-04-15 20:02:54 修改 · 154 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#k8s #云原生 #混部

于 2025-04-02 17:06:29 首次发布

Koordinator分析专栏收录该内容

1 篇文章

订阅专栏

New()

初始化CpuEvictor，也是一个framework.QOSStrategy。
参数：驱逐间隔1秒、冷却时长20秒、metric采集间隔1秒、stateInformer、metric缓存

func New(opt *framework.Options) framework.QOSStrategy {
    return &cpuEvictor{
        evictInterval:         time.Duration(opt.Config.CPUEvictIntervalSeconds) * time.Second,
        evictCoolingInterval:  time.Duration(opt.Config.CPUEvictCoolTimeSeconds) * time.Second,
        metricCollectInterval: opt.MetricAdvisorConfig.CollectResUsedInterval,
        statesInformer:        opt.StatesInformer,
        metricCache:           opt.MetricCache,
        lastEvictTime:         time.Now(),
    }
}

Run()

启动。用了一个无限循环，除非stop chan有数据。间隔1秒。

func (c *cpuEvictor) Run(stopCh <-chan struct{}) {
    go wait.Until(c.cpuEvict, c.evictInterval, stopCh)
}

cpuEvict()

获取nodeSLO
若cpu驱逐特性禁止，退出
若当前时间处于冷却期，退出
从配置获取窗口时间，没有的话默认为metric采集时间的2倍。
获取驱逐cpu阈值
调用 c.evictByResourceSatisfaction(node, thresholdConfig, windowSeconds)

func (c *cpuEvictor) cpuEvict() {
    klog.V(5).Infof("cpu evict process start")
    nodeSLO := c.statesInformer.GetNodeSLO()
    if disabled, err := features.IsFeatureDisabled(nodeSLO, features.BECPUEvict); err != nil {
        klog.Warningf("cpuEvict failed, cannot check the feature gate, err: %s", err)
        return
    } else if disabled {
        klog.V(4).Infof("cpuEvict skipped, nodeSLO disable the feature gate")
        return
    }
    if time.Since(c.lastEvictTime) < c.evictCoolingInterval {
        klog.V(4).Infof("skip CPU evict process, still in evict cool time")
        return
    }
    thresholdConfig := nodeSLO.Spec.ResourceUsedThresholdWithBE
    windowSeconds := int64(c.metricCollectInterval.Seconds() * 2)
    if thresholdConfig.CPUEvictTimeWindowSeconds != nil && *thresholdConfig.CPUEvictTimeWindowSeconds > int64(c.metricCollectInterval.Seconds()) {
        windowSeconds = *thresholdConfig.CPUEvictTimeWindowSeconds
    }
    node := c.statesInformer.GetNode()
    if node == nil {
        klog.Warningf("cpuEvict failed, got nil node")
        return
    }
    cpuCapacity := node.Status.Capacity.Cpu().Value()
    if cpuCapacity <= 0 {
        klog.Warningf("cpuEvict failed, node cpuCapacity not valid,value: %d", cpuCapacity)
        return
    }
    c.evictByResourceSatisfaction(node, thresholdConfig, windowSeconds)
    klog.V(5).Info("cpu evict process finished.")
}

evictByResourceSatisfaction()

isSatisfactionConfigValid()判断水位线阈值是否合理
c.calculateMilliRelease()计算需要驱逐的核数

func (c *cpuEvictor) evictByResourceSatisfaction(node *corev1.Node, thresholdConfig *slov1alpha1.ResourceThresholdStrategy, windowSeconds int64) {
    if !isSatisfactionConfigValid(thresholdConfig) {
        return
    }
    milliRelease := c.calculateMilliRelease(thresholdConfig, windowSeconds)
    if milliRelease > 0 {
        bePodInfos := c.getPodEvictInfoAndSort()
        c.killAndEvictBEPodsRelease(node, bePodInfos, milliRelease)
    }
}

isSatisfactionConfigValid()

thresholdConfig从nodeSLO.Spec.ResourceUsedThresholdWithBE获取
获取最高阈值和最低阈值
这里其实就是检查两个阈值是否合理
- 不能小于0，最低值不能大于最高值

func isSatisfactionConfigValid(thresholdConfig *slov1alpha1.ResourceThresholdStrategy) bool {
    lowPercent := thresholdConfig.CPUEvictBESatisfactionLowerPercent
    upperPercent := thresholdConfig.CPUEvictBESatisfactionUpperPercent
    if lowPercent == nil || upperPercent == nil {
        klog.V(4).Infof("cpuEvict by ResourceSatisfaction skipped, CPUEvictBESatisfactionLowerPercent or CPUEvictBESatisfactionUpperPercent not config")
        return false
    }
    if *lowPercent > beCPUSatisfactionLowPercentMax || *lowPercent <= 0 {
        klog.V(4).Infof("cpuEvict by ResourceSatisfaction skipped, CPUEvictBESatisfactionLowerPercent(%d) is not valid! must (0,%d]", *lowPercent, beCPUSatisfactionLowPercentMax)
        return false
    }
    if *upperPercent >= beCPUSatisfactionUpperPercentMax || *upperPercent <= 0 {
        klog.V(4).Infof("cpuEvict by ResourceSatisfaction skipped, CPUEvictBESatisfactionUpperPercent(%d) is not valid,must (0,%d)!", *upperPercent, beCPUSatisfactionUpperPercentMax)
        return false
    } else if *upperPercent < *lowPercent {
        klog.V(4).Infof("cpuEvict by ResourceSatisfaction skipped, CPUEvictBESatisfactionUpperPercent(%d) < CPUEvictBESatisfactionLowerPercent(%d)", *upperPercent, *lowPercent)
        return false
    }
    return true
}

calculateMilliRelease()

构建查询条件窗口时间范围+平均值
从metric缓存查询
- BECPUUsage(node_be:be_resource,usage)–读取cpuacct.usage计算的实际使用率
- BECPURequest(node_be:be_resource,request) --batch-cpu
- BECPULimit(node_be:be_resource,real-limit)–cgroup限制使用量
计算cpu使用率与CPUEvictBEUsageThresholdPercent（默认90%）对比，是否太高了
- cpu太高则计算需求驱逐的核数
第一次用窗口平均值计算一个结果，第二次用最新值计算一个结果，使用两者较低值

func (c *cpuEvictor) calculateMilliRelease(thresholdConfig *slov1alpha1.ResourceThresholdStrategy, windowSeconds int64) int64 {
    // Step1: Calculate release resource by BECPUResourceMetric in window
    queryParam := helpers.GenerateQueryParamsAvg(time.Duration(windowSeconds) * time.Second)
    querier, err := c.metricCache.Querier(*queryParam.Start, *queryParam.End)
    if err != nil {
        klog.Warningf("get query failed, error %v", err)
        return 0
    }
    defer querier.Close()
    // BECPUUsage
    avgBECPUMilliUsage, count01 := getBECPUMetric(metriccache.BEResourceAllocationUsage, querier, queryParam.Aggregate)
    // BECPURequest
    avgBECPUMilliRequest, count02 := getBECPUMetric(metriccache.BEResourceAllocationRequest, querier, queryParam.Aggregate)
    // BECPULimit
    avgBECPUMilliRealLimit, count03 := getBECPUMetric(metriccache.BEResourceAllocationRealLimit, querier, queryParam.Aggregate)
    // CPU Satisfaction considers the allocatable when policy=evictByAllocatable.
    avgBECPUMilliLimit := avgBECPUMilliRealLimit
    beCPUMilliAllocatable := c.getBEMilliAllocatable()
    if thresholdConfig.CPUEvictPolicy == slov1alpha1.EvictByAllocatablePolicy {
        avgBECPUMilliLimit = beCPUMilliAllocatable
    }
    // get min count
    count := minInt64(count01, count02, count03)
    if !isAvgQueryResultValid(windowSeconds, int64(c.metricCollectInterval.Seconds()), count) {
        return 0
    }
    if !isBECPUUsageHighEnough(avgBECPUMilliUsage, avgBECPUMilliLimit, thresholdConfig.CPUEvictBEUsageThresholdPercent) {
        klog.V(5).Infof("cpuEvict by ResourceSatisfaction skipped, avg usage not enough, "+
            "BEUsage:%v, BERequest:%v, BELimit:%v, BERealLimit:%v, BEAllocatable:%v",
            avgBECPUMilliUsage, avgBECPUMilliRequest, avgBECPUMilliLimit, avgBECPUMilliRealLimit, beCPUMilliAllocatable)
        return 0
    }
    milliRelease := calculateResourceMilliToRelease(avgBECPUMilliRequest, avgBECPUMilliLimit, thresholdConfig)
    if milliRelease <= 0 {
        klog.V(5).Infof("cpuEvict by ResourceSatisfaction skipped, releaseByAvg: %v", milliRelease)
        return 0
    }
    // Step2: Calculate release resource current
    queryParam = helpers.GenerateQueryParamsLast(c.metricCollectInterval * 2)
    querier, err = c.metricCache.Querier(*queryParam.Start, *queryParam.End)
    if err != nil {
        klog.Warningf("get query failed, error %v", err)
        return 0
    }
    defer querier.Close()
    // BECPUUsage
    currentBECPUMilliUsage, _ := getBECPUMetric(metriccache.BEResourceAllocationUsage, querier, queryParam.Aggregate)
    // BECPURequest
    currentBECPUMilliRequest, _ := getBECPUMetric(metriccache.BEResourceAllocationRequest, querier, queryParam.Aggregate)
    // BECPULimit
    currentBECPUMilliRealLimit, _ := getBECPUMetric(metriccache.BEResourceAllocationRealLimit, querier, queryParam.Aggregate)
    // CPU Satisfaction considers the allocatable when policy=evictByAllocatable.
    currentBECPUMilliLimit := currentBECPUMilliRealLimit
    if thresholdConfig.CPUEvictPolicy == slov1alpha1.EvictByAllocatablePolicy {
        currentBECPUMilliLimit = beCPUMilliAllocatable
    }
    if !isBECPUUsageHighEnough(currentBECPUMilliUsage, currentBECPUMilliLimit, thresholdConfig.CPUEvictBEUsageThresholdPercent) {
        klog.V(5).Infof("cpuEvict by ResourceSatisfaction skipped, current usage not enough, "+
            "BEUsage:%v, BERequest:%v, BELimit:%v, BERealLimit:%v, BEAllocatable:%v",
            currentBECPUMilliUsage, currentBECPUMilliRequest, currentBECPUMilliLimit, currentBECPUMilliRealLimit,
            beCPUMilliAllocatable)
        return 0
    }
    // Requests and limits do not change frequently.
    // If the current request and limit are equal to the average request and limit within the window period, there is no need to recalculate.
    if currentBECPUMilliRequest == avgBECPUMilliRequest && currentBECPUMilliLimit == avgBECPUMilliLimit {
        return milliRelease
    }
    milliReleaseByCurrent := calculateResourceMilliToRelease(currentBECPUMilliRequest, currentBECPUMilliLimit, thresholdConfig)
    if milliReleaseByCurrent <= 0 {
        klog.V(5).Infof("cpuEvict by ResourceSatisfaction skipped, releaseByCurrent: %v", milliReleaseByCurrent)
        return 0
    }
    // Step3：release = min(releaseByAvg,releaseByCurrent)
    if milliReleaseByCurrent < milliRelease {
        milliRelease = milliReleaseByCurrent
    }
    if milliRelease > 0 {
        klog.V(4).Infof("cpuEvict by ResourceSatisfaction start to evict, milliRelease: %v,"+
            "current status (BEUsage:%v, BERequest:%v, BELimit:%v, BERealLimit:%v, BEAllocatable:%v)",
            milliRelease, currentBECPUMilliUsage, currentBECPUMilliRequest, currentBECPUMilliLimit, currentBECPUMilliRealLimit,
            beCPUMilliAllocatable)
    }
    return milliRelease
}

isBECPUUsageHighEnough()

判断离线使用率是否太高了

若realLimit为0，返回false
cpuUsage<1000，返回false
cpuUsage = beCPUMilliUsage / beCPUMilliRealLimit
cpuUsage<thresholdPercent ，返回false
- thresholdPercent取值CPUEvictBEUsageThresholdPercent（默认90%）,其实就是离线使用率。
否则返回true，cpu使用率太高了

func isBECPUUsageHighEnough(beCPUMilliUsage, beCPUMilliRequest, beCPUMilliRealLimit float64, thresholdPercent *int64) bool {
    if beCPUMilliRealLimit == 0 {
        klog.Warningf("cpuEvict by ResourceSatisfaction skipped! CPURealLimit is zero!")
        return false
    }
    if beCPUMilliUsage < 1000 {
        return true
    }
    cpuUsage := beCPUMilliUsage / beCPUMilliRealLimit
    if thresholdPercent == nil {
        thresholdPercent = pointer.Int64(beCPUUsageThresholdPercent)
    }
    if cpuUsage < float64(*thresholdPercent)/100 {
        klog.Warningf("cpuEvict by ResourceSatisfaction skipped! cpuUsage(%.2f) and thresholdPercent %d!", cpuUsage, *thresholdPercent)
        return false
    }
    return true
}

calculateResourceMilliToRelease()

beCpuRequest为0，返回0
满足度=beCPUMilliRealLimit / beCPUMilliRequest
- =cgroup设置值/batch-cpu
- 就是给的资源太少了，导致任务运行太慢
满足度>CPUEvictBESatisfactionLowerPercent(默认60) ,返回0
差值=CPUEvictBESatisfactionUpperPercent(默认90)-满足度
差值小于0 ，返回0
返回beCpuRequest*差值
（会不会可能混部资源有空闲，但是调度过来的pod又用不起来，满足度不够又驱逐。比如任务使用率就是比较低的场景。其实不会，应该前面还有判断离线使用率要大于90%才进行计算驱逐，这里没有满足，说明肯定是资源不够）

func calculateResourceMilliToRelease(beCPUMilliRequest, beCPUMilliRealLimit float64, thresholdConfig *slov1alpha1.ResourceThresholdStrategy) int64 {
    if beCPUMilliRequest == 0 {
        klog.Warningf("cpuEvict by ResourceSatisfaction skipped! be pods requests is zero!")
        return 0
    }
    satisfactionRate := beCPUMilliRealLimit / beCPUMilliRequest
    if satisfactionRate > float64(*thresholdConfig.CPUEvictBESatisfactionLowerPercent)/100 {
        klog.Warningf("cpuEvict by ResourceSatisfaction skipped! satisfactionRate(%.2f) and lowPercent(%f)", satisfactionRate, float64(*thresholdConfig.CPUEvictBESatisfactionLowerPercent))
        return 0
    }
    rateGap := float64(*thresholdConfig.CPUEvictBESatisfactionUpperPercent)/100 - satisfactionRate
    if rateGap <= 0 {
        klog.Warningf("cpuEvict by ResourceSatisfaction skipped! satisfactionRate(%.2f) > upperPercent(%f)", satisfactionRate, float64(*thresholdConfig.CPUEvictBESatisfactionUpperPercent))
        return 0
    }
    milliRelease := beCPUMilliRequest * rateGap
    return int64(milliRelease)
}

getPodEvictInfoAndSort()

遍历节点上所有pod（从stateInformer获取)
若是be容器(离线)
- 查询metric:pod_cpu_usage最新值(其实是个cpu使用核数)
- 计算容器所有container batch-cpu总和
- 计算cpuUsage=使用核数/batch-cpu
排序：1、按优先级从小到大，低优排前面 2、再按cpuUsage 从大到小，大的排前面。

func (c *cpuEvictor) getPodEvictInfoAndSort() []*podEvictCPUInfo {
    var bePodInfos []*podEvictCPUInfo
    for _, podMeta := range c.statesInformer.GetAllPods() {
        pod := podMeta.Pod
        if apiext.GetPodQoSClassRaw(pod) == apiext.QoSBE {
            bePodInfo := &podEvictCPUInfo{pod: podMeta.Pod}
            queryMeta, err := metriccache.PodCPUUsageMetric.BuildQueryMeta(metriccache.MetricPropertiesFunc.Pod(string(pod.UID)))
            if err == nil {
                result, err := helpers.CollectPodMetricLast(c.metricCache, queryMeta, c.metricCollectInterval)
                if err == nil {
                    bePodInfo.milliUsedCores = int64(result * 1000)
                }
            }
            milliRequestSum := int64(0)
            for _, container := range pod.Spec.Containers {
                containerCPUReq := util.GetContainerBatchMilliCPURequest(&container)
                if containerCPUReq > 0 {
                    milliRequestSum = milliRequestSum + containerCPUReq
                }
            }
            bePodInfo.milliRequest = milliRequestSum
            if bePodInfo.milliRequest > 0 {
                bePodInfo.cpuUsage = float64(bePodInfo.milliUsedCores) / float64(bePodInfo.milliRequest)
            }
            bePodInfos = append(bePodInfos, bePodInfo)
        }
    }
    sort.Slice(bePodInfos, func(i, j int) bool {
        if bePodInfos[i].pod.Spec.Priority == nil || bePodInfos[j].pod.Spec.Priority == nil ||
            *bePodInfos[i].pod.Spec.Priority == *bePodInfos[j].pod.Spec.Priority {
            return bePodInfos[i].cpuUsage > bePodInfos[j].cpuUsage
        }
        return *bePodInfos[i].pod.Spec.Priority < *bePodInfos[j].pod.Spec.Priority
    })
    return bePodInfos
}

killAndEvictBEPodsRelease()

杀容器。

func (c *cpuEvictor) killAndEvictBEPodsRelease(node *corev1.Node, bePodInfos []*podEvictCPUInfo, cpuNeedMilliRelease int64) {
    message := fmt.Sprintf("killAndEvictBEPodsRelease for node(%s), need realase CPU : %d", node.Name, cpuNeedMilliRelease)
    cpuMilliReleased := int64(0)
    var killedPods []*corev1.Pod
    for _, bePod := range bePodInfos {
        if cpuMilliReleased >= cpuNeedMilliRelease {
            break
        }
        podKillMsg := fmt.Sprintf("%s, kill pod : %s", message, bePod.pod.Name)
        helpers.KillContainers(bePod.pod, podKillMsg)
        killedPods = append(killedPods, bePod.pod)
        cpuMilliReleased = cpuMilliReleased + bePod.milliRequest
    }
    c.evictor.EvictPodsIfNotEvicted(killedPods, node, resourceexecutor.EvictPodByBECPUSatisfaction, message)
    if len(killedPods) > 0 {
        c.lastEvictTime = time.Now()
    }
    klog.V(5).Infof("killAndEvictBEPodsRelease finished!cpuNeedMilliRelease(%d) cpuMilliReleased(%d)", cpuNeedMilliRelease, cpuMilliReleased)
}