New()
初始化CpuEvictor,也是一个framework.QOSStrategy。
参数:驱逐间隔1秒、冷却时长20秒、metric采集间隔1秒、stateInformer、metric缓存
func New(opt *framework.Options) framework.QOSStrategy {
return &cpuEvictor{
evictInterval: time.Duration(opt.Config.CPUEvictIntervalSeconds) * time.Second,
evictCoolingInterval: time.Duration(opt.Config.CPUEvictCoolTimeSeconds) * time.Second,
metricCollectInterval: opt.MetricAdvisorConfig.CollectResUsedInterval,
statesInformer: opt.StatesInformer,
metricCache: opt.MetricCache,
lastEvictTime: time.Now(),
}
}
Run()
启动。用了一个无限循环,除非stop chan有数据。间隔1秒。
func (c *cpuEvictor) Run(stopCh <-chan struct{}) {
go wait.Until(c.cpuEvict, c.evictInterval, stopCh)
}
cpuEvict()
- 获取nodeSLO
- 若cpu驱逐特性禁止,退出
- 若当前时间处于冷却期,退出
- 从配置获取窗口时间,没有的话默认为metric采集时间的2倍。
- 获取驱逐cpu阈值
- 调用
c.evictByResourceSatisfaction(node, thresholdConfig, windowSeconds)
func (c *cpuEvictor) cpuEvict() {
klog.V(5).Infof("cpu evict process start")
nodeSLO := c.statesInformer.GetNodeSLO()
if disabled, err := features.IsFeatureDisabled(nodeSLO, features.BECPUEvict); err != nil {
klog.Warningf("cpuEvict failed, cannot check the feature gate, err: %s", err)
return
} else if disabled {
klog.V(4).Infof("cpuEvict skipped, nodeSLO disable the feature gate")
return
}
if time.Since(c.lastEvictTime) < c.evictCoolingInterval {
klog.V(4).Infof("skip CPU evict process, still in evict cool time")
return
}
thresholdConfig := nodeSLO.Spec.ResourceUsedThresholdWithBE
windowSeconds := int64(c.metricCollectInterval.Seconds() * 2)
if thresholdConfig.CPUEvictTimeWindowSeconds != nil && *thresholdConfig.CPUEvictTimeWindowSeconds > int64(c.metricCollectInterval.Seconds()) {
windowSeconds = *thresholdConfig.CPUEvictTimeWindowSeconds
}
node := c.statesInformer.GetNode()
if node == nil {
klog.Warningf("cpuEvict failed, got nil node")
return
}
cpuCapacity := node.Status.Capacity.Cpu().Value()
if cpuCapacity <= 0 {
klog.Warningf("cpuEvict failed, node cpuCapacity not valid,value: %d", cpuCapacity)
return
}
c.evictByResourceSatisfaction(node, thresholdConfig, windowSeconds)
klog.V(5).Info("cpu evict process finished.")
}
evictByResourceSatisfaction()
isSatisfactionConfigValid()判断水位线阈值是否合理c.calculateMilliRelease()计算需要驱逐的核数
func (c *cpuEvictor) evictByResourceSatisfaction(node *corev1.Node, thresholdConfig *slov1alpha1.ResourceThresholdStrategy, windowSeconds int64) {
if !isSatisfactionConfigValid(thresholdConfig) {
return
}
milliRelease := c.calculateMilliRelease(thresholdConfig, windowSeconds)
if milliRelease > 0 {
bePodInfos := c.getPodEvictInfoAndSort()
c.killAndEvictBEPodsRelease(node, bePodInfos, milliRelease)
}
}
isSatisfactionConfigValid()
thresholdConfig从nodeSLO.Spec.ResourceUsedThresholdWithBE获取- 获取最高阈值和最低阈值
- 这里其实就是检查两个阈值是否合理
- 不能小于0,最低值不能大于最高值
func isSatisfactionConfigValid(thresholdConfig *slov1alpha1.ResourceThresholdStrategy) bool {
lowPercent := thresholdConfig.CPUEvictBESatisfactionLowerPercent
upperPercent := thresholdConfig.CPUEvictBESatisfactionUpperPercent
if lowPercent == nil || upperPercent == nil {
klog.V(4).Infof("cpuEvict by ResourceSatisfaction skipped, CPUEvictBESatisfactionLowerPercent or CPUEvictBESatisfactionUpperPercent not config")
return false
}
if *lowPercent > beCPUSatisfactionLowPercentMax || *lowPercent <= 0 {
klog.V(4).Infof("cpuEvict by ResourceSatisfaction skipped, CPUEvictBESatisfactionLowerPercent(%d) is not valid! must (0,%d]", *lowPercent, beCPUSatisfactionLowPercentMax)
return false
}
if *upperPercent >= beCPUSatisfactionUpperPercentMax || *upperPercent <= 0 {
klog.V(4).Infof("cpuEvict by ResourceSatisfaction skipped, CPUEvictBESatisfactionUpperPercent(%d) is not valid,must (0,%d)!", *upperPercent, beCPUSatisfactionUpperPercentMax)
return false
} else if *upperPercent < *lowPercent {
klog.V(4).Infof("cpuEvict by ResourceSatisfaction skipped, CPUEvictBESatisfactionUpperPercent(%d) < CPUEvictBESatisfactionLowerPercent(%d)", *upperPercent, *lowPercent)
return false
}
return true
}
calculateMilliRelease()
- 构建查询条件窗口时间范围+平均值
- 从metric缓存查询
BECPUUsage(node_be:be_resource,usage)–读取cpuacct.usage计算的实际使用率BECPURequest(node_be:be_resource,request)--batch-cpuBECPULimit(node_be:be_resource,real-limit)–cgroup限制使用量
- 计算cpu使用率与
CPUEvictBEUsageThresholdPercent(默认90%)对比,是否太高了- cpu太高则计算需求驱逐的核数
- 第一次用窗口平均值计算一个结果,第二次用最新值计算一个结果,使用两者较低值
func (c *cpuEvictor) calculateMilliRelease(thresholdConfig *slov1alpha1.ResourceThresholdStrategy, windowSeconds int64) int64 {
// Step1: Calculate release resource by BECPUResourceMetric in window
queryParam := helpers.GenerateQueryParamsAvg(time.Duration(windowSeconds) * time.Second)
querier, err := c.metricCache.Querier(*queryParam.Start, *queryParam.End)
if err != nil {
klog.Warningf("get query failed, error %v", err)
return 0
}
defer querier.Close()
// BECPUUsage
avgBECPUMilliUsage, count01 := getBECPUMetric(metriccache.BEResourceAllocationUsage, querier, queryParam.Aggregate)
// BECPURequest
avgBECPUMilliRequest, count02 := getBECPUMetric(metriccache.BEResourceAllocationRequest, querier, queryParam.Aggregate)
// BECPULimit
avgBECPUMilliRealLimit, count03 := getBECPUMetric(metriccache.BEResourceAllocationRealLimit, querier, queryParam.Aggregate)
// CPU Satisfaction considers the allocatable when policy=evictByAllocatable.
avgBECPUMilliLimit := avgBECPUMilliRealLimit
beCPUMilliAllocatable := c.getBEMilliAllocatable()
if thresholdConfig.CPUEvictPolicy == slov1alpha1.EvictByAllocatablePolicy {
avgBECPUMilliLimit = beCPUMilliAllocatable
}
// get min count
count := minInt64(count01, count02, count03)
if !isAvgQueryResultValid(windowSeconds, int64(c.metricCollectInterval.Seconds()), count) {
return 0
}
if !isBECPUUsageHighEnough(avgBECPUMilliUsage, avgBECPUMilliLimit, thresholdConfig.CPUEvictBEUsageThresholdPercent) {
klog.V(5).Infof("cpuEvict by ResourceSatisfaction skipped, avg usage not enough, "+
"BEUsage:%v, BERequest:%v, BELimit:%v, BERealLimit:%v, BEAllocatable:%v",
avgBECPUMilliUsage, avgBECPUMilliRequest, avgBECPUMilliLimit, avgBECPUMilliRealLimit, beCPUMilliAllocatable)
return 0
}
milliRelease := calculateResourceMilliToRelease(avgBECPUMilliRequest, avgBECPUMilliLimit, thresholdConfig)
if milliRelease <= 0 {
klog.V(5).Infof("cpuEvict by ResourceSatisfaction skipped, releaseByAvg: %v", milliRelease)
return 0
}
// Step2: Calculate release resource current
queryParam = helpers.GenerateQueryParamsLast(c.metricCollectInterval * 2)
querier, err = c.metricCache.Querier(*queryParam.Start, *queryParam.End)
if err != nil {
klog.Warningf("get query failed, error %v", err)
return 0
}
defer querier.Close()
// BECPUUsage
currentBECPUMilliUsage, _ := getBECPUMetric(metriccache.BEResourceAllocationUsage, querier, queryParam.Aggregate)
// BECPURequest
currentBECPUMilliRequest, _ := getBECPUMetric(metriccache.BEResourceAllocationRequest, querier, queryParam.Aggregate)
// BECPULimit
currentBECPUMilliRealLimit, _ := getBECPUMetric(metriccache.BEResourceAllocationRealLimit, querier, queryParam.Aggregate)
// CPU Satisfaction considers the allocatable when policy=evictByAllocatable.
currentBECPUMilliLimit := currentBECPUMilliRealLimit
if thresholdConfig.CPUEvictPolicy == slov1alpha1.EvictByAllocatablePolicy {
currentBECPUMilliLimit = beCPUMilliAllocatable
}
if !isBECPUUsageHighEnough(currentBECPUMilliUsage, currentBECPUMilliLimit, thresholdConfig.CPUEvictBEUsageThresholdPercent) {
klog.V(5).Infof("cpuEvict by ResourceSatisfaction skipped, current usage not enough, "+
"BEUsage:%v, BERequest:%v, BELimit:%v, BERealLimit:%v, BEAllocatable:%v",
currentBECPUMilliUsage, currentBECPUMilliRequest, currentBECPUMilliLimit, currentBECPUMilliRealLimit,
beCPUMilliAllocatable)
return 0
}
// Requests and limits do not change frequently.
// If the current request and limit are equal to the average request and limit within the window period, there is no need to recalculate.
if currentBECPUMilliRequest == avgBECPUMilliRequest && currentBECPUMilliLimit == avgBECPUMilliLimit {
return milliRelease
}
milliReleaseByCurrent := calculateResourceMilliToRelease(currentBECPUMilliRequest, currentBECPUMilliLimit, thresholdConfig)
if milliReleaseByCurrent <= 0 {
klog.V(5).Infof("cpuEvict by ResourceSatisfaction skipped, releaseByCurrent: %v", milliReleaseByCurrent)
return 0
}
// Step3:release = min(releaseByAvg,releaseByCurrent)
if milliReleaseByCurrent < milliRelease {
milliRelease = milliReleaseByCurrent
}
if milliRelease > 0 {
klog.V(4).Infof("cpuEvict by ResourceSatisfaction start to evict, milliRelease: %v,"+
"current status (BEUsage:%v, BERequest:%v, BELimit:%v, BERealLimit:%v, BEAllocatable:%v)",
milliRelease, currentBECPUMilliUsage, currentBECPUMilliRequest, currentBECPUMilliLimit, currentBECPUMilliRealLimit,
beCPUMilliAllocatable)
}
return milliRelease
}
isBECPUUsageHighEnough()
判断离线使用率是否太高了
- 若
realLimit为0,返回false cpuUsage<1000,返回falsecpuUsage = beCPUMilliUsage / beCPUMilliRealLimitcpuUsage<thresholdPercent,返回falsethresholdPercent取值CPUEvictBEUsageThresholdPercent(默认90%),其实就是离线使用率。
- 否则返回true,cpu使用率太高了
func isBECPUUsageHighEnough(beCPUMilliUsage, beCPUMilliRequest, beCPUMilliRealLimit float64, thresholdPercent *int64) bool {
if beCPUMilliRealLimit == 0 {
klog.Warningf("cpuEvict by ResourceSatisfaction skipped! CPURealLimit is zero!")
return false
}
if beCPUMilliUsage < 1000 {
return true
}
cpuUsage := beCPUMilliUsage / beCPUMilliRealLimit
if thresholdPercent == nil {
thresholdPercent = pointer.Int64(beCPUUsageThresholdPercent)
}
if cpuUsage < float64(*thresholdPercent)/100 {
klog.Warningf("cpuEvict by ResourceSatisfaction skipped! cpuUsage(%.2f) and thresholdPercent %d!", cpuUsage, *thresholdPercent)
return false
}
return true
}
calculateResourceMilliToRelease()
- beCpuRequest为0,返回0
- 满足度=
beCPUMilliRealLimit / beCPUMilliRequest- =cgroup设置值/batch-cpu
- 就是给的资源太少了,导致任务运行太慢
- 满足度>
CPUEvictBESatisfactionLowerPercent(默认60),返回0 - 差值=
CPUEvictBESatisfactionUpperPercent(默认90)-满足度 - 差值小于0 ,返回0
- 返回beCpuRequest*差值
(会不会可能混部资源有空闲,但是调度过来的pod又用不起来,满足度不够又驱逐。比如任务使用率就是比较低的场景。其实不会,应该前面还有判断离线使用率要大于90%才进行计算驱逐,这里没有满足,说明肯定是资源不够)
func calculateResourceMilliToRelease(beCPUMilliRequest, beCPUMilliRealLimit float64, thresholdConfig *slov1alpha1.ResourceThresholdStrategy) int64 {
if beCPUMilliRequest == 0 {
klog.Warningf("cpuEvict by ResourceSatisfaction skipped! be pods requests is zero!")
return 0
}
satisfactionRate := beCPUMilliRealLimit / beCPUMilliRequest
if satisfactionRate > float64(*thresholdConfig.CPUEvictBESatisfactionLowerPercent)/100 {
klog.Warningf("cpuEvict by ResourceSatisfaction skipped! satisfactionRate(%.2f) and lowPercent(%f)", satisfactionRate, float64(*thresholdConfig.CPUEvictBESatisfactionLowerPercent))
return 0
}
rateGap := float64(*thresholdConfig.CPUEvictBESatisfactionUpperPercent)/100 - satisfactionRate
if rateGap <= 0 {
klog.Warningf("cpuEvict by ResourceSatisfaction skipped! satisfactionRate(%.2f) > upperPercent(%f)", satisfactionRate, float64(*thresholdConfig.CPUEvictBESatisfactionUpperPercent))
return 0
}
milliRelease := beCPUMilliRequest * rateGap
return int64(milliRelease)
}
getPodEvictInfoAndSort()
- 遍历节点上所有pod(从stateInformer获取)
- 若是be容器(离线)
- 查询
metric:pod_cpu_usage最新值(其实是个cpu使用核数) - 计算容器所有container
batch-cpu总和 - 计算
cpuUsage=使用核数/batch-cpu
- 查询
- 排序:1、按优先级从小到大,低优排前面 2、再按cpuUsage 从大到小,大的排前面。
func (c *cpuEvictor) getPodEvictInfoAndSort() []*podEvictCPUInfo {
var bePodInfos []*podEvictCPUInfo
for _, podMeta := range c.statesInformer.GetAllPods() {
pod := podMeta.Pod
if apiext.GetPodQoSClassRaw(pod) == apiext.QoSBE {
bePodInfo := &podEvictCPUInfo{pod: podMeta.Pod}
queryMeta, err := metriccache.PodCPUUsageMetric.BuildQueryMeta(metriccache.MetricPropertiesFunc.Pod(string(pod.UID)))
if err == nil {
result, err := helpers.CollectPodMetricLast(c.metricCache, queryMeta, c.metricCollectInterval)
if err == nil {
bePodInfo.milliUsedCores = int64(result * 1000)
}
}
milliRequestSum := int64(0)
for _, container := range pod.Spec.Containers {
containerCPUReq := util.GetContainerBatchMilliCPURequest(&container)
if containerCPUReq > 0 {
milliRequestSum = milliRequestSum + containerCPUReq
}
}
bePodInfo.milliRequest = milliRequestSum
if bePodInfo.milliRequest > 0 {
bePodInfo.cpuUsage = float64(bePodInfo.milliUsedCores) / float64(bePodInfo.milliRequest)
}
bePodInfos = append(bePodInfos, bePodInfo)
}
}
sort.Slice(bePodInfos, func(i, j int) bool {
if bePodInfos[i].pod.Spec.Priority == nil || bePodInfos[j].pod.Spec.Priority == nil ||
*bePodInfos[i].pod.Spec.Priority == *bePodInfos[j].pod.Spec.Priority {
return bePodInfos[i].cpuUsage > bePodInfos[j].cpuUsage
}
return *bePodInfos[i].pod.Spec.Priority < *bePodInfos[j].pod.Spec.Priority
})
return bePodInfos
}
killAndEvictBEPodsRelease()
杀容器。
func (c *cpuEvictor) killAndEvictBEPodsRelease(node *corev1.Node, bePodInfos []*podEvictCPUInfo, cpuNeedMilliRelease int64) {
message := fmt.Sprintf("killAndEvictBEPodsRelease for node(%s), need realase CPU : %d", node.Name, cpuNeedMilliRelease)
cpuMilliReleased := int64(0)
var killedPods []*corev1.Pod
for _, bePod := range bePodInfos {
if cpuMilliReleased >= cpuNeedMilliRelease {
break
}
podKillMsg := fmt.Sprintf("%s, kill pod : %s", message, bePod.pod.Name)
helpers.KillContainers(bePod.pod, podKillMsg)
killedPods = append(killedPods, bePod.pod)
cpuMilliReleased = cpuMilliReleased + bePod.milliRequest
}
c.evictor.EvictPodsIfNotEvicted(killedPods, node, resourceexecutor.EvictPodByBECPUSatisfaction, message)
if len(killedPods) > 0 {
c.lastEvictTime = time.Now()
}
klog.V(5).Infof("killAndEvictBEPodsRelease finished!cpuNeedMilliRelease(%d) cpuMilliReleased(%d)", cpuNeedMilliRelease, cpuMilliReleased)
}
1157

被折叠的 条评论
为什么被折叠?



