Koordinator-Predict

创建predictServer

predictServer := prediction.NewPeakPredictServer(config.PredictionConf)
predictorFactory := prediction.NewPredictorFactory(predictServer, config.PredictionConf.ColdStartDuration, config.PredictionConf.SafetyMarginPercent)

看下NewPeakPredictServer方法。

  • 设置cfg
  • 初始化uidGenerator、PredictModelMap、checkPointer
func NewPeakPredictServer(cfg *Config) PredictServer {
    return &peakPredictServer{
        cfg:          cfg,
        uidGenerator: &generator{},
        models:       make(map[UIDType]*PredictModel),
        clock:        clock.RealClock{},
        hasSynced:    &atomic.Bool{},
        checkpointer: NewFileCheckpointer(cfg.CheckpointFilepath),
    }
}

NewPredictorFactory

简单设置属性

func NewPredictorFactory(predictServer PredictServer, coldStartDuration time.Duration, safetyMarginPercent int) PredictorFactory {
    return &predictorFactory{
        predictServer:       predictServer,
        coldStartDuration:   coldStartDuration,
        safetyMarginPercent: safetyMarginPercent,
    }
}

predict.Config

type Config struct {
	CheckpointFilepath           string
	ColdStartDuration            time.Duration
	SafetyMarginPercent          int
	MemoryHistogramDecayHalfLife time.Duration
	CPUHistogramDecayHalfLife    time.Duration
	TrainingInterval             time.Duration
	ModelExpirationDuration      time.Duration
	ModelCheckpointInterval      time.Duration
	ModelCheckpointMaxPerStep    int
}

启动predictServer

// start predict server
    go func() {
        if err := d.predictServer.Setup(d.statesInformer, d.metricCache); err != nil {
            klog.Fatal("Unable to setup the predict server: ", err)
        }
        if err := d.predictServer.Run(stopCh); err != nil {
            klog.Fatal("Unable to run the predict server: ", err)
        }
    }()

Run()

  • 重置模型数据
  • metric模型训练,1min一次
  • 清理过期metric,1min一次
  • checkpoint,1min一次
func (p *peakPredictServer) Run(stopCh <-chan struct{}) error {
    if !cache.WaitForCacheSync(stopCh, p.informer.HasSynced) {
       return fmt.Errorf("time out waiting for states informer caches to sync")
    }
    unknownUIDs := p.restoreModels()
    // remove unknown checkpoints before starting to work
    for _, uid := range unknownUIDs {
       err := p.checkpointer.Remove(uid)
       klog.InfoS("remove unknown checkpoint", "uid", uid)
       if err != nil {
          klog.Errorf("remove checkpoint %v failed, err: %v", uid, err)
       }
    }
    go wait.Until(p.training, p.cfg.TrainingInterval, stopCh)
    go wait.Until(p.gcModels, time.Minute, stopCh)
    go wait.Until(p.doCheckpoint, time.Minute, stopCh)
    <-stopCh
    return nil
}

training()

  • 遍历所有容器

    • 获取容器最近cpu/mem使用率,更新容器id model。
  • 获取node最近cpu/mem使用率,更新nodeid model。

  • 更新node priority class model

  • 更新system model

func (p *peakPredictServer) training() {
    // get pod metrics
    // 1. list pods, update models
    pods := p.informer.ListPods()
    // count the node-level usages of different priority classes and system
    nodeItemsMetric := NewNodeItemUsage()
    for _, pod := range pods {
       uid := p.uidGenerator.Pod(pod)
       lastCPUUsage, err := p.metricServer.GetPodMetric(MetricDesc{UID: uid}, CPUUsage)
       if err != nil {
          klog.Warningf("failed to query pod cpu metric, pod %s, err: %s", util.GetPodKey(pod), err)
          continue
       }
       lastMemoryUsage, err := p.metricServer.GetPodMetric(MetricDesc{UID: uid}, MemoryUsage)
       if err != nil {
          klog.Warningf("failed to query pod memory metric, pod %s, err: %s", util.GetPodKey(pod), err)
          continue
       }
       // update the pod model
       p.updateModel(uid, lastCPUUsage, lastMemoryUsage)
       // update the node priority metric
       priorityItemID := string(extension.GetPodPriorityClassWithDefault(pod))
       nodeItemsMetric.AddMetric(priorityItemID, lastCPUUsage, lastMemoryUsage)
       // count all pods metric
       nodeItemsMetric.AddMetric(AllPodsItemID, lastCPUUsage, lastMemoryUsage)
    }


    // 2. get node, update models
    nodeUID := p.uidGenerator.Node()
    lastNodeCPUUsage, errCPU := p.metricServer.GetNodeMetric(MetricDesc{UID: nodeUID}, CPUUsage)
    lastNodeMemoryUsage, errMem := p.metricServer.GetNodeMetric(MetricDesc{UID: nodeUID}, MemoryUsage)
    if errCPU != nil || errMem != nil {
       klog.Warningf("failed to query node cpu and memory metric, CPU err: %s, Memory err: %s", errCPU, errMem)
    } else {
       p.updateModel(nodeUID, lastNodeCPUUsage, lastNodeMemoryUsage)
    }


    // 3. update node priority models
    for _, priorityClass := range extension.KnownPriorityClasses {
       itemID := string(priorityClass)
       priorityUID := p.uidGenerator.NodeItem(itemID)
       metric, ok := nodeItemsMetric.GetMetric(itemID)
       if ok {
          p.updateModel(priorityUID, metric.LastCPUUsage, metric.LastMemoryUsage)
       } else {
          // reset the priority usage
          p.updateModel(priorityUID, 0, 0)
       }
    }


    // 4. update system model
    sysCPUUsage := lastNodeCPUUsage
    sysMemoryUsage := lastNodeMemoryUsage
    allPodsMetric, ok := nodeItemsMetric.GetMetric(AllPodsItemID)
    if ok {
       sysCPUUsage = math.Max(sysCPUUsage-allPodsMetric.LastCPUUsage, 0)
       sysMemoryUsage = math.Max(sysMemoryUsage-allPodsMetric.LastMemoryUsage, 0)
    }
    systemUID := p.uidGenerator.NodeItem(SystemItemID)
    p.updateModel(systemUID, sysCPUUsage, sysMemoryUsage)


    p.hasSynced.Store(true)
}

updateModel()

不存在对应model则设置一下model,调用AddSample()

func (p *peakPredictServer) updateModel(uid UIDType, cpu, memory float64) {
    p.modelsLock.Lock()
    defer p.modelsLock.Unlock()
    model, ok := p.models[uid]
    if !ok {
        model = &PredictModel{
            CPU:    p.defaultCPUHistogram(),
            Memory: p.defaultMemoryHistogram(),
        }
        p.models[uid] = model
    }
    now := p.clock.Now()
    model.Lock.Lock()
    defer model.Lock.Unlock()
    model.LastUpdated = now
    // TODO Add adjusted weights
    model.CPU.AddSample(cpu, 1, now)
    model.Memory.AddSample(memory, 1, now)
}

gcModels()

清理过期model

  • 遍历所有的model,若上次更新信息大于配置的过期时长,则删除。

    • 同时删除对应checkpointer中的model
func (p *peakPredictServer) gcModels() {
    if !p.HasSynced() {
        klog.Infof("wait for the state to be synchronized, skipping the step of model GC")
        return
    }
    tobeRemovedModels := make([]UIDType, 0)
    p.modelsLock.Lock()
    for uid, model := range p.models {
        if p.clock.Since(model.LastUpdated) > p.cfg.ModelExpirationDuration {
            delete(p.models, uid)
            klog.InfoS("gc model", "uid", uid)
            tobeRemovedModels = append(tobeRemovedModels, uid)
        }
    }
    p.modelsLock.Unlock()
    // do the io operations out of lock
    for _, uid := range tobeRemovedModels {
        err := p.checkpointer.Remove(uid)
        klog.InfoS("remove checkpoint", "uid", uid)
        if err != nil {
            klog.Errorf("remove checkpoint %v failed, err: %v", uid, err)
        }
    }
}

doCheckpoint()

  • 遍历所有的model

    • 按key:model进行pair组装,然后对上次checkpoint时间进行排序。
  • 遍历所有的pair,最大检查个数和间隔时间判断下,然后调用model的SaveToCheckpointer方法。

func (p *peakPredictServer) doCheckpoint() {
    if !p.HasSynced() {
        klog.Infof("wait for the state to be synchronized, skipping the step of model GC")
        return
    }
    type pair struct {
        UID   UIDType
        Model *PredictModel
    }
    p.modelsLock.Lock()
    pairs := make([]pair, 0, len(p.models))
    for key, model := range p.models {
        pairs = append(pairs, pair{UID: key, Model: model})
    }
    p.modelsLock.Unlock()
    // Sort models and keys by LastCheckpointed time
    sort.Slice(pairs, func(i, j int) bool {
        return pairs[i].Model.LastCheckpointed.Before(pairs[j].Model.LastCheckpointed)
    })
    checkpointModelsCount := 0
    for _, pair := range pairs {
        if checkpointModelsCount >= p.cfg.ModelCheckpointMaxPerStep {
            break
        }
        if p.clock.Since(pair.Model.LastCheckpointed) < p.cfg.ModelCheckpointInterval {
            break
        }
        ckpt := ModelCheckpoint{
            UID:         pair.UID,
            LastUpdated: metav1.NewTime(p.clock.Now()),
        }
        pair.Model.Lock.Lock()
        ckpt.CPU, _ = pair.Model.CPU.SaveToCheckpoint()
        ckpt.Memory, _ = pair.Model.Memory.SaveToCheckpoint()
        pair.Model.Lock.Unlock()
        err := p.checkpointer.Save(ckpt)
        if err != nil {
            klog.Errorf("save checkpoint uid %v failed, err: %s", pair.UID, err)
        } else {
            klog.InfoS("save checkpoint", "uid", pair.UID)
        }
        pair.Model.LastCheckpointed = p.clock.Now()
        checkpointModelsCount++
    }
}

SaveToCheckpoint()

func (h *histogram) SaveToCheckpoint() (*HistogramCheckpoint, error) {
	result := HistogramCheckpoint{
		BucketWeights: make(map[int]uint32),
	}
	result.TotalWeight = h.totalWeight
	// Find max
	max := 0.
	for bucket := h.minBucket; bucket <= h.maxBucket; bucket++ {
		if h.bucketWeight[bucket] > max {
			max = h.bucketWeight[bucket]
		}
	}
	// Compute ratio
	ratio := float64(MaxCheckpointWeight) / max
	// Convert weights and drop near-zero weights
	for bucket := h.minBucket; bucket <= h.maxBucket; bucket++ {
		newWeight := uint32(round(h.bucketWeight[bucket] * ratio))
		if newWeight > 0 {
			result.BucketWeights[bucket] = newWeight
		}
	}

	return &result, nil
}

目前只看到了怎样把数据放入到model中,还没有看到怎样去使用model数据。

接下来去找下哪里使用了model。

源码来自:https://pan.quark.cn/s/10499a977e58 在信息技术与软件开发领域,特别是数据管理及算法规划方面,计算数值的属性值是一项关键的工作。 依据标题“借助JAVA,计算数值的属性值”,我们可以解读为运用Java编程语言开发一种途径,这种途径能够评估数值的某些特定属性或指标。 说明中提及“奇偶属性是一种基础的属性值”,这表明我们或许会研究如何利用Java判定数值的奇偶性,但这仅是属性值的一种初级应用。 属性值一般与线性代数中的矩阵理论相关联,其中属性值展现了矩阵变换对向量伸缩和旋转的影响程度。 然而,在此场景下,属性值或许涉及更广泛的数学或编程范畴,例如数值的素因子分解、模数运算特性、位运算、数值的统计特征(例如算术平均数、中位数、众数)或其他定制化计算。 在Java环境中,有多种技术手段可用于求取数值的属性值。 以奇偶属性值为例,可以通过检查数值除以2的余数来确定:```javaint value = 17;boolean isOdd = value % 2 != 0; // 判断是否为奇数```倘若我们要计算更高级的属性值,例如素因子分解,可以编写一个函数来分解指定数值的所有素因子:```javapublic List<Integer> factorization(int value) { List<Integer> elements = new ArrayList<>(); for (int j = 2; j * j <= value; j++) { while (value % j == 0) { elements.add(j); value /= j; } } if (value > 1) { elements.add(valu...
### Koordinator 跨云管理 Kubernetes 集群的支持性 Koordinator 是一个专注于合工作负载调度的系统,其设计目标是提升延迟敏感型工作负载和批处理作业的运行效率与可靠性,同时通过资源超卖和技术提高集群资源利用率[^2]。然而,关于 Koordinator 是否支持跨云管理 Kubernetes 集群的问题,目前官方文档和已知信息中并未明确提及此类功能。 从架构角度来看,Koordinator 的核心功能集中在单个或多个同构集群内的资源调度与管理,例如支持 K8s 与 YARN 、资源精细化管理以及通过 QoS 提升任务运行效率[^1]。这些特性主要面向同一基础设施下的多租户环境,并未直接涉及跨不同云供应商的 Kubernetes 集群管理。 在跨云管理方面,通常需要依赖于更高层次的多集群管理工具或框架,例如 Kubernetes 原生的 Cluster API、Kubefed(Kubernetes Federation),或者第三方解决方案如 Rancher、Anthos 等。这些工具提供了对分布在不同云环境中的 Kubernetes 集群进行统一管理和调度的能力。如果 Koordinator 要实现跨云管理,理论上需要与上述工具集成,或者扩展其自身功能以支持多云环境下的集群发现、资源协调和任务调度。 此外,尽管 Koordinator 本身可能不直接支持跨云管理,但其设计理念和模块化架构为未来扩展此类功能提供了可能性。例如,通过引入外调度插件或与多集群管理平台结合,Koordinator 可能能够间接支持跨云场景下的资源调度[^3]。 ```python # 示例:假设 Koordinator 集成多集群管理功能后的伪代码 from koordinator import Scheduler, MultiClusterManager def cross_cloud_scheduling(clusters: list, workload: dict): manager = MultiClusterManager(clusters) scheduler = Scheduler(manager) return scheduler.schedule(workload) ``` ### 相关问题
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值