Koordinator-Predict

创建predictServer

predictServer := prediction.NewPeakPredictServer(config.PredictionConf)
predictorFactory := prediction.NewPredictorFactory(predictServer, config.PredictionConf.ColdStartDuration, config.PredictionConf.SafetyMarginPercent)

看下NewPeakPredictServer方法。

  • 设置cfg
  • 初始化uidGenerator、PredictModelMap、checkPointer
func NewPeakPredictServer(cfg *Config) PredictServer {
    return &peakPredictServer{
        cfg:          cfg,
        uidGenerator: &generator{},
        models:       make(map[UIDType]*PredictModel),
        clock:        clock.RealClock{},
        hasSynced:    &atomic.Bool{},
        checkpointer: NewFileCheckpointer(cfg.CheckpointFilepath),
    }
}

NewPredictorFactory

简单设置属性

func NewPredictorFactory(predictServer PredictServer, coldStartDuration time.Duration, safetyMarginPercent int) PredictorFactory {
    return &predictorFactory{
        predictServer:       predictServer,
        coldStartDuration:   coldStartDuration,
        safetyMarginPercent: safetyMarginPercent,
    }
}

predict.Config

type Config struct {
	CheckpointFilepath           string
	ColdStartDuration            time.Duration
	SafetyMarginPercent          int
	MemoryHistogramDecayHalfLife time.Duration
	CPUHistogramDecayHalfLife    time.Duration
	TrainingInterval             time.Duration
	ModelExpirationDuration      time.Duration
	ModelCheckpointInterval      time.Duration
	ModelCheckpointMaxPerStep    int
}

启动predictServer

// start predict server
    go func() {
        if err := d.predictServer.Setup(d.statesInformer, d.metricCache); err != nil {
            klog.Fatal("Unable to setup the predict server: ", err)
        }
        if err := d.predictServer.Run(stopCh); err != nil {
            klog.Fatal("Unable to run the predict server: ", err)
        }
    }()

Run()

  • 重置模型数据
  • metric模型训练,1min一次
  • 清理过期metric,1min一次
  • checkpoint,1min一次
func (p *peakPredictServer) Run(stopCh <-chan struct{}) error {
    if !cache.WaitForCacheSync(stopCh, p.informer.HasSynced) {
       return fmt.Errorf("time out waiting for states informer caches to sync")
    }
    unknownUIDs := p.restoreModels()
    // remove unknown checkpoints before starting to work
    for _, uid := range unknownUIDs {
       err := p.checkpointer.Remove(uid)
       klog.InfoS("remove unknown checkpoint", "uid", uid)
       if err != nil {
          klog.Errorf("remove checkpoint %v failed, err: %v", uid, err)
       }
    }
    go wait.Until(p.training, p.cfg.TrainingInterval, stopCh)
    go wait.Until(p.gcModels, time.Minute, stopCh)
    go wait.Until(p.doCheckpoint, time.Minute, stopCh)
    <-stopCh
    return nil
}

training()

  • 遍历所有容器

    • 获取容器最近cpu/mem使用率,更新容器id model。
  • 获取node最近cpu/mem使用率,更新nodeid model。

  • 更新node priority class model

  • 更新system model

func (p *peakPredictServer) training() {
    // get pod metrics
    // 1. list pods, update models
    pods := p.informer.ListPods()
    // count the node-level usages of different priority classes and system
    nodeItemsMetric := NewNodeItemUsage()
    for _, pod := range pods {
       uid := p.uidGenerator.Pod(pod)
       lastCPUUsage, err := p.metricServer.GetPodMetric(MetricDesc{UID: uid}, CPUUsage)
       if err != nil {
          klog.Warningf("failed to query pod cpu metric, pod %s, err: %s", util.GetPodKey(pod), err)
          continue
       }
       lastMemoryUsage, err := p.metricServer.GetPodMetric(MetricDesc{UID: uid}, MemoryUsage)
       if err != nil {
          klog.Warningf("failed to query pod memory metric, pod %s, err: %s", util.GetPodKey(pod), err)
          continue
       }
       // update the pod model
       p.updateModel(uid, lastCPUUsage, lastMemoryUsage)
       // update the node priority metric
       priorityItemID := string(extension.GetPodPriorityClassWithDefault(pod))
       nodeItemsMetric.AddMetric(priorityItemID, lastCPUUsage, lastMemoryUsage)
       // count all pods metric
       nodeItemsMetric.AddMetric(AllPodsItemID, lastCPUUsage, lastMemoryUsage)
    }


    // 2. get node, update models
    nodeUID := p.uidGenerator.Node()
    lastNodeCPUUsage, errCPU := p.metricServer.GetNodeMetric(MetricDesc{UID: nodeUID}, CPUUsage)
    lastNodeMemoryUsage, errMem := p.metricServer.GetNodeMetric(MetricDesc{UID: nodeUID}, MemoryUsage)
    if errCPU != nil || errMem != nil {
       klog.Warningf("failed to query node cpu and memory metric, CPU err: %s, Memory err: %s", errCPU, errMem)
    } else {
       p.updateModel(nodeUID, lastNodeCPUUsage, lastNodeMemoryUsage)
    }


    // 3. update node priority models
    for _, priorityClass := range extension.KnownPriorityClasses {
       itemID := string(priorityClass)
       priorityUID := p.uidGenerator.NodeItem(itemID)
       metric, ok := nodeItemsMetric.GetMetric(itemID)
       if ok {
          p.updateModel(priorityUID, metric.LastCPUUsage, metric.LastMemoryUsage)
       } else {
          // reset the priority usage
          p.updateModel(priorityUID, 0, 0)
       }
    }


    // 4. update system model
    sysCPUUsage := lastNodeCPUUsage
    sysMemoryUsage := lastNodeMemoryUsage
    allPodsMetric, ok := nodeItemsMetric.GetMetric(AllPodsItemID)
    if ok {
       sysCPUUsage = math.Max(sysCPUUsage-allPodsMetric.LastCPUUsage, 0)
       sysMemoryUsage = math.Max(sysMemoryUsage-allPodsMetric.LastMemoryUsage, 0)
    }
    systemUID := p.uidGenerator.NodeItem(SystemItemID)
    p.updateModel(systemUID, sysCPUUsage, sysMemoryUsage)


    p.hasSynced.Store(true)
}

updateModel()

不存在对应model则设置一下model,调用AddSample()

func (p *peakPredictServer) updateModel(uid UIDType, cpu, memory float64) {
    p.modelsLock.Lock()
    defer p.modelsLock.Unlock()
    model, ok := p.models[uid]
    if !ok {
        model = &PredictModel{
            CPU:    p.defaultCPUHistogram(),
            Memory: p.defaultMemoryHistogram(),
        }
        p.models[uid] = model
    }
    now := p.clock.Now()
    model.Lock.Lock()
    defer model.Lock.Unlock()
    model.LastUpdated = now
    // TODO Add adjusted weights
    model.CPU.AddSample(cpu, 1, now)
    model.Memory.AddSample(memory, 1, now)
}

gcModels()

清理过期model

  • 遍历所有的model,若上次更新信息大于配置的过期时长,则删除。

    • 同时删除对应checkpointer中的model
func (p *peakPredictServer) gcModels() {
    if !p.HasSynced() {
        klog.Infof("wait for the state to be synchronized, skipping the step of model GC")
        return
    }
    tobeRemovedModels := make([]UIDType, 0)
    p.modelsLock.Lock()
    for uid, model := range p.models {
        if p.clock.Since(model.LastUpdated) > p.cfg.ModelExpirationDuration {
            delete(p.models, uid)
            klog.InfoS("gc model", "uid", uid)
            tobeRemovedModels = append(tobeRemovedModels, uid)
        }
    }
    p.modelsLock.Unlock()
    // do the io operations out of lock
    for _, uid := range tobeRemovedModels {
        err := p.checkpointer.Remove(uid)
        klog.InfoS("remove checkpoint", "uid", uid)
        if err != nil {
            klog.Errorf("remove checkpoint %v failed, err: %v", uid, err)
        }
    }
}

doCheckpoint()

  • 遍历所有的model

    • 按key:model进行pair组装,然后对上次checkpoint时间进行排序。
  • 遍历所有的pair,最大检查个数和间隔时间判断下,然后调用model的SaveToCheckpointer方法。

func (p *peakPredictServer) doCheckpoint() {
    if !p.HasSynced() {
        klog.Infof("wait for the state to be synchronized, skipping the step of model GC")
        return
    }
    type pair struct {
        UID   UIDType
        Model *PredictModel
    }
    p.modelsLock.Lock()
    pairs := make([]pair, 0, len(p.models))
    for key, model := range p.models {
        pairs = append(pairs, pair{UID: key, Model: model})
    }
    p.modelsLock.Unlock()
    // Sort models and keys by LastCheckpointed time
    sort.Slice(pairs, func(i, j int) bool {
        return pairs[i].Model.LastCheckpointed.Before(pairs[j].Model.LastCheckpointed)
    })
    checkpointModelsCount := 0
    for _, pair := range pairs {
        if checkpointModelsCount >= p.cfg.ModelCheckpointMaxPerStep {
            break
        }
        if p.clock.Since(pair.Model.LastCheckpointed) < p.cfg.ModelCheckpointInterval {
            break
        }
        ckpt := ModelCheckpoint{
            UID:         pair.UID,
            LastUpdated: metav1.NewTime(p.clock.Now()),
        }
        pair.Model.Lock.Lock()
        ckpt.CPU, _ = pair.Model.CPU.SaveToCheckpoint()
        ckpt.Memory, _ = pair.Model.Memory.SaveToCheckpoint()
        pair.Model.Lock.Unlock()
        err := p.checkpointer.Save(ckpt)
        if err != nil {
            klog.Errorf("save checkpoint uid %v failed, err: %s", pair.UID, err)
        } else {
            klog.InfoS("save checkpoint", "uid", pair.UID)
        }
        pair.Model.LastCheckpointed = p.clock.Now()
        checkpointModelsCount++
    }
}

SaveToCheckpoint()

func (h *histogram) SaveToCheckpoint() (*HistogramCheckpoint, error) {
	result := HistogramCheckpoint{
		BucketWeights: make(map[int]uint32),
	}
	result.TotalWeight = h.totalWeight
	// Find max
	max := 0.
	for bucket := h.minBucket; bucket <= h.maxBucket; bucket++ {
		if h.bucketWeight[bucket] > max {
			max = h.bucketWeight[bucket]
		}
	}
	// Compute ratio
	ratio := float64(MaxCheckpointWeight) / max
	// Convert weights and drop near-zero weights
	for bucket := h.minBucket; bucket <= h.maxBucket; bucket++ {
		newWeight := uint32(round(h.bucketWeight[bucket] * ratio))
		if newWeight > 0 {
			result.BucketWeights[bucket] = newWeight
		}
	}

	return &result, nil
}

目前只看到了怎样把数据放入到model中,还没有看到怎样去使用model数据。

接下来去找下哪里使用了model。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值