创建predictServer
- 使用PredictionConf来构造创建predictServer。
- predictServer和冷启动时长、安全比例构造predictFactory
predictServer := prediction.NewPeakPredictServer(config.PredictionConf)
predictorFactory := prediction.NewPredictorFactory(predictServer, config.PredictionConf.ColdStartDuration, config.PredictionConf.SafetyMarginPercent)
看下NewPeakPredictServer方法。
- 设置cfg
- 初始化uidGenerator、PredictModelMap、checkPointer
func NewPeakPredictServer(cfg *Config) PredictServer {
return &peakPredictServer{
cfg: cfg,
uidGenerator: &generator{},
models: make(map[UIDType]*PredictModel),
clock: clock.RealClock{},
hasSynced: &atomic.Bool{},
checkpointer: NewFileCheckpointer(cfg.CheckpointFilepath),
}
}
NewPredictorFactory
简单设置属性
func NewPredictorFactory(predictServer PredictServer, coldStartDuration time.Duration, safetyMarginPercent int) PredictorFactory {
return &predictorFactory{
predictServer: predictServer,
coldStartDuration: coldStartDuration,
safetyMarginPercent: safetyMarginPercent,
}
}
predict.Config
- 包括checkpointFilePath、冷启动时间、安全比例、内存统计半时长、cpu统计半时长、训练间隔、model失效时长、model checkpoint间隔、model checkpoint最大步长。
type Config struct {
CheckpointFilepath string
ColdStartDuration time.Duration
SafetyMarginPercent int
MemoryHistogramDecayHalfLife time.Duration
CPUHistogramDecayHalfLife time.Duration
TrainingInterval time.Duration
ModelExpirationDuration time.Duration
ModelCheckpointInterval time.Duration
ModelCheckpointMaxPerStep int
}
启动predictServer
// start predict server
go func() {
if err := d.predictServer.Setup(d.statesInformer, d.metricCache); err != nil {
klog.Fatal("Unable to setup the predict server: ", err)
}
if err := d.predictServer.Run(stopCh); err != nil {
klog.Fatal("Unable to run the predict server: ", err)
}
}()
Run()
- 重置模型数据
- metric模型训练,1min一次
- 清理过期metric,1min一次
- checkpoint,1min一次
func (p *peakPredictServer) Run(stopCh <-chan struct{}) error {
if !cache.WaitForCacheSync(stopCh, p.informer.HasSynced) {
return fmt.Errorf("time out waiting for states informer caches to sync")
}
unknownUIDs := p.restoreModels()
// remove unknown checkpoints before starting to work
for _, uid := range unknownUIDs {
err := p.checkpointer.Remove(uid)
klog.InfoS("remove unknown checkpoint", "uid", uid)
if err != nil {
klog.Errorf("remove checkpoint %v failed, err: %v", uid, err)
}
}
go wait.Until(p.training, p.cfg.TrainingInterval, stopCh)
go wait.Until(p.gcModels, time.Minute, stopCh)
go wait.Until(p.doCheckpoint, time.Minute, stopCh)
<-stopCh
return nil
}
training()
-
遍历所有容器
-
- 获取容器最近cpu/mem使用率,更新容器id model。
-
获取node最近cpu/mem使用率,更新nodeid model。
-
更新node priority class model
-
更新system model
func (p *peakPredictServer) training() {
// get pod metrics
// 1. list pods, update models
pods := p.informer.ListPods()
// count the node-level usages of different priority classes and system
nodeItemsMetric := NewNodeItemUsage()
for _, pod := range pods {
uid := p.uidGenerator.Pod(pod)
lastCPUUsage, err := p.metricServer.GetPodMetric(MetricDesc{UID: uid}, CPUUsage)
if err != nil {
klog.Warningf("failed to query pod cpu metric, pod %s, err: %s", util.GetPodKey(pod), err)
continue
}
lastMemoryUsage, err := p.metricServer.GetPodMetric(MetricDesc{UID: uid}, MemoryUsage)
if err != nil {
klog.Warningf("failed to query pod memory metric, pod %s, err: %s", util.GetPodKey(pod), err)
continue
}
// update the pod model
p.updateModel(uid, lastCPUUsage, lastMemoryUsage)
// update the node priority metric
priorityItemID := string(extension.GetPodPriorityClassWithDefault(pod))
nodeItemsMetric.AddMetric(priorityItemID, lastCPUUsage, lastMemoryUsage)
// count all pods metric
nodeItemsMetric.AddMetric(AllPodsItemID, lastCPUUsage, lastMemoryUsage)
}
// 2. get node, update models
nodeUID := p.uidGenerator.Node()
lastNodeCPUUsage, errCPU := p.metricServer.GetNodeMetric(MetricDesc{UID: nodeUID}, CPUUsage)
lastNodeMemoryUsage, errMem := p.metricServer.GetNodeMetric(MetricDesc{UID: nodeUID}, MemoryUsage)
if errCPU != nil || errMem != nil {
klog.Warningf("failed to query node cpu and memory metric, CPU err: %s, Memory err: %s", errCPU, errMem)
} else {
p.updateModel(nodeUID, lastNodeCPUUsage, lastNodeMemoryUsage)
}
// 3. update node priority models
for _, priorityClass := range extension.KnownPriorityClasses {
itemID := string(priorityClass)
priorityUID := p.uidGenerator.NodeItem(itemID)
metric, ok := nodeItemsMetric.GetMetric(itemID)
if ok {
p.updateModel(priorityUID, metric.LastCPUUsage, metric.LastMemoryUsage)
} else {
// reset the priority usage
p.updateModel(priorityUID, 0, 0)
}
}
// 4. update system model
sysCPUUsage := lastNodeCPUUsage
sysMemoryUsage := lastNodeMemoryUsage
allPodsMetric, ok := nodeItemsMetric.GetMetric(AllPodsItemID)
if ok {
sysCPUUsage = math.Max(sysCPUUsage-allPodsMetric.LastCPUUsage, 0)
sysMemoryUsage = math.Max(sysMemoryUsage-allPodsMetric.LastMemoryUsage, 0)
}
systemUID := p.uidGenerator.NodeItem(SystemItemID)
p.updateModel(systemUID, sysCPUUsage, sysMemoryUsage)
p.hasSynced.Store(true)
}
updateModel()
不存在对应model则设置一下model,调用AddSample()
func (p *peakPredictServer) updateModel(uid UIDType, cpu, memory float64) {
p.modelsLock.Lock()
defer p.modelsLock.Unlock()
model, ok := p.models[uid]
if !ok {
model = &PredictModel{
CPU: p.defaultCPUHistogram(),
Memory: p.defaultMemoryHistogram(),
}
p.models[uid] = model
}
now := p.clock.Now()
model.Lock.Lock()
defer model.Lock.Unlock()
model.LastUpdated = now
// TODO Add adjusted weights
model.CPU.AddSample(cpu, 1, now)
model.Memory.AddSample(memory, 1, now)
}
gcModels()
清理过期model
-
遍历所有的model,若上次更新信息大于配置的过期时长,则删除。
-
- 同时删除对应checkpointer中的model
func (p *peakPredictServer) gcModels() {
if !p.HasSynced() {
klog.Infof("wait for the state to be synchronized, skipping the step of model GC")
return
}
tobeRemovedModels := make([]UIDType, 0)
p.modelsLock.Lock()
for uid, model := range p.models {
if p.clock.Since(model.LastUpdated) > p.cfg.ModelExpirationDuration {
delete(p.models, uid)
klog.InfoS("gc model", "uid", uid)
tobeRemovedModels = append(tobeRemovedModels, uid)
}
}
p.modelsLock.Unlock()
// do the io operations out of lock
for _, uid := range tobeRemovedModels {
err := p.checkpointer.Remove(uid)
klog.InfoS("remove checkpoint", "uid", uid)
if err != nil {
klog.Errorf("remove checkpoint %v failed, err: %v", uid, err)
}
}
}
doCheckpoint()
-
遍历所有的model
-
- 按key:model进行pair组装,然后对上次checkpoint时间进行排序。
-
遍历所有的pair,最大检查个数和间隔时间判断下,然后调用model的SaveToCheckpointer方法。
func (p *peakPredictServer) doCheckpoint() {
if !p.HasSynced() {
klog.Infof("wait for the state to be synchronized, skipping the step of model GC")
return
}
type pair struct {
UID UIDType
Model *PredictModel
}
p.modelsLock.Lock()
pairs := make([]pair, 0, len(p.models))
for key, model := range p.models {
pairs = append(pairs, pair{UID: key, Model: model})
}
p.modelsLock.Unlock()
// Sort models and keys by LastCheckpointed time
sort.Slice(pairs, func(i, j int) bool {
return pairs[i].Model.LastCheckpointed.Before(pairs[j].Model.LastCheckpointed)
})
checkpointModelsCount := 0
for _, pair := range pairs {
if checkpointModelsCount >= p.cfg.ModelCheckpointMaxPerStep {
break
}
if p.clock.Since(pair.Model.LastCheckpointed) < p.cfg.ModelCheckpointInterval {
break
}
ckpt := ModelCheckpoint{
UID: pair.UID,
LastUpdated: metav1.NewTime(p.clock.Now()),
}
pair.Model.Lock.Lock()
ckpt.CPU, _ = pair.Model.CPU.SaveToCheckpoint()
ckpt.Memory, _ = pair.Model.Memory.SaveToCheckpoint()
pair.Model.Lock.Unlock()
err := p.checkpointer.Save(ckpt)
if err != nil {
klog.Errorf("save checkpoint uid %v failed, err: %s", pair.UID, err)
} else {
klog.InfoS("save checkpoint", "uid", pair.UID)
}
pair.Model.LastCheckpointed = p.clock.Now()
checkpointModelsCount++
}
}
SaveToCheckpoint()
func (h *histogram) SaveToCheckpoint() (*HistogramCheckpoint, error) {
result := HistogramCheckpoint{
BucketWeights: make(map[int]uint32),
}
result.TotalWeight = h.totalWeight
// Find max
max := 0.
for bucket := h.minBucket; bucket <= h.maxBucket; bucket++ {
if h.bucketWeight[bucket] > max {
max = h.bucketWeight[bucket]
}
}
// Compute ratio
ratio := float64(MaxCheckpointWeight) / max
// Convert weights and drop near-zero weights
for bucket := h.minBucket; bucket <= h.maxBucket; bucket++ {
newWeight := uint32(round(h.bucketWeight[bucket] * ratio))
if newWeight > 0 {
result.BucketWeights[bucket] = newWeight
}
}
return &result, nil
}
目前只看到了怎样把数据放入到model中,还没有看到怎样去使用model数据。
接下来去找下哪里使用了model。