[原始碼分析] kubelet原始碼分析(三)之 Pod的建立
以下程式碼分析基於kubernetes v1.12.0
版本。
本文主要分析ofollow,noindex">https://github.com/kubernetes/kubernetes/tree/v1.12.0/pkg/kubelet 部分的程式碼。
1.syncLoopIteration
syncLoopIteration
主要通過幾種channel
來對不同型別的事件進行監聽並處理。其中包括:configCh
、plegCh
、syncCh
、houseKeepingCh
、livenessManager.Updates()
。
func (kl *Kubelet) syncLoopIteration(configCh <-chan kubetypes.PodUpdate, handler SyncHandler, syncCh <-chan time.Time, housekeepingCh <-chan time.Time, plegCh <-chan *pleg.PodLifecycleEvent) bool { select { case u, open := <-configCh: // Update from a config source; dispatch it to the right handler // callback. if !open { glog.Errorf("Update channel is closed. Exiting the sync loop.") return false } switch u.Op { case kubetypes.ADD: glog.V(2).Infof("SyncLoop (ADD, %q): %q", u.Source, format.Pods(u.Pods)) // After restarting, kubelet will get all existing pods through // ADD as if they are new pods. These pods will then go through the // admission process and *may* be rejected. This can be resolved // once we have checkpointing. handler.HandlePodAdditions(u.Pods) case kubetypes.UPDATE: glog.V(2).Infof("SyncLoop (UPDATE, %q): %q", u.Source, format.PodsWithDeletionTimestamps(u.Pods)) handler.HandlePodUpdates(u.Pods) case kubetypes.REMOVE: glog.V(2).Infof("SyncLoop (REMOVE, %q): %q", u.Source, format.Pods(u.Pods)) handler.HandlePodRemoves(u.Pods) case kubetypes.RECONCILE: glog.V(4).Infof("SyncLoop (RECONCILE, %q): %q", u.Source, format.Pods(u.Pods)) handler.HandlePodReconcile(u.Pods) case kubetypes.DELETE: glog.V(2).Infof("SyncLoop (DELETE, %q): %q", u.Source, format.Pods(u.Pods)) // DELETE is treated as a UPDATE because of graceful deletion. handler.HandlePodUpdates(u.Pods) case kubetypes.RESTORE: glog.V(2).Infof("SyncLoop (RESTORE, %q): %q", u.Source, format.Pods(u.Pods)) // These are pods restored from the checkpoint. Treat them as new // pods. handler.HandlePodAdditions(u.Pods) case kubetypes.SET: // TODO: Do we want to support this? glog.Errorf("Kubelet does not support snapshot update") } if u.Op != kubetypes.RESTORE { // If the update type is RESTORE, it means that the update is from // the pod checkpoints and may be incomplete. Do not mark the // source as ready. // Mark the source ready after receiving at least one update from the // source. Once all the sources are marked ready, various cleanup // routines will start reclaiming resources. It is important that this // takes place only after kubelet calls the update handler to process // the update to ensure the internal pod cache is up-to-date. kl.sourcesReady.AddSource(u.Source) } case e := <-plegCh: if isSyncPodWorthy(e) { // PLEG event for a pod; sync it. if pod, ok := kl.podManager.GetPodByUID(e.ID); ok { glog.V(2).Infof("SyncLoop (PLEG): %q, event: %#v", format.Pod(pod), e) handler.HandlePodSyncs([]*v1.Pod{pod}) } else { // If the pod no longer exists, ignore the event. glog.V(4).Infof("SyncLoop (PLEG): ignore irrelevant event: %#v", e) } } if e.Type == pleg.ContainerDied { if containerID, ok := e.Data.(string); ok { kl.cleanUpContainersInPod(e.ID, containerID) } } case <-syncCh: // Sync pods waiting for sync podsToSync := kl.getPodsToSync() if len(podsToSync) == 0 { break } glog.V(4).Infof("SyncLoop (SYNC): %d pods; %s", len(podsToSync), format.Pods(podsToSync)) handler.HandlePodSyncs(podsToSync) case update := <-kl.livenessManager.Updates(): if update.Result == proberesults.Failure { // The liveness manager detected a failure; sync the pod. // We should not use the pod from livenessManager, because it is never updated after // initialization. pod, ok := kl.podManager.GetPodByUID(update.PodUID) if !ok { // If the pod no longer exists, ignore the update. glog.V(4).Infof("SyncLoop (container unhealthy): ignore irrelevant update: %#v", update) break } glog.V(1).Infof("SyncLoop (container unhealthy): %q", format.Pod(pod)) handler.HandlePodSyncs([]*v1.Pod{pod}) } case <-housekeepingCh: if !kl.sourcesReady.AllReady() { // If the sources aren't ready or volume manager has not yet synced the states, // skip housekeeping, as we may accidentally delete pods from unready sources. glog.V(4).Infof("SyncLoop (housekeeping, skipped): sources aren't ready yet.") } else { glog.V(4).Infof("SyncLoop (housekeeping)") if err := handler.HandlePodCleanups(); err != nil { glog.Errorf("Failed cleaning pods: %v", err) } } } return true }
2.SyncHandler
SyncHandler
是一個定義Pod的不同Handler的介面,具體是實現者是kubelet
,該介面的方法主要在syncLoopIteration
中呼叫,介面定義如下:
// SyncHandler is an interface implemented by Kubelet, for testability type SyncHandler interface { HandlePodAdditions(pods []*v1.Pod) HandlePodUpdates(pods []*v1.Pod) HandlePodRemoves(pods []*v1.Pod) HandlePodReconcile(pods []*v1.Pod) HandlePodSyncs(pods []*v1.Pod) HandlePodCleanups() error }
2.1.HandlePodAdditions
HandlePodAdditions
先根據pod建立時間對pod進行排序,然後遍歷pod列表,來執行pod的相關操作。
// HandlePodAdditions is the callback in SyncHandler for pods being added from // a config source. func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { start := kl.clock.Now() sort.Sort(sliceutils.PodsByCreationTime(pods)) for _, pod := range pods { ... } }
將pod新增到pod manager中。
for _, pod := range pods { // Responsible for checking limits in resolv.conf if kl.dnsConfigurer != nil && kl.dnsConfigurer.ResolverConfig != "" { kl.dnsConfigurer.CheckLimitsForResolvConf() } existingPods := kl.podManager.GetPods() // Always add the pod to the pod manager. Kubelet relies on the pod // manager as the source of truth for the desired state. If a pod does // not exist in the pod manager, it means that it has been deleted in // the apiserver and no action (other than cleanup) is required. kl.podManager.AddPod(pod) ... }
如果是mirror pod,則對mirror pod進行處理。
if kubepod.IsMirrorPod(pod) { kl.handleMirrorPod(pod, start) continue }
如果當前pod的狀態不是Terminated
狀態,則判斷是否接受該pod,如果不接受則將pod狀態改為Failed
。
if !kl.podIsTerminated(pod) { // Only go through the admission process if the pod is not // terminated. // We failed pods that we rejected, so activePods include all admitted // pods that are alive. activePods := kl.filterOutTerminatedPods(existingPods) // Check if we can admit the pod; if not, reject it. if ok, reason, message := kl.canAdmitPod(activePods, pod); !ok { kl.rejectPod(pod, reason, message) continue } }
執行dispatchWork
函式,該函式是syncHandler中呼叫到的核心函式,該函式在pod worker中啟動一個非同步迴圈,來分派pod的相關操作。該函式的具體操作待後續分析。
mirrorPod, _ := kl.podManager.GetMirrorPodByPod(pod) kl.dispatchWork(pod, kubetypes.SyncPodCreate, mirrorPod, start)
最後加pod新增到probe manager中。
kl.probeManager.AddPod(pod)
2.2.HandlePodUpdates
HandlePodUpdates
同樣遍歷pod列表,執行相應的操作。
// HandlePodUpdates is the callback in the SyncHandler interface for pods // being updated from a config source. func (kl *Kubelet) HandlePodUpdates(pods []*v1.Pod) { start := kl.clock.Now() for _, pod := range pods { ... } }
將pod更新到pod manager中。
for _, pod := range pods { // Responsible for checking limits in resolv.conf if kl.dnsConfigurer != nil && kl.dnsConfigurer.ResolverConfig != "" { kl.dnsConfigurer.CheckLimitsForResolvConf() } kl.podManager.UpdatePod(pod) ... }
如果是mirror pod,則對mirror pod進行處理。
if kubepod.IsMirrorPod(pod) { kl.handleMirrorPod(pod, start) continue }
執行dispatchWork
函式。
// TODO: Evaluate if we need to validate and reject updates. mirrorPod, _ := kl.podManager.GetMirrorPodByPod(pod) kl.dispatchWork(pod, kubetypes.SyncPodUpdate, mirrorPod, start)
2.3.HandlePodRemoves
HandlePodRemoves
遍歷pod列表。
// HandlePodRemoves is the callback in the SyncHandler interface for pods // being removed from a config source. func (kl *Kubelet) HandlePodRemoves(pods []*v1.Pod) { start := kl.clock.Now() for _, pod := range pods { ... } }
從pod manager中刪除pod。
for _, pod := range pods { kl.podManager.DeletePod(pod) ... }
如果是mirror pod,則對mirror pod進行處理。
if kubepod.IsMirrorPod(pod) { kl.handleMirrorPod(pod, start) continue }
呼叫kubelet的deletePod
函式來刪除pod。
// Deletion is allowed to fail because the periodic cleanup routine // will trigger deletion again. if err := kl.deletePod(pod); err != nil { glog.V(2).Infof("Failed to delete pod %q, err: %v", format.Pod(pod), err) }
deletePod
函式將需要刪除的pod加入podKillingCh
的channel中,有podKiller
監聽這個channel去執行刪除任務,實現如下:
// deletePod deletes the pod from the internal state of the kubelet by: // 1.stopping the associated pod worker asynchronously // 2.signaling to kill the pod by sending on the podKillingCh channel // // deletePod returns an error if not all sources are ready or the pod is not // found in the runtime cache. func (kl *Kubelet) deletePod(pod *v1.Pod) error { if pod == nil { return fmt.Errorf("deletePod does not allow nil pod") } if !kl.sourcesReady.AllReady() { // If the sources aren't ready, skip deletion, as we may accidentally delete pods // for sources that haven't reported yet. return fmt.Errorf("skipping delete because sources aren't ready yet") } kl.podWorkers.ForgetWorker(pod.UID) // Runtime cache may not have been updated to with the pod, but it's okay // because the periodic cleanup routine will attempt to delete again later. runningPods, err := kl.runtimeCache.GetPods() if err != nil { return fmt.Errorf("error listing containers: %v", err) } runningPod := kubecontainer.Pods(runningPods).FindPod("", pod.UID) if runningPod.IsEmpty() { return fmt.Errorf("pod not found") } podPair := kubecontainer.PodPair{APIPod: pod, RunningPod: &runningPod} kl.podKillingCh <- &podPair // TODO: delete the mirror pod here? // We leave the volume/directory cleanup to the periodic cleanup routine. return nil }
從probe manager中移除pod。
kl.probeManager.RemovePod(pod)
2.4.HandlePodReconcile
遍歷pod列表。
// HandlePodReconcile is the callback in the SyncHandler interface for pods // that should be reconciled. func (kl *Kubelet) HandlePodReconcile(pods []*v1.Pod) { start := kl.clock.Now() for _, pod := range pods { ... } }
將pod更新到pod manager中。
for _, pod := range pods { // Update the pod in pod manager, status manager will do periodically reconcile according // to the pod manager. kl.podManager.UpdatePod(pod) ... }
必要時調整pod的Ready
狀態,執行dispatchWork
函式。
// Reconcile Pod "Ready" condition if necessary. Trigger sync pod for reconciliation. if status.NeedToReconcilePodReadiness(pod) { mirrorPod, _ := kl.podManager.GetMirrorPodByPod(pod) kl.dispatchWork(pod, kubetypes.SyncPodSync, mirrorPod, start) }
如果pod被設定為需要被驅逐的,則刪除pod中的容器。
// After an evicted pod is synced, all dead containers in the pod can be removed. if eviction.PodIsEvicted(pod.Status) { if podStatus, err := kl.podCache.Get(pod.UID); err == nil { kl.containerDeletor.deleteContainersInPod("", podStatus, true) } }
2.5.HandlePodSyncs
HandlePodSyncs
是syncHandler
介面回撥函式,呼叫dispatchWork
,通過pod worker來執行任務。
// HandlePodSyncs is the callback in the syncHandler interface for pods // that should be dispatched to pod workers for sync. func (kl *Kubelet) HandlePodSyncs(pods []*v1.Pod) { start := kl.clock.Now() for _, pod := range pods { mirrorPod, _ := kl.podManager.GetMirrorPodByPod(pod) kl.dispatchWork(pod, kubetypes.SyncPodSync, mirrorPod, start) } }
2.6.HandlePodCleanups
HandlePodCleanups
主要用來執行pod的清理任務,其中包括terminating
的pod,orphaned
的pod等。
首先檢視pod使用到的cgroup。
// HandlePodCleanups performs a series of cleanup work, including terminating // pod workers, killing unwanted pods, and removing orphaned volumes/pod // directories. // NOTE: This function is executed by the main sync loop, so it // should not contain any blocking calls. func (kl *Kubelet) HandlePodCleanups() error { // The kubelet lacks checkpointing, so we need to introspect the set of pods // in the cgroup tree prior to inspecting the set of pods in our pod manager. // this ensures our view of the cgroup tree does not mistakenly observe pods // that are added after the fact... var ( cgroupPods map[types.UID]cm.CgroupName errerror ) if kl.cgroupsPerQOS { pcm := kl.containerManager.NewPodContainerManager() cgroupPods, err = pcm.GetAllPodsFromCgroups() if err != nil { return fmt.Errorf("failed to get list of pods that still exist on cgroup mounts: %v", err) } } ... }
列出所有pod包括mirror pod。
allPods, mirrorPods := kl.podManager.GetPodsAndMirrorPods() // Pod phase progresses monotonically. Once a pod has reached a final state, // it should never leave regardless of the restart policy. The statuses // of such pods should not be changed, and there is no need to sync them. // TODO: the logic here does not handle two cases: //1. If the containers were removed immediately after they died, kubelet //may fail to generate correct statuses, let alone filtering correctly. //2. If kubelet restarted before writing the terminated status for a pod //to the apiserver, it could still restart the terminated pod (even //though the pod was not considered terminated by the apiserver). // These two conditions could be alleviated by checkpointing kubelet. activePods := kl.filterOutTerminatedPods(allPods) desiredPods := make(map[types.UID]empty) for _, pod := range activePods { desiredPods[pod.UID] = empty{} }
pod worker停止不再存在的pod的任務,並從probe manager中清除pod。
// Stop the workers for no-longer existing pods. // TODO: is here the best place to forget pod workers? kl.podWorkers.ForgetNonExistingPodWorkers(desiredPods) kl.probeManager.CleanupPods(activePods)
將需要殺死的pod加入到podKillingCh
的channel中,podKiller
的任務會監聽該channel並獲取需要殺死的pod列表來執行殺死pod的操作。
runningPods, err := kl.runtimeCache.GetPods() if err != nil { glog.Errorf("Error listing containers: %#v", err) return err } for _, pod := range runningPods { if _, found := desiredPods[pod.ID]; !found { kl.podKillingCh <- &kubecontainer.PodPair{APIPod: nil, RunningPod: pod} } }
當pod不再被繫結到該節點,移除podStatus
,其中removeOrphanedPodStatuses
最後呼叫的函式是statusManager
的RemoveOrphanedStatuses
方法。
kl.removeOrphanedPodStatuses(allPods, mirrorPods)
移除所有的orphaned volume。
// Remove any orphaned volumes. // Note that we pass all pods (including terminated pods) to the function, // so that we don't remove volumes associated with terminated but not yet // deleted pods. err = kl.cleanupOrphanedPodDirs(allPods, runningPods) if err != nil { // We want all cleanup tasks to be run even if one of them failed. So // we just log an error here and continue other cleanup tasks. // This also applies to the other clean up tasks. glog.Errorf("Failed cleaning up orphaned pod directories: %v", err) }
移除mirror pod。
// Remove any orphaned mirror pods. kl.podManager.DeleteOrphanedMirrorPods()
刪除不再執行的pod的cgroup。
// Remove any cgroups in the hierarchy for pods that are no longer running. if kl.cgroupsPerQOS { kl.cleanupOrphanedPodCgroups(cgroupPods, activePods) }
執行垃圾回收(GC)操作。
kl.backOff.GC()
2.7. 總結
- syncHandler的各種handler是在syncLoopIteration 中呼叫。
-
其中
HandlePodAdditions
、HandlePodUpdates
、HandlePodReconcile
、HandlePodSyncs
都呼叫到了dispatchWork
來執行pod的相關操作。 -
HandlePodCleanups
的pod清理任務,通過channel的方式加需要清理的pod給podKiller
來清理。 -
syncHandler中使用到
pod manager
、probe manager
、pod worker
、podKiller
來執行相關操作。 -
syncHandler中的各種handler是根據
podUpdate
中不同的操作型別(增刪改查等)來執行具體的handler。具體可參考syncloopiteration。
3.dispatchWork
如果pod的狀態是處於Terminated
狀態,則執行statusManager
的TerminatePod
操作。
// dispatchWork starts the asynchronous sync of the pod in a pod worker. // If the pod is terminated, dispatchWork func (kl *Kubelet) dispatchWork(pod *v1.Pod, syncType kubetypes.SyncPodType, mirrorPod *v1.Pod, start time.Time) { if kl.podIsTerminated(pod) { if pod.DeletionTimestamp != nil { // If the pod is in a terminated state, there is no pod worker to // handle the work item. Check if the DeletionTimestamp has been // set, and force a status update to trigger a pod deletion request // to the apiserver. kl.statusManager.TerminatePod(pod) } return } ... }
執行pod worker的UpdatePod
函式,該函式是pod worker的核心函式,來執行pod相關操作。具體邏輯待下文分析。
// Run the sync in an async worker. kl.podWorkers.UpdatePod(&UpdatePodOptions{ Pod:pod, MirrorPod:mirrorPod, UpdateType: syncType, OnCompleteFunc: func(err error) { if err != nil { metrics.PodWorkerLatency.WithLabelValues(syncType.String()).Observe(metrics.SinceInMicroseconds(start)) } }, })
當建立型別是SyncPodCreate
(即建立pod的時候),統計新pod中容器的數目。
// Note the number of containers for new pods. if syncType == kubetypes.SyncPodCreate { metrics.ContainersPerPodCount.Observe(float64(len(pod.Spec.Containers))) }
4.PodWorkers.UpdatePod
PodWorkers是一個介面型別:
// PodWorkers is an abstract interface for testability. type PodWorkers interface { UpdatePod(options *UpdatePodOptions) ForgetNonExistingPodWorkers(desiredPods map[types.UID]empty) ForgetWorker(uid types.UID) }
其中UpdatePod
是一個核心方法,通過podUpdates
的channel來傳遞需要處理的pod資訊,對於新建立的pod每個pod都會由一個goroutine來執行managePodLoop
。
// Apply the new setting to the specified pod. // If the options provide an OnCompleteFunc, the function is invoked if the update is accepted. // Update requests are ignored if a kill pod request is pending. func (p *podWorkers) UpdatePod(options *UpdatePodOptions) { pod := options.Pod uid := pod.UID var podUpdates chan UpdatePodOptions var exists bool p.podLock.Lock() defer p.podLock.Unlock() if podUpdates, exists = p.podUpdates[uid]; !exists { // We need to have a buffer here, because checkForUpdates() method that // puts an update into channel is called from the same goroutine where // the channel is consumed. However, it is guaranteed that in such case // the channel is empty, so buffer of size 1 is enough. podUpdates = make(chan UpdatePodOptions, 1) p.podUpdates[uid] = podUpdates // Creating a new pod worker either means this is a new pod, or that the // kubelet just restarted. In either case the kubelet is willing to believe // the status of the pod for the first pod worker sync. See corresponding // comment in syncPod. go func() { defer runtime.HandleCrash() p.managePodLoop(podUpdates) }() } if !p.isWorking[pod.UID] { p.isWorking[pod.UID] = true podUpdates <- *options } else { // if a request to kill a pod is pending, we do not let anything overwrite that request. update, found := p.lastUndeliveredWorkUpdate[pod.UID] if !found || update.UpdateType != kubetypes.SyncPodKill { p.lastUndeliveredWorkUpdate[pod.UID] = *options } } }
5.managePodLoop
managePodLoop
通過讀取podUpdates
channel的資訊,執行syncPodFn
函式,而syncPodFn
函式在newPodWorkers
的時候賦值了,即kubelet.syncPod
。
klet.podWorkers = newPodWorkers(klet.syncPod, kubeDeps.Recorder, klet.workQueue, klet.resyncInterval, backOffPeriod, klet.podCache)
newPodWorkers
函式參考:
func newPodWorkers(syncPodFn syncPodFnType, recorder record.EventRecorder, workQueue queue.WorkQueue, resyncInterval, backOffPeriod time.Duration, podCache kubecontainer.Cache) *podWorkers { return &podWorkers{ podUpdates:map[types.UID]chan UpdatePodOptions{}, isWorking:map[types.UID]bool{}, lastUndeliveredWorkUpdate: map[types.UID]UpdatePodOptions{}, syncPodFn:syncPodFn, recorder:recorder, workQueue:workQueue, resyncInterval:resyncInterval, backOffPeriod:backOffPeriod, podCache:podCache, } }
managePodLoop
函式參考:
func (p *podWorkers) managePodLoop(podUpdates <-chan UpdatePodOptions) { var lastSyncTime time.Time for update := range podUpdates { err := func() error { podUID := update.Pod.UID // This is a blocking call that would return only if the cache // has an entry for the pod that is newer than minRuntimeCache // Time. This ensures the worker doesn't start syncing until // after the cache is at least newer than the finished time of // the previous sync. status, err := p.podCache.GetNewerThan(podUID, lastSyncTime) if err != nil { // This is the legacy event thrown by manage pod loop // all other events are now dispatched from syncPodFn p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "error determining status: %v", err) return err } err = p.syncPodFn(syncPodOptions{ mirrorPod:update.MirrorPod, pod:update.Pod, podStatus:status, killPodOptions: update.KillPodOptions, updateType:update.UpdateType, }) lastSyncTime = time.Now() return err }() // notify the call-back function if the operation succeeded or not if update.OnCompleteFunc != nil { update.OnCompleteFunc(err) } if err != nil { // IMPORTANT: we do not log errors here, the syncPodFn is responsible for logging errors glog.Errorf("Error syncing pod %s (%q), skipping: %v", update.Pod.UID, format.Pod(update.Pod), err) } p.wrapUp(update.Pod.UID, err) } }
6.syncPod
syncPod
可以理解為是一個單個pod進行同步任務的事務指令碼。其中入參是syncPodOptions
,syncPodOptions
記錄了需要同步的pod的相關資訊。具體定義如下:
// syncPodOptions provides the arguments to a SyncPod operation. type syncPodOptions struct { // the mirror pod for the pod to sync, if it is a static pod mirrorPod *v1.Pod // pod to sync pod *v1.Pod // the type of update (create, update, sync) updateType kubetypes.SyncPodType // the current status podStatus *kubecontainer.PodStatus // if update type is kill, use the specified options to kill the pod. killPodOptions *KillPodOptions }
syncPod
主要執行以下的工作流:
-
如果是正在建立的pod,則記錄pod worker的啟動
latency
。 -
呼叫
generateAPIPodStatus
為pod提供v1.PodStatus
資訊。 -
如果pod是第一次執行,記錄pod的啟動
latency
。 -
更新
status manager
中的pod狀態。 - 如果pod不應該被執行則殺死pod。
-
如果pod是一個
static pod
,並且沒有對應的mirror pod
,則建立一個mirror pod
。 - 如果沒有pod的資料目錄則給pod建立對應的資料目錄。
- 等待volume被attach或mount。
- 獲取pod的secret資料。
-
呼叫
container runtime
的SyncPod
函式,執行相關pod操作。 -
更新pod的
ingress
和egress
的traffic limit
。
當以上任務流中有任何的error,則return error。在下一次執行syncPod
的任務流會被再次執行。對於錯誤資訊會被記錄到event中,方便debug。
以下對syncPod
的執行過程進行分析。
6.1. SyncPodKill
首先,獲取syncPodOptions
的pod資訊。
func (kl *Kubelet) syncPod(o syncPodOptions) error { // pull out the required options pod := o.pod mirrorPod := o.mirrorPod podStatus := o.podStatus updateType := o.updateType ... }
如果pod是需要被殺死的,則執行killPod
,會在指定的寬限期內殺死pod。
// if we want to kill a pod, do it now! if updateType == kubetypes.SyncPodKill { killPodOptions := o.killPodOptions if killPodOptions == nil || killPodOptions.PodStatusFunc == nil { return fmt.Errorf("kill pod options are required if update type is kill") } apiPodStatus := killPodOptions.PodStatusFunc(pod, podStatus) kl.statusManager.SetPodStatus(pod, apiPodStatus) // we kill the pod with the specified grace period since this is a termination if err := kl.killPod(pod, nil, podStatus, killPodOptions.PodTerminationGracePeriodSecondsOverride); err != nil { kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) // there was an error killing the pod, so we return that error directly utilruntime.HandleError(err) return err } return nil }
6.2. SyncPodCreate
如果pod是需要被建立的,則記錄pod的啟動latency
,latency
與pod在apiserver中第一次被記錄相關。
// Latency measurements for the main workflow are relative to the // first time the pod was seen by the API server. var firstSeenTime time.Time if firstSeenTimeStr, ok := pod.Annotations[kubetypes.ConfigFirstSeenAnnotationKey]; ok { firstSeenTime = kubetypes.ConvertToTimestamp(firstSeenTimeStr).Get() } // Record pod worker start latency if being created // TODO: make pod workers record their own latencies if updateType == kubetypes.SyncPodCreate { if !firstSeenTime.IsZero() { // This is the first time we are syncing the pod. Record the latency // since kubelet first saw the pod if firstSeenTime is set. metrics.PodWorkerStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime)) } else { glog.V(3).Infof("First seen time not recorded for pod %q", pod.UID) } }
通過pod和pod status生成最終的api pod status並設定pod的IP。
// Generate final API pod status with pod and status manager status apiPodStatus := kl.generateAPIPodStatus(pod, podStatus) // The pod IP may be changed in generateAPIPodStatus if the pod is using host network. (See #24576) // TODO(random-liu): After writing pod spec into container labels, check whether pod is using host network, and // set pod IP to hostIP directly in runtime.GetPodStatus podStatus.IP = apiPodStatus.PodIP
記錄pod到running狀態的時間。
// Record the time it takes for the pod to become running. existingStatus, ok := kl.statusManager.GetPodStatus(pod.UID) if !ok || existingStatus.Phase == v1.PodPending && apiPodStatus.Phase == v1.PodRunning && !firstSeenTime.IsZero() { metrics.PodStartLatency.Observe(metrics.SinceInMicroseconds(firstSeenTime)) }
如果pod是不可執行的,則更新pod和container的狀態和相應的原因。
runnable := kl.canRunPod(pod) if !runnable.Admit { // Pod is not runnable; update the Pod and Container statuses to why. apiPodStatus.Reason = runnable.Reason apiPodStatus.Message = runnable.Message // Waiting containers are not creating. const waitingReason = "Blocked" for _, cs := range apiPodStatus.InitContainerStatuses { if cs.State.Waiting != nil { cs.State.Waiting.Reason = waitingReason } } for _, cs := range apiPodStatus.ContainerStatuses { if cs.State.Waiting != nil { cs.State.Waiting.Reason = waitingReason } } }
並更新status manager中的狀態資訊,殺死不可執行的pod。
// Update status in the status manager kl.statusManager.SetPodStatus(pod, apiPodStatus) // Kill pod if it should not be running if !runnable.Admit || pod.DeletionTimestamp != nil || apiPodStatus.Phase == v1.PodFailed { var syncErr error if err := kl.killPod(pod, nil, podStatus, nil); err != nil { kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) syncErr = fmt.Errorf("error killing pod: %v", err) utilruntime.HandleError(syncErr) } else { if !runnable.Admit { // There was no error killing the pod, but the pod cannot be run. // Return an error to signal that the sync loop should back off. syncErr = fmt.Errorf("pod cannot be run: %s", runnable.Message) } } return syncErr }
如果網路外掛還沒到Ready
狀態,則只有在使用host
網路模式的情況下才啟動pod。
// If the network plugin is not ready, only start the pod if it uses the host network if rs := kl.runtimeState.networkErrors(); len(rs) != 0 && !kubecontainer.IsHostNetworkPod(pod) { kl.recorder.Eventf(pod, v1.EventTypeWarning, events.NetworkNotReady, "%s: %v", NetworkNotReadyErrorMsg, rs) return fmt.Errorf("%s: %v", NetworkNotReadyErrorMsg, rs) }
6.3. Cgroups
給pod建立Cgroups
,如果cgroups-per-qos
引數開啟,則申請相應的資源。對於terminated
的pod不需要建立或更新pod的Cgroups
。
當重新啟動kubelet
並且啟用cgroups-per-qos
時,應該間歇性地終止所有pod的執行容器並在qos cgroup hierarchy
下重新啟動。
如果pod的cgroup已經存在或者pod第一次執行,不殺死pod中容器。
// Create Cgroups for the pod and apply resource parameters // to them if cgroups-per-qos flag is enabled. pcm := kl.containerManager.NewPodContainerManager() // If pod has already been terminated then we need not create // or update the pod's cgroup if !kl.podIsTerminated(pod) { // When the kubelet is restarted with the cgroups-per-qos // flag enabled, all the pod's running containers // should be killed intermittently and brought back up // under the qos cgroup hierarchy. // Check if this is the pod's first sync firstSync := true for _, containerStatus := range apiPodStatus.ContainerStatuses { if containerStatus.State.Running != nil { firstSync = false break } } // Don't kill containers in pod if pod's cgroups already // exists or the pod is running for the first time podKilled := false if !pcm.Exists(pod) && !firstSync { if err := kl.killPod(pod, nil, podStatus, nil); err == nil { podKilled = true } } ...
如果pod被殺死並且重啟策略是Never
,則不建立或更新對應的Cgroups
,否則建立和更新pod的Cgroups
。
// Create and Update pod's Cgroups // Don't create cgroups for run once pod if it was killed above // The current policy is not to restart the run once pods when // the kubelet is restarted with the new flag as run once pods are // expected to run only once and if the kubelet is restarted then // they are not expected to run again. // We don't create and apply updates to cgroup if its a run once pod and was killed above if !(podKilled && pod.Spec.RestartPolicy == v1.RestartPolicyNever) { if !pcm.Exists(pod) { if err := kl.containerManager.UpdateQOSCgroups(); err != nil { glog.V(2).Infof("Failed to update QoS cgroups while syncing pod: %v", err) } if err := pcm.EnsureExists(pod); err != nil { kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToCreatePodContainer, "unable to ensure pod container exists: %v", err) return fmt.Errorf("failed to ensure that the pod: %v cgroups exist and are correctly applied: %v", pod.UID, err) } } }
其中建立Cgroups
是通過containerManager
的UpdateQOSCgroups
來執行。
if err := kl.containerManager.UpdateQOSCgroups(); err != nil { glog.V(2).Infof("Failed to update QoS cgroups while syncing pod: %v", err) }
6.4. Mirror Pod
如果pod是一個static pod
,沒有對應的mirror pod
,則建立一個mirror pod
;如果存在mirror pod
則刪除再重建一個mirror pod
。
// Create Mirror Pod for Static Pod if it doesn't already exist if kubepod.IsStaticPod(pod) { podFullName := kubecontainer.GetPodFullName(pod) deleted := false if mirrorPod != nil { if mirrorPod.DeletionTimestamp != nil || !kl.podManager.IsMirrorPodOf(mirrorPod, pod) { // The mirror pod is semantically different from the static pod. Remove // it. The mirror pod will get recreated later. glog.Warningf("Deleting mirror pod %q because it is outdated", format.Pod(mirrorPod)) if err := kl.podManager.DeleteMirrorPod(podFullName); err != nil { glog.Errorf("Failed deleting mirror pod %q: %v", format.Pod(mirrorPod), err) } else { deleted = true } } } if mirrorPod == nil || deleted { node, err := kl.GetNode() if err != nil || node.DeletionTimestamp != nil { glog.V(4).Infof("No need to create a mirror pod, since node %q has been removed from the cluster", kl.nodeName) } else { glog.V(4).Infof("Creating a mirror pod for static pod %q", format.Pod(pod)) if err := kl.podManager.CreateMirrorPod(pod); err != nil { glog.Errorf("Failed creating a mirror pod for %q: %v", format.Pod(pod), err) } } } }
6.5. makePodDataDirs
給pod建立資料目錄。
// Make data directories for the pod if err := kl.makePodDataDirs(pod); err != nil { kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToMakePodDataDirectories, "error making pod data directories: %v", err) glog.Errorf("Unable to make pod data directories for pod %q: %v", format.Pod(pod), err) return err }
其中資料目錄包括
PodDir PodVolumesDir PodPluginsDir
// makePodDataDirs creates the dirs for the pod datas. func (kl *Kubelet) makePodDataDirs(pod *v1.Pod) error { uid := pod.UID if err := os.MkdirAll(kl.getPodDir(uid), 0750); err != nil && !os.IsExist(err) { return err } if err := os.MkdirAll(kl.getPodVolumesDir(uid), 0750); err != nil && !os.IsExist(err) { return err } if err := os.MkdirAll(kl.getPodPluginsDir(uid), 0750); err != nil && !os.IsExist(err) { return err } return nil }
6.6. mount volumes
對非terminated
狀態的pod掛載volume
。
// Volume manager will not mount volumes for terminated pods if !kl.podIsTerminated(pod) { // Wait for volumes to attach/mount if err := kl.volumeManager.WaitForAttachAndMount(pod); err != nil { kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedMountVolume, "Unable to mount volumes for pod %q: %v", format.Pod(pod), err) glog.Errorf("Unable to mount volumes for pod %q: %v; skipping pod", format.Pod(pod), err) return err } }
6.7. PullSecretsForPod
獲取pod的secret資料。
// Fetch the pull secrets for the pod pullSecrets := kl.getPullSecretsForPod(pod)
getPullSecretsForPod
具體實現函式如下:
// getPullSecretsForPod inspects the Pod and retrieves the referenced pull // secrets. func (kl *Kubelet) getPullSecretsForPod(pod *v1.Pod) []v1.Secret { pullSecrets := []v1.Secret{} for _, secretRef := range pod.Spec.ImagePullSecrets { secret, err := kl.secretManager.GetSecret(pod.Namespace, secretRef.Name) if err != nil { glog.Warningf("Unable to retrieve pull secret %s/%s for %s/%s due to %v.The image pull may not succeed.", pod.Namespace, secretRef.Name, pod.Namespace, pod.Name, err) continue } pullSecrets = append(pullSecrets, *secret) } return pullSecrets }
6.8. containerRuntime.SyncPod
呼叫container runtime
的SyncPod
函式,執行相關pod操作,由此kubelet.syncPod
的操作邏輯轉入containerRuntime.SyncPod
函式中。
// Call the container runtime's SyncPod callback result := kl.containerRuntime.SyncPod(pod, apiPodStatus, podStatus, pullSecrets, kl.backOff) kl.reasonCache.Update(pod.UID, result) if err := result.Error(); err != nil { // Do not return error if the only failures were pods in backoff for _, r := range result.SyncResults { if r.Error != kubecontainer.ErrCrashLoopBackOff && r.Error != images.ErrImagePullBackOff { // Do not record an event here, as we keep all event logging for sync pod failures // local to container runtime so we get better errors return err } } return nil }
7.Runtime.SyncPod
SyncPod
主要執行sync操作使得執行的pod達到期望狀態的pod。主要執行以下操作:
-
計算
sandbox
和container
的變化。 - 必要的時候殺死pod。
-
殺死所有不需要執行的
container
。 -
必要時建立
sandbox
。 -
建立
init container
。 -
建立正常的
container
。
7.1. computePodActions
計算sandbox
和container
的變化。
// Step 1: Compute sandbox and container changes. podContainerChanges := m.computePodActions(pod, podStatus) glog.V(3).Infof("computePodActions got %+v for pod %q", podContainerChanges, format.Pod(pod)) if podContainerChanges.CreateSandbox { ref, err := ref.GetReference(legacyscheme.Scheme, pod) if err != nil { glog.Errorf("Couldn't make a ref to pod %q: '%v'", format.Pod(pod), err) } if podContainerChanges.SandboxID != "" { m.recorder.Eventf(ref, v1.EventTypeNormal, events.SandboxChanged, "Pod sandbox changed, it will be killed and re-created.") } else { glog.V(4).Infof("SyncPod received new pod %q, will create a sandbox for it", format.Pod(pod)) } }
7.2. killPodWithSyncResult
必要的時候殺死pod。
// Step 2: Kill the pod if the sandbox has changed. if podContainerChanges.KillPod { if !podContainerChanges.CreateSandbox { glog.V(4).Infof("Stopping PodSandbox for %q because all other containers are dead.", format.Pod(pod)) } else { glog.V(4).Infof("Stopping PodSandbox for %q, will start new one", format.Pod(pod)) } killResult := m.killPodWithSyncResult(pod, kubecontainer.ConvertPodStatusToRunningPod(m.runtimeName, podStatus), nil) result.AddPodSyncResult(killResult) if killResult.Error() != nil { glog.Errorf("killPodWithSyncResult failed: %v", killResult.Error()) return } if podContainerChanges.CreateSandbox { m.purgeInitContainers(pod, podStatus) } }
7.3. killContainer
殺死所有不需要執行的container
。
// Step 3: kill any running containers in this pod which are not to keep. for containerID, containerInfo := range podContainerChanges.ContainersToKill { glog.V(3).Infof("Killing unwanted container %q(id=%q) for pod %q", containerInfo.name, containerID, format.Pod(pod)) killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, containerInfo.name) result.AddSyncResult(killContainerResult) if err := m.killContainer(pod, containerID, containerInfo.name, containerInfo.message, nil); err != nil { killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error()) glog.Errorf("killContainer %q(id=%q) for pod %q failed: %v", containerInfo.name, containerID, format.Pod(pod), err) return } }
7.4. createPodSandbox
必要時建立sandbox
。
// Step 4: Create a sandbox for the pod if necessary. ... glog.V(4).Infof("Creating sandbox for pod %q", format.Pod(pod)) createSandboxResult := kubecontainer.NewSyncResult(kubecontainer.CreatePodSandbox, format.Pod(pod)) result.AddSyncResult(createSandboxResult) podSandboxID, msg, err = m.createPodSandbox(pod, podContainerChanges.Attempt) if err != nil { createSandboxResult.Fail(kubecontainer.ErrCreatePodSandbox, msg) glog.Errorf("createPodSandbox for pod %q failed: %v", format.Pod(pod), err) ref, referr := ref.GetReference(legacyscheme.Scheme, pod) if referr != nil { glog.Errorf("Couldn't make a ref to pod %q: '%v'", format.Pod(pod), referr) } m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedCreatePodSandBox, "Failed create pod sandbox: %v", err) return } glog.V(4).Infof("Created PodSandbox %q for pod %q", podSandboxID, format.Pod(pod))
7.5. start init container
建立init container
。
// Step 5: start the init container. if container := podContainerChanges.NextInitContainerToStart; container != nil { // Start the next init container. startContainerResult := kubecontainer.NewSyncResult(kubecontainer.StartContainer, container.Name) result.AddSyncResult(startContainerResult) isInBackOff, msg, err := m.doBackOff(pod, container, podStatus, backOff) if isInBackOff { startContainerResult.Fail(err, msg) glog.V(4).Infof("Backing Off restarting init container %+v in pod %v", container, format.Pod(pod)) return } glog.V(4).Infof("Creating init container %+v in pod %v", container, format.Pod(pod)) if msg, err := m.startContainer(podSandboxID, podSandboxConfig, container, pod, podStatus, pullSecrets, podIP, kubecontainer.ContainerTypeInit); err != nil { startContainerResult.Fail(err, msg) utilruntime.HandleError(fmt.Errorf("init container start failed: %v: %s", err, msg)) return } // Successfully started the container; clear the entry in the failure glog.V(4).Infof("Completed init container %q for pod %q", container.Name, format.Pod(pod)) }
7.6. start containers
建立正常的container
。
// Step 6: start containers in podContainerChanges.ContainersToStart. for _, idx := range podContainerChanges.ContainersToStart { container := &pod.Spec.Containers[idx] startContainerResult := kubecontainer.NewSyncResult(kubecontainer.StartContainer, container.Name) result.AddSyncResult(startContainerResult) isInBackOff, msg, err := m.doBackOff(pod, container, podStatus, backOff) if isInBackOff { startContainerResult.Fail(err, msg) glog.V(4).Infof("Backing Off restarting container %+v in pod %v", container, format.Pod(pod)) continue } glog.V(4).Infof("Creating container %+v in pod %v", container, format.Pod(pod)) if msg, err := m.startContainer(podSandboxID, podSandboxConfig, container, pod, podStatus, pullSecrets, podIP, kubecontainer.ContainerTypeRegular); err != nil { startContainerResult.Fail(err, msg) // known errors that are logged in other places are logged at higher levels here to avoid // repetitive log spam switch { case err == images.ErrImagePullBackOff: glog.V(3).Infof("container start failed: %v: %s", err, msg) default: utilruntime.HandleError(fmt.Errorf("container start failed: %v: %s", err, msg)) } continue } }
8.startContainer
startContainer
用來啟動一個容器,如果失敗則返回報錯資訊。主要包含以下操作:
- 拉取容器映象
- 建立container
- 啟動container
- 執行post start hook
// startContainer starts a container and returns a message indicates why it is failed on error. // It starts the container through the following steps: // * pull the image // * create the container // * start the container // * run the post start lifecycle hooks (if applicable) func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandboxConfig *runtimeapi.PodSandboxConfig, container *v1.Container, pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, podIP string, containerType kubecontainer.ContainerType) (string, error) { ... }
8.1. pull image
通過EnsureImageExists
方法拉取拉取指定pod容器的映象,並返回映象資訊和錯誤。
// Step 1: pull the image. imageRef, msg, err := m.imagePuller.EnsureImageExists(pod, container, pullSecrets) if err != nil { m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", grpc.ErrorDesc(err)) return msg, err }
8.2. CreateContainer
首先生成container的*v1.ObjectReference
物件,該物件包括container的相關資訊。
// Step 2: create the container. ref, err := kubecontainer.GenerateContainerRef(pod, container) if err != nil { glog.Errorf("Can't make a ref to pod %q, container %v: %v", format.Pod(pod), container.Name, err) } glog.V(4).Infof("Generating ref for container %s: %#v", container.Name, ref)
統計container的重啟次數,新的容器預設重啟次數為0。
// For a new container, the RestartCount should be 0 restartCount := 0 containerStatus := podStatus.FindContainerStatusByName(container.Name) if containerStatus != nil { restartCount = containerStatus.RestartCount + 1 }
生成container的配置。
containerConfig, cleanupAction, err := m.generateContainerConfig(container, pod, restartCount, podIP, imageRef, containerType) if cleanupAction != nil { defer cleanupAction() } if err != nil { m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", grpc.ErrorDesc(err)) return grpc.ErrorDesc(err), ErrCreateContainerConfig }
呼叫runtimeService
,執行CreateContainer
的操作。
containerID, err := m.runtimeService.CreateContainer(podSandboxID, containerConfig, podSandboxConfig) if err != nil { m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", grpc.ErrorDesc(err)) return grpc.ErrorDesc(err), ErrCreateContainer } err = m.internalLifecycle.PreStartContainer(pod, container, containerID) if err != nil { m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Internal PreStartContainer hook failed: %v", grpc.ErrorDesc(err)) return grpc.ErrorDesc(err), ErrPreStartHook } m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.CreatedContainer, "Created container") if ref != nil { m.containerRefManager.SetRef(kubecontainer.ContainerID{ Type: m.runtimeName, ID:containerID, }, ref) }
8.3. StartContainer
執行runtimeService
的StartContainer
方法,來啟動容器。
// Step 3: start the container. err = m.runtimeService.StartContainer(containerID) if err != nil { m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Error: %v", grpc.ErrorDesc(err)) return grpc.ErrorDesc(err), kubecontainer.ErrRunContainer } m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.StartedContainer, "Started container") // Symlink container logs to the legacy container log location for cluster logging // support. // TODO(random-liu): Remove this after cluster logging supports CRI container log path. containerMeta := containerConfig.GetMetadata() sandboxMeta := podSandboxConfig.GetMetadata() legacySymlink := legacyLogSymlink(containerID, containerMeta.Name, sandboxMeta.Name, sandboxMeta.Namespace) containerLog := filepath.Join(podSandboxConfig.LogDirectory, containerConfig.LogPath) // only create legacy symlink if containerLog path exists (or the error is not IsNotExist). // Because if containerLog path does not exist, only dandling legacySymlink is created. // This dangling legacySymlink is later removed by container gc, so it does not make sense // to create it in the first place. it happens when journald logging driver is used with docker. if _, err := m.osInterface.Stat(containerLog); !os.IsNotExist(err) { if err := m.osInterface.Symlink(containerLog, legacySymlink); err != nil { glog.Errorf("Failed to create legacy symbolic link %q to container %q log %q: %v", legacySymlink, containerID, containerLog, err) } }
8.4. execute post start hook
如果有指定Lifecycle.PostStart
,則執行PostStart
操作,PostStart
如果執行失敗,則容器會根據重啟的規則進行重啟。
// Step 4: execute the post start hook. if container.Lifecycle != nil && container.Lifecycle.PostStart != nil { kubeContainerID := kubecontainer.ContainerID{ Type: m.runtimeName, ID:containerID, } msg, handlerErr := m.runner.Run(kubeContainerID, pod, container, container.Lifecycle.PostStart) if handlerErr != nil { m.recordContainerEvent(pod, container, kubeContainerID.ID, v1.EventTypeWarning, events.FailedPostStartHook, msg) if err := m.killContainer(pod, kubeContainerID, container.Name, "FailedPostStartHook", nil); err != nil { glog.Errorf("Failed to kill container %q(id=%q) in pod %q: %v, %v", container.Name, kubeContainerID.String(), format.Pod(pod), ErrPostStartHook, err) } return msg, fmt.Errorf("%s: %v", ErrPostStartHook, handlerErr) } }
9. 總結
kubelet
的工作是管理Pod
在Node上的生命週期(包括增刪改查),kubelet
通過各種型別的manager
非同步工作各自執行各自的任務,其中使用到了多種的channel
來控制狀態訊號變化的傳遞,例如比較重要的channel有podUpdates <-chan UpdatePodOptions
,來傳遞pod的變化情況。
9.1. 建立pod的呼叫邏輯
syncLoopIteration-->kubetypes.ADD-->HandlePodAdditions(u.Pods)-->dispatchWork(pod, kubetypes.SyncPodCreate, mirrorPod, start)-->podWorkers.UpdatePod-->managePodLoop(podUpdates)-->syncPod(o syncPodOptions)-->containerRuntime.SyncPod-->startContainer
9.2. 各種manager
podManager probeManager statusManager podWorkers containerManager containerRuntime runtimeService
9.3. 各種channel
configCh <-chan kubetypes.PodUpdate syncCh <-chan time.Time housekeepingCh <-chan time.Time plegCh <-chan *pleg.PodLifecycleEvent podUpdates <-chan UpdatePodOptions