1. 程式人生 > >【kubernetes/k8s原始碼分析】kubelet原始碼分析之容器網路初始化原始碼分析

【kubernetes/k8s原始碼分析】kubelet原始碼分析之容器網路初始化原始碼分析

一. 網路基礎

  1.1 網路名稱空間的操作

  • 建立網路名稱空間: ip netns add
  • 名稱空間內執行命令: ip netns exec
  • 進入名稱空間: ip netns exec bash

  1.2 bridge-nf-call-iptables

  資料包進入網絡卡,協議棧程式碼就能“看到”整個資料包,剩下的問題就是如何來解析和過濾的問題了

  由於網橋工作於資料鏈路層,在iptables沒有開啟 bridge-nf時,資料會直接經過網橋轉發,結果就是對FORWARD的設定失效;
centos預設不開啟 bridge-nf

啟動bridge-nf方式:編輯檔案vim /etc/sysctl.conf 新增:

net.bridge.bridge-nf-call-ip6tables = 1
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-arptables = 1

二. docker網路基礎

  • 網路的名稱空間:Linux在網路棧中引入網路名稱空間,將獨立的網路協議棧隔離到不同的命令空間中,彼此間無法通訊;docker利用這一特性,實現不容器間的網路隔離。
  • Veth裝置對:Veth裝置對的引入是為了實現在不同網路名稱空間的通訊。
  • Iptables/Netfilter:Netfilter負責在核心中執行各種規則(過濾、修改、丟棄等),執行在核心模式中;Iptables模式是在使用者模式下執行的程序,負責協助維護核心中Netfilter的各種規則表;通過二者的配合來實現整個Linux網路協議棧中靈活的資料包處理機制。
  • 網橋:網橋是二層網路裝置,通過網橋可以將linux支援的不同的埠連線起來,並實現類似交換機那樣的多對多的通訊。
  • 路由:當IP層在處理資料傳送或轉發的時候,會使用路由表來決定去向

  別人特別好的圖片,引用

三. kubernetes網路基礎

kubernetes網路基礎原則

  • 每個Pod都擁有一個獨立的IP地址,所有Pod都在一個可以直接連通的、扁平的網路空間中,叢集內所有Pod可以使用Pod的IP來訪問。
  • 同一個Pod內所有的容器共享一個網路堆疊,該模型稱為IP-per-Pod模型。

kubernetes對叢集的網路要求

  • 所有容器都可以不用NAT的方式同別的容器通訊。
  • 所有節點都可以在不同NAT的方式下同所有容器通訊,反之亦然。
  • 容器的地址和別人看到的地址是同一個地址。

0. 資料流

  初始化: 

    NewMainKubelet -> NewDockerService

  SyncPod -> createPodSandbox -> RunPodSandbox -> SetUpPod

1. NewMainKubelet 函式

  路徑: pkg/kubelet/kubelet.go

--hairpin-mode string     Default: "promiscuous-bridge" How should the kubelet setup hairpin NAT. This allows endpoints of a Service to loadbalance back to themselves if they should try to access their own Service. Valid values are "promiscuous-bridge", "hairpin-veth" and "none".
	pluginSettings := dockershim.NetworkPluginSettings{
		HairpinMode:        kubeletconfiginternal.HairpinMode(kubeCfg.HairpinMode),
		NonMasqueradeCIDR:  nonMasqueradeCIDR,
		PluginName:         crOptions.NetworkPluginName,
		PluginConfDir:      crOptions.CNIConfDir,
		PluginBinDirString: crOptions.CNIBinDir,
		MTU:                int(crOptions.NetworkPluginMTU),
	}

  根據containerRuntime為docker,執行NewDockerService

	switch containerRuntime {
	case kubetypes.DockerContainerRuntime:
		// Create and start the CRI shim running as a grpc server.
		streamingConfig := getStreamingConfig(kubeCfg, kubeDeps, crOptions)
		ds, err := dockershim.NewDockerService(kubeDeps.DockerClientConfig, crOptions.PodSandboxImage, streamingConfig,
			&pluginSettings, runtimeCgroups, kubeCfg.CgroupDriver, crOptions.DockershimRootDirectory, !crOptions.RedirectContainerStreaming)
		if err != nil {
			return nil, err
		}
		if crOptions.RedirectContainerStreaming {
			klet.criHandler = ds
		}

		// The unix socket for kubelet <-> dockershim communication.
		glog.V(5).Infof("RemoteRuntimeEndpoint: %q, RemoteImageEndpoint: %q",
			remoteRuntimeEndpoint,
			remoteImageEndpoint)
		glog.V(2).Infof("Starting the GRPC server for the docker CRI shim.")
		server := dockerremote.NewDockerServer(remoteRuntimeEndpoint, ds)
		if err := server.Start(); err != nil {
			return nil, err
		}

		// Create dockerLegacyService when the logging driver is not supported.
		supported, err := ds.IsCRISupportedLogDriver()
		if err != nil {
			return nil, err
		}
		if !supported {
			klet.dockerLegacyService = ds
			legacyLogProvider = ds
		}

2. NewDockerService函式

  路徑: pkg/kubelet/dockershim/docker_service.go

  初始化CNI網路外掛,本文使用calico plugin,有前面文章講解,呼叫network.InitNetworkPlugin進行初始化

  InitNetworkPlugin函式呼叫Init進行初始化網路設定

	// dockershim currently only supports CNI plugins.
	pluginSettings.PluginBinDirs = cni.SplitDirs(pluginSettings.PluginBinDirString)
	cniPlugins := cni.ProbeNetworkPlugins(pluginSettings.PluginConfDir, pluginSettings.PluginBinDirs)
	cniPlugins = append(cniPlugins, kubenet.NewPlugin(pluginSettings.PluginBinDirs))
	netHost := &dockerNetworkHost{
		&namespaceGetter{ds},
		&portMappingGetter{ds},
	}
	plug, err := network.InitNetworkPlugin(cniPlugins, pluginSettings.PluginName, netHost, pluginSettings.HairpinMode, pluginSettings.NonMasqueradeCIDR, pluginSettings.MTU)
	if err != nil {
		return nil, fmt.Errorf("didn't find compatible CNI plugin with given settings %+v: %v", pluginSettings, err)
	}
	ds.network = network.NewPluginManager(plug)

3. Init函式

  路徑 pkg/kubelet/dockershim/network/plugins.go

  前面講解bridge-nf-call-iptables,路徑在/proc/sys/net/bridge/bridge-nf-call-iptables

func (plugin *NoopNetworkPlugin) Init(host Host, hairpinMode kubeletconfig.HairpinMode, nonMasqueradeCIDR string, mtu int) error {
	// Set bridge-nf-call-iptables=1 to maintain compatibility with older
	// kubernetes versions to ensure the iptables-based kube proxy functions
	// correctly.  Other plugins are responsible for setting this correctly
	// depending on whether or not they connect containers to Linux bridges
	// or use some other mechanism (ie, SDN vswitch).

	// Ensure the netfilter module is loaded on kernel >= 3.18; previously
	// it was built-in.
	utilexec.New().Command("modprobe", "br-netfilter").CombinedOutput()
	if err := plugin.Sysctl.SetSysctl(sysctlBridgeCallIPTables, 1); err != nil {
		glog.Warningf("can't set sysctl %s: %v", sysctlBridgeCallIPTables, err)
	}
	if val, err := plugin.Sysctl.GetSysctl(sysctlBridgeCallIP6Tables); err == nil {
		if val != 1 {
			if err = plugin.Sysctl.SetSysctl(sysctlBridgeCallIP6Tables, 1); err != nil {
				glog.Warningf("can't set sysctl %s: %v", sysctlBridgeCallIP6Tables, err)
			}
		}
	}

	return nil
}

4.  cniNetworkPlugin結構體

  路徑 pkg/kubelet/dockershim/.network/cni/cni.go

  如果啟動不為空的話,比如Kubelet啟動引數--network-plugin=cni,則執行這個目錄下init操作

const (
	CNIPluginName = "cni"
)

type cniNetworkPlugin struct {
	network.NoopNetworkPlugin

	loNetwork *cniNetwork

	sync.RWMutex
	defaultNetwork *cniNetwork

	host        network.Host
	execer      utilexec.Interface
	nsenterPath string
	confDir     string
	binDirs     []string
	podCidr     string
}

5. Ini函式

func (plugin *cniNetworkPlugin) Init(host network.Host, hairpinMode kubeletconfig.HairpinMode, nonMasqueradeCIDR string, mtu int) error {
	err := plugin.platformInit()
	if err != nil {
		return err
	}

	plugin.host = host

	plugin.syncNetworkConfig()
	return nil
}

  5.1 platformInit函式

    主要使用nsenter

func (plugin *cniNetworkPlugin) platformInit() error {
	var err error
	plugin.nsenterPath, err = plugin.execer.LookPath("nsenter")
	if err != nil {
		return err
	}
	return nil
}

  5.2 syncNetworkConfig函式

  讀取配置檔案比如kubelet引數路徑--cni-conf-dir=/etc/cni/net.d

func (plugin *cniNetworkPlugin) syncNetworkConfig() {
	network, err := getDefaultCNINetwork(plugin.confDir, plugin.binDirs)
	if err != nil {
		glog.Warningf("Unable to update cni config: %s", err)
		return
	}
	plugin.setDefaultNetwork(network)
}

6. SetUpPod

 根據資料流   SyncPod -> createPodSandbox -> RunPodSandbox -> SetUpPod

 主要函式為addToNetwork

func (plugin *cniNetworkPlugin) SetUpPod(namespace string, name string, id kubecontainer.ContainerID, annotations map[string]string) error {
	if err := plugin.checkInitialized(); err != nil {
		return err
	}
	netnsPath, err := plugin.host.GetNetNS(id.ID)
	if err != nil {
		return fmt.Errorf("CNI failed to retrieve network namespace path: %v", err)
	}

	// Windows doesn't have loNetwork. It comes only with Linux
	if plugin.loNetwork != nil {
		if _, err = plugin.addToNetwork(plugin.loNetwork, name, namespace, id, netnsPath, annotations); err != nil {
			glog.Errorf("Error while adding to cni lo network: %s", err)
			return err
		}
	}

	_, err = plugin.addToNetwork(plugin.getDefaultNetwork(), name, namespace, id, netnsPath, annotations)
	if err != nil {
		glog.Errorf("Error while adding to cni network: %s", err)
		return err
	}

	return err
}

7. addToNetwork函式

  發現最終呼叫AddNetworkList函式這個就是根據具體的網路外掛進行呼叫

func (plugin *cniNetworkPlugin) addToNetwork(network *cniNetwork, podName string, podNamespace string, podSandboxID kubecontainer.ContainerID, podNetnsPath string, annotations map[string]string) (cnitypes.Result, error) {
	rt, err := plugin.buildCNIRuntimeConf(podName, podNamespace, podSandboxID, podNetnsPath, annotations)
	if err != nil {
		glog.Errorf("Error adding network when building cni runtime conf: %v", err)
		return nil, err
	}

	netConf, cniNet := network.NetworkConfig, network.CNIConfig
	glog.V(4).Infof("About to add CNI network %v (type=%v)", netConf.Name, netConf.Plugins[0].Network.Type)
	res, err := cniNet.AddNetworkList(netConf, rt)
	if err != nil {
		glog.Errorf("Error adding network: %v", err)
		return nil, err
	}

	return res, nil
}

8. CNI介面

  各種外掛主要就是實現了兩個方法

type CNI interface {
	AddNetworkList(net *NetworkConfigList, rt *RuntimeConf) (types.Result, error)
	DelNetworkList(net *NetworkConfigList, rt *RuntimeConf) error

	AddNetwork(net *NetworkConfig, rt *RuntimeConf) (types.Result, error)
	DelNetwork(net *NetworkConfig, rt *RuntimeConf) error
}

9. 例如使用calico外掛

{
    "name": "calico-k8s-network",
    "cniVersion": "0.1.0",
    "type": "calico",
    "etcd_endpoints": "XXXXXX",
    "etcd_key_file": "/etc/calico/ssl/calico-key.pem",
    "etcd_cert_file": "/etc/calico/ssl/calico.pem",
    "etcd_ca_cert_file": "/etc/calico/ssl/ca.pem",
    "log_level": "info",
    "mtu": 1500,
    "ipam": {
        "type": "calico-ipam"
    },
    "policy": {
        "type": "k8s"
    },
    "kubernetes": {
        "kubeconfig": "/root/.kube/config"
    }
}