k8s proxy

K8S的proxy在每个node上运行,提供代理服务。它有两种模式:用户态iptable和内核态iptable。proxy通过reflector函数监控service和endpoint变化,更新iptable。服务或端点的增删会触发iptable同步。此外,系统每30秒会自动调用函数进行iptable同步。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

K8S 的proxy是运行在每一个node上面的,它主要起到代理作用,分为两种模式:

一种用户态:iptable

一种内核态iptable:

通过调用reflector函数,进行watch,watch service和endpoint资源,如发现有新建或者变动,则进行iptable的同步,proxier.syncProxyRules(),存储servic;endpoint也是如此,同步iptable和存储endpoint;

运行以后每隔30s,都会调用函数同步iptable

s.Proxier.SyncLoop();
// SyncLoop runs periodic work.  This is expected to run as a goroutine or as the main loop of the app.  It does not return.
func (proxier *Proxier) SyncLoop() {
   t := time.NewTicker(proxier.syncPeriod)
   defer t.Stop()
   for {
      <-t.C
      glog.V(6).Infof("Periodic sync")
      proxier.Sync()
   }
}
func (proxier *Proxier) Sync() {
   proxier.mu.Lock()
   defer proxier.mu.Unlock()
   proxier.syncProxyRules()
}

 

//execConntrackTool executes conntrack tool using given parameters
func (proxier *Proxier) execConntrackTool(parameters ...string) error {
   conntrackPath, err := proxier.exec.LookPath("conntrack")
   if err != nil {
      return fmt.Errorf("Error looking for path of conntrack: %v", err)
   }
   output, err := proxier.exec.Command(conntrackPath, parameters...).CombinedOutput()
   if err != nil {
      return fmt.Errorf("Conntrack command returned: %q, error message: %s", string(output), err)
   }
   return nil
}

// This is where all of the iptables-save/restore calls happen.
// The only other iptables rules are those that are setup in iptablesInit()
// assumes proxier.mu is held
func (proxier *Proxier) syncProxyRules() {
   start := time.Now()
   defer func() {
      glog.V(4).Infof("syncProxyRules took %v", time.Since(start))
   }()
   // don't sync rules till we've received services and endpoints
   if !proxier.haveReceivedEndpointsUpdate || !proxier.haveReceivedServiceUpdate {
      glog.V(2).Info("Not syncing iptables until Services and Endpoints have been received from master")
      return
   }
   glog.V(3).Infof("Syncing iptables rules")

   // Create and link the kube services chain.
   {
      tablesNeedServicesChain := []utiliptables.Table{utiliptables.TableFilter, utiliptables.TableNAT}
      for _, table := range tablesNeedServicesChain {
         if _, err := proxier.iptables.EnsureChain(table, kubeServicesChain); err != nil {
            glog.Errorf("Failed to ensure that %s chain %s exists: %v", table, kubeServicesChain, err)
            return
         }
      }

      tableChainsNeedJumpServices := []struct {
         table utiliptables.Table
         chain utiliptables.Chain
      }{
         {utiliptables.TableFilter, utiliptables.ChainOutput},
         {utiliptables.TableNAT, utiliptables.ChainOutput},
         {utiliptables.TableNAT, utiliptables.ChainPrerouting},
      }
      comment := "kubernetes service portals"
      args := []string{"-m", "comment", "--comment", comment, "-j", string(kubeServicesChain)}
      for _, tc := range tableChainsNeedJumpServices {
         if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, tc.table, tc.chain, args...); err != nil {
            glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", tc.table, tc.chain, kubeServicesChain, err)
            return
         }
      }
   }

   // Create and link the kube postrouting chain.
   {
      if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, kubePostroutingChain); err != nil {
         glog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableNAT, kubePostroutingChain, err)
         return
      }

      comment := "kubernetes postrouting rules"
      args := []string{"-m", "comment", "--comment", comment, "-j", string(kubePostroutingChain)}
      if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableNAT, utiliptables.ChainPostrouting, args...); err != nil {
         glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", utiliptables.TableNAT, utiliptables.ChainPostrouting, kubePostroutingChain, err)
         return
      }
   }

   // Get iptables-save output so we can check for existing chains and rules.
   // This will be a map of chain name to chain with rules as stored in iptables-save/iptables-restore
   existingFilterChains := make(map[utiliptables.Chain]string)
   iptablesSaveRaw, err := proxier.iptables.Save(utiliptables.TableFilter)
   if err != nil { // if we failed to get any rules
      glog.Errorf("Failed to execute iptables-save, syncing all rules: %v", err)
   } else { // otherwise parse the output
      existingFilterChains = utiliptables.GetChainLines(utiliptables.TableFilter, iptablesSaveRaw)
   }

   existingNATChains := make(map[utiliptables.Chain]string)
   iptablesSaveRaw, err = proxier.iptables.Save(utiliptables.TableNAT)
   if err != nil { // if we failed to get any rules
      glog.Errorf("Failed to execute iptables-save, syncing all rules: %v", err)
   } else { // otherwise parse the output
      existingNATChains = utiliptables.GetChainLines(utiliptables.TableNAT, iptablesSaveRaw)
   }

   filterChains := bytes.NewBuffer(nil)
   filterRules := bytes.NewBuffer(nil)
   natChains := bytes.NewBuffer(nil)
   natRules := bytes.NewBuffer(nil)

   // Write table headers.
   writeLine(filterChains, "*filter")
   writeLine(natChains, "*nat")

   // Make sure we keep stats for the top-level chains, if they existed
   // (which most should have because we created them above).
   if chain, ok := existingFilterChains[kubeServicesChain]; ok {
      writeLine(filterChains, chain)
   } else {
      writeLine(filterChains, utiliptables.MakeChainLine(kubeServicesChain))
   }
   if chain, ok := existingNATChains[kubeServicesChain]; ok {
      writeLine(natChains, chain)
   } else {
      writeLine(natChains, utiliptables.MakeChainLine(kubeServicesChain))
   }
   if chain, ok := existingNATChains[kubeNodePortsChain]; ok {
      writeLine(natChains, chain)
   } else {
      writeLine(natChains, utiliptables.MakeChainLine(kubeNodePortsChain))
   }
   if chain, ok := existingNATChains[kubePostroutingChain]; ok {
      writeLine(natChains, chain)
   } else {
      writeLine(natChains, utiliptables.MakeChainLine(kubePostroutingChain))
   }
   if chain, ok := existingNATChains[KubeMarkMasqChain]; ok {
      writeLine(natChains, chain)
   } else {
      writeLine(natChains, utiliptables.MakeChainLine(KubeMarkMasqChain))
   }

   // Install the kubernetes-specific postrouting rules. We use a whole chain for
   // this so that it is easier to flush and change, for example if the mark
   // value should ever change.
   writeLine(natRules, []string{
      "-A", string(kubePostroutingChain),
      "-m", "comment", "--comment", `"kubernetes service traffic requiring SNAT"`,
      "-m", "mark", "--mark", proxier.masqueradeMark,
      "-j", "MASQUERADE",
   }...)

   // Install the kubernetes-specific masquerade mark rule. We use a whole chain for
   // this so that it is easier to flush and change, for example if the mark
   // value should ever change.
   writeLine(natRules, []string{
      "-A", string(KubeMarkMasqChain),
      "-j", "MARK", "--set-xmark", proxier.masqueradeMark,
   }...)

   // Accumulate NAT chains to keep.
   activeNATChains := map[utiliptables.Chain]bool{} // use a map as a set

   // Accumulate the set of local ports that we will be holding open once this update is complete
   replacementPortsMap := map[localPort]closeable{}

   // Build rules for each service.
   for svcName, svcInfo := range proxier.serviceMap {
      protocol := strings.ToLower(string(svcInfo.protocol))

      // Create the per-service chain, retaining counters if possible.
      svcChain := servicePortChainName(svcName, protocol)
      if chain, ok := existingNATChains[svcChain]; ok {
         writeLine(natChains, chain)
      } else {
         writeLine(natChains, utiliptables.MakeChainLine(svcChain))
      }
      activeNATChains[svcChain] = true

      // Capture the clusterIP.
      args := []string{
         "-A", string(kubeServicesChain),
         "-m", "comment", "--comment", fmt.Sprintf(`"%s cluster IP"`, svcName.String()),
         "-m", protocol, "-p", protocol,
         "-d", fmt.Sprintf("%s/32", svcInfo.clusterIP.String()),
         "--dport", fmt.Sprintf("%d", svcInfo.port),
      }
      if proxier.masqueradeAll {
         writeLine(natRules, append(args, "-j", string(KubeMarkMasqChain))...)
      }
      if len(proxier.clusterCIDR) > 0 {
         writeLine(natRules, append(args, "! -s", proxier.clusterCIDR, "-j", string(KubeMarkMasqChain))...)
      }
      writeLine(natRules, append(args, "-j", string(svcChain))...)

      // Capture externalIPs.
      for _, externalIP := range svcInfo.externalIPs {
         // If the "external" IP happens to be an IP that is local to this
         // machine, hold the local port open so no other process can open it
         // (because the socket might open but it would never work).
         if local, err := isLocalIP(externalIP); err != nil {
            glog.Errorf("can't determine if IP is local, assuming not: %v", err)
         } else if local {
            lp := localPort{
               desc:     "externalIP for " + svcName.String(),
               ip:       externalIP,
               port:     svcInfo.port,
               protocol: protocol,
            }
            if proxier.portsMap[lp] != nil {
               glog.V(4).Infof("Port %s was open before and is still needed", lp.String())
               replacementPortsMap[lp] = proxier.portsMap[lp]
            } else {
               socket, err := openLocalPort(&lp)
               if err != nil {
                  glog.Errorf("can't open %s, skipping this externalIP: %v", lp.String(), err)
                  continue
               }
               replacementPortsMap[lp] = socket
            }
         } // We're holding the port, so it's OK to install iptables rules.
         args := []string{
            "-A", string(kubeServicesChain),
            "-m", "comment", "--comment", fmt.Sprintf(`"%s external IP"`, svcName.String()),
            "-m", protocol, "-p", protocol,
            "-d", fmt.Sprintf("%s/32", externalIP),
            "--dport", fmt.Sprintf("%d", svcInfo.port),
         }
         // We have to SNAT packets to external IPs.
         writeLine(natRules, append(args, "-j", string(KubeMarkMasqChain))...)

         // Allow traffic for external IPs that does not come from a bridge (i.e. not from a container)
         // nor from a local process to be forwarded to the service.
         // This rule roughly translates to "all traffic from off-machine".
         // This is imperfect in the face of network plugins that might not use a bridge, but we can revisit that later.
         externalTrafficOnlyArgs := append(args,
            "-m", "physdev", "!", "--physdev-is-in",
            "-m", "addrtype", "!", "--src-type", "LOCAL")
         writeLine(natRules, append(externalTrafficOnlyArgs, "-j", string(svcChain))...)
         dstLocalOnlyArgs := append(args, "-m", "addrtype", "--dst-type", "LOCAL")
         // Allow traffic bound for external IPs that happen to be recognized as local IPs to stay local.
         // This covers cases like GCE load-balancers which get added to the local routing table.
         writeLine(natRules, append(dstLocalOnlyArgs, "-j", string(svcChain))...)
      }

      // Capture load-balancer ingress.
      for _, ingress := range svcInfo.loadBalancerStatus.Ingress {
         if ingress.IP != "" {
            // create service firewall chain
            fwChain := serviceFirewallChainName(svcName, protocol)
            if chain, ok := existingNATChains[fwChain]; ok {
               writeLine(natChains, chain)
            } else {
               writeLine(natChains, utiliptables.MakeChainLine(fwChain))
            }
            activeNATChains[fwChain] = true
            // The service firewall rules are created based on ServiceSpec.loadBalancerSourceRanges field.
            // This currently works for loadbalancers that preserves source ips.
            // For loadbalancers which direct traffic to service NodePort, the firewall rules will not apply.

            args := []string{
               "-A", string(kubeServicesChain),
               "-m", "comment", "--comment", fmt.Sprintf(`"%s loadbalancer IP"`, svcName.String()),
               "-m", protocol, "-p", protocol,
               "-d", fmt.Sprintf("%s/32", ingress.IP),
               "--dport", fmt.Sprintf("%d", svcInfo.port),
            }
            // jump to service firewall chain
            writeLine(natRules, append(args, "-j", string(fwChain))...)

            args = []string{
               "-A", string(fwChain),
               "-m", "comment", "--comment", fmt.Sprintf(`"%s loadbalancer IP"`, svcName.String()),
            }
            // We have to SNAT packets from external IPs.
            writeLine(natRules, append(args, "-j", string(KubeMarkMasqChain))...)

            if len(svcInfo.loadBalancerSourceRanges) == 0 {
               // allow all sources, so jump directly to KUBE-SVC chain
               writeLine(natRules, append(args, "-j", string(svcChain))...)
            } else {
               // firewall filter based on each source range
               allowFromNode := false
               for _, src := range svcInfo.loadBalancerSourceRanges {
                  writeLine(natRules, append(args, "-s", src, "-j", string(svcChain))...)
                  // ignore error because it has been validated
                  _, cidr, _ := net.ParseCIDR(src)
                  if cidr.Contains(proxier.nodeIP) {
                     allowFromNode = true
                  }
               }
               // generally, ip route rule was added to intercept request to loadbalancer vip from the
               // loadbalancer's backend hosts. In this case, request will not hit the loadbalancer but loop back directly.
               // Need to add the following rule to allow request on host.
               if allowFromNode {
                  writeLine(natRules, append(args, "-s", fmt.Sprintf("%s/32", ingress.IP), "-j", string(svcChain))...)
               }
            }

            // If the packet was able to reach the end of firewall chain, then it did not get DNATed.
            // It means the packet cannot go thru the firewall, then mark it for DROP
            writeLine(natRules, append(args, "-j", string(KubeMarkDropChain))...)
         }
      }

      // Capture nodeports.  If we had more than 2 rules it might be
      // worthwhile to make a new per-service chain for nodeport rules, but
      // with just 2 rules it ends up being a waste and a cognitive burden.
      if svcInfo.nodePort != 0 {
         // Hold the local port open so no other process can open it
         // (because the socket might open but it would never work).
         lp := localPort{
            desc:     "nodePort for " + svcName.String(),
            ip:       "",
            port:     svcInfo.nodePort,
            protocol: protocol,
         }
         if proxier.portsMap[lp] != nil {
            glog.V(4).Infof("Port %s was open before and is still needed", lp.String())
            replacementPortsMap[lp] = proxier.portsMap[lp]
         } else {
            socket, err := openLocalPort(&lp)
            if err != nil {
               glog.Errorf("can't open %s, skipping this nodePort: %v", lp.String(), err)
               continue
            }
            replacementPortsMap[lp] = socket
         } // We're holding the port, so it's OK to install iptables rules.

         args := []string{
            "-A", string(kubeNodePortsChain),
            "-m", "comment", "--comment", svcName.String(),
            "-m", protocol, "-p", protocol,
            "--dport", fmt.Sprintf("%d", svcInfo.nodePort),
         }
         // Nodeports need SNAT.
         writeLine(natRules, append(args, "-j", string(KubeMarkMasqChain))...)
         // Jump to the service chain.
         writeLine(natRules, append(args, "-j", string(svcChain))...)
      }

      // If the service has no endpoints then reject packets.
      if len(proxier.endpointsMap[svcName]) == 0 {
         writeLine(filterRules,
            "-A", string(kubeServicesChain),
            "-m", "comment", "--comment", fmt.Sprintf(`"%s has no endpoints"`, svcName.String()),
            "-m", protocol, "-p", protocol,
            "-d", fmt.Sprintf("%s/32", svcInfo.clusterIP.String()),
            "--dport", fmt.Sprintf("%d", svcInfo.port),
            "-j", "REJECT",
         )
         continue
      }

      // Generate the per-endpoint chains.  We do this in multiple passes so we
      // can group rules together.
      endpoints := make([]string, 0)
      endpointChains := make([]utiliptables.Chain, 0)
      for _, ep := range proxier.endpointsMap[svcName] {
         endpoints = append(endpoints, ep)
         endpointChain := servicePortEndpointChainName(svcName, protocol, ep)
         endpointChains = append(endpointChains, endpointChain)

         // Create the endpoint chain, retaining counters if possible.
         if chain, ok := existingNATChains[utiliptables.Chain(endpointChain)]; ok {
            writeLine(natChains, chain)
         } else {
            writeLine(natChains, utiliptables.MakeChainLine(endpointChain))
         }
         activeNATChains[endpointChain] = true
      }

      // First write session affinity rules, if applicable.
      if svcInfo.sessionAffinityType == api.ServiceAffinityClientIP {
         for _, endpointChain := range endpointChains {
            writeLine(natRules,
               "-A", string(svcChain),
               "-m", "comment", "--comment", svcName.String(),
               "-m", "recent", "--name", string(endpointChain),
               "--rcheck", "--seconds", fmt.Sprintf("%d", svcInfo.stickyMaxAgeSeconds), "--reap",
               "-j", string(endpointChain))
         }
      }

      // Now write loadbalancing & DNAT rules.
      n := len(endpointChains)
      for i, endpointChain := range endpointChains {
         // Balancing rules in the per-service chain.
         args := []string{
            "-A", string(svcChain),
            "-m", "comment", "--comment", svcName.String(),
         }
         if i < (n - 1) {
            // Each rule is a probabilistic match.
            args = append(args,
               "-m", "statistic",
               "--mode", "random",
               "--probability", fmt.Sprintf("%0.5f", 1.0/float64(n-i)))
         }
         // The final (or only if n == 1) rule is a guaranteed match.
         args = append(args, "-j", string(endpointChain))
         writeLine(natRules, args...)

         // Rules in the per-endpoint chain.
         args = []string{
            "-A", string(endpointChain),
            "-m", "comment", "--comment", svcName.String(),
         }
         // Handle traffic that loops back to the originator with SNAT.
         // Technically we only need to do this if the endpoint is on this
         // host, but we don't have that information, so we just do this for
         // all endpoints.
         // TODO: if we grow logic to get this node's pod CIDR, we can use it.
         writeLine(natRules, append(args,
            "-s", fmt.Sprintf("%s/32", strings.Split(endpoints[i], ":")[0]),
            "-j", string(KubeMarkMasqChain))...)

         // Update client-affinity lists.
         if svcInfo.sessionAffinityType == api.ServiceAffinityClientIP {
            args = append(args, "-m", "recent", "--name", string(endpointChain), "--set")
         }
         // DNAT to final destination.
         args = append(args, "-m", protocol, "-p", protocol, "-j", "DNAT", "--to-destination", endpoints[i])
         writeLine(natRules, args...)
      }
   }

   // Delete chains no longer in use.
   for chain := range existingNATChains {
      if !activeNATChains[chain] {
         chainString := string(chain)
         if !strings.HasPrefix(chainString, "KUBE-SVC-") && !strings.HasPrefix(chainString, "KUBE-SEP-") && !strings.HasPrefix(chainString, "KUBE-FW-") {
            // Ignore chains that aren't ours.
            continue
         }
         // We must (as per iptables) write a chain-line for it, which has
         // the nice effect of flushing the chain.  Then we can remove the
         // chain.
         writeLine(natChains, existingNATChains[chain])
         writeLine(natRules, "-X", chainString)
      }
   }

   // Finally, tail-call to the nodeports chain.  This needs to be after all
   // other service portal rules.
   writeLine(natRules,
      "-A", string(kubeServicesChain),
      "-m", "comment", "--comment", `"kubernetes service nodeports; NOTE: this must be the last rule in this chain"`,
      "-m", "addrtype", "--dst-type", "LOCAL",
      "-j", string(kubeNodePortsChain))

   // Write the end-of-table markers.
   writeLine(filterRules, "COMMIT")
   writeLine(natRules, "COMMIT")

   // Sync rules.
   // NOTE: NoFlushTables is used so we don't flush non-kubernetes chains in the table.
   filterLines := append(filterChains.Bytes(), filterRules.Bytes()...)
   natLines := append(natChains.Bytes(), natRules.Bytes()...)
   lines := append(filterLines, natLines...)

   glog.V(3).Infof("Restoring iptables rules: %s", lines)
   err = proxier.iptables.RestoreAll(lines, utiliptables.NoFlushTables, utiliptables.RestoreCounters)
   if err != nil {
      glog.Errorf("Failed to execute iptables-restore: %v", err)
      // Revert new local ports.
      revertPorts(replacementPortsMap, proxier.portsMap)
      return
   }

   // Close old local ports and save new ones.
   for k, v := range proxier.portsMap {
      if replacementPortsMap[k] == nil {
         v.Close()
      }
   }
   proxier.portsMap = replacementPortsMap

   // Clean up the older SNAT rule which was directly in POSTROUTING.
   // TODO(thockin): Remove this for v1.3 or v1.4.
   args := []string{
      "-m", "comment", "--comment", "kubernetes service traffic requiring SNAT",
      "-m", "mark", "--mark", oldIptablesMasqueradeMark,
      "-j", "MASQUERADE",
   }
   if err := proxier.iptables.DeleteRule(utiliptables.TableNAT, utiliptables.ChainPostrouting, args...); err != nil {
      if !utiliptables.IsNotFoundError(err) {
         glog.Errorf("Error removing old-style SNAT rule: %v", err)
      }
   }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值