root@rke2-cilium-01:~# ip link show | grep -E "cilium|lxc_health" 3: cilium_net@cilium_host: <BROADCAST,MULTICAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000 4: cilium_host@cilium_net: <BROADCAST,MULTICAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000 7: cilium_vxlan: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000 71: lxc_health@if70: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000
需要注意的是,虚拟化平台有可能会占用同样的端口(如深信服)用于 VXLAN 通信,当端口冲突的时候,虚拟化平台为了防止 VXLAN 报文出现环路(即从业务口出去,从 VXLAN 环进来),而不转发 8472 端口的报文,而是直接被 DROP 掉,从而导致 Pod 跨节点通信失败。
root@rke2-cilium-01:~# kubectl get pod -owide NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES busybox-9pzgx 1/1 Running 0 3h10m 10.42.1.64 rke2-cilium-02 <none> <none> busybox-s2q6k 1/1 Running 0 3h10m 10.42.0.97 rke2-cilium-01 <none> <none> nginx-596795d4c-b94m5 1/1 Running 0 3h17m 10.42.1.145 rke2-cilium-02 <none> <none> nginx-596795d4c-zw7x6 1/1 Running 0 3h17m 10.42.0.182 rke2-cilium-01 <none> <none>
具体拓扑如下:
跟 Calico VXLAN 模式一样,每个 Pod 在宿主机上都会有对应的网卡组成一对 veth-pair。
获取对端网卡可以通过下面的方式:
1 2 3 4 5 6 7 8
root@rke2-cilium-01:~# kubectl exec -it busybox-s2q6k -- ip a ... 74: eth0@if75: <BROADCAST,MULTICAST,UP,LOWER_UP,M-DOWN> mtu 1500 qdisc noqueue qlen 1000 link/ether 1e:a1:7a:8e:6b:f4 brd ff:ff:ff:ff:ff:ff inet 10.42.0.97/32 scope global eth0 valid_lft forever preferred_lft forever inet6 fe80::1ca1:7aff:fe8e:6bf4/64 scope link valid_lft forever preferred_lft forever
可以看到 Pod 内的网卡 eth0@if75,那么在宿主机寻找序号为 75 的网卡即可,即对应的网卡为 lxcee24765fd490:
1 2 3
root@rke2-cilium-01:~# ip link show | grep 75 75: lxcee24765fd490@if74: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000 link/ether ee:bc:36:fb:d3:05 brd ff:ff:ff:ff:ff:ff link-netns cni-98efb29a-479a-8587-9bea-711b56275933
如果 Pod 内部没有 ip 之类的命令,那么可以获取容器的 Pid,然后通过 nsenter 进入对应的网络命名空间获取:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
root@rke2-cilium-01:~# nerdctl ps | grep nginx 00b34d4b9161 harbor.warnerchen.com/library/nginx:mainline "/docker-entrypoint.…" 3 hours ago Up k8s://default/nginx-596795d4c-zw7x6/nginx
root@rke2-cilium-01:~# ip -4 a ... 72: eth0@if73: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link-netnsid 0 inet 10.42.0.182/32 scope global eth0 valid_lft forever preferred_lft forever
root@rke2-cilium-01:~# ip link show | grep 73 73: lxca72683320161@if72: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000
同节点 Pod 通信
当 Pod 使用容器网段启动的时候,Cilium 会为其分配地址并配置路由,默认路由会指向宿主机的 cilium_host 网卡,而默认情况下 ARP 表为空:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
root@rke2-cilium-01:~# kubectl exec -it busybox-s2q6k -- ip -4 a 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue qlen 1000 inet 127.0.0.1/8 scope host lo valid_lft forever preferred_lft forever 74: eth0@if75: <BROADCAST,MULTICAST,UP,LOWER_UP,M-DOWN> mtu 1500 qdisc noqueue qlen 1000 inet 10.42.0.97/32 scope global eth0 valid_lft forever preferred_lft forever
root@rke2-cilium-01:~# kubectl exec -it busybox-s2q6k -- ping -c 3 10.42.0.182 PING 10.42.0.182 (10.42.0.182): 56 data bytes 64 bytes from 10.42.0.182: seq=0 ttl=63 time=0.146 ms 64 bytes from 10.42.0.182: seq=1 ttl=63 time=0.102 ms 64 bytes from 10.42.0.182: seq=2 ttl=63 time=0.115 ms
--- 10.42.0.182 ping statistics --- 3 packets transmitted, 3 packets received, 0% packet loss round-trip min/avg/max = 0.102/0.121/0.146 ms
busybox-s2q6k 的 IP 为 10.42.0.97,nginx-596795d4c-zw7x6 的 IP 为 10.42.0.182,当 busybox-s2q6k 发送 ICMP 请求到 nginx-596795d4c-zw7x6 的时候,经过路由表,查询到下一跳为 10.42.0.26,即宿主机的 cilium_host 网卡,此时 Pod 会发送 ARP 广播,获取该 IP 的 MAC 地址,但 cilium_host 网卡是 NOARP 状态的,也就是不会响应 ARP 报文。
Cilium 会根据 Pod 对应的 veth-pair 设备,将其 MAC 地址返回给 Pod,也就是 busybox-s2q6k 最终会获取到 lxcee24765fd490 的 MAC 地址,并进行缓存,随后再发送 ICMP 报文:
1 2 3 4 5 6 7 8 9
root@rke2-cilium-01:~# kubectl exec -it busybox-s2q6k -- arp ? (10.42.0.26) at ee:bc:36:fb:d3:05 [ether] on eth0
root@rke2-cilium-01:~# kubectl exec -it busybox-s2q6k -- ip neigh 10.42.0.26 dev eth0 lladdr ee:bc:36:fb:d3:05 used 0/0/0 probes 1 STALE
root@rke2-cilium-01:~# ip link show | grep 75 75: lxcee24765fd490@if74: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000 link/ether ee:bc:36:fb:d3:05 brd ff:ff:ff:ff:ff:ff link-netns cni-98efb29a-479a-8587-9bea-711b56275933
当 lxcee24765fd490 收到 Pod 发来的 ICMP 报文后,会发现 MAC 地址是自己的,而三层目的地址是 10.42.0.182,查看宿主机路由表:
1 2 3 4 5 6 7 8 9 10 11
root@rke2-cilium-01:~# route -n Kernel IP routing table Destination Gateway Genmask Flags Metric Ref Use Iface 0.0.0.0 172.16.16.1 0.0.0.0 UG 0 0 0 ens34 10.42.0.0 10.42.0.26 255.255.255.0 UG 0 0 0 cilium_host 10.42.0.26 0.0.0.0 255.255.255.255 UH 0 0 0 cilium_host 10.42.1.0 10.42.0.26 255.255.255.0 UG 0 0 0 cilium_host 10.42.2.0 10.42.0.26 255.255.255.0 UG 0 0 0 cilium_host 10.42.3.0 10.42.0.26 255.255.255.0 UG 0 0 0 cilium_host 10.42.4.0 10.42.0.26 255.255.255.0 UG 0 0 0 cilium_host 172.16.16.0 0.0.0.0 255.255.255.0 U 0 0 0 ens34
root@rke2-cilium-01:~# kubectl -n kube-system exec -it cilium-45jt2 -c cilium-agent -- cilium bpf endpoint list IP ADDRESS LOCAL ENDPOINT INFO ... # 此处为 nginx-596795d4c-zw7x6 对应的规则 10.42.0.182:0 id=2288 sec_id=16621 flags=0x0000 ifindex=73 mac=1E:7F:57:19:9E:CA nodemac=32:0C:2E:E0:F3:F9 parent_ifindex=0
可以看到规则中对应的 Node MAC 地址为 32:0C:2E:E0:F3:F9,根据这个 MAC 地址,找到的网卡设备为 lxca72683320161,也就是 nginx-596795d4c-zw7x6 Pod 的 veth-pair 设备:
1 2 3
root@rke2-cilium-01:~# ip link show | grep -i "32:0C:2E:E0:F3:F9" -B 1 73: lxca72683320161@if72: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000 link/ether 32:0c:2e:e0:f3:f9 brd ff:ff:ff:ff:ff:ff link-netns cni-b05ebd80-4a9e-e1b2-1d79-622cfca693ed
root@rke2-cilium-02:~# route -n Kernel IP routing table Destination Gateway Genmask Flags Metric Ref Use Iface 0.0.0.0 172.16.16.1 0.0.0.0 UG 0 0 0 ens34 10.42.0.0 10.42.1.217 255.255.255.0 UG 0 0 0 cilium_host 10.42.1.0 10.42.1.217 255.255.255.0 UG 0 0 0 cilium_host 10.42.1.217 0.0.0.0 255.255.255.255 UH 0 0 0 cilium_host 10.42.2.0 10.42.1.217 255.255.255.0 UG 0 0 0 cilium_host 10.42.3.0 10.42.1.217 255.255.255.0 UG 0 0 0 cilium_host 10.42.4.0 10.42.1.217 255.255.255.0 UG 0 0 0 cilium_host 172.16.16.0 0.0.0.0 255.255.255.0 U 0 0 0 ens34
同理,会转发给 cilium_host,然后通过 eBPF 进行转发:
1 2 3 4 5
root@rke2-cilium-01:~# kubectl -n kube-system exec -it cilium-rrzgw -c cilium-agent -- cilium bpf endpoint list IP ADDRESS LOCAL ENDPOINT INFO ... # 此处为 busybox-9pzgx 对应的规则 10.42.1.64:0 id=540 sec_id=7162 flags=0x0000 ifindex=119 mac=DA:1A:3D:E0:1E:1E nodemac=AE:21:12:07:BB:79 parent_ifindex=0
根据 Node MAC 查询到对应的网卡为 lxcef30caaf53d5:
1 2 3
root@rke2-cilium-02:~# ip link show | grep -i "AE:21:12:07:BB:79" -B 1 119: lxcef30caaf53d5@if118: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000 link/ether ae:21:12:07:bb:79 brd ff:ff:ff:ff:ff:ff link-netns cni-985b9ef6-6539-d0b6-f66a-bc333b794b2c
也就是 busybox-9pzgx 在宿主机上对应的 veth-pair 设备,报文会被转发到该设备,最后转发到 Pod 内的 eth0 网卡中,完成通信。