10000 add flannel vxlan demo · lx1036/code@c7d9eb1 · GitHub
[go: up one dir, main page]

Skip to content

Commit c7d9eb1

Browse files
author
shenming
committed
add flannel vxlan demo
1 parent e79805c commit c7d9eb1

File tree

5 files changed

+292
-5
lines changed

5 files changed

+292
-5
lines changed

go/k8s/bpf/xdp-l4lb/xdp-cilium-l4lb/cilium/test/tunnel/vxlan/flannel/pkg/backend/vxlan/vxlan_network.go

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ type vxlanLeaseAttrs struct {
6767
VtepMAC hardwareAddr
6868
}
6969

70+
/**
71+
INFO: 调试 flannel vxlan,`kubectl -n kube-flannel edit ds kube-flannel-ds` 新加参数 --v=5 就行
72+
*/
73+
7074
// 配置路由和 arp, 这里 batch 包含除了当前 node subnet 之外的其他所有 node subnet
7175
// INFO: flanneld 通过 watch k8s Node 来动态地维护各节点通信所需的ARP、FDB以及路由条目
7276
func (network *vxlanNetwork) handleSubnetEvents(batch []subnet.Event) {
@@ -98,7 +102,7 @@ func (network *vxlanNetwork) handleSubnetEvents(batch []subnet.Event) {
98102
Scope: netlink.SCOPE_UNIVERSE,
99103
// 到达新 node 的 vxlan 网卡的路由,`10.230.91.0/24 via 10.230.91.0 dev flannel.1 onlink`,
100104
// 然后 10.230.91.0 对应的 mac 在 ARP 表里,下面 AddARP() 设置了,然后这个 mac 对应的 IP 在 FDB 表里设置了,就是另一台 nodeIP,最后封包结束
101-
Dst: sn.ToIPNet(),
105+
Dst: sn.ToIPNet(), // 10.230.91.0/24
102106
Gw: sn.IP.ToIP(), // 10.230.91.0
103107
}
104108
// flanneld在加入集群时会为每个其他节点生成一条on-link路由,on-link路由表示是直连路由,匹配该条路由的数据包将触发ARP请求获取目的IP的MAC地址
@@ -123,6 +127,7 @@ func (network *vxlanNetwork) handleSubnetEvents(batch []subnet.Event) {
123127

124128
// INFO: 这里是最重点之处,主要配置 route/arp/fdb,就可以实现了 vxlan 封包,回答了上面的问题
125129
// 其实主要就是实现了这篇文章所说的配置 http://just4coding.com/2020/04/20/vxlan-fdb/ , 配置 arp/fdb 来实现 vxlan 封包
130+
// 注意:只有增加新的 node 时才会触发这个 vxlan_network.go 订阅逻辑
126131
switch event.Type {
127132
case subnet.EventAdded:
128133
if event.Lease.EnableIPv4 {
@@ -135,12 +140,16 @@ func (network *vxlanNetwork) handleSubnetEvents(batch []subnet.Event) {
135140
} else {
136141
klog.Infof(fmt.Sprintf("adding subnet: %s for nodeIP: %s VtepMAC: %s",
137142
sn, attrs.PublicIP, net.HardwareAddr(vxlanAttrs.VtepMAC)))
138-
// 这里 sn.IP 是 vxlan 网卡的 IP
143+
// (1)这里 sn.IP 是新加 node 的 vtep 的 IP,即 vxlan 网卡的 IP
144+
// vxlanAttrs.VtepMAC 是新加 node 的 vtep mac 地址,即 vxlan 网卡的 mac
145+
// 10.244.1.0, 72:ff:29:6f:e7:98
139146
if err := network.dev.AddARP(neighbor{IP: sn.IP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)}); err != nil {
140147
klog.Error("AddARP failed: ", err)
141148
continue
142149
}
143-
// 这里 attrs.PublicIP 就是 nodeIP, 或者也叫 VTEP IP 地址, 可以参考验证 @see http://just4coding.com/2020/04/20/vxlan-fdb/
150+
// (2)这里 attrs.PublicIP 就是新加 node 的 nodeIP(k8s里验证是这个结论),
151+
// vxlanAttrs.VtepMAC 是新加 node 的 vtep mac 地址,即 vxlan 网卡的 mac
152+
// 或者也叫 VTEP IP 地址, 可以参考验证 @see http://just4coding.com/2020/04/20/vxlan-fdb/
144153
if err := network.dev.AddFDB(neighbor{IP: attrs.PublicIP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)}); err != nil {
145154
// Try to clean up the ARP entry then continue
146155
if err := network.dev.DelARP(neighbor{IP: sn.IP, MAC: net.HardwareAddr(vxlanAttrs.VtepMAC)}); err != nil {
@@ -150,8 +159,8 @@ func (network *vxlanNetwork) handleSubnetEvents(batch []subnet.Event) {
150159
continue
151160
}
152161

153-
// Set the route - the kernel would ARP for the Gw IP address if it hadn't already been set above so make sure
154-
// this is done last.
162+
// (3)10.244.1.0/24 via 10.244.1.0 dev flannel.1 onlink,10.244.1.0/24 是新加 node 的 flannel.1 vxlan 网卡,flannel.1 是本 node 的网卡
163+
// 即新增路由,访问 pod 子网 10.244.1.0/24(node2的pod网段),需要从本地 vtep 设备 flannel.1 vxlan 网卡走
155164
if err := netlink.RouteReplace(&vxlanRoute); err != nil {
156165
klog.Errorf(fmt.Sprintf("failed to add vxlanRoute (%s -> %s): %v", vxlanRoute.Dst, vxlanRoute.Gw, err))
157166
// Try to clean up both the ARP and FDB entries then continue
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
2+
3+
4+
# 抓包 vxlan 跨节点 pod
5+
6+
7+
8+
9+
10+
11+
12+
13+
F438 14+
15+
16+
# 抓包 ipip 跨节点 pod
17+
18+
19+
20+
21+
22+
23+
24+
25+
26+

go/k8s/bpf/xdp-l4lb/xdp-cilium-l4lb/cilium/test/tunnel/vxlan/flannel/vxlan_in_flannel.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,7 @@ https://github.com/flannel-io/flannel/blob/master/Documentation/backends.md#vxla
66

77

88

9+
10+
# 参考文献
11+
https://github.com/flannel-io/flannel/tree/v0.22.3
12+
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
2+
3+
# 同 Node 的 pod-to-pod 通信
4+
5+
6+
7+
8+
9+
# 跨 Node 的 pod-to-pod 通信
10+
11+
12+
13+
# 参考文献
14+
https://github.com/y805939188/simple-k8s-cni/blob/master/plugins/vxlan/ebpf/vxlan_ingress.c
15+
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
2+
/**
3+
* https://github.com/y805939188/simple-k8s-cni/blob/master/plugins/vxlan/ebpf/vxlan_ingress.c
4+
*/
5+
6+
7+
// /root/linux-5.10.142/include/uapi/linux/bpf.h
8+
#include <linux/bpf.h>
9+
#include <linux/if_ether.h>
10+
#include <linux/if_packet.h>
11+
#include <linux/ip.h>
12+
#include <linux/pkt_cls.h>
13+
14+
// /root/linux-5.10.142/tools/lib/bpf/bpf_helpers.h
15+
#include <bpf/bpf_helpers.h>
16+
#include <bpf/bpf_endian.h>
17+
18+
#ifndef __section
19+
#define __section(X) __attribute__((section(X), used))
20+
#endif
21+
#ifndef __section_maps
22+
#define __section_maps __section("maps")
23+
#endif
24+
25+
#define PIN_GLOBAL_NS 2
26+
#define DEFAULT_TUNNEL_ID 13190
27+
#define LOCAL_DEV_VXLAN 1;
28+
#define LOCAL_DEV_VETH 2;
29+
30+
struct bpf_elf_map {
31+
__u32 type;
32+
__u32 size_key;
33+
__u32 size_value;
34+
__u32 max_elem;
35+
__u32 flags;
36+
__u32 id;
37+
__u32 pinning;
38+
__u32 inner_id;
39+
__u32 inner_idx;
40+
};
41+
42+
struct endpointKey {
43+
__u32 ip;
44+
};
45+
struct endpointInfo {
46+
__u32 ifIndex;
47+
__u32 lxcIfIndex;
48+
__u8 mac[8];
49+
__u8 nodeMac[8];
50+
};
51+
52+
struct bpf_elf_map __section_maps ding_lxc = {
53+
.type = BPF_MAP_TYPE_HASH,
54+
.size_key = sizeof(struct endpointKey),
55+
.size_value = sizeof(struct endpointInfo),
56+
.pinning = PIN_GLOBAL_NS, // 设置 LIBBPF_PIN_BY_NAME 导致 pin path 为 /sys/fs/bpf/tc/424e22c8e74276a6484f398886d426f441d9b849/
57+
.max_elem = 256,
58+
};
59+
60+
struct podNodeKey {
61+
__u32 ip;
62+
};
63+
64+
struct podNodeValue {
65+
__u32 ip;
66+
};
67+
68+
struct bpf_elf_map __section_maps ding_ip = {
69+
.type = BPF_MAP_TYPE_HASH,
70+
.size_key = sizeof(struct podNodeKey),
71+
.size_value = sizeof(struct podNodeValue),
72+
.pinning = PIN_GLOBAL_NS, // 设置 LIBBPF_PIN_BY_NAME 导致 pin path 为 /sys/fs/bpf/tc/424e22c8e74276a6484f398886d426f441d9b849/
73+
.max_elem = 256,
74+
};
75+
76+
struct localNodeMapKey {
77+
__u32 type;
78+
};
79+
struct localNodeMapValue {
80+
__u32 ifIndex;
81+
};
82+
struct bpf_elf_map __section_maps ding_local = {
83+
.type = BPF_MAP_TYPE_HASH,
84+
.size_key = sizeof(struct localNodeMapKey),
85+
.size_value = sizeof(struct localNodeMapValue),
86+
.pinning = PIN_GLOBAL_NS, // 设置 LIBBPF_PIN_BY_NAME 导致 pin path 为 /sys/fs/bpf/tc/424e22c8e74276a6484f398886d426f441d9b849/
87+
.max_elem = 256,
88+
};
89+
90+
SEC("veth_pair_ingress")
91+
int veth_pair_ingress(struct __sk_buff *skb) {
92+
void *data = (void *)(long)skb->data;
93+
void *data_end = (void *)(long)skb->data_end;
94+
if (data + sizeof(struct ethhdr) + sizeof(struct iphdr) > data_end) {
95+
return TC_ACT_UNSPEC;
96+
}
97+
98+
struct ethhdr *eth = data;
99+
struct iphdr *ip = (data + sizeof(struct ethhdr));
100+
if (eth->h_proto != bpf_htons(ETH_P_IP)) {
101+
return TC_ACT_UNSPEC;
102+
}
103+
104+
// 在 go 那头儿往 ebpf 的 map 里存的时候我这个 arm 是按照小端序存的
105+
// 这里给转成网络的大端序
106+
__u32 src_ip = bpf_htonl(ip->saddr);
107+
__u32 dst_ip = bpf_htonl(ip->daddr);
108+
// 拿到 mac 地址
109+
__u8 src_mac[ETH_ALEN];
110+
__u8 dst_mac[ETH_ALEN];
111+
struct endpointKey epKey = {};
112+
epKey.ip = dst_ip;
113+
// 在 lxc 中查找
114+
struct endpointInfo *ep = bpf_map_lookup_elem(&ding_lxc, &epKey);
115+
if (ep) {
116+
// 如果能找到说明是要发往本机其他 pod 中的
117+
// 把 mac 地址改成目标 pod 的两对儿 veth 的 mac 地址
118+
__builtin_memcpy(src_mac, ep->nodeMac, ETH_ALEN);
119+
__builtin_memcpy(dst_mac, ep->mac, ETH_ALEN);
120+
bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_source), dst_mac, ETH_ALEN, 0);
121+
bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_dest), src_mac, ETH_ALEN, 0);
122+
return bpf_redirect_peer(ep->lxcIfIndex, 0);
123+
}
124+
125+
struct podNodeKey podNodeKey = {
126+
.ip = dst_ip,
127+
};
128+
struct podNodeValue *podNode = bpf_map_lookup_elem(&ding_ip, &podNodeKey);
129+
if (podNode) {
130+
// 进到这里说明该目标 ip 是本集群内的 ip
131+
struct localNodeMapKey localKey = {};
132+
localKey.type = LOCAL_DEV_VXLAN;
133+
struct localNodeMapValue *localValue = bpf_map_lookup_elem(&ding_local, &localKey);
134+
if (localValue) {
135+
// redirect 到 vxlan egress
136+
return bpf_redirect(localValue->ifIndex, 0);
137+
}
138+
return TC_ACT_UNSPEC;
139+
}
140+
141+
return TC_ACT_UNSPEC;
142+
}
143+
144+
145+
SEC("vxlan_ingress")
146+
int vxlan_ingress(struct __sk_buff *skb) {
147+
void *data = (void *)(long)skb->data;
148+
void *data_end = (void *)(long)skb->data_end;
149+
if (data + sizeof(struct ethhdr) + sizeof(struct iphdr) > data_end) {
150+
return TC_ACT_UNSPEC;
151+
}
152+
153+
struct ethhdr *eth = data;
154+
struct iphdr *ip = (data + sizeof(struct ethhdr));
155+
if (eth->h_proto != bpf_htons(ETH_P_IP)) {
156+
return TC_ACT_UNSPEC;
157+
}
158+
159+
__u32 src_ip = bpf_htonl(ip->saddr);
160+
__u32 dst_ip = bpf_htonl(ip->daddr);
161+
bpf_printk("the dst_ip is: %d", dst_ip);
162+
bpf_printk("the ip->daddr is: %d", ip->daddr);
163+
164+
struct endpointKey epKey = {
165+
.ip = dst_ip,
166+
};
167+
struct endpointInfo *ep = bpf_map_lookup_elem(&ding_lxc, &epKey);
168+
if (!ep) {
169+
return TC_ACT_OK;
170+
}
171+
// 找到的话说明是发往本机 pod 中的流量
172+
// 此时需要做 stc mac 和 dst mac 的更新
173+
174+
// 拿到 mac 地址
175+
__u8 src_mac[ETH_ALEN];
176+
__u8 dst_mac[ETH_ALEN];
177+
// 将 mac 改成本机 pod 的那对儿 veth pair 的 mac
178+
__builtin_memcpy(src_mac, ep->nodeMac, ETH_ALEN);
179+
__builtin_memcpy(dst_mac, ep->mac, ETH_ALEN);
180+
// 将 mac 更新到 skb 中
181+
bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_dest), dst_mac, ETH_ALEN, 0);
182+
bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_source), src_mac, ETH_ALEN, 0);
183+
184+
return bpf_redirect(ep->lxcIfIndex, 0);
185+
}
186+
187+
SEC("vxlan_egress")
188+
int vxlan_egress(struct __sk_buff *skb) {
189+
void *data = (void *)(long)skb->data;
190+
void *data_end = (void *)(long)skb->data_end;
191+
if (data + sizeof(struct ethhdr) + sizeof(struct iphdr) > data_end) {
192+
return TC_ACT_UNSPEC;
193+
}
194+
195+
struct ethhdr *eth = data;
196+
struct iphdr *ip = (data + sizeof(struct ethhdr));
197+
if (eth->h_proto != bpf_htons(ETH_P_IP)) {
198+
return TC_ACT_UNSPEC;
199+
}
200+
201+
__u32 src_ip = bpf_htonl(ip->saddr);
202+
__u32 dst_ip = bpf_htonl(ip->daddr);
203+
bpf_printk("the dst_ip is: %d", dst_ip);
204+
bpf_printk("the ip->daddr is: %d", ip->daddr);
205+
206+
// 获取目标 ip 所在的 node ip
207+
struct podNodeKey podNodeKey = {};
208+
podNodeKey.ip = dst_ip;
209+
struct podNodeValue *podNode = bpf_map_lookup_elem(&ding_ip, &podNodeKey);
210+
if (podNode) {
211+
__u32 dst_node_ip = podNode->ip;
212+
// 准备一个 tunnel
213+
struct bpf_tunnel_key key;
214+
int ret;
215+
__builtin_memset(&key, 0x0, sizeof(key));
216+
key.remote_ipv4 = podNode->ip;
217+
key.tunnel_id = DEFAULT_TUNNEL_ID;
218+
key.tunnel_tos = 0;
219+
key.tunnel_ttl = 64;
220+
// 添加外头的隧道 udp
221+
ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX);
222+
if (ret < 0) {
223+
bpf_printk("bpf_skb_set_tunnel_key failed");
224+
return TC_ACT_SHOT;
225+
}
226+
return TC_ACT_OK;
227+
}
228+
229+
return TC_ACT_OK;
230+
}
231+
232+
233+
char _license[] SEC("license") = "GPL";

0 commit comments

Comments
 (0)
0