Skip to content

Commit 50f6fa8

Browse files
committed
bpf: Support external IPv4 DSR
Support IPIP termination from the Cilium L4LB against a regular Cilium cluster. This work covers the termination as well as DSR aspect, so that replies go directly back to clients instead of the Cilium L4LB. Given the VIP:port of an external L4LB is not known in our K8s cluster, we also cannot hold them in the revNat map. Therefore, add the tuple info in the CT map. Guard this under a compilation flag given this is only relevant for users who really want to terminate the external L4LB in the workload cluster, others don't need to take the additional cycles. From agent side, the --enable-external-dsr={true,false} flag controls this setting. The default is on false. Example with IPIP termination : Cilium L4LB node: # ./cilium-dbg/cilium-dbg service list ID Frontend Service Type Backend [...] 11 1.1.1.1:80 ExternalIPs 1 => 192.168.2.12:80 (active) Cilium regular cluster with --enable-external-dsr=true: # ./cilium-dbg/cilium-dbg service list ID Frontend Service Type Backend [...] 11 192.168.2.12:80 ExternalIPs 1 => 193.99.144.80:80 (active) tcpdump on Cilium regular node: [...] 09:36:17.421507 IP 192.168.2.11 > 192.168.2.12: IP 192.168.2.13.43196 > 1.1.1.1.80: Flags [S], seq 3976047959, win 42340, options [mss 1460,sackOK,TS val 4083238462 ecr 0,nop,wscale 9], length 0 09:36:17.421529 IP 192.168.2.12.43196 > 193.99.144.80.80: Flags [S], seq 3976047959, win 42340, options [mss 1460,sackOK,TS val 4083238462 ecr 0,nop,wscale 9], length 0 09:36:17.428443 IP 193.99.144.80.80 > 192.168.2.12.43196: Flags [S.], seq 1717159938, ack 3976047960, win 14600, options [mss 1460,nop,wscale 0,sackOK,TS val 1591760912 ecr 4083238462], length 0 09:36:17.428680 IP 1.1.1.1.80 > 192.168.2.13.43196: Flags [S.], seq 1717159938, ack 3976047960, win 14600, options [mss 1460,nop,wscale 0,sackOK,TS val 1591760912 ecr 4083238462], length 0 [...] What can be seen is the IPIP termination, the Cilium regular node then performing the service request to the backend, and upon reply reversing everything along with the DSR (1.1.1.1.80) to the client directly. Signed-off-by: Daniel Borkmann <[email protected]>
1 parent c94f8cc commit 50f6fa8

File tree

11 files changed

+134
-25
lines changed

11 files changed

+134
-25
lines changed

Documentation/cmdref/cilium-agent.md

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bpf/bpf_lxc.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ static __always_inline int __per_packet_lb_svc_xlate_4(void *ctx, struct iphdr *
8989

9090
has_l4_header = ipv4_has_l4_header(ip4);
9191

92-
ret = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple);
92+
ret = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple, NULL);
9393
if (IS_ERR(ret)) {
9494
if (ret == DROP_UNSUPP_SERVICE_PROTO || ret == DROP_UNKNOWN_L4)
9595
goto skip_service_lookup;
@@ -2082,7 +2082,7 @@ ipv4_to_endpoint_is_hairpin_flow(struct __ctx_buff *ctx, struct iphdr *ip4)
20822082
/* Extract the tuple from the packet so we can freely access addrs and ports.
20832083
* All values are in network byte order.
20842084
*/
2085-
err = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple);
2085+
err = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple, NULL);
20862086
if (IS_ERR(err))
20872087
return false;
20882088

bpf/lib/common.h

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,7 @@ enum {
581581
#define DROP_CT_INVALID_HDR -135
582582
#define DROP_FRAG_NEEDED -136
583583
#define DROP_CT_UNKNOWN_PROTO -137
584-
#define DROP_UNUSED4 -138 /* unused */
584+
#define DROP_READ_ERROR -138
585585
#define DROP_UNKNOWN_L3 -139
586586
#define DROP_MISSED_TAIL_CALL -140
587587
#define DROP_WRITE_ERROR -141
@@ -901,6 +901,11 @@ enum {
901901
BE_STATE_MAINTENANCE,
902902
};
903903

904+
struct lb4_reverse_nat {
905+
__be32 address;
906+
__be16 port;
907+
} __packed;
908+
904909
struct ipv6_ct_tuple {
905910
/* Address fields are reversed, i.e.,
906911
* these field names are correct for reply direction traffic.
@@ -939,11 +944,12 @@ struct ct_entry {
939944
__u64 packets;
940945
__u64 bytes;
941946
};
947+
struct lb4_reverse_nat dsr4;
942948
};
943949
__u32 lifetime;
944950
__u16 rx_closing:1,
945951
tx_closing:1,
946-
unused_nat46:1, /* unused since v1.12 / 81dee05e82fb */
952+
dsr_external:1, /* DSR for a VIP from an external L4LB (dsr4) */
947953
lb_loopback:1,
948954
seen_non_syn:1,
949955
node_port:1,
@@ -1074,11 +1080,6 @@ struct lb4_health {
10741080
struct lb4_backend peer;
10751081
};
10761082

1077-
struct lb4_reverse_nat {
1078-
__be32 address;
1079-
__be16 port;
1080-
} __packed;
1081-
10821083
struct ipv4_revnat_tuple {
10831084
__sock_cookie cookie;
10841085
__be32 address;
@@ -1146,12 +1147,18 @@ struct ct_state {
11461147
reserved1:1, /* Was auth_required, not used in production anywhere */
11471148
from_tunnel:1, /* Connection is from tunnel */
11481149
dsr_internal:1,
1149-
reserved:8;
1150+
dsr_external:1,
1151+
reserved:7;
11501152
__u32 src_sec_id;
11511153
#ifndef HAVE_FIB_IFINDEX
11521154
__u16 ifindex;
11531155
#endif
11541156
__u32 backend_id; /* Backend ID in lb4_backends */
1157+
#ifdef ENABLE_DSR_EXTERNAL
1158+
union {
1159+
struct lb4_reverse_nat dsr4;
1160+
};
1161+
#endif
11551162
};
11561163

11571164
static __always_inline bool ct_state_is_from_l7lb(const struct ct_state *ct_state __maybe_unused)

bpf/lib/conntrack.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,11 @@ ct_lookup_fill_state(struct ct_state *state, const struct ct_entry *entry,
206206
state->proxy_redirect = entry->proxy_redirect;
207207
state->from_l7lb = entry->from_l7lb;
208208
state->from_tunnel = entry->from_tunnel;
209+
#ifdef ENABLE_DSR_EXTERNAL
210+
state->dsr_external = entry->dsr_external;
211+
if (state->dsr_external)
212+
state->dsr4 = entry->dsr4;
213+
#endif
209214
#ifndef HAVE_FIB_IFINDEX
210215
state->ifindex = entry->ifindex;
211216
#endif
@@ -910,6 +915,9 @@ ct_create_fill_entry(struct ct_entry *entry, const struct ct_state *state,
910915
entry->node_port = state->node_port;
911916
entry->dsr_internal = state->dsr_internal;
912917
entry->from_tunnel = state->from_tunnel;
918+
#ifdef ENABLE_DSR_EXTERNAL
919+
entry->dsr_external = state->dsr_external;
920+
#endif
913921
#ifndef HAVE_FIB_IFINDEX
914922
entry->ifindex = state->ifindex;
915923
#endif
@@ -988,8 +996,13 @@ static __always_inline int ct_create4(const void *map_main,
988996
union tcp_flags seen_flags = { .value = 0 };
989997
int err;
990998

991-
if (ct_state)
999+
if (ct_state) {
9921000
ct_create_fill_entry(&entry, ct_state, dir);
1001+
#ifdef ENABLE_DSR_EXTERNAL
1002+
if (entry.dsr_external)
1003+
entry.dsr4 = ct_state->dsr4;
1004+
#endif
1005+
}
9931006

9941007
seen_flags.value |= is_tcp ? TCP_FLAG_SYN : 0;
9951008
ct_update_timeout(&entry, is_tcp, dir, seen_flags);

bpf/lib/lb.h

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1155,6 +1155,14 @@ static __always_inline int lb4_rev_nat(struct __ctx_buff *ctx, int l3_off, int l
11551155
loopback, has_l4_header);
11561156
}
11571157

1158+
static __always_inline int lb4_rev_dsr(struct __ctx_buff *ctx, int l3_off, int l4_off,
1159+
struct lb4_reverse_nat *rev_dsr, bool loopback,
1160+
struct ipv4_ct_tuple *tuple, int flags, bool has_l4_header)
1161+
{
1162+
return __lb4_rev_nat(ctx, l3_off, l4_off, tuple, flags, rev_dsr,
1163+
loopback, has_l4_header);
1164+
}
1165+
11581166
static __always_inline void
11591167
lb4_fill_key(struct lb4_key *key, const struct ipv4_ct_tuple *tuple)
11601168
{
@@ -1179,7 +1187,7 @@ lb4_fill_key(struct lb4_key *key, const struct ipv4_ct_tuple *tuple)
11791187
*/
11801188
static __always_inline int
11811189
lb4_extract_tuple(struct __ctx_buff *ctx, struct iphdr *ip4, int l3_off, int *l4_off,
1182-
struct ipv4_ct_tuple *tuple)
1190+
struct ipv4_ct_tuple *tuple, __be32 *external_vip __maybe_unused)
11831191
{
11841192
int ret;
11851193

@@ -1190,14 +1198,41 @@ lb4_extract_tuple(struct __ctx_buff *ctx, struct iphdr *ip4, int l3_off, int *l4
11901198
*l4_off = l3_off + ipv4_hdrlen(ip4);
11911199

11921200
switch (tuple->nexthdr) {
1201+
#ifdef ENABLE_DSR_EXTERNAL
1202+
# if __ctx_is == __ctx_skb
1203+
case IPPROTO_IPIP: {
1204+
struct iphdr inner;
1205+
1206+
/* The initial packets hits the Cilium L4LB as:
1207+
* - [ client-ip -> l4lb-vip ]
1208+
*
1209+
* The IPIP packet from the Cilium L4LB looks as follows:
1210+
* - outer: [ l4lb-rss-ip -> k8s-svc-ip ]
1211+
* - inner: [ client-ip -> l4lb-vip ]
1212+
*
1213+
* We extract [ client-ip -> k8s-svc-ip ] and later need
1214+
* to reply with l4lb-vip as source. The l4lb-vip and
1215+
* k8s-svc-ip ports are the same / must match.
1216+
*/
1217+
ctx_load_bytes(ctx, *l4_off, &inner, sizeof(inner));
1218+
tuple->nexthdr = inner.protocol;
1219+
tuple->saddr = inner.saddr;
1220+
if (external_vip)
1221+
*external_vip = inner.daddr;
1222+
if (ipv4_hdrlen(&inner) != sizeof(*ip4))
1223+
return DROP_NAT_UNSUPP_PROTO;
1224+
*l4_off += sizeof(*ip4);
1225+
fallthrough;
1226+
};
1227+
# endif
1228+
#endif
11931229
case IPPROTO_TCP:
11941230
case IPPROTO_UDP:
11951231
#ifdef ENABLE_SCTP
11961232
case IPPROTO_SCTP:
11971233
#endif /* ENABLE_SCTP */
11981234
ret = ipv4_load_l4_ports(ctx, ip4, *l4_off, CT_EGRESS,
11991235
&tuple->dport, NULL);
1200-
12011236
if (IS_ERR(ret))
12021237
return ret;
12031238
return 0;
@@ -1366,6 +1401,7 @@ lb4_xlate(struct __ctx_buff *ctx, __be32 *new_saddr __maybe_unused,
13661401
{
13671402
const __be32 *new_daddr = &backend->address;
13681403
struct csum_offset csum_off = {};
1404+
__be32 old_daddr;
13691405
__be32 sum;
13701406
int ret;
13711407

@@ -1375,12 +1411,16 @@ lb4_xlate(struct __ctx_buff *ctx, __be32 *new_saddr __maybe_unused,
13751411
if (skip_l3_xlate)
13761412
goto l4_xlate;
13771413

1414+
ret = ctx_load_bytes(ctx, l3_off + offsetof(struct iphdr, daddr),
1415+
&old_daddr, 4);
1416+
if (unlikely(ret < 0))
1417+
return DROP_READ_ERROR;
13781418
ret = ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, daddr),
13791419
new_daddr, 4, 0);
1380-
if (ret < 0)
1420+
if (unlikely(ret < 0))
13811421
return DROP_WRITE_ERROR;
13821422

1383-
sum = csum_diff(&key->address, 4, new_daddr, 4, 0);
1423+
sum = csum_diff(&old_daddr, 4, new_daddr, 4, 0);
13841424
#ifndef DISABLE_LOOPBACK_LB
13851425
if (new_saddr && *new_saddr) {
13861426
cilium_dbg_lb(ctx, DBG_LB4_LOOPBACK_SNAT, *old_saddr, *new_saddr);

bpf/lib/nodeport.h

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2416,7 +2416,7 @@ nodeport_rev_dnat_ingress_ipv4(struct __ctx_buff *ctx, struct trace_ctx *trace,
24162416

24172417
has_l4_header = ipv4_has_l4_header(ip4);
24182418

2419-
ret = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple);
2419+
ret = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple, NULL);
24202420
if (ret < 0) {
24212421
/* If it's not a SVC protocol, we don't need to check for RevDNAT: */
24222422
if (ret == DROP_UNSUPP_SERVICE_PROTO || ret == DROP_UNKNOWN_L4)
@@ -2444,8 +2444,18 @@ nodeport_rev_dnat_ingress_ipv4(struct __ctx_buff *ctx, struct trace_ctx *trace,
24442444
CT_ENTRY_NODEPORT, &ct_state, &trace->monitor);
24452445
if (ret == CT_REPLY) {
24462446
trace->reason = TRACE_REASON_CT_REPLY;
2447-
ret = lb4_rev_nat(ctx, l3_off, l4_off, ct_state.rev_nat_index, false,
2448-
&tuple, REV_NAT_F_TUPLE_SADDR, has_l4_header);
2447+
#ifdef ENABLE_DSR_EXTERNAL
2448+
if (ct_state.dsr_external) {
2449+
ret = lb4_rev_dsr(ctx, l3_off, l4_off, &ct_state.dsr4,
2450+
false, &tuple, REV_NAT_F_TUPLE_SADDR,
2451+
has_l4_header);
2452+
} else
2453+
#endif
2454+
{
2455+
ret = lb4_rev_nat(ctx, l3_off, l4_off, ct_state.rev_nat_index,
2456+
false, &tuple, REV_NAT_F_TUPLE_SADDR,
2457+
has_l4_header);
2458+
}
24492459
if (IS_ERR(ret))
24502460
return ret;
24512461
if (!revalidate_data(ctx, &data, &data_end, &ip4))
@@ -2683,7 +2693,7 @@ int tail_nodeport_nat_egress_ipv4(struct __ctx_buff *ctx)
26832693
}
26842694
#endif
26852695

2686-
ret = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple);
2696+
ret = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple, NULL);
26872697
if (IS_ERR(ret))
26882698
goto drop_err;
26892699

@@ -2770,6 +2780,7 @@ static __always_inline int nodeport_lb4(struct __ctx_buff *ctx,
27702780
struct lb4_service *svc;
27712781
struct lb4_key key = {};
27722782
struct ct_state ct_state_new = {};
2783+
__be32 external_vip = 0;
27732784
__u32 cluster_id = 0;
27742785
__u32 monitor = 0;
27752786
int ret, l4_off;
@@ -2778,7 +2789,7 @@ static __always_inline int nodeport_lb4(struct __ctx_buff *ctx,
27782789

27792790
has_l4_header = ipv4_has_l4_header(ip4);
27802791

2781-
ret = lb4_extract_tuple(ctx, ip4, l3_off, &l4_off, &tuple);
2792+
ret = lb4_extract_tuple(ctx, ip4, l3_off, &l4_off, &tuple, &external_vip);
27822793
if (IS_ERR(ret)) {
27832794
if (ret == DROP_UNSUPP_SERVICE_PROTO) {
27842795
is_svc_proto = false;
@@ -2799,6 +2810,16 @@ static __always_inline int nodeport_lb4(struct __ctx_buff *ctx,
27992810

28002811
if (!lb4_src_range_ok(svc, ip4->saddr))
28012812
return DROP_NOT_IN_SRC_RANGE;
2813+
#ifdef ENABLE_DSR_EXTERNAL
2814+
if (external_vip) {
2815+
if (ctx_adjust_hroom(ctx, -(int)sizeof(*ip4),
2816+
BPF_ADJ_ROOM_MAC,
2817+
BPF_F_ADJ_ROOM_FIXED_GSO))
2818+
return DROP_UNSUPP_SERVICE_PROTO;
2819+
tuple.daddr = external_vip;
2820+
l4_off -= sizeof(*ip4);
2821+
}
2822+
#endif
28022823
#if defined(ENABLE_L7_LB)
28032824
if (lb4_svc_is_l7loadbalancer(svc) && svc->l7_lb_proxy_port > 0) {
28042825
/* We cannot redirect from the XDP layer to cilium_host.
@@ -2930,6 +2951,15 @@ static __always_inline int nodeport_lb4(struct __ctx_buff *ctx,
29302951
redo:
29312952
ct_state_new.src_sec_id = WORLD_IPV4_ID;
29322953
ct_state_new.node_port = 1;
2954+
#ifdef ENABLE_DSR_EXTERNAL
2955+
ct_state_new.dsr_external = !!external_vip;
2956+
if (ct_state_new.dsr_external) {
2957+
ct_state_new.dsr4.address = external_vip;
2958+
ct_state_new.dsr4.port = key.dport;
2959+
}
2960+
#else
2961+
ct_state_new.dsr_external = 0;
2962+
#endif
29332963
#ifndef HAVE_FIB_IFINDEX
29342964
ct_state_new.ifindex = (__u16)NATIVE_DEV_IFINDEX;
29352965
#endif
@@ -3003,7 +3033,7 @@ nodeport_rev_dnat_fwd_ipv4(struct __ctx_buff *ctx, struct trace_ctx *trace,
30033033
has_l4_header = ipv4_has_l4_header(ip4);
30043034
is_fragment = ipv4_is_fragment(ip4);
30053035

3006-
ret = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple);
3036+
ret = lb4_extract_tuple(ctx, ip4, ETH_HLEN, &l4_off, &tuple, NULL);
30073037
if (ret < 0) {
30083038
/* If it's not a SVC protocol, we don't need to check for RevDNAT: */
30093039
if (ret == DROP_UNSUPP_SERVICE_PROTO || ret == DROP_UNKNOWN_L4)

daemon/cmd/daemon_main.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,9 @@ func InitGlobalFlags(cmd *cobra.Command, vp *viper.Viper) {
253253
flags.Bool(option.EnableLocalNodeRoute, defaults.EnableLocalNodeRoute, "Enable installation of the route which points the allocation prefix of the local node")
254254
option.BindEnv(vp, option.EnableLocalNodeRoute)
255255

256+
flags.Bool(option.EnableExternalDSR, false, "Enable termination and DSR handling of external L4LBs")
257+
option.BindEnv(vp, option.EnableExternalDSR)
258+
256259
flags.Bool(option.EnableIPv4Name, defaults.EnableIPv4, "Enable IPv4 support")
257260
option.BindEnv(vp, option.EnableIPv4Name)
258261

daemon/cmd/kube_proxy_replacement.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,7 @@ func disableNodePort() {
501501
option.Config.EnableHostPort = false
502502
option.Config.EnableExternalIPs = false
503503
option.Config.EnableSVCSourceRangeCheck = false
504+
option.Config.EnableExternalDSR = false
504505
option.Config.EnableHostLegacyRouting = true
505506
}
506507

pkg/datapath/linux/config/config.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,9 @@ func (h *HeaderfileWriter) WriteNodeConfig(w io.Writer, cfg *datapath.LocalNodeC
381381
cDefinesMap["ENABLE_MKE"] = "1"
382382
cDefinesMap["MKE_HOST"] = fmt.Sprintf("%d", option.HostExtensionMKE)
383383
}
384+
if option.Config.EnableExternalDSR {
385+
cDefinesMap["ENABLE_DSR_EXTERNAL"] = "1"
386+
}
384387
if option.Config.EnableRecorder {
385388
cDefinesMap["ENABLE_CAPTURE"] = "1"
386389
if option.Config.EnableIPv4 {

pkg/maps/ctmap/types.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -558,7 +558,7 @@ const SizeofCtEntry = int(unsafe.Sizeof(CtEntry{}))
558558
const (
559559
RxClosing = 1 << iota
560560
TxClosing
561-
Nat64
561+
DSRExternal
562562
LBLoopback
563563
SeenNonSyn
564564
NodePort
@@ -584,9 +584,6 @@ func (c *CtEntry) flagsString() string {
584584
if (c.Flags & TxClosing) != 0 {
585585
sb.WriteString("TxClosing ")
586586
}
587-
if (c.Flags & Nat64) != 0 {
588-
sb.WriteString("Nat64 ")
589-
}
590587
if (c.Flags & LBLoopback) != 0 {
591588
sb.WriteString("LBLoopback ")
592589
}
@@ -602,6 +599,9 @@ func (c *CtEntry) flagsString() string {
602599
if (c.Flags & DSRInternal) != 0 {
603600
sb.WriteString("DSR ")
604601
}
602+
if (c.Flags & DSRExternal) != 0 {
603+
sb.WriteString("DSRExt ")
604+
}
605605
if (c.Flags & FromL7LB) != 0 {
606606
sb.WriteString("FromL7LB ")
607607
}

0 commit comments

Comments
 (0)