Skip to content

Commit 8d21ec0

Browse files
iamkafaidavem330
authored andcommitted
bpf: Add __sk_buff->delivery_time_type and bpf_skb_set_skb_delivery_time()
* __sk_buff->delivery_time_type: This patch adds __sk_buff->delivery_time_type. It tells if the delivery_time is stored in __sk_buff->tstamp or not. It will be most useful for ingress to tell if the __sk_buff->tstamp has the (rcv) timestamp or delivery_time. If delivery_time_type is 0 (BPF_SKB_DELIVERY_TIME_NONE), it has the (rcv) timestamp. Two non-zero types are defined for the delivery_time_type, BPF_SKB_DELIVERY_TIME_MONO and BPF_SKB_DELIVERY_TIME_UNSPEC. For UNSPEC, it can only happen in egress because only mono delivery_time can be forwarded to ingress now. The clock of UNSPEC delivery_time can be deduced from the skb->sk->sk_clockid which is how the sch_etf doing it also. * Provide forwarded delivery_time to tc-bpf@ingress: With the help of the new delivery_time_type, the tc-bpf has a way to tell if the __sk_buff->tstamp has the (rcv) timestamp or the delivery_time. During bpf load time, the verifier will learn if the bpf prog has accessed the new __sk_buff->delivery_time_type. If it does, it means the tc-bpf@ingress is expecting the skb->tstamp could have the delivery_time. The kernel will then read the skb->tstamp as-is during bpf insn rewrite without checking the skb->mono_delivery_time. This is done by adding a new prog->delivery_time_access bit. The same goes for writing skb->tstamp. * bpf_skb_set_delivery_time(): The bpf_skb_set_delivery_time() helper is added to allow setting both delivery_time and the delivery_time_type at the same time. If the tc-bpf does not need to change the delivery_time_type, it can directly write to the __sk_buff->tstamp as the existing tc-bpf has already been doing. It will be most useful at ingress to change the __sk_buff->tstamp from the (rcv) timestamp to a mono delivery_time and then bpf_redirect_*(). bpf only has mono clock helper (bpf_ktime_get_ns), and the current known use case is the mono EDT for fq, and only mono delivery time can be kept during forward now, so bpf_skb_set_delivery_time() only supports setting BPF_SKB_DELIVERY_TIME_MONO. It can be extended later when use cases come up and the forwarding path also supports other clock bases. Signed-off-by: Martin KaFai Lau <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 7449197 commit 8d21ec0

4 files changed

Lines changed: 216 additions & 38 deletions

File tree

include/linux/filter.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,8 @@ struct bpf_prog {
572572
has_callchain_buf:1, /* callchain buffer allocated? */
573573
enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
574574
call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
575-
call_get_func_ip:1; /* Do we call get_func_ip() */
575+
call_get_func_ip:1, /* Do we call get_func_ip() */
576+
delivery_time_access:1; /* Accessed __sk_buff->delivery_time_type */
576577
enum bpf_prog_type type; /* Type of BPF program */
577578
enum bpf_attach_type expected_attach_type; /* For some prog types */
578579
u32 len; /* Number of filter blocks */

include/uapi/linux/bpf.h

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5086,6 +5086,37 @@ union bpf_attr {
50865086
* Return
50875087
* 0 on success, or a negative error in case of failure. On error
50885088
* *dst* buffer is zeroed out.
5089+
*
5090+
* long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type)
5091+
* Description
5092+
* Set a *dtime* (delivery time) to the __sk_buff->tstamp and also
5093+
* change the __sk_buff->delivery_time_type to *dtime_type*.
5094+
*
5095+
* When setting a delivery time (non zero *dtime*) to
5096+
* __sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type*
5097+
* is supported. It is the only delivery_time_type that will be
5098+
* kept after bpf_redirect_*().
5099+
*
5100+
* If there is no need to change the __sk_buff->delivery_time_type,
5101+
* the delivery time can be directly written to __sk_buff->tstamp
5102+
* instead.
5103+
*
5104+
* *dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE
5105+
* can be used to clear any delivery time stored in
5106+
* __sk_buff->tstamp.
5107+
*
5108+
* Only IPv4 and IPv6 skb->protocol are supported.
5109+
*
5110+
* This function is most useful when it needs to set a
5111+
* mono delivery time to __sk_buff->tstamp and then
5112+
* bpf_redirect_*() to the egress of an iface. For example,
5113+
* changing the (rcv) timestamp in __sk_buff->tstamp at
5114+
* ingress to a mono delivery time and then bpf_redirect_*()
5115+
* to sch_fq@phy-dev.
5116+
* Return
5117+
* 0 on success.
5118+
* **-EINVAL** for invalid input
5119+
* **-EOPNOTSUPP** for unsupported delivery_time_type and protocol
50895120
*/
50905121
#define __BPF_FUNC_MAPPER(FN) \
50915122
FN(unspec), \
@@ -5280,6 +5311,7 @@ union bpf_attr {
52805311
FN(xdp_load_bytes), \
52815312
FN(xdp_store_bytes), \
52825313
FN(copy_from_user_task), \
5314+
FN(skb_set_delivery_time), \
52835315
/* */
52845316

52855317
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5469,6 +5501,12 @@ union { \
54695501
__u64 :64; \
54705502
} __attribute__((aligned(8)))
54715503

5504+
enum {
5505+
BPF_SKB_DELIVERY_TIME_NONE,
5506+
BPF_SKB_DELIVERY_TIME_UNSPEC,
5507+
BPF_SKB_DELIVERY_TIME_MONO,
5508+
};
5509+
54725510
/* user accessible mirror of in-kernel sk_buff.
54735511
* new fields can only be added to the end of this structure
54745512
*/
@@ -5509,7 +5547,8 @@ struct __sk_buff {
55095547
__u32 gso_segs;
55105548
__bpf_md_ptr(struct bpf_sock *, sk);
55115549
__u32 gso_size;
5512-
__u32 :32; /* Padding, future use. */
5550+
__u8 delivery_time_type;
5551+
__u32 :24; /* Padding, future use. */
55135552
__u64 hwtstamp;
55145553
};
55155554

net/core/filter.c

Lines changed: 134 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7388,6 +7388,43 @@ static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
73887388
.arg3_type = ARG_ANYTHING,
73897389
};
73907390

7391+
BPF_CALL_3(bpf_skb_set_delivery_time, struct sk_buff *, skb,
7392+
u64, dtime, u32, dtime_type)
7393+
{
7394+
/* skb_clear_delivery_time() is done for inet protocol */
7395+
if (skb->protocol != htons(ETH_P_IP) &&
7396+
skb->protocol != htons(ETH_P_IPV6))
7397+
return -EOPNOTSUPP;
7398+
7399+
switch (dtime_type) {
7400+
case BPF_SKB_DELIVERY_TIME_MONO:
7401+
if (!dtime)
7402+
return -EINVAL;
7403+
skb->tstamp = dtime;
7404+
skb->mono_delivery_time = 1;
7405+
break;
7406+
case BPF_SKB_DELIVERY_TIME_NONE:
7407+
if (dtime)
7408+
return -EINVAL;
7409+
skb->tstamp = 0;
7410+
skb->mono_delivery_time = 0;
7411+
break;
7412+
default:
7413+
return -EOPNOTSUPP;
7414+
}
7415+
7416+
return 0;
7417+
}
7418+
7419+
static const struct bpf_func_proto bpf_skb_set_delivery_time_proto = {
7420+
.func = bpf_skb_set_delivery_time,
7421+
.gpl_only = false,
7422+
.ret_type = RET_INTEGER,
7423+
.arg1_type = ARG_PTR_TO_CTX,
7424+
.arg2_type = ARG_ANYTHING,
7425+
.arg3_type = ARG_ANYTHING,
7426+
};
7427+
73917428
#endif /* CONFIG_INET */
73927429

73937430
bool bpf_helper_changes_pkt_data(void *func)
@@ -7749,6 +7786,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
77497786
return &bpf_tcp_gen_syncookie_proto;
77507787
case BPF_FUNC_sk_assign:
77517788
return &bpf_sk_assign_proto;
7789+
case BPF_FUNC_skb_set_delivery_time:
7790+
return &bpf_skb_set_delivery_time_proto;
77527791
#endif
77537792
default:
77547793
return bpf_sk_base_func_proto(func_id);
@@ -8088,7 +8127,9 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
80888127
return false;
80898128
info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
80908129
break;
8091-
case offsetofend(struct __sk_buff, gso_size) ... offsetof(struct __sk_buff, hwtstamp) - 1:
8130+
case offsetof(struct __sk_buff, delivery_time_type):
8131+
return false;
8132+
case offsetofend(struct __sk_buff, delivery_time_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
80928133
/* Explicitly prohibit access to padding in __sk_buff. */
80938134
return false;
80948135
default:
@@ -8443,6 +8484,15 @@ static bool tc_cls_act_is_valid_access(int off, int size,
84438484
break;
84448485
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
84458486
return false;
8487+
case offsetof(struct __sk_buff, delivery_time_type):
8488+
/* The convert_ctx_access() on reading and writing
8489+
* __sk_buff->tstamp depends on whether the bpf prog
8490+
* has used __sk_buff->delivery_time_type or not.
8491+
* Thus, we need to set prog->delivery_time_access
8492+
* earlier during is_valid_access() here.
8493+
*/
8494+
((struct bpf_prog *)prog)->delivery_time_access = 1;
8495+
return size == sizeof(__u8);
84468496
}
84478497

84488498
return bpf_skb_is_valid_access(off, size, type, prog, info);
@@ -8838,6 +8888,45 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
88388888
return insn - insn_buf;
88398889
}
88408890

8891+
static struct bpf_insn *bpf_convert_dtime_type_read(const struct bpf_insn *si,
8892+
struct bpf_insn *insn)
8893+
{
8894+
__u8 value_reg = si->dst_reg;
8895+
__u8 skb_reg = si->src_reg;
8896+
__u8 tmp_reg = BPF_REG_AX;
8897+
8898+
*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
8899+
SKB_MONO_DELIVERY_TIME_OFFSET);
8900+
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
8901+
SKB_MONO_DELIVERY_TIME_MASK);
8902+
*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
8903+
/* value_reg = BPF_SKB_DELIVERY_TIME_MONO */
8904+
*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_MONO);
8905+
*insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 10 : 5);
8906+
8907+
*insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, skb_reg,
8908+
offsetof(struct sk_buff, tstamp));
8909+
*insn++ = BPF_JMP_IMM(BPF_JNE, tmp_reg, 0, 2);
8910+
/* value_reg = BPF_SKB_DELIVERY_TIME_NONE */
8911+
*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_NONE);
8912+
*insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 6 : 1);
8913+
8914+
#ifdef CONFIG_NET_CLS_ACT
8915+
*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
8916+
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
8917+
*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
8918+
/* At ingress, value_reg = 0 */
8919+
*insn++ = BPF_MOV32_IMM(value_reg, 0);
8920+
*insn++ = BPF_JMP_A(1);
8921+
#endif
8922+
8923+
/* value_reg = BPF_SKB_DELIVERYT_TIME_UNSPEC */
8924+
*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_UNSPEC);
8925+
8926+
/* 15 insns with CONFIG_NET_CLS_ACT */
8927+
return insn;
8928+
}
8929+
88418930
static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
88428931
struct bpf_insn *insn)
88438932
{
@@ -8859,57 +8948,63 @@ static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
88598948
return insn;
88608949
}
88618950

8862-
static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_insn *si,
8951+
static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
8952+
const struct bpf_insn *si,
88638953
struct bpf_insn *insn)
88648954
{
88658955
__u8 value_reg = si->dst_reg;
88668956
__u8 skb_reg = si->src_reg;
88678957

88688958
#ifdef CONFIG_NET_CLS_ACT
8869-
__u8 tmp_reg = BPF_REG_AX;
8870-
8871-
*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
8872-
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
8873-
*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 5);
8874-
/* @ingress, read __sk_buff->tstamp as the (rcv) timestamp,
8875-
* so check the skb->mono_delivery_time.
8876-
*/
8877-
*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
8878-
SKB_MONO_DELIVERY_TIME_OFFSET);
8879-
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
8880-
SKB_MONO_DELIVERY_TIME_MASK);
8881-
*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
8882-
/* skb->mono_delivery_time is set, read 0 as the (rcv) timestamp. */
8883-
*insn++ = BPF_MOV64_IMM(value_reg, 0);
8884-
*insn++ = BPF_JMP_A(1);
8959+
if (!prog->delivery_time_access) {
8960+
__u8 tmp_reg = BPF_REG_AX;
8961+
8962+
*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
8963+
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
8964+
*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 5);
8965+
/* @ingress, read __sk_buff->tstamp as the (rcv) timestamp,
8966+
* so check the skb->mono_delivery_time.
8967+
*/
8968+
*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
8969+
SKB_MONO_DELIVERY_TIME_OFFSET);
8970+
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
8971+
SKB_MONO_DELIVERY_TIME_MASK);
8972+
*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
8973+
/* skb->mono_delivery_time is set, read 0 as the (rcv) timestamp. */
8974+
*insn++ = BPF_MOV64_IMM(value_reg, 0);
8975+
*insn++ = BPF_JMP_A(1);
8976+
}
88858977
#endif
88868978

88878979
*insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
88888980
offsetof(struct sk_buff, tstamp));
88898981
return insn;
88908982
}
88918983

8892-
static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_insn *si,
8984+
static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
8985+
const struct bpf_insn *si,
88938986
struct bpf_insn *insn)
88948987
{
88958988
__u8 value_reg = si->src_reg;
88968989
__u8 skb_reg = si->dst_reg;
88978990

88988991
#ifdef CONFIG_NET_CLS_ACT
8899-
__u8 tmp_reg = BPF_REG_AX;
8900-
8901-
*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
8902-
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
8903-
*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 3);
8904-
/* Writing __sk_buff->tstamp at ingress as the (rcv) timestamp.
8905-
* Clear the skb->mono_delivery_time.
8906-
*/
8907-
*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
8908-
SKB_MONO_DELIVERY_TIME_OFFSET);
8909-
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
8910-
~SKB_MONO_DELIVERY_TIME_MASK);
8911-
*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg,
8912-
SKB_MONO_DELIVERY_TIME_OFFSET);
8992+
if (!prog->delivery_time_access) {
8993+
__u8 tmp_reg = BPF_REG_AX;
8994+
8995+
*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
8996+
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
8997+
*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 3);
8998+
/* Writing __sk_buff->tstamp at ingress as the (rcv) timestamp.
8999+
* Clear the skb->mono_delivery_time.
9000+
*/
9001+
*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
9002+
SKB_MONO_DELIVERY_TIME_OFFSET);
9003+
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
9004+
~SKB_MONO_DELIVERY_TIME_MASK);
9005+
*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg,
9006+
SKB_MONO_DELIVERY_TIME_OFFSET);
9007+
}
89139008
#endif
89149009

89159010
/* skb->tstamp = tstamp */
@@ -9226,9 +9321,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
92269321
BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);
92279322

92289323
if (type == BPF_WRITE)
9229-
insn = bpf_convert_tstamp_write(si, insn);
9324+
insn = bpf_convert_tstamp_write(prog, si, insn);
92309325
else
9231-
insn = bpf_convert_tstamp_read(si, insn);
9326+
insn = bpf_convert_tstamp_read(prog, si, insn);
9327+
break;
9328+
9329+
case offsetof(struct __sk_buff, delivery_time_type):
9330+
insn = bpf_convert_dtime_type_read(si, insn);
92329331
break;
92339332

92349333
case offsetof(struct __sk_buff, gso_segs):

tools/include/uapi/linux/bpf.h

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5086,6 +5086,37 @@ union bpf_attr {
50865086
* Return
50875087
* 0 on success, or a negative error in case of failure. On error
50885088
* *dst* buffer is zeroed out.
5089+
*
5090+
* long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type)
5091+
* Description
5092+
* Set a *dtime* (delivery time) to the __sk_buff->tstamp and also
5093+
* change the __sk_buff->delivery_time_type to *dtime_type*.
5094+
*
5095+
* When setting a delivery time (non zero *dtime*) to
5096+
* __sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type*
5097+
* is supported. It is the only delivery_time_type that will be
5098+
* kept after bpf_redirect_*().
5099+
*
5100+
* If there is no need to change the __sk_buff->delivery_time_type,
5101+
* the delivery time can be directly written to __sk_buff->tstamp
5102+
* instead.
5103+
*
5104+
* *dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE
5105+
* can be used to clear any delivery time stored in
5106+
* __sk_buff->tstamp.
5107+
*
5108+
* Only IPv4 and IPv6 skb->protocol are supported.
5109+
*
5110+
* This function is most useful when it needs to set a
5111+
* mono delivery time to __sk_buff->tstamp and then
5112+
* bpf_redirect_*() to the egress of an iface. For example,
5113+
* changing the (rcv) timestamp in __sk_buff->tstamp at
5114+
* ingress to a mono delivery time and then bpf_redirect_*()
5115+
* to sch_fq@phy-dev.
5116+
* Return
5117+
* 0 on success.
5118+
* **-EINVAL** for invalid input
5119+
* **-EOPNOTSUPP** for unsupported delivery_time_type and protocol
50895120
*/
50905121
#define __BPF_FUNC_MAPPER(FN) \
50915122
FN(unspec), \
@@ -5280,6 +5311,7 @@ union bpf_attr {
52805311
FN(xdp_load_bytes), \
52815312
FN(xdp_store_bytes), \
52825313
FN(copy_from_user_task), \
5314+
FN(skb_set_delivery_time), \
52835315
/* */
52845316

52855317
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5469,6 +5501,12 @@ union { \
54695501
__u64 :64; \
54705502
} __attribute__((aligned(8)))
54715503

5504+
enum {
5505+
BPF_SKB_DELIVERY_TIME_NONE,
5506+
BPF_SKB_DELIVERY_TIME_UNSPEC,
5507+
BPF_SKB_DELIVERY_TIME_MONO,
5508+
};
5509+
54725510
/* user accessible mirror of in-kernel sk_buff.
54735511
* new fields can only be added to the end of this structure
54745512
*/
@@ -5509,7 +5547,8 @@ struct __sk_buff {
55095547
__u32 gso_segs;
55105548
__bpf_md_ptr(struct bpf_sock *, sk);
55115549
__u32 gso_size;
5512-
__u32 :32; /* Padding, future use. */
5550+
__u8 delivery_time_type;
5551+
__u32 :24; /* Padding, future use. */
55135552
__u64 hwtstamp;
55145553
};
55155554

0 commit comments

Comments
 (0)