@@ -572,7 +572,8 @@ struct bpf_prog {
has_callchain_buf:1, /* callchain buffer allocated? */
enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
- call_get_func_ip:1; /* Do we call get_func_ip() */
+ call_get_func_ip:1, /* Do we call get_func_ip() */
+ delivery_time_access:1; /* Accessed __sk_buff->delivery_time_type */
enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */
@@ -5086,6 +5086,37 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure. On error
* *dst* buffer is zeroed out.
+ *
+ * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type)
+ * Description
+ * Set a *dtime* (delivery time) to the __sk_buff->tstamp and also
+ * change the __sk_buff->delivery_time_type to *dtime_type*.
+ *
+ * When setting a delivery time (non zero *dtime*) to
+ * __sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type*
+ * is supported. It is the only delivery_time_type that will be
+ * kept after bpf_redirect_*().
+ *
+ * If there is no need to change the __sk_buff->delivery_time_type,
+ * the delivery time can be directly written to __sk_buff->tstamp
+ * instead.
+ *
+ * *dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE
+ * can be used to clear any delivery time stored in
+ * __sk_buff->tstamp.
+ *
+ * Only IPv4 and IPv6 skb->protocol are supported.
+ *
+ * This function is most useful when it needs to set a
+ * mono delivery time to __sk_buff->tstamp and then
+ * bpf_redirect_*() to the egress of an iface. For example,
+ * changing the (rcv) timestamp in __sk_buff->tstamp at
+ * ingress to a mono delivery time and then bpf_redirect_*()
+ * to sch_fq@phy-dev.
+ * Return
+ * 0 on success.
+ * **-EINVAL** for invalid input
+ * **-EOPNOTSUPP** for unsupported delivery_time_type and protocol
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -5280,6 +5311,7 @@ union bpf_attr {
FN(xdp_load_bytes), \
FN(xdp_store_bytes), \
FN(copy_from_user_task), \
+ FN(skb_set_delivery_time), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5469,6 +5501,12 @@ union { \
__u64 :64; \
} __attribute__((aligned(8)))
+enum {
+ BPF_SKB_DELIVERY_TIME_NONE,
+ BPF_SKB_DELIVERY_TIME_UNSPEC,
+ BPF_SKB_DELIVERY_TIME_MONO,
+};
+
/* user accessible mirror of in-kernel sk_buff.
* new fields can only be added to the end of this structure
*/
@@ -5509,7 +5547,8 @@ struct __sk_buff {
__u32 gso_segs;
__bpf_md_ptr(struct bpf_sock *, sk);
__u32 gso_size;
- __u32 :32; /* Padding, future use. */
+ __u8 delivery_time_type;
+ __u32 :24; /* Padding, future use. */
__u64 hwtstamp;
};
@@ -7388,6 +7388,43 @@ static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_skb_set_delivery_time, struct sk_buff *, skb,
+ u64, dtime, u32, dtime_type)
+{
+ /* skb_clear_delivery_time() is done for inet protocol */
+ if (skb->protocol != htons(ETH_P_IP) &&
+ skb->protocol != htons(ETH_P_IPV6))
+ return -EOPNOTSUPP;
+
+ switch (dtime_type) {
+ case BPF_SKB_DELIVERY_TIME_MONO:
+ if (!dtime)
+ return -EINVAL;
+ skb->tstamp = dtime;
+ skb->mono_delivery_time = 1;
+ break;
+ case BPF_SKB_DELIVERY_TIME_NONE:
+ if (dtime)
+ return -EINVAL;
+ skb->tstamp = 0;
+ skb->mono_delivery_time = 0;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_delivery_time_proto = {
+ .func = bpf_skb_set_delivery_time,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
@@ -7749,6 +7786,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_tcp_gen_syncookie_proto;
case BPF_FUNC_sk_assign:
return &bpf_sk_assign_proto;
+ case BPF_FUNC_skb_set_delivery_time:
+ return &bpf_skb_set_delivery_time_proto;
#endif
default:
return bpf_sk_base_func_proto(func_id);
@@ -8088,7 +8127,9 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
return false;
info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
break;
- case offsetofend(struct __sk_buff, gso_size) ... offsetof(struct __sk_buff, hwtstamp) - 1:
+ case offsetof(struct __sk_buff, delivery_time_type):
+ return false;
+ case offsetofend(struct __sk_buff, delivery_time_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
/* Explicitly prohibit access to padding in __sk_buff. */
return false;
default:
@@ -8443,6 +8484,15 @@ static bool tc_cls_act_is_valid_access(int off, int size,
break;
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
+ case offsetof(struct __sk_buff, delivery_time_type):
+ /* The convert_ctx_access() on reading and writing
+ * __sk_buff->tstamp depends on whether the bpf prog
+ * has used __sk_buff->delivery_time_type or not.
+ * Thus, we need to set prog->delivery_time_access
+ * earlier during is_valid_access() here.
+ */
+ ((struct bpf_prog *)prog)->delivery_time_access = 1;
+ return size == sizeof(__u8);
}
return bpf_skb_is_valid_access(off, size, type, prog, info);
@@ -8838,6 +8888,45 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
+static struct bpf_insn *bpf_convert_dtime_type_read(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ __u8 value_reg = si->dst_reg;
+ __u8 skb_reg = si->src_reg;
+ __u8 tmp_reg = BPF_REG_AX;
+
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
+ SKB_MONO_DELIVERY_TIME_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
+ SKB_MONO_DELIVERY_TIME_MASK);
+ *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
+ /* value_reg = BPF_SKB_DELIVERY_TIME_MONO */
+ *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_MONO);
+ *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 10 : 5);
+
+ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, skb_reg,
+ offsetof(struct sk_buff, tstamp));
+ *insn++ = BPF_JMP_IMM(BPF_JNE, tmp_reg, 0, 2);
+ /* value_reg = BPF_SKB_DELIVERY_TIME_NONE */
+ *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_NONE);
+ *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 6 : 1);
+
+#ifdef CONFIG_NET_CLS_ACT
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
+ *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
+ /* At ingress, value_reg = 0 */
+ *insn++ = BPF_MOV32_IMM(value_reg, 0);
+ *insn++ = BPF_JMP_A(1);
+#endif
+
+ /* value_reg = BPF_SKB_DELIVERYT_TIME_UNSPEC */
+ *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_UNSPEC);
+
+ /* 15 insns with CONFIG_NET_CLS_ACT */
+ return insn;
+}
+
static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
struct bpf_insn *insn)
{
@@ -8859,29 +8948,32 @@ static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
return insn;
}
-static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_insn *si,
+static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
+ const struct bpf_insn *si,
struct bpf_insn *insn)
{
__u8 value_reg = si->dst_reg;
__u8 skb_reg = si->src_reg;
#ifdef CONFIG_NET_CLS_ACT
- __u8 tmp_reg = BPF_REG_AX;
-
- *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
- *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
- *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 5);
- /* @ingress, read __sk_buff->tstamp as the (rcv) timestamp,
- * so check the skb->mono_delivery_time.
- */
- *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
- SKB_MONO_DELIVERY_TIME_OFFSET);
- *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
- SKB_MONO_DELIVERY_TIME_MASK);
- *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
- /* skb->mono_delivery_time is set, read 0 as the (rcv) timestamp. */
- *insn++ = BPF_MOV64_IMM(value_reg, 0);
- *insn++ = BPF_JMP_A(1);
+ if (!prog->delivery_time_access) {
+ __u8 tmp_reg = BPF_REG_AX;
+
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
+ *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 5);
+ /* @ingress, read __sk_buff->tstamp as the (rcv) timestamp,
+ * so check the skb->mono_delivery_time.
+ */
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
+ SKB_MONO_DELIVERY_TIME_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
+ SKB_MONO_DELIVERY_TIME_MASK);
+ *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
+ /* skb->mono_delivery_time is set, read 0 as the (rcv) timestamp. */
+ *insn++ = BPF_MOV64_IMM(value_reg, 0);
+ *insn++ = BPF_JMP_A(1);
+ }
#endif
*insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
@@ -8889,27 +8981,30 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_insn *si,
return insn;
}
-static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_insn *si,
+static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
+ const struct bpf_insn *si,
struct bpf_insn *insn)
{
__u8 value_reg = si->src_reg;
__u8 skb_reg = si->dst_reg;
#ifdef CONFIG_NET_CLS_ACT
- __u8 tmp_reg = BPF_REG_AX;
-
- *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
- *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
- *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 3);
- /* Writing __sk_buff->tstamp at ingress as the (rcv) timestamp.
- * Clear the skb->mono_delivery_time.
- */
- *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
- SKB_MONO_DELIVERY_TIME_OFFSET);
- *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
- ~SKB_MONO_DELIVERY_TIME_MASK);
- *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg,
- SKB_MONO_DELIVERY_TIME_OFFSET);
+ if (!prog->delivery_time_access) {
+ __u8 tmp_reg = BPF_REG_AX;
+
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
+ *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 3);
+ /* Writing __sk_buff->tstamp at ingress as the (rcv) timestamp.
+ * Clear the skb->mono_delivery_time.
+ */
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
+ SKB_MONO_DELIVERY_TIME_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
+ ~SKB_MONO_DELIVERY_TIME_MASK);
+ *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg,
+ SKB_MONO_DELIVERY_TIME_OFFSET);
+ }
#endif
/* skb->tstamp = tstamp */
@@ -9226,9 +9321,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);
if (type == BPF_WRITE)
- insn = bpf_convert_tstamp_write(si, insn);
+ insn = bpf_convert_tstamp_write(prog, si, insn);
else
- insn = bpf_convert_tstamp_read(si, insn);
+ insn = bpf_convert_tstamp_read(prog, si, insn);
+ break;
+
+ case offsetof(struct __sk_buff, delivery_time_type):
+ insn = bpf_convert_dtime_type_read(si, insn);
break;
case offsetof(struct __sk_buff, gso_segs):
@@ -5086,6 +5086,37 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure. On error
* *dst* buffer is zeroed out.
+ *
+ * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type)
+ * Description
+ * Set a *dtime* (delivery time) to the __sk_buff->tstamp and also
+ * change the __sk_buff->delivery_time_type to *dtime_type*.
+ *
+ * When setting a delivery time (non zero *dtime*) to
+ * __sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type*
+ * is supported. It is the only delivery_time_type that will be
+ * kept after bpf_redirect_*().
+ *
+ * If there is no need to change the __sk_buff->delivery_time_type,
+ * the delivery time can be directly written to __sk_buff->tstamp
+ * instead.
+ *
+ * *dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE
+ * can be used to clear any delivery time stored in
+ * __sk_buff->tstamp.
+ *
+ * Only IPv4 and IPv6 skb->protocol are supported.
+ *
+ * This function is most useful when it needs to set a
+ * mono delivery time to __sk_buff->tstamp and then
+ * bpf_redirect_*() to the egress of an iface. For example,
+ * changing the (rcv) timestamp in __sk_buff->tstamp at
+ * ingress to a mono delivery time and then bpf_redirect_*()
+ * to sch_fq@phy-dev.
+ * Return
+ * 0 on success.
+ * **-EINVAL** for invalid input
+ * **-EOPNOTSUPP** for unsupported delivery_time_type and protocol
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -5280,6 +5311,7 @@ union bpf_attr {
FN(xdp_load_bytes), \
FN(xdp_store_bytes), \
FN(copy_from_user_task), \
+ FN(skb_set_delivery_time), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5469,6 +5501,12 @@ union { \
__u64 :64; \
} __attribute__((aligned(8)))
+enum {
+ BPF_SKB_DELIVERY_TIME_NONE,
+ BPF_SKB_DELIVERY_TIME_UNSPEC,
+ BPF_SKB_DELIVERY_TIME_MONO,
+};
+
/* user accessible mirror of in-kernel sk_buff.
* new fields can only be added to the end of this structure
*/
@@ -5509,7 +5547,8 @@ struct __sk_buff {
__u32 gso_segs;
__bpf_md_ptr(struct bpf_sock *, sk);
__u32 gso_size;
- __u32 :32; /* Padding, future use. */
+ __u8 delivery_time_type;
+ __u32 :24; /* Padding, future use. */
__u64 hwtstamp;
};
* __sk_buff->delivery_time_type: This patch adds __sk_buff->delivery_time_type. It tells if the delivery_time is stored in __sk_buff->tstamp or not. It will be most useful for ingress to tell if the __sk_buff->tstamp has the (rcv) timestamp or delivery_time. If delivery_time_type is 0 (BPF_SKB_DELIVERY_TIME_NONE), it has the (rcv) timestamp. Two non-zero types are defined for the delivery_time_type, BPF_SKB_DELIVERY_TIME_MONO and BPF_SKB_DELIVERY_TIME_UNSPEC. For UNSPEC, it can only happen in egress because only mono delivery_time can be forwarded to ingress now. The clock of UNSPEC delivery_time can be deduced from the skb->sk->sk_clockid which is how the sch_etf doing it also. * Provide forwarded delivery_time to tc-bpf@ingress: With the help of the new delivery_time_type, the tc-bpf has a way to tell if the __sk_buff->tstamp has the (rcv) timestamp or the delivery_time. During bpf load time, the verifier will learn if the bpf prog has accessed the new __sk_buff->delivery_time_type. If it does, it means the tc-bpf@ingress is expecting the skb->tstamp could have the delivery_time. The kernel will then read the skb->tstamp as-is during bpf insn rewrite without checking the skb->mono_delivery_time. This is done by adding a new prog->delivery_time_access bit. The same goes for writing skb->tstamp. * bpf_skb_set_delivery_time(): The bpf_skb_set_delivery_time() helper is added to allow setting both delivery_time and the delivery_time_type at the same time. If the tc-bpf does not need to change the delivery_time_type, it can directly write to the __sk_buff->tstamp as the existing tc-bpf has already been doing. It will be most useful at ingress to change the __sk_buff->tstamp from the (rcv) timestamp to a mono delivery_time and then bpf_redirect_*(). bpf only has mono clock helper (bpf_ktime_get_ns), and the current known use case is the mono EDT for fq, and only mono delivery time can be kept during forward now, so bpf_skb_set_delivery_time() only supports setting BPF_SKB_DELIVERY_TIME_MONO. It can be extended later when use cases come up and the forwarding path also supports other clock bases. Signed-off-by: Martin KaFai Lau <kafai@fb.com> --- include/linux/filter.h | 3 +- include/uapi/linux/bpf.h | 41 +++++++- net/core/filter.c | 169 ++++++++++++++++++++++++++------- tools/include/uapi/linux/bpf.h | 41 +++++++- 4 files changed, 216 insertions(+), 38 deletions(-)