diff mbox series

[v4,net-next,7/8] bpf: Add __sk_buff->delivery_time_type and bpf_skb_set_delivery_time()

Message ID 20220211071316.892630-1-kafai@fb.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series Preserve mono delivery time (EDT) in skb->tstamp | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/apply fail Patch does not apply to net-next

Commit Message

Martin KaFai Lau Feb. 11, 2022, 7:13 a.m. UTC
* __sk_buff->delivery_time_type:
This patch adds __sk_buff->delivery_time_type.  It tells if the
delivery_time is stored in __sk_buff->tstamp or not.

It will be most useful for ingress to tell if the __sk_buff->tstamp
has the (rcv) timestamp or delivery_time.  If delivery_time_type
is 0 (BPF_SKB_DELIVERY_TIME_NONE), it has the (rcv) timestamp.

Two non-zero types are defined for the delivery_time_type,
BPF_SKB_DELIVERY_TIME_MONO and BPF_SKB_DELIVERY_TIME_UNSPEC.  For UNSPEC,
it can only happen in egress because only mono delivery_time can be
forwarded to ingress now.  The clock of UNSPEC delivery_time
can be deduced from the skb->sk->sk_clockid which is how
the sch_etf doing it also.

Thus, while delivery_time_type provides (rcv) timestamp
vs delivery_time signal to tc-bpf@ingress, it should not change the
existing way of doing thing for tc-bpf@egress other than spelling
out more explicitly in the new __sk_buff->delivery_time_type
instead of having the tc-bpf to deduce it by checking the sk
is tcp (mono EDT) or by checking sk->sk_clockid for non-tcp.

delivery_time_type is read only.  Its convert_ctx_access() requires
the skb's mono_delivery_time bit and tc_at_ingress bit.
They are moved up in sk_buff so that bpf rewrite can be done at a
fixed offset.  tc_skip_classify is moved together with tc_at_ingress.
To get one bit for mono_delivery_time, csum_not_inet is moved down and
this bit is currently used by sctp.

* Provide forwarded delivery_time to tc-bpf@ingress:
With the help of the new delivery_time_type, the tc-bpf has a way
to tell if the __sk_buff->tstamp has the (rcv) timestamp or
the delivery_time.  During bpf load time, the verifier will learn if
the bpf prog has accessed the new __sk_buff->delivery_time_type.
If it does, it means the tc-bpf@ingress is expecting the
skb->tstamp could have the delivery_time.  The kernel will then keep
the forwarded delivery_time in skb->tstamp.  This is done by adding a
new prog->delivery_time_access bit.

Since the tc-bpf@ingress can access the delivery_time,
it also needs to clear the skb->mono_delivery_time after
running the bpf if 0 has been written to skb->tstamp.  This
is the same as the tc-bpf@egress in the previous patch.

For tail call, the callee will follow the __sk_buff->tstamp
expectation of its caller at ingress.  If caller does not have
its prog->delivery_time_access set, the callee prog will not have
the forwarded delivery_time in __sk_buff->tstamp and will have
the (rcv) timestamp instead.  If needed, in the future, a new
attach_type can be added to allow the tc-bpf to explicitly specify
its expectation on the __sk_buff->tstamp.

* bpf_skb_set_delivery_time():
The bpf_skb_set_delivery_time() helper is added to allow setting both
delivery_time and the delivery_time_type at the same time.  If the
tc-bpf does not need to change the delivery_time_type, it can directly
write to the __sk_buff->tstamp as the existing tc-bpf has already been
doing.  It will be most useful at ingress to change the
__sk_buff->tstamp from the (rcv) timestamp to
a mono delivery_time and then bpf_redirect_*().

bpf only has mono clock helper (bpf_ktime_get_ns), and
the current known use case is the mono EDT for fq, and
only mono delivery time can be kept during forward now,
so bpf_skb_set_delivery_time() only supports setting
BPF_SKB_DELIVERY_TIME_MONO.  It can be extended later when use cases
come up and the forwarding path also supports other clock bases.
This function could be inline and is left as a future exercise.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/linux/filter.h         |  7 ++-
 include/linux/skbuff.h         | 20 ++++++---
 include/uapi/linux/bpf.h       | 35 ++++++++++++++-
 net/core/filter.c              | 79 +++++++++++++++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h | 35 ++++++++++++++-
 5 files changed, 164 insertions(+), 12 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index e43e1701a80e..00bbde352ad0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -572,7 +572,8 @@  struct bpf_prog {
 				has_callchain_buf:1, /* callchain buffer allocated? */
 				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
 				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
-				call_get_func_ip:1; /* Do we call get_func_ip() */
+				call_get_func_ip:1, /* Do we call get_func_ip() */
+				delivery_time_access:1; /* Accessed __sk_buff->delivery_time_type */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
@@ -705,7 +706,7 @@  static __always_inline u32 bpf_prog_run_at_ingress(const struct bpf_prog *prog,
 	ktime_t tstamp, saved_mono_dtime = 0;
 	int filter_res;
 
-	if (unlikely(skb->mono_delivery_time)) {
+	if (unlikely(skb->mono_delivery_time) && !prog->delivery_time_access) {
 		saved_mono_dtime = skb->tstamp;
 		skb->mono_delivery_time = 0;
 		if (static_branch_unlikely(&netstamp_needed_key))
@@ -723,6 +724,8 @@  static __always_inline u32 bpf_prog_run_at_ingress(const struct bpf_prog *prog,
 	/* __sk_buff->tstamp was not changed, restore the delivery_time */
 	if (unlikely(saved_mono_dtime) && skb_tstamp(skb) == tstamp)
 		skb_set_delivery_time(skb, saved_mono_dtime, true);
+	if (unlikely(skb->mono_delivery_time && !skb->tstamp))
+		skb->mono_delivery_time = 0;
 
 	return filter_res;
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0e09e75fa787..fb7146be48f7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -893,22 +893,23 @@  struct sk_buff {
 	__u8			vlan_present:1;	/* See PKT_VLAN_PRESENT_BIT */
 	__u8			csum_complete_sw:1;
 	__u8			csum_level:2;
-	__u8			csum_not_inet:1;
 	__u8			dst_pending_confirm:1;
+	__u8			mono_delivery_time:1;
+
+#ifdef CONFIG_NET_CLS_ACT
+	__u8			tc_skip_classify:1;
+	__u8			tc_at_ingress:1;
+#endif
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	__u8			ndisc_nodetype:2;
 #endif
-
+	__u8			csum_not_inet:1;
 	__u8			ipvs_property:1;
 	__u8			inner_protocol_type:1;
 	__u8			remcsum_offload:1;
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
 	__u8			offload_l3_fwd_mark:1;
-#endif
-#ifdef CONFIG_NET_CLS_ACT
-	__u8			tc_skip_classify:1;
-	__u8			tc_at_ingress:1;
 #endif
 	__u8			redirected:1;
 #ifdef CONFIG_NET_REDIRECT
@@ -921,7 +922,6 @@  struct sk_buff {
 	__u8			decrypted:1;
 #endif
 	__u8			slow_gro:1;
-	__u8			mono_delivery_time:1;
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
@@ -999,10 +999,16 @@  struct sk_buff {
 /* if you move pkt_vlan_present around you also must adapt these constants */
 #ifdef __BIG_ENDIAN_BITFIELD
 #define PKT_VLAN_PRESENT_BIT	7
+#define TC_AT_INGRESS_MASK		(1 << 0)
+#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 2)
 #else
 #define PKT_VLAN_PRESENT_BIT	0
+#define TC_AT_INGRESS_MASK		(1 << 7)
+#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 5)
 #endif
 #define PKT_VLAN_PRESENT_OFFSET	offsetof(struct sk_buff, __pkt_vlan_present_offset)
+#define TC_AT_INGRESS_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset)
+#define SKB_MONO_DELIVERY_TIME_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset)
 
 #ifdef __KERNEL__
 /*
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 16a7574292a5..b36771b1bfa1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5076,6 +5076,31 @@  union bpf_attr {
  *		associated to *xdp_md*, at *offset*.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type)
+ *	Description
+ *		Set a *dtime* (delivery time) to the __sk_buff->tstamp and also
+ *		change the __sk_buff->delivery_time_type to *dtime_type*.
+ *
+ *		Only BPF_SKB_DELIVERY_TIME_MONO is supported in *dtime_type*
+ *		and it is the only delivery_time_type that will be kept
+ *		after bpf_redirect_*().
+ *		Only ipv4 and ipv6 skb->protocol is supported.
+ *
+ *		If there is no need to change the __sk_buff->delivery_time_type,
+ *		the delivery_time can be directly written to __sk_buff->tstamp
+ *		instead.
+ *
+ *		This function is most useful when it needs to set a
+ *		mono delivery_time to __sk_buff->tstamp and then
+ *		bpf_redirect_*() to the egress of an iface.  For example,
+ *		changing the (rcv) timestamp in __sk_buff->tstamp at
+ *		ingress to a mono delivery time and then bpf_redirect_*()
+ *		to sch_fq@phy-dev.
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** for invalid input
+ *		**-EOPNOTSUPP** for unsupported delivery_time_type and protocol
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5269,6 +5294,7 @@  union bpf_attr {
 	FN(xdp_get_buff_len),		\
 	FN(xdp_load_bytes),		\
 	FN(xdp_store_bytes),		\
+	FN(skb_set_delivery_time),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5458,6 +5484,12 @@  union {					\
 	__u64 :64;			\
 } __attribute__((aligned(8)))
 
+enum {
+	BPF_SKB_DELIVERY_TIME_NONE,
+	BPF_SKB_DELIVERY_TIME_UNSPEC,
+	BPF_SKB_DELIVERY_TIME_MONO,
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
@@ -5498,7 +5530,8 @@  struct __sk_buff {
 	__u32 gso_segs;
 	__bpf_md_ptr(struct bpf_sock *, sk);
 	__u32 gso_size;
-	__u32 :32;		/* Padding, future use. */
+	__u8  delivery_time_type;
+	__u32 :24;		/* Padding, future use. */
 	__u64 hwtstamp;
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index a2d712be4985..0e79a6ca4a95 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7159,6 +7159,33 @@  static const struct bpf_func_proto bpf_sk_assign_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_skb_set_delivery_time, struct sk_buff *, skb,
+	   u64, dtime, u32, dtime_type)
+{
+	if (!dtime)
+		return -EINVAL;
+
+	/* skb_clear_delivery_time() is done for inet protocol */
+	if (dtime_type != BPF_SKB_DELIVERY_TIME_MONO ||
+	    (skb->protocol != htons(ETH_P_IP) &&
+	     skb->protocol != htons(ETH_P_IPV6)))
+		return -EOPNOTSUPP;
+
+	skb->mono_delivery_time = 1;
+	skb->tstamp = dtime;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_delivery_time_proto = {
+	.func		= bpf_skb_set_delivery_time,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
 				    u8 search_kind, const u8 *magic,
 				    u8 magic_len, bool *eol)
@@ -7746,6 +7773,8 @@  tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_tcp_gen_syncookie_proto;
 	case BPF_FUNC_sk_assign:
 		return &bpf_sk_assign_proto;
+	case BPF_FUNC_skb_set_delivery_time:
+		return &bpf_skb_set_delivery_time_proto;
 #endif
 	default:
 		return bpf_sk_base_func_proto(func_id);
@@ -8085,7 +8114,9 @@  static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 			return false;
 		info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
 		break;
-	case offsetofend(struct __sk_buff, gso_size) ... offsetof(struct __sk_buff, hwtstamp) - 1:
+	case offsetof(struct __sk_buff, delivery_time_type):
+		return false;
+	case offsetofend(struct __sk_buff, delivery_time_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
 		/* Explicitly prohibit access to padding in __sk_buff. */
 		return false;
 	default:
@@ -8432,6 +8463,8 @@  static bool tc_cls_act_is_valid_access(int off, int size,
 		break;
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
+	case offsetof(struct __sk_buff, delivery_time_type):
+		return size == sizeof(__u8);
 	}
 
 	return bpf_skb_is_valid_access(off, size, type, prog, info);
@@ -8848,6 +8881,45 @@  static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
 	return insn;
 }
 
+static struct bpf_insn *bpf_convert_dtime_type_read(const struct bpf_insn *si,
+						    struct bpf_insn *insn)
+{
+	__u8 value_reg = si->dst_reg;
+	__u8 skb_reg = si->src_reg;
+	__u8 tmp_reg = BPF_REG_AX;
+
+	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
+			      SKB_MONO_DELIVERY_TIME_OFFSET);
+	*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
+				SKB_MONO_DELIVERY_TIME_MASK);
+	*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
+	/* value_reg = BPF_SKB_DELIVERY_TIME_MONO */
+	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_MONO);
+	*insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 10 : 5);
+
+	*insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, skb_reg,
+			      offsetof(struct sk_buff, tstamp));
+	*insn++ = BPF_JMP_IMM(BPF_JNE, tmp_reg, 0, 2);
+	/* value_reg = BPF_SKB_DELIVERY_TIME_NONE */
+	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_NONE);
+	*insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 6 : 1);
+
+#ifdef CONFIG_NET_CLS_ACT
+	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET);
+	*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK);
+	*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2);
+	/* At ingress, value_reg = 0 */
+	*insn++ = BPF_MOV32_IMM(value_reg, 0);
+	*insn++ = BPF_JMP_A(1);
+#endif
+
+	/* value_reg = BPF_SKB_DELIVERYT_TIME_UNSPEC */
+	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_UNSPEC);
+
+	/* 15 insns with CONFIG_NET_CLS_ACT */
+	return insn;
+}
+
 static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				  const struct bpf_insn *si,
 				  struct bpf_insn *insn_buf,
@@ -9169,6 +9241,11 @@  static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 							     target_size));
 		break;
 
+	case offsetof(struct __sk_buff, delivery_time_type):
+		insn = bpf_convert_dtime_type_read(si, insn);
+		prog->delivery_time_access = 1;
+		break;
+
 	case offsetof(struct __sk_buff, gso_segs):
 		insn = bpf_convert_shinfo_access(si, insn);
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 16a7574292a5..b36771b1bfa1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5076,6 +5076,31 @@  union bpf_attr {
  *		associated to *xdp_md*, at *offset*.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type)
+ *	Description
+ *		Set a *dtime* (delivery time) to the __sk_buff->tstamp and also
+ *		change the __sk_buff->delivery_time_type to *dtime_type*.
+ *
+ *		Only BPF_SKB_DELIVERY_TIME_MONO is supported in *dtime_type*
+ *		and it is the only delivery_time_type that will be kept
+ *		after bpf_redirect_*().
+ *		Only ipv4 and ipv6 skb->protocol is supported.
+ *
+ *		If there is no need to change the __sk_buff->delivery_time_type,
+ *		the delivery_time can be directly written to __sk_buff->tstamp
+ *		instead.
+ *
+ *		This function is most useful when it needs to set a
+ *		mono delivery_time to __sk_buff->tstamp and then
+ *		bpf_redirect_*() to the egress of an iface.  For example,
+ *		changing the (rcv) timestamp in __sk_buff->tstamp at
+ *		ingress to a mono delivery time and then bpf_redirect_*()
+ *		to sch_fq@phy-dev.
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** for invalid input
+ *		**-EOPNOTSUPP** for unsupported delivery_time_type and protocol
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5269,6 +5294,7 @@  union bpf_attr {
 	FN(xdp_get_buff_len),		\
 	FN(xdp_load_bytes),		\
 	FN(xdp_store_bytes),		\
+	FN(skb_set_delivery_time),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5458,6 +5484,12 @@  union {					\
 	__u64 :64;			\
 } __attribute__((aligned(8)))
 
+enum {
+	BPF_SKB_DELIVERY_TIME_NONE,
+	BPF_SKB_DELIVERY_TIME_UNSPEC,
+	BPF_SKB_DELIVERY_TIME_MONO,
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
@@ -5498,7 +5530,8 @@  struct __sk_buff {
 	__u32 gso_segs;
 	__bpf_md_ptr(struct bpf_sock *, sk);
 	__u32 gso_size;
-	__u32 :32;		/* Padding, future use. */
+	__u8  delivery_time_type;
+	__u32 :24;		/* Padding, future use. */
 	__u64 hwtstamp;
 };