diff mbox series

[RFC,bpf-next,v2,06/14] xdp: Carry over xdp metadata into skb context

Message ID 20221104032532.1615099-7-sdf@google.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series xdp: hints via kfuncs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-1 pending Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain }}
bpf/vmtest-bpf-next-VM_Test-2 pending Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 fail Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-4 fail Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 fail Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-6 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-7 success Logs for set-matrix
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline fail Detected static functions without inline keyword in header files: 1
netdev/build_32bit fail Errors and warnings before: 5338 this patch: 868
netdev/cc_maintainers warning 5 maintainers not CCed: hawk@kernel.org pabeni@redhat.com davem@davemloft.net edumazet@google.com imagedong@tencent.com
netdev/build_clang fail Errors and warnings before: 1123 this patch: 505
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 5520 this patch: 901
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Please use a blank line after function/struct/union/enum declarations WARNING: braces {} are not necessary for single statement blocks WARNING: line length of 106 exceeds 80 columns WARNING: line length of 117 exceeds 80 columns WARNING: line length of 81 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns WARNING: networking block comments don't use an empty /* line, use /* Comment...
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Stanislav Fomichev Nov. 4, 2022, 3:25 a.m. UTC
Implement new bpf_xdp_metadata_export_to_skb kfunc which
prepares compatible xdp metadata for kernel consumption.
This kfunc should be called prior to bpf_redirect
or (unless already called) when XDP_PASS'ing the frame
into the kernel.

The implementation currently maintains xdp_to_skb_metadata
layout by calling bpf_xdp_metadata_rx_timestamp and placing
small magic number. From skb_metdata_set, when we get expected magic number,
we interpret metadata accordingly.

Both magic number and struct layout are randomized to make sure
it doesn't leak into the userspace.

skb_metadata_set is amended with skb_metadata_import_from_xdp which
tries to parse out the metadata and put it into skb.

See the comment for r1 vs r2/r3/r4/r5 conventions.

Cc: John Fastabend <john.fastabend@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Anatoly Burakov <anatoly.burakov@intel.com>
Cc: Alexander Lobakin <alexandr.lobakin@intel.com>
Cc: Magnus Karlsson <magnus.karlsson@gmail.com>
Cc: Maryam Tahhan <mtahhan@redhat.com>
Cc: xdp-hints@xdp-project.net
Cc: netdev@vger.kernel.org
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 drivers/net/veth.c        |   4 +-
 include/linux/bpf_patch.h |   2 +
 include/linux/skbuff.h    |   4 ++
 include/net/xdp.h         |  13 +++++
 kernel/bpf/bpf_patch.c    |  30 +++++++++++
 kernel/bpf/verifier.c     |  18 +++++++
 net/core/skbuff.c         |  25 +++++++++
 net/core/xdp.c            | 104 +++++++++++++++++++++++++++++++++++---
 8 files changed, 193 insertions(+), 7 deletions(-)

Comments

Martin KaFai Lau Nov. 7, 2022, 10:01 p.m. UTC | #1
On 11/3/22 8:25 PM, Stanislav Fomichev wrote:
> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 59c9fd55699d..dba857f212d7 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -4217,9 +4217,13 @@ static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
>   	       true : __skb_metadata_differs(skb_a, skb_b, len_a);
>   }
>   
> +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len);
> +
>   static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
>   {
>   	skb_shinfo(skb)->meta_len = meta_len;
> +	if (meta_len)
> +		skb_metadata_import_from_xdp(skb, meta_len);
>   }
>
[ ... ]

> +struct xdp_to_skb_metadata {
> +	u32 magic; /* xdp_metadata_magic */
> +	u64 rx_timestamp;
> +} __randomize_layout;
> +
> +struct bpf_patch;
> +

[ ... ]

> +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len)
> +{
> +	struct xdp_to_skb_metadata *meta = (void *)(skb_mac_header(skb) - len);
> +
> +	/* Optional SKB info, currently missing:
> +	 * - HW checksum info		(skb->ip_summed)
> +	 * - HW RX hash			(skb_set_hash)
> +	 * - RX ring dev queue index	(skb_record_rx_queue)
> +	 */
> +
> +	if (len != sizeof(struct xdp_to_skb_metadata))
> +		return;
> +
> +	if (meta->magic != xdp_metadata_magic)
> +		return;
> +
> +	if (meta->rx_timestamp) {
> +		*skb_hwtstamps(skb) = (struct skb_shared_hwtstamps){
> +			.hwtstamp = ns_to_ktime(meta->rx_timestamp),
> +		};
> +	}
> +}

Considering the metadata will affect the gro, should the meta be cleared after 
importing to the skb?

[ ... ]

> +/* Since we're not actually doing a call but instead rewriting
> + * in place, we can only afford to use R0-R5 scratch registers.
> + *
> + * We reserve R1 for bpf_xdp_metadata_export_to_skb and let individual
> + * metadata kfuncs use only R0,R4-R5.
> + *
> + * The above also means we _cannot_ easily call any other helper/kfunc
> + * because there is no place for us to preserve our R1 argument;
> + * existing R6-R9 belong to the callee.
> + */
> +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch)
> +{
> +	u32 func_id;
> +
> +	/*
> +	 * The code below generates the following:
> +	 *
> +	 * void bpf_xdp_metadata_export_to_skb(struct xdp_md *ctx)
> +	 * {
> +	 *	struct xdp_to_skb_metadata *meta;
> +	 *	int ret;
> +	 *
> +	 *	ret = bpf_xdp_adjust_meta(ctx, -sizeof(*meta));
> +	 *	if (!ret)
> +	 *		return;
> +	 *
> +	 *	meta = ctx->data_meta;
> +	 *	meta->magic = xdp_metadata_magic;
> +	 *	meta->rx_timestamp = bpf_xdp_metadata_rx_timestamp(ctx);
> +	 * }
> +	 *
> +	 */
> +
> +	bpf_patch_append(patch,
> +		/* r2 = ((struct xdp_buff *)r1)->data_meta; */
> +		BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
> +			    offsetof(struct xdp_buff, data_meta)),
> +		/* r3 = ((struct xdp_buff *)r1)->data; */
> +		BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
> +			    offsetof(struct xdp_buff, data)),
> +		/* if (data_meta != data) return;
> +		 *
> +		 *	data_meta > data: xdp_data_meta_unsupported()
> +		 *	data_meta < data: already used, no need to touch
> +		 */
> +		BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, S16_MAX),
> +
> +		/* r2 -= sizeof(struct xdp_to_skb_metadata); */
> +		BPF_ALU64_IMM(BPF_SUB, BPF_REG_2,
> +			      sizeof(struct xdp_to_skb_metadata)),
> +		/* r3 = ((struct xdp_buff *)r1)->data_hard_start; */
> +		BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
> +			    offsetof(struct xdp_buff, data_hard_start)),
> +		/* r3 += sizeof(struct xdp_frame) */
> +		BPF_ALU64_IMM(BPF_ADD, BPF_REG_3,
> +			      sizeof(struct xdp_frame)),
> +		/* if (data-sizeof(struct xdp_to_skb_metadata) < data_hard_start+sizeof(struct xdp_frame)) return; */
> +		BPF_JMP_REG(BPF_JLT, BPF_REG_2, BPF_REG_3, S16_MAX),
> +
> +		/* ((struct xdp_buff *)r1)->data_meta = r2; */
> +		BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2,
> +			    offsetof(struct xdp_buff, data_meta)),
> +
> +		/* *((struct xdp_to_skb_metadata *)r2)->magic = xdp_metadata_magic; */
> +		BPF_ST_MEM(BPF_W, BPF_REG_2,
> +			   offsetof(struct xdp_to_skb_metadata, magic),
> +			   xdp_metadata_magic),
> +	);
> +
> +	/*	r0 = bpf_xdp_metadata_rx_timestamp(ctx); */
> +	func_id = xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP);
> +	prog->aux->xdp_kfunc_ndo->ndo_unroll_kfunc(prog, func_id, patch);
> +
> +	bpf_patch_append(patch,
> +		/* r2 = ((struct xdp_buff *)r1)->data_meta; */
> +		BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
> +			    offsetof(struct xdp_buff, data_meta)),
> +		/* *((struct xdp_to_skb_metadata *)r2)->rx_timestamp = r0; */
> +		BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0,
> +			    offsetof(struct xdp_to_skb_metadata, rx_timestamp)),

Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not 
sure it is solid enough by asking the xdp prog not to use the same random number 
in its own metadata + not to change the metadata through xdp->data_meta after 
calling bpf_xdp_metadata_export_to_skb().

Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the 
xdp_to_skb_metadata can be limited to XDP_REDIRECT only?


> +	);
> +
> +	bpf_patch_resolve_jmp(patch);
> +}
> +
>   static int __init xdp_metadata_init(void)
>   {
> +	xdp_metadata_magic = get_random_u32() | 1;
>   	return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set);
>   }
>   late_initcall(xdp_metadata_init);
Stanislav Fomichev Nov. 8, 2022, 9:54 p.m. UTC | #2
On Mon, Nov 7, 2022 at 2:02 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 11/3/22 8:25 PM, Stanislav Fomichev wrote:
> > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > index 59c9fd55699d..dba857f212d7 100644
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> > @@ -4217,9 +4217,13 @@ static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
> >              true : __skb_metadata_differs(skb_a, skb_b, len_a);
> >   }
> >
> > +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len);
> > +
> >   static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
> >   {
> >       skb_shinfo(skb)->meta_len = meta_len;
> > +     if (meta_len)
> > +             skb_metadata_import_from_xdp(skb, meta_len);
> >   }
> >
> [ ... ]
>
> > +struct xdp_to_skb_metadata {
> > +     u32 magic; /* xdp_metadata_magic */
> > +     u64 rx_timestamp;
> > +} __randomize_layout;
> > +
> > +struct bpf_patch;
> > +
>
> [ ... ]
>
> > +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len)
> > +{
> > +     struct xdp_to_skb_metadata *meta = (void *)(skb_mac_header(skb) - len);
> > +
> > +     /* Optional SKB info, currently missing:
> > +      * - HW checksum info           (skb->ip_summed)
> > +      * - HW RX hash                 (skb_set_hash)
> > +      * - RX ring dev queue index    (skb_record_rx_queue)
> > +      */
> > +
> > +     if (len != sizeof(struct xdp_to_skb_metadata))
> > +             return;
> > +
> > +     if (meta->magic != xdp_metadata_magic)
> > +             return;
> > +
> > +     if (meta->rx_timestamp) {
> > +             *skb_hwtstamps(skb) = (struct skb_shared_hwtstamps){
> > +                     .hwtstamp = ns_to_ktime(meta->rx_timestamp),
> > +             };
> > +     }
> > +}
>
> Considering the metadata will affect the gro, should the meta be cleared after
> importing to the skb?

Yeah, good suggestion, will clear it here.

> [ ... ]
>
> > +/* Since we're not actually doing a call but instead rewriting
> > + * in place, we can only afford to use R0-R5 scratch registers.
> > + *
> > + * We reserve R1 for bpf_xdp_metadata_export_to_skb and let individual
> > + * metadata kfuncs use only R0,R4-R5.
> > + *
> > + * The above also means we _cannot_ easily call any other helper/kfunc
> > + * because there is no place for us to preserve our R1 argument;
> > + * existing R6-R9 belong to the callee.
> > + */
> > +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch)
> > +{
> > +     u32 func_id;
> > +
> > +     /*
> > +      * The code below generates the following:
> > +      *
> > +      * void bpf_xdp_metadata_export_to_skb(struct xdp_md *ctx)
> > +      * {
> > +      *      struct xdp_to_skb_metadata *meta;
> > +      *      int ret;
> > +      *
> > +      *      ret = bpf_xdp_adjust_meta(ctx, -sizeof(*meta));
> > +      *      if (!ret)
> > +      *              return;
> > +      *
> > +      *      meta = ctx->data_meta;
> > +      *      meta->magic = xdp_metadata_magic;
> > +      *      meta->rx_timestamp = bpf_xdp_metadata_rx_timestamp(ctx);
> > +      * }
> > +      *
> > +      */
> > +
> > +     bpf_patch_append(patch,
> > +             /* r2 = ((struct xdp_buff *)r1)->data_meta; */
> > +             BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
> > +                         offsetof(struct xdp_buff, data_meta)),
> > +             /* r3 = ((struct xdp_buff *)r1)->data; */
> > +             BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
> > +                         offsetof(struct xdp_buff, data)),
> > +             /* if (data_meta != data) return;
> > +              *
> > +              *      data_meta > data: xdp_data_meta_unsupported()
> > +              *      data_meta < data: already used, no need to touch
> > +              */
> > +             BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, S16_MAX),
> > +
> > +             /* r2 -= sizeof(struct xdp_to_skb_metadata); */
> > +             BPF_ALU64_IMM(BPF_SUB, BPF_REG_2,
> > +                           sizeof(struct xdp_to_skb_metadata)),
> > +             /* r3 = ((struct xdp_buff *)r1)->data_hard_start; */
> > +             BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
> > +                         offsetof(struct xdp_buff, data_hard_start)),
> > +             /* r3 += sizeof(struct xdp_frame) */
> > +             BPF_ALU64_IMM(BPF_ADD, BPF_REG_3,
> > +                           sizeof(struct xdp_frame)),
> > +             /* if (data-sizeof(struct xdp_to_skb_metadata) < data_hard_start+sizeof(struct xdp_frame)) return; */
> > +             BPF_JMP_REG(BPF_JLT, BPF_REG_2, BPF_REG_3, S16_MAX),
> > +
> > +             /* ((struct xdp_buff *)r1)->data_meta = r2; */
> > +             BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2,
> > +                         offsetof(struct xdp_buff, data_meta)),
> > +
> > +             /* *((struct xdp_to_skb_metadata *)r2)->magic = xdp_metadata_magic; */
> > +             BPF_ST_MEM(BPF_W, BPF_REG_2,
> > +                        offsetof(struct xdp_to_skb_metadata, magic),
> > +                        xdp_metadata_magic),
> > +     );
> > +
> > +     /*      r0 = bpf_xdp_metadata_rx_timestamp(ctx); */
> > +     func_id = xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP);
> > +     prog->aux->xdp_kfunc_ndo->ndo_unroll_kfunc(prog, func_id, patch);
> > +
> > +     bpf_patch_append(patch,
> > +             /* r2 = ((struct xdp_buff *)r1)->data_meta; */
> > +             BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
> > +                         offsetof(struct xdp_buff, data_meta)),
> > +             /* *((struct xdp_to_skb_metadata *)r2)->rx_timestamp = r0; */
> > +             BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0,
> > +                         offsetof(struct xdp_to_skb_metadata, rx_timestamp)),
>
> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
> sure it is solid enough by asking the xdp prog not to use the same random number
> in its own metadata + not to change the metadata through xdp->data_meta after
> calling bpf_xdp_metadata_export_to_skb().

What do you think the usecase here might be? Or are you suggesting we
reject further access to data_meta after
bpf_xdp_metadata_export_to_skb somehow?

If we want to let the programs override some of this
bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
more kfuncs instead of exposing the layout?

bpf_xdp_metadata_export_to_skb(ctx);
bpf_xdp_metadata_export_skb_hash(ctx, 1234);
...

> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the
> xdp_to_skb_metadata can be limited to XDP_REDIRECT only?

XDP_PASS cases where we convert xdp_buff into skb in the drivers right
now usually have C code to manually pull out the metadata (out of hw
desc) and put it into skb.

So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for
XDP_PASS, we're doing a double amount of work:
skb_metadata_import_from_xdp first, then custom driver code second.

In theory, maybe we should completely skip drivers custom parsing when
there is a prog with BPF_F_XDP_HAS_METADATA?
Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven
and won't require any mental work (plus, the drivers won't have to
care either in the future).

WDYT?

> > +     );
> > +
> > +     bpf_patch_resolve_jmp(patch);
> > +}
> > +
> >   static int __init xdp_metadata_init(void)
> >   {
> > +     xdp_metadata_magic = get_random_u32() | 1;
> >       return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set);
> >   }
> >   late_initcall(xdp_metadata_init);
Martin KaFai Lau Nov. 9, 2022, 3:07 a.m. UTC | #3
On 11/8/22 1:54 PM, Stanislav Fomichev wrote:
> On Mon, Nov 7, 2022 at 2:02 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 11/3/22 8:25 PM, Stanislav Fomichev wrote:
>>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>>> index 59c9fd55699d..dba857f212d7 100644
>>> --- a/include/linux/skbuff.h
>>> +++ b/include/linux/skbuff.h
>>> @@ -4217,9 +4217,13 @@ static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
>>>               true : __skb_metadata_differs(skb_a, skb_b, len_a);
>>>    }
>>>
>>> +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len);
>>> +
>>>    static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
>>>    {
>>>        skb_shinfo(skb)->meta_len = meta_len;
>>> +     if (meta_len)
>>> +             skb_metadata_import_from_xdp(skb, meta_len);
>>>    }
>>>
>> [ ... ]
>>
>>> +struct xdp_to_skb_metadata {
>>> +     u32 magic; /* xdp_metadata_magic */
>>> +     u64 rx_timestamp;
>>> +} __randomize_layout;
>>> +
>>> +struct bpf_patch;
>>> +
>>
>> [ ... ]
>>
>>> +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len)
>>> +{
>>> +     struct xdp_to_skb_metadata *meta = (void *)(skb_mac_header(skb) - len);
>>> +
>>> +     /* Optional SKB info, currently missing:
>>> +      * - HW checksum info           (skb->ip_summed)
>>> +      * - HW RX hash                 (skb_set_hash)
>>> +      * - RX ring dev queue index    (skb_record_rx_queue)
>>> +      */
>>> +
>>> +     if (len != sizeof(struct xdp_to_skb_metadata))
>>> +             return;
>>> +
>>> +     if (meta->magic != xdp_metadata_magic)
>>> +             return;
>>> +
>>> +     if (meta->rx_timestamp) {
>>> +             *skb_hwtstamps(skb) = (struct skb_shared_hwtstamps){
>>> +                     .hwtstamp = ns_to_ktime(meta->rx_timestamp),
>>> +             };
>>> +     }
>>> +}
>>
>> Considering the metadata will affect the gro, should the meta be cleared after
>> importing to the skb?
> 
> Yeah, good suggestion, will clear it here.
> 
>> [ ... ]
>>
>>> +/* Since we're not actually doing a call but instead rewriting
>>> + * in place, we can only afford to use R0-R5 scratch registers.
>>> + *
>>> + * We reserve R1 for bpf_xdp_metadata_export_to_skb and let individual
>>> + * metadata kfuncs use only R0,R4-R5.
>>> + *
>>> + * The above also means we _cannot_ easily call any other helper/kfunc
>>> + * because there is no place for us to preserve our R1 argument;
>>> + * existing R6-R9 belong to the callee.
>>> + */
>>> +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch)
>>> +{
>>> +     u32 func_id;
>>> +
>>> +     /*
>>> +      * The code below generates the following:
>>> +      *
>>> +      * void bpf_xdp_metadata_export_to_skb(struct xdp_md *ctx)
>>> +      * {
>>> +      *      struct xdp_to_skb_metadata *meta;
>>> +      *      int ret;
>>> +      *
>>> +      *      ret = bpf_xdp_adjust_meta(ctx, -sizeof(*meta));
>>> +      *      if (!ret)
>>> +      *              return;
>>> +      *
>>> +      *      meta = ctx->data_meta;
>>> +      *      meta->magic = xdp_metadata_magic;
>>> +      *      meta->rx_timestamp = bpf_xdp_metadata_rx_timestamp(ctx);
>>> +      * }
>>> +      *
>>> +      */
>>> +
>>> +     bpf_patch_append(patch,
>>> +             /* r2 = ((struct xdp_buff *)r1)->data_meta; */
>>> +             BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
>>> +                         offsetof(struct xdp_buff, data_meta)),
>>> +             /* r3 = ((struct xdp_buff *)r1)->data; */
>>> +             BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
>>> +                         offsetof(struct xdp_buff, data)),
>>> +             /* if (data_meta != data) return;
>>> +              *
>>> +              *      data_meta > data: xdp_data_meta_unsupported()
>>> +              *      data_meta < data: already used, no need to touch
>>> +              */
>>> +             BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, S16_MAX),
>>> +
>>> +             /* r2 -= sizeof(struct xdp_to_skb_metadata); */
>>> +             BPF_ALU64_IMM(BPF_SUB, BPF_REG_2,
>>> +                           sizeof(struct xdp_to_skb_metadata)),
>>> +             /* r3 = ((struct xdp_buff *)r1)->data_hard_start; */
>>> +             BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
>>> +                         offsetof(struct xdp_buff, data_hard_start)),
>>> +             /* r3 += sizeof(struct xdp_frame) */
>>> +             BPF_ALU64_IMM(BPF_ADD, BPF_REG_3,
>>> +                           sizeof(struct xdp_frame)),
>>> +             /* if (data-sizeof(struct xdp_to_skb_metadata) < data_hard_start+sizeof(struct xdp_frame)) return; */
>>> +             BPF_JMP_REG(BPF_JLT, BPF_REG_2, BPF_REG_3, S16_MAX),
>>> +
>>> +             /* ((struct xdp_buff *)r1)->data_meta = r2; */
>>> +             BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2,
>>> +                         offsetof(struct xdp_buff, data_meta)),
>>> +
>>> +             /* *((struct xdp_to_skb_metadata *)r2)->magic = xdp_metadata_magic; */
>>> +             BPF_ST_MEM(BPF_W, BPF_REG_2,
>>> +                        offsetof(struct xdp_to_skb_metadata, magic),
>>> +                        xdp_metadata_magic),
>>> +     );
>>> +
>>> +     /*      r0 = bpf_xdp_metadata_rx_timestamp(ctx); */
>>> +     func_id = xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP);
>>> +     prog->aux->xdp_kfunc_ndo->ndo_unroll_kfunc(prog, func_id, patch);
>>> +
>>> +     bpf_patch_append(patch,
>>> +             /* r2 = ((struct xdp_buff *)r1)->data_meta; */
>>> +             BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
>>> +                         offsetof(struct xdp_buff, data_meta)),
>>> +             /* *((struct xdp_to_skb_metadata *)r2)->rx_timestamp = r0; */
>>> +             BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0,
>>> +                         offsetof(struct xdp_to_skb_metadata, rx_timestamp)),
>>
>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
>> sure it is solid enough by asking the xdp prog not to use the same random number
>> in its own metadata + not to change the metadata through xdp->data_meta after
>> calling bpf_xdp_metadata_export_to_skb().
> 
> What do you think the usecase here might be? Or are you suggesting we
> reject further access to data_meta after
> bpf_xdp_metadata_export_to_skb somehow?
> 
> If we want to let the programs override some of this
> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
> more kfuncs instead of exposing the layout?
> 
> bpf_xdp_metadata_export_to_skb(ctx);
> bpf_xdp_metadata_export_skb_hash(ctx, 1234);


I can't think of a use case now for the xdp prog to use the xdp_to_skb_metadata 
while the xdp prog can directly call the kfunc (eg 
bpf_xdp_metadata_rx_timestamp) to get individual hint.  I was asking if patch 7 
is an actual use case because it does test the tstamp in XDP_PASS or it is 
mostly for selftest purpose?  Yeah, may be the xdp prog will be able to change 
the xdp_to_skb_metadata eventually but that is for later.

My concern is the xdp prog is allowed to change xdp_to_skb_metadata or 
by-coincident writing metadata that matches the random and the sizeof(struct 
xdp_to_skb_metadata).

Also, the added opacity of xdp_to_skb_metadata (__randomize_layout + random int) 
is trying very hard to hide it from xdp prog.  Instead, would it be cleaner to 
have a flag in xdp->flags (to be set by bpf_xdp_metadata_export_to_skb?) to 
guard this, like one of Jesper's patch.  The xdp_convert_ctx_access() and 
bpf_xdp_adjust_meta() can check this bit to ensure the xdp_to_skb_metadata 
cannot be read and no metadata can be added/deleted after that.  btw, is it 
possible to keep both xdp_to_skb_metadata and the xdp_prog's metadata?  After 
skb_metadata_import_from_xdp popping the xdp_to_skb_metadata, the remaining 
xdp_prog's metatdata can still be used by the bpf-tc.

> ...
> 
>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the
>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only?
> 
> XDP_PASS cases where we convert xdp_buff into skb in the drivers right
> now usually have C code to manually pull out the metadata (out of hw
> desc) and put it into skb.
> 
> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for
> XDP_PASS, we're doing a double amount of work:
> skb_metadata_import_from_xdp first, then custom driver code second.
> 
> In theory, maybe we should completely skip drivers custom parsing when
> there is a prog with BPF_F_XDP_HAS_METADATA?
> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven
> and won't require any mental work (plus, the drivers won't have to
> care either in the future).
>  > WDYT?


Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes 
sense to only use the hints (if ever written) from xdp prog especially if it 
will eventually support xdp prog changing some of the hints in the future.  For 
now, I think either way is fine since they are the same and the xdp prog is sort 
of doing extra unnecessary work anyway by calling 
bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be 
changed now.


> 
>>> +     );
>>> +
>>> +     bpf_patch_resolve_jmp(patch);
>>> +}
>>> +
>>>    static int __init xdp_metadata_init(void)
>>>    {
>>> +     xdp_metadata_magic = get_random_u32() | 1;
>>>        return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set);
>>>    }
>>>    late_initcall(xdp_metadata_init);
Martin KaFai Lau Nov. 9, 2022, 4:19 a.m. UTC | #4
On 11/8/22 7:07 PM, Martin KaFai Lau wrote:
> On 11/8/22 1:54 PM, Stanislav Fomichev wrote:
>> On Mon, Nov 7, 2022 at 2:02 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>>
>>> On 11/3/22 8:25 PM, Stanislav Fomichev wrote:
>>>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>>>> index 59c9fd55699d..dba857f212d7 100644
>>>> --- a/include/linux/skbuff.h
>>>> +++ b/include/linux/skbuff.h
>>>> @@ -4217,9 +4217,13 @@ static inline bool skb_metadata_differs(const struct 
>>>> sk_buff *skb_a,
>>>>               true : __skb_metadata_differs(skb_a, skb_b, len_a);
>>>>    }
>>>>
>>>> +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len);
>>>> +
>>>>    static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
>>>>    {
>>>>        skb_shinfo(skb)->meta_len = meta_len;
>>>> +     if (meta_len)
>>>> +             skb_metadata_import_from_xdp(skb, meta_len);
>>>>    }
>>>>
>>> [ ... ]
>>>
>>>> +struct xdp_to_skb_metadata {
>>>> +     u32 magic; /* xdp_metadata_magic */
>>>> +     u64 rx_timestamp;
>>>> +} __randomize_layout;
>>>> +
>>>> +struct bpf_patch;
>>>> +
>>>
>>> [ ... ]
>>>
>>>> +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len)
>>>> +{
>>>> +     struct xdp_to_skb_metadata *meta = (void *)(skb_mac_header(skb) - len);
>>>> +
>>>> +     /* Optional SKB info, currently missing:
>>>> +      * - HW checksum info           (skb->ip_summed)
>>>> +      * - HW RX hash                 (skb_set_hash)
>>>> +      * - RX ring dev queue index    (skb_record_rx_queue)
>>>> +      */
>>>> +
>>>> +     if (len != sizeof(struct xdp_to_skb_metadata))
>>>> +             return;
>>>> +
>>>> +     if (meta->magic != xdp_metadata_magic)
>>>> +             return;
>>>> +
>>>> +     if (meta->rx_timestamp) {
>>>> +             *skb_hwtstamps(skb) = (struct skb_shared_hwtstamps){
>>>> +                     .hwtstamp = ns_to_ktime(meta->rx_timestamp),
>>>> +             };
>>>> +     }
>>>> +}
>>>
>>> Considering the metadata will affect the gro, should the meta be cleared after
>>> importing to the skb?
>>
>> Yeah, good suggestion, will clear it here.
>>
>>> [ ... ]
>>>
>>>> +/* Since we're not actually doing a call but instead rewriting
>>>> + * in place, we can only afford to use R0-R5 scratch registers.
>>>> + *
>>>> + * We reserve R1 for bpf_xdp_metadata_export_to_skb and let individual
>>>> + * metadata kfuncs use only R0,R4-R5.
>>>> + *
>>>> + * The above also means we _cannot_ easily call any other helper/kfunc
>>>> + * because there is no place for us to preserve our R1 argument;
>>>> + * existing R6-R9 belong to the callee.
>>>> + */
>>>> +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct 
>>>> bpf_patch *patch)
>>>> +{
>>>> +     u32 func_id;
>>>> +
>>>> +     /*
>>>> +      * The code below generates the following:
>>>> +      *
>>>> +      * void bpf_xdp_metadata_export_to_skb(struct xdp_md *ctx)
>>>> +      * {
>>>> +      *      struct xdp_to_skb_metadata *meta;
>>>> +      *      int ret;
>>>> +      *
>>>> +      *      ret = bpf_xdp_adjust_meta(ctx, -sizeof(*meta));
>>>> +      *      if (!ret)
>>>> +      *              return;
>>>> +      *
>>>> +      *      meta = ctx->data_meta;
>>>> +      *      meta->magic = xdp_metadata_magic;
>>>> +      *      meta->rx_timestamp = bpf_xdp_metadata_rx_timestamp(ctx);
>>>> +      * }
>>>> +      *
>>>> +      */
>>>> +
>>>> +     bpf_patch_append(patch,
>>>> +             /* r2 = ((struct xdp_buff *)r1)->data_meta; */
>>>> +             BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
>>>> +                         offsetof(struct xdp_buff, data_meta)),
>>>> +             /* r3 = ((struct xdp_buff *)r1)->data; */
>>>> +             BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
>>>> +                         offsetof(struct xdp_buff, data)),
>>>> +             /* if (data_meta != data) return;
>>>> +              *
>>>> +              *      data_meta > data: xdp_data_meta_unsupported()
>>>> +              *      data_meta < data: already used, no need to touch
>>>> +              */
>>>> +             BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, S16_MAX),
>>>> +
>>>> +             /* r2 -= sizeof(struct xdp_to_skb_metadata); */
>>>> +             BPF_ALU64_IMM(BPF_SUB, BPF_REG_2,
>>>> +                           sizeof(struct xdp_to_skb_metadata)),
>>>> +             /* r3 = ((struct xdp_buff *)r1)->data_hard_start; */
>>>> +             BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
>>>> +                         offsetof(struct xdp_buff, data_hard_start)),
>>>> +             /* r3 += sizeof(struct xdp_frame) */
>>>> +             BPF_ALU64_IMM(BPF_ADD, BPF_REG_3,
>>>> +                           sizeof(struct xdp_frame)),
>>>> +             /* if (data-sizeof(struct xdp_to_skb_metadata) < 
>>>> data_hard_start+sizeof(struct xdp_frame)) return; */
>>>> +             BPF_JMP_REG(BPF_JLT, BPF_REG_2, BPF_REG_3, S16_MAX),
>>>> +
>>>> +             /* ((struct xdp_buff *)r1)->data_meta = r2; */
>>>> +             BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2,
>>>> +                         offsetof(struct xdp_buff, data_meta)),
>>>> +
>>>> +             /* *((struct xdp_to_skb_metadata *)r2)->magic = 
>>>> xdp_metadata_magic; */
>>>> +             BPF_ST_MEM(BPF_W, BPF_REG_2,
>>>> +                        offsetof(struct xdp_to_skb_metadata, magic),
>>>> +                        xdp_metadata_magic),
>>>> +     );
>>>> +
>>>> +     /*      r0 = bpf_xdp_metadata_rx_timestamp(ctx); */
>>>> +     func_id = xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP);
>>>> +     prog->aux->xdp_kfunc_ndo->ndo_unroll_kfunc(prog, func_id, patch);
>>>> +
>>>> +     bpf_patch_append(patch,
>>>> +             /* r2 = ((struct xdp_buff *)r1)->data_meta; */
>>>> +             BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
>>>> +                         offsetof(struct xdp_buff, data_meta)),
>>>> +             /* *((struct xdp_to_skb_metadata *)r2)->rx_timestamp = r0; */
>>>> +             BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0,
>>>> +                         offsetof(struct xdp_to_skb_metadata, rx_timestamp)),
>>>
>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
>>> sure it is solid enough by asking the xdp prog not to use the same random number
>>> in its own metadata + not to change the metadata through xdp->data_meta after
>>> calling bpf_xdp_metadata_export_to_skb().
>>
>> What do you think the usecase here might be? Or are you suggesting we
>> reject further access to data_meta after
>> bpf_xdp_metadata_export_to_skb somehow?
>>
>> If we want to let the programs override some of this
>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
>> more kfuncs instead of exposing the layout?
>>
>> bpf_xdp_metadata_export_to_skb(ctx);
>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);

After re-reading patch 6, have another question.  The 'void 
bpf_xdp_metadata_export_to_skb();' function signature.  Should it at least 
return ok/err? or even return a 'struct xdp_to_skb_metadata *' pointer and the 
xdp prog can directly read (or even write) it?  A related question, why 'struct 
xdp_to_skb_metadata' needs __randomize_layout?


> 
> 
> I can't think of a use case now for the xdp prog to use the xdp_to_skb_metadata 
> while the xdp prog can directly call the kfunc (eg 
> bpf_xdp_metadata_rx_timestamp) to get individual hint.  I was asking if patch 7 
> is an actual use case because it does test the tstamp in XDP_PASS or it is 
> mostly for selftest purpose?  Yeah, may be the xdp prog will be able to change 
> the xdp_to_skb_metadata eventually but that is for later.
> 
> My concern is the xdp prog is allowed to change xdp_to_skb_metadata or 
> by-coincident writing metadata that matches the random and the sizeof(struct 
> xdp_to_skb_metadata).
> 
> Also, the added opacity of xdp_to_skb_metadata (__randomize_layout + random int) 
> is trying very hard to hide it from xdp prog.  Instead, would it be cleaner to 
> have a flag in xdp->flags (to be set by bpf_xdp_metadata_export_to_skb?) to 
> guard this, like one of Jesper's patch.  The xdp_convert_ctx_access() and 
> bpf_xdp_adjust_meta() can check this bit to ensure the xdp_to_skb_metadata 
> cannot be read and no metadata can be added/deleted after that.  btw, is it 
> possible to keep both xdp_to_skb_metadata and the xdp_prog's metadata?  After 
> skb_metadata_import_from_xdp popping the xdp_to_skb_metadata, the remaining 
> xdp_prog's metatdata can still be used by the bpf-tc.
> 
>> ...
>>
>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the
>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only?
>>
>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right
>> now usually have C code to manually pull out the metadata (out of hw
>> desc) and put it into skb.
>>
>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for
>> XDP_PASS, we're doing a double amount of work:
>> skb_metadata_import_from_xdp first, then custom driver code second.
>>
>> In theory, maybe we should completely skip drivers custom parsing when
>> there is a prog with BPF_F_XDP_HAS_METADATA?
>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven
>> and won't require any mental work (plus, the drivers won't have to
>> care either in the future).
>>  > WDYT?
> 
> 
> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes 
> sense to only use the hints (if ever written) from xdp prog especially if it 
> will eventually support xdp prog changing some of the hints in the future.  For 
> now, I think either way is fine since they are the same and the xdp prog is sort 
> of doing extra unnecessary work anyway by calling 
> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be 
> changed now.
> 
> 
>>
>>>> +     );
>>>> +
>>>> +     bpf_patch_resolve_jmp(patch);
>>>> +}
>>>> +
>>>>    static int __init xdp_metadata_init(void)
>>>>    {
>>>> +     xdp_metadata_magic = get_random_u32() | 1;
>>>>        return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, 
>>>> &xdp_metadata_kfunc_set);
>>>>    }
>>>>    late_initcall(xdp_metadata_init);
>
Toke Høiland-Jørgensen Nov. 9, 2022, 11:10 a.m. UTC | #5
Snipping a bit of context to reply to this bit:

>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
>>>> sure it is solid enough by asking the xdp prog not to use the same random number
>>>> in its own metadata + not to change the metadata through xdp->data_meta after
>>>> calling bpf_xdp_metadata_export_to_skb().
>>>
>>> What do you think the usecase here might be? Or are you suggesting we
>>> reject further access to data_meta after
>>> bpf_xdp_metadata_export_to_skb somehow?
>>>
>>> If we want to let the programs override some of this
>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
>>> more kfuncs instead of exposing the layout?
>>>
>>> bpf_xdp_metadata_export_to_skb(ctx);
>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);

There are several use cases for needing to access the metadata after
calling bpf_xdp_metdata_export_to_skb():

- Accessing the metadata after redirect (in a cpumap or devmap program,
  or on a veth device)
- Transferring the packet+metadata to AF_XDP
- Returning XDP_PASS, but accessing some of the metadata first (whether
  to read or change it)

The last one could be solved by calling additional kfuncs, but that
would be less efficient than just directly editing the struct which
will be cache-hot after the helper returns.

And yeah, this will allow the XDP program to inject arbitrary metadata
into the netstack; but it can already inject arbitrary *packet* data
into the stack, so not sure if this is much of an additional risk? If it
does lead to trivial crashes, we should probably harden the stack
against that?

As for the random number, Jesper and I discussed replacing this with the
same BTF-ID scheme that he was using in his patch series. I.e., instead
of just putting in a random number, we insert the BTF ID of the metadata
struct at the end of it. This will allow us to support multiple
different formats in the future (not just changing the layout, but
having multiple simultaneous formats in the same kernel image), in case
we run out of space.

We should probably also have a flag set on the xdp_frame so the stack
knows that the metadata area contains relevant-to-skb data, to guard
against an XDP program accidentally hitting the "magic number" (BTF_ID)
in unrelated stuff it puts into the metadata area.

> After re-reading patch 6, have another question. The 'void
> bpf_xdp_metadata_export_to_skb();' function signature. Should it at
> least return ok/err? or even return a 'struct xdp_to_skb_metadata *'
> pointer and the xdp prog can directly read (or even write) it?

Hmm, I'm not sure returning a failure makes sense? Failure to read one
or more fields just means that those fields will not be populated? We
should probably have a flags field inside the metadata struct itself to
indicate which fields are set or not, but I'm not sure returning an
error value adds anything? Returning a pointer to the metadata field
might be convenient for users (it would just be an alias to the
data_meta pointer, but the verifier could know its size, so the program
doesn't have to bounds check it).

> A related question, why 'struct xdp_to_skb_metadata' needs
> __randomize_layout?

The __randomize_layout thing is there to force BPF programs to use CO-RE
to access the field. This is to avoid the struct layout accidentally
ossifying because people in practice rely on a particular layout, even
though we tell them to use CO-RE. There are lots of examples of this
happening in other domains (IP header options, TCP options, etc), and
__randomize_layout seemed like a neat trick to enforce CO-RE usage :)

>>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the
>>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only?
>>>
>>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right
>>> now usually have C code to manually pull out the metadata (out of hw
>>> desc) and put it into skb.
>>>
>>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for
>>> XDP_PASS, we're doing a double amount of work:
>>> skb_metadata_import_from_xdp first, then custom driver code second.
>>>
>>> In theory, maybe we should completely skip drivers custom parsing when
>>> there is a prog with BPF_F_XDP_HAS_METADATA?
>>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven
>>> and won't require any mental work (plus, the drivers won't have to
>>> care either in the future).
>>>  > WDYT?
>> 
>> 
>> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes 
>> sense to only use the hints (if ever written) from xdp prog especially if it 
>> will eventually support xdp prog changing some of the hints in the future.  For 
>> now, I think either way is fine since they are the same and the xdp prog is sort 
>> of doing extra unnecessary work anyway by calling 
>> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be 
>> changed now.

I agree it would be best if the drivers also use the XDP metadata (if
present) on XDP_PASS. Longer term my hope is we can make the XDP
metadata support the only thing drivers need to implement (i.e., have
the stack call into that code even when no XDP program is loaded), but
for now just for consistency (and allowing the XDP program to update the
metadata), we should probably at least consume it on XDP_PASS.

-Toke
Martin KaFai Lau Nov. 9, 2022, 6:22 p.m. UTC | #6
On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote:
> Snipping a bit of context to reply to this bit:
> 
>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
>>>>> sure it is solid enough by asking the xdp prog not to use the same random number
>>>>> in its own metadata + not to change the metadata through xdp->data_meta after
>>>>> calling bpf_xdp_metadata_export_to_skb().
>>>>
>>>> What do you think the usecase here might be? Or are you suggesting we
>>>> reject further access to data_meta after
>>>> bpf_xdp_metadata_export_to_skb somehow?
>>>>
>>>> If we want to let the programs override some of this
>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
>>>> more kfuncs instead of exposing the layout?
>>>>
>>>> bpf_xdp_metadata_export_to_skb(ctx);
>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);
> 
> There are several use cases for needing to access the metadata after
> calling bpf_xdp_metdata_export_to_skb():
> 
> - Accessing the metadata after redirect (in a cpumap or devmap program,
>    or on a veth device)
> - Transferring the packet+metadata to AF_XDP
fwiw, the xdp prog could also be more selective and only stores one of the hints 
instead of the whole 'struct xdp_to_skb_metadata'.

> - Returning XDP_PASS, but accessing some of the metadata first (whether
>    to read or change it)
> 
> The last one could be solved by calling additional kfuncs, but that
> would be less efficient than just directly editing the struct which
> will be cache-hot after the helper returns.

Yeah, it is more efficient to directly write if possible.  I think this set 
allows the direct reading and writing already through data_meta (as a _u8 *).

> 
> And yeah, this will allow the XDP program to inject arbitrary metadata
> into the netstack; but it can already inject arbitrary *packet* data
> into the stack, so not sure if this is much of an additional risk? If it
> does lead to trivial crashes, we should probably harden the stack
> against that?
> 
> As for the random number, Jesper and I discussed replacing this with the
> same BTF-ID scheme that he was using in his patch series. I.e., instead
> of just putting in a random number, we insert the BTF ID of the metadata
> struct at the end of it. This will allow us to support multiple
> different formats in the future (not just changing the layout, but
> having multiple simultaneous formats in the same kernel image), in case
> we run out of space.

This seems a bit hypothetical.  How much headroom does it usually have for the 
xdp prog?  Potentially the hints can use all the remaining space left after the 
header encap and the current bpf_xdp_adjust_meta() usage?

> 
> We should probably also have a flag set on the xdp_frame so the stack
> knows that the metadata area contains relevant-to-skb data, to guard
> against an XDP program accidentally hitting the "magic number" (BTF_ID)
> in unrelated stuff it puts into the metadata area.

Yeah, I think having a flag is useful.  The flag will be set at xdp_buff and 
then transfer to the xdp_frame?

> 
>> After re-reading patch 6, have another question. The 'void
>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at
>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *'
>> pointer and the xdp prog can directly read (or even write) it?
> 
> Hmm, I'm not sure returning a failure makes sense? Failure to read one
> or more fields just means that those fields will not be populated? We
> should probably have a flags field inside the metadata struct itself to
> indicate which fields are set or not, but I'm not sure returning an
> error value adds anything? Returning a pointer to the metadata field
> might be convenient for users (it would just be an alias to the
> data_meta pointer, but the verifier could know its size, so the program
> doesn't have to bounds check it).

If some hints are not available, those hints should be initialized to 
0/CHECKSUM_NONE/...etc.  The xdp prog needs a direct way to tell hard failure 
when it cannot write the meta area because of not enough space.  Comparing 
xdp->data_meta with xdp->data as a side effect is not intuitive.

It is more than saving the bound check.  With type info of 'struct 
xdp_to_skb_metadata *', the verifier can do more checks like reading in the 
middle of an integer member.  The verifier could also limit write access only to 
a few struct's members if it is needed.

The returning 'struct xdp_to_skb_metadata *' should not be an alias to the 
xdp->data_meta.  They should actually point to different locations in the 
headroom.  bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff. 
xdp->data_meta won't be changed and keeps pointing to the last 
bpf_xdp_adjust_meta() location.  The kernel will know if there is 
xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the 
xdp_{buff,frame}.  Would it work?

> 
>> A related question, why 'struct xdp_to_skb_metadata' needs
>> __randomize_layout?
> 
> The __randomize_layout thing is there to force BPF programs to use CO-RE
> to access the field. This is to avoid the struct layout accidentally
> ossifying because people in practice rely on a particular layout, even
> though we tell them to use CO-RE. There are lots of examples of this
> happening in other domains (IP header options, TCP options, etc), and
> __randomize_layout seemed like a neat trick to enforce CO-RE usage :)

I am not sure if it is necessary or helpful to only enforce __randomize_layout 
in 'struct xdp_to_skb_metadata'.  There are other CO-RE use cases (tracing and 
non tracing) that already have direct access (reading and/or writing) to other 
kernel structures.

It is more important for the verifier to see the xdp prog accessing it as a 
'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so 
that the verifier can enforce the rules of access.

> 
>>>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the
>>>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only?
>>>>
>>>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right
>>>> now usually have C code to manually pull out the metadata (out of hw
>>>> desc) and put it into skb.
>>>>
>>>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for
>>>> XDP_PASS, we're doing a double amount of work:
>>>> skb_metadata_import_from_xdp first, then custom driver code second.
>>>>
>>>> In theory, maybe we should completely skip drivers custom parsing when
>>>> there is a prog with BPF_F_XDP_HAS_METADATA?
>>>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven
>>>> and won't require any mental work (plus, the drivers won't have to
>>>> care either in the future).
>>>>   > WDYT?
>>>
>>>
>>> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes
>>> sense to only use the hints (if ever written) from xdp prog especially if it
>>> will eventually support xdp prog changing some of the hints in the future.  For
>>> now, I think either way is fine since they are the same and the xdp prog is sort
>>> of doing extra unnecessary work anyway by calling
>>> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be
>>> changed now.
> 
> I agree it would be best if the drivers also use the XDP metadata (if
> present) on XDP_PASS. Longer term my hope is we can make the XDP
> metadata support the only thing drivers need to implement (i.e., have
> the stack call into that code even when no XDP program is loaded), but
> for now just for consistency (and allowing the XDP program to update the
> metadata), we should probably at least consume it on XDP_PASS.
> 
> -Toke
>
Stanislav Fomichev Nov. 9, 2022, 9:33 p.m. UTC | #7
On Wed, Nov 9, 2022 at 10:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote:
> > Snipping a bit of context to reply to this bit:
> >
> >>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
> >>>>> sure it is solid enough by asking the xdp prog not to use the same random number
> >>>>> in its own metadata + not to change the metadata through xdp->data_meta after
> >>>>> calling bpf_xdp_metadata_export_to_skb().
> >>>>
> >>>> What do you think the usecase here might be? Or are you suggesting we
> >>>> reject further access to data_meta after
> >>>> bpf_xdp_metadata_export_to_skb somehow?
> >>>>
> >>>> If we want to let the programs override some of this
> >>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
> >>>> more kfuncs instead of exposing the layout?
> >>>>
> >>>> bpf_xdp_metadata_export_to_skb(ctx);
> >>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);
> >
> > There are several use cases for needing to access the metadata after
> > calling bpf_xdp_metdata_export_to_skb():
> >
> > - Accessing the metadata after redirect (in a cpumap or devmap program,
> >    or on a veth device)
> > - Transferring the packet+metadata to AF_XDP
> fwiw, the xdp prog could also be more selective and only stores one of the hints
> instead of the whole 'struct xdp_to_skb_metadata'.
>
> > - Returning XDP_PASS, but accessing some of the metadata first (whether
> >    to read or change it)
> >
> > The last one could be solved by calling additional kfuncs, but that
> > would be less efficient than just directly editing the struct which
> > will be cache-hot after the helper returns.
>
> Yeah, it is more efficient to directly write if possible.  I think this set
> allows the direct reading and writing already through data_meta (as a _u8 *).
>
> >
> > And yeah, this will allow the XDP program to inject arbitrary metadata
> > into the netstack; but it can already inject arbitrary *packet* data
> > into the stack, so not sure if this is much of an additional risk? If it
> > does lead to trivial crashes, we should probably harden the stack
> > against that?
> >
> > As for the random number, Jesper and I discussed replacing this with the
> > same BTF-ID scheme that he was using in his patch series. I.e., instead
> > of just putting in a random number, we insert the BTF ID of the metadata
> > struct at the end of it. This will allow us to support multiple
> > different formats in the future (not just changing the layout, but
> > having multiple simultaneous formats in the same kernel image), in case
> > we run out of space.
>
> This seems a bit hypothetical.  How much headroom does it usually have for the
> xdp prog?  Potentially the hints can use all the remaining space left after the
> header encap and the current bpf_xdp_adjust_meta() usage?
>
> >
> > We should probably also have a flag set on the xdp_frame so the stack
> > knows that the metadata area contains relevant-to-skb data, to guard
> > against an XDP program accidentally hitting the "magic number" (BTF_ID)
> > in unrelated stuff it puts into the metadata area.
>
> Yeah, I think having a flag is useful.  The flag will be set at xdp_buff and
> then transfer to the xdp_frame?
>
> >
> >> After re-reading patch 6, have another question. The 'void
> >> bpf_xdp_metadata_export_to_skb();' function signature. Should it at
> >> least return ok/err? or even return a 'struct xdp_to_skb_metadata *'
> >> pointer and the xdp prog can directly read (or even write) it?
> >
> > Hmm, I'm not sure returning a failure makes sense? Failure to read one
> > or more fields just means that those fields will not be populated? We
> > should probably have a flags field inside the metadata struct itself to
> > indicate which fields are set or not, but I'm not sure returning an
> > error value adds anything? Returning a pointer to the metadata field
> > might be convenient for users (it would just be an alias to the
> > data_meta pointer, but the verifier could know its size, so the program
> > doesn't have to bounds check it).
>
> If some hints are not available, those hints should be initialized to
> 0/CHECKSUM_NONE/...etc.  The xdp prog needs a direct way to tell hard failure
> when it cannot write the meta area because of not enough space.  Comparing
> xdp->data_meta with xdp->data as a side effect is not intuitive.
>
> It is more than saving the bound check.  With type info of 'struct
> xdp_to_skb_metadata *', the verifier can do more checks like reading in the
> middle of an integer member.  The verifier could also limit write access only to
> a few struct's members if it is needed.
>
> The returning 'struct xdp_to_skb_metadata *' should not be an alias to the
> xdp->data_meta.  They should actually point to different locations in the
> headroom.  bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff.
> xdp->data_meta won't be changed and keeps pointing to the last
> bpf_xdp_adjust_meta() location.  The kernel will know if there is
> xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the
> xdp_{buff,frame}.  Would it work?
>
> >
> >> A related question, why 'struct xdp_to_skb_metadata' needs
> >> __randomize_layout?
> >
> > The __randomize_layout thing is there to force BPF programs to use CO-RE
> > to access the field. This is to avoid the struct layout accidentally
> > ossifying because people in practice rely on a particular layout, even
> > though we tell them to use CO-RE. There are lots of examples of this
> > happening in other domains (IP header options, TCP options, etc), and
> > __randomize_layout seemed like a neat trick to enforce CO-RE usage :)
>
> I am not sure if it is necessary or helpful to only enforce __randomize_layout
> in 'struct xdp_to_skb_metadata'.  There are other CO-RE use cases (tracing and
> non tracing) that already have direct access (reading and/or writing) to other
> kernel structures.
>
> It is more important for the verifier to see the xdp prog accessing it as a
> 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so
> that the verifier can enforce the rules of access.
>
> >
> >>>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the
> >>>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only?
> >>>>
> >>>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right
> >>>> now usually have C code to manually pull out the metadata (out of hw
> >>>> desc) and put it into skb.
> >>>>
> >>>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for
> >>>> XDP_PASS, we're doing a double amount of work:
> >>>> skb_metadata_import_from_xdp first, then custom driver code second.
> >>>>
> >>>> In theory, maybe we should completely skip drivers custom parsing when
> >>>> there is a prog with BPF_F_XDP_HAS_METADATA?
> >>>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven
> >>>> and won't require any mental work (plus, the drivers won't have to
> >>>> care either in the future).
> >>>>   > WDYT?
> >>>
> >>>
> >>> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes
> >>> sense to only use the hints (if ever written) from xdp prog especially if it
> >>> will eventually support xdp prog changing some of the hints in the future.  For
> >>> now, I think either way is fine since they are the same and the xdp prog is sort
> >>> of doing extra unnecessary work anyway by calling
> >>> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be
> >>> changed now.
> >
> > I agree it would be best if the drivers also use the XDP metadata (if
> > present) on XDP_PASS. Longer term my hope is we can make the XDP
> > metadata support the only thing drivers need to implement (i.e., have
> > the stack call into that code even when no XDP program is loaded), but
> > for now just for consistency (and allowing the XDP program to update the
> > metadata), we should probably at least consume it on XDP_PASS.
> >
> > -Toke
> >

Not to derail the discussion (left the last message intact on top,
feel free to continue), but to summarize. The proposed changes seem to
be:

1. bpf_xdp_metadata_export_to_skb() should return pointer to "struct
xdp_to_skb_metadata"
  - This should let bpf programs change the metadata passed to the skb

2. "struct xdp_to_skb_metadata" should have its btf_id as the first
__u32 member (and remove the magic)
  - This is for the redirect case where the end users, including
AF_XDP, can parse this metadata from btf_id
  - This, however, is not all the metadata that the device can
support, but a much narrower set that the kernel is expected to use
for skb construction

3. __randomize_layout isn't really helping, CO-RE will trigger
regardless; maybe only the case where it matters is probably AF_XDP,
so still useful?

4. The presence of the metadata generated by
bpf_xdp_metadata_export_to_skb should be indicated by a flag in
xdp_{buff,frame}->flags
  - Assuming exposing it via xdp_md->has_skb_metadata is ok?
  - Since the programs probably need to do the following:

  if (xdp_md->has_skb_metadata) {
    access/change skb metadata by doing struct xdp_to_skb_metadata *p
= data_meta;
  } else {
    use kfuncs
  }

5. Support the case where we keep program's metadata and kernel's
xdp_to_skb_metadata
  - skb_metadata_import_from_xdp() will "consume" it by mem-moving the
rest of the metadata over it and adjusting the headroom


I think the above solves all the cases Toke points to?

a) Accessing the metadata after redirect (in a cpumap or devmap
program, or on a veth device)
  - only a small xdp_to_skb_metadata subset will work out of the box
iff the redirecttor calls bpf_xdp_metadata_export_to_skb; for the rest
the progs will have to agree on the layout, right?

b) Transferring the packet+metadata to AF_XDP
  - here, again, the AF_XDP consumer will have to either expect
xdp_to_skb_metadata with a smaller set of skb-related metadata, or
will have to make sure the producer builds a custom layout using
kfuncs; there is also no flag to indicate whether xdp_to_skb_metadata
is there or not; the consumer will have to test btf_id at the right
offset

c) Returning XDP_PASS, but accessing some of the metadata first
(whether to read or change it)
  - can read via kfuncs, can change via
bpf_xdp_metadata_export_to_skb(); m->xyz=abc;

Anything I'm missing?
Martin KaFai Lau Nov. 10, 2022, 12:13 a.m. UTC | #8
On 11/9/22 1:33 PM, Stanislav Fomichev wrote:
> On Wed, Nov 9, 2022 at 10:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote:
>>> Snipping a bit of context to reply to this bit:
>>>
>>>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
>>>>>>> sure it is solid enough by asking the xdp prog not to use the same random number
>>>>>>> in its own metadata + not to change the metadata through xdp->data_meta after
>>>>>>> calling bpf_xdp_metadata_export_to_skb().
>>>>>>
>>>>>> What do you think the usecase here might be? Or are you suggesting we
>>>>>> reject further access to data_meta after
>>>>>> bpf_xdp_metadata_export_to_skb somehow?
>>>>>>
>>>>>> If we want to let the programs override some of this
>>>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
>>>>>> more kfuncs instead of exposing the layout?
>>>>>>
>>>>>> bpf_xdp_metadata_export_to_skb(ctx);
>>>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);
>>>
>>> There are several use cases for needing to access the metadata after
>>> calling bpf_xdp_metdata_export_to_skb():
>>>
>>> - Accessing the metadata after redirect (in a cpumap or devmap program,
>>>     or on a veth device)
>>> - Transferring the packet+metadata to AF_XDP
>> fwiw, the xdp prog could also be more selective and only stores one of the hints
>> instead of the whole 'struct xdp_to_skb_metadata'.
>>
>>> - Returning XDP_PASS, but accessing some of the metadata first (whether
>>>     to read or change it)
>>>
>>> The last one could be solved by calling additional kfuncs, but that
>>> would be less efficient than just directly editing the struct which
>>> will be cache-hot after the helper returns.
>>
>> Yeah, it is more efficient to directly write if possible.  I think this set
>> allows the direct reading and writing already through data_meta (as a _u8 *).
>>
>>>
>>> And yeah, this will allow the XDP program to inject arbitrary metadata
>>> into the netstack; but it can already inject arbitrary *packet* data
>>> into the stack, so not sure if this is much of an additional risk? If it
>>> does lead to trivial crashes, we should probably harden the stack
>>> against that?
>>>
>>> As for the random number, Jesper and I discussed replacing this with the
>>> same BTF-ID scheme that he was using in his patch series. I.e., instead
>>> of just putting in a random number, we insert the BTF ID of the metadata
>>> struct at the end of it. This will allow us to support multiple
>>> different formats in the future (not just changing the layout, but
>>> having multiple simultaneous formats in the same kernel image), in case
>>> we run out of space.
>>
>> This seems a bit hypothetical.  How much headroom does it usually have for the
>> xdp prog?  Potentially the hints can use all the remaining space left after the
>> header encap and the current bpf_xdp_adjust_meta() usage?
>>
>>>
>>> We should probably also have a flag set on the xdp_frame so the stack
>>> knows that the metadata area contains relevant-to-skb data, to guard
>>> against an XDP program accidentally hitting the "magic number" (BTF_ID)
>>> in unrelated stuff it puts into the metadata area.
>>
>> Yeah, I think having a flag is useful.  The flag will be set at xdp_buff and
>> then transfer to the xdp_frame?
>>
>>>
>>>> After re-reading patch 6, have another question. The 'void
>>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at
>>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *'
>>>> pointer and the xdp prog can directly read (or even write) it?
>>>
>>> Hmm, I'm not sure returning a failure makes sense? Failure to read one
>>> or more fields just means that those fields will not be populated? We
>>> should probably have a flags field inside the metadata struct itself to
>>> indicate which fields are set or not, but I'm not sure returning an
>>> error value adds anything? Returning a pointer to the metadata field
>>> might be convenient for users (it would just be an alias to the
>>> data_meta pointer, but the verifier could know its size, so the program
>>> doesn't have to bounds check it).
>>
>> If some hints are not available, those hints should be initialized to
>> 0/CHECKSUM_NONE/...etc.  The xdp prog needs a direct way to tell hard failure
>> when it cannot write the meta area because of not enough space.  Comparing
>> xdp->data_meta with xdp->data as a side effect is not intuitive.
>>
>> It is more than saving the bound check.  With type info of 'struct
>> xdp_to_skb_metadata *', the verifier can do more checks like reading in the
>> middle of an integer member.  The verifier could also limit write access only to
>> a few struct's members if it is needed.
>>
>> The returning 'struct xdp_to_skb_metadata *' should not be an alias to the
>> xdp->data_meta.  They should actually point to different locations in the
>> headroom.  bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff.
>> xdp->data_meta won't be changed and keeps pointing to the last
>> bpf_xdp_adjust_meta() location.  The kernel will know if there is
>> xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the
>> xdp_{buff,frame}.  Would it work?
>>
>>>
>>>> A related question, why 'struct xdp_to_skb_metadata' needs
>>>> __randomize_layout?
>>>
>>> The __randomize_layout thing is there to force BPF programs to use CO-RE
>>> to access the field. This is to avoid the struct layout accidentally
>>> ossifying because people in practice rely on a particular layout, even
>>> though we tell them to use CO-RE. There are lots of examples of this
>>> happening in other domains (IP header options, TCP options, etc), and
>>> __randomize_layout seemed like a neat trick to enforce CO-RE usage :)
>>
>> I am not sure if it is necessary or helpful to only enforce __randomize_layout
>> in 'struct xdp_to_skb_metadata'.  There are other CO-RE use cases (tracing and
>> non tracing) that already have direct access (reading and/or writing) to other
>> kernel structures.
>>
>> It is more important for the verifier to see the xdp prog accessing it as a
>> 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so
>> that the verifier can enforce the rules of access.
>>
>>>
>>>>>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the
>>>>>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only?
>>>>>>
>>>>>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right
>>>>>> now usually have C code to manually pull out the metadata (out of hw
>>>>>> desc) and put it into skb.
>>>>>>
>>>>>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for
>>>>>> XDP_PASS, we're doing a double amount of work:
>>>>>> skb_metadata_import_from_xdp first, then custom driver code second.
>>>>>>
>>>>>> In theory, maybe we should completely skip drivers custom parsing when
>>>>>> there is a prog with BPF_F_XDP_HAS_METADATA?
>>>>>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven
>>>>>> and won't require any mental work (plus, the drivers won't have to
>>>>>> care either in the future).
>>>>>>    > WDYT?
>>>>>
>>>>>
>>>>> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes
>>>>> sense to only use the hints (if ever written) from xdp prog especially if it
>>>>> will eventually support xdp prog changing some of the hints in the future.  For
>>>>> now, I think either way is fine since they are the same and the xdp prog is sort
>>>>> of doing extra unnecessary work anyway by calling
>>>>> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be
>>>>> changed now.
>>>
>>> I agree it would be best if the drivers also use the XDP metadata (if
>>> present) on XDP_PASS. Longer term my hope is we can make the XDP
>>> metadata support the only thing drivers need to implement (i.e., have
>>> the stack call into that code even when no XDP program is loaded), but
>>> for now just for consistency (and allowing the XDP program to update the
>>> metadata), we should probably at least consume it on XDP_PASS.
>>>
>>> -Toke
>>>
> 
> Not to derail the discussion (left the last message intact on top,
> feel free to continue), but to summarize. The proposed changes seem to
> be:
> 
> 1. bpf_xdp_metadata_export_to_skb() should return pointer to "struct
> xdp_to_skb_metadata"
>    - This should let bpf programs change the metadata passed to the skb
> 
> 2. "struct xdp_to_skb_metadata" should have its btf_id as the first
> __u32 member (and remove the magic)
>    - This is for the redirect case where the end users, including
> AF_XDP, can parse this metadata from btf_id

I think Toke's idea is to put the btf_id at the end of xdp_to_skb_metadata.  I 
can see why the end is needed for the userspace AF_XDP because, afaict, AF_XDP 
rx_desc currently cannot tell if there is metadata written by the xdp prog or 
not.  However, if the 'has_skb_metadata' bit can also be passed to the AF_XDP 
rx_desc->options, the btf_id may as well be not needed now.  However, the btf_id 
and other future new members can be added to the xdp_to_skb_metadata later if 
there is a need.

For the kernel and xdp prog, a bit in the xdp->flags should be enough to get to 
the xdp_to_skb_metadata.  The xdp prog will use CO-RE to access the members in 
xdp_to_skb_metadata.

>    - This, however, is not all the metadata that the device can
> support, but a much narrower set that the kernel is expected to use
> for skb construction
> 
> 3. __randomize_layout isn't really helping, CO-RE will trigger
> regardless; maybe only the case where it matters is probably AF_XDP,
> so still useful?
> 
> 4. The presence of the metadata generated by
> bpf_xdp_metadata_export_to_skb should be indicated by a flag in
> xdp_{buff,frame}->flags
>    - Assuming exposing it via xdp_md->has_skb_metadata is ok?

probably __bpf_md_ptr(struct xdp_to_skb_metadata *, skb_metadata) and the type 
will be PTR_TO_BTF_ID_OR_NULL.

>    - Since the programs probably need to do the following:
> 
>    if (xdp_md->has_skb_metadata) {
>      access/change skb metadata by doing struct xdp_to_skb_metadata *p
> = data_meta;

and directly access/change xdp->skb_metadata instead of using xdp->data_meta.

>    } else {
>      use kfuncs
>    }
> 
> 5. Support the case where we keep program's metadata and kernel's
> xdp_to_skb_metadata
>    - skb_metadata_import_from_xdp() will "consume" it by mem-moving the
> rest of the metadata over it and adjusting the headroom

I was thinking the kernel's xdp_to_skb_metadata is always before the program's 
metadata.  xdp prog should usually work in this order also: read/write headers, 
write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return 
XDP_PASS/XDP_REDIRECT.  When it is XDP_PASS, the kernel just needs to pop the 
xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc.

For the kernel and xdp prog, I don't think it matters where the 
xdp_to_skb_metadata is.  However, the xdp->data_meta (program's metadata) has to 
be before xdp->data because of the current data_meta and data comparison usage 
in the xdp prog.

The order of the kernel's xdp_to_skb_metadata and the program's metadata 
probably only matters to the userspace AF_XDP.  However, I don't see how AF_XDP 
supports the program's metadata now.  afaict, it can only work now if there is 
some sort of contract between them or the AF_XDP currently does not use the 
program's metadata.  Either way, we can do the mem-moving only for AF_XDP and it 
should be a no op if there is no program's metadata?  This behavior could also 
be configurable through setsockopt?

Thanks for the summary!

> 
> 
> I think the above solves all the cases Toke points to?
> 
> a) Accessing the metadata after redirect (in a cpumap or devmap
> program, or on a veth device)
>    - only a small xdp_to_skb_metadata subset will work out of the box
> iff the redirecttor calls bpf_xdp_metadata_export_to_skb; for the rest
> the progs will have to agree on the layout, right?
> 
> b) Transferring the packet+metadata to AF_XDP
>    - here, again, the AF_XDP consumer will have to either expect
> xdp_to_skb_metadata with a smaller set of skb-related metadata, or
> will have to make sure the producer builds a custom layout using
> kfuncs; there is also no flag to indicate whether xdp_to_skb_metadata
> is there or not; the consumer will have to test btf_id at the right
> offset
> 
> c) Returning XDP_PASS, but accessing some of the metadata first
> (whether to read or change it)
>    - can read via kfuncs, can change via
> bpf_xdp_metadata_export_to_skb(); m->xyz=abc;
> 
> Anything I'm missing?
Stanislav Fomichev Nov. 10, 2022, 1:02 a.m. UTC | #9
On Wed, Nov 9, 2022 at 4:13 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 11/9/22 1:33 PM, Stanislav Fomichev wrote:
> > On Wed, Nov 9, 2022 at 10:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote:
> >>> Snipping a bit of context to reply to this bit:
> >>>
> >>>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
> >>>>>>> sure it is solid enough by asking the xdp prog not to use the same random number
> >>>>>>> in its own metadata + not to change the metadata through xdp->data_meta after
> >>>>>>> calling bpf_xdp_metadata_export_to_skb().
> >>>>>>
> >>>>>> What do you think the usecase here might be? Or are you suggesting we
> >>>>>> reject further access to data_meta after
> >>>>>> bpf_xdp_metadata_export_to_skb somehow?
> >>>>>>
> >>>>>> If we want to let the programs override some of this
> >>>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
> >>>>>> more kfuncs instead of exposing the layout?
> >>>>>>
> >>>>>> bpf_xdp_metadata_export_to_skb(ctx);
> >>>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);
> >>>
> >>> There are several use cases for needing to access the metadata after
> >>> calling bpf_xdp_metdata_export_to_skb():
> >>>
> >>> - Accessing the metadata after redirect (in a cpumap or devmap program,
> >>>     or on a veth device)
> >>> - Transferring the packet+metadata to AF_XDP
> >> fwiw, the xdp prog could also be more selective and only stores one of the hints
> >> instead of the whole 'struct xdp_to_skb_metadata'.
> >>
> >>> - Returning XDP_PASS, but accessing some of the metadata first (whether
> >>>     to read or change it)
> >>>
> >>> The last one could be solved by calling additional kfuncs, but that
> >>> would be less efficient than just directly editing the struct which
> >>> will be cache-hot after the helper returns.
> >>
> >> Yeah, it is more efficient to directly write if possible.  I think this set
> >> allows the direct reading and writing already through data_meta (as a _u8 *).
> >>
> >>>
> >>> And yeah, this will allow the XDP program to inject arbitrary metadata
> >>> into the netstack; but it can already inject arbitrary *packet* data
> >>> into the stack, so not sure if this is much of an additional risk? If it
> >>> does lead to trivial crashes, we should probably harden the stack
> >>> against that?
> >>>
> >>> As for the random number, Jesper and I discussed replacing this with the
> >>> same BTF-ID scheme that he was using in his patch series. I.e., instead
> >>> of just putting in a random number, we insert the BTF ID of the metadata
> >>> struct at the end of it. This will allow us to support multiple
> >>> different formats in the future (not just changing the layout, but
> >>> having multiple simultaneous formats in the same kernel image), in case
> >>> we run out of space.
> >>
> >> This seems a bit hypothetical.  How much headroom does it usually have for the
> >> xdp prog?  Potentially the hints can use all the remaining space left after the
> >> header encap and the current bpf_xdp_adjust_meta() usage?
> >>
> >>>
> >>> We should probably also have a flag set on the xdp_frame so the stack
> >>> knows that the metadata area contains relevant-to-skb data, to guard
> >>> against an XDP program accidentally hitting the "magic number" (BTF_ID)
> >>> in unrelated stuff it puts into the metadata area.
> >>
> >> Yeah, I think having a flag is useful.  The flag will be set at xdp_buff and
> >> then transfer to the xdp_frame?
> >>
> >>>
> >>>> After re-reading patch 6, have another question. The 'void
> >>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at
> >>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *'
> >>>> pointer and the xdp prog can directly read (or even write) it?
> >>>
> >>> Hmm, I'm not sure returning a failure makes sense? Failure to read one
> >>> or more fields just means that those fields will not be populated? We
> >>> should probably have a flags field inside the metadata struct itself to
> >>> indicate which fields are set or not, but I'm not sure returning an
> >>> error value adds anything? Returning a pointer to the metadata field
> >>> might be convenient for users (it would just be an alias to the
> >>> data_meta pointer, but the verifier could know its size, so the program
> >>> doesn't have to bounds check it).
> >>
> >> If some hints are not available, those hints should be initialized to
> >> 0/CHECKSUM_NONE/...etc.  The xdp prog needs a direct way to tell hard failure
> >> when it cannot write the meta area because of not enough space.  Comparing
> >> xdp->data_meta with xdp->data as a side effect is not intuitive.
> >>
> >> It is more than saving the bound check.  With type info of 'struct
> >> xdp_to_skb_metadata *', the verifier can do more checks like reading in the
> >> middle of an integer member.  The verifier could also limit write access only to
> >> a few struct's members if it is needed.
> >>
> >> The returning 'struct xdp_to_skb_metadata *' should not be an alias to the
> >> xdp->data_meta.  They should actually point to different locations in the
> >> headroom.  bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff.
> >> xdp->data_meta won't be changed and keeps pointing to the last
> >> bpf_xdp_adjust_meta() location.  The kernel will know if there is
> >> xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the
> >> xdp_{buff,frame}.  Would it work?
> >>
> >>>
> >>>> A related question, why 'struct xdp_to_skb_metadata' needs
> >>>> __randomize_layout?
> >>>
> >>> The __randomize_layout thing is there to force BPF programs to use CO-RE
> >>> to access the field. This is to avoid the struct layout accidentally
> >>> ossifying because people in practice rely on a particular layout, even
> >>> though we tell them to use CO-RE. There are lots of examples of this
> >>> happening in other domains (IP header options, TCP options, etc), and
> >>> __randomize_layout seemed like a neat trick to enforce CO-RE usage :)
> >>
> >> I am not sure if it is necessary or helpful to only enforce __randomize_layout
> >> in 'struct xdp_to_skb_metadata'.  There are other CO-RE use cases (tracing and
> >> non tracing) that already have direct access (reading and/or writing) to other
> >> kernel structures.
> >>
> >> It is more important for the verifier to see the xdp prog accessing it as a
> >> 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so
> >> that the verifier can enforce the rules of access.
> >>
> >>>
> >>>>>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the
> >>>>>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only?
> >>>>>>
> >>>>>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right
> >>>>>> now usually have C code to manually pull out the metadata (out of hw
> >>>>>> desc) and put it into skb.
> >>>>>>
> >>>>>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for
> >>>>>> XDP_PASS, we're doing a double amount of work:
> >>>>>> skb_metadata_import_from_xdp first, then custom driver code second.
> >>>>>>
> >>>>>> In theory, maybe we should completely skip drivers custom parsing when
> >>>>>> there is a prog with BPF_F_XDP_HAS_METADATA?
> >>>>>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven
> >>>>>> and won't require any mental work (plus, the drivers won't have to
> >>>>>> care either in the future).
> >>>>>>    > WDYT?
> >>>>>
> >>>>>
> >>>>> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes
> >>>>> sense to only use the hints (if ever written) from xdp prog especially if it
> >>>>> will eventually support xdp prog changing some of the hints in the future.  For
> >>>>> now, I think either way is fine since they are the same and the xdp prog is sort
> >>>>> of doing extra unnecessary work anyway by calling
> >>>>> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be
> >>>>> changed now.
> >>>
> >>> I agree it would be best if the drivers also use the XDP metadata (if
> >>> present) on XDP_PASS. Longer term my hope is we can make the XDP
> >>> metadata support the only thing drivers need to implement (i.e., have
> >>> the stack call into that code even when no XDP program is loaded), but
> >>> for now just for consistency (and allowing the XDP program to update the
> >>> metadata), we should probably at least consume it on XDP_PASS.
> >>>
> >>> -Toke
> >>>
> >
> > Not to derail the discussion (left the last message intact on top,
> > feel free to continue), but to summarize. The proposed changes seem to
> > be:
> >
> > 1. bpf_xdp_metadata_export_to_skb() should return pointer to "struct
> > xdp_to_skb_metadata"
> >    - This should let bpf programs change the metadata passed to the skb
> >
> > 2. "struct xdp_to_skb_metadata" should have its btf_id as the first
> > __u32 member (and remove the magic)
> >    - This is for the redirect case where the end users, including
> > AF_XDP, can parse this metadata from btf_id
>
> I think Toke's idea is to put the btf_id at the end of xdp_to_skb_metadata.  I
> can see why the end is needed for the userspace AF_XDP because, afaict, AF_XDP
> rx_desc currently cannot tell if there is metadata written by the xdp prog or
> not.  However, if the 'has_skb_metadata' bit can also be passed to the AF_XDP
> rx_desc->options, the btf_id may as well be not needed now.  However, the btf_id
> and other future new members can be added to the xdp_to_skb_metadata later if
> there is a need.
>
> For the kernel and xdp prog, a bit in the xdp->flags should be enough to get to
> the xdp_to_skb_metadata.  The xdp prog will use CO-RE to access the members in
> xdp_to_skb_metadata.

Ack, good points on putting it at the end.
Regarding bit in desc->options vs btf_id: since it seems that btf_id
is useful anyway, let's start with that? We can add a bit later on if
it turns out using metadata is problematic otherwise.

> >    - This, however, is not all the metadata that the device can
> > support, but a much narrower set that the kernel is expected to use
> > for skb construction
> >
> > 3. __randomize_layout isn't really helping, CO-RE will trigger
> > regardless; maybe only the case where it matters is probably AF_XDP,
> > so still useful?
> >
> > 4. The presence of the metadata generated by
> > bpf_xdp_metadata_export_to_skb should be indicated by a flag in
> > xdp_{buff,frame}->flags
> >    - Assuming exposing it via xdp_md->has_skb_metadata is ok?
>
> probably __bpf_md_ptr(struct xdp_to_skb_metadata *, skb_metadata) and the type
> will be PTR_TO_BTF_ID_OR_NULL.

Oh, that seems even better than returning it from
bpf_xdp_metadata_export_to_skb.
bpf_xdp_metadata_export_to_skb can return true/false and the rest goes
via default verifier ctx resolution mechanism..
(returning ptr from a kfunc seems to be a bit complicated right now)

> >    - Since the programs probably need to do the following:
> >
> >    if (xdp_md->has_skb_metadata) {
> >      access/change skb metadata by doing struct xdp_to_skb_metadata *p
> > = data_meta;
>
> and directly access/change xdp->skb_metadata instead of using xdp->data_meta.

Ack.

> >    } else {
> >      use kfuncs
> >    }
> >
> > 5. Support the case where we keep program's metadata and kernel's
> > xdp_to_skb_metadata
> >    - skb_metadata_import_from_xdp() will "consume" it by mem-moving the
> > rest of the metadata over it and adjusting the headroom
>
> I was thinking the kernel's xdp_to_skb_metadata is always before the program's
> metadata.  xdp prog should usually work in this order also: read/write headers,
> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return
> XDP_PASS/XDP_REDIRECT.  When it is XDP_PASS, the kernel just needs to pop the
> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc.
>
> For the kernel and xdp prog, I don't think it matters where the
> xdp_to_skb_metadata is.  However, the xdp->data_meta (program's metadata) has to
> be before xdp->data because of the current data_meta and data comparison usage
> in the xdp prog.
>
> The order of the kernel's xdp_to_skb_metadata and the program's metadata
> probably only matters to the userspace AF_XDP.  However, I don't see how AF_XDP
> supports the program's metadata now.  afaict, it can only work now if there is
> some sort of contract between them or the AF_XDP currently does not use the
> program's metadata.  Either way, we can do the mem-moving only for AF_XDP and it
> should be a no op if there is no program's metadata?  This behavior could also
> be configurable through setsockopt?

Agreed on all of the above. For now it seems like the safest thing to
do is to put xdp_to_skb_metadata last to allow af_xdp to properly
locate btf_id.
Let's see if Toke disagrees :-)


> Thanks for the summary!
>
> >
> >
> > I think the above solves all the cases Toke points to?
> >
> > a) Accessing the metadata after redirect (in a cpumap or devmap
> > program, or on a veth device)
> >    - only a small xdp_to_skb_metadata subset will work out of the box
> > iff the redirecttor calls bpf_xdp_metadata_export_to_skb; for the rest
> > the progs will have to agree on the layout, right?
> >
> > b) Transferring the packet+metadata to AF_XDP
> >    - here, again, the AF_XDP consumer will have to either expect
> > xdp_to_skb_metadata with a smaller set of skb-related metadata, or
> > will have to make sure the producer builds a custom layout using
> > kfuncs; there is also no flag to indicate whether xdp_to_skb_metadata
> > is there or not; the consumer will have to test btf_id at the right
> > offset
> >
> > c) Returning XDP_PASS, but accessing some of the metadata first
> > (whether to read or change it)
> >    - can read via kfuncs, can change via
> > bpf_xdp_metadata_export_to_skb(); m->xyz=abc;
> >
> > Anything I'm missing?
>
John Fastabend Nov. 10, 2022, 1:09 a.m. UTC | #10
Stanislav Fomichev wrote:
> Implement new bpf_xdp_metadata_export_to_skb kfunc which
> prepares compatible xdp metadata for kernel consumption.
> This kfunc should be called prior to bpf_redirect
> or (unless already called) when XDP_PASS'ing the frame
> into the kernel.

Hi,

Had a couple high level questions so starting a new thread thought
it would be more confusing than helpful to add to the thread on
this patch.

> 
> The implementation currently maintains xdp_to_skb_metadata
> layout by calling bpf_xdp_metadata_rx_timestamp and placing
> small magic number. From skb_metdata_set, when we get expected magic number,
> we interpret metadata accordingly.

From commit message side I'm not able to parse this paragraph without
reading code. Maybe expand it a bit for next version or it could
just be me.

> 
> Both magic number and struct layout are randomized to make sure
> it doesn't leak into the userspace.

Are we worried about leaking pointers into XDP program here? We already
leak pointers into XDP through helpers so I'm not sure it matters.

> 
> skb_metadata_set is amended with skb_metadata_import_from_xdp which
> tries to parse out the metadata and put it into skb.
> 
> See the comment for r1 vs r2/r3/r4/r5 conventions.

I think for next version an expanded commit message with use
cases would help. I had to follow the thread to get some ideas
why this might be useful.

> 
> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: David Ahern <dsahern@gmail.com>
> Cc: Martin KaFai Lau <martin.lau@linux.dev>
> Cc: Jakub Kicinski <kuba@kernel.org>
> Cc: Willem de Bruijn <willemb@google.com>
> Cc: Jesper Dangaard Brouer <brouer@redhat.com>
> Cc: Anatoly Burakov <anatoly.burakov@intel.com>
> Cc: Alexander Lobakin <alexandr.lobakin@intel.com>
> Cc: Magnus Karlsson <magnus.karlsson@gmail.com>
> Cc: Maryam Tahhan <mtahhan@redhat.com>
> Cc: xdp-hints@xdp-project.net
> Cc: netdev@vger.kernel.org
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>  drivers/net/veth.c        |   4 +-
>  include/linux/bpf_patch.h |   2 +
>  include/linux/skbuff.h    |   4 ++
>  include/net/xdp.h         |  13 +++++
>  kernel/bpf/bpf_patch.c    |  30 +++++++++++
>  kernel/bpf/verifier.c     |  18 +++++++
>  net/core/skbuff.c         |  25 +++++++++
>  net/core/xdp.c            | 104 +++++++++++++++++++++++++++++++++++---
>  8 files changed, 193 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index 0e629ceb087b..d4cd0938360b 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -1673,7 +1673,9 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>  static void veth_unroll_kfunc(const struct bpf_prog *prog, u32 func_id,
>  			      struct bpf_patch *patch)
>  {
> -	if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED)) {
> +	if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_EXPORT_TO_SKB)) {
> +		return xdp_metadata_export_to_skb(prog, patch);
> +	} else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED)) {
>  		/* return true; */
>  		bpf_patch_append(patch, BPF_MOV64_IMM(BPF_REG_0, 1));
>  	} else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) {
> diff --git a/include/linux/bpf_patch.h b/include/linux/bpf_patch.h
> index 81ff738eef8d..359c165ad68b 100644
> --- a/include/linux/bpf_patch.h
> +++ b/include/linux/bpf_patch.h
> @@ -16,6 +16,8 @@ size_t bpf_patch_len(const struct bpf_patch *patch);
>  int bpf_patch_err(const struct bpf_patch *patch);
>  void __bpf_patch_append(struct bpf_patch *patch, struct bpf_insn insn);
>  struct bpf_insn *bpf_patch_data(const struct bpf_patch *patch);
> +void bpf_patch_resolve_jmp(struct bpf_patch *patch);
> +u32 bpf_patch_magles_registers(const struct bpf_patch *patch);
>  
>  #define bpf_patch_append(patch, ...) ({ \
>  	struct bpf_insn insn[] = { __VA_ARGS__ }; \
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 59c9fd55699d..dba857f212d7 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -4217,9 +4217,13 @@ static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
>  	       true : __skb_metadata_differs(skb_a, skb_b, len_a);
>  }
>  
> +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len);
> +
>  static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
>  {
>  	skb_shinfo(skb)->meta_len = meta_len;
> +	if (meta_len)
> +		skb_metadata_import_from_xdp(skb, meta_len);
>  }
>  
>  static inline void skb_metadata_clear(struct sk_buff *skb)
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 2a82a98f2f9f..8c97c6996172 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -411,6 +411,8 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
>  #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE
>  
>  #define XDP_METADATA_KFUNC_xxx	\
> +	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_EXPORT_TO_SKB, \
> +			   bpf_xdp_metadata_export_to_skb) \
>  	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED, \
>  			   bpf_xdp_metadata_rx_timestamp_supported) \
>  	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \
> @@ -423,14 +425,25 @@ XDP_METADATA_KFUNC_xxx
>  MAX_XDP_METADATA_KFUNC,
>  };
>  
> +struct xdp_to_skb_metadata {
> +	u32 magic; /* xdp_metadata_magic */
> +	u64 rx_timestamp;

Slightly confused. I thought/think most drivers populate the skb timestamp
if they can already? So why do we need to bounce these through some xdp
metadata? Won't all this cost more than the load/store directly from the
descriptor into the skb? Even if drivers are not populating skb now
shouldn't an ethtool knob be enough to turn this on?

I don't see the value of getting this in veth side its just a sw
timestamp there.

If its specific to cpumap shouldn't we land this in cpumap code paths
out of general XDP code paths?


> +} __randomize_layout;
> +
> +struct bpf_patch;
> +
>  #ifdef CONFIG_DEBUG_INFO_BTF
> +extern u32 xdp_metadata_magic;
>  extern struct btf_id_set8 xdp_metadata_kfunc_ids;
>  static inline u32 xdp_metadata_kfunc_id(int id)
>  {
>  	return xdp_metadata_kfunc_ids.pairs[id].id;
>  }
> +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch);
>  #else
> +#define xdp_metadata_magic 0
>  static inline u32 xdp_metadata_kfunc_id(int id) { return 0; }
> +static void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch) { return 0; }
>  #endif
>  
>  #endif /* __LINUX_NET_XDP_H__ */
> diff --git a/kernel/bpf/bpf_patch.c b/kernel/bpf/bpf_patch.c
> index 82a10bf5624a..8f1fef74299c 100644
> --- a/kernel/bpf/bpf_patch.c
> +++ b/kernel/bpf/bpf_patch.c
> @@ -49,3 +49,33 @@ struct bpf_insn *bpf_patch_data(const struct bpf_patch *patch)
>  {
>  	return patch->insn;
>  }

[...]

>  
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 42a35b59fb1e..37e3aef46525 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -72,6 +72,7 @@
>  #include <net/mptcp.h>
>  #include <net/mctp.h>
>  #include <net/page_pool.h>
> +#include <net/xdp.h>
>  
>  #include <linux/uaccess.h>
>  #include <trace/events/skb.h>
> @@ -6672,3 +6673,27 @@ nodefer:	__kfree_skb(skb);
>  	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
>  		smp_call_function_single_async(cpu, &sd->defer_csd);
>  }
> +
> +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len)
> +{
> +	struct xdp_to_skb_metadata *meta = (void *)(skb_mac_header(skb) - len);
> +
> +	/* Optional SKB info, currently missing:
> +	 * - HW checksum info		(skb->ip_summed)
> +	 * - HW RX hash			(skb_set_hash)
> +	 * - RX ring dev queue index	(skb_record_rx_queue)
> +	 */
> +
> +	if (len != sizeof(struct xdp_to_skb_metadata))
> +		return;
> +
> +	if (meta->magic != xdp_metadata_magic)
> +		return;
> +
> +	if (meta->rx_timestamp) {
> +		*skb_hwtstamps(skb) = (struct skb_shared_hwtstamps){
> +			.hwtstamp = ns_to_ktime(meta->rx_timestamp),
> +		};
> +	}
> +}
> +EXPORT_SYMBOL(skb_metadata_import_from_xdp);
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 22f1e44700eb..8204fa05c5e9 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -653,12 +653,6 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
>  	/* Essential SKB info: protocol and skb->dev */
>  	skb->protocol = eth_type_trans(skb, dev);
>  
> -	/* Optional SKB info, currently missing:
> -	 * - HW checksum info		(skb->ip_summed)
> -	 * - HW RX hash			(skb_set_hash)
> -	 * - RX ring dev queue index	(skb_record_rx_queue)
> -	 */
> -
>  	/* Until page_pool get SKB return path, release DMA here */
>  	xdp_release_frame(xdpf);
>  
> @@ -712,6 +706,13 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
>  	return nxdpf;
>  }
>  
> +/* For the packets directed to the kernel, this kfunc exports XDP metadata
> + * into skb context.
> + */
> +noinline void bpf_xdp_metadata_export_to_skb(const struct xdp_md *ctx)
> +{
> +}
> +
>  /* Indicates whether particular device supports rx_timestamp metadata.
>   * This is an optional helper to support marking some branches as
>   * "dead code" in the BPF programs.
> @@ -737,13 +738,104 @@ XDP_METADATA_KFUNC_xxx
>  #undef XDP_METADATA_KFUNC
>  BTF_SET8_END(xdp_metadata_kfunc_ids)
>  
> +/* Make sure userspace doesn't depend on our layout by using
> + * different pseudo-generated magic value.
> + */
> +u32 xdp_metadata_magic;
> +
>  static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
>  	.owner = THIS_MODULE,
>  	.set   = &xdp_metadata_kfunc_ids,
>  };
>  
> +/* Since we're not actually doing a call but instead rewriting
> + * in place, we can only afford to use R0-R5 scratch registers.

Why not just do a call? Its neat to inline this but your going
to build an skb next. Thats not cheap and the cost of a call
should be complete noise when hitting the entire stack?

Any benchmark to convince us this is worthwhile optimizations?

> + *
> + * We reserve R1 for bpf_xdp_metadata_export_to_skb and let individual
> + * metadata kfuncs use only R0,R4-R5.
> + *
> + * The above also means we _cannot_ easily call any other helper/kfunc
> + * because there is no place for us to preserve our R1 argument;
> + * existing R6-R9 belong to the callee.
> + */
> +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch)
> +{

[...]

>  }
>  late_initcall(xdp_metadata_init);

Thanks,
John
John Fastabend Nov. 10, 2022, 1:26 a.m. UTC | #11
Toke Høiland-Jørgensen wrote:
> Snipping a bit of context to reply to this bit:
> 
> >>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
> >>>> sure it is solid enough by asking the xdp prog not to use the same random number
> >>>> in its own metadata + not to change the metadata through xdp->data_meta after
> >>>> calling bpf_xdp_metadata_export_to_skb().
> >>>
> >>> What do you think the usecase here might be? Or are you suggesting we
> >>> reject further access to data_meta after
> >>> bpf_xdp_metadata_export_to_skb somehow?
> >>>
> >>> If we want to let the programs override some of this
> >>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
> >>> more kfuncs instead of exposing the layout?
> >>>
> >>> bpf_xdp_metadata_export_to_skb(ctx);
> >>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);
> 

Hi Toke,

Trying not to bifurcate your thread. Can I start a new one here to
elaborate on these use cases. I'm still a bit lost on any use case
for this that makes sense to actually deploy on a network.

> There are several use cases for needing to access the metadata after
> calling bpf_xdp_metdata_export_to_skb():
> 
> - Accessing the metadata after redirect (in a cpumap or devmap program,
>   or on a veth device)

I think for devmap there are still lots of opens how/where the skb
is even built.

For cpumap I'm a bit unsure what the use case is. For ice, mlx and
such you should use the hardware RSS if performance is top of mind.
And then for specific devices on cpumap (maybe realtime or ptp things?)
could we just throw it through the xdp_frame?

> - Transferring the packet+metadata to AF_XDP

In this case we have the metadata and AF_XDP program and XDP program
simply need to agree on metadata format. No need to have some magic
numbers and driver specific kfuncs.

> - Returning XDP_PASS, but accessing some of the metadata first (whether
>   to read or change it)
> 

I don't get this case? XDP_PASS should go to stack normally through
drivers build_skb routines. These will populate timestamp normally.
My guess is simply descriptor->skb load/store is cheaper than carrying
around this metadata and doing the call in BPF side. Anyways you
just built an entire skb and hit the stack I don't think you will
notice this noise in any benchmark.
Stanislav Fomichev Nov. 10, 2022, 6:44 a.m. UTC | #12
On Wed, Nov 9, 2022 at 5:09 PM John Fastabend <john.fastabend@gmail.com> wrote:
>
> Stanislav Fomichev wrote:
> > Implement new bpf_xdp_metadata_export_to_skb kfunc which
> > prepares compatible xdp metadata for kernel consumption.
> > This kfunc should be called prior to bpf_redirect
> > or (unless already called) when XDP_PASS'ing the frame
> > into the kernel.
>
> Hi,
>
> Had a couple high level questions so starting a new thread thought
> it would be more confusing than helpful to add to the thread on
> this patch.
>
> >
> > The implementation currently maintains xdp_to_skb_metadata
> > layout by calling bpf_xdp_metadata_rx_timestamp and placing
> > small magic number. From skb_metdata_set, when we get expected magic number,
> > we interpret metadata accordingly.
>
> From commit message side I'm not able to parse this paragraph without
> reading code. Maybe expand it a bit for next version or it could
> just be me.

Sure, will try to reword. In another thread we've discussed removing
the magic number, so I'll have to rewrite this part anyway :-)

> >
> > Both magic number and struct layout are randomized to make sure
> > it doesn't leak into the userspace.
>
> Are we worried about leaking pointers into XDP program here? We already
> leak pointers into XDP through helpers so I'm not sure it matters.

I wasn't sure here whether we want to have that xdp->skb (redirect)
path to be completely opaque or it's ok to expose this to the
bpf/af_xdp.
Seems like everybody's on board about exposing, so I'll be most likely
removing that randomization part.
(see my recent reply to Martin/Toke)

> >
> > skb_metadata_set is amended with skb_metadata_import_from_xdp which
> > tries to parse out the metadata and put it into skb.
> >
> > See the comment for r1 vs r2/r3/r4/r5 conventions.
>
> I think for next version an expanded commit message with use
> cases would help. I had to follow the thread to get some ideas
> why this might be useful.

Sure. I was planning to put together
Documentation/bpf/xdp_metadata.rst with more
details/assumptions/use-cases for the final non-rfc submissions..
Maybe that's better to help build up a full picture?

> > Cc: John Fastabend <john.fastabend@gmail.com>
> > Cc: David Ahern <dsahern@gmail.com>
> > Cc: Martin KaFai Lau <martin.lau@linux.dev>
> > Cc: Jakub Kicinski <kuba@kernel.org>
> > Cc: Willem de Bruijn <willemb@google.com>
> > Cc: Jesper Dangaard Brouer <brouer@redhat.com>
> > Cc: Anatoly Burakov <anatoly.burakov@intel.com>
> > Cc: Alexander Lobakin <alexandr.lobakin@intel.com>
> > Cc: Magnus Karlsson <magnus.karlsson@gmail.com>
> > Cc: Maryam Tahhan <mtahhan@redhat.com>
> > Cc: xdp-hints@xdp-project.net
> > Cc: netdev@vger.kernel.org
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >  drivers/net/veth.c        |   4 +-
> >  include/linux/bpf_patch.h |   2 +
> >  include/linux/skbuff.h    |   4 ++
> >  include/net/xdp.h         |  13 +++++
> >  kernel/bpf/bpf_patch.c    |  30 +++++++++++
> >  kernel/bpf/verifier.c     |  18 +++++++
> >  net/core/skbuff.c         |  25 +++++++++
> >  net/core/xdp.c            | 104 +++++++++++++++++++++++++++++++++++---
> >  8 files changed, 193 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> > index 0e629ceb087b..d4cd0938360b 100644
> > --- a/drivers/net/veth.c
> > +++ b/drivers/net/veth.c
> > @@ -1673,7 +1673,9 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
> >  static void veth_unroll_kfunc(const struct bpf_prog *prog, u32 func_id,
> >                             struct bpf_patch *patch)
> >  {
> > -     if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED)) {
> > +     if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_EXPORT_TO_SKB)) {
> > +             return xdp_metadata_export_to_skb(prog, patch);
> > +     } else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED)) {
> >               /* return true; */
> >               bpf_patch_append(patch, BPF_MOV64_IMM(BPF_REG_0, 1));
> >       } else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) {
> > diff --git a/include/linux/bpf_patch.h b/include/linux/bpf_patch.h
> > index 81ff738eef8d..359c165ad68b 100644
> > --- a/include/linux/bpf_patch.h
> > +++ b/include/linux/bpf_patch.h
> > @@ -16,6 +16,8 @@ size_t bpf_patch_len(const struct bpf_patch *patch);
> >  int bpf_patch_err(const struct bpf_patch *patch);
> >  void __bpf_patch_append(struct bpf_patch *patch, struct bpf_insn insn);
> >  struct bpf_insn *bpf_patch_data(const struct bpf_patch *patch);
> > +void bpf_patch_resolve_jmp(struct bpf_patch *patch);
> > +u32 bpf_patch_magles_registers(const struct bpf_patch *patch);
> >
> >  #define bpf_patch_append(patch, ...) ({ \
> >       struct bpf_insn insn[] = { __VA_ARGS__ }; \
> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > index 59c9fd55699d..dba857f212d7 100644
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> > @@ -4217,9 +4217,13 @@ static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
> >              true : __skb_metadata_differs(skb_a, skb_b, len_a);
> >  }
> >
> > +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len);
> > +
> >  static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
> >  {
> >       skb_shinfo(skb)->meta_len = meta_len;
> > +     if (meta_len)
> > +             skb_metadata_import_from_xdp(skb, meta_len);
> >  }
> >
> >  static inline void skb_metadata_clear(struct sk_buff *skb)
> > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > index 2a82a98f2f9f..8c97c6996172 100644
> > --- a/include/net/xdp.h
> > +++ b/include/net/xdp.h
> > @@ -411,6 +411,8 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
> >  #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE
> >
> >  #define XDP_METADATA_KFUNC_xxx       \
> > +     XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_EXPORT_TO_SKB, \
> > +                        bpf_xdp_metadata_export_to_skb) \
> >       XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED, \
> >                          bpf_xdp_metadata_rx_timestamp_supported) \
> >       XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \
> > @@ -423,14 +425,25 @@ XDP_METADATA_KFUNC_xxx
> >  MAX_XDP_METADATA_KFUNC,
> >  };
> >
> > +struct xdp_to_skb_metadata {
> > +     u32 magic; /* xdp_metadata_magic */
> > +     u64 rx_timestamp;
>
> Slightly confused. I thought/think most drivers populate the skb timestamp
> if they can already? So why do we need to bounce these through some xdp
> metadata? Won't all this cost more than the load/store directly from the
> descriptor into the skb? Even if drivers are not populating skb now
> shouldn't an ethtool knob be enough to turn this on?

dsahern@ pointed out that it might be useful for the program to be
able to override some of that metadata.
Or, for example, if there is no rx vlan offload, the program can strip
it (as part of parsing) and put the vlan tag into that
xdp_to_skb_metadata.

> I don't see the value of getting this in veth side its just a sw
> timestamp there.

(veth is there so we can have some selftests)

> If its specific to cpumap shouldn't we land this in cpumap code paths
> out of general XDP code paths?

See above, if we run this at cpumap time it's too late?

> > +} __randomize_layout;
> > +
> > +struct bpf_patch;
> > +
> >  #ifdef CONFIG_DEBUG_INFO_BTF
> > +extern u32 xdp_metadata_magic;
> >  extern struct btf_id_set8 xdp_metadata_kfunc_ids;
> >  static inline u32 xdp_metadata_kfunc_id(int id)
> >  {
> >       return xdp_metadata_kfunc_ids.pairs[id].id;
> >  }
> > +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch);
> >  #else
> > +#define xdp_metadata_magic 0
> >  static inline u32 xdp_metadata_kfunc_id(int id) { return 0; }
> > +static void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch) { return 0; }
> >  #endif
> >
> >  #endif /* __LINUX_NET_XDP_H__ */
> > diff --git a/kernel/bpf/bpf_patch.c b/kernel/bpf/bpf_patch.c
> > index 82a10bf5624a..8f1fef74299c 100644
> > --- a/kernel/bpf/bpf_patch.c
> > +++ b/kernel/bpf/bpf_patch.c
> > @@ -49,3 +49,33 @@ struct bpf_insn *bpf_patch_data(const struct bpf_patch *patch)
> >  {
> >       return patch->insn;
> >  }
>
> [...]
>
> >
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 42a35b59fb1e..37e3aef46525 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -72,6 +72,7 @@
> >  #include <net/mptcp.h>
> >  #include <net/mctp.h>
> >  #include <net/page_pool.h>
> > +#include <net/xdp.h>
> >
> >  #include <linux/uaccess.h>
> >  #include <trace/events/skb.h>
> > @@ -6672,3 +6673,27 @@ nodefer:       __kfree_skb(skb);
> >       if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
> >               smp_call_function_single_async(cpu, &sd->defer_csd);
> >  }
> > +
> > +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len)
> > +{
> > +     struct xdp_to_skb_metadata *meta = (void *)(skb_mac_header(skb) - len);
> > +
> > +     /* Optional SKB info, currently missing:
> > +      * - HW checksum info           (skb->ip_summed)
> > +      * - HW RX hash                 (skb_set_hash)
> > +      * - RX ring dev queue index    (skb_record_rx_queue)
> > +      */
> > +
> > +     if (len != sizeof(struct xdp_to_skb_metadata))
> > +             return;
> > +
> > +     if (meta->magic != xdp_metadata_magic)
> > +             return;
> > +
> > +     if (meta->rx_timestamp) {
> > +             *skb_hwtstamps(skb) = (struct skb_shared_hwtstamps){
> > +                     .hwtstamp = ns_to_ktime(meta->rx_timestamp),
> > +             };
> > +     }
> > +}
> > +EXPORT_SYMBOL(skb_metadata_import_from_xdp);
> > diff --git a/net/core/xdp.c b/net/core/xdp.c
> > index 22f1e44700eb..8204fa05c5e9 100644
> > --- a/net/core/xdp.c
> > +++ b/net/core/xdp.c
> > @@ -653,12 +653,6 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
> >       /* Essential SKB info: protocol and skb->dev */
> >       skb->protocol = eth_type_trans(skb, dev);
> >
> > -     /* Optional SKB info, currently missing:
> > -      * - HW checksum info           (skb->ip_summed)
> > -      * - HW RX hash                 (skb_set_hash)
> > -      * - RX ring dev queue index    (skb_record_rx_queue)
> > -      */
> > -
> >       /* Until page_pool get SKB return path, release DMA here */
> >       xdp_release_frame(xdpf);
> >
> > @@ -712,6 +706,13 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
> >       return nxdpf;
> >  }
> >
> > +/* For the packets directed to the kernel, this kfunc exports XDP metadata
> > + * into skb context.
> > + */
> > +noinline void bpf_xdp_metadata_export_to_skb(const struct xdp_md *ctx)
> > +{
> > +}
> > +
> >  /* Indicates whether particular device supports rx_timestamp metadata.
> >   * This is an optional helper to support marking some branches as
> >   * "dead code" in the BPF programs.
> > @@ -737,13 +738,104 @@ XDP_METADATA_KFUNC_xxx
> >  #undef XDP_METADATA_KFUNC
> >  BTF_SET8_END(xdp_metadata_kfunc_ids)
> >
> > +/* Make sure userspace doesn't depend on our layout by using
> > + * different pseudo-generated magic value.
> > + */
> > +u32 xdp_metadata_magic;
> > +
> >  static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
> >       .owner = THIS_MODULE,
> >       .set   = &xdp_metadata_kfunc_ids,
> >  };
> >
> > +/* Since we're not actually doing a call but instead rewriting
> > + * in place, we can only afford to use R0-R5 scratch registers.
>
> Why not just do a call? Its neat to inline this but your going
> to build an skb next. Thats not cheap and the cost of a call
> should be complete noise when hitting the entire stack?
>
> Any benchmark to convince us this is worthwhile optimizations?

I do have a suspicion that a non-zero amount of drivers will actually
resort to calling kernel instead of writing bpf bytecode (especially
since there might be some locks involved).

However if we prefer to go back to calls, there still has to be some
translation table.
I'm assuming we want to at least resolve indirect
netdev->kfunc_rx_metatada() calls at prog load time.
So we still need a per-netdev map from kfunc_id to real implementation func.
And I don't think that it would be more simple than the switch
statement that I have?



> > + *
> > + * We reserve R1 for bpf_xdp_metadata_export_to_skb and let individual
> > + * metadata kfuncs use only R0,R4-R5.
> > + *
> > + * The above also means we _cannot_ easily call any other helper/kfunc
> > + * because there is no place for us to preserve our R1 argument;
> > + * existing R6-R9 belong to the callee.
> > + */
> > +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch)
> > +{
>
> [...]
>
> >  }
> >  late_initcall(xdp_metadata_init);
>
> Thanks,
> John
>
>
Toke Høiland-Jørgensen Nov. 10, 2022, 2:19 p.m. UTC | #13
Martin KaFai Lau <martin.lau@linux.dev> writes:

> On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote:
>> Snipping a bit of context to reply to this bit:
>> 
>>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
>>>>>> sure it is solid enough by asking the xdp prog not to use the same random number
>>>>>> in its own metadata + not to change the metadata through xdp->data_meta after
>>>>>> calling bpf_xdp_metadata_export_to_skb().
>>>>>
>>>>> What do you think the usecase here might be? Or are you suggesting we
>>>>> reject further access to data_meta after
>>>>> bpf_xdp_metadata_export_to_skb somehow?
>>>>>
>>>>> If we want to let the programs override some of this
>>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
>>>>> more kfuncs instead of exposing the layout?
>>>>>
>>>>> bpf_xdp_metadata_export_to_skb(ctx);
>>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);
>> 
>> There are several use cases for needing to access the metadata after
>> calling bpf_xdp_metdata_export_to_skb():
>> 
>> - Accessing the metadata after redirect (in a cpumap or devmap program,
>>    or on a veth device)
>> - Transferring the packet+metadata to AF_XDP
> fwiw, the xdp prog could also be more selective and only stores one of the hints 
> instead of the whole 'struct xdp_to_skb_metadata'.

Yup, absolutely! In that sense, reusing the SKB format is mostly a
convenience feature. However, lots of people consume AF_XDP through the
default program installed by libxdp in the XSK setup code, and including
custom metadata there is awkward. So having the metadata consumed by the
stack as the "default subset" would enable easy consumption by
non-advanced users, while advanced users can still do custom stuff by
writing their own XDP program that calls the kfuncs.

>> - Returning XDP_PASS, but accessing some of the metadata first (whether
>>    to read or change it)
>> 
>> The last one could be solved by calling additional kfuncs, but that
>> would be less efficient than just directly editing the struct which
>> will be cache-hot after the helper returns.
>
> Yeah, it is more efficient to directly write if possible.  I think this set 
> allows the direct reading and writing already through data_meta (as a _u8 *).

Yup, totally fine with just keeping that capability :)

>> And yeah, this will allow the XDP program to inject arbitrary metadata
>> into the netstack; but it can already inject arbitrary *packet* data
>> into the stack, so not sure if this is much of an additional risk? If it
>> does lead to trivial crashes, we should probably harden the stack
>> against that?
>> 
>> As for the random number, Jesper and I discussed replacing this with the
>> same BTF-ID scheme that he was using in his patch series. I.e., instead
>> of just putting in a random number, we insert the BTF ID of the metadata
>> struct at the end of it. This will allow us to support multiple
>> different formats in the future (not just changing the layout, but
>> having multiple simultaneous formats in the same kernel image), in case
>> we run out of space.
>
> This seems a bit hypothetical.  How much headroom does it usually have for the 
> xdp prog?  Potentially the hints can use all the remaining space left after the 
> header encap and the current bpf_xdp_adjust_meta() usage?

For the metadata consumed by the stack right now it's a bit
hypothetical, yeah. However, there's a bunch of metadata commonly
supported by hardware that the stack currently doesn't consume and that
hopefully this feature will end up making more accessible. My hope is
that the stack can also learn how to use this in the future, in which
case we may run out of space. So I think of that bit mostly as
future-proofing...

>> We should probably also have a flag set on the xdp_frame so the stack
>> knows that the metadata area contains relevant-to-skb data, to guard
>> against an XDP program accidentally hitting the "magic number" (BTF_ID)
>> in unrelated stuff it puts into the metadata area.
>
> Yeah, I think having a flag is useful.  The flag will be set at xdp_buff and 
> then transfer to the xdp_frame?

Yeah, exactly!

>>> After re-reading patch 6, have another question. The 'void
>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at
>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *'
>>> pointer and the xdp prog can directly read (or even write) it?
>> 
>> Hmm, I'm not sure returning a failure makes sense? Failure to read one
>> or more fields just means that those fields will not be populated? We
>> should probably have a flags field inside the metadata struct itself to
>> indicate which fields are set or not, but I'm not sure returning an
>> error value adds anything? Returning a pointer to the metadata field
>> might be convenient for users (it would just be an alias to the
>> data_meta pointer, but the verifier could know its size, so the program
>> doesn't have to bounds check it).
>
> If some hints are not available, those hints should be initialized to
> 0/CHECKSUM_NONE/...etc.

The problem with that is that then we have to spend cycles writing
eight bytes of zeroes into the checksum field :)

> The xdp prog needs a direct way to tell hard failure when it cannot
> write the meta area because of not enough space. Comparing
> xdp->data_meta with xdp->data as a side effect is not intuitive.

Yeah, hence a flags field so we can just see if setting each field
succeeded?

> It is more than saving the bound check.  With type info of 'struct 
> xdp_to_skb_metadata *', the verifier can do more checks like reading in the 
> middle of an integer member.  The verifier could also limit write access only to 
> a few struct's members if it is needed.
>
> The returning 'struct xdp_to_skb_metadata *' should not be an alias to the 
> xdp->data_meta.  They should actually point to different locations in the 
> headroom.  bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff. 
> xdp->data_meta won't be changed and keeps pointing to the last 
> bpf_xdp_adjust_meta() location.  The kernel will know if there is 
> xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the 
> xdp_{buff,frame}.  Would it work?

Hmm, logically splitting the program metadata and the xdp_hints metadata
(but having them share the same area) *could* work, I guess, I'm just
not sure it's worth the extra complexity?

>>> A related question, why 'struct xdp_to_skb_metadata' needs
>>> __randomize_layout?
>> 
>> The __randomize_layout thing is there to force BPF programs to use CO-RE
>> to access the field. This is to avoid the struct layout accidentally
>> ossifying because people in practice rely on a particular layout, even
>> though we tell them to use CO-RE. There are lots of examples of this
>> happening in other domains (IP header options, TCP options, etc), and
>> __randomize_layout seemed like a neat trick to enforce CO-RE usage :)
>
> I am not sure if it is necessary or helpful to only enforce __randomize_layout 
> in 'struct xdp_to_skb_metadata'.  There are other CO-RE use cases (tracing and 
> non tracing) that already have direct access (reading and/or writing) to other 
> kernel structures.
>
> It is more important for the verifier to see the xdp prog accessing it as a 
> 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so 
> that the verifier can enforce the rules of access.

That only works inside the kernel, though. Since the metadata field can
be copied wholesale to AF_XDP, having it randomized forces userspace
consumers to also write code to deal with the layout being dynamic...

-Toke
Toke Høiland-Jørgensen Nov. 10, 2022, 2:26 p.m. UTC | #14
Stanislav Fomichev <sdf@google.com> writes:

> On Wed, Nov 9, 2022 at 4:13 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 11/9/22 1:33 PM, Stanislav Fomichev wrote:
>> > On Wed, Nov 9, 2022 at 10:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>> >>
>> >> On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote:
>> >>> Snipping a bit of context to reply to this bit:
>> >>>
>> >>>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
>> >>>>>>> sure it is solid enough by asking the xdp prog not to use the same random number
>> >>>>>>> in its own metadata + not to change the metadata through xdp->data_meta after
>> >>>>>>> calling bpf_xdp_metadata_export_to_skb().
>> >>>>>>
>> >>>>>> What do you think the usecase here might be? Or are you suggesting we
>> >>>>>> reject further access to data_meta after
>> >>>>>> bpf_xdp_metadata_export_to_skb somehow?
>> >>>>>>
>> >>>>>> If we want to let the programs override some of this
>> >>>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
>> >>>>>> more kfuncs instead of exposing the layout?
>> >>>>>>
>> >>>>>> bpf_xdp_metadata_export_to_skb(ctx);
>> >>>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);
>> >>>
>> >>> There are several use cases for needing to access the metadata after
>> >>> calling bpf_xdp_metdata_export_to_skb():
>> >>>
>> >>> - Accessing the metadata after redirect (in a cpumap or devmap program,
>> >>>     or on a veth device)
>> >>> - Transferring the packet+metadata to AF_XDP
>> >> fwiw, the xdp prog could also be more selective and only stores one of the hints
>> >> instead of the whole 'struct xdp_to_skb_metadata'.
>> >>
>> >>> - Returning XDP_PASS, but accessing some of the metadata first (whether
>> >>>     to read or change it)
>> >>>
>> >>> The last one could be solved by calling additional kfuncs, but that
>> >>> would be less efficient than just directly editing the struct which
>> >>> will be cache-hot after the helper returns.
>> >>
>> >> Yeah, it is more efficient to directly write if possible.  I think this set
>> >> allows the direct reading and writing already through data_meta (as a _u8 *).
>> >>
>> >>>
>> >>> And yeah, this will allow the XDP program to inject arbitrary metadata
>> >>> into the netstack; but it can already inject arbitrary *packet* data
>> >>> into the stack, so not sure if this is much of an additional risk? If it
>> >>> does lead to trivial crashes, we should probably harden the stack
>> >>> against that?
>> >>>
>> >>> As for the random number, Jesper and I discussed replacing this with the
>> >>> same BTF-ID scheme that he was using in his patch series. I.e., instead
>> >>> of just putting in a random number, we insert the BTF ID of the metadata
>> >>> struct at the end of it. This will allow us to support multiple
>> >>> different formats in the future (not just changing the layout, but
>> >>> having multiple simultaneous formats in the same kernel image), in case
>> >>> we run out of space.
>> >>
>> >> This seems a bit hypothetical.  How much headroom does it usually have for the
>> >> xdp prog?  Potentially the hints can use all the remaining space left after the
>> >> header encap and the current bpf_xdp_adjust_meta() usage?
>> >>
>> >>>
>> >>> We should probably also have a flag set on the xdp_frame so the stack
>> >>> knows that the metadata area contains relevant-to-skb data, to guard
>> >>> against an XDP program accidentally hitting the "magic number" (BTF_ID)
>> >>> in unrelated stuff it puts into the metadata area.
>> >>
>> >> Yeah, I think having a flag is useful.  The flag will be set at xdp_buff and
>> >> then transfer to the xdp_frame?
>> >>
>> >>>
>> >>>> After re-reading patch 6, have another question. The 'void
>> >>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at
>> >>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *'
>> >>>> pointer and the xdp prog can directly read (or even write) it?
>> >>>
>> >>> Hmm, I'm not sure returning a failure makes sense? Failure to read one
>> >>> or more fields just means that those fields will not be populated? We
>> >>> should probably have a flags field inside the metadata struct itself to
>> >>> indicate which fields are set or not, but I'm not sure returning an
>> >>> error value adds anything? Returning a pointer to the metadata field
>> >>> might be convenient for users (it would just be an alias to the
>> >>> data_meta pointer, but the verifier could know its size, so the program
>> >>> doesn't have to bounds check it).
>> >>
>> >> If some hints are not available, those hints should be initialized to
>> >> 0/CHECKSUM_NONE/...etc.  The xdp prog needs a direct way to tell hard failure
>> >> when it cannot write the meta area because of not enough space.  Comparing
>> >> xdp->data_meta with xdp->data as a side effect is not intuitive.
>> >>
>> >> It is more than saving the bound check.  With type info of 'struct
>> >> xdp_to_skb_metadata *', the verifier can do more checks like reading in the
>> >> middle of an integer member.  The verifier could also limit write access only to
>> >> a few struct's members if it is needed.
>> >>
>> >> The returning 'struct xdp_to_skb_metadata *' should not be an alias to the
>> >> xdp->data_meta.  They should actually point to different locations in the
>> >> headroom.  bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff.
>> >> xdp->data_meta won't be changed and keeps pointing to the last
>> >> bpf_xdp_adjust_meta() location.  The kernel will know if there is
>> >> xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the
>> >> xdp_{buff,frame}.  Would it work?
>> >>
>> >>>
>> >>>> A related question, why 'struct xdp_to_skb_metadata' needs
>> >>>> __randomize_layout?
>> >>>
>> >>> The __randomize_layout thing is there to force BPF programs to use CO-RE
>> >>> to access the field. This is to avoid the struct layout accidentally
>> >>> ossifying because people in practice rely on a particular layout, even
>> >>> though we tell them to use CO-RE. There are lots of examples of this
>> >>> happening in other domains (IP header options, TCP options, etc), and
>> >>> __randomize_layout seemed like a neat trick to enforce CO-RE usage :)
>> >>
>> >> I am not sure if it is necessary or helpful to only enforce __randomize_layout
>> >> in 'struct xdp_to_skb_metadata'.  There are other CO-RE use cases (tracing and
>> >> non tracing) that already have direct access (reading and/or writing) to other
>> >> kernel structures.
>> >>
>> >> It is more important for the verifier to see the xdp prog accessing it as a
>> >> 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so
>> >> that the verifier can enforce the rules of access.
>> >>
>> >>>
>> >>>>>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the
>> >>>>>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only?
>> >>>>>>
>> >>>>>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right
>> >>>>>> now usually have C code to manually pull out the metadata (out of hw
>> >>>>>> desc) and put it into skb.
>> >>>>>>
>> >>>>>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for
>> >>>>>> XDP_PASS, we're doing a double amount of work:
>> >>>>>> skb_metadata_import_from_xdp first, then custom driver code second.
>> >>>>>>
>> >>>>>> In theory, maybe we should completely skip drivers custom parsing when
>> >>>>>> there is a prog with BPF_F_XDP_HAS_METADATA?
>> >>>>>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven
>> >>>>>> and won't require any mental work (plus, the drivers won't have to
>> >>>>>> care either in the future).
>> >>>>>>    > WDYT?
>> >>>>>
>> >>>>>
>> >>>>> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes
>> >>>>> sense to only use the hints (if ever written) from xdp prog especially if it
>> >>>>> will eventually support xdp prog changing some of the hints in the future.  For
>> >>>>> now, I think either way is fine since they are the same and the xdp prog is sort
>> >>>>> of doing extra unnecessary work anyway by calling
>> >>>>> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be
>> >>>>> changed now.
>> >>>
>> >>> I agree it would be best if the drivers also use the XDP metadata (if
>> >>> present) on XDP_PASS. Longer term my hope is we can make the XDP
>> >>> metadata support the only thing drivers need to implement (i.e., have
>> >>> the stack call into that code even when no XDP program is loaded), but
>> >>> for now just for consistency (and allowing the XDP program to update the
>> >>> metadata), we should probably at least consume it on XDP_PASS.
>> >>>
>> >>> -Toke
>> >>>
>> >
>> > Not to derail the discussion (left the last message intact on top,
>> > feel free to continue), but to summarize. The proposed changes seem to
>> > be:
>> >
>> > 1. bpf_xdp_metadata_export_to_skb() should return pointer to "struct
>> > xdp_to_skb_metadata"
>> >    - This should let bpf programs change the metadata passed to the skb
>> >
>> > 2. "struct xdp_to_skb_metadata" should have its btf_id as the first
>> > __u32 member (and remove the magic)
>> >    - This is for the redirect case where the end users, including
>> > AF_XDP, can parse this metadata from btf_id
>>
>> I think Toke's idea is to put the btf_id at the end of xdp_to_skb_metadata.  I
>> can see why the end is needed for the userspace AF_XDP because, afaict, AF_XDP
>> rx_desc currently cannot tell if there is metadata written by the xdp prog or
>> not.  However, if the 'has_skb_metadata' bit can also be passed to the AF_XDP
>> rx_desc->options, the btf_id may as well be not needed now.  However, the btf_id
>> and other future new members can be added to the xdp_to_skb_metadata later if
>> there is a need.
>>
>> For the kernel and xdp prog, a bit in the xdp->flags should be enough to get to
>> the xdp_to_skb_metadata.  The xdp prog will use CO-RE to access the members in
>> xdp_to_skb_metadata.
>
> Ack, good points on putting it at the end.
> Regarding bit in desc->options vs btf_id: since it seems that btf_id
> is useful anyway, let's start with that? We can add a bit later on if
> it turns out using metadata is problematic otherwise.

I think the bit is mostly useful so that the stack can know that the
metadata has been set before consuming it (to guard against regular
xdp_metadata usage accidentally hitting the "right" BTF ID). I don't
think it needs to be exposed to the XDP programs themselves.

>> >    - This, however, is not all the metadata that the device can
>> > support, but a much narrower set that the kernel is expected to use
>> > for skb construction
>> >
>> > 3. __randomize_layout isn't really helping, CO-RE will trigger
>> > regardless; maybe only the case where it matters is probably AF_XDP,
>> > so still useful?

Yeah, see my response to Martin, I think the randomisation is useful for
AF_XDP transfer.

>> > 4. The presence of the metadata generated by
>> > bpf_xdp_metadata_export_to_skb should be indicated by a flag in
>> > xdp_{buff,frame}->flags
>> >    - Assuming exposing it via xdp_md->has_skb_metadata is ok?
>>
>> probably __bpf_md_ptr(struct xdp_to_skb_metadata *, skb_metadata) and the type
>> will be PTR_TO_BTF_ID_OR_NULL.
>
> Oh, that seems even better than returning it from
> bpf_xdp_metadata_export_to_skb.
> bpf_xdp_metadata_export_to_skb can return true/false and the rest goes
> via default verifier ctx resolution mechanism..
> (returning ptr from a kfunc seems to be a bit complicated right now)

See my response to John in the other thread about mixing stable UAPI (in
xdp_md) and unstable BTF structures in the xdp_md struct: I think this
is confusing and would prefer a kfunc.

>> >    - Since the programs probably need to do the following:
>> >
>> >    if (xdp_md->has_skb_metadata) {
>> >      access/change skb metadata by doing struct xdp_to_skb_metadata *p
>> > = data_meta;
>>
>> and directly access/change xdp->skb_metadata instead of using xdp->data_meta.
>
> Ack.
>
>> >    } else {
>> >      use kfuncs
>> >    }
>> >
>> > 5. Support the case where we keep program's metadata and kernel's
>> > xdp_to_skb_metadata
>> >    - skb_metadata_import_from_xdp() will "consume" it by mem-moving the
>> > rest of the metadata over it and adjusting the headroom
>>
>> I was thinking the kernel's xdp_to_skb_metadata is always before the program's
>> metadata.  xdp prog should usually work in this order also: read/write headers,
>> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return
>> XDP_PASS/XDP_REDIRECT.  When it is XDP_PASS, the kernel just needs to pop the
>> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc.
>>
>> For the kernel and xdp prog, I don't think it matters where the
>> xdp_to_skb_metadata is.  However, the xdp->data_meta (program's metadata) has to
>> be before xdp->data because of the current data_meta and data comparison usage
>> in the xdp prog.
>>
>> The order of the kernel's xdp_to_skb_metadata and the program's metadata
>> probably only matters to the userspace AF_XDP.  However, I don't see how AF_XDP
>> supports the program's metadata now.  afaict, it can only work now if there is
>> some sort of contract between them or the AF_XDP currently does not use the
>> program's metadata.  Either way, we can do the mem-moving only for AF_XDP and it
>> should be a no op if there is no program's metadata?  This behavior could also
>> be configurable through setsockopt?
>
> Agreed on all of the above. For now it seems like the safest thing to
> do is to put xdp_to_skb_metadata last to allow af_xdp to properly
> locate btf_id.
> Let's see if Toke disagrees :-)

As I replied to Martin, I'm not sure it's worth the complexity to
logically split the SKB metadata from the program's own metadata (as
opposed to just reusing the existing data_meta pointer)?

However, if we do, the layout that makes most sense to me is putting the
skb metadata before the program metadata, like:

--------------
| skb_metadata
--------------
| data_meta
--------------
| data
--------------

Not sure if that's what you meant? :)

-Toke
Toke Høiland-Jørgensen Nov. 10, 2022, 2:32 p.m. UTC | #15
John Fastabend <john.fastabend@gmail.com> writes:

> Toke Høiland-Jørgensen wrote:
>> Snipping a bit of context to reply to this bit:
>> 
>> >>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
>> >>>> sure it is solid enough by asking the xdp prog not to use the same random number
>> >>>> in its own metadata + not to change the metadata through xdp->data_meta after
>> >>>> calling bpf_xdp_metadata_export_to_skb().
>> >>>
>> >>> What do you think the usecase here might be? Or are you suggesting we
>> >>> reject further access to data_meta after
>> >>> bpf_xdp_metadata_export_to_skb somehow?
>> >>>
>> >>> If we want to let the programs override some of this
>> >>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
>> >>> more kfuncs instead of exposing the layout?
>> >>>
>> >>> bpf_xdp_metadata_export_to_skb(ctx);
>> >>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);
>> 
>
> Hi Toke,
>
> Trying not to bifurcate your thread. Can I start a new one here to
> elaborate on these use cases. I'm still a bit lost on any use case
> for this that makes sense to actually deploy on a network.
>
>> There are several use cases for needing to access the metadata after
>> calling bpf_xdp_metdata_export_to_skb():
>> 
>> - Accessing the metadata after redirect (in a cpumap or devmap program,
>>   or on a veth device)
>
> I think for devmap there are still lots of opens how/where the skb
> is even built.

For veth it's pretty clear; i.e., when redirecting into containers.

> For cpumap I'm a bit unsure what the use case is. For ice, mlx and
> such you should use the hardware RSS if performance is top of mind.

Hardware RSS works fine if your hardware supports the hashing you want;
many do not. As an example, Jesper wrote this application that uses
cpumap to divide out ISP customer traffic among different CPUs (solving
an HTB scaling problem):

https://github.com/xdp-project/xdp-cpumap-tc

> And then for specific devices on cpumap (maybe realtime or ptp
> things?) could we just throw it through the xdp_frame?

Not sure what you mean here? Throw what through the xdp_frame?

>> - Transferring the packet+metadata to AF_XDP
>
> In this case we have the metadata and AF_XDP program and XDP program
> simply need to agree on metadata format. No need to have some magic
> numbers and driver specific kfuncs.

See my other reply to Martin: Yeah, for AF_XDP users that write their
own kernel XDP programs, they can just do whatever they want. But many
users just rely on the default program in libxdp, so having a standard
format to include with that is useful.

>> - Returning XDP_PASS, but accessing some of the metadata first (whether
>>   to read or change it)
>> 
>
> I don't get this case? XDP_PASS should go to stack normally through
> drivers build_skb routines. These will populate timestamp normally.
> My guess is simply descriptor->skb load/store is cheaper than carrying
> around this metadata and doing the call in BPF side. Anyways you
> just built an entire skb and hit the stack I don't think you will
> notice this noise in any benchmark.

If you modify the packet before calling XDP_PASS you may want to update
the metadata as well (for instance the RX hash, or in the future the
metadata could also carry transport header offsets).

-Toke
John Fastabend Nov. 10, 2022, 5:30 p.m. UTC | #16
Toke Høiland-Jørgensen wrote:
> John Fastabend <john.fastabend@gmail.com> writes:
> 
> > Toke Høiland-Jørgensen wrote:
> >> Snipping a bit of context to reply to this bit:
> >> 
> >> >>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
> >> >>>> sure it is solid enough by asking the xdp prog not to use the same random number
> >> >>>> in its own metadata + not to change the metadata through xdp->data_meta after
> >> >>>> calling bpf_xdp_metadata_export_to_skb().
> >> >>>
> >> >>> What do you think the usecase here might be? Or are you suggesting we
> >> >>> reject further access to data_meta after
> >> >>> bpf_xdp_metadata_export_to_skb somehow?
> >> >>>
> >> >>> If we want to let the programs override some of this
> >> >>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
> >> >>> more kfuncs instead of exposing the layout?
> >> >>>
> >> >>> bpf_xdp_metadata_export_to_skb(ctx);
> >> >>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);
> >> 
> >
> > Hi Toke,
> >
> > Trying not to bifurcate your thread. Can I start a new one here to
> > elaborate on these use cases. I'm still a bit lost on any use case
> > for this that makes sense to actually deploy on a network.
> >
> >> There are several use cases for needing to access the metadata after
> >> calling bpf_xdp_metdata_export_to_skb():
> >> 
> >> - Accessing the metadata after redirect (in a cpumap or devmap program,
> >>   or on a veth device)
> >
> > I think for devmap there are still lots of opens how/where the skb
> > is even built.
> 
> For veth it's pretty clear; i.e., when redirecting into containers.

Ah but I think XDP on veth is a bit questionable in general. The use
case is NFV I guess but its not how I would build out NFV. I've never
seen it actually deployed other than in CI. Anyways not necessary to
drop into that debate here. It exists so OK.

> 
> > For cpumap I'm a bit unsure what the use case is. For ice, mlx and
> > such you should use the hardware RSS if performance is top of mind.
> 
> Hardware RSS works fine if your hardware supports the hashing you want;
> many do not. As an example, Jesper wrote this application that uses
> cpumap to divide out ISP customer traffic among different CPUs (solving
> an HTB scaling problem):
> 
> https://github.com/xdp-project/xdp-cpumap-tc

I'm going to argue hw should be able to do this still and we
should fix the hw but maybe not easily doable without convincing
hardware folks to talk to us.

Also not obvious tto me how linked code works without more studying, its
ingress HTB? So you would push the rxhash and timestamp into cpumap and
then build the skb here with the correct skb->timestamp?

OK even if I can't exactly find the use case for cpumap if I had
a use case I can see how passing metadata through is useful.

> 
> > And then for specific devices on cpumap (maybe realtime or ptp
> > things?) could we just throw it through the xdp_frame?
> 
> Not sure what you mean here? Throw what through the xdp_frame?

Doesn't matter reread patches and figured it out I was slightly
confused.

> 
> >> - Transferring the packet+metadata to AF_XDP
> >
> > In this case we have the metadata and AF_XDP program and XDP program
> > simply need to agree on metadata format. No need to have some magic
> > numbers and driver specific kfuncs.
> 
> See my other reply to Martin: Yeah, for AF_XDP users that write their
> own kernel XDP programs, they can just do whatever they want. But many
> users just rely on the default program in libxdp, so having a standard
> format to include with that is useful.
> 

I don't think your AF_XDP program is any different than other AF_XDP
programs. Your lib can create a standard format if it wants but
kernel doesn't need to enforce it anyway.


> >> - Returning XDP_PASS, but accessing some of the metadata first (whether
> >>   to read or change it)
> >> 
> >
> > I don't get this case? XDP_PASS should go to stack normally through
> > drivers build_skb routines. These will populate timestamp normally.
> > My guess is simply descriptor->skb load/store is cheaper than carrying
> > around this metadata and doing the call in BPF side. Anyways you
> > just built an entire skb and hit the stack I don't think you will
> > notice this noise in any benchmark.
> 
> If you modify the packet before calling XDP_PASS you may want to update
> the metadata as well (for instance the RX hash, or in the future the
> metadata could also carry transport header offsets).

OK. So when you modify the pkt fixing up the rxhash makes sense. Thanks
for the response I can see the argument.

Thanks,
John
Stanislav Fomichev Nov. 10, 2022, 6:52 p.m. UTC | #17
On Thu, Nov 10, 2022 at 6:27 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Stanislav Fomichev <sdf@google.com> writes:
>
> > On Wed, Nov 9, 2022 at 4:13 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 11/9/22 1:33 PM, Stanislav Fomichev wrote:
> >> > On Wed, Nov 9, 2022 at 10:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >> >>
> >> >> On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote:
> >> >>> Snipping a bit of context to reply to this bit:
> >> >>>
> >> >>>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
> >> >>>>>>> sure it is solid enough by asking the xdp prog not to use the same random number
> >> >>>>>>> in its own metadata + not to change the metadata through xdp->data_meta after
> >> >>>>>>> calling bpf_xdp_metadata_export_to_skb().
> >> >>>>>>
> >> >>>>>> What do you think the usecase here might be? Or are you suggesting we
> >> >>>>>> reject further access to data_meta after
> >> >>>>>> bpf_xdp_metadata_export_to_skb somehow?
> >> >>>>>>
> >> >>>>>> If we want to let the programs override some of this
> >> >>>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
> >> >>>>>> more kfuncs instead of exposing the layout?
> >> >>>>>>
> >> >>>>>> bpf_xdp_metadata_export_to_skb(ctx);
> >> >>>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);
> >> >>>
> >> >>> There are several use cases for needing to access the metadata after
> >> >>> calling bpf_xdp_metdata_export_to_skb():
> >> >>>
> >> >>> - Accessing the metadata after redirect (in a cpumap or devmap program,
> >> >>>     or on a veth device)
> >> >>> - Transferring the packet+metadata to AF_XDP
> >> >> fwiw, the xdp prog could also be more selective and only stores one of the hints
> >> >> instead of the whole 'struct xdp_to_skb_metadata'.
> >> >>
> >> >>> - Returning XDP_PASS, but accessing some of the metadata first (whether
> >> >>>     to read or change it)
> >> >>>
> >> >>> The last one could be solved by calling additional kfuncs, but that
> >> >>> would be less efficient than just directly editing the struct which
> >> >>> will be cache-hot after the helper returns.
> >> >>
> >> >> Yeah, it is more efficient to directly write if possible.  I think this set
> >> >> allows the direct reading and writing already through data_meta (as a _u8 *).
> >> >>
> >> >>>
> >> >>> And yeah, this will allow the XDP program to inject arbitrary metadata
> >> >>> into the netstack; but it can already inject arbitrary *packet* data
> >> >>> into the stack, so not sure if this is much of an additional risk? If it
> >> >>> does lead to trivial crashes, we should probably harden the stack
> >> >>> against that?
> >> >>>
> >> >>> As for the random number, Jesper and I discussed replacing this with the
> >> >>> same BTF-ID scheme that he was using in his patch series. I.e., instead
> >> >>> of just putting in a random number, we insert the BTF ID of the metadata
> >> >>> struct at the end of it. This will allow us to support multiple
> >> >>> different formats in the future (not just changing the layout, but
> >> >>> having multiple simultaneous formats in the same kernel image), in case
> >> >>> we run out of space.
> >> >>
> >> >> This seems a bit hypothetical.  How much headroom does it usually have for the
> >> >> xdp prog?  Potentially the hints can use all the remaining space left after the
> >> >> header encap and the current bpf_xdp_adjust_meta() usage?
> >> >>
> >> >>>
> >> >>> We should probably also have a flag set on the xdp_frame so the stack
> >> >>> knows that the metadata area contains relevant-to-skb data, to guard
> >> >>> against an XDP program accidentally hitting the "magic number" (BTF_ID)
> >> >>> in unrelated stuff it puts into the metadata area.
> >> >>
> >> >> Yeah, I think having a flag is useful.  The flag will be set at xdp_buff and
> >> >> then transfer to the xdp_frame?
> >> >>
> >> >>>
> >> >>>> After re-reading patch 6, have another question. The 'void
> >> >>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at
> >> >>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *'
> >> >>>> pointer and the xdp prog can directly read (or even write) it?
> >> >>>
> >> >>> Hmm, I'm not sure returning a failure makes sense? Failure to read one
> >> >>> or more fields just means that those fields will not be populated? We
> >> >>> should probably have a flags field inside the metadata struct itself to
> >> >>> indicate which fields are set or not, but I'm not sure returning an
> >> >>> error value adds anything? Returning a pointer to the metadata field
> >> >>> might be convenient for users (it would just be an alias to the
> >> >>> data_meta pointer, but the verifier could know its size, so the program
> >> >>> doesn't have to bounds check it).
> >> >>
> >> >> If some hints are not available, those hints should be initialized to
> >> >> 0/CHECKSUM_NONE/...etc.  The xdp prog needs a direct way to tell hard failure
> >> >> when it cannot write the meta area because of not enough space.  Comparing
> >> >> xdp->data_meta with xdp->data as a side effect is not intuitive.
> >> >>
> >> >> It is more than saving the bound check.  With type info of 'struct
> >> >> xdp_to_skb_metadata *', the verifier can do more checks like reading in the
> >> >> middle of an integer member.  The verifier could also limit write access only to
> >> >> a few struct's members if it is needed.
> >> >>
> >> >> The returning 'struct xdp_to_skb_metadata *' should not be an alias to the
> >> >> xdp->data_meta.  They should actually point to different locations in the
> >> >> headroom.  bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff.
> >> >> xdp->data_meta won't be changed and keeps pointing to the last
> >> >> bpf_xdp_adjust_meta() location.  The kernel will know if there is
> >> >> xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the
> >> >> xdp_{buff,frame}.  Would it work?
> >> >>
> >> >>>
> >> >>>> A related question, why 'struct xdp_to_skb_metadata' needs
> >> >>>> __randomize_layout?
> >> >>>
> >> >>> The __randomize_layout thing is there to force BPF programs to use CO-RE
> >> >>> to access the field. This is to avoid the struct layout accidentally
> >> >>> ossifying because people in practice rely on a particular layout, even
> >> >>> though we tell them to use CO-RE. There are lots of examples of this
> >> >>> happening in other domains (IP header options, TCP options, etc), and
> >> >>> __randomize_layout seemed like a neat trick to enforce CO-RE usage :)
> >> >>
> >> >> I am not sure if it is necessary or helpful to only enforce __randomize_layout
> >> >> in 'struct xdp_to_skb_metadata'.  There are other CO-RE use cases (tracing and
> >> >> non tracing) that already have direct access (reading and/or writing) to other
> >> >> kernel structures.
> >> >>
> >> >> It is more important for the verifier to see the xdp prog accessing it as a
> >> >> 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so
> >> >> that the verifier can enforce the rules of access.
> >> >>
> >> >>>
> >> >>>>>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the
> >> >>>>>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only?
> >> >>>>>>
> >> >>>>>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right
> >> >>>>>> now usually have C code to manually pull out the metadata (out of hw
> >> >>>>>> desc) and put it into skb.
> >> >>>>>>
> >> >>>>>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for
> >> >>>>>> XDP_PASS, we're doing a double amount of work:
> >> >>>>>> skb_metadata_import_from_xdp first, then custom driver code second.
> >> >>>>>>
> >> >>>>>> In theory, maybe we should completely skip drivers custom parsing when
> >> >>>>>> there is a prog with BPF_F_XDP_HAS_METADATA?
> >> >>>>>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven
> >> >>>>>> and won't require any mental work (plus, the drivers won't have to
> >> >>>>>> care either in the future).
> >> >>>>>>    > WDYT?
> >> >>>>>
> >> >>>>>
> >> >>>>> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes
> >> >>>>> sense to only use the hints (if ever written) from xdp prog especially if it
> >> >>>>> will eventually support xdp prog changing some of the hints in the future.  For
> >> >>>>> now, I think either way is fine since they are the same and the xdp prog is sort
> >> >>>>> of doing extra unnecessary work anyway by calling
> >> >>>>> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be
> >> >>>>> changed now.
> >> >>>
> >> >>> I agree it would be best if the drivers also use the XDP metadata (if
> >> >>> present) on XDP_PASS. Longer term my hope is we can make the XDP
> >> >>> metadata support the only thing drivers need to implement (i.e., have
> >> >>> the stack call into that code even when no XDP program is loaded), but
> >> >>> for now just for consistency (and allowing the XDP program to update the
> >> >>> metadata), we should probably at least consume it on XDP_PASS.
> >> >>>
> >> >>> -Toke
> >> >>>
> >> >
> >> > Not to derail the discussion (left the last message intact on top,
> >> > feel free to continue), but to summarize. The proposed changes seem to
> >> > be:
> >> >
> >> > 1. bpf_xdp_metadata_export_to_skb() should return pointer to "struct
> >> > xdp_to_skb_metadata"
> >> >    - This should let bpf programs change the metadata passed to the skb
> >> >
> >> > 2. "struct xdp_to_skb_metadata" should have its btf_id as the first
> >> > __u32 member (and remove the magic)
> >> >    - This is for the redirect case where the end users, including
> >> > AF_XDP, can parse this metadata from btf_id
> >>
> >> I think Toke's idea is to put the btf_id at the end of xdp_to_skb_metadata.  I
> >> can see why the end is needed for the userspace AF_XDP because, afaict, AF_XDP
> >> rx_desc currently cannot tell if there is metadata written by the xdp prog or
> >> not.  However, if the 'has_skb_metadata' bit can also be passed to the AF_XDP
> >> rx_desc->options, the btf_id may as well be not needed now.  However, the btf_id
> >> and other future new members can be added to the xdp_to_skb_metadata later if
> >> there is a need.
> >>
> >> For the kernel and xdp prog, a bit in the xdp->flags should be enough to get to
> >> the xdp_to_skb_metadata.  The xdp prog will use CO-RE to access the members in
> >> xdp_to_skb_metadata.
> >
> > Ack, good points on putting it at the end.
> > Regarding bit in desc->options vs btf_id: since it seems that btf_id
> > is useful anyway, let's start with that? We can add a bit later on if
> > it turns out using metadata is problematic otherwise.
>
> I think the bit is mostly useful so that the stack can know that the
> metadata has been set before consuming it (to guard against regular
> xdp_metadata usage accidentally hitting the "right" BTF ID). I don't
> think it needs to be exposed to the XDP programs themselves.

SG! A flag for xdp_buff/frame and a kfunc to query it for bpf.

> >> >    - This, however, is not all the metadata that the device can
> >> > support, but a much narrower set that the kernel is expected to use
> >> > for skb construction
> >> >
> >> > 3. __randomize_layout isn't really helping, CO-RE will trigger
> >> > regardless; maybe only the case where it matters is probably AF_XDP,
> >> > so still useful?
>
> Yeah, see my response to Martin, I think the randomisation is useful for
> AF_XDP transfer.

SG. Let's keep it for now. Worst case, if it hurts, we can remove it later...

> >> > 4. The presence of the metadata generated by
> >> > bpf_xdp_metadata_export_to_skb should be indicated by a flag in
> >> > xdp_{buff,frame}->flags
> >> >    - Assuming exposing it via xdp_md->has_skb_metadata is ok?
> >>
> >> probably __bpf_md_ptr(struct xdp_to_skb_metadata *, skb_metadata) and the type
> >> will be PTR_TO_BTF_ID_OR_NULL.
> >
> > Oh, that seems even better than returning it from
> > bpf_xdp_metadata_export_to_skb.
> > bpf_xdp_metadata_export_to_skb can return true/false and the rest goes
> > via default verifier ctx resolution mechanism..
> > (returning ptr from a kfunc seems to be a bit complicated right now)
>
> See my response to John in the other thread about mixing stable UAPI (in
> xdp_md) and unstable BTF structures in the xdp_md struct: I think this
> is confusing and would prefer a kfunc.

SG!

> >> >    - Since the programs probably need to do the following:
> >> >
> >> >    if (xdp_md->has_skb_metadata) {
> >> >      access/change skb metadata by doing struct xdp_to_skb_metadata *p
> >> > = data_meta;
> >>
> >> and directly access/change xdp->skb_metadata instead of using xdp->data_meta.
> >
> > Ack.
> >
> >> >    } else {
> >> >      use kfuncs
> >> >    }
> >> >
> >> > 5. Support the case where we keep program's metadata and kernel's
> >> > xdp_to_skb_metadata
> >> >    - skb_metadata_import_from_xdp() will "consume" it by mem-moving the
> >> > rest of the metadata over it and adjusting the headroom
> >>
> >> I was thinking the kernel's xdp_to_skb_metadata is always before the program's
> >> metadata.  xdp prog should usually work in this order also: read/write headers,
> >> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return
> >> XDP_PASS/XDP_REDIRECT.  When it is XDP_PASS, the kernel just needs to pop the
> >> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc.
> >>
> >> For the kernel and xdp prog, I don't think it matters where the
> >> xdp_to_skb_metadata is.  However, the xdp->data_meta (program's metadata) has to
> >> be before xdp->data because of the current data_meta and data comparison usage
> >> in the xdp prog.
> >>
> >> The order of the kernel's xdp_to_skb_metadata and the program's metadata
> >> probably only matters to the userspace AF_XDP.  However, I don't see how AF_XDP
> >> supports the program's metadata now.  afaict, it can only work now if there is
> >> some sort of contract between them or the AF_XDP currently does not use the
> >> program's metadata.  Either way, we can do the mem-moving only for AF_XDP and it
> >> should be a no op if there is no program's metadata?  This behavior could also
> >> be configurable through setsockopt?
> >
> > Agreed on all of the above. For now it seems like the safest thing to
> > do is to put xdp_to_skb_metadata last to allow af_xdp to properly
> > locate btf_id.
> > Let's see if Toke disagrees :-)
>
> As I replied to Martin, I'm not sure it's worth the complexity to
> logically split the SKB metadata from the program's own metadata (as
> opposed to just reusing the existing data_meta pointer)?

I'd gladly keep my current requirement where it's either or, but not both :-)
We can relax it later if required?

> However, if we do, the layout that makes most sense to me is putting the
> skb metadata before the program metadata, like:
>
> --------------
> | skb_metadata
> --------------
> | data_meta
> --------------
> | data
> --------------
>
> Not sure if that's what you meant? :)

I was suggesting the other way around: |custom meta|skb_metadata|data|
(but, as Martin points out, consuming skb_metadata in the kernel
becomes messier)

af_xdp can check whether skb_metdata is present by looking at data -
offsetof(struct skb_metadata, btf_id).
progs that know how to handle custom metadata, will look at data -
sizeof(skb_metadata)

Otherwise, if it's the other way around, how do we find skb_metadata
in a redirected frame?
Let's say we have |skb_metadata|custom meta|data|, how does the final
program find skb_metadata?
All the progs have to agree on the sizeof(tc/custom meta), right?
Martin KaFai Lau Nov. 10, 2022, 7:04 p.m. UTC | #18
On 11/10/22 6:19 AM, Toke Høiland-Jørgensen wrote:
> Martin KaFai Lau <martin.lau@linux.dev> writes:
> 
>> On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote:
>>> Snipping a bit of context to reply to this bit:
>>>
>>>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
>>>>>>> sure it is solid enough by asking the xdp prog not to use the same random number
>>>>>>> in its own metadata + not to change the metadata through xdp->data_meta after
>>>>>>> calling bpf_xdp_metadata_export_to_skb().
>>>>>>
>>>>>> What do you think the usecase here might be? Or are you suggesting we
>>>>>> reject further access to data_meta after
>>>>>> bpf_xdp_metadata_export_to_skb somehow?
>>>>>>
>>>>>> If we want to let the programs override some of this
>>>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
>>>>>> more kfuncs instead of exposing the layout?
>>>>>>
>>>>>> bpf_xdp_metadata_export_to_skb(ctx);
>>>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);
>>>
>>> There are several use cases for needing to access the metadata after
>>> calling bpf_xdp_metdata_export_to_skb():
>>>
>>> - Accessing the metadata after redirect (in a cpumap or devmap program,
>>>     or on a veth device)
>>> - Transferring the packet+metadata to AF_XDP
>> fwiw, the xdp prog could also be more selective and only stores one of the hints
>> instead of the whole 'struct xdp_to_skb_metadata'.
> 
> Yup, absolutely! In that sense, reusing the SKB format is mostly a
> convenience feature. However, lots of people consume AF_XDP through the
> default program installed by libxdp in the XSK setup code, and including
> custom metadata there is awkward. So having the metadata consumed by the
> stack as the "default subset" would enable easy consumption by
> non-advanced users, while advanced users can still do custom stuff by
> writing their own XDP program that calls the kfuncs.
> 
>>> - Returning XDP_PASS, but accessing some of the metadata first (whether
>>>     to read or change it)
>>>
>>> The last one could be solved by calling additional kfuncs, but that
>>> would be less efficient than just directly editing the struct which
>>> will be cache-hot after the helper returns.
>>
>> Yeah, it is more efficient to directly write if possible.  I think this set
>> allows the direct reading and writing already through data_meta (as a _u8 *).
> 
> Yup, totally fine with just keeping that capability :)
> 
>>> And yeah, this will allow the XDP program to inject arbitrary metadata
>>> into the netstack; but it can already inject arbitrary *packet* data
>>> into the stack, so not sure if this is much of an additional risk? If it
>>> does lead to trivial crashes, we should probably harden the stack
>>> against that?
>>>
>>> As for the random number, Jesper and I discussed replacing this with the
>>> same BTF-ID scheme that he was using in his patch series. I.e., instead
>>> of just putting in a random number, we insert the BTF ID of the metadata
>>> struct at the end of it. This will allow us to support multiple
>>> different formats in the future (not just changing the layout, but
>>> having multiple simultaneous formats in the same kernel image), in case
>>> we run out of space.
>>
>> This seems a bit hypothetical.  How much headroom does it usually have for the
>> xdp prog?  Potentially the hints can use all the remaining space left after the
>> header encap and the current bpf_xdp_adjust_meta() usage?
> 
> For the metadata consumed by the stack right now it's a bit
> hypothetical, yeah. However, there's a bunch of metadata commonly
> supported by hardware that the stack currently doesn't consume and that
> hopefully this feature will end up making more accessible. My hope is
> that the stack can also learn how to use this in the future, in which
> case we may run out of space. So I think of that bit mostly as
> future-proofing...

ic. in this case, Can the btf_id be added to 'struct xdp_to_skb_metadata' later 
if it is indeed needed?  The 'struct xdp_to_skb_metadata' is not in UAPI and 
doing it with CO-RE is to give us flexibility to make this kind of changes in 
the future.

> 
>>> We should probably also have a flag set on the xdp_frame so the stack
>>> knows that the metadata area contains relevant-to-skb data, to guard
>>> against an XDP program accidentally hitting the "magic number" (BTF_ID)
>>> in unrelated stuff it puts into the metadata area.
>>
>> Yeah, I think having a flag is useful.  The flag will be set at xdp_buff and
>> then transfer to the xdp_frame?
> 
> Yeah, exactly!
> 
>>>> After re-reading patch 6, have another question. The 'void
>>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at
>>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *'
>>>> pointer and the xdp prog can directly read (or even write) it?
>>>
>>> Hmm, I'm not sure returning a failure makes sense? Failure to read one
>>> or more fields just means that those fields will not be populated? We
>>> should probably have a flags field inside the metadata struct itself to
>>> indicate which fields are set or not, but I'm not sure returning an
>>> error value adds anything? Returning a pointer to the metadata field
>>> might be convenient for users (it would just be an alias to the
>>> data_meta pointer, but the verifier could know its size, so the program
>>> doesn't have to bounds check it).
>>
>> If some hints are not available, those hints should be initialized to
>> 0/CHECKSUM_NONE/...etc.
> 
> The problem with that is that then we have to spend cycles writing
> eight bytes of zeroes into the checksum field :)

With a common 'struct xdp_to_skb_metadata', I am not sure how some of these zero 
writes can be avoided.  If the xdp prog wants to optimize, it can call 
individual kfunc to get individual hints.

> 
>> The xdp prog needs a direct way to tell hard failure when it cannot
>> write the meta area because of not enough space. Comparing
>> xdp->data_meta with xdp->data as a side effect is not intuitive.
> 
> Yeah, hence a flags field so we can just see if setting each field
> succeeded?

How testing a flag is different from checking 0/invalid-value of a field?  or 
some fields just don't have an invalid value to check for like vlan_tci?

You meant a flags field as a return value or in the 'struct xdp_to_skb_metadata' ?

> 
>> It is more than saving the bound check.  With type info of 'struct
>> xdp_to_skb_metadata *', the verifier can do more checks like reading in the
>> middle of an integer member.  The verifier could also limit write access only to
>> a few struct's members if it is needed.
>>
>> The returning 'struct xdp_to_skb_metadata *' should not be an alias to the
>> xdp->data_meta.  They should actually point to different locations in the
>> headroom.  bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff.
>> xdp->data_meta won't be changed and keeps pointing to the last
>> bpf_xdp_adjust_meta() location.  The kernel will know if there is
>> xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the
>> xdp_{buff,frame}.  Would it work?
> 
> Hmm, logically splitting the program metadata and the xdp_hints metadata
> (but having them share the same area) *could* work, I guess, I'm just
> not sure it's worth the extra complexity?

It shouldn't stop the existing xdp prog writing its own metadata from using the 
the new bpf_xdp_metadata_export_to_skb().

> 
>>>> A related question, why 'struct xdp_to_skb_metadata' needs
>>>> __randomize_layout?
>>>
>>> The __randomize_layout thing is there to force BPF programs to use CO-RE
>>> to access the field. This is to avoid the struct layout accidentally
>>> ossifying because people in practice rely on a particular layout, even
>>> though we tell them to use CO-RE. There are lots of examples of this
>>> happening in other domains (IP header options, TCP options, etc), and
>>> __randomize_layout seemed like a neat trick to enforce CO-RE usage :)
>>
>> I am not sure if it is necessary or helpful to only enforce __randomize_layout
>> in 'struct xdp_to_skb_metadata'.  There are other CO-RE use cases (tracing and
>> non tracing) that already have direct access (reading and/or writing) to other
>> kernel structures.
>>
>> It is more important for the verifier to see the xdp prog accessing it as a
>> 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so
>> that the verifier can enforce the rules of access.
> 
> That only works inside the kernel, though. Since the metadata field can
> be copied wholesale to AF_XDP, having it randomized forces userspace
> consumers to also write code to deal with the layout being dynamic...

hm... I still don't see how useful it is, in particular you mentioned the libxdp 
will install a xdp prog to write this default format (xdp_to_skb_metadata) and 
likely libxdp will also provide some helpers to parse the xdp_to_skb_metadata 
and the libxdp user should not need to know if CO-RE is used or not. 
Considering it is a kernel internal struct, I think it is fine to keep it and 
can be revisited later if needed.  Lets get on to other things first :)
David Ahern Nov. 10, 2022, 9:21 p.m. UTC | #19
On 11/9/22 11:44 PM, Stanislav Fomichev wrote:
>>> @@ -423,14 +425,25 @@ XDP_METADATA_KFUNC_xxx
>>>  MAX_XDP_METADATA_KFUNC,
>>>  };
>>>
>>> +struct xdp_to_skb_metadata {
>>> +     u32 magic; /* xdp_metadata_magic */
>>> +     u64 rx_timestamp;
>>
>> Slightly confused. I thought/think most drivers populate the skb timestamp
>> if they can already? So why do we need to bounce these through some xdp
>> metadata? Won't all this cost more than the load/store directly from the
>> descriptor into the skb? Even if drivers are not populating skb now
>> shouldn't an ethtool knob be enough to turn this on?
> 
> dsahern@ pointed out that it might be useful for the program to be
> able to override some of that metadata.

Examples that come to mind from previous work:
1. changing vlans on a redirect: Rx on vlan 1 with h/w popping the vlan
so it is provided in metadata. Then on a redirect it shifts to vlan 2
for Tx. But this example use case assumes Tx accesses the metadata to
ask h/w to insert the header.

2. popping or updating an encap header and wanting to update the checksum

3. changing the h/w provided hash to affect steering of a subsequent skb
Toke Høiland-Jørgensen Nov. 10, 2022, 10:49 p.m. UTC | #20
John Fastabend <john.fastabend@gmail.com> writes:

> Toke Høiland-Jørgensen wrote:
>> John Fastabend <john.fastabend@gmail.com> writes:
>> 
>> > Toke Høiland-Jørgensen wrote:
>> >> Snipping a bit of context to reply to this bit:
>> >> 
>> >> >>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
>> >> >>>> sure it is solid enough by asking the xdp prog not to use the same random number
>> >> >>>> in its own metadata + not to change the metadata through xdp->data_meta after
>> >> >>>> calling bpf_xdp_metadata_export_to_skb().
>> >> >>>
>> >> >>> What do you think the usecase here might be? Or are you suggesting we
>> >> >>> reject further access to data_meta after
>> >> >>> bpf_xdp_metadata_export_to_skb somehow?
>> >> >>>
>> >> >>> If we want to let the programs override some of this
>> >> >>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
>> >> >>> more kfuncs instead of exposing the layout?
>> >> >>>
>> >> >>> bpf_xdp_metadata_export_to_skb(ctx);
>> >> >>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);
>> >> 
>> >
>> > Hi Toke,
>> >
>> > Trying not to bifurcate your thread. Can I start a new one here to
>> > elaborate on these use cases. I'm still a bit lost on any use case
>> > for this that makes sense to actually deploy on a network.
>> >
>> >> There are several use cases for needing to access the metadata after
>> >> calling bpf_xdp_metdata_export_to_skb():
>> >> 
>> >> - Accessing the metadata after redirect (in a cpumap or devmap program,
>> >>   or on a veth device)
>> >
>> > I think for devmap there are still lots of opens how/where the skb
>> > is even built.
>> 
>> For veth it's pretty clear; i.e., when redirecting into containers.
>
> Ah but I think XDP on veth is a bit questionable in general. The use
> case is NFV I guess but its not how I would build out NFV. I've never
> seen it actually deployed other than in CI. Anyways not necessary to
> drop into that debate here. It exists so OK.
>
>> 
>> > For cpumap I'm a bit unsure what the use case is. For ice, mlx and
>> > such you should use the hardware RSS if performance is top of mind.
>> 
>> Hardware RSS works fine if your hardware supports the hashing you want;
>> many do not. As an example, Jesper wrote this application that uses
>> cpumap to divide out ISP customer traffic among different CPUs (solving
>> an HTB scaling problem):
>> 
>> https://github.com/xdp-project/xdp-cpumap-tc
>
> I'm going to argue hw should be able to do this still and we
> should fix the hw but maybe not easily doable without convincing
> hardware folks to talk to us.

Sure, in the ideal world the hardware should just be able to do this.
Unfortunately, we don't live in that ideal world :)

> Also not obvious tto me how linked code works without more studying,
> its ingress HTB? So you would push the rxhash and timestamp into
> cpumap and then build the skb here with the correct skb->timestamp?

No, the HTB tree is on egress. The use case is an ISP middlebox that
shapes (say) 1000 customers to their subscribed rate, using a big HTB
tree. If you just do this with a single HTB instance on the egress NIC
you run into the global qdisc lock and you can't scale beyond a pretty
measly bandwidth. Whereas if you use multiple HW TXQs and the mq qdisc,
you can partition the HTB tree so you only have a subset of customers on
each HWQ/HTB instance. But for this to work, and still guarantee each
customer gets shaped to the right rate, you need to ensure that all that
customer's traffic hits the same HWQ. The xdp-cpumap-tc tool does this
by configuring the TXQs to correspond to individual CPUs, and then runs
an XDP program that matches traffic to customers and redirects them to
the right CPU (using an LPM map).

This solution runs in production in quite a number of smallish WISPs,
BTW, with quite nice results. The software to set it all up is also open
sourced: https://libreqos.io/

Coming back to HW metadata, the LibreQoS system could benefit from the
hardware flow hash in particular, since that would save a hash operation
when enqueueing the packet into sch_cake.

> OK even if I can't exactly find the use case for cpumap if I had
> a use case I can see how passing metadata through is useful.

Great!

>> > And then for specific devices on cpumap (maybe realtime or ptp
>> > things?) could we just throw it through the xdp_frame?
>> 
>> Not sure what you mean here? Throw what through the xdp_frame?
>
> Doesn't matter reread patches and figured it out I was slightly
> confused.

Right, OK.

>> 
>> >> - Transferring the packet+metadata to AF_XDP
>> >
>> > In this case we have the metadata and AF_XDP program and XDP program
>> > simply need to agree on metadata format. No need to have some magic
>> > numbers and driver specific kfuncs.
>> 
>> See my other reply to Martin: Yeah, for AF_XDP users that write their
>> own kernel XDP programs, they can just do whatever they want. But many
>> users just rely on the default program in libxdp, so having a standard
>> format to include with that is useful.
>> 
>
> I don't think your AF_XDP program is any different than other AF_XDP
> programs. Your lib can create a standard format if it wants but
> kernel doesn't need to enforce it anyway.

Yeah, we totally could. But since we're defining a "standard" format for
kernel (skb) consumption anyway, making this available to AF_XDP is
kinda convenient so we don't have to :)

>> >> - Returning XDP_PASS, but accessing some of the metadata first (whether
>> >>   to read or change it)
>> >> 
>> >
>> > I don't get this case? XDP_PASS should go to stack normally through
>> > drivers build_skb routines. These will populate timestamp normally.
>> > My guess is simply descriptor->skb load/store is cheaper than carrying
>> > around this metadata and doing the call in BPF side. Anyways you
>> > just built an entire skb and hit the stack I don't think you will
>> > notice this noise in any benchmark.
>> 
>> If you modify the packet before calling XDP_PASS you may want to update
>> the metadata as well (for instance the RX hash, or in the future the
>> metadata could also carry transport header offsets).
>
> OK. So when you modify the pkt fixing up the rxhash makes sense. Thanks
> for the response I can see the argument.

Great! You're welcome :)

-Toke
Toke Høiland-Jørgensen Nov. 10, 2022, 11:14 p.m. UTC | #21
Skipping to the last bit:

>> >> >    } else {
>> >> >      use kfuncs
>> >> >    }
>> >> >
>> >> > 5. Support the case where we keep program's metadata and kernel's
>> >> > xdp_to_skb_metadata
>> >> >    - skb_metadata_import_from_xdp() will "consume" it by mem-moving the
>> >> > rest of the metadata over it and adjusting the headroom
>> >>
>> >> I was thinking the kernel's xdp_to_skb_metadata is always before the program's
>> >> metadata.  xdp prog should usually work in this order also: read/write headers,
>> >> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return
>> >> XDP_PASS/XDP_REDIRECT.  When it is XDP_PASS, the kernel just needs to pop the
>> >> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc.
>> >>
>> >> For the kernel and xdp prog, I don't think it matters where the
>> >> xdp_to_skb_metadata is.  However, the xdp->data_meta (program's metadata) has to
>> >> be before xdp->data because of the current data_meta and data comparison usage
>> >> in the xdp prog.
>> >>
>> >> The order of the kernel's xdp_to_skb_metadata and the program's metadata
>> >> probably only matters to the userspace AF_XDP.  However, I don't see how AF_XDP
>> >> supports the program's metadata now.  afaict, it can only work now if there is
>> >> some sort of contract between them or the AF_XDP currently does not use the
>> >> program's metadata.  Either way, we can do the mem-moving only for AF_XDP and it
>> >> should be a no op if there is no program's metadata?  This behavior could also
>> >> be configurable through setsockopt?
>> >
>> > Agreed on all of the above. For now it seems like the safest thing to
>> > do is to put xdp_to_skb_metadata last to allow af_xdp to properly
>> > locate btf_id.
>> > Let's see if Toke disagrees :-)
>>
>> As I replied to Martin, I'm not sure it's worth the complexity to
>> logically split the SKB metadata from the program's own metadata (as
>> opposed to just reusing the existing data_meta pointer)?
>
> I'd gladly keep my current requirement where it's either or, but not both :-)
> We can relax it later if required?

So the way I've been thinking about it is simply that the skb_metadata
would live in the same place at the data_meta pointer (including
adjusting that pointer to accommodate it), and just overriding the
existing program metadata, if any exists. But looking at it now, I guess
having the split makes it easier for a program to write its own custom
metadata and still use the skb metadata. See below about the ordering.

>> However, if we do, the layout that makes most sense to me is putting the
>> skb metadata before the program metadata, like:
>>
>> --------------
>> | skb_metadata
>> --------------
>> | data_meta
>> --------------
>> | data
>> --------------
>>
>> Not sure if that's what you meant? :)
>
> I was suggesting the other way around: |custom meta|skb_metadata|data|
> (but, as Martin points out, consuming skb_metadata in the kernel
> becomes messier)
>
> af_xdp can check whether skb_metdata is present by looking at data -
> offsetof(struct skb_metadata, btf_id).
> progs that know how to handle custom metadata, will look at data -
> sizeof(skb_metadata)
>
> Otherwise, if it's the other way around, how do we find skb_metadata
> in a redirected frame?
> Let's say we have |skb_metadata|custom meta|data|, how does the final
> program find skb_metadata?
> All the progs have to agree on the sizeof(tc/custom meta), right?

Erm, maybe I'm missing something here, but skb_metadata is fixed size,
right? So if the "skb_metadata is present" flag is set, we know that the
sizeof(skb_metadata) bytes before the data_meta pointer contains the
metadata, and if the flag is not set, we know those bytes are not valid
metadata.

For AF_XDP, we'd need to transfer the flag as well, and it could apply
the same logic (getting the size from the vmlinux BTF).

By this logic, the BTF_ID should be the *first* entry of struct
skb_metadata, since that will be the field AF_XDP programs can find
right off the bat, no?

-Toke
Toke Høiland-Jørgensen Nov. 10, 2022, 11:29 p.m. UTC | #22
Martin KaFai Lau <martin.lau@linux.dev> writes:

> On 11/10/22 6:19 AM, Toke Høiland-Jørgensen wrote:
>> Martin KaFai Lau <martin.lau@linux.dev> writes:
>> 
>>> On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote:
>>>> Snipping a bit of context to reply to this bit:
>>>>
>>>>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not
>>>>>>>> sure it is solid enough by asking the xdp prog not to use the same random number
>>>>>>>> in its own metadata + not to change the metadata through xdp->data_meta after
>>>>>>>> calling bpf_xdp_metadata_export_to_skb().
>>>>>>>
>>>>>>> What do you think the usecase here might be? Or are you suggesting we
>>>>>>> reject further access to data_meta after
>>>>>>> bpf_xdp_metadata_export_to_skb somehow?
>>>>>>>
>>>>>>> If we want to let the programs override some of this
>>>>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add
>>>>>>> more kfuncs instead of exposing the layout?
>>>>>>>
>>>>>>> bpf_xdp_metadata_export_to_skb(ctx);
>>>>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234);
>>>>
>>>> There are several use cases for needing to access the metadata after
>>>> calling bpf_xdp_metdata_export_to_skb():
>>>>
>>>> - Accessing the metadata after redirect (in a cpumap or devmap program,
>>>>     or on a veth device)
>>>> - Transferring the packet+metadata to AF_XDP
>>> fwiw, the xdp prog could also be more selective and only stores one of the hints
>>> instead of the whole 'struct xdp_to_skb_metadata'.
>> 
>> Yup, absolutely! In that sense, reusing the SKB format is mostly a
>> convenience feature. However, lots of people consume AF_XDP through the
>> default program installed by libxdp in the XSK setup code, and including
>> custom metadata there is awkward. So having the metadata consumed by the
>> stack as the "default subset" would enable easy consumption by
>> non-advanced users, while advanced users can still do custom stuff by
>> writing their own XDP program that calls the kfuncs.
>> 
>>>> - Returning XDP_PASS, but accessing some of the metadata first (whether
>>>>     to read or change it)
>>>>
>>>> The last one could be solved by calling additional kfuncs, but that
>>>> would be less efficient than just directly editing the struct which
>>>> will be cache-hot after the helper returns.
>>>
>>> Yeah, it is more efficient to directly write if possible.  I think this set
>>> allows the direct reading and writing already through data_meta (as a _u8 *).
>> 
>> Yup, totally fine with just keeping that capability :)
>> 
>>>> And yeah, this will allow the XDP program to inject arbitrary metadata
>>>> into the netstack; but it can already inject arbitrary *packet* data
>>>> into the stack, so not sure if this is much of an additional risk? If it
>>>> does lead to trivial crashes, we should probably harden the stack
>>>> against that?
>>>>
>>>> As for the random number, Jesper and I discussed replacing this with the
>>>> same BTF-ID scheme that he was using in his patch series. I.e., instead
>>>> of just putting in a random number, we insert the BTF ID of the metadata
>>>> struct at the end of it. This will allow us to support multiple
>>>> different formats in the future (not just changing the layout, but
>>>> having multiple simultaneous formats in the same kernel image), in case
>>>> we run out of space.
>>>
>>> This seems a bit hypothetical.  How much headroom does it usually have for the
>>> xdp prog?  Potentially the hints can use all the remaining space left after the
>>> header encap and the current bpf_xdp_adjust_meta() usage?
>> 
>> For the metadata consumed by the stack right now it's a bit
>> hypothetical, yeah. However, there's a bunch of metadata commonly
>> supported by hardware that the stack currently doesn't consume and that
>> hopefully this feature will end up making more accessible. My hope is
>> that the stack can also learn how to use this in the future, in which
>> case we may run out of space. So I think of that bit mostly as
>> future-proofing...
>
> ic. in this case, Can the btf_id be added to 'struct xdp_to_skb_metadata' later 
> if it is indeed needed?  The 'struct xdp_to_skb_metadata' is not in UAPI and 
> doing it with CO-RE is to give us flexibility to make this kind of changes in 
> the future.

My worry is mostly that it'll be more painful to add it later than just
including it from the start, mostly because of AF_XDP users. But if we
do the randomisation thing (thus forcing AF_XDP users to deal with the
dynamic layout as well), it should be possible to add it later, and I
can live with that option as well...

>>>> We should probably also have a flag set on the xdp_frame so the stack
>>>> knows that the metadata area contains relevant-to-skb data, to guard
>>>> against an XDP program accidentally hitting the "magic number" (BTF_ID)
>>>> in unrelated stuff it puts into the metadata area.
>>>
>>> Yeah, I think having a flag is useful.  The flag will be set at xdp_buff and
>>> then transfer to the xdp_frame?
>> 
>> Yeah, exactly!
>> 
>>>>> After re-reading patch 6, have another question. The 'void
>>>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at
>>>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *'
>>>>> pointer and the xdp prog can directly read (or even write) it?
>>>>
>>>> Hmm, I'm not sure returning a failure makes sense? Failure to read one
>>>> or more fields just means that those fields will not be populated? We
>>>> should probably have a flags field inside the metadata struct itself to
>>>> indicate which fields are set or not, but I'm not sure returning an
>>>> error value adds anything? Returning a pointer to the metadata field
>>>> might be convenient for users (it would just be an alias to the
>>>> data_meta pointer, but the verifier could know its size, so the program
>>>> doesn't have to bounds check it).
>>>
>>> If some hints are not available, those hints should be initialized to
>>> 0/CHECKSUM_NONE/...etc.
>> 
>> The problem with that is that then we have to spend cycles writing
>> eight bytes of zeroes into the checksum field :)
>
> With a common 'struct xdp_to_skb_metadata', I am not sure how some of these zero 
> writes can be avoided.  If the xdp prog wants to optimize, it can call 
> individual kfunc to get individual hints.

Erm, we just... don't write those fields? Something like:

void write_skb_meta(hw, ctx) {
  struct xdp_skb_metadata meta = ctx->data_meta - sizeof(struct xdp_skb_metadata);
  meta->valid_fields = 0;

  if (hw_has_timestamp) {
    meta->timestamp = hw->timestamp;
    meta->valid_fields |= FIELD_TIMESTAMP;
  } /* otherwise meta->timestamp is just uninitialised */

  if (hw_has_rxhash) {
    meta->rxhash = hw->rxhash;
    meta->valid_fields |= FIELD_RXHASH;
  } /* otherwise meta->rxhash is just uninitialised */
  ...etc...
}

>>> The xdp prog needs a direct way to tell hard failure when it cannot
>>> write the meta area because of not enough space. Comparing
>>> xdp->data_meta with xdp->data as a side effect is not intuitive.
>> 
>> Yeah, hence a flags field so we can just see if setting each field
>> succeeded?
>
> How testing a flag is different from checking 0/invalid-value of a
> field?

The flags field is smaller, so we write fewer bytes if not all fields
are populated.

> or some fields just don't have an invalid value to check for
> like vlan_tci?

Yeah, that could also be an issue.

> You meant a flags field as a return value or in the 'struct xdp_to_skb_metadata' ?

See above.


>> 
>>> It is more than saving the bound check.  With type info of 'struct
>>> xdp_to_skb_metadata *', the verifier can do more checks like reading in the
>>> middle of an integer member.  The verifier could also limit write access only to
>>> a few struct's members if it is needed.
>>>
>>> The returning 'struct xdp_to_skb_metadata *' should not be an alias to the
>>> xdp->data_meta.  They should actually point to different locations in the
>>> headroom.  bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff.
>>> xdp->data_meta won't be changed and keeps pointing to the last
>>> bpf_xdp_adjust_meta() location.  The kernel will know if there is
>>> xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the
>>> xdp_{buff,frame}.  Would it work?
>> 
>> Hmm, logically splitting the program metadata and the xdp_hints metadata
>> (but having them share the same area) *could* work, I guess, I'm just
>> not sure it's worth the extra complexity?
>
> It shouldn't stop the existing xdp prog writing its own metadata from using the 
> the new bpf_xdp_metadata_export_to_skb().

Right, I think I see what you mean, and I agree that splitting it
internally in the kernel does make sense (see my other reply to
Stanislav).

>> 
>>>>> A related question, why 'struct xdp_to_skb_metadata' needs
>>>>> __randomize_layout?
>>>>
>>>> The __randomize_layout thing is there to force BPF programs to use CO-RE
>>>> to access the field. This is to avoid the struct layout accidentally
>>>> ossifying because people in practice rely on a particular layout, even
>>>> though we tell them to use CO-RE. There are lots of examples of this
>>>> happening in other domains (IP header options, TCP options, etc), and
>>>> __randomize_layout seemed like a neat trick to enforce CO-RE usage :)
>>>
>>> I am not sure if it is necessary or helpful to only enforce __randomize_layout
>>> in 'struct xdp_to_skb_metadata'.  There are other CO-RE use cases (tracing and
>>> non tracing) that already have direct access (reading and/or writing) to other
>>> kernel structures.
>>>
>>> It is more important for the verifier to see the xdp prog accessing it as a
>>> 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so
>>> that the verifier can enforce the rules of access.
>> 
>> That only works inside the kernel, though. Since the metadata field can
>> be copied wholesale to AF_XDP, having it randomized forces userspace
>> consumers to also write code to deal with the layout being dynamic...
>
> hm... I still don't see how useful it is, in particular you mentioned
> the libxdp will install a xdp prog to write this default format
> (xdp_to_skb_metadata) and likely libxdp will also provide some helpers
> to parse the xdp_to_skb_metadata and the libxdp user should not need
> to know if CO-RE is used or not. Considering it is a kernel internal
> struct, I think it is fine to keep it and can be revisited later if
> needed. Lets get on to other things first :)

Well, if it was just kernel-internal, sure. But we're also exporting it
to userspace (through AF_XDP). Well-behaved users will obviously do the
right thing and use CO-RE. I'm trying to guard against users just
blindly copy-pasting the struct definition from the kernel and doing a
static cast, then complaining about their code breaking the first time
we change the struct layout. Which I sadly expect that there absolutely
will be people who do unless we actively make sure that doesn't work
from the get-go. And since the randomisation is literally just adding
__randomize_layout to the struct definition, I'd rather start out with
having it, and removing it later if it turns out not to be needed... :)

-Toke
Stanislav Fomichev Nov. 10, 2022, 11:52 p.m. UTC | #23
On Thu, Nov 10, 2022 at 3:14 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Skipping to the last bit:
>
> >> >> >    } else {
> >> >> >      use kfuncs
> >> >> >    }
> >> >> >
> >> >> > 5. Support the case where we keep program's metadata and kernel's
> >> >> > xdp_to_skb_metadata
> >> >> >    - skb_metadata_import_from_xdp() will "consume" it by mem-moving the
> >> >> > rest of the metadata over it and adjusting the headroom
> >> >>
> >> >> I was thinking the kernel's xdp_to_skb_metadata is always before the program's
> >> >> metadata.  xdp prog should usually work in this order also: read/write headers,
> >> >> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return
> >> >> XDP_PASS/XDP_REDIRECT.  When it is XDP_PASS, the kernel just needs to pop the
> >> >> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc.
> >> >>
> >> >> For the kernel and xdp prog, I don't think it matters where the
> >> >> xdp_to_skb_metadata is.  However, the xdp->data_meta (program's metadata) has to
> >> >> be before xdp->data because of the current data_meta and data comparison usage
> >> >> in the xdp prog.
> >> >>
> >> >> The order of the kernel's xdp_to_skb_metadata and the program's metadata
> >> >> probably only matters to the userspace AF_XDP.  However, I don't see how AF_XDP
> >> >> supports the program's metadata now.  afaict, it can only work now if there is
> >> >> some sort of contract between them or the AF_XDP currently does not use the
> >> >> program's metadata.  Either way, we can do the mem-moving only for AF_XDP and it
> >> >> should be a no op if there is no program's metadata?  This behavior could also
> >> >> be configurable through setsockopt?
> >> >
> >> > Agreed on all of the above. For now it seems like the safest thing to
> >> > do is to put xdp_to_skb_metadata last to allow af_xdp to properly
> >> > locate btf_id.
> >> > Let's see if Toke disagrees :-)
> >>
> >> As I replied to Martin, I'm not sure it's worth the complexity to
> >> logically split the SKB metadata from the program's own metadata (as
> >> opposed to just reusing the existing data_meta pointer)?
> >
> > I'd gladly keep my current requirement where it's either or, but not both :-)
> > We can relax it later if required?
>
> So the way I've been thinking about it is simply that the skb_metadata
> would live in the same place at the data_meta pointer (including
> adjusting that pointer to accommodate it), and just overriding the
> existing program metadata, if any exists. But looking at it now, I guess
> having the split makes it easier for a program to write its own custom
> metadata and still use the skb metadata. See below about the ordering.
>
> >> However, if we do, the layout that makes most sense to me is putting the
> >> skb metadata before the program metadata, like:
> >>
> >> --------------
> >> | skb_metadata
> >> --------------
> >> | data_meta
> >> --------------
> >> | data
> >> --------------
> >>
> >> Not sure if that's what you meant? :)
> >
> > I was suggesting the other way around: |custom meta|skb_metadata|data|
> > (but, as Martin points out, consuming skb_metadata in the kernel
> > becomes messier)
> >
> > af_xdp can check whether skb_metdata is present by looking at data -
> > offsetof(struct skb_metadata, btf_id).
> > progs that know how to handle custom metadata, will look at data -
> > sizeof(skb_metadata)
> >
> > Otherwise, if it's the other way around, how do we find skb_metadata
> > in a redirected frame?
> > Let's say we have |skb_metadata|custom meta|data|, how does the final
> > program find skb_metadata?
> > All the progs have to agree on the sizeof(tc/custom meta), right?
>
> Erm, maybe I'm missing something here, but skb_metadata is fixed size,
> right? So if the "skb_metadata is present" flag is set, we know that the
> sizeof(skb_metadata) bytes before the data_meta pointer contains the
> metadata, and if the flag is not set, we know those bytes are not valid
> metadata.
>
> For AF_XDP, we'd need to transfer the flag as well, and it could apply
> the same logic (getting the size from the vmlinux BTF).
>
> By this logic, the BTF_ID should be the *first* entry of struct
> skb_metadata, since that will be the field AF_XDP programs can find
> right off the bat, no?

The problem with AF_XDP is that, IIUC, it doesn't have a data_meta
pointer in the userspace.

You get an rx descriptor where the address points to the 'data':
| 256 bytes headroom where metadata can go | data |

So you have (at most) 256 bytes of headroom, some of that might be the
metadata, but you really don't know where it starts. But you know it
definitely ends where the data begins.

So if we have the following, we can locate skb_metadata:
| 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data |
data - sizeof(skb_metadata) will get you there

But if it's the other way around, the program has to know
sizeof(custom metadata) to locate skb_metadata:
| 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data |

Am I missing something here?
Martin KaFai Lau Nov. 10, 2022, 11:58 p.m. UTC | #24
On 11/10/22 10:52 AM, Stanislav Fomichev wrote:
>>> Oh, that seems even better than returning it from
>>> bpf_xdp_metadata_export_to_skb.
>>> bpf_xdp_metadata_export_to_skb can return true/false and the rest goes
>>> via default verifier ctx resolution mechanism..
>>> (returning ptr from a kfunc seems to be a bit complicated right now)

What is the complication in returning ptr from a kfunc?  I want to see if it can 
be solved because it will be an easier API to use instead of calling another 
kfunc to get the ptr.

>> See my response to John in the other thread about mixing stable UAPI (in
>> xdp_md) and unstable BTF structures in the xdp_md struct: I think this
>> is confusing and would prefer a kfunc.

hmm... right, it will be one of the first considering the current __bpf_md_ptr() 
usages are only exposing another struct in uapi, eg. __bpf_md_ptr(struct 
bpf_sock *, sk).

A kfunc will also be fine.
Toke Høiland-Jørgensen Nov. 11, 2022, 12:10 a.m. UTC | #25
Stanislav Fomichev <sdf@google.com> writes:

> On Thu, Nov 10, 2022 at 3:14 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> Skipping to the last bit:
>>
>> >> >> >    } else {
>> >> >> >      use kfuncs
>> >> >> >    }
>> >> >> >
>> >> >> > 5. Support the case where we keep program's metadata and kernel's
>> >> >> > xdp_to_skb_metadata
>> >> >> >    - skb_metadata_import_from_xdp() will "consume" it by mem-moving the
>> >> >> > rest of the metadata over it and adjusting the headroom
>> >> >>
>> >> >> I was thinking the kernel's xdp_to_skb_metadata is always before the program's
>> >> >> metadata.  xdp prog should usually work in this order also: read/write headers,
>> >> >> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return
>> >> >> XDP_PASS/XDP_REDIRECT.  When it is XDP_PASS, the kernel just needs to pop the
>> >> >> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc.
>> >> >>
>> >> >> For the kernel and xdp prog, I don't think it matters where the
>> >> >> xdp_to_skb_metadata is.  However, the xdp->data_meta (program's metadata) has to
>> >> >> be before xdp->data because of the current data_meta and data comparison usage
>> >> >> in the xdp prog.
>> >> >>
>> >> >> The order of the kernel's xdp_to_skb_metadata and the program's metadata
>> >> >> probably only matters to the userspace AF_XDP.  However, I don't see how AF_XDP
>> >> >> supports the program's metadata now.  afaict, it can only work now if there is
>> >> >> some sort of contract between them or the AF_XDP currently does not use the
>> >> >> program's metadata.  Either way, we can do the mem-moving only for AF_XDP and it
>> >> >> should be a no op if there is no program's metadata?  This behavior could also
>> >> >> be configurable through setsockopt?
>> >> >
>> >> > Agreed on all of the above. For now it seems like the safest thing to
>> >> > do is to put xdp_to_skb_metadata last to allow af_xdp to properly
>> >> > locate btf_id.
>> >> > Let's see if Toke disagrees :-)
>> >>
>> >> As I replied to Martin, I'm not sure it's worth the complexity to
>> >> logically split the SKB metadata from the program's own metadata (as
>> >> opposed to just reusing the existing data_meta pointer)?
>> >
>> > I'd gladly keep my current requirement where it's either or, but not both :-)
>> > We can relax it later if required?
>>
>> So the way I've been thinking about it is simply that the skb_metadata
>> would live in the same place at the data_meta pointer (including
>> adjusting that pointer to accommodate it), and just overriding the
>> existing program metadata, if any exists. But looking at it now, I guess
>> having the split makes it easier for a program to write its own custom
>> metadata and still use the skb metadata. See below about the ordering.
>>
>> >> However, if we do, the layout that makes most sense to me is putting the
>> >> skb metadata before the program metadata, like:
>> >>
>> >> --------------
>> >> | skb_metadata
>> >> --------------
>> >> | data_meta
>> >> --------------
>> >> | data
>> >> --------------
>> >>
>> >> Not sure if that's what you meant? :)
>> >
>> > I was suggesting the other way around: |custom meta|skb_metadata|data|
>> > (but, as Martin points out, consuming skb_metadata in the kernel
>> > becomes messier)
>> >
>> > af_xdp can check whether skb_metdata is present by looking at data -
>> > offsetof(struct skb_metadata, btf_id).
>> > progs that know how to handle custom metadata, will look at data -
>> > sizeof(skb_metadata)
>> >
>> > Otherwise, if it's the other way around, how do we find skb_metadata
>> > in a redirected frame?
>> > Let's say we have |skb_metadata|custom meta|data|, how does the final
>> > program find skb_metadata?
>> > All the progs have to agree on the sizeof(tc/custom meta), right?
>>
>> Erm, maybe I'm missing something here, but skb_metadata is fixed size,
>> right? So if the "skb_metadata is present" flag is set, we know that the
>> sizeof(skb_metadata) bytes before the data_meta pointer contains the
>> metadata, and if the flag is not set, we know those bytes are not valid
>> metadata.
>>
>> For AF_XDP, we'd need to transfer the flag as well, and it could apply
>> the same logic (getting the size from the vmlinux BTF).
>>
>> By this logic, the BTF_ID should be the *first* entry of struct
>> skb_metadata, since that will be the field AF_XDP programs can find
>> right off the bat, no?
>
> The problem with AF_XDP is that, IIUC, it doesn't have a data_meta
> pointer in the userspace.
>
> You get an rx descriptor where the address points to the 'data':
> | 256 bytes headroom where metadata can go | data |

Ah, I was missing the bit where the data pointer actually points at
data, not the start of the buf. Oops, my bad!

> So you have (at most) 256 bytes of headroom, some of that might be the
> metadata, but you really don't know where it starts. But you know it
> definitely ends where the data begins.
>
> So if we have the following, we can locate skb_metadata:
> | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data |
> data - sizeof(skb_metadata) will get you there
>
> But if it's the other way around, the program has to know
> sizeof(custom metadata) to locate skb_metadata:
> | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data |
>
> Am I missing something here?

Hmm, so one could argue that the only way AF_XDP can consume custom
metadata today is if it knows out of band what the size of it is. And if
it knows that, it can just skip over it to go back to the skb_metadata,
no?

The only problem left then is if there were multiple XDP programs called
in sequence (whether before a redirect, or by libxdp chaining or tail
calls), and the first one resized the metadata area without the last one
knowing about it. For this, we could add a CLOBBER_PROGRAM_META flag to
the skb_metadata helper which if set will ensure that the program
metadata length is reset to 0?

-Toke
Stanislav Fomichev Nov. 11, 2022, 12:20 a.m. UTC | #26
"

On Thu, Nov 10, 2022 at 3:58 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 11/10/22 10:52 AM, Stanislav Fomichev wrote:
> >>> Oh, that seems even better than returning it from
> >>> bpf_xdp_metadata_export_to_skb.
> >>> bpf_xdp_metadata_export_to_skb can return true/false and the rest goes
> >>> via default verifier ctx resolution mechanism..
> >>> (returning ptr from a kfunc seems to be a bit complicated right now)
>
> What is the complication in returning ptr from a kfunc?  I want to see if it can
> be solved because it will be an easier API to use instead of calling another
> kfunc to get the ptr.

At least for returning a pointer to the basic c types like int/long, I
was hitting the following:

commit eb1f7f71c126c8fd50ea81af98f97c4b581ea4ae
Author:     Benjamin Tissoires <benjamin.tissoires@redhat.com>
AuthorDate: Tue Sep 6 17:13:02 2022 +0200
Commit:     Alexei Starovoitov <ast@kernel.org>
CommitDate: Wed Sep 7 11:05:17 2022 -0700

    bpf/verifier: allow kfunc to return an allocated mem

Where I had to add an extra "const int rdonly_buf_size" argument to
the kfunc to make the verifier happy.
But I haven't really looked at what would happen with the pointers to
structs (or why, at all, we have this restriction).


> >> See my response to John in the other thread about mixing stable UAPI (in
> >> xdp_md) and unstable BTF structures in the xdp_md struct: I think this
> >> is confusing and would prefer a kfunc.
>
> hmm... right, it will be one of the first considering the current __bpf_md_ptr()
> usages are only exposing another struct in uapi, eg. __bpf_md_ptr(struct
> bpf_sock *, sk).
>
> A kfunc will also be fine.
>
Martin KaFai Lau Nov. 11, 2022, 12:33 a.m. UTC | #27
On 11/10/22 3:52 PM, Stanislav Fomichev wrote:
> On Thu, Nov 10, 2022 at 3:14 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> Skipping to the last bit:
>>
>>>>>>>     } else {
>>>>>>>       use kfuncs
>>>>>>>     }
>>>>>>>
>>>>>>> 5. Support the case where we keep program's metadata and kernel's
>>>>>>> xdp_to_skb_metadata
>>>>>>>     - skb_metadata_import_from_xdp() will "consume" it by mem-moving the
>>>>>>> rest of the metadata over it and adjusting the headroom
>>>>>>
>>>>>> I was thinking the kernel's xdp_to_skb_metadata is always before the program's
>>>>>> metadata.  xdp prog should usually work in this order also: read/write headers,
>>>>>> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return
>>>>>> XDP_PASS/XDP_REDIRECT.  When it is XDP_PASS, the kernel just needs to pop the
>>>>>> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc.
>>>>>>
>>>>>> For the kernel and xdp prog, I don't think it matters where the
>>>>>> xdp_to_skb_metadata is.  However, the xdp->data_meta (program's metadata) has to
>>>>>> be before xdp->data because of the current data_meta and data comparison usage
>>>>>> in the xdp prog.
>>>>>>
>>>>>> The order of the kernel's xdp_to_skb_metadata and the program's metadata
>>>>>> probably only matters to the userspace AF_XDP.  However, I don't see how AF_XDP
>>>>>> supports the program's metadata now.  afaict, it can only work now if there is
>>>>>> some sort of contract between them or the AF_XDP currently does not use the
>>>>>> program's metadata.  Either way, we can do the mem-moving only for AF_XDP and it
>>>>>> should be a no op if there is no program's metadata?  This behavior could also
>>>>>> be configurable through setsockopt?
>>>>>
>>>>> Agreed on all of the above. For now it seems like the safest thing to
>>>>> do is to put xdp_to_skb_metadata last to allow af_xdp to properly
>>>>> locate btf_id.
>>>>> Let's see if Toke disagrees :-)
>>>>
>>>> As I replied to Martin, I'm not sure it's worth the complexity to
>>>> logically split the SKB metadata from the program's own metadata (as
>>>> opposed to just reusing the existing data_meta pointer)?
>>>
>>> I'd gladly keep my current requirement where it's either or, but not both :-)
>>> We can relax it later if required?
>>
>> So the way I've been thinking about it is simply that the skb_metadata
>> would live in the same place at the data_meta pointer (including
>> adjusting that pointer to accommodate it), and just overriding the
>> existing program metadata, if any exists. But looking at it now, I guess
>> having the split makes it easier for a program to write its own custom
>> metadata and still use the skb metadata. See below about the ordering.
>>
>>>> However, if we do, the layout that makes most sense to me is putting the
>>>> skb metadata before the program metadata, like:
>>>>
>>>> --------------
>>>> | skb_metadata
>>>> --------------
>>>> | data_meta
>>>> --------------
>>>> | data
>>>> --------------
>>>>

Yeah, for the kernel and xdp prog (ie not AF_XDP), I meant this:

| skb_metadata | custom metadata | data |

>>>> Not sure if that's what you meant? :)
>>>
>>> I was suggesting the other way around: |custom meta|skb_metadata|data|
>>> (but, as Martin points out, consuming skb_metadata in the kernel
>>> becomes messier)
>>>
>>> af_xdp can check whether skb_metdata is present by looking at data -
>>> offsetof(struct skb_metadata, btf_id).
>>> progs that know how to handle custom metadata, will look at data -
>>> sizeof(skb_metadata)
>>>
>>> Otherwise, if it's the other way around, how do we find skb_metadata
>>> in a redirected frame?
>>> Let's say we have |skb_metadata|custom meta|data|, how does the final
>>> program find skb_metadata?
>>> All the progs have to agree on the sizeof(tc/custom meta), right?
>>
>> Erm, maybe I'm missing something here, but skb_metadata is fixed size,
>> right? So if the "skb_metadata is present" flag is set, we know that the
>> sizeof(skb_metadata) bytes before the data_meta pointer contains the
>> metadata, and if the flag is not set, we know those bytes are not valid
>> metadata.

right, so to get to the skb_metadata, it will be
data_meta -= sizeof(skb_metadata);  /* probably need alignment */

>>
>> For AF_XDP, we'd need to transfer the flag as well, and it could apply
>> the same logic (getting the size from the vmlinux BTF).
>>
>> By this logic, the BTF_ID should be the *first* entry of struct
>> skb_metadata, since that will be the field AF_XDP programs can find
>> right off the bat, no? >
> The problem with AF_XDP is that, IIUC, it doesn't have a data_meta
> pointer in the userspace.

Yep. It is my understanding also.  Missing data_meta pointer in the AF_XDP 
rx_desc is a potential problem.  Having BTF_ID or not won't help.

> 
> You get an rx descriptor where the address points to the 'data':
> | 256 bytes headroom where metadata can go | data |
> 
> So you have (at most) 256 bytes of headroom, some of that might be the
> metadata, but you really don't know where it starts. But you know it
> definitely ends where the data begins.
> 
> So if we have the following, we can locate skb_metadata:
> | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data |
> data - sizeof(skb_metadata) will get you there
> 
> But if it's the other way around, the program has to know
> sizeof(custom metadata) to locate skb_metadata:
> | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data |

Right, this won't work if the AF_XDP user does not know how big the custom 
metadata is.  The kernel then needs to swap the "skb_metadata" and "custom 
metadata" + setting a flag in the AF_XDP rx_desc->options to make it looks like 
this:
| custom metadata | skb_metadata | data |

However, since data_meta is missing from the rx_desc, may be we can safely 
assume the AF_XDP user always knows the size of the custom metadata or there is 
usually no "custom metadata" and no swap is needed?
Martin KaFai Lau Nov. 11, 2022, 12:45 a.m. UTC | #28
On 11/10/22 4:10 PM, Toke Høiland-Jørgensen wrote:
>> The problem with AF_XDP is that, IIUC, it doesn't have a data_meta
>> pointer in the userspace.
>>
>> You get an rx descriptor where the address points to the 'data':
>> | 256 bytes headroom where metadata can go | data |
> 
> Ah, I was missing the bit where the data pointer actually points at
> data, not the start of the buf. Oops, my bad!
> 
>> So you have (at most) 256 bytes of headroom, some of that might be the
>> metadata, but you really don't know where it starts. But you know it
>> definitely ends where the data begins.
>>
>> So if we have the following, we can locate skb_metadata:
>> | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data |
>> data - sizeof(skb_metadata) will get you there
>>
>> But if it's the other way around, the program has to know
>> sizeof(custom metadata) to locate skb_metadata:
>> | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data |
>>
>> Am I missing something here?
> 
> Hmm, so one could argue that the only way AF_XDP can consume custom
> metadata today is if it knows out of band what the size of it is. And if
> it knows that, it can just skip over it to go back to the skb_metadata,
> no?

+1 I replied with a similar point in another email.  I also think we can safely 
assume this.

> 
> The only problem left then is if there were multiple XDP programs called
> in sequence (whether before a redirect, or by libxdp chaining or tail
> calls), and the first one resized the metadata area without the last one
> knowing about it. For this, we could add a CLOBBER_PROGRAM_META flag to
> the skb_metadata helper which if set will ensure that the program
> metadata length is reset to 0?

How is it different from the same xdp prog calling bpf_xdp_adjust_meta() and 
bpf_xdp_metadata_export_to_skb() multiple times.  The earlier stored 
skb_metadata needs to be moved during the latter bpf_xdp_adjust_meta().  The 
latter bpf_xdp_metadata_export_to_skb() will overwrite the earlier skb_metadata.
Stanislav Fomichev Nov. 11, 2022, 12:57 a.m. UTC | #29
On Thu, Nov 10, 2022 at 4:33 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 11/10/22 3:52 PM, Stanislav Fomichev wrote:
> > On Thu, Nov 10, 2022 at 3:14 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >>
> >> Skipping to the last bit:
> >>
> >>>>>>>     } else {
> >>>>>>>       use kfuncs
> >>>>>>>     }
> >>>>>>>
> >>>>>>> 5. Support the case where we keep program's metadata and kernel's
> >>>>>>> xdp_to_skb_metadata
> >>>>>>>     - skb_metadata_import_from_xdp() will "consume" it by mem-moving the
> >>>>>>> rest of the metadata over it and adjusting the headroom
> >>>>>>
> >>>>>> I was thinking the kernel's xdp_to_skb_metadata is always before the program's
> >>>>>> metadata.  xdp prog should usually work in this order also: read/write headers,
> >>>>>> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return
> >>>>>> XDP_PASS/XDP_REDIRECT.  When it is XDP_PASS, the kernel just needs to pop the
> >>>>>> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc.
> >>>>>>
> >>>>>> For the kernel and xdp prog, I don't think it matters where the
> >>>>>> xdp_to_skb_metadata is.  However, the xdp->data_meta (program's metadata) has to
> >>>>>> be before xdp->data because of the current data_meta and data comparison usage
> >>>>>> in the xdp prog.
> >>>>>>
> >>>>>> The order of the kernel's xdp_to_skb_metadata and the program's metadata
> >>>>>> probably only matters to the userspace AF_XDP.  However, I don't see how AF_XDP
> >>>>>> supports the program's metadata now.  afaict, it can only work now if there is
> >>>>>> some sort of contract between them or the AF_XDP currently does not use the
> >>>>>> program's metadata.  Either way, we can do the mem-moving only for AF_XDP and it
> >>>>>> should be a no op if there is no program's metadata?  This behavior could also
> >>>>>> be configurable through setsockopt?
> >>>>>
> >>>>> Agreed on all of the above. For now it seems like the safest thing to
> >>>>> do is to put xdp_to_skb_metadata last to allow af_xdp to properly
> >>>>> locate btf_id.
> >>>>> Let's see if Toke disagrees :-)
> >>>>
> >>>> As I replied to Martin, I'm not sure it's worth the complexity to
> >>>> logically split the SKB metadata from the program's own metadata (as
> >>>> opposed to just reusing the existing data_meta pointer)?
> >>>
> >>> I'd gladly keep my current requirement where it's either or, but not both :-)
> >>> We can relax it later if required?
> >>
> >> So the way I've been thinking about it is simply that the skb_metadata
> >> would live in the same place at the data_meta pointer (including
> >> adjusting that pointer to accommodate it), and just overriding the
> >> existing program metadata, if any exists. But looking at it now, I guess
> >> having the split makes it easier for a program to write its own custom
> >> metadata and still use the skb metadata. See below about the ordering.
> >>
> >>>> However, if we do, the layout that makes most sense to me is putting the
> >>>> skb metadata before the program metadata, like:
> >>>>
> >>>> --------------
> >>>> | skb_metadata
> >>>> --------------
> >>>> | data_meta
> >>>> --------------
> >>>> | data
> >>>> --------------
> >>>>
>
> Yeah, for the kernel and xdp prog (ie not AF_XDP), I meant this:
>
> | skb_metadata | custom metadata | data |
>
> >>>> Not sure if that's what you meant? :)
> >>>
> >>> I was suggesting the other way around: |custom meta|skb_metadata|data|
> >>> (but, as Martin points out, consuming skb_metadata in the kernel
> >>> becomes messier)
> >>>
> >>> af_xdp can check whether skb_metdata is present by looking at data -
> >>> offsetof(struct skb_metadata, btf_id).
> >>> progs that know how to handle custom metadata, will look at data -
> >>> sizeof(skb_metadata)
> >>>
> >>> Otherwise, if it's the other way around, how do we find skb_metadata
> >>> in a redirected frame?
> >>> Let's say we have |skb_metadata|custom meta|data|, how does the final
> >>> program find skb_metadata?
> >>> All the progs have to agree on the sizeof(tc/custom meta), right?
> >>
> >> Erm, maybe I'm missing something here, but skb_metadata is fixed size,
> >> right? So if the "skb_metadata is present" flag is set, we know that the
> >> sizeof(skb_metadata) bytes before the data_meta pointer contains the
> >> metadata, and if the flag is not set, we know those bytes are not valid
> >> metadata.
>
> right, so to get to the skb_metadata, it will be
> data_meta -= sizeof(skb_metadata);  /* probably need alignment */
>
> >>
> >> For AF_XDP, we'd need to transfer the flag as well, and it could apply
> >> the same logic (getting the size from the vmlinux BTF).
> >>
> >> By this logic, the BTF_ID should be the *first* entry of struct
> >> skb_metadata, since that will be the field AF_XDP programs can find
> >> right off the bat, no? >
> > The problem with AF_XDP is that, IIUC, it doesn't have a data_meta
> > pointer in the userspace.
>
> Yep. It is my understanding also.  Missing data_meta pointer in the AF_XDP
> rx_desc is a potential problem.  Having BTF_ID or not won't help.
>
> >
> > You get an rx descriptor where the address points to the 'data':
> > | 256 bytes headroom where metadata can go | data |
> >
> > So you have (at most) 256 bytes of headroom, some of that might be the
> > metadata, but you really don't know where it starts. But you know it
> > definitely ends where the data begins.
> >
> > So if we have the following, we can locate skb_metadata:
> > | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data |
> > data - sizeof(skb_metadata) will get you there
> >
> > But if it's the other way around, the program has to know
> > sizeof(custom metadata) to locate skb_metadata:
> > | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data |
>
> Right, this won't work if the AF_XDP user does not know how big the custom
> metadata is.  The kernel then needs to swap the "skb_metadata" and "custom
> metadata" + setting a flag in the AF_XDP rx_desc->options to make it looks like
> this:
> | custom metadata | skb_metadata | data |
>
> However, since data_meta is missing from the rx_desc, may be we can safely
> assume the AF_XDP user always knows the size of the custom metadata or there is
> usually no "custom metadata" and no swap is needed?

If we can assume they can share that info, can they also share more
info on what kind of metadata they would prefer to get?
If they can agree on the size, maybe they also can agree on the flows
that need skb_metdata vs the flows that need a custom one?

Seems like we can start with supporting either one, but not both and
extend in the future once we have more understanding on whether it's
actually needed or not?

bpf_xdp_metadata_export_to_skb: adjust data meta, add uses-skb-metadata flag
bpf_xdp_adjust_meta: unconditionally reset uses-skb-metadata flag
Martin KaFai Lau Nov. 11, 2022, 1:26 a.m. UTC | #30
On 11/10/22 4:57 PM, Stanislav Fomichev wrote:
> On Thu, Nov 10, 2022 at 4:33 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 11/10/22 3:52 PM, Stanislav Fomichev wrote:
>>> On Thu, Nov 10, 2022 at 3:14 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>>>
>>>> Skipping to the last bit:
>>>>
>>>>>>>>>      } else {
>>>>>>>>>        use kfuncs
>>>>>>>>>      }
>>>>>>>>>
>>>>>>>>> 5. Support the case where we keep program's metadata and kernel's
>>>>>>>>> xdp_to_skb_metadata
>>>>>>>>>      - skb_metadata_import_from_xdp() will "consume" it by mem-moving the
>>>>>>>>> rest of the metadata over it and adjusting the headroom
>>>>>>>>
>>>>>>>> I was thinking the kernel's xdp_to_skb_metadata is always before the program's
>>>>>>>> metadata.  xdp prog should usually work in this order also: read/write headers,
>>>>>>>> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return
>>>>>>>> XDP_PASS/XDP_REDIRECT.  When it is XDP_PASS, the kernel just needs to pop the
>>>>>>>> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc.
>>>>>>>>
>>>>>>>> For the kernel and xdp prog, I don't think it matters where the
>>>>>>>> xdp_to_skb_metadata is.  However, the xdp->data_meta (program's metadata) has to
>>>>>>>> be before xdp->data because of the current data_meta and data comparison usage
>>>>>>>> in the xdp prog.
>>>>>>>>
>>>>>>>> The order of the kernel's xdp_to_skb_metadata and the program's metadata
>>>>>>>> probably only matters to the userspace AF_XDP.  However, I don't see how AF_XDP
>>>>>>>> supports the program's metadata now.  afaict, it can only work now if there is
>>>>>>>> some sort of contract between them or the AF_XDP currently does not use the
>>>>>>>> program's metadata.  Either way, we can do the mem-moving only for AF_XDP and it
>>>>>>>> should be a no op if there is no program's metadata?  This behavior could also
>>>>>>>> be configurable through setsockopt?
>>>>>>>
>>>>>>> Agreed on all of the above. For now it seems like the safest thing to
>>>>>>> do is to put xdp_to_skb_metadata last to allow af_xdp to properly
>>>>>>> locate btf_id.
>>>>>>> Let's see if Toke disagrees :-)
>>>>>>
>>>>>> As I replied to Martin, I'm not sure it's worth the complexity to
>>>>>> logically split the SKB metadata from the program's own metadata (as
>>>>>> opposed to just reusing the existing data_meta pointer)?
>>>>>
>>>>> I'd gladly keep my current requirement where it's either or, but not both :-)
>>>>> We can relax it later if required?
>>>>
>>>> So the way I've been thinking about it is simply that the skb_metadata
>>>> would live in the same place at the data_meta pointer (including
>>>> adjusting that pointer to accommodate it), and just overriding the
>>>> existing program metadata, if any exists. But looking at it now, I guess
>>>> having the split makes it easier for a program to write its own custom
>>>> metadata and still use the skb metadata. See below about the ordering.
>>>>
>>>>>> However, if we do, the layout that makes most sense to me is putting the
>>>>>> skb metadata before the program metadata, like:
>>>>>>
>>>>>> --------------
>>>>>> | skb_metadata
>>>>>> --------------
>>>>>> | data_meta
>>>>>> --------------
>>>>>> | data
>>>>>> --------------
>>>>>>
>>
>> Yeah, for the kernel and xdp prog (ie not AF_XDP), I meant this:
>>
>> | skb_metadata | custom metadata | data |
>>
>>>>>> Not sure if that's what you meant? :)
>>>>>
>>>>> I was suggesting the other way around: |custom meta|skb_metadata|data|
>>>>> (but, as Martin points out, consuming skb_metadata in the kernel
>>>>> becomes messier)
>>>>>
>>>>> af_xdp can check whether skb_metdata is present by looking at data -
>>>>> offsetof(struct skb_metadata, btf_id).
>>>>> progs that know how to handle custom metadata, will look at data -
>>>>> sizeof(skb_metadata)
>>>>>
>>>>> Otherwise, if it's the other way around, how do we find skb_metadata
>>>>> in a redirected frame?
>>>>> Let's say we have |skb_metadata|custom meta|data|, how does the final
>>>>> program find skb_metadata?
>>>>> All the progs have to agree on the sizeof(tc/custom meta), right?
>>>>
>>>> Erm, maybe I'm missing something here, but skb_metadata is fixed size,
>>>> right? So if the "skb_metadata is present" flag is set, we know that the
>>>> sizeof(skb_metadata) bytes before the data_meta pointer contains the
>>>> metadata, and if the flag is not set, we know those bytes are not valid
>>>> metadata.
>>
>> right, so to get to the skb_metadata, it will be
>> data_meta -= sizeof(skb_metadata);  /* probably need alignment */
>>
>>>>
>>>> For AF_XDP, we'd need to transfer the flag as well, and it could apply
>>>> the same logic (getting the size from the vmlinux BTF).
>>>>
>>>> By this logic, the BTF_ID should be the *first* entry of struct
>>>> skb_metadata, since that will be the field AF_XDP programs can find
>>>> right off the bat, no? >
>>> The problem with AF_XDP is that, IIUC, it doesn't have a data_meta
>>> pointer in the userspace.
>>
>> Yep. It is my understanding also.  Missing data_meta pointer in the AF_XDP
>> rx_desc is a potential problem.  Having BTF_ID or not won't help.
>>
>>>
>>> You get an rx descriptor where the address points to the 'data':
>>> | 256 bytes headroom where metadata can go | data |
>>>
>>> So you have (at most) 256 bytes of headroom, some of that might be the
>>> metadata, but you really don't know where it starts. But you know it
>>> definitely ends where the data begins.
>>>
>>> So if we have the following, we can locate skb_metadata:
>>> | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data |
>>> data - sizeof(skb_metadata) will get you there
>>>
>>> But if it's the other way around, the program has to know
>>> sizeof(custom metadata) to locate skb_metadata:
>>> | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data |
>>
>> Right, this won't work if the AF_XDP user does not know how big the custom
>> metadata is.  The kernel then needs to swap the "skb_metadata" and "custom
>> metadata" + setting a flag in the AF_XDP rx_desc->options to make it looks like
>> this:
>> | custom metadata | skb_metadata | data |
>>
>> However, since data_meta is missing from the rx_desc, may be we can safely
>> assume the AF_XDP user always knows the size of the custom metadata or there is
>> usually no "custom metadata" and no swap is needed?
> 
> If we can assume they can share that info, can they also share more
> info on what kind of metadata they would prefer to get?
> If they can agree on the size, maybe they also can agree on the flows
> that need skb_metdata vs the flows that need a custom one?
> 
> Seems like we can start with supporting either one, but not both and
> extend in the future once we have more understanding on whether it's
> actually needed or not?
> 
> bpf_xdp_metadata_export_to_skb: adjust data meta, add uses-skb-metadata flag
> bpf_xdp_adjust_meta: unconditionally reset uses-skb-metadata flag
hmm... I am thinking:

bpf_xdp_adjust_meta: move the existing (if any) skb_metadata and adjust 
xdp->data_meta.

bpf_xdp_metadata_export_to_skb: If skb_metadata exists, overwrites the existing 
one.  If not exists, gets headroom before xdp->data_meta and writes hints.
Martin KaFai Lau Nov. 11, 2022, 1:39 a.m. UTC | #31
On 11/10/22 3:29 PM, Toke Høiland-Jørgensen wrote:
>>> For the metadata consumed by the stack right now it's a bit
>>> hypothetical, yeah. However, there's a bunch of metadata commonly
>>> supported by hardware that the stack currently doesn't consume and that
>>> hopefully this feature will end up making more accessible. My hope is
>>> that the stack can also learn how to use this in the future, in which
>>> case we may run out of space. So I think of that bit mostly as
>>> future-proofing...
>>
>> ic. in this case, Can the btf_id be added to 'struct xdp_to_skb_metadata' later
>> if it is indeed needed?  The 'struct xdp_to_skb_metadata' is not in UAPI and
>> doing it with CO-RE is to give us flexibility to make this kind of changes in
>> the future.
> 
> My worry is mostly that it'll be more painful to add it later than just
> including it from the start, mostly because of AF_XDP users. But if we
> do the randomisation thing (thus forcing AF_XDP users to deal with the
> dynamic layout as well), it should be possible to add it later, and I
> can live with that option as well...

imo, considering we are trying to optimize unnecessary field initialization as 
below, it is sort of wasteful to always initialize the btf_id with the same 
value.  It is better to add it in the future when there is a need.

>>>>> We should probably also have a flag set on the xdp_frame so the stack
>>>>> knows that the metadata area contains relevant-to-skb data, to guard
>>>>> against an XDP program accidentally hitting the "magic number" (BTF_ID)
>>>>> in unrelated stuff it puts into the metadata area.
>>>>
>>>> Yeah, I think having a flag is useful.  The flag will be set at xdp_buff and
>>>> then transfer to the xdp_frame?
>>>
>>> Yeah, exactly!
>>>
>>>>>> After re-reading patch 6, have another question. The 'void
>>>>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at
>>>>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *'
>>>>>> pointer and the xdp prog can directly read (or even write) it?
>>>>>
>>>>> Hmm, I'm not sure returning a failure makes sense? Failure to read one
>>>>> or more fields just means that those fields will not be populated? We
>>>>> should probably have a flags field inside the metadata struct itself to
>>>>> indicate which fields are set or not, but I'm not sure returning an
>>>>> error value adds anything? Returning a pointer to the metadata field
>>>>> might be convenient for users (it would just be an alias to the
>>>>> data_meta pointer, but the verifier could know its size, so the program
>>>>> doesn't have to bounds check it).
>>>>
>>>> If some hints are not available, those hints should be initialized to
>>>> 0/CHECKSUM_NONE/...etc.
>>>
>>> The problem with that is that then we have to spend cycles writing
>>> eight bytes of zeroes into the checksum field :)
>>
>> With a common 'struct xdp_to_skb_metadata', I am not sure how some of these zero
>> writes can be avoided.  If the xdp prog wants to optimize, it can call
>> individual kfunc to get individual hints.
> 
> Erm, we just... don't write those fields? Something like:
> 
> void write_skb_meta(hw, ctx) {
>    struct xdp_skb_metadata meta = ctx->data_meta - sizeof(struct xdp_skb_metadata);
>    meta->valid_fields = 0;
> 
>    if (hw_has_timestamp) {
>      meta->timestamp = hw->timestamp;
>      meta->valid_fields |= FIELD_TIMESTAMP;
>    } /* otherwise meta->timestamp is just uninitialised */
> 
>    if (hw_has_rxhash) {
>      meta->rxhash = hw->rxhash;
>      meta->valid_fields |= FIELD_RXHASH;
>    } /* otherwise meta->rxhash is just uninitialised */
>    ...etc...
> }

Ah, got it.  Make sense.  My mind was stalled in the paradigm that a helper that 
needs to initialize the result.
Toke Høiland-Jørgensen Nov. 11, 2022, 9:37 a.m. UTC | #32
Martin KaFai Lau <martin.lau@linux.dev> writes:

> On 11/10/22 4:10 PM, Toke Høiland-Jørgensen wrote:
>>> The problem with AF_XDP is that, IIUC, it doesn't have a data_meta
>>> pointer in the userspace.
>>>
>>> You get an rx descriptor where the address points to the 'data':
>>> | 256 bytes headroom where metadata can go | data |
>> 
>> Ah, I was missing the bit where the data pointer actually points at
>> data, not the start of the buf. Oops, my bad!
>> 
>>> So you have (at most) 256 bytes of headroom, some of that might be the
>>> metadata, but you really don't know where it starts. But you know it
>>> definitely ends where the data begins.
>>>
>>> So if we have the following, we can locate skb_metadata:
>>> | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data |
>>> data - sizeof(skb_metadata) will get you there
>>>
>>> But if it's the other way around, the program has to know
>>> sizeof(custom metadata) to locate skb_metadata:
>>> | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data |
>>>
>>> Am I missing something here?
>> 
>> Hmm, so one could argue that the only way AF_XDP can consume custom
>> metadata today is if it knows out of band what the size of it is. And if
>> it knows that, it can just skip over it to go back to the skb_metadata,
>> no?
>
> +1 I replied with a similar point in another email. I also think we
> can safely assume this.

Great!

>> 
>> The only problem left then is if there were multiple XDP programs called
>> in sequence (whether before a redirect, or by libxdp chaining or tail
>> calls), and the first one resized the metadata area without the last one
>> knowing about it. For this, we could add a CLOBBER_PROGRAM_META flag to
>> the skb_metadata helper which if set will ensure that the program
>> metadata length is reset to 0?
>
> How is it different from the same xdp prog calling bpf_xdp_adjust_meta() and 
> bpf_xdp_metadata_export_to_skb() multiple times.  The earlier stored 
> skb_metadata needs to be moved during the latter bpf_xdp_adjust_meta().  The 
> latter bpf_xdp_metadata_export_to_skb() will overwrite the earlier skb_metadata.

Well, it would just be a convenience flag, so instead of doing:

metalen = ctx->data - ctx->data_meta;
if (metalen)
  xdp_adjust_meta(-metalen);
bpf_xdp_metadata_export_to_skb(ctx);

you could just do:

bpf_xdp_metadata_export_to_skb(ctx, CLOBBER_PROGRAM_META);

and the kernel would do the check+move for you. But, well, the couple of
extra instructions to do the check in BPF is probably fine.

(I'm talking here about a program that wants to make sure that any
custom metadata that may have been added by an earlier program is
removed before redirecting to an XSK socket; I expect we'd want to do
something like this in the default program in libxdp).

-Toke
Toke Høiland-Jørgensen Nov. 11, 2022, 9:41 a.m. UTC | #33
Martin KaFai Lau <martin.lau@linux.dev> writes:

> On 11/10/22 4:57 PM, Stanislav Fomichev wrote:
>> On Thu, Nov 10, 2022 at 4:33 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>>
>>> On 11/10/22 3:52 PM, Stanislav Fomichev wrote:
>>>> On Thu, Nov 10, 2022 at 3:14 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>>>>
>>>>> Skipping to the last bit:
>>>>>
>>>>>>>>>>      } else {
>>>>>>>>>>        use kfuncs
>>>>>>>>>>      }
>>>>>>>>>>
>>>>>>>>>> 5. Support the case where we keep program's metadata and kernel's
>>>>>>>>>> xdp_to_skb_metadata
>>>>>>>>>>      - skb_metadata_import_from_xdp() will "consume" it by mem-moving the
>>>>>>>>>> rest of the metadata over it and adjusting the headroom
>>>>>>>>>
>>>>>>>>> I was thinking the kernel's xdp_to_skb_metadata is always before the program's
>>>>>>>>> metadata.  xdp prog should usually work in this order also: read/write headers,
>>>>>>>>> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return
>>>>>>>>> XDP_PASS/XDP_REDIRECT.  When it is XDP_PASS, the kernel just needs to pop the
>>>>>>>>> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc.
>>>>>>>>>
>>>>>>>>> For the kernel and xdp prog, I don't think it matters where the
>>>>>>>>> xdp_to_skb_metadata is.  However, the xdp->data_meta (program's metadata) has to
>>>>>>>>> be before xdp->data because of the current data_meta and data comparison usage
>>>>>>>>> in the xdp prog.
>>>>>>>>>
>>>>>>>>> The order of the kernel's xdp_to_skb_metadata and the program's metadata
>>>>>>>>> probably only matters to the userspace AF_XDP.  However, I don't see how AF_XDP
>>>>>>>>> supports the program's metadata now.  afaict, it can only work now if there is
>>>>>>>>> some sort of contract between them or the AF_XDP currently does not use the
>>>>>>>>> program's metadata.  Either way, we can do the mem-moving only for AF_XDP and it
>>>>>>>>> should be a no op if there is no program's metadata?  This behavior could also
>>>>>>>>> be configurable through setsockopt?
>>>>>>>>
>>>>>>>> Agreed on all of the above. For now it seems like the safest thing to
>>>>>>>> do is to put xdp_to_skb_metadata last to allow af_xdp to properly
>>>>>>>> locate btf_id.
>>>>>>>> Let's see if Toke disagrees :-)
>>>>>>>
>>>>>>> As I replied to Martin, I'm not sure it's worth the complexity to
>>>>>>> logically split the SKB metadata from the program's own metadata (as
>>>>>>> opposed to just reusing the existing data_meta pointer)?
>>>>>>
>>>>>> I'd gladly keep my current requirement where it's either or, but not both :-)
>>>>>> We can relax it later if required?
>>>>>
>>>>> So the way I've been thinking about it is simply that the skb_metadata
>>>>> would live in the same place at the data_meta pointer (including
>>>>> adjusting that pointer to accommodate it), and just overriding the
>>>>> existing program metadata, if any exists. But looking at it now, I guess
>>>>> having the split makes it easier for a program to write its own custom
>>>>> metadata and still use the skb metadata. See below about the ordering.
>>>>>
>>>>>>> However, if we do, the layout that makes most sense to me is putting the
>>>>>>> skb metadata before the program metadata, like:
>>>>>>>
>>>>>>> --------------
>>>>>>> | skb_metadata
>>>>>>> --------------
>>>>>>> | data_meta
>>>>>>> --------------
>>>>>>> | data
>>>>>>> --------------
>>>>>>>
>>>
>>> Yeah, for the kernel and xdp prog (ie not AF_XDP), I meant this:
>>>
>>> | skb_metadata | custom metadata | data |
>>>
>>>>>>> Not sure if that's what you meant? :)
>>>>>>
>>>>>> I was suggesting the other way around: |custom meta|skb_metadata|data|
>>>>>> (but, as Martin points out, consuming skb_metadata in the kernel
>>>>>> becomes messier)
>>>>>>
>>>>>> af_xdp can check whether skb_metdata is present by looking at data -
>>>>>> offsetof(struct skb_metadata, btf_id).
>>>>>> progs that know how to handle custom metadata, will look at data -
>>>>>> sizeof(skb_metadata)
>>>>>>
>>>>>> Otherwise, if it's the other way around, how do we find skb_metadata
>>>>>> in a redirected frame?
>>>>>> Let's say we have |skb_metadata|custom meta|data|, how does the final
>>>>>> program find skb_metadata?
>>>>>> All the progs have to agree on the sizeof(tc/custom meta), right?
>>>>>
>>>>> Erm, maybe I'm missing something here, but skb_metadata is fixed size,
>>>>> right? So if the "skb_metadata is present" flag is set, we know that the
>>>>> sizeof(skb_metadata) bytes before the data_meta pointer contains the
>>>>> metadata, and if the flag is not set, we know those bytes are not valid
>>>>> metadata.
>>>
>>> right, so to get to the skb_metadata, it will be
>>> data_meta -= sizeof(skb_metadata);  /* probably need alignment */
>>>
>>>>>
>>>>> For AF_XDP, we'd need to transfer the flag as well, and it could apply
>>>>> the same logic (getting the size from the vmlinux BTF).
>>>>>
>>>>> By this logic, the BTF_ID should be the *first* entry of struct
>>>>> skb_metadata, since that will be the field AF_XDP programs can find
>>>>> right off the bat, no? >
>>>> The problem with AF_XDP is that, IIUC, it doesn't have a data_meta
>>>> pointer in the userspace.
>>>
>>> Yep. It is my understanding also.  Missing data_meta pointer in the AF_XDP
>>> rx_desc is a potential problem.  Having BTF_ID or not won't help.
>>>
>>>>
>>>> You get an rx descriptor where the address points to the 'data':
>>>> | 256 bytes headroom where metadata can go | data |
>>>>
>>>> So you have (at most) 256 bytes of headroom, some of that might be the
>>>> metadata, but you really don't know where it starts. But you know it
>>>> definitely ends where the data begins.
>>>>
>>>> So if we have the following, we can locate skb_metadata:
>>>> | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data |
>>>> data - sizeof(skb_metadata) will get you there
>>>>
>>>> But if it's the other way around, the program has to know
>>>> sizeof(custom metadata) to locate skb_metadata:
>>>> | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data |
>>>
>>> Right, this won't work if the AF_XDP user does not know how big the custom
>>> metadata is.  The kernel then needs to swap the "skb_metadata" and "custom
>>> metadata" + setting a flag in the AF_XDP rx_desc->options to make it looks like
>>> this:
>>> | custom metadata | skb_metadata | data |
>>>
>>> However, since data_meta is missing from the rx_desc, may be we can safely
>>> assume the AF_XDP user always knows the size of the custom metadata or there is
>>> usually no "custom metadata" and no swap is needed?
>> 
>> If we can assume they can share that info, can they also share more
>> info on what kind of metadata they would prefer to get?
>> If they can agree on the size, maybe they also can agree on the flows
>> that need skb_metdata vs the flows that need a custom one?
>> 
>> Seems like we can start with supporting either one, but not both and
>> extend in the future once we have more understanding on whether it's
>> actually needed or not?
>> 
>> bpf_xdp_metadata_export_to_skb: adjust data meta, add uses-skb-metadata flag
>> bpf_xdp_adjust_meta: unconditionally reset uses-skb-metadata flag
> hmm... I am thinking:
>
> bpf_xdp_adjust_meta: move the existing (if any) skb_metadata and adjust 
> xdp->data_meta.
>
> bpf_xdp_metadata_export_to_skb: If skb_metadata exists, overwrites the existing 
> one.  If not exists, gets headroom before xdp->data_meta and writes hints.

Yeah, +1 on this. For AF_XDP that means that if you only do
bpf_xdp_metadata_export_to_skb() you'll get the skb metadata right
before data, and if you add custom metadata yourself, that goes
in-between so you'll need to have some way to communicate the length to
the AF_XDP consumer. And a default program (like in libxdp) can just
check the metadata pointer and make sure to remove any metadata before
redirecting to userspace, like in the example from my other reply to
Martin.

-Toke
Toke Høiland-Jørgensen Nov. 11, 2022, 9:44 a.m. UTC | #34
Martin KaFai Lau <martin.lau@linux.dev> writes:

> On 11/10/22 3:29 PM, Toke Høiland-Jørgensen wrote:
>>>> For the metadata consumed by the stack right now it's a bit
>>>> hypothetical, yeah. However, there's a bunch of metadata commonly
>>>> supported by hardware that the stack currently doesn't consume and that
>>>> hopefully this feature will end up making more accessible. My hope is
>>>> that the stack can also learn how to use this in the future, in which
>>>> case we may run out of space. So I think of that bit mostly as
>>>> future-proofing...
>>>
>>> ic. in this case, Can the btf_id be added to 'struct xdp_to_skb_metadata' later
>>> if it is indeed needed?  The 'struct xdp_to_skb_metadata' is not in UAPI and
>>> doing it with CO-RE is to give us flexibility to make this kind of changes in
>>> the future.
>> 
>> My worry is mostly that it'll be more painful to add it later than just
>> including it from the start, mostly because of AF_XDP users. But if we
>> do the randomisation thing (thus forcing AF_XDP users to deal with the
>> dynamic layout as well), it should be possible to add it later, and I
>> can live with that option as well...
>
> imo, considering we are trying to optimize unnecessary field
> initialization as below, it is sort of wasteful to always initialize
> the btf_id with the same value. It is better to add it in the future
> when there is a need.

Okay, let's omit the BTF ID for now, and see what that looks like. I'll
try to keep in mind to see if I can find any reasons why we'd need to
add it back and make sure to complain before this lands if I find any :)

-Toke
diff mbox series

Patch

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 0e629ceb087b..d4cd0938360b 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1673,7 +1673,9 @@  static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 static void veth_unroll_kfunc(const struct bpf_prog *prog, u32 func_id,
 			      struct bpf_patch *patch)
 {
-	if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED)) {
+	if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_EXPORT_TO_SKB)) {
+		return xdp_metadata_export_to_skb(prog, patch);
+	} else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED)) {
 		/* return true; */
 		bpf_patch_append(patch, BPF_MOV64_IMM(BPF_REG_0, 1));
 	} else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) {
diff --git a/include/linux/bpf_patch.h b/include/linux/bpf_patch.h
index 81ff738eef8d..359c165ad68b 100644
--- a/include/linux/bpf_patch.h
+++ b/include/linux/bpf_patch.h
@@ -16,6 +16,8 @@  size_t bpf_patch_len(const struct bpf_patch *patch);
 int bpf_patch_err(const struct bpf_patch *patch);
 void __bpf_patch_append(struct bpf_patch *patch, struct bpf_insn insn);
 struct bpf_insn *bpf_patch_data(const struct bpf_patch *patch);
+void bpf_patch_resolve_jmp(struct bpf_patch *patch);
+u32 bpf_patch_magles_registers(const struct bpf_patch *patch);
 
 #define bpf_patch_append(patch, ...) ({ \
 	struct bpf_insn insn[] = { __VA_ARGS__ }; \
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 59c9fd55699d..dba857f212d7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4217,9 +4217,13 @@  static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
 	       true : __skb_metadata_differs(skb_a, skb_b, len_a);
 }
 
+void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len);
+
 static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
 {
 	skb_shinfo(skb)->meta_len = meta_len;
+	if (meta_len)
+		skb_metadata_import_from_xdp(skb, meta_len);
 }
 
 static inline void skb_metadata_clear(struct sk_buff *skb)
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 2a82a98f2f9f..8c97c6996172 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -411,6 +411,8 @@  void xdp_attachment_setup(struct xdp_attachment_info *info,
 #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE
 
 #define XDP_METADATA_KFUNC_xxx	\
+	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_EXPORT_TO_SKB, \
+			   bpf_xdp_metadata_export_to_skb) \
 	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED, \
 			   bpf_xdp_metadata_rx_timestamp_supported) \
 	XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \
@@ -423,14 +425,25 @@  XDP_METADATA_KFUNC_xxx
 MAX_XDP_METADATA_KFUNC,
 };
 
+struct xdp_to_skb_metadata {
+	u32 magic; /* xdp_metadata_magic */
+	u64 rx_timestamp;
+} __randomize_layout;
+
+struct bpf_patch;
+
 #ifdef CONFIG_DEBUG_INFO_BTF
+extern u32 xdp_metadata_magic;
 extern struct btf_id_set8 xdp_metadata_kfunc_ids;
 static inline u32 xdp_metadata_kfunc_id(int id)
 {
 	return xdp_metadata_kfunc_ids.pairs[id].id;
 }
+void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch);
 #else
+#define xdp_metadata_magic 0
 static inline u32 xdp_metadata_kfunc_id(int id) { return 0; }
+static void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch) { return 0; }
 #endif
 
 #endif /* __LINUX_NET_XDP_H__ */
diff --git a/kernel/bpf/bpf_patch.c b/kernel/bpf/bpf_patch.c
index 82a10bf5624a..8f1fef74299c 100644
--- a/kernel/bpf/bpf_patch.c
+++ b/kernel/bpf/bpf_patch.c
@@ -49,3 +49,33 @@  struct bpf_insn *bpf_patch_data(const struct bpf_patch *patch)
 {
 	return patch->insn;
 }
+
+void bpf_patch_resolve_jmp(struct bpf_patch *patch)
+{
+	int i;
+
+	for (i = 0; i < patch->len; i++) {
+		if (BPF_CLASS(patch->insn[i].code) != BPF_JMP)
+			continue;
+
+		if (BPF_SRC(patch->insn[i].code) != BPF_X)
+			continue;
+
+		if (patch->insn[i].off != S16_MAX)
+			continue;
+
+		patch->insn[i].off = patch->len - i - 1;
+	}
+}
+
+u32 bpf_patch_magles_registers(const struct bpf_patch *patch)
+{
+	u32 mask = 0;
+	int i;
+
+	for (i = 0; i < patch->len; i++) {
+		mask |= 1 << patch->insn[i].dst_reg;
+	}
+
+	return mask;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4e5c5ff35d5f..49f55f81e7f3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13902,6 +13902,24 @@  static int unroll_kfunc_call(struct bpf_verifier_env *env,
 		 */
 		bpf_patch_append(patch, BPF_MOV64_IMM(BPF_REG_0, 0));
 	}
+
+	if (func_id != xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_EXPORT_TO_SKB)) {
+		u32 allowed = 0;
+		u32 mangled = bpf_patch_magles_registers(patch);
+
+		/* See xdp_metadata_export_to_skb comment for the
+		 * reason about these constraints.
+		 */
+
+		allowed |= 1 << BPF_REG_0;
+		allowed |= 1 << BPF_REG_2;
+		allowed |= 1 << BPF_REG_3;
+		allowed |= 1 << BPF_REG_4;
+		allowed |= 1 << BPF_REG_5;
+
+		WARN_ON_ONCE(mangled & ~allowed);
+	}
+
 	return bpf_patch_err(patch);
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 42a35b59fb1e..37e3aef46525 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -72,6 +72,7 @@ 
 #include <net/mptcp.h>
 #include <net/mctp.h>
 #include <net/page_pool.h>
+#include <net/xdp.h>
 
 #include <linux/uaccess.h>
 #include <trace/events/skb.h>
@@ -6672,3 +6673,27 @@  nodefer:	__kfree_skb(skb);
 	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
 		smp_call_function_single_async(cpu, &sd->defer_csd);
 }
+
+void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len)
+{
+	struct xdp_to_skb_metadata *meta = (void *)(skb_mac_header(skb) - len);
+
+	/* Optional SKB info, currently missing:
+	 * - HW checksum info		(skb->ip_summed)
+	 * - HW RX hash			(skb_set_hash)
+	 * - RX ring dev queue index	(skb_record_rx_queue)
+	 */
+
+	if (len != sizeof(struct xdp_to_skb_metadata))
+		return;
+
+	if (meta->magic != xdp_metadata_magic)
+		return;
+
+	if (meta->rx_timestamp) {
+		*skb_hwtstamps(skb) = (struct skb_shared_hwtstamps){
+			.hwtstamp = ns_to_ktime(meta->rx_timestamp),
+		};
+	}
+}
+EXPORT_SYMBOL(skb_metadata_import_from_xdp);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 22f1e44700eb..8204fa05c5e9 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -653,12 +653,6 @@  struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 	/* Essential SKB info: protocol and skb->dev */
 	skb->protocol = eth_type_trans(skb, dev);
 
-	/* Optional SKB info, currently missing:
-	 * - HW checksum info		(skb->ip_summed)
-	 * - HW RX hash			(skb_set_hash)
-	 * - RX ring dev queue index	(skb_record_rx_queue)
-	 */
-
 	/* Until page_pool get SKB return path, release DMA here */
 	xdp_release_frame(xdpf);
 
@@ -712,6 +706,13 @@  struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
 	return nxdpf;
 }
 
+/* For the packets directed to the kernel, this kfunc exports XDP metadata
+ * into skb context.
+ */
+noinline void bpf_xdp_metadata_export_to_skb(const struct xdp_md *ctx)
+{
+}
+
 /* Indicates whether particular device supports rx_timestamp metadata.
  * This is an optional helper to support marking some branches as
  * "dead code" in the BPF programs.
@@ -737,13 +738,104 @@  XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 BTF_SET8_END(xdp_metadata_kfunc_ids)
 
+/* Make sure userspace doesn't depend on our layout by using
+ * different pseudo-generated magic value.
+ */
+u32 xdp_metadata_magic;
+
 static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
 	.owner = THIS_MODULE,
 	.set   = &xdp_metadata_kfunc_ids,
 };
 
+/* Since we're not actually doing a call but instead rewriting
+ * in place, we can only afford to use R0-R5 scratch registers.
+ *
+ * We reserve R1 for bpf_xdp_metadata_export_to_skb and let individual
+ * metadata kfuncs use only R0,R4-R5.
+ *
+ * The above also means we _cannot_ easily call any other helper/kfunc
+ * because there is no place for us to preserve our R1 argument;
+ * existing R6-R9 belong to the callee.
+ */
+void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch)
+{
+	u32 func_id;
+
+	/*
+	 * The code below generates the following:
+	 *
+	 * void bpf_xdp_metadata_export_to_skb(struct xdp_md *ctx)
+	 * {
+	 *	struct xdp_to_skb_metadata *meta;
+	 *	int ret;
+	 *
+	 *	ret = bpf_xdp_adjust_meta(ctx, -sizeof(*meta));
+	 *	if (!ret)
+	 *		return;
+	 *
+	 *	meta = ctx->data_meta;
+	 *	meta->magic = xdp_metadata_magic;
+	 *	meta->rx_timestamp = bpf_xdp_metadata_rx_timestamp(ctx);
+	 * }
+	 *
+	 */
+
+	bpf_patch_append(patch,
+		/* r2 = ((struct xdp_buff *)r1)->data_meta; */
+		BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
+			    offsetof(struct xdp_buff, data_meta)),
+		/* r3 = ((struct xdp_buff *)r1)->data; */
+		BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
+			    offsetof(struct xdp_buff, data)),
+		/* if (data_meta != data) return;
+		 *
+		 *	data_meta > data: xdp_data_meta_unsupported()
+		 *	data_meta < data: already used, no need to touch
+		 */
+		BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, S16_MAX),
+
+		/* r2 -= sizeof(struct xdp_to_skb_metadata); */
+		BPF_ALU64_IMM(BPF_SUB, BPF_REG_2,
+			      sizeof(struct xdp_to_skb_metadata)),
+		/* r3 = ((struct xdp_buff *)r1)->data_hard_start; */
+		BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
+			    offsetof(struct xdp_buff, data_hard_start)),
+		/* r3 += sizeof(struct xdp_frame) */
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_3,
+			      sizeof(struct xdp_frame)),
+		/* if (data-sizeof(struct xdp_to_skb_metadata) < data_hard_start+sizeof(struct xdp_frame)) return; */
+		BPF_JMP_REG(BPF_JLT, BPF_REG_2, BPF_REG_3, S16_MAX),
+
+		/* ((struct xdp_buff *)r1)->data_meta = r2; */
+		BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2,
+			    offsetof(struct xdp_buff, data_meta)),
+
+		/* *((struct xdp_to_skb_metadata *)r2)->magic = xdp_metadata_magic; */
+		BPF_ST_MEM(BPF_W, BPF_REG_2,
+			   offsetof(struct xdp_to_skb_metadata, magic),
+			   xdp_metadata_magic),
+	);
+
+	/*	r0 = bpf_xdp_metadata_rx_timestamp(ctx); */
+	func_id = xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP);
+	prog->aux->xdp_kfunc_ndo->ndo_unroll_kfunc(prog, func_id, patch);
+
+	bpf_patch_append(patch,
+		/* r2 = ((struct xdp_buff *)r1)->data_meta; */
+		BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
+			    offsetof(struct xdp_buff, data_meta)),
+		/* *((struct xdp_to_skb_metadata *)r2)->rx_timestamp = r0; */
+		BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0,
+			    offsetof(struct xdp_to_skb_metadata, rx_timestamp)),
+	);
+
+	bpf_patch_resolve_jmp(patch);
+}
+
 static int __init xdp_metadata_init(void)
 {
+	xdp_metadata_magic = get_random_u32() | 1;
 	return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set);
 }
 late_initcall(xdp_metadata_init);