Message ID | 20221104032532.1615099-7-sdf@google.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | xdp: hints via kfuncs | expand |
On 11/3/22 8:25 PM, Stanislav Fomichev wrote: > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index 59c9fd55699d..dba857f212d7 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -4217,9 +4217,13 @@ static inline bool skb_metadata_differs(const struct sk_buff *skb_a, > true : __skb_metadata_differs(skb_a, skb_b, len_a); > } > > +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len); > + > static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) > { > skb_shinfo(skb)->meta_len = meta_len; > + if (meta_len) > + skb_metadata_import_from_xdp(skb, meta_len); > } > [ ... ] > +struct xdp_to_skb_metadata { > + u32 magic; /* xdp_metadata_magic */ > + u64 rx_timestamp; > +} __randomize_layout; > + > +struct bpf_patch; > + [ ... ] > +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len) > +{ > + struct xdp_to_skb_metadata *meta = (void *)(skb_mac_header(skb) - len); > + > + /* Optional SKB info, currently missing: > + * - HW checksum info (skb->ip_summed) > + * - HW RX hash (skb_set_hash) > + * - RX ring dev queue index (skb_record_rx_queue) > + */ > + > + if (len != sizeof(struct xdp_to_skb_metadata)) > + return; > + > + if (meta->magic != xdp_metadata_magic) > + return; > + > + if (meta->rx_timestamp) { > + *skb_hwtstamps(skb) = (struct skb_shared_hwtstamps){ > + .hwtstamp = ns_to_ktime(meta->rx_timestamp), > + }; > + } > +} Considering the metadata will affect the gro, should the meta be cleared after importing to the skb? [ ... ] > +/* Since we're not actually doing a call but instead rewriting > + * in place, we can only afford to use R0-R5 scratch registers. > + * > + * We reserve R1 for bpf_xdp_metadata_export_to_skb and let individual > + * metadata kfuncs use only R0,R4-R5. > + * > + * The above also means we _cannot_ easily call any other helper/kfunc > + * because there is no place for us to preserve our R1 argument; > + * existing R6-R9 belong to the callee. > + */ > +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch) > +{ > + u32 func_id; > + > + /* > + * The code below generates the following: > + * > + * void bpf_xdp_metadata_export_to_skb(struct xdp_md *ctx) > + * { > + * struct xdp_to_skb_metadata *meta; > + * int ret; > + * > + * ret = bpf_xdp_adjust_meta(ctx, -sizeof(*meta)); > + * if (!ret) > + * return; > + * > + * meta = ctx->data_meta; > + * meta->magic = xdp_metadata_magic; > + * meta->rx_timestamp = bpf_xdp_metadata_rx_timestamp(ctx); > + * } > + * > + */ > + > + bpf_patch_append(patch, > + /* r2 = ((struct xdp_buff *)r1)->data_meta; */ > + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, > + offsetof(struct xdp_buff, data_meta)), > + /* r3 = ((struct xdp_buff *)r1)->data; */ > + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, > + offsetof(struct xdp_buff, data)), > + /* if (data_meta != data) return; > + * > + * data_meta > data: xdp_data_meta_unsupported() > + * data_meta < data: already used, no need to touch > + */ > + BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, S16_MAX), > + > + /* r2 -= sizeof(struct xdp_to_skb_metadata); */ > + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, > + sizeof(struct xdp_to_skb_metadata)), > + /* r3 = ((struct xdp_buff *)r1)->data_hard_start; */ > + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, > + offsetof(struct xdp_buff, data_hard_start)), > + /* r3 += sizeof(struct xdp_frame) */ > + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, > + sizeof(struct xdp_frame)), > + /* if (data-sizeof(struct xdp_to_skb_metadata) < data_hard_start+sizeof(struct xdp_frame)) return; */ > + BPF_JMP_REG(BPF_JLT, BPF_REG_2, BPF_REG_3, S16_MAX), > + > + /* ((struct xdp_buff *)r1)->data_meta = r2; */ > + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, > + offsetof(struct xdp_buff, data_meta)), > + > + /* *((struct xdp_to_skb_metadata *)r2)->magic = xdp_metadata_magic; */ > + BPF_ST_MEM(BPF_W, BPF_REG_2, > + offsetof(struct xdp_to_skb_metadata, magic), > + xdp_metadata_magic), > + ); > + > + /* r0 = bpf_xdp_metadata_rx_timestamp(ctx); */ > + func_id = xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP); > + prog->aux->xdp_kfunc_ndo->ndo_unroll_kfunc(prog, func_id, patch); > + > + bpf_patch_append(patch, > + /* r2 = ((struct xdp_buff *)r1)->data_meta; */ > + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, > + offsetof(struct xdp_buff, data_meta)), > + /* *((struct xdp_to_skb_metadata *)r2)->rx_timestamp = r0; */ > + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, > + offsetof(struct xdp_to_skb_metadata, rx_timestamp)), Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not sure it is solid enough by asking the xdp prog not to use the same random number in its own metadata + not to change the metadata through xdp->data_meta after calling bpf_xdp_metadata_export_to_skb(). Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the xdp_to_skb_metadata can be limited to XDP_REDIRECT only? > + ); > + > + bpf_patch_resolve_jmp(patch); > +} > + > static int __init xdp_metadata_init(void) > { > + xdp_metadata_magic = get_random_u32() | 1; > return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set); > } > late_initcall(xdp_metadata_init);
On Mon, Nov 7, 2022 at 2:02 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > On 11/3/22 8:25 PM, Stanislav Fomichev wrote: > > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > > index 59c9fd55699d..dba857f212d7 100644 > > --- a/include/linux/skbuff.h > > +++ b/include/linux/skbuff.h > > @@ -4217,9 +4217,13 @@ static inline bool skb_metadata_differs(const struct sk_buff *skb_a, > > true : __skb_metadata_differs(skb_a, skb_b, len_a); > > } > > > > +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len); > > + > > static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) > > { > > skb_shinfo(skb)->meta_len = meta_len; > > + if (meta_len) > > + skb_metadata_import_from_xdp(skb, meta_len); > > } > > > [ ... ] > > > +struct xdp_to_skb_metadata { > > + u32 magic; /* xdp_metadata_magic */ > > + u64 rx_timestamp; > > +} __randomize_layout; > > + > > +struct bpf_patch; > > + > > [ ... ] > > > +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len) > > +{ > > + struct xdp_to_skb_metadata *meta = (void *)(skb_mac_header(skb) - len); > > + > > + /* Optional SKB info, currently missing: > > + * - HW checksum info (skb->ip_summed) > > + * - HW RX hash (skb_set_hash) > > + * - RX ring dev queue index (skb_record_rx_queue) > > + */ > > + > > + if (len != sizeof(struct xdp_to_skb_metadata)) > > + return; > > + > > + if (meta->magic != xdp_metadata_magic) > > + return; > > + > > + if (meta->rx_timestamp) { > > + *skb_hwtstamps(skb) = (struct skb_shared_hwtstamps){ > > + .hwtstamp = ns_to_ktime(meta->rx_timestamp), > > + }; > > + } > > +} > > Considering the metadata will affect the gro, should the meta be cleared after > importing to the skb? Yeah, good suggestion, will clear it here. > [ ... ] > > > +/* Since we're not actually doing a call but instead rewriting > > + * in place, we can only afford to use R0-R5 scratch registers. > > + * > > + * We reserve R1 for bpf_xdp_metadata_export_to_skb and let individual > > + * metadata kfuncs use only R0,R4-R5. > > + * > > + * The above also means we _cannot_ easily call any other helper/kfunc > > + * because there is no place for us to preserve our R1 argument; > > + * existing R6-R9 belong to the callee. > > + */ > > +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch) > > +{ > > + u32 func_id; > > + > > + /* > > + * The code below generates the following: > > + * > > + * void bpf_xdp_metadata_export_to_skb(struct xdp_md *ctx) > > + * { > > + * struct xdp_to_skb_metadata *meta; > > + * int ret; > > + * > > + * ret = bpf_xdp_adjust_meta(ctx, -sizeof(*meta)); > > + * if (!ret) > > + * return; > > + * > > + * meta = ctx->data_meta; > > + * meta->magic = xdp_metadata_magic; > > + * meta->rx_timestamp = bpf_xdp_metadata_rx_timestamp(ctx); > > + * } > > + * > > + */ > > + > > + bpf_patch_append(patch, > > + /* r2 = ((struct xdp_buff *)r1)->data_meta; */ > > + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, > > + offsetof(struct xdp_buff, data_meta)), > > + /* r3 = ((struct xdp_buff *)r1)->data; */ > > + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, > > + offsetof(struct xdp_buff, data)), > > + /* if (data_meta != data) return; > > + * > > + * data_meta > data: xdp_data_meta_unsupported() > > + * data_meta < data: already used, no need to touch > > + */ > > + BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, S16_MAX), > > + > > + /* r2 -= sizeof(struct xdp_to_skb_metadata); */ > > + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, > > + sizeof(struct xdp_to_skb_metadata)), > > + /* r3 = ((struct xdp_buff *)r1)->data_hard_start; */ > > + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, > > + offsetof(struct xdp_buff, data_hard_start)), > > + /* r3 += sizeof(struct xdp_frame) */ > > + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, > > + sizeof(struct xdp_frame)), > > + /* if (data-sizeof(struct xdp_to_skb_metadata) < data_hard_start+sizeof(struct xdp_frame)) return; */ > > + BPF_JMP_REG(BPF_JLT, BPF_REG_2, BPF_REG_3, S16_MAX), > > + > > + /* ((struct xdp_buff *)r1)->data_meta = r2; */ > > + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, > > + offsetof(struct xdp_buff, data_meta)), > > + > > + /* *((struct xdp_to_skb_metadata *)r2)->magic = xdp_metadata_magic; */ > > + BPF_ST_MEM(BPF_W, BPF_REG_2, > > + offsetof(struct xdp_to_skb_metadata, magic), > > + xdp_metadata_magic), > > + ); > > + > > + /* r0 = bpf_xdp_metadata_rx_timestamp(ctx); */ > > + func_id = xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP); > > + prog->aux->xdp_kfunc_ndo->ndo_unroll_kfunc(prog, func_id, patch); > > + > > + bpf_patch_append(patch, > > + /* r2 = ((struct xdp_buff *)r1)->data_meta; */ > > + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, > > + offsetof(struct xdp_buff, data_meta)), > > + /* *((struct xdp_to_skb_metadata *)r2)->rx_timestamp = r0; */ > > + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, > > + offsetof(struct xdp_to_skb_metadata, rx_timestamp)), > > Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not > sure it is solid enough by asking the xdp prog not to use the same random number > in its own metadata + not to change the metadata through xdp->data_meta after > calling bpf_xdp_metadata_export_to_skb(). What do you think the usecase here might be? Or are you suggesting we reject further access to data_meta after bpf_xdp_metadata_export_to_skb somehow? If we want to let the programs override some of this bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add more kfuncs instead of exposing the layout? bpf_xdp_metadata_export_to_skb(ctx); bpf_xdp_metadata_export_skb_hash(ctx, 1234); ... > Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the > xdp_to_skb_metadata can be limited to XDP_REDIRECT only? XDP_PASS cases where we convert xdp_buff into skb in the drivers right now usually have C code to manually pull out the metadata (out of hw desc) and put it into skb. So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for XDP_PASS, we're doing a double amount of work: skb_metadata_import_from_xdp first, then custom driver code second. In theory, maybe we should completely skip drivers custom parsing when there is a prog with BPF_F_XDP_HAS_METADATA? Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven and won't require any mental work (plus, the drivers won't have to care either in the future). WDYT? > > + ); > > + > > + bpf_patch_resolve_jmp(patch); > > +} > > + > > static int __init xdp_metadata_init(void) > > { > > + xdp_metadata_magic = get_random_u32() | 1; > > return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set); > > } > > late_initcall(xdp_metadata_init);
On 11/8/22 1:54 PM, Stanislav Fomichev wrote: > On Mon, Nov 7, 2022 at 2:02 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: >> >> On 11/3/22 8:25 PM, Stanislav Fomichev wrote: >>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h >>> index 59c9fd55699d..dba857f212d7 100644 >>> --- a/include/linux/skbuff.h >>> +++ b/include/linux/skbuff.h >>> @@ -4217,9 +4217,13 @@ static inline bool skb_metadata_differs(const struct sk_buff *skb_a, >>> true : __skb_metadata_differs(skb_a, skb_b, len_a); >>> } >>> >>> +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len); >>> + >>> static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) >>> { >>> skb_shinfo(skb)->meta_len = meta_len; >>> + if (meta_len) >>> + skb_metadata_import_from_xdp(skb, meta_len); >>> } >>> >> [ ... ] >> >>> +struct xdp_to_skb_metadata { >>> + u32 magic; /* xdp_metadata_magic */ >>> + u64 rx_timestamp; >>> +} __randomize_layout; >>> + >>> +struct bpf_patch; >>> + >> >> [ ... ] >> >>> +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len) >>> +{ >>> + struct xdp_to_skb_metadata *meta = (void *)(skb_mac_header(skb) - len); >>> + >>> + /* Optional SKB info, currently missing: >>> + * - HW checksum info (skb->ip_summed) >>> + * - HW RX hash (skb_set_hash) >>> + * - RX ring dev queue index (skb_record_rx_queue) >>> + */ >>> + >>> + if (len != sizeof(struct xdp_to_skb_metadata)) >>> + return; >>> + >>> + if (meta->magic != xdp_metadata_magic) >>> + return; >>> + >>> + if (meta->rx_timestamp) { >>> + *skb_hwtstamps(skb) = (struct skb_shared_hwtstamps){ >>> + .hwtstamp = ns_to_ktime(meta->rx_timestamp), >>> + }; >>> + } >>> +} >> >> Considering the metadata will affect the gro, should the meta be cleared after >> importing to the skb? > > Yeah, good suggestion, will clear it here. > >> [ ... ] >> >>> +/* Since we're not actually doing a call but instead rewriting >>> + * in place, we can only afford to use R0-R5 scratch registers. >>> + * >>> + * We reserve R1 for bpf_xdp_metadata_export_to_skb and let individual >>> + * metadata kfuncs use only R0,R4-R5. >>> + * >>> + * The above also means we _cannot_ easily call any other helper/kfunc >>> + * because there is no place for us to preserve our R1 argument; >>> + * existing R6-R9 belong to the callee. >>> + */ >>> +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch) >>> +{ >>> + u32 func_id; >>> + >>> + /* >>> + * The code below generates the following: >>> + * >>> + * void bpf_xdp_metadata_export_to_skb(struct xdp_md *ctx) >>> + * { >>> + * struct xdp_to_skb_metadata *meta; >>> + * int ret; >>> + * >>> + * ret = bpf_xdp_adjust_meta(ctx, -sizeof(*meta)); >>> + * if (!ret) >>> + * return; >>> + * >>> + * meta = ctx->data_meta; >>> + * meta->magic = xdp_metadata_magic; >>> + * meta->rx_timestamp = bpf_xdp_metadata_rx_timestamp(ctx); >>> + * } >>> + * >>> + */ >>> + >>> + bpf_patch_append(patch, >>> + /* r2 = ((struct xdp_buff *)r1)->data_meta; */ >>> + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, >>> + offsetof(struct xdp_buff, data_meta)), >>> + /* r3 = ((struct xdp_buff *)r1)->data; */ >>> + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, >>> + offsetof(struct xdp_buff, data)), >>> + /* if (data_meta != data) return; >>> + * >>> + * data_meta > data: xdp_data_meta_unsupported() >>> + * data_meta < data: already used, no need to touch >>> + */ >>> + BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, S16_MAX), >>> + >>> + /* r2 -= sizeof(struct xdp_to_skb_metadata); */ >>> + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, >>> + sizeof(struct xdp_to_skb_metadata)), >>> + /* r3 = ((struct xdp_buff *)r1)->data_hard_start; */ >>> + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, >>> + offsetof(struct xdp_buff, data_hard_start)), >>> + /* r3 += sizeof(struct xdp_frame) */ >>> + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, >>> + sizeof(struct xdp_frame)), >>> + /* if (data-sizeof(struct xdp_to_skb_metadata) < data_hard_start+sizeof(struct xdp_frame)) return; */ >>> + BPF_JMP_REG(BPF_JLT, BPF_REG_2, BPF_REG_3, S16_MAX), >>> + >>> + /* ((struct xdp_buff *)r1)->data_meta = r2; */ >>> + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, >>> + offsetof(struct xdp_buff, data_meta)), >>> + >>> + /* *((struct xdp_to_skb_metadata *)r2)->magic = xdp_metadata_magic; */ >>> + BPF_ST_MEM(BPF_W, BPF_REG_2, >>> + offsetof(struct xdp_to_skb_metadata, magic), >>> + xdp_metadata_magic), >>> + ); >>> + >>> + /* r0 = bpf_xdp_metadata_rx_timestamp(ctx); */ >>> + func_id = xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP); >>> + prog->aux->xdp_kfunc_ndo->ndo_unroll_kfunc(prog, func_id, patch); >>> + >>> + bpf_patch_append(patch, >>> + /* r2 = ((struct xdp_buff *)r1)->data_meta; */ >>> + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, >>> + offsetof(struct xdp_buff, data_meta)), >>> + /* *((struct xdp_to_skb_metadata *)r2)->rx_timestamp = r0; */ >>> + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, >>> + offsetof(struct xdp_to_skb_metadata, rx_timestamp)), >> >> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not >> sure it is solid enough by asking the xdp prog not to use the same random number >> in its own metadata + not to change the metadata through xdp->data_meta after >> calling bpf_xdp_metadata_export_to_skb(). > > What do you think the usecase here might be? Or are you suggesting we > reject further access to data_meta after > bpf_xdp_metadata_export_to_skb somehow? > > If we want to let the programs override some of this > bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add > more kfuncs instead of exposing the layout? > > bpf_xdp_metadata_export_to_skb(ctx); > bpf_xdp_metadata_export_skb_hash(ctx, 1234); I can't think of a use case now for the xdp prog to use the xdp_to_skb_metadata while the xdp prog can directly call the kfunc (eg bpf_xdp_metadata_rx_timestamp) to get individual hint. I was asking if patch 7 is an actual use case because it does test the tstamp in XDP_PASS or it is mostly for selftest purpose? Yeah, may be the xdp prog will be able to change the xdp_to_skb_metadata eventually but that is for later. My concern is the xdp prog is allowed to change xdp_to_skb_metadata or by-coincident writing metadata that matches the random and the sizeof(struct xdp_to_skb_metadata). Also, the added opacity of xdp_to_skb_metadata (__randomize_layout + random int) is trying very hard to hide it from xdp prog. Instead, would it be cleaner to have a flag in xdp->flags (to be set by bpf_xdp_metadata_export_to_skb?) to guard this, like one of Jesper's patch. The xdp_convert_ctx_access() and bpf_xdp_adjust_meta() can check this bit to ensure the xdp_to_skb_metadata cannot be read and no metadata can be added/deleted after that. btw, is it possible to keep both xdp_to_skb_metadata and the xdp_prog's metadata? After skb_metadata_import_from_xdp popping the xdp_to_skb_metadata, the remaining xdp_prog's metatdata can still be used by the bpf-tc. > ... > >> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the >> xdp_to_skb_metadata can be limited to XDP_REDIRECT only? > > XDP_PASS cases where we convert xdp_buff into skb in the drivers right > now usually have C code to manually pull out the metadata (out of hw > desc) and put it into skb. > > So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for > XDP_PASS, we're doing a double amount of work: > skb_metadata_import_from_xdp first, then custom driver code second. > > In theory, maybe we should completely skip drivers custom parsing when > there is a prog with BPF_F_XDP_HAS_METADATA? > Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven > and won't require any mental work (plus, the drivers won't have to > care either in the future). > > WDYT? Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes sense to only use the hints (if ever written) from xdp prog especially if it will eventually support xdp prog changing some of the hints in the future. For now, I think either way is fine since they are the same and the xdp prog is sort of doing extra unnecessary work anyway by calling bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be changed now. > >>> + ); >>> + >>> + bpf_patch_resolve_jmp(patch); >>> +} >>> + >>> static int __init xdp_metadata_init(void) >>> { >>> + xdp_metadata_magic = get_random_u32() | 1; >>> return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set); >>> } >>> late_initcall(xdp_metadata_init);
On 11/8/22 7:07 PM, Martin KaFai Lau wrote: > On 11/8/22 1:54 PM, Stanislav Fomichev wrote: >> On Mon, Nov 7, 2022 at 2:02 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: >>> >>> On 11/3/22 8:25 PM, Stanislav Fomichev wrote: >>>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h >>>> index 59c9fd55699d..dba857f212d7 100644 >>>> --- a/include/linux/skbuff.h >>>> +++ b/include/linux/skbuff.h >>>> @@ -4217,9 +4217,13 @@ static inline bool skb_metadata_differs(const struct >>>> sk_buff *skb_a, >>>> true : __skb_metadata_differs(skb_a, skb_b, len_a); >>>> } >>>> >>>> +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len); >>>> + >>>> static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) >>>> { >>>> skb_shinfo(skb)->meta_len = meta_len; >>>> + if (meta_len) >>>> + skb_metadata_import_from_xdp(skb, meta_len); >>>> } >>>> >>> [ ... ] >>> >>>> +struct xdp_to_skb_metadata { >>>> + u32 magic; /* xdp_metadata_magic */ >>>> + u64 rx_timestamp; >>>> +} __randomize_layout; >>>> + >>>> +struct bpf_patch; >>>> + >>> >>> [ ... ] >>> >>>> +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len) >>>> +{ >>>> + struct xdp_to_skb_metadata *meta = (void *)(skb_mac_header(skb) - len); >>>> + >>>> + /* Optional SKB info, currently missing: >>>> + * - HW checksum info (skb->ip_summed) >>>> + * - HW RX hash (skb_set_hash) >>>> + * - RX ring dev queue index (skb_record_rx_queue) >>>> + */ >>>> + >>>> + if (len != sizeof(struct xdp_to_skb_metadata)) >>>> + return; >>>> + >>>> + if (meta->magic != xdp_metadata_magic) >>>> + return; >>>> + >>>> + if (meta->rx_timestamp) { >>>> + *skb_hwtstamps(skb) = (struct skb_shared_hwtstamps){ >>>> + .hwtstamp = ns_to_ktime(meta->rx_timestamp), >>>> + }; >>>> + } >>>> +} >>> >>> Considering the metadata will affect the gro, should the meta be cleared after >>> importing to the skb? >> >> Yeah, good suggestion, will clear it here. >> >>> [ ... ] >>> >>>> +/* Since we're not actually doing a call but instead rewriting >>>> + * in place, we can only afford to use R0-R5 scratch registers. >>>> + * >>>> + * We reserve R1 for bpf_xdp_metadata_export_to_skb and let individual >>>> + * metadata kfuncs use only R0,R4-R5. >>>> + * >>>> + * The above also means we _cannot_ easily call any other helper/kfunc >>>> + * because there is no place for us to preserve our R1 argument; >>>> + * existing R6-R9 belong to the callee. >>>> + */ >>>> +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct >>>> bpf_patch *patch) >>>> +{ >>>> + u32 func_id; >>>> + >>>> + /* >>>> + * The code below generates the following: >>>> + * >>>> + * void bpf_xdp_metadata_export_to_skb(struct xdp_md *ctx) >>>> + * { >>>> + * struct xdp_to_skb_metadata *meta; >>>> + * int ret; >>>> + * >>>> + * ret = bpf_xdp_adjust_meta(ctx, -sizeof(*meta)); >>>> + * if (!ret) >>>> + * return; >>>> + * >>>> + * meta = ctx->data_meta; >>>> + * meta->magic = xdp_metadata_magic; >>>> + * meta->rx_timestamp = bpf_xdp_metadata_rx_timestamp(ctx); >>>> + * } >>>> + * >>>> + */ >>>> + >>>> + bpf_patch_append(patch, >>>> + /* r2 = ((struct xdp_buff *)r1)->data_meta; */ >>>> + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, >>>> + offsetof(struct xdp_buff, data_meta)), >>>> + /* r3 = ((struct xdp_buff *)r1)->data; */ >>>> + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, >>>> + offsetof(struct xdp_buff, data)), >>>> + /* if (data_meta != data) return; >>>> + * >>>> + * data_meta > data: xdp_data_meta_unsupported() >>>> + * data_meta < data: already used, no need to touch >>>> + */ >>>> + BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, S16_MAX), >>>> + >>>> + /* r2 -= sizeof(struct xdp_to_skb_metadata); */ >>>> + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, >>>> + sizeof(struct xdp_to_skb_metadata)), >>>> + /* r3 = ((struct xdp_buff *)r1)->data_hard_start; */ >>>> + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, >>>> + offsetof(struct xdp_buff, data_hard_start)), >>>> + /* r3 += sizeof(struct xdp_frame) */ >>>> + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, >>>> + sizeof(struct xdp_frame)), >>>> + /* if (data-sizeof(struct xdp_to_skb_metadata) < >>>> data_hard_start+sizeof(struct xdp_frame)) return; */ >>>> + BPF_JMP_REG(BPF_JLT, BPF_REG_2, BPF_REG_3, S16_MAX), >>>> + >>>> + /* ((struct xdp_buff *)r1)->data_meta = r2; */ >>>> + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, >>>> + offsetof(struct xdp_buff, data_meta)), >>>> + >>>> + /* *((struct xdp_to_skb_metadata *)r2)->magic = >>>> xdp_metadata_magic; */ >>>> + BPF_ST_MEM(BPF_W, BPF_REG_2, >>>> + offsetof(struct xdp_to_skb_metadata, magic), >>>> + xdp_metadata_magic), >>>> + ); >>>> + >>>> + /* r0 = bpf_xdp_metadata_rx_timestamp(ctx); */ >>>> + func_id = xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP); >>>> + prog->aux->xdp_kfunc_ndo->ndo_unroll_kfunc(prog, func_id, patch); >>>> + >>>> + bpf_patch_append(patch, >>>> + /* r2 = ((struct xdp_buff *)r1)->data_meta; */ >>>> + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, >>>> + offsetof(struct xdp_buff, data_meta)), >>>> + /* *((struct xdp_to_skb_metadata *)r2)->rx_timestamp = r0; */ >>>> + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, >>>> + offsetof(struct xdp_to_skb_metadata, rx_timestamp)), >>> >>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not >>> sure it is solid enough by asking the xdp prog not to use the same random number >>> in its own metadata + not to change the metadata through xdp->data_meta after >>> calling bpf_xdp_metadata_export_to_skb(). >> >> What do you think the usecase here might be? Or are you suggesting we >> reject further access to data_meta after >> bpf_xdp_metadata_export_to_skb somehow? >> >> If we want to let the programs override some of this >> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add >> more kfuncs instead of exposing the layout? >> >> bpf_xdp_metadata_export_to_skb(ctx); >> bpf_xdp_metadata_export_skb_hash(ctx, 1234); After re-reading patch 6, have another question. The 'void bpf_xdp_metadata_export_to_skb();' function signature. Should it at least return ok/err? or even return a 'struct xdp_to_skb_metadata *' pointer and the xdp prog can directly read (or even write) it? A related question, why 'struct xdp_to_skb_metadata' needs __randomize_layout? > > > I can't think of a use case now for the xdp prog to use the xdp_to_skb_metadata > while the xdp prog can directly call the kfunc (eg > bpf_xdp_metadata_rx_timestamp) to get individual hint. I was asking if patch 7 > is an actual use case because it does test the tstamp in XDP_PASS or it is > mostly for selftest purpose? Yeah, may be the xdp prog will be able to change > the xdp_to_skb_metadata eventually but that is for later. > > My concern is the xdp prog is allowed to change xdp_to_skb_metadata or > by-coincident writing metadata that matches the random and the sizeof(struct > xdp_to_skb_metadata). > > Also, the added opacity of xdp_to_skb_metadata (__randomize_layout + random int) > is trying very hard to hide it from xdp prog. Instead, would it be cleaner to > have a flag in xdp->flags (to be set by bpf_xdp_metadata_export_to_skb?) to > guard this, like one of Jesper's patch. The xdp_convert_ctx_access() and > bpf_xdp_adjust_meta() can check this bit to ensure the xdp_to_skb_metadata > cannot be read and no metadata can be added/deleted after that. btw, is it > possible to keep both xdp_to_skb_metadata and the xdp_prog's metadata? After > skb_metadata_import_from_xdp popping the xdp_to_skb_metadata, the remaining > xdp_prog's metatdata can still be used by the bpf-tc. > >> ... >> >>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the >>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only? >> >> XDP_PASS cases where we convert xdp_buff into skb in the drivers right >> now usually have C code to manually pull out the metadata (out of hw >> desc) and put it into skb. >> >> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for >> XDP_PASS, we're doing a double amount of work: >> skb_metadata_import_from_xdp first, then custom driver code second. >> >> In theory, maybe we should completely skip drivers custom parsing when >> there is a prog with BPF_F_XDP_HAS_METADATA? >> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven >> and won't require any mental work (plus, the drivers won't have to >> care either in the future). >> > WDYT? > > > Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes > sense to only use the hints (if ever written) from xdp prog especially if it > will eventually support xdp prog changing some of the hints in the future. For > now, I think either way is fine since they are the same and the xdp prog is sort > of doing extra unnecessary work anyway by calling > bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be > changed now. > > >> >>>> + ); >>>> + >>>> + bpf_patch_resolve_jmp(patch); >>>> +} >>>> + >>>> static int __init xdp_metadata_init(void) >>>> { >>>> + xdp_metadata_magic = get_random_u32() | 1; >>>> return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, >>>> &xdp_metadata_kfunc_set); >>>> } >>>> late_initcall(xdp_metadata_init); >
Snipping a bit of context to reply to this bit: >>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not >>>> sure it is solid enough by asking the xdp prog not to use the same random number >>>> in its own metadata + not to change the metadata through xdp->data_meta after >>>> calling bpf_xdp_metadata_export_to_skb(). >>> >>> What do you think the usecase here might be? Or are you suggesting we >>> reject further access to data_meta after >>> bpf_xdp_metadata_export_to_skb somehow? >>> >>> If we want to let the programs override some of this >>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add >>> more kfuncs instead of exposing the layout? >>> >>> bpf_xdp_metadata_export_to_skb(ctx); >>> bpf_xdp_metadata_export_skb_hash(ctx, 1234); There are several use cases for needing to access the metadata after calling bpf_xdp_metdata_export_to_skb(): - Accessing the metadata after redirect (in a cpumap or devmap program, or on a veth device) - Transferring the packet+metadata to AF_XDP - Returning XDP_PASS, but accessing some of the metadata first (whether to read or change it) The last one could be solved by calling additional kfuncs, but that would be less efficient than just directly editing the struct which will be cache-hot after the helper returns. And yeah, this will allow the XDP program to inject arbitrary metadata into the netstack; but it can already inject arbitrary *packet* data into the stack, so not sure if this is much of an additional risk? If it does lead to trivial crashes, we should probably harden the stack against that? As for the random number, Jesper and I discussed replacing this with the same BTF-ID scheme that he was using in his patch series. I.e., instead of just putting in a random number, we insert the BTF ID of the metadata struct at the end of it. This will allow us to support multiple different formats in the future (not just changing the layout, but having multiple simultaneous formats in the same kernel image), in case we run out of space. We should probably also have a flag set on the xdp_frame so the stack knows that the metadata area contains relevant-to-skb data, to guard against an XDP program accidentally hitting the "magic number" (BTF_ID) in unrelated stuff it puts into the metadata area. > After re-reading patch 6, have another question. The 'void > bpf_xdp_metadata_export_to_skb();' function signature. Should it at > least return ok/err? or even return a 'struct xdp_to_skb_metadata *' > pointer and the xdp prog can directly read (or even write) it? Hmm, I'm not sure returning a failure makes sense? Failure to read one or more fields just means that those fields will not be populated? We should probably have a flags field inside the metadata struct itself to indicate which fields are set or not, but I'm not sure returning an error value adds anything? Returning a pointer to the metadata field might be convenient for users (it would just be an alias to the data_meta pointer, but the verifier could know its size, so the program doesn't have to bounds check it). > A related question, why 'struct xdp_to_skb_metadata' needs > __randomize_layout? The __randomize_layout thing is there to force BPF programs to use CO-RE to access the field. This is to avoid the struct layout accidentally ossifying because people in practice rely on a particular layout, even though we tell them to use CO-RE. There are lots of examples of this happening in other domains (IP header options, TCP options, etc), and __randomize_layout seemed like a neat trick to enforce CO-RE usage :) >>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the >>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only? >>> >>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right >>> now usually have C code to manually pull out the metadata (out of hw >>> desc) and put it into skb. >>> >>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for >>> XDP_PASS, we're doing a double amount of work: >>> skb_metadata_import_from_xdp first, then custom driver code second. >>> >>> In theory, maybe we should completely skip drivers custom parsing when >>> there is a prog with BPF_F_XDP_HAS_METADATA? >>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven >>> and won't require any mental work (plus, the drivers won't have to >>> care either in the future). >>> > WDYT? >> >> >> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes >> sense to only use the hints (if ever written) from xdp prog especially if it >> will eventually support xdp prog changing some of the hints in the future. For >> now, I think either way is fine since they are the same and the xdp prog is sort >> of doing extra unnecessary work anyway by calling >> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be >> changed now. I agree it would be best if the drivers also use the XDP metadata (if present) on XDP_PASS. Longer term my hope is we can make the XDP metadata support the only thing drivers need to implement (i.e., have the stack call into that code even when no XDP program is loaded), but for now just for consistency (and allowing the XDP program to update the metadata), we should probably at least consume it on XDP_PASS. -Toke
On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote: > Snipping a bit of context to reply to this bit: > >>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not >>>>> sure it is solid enough by asking the xdp prog not to use the same random number >>>>> in its own metadata + not to change the metadata through xdp->data_meta after >>>>> calling bpf_xdp_metadata_export_to_skb(). >>>> >>>> What do you think the usecase here might be? Or are you suggesting we >>>> reject further access to data_meta after >>>> bpf_xdp_metadata_export_to_skb somehow? >>>> >>>> If we want to let the programs override some of this >>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add >>>> more kfuncs instead of exposing the layout? >>>> >>>> bpf_xdp_metadata_export_to_skb(ctx); >>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234); > > There are several use cases for needing to access the metadata after > calling bpf_xdp_metdata_export_to_skb(): > > - Accessing the metadata after redirect (in a cpumap or devmap program, > or on a veth device) > - Transferring the packet+metadata to AF_XDP fwiw, the xdp prog could also be more selective and only stores one of the hints instead of the whole 'struct xdp_to_skb_metadata'. > - Returning XDP_PASS, but accessing some of the metadata first (whether > to read or change it) > > The last one could be solved by calling additional kfuncs, but that > would be less efficient than just directly editing the struct which > will be cache-hot after the helper returns. Yeah, it is more efficient to directly write if possible. I think this set allows the direct reading and writing already through data_meta (as a _u8 *). > > And yeah, this will allow the XDP program to inject arbitrary metadata > into the netstack; but it can already inject arbitrary *packet* data > into the stack, so not sure if this is much of an additional risk? If it > does lead to trivial crashes, we should probably harden the stack > against that? > > As for the random number, Jesper and I discussed replacing this with the > same BTF-ID scheme that he was using in his patch series. I.e., instead > of just putting in a random number, we insert the BTF ID of the metadata > struct at the end of it. This will allow us to support multiple > different formats in the future (not just changing the layout, but > having multiple simultaneous formats in the same kernel image), in case > we run out of space. This seems a bit hypothetical. How much headroom does it usually have for the xdp prog? Potentially the hints can use all the remaining space left after the header encap and the current bpf_xdp_adjust_meta() usage? > > We should probably also have a flag set on the xdp_frame so the stack > knows that the metadata area contains relevant-to-skb data, to guard > against an XDP program accidentally hitting the "magic number" (BTF_ID) > in unrelated stuff it puts into the metadata area. Yeah, I think having a flag is useful. The flag will be set at xdp_buff and then transfer to the xdp_frame? > >> After re-reading patch 6, have another question. The 'void >> bpf_xdp_metadata_export_to_skb();' function signature. Should it at >> least return ok/err? or even return a 'struct xdp_to_skb_metadata *' >> pointer and the xdp prog can directly read (or even write) it? > > Hmm, I'm not sure returning a failure makes sense? Failure to read one > or more fields just means that those fields will not be populated? We > should probably have a flags field inside the metadata struct itself to > indicate which fields are set or not, but I'm not sure returning an > error value adds anything? Returning a pointer to the metadata field > might be convenient for users (it would just be an alias to the > data_meta pointer, but the verifier could know its size, so the program > doesn't have to bounds check it). If some hints are not available, those hints should be initialized to 0/CHECKSUM_NONE/...etc. The xdp prog needs a direct way to tell hard failure when it cannot write the meta area because of not enough space. Comparing xdp->data_meta with xdp->data as a side effect is not intuitive. It is more than saving the bound check. With type info of 'struct xdp_to_skb_metadata *', the verifier can do more checks like reading in the middle of an integer member. The verifier could also limit write access only to a few struct's members if it is needed. The returning 'struct xdp_to_skb_metadata *' should not be an alias to the xdp->data_meta. They should actually point to different locations in the headroom. bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff. xdp->data_meta won't be changed and keeps pointing to the last bpf_xdp_adjust_meta() location. The kernel will know if there is xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the xdp_{buff,frame}. Would it work? > >> A related question, why 'struct xdp_to_skb_metadata' needs >> __randomize_layout? > > The __randomize_layout thing is there to force BPF programs to use CO-RE > to access the field. This is to avoid the struct layout accidentally > ossifying because people in practice rely on a particular layout, even > though we tell them to use CO-RE. There are lots of examples of this > happening in other domains (IP header options, TCP options, etc), and > __randomize_layout seemed like a neat trick to enforce CO-RE usage :) I am not sure if it is necessary or helpful to only enforce __randomize_layout in 'struct xdp_to_skb_metadata'. There are other CO-RE use cases (tracing and non tracing) that already have direct access (reading and/or writing) to other kernel structures. It is more important for the verifier to see the xdp prog accessing it as a 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so that the verifier can enforce the rules of access. > >>>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the >>>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only? >>>> >>>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right >>>> now usually have C code to manually pull out the metadata (out of hw >>>> desc) and put it into skb. >>>> >>>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for >>>> XDP_PASS, we're doing a double amount of work: >>>> skb_metadata_import_from_xdp first, then custom driver code second. >>>> >>>> In theory, maybe we should completely skip drivers custom parsing when >>>> there is a prog with BPF_F_XDP_HAS_METADATA? >>>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven >>>> and won't require any mental work (plus, the drivers won't have to >>>> care either in the future). >>>> > WDYT? >>> >>> >>> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes >>> sense to only use the hints (if ever written) from xdp prog especially if it >>> will eventually support xdp prog changing some of the hints in the future. For >>> now, I think either way is fine since they are the same and the xdp prog is sort >>> of doing extra unnecessary work anyway by calling >>> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be >>> changed now. > > I agree it would be best if the drivers also use the XDP metadata (if > present) on XDP_PASS. Longer term my hope is we can make the XDP > metadata support the only thing drivers need to implement (i.e., have > the stack call into that code even when no XDP program is loaded), but > for now just for consistency (and allowing the XDP program to update the > metadata), we should probably at least consume it on XDP_PASS. > > -Toke >
On Wed, Nov 9, 2022 at 10:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote: > > Snipping a bit of context to reply to this bit: > > > >>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not > >>>>> sure it is solid enough by asking the xdp prog not to use the same random number > >>>>> in its own metadata + not to change the metadata through xdp->data_meta after > >>>>> calling bpf_xdp_metadata_export_to_skb(). > >>>> > >>>> What do you think the usecase here might be? Or are you suggesting we > >>>> reject further access to data_meta after > >>>> bpf_xdp_metadata_export_to_skb somehow? > >>>> > >>>> If we want to let the programs override some of this > >>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add > >>>> more kfuncs instead of exposing the layout? > >>>> > >>>> bpf_xdp_metadata_export_to_skb(ctx); > >>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234); > > > > There are several use cases for needing to access the metadata after > > calling bpf_xdp_metdata_export_to_skb(): > > > > - Accessing the metadata after redirect (in a cpumap or devmap program, > > or on a veth device) > > - Transferring the packet+metadata to AF_XDP > fwiw, the xdp prog could also be more selective and only stores one of the hints > instead of the whole 'struct xdp_to_skb_metadata'. > > > - Returning XDP_PASS, but accessing some of the metadata first (whether > > to read or change it) > > > > The last one could be solved by calling additional kfuncs, but that > > would be less efficient than just directly editing the struct which > > will be cache-hot after the helper returns. > > Yeah, it is more efficient to directly write if possible. I think this set > allows the direct reading and writing already through data_meta (as a _u8 *). > > > > > And yeah, this will allow the XDP program to inject arbitrary metadata > > into the netstack; but it can already inject arbitrary *packet* data > > into the stack, so not sure if this is much of an additional risk? If it > > does lead to trivial crashes, we should probably harden the stack > > against that? > > > > As for the random number, Jesper and I discussed replacing this with the > > same BTF-ID scheme that he was using in his patch series. I.e., instead > > of just putting in a random number, we insert the BTF ID of the metadata > > struct at the end of it. This will allow us to support multiple > > different formats in the future (not just changing the layout, but > > having multiple simultaneous formats in the same kernel image), in case > > we run out of space. > > This seems a bit hypothetical. How much headroom does it usually have for the > xdp prog? Potentially the hints can use all the remaining space left after the > header encap and the current bpf_xdp_adjust_meta() usage? > > > > > We should probably also have a flag set on the xdp_frame so the stack > > knows that the metadata area contains relevant-to-skb data, to guard > > against an XDP program accidentally hitting the "magic number" (BTF_ID) > > in unrelated stuff it puts into the metadata area. > > Yeah, I think having a flag is useful. The flag will be set at xdp_buff and > then transfer to the xdp_frame? > > > > >> After re-reading patch 6, have another question. The 'void > >> bpf_xdp_metadata_export_to_skb();' function signature. Should it at > >> least return ok/err? or even return a 'struct xdp_to_skb_metadata *' > >> pointer and the xdp prog can directly read (or even write) it? > > > > Hmm, I'm not sure returning a failure makes sense? Failure to read one > > or more fields just means that those fields will not be populated? We > > should probably have a flags field inside the metadata struct itself to > > indicate which fields are set or not, but I'm not sure returning an > > error value adds anything? Returning a pointer to the metadata field > > might be convenient for users (it would just be an alias to the > > data_meta pointer, but the verifier could know its size, so the program > > doesn't have to bounds check it). > > If some hints are not available, those hints should be initialized to > 0/CHECKSUM_NONE/...etc. The xdp prog needs a direct way to tell hard failure > when it cannot write the meta area because of not enough space. Comparing > xdp->data_meta with xdp->data as a side effect is not intuitive. > > It is more than saving the bound check. With type info of 'struct > xdp_to_skb_metadata *', the verifier can do more checks like reading in the > middle of an integer member. The verifier could also limit write access only to > a few struct's members if it is needed. > > The returning 'struct xdp_to_skb_metadata *' should not be an alias to the > xdp->data_meta. They should actually point to different locations in the > headroom. bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff. > xdp->data_meta won't be changed and keeps pointing to the last > bpf_xdp_adjust_meta() location. The kernel will know if there is > xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the > xdp_{buff,frame}. Would it work? > > > > >> A related question, why 'struct xdp_to_skb_metadata' needs > >> __randomize_layout? > > > > The __randomize_layout thing is there to force BPF programs to use CO-RE > > to access the field. This is to avoid the struct layout accidentally > > ossifying because people in practice rely on a particular layout, even > > though we tell them to use CO-RE. There are lots of examples of this > > happening in other domains (IP header options, TCP options, etc), and > > __randomize_layout seemed like a neat trick to enforce CO-RE usage :) > > I am not sure if it is necessary or helpful to only enforce __randomize_layout > in 'struct xdp_to_skb_metadata'. There are other CO-RE use cases (tracing and > non tracing) that already have direct access (reading and/or writing) to other > kernel structures. > > It is more important for the verifier to see the xdp prog accessing it as a > 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so > that the verifier can enforce the rules of access. > > > > >>>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the > >>>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only? > >>>> > >>>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right > >>>> now usually have C code to manually pull out the metadata (out of hw > >>>> desc) and put it into skb. > >>>> > >>>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for > >>>> XDP_PASS, we're doing a double amount of work: > >>>> skb_metadata_import_from_xdp first, then custom driver code second. > >>>> > >>>> In theory, maybe we should completely skip drivers custom parsing when > >>>> there is a prog with BPF_F_XDP_HAS_METADATA? > >>>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven > >>>> and won't require any mental work (plus, the drivers won't have to > >>>> care either in the future). > >>>> > WDYT? > >>> > >>> > >>> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes > >>> sense to only use the hints (if ever written) from xdp prog especially if it > >>> will eventually support xdp prog changing some of the hints in the future. For > >>> now, I think either way is fine since they are the same and the xdp prog is sort > >>> of doing extra unnecessary work anyway by calling > >>> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be > >>> changed now. > > > > I agree it would be best if the drivers also use the XDP metadata (if > > present) on XDP_PASS. Longer term my hope is we can make the XDP > > metadata support the only thing drivers need to implement (i.e., have > > the stack call into that code even when no XDP program is loaded), but > > for now just for consistency (and allowing the XDP program to update the > > metadata), we should probably at least consume it on XDP_PASS. > > > > -Toke > > Not to derail the discussion (left the last message intact on top, feel free to continue), but to summarize. The proposed changes seem to be: 1. bpf_xdp_metadata_export_to_skb() should return pointer to "struct xdp_to_skb_metadata" - This should let bpf programs change the metadata passed to the skb 2. "struct xdp_to_skb_metadata" should have its btf_id as the first __u32 member (and remove the magic) - This is for the redirect case where the end users, including AF_XDP, can parse this metadata from btf_id - This, however, is not all the metadata that the device can support, but a much narrower set that the kernel is expected to use for skb construction 3. __randomize_layout isn't really helping, CO-RE will trigger regardless; maybe only the case where it matters is probably AF_XDP, so still useful? 4. The presence of the metadata generated by bpf_xdp_metadata_export_to_skb should be indicated by a flag in xdp_{buff,frame}->flags - Assuming exposing it via xdp_md->has_skb_metadata is ok? - Since the programs probably need to do the following: if (xdp_md->has_skb_metadata) { access/change skb metadata by doing struct xdp_to_skb_metadata *p = data_meta; } else { use kfuncs } 5. Support the case where we keep program's metadata and kernel's xdp_to_skb_metadata - skb_metadata_import_from_xdp() will "consume" it by mem-moving the rest of the metadata over it and adjusting the headroom I think the above solves all the cases Toke points to? a) Accessing the metadata after redirect (in a cpumap or devmap program, or on a veth device) - only a small xdp_to_skb_metadata subset will work out of the box iff the redirecttor calls bpf_xdp_metadata_export_to_skb; for the rest the progs will have to agree on the layout, right? b) Transferring the packet+metadata to AF_XDP - here, again, the AF_XDP consumer will have to either expect xdp_to_skb_metadata with a smaller set of skb-related metadata, or will have to make sure the producer builds a custom layout using kfuncs; there is also no flag to indicate whether xdp_to_skb_metadata is there or not; the consumer will have to test btf_id at the right offset c) Returning XDP_PASS, but accessing some of the metadata first (whether to read or change it) - can read via kfuncs, can change via bpf_xdp_metadata_export_to_skb(); m->xyz=abc; Anything I'm missing?
On 11/9/22 1:33 PM, Stanislav Fomichev wrote: > On Wed, Nov 9, 2022 at 10:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote: >> >> On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote: >>> Snipping a bit of context to reply to this bit: >>> >>>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not >>>>>>> sure it is solid enough by asking the xdp prog not to use the same random number >>>>>>> in its own metadata + not to change the metadata through xdp->data_meta after >>>>>>> calling bpf_xdp_metadata_export_to_skb(). >>>>>> >>>>>> What do you think the usecase here might be? Or are you suggesting we >>>>>> reject further access to data_meta after >>>>>> bpf_xdp_metadata_export_to_skb somehow? >>>>>> >>>>>> If we want to let the programs override some of this >>>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add >>>>>> more kfuncs instead of exposing the layout? >>>>>> >>>>>> bpf_xdp_metadata_export_to_skb(ctx); >>>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234); >>> >>> There are several use cases for needing to access the metadata after >>> calling bpf_xdp_metdata_export_to_skb(): >>> >>> - Accessing the metadata after redirect (in a cpumap or devmap program, >>> or on a veth device) >>> - Transferring the packet+metadata to AF_XDP >> fwiw, the xdp prog could also be more selective and only stores one of the hints >> instead of the whole 'struct xdp_to_skb_metadata'. >> >>> - Returning XDP_PASS, but accessing some of the metadata first (whether >>> to read or change it) >>> >>> The last one could be solved by calling additional kfuncs, but that >>> would be less efficient than just directly editing the struct which >>> will be cache-hot after the helper returns. >> >> Yeah, it is more efficient to directly write if possible. I think this set >> allows the direct reading and writing already through data_meta (as a _u8 *). >> >>> >>> And yeah, this will allow the XDP program to inject arbitrary metadata >>> into the netstack; but it can already inject arbitrary *packet* data >>> into the stack, so not sure if this is much of an additional risk? If it >>> does lead to trivial crashes, we should probably harden the stack >>> against that? >>> >>> As for the random number, Jesper and I discussed replacing this with the >>> same BTF-ID scheme that he was using in his patch series. I.e., instead >>> of just putting in a random number, we insert the BTF ID of the metadata >>> struct at the end of it. This will allow us to support multiple >>> different formats in the future (not just changing the layout, but >>> having multiple simultaneous formats in the same kernel image), in case >>> we run out of space. >> >> This seems a bit hypothetical. How much headroom does it usually have for the >> xdp prog? Potentially the hints can use all the remaining space left after the >> header encap and the current bpf_xdp_adjust_meta() usage? >> >>> >>> We should probably also have a flag set on the xdp_frame so the stack >>> knows that the metadata area contains relevant-to-skb data, to guard >>> against an XDP program accidentally hitting the "magic number" (BTF_ID) >>> in unrelated stuff it puts into the metadata area. >> >> Yeah, I think having a flag is useful. The flag will be set at xdp_buff and >> then transfer to the xdp_frame? >> >>> >>>> After re-reading patch 6, have another question. The 'void >>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at >>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *' >>>> pointer and the xdp prog can directly read (or even write) it? >>> >>> Hmm, I'm not sure returning a failure makes sense? Failure to read one >>> or more fields just means that those fields will not be populated? We >>> should probably have a flags field inside the metadata struct itself to >>> indicate which fields are set or not, but I'm not sure returning an >>> error value adds anything? Returning a pointer to the metadata field >>> might be convenient for users (it would just be an alias to the >>> data_meta pointer, but the verifier could know its size, so the program >>> doesn't have to bounds check it). >> >> If some hints are not available, those hints should be initialized to >> 0/CHECKSUM_NONE/...etc. The xdp prog needs a direct way to tell hard failure >> when it cannot write the meta area because of not enough space. Comparing >> xdp->data_meta with xdp->data as a side effect is not intuitive. >> >> It is more than saving the bound check. With type info of 'struct >> xdp_to_skb_metadata *', the verifier can do more checks like reading in the >> middle of an integer member. The verifier could also limit write access only to >> a few struct's members if it is needed. >> >> The returning 'struct xdp_to_skb_metadata *' should not be an alias to the >> xdp->data_meta. They should actually point to different locations in the >> headroom. bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff. >> xdp->data_meta won't be changed and keeps pointing to the last >> bpf_xdp_adjust_meta() location. The kernel will know if there is >> xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the >> xdp_{buff,frame}. Would it work? >> >>> >>>> A related question, why 'struct xdp_to_skb_metadata' needs >>>> __randomize_layout? >>> >>> The __randomize_layout thing is there to force BPF programs to use CO-RE >>> to access the field. This is to avoid the struct layout accidentally >>> ossifying because people in practice rely on a particular layout, even >>> though we tell them to use CO-RE. There are lots of examples of this >>> happening in other domains (IP header options, TCP options, etc), and >>> __randomize_layout seemed like a neat trick to enforce CO-RE usage :) >> >> I am not sure if it is necessary or helpful to only enforce __randomize_layout >> in 'struct xdp_to_skb_metadata'. There are other CO-RE use cases (tracing and >> non tracing) that already have direct access (reading and/or writing) to other >> kernel structures. >> >> It is more important for the verifier to see the xdp prog accessing it as a >> 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so >> that the verifier can enforce the rules of access. >> >>> >>>>>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the >>>>>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only? >>>>>> >>>>>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right >>>>>> now usually have C code to manually pull out the metadata (out of hw >>>>>> desc) and put it into skb. >>>>>> >>>>>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for >>>>>> XDP_PASS, we're doing a double amount of work: >>>>>> skb_metadata_import_from_xdp first, then custom driver code second. >>>>>> >>>>>> In theory, maybe we should completely skip drivers custom parsing when >>>>>> there is a prog with BPF_F_XDP_HAS_METADATA? >>>>>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven >>>>>> and won't require any mental work (plus, the drivers won't have to >>>>>> care either in the future). >>>>>> > WDYT? >>>>> >>>>> >>>>> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes >>>>> sense to only use the hints (if ever written) from xdp prog especially if it >>>>> will eventually support xdp prog changing some of the hints in the future. For >>>>> now, I think either way is fine since they are the same and the xdp prog is sort >>>>> of doing extra unnecessary work anyway by calling >>>>> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be >>>>> changed now. >>> >>> I agree it would be best if the drivers also use the XDP metadata (if >>> present) on XDP_PASS. Longer term my hope is we can make the XDP >>> metadata support the only thing drivers need to implement (i.e., have >>> the stack call into that code even when no XDP program is loaded), but >>> for now just for consistency (and allowing the XDP program to update the >>> metadata), we should probably at least consume it on XDP_PASS. >>> >>> -Toke >>> > > Not to derail the discussion (left the last message intact on top, > feel free to continue), but to summarize. The proposed changes seem to > be: > > 1. bpf_xdp_metadata_export_to_skb() should return pointer to "struct > xdp_to_skb_metadata" > - This should let bpf programs change the metadata passed to the skb > > 2. "struct xdp_to_skb_metadata" should have its btf_id as the first > __u32 member (and remove the magic) > - This is for the redirect case where the end users, including > AF_XDP, can parse this metadata from btf_id I think Toke's idea is to put the btf_id at the end of xdp_to_skb_metadata. I can see why the end is needed for the userspace AF_XDP because, afaict, AF_XDP rx_desc currently cannot tell if there is metadata written by the xdp prog or not. However, if the 'has_skb_metadata' bit can also be passed to the AF_XDP rx_desc->options, the btf_id may as well be not needed now. However, the btf_id and other future new members can be added to the xdp_to_skb_metadata later if there is a need. For the kernel and xdp prog, a bit in the xdp->flags should be enough to get to the xdp_to_skb_metadata. The xdp prog will use CO-RE to access the members in xdp_to_skb_metadata. > - This, however, is not all the metadata that the device can > support, but a much narrower set that the kernel is expected to use > for skb construction > > 3. __randomize_layout isn't really helping, CO-RE will trigger > regardless; maybe only the case where it matters is probably AF_XDP, > so still useful? > > 4. The presence of the metadata generated by > bpf_xdp_metadata_export_to_skb should be indicated by a flag in > xdp_{buff,frame}->flags > - Assuming exposing it via xdp_md->has_skb_metadata is ok? probably __bpf_md_ptr(struct xdp_to_skb_metadata *, skb_metadata) and the type will be PTR_TO_BTF_ID_OR_NULL. > - Since the programs probably need to do the following: > > if (xdp_md->has_skb_metadata) { > access/change skb metadata by doing struct xdp_to_skb_metadata *p > = data_meta; and directly access/change xdp->skb_metadata instead of using xdp->data_meta. > } else { > use kfuncs > } > > 5. Support the case where we keep program's metadata and kernel's > xdp_to_skb_metadata > - skb_metadata_import_from_xdp() will "consume" it by mem-moving the > rest of the metadata over it and adjusting the headroom I was thinking the kernel's xdp_to_skb_metadata is always before the program's metadata. xdp prog should usually work in this order also: read/write headers, write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return XDP_PASS/XDP_REDIRECT. When it is XDP_PASS, the kernel just needs to pop the xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc. For the kernel and xdp prog, I don't think it matters where the xdp_to_skb_metadata is. However, the xdp->data_meta (program's metadata) has to be before xdp->data because of the current data_meta and data comparison usage in the xdp prog. The order of the kernel's xdp_to_skb_metadata and the program's metadata probably only matters to the userspace AF_XDP. However, I don't see how AF_XDP supports the program's metadata now. afaict, it can only work now if there is some sort of contract between them or the AF_XDP currently does not use the program's metadata. Either way, we can do the mem-moving only for AF_XDP and it should be a no op if there is no program's metadata? This behavior could also be configurable through setsockopt? Thanks for the summary! > > > I think the above solves all the cases Toke points to? > > a) Accessing the metadata after redirect (in a cpumap or devmap > program, or on a veth device) > - only a small xdp_to_skb_metadata subset will work out of the box > iff the redirecttor calls bpf_xdp_metadata_export_to_skb; for the rest > the progs will have to agree on the layout, right? > > b) Transferring the packet+metadata to AF_XDP > - here, again, the AF_XDP consumer will have to either expect > xdp_to_skb_metadata with a smaller set of skb-related metadata, or > will have to make sure the producer builds a custom layout using > kfuncs; there is also no flag to indicate whether xdp_to_skb_metadata > is there or not; the consumer will have to test btf_id at the right > offset > > c) Returning XDP_PASS, but accessing some of the metadata first > (whether to read or change it) > - can read via kfuncs, can change via > bpf_xdp_metadata_export_to_skb(); m->xyz=abc; > > Anything I'm missing?
On Wed, Nov 9, 2022 at 4:13 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > On 11/9/22 1:33 PM, Stanislav Fomichev wrote: > > On Wed, Nov 9, 2022 at 10:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote: > >> > >> On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote: > >>> Snipping a bit of context to reply to this bit: > >>> > >>>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not > >>>>>>> sure it is solid enough by asking the xdp prog not to use the same random number > >>>>>>> in its own metadata + not to change the metadata through xdp->data_meta after > >>>>>>> calling bpf_xdp_metadata_export_to_skb(). > >>>>>> > >>>>>> What do you think the usecase here might be? Or are you suggesting we > >>>>>> reject further access to data_meta after > >>>>>> bpf_xdp_metadata_export_to_skb somehow? > >>>>>> > >>>>>> If we want to let the programs override some of this > >>>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add > >>>>>> more kfuncs instead of exposing the layout? > >>>>>> > >>>>>> bpf_xdp_metadata_export_to_skb(ctx); > >>>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234); > >>> > >>> There are several use cases for needing to access the metadata after > >>> calling bpf_xdp_metdata_export_to_skb(): > >>> > >>> - Accessing the metadata after redirect (in a cpumap or devmap program, > >>> or on a veth device) > >>> - Transferring the packet+metadata to AF_XDP > >> fwiw, the xdp prog could also be more selective and only stores one of the hints > >> instead of the whole 'struct xdp_to_skb_metadata'. > >> > >>> - Returning XDP_PASS, but accessing some of the metadata first (whether > >>> to read or change it) > >>> > >>> The last one could be solved by calling additional kfuncs, but that > >>> would be less efficient than just directly editing the struct which > >>> will be cache-hot after the helper returns. > >> > >> Yeah, it is more efficient to directly write if possible. I think this set > >> allows the direct reading and writing already through data_meta (as a _u8 *). > >> > >>> > >>> And yeah, this will allow the XDP program to inject arbitrary metadata > >>> into the netstack; but it can already inject arbitrary *packet* data > >>> into the stack, so not sure if this is much of an additional risk? If it > >>> does lead to trivial crashes, we should probably harden the stack > >>> against that? > >>> > >>> As for the random number, Jesper and I discussed replacing this with the > >>> same BTF-ID scheme that he was using in his patch series. I.e., instead > >>> of just putting in a random number, we insert the BTF ID of the metadata > >>> struct at the end of it. This will allow us to support multiple > >>> different formats in the future (not just changing the layout, but > >>> having multiple simultaneous formats in the same kernel image), in case > >>> we run out of space. > >> > >> This seems a bit hypothetical. How much headroom does it usually have for the > >> xdp prog? Potentially the hints can use all the remaining space left after the > >> header encap and the current bpf_xdp_adjust_meta() usage? > >> > >>> > >>> We should probably also have a flag set on the xdp_frame so the stack > >>> knows that the metadata area contains relevant-to-skb data, to guard > >>> against an XDP program accidentally hitting the "magic number" (BTF_ID) > >>> in unrelated stuff it puts into the metadata area. > >> > >> Yeah, I think having a flag is useful. The flag will be set at xdp_buff and > >> then transfer to the xdp_frame? > >> > >>> > >>>> After re-reading patch 6, have another question. The 'void > >>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at > >>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *' > >>>> pointer and the xdp prog can directly read (or even write) it? > >>> > >>> Hmm, I'm not sure returning a failure makes sense? Failure to read one > >>> or more fields just means that those fields will not be populated? We > >>> should probably have a flags field inside the metadata struct itself to > >>> indicate which fields are set or not, but I'm not sure returning an > >>> error value adds anything? Returning a pointer to the metadata field > >>> might be convenient for users (it would just be an alias to the > >>> data_meta pointer, but the verifier could know its size, so the program > >>> doesn't have to bounds check it). > >> > >> If some hints are not available, those hints should be initialized to > >> 0/CHECKSUM_NONE/...etc. The xdp prog needs a direct way to tell hard failure > >> when it cannot write the meta area because of not enough space. Comparing > >> xdp->data_meta with xdp->data as a side effect is not intuitive. > >> > >> It is more than saving the bound check. With type info of 'struct > >> xdp_to_skb_metadata *', the verifier can do more checks like reading in the > >> middle of an integer member. The verifier could also limit write access only to > >> a few struct's members if it is needed. > >> > >> The returning 'struct xdp_to_skb_metadata *' should not be an alias to the > >> xdp->data_meta. They should actually point to different locations in the > >> headroom. bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff. > >> xdp->data_meta won't be changed and keeps pointing to the last > >> bpf_xdp_adjust_meta() location. The kernel will know if there is > >> xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the > >> xdp_{buff,frame}. Would it work? > >> > >>> > >>>> A related question, why 'struct xdp_to_skb_metadata' needs > >>>> __randomize_layout? > >>> > >>> The __randomize_layout thing is there to force BPF programs to use CO-RE > >>> to access the field. This is to avoid the struct layout accidentally > >>> ossifying because people in practice rely on a particular layout, even > >>> though we tell them to use CO-RE. There are lots of examples of this > >>> happening in other domains (IP header options, TCP options, etc), and > >>> __randomize_layout seemed like a neat trick to enforce CO-RE usage :) > >> > >> I am not sure if it is necessary or helpful to only enforce __randomize_layout > >> in 'struct xdp_to_skb_metadata'. There are other CO-RE use cases (tracing and > >> non tracing) that already have direct access (reading and/or writing) to other > >> kernel structures. > >> > >> It is more important for the verifier to see the xdp prog accessing it as a > >> 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so > >> that the verifier can enforce the rules of access. > >> > >>> > >>>>>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the > >>>>>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only? > >>>>>> > >>>>>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right > >>>>>> now usually have C code to manually pull out the metadata (out of hw > >>>>>> desc) and put it into skb. > >>>>>> > >>>>>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for > >>>>>> XDP_PASS, we're doing a double amount of work: > >>>>>> skb_metadata_import_from_xdp first, then custom driver code second. > >>>>>> > >>>>>> In theory, maybe we should completely skip drivers custom parsing when > >>>>>> there is a prog with BPF_F_XDP_HAS_METADATA? > >>>>>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven > >>>>>> and won't require any mental work (plus, the drivers won't have to > >>>>>> care either in the future). > >>>>>> > WDYT? > >>>>> > >>>>> > >>>>> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes > >>>>> sense to only use the hints (if ever written) from xdp prog especially if it > >>>>> will eventually support xdp prog changing some of the hints in the future. For > >>>>> now, I think either way is fine since they are the same and the xdp prog is sort > >>>>> of doing extra unnecessary work anyway by calling > >>>>> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be > >>>>> changed now. > >>> > >>> I agree it would be best if the drivers also use the XDP metadata (if > >>> present) on XDP_PASS. Longer term my hope is we can make the XDP > >>> metadata support the only thing drivers need to implement (i.e., have > >>> the stack call into that code even when no XDP program is loaded), but > >>> for now just for consistency (and allowing the XDP program to update the > >>> metadata), we should probably at least consume it on XDP_PASS. > >>> > >>> -Toke > >>> > > > > Not to derail the discussion (left the last message intact on top, > > feel free to continue), but to summarize. The proposed changes seem to > > be: > > > > 1. bpf_xdp_metadata_export_to_skb() should return pointer to "struct > > xdp_to_skb_metadata" > > - This should let bpf programs change the metadata passed to the skb > > > > 2. "struct xdp_to_skb_metadata" should have its btf_id as the first > > __u32 member (and remove the magic) > > - This is for the redirect case where the end users, including > > AF_XDP, can parse this metadata from btf_id > > I think Toke's idea is to put the btf_id at the end of xdp_to_skb_metadata. I > can see why the end is needed for the userspace AF_XDP because, afaict, AF_XDP > rx_desc currently cannot tell if there is metadata written by the xdp prog or > not. However, if the 'has_skb_metadata' bit can also be passed to the AF_XDP > rx_desc->options, the btf_id may as well be not needed now. However, the btf_id > and other future new members can be added to the xdp_to_skb_metadata later if > there is a need. > > For the kernel and xdp prog, a bit in the xdp->flags should be enough to get to > the xdp_to_skb_metadata. The xdp prog will use CO-RE to access the members in > xdp_to_skb_metadata. Ack, good points on putting it at the end. Regarding bit in desc->options vs btf_id: since it seems that btf_id is useful anyway, let's start with that? We can add a bit later on if it turns out using metadata is problematic otherwise. > > - This, however, is not all the metadata that the device can > > support, but a much narrower set that the kernel is expected to use > > for skb construction > > > > 3. __randomize_layout isn't really helping, CO-RE will trigger > > regardless; maybe only the case where it matters is probably AF_XDP, > > so still useful? > > > > 4. The presence of the metadata generated by > > bpf_xdp_metadata_export_to_skb should be indicated by a flag in > > xdp_{buff,frame}->flags > > - Assuming exposing it via xdp_md->has_skb_metadata is ok? > > probably __bpf_md_ptr(struct xdp_to_skb_metadata *, skb_metadata) and the type > will be PTR_TO_BTF_ID_OR_NULL. Oh, that seems even better than returning it from bpf_xdp_metadata_export_to_skb. bpf_xdp_metadata_export_to_skb can return true/false and the rest goes via default verifier ctx resolution mechanism.. (returning ptr from a kfunc seems to be a bit complicated right now) > > - Since the programs probably need to do the following: > > > > if (xdp_md->has_skb_metadata) { > > access/change skb metadata by doing struct xdp_to_skb_metadata *p > > = data_meta; > > and directly access/change xdp->skb_metadata instead of using xdp->data_meta. Ack. > > } else { > > use kfuncs > > } > > > > 5. Support the case where we keep program's metadata and kernel's > > xdp_to_skb_metadata > > - skb_metadata_import_from_xdp() will "consume" it by mem-moving the > > rest of the metadata over it and adjusting the headroom > > I was thinking the kernel's xdp_to_skb_metadata is always before the program's > metadata. xdp prog should usually work in this order also: read/write headers, > write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return > XDP_PASS/XDP_REDIRECT. When it is XDP_PASS, the kernel just needs to pop the > xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc. > > For the kernel and xdp prog, I don't think it matters where the > xdp_to_skb_metadata is. However, the xdp->data_meta (program's metadata) has to > be before xdp->data because of the current data_meta and data comparison usage > in the xdp prog. > > The order of the kernel's xdp_to_skb_metadata and the program's metadata > probably only matters to the userspace AF_XDP. However, I don't see how AF_XDP > supports the program's metadata now. afaict, it can only work now if there is > some sort of contract between them or the AF_XDP currently does not use the > program's metadata. Either way, we can do the mem-moving only for AF_XDP and it > should be a no op if there is no program's metadata? This behavior could also > be configurable through setsockopt? Agreed on all of the above. For now it seems like the safest thing to do is to put xdp_to_skb_metadata last to allow af_xdp to properly locate btf_id. Let's see if Toke disagrees :-) > Thanks for the summary! > > > > > > > I think the above solves all the cases Toke points to? > > > > a) Accessing the metadata after redirect (in a cpumap or devmap > > program, or on a veth device) > > - only a small xdp_to_skb_metadata subset will work out of the box > > iff the redirecttor calls bpf_xdp_metadata_export_to_skb; for the rest > > the progs will have to agree on the layout, right? > > > > b) Transferring the packet+metadata to AF_XDP > > - here, again, the AF_XDP consumer will have to either expect > > xdp_to_skb_metadata with a smaller set of skb-related metadata, or > > will have to make sure the producer builds a custom layout using > > kfuncs; there is also no flag to indicate whether xdp_to_skb_metadata > > is there or not; the consumer will have to test btf_id at the right > > offset > > > > c) Returning XDP_PASS, but accessing some of the metadata first > > (whether to read or change it) > > - can read via kfuncs, can change via > > bpf_xdp_metadata_export_to_skb(); m->xyz=abc; > > > > Anything I'm missing? >
Stanislav Fomichev wrote: > Implement new bpf_xdp_metadata_export_to_skb kfunc which > prepares compatible xdp metadata for kernel consumption. > This kfunc should be called prior to bpf_redirect > or (unless already called) when XDP_PASS'ing the frame > into the kernel. Hi, Had a couple high level questions so starting a new thread thought it would be more confusing than helpful to add to the thread on this patch. > > The implementation currently maintains xdp_to_skb_metadata > layout by calling bpf_xdp_metadata_rx_timestamp and placing > small magic number. From skb_metdata_set, when we get expected magic number, > we interpret metadata accordingly. From commit message side I'm not able to parse this paragraph without reading code. Maybe expand it a bit for next version or it could just be me. > > Both magic number and struct layout are randomized to make sure > it doesn't leak into the userspace. Are we worried about leaking pointers into XDP program here? We already leak pointers into XDP through helpers so I'm not sure it matters. > > skb_metadata_set is amended with skb_metadata_import_from_xdp which > tries to parse out the metadata and put it into skb. > > See the comment for r1 vs r2/r3/r4/r5 conventions. I think for next version an expanded commit message with use cases would help. I had to follow the thread to get some ideas why this might be useful. > > Cc: John Fastabend <john.fastabend@gmail.com> > Cc: David Ahern <dsahern@gmail.com> > Cc: Martin KaFai Lau <martin.lau@linux.dev> > Cc: Jakub Kicinski <kuba@kernel.org> > Cc: Willem de Bruijn <willemb@google.com> > Cc: Jesper Dangaard Brouer <brouer@redhat.com> > Cc: Anatoly Burakov <anatoly.burakov@intel.com> > Cc: Alexander Lobakin <alexandr.lobakin@intel.com> > Cc: Magnus Karlsson <magnus.karlsson@gmail.com> > Cc: Maryam Tahhan <mtahhan@redhat.com> > Cc: xdp-hints@xdp-project.net > Cc: netdev@vger.kernel.org > Signed-off-by: Stanislav Fomichev <sdf@google.com> > --- > drivers/net/veth.c | 4 +- > include/linux/bpf_patch.h | 2 + > include/linux/skbuff.h | 4 ++ > include/net/xdp.h | 13 +++++ > kernel/bpf/bpf_patch.c | 30 +++++++++++ > kernel/bpf/verifier.c | 18 +++++++ > net/core/skbuff.c | 25 +++++++++ > net/core/xdp.c | 104 +++++++++++++++++++++++++++++++++++--- > 8 files changed, 193 insertions(+), 7 deletions(-) > > diff --git a/drivers/net/veth.c b/drivers/net/veth.c > index 0e629ceb087b..d4cd0938360b 100644 > --- a/drivers/net/veth.c > +++ b/drivers/net/veth.c > @@ -1673,7 +1673,9 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) > static void veth_unroll_kfunc(const struct bpf_prog *prog, u32 func_id, > struct bpf_patch *patch) > { > - if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED)) { > + if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_EXPORT_TO_SKB)) { > + return xdp_metadata_export_to_skb(prog, patch); > + } else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED)) { > /* return true; */ > bpf_patch_append(patch, BPF_MOV64_IMM(BPF_REG_0, 1)); > } else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) { > diff --git a/include/linux/bpf_patch.h b/include/linux/bpf_patch.h > index 81ff738eef8d..359c165ad68b 100644 > --- a/include/linux/bpf_patch.h > +++ b/include/linux/bpf_patch.h > @@ -16,6 +16,8 @@ size_t bpf_patch_len(const struct bpf_patch *patch); > int bpf_patch_err(const struct bpf_patch *patch); > void __bpf_patch_append(struct bpf_patch *patch, struct bpf_insn insn); > struct bpf_insn *bpf_patch_data(const struct bpf_patch *patch); > +void bpf_patch_resolve_jmp(struct bpf_patch *patch); > +u32 bpf_patch_magles_registers(const struct bpf_patch *patch); > > #define bpf_patch_append(patch, ...) ({ \ > struct bpf_insn insn[] = { __VA_ARGS__ }; \ > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index 59c9fd55699d..dba857f212d7 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -4217,9 +4217,13 @@ static inline bool skb_metadata_differs(const struct sk_buff *skb_a, > true : __skb_metadata_differs(skb_a, skb_b, len_a); > } > > +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len); > + > static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) > { > skb_shinfo(skb)->meta_len = meta_len; > + if (meta_len) > + skb_metadata_import_from_xdp(skb, meta_len); > } > > static inline void skb_metadata_clear(struct sk_buff *skb) > diff --git a/include/net/xdp.h b/include/net/xdp.h > index 2a82a98f2f9f..8c97c6996172 100644 > --- a/include/net/xdp.h > +++ b/include/net/xdp.h > @@ -411,6 +411,8 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, > #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE > > #define XDP_METADATA_KFUNC_xxx \ > + XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_EXPORT_TO_SKB, \ > + bpf_xdp_metadata_export_to_skb) \ > XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED, \ > bpf_xdp_metadata_rx_timestamp_supported) \ > XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ > @@ -423,14 +425,25 @@ XDP_METADATA_KFUNC_xxx > MAX_XDP_METADATA_KFUNC, > }; > > +struct xdp_to_skb_metadata { > + u32 magic; /* xdp_metadata_magic */ > + u64 rx_timestamp; Slightly confused. I thought/think most drivers populate the skb timestamp if they can already? So why do we need to bounce these through some xdp metadata? Won't all this cost more than the load/store directly from the descriptor into the skb? Even if drivers are not populating skb now shouldn't an ethtool knob be enough to turn this on? I don't see the value of getting this in veth side its just a sw timestamp there. If its specific to cpumap shouldn't we land this in cpumap code paths out of general XDP code paths? > +} __randomize_layout; > + > +struct bpf_patch; > + > #ifdef CONFIG_DEBUG_INFO_BTF > +extern u32 xdp_metadata_magic; > extern struct btf_id_set8 xdp_metadata_kfunc_ids; > static inline u32 xdp_metadata_kfunc_id(int id) > { > return xdp_metadata_kfunc_ids.pairs[id].id; > } > +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch); > #else > +#define xdp_metadata_magic 0 > static inline u32 xdp_metadata_kfunc_id(int id) { return 0; } > +static void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch) { return 0; } > #endif > > #endif /* __LINUX_NET_XDP_H__ */ > diff --git a/kernel/bpf/bpf_patch.c b/kernel/bpf/bpf_patch.c > index 82a10bf5624a..8f1fef74299c 100644 > --- a/kernel/bpf/bpf_patch.c > +++ b/kernel/bpf/bpf_patch.c > @@ -49,3 +49,33 @@ struct bpf_insn *bpf_patch_data(const struct bpf_patch *patch) > { > return patch->insn; > } [...] > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index 42a35b59fb1e..37e3aef46525 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -72,6 +72,7 @@ > #include <net/mptcp.h> > #include <net/mctp.h> > #include <net/page_pool.h> > +#include <net/xdp.h> > > #include <linux/uaccess.h> > #include <trace/events/skb.h> > @@ -6672,3 +6673,27 @@ nodefer: __kfree_skb(skb); > if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) > smp_call_function_single_async(cpu, &sd->defer_csd); > } > + > +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len) > +{ > + struct xdp_to_skb_metadata *meta = (void *)(skb_mac_header(skb) - len); > + > + /* Optional SKB info, currently missing: > + * - HW checksum info (skb->ip_summed) > + * - HW RX hash (skb_set_hash) > + * - RX ring dev queue index (skb_record_rx_queue) > + */ > + > + if (len != sizeof(struct xdp_to_skb_metadata)) > + return; > + > + if (meta->magic != xdp_metadata_magic) > + return; > + > + if (meta->rx_timestamp) { > + *skb_hwtstamps(skb) = (struct skb_shared_hwtstamps){ > + .hwtstamp = ns_to_ktime(meta->rx_timestamp), > + }; > + } > +} > +EXPORT_SYMBOL(skb_metadata_import_from_xdp); > diff --git a/net/core/xdp.c b/net/core/xdp.c > index 22f1e44700eb..8204fa05c5e9 100644 > --- a/net/core/xdp.c > +++ b/net/core/xdp.c > @@ -653,12 +653,6 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, > /* Essential SKB info: protocol and skb->dev */ > skb->protocol = eth_type_trans(skb, dev); > > - /* Optional SKB info, currently missing: > - * - HW checksum info (skb->ip_summed) > - * - HW RX hash (skb_set_hash) > - * - RX ring dev queue index (skb_record_rx_queue) > - */ > - > /* Until page_pool get SKB return path, release DMA here */ > xdp_release_frame(xdpf); > > @@ -712,6 +706,13 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) > return nxdpf; > } > > +/* For the packets directed to the kernel, this kfunc exports XDP metadata > + * into skb context. > + */ > +noinline void bpf_xdp_metadata_export_to_skb(const struct xdp_md *ctx) > +{ > +} > + > /* Indicates whether particular device supports rx_timestamp metadata. > * This is an optional helper to support marking some branches as > * "dead code" in the BPF programs. > @@ -737,13 +738,104 @@ XDP_METADATA_KFUNC_xxx > #undef XDP_METADATA_KFUNC > BTF_SET8_END(xdp_metadata_kfunc_ids) > > +/* Make sure userspace doesn't depend on our layout by using > + * different pseudo-generated magic value. > + */ > +u32 xdp_metadata_magic; > + > static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { > .owner = THIS_MODULE, > .set = &xdp_metadata_kfunc_ids, > }; > > +/* Since we're not actually doing a call but instead rewriting > + * in place, we can only afford to use R0-R5 scratch registers. Why not just do a call? Its neat to inline this but your going to build an skb next. Thats not cheap and the cost of a call should be complete noise when hitting the entire stack? Any benchmark to convince us this is worthwhile optimizations? > + * > + * We reserve R1 for bpf_xdp_metadata_export_to_skb and let individual > + * metadata kfuncs use only R0,R4-R5. > + * > + * The above also means we _cannot_ easily call any other helper/kfunc > + * because there is no place for us to preserve our R1 argument; > + * existing R6-R9 belong to the callee. > + */ > +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch) > +{ [...] > } > late_initcall(xdp_metadata_init); Thanks, John
Toke Høiland-Jørgensen wrote: > Snipping a bit of context to reply to this bit: > > >>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not > >>>> sure it is solid enough by asking the xdp prog not to use the same random number > >>>> in its own metadata + not to change the metadata through xdp->data_meta after > >>>> calling bpf_xdp_metadata_export_to_skb(). > >>> > >>> What do you think the usecase here might be? Or are you suggesting we > >>> reject further access to data_meta after > >>> bpf_xdp_metadata_export_to_skb somehow? > >>> > >>> If we want to let the programs override some of this > >>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add > >>> more kfuncs instead of exposing the layout? > >>> > >>> bpf_xdp_metadata_export_to_skb(ctx); > >>> bpf_xdp_metadata_export_skb_hash(ctx, 1234); > Hi Toke, Trying not to bifurcate your thread. Can I start a new one here to elaborate on these use cases. I'm still a bit lost on any use case for this that makes sense to actually deploy on a network. > There are several use cases for needing to access the metadata after > calling bpf_xdp_metdata_export_to_skb(): > > - Accessing the metadata after redirect (in a cpumap or devmap program, > or on a veth device) I think for devmap there are still lots of opens how/where the skb is even built. For cpumap I'm a bit unsure what the use case is. For ice, mlx and such you should use the hardware RSS if performance is top of mind. And then for specific devices on cpumap (maybe realtime or ptp things?) could we just throw it through the xdp_frame? > - Transferring the packet+metadata to AF_XDP In this case we have the metadata and AF_XDP program and XDP program simply need to agree on metadata format. No need to have some magic numbers and driver specific kfuncs. > - Returning XDP_PASS, but accessing some of the metadata first (whether > to read or change it) > I don't get this case? XDP_PASS should go to stack normally through drivers build_skb routines. These will populate timestamp normally. My guess is simply descriptor->skb load/store is cheaper than carrying around this metadata and doing the call in BPF side. Anyways you just built an entire skb and hit the stack I don't think you will notice this noise in any benchmark.
On Wed, Nov 9, 2022 at 5:09 PM John Fastabend <john.fastabend@gmail.com> wrote: > > Stanislav Fomichev wrote: > > Implement new bpf_xdp_metadata_export_to_skb kfunc which > > prepares compatible xdp metadata for kernel consumption. > > This kfunc should be called prior to bpf_redirect > > or (unless already called) when XDP_PASS'ing the frame > > into the kernel. > > Hi, > > Had a couple high level questions so starting a new thread thought > it would be more confusing than helpful to add to the thread on > this patch. > > > > > The implementation currently maintains xdp_to_skb_metadata > > layout by calling bpf_xdp_metadata_rx_timestamp and placing > > small magic number. From skb_metdata_set, when we get expected magic number, > > we interpret metadata accordingly. > > From commit message side I'm not able to parse this paragraph without > reading code. Maybe expand it a bit for next version or it could > just be me. Sure, will try to reword. In another thread we've discussed removing the magic number, so I'll have to rewrite this part anyway :-) > > > > Both magic number and struct layout are randomized to make sure > > it doesn't leak into the userspace. > > Are we worried about leaking pointers into XDP program here? We already > leak pointers into XDP through helpers so I'm not sure it matters. I wasn't sure here whether we want to have that xdp->skb (redirect) path to be completely opaque or it's ok to expose this to the bpf/af_xdp. Seems like everybody's on board about exposing, so I'll be most likely removing that randomization part. (see my recent reply to Martin/Toke) > > > > skb_metadata_set is amended with skb_metadata_import_from_xdp which > > tries to parse out the metadata and put it into skb. > > > > See the comment for r1 vs r2/r3/r4/r5 conventions. > > I think for next version an expanded commit message with use > cases would help. I had to follow the thread to get some ideas > why this might be useful. Sure. I was planning to put together Documentation/bpf/xdp_metadata.rst with more details/assumptions/use-cases for the final non-rfc submissions.. Maybe that's better to help build up a full picture? > > Cc: John Fastabend <john.fastabend@gmail.com> > > Cc: David Ahern <dsahern@gmail.com> > > Cc: Martin KaFai Lau <martin.lau@linux.dev> > > Cc: Jakub Kicinski <kuba@kernel.org> > > Cc: Willem de Bruijn <willemb@google.com> > > Cc: Jesper Dangaard Brouer <brouer@redhat.com> > > Cc: Anatoly Burakov <anatoly.burakov@intel.com> > > Cc: Alexander Lobakin <alexandr.lobakin@intel.com> > > Cc: Magnus Karlsson <magnus.karlsson@gmail.com> > > Cc: Maryam Tahhan <mtahhan@redhat.com> > > Cc: xdp-hints@xdp-project.net > > Cc: netdev@vger.kernel.org > > Signed-off-by: Stanislav Fomichev <sdf@google.com> > > --- > > drivers/net/veth.c | 4 +- > > include/linux/bpf_patch.h | 2 + > > include/linux/skbuff.h | 4 ++ > > include/net/xdp.h | 13 +++++ > > kernel/bpf/bpf_patch.c | 30 +++++++++++ > > kernel/bpf/verifier.c | 18 +++++++ > > net/core/skbuff.c | 25 +++++++++ > > net/core/xdp.c | 104 +++++++++++++++++++++++++++++++++++--- > > 8 files changed, 193 insertions(+), 7 deletions(-) > > > > diff --git a/drivers/net/veth.c b/drivers/net/veth.c > > index 0e629ceb087b..d4cd0938360b 100644 > > --- a/drivers/net/veth.c > > +++ b/drivers/net/veth.c > > @@ -1673,7 +1673,9 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) > > static void veth_unroll_kfunc(const struct bpf_prog *prog, u32 func_id, > > struct bpf_patch *patch) > > { > > - if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED)) { > > + if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_EXPORT_TO_SKB)) { > > + return xdp_metadata_export_to_skb(prog, patch); > > + } else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED)) { > > /* return true; */ > > bpf_patch_append(patch, BPF_MOV64_IMM(BPF_REG_0, 1)); > > } else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) { > > diff --git a/include/linux/bpf_patch.h b/include/linux/bpf_patch.h > > index 81ff738eef8d..359c165ad68b 100644 > > --- a/include/linux/bpf_patch.h > > +++ b/include/linux/bpf_patch.h > > @@ -16,6 +16,8 @@ size_t bpf_patch_len(const struct bpf_patch *patch); > > int bpf_patch_err(const struct bpf_patch *patch); > > void __bpf_patch_append(struct bpf_patch *patch, struct bpf_insn insn); > > struct bpf_insn *bpf_patch_data(const struct bpf_patch *patch); > > +void bpf_patch_resolve_jmp(struct bpf_patch *patch); > > +u32 bpf_patch_magles_registers(const struct bpf_patch *patch); > > > > #define bpf_patch_append(patch, ...) ({ \ > > struct bpf_insn insn[] = { __VA_ARGS__ }; \ > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > > index 59c9fd55699d..dba857f212d7 100644 > > --- a/include/linux/skbuff.h > > +++ b/include/linux/skbuff.h > > @@ -4217,9 +4217,13 @@ static inline bool skb_metadata_differs(const struct sk_buff *skb_a, > > true : __skb_metadata_differs(skb_a, skb_b, len_a); > > } > > > > +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len); > > + > > static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) > > { > > skb_shinfo(skb)->meta_len = meta_len; > > + if (meta_len) > > + skb_metadata_import_from_xdp(skb, meta_len); > > } > > > > static inline void skb_metadata_clear(struct sk_buff *skb) > > diff --git a/include/net/xdp.h b/include/net/xdp.h > > index 2a82a98f2f9f..8c97c6996172 100644 > > --- a/include/net/xdp.h > > +++ b/include/net/xdp.h > > @@ -411,6 +411,8 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, > > #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE > > > > #define XDP_METADATA_KFUNC_xxx \ > > + XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_EXPORT_TO_SKB, \ > > + bpf_xdp_metadata_export_to_skb) \ > > XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED, \ > > bpf_xdp_metadata_rx_timestamp_supported) \ > > XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ > > @@ -423,14 +425,25 @@ XDP_METADATA_KFUNC_xxx > > MAX_XDP_METADATA_KFUNC, > > }; > > > > +struct xdp_to_skb_metadata { > > + u32 magic; /* xdp_metadata_magic */ > > + u64 rx_timestamp; > > Slightly confused. I thought/think most drivers populate the skb timestamp > if they can already? So why do we need to bounce these through some xdp > metadata? Won't all this cost more than the load/store directly from the > descriptor into the skb? Even if drivers are not populating skb now > shouldn't an ethtool knob be enough to turn this on? dsahern@ pointed out that it might be useful for the program to be able to override some of that metadata. Or, for example, if there is no rx vlan offload, the program can strip it (as part of parsing) and put the vlan tag into that xdp_to_skb_metadata. > I don't see the value of getting this in veth side its just a sw > timestamp there. (veth is there so we can have some selftests) > If its specific to cpumap shouldn't we land this in cpumap code paths > out of general XDP code paths? See above, if we run this at cpumap time it's too late? > > +} __randomize_layout; > > + > > +struct bpf_patch; > > + > > #ifdef CONFIG_DEBUG_INFO_BTF > > +extern u32 xdp_metadata_magic; > > extern struct btf_id_set8 xdp_metadata_kfunc_ids; > > static inline u32 xdp_metadata_kfunc_id(int id) > > { > > return xdp_metadata_kfunc_ids.pairs[id].id; > > } > > +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch); > > #else > > +#define xdp_metadata_magic 0 > > static inline u32 xdp_metadata_kfunc_id(int id) { return 0; } > > +static void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch) { return 0; } > > #endif > > > > #endif /* __LINUX_NET_XDP_H__ */ > > diff --git a/kernel/bpf/bpf_patch.c b/kernel/bpf/bpf_patch.c > > index 82a10bf5624a..8f1fef74299c 100644 > > --- a/kernel/bpf/bpf_patch.c > > +++ b/kernel/bpf/bpf_patch.c > > @@ -49,3 +49,33 @@ struct bpf_insn *bpf_patch_data(const struct bpf_patch *patch) > > { > > return patch->insn; > > } > > [...] > > > > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > > index 42a35b59fb1e..37e3aef46525 100644 > > --- a/net/core/skbuff.c > > +++ b/net/core/skbuff.c > > @@ -72,6 +72,7 @@ > > #include <net/mptcp.h> > > #include <net/mctp.h> > > #include <net/page_pool.h> > > +#include <net/xdp.h> > > > > #include <linux/uaccess.h> > > #include <trace/events/skb.h> > > @@ -6672,3 +6673,27 @@ nodefer: __kfree_skb(skb); > > if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) > > smp_call_function_single_async(cpu, &sd->defer_csd); > > } > > + > > +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len) > > +{ > > + struct xdp_to_skb_metadata *meta = (void *)(skb_mac_header(skb) - len); > > + > > + /* Optional SKB info, currently missing: > > + * - HW checksum info (skb->ip_summed) > > + * - HW RX hash (skb_set_hash) > > + * - RX ring dev queue index (skb_record_rx_queue) > > + */ > > + > > + if (len != sizeof(struct xdp_to_skb_metadata)) > > + return; > > + > > + if (meta->magic != xdp_metadata_magic) > > + return; > > + > > + if (meta->rx_timestamp) { > > + *skb_hwtstamps(skb) = (struct skb_shared_hwtstamps){ > > + .hwtstamp = ns_to_ktime(meta->rx_timestamp), > > + }; > > + } > > +} > > +EXPORT_SYMBOL(skb_metadata_import_from_xdp); > > diff --git a/net/core/xdp.c b/net/core/xdp.c > > index 22f1e44700eb..8204fa05c5e9 100644 > > --- a/net/core/xdp.c > > +++ b/net/core/xdp.c > > @@ -653,12 +653,6 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, > > /* Essential SKB info: protocol and skb->dev */ > > skb->protocol = eth_type_trans(skb, dev); > > > > - /* Optional SKB info, currently missing: > > - * - HW checksum info (skb->ip_summed) > > - * - HW RX hash (skb_set_hash) > > - * - RX ring dev queue index (skb_record_rx_queue) > > - */ > > - > > /* Until page_pool get SKB return path, release DMA here */ > > xdp_release_frame(xdpf); > > > > @@ -712,6 +706,13 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) > > return nxdpf; > > } > > > > +/* For the packets directed to the kernel, this kfunc exports XDP metadata > > + * into skb context. > > + */ > > +noinline void bpf_xdp_metadata_export_to_skb(const struct xdp_md *ctx) > > +{ > > +} > > + > > /* Indicates whether particular device supports rx_timestamp metadata. > > * This is an optional helper to support marking some branches as > > * "dead code" in the BPF programs. > > @@ -737,13 +738,104 @@ XDP_METADATA_KFUNC_xxx > > #undef XDP_METADATA_KFUNC > > BTF_SET8_END(xdp_metadata_kfunc_ids) > > > > +/* Make sure userspace doesn't depend on our layout by using > > + * different pseudo-generated magic value. > > + */ > > +u32 xdp_metadata_magic; > > + > > static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { > > .owner = THIS_MODULE, > > .set = &xdp_metadata_kfunc_ids, > > }; > > > > +/* Since we're not actually doing a call but instead rewriting > > + * in place, we can only afford to use R0-R5 scratch registers. > > Why not just do a call? Its neat to inline this but your going > to build an skb next. Thats not cheap and the cost of a call > should be complete noise when hitting the entire stack? > > Any benchmark to convince us this is worthwhile optimizations? I do have a suspicion that a non-zero amount of drivers will actually resort to calling kernel instead of writing bpf bytecode (especially since there might be some locks involved). However if we prefer to go back to calls, there still has to be some translation table. I'm assuming we want to at least resolve indirect netdev->kfunc_rx_metatada() calls at prog load time. So we still need a per-netdev map from kfunc_id to real implementation func. And I don't think that it would be more simple than the switch statement that I have? > > + * > > + * We reserve R1 for bpf_xdp_metadata_export_to_skb and let individual > > + * metadata kfuncs use only R0,R4-R5. > > + * > > + * The above also means we _cannot_ easily call any other helper/kfunc > > + * because there is no place for us to preserve our R1 argument; > > + * existing R6-R9 belong to the callee. > > + */ > > +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch) > > +{ > > [...] > > > } > > late_initcall(xdp_metadata_init); > > Thanks, > John > >
Martin KaFai Lau <martin.lau@linux.dev> writes: > On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote: >> Snipping a bit of context to reply to this bit: >> >>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not >>>>>> sure it is solid enough by asking the xdp prog not to use the same random number >>>>>> in its own metadata + not to change the metadata through xdp->data_meta after >>>>>> calling bpf_xdp_metadata_export_to_skb(). >>>>> >>>>> What do you think the usecase here might be? Or are you suggesting we >>>>> reject further access to data_meta after >>>>> bpf_xdp_metadata_export_to_skb somehow? >>>>> >>>>> If we want to let the programs override some of this >>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add >>>>> more kfuncs instead of exposing the layout? >>>>> >>>>> bpf_xdp_metadata_export_to_skb(ctx); >>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234); >> >> There are several use cases for needing to access the metadata after >> calling bpf_xdp_metdata_export_to_skb(): >> >> - Accessing the metadata after redirect (in a cpumap or devmap program, >> or on a veth device) >> - Transferring the packet+metadata to AF_XDP > fwiw, the xdp prog could also be more selective and only stores one of the hints > instead of the whole 'struct xdp_to_skb_metadata'. Yup, absolutely! In that sense, reusing the SKB format is mostly a convenience feature. However, lots of people consume AF_XDP through the default program installed by libxdp in the XSK setup code, and including custom metadata there is awkward. So having the metadata consumed by the stack as the "default subset" would enable easy consumption by non-advanced users, while advanced users can still do custom stuff by writing their own XDP program that calls the kfuncs. >> - Returning XDP_PASS, but accessing some of the metadata first (whether >> to read or change it) >> >> The last one could be solved by calling additional kfuncs, but that >> would be less efficient than just directly editing the struct which >> will be cache-hot after the helper returns. > > Yeah, it is more efficient to directly write if possible. I think this set > allows the direct reading and writing already through data_meta (as a _u8 *). Yup, totally fine with just keeping that capability :) >> And yeah, this will allow the XDP program to inject arbitrary metadata >> into the netstack; but it can already inject arbitrary *packet* data >> into the stack, so not sure if this is much of an additional risk? If it >> does lead to trivial crashes, we should probably harden the stack >> against that? >> >> As for the random number, Jesper and I discussed replacing this with the >> same BTF-ID scheme that he was using in his patch series. I.e., instead >> of just putting in a random number, we insert the BTF ID of the metadata >> struct at the end of it. This will allow us to support multiple >> different formats in the future (not just changing the layout, but >> having multiple simultaneous formats in the same kernel image), in case >> we run out of space. > > This seems a bit hypothetical. How much headroom does it usually have for the > xdp prog? Potentially the hints can use all the remaining space left after the > header encap and the current bpf_xdp_adjust_meta() usage? For the metadata consumed by the stack right now it's a bit hypothetical, yeah. However, there's a bunch of metadata commonly supported by hardware that the stack currently doesn't consume and that hopefully this feature will end up making more accessible. My hope is that the stack can also learn how to use this in the future, in which case we may run out of space. So I think of that bit mostly as future-proofing... >> We should probably also have a flag set on the xdp_frame so the stack >> knows that the metadata area contains relevant-to-skb data, to guard >> against an XDP program accidentally hitting the "magic number" (BTF_ID) >> in unrelated stuff it puts into the metadata area. > > Yeah, I think having a flag is useful. The flag will be set at xdp_buff and > then transfer to the xdp_frame? Yeah, exactly! >>> After re-reading patch 6, have another question. The 'void >>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at >>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *' >>> pointer and the xdp prog can directly read (or even write) it? >> >> Hmm, I'm not sure returning a failure makes sense? Failure to read one >> or more fields just means that those fields will not be populated? We >> should probably have a flags field inside the metadata struct itself to >> indicate which fields are set or not, but I'm not sure returning an >> error value adds anything? Returning a pointer to the metadata field >> might be convenient for users (it would just be an alias to the >> data_meta pointer, but the verifier could know its size, so the program >> doesn't have to bounds check it). > > If some hints are not available, those hints should be initialized to > 0/CHECKSUM_NONE/...etc. The problem with that is that then we have to spend cycles writing eight bytes of zeroes into the checksum field :) > The xdp prog needs a direct way to tell hard failure when it cannot > write the meta area because of not enough space. Comparing > xdp->data_meta with xdp->data as a side effect is not intuitive. Yeah, hence a flags field so we can just see if setting each field succeeded? > It is more than saving the bound check. With type info of 'struct > xdp_to_skb_metadata *', the verifier can do more checks like reading in the > middle of an integer member. The verifier could also limit write access only to > a few struct's members if it is needed. > > The returning 'struct xdp_to_skb_metadata *' should not be an alias to the > xdp->data_meta. They should actually point to different locations in the > headroom. bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff. > xdp->data_meta won't be changed and keeps pointing to the last > bpf_xdp_adjust_meta() location. The kernel will know if there is > xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the > xdp_{buff,frame}. Would it work? Hmm, logically splitting the program metadata and the xdp_hints metadata (but having them share the same area) *could* work, I guess, I'm just not sure it's worth the extra complexity? >>> A related question, why 'struct xdp_to_skb_metadata' needs >>> __randomize_layout? >> >> The __randomize_layout thing is there to force BPF programs to use CO-RE >> to access the field. This is to avoid the struct layout accidentally >> ossifying because people in practice rely on a particular layout, even >> though we tell them to use CO-RE. There are lots of examples of this >> happening in other domains (IP header options, TCP options, etc), and >> __randomize_layout seemed like a neat trick to enforce CO-RE usage :) > > I am not sure if it is necessary or helpful to only enforce __randomize_layout > in 'struct xdp_to_skb_metadata'. There are other CO-RE use cases (tracing and > non tracing) that already have direct access (reading and/or writing) to other > kernel structures. > > It is more important for the verifier to see the xdp prog accessing it as a > 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so > that the verifier can enforce the rules of access. That only works inside the kernel, though. Since the metadata field can be copied wholesale to AF_XDP, having it randomized forces userspace consumers to also write code to deal with the layout being dynamic... -Toke
Stanislav Fomichev <sdf@google.com> writes: > On Wed, Nov 9, 2022 at 4:13 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: >> >> On 11/9/22 1:33 PM, Stanislav Fomichev wrote: >> > On Wed, Nov 9, 2022 at 10:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote: >> >> >> >> On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote: >> >>> Snipping a bit of context to reply to this bit: >> >>> >> >>>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not >> >>>>>>> sure it is solid enough by asking the xdp prog not to use the same random number >> >>>>>>> in its own metadata + not to change the metadata through xdp->data_meta after >> >>>>>>> calling bpf_xdp_metadata_export_to_skb(). >> >>>>>> >> >>>>>> What do you think the usecase here might be? Or are you suggesting we >> >>>>>> reject further access to data_meta after >> >>>>>> bpf_xdp_metadata_export_to_skb somehow? >> >>>>>> >> >>>>>> If we want to let the programs override some of this >> >>>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add >> >>>>>> more kfuncs instead of exposing the layout? >> >>>>>> >> >>>>>> bpf_xdp_metadata_export_to_skb(ctx); >> >>>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234); >> >>> >> >>> There are several use cases for needing to access the metadata after >> >>> calling bpf_xdp_metdata_export_to_skb(): >> >>> >> >>> - Accessing the metadata after redirect (in a cpumap or devmap program, >> >>> or on a veth device) >> >>> - Transferring the packet+metadata to AF_XDP >> >> fwiw, the xdp prog could also be more selective and only stores one of the hints >> >> instead of the whole 'struct xdp_to_skb_metadata'. >> >> >> >>> - Returning XDP_PASS, but accessing some of the metadata first (whether >> >>> to read or change it) >> >>> >> >>> The last one could be solved by calling additional kfuncs, but that >> >>> would be less efficient than just directly editing the struct which >> >>> will be cache-hot after the helper returns. >> >> >> >> Yeah, it is more efficient to directly write if possible. I think this set >> >> allows the direct reading and writing already through data_meta (as a _u8 *). >> >> >> >>> >> >>> And yeah, this will allow the XDP program to inject arbitrary metadata >> >>> into the netstack; but it can already inject arbitrary *packet* data >> >>> into the stack, so not sure if this is much of an additional risk? If it >> >>> does lead to trivial crashes, we should probably harden the stack >> >>> against that? >> >>> >> >>> As for the random number, Jesper and I discussed replacing this with the >> >>> same BTF-ID scheme that he was using in his patch series. I.e., instead >> >>> of just putting in a random number, we insert the BTF ID of the metadata >> >>> struct at the end of it. This will allow us to support multiple >> >>> different formats in the future (not just changing the layout, but >> >>> having multiple simultaneous formats in the same kernel image), in case >> >>> we run out of space. >> >> >> >> This seems a bit hypothetical. How much headroom does it usually have for the >> >> xdp prog? Potentially the hints can use all the remaining space left after the >> >> header encap and the current bpf_xdp_adjust_meta() usage? >> >> >> >>> >> >>> We should probably also have a flag set on the xdp_frame so the stack >> >>> knows that the metadata area contains relevant-to-skb data, to guard >> >>> against an XDP program accidentally hitting the "magic number" (BTF_ID) >> >>> in unrelated stuff it puts into the metadata area. >> >> >> >> Yeah, I think having a flag is useful. The flag will be set at xdp_buff and >> >> then transfer to the xdp_frame? >> >> >> >>> >> >>>> After re-reading patch 6, have another question. The 'void >> >>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at >> >>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *' >> >>>> pointer and the xdp prog can directly read (or even write) it? >> >>> >> >>> Hmm, I'm not sure returning a failure makes sense? Failure to read one >> >>> or more fields just means that those fields will not be populated? We >> >>> should probably have a flags field inside the metadata struct itself to >> >>> indicate which fields are set or not, but I'm not sure returning an >> >>> error value adds anything? Returning a pointer to the metadata field >> >>> might be convenient for users (it would just be an alias to the >> >>> data_meta pointer, but the verifier could know its size, so the program >> >>> doesn't have to bounds check it). >> >> >> >> If some hints are not available, those hints should be initialized to >> >> 0/CHECKSUM_NONE/...etc. The xdp prog needs a direct way to tell hard failure >> >> when it cannot write the meta area because of not enough space. Comparing >> >> xdp->data_meta with xdp->data as a side effect is not intuitive. >> >> >> >> It is more than saving the bound check. With type info of 'struct >> >> xdp_to_skb_metadata *', the verifier can do more checks like reading in the >> >> middle of an integer member. The verifier could also limit write access only to >> >> a few struct's members if it is needed. >> >> >> >> The returning 'struct xdp_to_skb_metadata *' should not be an alias to the >> >> xdp->data_meta. They should actually point to different locations in the >> >> headroom. bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff. >> >> xdp->data_meta won't be changed and keeps pointing to the last >> >> bpf_xdp_adjust_meta() location. The kernel will know if there is >> >> xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the >> >> xdp_{buff,frame}. Would it work? >> >> >> >>> >> >>>> A related question, why 'struct xdp_to_skb_metadata' needs >> >>>> __randomize_layout? >> >>> >> >>> The __randomize_layout thing is there to force BPF programs to use CO-RE >> >>> to access the field. This is to avoid the struct layout accidentally >> >>> ossifying because people in practice rely on a particular layout, even >> >>> though we tell them to use CO-RE. There are lots of examples of this >> >>> happening in other domains (IP header options, TCP options, etc), and >> >>> __randomize_layout seemed like a neat trick to enforce CO-RE usage :) >> >> >> >> I am not sure if it is necessary or helpful to only enforce __randomize_layout >> >> in 'struct xdp_to_skb_metadata'. There are other CO-RE use cases (tracing and >> >> non tracing) that already have direct access (reading and/or writing) to other >> >> kernel structures. >> >> >> >> It is more important for the verifier to see the xdp prog accessing it as a >> >> 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so >> >> that the verifier can enforce the rules of access. >> >> >> >>> >> >>>>>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the >> >>>>>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only? >> >>>>>> >> >>>>>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right >> >>>>>> now usually have C code to manually pull out the metadata (out of hw >> >>>>>> desc) and put it into skb. >> >>>>>> >> >>>>>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for >> >>>>>> XDP_PASS, we're doing a double amount of work: >> >>>>>> skb_metadata_import_from_xdp first, then custom driver code second. >> >>>>>> >> >>>>>> In theory, maybe we should completely skip drivers custom parsing when >> >>>>>> there is a prog with BPF_F_XDP_HAS_METADATA? >> >>>>>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven >> >>>>>> and won't require any mental work (plus, the drivers won't have to >> >>>>>> care either in the future). >> >>>>>> > WDYT? >> >>>>> >> >>>>> >> >>>>> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes >> >>>>> sense to only use the hints (if ever written) from xdp prog especially if it >> >>>>> will eventually support xdp prog changing some of the hints in the future. For >> >>>>> now, I think either way is fine since they are the same and the xdp prog is sort >> >>>>> of doing extra unnecessary work anyway by calling >> >>>>> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be >> >>>>> changed now. >> >>> >> >>> I agree it would be best if the drivers also use the XDP metadata (if >> >>> present) on XDP_PASS. Longer term my hope is we can make the XDP >> >>> metadata support the only thing drivers need to implement (i.e., have >> >>> the stack call into that code even when no XDP program is loaded), but >> >>> for now just for consistency (and allowing the XDP program to update the >> >>> metadata), we should probably at least consume it on XDP_PASS. >> >>> >> >>> -Toke >> >>> >> > >> > Not to derail the discussion (left the last message intact on top, >> > feel free to continue), but to summarize. The proposed changes seem to >> > be: >> > >> > 1. bpf_xdp_metadata_export_to_skb() should return pointer to "struct >> > xdp_to_skb_metadata" >> > - This should let bpf programs change the metadata passed to the skb >> > >> > 2. "struct xdp_to_skb_metadata" should have its btf_id as the first >> > __u32 member (and remove the magic) >> > - This is for the redirect case where the end users, including >> > AF_XDP, can parse this metadata from btf_id >> >> I think Toke's idea is to put the btf_id at the end of xdp_to_skb_metadata. I >> can see why the end is needed for the userspace AF_XDP because, afaict, AF_XDP >> rx_desc currently cannot tell if there is metadata written by the xdp prog or >> not. However, if the 'has_skb_metadata' bit can also be passed to the AF_XDP >> rx_desc->options, the btf_id may as well be not needed now. However, the btf_id >> and other future new members can be added to the xdp_to_skb_metadata later if >> there is a need. >> >> For the kernel and xdp prog, a bit in the xdp->flags should be enough to get to >> the xdp_to_skb_metadata. The xdp prog will use CO-RE to access the members in >> xdp_to_skb_metadata. > > Ack, good points on putting it at the end. > Regarding bit in desc->options vs btf_id: since it seems that btf_id > is useful anyway, let's start with that? We can add a bit later on if > it turns out using metadata is problematic otherwise. I think the bit is mostly useful so that the stack can know that the metadata has been set before consuming it (to guard against regular xdp_metadata usage accidentally hitting the "right" BTF ID). I don't think it needs to be exposed to the XDP programs themselves. >> > - This, however, is not all the metadata that the device can >> > support, but a much narrower set that the kernel is expected to use >> > for skb construction >> > >> > 3. __randomize_layout isn't really helping, CO-RE will trigger >> > regardless; maybe only the case where it matters is probably AF_XDP, >> > so still useful? Yeah, see my response to Martin, I think the randomisation is useful for AF_XDP transfer. >> > 4. The presence of the metadata generated by >> > bpf_xdp_metadata_export_to_skb should be indicated by a flag in >> > xdp_{buff,frame}->flags >> > - Assuming exposing it via xdp_md->has_skb_metadata is ok? >> >> probably __bpf_md_ptr(struct xdp_to_skb_metadata *, skb_metadata) and the type >> will be PTR_TO_BTF_ID_OR_NULL. > > Oh, that seems even better than returning it from > bpf_xdp_metadata_export_to_skb. > bpf_xdp_metadata_export_to_skb can return true/false and the rest goes > via default verifier ctx resolution mechanism.. > (returning ptr from a kfunc seems to be a bit complicated right now) See my response to John in the other thread about mixing stable UAPI (in xdp_md) and unstable BTF structures in the xdp_md struct: I think this is confusing and would prefer a kfunc. >> > - Since the programs probably need to do the following: >> > >> > if (xdp_md->has_skb_metadata) { >> > access/change skb metadata by doing struct xdp_to_skb_metadata *p >> > = data_meta; >> >> and directly access/change xdp->skb_metadata instead of using xdp->data_meta. > > Ack. > >> > } else { >> > use kfuncs >> > } >> > >> > 5. Support the case where we keep program's metadata and kernel's >> > xdp_to_skb_metadata >> > - skb_metadata_import_from_xdp() will "consume" it by mem-moving the >> > rest of the metadata over it and adjusting the headroom >> >> I was thinking the kernel's xdp_to_skb_metadata is always before the program's >> metadata. xdp prog should usually work in this order also: read/write headers, >> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return >> XDP_PASS/XDP_REDIRECT. When it is XDP_PASS, the kernel just needs to pop the >> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc. >> >> For the kernel and xdp prog, I don't think it matters where the >> xdp_to_skb_metadata is. However, the xdp->data_meta (program's metadata) has to >> be before xdp->data because of the current data_meta and data comparison usage >> in the xdp prog. >> >> The order of the kernel's xdp_to_skb_metadata and the program's metadata >> probably only matters to the userspace AF_XDP. However, I don't see how AF_XDP >> supports the program's metadata now. afaict, it can only work now if there is >> some sort of contract between them or the AF_XDP currently does not use the >> program's metadata. Either way, we can do the mem-moving only for AF_XDP and it >> should be a no op if there is no program's metadata? This behavior could also >> be configurable through setsockopt? > > Agreed on all of the above. For now it seems like the safest thing to > do is to put xdp_to_skb_metadata last to allow af_xdp to properly > locate btf_id. > Let's see if Toke disagrees :-) As I replied to Martin, I'm not sure it's worth the complexity to logically split the SKB metadata from the program's own metadata (as opposed to just reusing the existing data_meta pointer)? However, if we do, the layout that makes most sense to me is putting the skb metadata before the program metadata, like: -------------- | skb_metadata -------------- | data_meta -------------- | data -------------- Not sure if that's what you meant? :) -Toke
John Fastabend <john.fastabend@gmail.com> writes: > Toke Høiland-Jørgensen wrote: >> Snipping a bit of context to reply to this bit: >> >> >>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not >> >>>> sure it is solid enough by asking the xdp prog not to use the same random number >> >>>> in its own metadata + not to change the metadata through xdp->data_meta after >> >>>> calling bpf_xdp_metadata_export_to_skb(). >> >>> >> >>> What do you think the usecase here might be? Or are you suggesting we >> >>> reject further access to data_meta after >> >>> bpf_xdp_metadata_export_to_skb somehow? >> >>> >> >>> If we want to let the programs override some of this >> >>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add >> >>> more kfuncs instead of exposing the layout? >> >>> >> >>> bpf_xdp_metadata_export_to_skb(ctx); >> >>> bpf_xdp_metadata_export_skb_hash(ctx, 1234); >> > > Hi Toke, > > Trying not to bifurcate your thread. Can I start a new one here to > elaborate on these use cases. I'm still a bit lost on any use case > for this that makes sense to actually deploy on a network. > >> There are several use cases for needing to access the metadata after >> calling bpf_xdp_metdata_export_to_skb(): >> >> - Accessing the metadata after redirect (in a cpumap or devmap program, >> or on a veth device) > > I think for devmap there are still lots of opens how/where the skb > is even built. For veth it's pretty clear; i.e., when redirecting into containers. > For cpumap I'm a bit unsure what the use case is. For ice, mlx and > such you should use the hardware RSS if performance is top of mind. Hardware RSS works fine if your hardware supports the hashing you want; many do not. As an example, Jesper wrote this application that uses cpumap to divide out ISP customer traffic among different CPUs (solving an HTB scaling problem): https://github.com/xdp-project/xdp-cpumap-tc > And then for specific devices on cpumap (maybe realtime or ptp > things?) could we just throw it through the xdp_frame? Not sure what you mean here? Throw what through the xdp_frame? >> - Transferring the packet+metadata to AF_XDP > > In this case we have the metadata and AF_XDP program and XDP program > simply need to agree on metadata format. No need to have some magic > numbers and driver specific kfuncs. See my other reply to Martin: Yeah, for AF_XDP users that write their own kernel XDP programs, they can just do whatever they want. But many users just rely on the default program in libxdp, so having a standard format to include with that is useful. >> - Returning XDP_PASS, but accessing some of the metadata first (whether >> to read or change it) >> > > I don't get this case? XDP_PASS should go to stack normally through > drivers build_skb routines. These will populate timestamp normally. > My guess is simply descriptor->skb load/store is cheaper than carrying > around this metadata and doing the call in BPF side. Anyways you > just built an entire skb and hit the stack I don't think you will > notice this noise in any benchmark. If you modify the packet before calling XDP_PASS you may want to update the metadata as well (for instance the RX hash, or in the future the metadata could also carry transport header offsets). -Toke
Toke Høiland-Jørgensen wrote: > John Fastabend <john.fastabend@gmail.com> writes: > > > Toke Høiland-Jørgensen wrote: > >> Snipping a bit of context to reply to this bit: > >> > >> >>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not > >> >>>> sure it is solid enough by asking the xdp prog not to use the same random number > >> >>>> in its own metadata + not to change the metadata through xdp->data_meta after > >> >>>> calling bpf_xdp_metadata_export_to_skb(). > >> >>> > >> >>> What do you think the usecase here might be? Or are you suggesting we > >> >>> reject further access to data_meta after > >> >>> bpf_xdp_metadata_export_to_skb somehow? > >> >>> > >> >>> If we want to let the programs override some of this > >> >>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add > >> >>> more kfuncs instead of exposing the layout? > >> >>> > >> >>> bpf_xdp_metadata_export_to_skb(ctx); > >> >>> bpf_xdp_metadata_export_skb_hash(ctx, 1234); > >> > > > > Hi Toke, > > > > Trying not to bifurcate your thread. Can I start a new one here to > > elaborate on these use cases. I'm still a bit lost on any use case > > for this that makes sense to actually deploy on a network. > > > >> There are several use cases for needing to access the metadata after > >> calling bpf_xdp_metdata_export_to_skb(): > >> > >> - Accessing the metadata after redirect (in a cpumap or devmap program, > >> or on a veth device) > > > > I think for devmap there are still lots of opens how/where the skb > > is even built. > > For veth it's pretty clear; i.e., when redirecting into containers. Ah but I think XDP on veth is a bit questionable in general. The use case is NFV I guess but its not how I would build out NFV. I've never seen it actually deployed other than in CI. Anyways not necessary to drop into that debate here. It exists so OK. > > > For cpumap I'm a bit unsure what the use case is. For ice, mlx and > > such you should use the hardware RSS if performance is top of mind. > > Hardware RSS works fine if your hardware supports the hashing you want; > many do not. As an example, Jesper wrote this application that uses > cpumap to divide out ISP customer traffic among different CPUs (solving > an HTB scaling problem): > > https://github.com/xdp-project/xdp-cpumap-tc I'm going to argue hw should be able to do this still and we should fix the hw but maybe not easily doable without convincing hardware folks to talk to us. Also not obvious tto me how linked code works without more studying, its ingress HTB? So you would push the rxhash and timestamp into cpumap and then build the skb here with the correct skb->timestamp? OK even if I can't exactly find the use case for cpumap if I had a use case I can see how passing metadata through is useful. > > > And then for specific devices on cpumap (maybe realtime or ptp > > things?) could we just throw it through the xdp_frame? > > Not sure what you mean here? Throw what through the xdp_frame? Doesn't matter reread patches and figured it out I was slightly confused. > > >> - Transferring the packet+metadata to AF_XDP > > > > In this case we have the metadata and AF_XDP program and XDP program > > simply need to agree on metadata format. No need to have some magic > > numbers and driver specific kfuncs. > > See my other reply to Martin: Yeah, for AF_XDP users that write their > own kernel XDP programs, they can just do whatever they want. But many > users just rely on the default program in libxdp, so having a standard > format to include with that is useful. > I don't think your AF_XDP program is any different than other AF_XDP programs. Your lib can create a standard format if it wants but kernel doesn't need to enforce it anyway. > >> - Returning XDP_PASS, but accessing some of the metadata first (whether > >> to read or change it) > >> > > > > I don't get this case? XDP_PASS should go to stack normally through > > drivers build_skb routines. These will populate timestamp normally. > > My guess is simply descriptor->skb load/store is cheaper than carrying > > around this metadata and doing the call in BPF side. Anyways you > > just built an entire skb and hit the stack I don't think you will > > notice this noise in any benchmark. > > If you modify the packet before calling XDP_PASS you may want to update > the metadata as well (for instance the RX hash, or in the future the > metadata could also carry transport header offsets). OK. So when you modify the pkt fixing up the rxhash makes sense. Thanks for the response I can see the argument. Thanks, John
On Thu, Nov 10, 2022 at 6:27 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote: > > Stanislav Fomichev <sdf@google.com> writes: > > > On Wed, Nov 9, 2022 at 4:13 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: > >> > >> On 11/9/22 1:33 PM, Stanislav Fomichev wrote: > >> > On Wed, Nov 9, 2022 at 10:22 AM Martin KaFai Lau <martin.lau@linux.dev> wrote: > >> >> > >> >> On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote: > >> >>> Snipping a bit of context to reply to this bit: > >> >>> > >> >>>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not > >> >>>>>>> sure it is solid enough by asking the xdp prog not to use the same random number > >> >>>>>>> in its own metadata + not to change the metadata through xdp->data_meta after > >> >>>>>>> calling bpf_xdp_metadata_export_to_skb(). > >> >>>>>> > >> >>>>>> What do you think the usecase here might be? Or are you suggesting we > >> >>>>>> reject further access to data_meta after > >> >>>>>> bpf_xdp_metadata_export_to_skb somehow? > >> >>>>>> > >> >>>>>> If we want to let the programs override some of this > >> >>>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add > >> >>>>>> more kfuncs instead of exposing the layout? > >> >>>>>> > >> >>>>>> bpf_xdp_metadata_export_to_skb(ctx); > >> >>>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234); > >> >>> > >> >>> There are several use cases for needing to access the metadata after > >> >>> calling bpf_xdp_metdata_export_to_skb(): > >> >>> > >> >>> - Accessing the metadata after redirect (in a cpumap or devmap program, > >> >>> or on a veth device) > >> >>> - Transferring the packet+metadata to AF_XDP > >> >> fwiw, the xdp prog could also be more selective and only stores one of the hints > >> >> instead of the whole 'struct xdp_to_skb_metadata'. > >> >> > >> >>> - Returning XDP_PASS, but accessing some of the metadata first (whether > >> >>> to read or change it) > >> >>> > >> >>> The last one could be solved by calling additional kfuncs, but that > >> >>> would be less efficient than just directly editing the struct which > >> >>> will be cache-hot after the helper returns. > >> >> > >> >> Yeah, it is more efficient to directly write if possible. I think this set > >> >> allows the direct reading and writing already through data_meta (as a _u8 *). > >> >> > >> >>> > >> >>> And yeah, this will allow the XDP program to inject arbitrary metadata > >> >>> into the netstack; but it can already inject arbitrary *packet* data > >> >>> into the stack, so not sure if this is much of an additional risk? If it > >> >>> does lead to trivial crashes, we should probably harden the stack > >> >>> against that? > >> >>> > >> >>> As for the random number, Jesper and I discussed replacing this with the > >> >>> same BTF-ID scheme that he was using in his patch series. I.e., instead > >> >>> of just putting in a random number, we insert the BTF ID of the metadata > >> >>> struct at the end of it. This will allow us to support multiple > >> >>> different formats in the future (not just changing the layout, but > >> >>> having multiple simultaneous formats in the same kernel image), in case > >> >>> we run out of space. > >> >> > >> >> This seems a bit hypothetical. How much headroom does it usually have for the > >> >> xdp prog? Potentially the hints can use all the remaining space left after the > >> >> header encap and the current bpf_xdp_adjust_meta() usage? > >> >> > >> >>> > >> >>> We should probably also have a flag set on the xdp_frame so the stack > >> >>> knows that the metadata area contains relevant-to-skb data, to guard > >> >>> against an XDP program accidentally hitting the "magic number" (BTF_ID) > >> >>> in unrelated stuff it puts into the metadata area. > >> >> > >> >> Yeah, I think having a flag is useful. The flag will be set at xdp_buff and > >> >> then transfer to the xdp_frame? > >> >> > >> >>> > >> >>>> After re-reading patch 6, have another question. The 'void > >> >>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at > >> >>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *' > >> >>>> pointer and the xdp prog can directly read (or even write) it? > >> >>> > >> >>> Hmm, I'm not sure returning a failure makes sense? Failure to read one > >> >>> or more fields just means that those fields will not be populated? We > >> >>> should probably have a flags field inside the metadata struct itself to > >> >>> indicate which fields are set or not, but I'm not sure returning an > >> >>> error value adds anything? Returning a pointer to the metadata field > >> >>> might be convenient for users (it would just be an alias to the > >> >>> data_meta pointer, but the verifier could know its size, so the program > >> >>> doesn't have to bounds check it). > >> >> > >> >> If some hints are not available, those hints should be initialized to > >> >> 0/CHECKSUM_NONE/...etc. The xdp prog needs a direct way to tell hard failure > >> >> when it cannot write the meta area because of not enough space. Comparing > >> >> xdp->data_meta with xdp->data as a side effect is not intuitive. > >> >> > >> >> It is more than saving the bound check. With type info of 'struct > >> >> xdp_to_skb_metadata *', the verifier can do more checks like reading in the > >> >> middle of an integer member. The verifier could also limit write access only to > >> >> a few struct's members if it is needed. > >> >> > >> >> The returning 'struct xdp_to_skb_metadata *' should not be an alias to the > >> >> xdp->data_meta. They should actually point to different locations in the > >> >> headroom. bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff. > >> >> xdp->data_meta won't be changed and keeps pointing to the last > >> >> bpf_xdp_adjust_meta() location. The kernel will know if there is > >> >> xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the > >> >> xdp_{buff,frame}. Would it work? > >> >> > >> >>> > >> >>>> A related question, why 'struct xdp_to_skb_metadata' needs > >> >>>> __randomize_layout? > >> >>> > >> >>> The __randomize_layout thing is there to force BPF programs to use CO-RE > >> >>> to access the field. This is to avoid the struct layout accidentally > >> >>> ossifying because people in practice rely on a particular layout, even > >> >>> though we tell them to use CO-RE. There are lots of examples of this > >> >>> happening in other domains (IP header options, TCP options, etc), and > >> >>> __randomize_layout seemed like a neat trick to enforce CO-RE usage :) > >> >> > >> >> I am not sure if it is necessary or helpful to only enforce __randomize_layout > >> >> in 'struct xdp_to_skb_metadata'. There are other CO-RE use cases (tracing and > >> >> non tracing) that already have direct access (reading and/or writing) to other > >> >> kernel structures. > >> >> > >> >> It is more important for the verifier to see the xdp prog accessing it as a > >> >> 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so > >> >> that the verifier can enforce the rules of access. > >> >> > >> >>> > >> >>>>>>> Does xdp_to_skb_metadata have a use case for XDP_PASS (like patch 7) or the > >> >>>>>>> xdp_to_skb_metadata can be limited to XDP_REDIRECT only? > >> >>>>>> > >> >>>>>> XDP_PASS cases where we convert xdp_buff into skb in the drivers right > >> >>>>>> now usually have C code to manually pull out the metadata (out of hw > >> >>>>>> desc) and put it into skb. > >> >>>>>> > >> >>>>>> So, currently, if we're calling bpf_xdp_metadata_export_to_skb() for > >> >>>>>> XDP_PASS, we're doing a double amount of work: > >> >>>>>> skb_metadata_import_from_xdp first, then custom driver code second. > >> >>>>>> > >> >>>>>> In theory, maybe we should completely skip drivers custom parsing when > >> >>>>>> there is a prog with BPF_F_XDP_HAS_METADATA? > >> >>>>>> Then both xdp->skb paths (XDP_PASS+XDP_REDIRECT) will be bpf-driven > >> >>>>>> and won't require any mental work (plus, the drivers won't have to > >> >>>>>> care either in the future). > >> >>>>>> > WDYT? > >> >>>>> > >> >>>>> > >> >>>>> Yeah, not sure if it can solely depend on BPF_F_XDP_HAS_METADATA but it makes > >> >>>>> sense to only use the hints (if ever written) from xdp prog especially if it > >> >>>>> will eventually support xdp prog changing some of the hints in the future. For > >> >>>>> now, I think either way is fine since they are the same and the xdp prog is sort > >> >>>>> of doing extra unnecessary work anyway by calling > >> >>>>> bpf_xdp_metadata_export_to_skb() with XDP_PASS and knowing nothing can be > >> >>>>> changed now. > >> >>> > >> >>> I agree it would be best if the drivers also use the XDP metadata (if > >> >>> present) on XDP_PASS. Longer term my hope is we can make the XDP > >> >>> metadata support the only thing drivers need to implement (i.e., have > >> >>> the stack call into that code even when no XDP program is loaded), but > >> >>> for now just for consistency (and allowing the XDP program to update the > >> >>> metadata), we should probably at least consume it on XDP_PASS. > >> >>> > >> >>> -Toke > >> >>> > >> > > >> > Not to derail the discussion (left the last message intact on top, > >> > feel free to continue), but to summarize. The proposed changes seem to > >> > be: > >> > > >> > 1. bpf_xdp_metadata_export_to_skb() should return pointer to "struct > >> > xdp_to_skb_metadata" > >> > - This should let bpf programs change the metadata passed to the skb > >> > > >> > 2. "struct xdp_to_skb_metadata" should have its btf_id as the first > >> > __u32 member (and remove the magic) > >> > - This is for the redirect case where the end users, including > >> > AF_XDP, can parse this metadata from btf_id > >> > >> I think Toke's idea is to put the btf_id at the end of xdp_to_skb_metadata. I > >> can see why the end is needed for the userspace AF_XDP because, afaict, AF_XDP > >> rx_desc currently cannot tell if there is metadata written by the xdp prog or > >> not. However, if the 'has_skb_metadata' bit can also be passed to the AF_XDP > >> rx_desc->options, the btf_id may as well be not needed now. However, the btf_id > >> and other future new members can be added to the xdp_to_skb_metadata later if > >> there is a need. > >> > >> For the kernel and xdp prog, a bit in the xdp->flags should be enough to get to > >> the xdp_to_skb_metadata. The xdp prog will use CO-RE to access the members in > >> xdp_to_skb_metadata. > > > > Ack, good points on putting it at the end. > > Regarding bit in desc->options vs btf_id: since it seems that btf_id > > is useful anyway, let's start with that? We can add a bit later on if > > it turns out using metadata is problematic otherwise. > > I think the bit is mostly useful so that the stack can know that the > metadata has been set before consuming it (to guard against regular > xdp_metadata usage accidentally hitting the "right" BTF ID). I don't > think it needs to be exposed to the XDP programs themselves. SG! A flag for xdp_buff/frame and a kfunc to query it for bpf. > >> > - This, however, is not all the metadata that the device can > >> > support, but a much narrower set that the kernel is expected to use > >> > for skb construction > >> > > >> > 3. __randomize_layout isn't really helping, CO-RE will trigger > >> > regardless; maybe only the case where it matters is probably AF_XDP, > >> > so still useful? > > Yeah, see my response to Martin, I think the randomisation is useful for > AF_XDP transfer. SG. Let's keep it for now. Worst case, if it hurts, we can remove it later... > >> > 4. The presence of the metadata generated by > >> > bpf_xdp_metadata_export_to_skb should be indicated by a flag in > >> > xdp_{buff,frame}->flags > >> > - Assuming exposing it via xdp_md->has_skb_metadata is ok? > >> > >> probably __bpf_md_ptr(struct xdp_to_skb_metadata *, skb_metadata) and the type > >> will be PTR_TO_BTF_ID_OR_NULL. > > > > Oh, that seems even better than returning it from > > bpf_xdp_metadata_export_to_skb. > > bpf_xdp_metadata_export_to_skb can return true/false and the rest goes > > via default verifier ctx resolution mechanism.. > > (returning ptr from a kfunc seems to be a bit complicated right now) > > See my response to John in the other thread about mixing stable UAPI (in > xdp_md) and unstable BTF structures in the xdp_md struct: I think this > is confusing and would prefer a kfunc. SG! > >> > - Since the programs probably need to do the following: > >> > > >> > if (xdp_md->has_skb_metadata) { > >> > access/change skb metadata by doing struct xdp_to_skb_metadata *p > >> > = data_meta; > >> > >> and directly access/change xdp->skb_metadata instead of using xdp->data_meta. > > > > Ack. > > > >> > } else { > >> > use kfuncs > >> > } > >> > > >> > 5. Support the case where we keep program's metadata and kernel's > >> > xdp_to_skb_metadata > >> > - skb_metadata_import_from_xdp() will "consume" it by mem-moving the > >> > rest of the metadata over it and adjusting the headroom > >> > >> I was thinking the kernel's xdp_to_skb_metadata is always before the program's > >> metadata. xdp prog should usually work in this order also: read/write headers, > >> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return > >> XDP_PASS/XDP_REDIRECT. When it is XDP_PASS, the kernel just needs to pop the > >> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc. > >> > >> For the kernel and xdp prog, I don't think it matters where the > >> xdp_to_skb_metadata is. However, the xdp->data_meta (program's metadata) has to > >> be before xdp->data because of the current data_meta and data comparison usage > >> in the xdp prog. > >> > >> The order of the kernel's xdp_to_skb_metadata and the program's metadata > >> probably only matters to the userspace AF_XDP. However, I don't see how AF_XDP > >> supports the program's metadata now. afaict, it can only work now if there is > >> some sort of contract between them or the AF_XDP currently does not use the > >> program's metadata. Either way, we can do the mem-moving only for AF_XDP and it > >> should be a no op if there is no program's metadata? This behavior could also > >> be configurable through setsockopt? > > > > Agreed on all of the above. For now it seems like the safest thing to > > do is to put xdp_to_skb_metadata last to allow af_xdp to properly > > locate btf_id. > > Let's see if Toke disagrees :-) > > As I replied to Martin, I'm not sure it's worth the complexity to > logically split the SKB metadata from the program's own metadata (as > opposed to just reusing the existing data_meta pointer)? I'd gladly keep my current requirement where it's either or, but not both :-) We can relax it later if required? > However, if we do, the layout that makes most sense to me is putting the > skb metadata before the program metadata, like: > > -------------- > | skb_metadata > -------------- > | data_meta > -------------- > | data > -------------- > > Not sure if that's what you meant? :) I was suggesting the other way around: |custom meta|skb_metadata|data| (but, as Martin points out, consuming skb_metadata in the kernel becomes messier) af_xdp can check whether skb_metdata is present by looking at data - offsetof(struct skb_metadata, btf_id). progs that know how to handle custom metadata, will look at data - sizeof(skb_metadata) Otherwise, if it's the other way around, how do we find skb_metadata in a redirected frame? Let's say we have |skb_metadata|custom meta|data|, how does the final program find skb_metadata? All the progs have to agree on the sizeof(tc/custom meta), right?
On 11/10/22 6:19 AM, Toke Høiland-Jørgensen wrote: > Martin KaFai Lau <martin.lau@linux.dev> writes: > >> On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote: >>> Snipping a bit of context to reply to this bit: >>> >>>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not >>>>>>> sure it is solid enough by asking the xdp prog not to use the same random number >>>>>>> in its own metadata + not to change the metadata through xdp->data_meta after >>>>>>> calling bpf_xdp_metadata_export_to_skb(). >>>>>> >>>>>> What do you think the usecase here might be? Or are you suggesting we >>>>>> reject further access to data_meta after >>>>>> bpf_xdp_metadata_export_to_skb somehow? >>>>>> >>>>>> If we want to let the programs override some of this >>>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add >>>>>> more kfuncs instead of exposing the layout? >>>>>> >>>>>> bpf_xdp_metadata_export_to_skb(ctx); >>>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234); >>> >>> There are several use cases for needing to access the metadata after >>> calling bpf_xdp_metdata_export_to_skb(): >>> >>> - Accessing the metadata after redirect (in a cpumap or devmap program, >>> or on a veth device) >>> - Transferring the packet+metadata to AF_XDP >> fwiw, the xdp prog could also be more selective and only stores one of the hints >> instead of the whole 'struct xdp_to_skb_metadata'. > > Yup, absolutely! In that sense, reusing the SKB format is mostly a > convenience feature. However, lots of people consume AF_XDP through the > default program installed by libxdp in the XSK setup code, and including > custom metadata there is awkward. So having the metadata consumed by the > stack as the "default subset" would enable easy consumption by > non-advanced users, while advanced users can still do custom stuff by > writing their own XDP program that calls the kfuncs. > >>> - Returning XDP_PASS, but accessing some of the metadata first (whether >>> to read or change it) >>> >>> The last one could be solved by calling additional kfuncs, but that >>> would be less efficient than just directly editing the struct which >>> will be cache-hot after the helper returns. >> >> Yeah, it is more efficient to directly write if possible. I think this set >> allows the direct reading and writing already through data_meta (as a _u8 *). > > Yup, totally fine with just keeping that capability :) > >>> And yeah, this will allow the XDP program to inject arbitrary metadata >>> into the netstack; but it can already inject arbitrary *packet* data >>> into the stack, so not sure if this is much of an additional risk? If it >>> does lead to trivial crashes, we should probably harden the stack >>> against that? >>> >>> As for the random number, Jesper and I discussed replacing this with the >>> same BTF-ID scheme that he was using in his patch series. I.e., instead >>> of just putting in a random number, we insert the BTF ID of the metadata >>> struct at the end of it. This will allow us to support multiple >>> different formats in the future (not just changing the layout, but >>> having multiple simultaneous formats in the same kernel image), in case >>> we run out of space. >> >> This seems a bit hypothetical. How much headroom does it usually have for the >> xdp prog? Potentially the hints can use all the remaining space left after the >> header encap and the current bpf_xdp_adjust_meta() usage? > > For the metadata consumed by the stack right now it's a bit > hypothetical, yeah. However, there's a bunch of metadata commonly > supported by hardware that the stack currently doesn't consume and that > hopefully this feature will end up making more accessible. My hope is > that the stack can also learn how to use this in the future, in which > case we may run out of space. So I think of that bit mostly as > future-proofing... ic. in this case, Can the btf_id be added to 'struct xdp_to_skb_metadata' later if it is indeed needed? The 'struct xdp_to_skb_metadata' is not in UAPI and doing it with CO-RE is to give us flexibility to make this kind of changes in the future. > >>> We should probably also have a flag set on the xdp_frame so the stack >>> knows that the metadata area contains relevant-to-skb data, to guard >>> against an XDP program accidentally hitting the "magic number" (BTF_ID) >>> in unrelated stuff it puts into the metadata area. >> >> Yeah, I think having a flag is useful. The flag will be set at xdp_buff and >> then transfer to the xdp_frame? > > Yeah, exactly! > >>>> After re-reading patch 6, have another question. The 'void >>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at >>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *' >>>> pointer and the xdp prog can directly read (or even write) it? >>> >>> Hmm, I'm not sure returning a failure makes sense? Failure to read one >>> or more fields just means that those fields will not be populated? We >>> should probably have a flags field inside the metadata struct itself to >>> indicate which fields are set or not, but I'm not sure returning an >>> error value adds anything? Returning a pointer to the metadata field >>> might be convenient for users (it would just be an alias to the >>> data_meta pointer, but the verifier could know its size, so the program >>> doesn't have to bounds check it). >> >> If some hints are not available, those hints should be initialized to >> 0/CHECKSUM_NONE/...etc. > > The problem with that is that then we have to spend cycles writing > eight bytes of zeroes into the checksum field :) With a common 'struct xdp_to_skb_metadata', I am not sure how some of these zero writes can be avoided. If the xdp prog wants to optimize, it can call individual kfunc to get individual hints. > >> The xdp prog needs a direct way to tell hard failure when it cannot >> write the meta area because of not enough space. Comparing >> xdp->data_meta with xdp->data as a side effect is not intuitive. > > Yeah, hence a flags field so we can just see if setting each field > succeeded? How testing a flag is different from checking 0/invalid-value of a field? or some fields just don't have an invalid value to check for like vlan_tci? You meant a flags field as a return value or in the 'struct xdp_to_skb_metadata' ? > >> It is more than saving the bound check. With type info of 'struct >> xdp_to_skb_metadata *', the verifier can do more checks like reading in the >> middle of an integer member. The verifier could also limit write access only to >> a few struct's members if it is needed. >> >> The returning 'struct xdp_to_skb_metadata *' should not be an alias to the >> xdp->data_meta. They should actually point to different locations in the >> headroom. bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff. >> xdp->data_meta won't be changed and keeps pointing to the last >> bpf_xdp_adjust_meta() location. The kernel will know if there is >> xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the >> xdp_{buff,frame}. Would it work? > > Hmm, logically splitting the program metadata and the xdp_hints metadata > (but having them share the same area) *could* work, I guess, I'm just > not sure it's worth the extra complexity? It shouldn't stop the existing xdp prog writing its own metadata from using the the new bpf_xdp_metadata_export_to_skb(). > >>>> A related question, why 'struct xdp_to_skb_metadata' needs >>>> __randomize_layout? >>> >>> The __randomize_layout thing is there to force BPF programs to use CO-RE >>> to access the field. This is to avoid the struct layout accidentally >>> ossifying because people in practice rely on a particular layout, even >>> though we tell them to use CO-RE. There are lots of examples of this >>> happening in other domains (IP header options, TCP options, etc), and >>> __randomize_layout seemed like a neat trick to enforce CO-RE usage :) >> >> I am not sure if it is necessary or helpful to only enforce __randomize_layout >> in 'struct xdp_to_skb_metadata'. There are other CO-RE use cases (tracing and >> non tracing) that already have direct access (reading and/or writing) to other >> kernel structures. >> >> It is more important for the verifier to see the xdp prog accessing it as a >> 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so >> that the verifier can enforce the rules of access. > > That only works inside the kernel, though. Since the metadata field can > be copied wholesale to AF_XDP, having it randomized forces userspace > consumers to also write code to deal with the layout being dynamic... hm... I still don't see how useful it is, in particular you mentioned the libxdp will install a xdp prog to write this default format (xdp_to_skb_metadata) and likely libxdp will also provide some helpers to parse the xdp_to_skb_metadata and the libxdp user should not need to know if CO-RE is used or not. Considering it is a kernel internal struct, I think it is fine to keep it and can be revisited later if needed. Lets get on to other things first :)
On 11/9/22 11:44 PM, Stanislav Fomichev wrote: >>> @@ -423,14 +425,25 @@ XDP_METADATA_KFUNC_xxx >>> MAX_XDP_METADATA_KFUNC, >>> }; >>> >>> +struct xdp_to_skb_metadata { >>> + u32 magic; /* xdp_metadata_magic */ >>> + u64 rx_timestamp; >> >> Slightly confused. I thought/think most drivers populate the skb timestamp >> if they can already? So why do we need to bounce these through some xdp >> metadata? Won't all this cost more than the load/store directly from the >> descriptor into the skb? Even if drivers are not populating skb now >> shouldn't an ethtool knob be enough to turn this on? > > dsahern@ pointed out that it might be useful for the program to be > able to override some of that metadata. Examples that come to mind from previous work: 1. changing vlans on a redirect: Rx on vlan 1 with h/w popping the vlan so it is provided in metadata. Then on a redirect it shifts to vlan 2 for Tx. But this example use case assumes Tx accesses the metadata to ask h/w to insert the header. 2. popping or updating an encap header and wanting to update the checksum 3. changing the h/w provided hash to affect steering of a subsequent skb
John Fastabend <john.fastabend@gmail.com> writes: > Toke Høiland-Jørgensen wrote: >> John Fastabend <john.fastabend@gmail.com> writes: >> >> > Toke Høiland-Jørgensen wrote: >> >> Snipping a bit of context to reply to this bit: >> >> >> >> >>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not >> >> >>>> sure it is solid enough by asking the xdp prog not to use the same random number >> >> >>>> in its own metadata + not to change the metadata through xdp->data_meta after >> >> >>>> calling bpf_xdp_metadata_export_to_skb(). >> >> >>> >> >> >>> What do you think the usecase here might be? Or are you suggesting we >> >> >>> reject further access to data_meta after >> >> >>> bpf_xdp_metadata_export_to_skb somehow? >> >> >>> >> >> >>> If we want to let the programs override some of this >> >> >>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add >> >> >>> more kfuncs instead of exposing the layout? >> >> >>> >> >> >>> bpf_xdp_metadata_export_to_skb(ctx); >> >> >>> bpf_xdp_metadata_export_skb_hash(ctx, 1234); >> >> >> > >> > Hi Toke, >> > >> > Trying not to bifurcate your thread. Can I start a new one here to >> > elaborate on these use cases. I'm still a bit lost on any use case >> > for this that makes sense to actually deploy on a network. >> > >> >> There are several use cases for needing to access the metadata after >> >> calling bpf_xdp_metdata_export_to_skb(): >> >> >> >> - Accessing the metadata after redirect (in a cpumap or devmap program, >> >> or on a veth device) >> > >> > I think for devmap there are still lots of opens how/where the skb >> > is even built. >> >> For veth it's pretty clear; i.e., when redirecting into containers. > > Ah but I think XDP on veth is a bit questionable in general. The use > case is NFV I guess but its not how I would build out NFV. I've never > seen it actually deployed other than in CI. Anyways not necessary to > drop into that debate here. It exists so OK. > >> >> > For cpumap I'm a bit unsure what the use case is. For ice, mlx and >> > such you should use the hardware RSS if performance is top of mind. >> >> Hardware RSS works fine if your hardware supports the hashing you want; >> many do not. As an example, Jesper wrote this application that uses >> cpumap to divide out ISP customer traffic among different CPUs (solving >> an HTB scaling problem): >> >> https://github.com/xdp-project/xdp-cpumap-tc > > I'm going to argue hw should be able to do this still and we > should fix the hw but maybe not easily doable without convincing > hardware folks to talk to us. Sure, in the ideal world the hardware should just be able to do this. Unfortunately, we don't live in that ideal world :) > Also not obvious tto me how linked code works without more studying, > its ingress HTB? So you would push the rxhash and timestamp into > cpumap and then build the skb here with the correct skb->timestamp? No, the HTB tree is on egress. The use case is an ISP middlebox that shapes (say) 1000 customers to their subscribed rate, using a big HTB tree. If you just do this with a single HTB instance on the egress NIC you run into the global qdisc lock and you can't scale beyond a pretty measly bandwidth. Whereas if you use multiple HW TXQs and the mq qdisc, you can partition the HTB tree so you only have a subset of customers on each HWQ/HTB instance. But for this to work, and still guarantee each customer gets shaped to the right rate, you need to ensure that all that customer's traffic hits the same HWQ. The xdp-cpumap-tc tool does this by configuring the TXQs to correspond to individual CPUs, and then runs an XDP program that matches traffic to customers and redirects them to the right CPU (using an LPM map). This solution runs in production in quite a number of smallish WISPs, BTW, with quite nice results. The software to set it all up is also open sourced: https://libreqos.io/ Coming back to HW metadata, the LibreQoS system could benefit from the hardware flow hash in particular, since that would save a hash operation when enqueueing the packet into sch_cake. > OK even if I can't exactly find the use case for cpumap if I had > a use case I can see how passing metadata through is useful. Great! >> > And then for specific devices on cpumap (maybe realtime or ptp >> > things?) could we just throw it through the xdp_frame? >> >> Not sure what you mean here? Throw what through the xdp_frame? > > Doesn't matter reread patches and figured it out I was slightly > confused. Right, OK. >> >> >> - Transferring the packet+metadata to AF_XDP >> > >> > In this case we have the metadata and AF_XDP program and XDP program >> > simply need to agree on metadata format. No need to have some magic >> > numbers and driver specific kfuncs. >> >> See my other reply to Martin: Yeah, for AF_XDP users that write their >> own kernel XDP programs, they can just do whatever they want. But many >> users just rely on the default program in libxdp, so having a standard >> format to include with that is useful. >> > > I don't think your AF_XDP program is any different than other AF_XDP > programs. Your lib can create a standard format if it wants but > kernel doesn't need to enforce it anyway. Yeah, we totally could. But since we're defining a "standard" format for kernel (skb) consumption anyway, making this available to AF_XDP is kinda convenient so we don't have to :) >> >> - Returning XDP_PASS, but accessing some of the metadata first (whether >> >> to read or change it) >> >> >> > >> > I don't get this case? XDP_PASS should go to stack normally through >> > drivers build_skb routines. These will populate timestamp normally. >> > My guess is simply descriptor->skb load/store is cheaper than carrying >> > around this metadata and doing the call in BPF side. Anyways you >> > just built an entire skb and hit the stack I don't think you will >> > notice this noise in any benchmark. >> >> If you modify the packet before calling XDP_PASS you may want to update >> the metadata as well (for instance the RX hash, or in the future the >> metadata could also carry transport header offsets). > > OK. So when you modify the pkt fixing up the rxhash makes sense. Thanks > for the response I can see the argument. Great! You're welcome :) -Toke
Skipping to the last bit: >> >> > } else { >> >> > use kfuncs >> >> > } >> >> > >> >> > 5. Support the case where we keep program's metadata and kernel's >> >> > xdp_to_skb_metadata >> >> > - skb_metadata_import_from_xdp() will "consume" it by mem-moving the >> >> > rest of the metadata over it and adjusting the headroom >> >> >> >> I was thinking the kernel's xdp_to_skb_metadata is always before the program's >> >> metadata. xdp prog should usually work in this order also: read/write headers, >> >> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return >> >> XDP_PASS/XDP_REDIRECT. When it is XDP_PASS, the kernel just needs to pop the >> >> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc. >> >> >> >> For the kernel and xdp prog, I don't think it matters where the >> >> xdp_to_skb_metadata is. However, the xdp->data_meta (program's metadata) has to >> >> be before xdp->data because of the current data_meta and data comparison usage >> >> in the xdp prog. >> >> >> >> The order of the kernel's xdp_to_skb_metadata and the program's metadata >> >> probably only matters to the userspace AF_XDP. However, I don't see how AF_XDP >> >> supports the program's metadata now. afaict, it can only work now if there is >> >> some sort of contract between them or the AF_XDP currently does not use the >> >> program's metadata. Either way, we can do the mem-moving only for AF_XDP and it >> >> should be a no op if there is no program's metadata? This behavior could also >> >> be configurable through setsockopt? >> > >> > Agreed on all of the above. For now it seems like the safest thing to >> > do is to put xdp_to_skb_metadata last to allow af_xdp to properly >> > locate btf_id. >> > Let's see if Toke disagrees :-) >> >> As I replied to Martin, I'm not sure it's worth the complexity to >> logically split the SKB metadata from the program's own metadata (as >> opposed to just reusing the existing data_meta pointer)? > > I'd gladly keep my current requirement where it's either or, but not both :-) > We can relax it later if required? So the way I've been thinking about it is simply that the skb_metadata would live in the same place at the data_meta pointer (including adjusting that pointer to accommodate it), and just overriding the existing program metadata, if any exists. But looking at it now, I guess having the split makes it easier for a program to write its own custom metadata and still use the skb metadata. See below about the ordering. >> However, if we do, the layout that makes most sense to me is putting the >> skb metadata before the program metadata, like: >> >> -------------- >> | skb_metadata >> -------------- >> | data_meta >> -------------- >> | data >> -------------- >> >> Not sure if that's what you meant? :) > > I was suggesting the other way around: |custom meta|skb_metadata|data| > (but, as Martin points out, consuming skb_metadata in the kernel > becomes messier) > > af_xdp can check whether skb_metdata is present by looking at data - > offsetof(struct skb_metadata, btf_id). > progs that know how to handle custom metadata, will look at data - > sizeof(skb_metadata) > > Otherwise, if it's the other way around, how do we find skb_metadata > in a redirected frame? > Let's say we have |skb_metadata|custom meta|data|, how does the final > program find skb_metadata? > All the progs have to agree on the sizeof(tc/custom meta), right? Erm, maybe I'm missing something here, but skb_metadata is fixed size, right? So if the "skb_metadata is present" flag is set, we know that the sizeof(skb_metadata) bytes before the data_meta pointer contains the metadata, and if the flag is not set, we know those bytes are not valid metadata. For AF_XDP, we'd need to transfer the flag as well, and it could apply the same logic (getting the size from the vmlinux BTF). By this logic, the BTF_ID should be the *first* entry of struct skb_metadata, since that will be the field AF_XDP programs can find right off the bat, no? -Toke
Martin KaFai Lau <martin.lau@linux.dev> writes: > On 11/10/22 6:19 AM, Toke Høiland-Jørgensen wrote: >> Martin KaFai Lau <martin.lau@linux.dev> writes: >> >>> On 11/9/22 3:10 AM, Toke Høiland-Jørgensen wrote: >>>> Snipping a bit of context to reply to this bit: >>>> >>>>>>>> Can the xdp prog still change the metadata through xdp->data_meta? tbh, I am not >>>>>>>> sure it is solid enough by asking the xdp prog not to use the same random number >>>>>>>> in its own metadata + not to change the metadata through xdp->data_meta after >>>>>>>> calling bpf_xdp_metadata_export_to_skb(). >>>>>>> >>>>>>> What do you think the usecase here might be? Or are you suggesting we >>>>>>> reject further access to data_meta after >>>>>>> bpf_xdp_metadata_export_to_skb somehow? >>>>>>> >>>>>>> If we want to let the programs override some of this >>>>>>> bpf_xdp_metadata_export_to_skb() metadata, it feels like we can add >>>>>>> more kfuncs instead of exposing the layout? >>>>>>> >>>>>>> bpf_xdp_metadata_export_to_skb(ctx); >>>>>>> bpf_xdp_metadata_export_skb_hash(ctx, 1234); >>>> >>>> There are several use cases for needing to access the metadata after >>>> calling bpf_xdp_metdata_export_to_skb(): >>>> >>>> - Accessing the metadata after redirect (in a cpumap or devmap program, >>>> or on a veth device) >>>> - Transferring the packet+metadata to AF_XDP >>> fwiw, the xdp prog could also be more selective and only stores one of the hints >>> instead of the whole 'struct xdp_to_skb_metadata'. >> >> Yup, absolutely! In that sense, reusing the SKB format is mostly a >> convenience feature. However, lots of people consume AF_XDP through the >> default program installed by libxdp in the XSK setup code, and including >> custom metadata there is awkward. So having the metadata consumed by the >> stack as the "default subset" would enable easy consumption by >> non-advanced users, while advanced users can still do custom stuff by >> writing their own XDP program that calls the kfuncs. >> >>>> - Returning XDP_PASS, but accessing some of the metadata first (whether >>>> to read or change it) >>>> >>>> The last one could be solved by calling additional kfuncs, but that >>>> would be less efficient than just directly editing the struct which >>>> will be cache-hot after the helper returns. >>> >>> Yeah, it is more efficient to directly write if possible. I think this set >>> allows the direct reading and writing already through data_meta (as a _u8 *). >> >> Yup, totally fine with just keeping that capability :) >> >>>> And yeah, this will allow the XDP program to inject arbitrary metadata >>>> into the netstack; but it can already inject arbitrary *packet* data >>>> into the stack, so not sure if this is much of an additional risk? If it >>>> does lead to trivial crashes, we should probably harden the stack >>>> against that? >>>> >>>> As for the random number, Jesper and I discussed replacing this with the >>>> same BTF-ID scheme that he was using in his patch series. I.e., instead >>>> of just putting in a random number, we insert the BTF ID of the metadata >>>> struct at the end of it. This will allow us to support multiple >>>> different formats in the future (not just changing the layout, but >>>> having multiple simultaneous formats in the same kernel image), in case >>>> we run out of space. >>> >>> This seems a bit hypothetical. How much headroom does it usually have for the >>> xdp prog? Potentially the hints can use all the remaining space left after the >>> header encap and the current bpf_xdp_adjust_meta() usage? >> >> For the metadata consumed by the stack right now it's a bit >> hypothetical, yeah. However, there's a bunch of metadata commonly >> supported by hardware that the stack currently doesn't consume and that >> hopefully this feature will end up making more accessible. My hope is >> that the stack can also learn how to use this in the future, in which >> case we may run out of space. So I think of that bit mostly as >> future-proofing... > > ic. in this case, Can the btf_id be added to 'struct xdp_to_skb_metadata' later > if it is indeed needed? The 'struct xdp_to_skb_metadata' is not in UAPI and > doing it with CO-RE is to give us flexibility to make this kind of changes in > the future. My worry is mostly that it'll be more painful to add it later than just including it from the start, mostly because of AF_XDP users. But if we do the randomisation thing (thus forcing AF_XDP users to deal with the dynamic layout as well), it should be possible to add it later, and I can live with that option as well... >>>> We should probably also have a flag set on the xdp_frame so the stack >>>> knows that the metadata area contains relevant-to-skb data, to guard >>>> against an XDP program accidentally hitting the "magic number" (BTF_ID) >>>> in unrelated stuff it puts into the metadata area. >>> >>> Yeah, I think having a flag is useful. The flag will be set at xdp_buff and >>> then transfer to the xdp_frame? >> >> Yeah, exactly! >> >>>>> After re-reading patch 6, have another question. The 'void >>>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at >>>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *' >>>>> pointer and the xdp prog can directly read (or even write) it? >>>> >>>> Hmm, I'm not sure returning a failure makes sense? Failure to read one >>>> or more fields just means that those fields will not be populated? We >>>> should probably have a flags field inside the metadata struct itself to >>>> indicate which fields are set or not, but I'm not sure returning an >>>> error value adds anything? Returning a pointer to the metadata field >>>> might be convenient for users (it would just be an alias to the >>>> data_meta pointer, but the verifier could know its size, so the program >>>> doesn't have to bounds check it). >>> >>> If some hints are not available, those hints should be initialized to >>> 0/CHECKSUM_NONE/...etc. >> >> The problem with that is that then we have to spend cycles writing >> eight bytes of zeroes into the checksum field :) > > With a common 'struct xdp_to_skb_metadata', I am not sure how some of these zero > writes can be avoided. If the xdp prog wants to optimize, it can call > individual kfunc to get individual hints. Erm, we just... don't write those fields? Something like: void write_skb_meta(hw, ctx) { struct xdp_skb_metadata meta = ctx->data_meta - sizeof(struct xdp_skb_metadata); meta->valid_fields = 0; if (hw_has_timestamp) { meta->timestamp = hw->timestamp; meta->valid_fields |= FIELD_TIMESTAMP; } /* otherwise meta->timestamp is just uninitialised */ if (hw_has_rxhash) { meta->rxhash = hw->rxhash; meta->valid_fields |= FIELD_RXHASH; } /* otherwise meta->rxhash is just uninitialised */ ...etc... } >>> The xdp prog needs a direct way to tell hard failure when it cannot >>> write the meta area because of not enough space. Comparing >>> xdp->data_meta with xdp->data as a side effect is not intuitive. >> >> Yeah, hence a flags field so we can just see if setting each field >> succeeded? > > How testing a flag is different from checking 0/invalid-value of a > field? The flags field is smaller, so we write fewer bytes if not all fields are populated. > or some fields just don't have an invalid value to check for > like vlan_tci? Yeah, that could also be an issue. > You meant a flags field as a return value or in the 'struct xdp_to_skb_metadata' ? See above. >> >>> It is more than saving the bound check. With type info of 'struct >>> xdp_to_skb_metadata *', the verifier can do more checks like reading in the >>> middle of an integer member. The verifier could also limit write access only to >>> a few struct's members if it is needed. >>> >>> The returning 'struct xdp_to_skb_metadata *' should not be an alias to the >>> xdp->data_meta. They should actually point to different locations in the >>> headroom. bpf_xdp_metadata_export_to_skb() sets a flag in xdp_buff. >>> xdp->data_meta won't be changed and keeps pointing to the last >>> bpf_xdp_adjust_meta() location. The kernel will know if there is >>> xdp_to_skb_metadata before the xdp->data_meta when that bit is set in the >>> xdp_{buff,frame}. Would it work? >> >> Hmm, logically splitting the program metadata and the xdp_hints metadata >> (but having them share the same area) *could* work, I guess, I'm just >> not sure it's worth the extra complexity? > > It shouldn't stop the existing xdp prog writing its own metadata from using the > the new bpf_xdp_metadata_export_to_skb(). Right, I think I see what you mean, and I agree that splitting it internally in the kernel does make sense (see my other reply to Stanislav). >> >>>>> A related question, why 'struct xdp_to_skb_metadata' needs >>>>> __randomize_layout? >>>> >>>> The __randomize_layout thing is there to force BPF programs to use CO-RE >>>> to access the field. This is to avoid the struct layout accidentally >>>> ossifying because people in practice rely on a particular layout, even >>>> though we tell them to use CO-RE. There are lots of examples of this >>>> happening in other domains (IP header options, TCP options, etc), and >>>> __randomize_layout seemed like a neat trick to enforce CO-RE usage :) >>> >>> I am not sure if it is necessary or helpful to only enforce __randomize_layout >>> in 'struct xdp_to_skb_metadata'. There are other CO-RE use cases (tracing and >>> non tracing) that already have direct access (reading and/or writing) to other >>> kernel structures. >>> >>> It is more important for the verifier to see the xdp prog accessing it as a >>> 'struct xdp_to_skb_metadata *' instead of xdp->data_meta which is a __u8 * so >>> that the verifier can enforce the rules of access. >> >> That only works inside the kernel, though. Since the metadata field can >> be copied wholesale to AF_XDP, having it randomized forces userspace >> consumers to also write code to deal with the layout being dynamic... > > hm... I still don't see how useful it is, in particular you mentioned > the libxdp will install a xdp prog to write this default format > (xdp_to_skb_metadata) and likely libxdp will also provide some helpers > to parse the xdp_to_skb_metadata and the libxdp user should not need > to know if CO-RE is used or not. Considering it is a kernel internal > struct, I think it is fine to keep it and can be revisited later if > needed. Lets get on to other things first :) Well, if it was just kernel-internal, sure. But we're also exporting it to userspace (through AF_XDP). Well-behaved users will obviously do the right thing and use CO-RE. I'm trying to guard against users just blindly copy-pasting the struct definition from the kernel and doing a static cast, then complaining about their code breaking the first time we change the struct layout. Which I sadly expect that there absolutely will be people who do unless we actively make sure that doesn't work from the get-go. And since the randomisation is literally just adding __randomize_layout to the struct definition, I'd rather start out with having it, and removing it later if it turns out not to be needed... :) -Toke
On Thu, Nov 10, 2022 at 3:14 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote: > > Skipping to the last bit: > > >> >> > } else { > >> >> > use kfuncs > >> >> > } > >> >> > > >> >> > 5. Support the case where we keep program's metadata and kernel's > >> >> > xdp_to_skb_metadata > >> >> > - skb_metadata_import_from_xdp() will "consume" it by mem-moving the > >> >> > rest of the metadata over it and adjusting the headroom > >> >> > >> >> I was thinking the kernel's xdp_to_skb_metadata is always before the program's > >> >> metadata. xdp prog should usually work in this order also: read/write headers, > >> >> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return > >> >> XDP_PASS/XDP_REDIRECT. When it is XDP_PASS, the kernel just needs to pop the > >> >> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc. > >> >> > >> >> For the kernel and xdp prog, I don't think it matters where the > >> >> xdp_to_skb_metadata is. However, the xdp->data_meta (program's metadata) has to > >> >> be before xdp->data because of the current data_meta and data comparison usage > >> >> in the xdp prog. > >> >> > >> >> The order of the kernel's xdp_to_skb_metadata and the program's metadata > >> >> probably only matters to the userspace AF_XDP. However, I don't see how AF_XDP > >> >> supports the program's metadata now. afaict, it can only work now if there is > >> >> some sort of contract between them or the AF_XDP currently does not use the > >> >> program's metadata. Either way, we can do the mem-moving only for AF_XDP and it > >> >> should be a no op if there is no program's metadata? This behavior could also > >> >> be configurable through setsockopt? > >> > > >> > Agreed on all of the above. For now it seems like the safest thing to > >> > do is to put xdp_to_skb_metadata last to allow af_xdp to properly > >> > locate btf_id. > >> > Let's see if Toke disagrees :-) > >> > >> As I replied to Martin, I'm not sure it's worth the complexity to > >> logically split the SKB metadata from the program's own metadata (as > >> opposed to just reusing the existing data_meta pointer)? > > > > I'd gladly keep my current requirement where it's either or, but not both :-) > > We can relax it later if required? > > So the way I've been thinking about it is simply that the skb_metadata > would live in the same place at the data_meta pointer (including > adjusting that pointer to accommodate it), and just overriding the > existing program metadata, if any exists. But looking at it now, I guess > having the split makes it easier for a program to write its own custom > metadata and still use the skb metadata. See below about the ordering. > > >> However, if we do, the layout that makes most sense to me is putting the > >> skb metadata before the program metadata, like: > >> > >> -------------- > >> | skb_metadata > >> -------------- > >> | data_meta > >> -------------- > >> | data > >> -------------- > >> > >> Not sure if that's what you meant? :) > > > > I was suggesting the other way around: |custom meta|skb_metadata|data| > > (but, as Martin points out, consuming skb_metadata in the kernel > > becomes messier) > > > > af_xdp can check whether skb_metdata is present by looking at data - > > offsetof(struct skb_metadata, btf_id). > > progs that know how to handle custom metadata, will look at data - > > sizeof(skb_metadata) > > > > Otherwise, if it's the other way around, how do we find skb_metadata > > in a redirected frame? > > Let's say we have |skb_metadata|custom meta|data|, how does the final > > program find skb_metadata? > > All the progs have to agree on the sizeof(tc/custom meta), right? > > Erm, maybe I'm missing something here, but skb_metadata is fixed size, > right? So if the "skb_metadata is present" flag is set, we know that the > sizeof(skb_metadata) bytes before the data_meta pointer contains the > metadata, and if the flag is not set, we know those bytes are not valid > metadata. > > For AF_XDP, we'd need to transfer the flag as well, and it could apply > the same logic (getting the size from the vmlinux BTF). > > By this logic, the BTF_ID should be the *first* entry of struct > skb_metadata, since that will be the field AF_XDP programs can find > right off the bat, no? The problem with AF_XDP is that, IIUC, it doesn't have a data_meta pointer in the userspace. You get an rx descriptor where the address points to the 'data': | 256 bytes headroom where metadata can go | data | So you have (at most) 256 bytes of headroom, some of that might be the metadata, but you really don't know where it starts. But you know it definitely ends where the data begins. So if we have the following, we can locate skb_metadata: | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data | data - sizeof(skb_metadata) will get you there But if it's the other way around, the program has to know sizeof(custom metadata) to locate skb_metadata: | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data | Am I missing something here?
On 11/10/22 10:52 AM, Stanislav Fomichev wrote: >>> Oh, that seems even better than returning it from >>> bpf_xdp_metadata_export_to_skb. >>> bpf_xdp_metadata_export_to_skb can return true/false and the rest goes >>> via default verifier ctx resolution mechanism.. >>> (returning ptr from a kfunc seems to be a bit complicated right now) What is the complication in returning ptr from a kfunc? I want to see if it can be solved because it will be an easier API to use instead of calling another kfunc to get the ptr. >> See my response to John in the other thread about mixing stable UAPI (in >> xdp_md) and unstable BTF structures in the xdp_md struct: I think this >> is confusing and would prefer a kfunc. hmm... right, it will be one of the first considering the current __bpf_md_ptr() usages are only exposing another struct in uapi, eg. __bpf_md_ptr(struct bpf_sock *, sk). A kfunc will also be fine.
Stanislav Fomichev <sdf@google.com> writes: > On Thu, Nov 10, 2022 at 3:14 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote: >> >> Skipping to the last bit: >> >> >> >> > } else { >> >> >> > use kfuncs >> >> >> > } >> >> >> > >> >> >> > 5. Support the case where we keep program's metadata and kernel's >> >> >> > xdp_to_skb_metadata >> >> >> > - skb_metadata_import_from_xdp() will "consume" it by mem-moving the >> >> >> > rest of the metadata over it and adjusting the headroom >> >> >> >> >> >> I was thinking the kernel's xdp_to_skb_metadata is always before the program's >> >> >> metadata. xdp prog should usually work in this order also: read/write headers, >> >> >> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return >> >> >> XDP_PASS/XDP_REDIRECT. When it is XDP_PASS, the kernel just needs to pop the >> >> >> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc. >> >> >> >> >> >> For the kernel and xdp prog, I don't think it matters where the >> >> >> xdp_to_skb_metadata is. However, the xdp->data_meta (program's metadata) has to >> >> >> be before xdp->data because of the current data_meta and data comparison usage >> >> >> in the xdp prog. >> >> >> >> >> >> The order of the kernel's xdp_to_skb_metadata and the program's metadata >> >> >> probably only matters to the userspace AF_XDP. However, I don't see how AF_XDP >> >> >> supports the program's metadata now. afaict, it can only work now if there is >> >> >> some sort of contract between them or the AF_XDP currently does not use the >> >> >> program's metadata. Either way, we can do the mem-moving only for AF_XDP and it >> >> >> should be a no op if there is no program's metadata? This behavior could also >> >> >> be configurable through setsockopt? >> >> > >> >> > Agreed on all of the above. For now it seems like the safest thing to >> >> > do is to put xdp_to_skb_metadata last to allow af_xdp to properly >> >> > locate btf_id. >> >> > Let's see if Toke disagrees :-) >> >> >> >> As I replied to Martin, I'm not sure it's worth the complexity to >> >> logically split the SKB metadata from the program's own metadata (as >> >> opposed to just reusing the existing data_meta pointer)? >> > >> > I'd gladly keep my current requirement where it's either or, but not both :-) >> > We can relax it later if required? >> >> So the way I've been thinking about it is simply that the skb_metadata >> would live in the same place at the data_meta pointer (including >> adjusting that pointer to accommodate it), and just overriding the >> existing program metadata, if any exists. But looking at it now, I guess >> having the split makes it easier for a program to write its own custom >> metadata and still use the skb metadata. See below about the ordering. >> >> >> However, if we do, the layout that makes most sense to me is putting the >> >> skb metadata before the program metadata, like: >> >> >> >> -------------- >> >> | skb_metadata >> >> -------------- >> >> | data_meta >> >> -------------- >> >> | data >> >> -------------- >> >> >> >> Not sure if that's what you meant? :) >> > >> > I was suggesting the other way around: |custom meta|skb_metadata|data| >> > (but, as Martin points out, consuming skb_metadata in the kernel >> > becomes messier) >> > >> > af_xdp can check whether skb_metdata is present by looking at data - >> > offsetof(struct skb_metadata, btf_id). >> > progs that know how to handle custom metadata, will look at data - >> > sizeof(skb_metadata) >> > >> > Otherwise, if it's the other way around, how do we find skb_metadata >> > in a redirected frame? >> > Let's say we have |skb_metadata|custom meta|data|, how does the final >> > program find skb_metadata? >> > All the progs have to agree on the sizeof(tc/custom meta), right? >> >> Erm, maybe I'm missing something here, but skb_metadata is fixed size, >> right? So if the "skb_metadata is present" flag is set, we know that the >> sizeof(skb_metadata) bytes before the data_meta pointer contains the >> metadata, and if the flag is not set, we know those bytes are not valid >> metadata. >> >> For AF_XDP, we'd need to transfer the flag as well, and it could apply >> the same logic (getting the size from the vmlinux BTF). >> >> By this logic, the BTF_ID should be the *first* entry of struct >> skb_metadata, since that will be the field AF_XDP programs can find >> right off the bat, no? > > The problem with AF_XDP is that, IIUC, it doesn't have a data_meta > pointer in the userspace. > > You get an rx descriptor where the address points to the 'data': > | 256 bytes headroom where metadata can go | data | Ah, I was missing the bit where the data pointer actually points at data, not the start of the buf. Oops, my bad! > So you have (at most) 256 bytes of headroom, some of that might be the > metadata, but you really don't know where it starts. But you know it > definitely ends where the data begins. > > So if we have the following, we can locate skb_metadata: > | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data | > data - sizeof(skb_metadata) will get you there > > But if it's the other way around, the program has to know > sizeof(custom metadata) to locate skb_metadata: > | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data | > > Am I missing something here? Hmm, so one could argue that the only way AF_XDP can consume custom metadata today is if it knows out of band what the size of it is. And if it knows that, it can just skip over it to go back to the skb_metadata, no? The only problem left then is if there were multiple XDP programs called in sequence (whether before a redirect, or by libxdp chaining or tail calls), and the first one resized the metadata area without the last one knowing about it. For this, we could add a CLOBBER_PROGRAM_META flag to the skb_metadata helper which if set will ensure that the program metadata length is reset to 0? -Toke
" On Thu, Nov 10, 2022 at 3:58 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > On 11/10/22 10:52 AM, Stanislav Fomichev wrote: > >>> Oh, that seems even better than returning it from > >>> bpf_xdp_metadata_export_to_skb. > >>> bpf_xdp_metadata_export_to_skb can return true/false and the rest goes > >>> via default verifier ctx resolution mechanism.. > >>> (returning ptr from a kfunc seems to be a bit complicated right now) > > What is the complication in returning ptr from a kfunc? I want to see if it can > be solved because it will be an easier API to use instead of calling another > kfunc to get the ptr. At least for returning a pointer to the basic c types like int/long, I was hitting the following: commit eb1f7f71c126c8fd50ea81af98f97c4b581ea4ae Author: Benjamin Tissoires <benjamin.tissoires@redhat.com> AuthorDate: Tue Sep 6 17:13:02 2022 +0200 Commit: Alexei Starovoitov <ast@kernel.org> CommitDate: Wed Sep 7 11:05:17 2022 -0700 bpf/verifier: allow kfunc to return an allocated mem Where I had to add an extra "const int rdonly_buf_size" argument to the kfunc to make the verifier happy. But I haven't really looked at what would happen with the pointers to structs (or why, at all, we have this restriction). > >> See my response to John in the other thread about mixing stable UAPI (in > >> xdp_md) and unstable BTF structures in the xdp_md struct: I think this > >> is confusing and would prefer a kfunc. > > hmm... right, it will be one of the first considering the current __bpf_md_ptr() > usages are only exposing another struct in uapi, eg. __bpf_md_ptr(struct > bpf_sock *, sk). > > A kfunc will also be fine. >
On 11/10/22 3:52 PM, Stanislav Fomichev wrote: > On Thu, Nov 10, 2022 at 3:14 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote: >> >> Skipping to the last bit: >> >>>>>>> } else { >>>>>>> use kfuncs >>>>>>> } >>>>>>> >>>>>>> 5. Support the case where we keep program's metadata and kernel's >>>>>>> xdp_to_skb_metadata >>>>>>> - skb_metadata_import_from_xdp() will "consume" it by mem-moving the >>>>>>> rest of the metadata over it and adjusting the headroom >>>>>> >>>>>> I was thinking the kernel's xdp_to_skb_metadata is always before the program's >>>>>> metadata. xdp prog should usually work in this order also: read/write headers, >>>>>> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return >>>>>> XDP_PASS/XDP_REDIRECT. When it is XDP_PASS, the kernel just needs to pop the >>>>>> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc. >>>>>> >>>>>> For the kernel and xdp prog, I don't think it matters where the >>>>>> xdp_to_skb_metadata is. However, the xdp->data_meta (program's metadata) has to >>>>>> be before xdp->data because of the current data_meta and data comparison usage >>>>>> in the xdp prog. >>>>>> >>>>>> The order of the kernel's xdp_to_skb_metadata and the program's metadata >>>>>> probably only matters to the userspace AF_XDP. However, I don't see how AF_XDP >>>>>> supports the program's metadata now. afaict, it can only work now if there is >>>>>> some sort of contract between them or the AF_XDP currently does not use the >>>>>> program's metadata. Either way, we can do the mem-moving only for AF_XDP and it >>>>>> should be a no op if there is no program's metadata? This behavior could also >>>>>> be configurable through setsockopt? >>>>> >>>>> Agreed on all of the above. For now it seems like the safest thing to >>>>> do is to put xdp_to_skb_metadata last to allow af_xdp to properly >>>>> locate btf_id. >>>>> Let's see if Toke disagrees :-) >>>> >>>> As I replied to Martin, I'm not sure it's worth the complexity to >>>> logically split the SKB metadata from the program's own metadata (as >>>> opposed to just reusing the existing data_meta pointer)? >>> >>> I'd gladly keep my current requirement where it's either or, but not both :-) >>> We can relax it later if required? >> >> So the way I've been thinking about it is simply that the skb_metadata >> would live in the same place at the data_meta pointer (including >> adjusting that pointer to accommodate it), and just overriding the >> existing program metadata, if any exists. But looking at it now, I guess >> having the split makes it easier for a program to write its own custom >> metadata and still use the skb metadata. See below about the ordering. >> >>>> However, if we do, the layout that makes most sense to me is putting the >>>> skb metadata before the program metadata, like: >>>> >>>> -------------- >>>> | skb_metadata >>>> -------------- >>>> | data_meta >>>> -------------- >>>> | data >>>> -------------- >>>> Yeah, for the kernel and xdp prog (ie not AF_XDP), I meant this: | skb_metadata | custom metadata | data | >>>> Not sure if that's what you meant? :) >>> >>> I was suggesting the other way around: |custom meta|skb_metadata|data| >>> (but, as Martin points out, consuming skb_metadata in the kernel >>> becomes messier) >>> >>> af_xdp can check whether skb_metdata is present by looking at data - >>> offsetof(struct skb_metadata, btf_id). >>> progs that know how to handle custom metadata, will look at data - >>> sizeof(skb_metadata) >>> >>> Otherwise, if it's the other way around, how do we find skb_metadata >>> in a redirected frame? >>> Let's say we have |skb_metadata|custom meta|data|, how does the final >>> program find skb_metadata? >>> All the progs have to agree on the sizeof(tc/custom meta), right? >> >> Erm, maybe I'm missing something here, but skb_metadata is fixed size, >> right? So if the "skb_metadata is present" flag is set, we know that the >> sizeof(skb_metadata) bytes before the data_meta pointer contains the >> metadata, and if the flag is not set, we know those bytes are not valid >> metadata. right, so to get to the skb_metadata, it will be data_meta -= sizeof(skb_metadata); /* probably need alignment */ >> >> For AF_XDP, we'd need to transfer the flag as well, and it could apply >> the same logic (getting the size from the vmlinux BTF). >> >> By this logic, the BTF_ID should be the *first* entry of struct >> skb_metadata, since that will be the field AF_XDP programs can find >> right off the bat, no? > > The problem with AF_XDP is that, IIUC, it doesn't have a data_meta > pointer in the userspace. Yep. It is my understanding also. Missing data_meta pointer in the AF_XDP rx_desc is a potential problem. Having BTF_ID or not won't help. > > You get an rx descriptor where the address points to the 'data': > | 256 bytes headroom where metadata can go | data | > > So you have (at most) 256 bytes of headroom, some of that might be the > metadata, but you really don't know where it starts. But you know it > definitely ends where the data begins. > > So if we have the following, we can locate skb_metadata: > | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data | > data - sizeof(skb_metadata) will get you there > > But if it's the other way around, the program has to know > sizeof(custom metadata) to locate skb_metadata: > | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data | Right, this won't work if the AF_XDP user does not know how big the custom metadata is. The kernel then needs to swap the "skb_metadata" and "custom metadata" + setting a flag in the AF_XDP rx_desc->options to make it looks like this: | custom metadata | skb_metadata | data | However, since data_meta is missing from the rx_desc, may be we can safely assume the AF_XDP user always knows the size of the custom metadata or there is usually no "custom metadata" and no swap is needed?
On 11/10/22 4:10 PM, Toke Høiland-Jørgensen wrote: >> The problem with AF_XDP is that, IIUC, it doesn't have a data_meta >> pointer in the userspace. >> >> You get an rx descriptor where the address points to the 'data': >> | 256 bytes headroom where metadata can go | data | > > Ah, I was missing the bit where the data pointer actually points at > data, not the start of the buf. Oops, my bad! > >> So you have (at most) 256 bytes of headroom, some of that might be the >> metadata, but you really don't know where it starts. But you know it >> definitely ends where the data begins. >> >> So if we have the following, we can locate skb_metadata: >> | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data | >> data - sizeof(skb_metadata) will get you there >> >> But if it's the other way around, the program has to know >> sizeof(custom metadata) to locate skb_metadata: >> | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data | >> >> Am I missing something here? > > Hmm, so one could argue that the only way AF_XDP can consume custom > metadata today is if it knows out of band what the size of it is. And if > it knows that, it can just skip over it to go back to the skb_metadata, > no? +1 I replied with a similar point in another email. I also think we can safely assume this. > > The only problem left then is if there were multiple XDP programs called > in sequence (whether before a redirect, or by libxdp chaining or tail > calls), and the first one resized the metadata area without the last one > knowing about it. For this, we could add a CLOBBER_PROGRAM_META flag to > the skb_metadata helper which if set will ensure that the program > metadata length is reset to 0? How is it different from the same xdp prog calling bpf_xdp_adjust_meta() and bpf_xdp_metadata_export_to_skb() multiple times. The earlier stored skb_metadata needs to be moved during the latter bpf_xdp_adjust_meta(). The latter bpf_xdp_metadata_export_to_skb() will overwrite the earlier skb_metadata.
On Thu, Nov 10, 2022 at 4:33 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > On 11/10/22 3:52 PM, Stanislav Fomichev wrote: > > On Thu, Nov 10, 2022 at 3:14 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote: > >> > >> Skipping to the last bit: > >> > >>>>>>> } else { > >>>>>>> use kfuncs > >>>>>>> } > >>>>>>> > >>>>>>> 5. Support the case where we keep program's metadata and kernel's > >>>>>>> xdp_to_skb_metadata > >>>>>>> - skb_metadata_import_from_xdp() will "consume" it by mem-moving the > >>>>>>> rest of the metadata over it and adjusting the headroom > >>>>>> > >>>>>> I was thinking the kernel's xdp_to_skb_metadata is always before the program's > >>>>>> metadata. xdp prog should usually work in this order also: read/write headers, > >>>>>> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return > >>>>>> XDP_PASS/XDP_REDIRECT. When it is XDP_PASS, the kernel just needs to pop the > >>>>>> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc. > >>>>>> > >>>>>> For the kernel and xdp prog, I don't think it matters where the > >>>>>> xdp_to_skb_metadata is. However, the xdp->data_meta (program's metadata) has to > >>>>>> be before xdp->data because of the current data_meta and data comparison usage > >>>>>> in the xdp prog. > >>>>>> > >>>>>> The order of the kernel's xdp_to_skb_metadata and the program's metadata > >>>>>> probably only matters to the userspace AF_XDP. However, I don't see how AF_XDP > >>>>>> supports the program's metadata now. afaict, it can only work now if there is > >>>>>> some sort of contract between them or the AF_XDP currently does not use the > >>>>>> program's metadata. Either way, we can do the mem-moving only for AF_XDP and it > >>>>>> should be a no op if there is no program's metadata? This behavior could also > >>>>>> be configurable through setsockopt? > >>>>> > >>>>> Agreed on all of the above. For now it seems like the safest thing to > >>>>> do is to put xdp_to_skb_metadata last to allow af_xdp to properly > >>>>> locate btf_id. > >>>>> Let's see if Toke disagrees :-) > >>>> > >>>> As I replied to Martin, I'm not sure it's worth the complexity to > >>>> logically split the SKB metadata from the program's own metadata (as > >>>> opposed to just reusing the existing data_meta pointer)? > >>> > >>> I'd gladly keep my current requirement where it's either or, but not both :-) > >>> We can relax it later if required? > >> > >> So the way I've been thinking about it is simply that the skb_metadata > >> would live in the same place at the data_meta pointer (including > >> adjusting that pointer to accommodate it), and just overriding the > >> existing program metadata, if any exists. But looking at it now, I guess > >> having the split makes it easier for a program to write its own custom > >> metadata and still use the skb metadata. See below about the ordering. > >> > >>>> However, if we do, the layout that makes most sense to me is putting the > >>>> skb metadata before the program metadata, like: > >>>> > >>>> -------------- > >>>> | skb_metadata > >>>> -------------- > >>>> | data_meta > >>>> -------------- > >>>> | data > >>>> -------------- > >>>> > > Yeah, for the kernel and xdp prog (ie not AF_XDP), I meant this: > > | skb_metadata | custom metadata | data | > > >>>> Not sure if that's what you meant? :) > >>> > >>> I was suggesting the other way around: |custom meta|skb_metadata|data| > >>> (but, as Martin points out, consuming skb_metadata in the kernel > >>> becomes messier) > >>> > >>> af_xdp can check whether skb_metdata is present by looking at data - > >>> offsetof(struct skb_metadata, btf_id). > >>> progs that know how to handle custom metadata, will look at data - > >>> sizeof(skb_metadata) > >>> > >>> Otherwise, if it's the other way around, how do we find skb_metadata > >>> in a redirected frame? > >>> Let's say we have |skb_metadata|custom meta|data|, how does the final > >>> program find skb_metadata? > >>> All the progs have to agree on the sizeof(tc/custom meta), right? > >> > >> Erm, maybe I'm missing something here, but skb_metadata is fixed size, > >> right? So if the "skb_metadata is present" flag is set, we know that the > >> sizeof(skb_metadata) bytes before the data_meta pointer contains the > >> metadata, and if the flag is not set, we know those bytes are not valid > >> metadata. > > right, so to get to the skb_metadata, it will be > data_meta -= sizeof(skb_metadata); /* probably need alignment */ > > >> > >> For AF_XDP, we'd need to transfer the flag as well, and it could apply > >> the same logic (getting the size from the vmlinux BTF). > >> > >> By this logic, the BTF_ID should be the *first* entry of struct > >> skb_metadata, since that will be the field AF_XDP programs can find > >> right off the bat, no? > > > The problem with AF_XDP is that, IIUC, it doesn't have a data_meta > > pointer in the userspace. > > Yep. It is my understanding also. Missing data_meta pointer in the AF_XDP > rx_desc is a potential problem. Having BTF_ID or not won't help. > > > > > You get an rx descriptor where the address points to the 'data': > > | 256 bytes headroom where metadata can go | data | > > > > So you have (at most) 256 bytes of headroom, some of that might be the > > metadata, but you really don't know where it starts. But you know it > > definitely ends where the data begins. > > > > So if we have the following, we can locate skb_metadata: > > | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data | > > data - sizeof(skb_metadata) will get you there > > > > But if it's the other way around, the program has to know > > sizeof(custom metadata) to locate skb_metadata: > > | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data | > > Right, this won't work if the AF_XDP user does not know how big the custom > metadata is. The kernel then needs to swap the "skb_metadata" and "custom > metadata" + setting a flag in the AF_XDP rx_desc->options to make it looks like > this: > | custom metadata | skb_metadata | data | > > However, since data_meta is missing from the rx_desc, may be we can safely > assume the AF_XDP user always knows the size of the custom metadata or there is > usually no "custom metadata" and no swap is needed? If we can assume they can share that info, can they also share more info on what kind of metadata they would prefer to get? If they can agree on the size, maybe they also can agree on the flows that need skb_metdata vs the flows that need a custom one? Seems like we can start with supporting either one, but not both and extend in the future once we have more understanding on whether it's actually needed or not? bpf_xdp_metadata_export_to_skb: adjust data meta, add uses-skb-metadata flag bpf_xdp_adjust_meta: unconditionally reset uses-skb-metadata flag
On 11/10/22 4:57 PM, Stanislav Fomichev wrote: > On Thu, Nov 10, 2022 at 4:33 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: >> >> On 11/10/22 3:52 PM, Stanislav Fomichev wrote: >>> On Thu, Nov 10, 2022 at 3:14 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote: >>>> >>>> Skipping to the last bit: >>>> >>>>>>>>> } else { >>>>>>>>> use kfuncs >>>>>>>>> } >>>>>>>>> >>>>>>>>> 5. Support the case where we keep program's metadata and kernel's >>>>>>>>> xdp_to_skb_metadata >>>>>>>>> - skb_metadata_import_from_xdp() will "consume" it by mem-moving the >>>>>>>>> rest of the metadata over it and adjusting the headroom >>>>>>>> >>>>>>>> I was thinking the kernel's xdp_to_skb_metadata is always before the program's >>>>>>>> metadata. xdp prog should usually work in this order also: read/write headers, >>>>>>>> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return >>>>>>>> XDP_PASS/XDP_REDIRECT. When it is XDP_PASS, the kernel just needs to pop the >>>>>>>> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc. >>>>>>>> >>>>>>>> For the kernel and xdp prog, I don't think it matters where the >>>>>>>> xdp_to_skb_metadata is. However, the xdp->data_meta (program's metadata) has to >>>>>>>> be before xdp->data because of the current data_meta and data comparison usage >>>>>>>> in the xdp prog. >>>>>>>> >>>>>>>> The order of the kernel's xdp_to_skb_metadata and the program's metadata >>>>>>>> probably only matters to the userspace AF_XDP. However, I don't see how AF_XDP >>>>>>>> supports the program's metadata now. afaict, it can only work now if there is >>>>>>>> some sort of contract between them or the AF_XDP currently does not use the >>>>>>>> program's metadata. Either way, we can do the mem-moving only for AF_XDP and it >>>>>>>> should be a no op if there is no program's metadata? This behavior could also >>>>>>>> be configurable through setsockopt? >>>>>>> >>>>>>> Agreed on all of the above. For now it seems like the safest thing to >>>>>>> do is to put xdp_to_skb_metadata last to allow af_xdp to properly >>>>>>> locate btf_id. >>>>>>> Let's see if Toke disagrees :-) >>>>>> >>>>>> As I replied to Martin, I'm not sure it's worth the complexity to >>>>>> logically split the SKB metadata from the program's own metadata (as >>>>>> opposed to just reusing the existing data_meta pointer)? >>>>> >>>>> I'd gladly keep my current requirement where it's either or, but not both :-) >>>>> We can relax it later if required? >>>> >>>> So the way I've been thinking about it is simply that the skb_metadata >>>> would live in the same place at the data_meta pointer (including >>>> adjusting that pointer to accommodate it), and just overriding the >>>> existing program metadata, if any exists. But looking at it now, I guess >>>> having the split makes it easier for a program to write its own custom >>>> metadata and still use the skb metadata. See below about the ordering. >>>> >>>>>> However, if we do, the layout that makes most sense to me is putting the >>>>>> skb metadata before the program metadata, like: >>>>>> >>>>>> -------------- >>>>>> | skb_metadata >>>>>> -------------- >>>>>> | data_meta >>>>>> -------------- >>>>>> | data >>>>>> -------------- >>>>>> >> >> Yeah, for the kernel and xdp prog (ie not AF_XDP), I meant this: >> >> | skb_metadata | custom metadata | data | >> >>>>>> Not sure if that's what you meant? :) >>>>> >>>>> I was suggesting the other way around: |custom meta|skb_metadata|data| >>>>> (but, as Martin points out, consuming skb_metadata in the kernel >>>>> becomes messier) >>>>> >>>>> af_xdp can check whether skb_metdata is present by looking at data - >>>>> offsetof(struct skb_metadata, btf_id). >>>>> progs that know how to handle custom metadata, will look at data - >>>>> sizeof(skb_metadata) >>>>> >>>>> Otherwise, if it's the other way around, how do we find skb_metadata >>>>> in a redirected frame? >>>>> Let's say we have |skb_metadata|custom meta|data|, how does the final >>>>> program find skb_metadata? >>>>> All the progs have to agree on the sizeof(tc/custom meta), right? >>>> >>>> Erm, maybe I'm missing something here, but skb_metadata is fixed size, >>>> right? So if the "skb_metadata is present" flag is set, we know that the >>>> sizeof(skb_metadata) bytes before the data_meta pointer contains the >>>> metadata, and if the flag is not set, we know those bytes are not valid >>>> metadata. >> >> right, so to get to the skb_metadata, it will be >> data_meta -= sizeof(skb_metadata); /* probably need alignment */ >> >>>> >>>> For AF_XDP, we'd need to transfer the flag as well, and it could apply >>>> the same logic (getting the size from the vmlinux BTF). >>>> >>>> By this logic, the BTF_ID should be the *first* entry of struct >>>> skb_metadata, since that will be the field AF_XDP programs can find >>>> right off the bat, no? > >>> The problem with AF_XDP is that, IIUC, it doesn't have a data_meta >>> pointer in the userspace. >> >> Yep. It is my understanding also. Missing data_meta pointer in the AF_XDP >> rx_desc is a potential problem. Having BTF_ID or not won't help. >> >>> >>> You get an rx descriptor where the address points to the 'data': >>> | 256 bytes headroom where metadata can go | data | >>> >>> So you have (at most) 256 bytes of headroom, some of that might be the >>> metadata, but you really don't know where it starts. But you know it >>> definitely ends where the data begins. >>> >>> So if we have the following, we can locate skb_metadata: >>> | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data | >>> data - sizeof(skb_metadata) will get you there >>> >>> But if it's the other way around, the program has to know >>> sizeof(custom metadata) to locate skb_metadata: >>> | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data | >> >> Right, this won't work if the AF_XDP user does not know how big the custom >> metadata is. The kernel then needs to swap the "skb_metadata" and "custom >> metadata" + setting a flag in the AF_XDP rx_desc->options to make it looks like >> this: >> | custom metadata | skb_metadata | data | >> >> However, since data_meta is missing from the rx_desc, may be we can safely >> assume the AF_XDP user always knows the size of the custom metadata or there is >> usually no "custom metadata" and no swap is needed? > > If we can assume they can share that info, can they also share more > info on what kind of metadata they would prefer to get? > If they can agree on the size, maybe they also can agree on the flows > that need skb_metdata vs the flows that need a custom one? > > Seems like we can start with supporting either one, but not both and > extend in the future once we have more understanding on whether it's > actually needed or not? > > bpf_xdp_metadata_export_to_skb: adjust data meta, add uses-skb-metadata flag > bpf_xdp_adjust_meta: unconditionally reset uses-skb-metadata flag hmm... I am thinking: bpf_xdp_adjust_meta: move the existing (if any) skb_metadata and adjust xdp->data_meta. bpf_xdp_metadata_export_to_skb: If skb_metadata exists, overwrites the existing one. If not exists, gets headroom before xdp->data_meta and writes hints.
On 11/10/22 3:29 PM, Toke Høiland-Jørgensen wrote: >>> For the metadata consumed by the stack right now it's a bit >>> hypothetical, yeah. However, there's a bunch of metadata commonly >>> supported by hardware that the stack currently doesn't consume and that >>> hopefully this feature will end up making more accessible. My hope is >>> that the stack can also learn how to use this in the future, in which >>> case we may run out of space. So I think of that bit mostly as >>> future-proofing... >> >> ic. in this case, Can the btf_id be added to 'struct xdp_to_skb_metadata' later >> if it is indeed needed? The 'struct xdp_to_skb_metadata' is not in UAPI and >> doing it with CO-RE is to give us flexibility to make this kind of changes in >> the future. > > My worry is mostly that it'll be more painful to add it later than just > including it from the start, mostly because of AF_XDP users. But if we > do the randomisation thing (thus forcing AF_XDP users to deal with the > dynamic layout as well), it should be possible to add it later, and I > can live with that option as well... imo, considering we are trying to optimize unnecessary field initialization as below, it is sort of wasteful to always initialize the btf_id with the same value. It is better to add it in the future when there is a need. >>>>> We should probably also have a flag set on the xdp_frame so the stack >>>>> knows that the metadata area contains relevant-to-skb data, to guard >>>>> against an XDP program accidentally hitting the "magic number" (BTF_ID) >>>>> in unrelated stuff it puts into the metadata area. >>>> >>>> Yeah, I think having a flag is useful. The flag will be set at xdp_buff and >>>> then transfer to the xdp_frame? >>> >>> Yeah, exactly! >>> >>>>>> After re-reading patch 6, have another question. The 'void >>>>>> bpf_xdp_metadata_export_to_skb();' function signature. Should it at >>>>>> least return ok/err? or even return a 'struct xdp_to_skb_metadata *' >>>>>> pointer and the xdp prog can directly read (or even write) it? >>>>> >>>>> Hmm, I'm not sure returning a failure makes sense? Failure to read one >>>>> or more fields just means that those fields will not be populated? We >>>>> should probably have a flags field inside the metadata struct itself to >>>>> indicate which fields are set or not, but I'm not sure returning an >>>>> error value adds anything? Returning a pointer to the metadata field >>>>> might be convenient for users (it would just be an alias to the >>>>> data_meta pointer, but the verifier could know its size, so the program >>>>> doesn't have to bounds check it). >>>> >>>> If some hints are not available, those hints should be initialized to >>>> 0/CHECKSUM_NONE/...etc. >>> >>> The problem with that is that then we have to spend cycles writing >>> eight bytes of zeroes into the checksum field :) >> >> With a common 'struct xdp_to_skb_metadata', I am not sure how some of these zero >> writes can be avoided. If the xdp prog wants to optimize, it can call >> individual kfunc to get individual hints. > > Erm, we just... don't write those fields? Something like: > > void write_skb_meta(hw, ctx) { > struct xdp_skb_metadata meta = ctx->data_meta - sizeof(struct xdp_skb_metadata); > meta->valid_fields = 0; > > if (hw_has_timestamp) { > meta->timestamp = hw->timestamp; > meta->valid_fields |= FIELD_TIMESTAMP; > } /* otherwise meta->timestamp is just uninitialised */ > > if (hw_has_rxhash) { > meta->rxhash = hw->rxhash; > meta->valid_fields |= FIELD_RXHASH; > } /* otherwise meta->rxhash is just uninitialised */ > ...etc... > } Ah, got it. Make sense. My mind was stalled in the paradigm that a helper that needs to initialize the result.
Martin KaFai Lau <martin.lau@linux.dev> writes: > On 11/10/22 4:10 PM, Toke Høiland-Jørgensen wrote: >>> The problem with AF_XDP is that, IIUC, it doesn't have a data_meta >>> pointer in the userspace. >>> >>> You get an rx descriptor where the address points to the 'data': >>> | 256 bytes headroom where metadata can go | data | >> >> Ah, I was missing the bit where the data pointer actually points at >> data, not the start of the buf. Oops, my bad! >> >>> So you have (at most) 256 bytes of headroom, some of that might be the >>> metadata, but you really don't know where it starts. But you know it >>> definitely ends where the data begins. >>> >>> So if we have the following, we can locate skb_metadata: >>> | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data | >>> data - sizeof(skb_metadata) will get you there >>> >>> But if it's the other way around, the program has to know >>> sizeof(custom metadata) to locate skb_metadata: >>> | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data | >>> >>> Am I missing something here? >> >> Hmm, so one could argue that the only way AF_XDP can consume custom >> metadata today is if it knows out of band what the size of it is. And if >> it knows that, it can just skip over it to go back to the skb_metadata, >> no? > > +1 I replied with a similar point in another email. I also think we > can safely assume this. Great! >> >> The only problem left then is if there were multiple XDP programs called >> in sequence (whether before a redirect, or by libxdp chaining or tail >> calls), and the first one resized the metadata area without the last one >> knowing about it. For this, we could add a CLOBBER_PROGRAM_META flag to >> the skb_metadata helper which if set will ensure that the program >> metadata length is reset to 0? > > How is it different from the same xdp prog calling bpf_xdp_adjust_meta() and > bpf_xdp_metadata_export_to_skb() multiple times. The earlier stored > skb_metadata needs to be moved during the latter bpf_xdp_adjust_meta(). The > latter bpf_xdp_metadata_export_to_skb() will overwrite the earlier skb_metadata. Well, it would just be a convenience flag, so instead of doing: metalen = ctx->data - ctx->data_meta; if (metalen) xdp_adjust_meta(-metalen); bpf_xdp_metadata_export_to_skb(ctx); you could just do: bpf_xdp_metadata_export_to_skb(ctx, CLOBBER_PROGRAM_META); and the kernel would do the check+move for you. But, well, the couple of extra instructions to do the check in BPF is probably fine. (I'm talking here about a program that wants to make sure that any custom metadata that may have been added by an earlier program is removed before redirecting to an XSK socket; I expect we'd want to do something like this in the default program in libxdp). -Toke
Martin KaFai Lau <martin.lau@linux.dev> writes: > On 11/10/22 4:57 PM, Stanislav Fomichev wrote: >> On Thu, Nov 10, 2022 at 4:33 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: >>> >>> On 11/10/22 3:52 PM, Stanislav Fomichev wrote: >>>> On Thu, Nov 10, 2022 at 3:14 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote: >>>>> >>>>> Skipping to the last bit: >>>>> >>>>>>>>>> } else { >>>>>>>>>> use kfuncs >>>>>>>>>> } >>>>>>>>>> >>>>>>>>>> 5. Support the case where we keep program's metadata and kernel's >>>>>>>>>> xdp_to_skb_metadata >>>>>>>>>> - skb_metadata_import_from_xdp() will "consume" it by mem-moving the >>>>>>>>>> rest of the metadata over it and adjusting the headroom >>>>>>>>> >>>>>>>>> I was thinking the kernel's xdp_to_skb_metadata is always before the program's >>>>>>>>> metadata. xdp prog should usually work in this order also: read/write headers, >>>>>>>>> write its own metadata, call bpf_xdp_metadata_export_to_skb(), and return >>>>>>>>> XDP_PASS/XDP_REDIRECT. When it is XDP_PASS, the kernel just needs to pop the >>>>>>>>> xdp_to_skb_metadata and pass the remaining program's metadata to the bpf-tc. >>>>>>>>> >>>>>>>>> For the kernel and xdp prog, I don't think it matters where the >>>>>>>>> xdp_to_skb_metadata is. However, the xdp->data_meta (program's metadata) has to >>>>>>>>> be before xdp->data because of the current data_meta and data comparison usage >>>>>>>>> in the xdp prog. >>>>>>>>> >>>>>>>>> The order of the kernel's xdp_to_skb_metadata and the program's metadata >>>>>>>>> probably only matters to the userspace AF_XDP. However, I don't see how AF_XDP >>>>>>>>> supports the program's metadata now. afaict, it can only work now if there is >>>>>>>>> some sort of contract between them or the AF_XDP currently does not use the >>>>>>>>> program's metadata. Either way, we can do the mem-moving only for AF_XDP and it >>>>>>>>> should be a no op if there is no program's metadata? This behavior could also >>>>>>>>> be configurable through setsockopt? >>>>>>>> >>>>>>>> Agreed on all of the above. For now it seems like the safest thing to >>>>>>>> do is to put xdp_to_skb_metadata last to allow af_xdp to properly >>>>>>>> locate btf_id. >>>>>>>> Let's see if Toke disagrees :-) >>>>>>> >>>>>>> As I replied to Martin, I'm not sure it's worth the complexity to >>>>>>> logically split the SKB metadata from the program's own metadata (as >>>>>>> opposed to just reusing the existing data_meta pointer)? >>>>>> >>>>>> I'd gladly keep my current requirement where it's either or, but not both :-) >>>>>> We can relax it later if required? >>>>> >>>>> So the way I've been thinking about it is simply that the skb_metadata >>>>> would live in the same place at the data_meta pointer (including >>>>> adjusting that pointer to accommodate it), and just overriding the >>>>> existing program metadata, if any exists. But looking at it now, I guess >>>>> having the split makes it easier for a program to write its own custom >>>>> metadata and still use the skb metadata. See below about the ordering. >>>>> >>>>>>> However, if we do, the layout that makes most sense to me is putting the >>>>>>> skb metadata before the program metadata, like: >>>>>>> >>>>>>> -------------- >>>>>>> | skb_metadata >>>>>>> -------------- >>>>>>> | data_meta >>>>>>> -------------- >>>>>>> | data >>>>>>> -------------- >>>>>>> >>> >>> Yeah, for the kernel and xdp prog (ie not AF_XDP), I meant this: >>> >>> | skb_metadata | custom metadata | data | >>> >>>>>>> Not sure if that's what you meant? :) >>>>>> >>>>>> I was suggesting the other way around: |custom meta|skb_metadata|data| >>>>>> (but, as Martin points out, consuming skb_metadata in the kernel >>>>>> becomes messier) >>>>>> >>>>>> af_xdp can check whether skb_metdata is present by looking at data - >>>>>> offsetof(struct skb_metadata, btf_id). >>>>>> progs that know how to handle custom metadata, will look at data - >>>>>> sizeof(skb_metadata) >>>>>> >>>>>> Otherwise, if it's the other way around, how do we find skb_metadata >>>>>> in a redirected frame? >>>>>> Let's say we have |skb_metadata|custom meta|data|, how does the final >>>>>> program find skb_metadata? >>>>>> All the progs have to agree on the sizeof(tc/custom meta), right? >>>>> >>>>> Erm, maybe I'm missing something here, but skb_metadata is fixed size, >>>>> right? So if the "skb_metadata is present" flag is set, we know that the >>>>> sizeof(skb_metadata) bytes before the data_meta pointer contains the >>>>> metadata, and if the flag is not set, we know those bytes are not valid >>>>> metadata. >>> >>> right, so to get to the skb_metadata, it will be >>> data_meta -= sizeof(skb_metadata); /* probably need alignment */ >>> >>>>> >>>>> For AF_XDP, we'd need to transfer the flag as well, and it could apply >>>>> the same logic (getting the size from the vmlinux BTF). >>>>> >>>>> By this logic, the BTF_ID should be the *first* entry of struct >>>>> skb_metadata, since that will be the field AF_XDP programs can find >>>>> right off the bat, no? > >>>> The problem with AF_XDP is that, IIUC, it doesn't have a data_meta >>>> pointer in the userspace. >>> >>> Yep. It is my understanding also. Missing data_meta pointer in the AF_XDP >>> rx_desc is a potential problem. Having BTF_ID or not won't help. >>> >>>> >>>> You get an rx descriptor where the address points to the 'data': >>>> | 256 bytes headroom where metadata can go | data | >>>> >>>> So you have (at most) 256 bytes of headroom, some of that might be the >>>> metadata, but you really don't know where it starts. But you know it >>>> definitely ends where the data begins. >>>> >>>> So if we have the following, we can locate skb_metadata: >>>> | 256-sizeof(skb_metadata) headroom | custom metadata | skb_metadata | data | >>>> data - sizeof(skb_metadata) will get you there >>>> >>>> But if it's the other way around, the program has to know >>>> sizeof(custom metadata) to locate skb_metadata: >>>> | 256-sizeof(skb_metadata) headroom | skb_metadata | custom metadata | data | >>> >>> Right, this won't work if the AF_XDP user does not know how big the custom >>> metadata is. The kernel then needs to swap the "skb_metadata" and "custom >>> metadata" + setting a flag in the AF_XDP rx_desc->options to make it looks like >>> this: >>> | custom metadata | skb_metadata | data | >>> >>> However, since data_meta is missing from the rx_desc, may be we can safely >>> assume the AF_XDP user always knows the size of the custom metadata or there is >>> usually no "custom metadata" and no swap is needed? >> >> If we can assume they can share that info, can they also share more >> info on what kind of metadata they would prefer to get? >> If they can agree on the size, maybe they also can agree on the flows >> that need skb_metdata vs the flows that need a custom one? >> >> Seems like we can start with supporting either one, but not both and >> extend in the future once we have more understanding on whether it's >> actually needed or not? >> >> bpf_xdp_metadata_export_to_skb: adjust data meta, add uses-skb-metadata flag >> bpf_xdp_adjust_meta: unconditionally reset uses-skb-metadata flag > hmm... I am thinking: > > bpf_xdp_adjust_meta: move the existing (if any) skb_metadata and adjust > xdp->data_meta. > > bpf_xdp_metadata_export_to_skb: If skb_metadata exists, overwrites the existing > one. If not exists, gets headroom before xdp->data_meta and writes hints. Yeah, +1 on this. For AF_XDP that means that if you only do bpf_xdp_metadata_export_to_skb() you'll get the skb metadata right before data, and if you add custom metadata yourself, that goes in-between so you'll need to have some way to communicate the length to the AF_XDP consumer. And a default program (like in libxdp) can just check the metadata pointer and make sure to remove any metadata before redirecting to userspace, like in the example from my other reply to Martin. -Toke
Martin KaFai Lau <martin.lau@linux.dev> writes: > On 11/10/22 3:29 PM, Toke Høiland-Jørgensen wrote: >>>> For the metadata consumed by the stack right now it's a bit >>>> hypothetical, yeah. However, there's a bunch of metadata commonly >>>> supported by hardware that the stack currently doesn't consume and that >>>> hopefully this feature will end up making more accessible. My hope is >>>> that the stack can also learn how to use this in the future, in which >>>> case we may run out of space. So I think of that bit mostly as >>>> future-proofing... >>> >>> ic. in this case, Can the btf_id be added to 'struct xdp_to_skb_metadata' later >>> if it is indeed needed? The 'struct xdp_to_skb_metadata' is not in UAPI and >>> doing it with CO-RE is to give us flexibility to make this kind of changes in >>> the future. >> >> My worry is mostly that it'll be more painful to add it later than just >> including it from the start, mostly because of AF_XDP users. But if we >> do the randomisation thing (thus forcing AF_XDP users to deal with the >> dynamic layout as well), it should be possible to add it later, and I >> can live with that option as well... > > imo, considering we are trying to optimize unnecessary field > initialization as below, it is sort of wasteful to always initialize > the btf_id with the same value. It is better to add it in the future > when there is a need. Okay, let's omit the BTF ID for now, and see what that looks like. I'll try to keep in mind to see if I can find any reasons why we'd need to add it back and make sure to complain before this lands if I find any :) -Toke
diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 0e629ceb087b..d4cd0938360b 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1673,7 +1673,9 @@ static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) static void veth_unroll_kfunc(const struct bpf_prog *prog, u32 func_id, struct bpf_patch *patch) { - if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED)) { + if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_EXPORT_TO_SKB)) { + return xdp_metadata_export_to_skb(prog, patch); + } else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED)) { /* return true; */ bpf_patch_append(patch, BPF_MOV64_IMM(BPF_REG_0, 1)); } else if (func_id == xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) { diff --git a/include/linux/bpf_patch.h b/include/linux/bpf_patch.h index 81ff738eef8d..359c165ad68b 100644 --- a/include/linux/bpf_patch.h +++ b/include/linux/bpf_patch.h @@ -16,6 +16,8 @@ size_t bpf_patch_len(const struct bpf_patch *patch); int bpf_patch_err(const struct bpf_patch *patch); void __bpf_patch_append(struct bpf_patch *patch, struct bpf_insn insn); struct bpf_insn *bpf_patch_data(const struct bpf_patch *patch); +void bpf_patch_resolve_jmp(struct bpf_patch *patch); +u32 bpf_patch_magles_registers(const struct bpf_patch *patch); #define bpf_patch_append(patch, ...) ({ \ struct bpf_insn insn[] = { __VA_ARGS__ }; \ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 59c9fd55699d..dba857f212d7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4217,9 +4217,13 @@ static inline bool skb_metadata_differs(const struct sk_buff *skb_a, true : __skb_metadata_differs(skb_a, skb_b, len_a); } +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len); + static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) { skb_shinfo(skb)->meta_len = meta_len; + if (meta_len) + skb_metadata_import_from_xdp(skb, meta_len); } static inline void skb_metadata_clear(struct sk_buff *skb) diff --git a/include/net/xdp.h b/include/net/xdp.h index 2a82a98f2f9f..8c97c6996172 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -411,6 +411,8 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE #define XDP_METADATA_KFUNC_xxx \ + XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_EXPORT_TO_SKB, \ + bpf_xdp_metadata_export_to_skb) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP_SUPPORTED, \ bpf_xdp_metadata_rx_timestamp_supported) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ @@ -423,14 +425,25 @@ XDP_METADATA_KFUNC_xxx MAX_XDP_METADATA_KFUNC, }; +struct xdp_to_skb_metadata { + u32 magic; /* xdp_metadata_magic */ + u64 rx_timestamp; +} __randomize_layout; + +struct bpf_patch; + #ifdef CONFIG_DEBUG_INFO_BTF +extern u32 xdp_metadata_magic; extern struct btf_id_set8 xdp_metadata_kfunc_ids; static inline u32 xdp_metadata_kfunc_id(int id) { return xdp_metadata_kfunc_ids.pairs[id].id; } +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch); #else +#define xdp_metadata_magic 0 static inline u32 xdp_metadata_kfunc_id(int id) { return 0; } +static void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch) { return 0; } #endif #endif /* __LINUX_NET_XDP_H__ */ diff --git a/kernel/bpf/bpf_patch.c b/kernel/bpf/bpf_patch.c index 82a10bf5624a..8f1fef74299c 100644 --- a/kernel/bpf/bpf_patch.c +++ b/kernel/bpf/bpf_patch.c @@ -49,3 +49,33 @@ struct bpf_insn *bpf_patch_data(const struct bpf_patch *patch) { return patch->insn; } + +void bpf_patch_resolve_jmp(struct bpf_patch *patch) +{ + int i; + + for (i = 0; i < patch->len; i++) { + if (BPF_CLASS(patch->insn[i].code) != BPF_JMP) + continue; + + if (BPF_SRC(patch->insn[i].code) != BPF_X) + continue; + + if (patch->insn[i].off != S16_MAX) + continue; + + patch->insn[i].off = patch->len - i - 1; + } +} + +u32 bpf_patch_magles_registers(const struct bpf_patch *patch) +{ + u32 mask = 0; + int i; + + for (i = 0; i < patch->len; i++) { + mask |= 1 << patch->insn[i].dst_reg; + } + + return mask; +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4e5c5ff35d5f..49f55f81e7f3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13902,6 +13902,24 @@ static int unroll_kfunc_call(struct bpf_verifier_env *env, */ bpf_patch_append(patch, BPF_MOV64_IMM(BPF_REG_0, 0)); } + + if (func_id != xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_EXPORT_TO_SKB)) { + u32 allowed = 0; + u32 mangled = bpf_patch_magles_registers(patch); + + /* See xdp_metadata_export_to_skb comment for the + * reason about these constraints. + */ + + allowed |= 1 << BPF_REG_0; + allowed |= 1 << BPF_REG_2; + allowed |= 1 << BPF_REG_3; + allowed |= 1 << BPF_REG_4; + allowed |= 1 << BPF_REG_5; + + WARN_ON_ONCE(mangled & ~allowed); + } + return bpf_patch_err(patch); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 42a35b59fb1e..37e3aef46525 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -72,6 +72,7 @@ #include <net/mptcp.h> #include <net/mctp.h> #include <net/page_pool.h> +#include <net/xdp.h> #include <linux/uaccess.h> #include <trace/events/skb.h> @@ -6672,3 +6673,27 @@ nodefer: __kfree_skb(skb); if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) smp_call_function_single_async(cpu, &sd->defer_csd); } + +void skb_metadata_import_from_xdp(struct sk_buff *skb, size_t len) +{ + struct xdp_to_skb_metadata *meta = (void *)(skb_mac_header(skb) - len); + + /* Optional SKB info, currently missing: + * - HW checksum info (skb->ip_summed) + * - HW RX hash (skb_set_hash) + * - RX ring dev queue index (skb_record_rx_queue) + */ + + if (len != sizeof(struct xdp_to_skb_metadata)) + return; + + if (meta->magic != xdp_metadata_magic) + return; + + if (meta->rx_timestamp) { + *skb_hwtstamps(skb) = (struct skb_shared_hwtstamps){ + .hwtstamp = ns_to_ktime(meta->rx_timestamp), + }; + } +} +EXPORT_SYMBOL(skb_metadata_import_from_xdp); diff --git a/net/core/xdp.c b/net/core/xdp.c index 22f1e44700eb..8204fa05c5e9 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -653,12 +653,6 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, /* Essential SKB info: protocol and skb->dev */ skb->protocol = eth_type_trans(skb, dev); - /* Optional SKB info, currently missing: - * - HW checksum info (skb->ip_summed) - * - HW RX hash (skb_set_hash) - * - RX ring dev queue index (skb_record_rx_queue) - */ - /* Until page_pool get SKB return path, release DMA here */ xdp_release_frame(xdpf); @@ -712,6 +706,13 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) return nxdpf; } +/* For the packets directed to the kernel, this kfunc exports XDP metadata + * into skb context. + */ +noinline void bpf_xdp_metadata_export_to_skb(const struct xdp_md *ctx) +{ +} + /* Indicates whether particular device supports rx_timestamp metadata. * This is an optional helper to support marking some branches as * "dead code" in the BPF programs. @@ -737,13 +738,104 @@ XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC BTF_SET8_END(xdp_metadata_kfunc_ids) +/* Make sure userspace doesn't depend on our layout by using + * different pseudo-generated magic value. + */ +u32 xdp_metadata_magic; + static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = { .owner = THIS_MODULE, .set = &xdp_metadata_kfunc_ids, }; +/* Since we're not actually doing a call but instead rewriting + * in place, we can only afford to use R0-R5 scratch registers. + * + * We reserve R1 for bpf_xdp_metadata_export_to_skb and let individual + * metadata kfuncs use only R0,R4-R5. + * + * The above also means we _cannot_ easily call any other helper/kfunc + * because there is no place for us to preserve our R1 argument; + * existing R6-R9 belong to the callee. + */ +void xdp_metadata_export_to_skb(const struct bpf_prog *prog, struct bpf_patch *patch) +{ + u32 func_id; + + /* + * The code below generates the following: + * + * void bpf_xdp_metadata_export_to_skb(struct xdp_md *ctx) + * { + * struct xdp_to_skb_metadata *meta; + * int ret; + * + * ret = bpf_xdp_adjust_meta(ctx, -sizeof(*meta)); + * if (!ret) + * return; + * + * meta = ctx->data_meta; + * meta->magic = xdp_metadata_magic; + * meta->rx_timestamp = bpf_xdp_metadata_rx_timestamp(ctx); + * } + * + */ + + bpf_patch_append(patch, + /* r2 = ((struct xdp_buff *)r1)->data_meta; */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_buff, data_meta)), + /* r3 = ((struct xdp_buff *)r1)->data; */ + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_buff, data)), + /* if (data_meta != data) return; + * + * data_meta > data: xdp_data_meta_unsupported() + * data_meta < data: already used, no need to touch + */ + BPF_JMP_REG(BPF_JNE, BPF_REG_2, BPF_REG_3, S16_MAX), + + /* r2 -= sizeof(struct xdp_to_skb_metadata); */ + BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, + sizeof(struct xdp_to_skb_metadata)), + /* r3 = ((struct xdp_buff *)r1)->data_hard_start; */ + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, + offsetof(struct xdp_buff, data_hard_start)), + /* r3 += sizeof(struct xdp_frame) */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, + sizeof(struct xdp_frame)), + /* if (data-sizeof(struct xdp_to_skb_metadata) < data_hard_start+sizeof(struct xdp_frame)) return; */ + BPF_JMP_REG(BPF_JLT, BPF_REG_2, BPF_REG_3, S16_MAX), + + /* ((struct xdp_buff *)r1)->data_meta = r2; */ + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, + offsetof(struct xdp_buff, data_meta)), + + /* *((struct xdp_to_skb_metadata *)r2)->magic = xdp_metadata_magic; */ + BPF_ST_MEM(BPF_W, BPF_REG_2, + offsetof(struct xdp_to_skb_metadata, magic), + xdp_metadata_magic), + ); + + /* r0 = bpf_xdp_metadata_rx_timestamp(ctx); */ + func_id = xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP); + prog->aux->xdp_kfunc_ndo->ndo_unroll_kfunc(prog, func_id, patch); + + bpf_patch_append(patch, + /* r2 = ((struct xdp_buff *)r1)->data_meta; */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, + offsetof(struct xdp_buff, data_meta)), + /* *((struct xdp_to_skb_metadata *)r2)->rx_timestamp = r0; */ + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, + offsetof(struct xdp_to_skb_metadata, rx_timestamp)), + ); + + bpf_patch_resolve_jmp(patch); +} + static int __init xdp_metadata_init(void) { + xdp_metadata_magic = get_random_u32() | 1; return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set); } late_initcall(xdp_metadata_init);
Implement new bpf_xdp_metadata_export_to_skb kfunc which prepares compatible xdp metadata for kernel consumption. This kfunc should be called prior to bpf_redirect or (unless already called) when XDP_PASS'ing the frame into the kernel. The implementation currently maintains xdp_to_skb_metadata layout by calling bpf_xdp_metadata_rx_timestamp and placing small magic number. From skb_metdata_set, when we get expected magic number, we interpret metadata accordingly. Both magic number and struct layout are randomized to make sure it doesn't leak into the userspace. skb_metadata_set is amended with skb_metadata_import_from_xdp which tries to parse out the metadata and put it into skb. See the comment for r1 vs r2/r3/r4/r5 conventions. Cc: John Fastabend <john.fastabend@gmail.com> Cc: David Ahern <dsahern@gmail.com> Cc: Martin KaFai Lau <martin.lau@linux.dev> Cc: Jakub Kicinski <kuba@kernel.org> Cc: Willem de Bruijn <willemb@google.com> Cc: Jesper Dangaard Brouer <brouer@redhat.com> Cc: Anatoly Burakov <anatoly.burakov@intel.com> Cc: Alexander Lobakin <alexandr.lobakin@intel.com> Cc: Magnus Karlsson <magnus.karlsson@gmail.com> Cc: Maryam Tahhan <mtahhan@redhat.com> Cc: xdp-hints@xdp-project.net Cc: netdev@vger.kernel.org Signed-off-by: Stanislav Fomichev <sdf@google.com> --- drivers/net/veth.c | 4 +- include/linux/bpf_patch.h | 2 + include/linux/skbuff.h | 4 ++ include/net/xdp.h | 13 +++++ kernel/bpf/bpf_patch.c | 30 +++++++++++ kernel/bpf/verifier.c | 18 +++++++ net/core/skbuff.c | 25 +++++++++ net/core/xdp.c | 104 +++++++++++++++++++++++++++++++++++--- 8 files changed, 193 insertions(+), 7 deletions(-)