Message ID | edbca42217a73161903a50ba07ec63c5fa5fde00.1660761470.git.dxu@dxuuu.xyz (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Support direct writes to nf_conn:mark | expand |
On Wed, 17 Aug 2022 at 20:43, Daniel Xu <dxu@dxuuu.xyz> wrote: > > Support direct writes to nf_conn:mark from TC and XDP prog types. This > is useful when applications want to store per-connection metadata. This > is also particularly useful for applications that run both bpf and > iptables/nftables because the latter can trivially access this metadata. > > One example use case would be if a bpf prog is responsible for advanced > packet classification and iptables/nftables is later used for routing > due to pre-existing/legacy code. > > Signed-off-by: Daniel Xu <dxu@dxuuu.xyz> > --- > include/net/netfilter/nf_conntrack_bpf.h | 18 +++++++++ > net/core/filter.c | 34 ++++++++++++++++ > net/netfilter/nf_conntrack_bpf.c | 50 ++++++++++++++++++++++++ > 3 files changed, 102 insertions(+) > > diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h > index a473b56842c5..0f584c2bd475 100644 > --- a/include/net/netfilter/nf_conntrack_bpf.h > +++ b/include/net/netfilter/nf_conntrack_bpf.h > @@ -3,6 +3,7 @@ > #ifndef _NF_CONNTRACK_BPF_H > #define _NF_CONNTRACK_BPF_H > > +#include <linux/bpf.h> > #include <linux/btf.h> > #include <linux/kconfig.h> > > @@ -10,6 +11,12 @@ > (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) > > extern int register_nf_conntrack_bpf(void); > +extern int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, > + const struct btf *btf, > + const struct btf_type *t, int off, > + int size, enum bpf_access_type atype, > + u32 *next_btf_id, > + enum bpf_type_flag *flag); > > #else > > @@ -18,6 +25,17 @@ static inline int register_nf_conntrack_bpf(void) > return 0; > } > > +static inline int > +nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, > + const struct btf *btf, > + const struct btf_type *t, int off, > + int size, enum bpf_access_type atype, > + u32 *next_btf_id, > + enum bpf_type_flag *flag) > +{ > + return -EACCES; > +} > + We should make it work when nf_conntrack is a kernel module as well, not just when it is compiled in. The rest of the stuff already works when it is a module. For that, you can have a global function pointer for this callback, protected by a mutex. register/unregister sets it/unsets it. Each time you call it requires mutex to be held during the call. Later when we have more modules that supply btf_struct_access callback for their module types we can generalize it, for now it should be ok to hardcode it for nf_conn. > #endif > > #endif /* _NF_CONNTRACK_BPF_H */ > diff --git a/net/core/filter.c b/net/core/filter.c > index 5669248aff25..d7b768fe9de7 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -18,6 +18,7 @@ > */ > > #include <linux/atomic.h> > +#include <linux/bpf_verifier.h> > #include <linux/module.h> > #include <linux/types.h> > #include <linux/mm.h> > @@ -55,6 +56,7 @@ > #include <net/sock_reuseport.h> > #include <net/busy_poll.h> > #include <net/tcp.h> > +#include <net/netfilter/nf_conntrack_bpf.h> > #include <net/xfrm.h> > #include <net/udp.h> > #include <linux/bpf_trace.h> > @@ -8710,6 +8712,21 @@ static bool tc_cls_act_is_valid_access(int off, int size, > return bpf_skb_is_valid_access(off, size, type, prog, info); > } > > +static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, > + const struct btf *btf, > + const struct btf_type *t, int off, > + int size, enum bpf_access_type atype, > + u32 *next_btf_id, > + enum bpf_type_flag *flag) > +{ > + if (atype == BPF_READ) > + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, > + flag); > + > + return nf_conntrack_btf_struct_access(log, btf, t, off, size, atype, > + next_btf_id, flag); > +} > + > static bool __is_valid_xdp_access(int off, int size) > { > if (off < 0 || off >= sizeof(struct xdp_md)) > @@ -8769,6 +8786,21 @@ void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, > } > EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); > > +static int xdp_btf_struct_access(struct bpf_verifier_log *log, > + const struct btf *btf, > + const struct btf_type *t, int off, > + int size, enum bpf_access_type atype, > + u32 *next_btf_id, > + enum bpf_type_flag *flag) > +{ > + if (atype == BPF_READ) > + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, > + flag); > + > + return nf_conntrack_btf_struct_access(log, btf, t, off, size, atype, > + next_btf_id, flag); > +} > + > static bool sock_addr_is_valid_access(int off, int size, > enum bpf_access_type type, > const struct bpf_prog *prog, > @@ -10663,6 +10695,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { > .convert_ctx_access = tc_cls_act_convert_ctx_access, > .gen_prologue = tc_cls_act_prologue, > .gen_ld_abs = bpf_gen_ld_abs, > + .btf_struct_access = tc_cls_act_btf_struct_access, > }; > > const struct bpf_prog_ops tc_cls_act_prog_ops = { > @@ -10674,6 +10707,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = { > .is_valid_access = xdp_is_valid_access, > .convert_ctx_access = xdp_convert_ctx_access, > .gen_prologue = bpf_noop_prologue, > + .btf_struct_access = xdp_btf_struct_access, > }; > > const struct bpf_prog_ops xdp_prog_ops = { > diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c > index 1cd87b28c9b0..8010cc542d17 100644 > --- a/net/netfilter/nf_conntrack_bpf.c > +++ b/net/netfilter/nf_conntrack_bpf.c > @@ -6,6 +6,7 @@ > * are exposed through to BPF programs is explicitly unstable. > */ > > +#include <linux/bpf_verifier.h> > #include <linux/bpf.h> > #include <linux/btf.h> > #include <linux/types.h> > @@ -15,6 +16,8 @@ > #include <net/netfilter/nf_conntrack_bpf.h> > #include <net/netfilter/nf_conntrack_core.h> > > +static const struct btf_type *nf_conn_type; > + > /* bpf_ct_opts - Options for CT lookup helpers > * > * Members: > @@ -184,6 +187,53 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, > return ct; > } > > +/* Check writes into `struct nf_conn` */ > +int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, > + const struct btf *btf, > + const struct btf_type *t, int off, > + int size, enum bpf_access_type atype, > + u32 *next_btf_id, > + enum bpf_type_flag *flag) > +{ > + const struct btf_type *nct = READ_ONCE(nf_conn_type); > + s32 type_id; > + size_t end; > + > + if (!nct) { > + type_id = btf_find_by_name_kind(btf, "nf_conn", BTF_KIND_STRUCT); > + if (type_id < 0) > + return -EINVAL; > + > + nct = btf_type_by_id(btf, type_id); > + WRITE_ONCE(nf_conn_type, nct); Instead of this, why not just use BTF_ID_LIST_SINGLE to get the type_id and then match 't' to the result of btf_type_by_id? btf_type_by_id is not expensive. > + } > + > + if (t != nct) { > + bpf_log(log, "only read is supported\n"); > + return -EACCES; > + } > + > + switch (off) { > +#if defined(CONFIG_NF_CONNTRACK_MARK) > + case offsetof(struct nf_conn, mark): > + end = offsetofend(struct nf_conn, mark); > + break; > +#endif > + default: > + bpf_log(log, "no write support to nf_conn at off %d\n", off); > + return -EACCES; > + } > + > + if (off + size > end) { > + bpf_log(log, > + "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n", > + off, size, end); > + return -EACCES; > + } > + > + return NOT_INIT; > +} > + > __diag_push(); > __diag_ignore_all("-Wmissing-prototypes", > "Global functions as their definitions will be in nf_conntrack BTF"); > -- > 2.37.1 >
On Wed, Aug 17, 2022 at 11:43 AM Daniel Xu <dxu@dxuuu.xyz> wrote: > > +/* Check writes into `struct nf_conn` */ > +int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, > + const struct btf *btf, > + const struct btf_type *t, int off, > + int size, enum bpf_access_type atype, > + u32 *next_btf_id, > + enum bpf_type_flag *flag) > +{ > + const struct btf_type *nct = READ_ONCE(nf_conn_type); > + s32 type_id; > + size_t end; > + > + if (!nct) { > + type_id = btf_find_by_name_kind(btf, "nf_conn", BTF_KIND_STRUCT); > + if (type_id < 0) > + return -EINVAL; > + > + nct = btf_type_by_id(btf, type_id); > + WRITE_ONCE(nf_conn_type, nct); > + } > + > + if (t != nct) { > + bpf_log(log, "only read is supported\n"); > + return -EACCES; > + } > + > + switch (off) { > +#if defined(CONFIG_NF_CONNTRACK_MARK) > + case offsetof(struct nf_conn, mark): > + end = offsetofend(struct nf_conn, mark); > + break; > +#endif > + default: > + bpf_log(log, "no write support to nf_conn at off %d\n", off); > + return -EACCES; > + } > + > + if (off + size > end) { > + bpf_log(log, > + "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n", > + off, size, end); > + return -EACCES; > + } > + > + return NOT_INIT; Took me a long time to realize that this is a copy-paste from net/ipv4/bpf_tcp_ca.c. It's not wrong, but misleading. When atype == BPF_READ the return value from btf_struct_access should only be error<0, SCALAR_VALUE, PTR_TO_BTF_ID. For atype == BPF_WRITE we should probably standardize on error<0, or 0. The NOT_INIT happens to be zero, but explicit 0 is cleaner to avoid confusion that this is somehow enum bpf_reg_type. Martin, since you've added this code in bpf_tcp_ca, wdyt?
On Wed, Aug 17, 2022 at 02:30:01PM -0700, Alexei Starovoitov wrote: > On Wed, Aug 17, 2022 at 11:43 AM Daniel Xu <dxu@dxuuu.xyz> wrote: > > > > +/* Check writes into `struct nf_conn` */ > > +int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, > > + const struct btf *btf, > > + const struct btf_type *t, int off, > > + int size, enum bpf_access_type atype, > > + u32 *next_btf_id, > > + enum bpf_type_flag *flag) > > +{ > > + const struct btf_type *nct = READ_ONCE(nf_conn_type); > > + s32 type_id; > > + size_t end; > > + > > + if (!nct) { > > + type_id = btf_find_by_name_kind(btf, "nf_conn", BTF_KIND_STRUCT); > > + if (type_id < 0) > > + return -EINVAL; > > + > > + nct = btf_type_by_id(btf, type_id); > > + WRITE_ONCE(nf_conn_type, nct); > > + } > > + > > + if (t != nct) { > > + bpf_log(log, "only read is supported\n"); > > + return -EACCES; > > + } > > + > > + switch (off) { > > +#if defined(CONFIG_NF_CONNTRACK_MARK) > > + case offsetof(struct nf_conn, mark): > > + end = offsetofend(struct nf_conn, mark); > > + break; > > +#endif > > + default: > > + bpf_log(log, "no write support to nf_conn at off %d\n", off); > > + return -EACCES; > > + } > > + > > + if (off + size > end) { > > + bpf_log(log, > > + "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n", > > + off, size, end); > > + return -EACCES; > > + } > > + > > + return NOT_INIT; > > Took me a long time to realize that this is a copy-paste > from net/ipv4/bpf_tcp_ca.c. > It's not wrong, but misleading. > When atype == BPF_READ the return value from > btf_struct_access should only be error<0, SCALAR_VALUE, PTR_TO_BTF_ID. > For atype == BPF_WRITE we should probably standardize on > error<0, or 0. > > The NOT_INIT happens to be zero, but explicit 0 > is cleaner to avoid confusion that this is somehow enum bpf_reg_type. > > Martin, > since you've added this code in bpf_tcp_ca, wdyt? Yep, sgtm. This will be less confusing.
On Wed, Aug 17, 2022 at 09:48:31PM +0200, Kumar Kartikeya Dwivedi wrote: > On Wed, 17 Aug 2022 at 20:43, Daniel Xu <dxu@dxuuu.xyz> wrote: [...] > > diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h > > index a473b56842c5..0f584c2bd475 100644 > > --- a/include/net/netfilter/nf_conntrack_bpf.h > > +++ b/include/net/netfilter/nf_conntrack_bpf.h > > @@ -3,6 +3,7 @@ > > #ifndef _NF_CONNTRACK_BPF_H > > #define _NF_CONNTRACK_BPF_H > > > > +#include <linux/bpf.h> > > #include <linux/btf.h> > > #include <linux/kconfig.h> > > > > @@ -10,6 +11,12 @@ > > (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) > > > > extern int register_nf_conntrack_bpf(void); > > +extern int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, > > + const struct btf *btf, > > + const struct btf_type *t, int off, > > + int size, enum bpf_access_type atype, > > + u32 *next_btf_id, > > + enum bpf_type_flag *flag); > > > > #else > > > > @@ -18,6 +25,17 @@ static inline int register_nf_conntrack_bpf(void) > > return 0; > > } > > > > +static inline int > > +nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, > > + const struct btf *btf, > > + const struct btf_type *t, int off, > > + int size, enum bpf_access_type atype, > > + u32 *next_btf_id, > > + enum bpf_type_flag *flag) > > +{ > > + return -EACCES; > > +} > > + > > We should make it work when nf_conntrack is a kernel module as well, > not just when it is compiled in. The rest of the stuff already works > when it is a module. For that, you can have a global function pointer > for this callback, protected by a mutex. register/unregister sets > it/unsets it. Each time you call it requires mutex to be held during > the call. > > Later when we have more modules that supply btf_struct_access callback > for their module types we can generalize it, for now it should be ok > to hardcode it for nf_conn. Ok, will look into that. > > > #endif > > > > #endif /* _NF_CONNTRACK_BPF_H */ [...] > > > > +/* Check writes into `struct nf_conn` */ > > +int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, > > + const struct btf *btf, > > + const struct btf_type *t, int off, > > + int size, enum bpf_access_type atype, > > + u32 *next_btf_id, > > + enum bpf_type_flag *flag) > > +{ > > + const struct btf_type *nct = READ_ONCE(nf_conn_type); > > + s32 type_id; > > + size_t end; > > + > > + if (!nct) { > > + type_id = btf_find_by_name_kind(btf, "nf_conn", BTF_KIND_STRUCT); > > + if (type_id < 0) > > + return -EINVAL; > > + > > + nct = btf_type_by_id(btf, type_id); > > + WRITE_ONCE(nf_conn_type, nct); > > Instead of this, why not just use BTF_ID_LIST_SINGLE to get the > type_id and then match 't' to the result of btf_type_by_id? > btf_type_by_id is not expensive. Ah yeah, good idea. Will fix. [...] Thanks, Daniel
On Wed, Aug 17, 2022 at 03:05:01PM -0700, Martin KaFai Lau wrote: > On Wed, Aug 17, 2022 at 02:30:01PM -0700, Alexei Starovoitov wrote: > > On Wed, Aug 17, 2022 at 11:43 AM Daniel Xu <dxu@dxuuu.xyz> wrote: > > > > > > +/* Check writes into `struct nf_conn` */ > > > +int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, > > > + const struct btf *btf, > > > + const struct btf_type *t, int off, > > > + int size, enum bpf_access_type atype, > > > + u32 *next_btf_id, > > > + enum bpf_type_flag *flag) > > > +{ > > > + const struct btf_type *nct = READ_ONCE(nf_conn_type); > > > + s32 type_id; > > > + size_t end; > > > + > > > + if (!nct) { > > > + type_id = btf_find_by_name_kind(btf, "nf_conn", BTF_KIND_STRUCT); > > > + if (type_id < 0) > > > + return -EINVAL; > > > + > > > + nct = btf_type_by_id(btf, type_id); > > > + WRITE_ONCE(nf_conn_type, nct); > > > + } > > > + > > > + if (t != nct) { > > > + bpf_log(log, "only read is supported\n"); > > > + return -EACCES; > > > + } > > > + > > > + switch (off) { > > > +#if defined(CONFIG_NF_CONNTRACK_MARK) > > > + case offsetof(struct nf_conn, mark): > > > + end = offsetofend(struct nf_conn, mark); > > > + break; > > > +#endif > > > + default: > > > + bpf_log(log, "no write support to nf_conn at off %d\n", off); > > > + return -EACCES; > > > + } > > > + > > > + if (off + size > end) { > > > + bpf_log(log, > > > + "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n", > > > + off, size, end); > > > + return -EACCES; > > > + } > > > + > > > + return NOT_INIT; > > > > Took me a long time to realize that this is a copy-paste > > from net/ipv4/bpf_tcp_ca.c. > > It's not wrong, but misleading. > > When atype == BPF_READ the return value from > > btf_struct_access should only be error<0, SCALAR_VALUE, PTR_TO_BTF_ID. > > For atype == BPF_WRITE we should probably standardize on > > error<0, or 0. > > > > The NOT_INIT happens to be zero, but explicit 0 > > is cleaner to avoid confusion that this is somehow enum bpf_reg_type. > > > > Martin, > > since you've added this code in bpf_tcp_ca, wdyt? > Yep, sgtm. This will be less confusing. Ok, will fix both occurrences. Thanks, Daniel
Daniel Xu <dxu@dxuuu.xyz> writes: > Support direct writes to nf_conn:mark from TC and XDP prog types. This > is useful when applications want to store per-connection metadata. This > is also particularly useful for applications that run both bpf and > iptables/nftables because the latter can trivially access this > metadata. Looking closer at the nf_conn definition, the mark field (and possibly secmark) seems to be the only field that is likely to be feasible to support direct writes to, as everything else either requires special handling (like status and timeout), or they are composite field that will require helpers anyway to use correctly. Which means we're in the process of creating an API where users have to call helpers to fill in all fields *except* this one field that happens to be directly writable. That seems like a really confusing and inconsistent API, so IMO it strengthens the case for just making a helper for this field as well, even though it adds a bit of overhead (and then solving the overhead issue in a more generic way such as by supporting clever inlining). -Toke
Hi Toke, On Thu, Aug 18, 2022 at 09:52:08PM +0200, Toke Høiland-Jørgensen wrote: > Daniel Xu <dxu@dxuuu.xyz> writes: > > > Support direct writes to nf_conn:mark from TC and XDP prog types. This > > is useful when applications want to store per-connection metadata. This > > is also particularly useful for applications that run both bpf and > > iptables/nftables because the latter can trivially access this > > metadata. > > Looking closer at the nf_conn definition, the mark field (and possibly > secmark) seems to be the only field that is likely to be feasible to > support direct writes to, as everything else either requires special > handling (like status and timeout), or they are composite field that > will require helpers anyway to use correctly. > > Which means we're in the process of creating an API where users have to > call helpers to fill in all fields *except* this one field that happens > to be directly writable. That seems like a really confusing and > inconsistent API, so IMO it strengthens the case for just making a > helper for this field as well, even though it adds a bit of overhead > (and then solving the overhead issue in a more generic way such as by > supporting clever inlining). > > -Toke I don't particularly have a strong opinion here. But to play devil's advocate: * It may be confusing now, but over time I expect to see more direct write support via BTF, especially b/c there is support for unstable helpers now. So perhaps in the future it will seem more sensible. * The unstable helpers do not have external documentation. Nor should they in my opinion as their unstableness + stale docs may lead to undesirable outcomes. So users of the unstable API already have to splunk through kernel code and/or selftests to figure out how to wield the APIs. All this to say there may not be an argument for discoverability. * Direct writes are slightly more ergnomic than using a helper. Thanks, Daniel
Daniel Xu <dxu@dxuuu.xyz> writes: > Hi Toke, > > On Thu, Aug 18, 2022 at 09:52:08PM +0200, Toke Høiland-Jørgensen wrote: >> Daniel Xu <dxu@dxuuu.xyz> writes: >> >> > Support direct writes to nf_conn:mark from TC and XDP prog types. This >> > is useful when applications want to store per-connection metadata. This >> > is also particularly useful for applications that run both bpf and >> > iptables/nftables because the latter can trivially access this >> > metadata. >> >> Looking closer at the nf_conn definition, the mark field (and possibly >> secmark) seems to be the only field that is likely to be feasible to >> support direct writes to, as everything else either requires special >> handling (like status and timeout), or they are composite field that >> will require helpers anyway to use correctly. >> >> Which means we're in the process of creating an API where users have to >> call helpers to fill in all fields *except* this one field that happens >> to be directly writable. That seems like a really confusing and >> inconsistent API, so IMO it strengthens the case for just making a >> helper for this field as well, even though it adds a bit of overhead >> (and then solving the overhead issue in a more generic way such as by >> supporting clever inlining). >> >> -Toke > > I don't particularly have a strong opinion here. But to play devil's > advocate: > > * It may be confusing now, but over time I expect to see more direct > write support via BTF, especially b/c there is support for unstable > helpers now. So perhaps in the future it will seem more sensible. Right, sure, for other structs. My point was that it doesn't look like this particular one (nf_conn) is likely to grow any other members we can access directly, so it'll be a weird one-off for that single field... > * The unstable helpers do not have external documentation. Nor should > they in my opinion as their unstableness + stale docs may lead to > undesirable outcomes. So users of the unstable API already have to > splunk through kernel code and/or selftests to figure out how to wield > the APIs. All this to say there may not be an argument for > discoverability. This I don't buy at all. Just because it's (supposedly) "unstable" is no excuse to design a bad API, or make it actively user-hostile by hiding things so users have to go browse kernel code to know how to use it. So in any case, we should definitely document everything. > * Direct writes are slightly more ergnomic than using a helper. This is true, and that's the main argument for doing it this way. The point of my previous email was that since it's only a single field, consistency weighs heavier than ergonomics :) -Toke
On Fri, Aug 19, 2022 at 6:05 AM Toke Høiland-Jørgensen <toke@kernel.org> wrote: > > Daniel Xu <dxu@dxuuu.xyz> writes: > > > Hi Toke, > > > > On Thu, Aug 18, 2022 at 09:52:08PM +0200, Toke Høiland-Jørgensen wrote: > >> Daniel Xu <dxu@dxuuu.xyz> writes: > >> > >> > Support direct writes to nf_conn:mark from TC and XDP prog types. This > >> > is useful when applications want to store per-connection metadata. This > >> > is also particularly useful for applications that run both bpf and > >> > iptables/nftables because the latter can trivially access this > >> > metadata. > >> > >> Looking closer at the nf_conn definition, the mark field (and possibly > >> secmark) seems to be the only field that is likely to be feasible to > >> support direct writes to, as everything else either requires special > >> handling (like status and timeout), or they are composite field that > >> will require helpers anyway to use correctly. > >> > >> Which means we're in the process of creating an API where users have to > >> call helpers to fill in all fields *except* this one field that happens > >> to be directly writable. That seems like a really confusing and > >> inconsistent API, so IMO it strengthens the case for just making a > >> helper for this field as well, even though it adds a bit of overhead > >> (and then solving the overhead issue in a more generic way such as by > >> supporting clever inlining). > >> > >> -Toke > > > > I don't particularly have a strong opinion here. But to play devil's > > advocate: > > > > * It may be confusing now, but over time I expect to see more direct > > write support via BTF, especially b/c there is support for unstable > > helpers now. So perhaps in the future it will seem more sensible. > > Right, sure, for other structs. My point was that it doesn't look like > this particular one (nf_conn) is likely to grow any other members we can > access directly, so it'll be a weird one-off for that single field... > > > * The unstable helpers do not have external documentation. Nor should > > they in my opinion as their unstableness + stale docs may lead to > > undesirable outcomes. So users of the unstable API already have to > > splunk through kernel code and/or selftests to figure out how to wield > > the APIs. All this to say there may not be an argument for > > discoverability. > > This I don't buy at all. Just because it's (supposedly) "unstable" is no They're unstable. Please don't start this 'but can we actually remove them' doubts. You're only confusing yourself and others. We tweaked kfuncs already. We removed tracepoints too after they were in a full kernel release. > excuse to design a bad API, or make it actively user-hostile by hiding 'bad API'? what? It's a direct field write. We do allow it in other data structures. > things so users have to go browse kernel code to know how to use it. So > in any case, we should definitely document everything. > > > * Direct writes are slightly more ergnomic than using a helper. > > This is true, and that's the main argument for doing it this way. The > point of my previous email was that since it's only a single field, > consistency weighs heavier than ergonomics :) I don't think the 'consistency' argument applies here. We already allow direct read of all fields. Also the field access is easier to handle with CO-RE.
diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h index a473b56842c5..0f584c2bd475 100644 --- a/include/net/netfilter/nf_conntrack_bpf.h +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -3,6 +3,7 @@ #ifndef _NF_CONNTRACK_BPF_H #define _NF_CONNTRACK_BPF_H +#include <linux/bpf.h> #include <linux/btf.h> #include <linux/kconfig.h> @@ -10,6 +11,12 @@ (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) extern int register_nf_conntrack_bpf(void); +extern int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag); #else @@ -18,6 +25,17 @@ static inline int register_nf_conntrack_bpf(void) return 0; } +static inline int +nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + return -EACCES; +} + #endif #endif /* _NF_CONNTRACK_BPF_H */ diff --git a/net/core/filter.c b/net/core/filter.c index 5669248aff25..d7b768fe9de7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -18,6 +18,7 @@ */ #include <linux/atomic.h> +#include <linux/bpf_verifier.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> @@ -55,6 +56,7 @@ #include <net/sock_reuseport.h> #include <net/busy_poll.h> #include <net/tcp.h> +#include <net/netfilter/nf_conntrack_bpf.h> #include <net/xfrm.h> #include <net/udp.h> #include <linux/bpf_trace.h> @@ -8710,6 +8712,21 @@ static bool tc_cls_act_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } +static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + if (atype == BPF_READ) + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); + + return nf_conntrack_btf_struct_access(log, btf, t, off, size, atype, + next_btf_id, flag); +} + static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) @@ -8769,6 +8786,21 @@ void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static int xdp_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + if (atype == BPF_READ) + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); + + return nf_conntrack_btf_struct_access(log, btf, t, off, size, atype, + next_btf_id, flag); +} + static bool sock_addr_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -10663,6 +10695,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, + .btf_struct_access = tc_cls_act_btf_struct_access, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { @@ -10674,6 +10707,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, .gen_prologue = bpf_noop_prologue, + .btf_struct_access = xdp_btf_struct_access, }; const struct bpf_prog_ops xdp_prog_ops = { diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 1cd87b28c9b0..8010cc542d17 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -6,6 +6,7 @@ * are exposed through to BPF programs is explicitly unstable. */ +#include <linux/bpf_verifier.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/types.h> @@ -15,6 +16,8 @@ #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> +static const struct btf_type *nf_conn_type; + /* bpf_ct_opts - Options for CT lookup helpers * * Members: @@ -184,6 +187,53 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, return ct; } +/* Check writes into `struct nf_conn` */ +int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + const struct btf_type *nct = READ_ONCE(nf_conn_type); + s32 type_id; + size_t end; + + if (!nct) { + type_id = btf_find_by_name_kind(btf, "nf_conn", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + + nct = btf_type_by_id(btf, type_id); + WRITE_ONCE(nf_conn_type, nct); + } + + if (t != nct) { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + switch (off) { +#if defined(CONFIG_NF_CONNTRACK_MARK) + case offsetof(struct nf_conn, mark): + end = offsetofend(struct nf_conn, mark); + break; +#endif + default: + bpf_log(log, "no write support to nf_conn at off %d\n", off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n", + off, size, end); + return -EACCES; + } + + return NOT_INIT; +} + __diag_push(); __diag_ignore_all("-Wmissing-prototypes", "Global functions as their definitions will be in nf_conntrack BTF");
Support direct writes to nf_conn:mark from TC and XDP prog types. This is useful when applications want to store per-connection metadata. This is also particularly useful for applications that run both bpf and iptables/nftables because the latter can trivially access this metadata. One example use case would be if a bpf prog is responsible for advanced packet classification and iptables/nftables is later used for routing due to pre-existing/legacy code. Signed-off-by: Daniel Xu <dxu@dxuuu.xyz> --- include/net/netfilter/nf_conntrack_bpf.h | 18 +++++++++ net/core/filter.c | 34 ++++++++++++++++ net/netfilter/nf_conntrack_bpf.c | 50 ++++++++++++++++++++++++ 3 files changed, 102 insertions(+)