diff mbox series

[bpf-next,v2,3/4] bpf: Add support for writing to nf_conn:mark

Message ID edbca42217a73161903a50ba07ec63c5fa5fde00.1660761470.git.dxu@dxuuu.xyz (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Support direct writes to nf_conn:mark | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR pending PR summary
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 26 this patch: 26
netdev/cc_maintainers warning 15 maintainers not CCed: john.fastabend@gmail.com song@kernel.org sdf@google.com hawk@kernel.org martin.lau@linux.dev davem@davemloft.net kadlec@netfilter.org edumazet@google.com kpsingh@kernel.org coreteam@netfilter.org kuba@kernel.org jolsa@kernel.org pabeni@redhat.com haoluo@google.com yhs@fb.com
netdev/build_clang success Errors and warnings before: 5 this patch: 5
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 26 this patch: 26
netdev/checkpatch warning CHECK: extern prototypes should be avoided in .h files WARNING: line length of 81 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for Kernel LATEST on ubuntu-latest with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Kernel LATEST on ubuntu-latest with llvm-16
bpf/vmtest-bpf-next-VM_Test-3 pending Logs for Kernel LATEST on z15 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-5 success Logs for set-matrix

Commit Message

Daniel Xu Aug. 17, 2022, 6:43 p.m. UTC
Support direct writes to nf_conn:mark from TC and XDP prog types. This
is useful when applications want to store per-connection metadata. This
is also particularly useful for applications that run both bpf and
iptables/nftables because the latter can trivially access this metadata.

One example use case would be if a bpf prog is responsible for advanced
packet classification and iptables/nftables is later used for routing
due to pre-existing/legacy code.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
---
 include/net/netfilter/nf_conntrack_bpf.h | 18 +++++++++
 net/core/filter.c                        | 34 ++++++++++++++++
 net/netfilter/nf_conntrack_bpf.c         | 50 ++++++++++++++++++++++++
 3 files changed, 102 insertions(+)

Comments

Kumar Kartikeya Dwivedi Aug. 17, 2022, 7:48 p.m. UTC | #1
On Wed, 17 Aug 2022 at 20:43, Daniel Xu <dxu@dxuuu.xyz> wrote:
>
> Support direct writes to nf_conn:mark from TC and XDP prog types. This
> is useful when applications want to store per-connection metadata. This
> is also particularly useful for applications that run both bpf and
> iptables/nftables because the latter can trivially access this metadata.
>
> One example use case would be if a bpf prog is responsible for advanced
> packet classification and iptables/nftables is later used for routing
> due to pre-existing/legacy code.
>
> Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> ---
>  include/net/netfilter/nf_conntrack_bpf.h | 18 +++++++++
>  net/core/filter.c                        | 34 ++++++++++++++++
>  net/netfilter/nf_conntrack_bpf.c         | 50 ++++++++++++++++++++++++
>  3 files changed, 102 insertions(+)
>
> diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h
> index a473b56842c5..0f584c2bd475 100644
> --- a/include/net/netfilter/nf_conntrack_bpf.h
> +++ b/include/net/netfilter/nf_conntrack_bpf.h
> @@ -3,6 +3,7 @@
>  #ifndef _NF_CONNTRACK_BPF_H
>  #define _NF_CONNTRACK_BPF_H
>
> +#include <linux/bpf.h>
>  #include <linux/btf.h>
>  #include <linux/kconfig.h>
>
> @@ -10,6 +11,12 @@
>      (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
>
>  extern int register_nf_conntrack_bpf(void);
> +extern int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
> +                                         const struct btf *btf,
> +                                         const struct btf_type *t, int off,
> +                                         int size, enum bpf_access_type atype,
> +                                         u32 *next_btf_id,
> +                                         enum bpf_type_flag *flag);
>
>  #else
>
> @@ -18,6 +25,17 @@ static inline int register_nf_conntrack_bpf(void)
>         return 0;
>  }
>
> +static inline int
> +nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
> +                              const struct btf *btf,
> +                              const struct btf_type *t, int off,
> +                              int size, enum bpf_access_type atype,
> +                              u32 *next_btf_id,
> +                              enum bpf_type_flag *flag)
> +{
> +       return -EACCES;
> +}
> +

We should make it work when nf_conntrack is a kernel module as well,
not just when it is compiled in. The rest of the stuff already works
when it is a module. For that, you can have a global function pointer
for this callback, protected by a mutex. register/unregister sets
it/unsets it. Each time you call it requires mutex to be held during
the call.

Later when we have more modules that supply btf_struct_access callback
for their module types we can generalize it, for now it should be ok
to hardcode it for nf_conn.

>  #endif
>
>  #endif /* _NF_CONNTRACK_BPF_H */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 5669248aff25..d7b768fe9de7 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -18,6 +18,7 @@
>   */
>
>  #include <linux/atomic.h>
> +#include <linux/bpf_verifier.h>
>  #include <linux/module.h>
>  #include <linux/types.h>
>  #include <linux/mm.h>
> @@ -55,6 +56,7 @@
>  #include <net/sock_reuseport.h>
>  #include <net/busy_poll.h>
>  #include <net/tcp.h>
> +#include <net/netfilter/nf_conntrack_bpf.h>
>  #include <net/xfrm.h>
>  #include <net/udp.h>
>  #include <linux/bpf_trace.h>
> @@ -8710,6 +8712,21 @@ static bool tc_cls_act_is_valid_access(int off, int size,
>         return bpf_skb_is_valid_access(off, size, type, prog, info);
>  }
>
> +static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
> +                                       const struct btf *btf,
> +                                       const struct btf_type *t, int off,
> +                                       int size, enum bpf_access_type atype,
> +                                       u32 *next_btf_id,
> +                                       enum bpf_type_flag *flag)
> +{
> +       if (atype == BPF_READ)
> +               return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
> +                                        flag);
> +
> +       return nf_conntrack_btf_struct_access(log, btf, t, off, size, atype,
> +                                             next_btf_id, flag);
> +}
> +
>  static bool __is_valid_xdp_access(int off, int size)
>  {
>         if (off < 0 || off >= sizeof(struct xdp_md))
> @@ -8769,6 +8786,21 @@ void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog,
>  }
>  EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
>
> +static int xdp_btf_struct_access(struct bpf_verifier_log *log,
> +                                const struct btf *btf,
> +                                const struct btf_type *t, int off,
> +                                int size, enum bpf_access_type atype,
> +                                u32 *next_btf_id,
> +                                enum bpf_type_flag *flag)
> +{
> +       if (atype == BPF_READ)
> +               return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
> +                                        flag);
> +
> +       return nf_conntrack_btf_struct_access(log, btf, t, off, size, atype,
> +                                             next_btf_id, flag);
> +}
> +
>  static bool sock_addr_is_valid_access(int off, int size,
>                                       enum bpf_access_type type,
>                                       const struct bpf_prog *prog,
> @@ -10663,6 +10695,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
>         .convert_ctx_access     = tc_cls_act_convert_ctx_access,
>         .gen_prologue           = tc_cls_act_prologue,
>         .gen_ld_abs             = bpf_gen_ld_abs,
> +       .btf_struct_access      = tc_cls_act_btf_struct_access,
>  };
>
>  const struct bpf_prog_ops tc_cls_act_prog_ops = {
> @@ -10674,6 +10707,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = {
>         .is_valid_access        = xdp_is_valid_access,
>         .convert_ctx_access     = xdp_convert_ctx_access,
>         .gen_prologue           = bpf_noop_prologue,
> +       .btf_struct_access      = xdp_btf_struct_access,
>  };
>
>  const struct bpf_prog_ops xdp_prog_ops = {
> diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
> index 1cd87b28c9b0..8010cc542d17 100644
> --- a/net/netfilter/nf_conntrack_bpf.c
> +++ b/net/netfilter/nf_conntrack_bpf.c
> @@ -6,6 +6,7 @@
>   * are exposed through to BPF programs is explicitly unstable.
>   */
>
> +#include <linux/bpf_verifier.h>
>  #include <linux/bpf.h>
>  #include <linux/btf.h>
>  #include <linux/types.h>
> @@ -15,6 +16,8 @@
>  #include <net/netfilter/nf_conntrack_bpf.h>
>  #include <net/netfilter/nf_conntrack_core.h>
>
> +static const struct btf_type *nf_conn_type;
> +
>  /* bpf_ct_opts - Options for CT lookup helpers
>   *
>   * Members:
> @@ -184,6 +187,53 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
>         return ct;
>  }
>
> +/* Check writes into `struct nf_conn` */
> +int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
> +                                  const struct btf *btf,
> +                                  const struct btf_type *t, int off,
> +                                  int size, enum bpf_access_type atype,
> +                                  u32 *next_btf_id,
> +                                  enum bpf_type_flag *flag)
> +{
> +       const struct btf_type *nct = READ_ONCE(nf_conn_type);
> +       s32 type_id;
> +       size_t end;
> +
> +       if (!nct) {
> +               type_id = btf_find_by_name_kind(btf, "nf_conn", BTF_KIND_STRUCT);
> +               if (type_id < 0)
> +                       return -EINVAL;
> +
> +               nct = btf_type_by_id(btf, type_id);
> +               WRITE_ONCE(nf_conn_type, nct);

Instead of this, why not just use BTF_ID_LIST_SINGLE to get the
type_id and then match 't' to the result of btf_type_by_id?
btf_type_by_id is not expensive.

> +       }
> +
> +       if (t != nct) {
> +               bpf_log(log, "only read is supported\n");
> +               return -EACCES;
> +       }
> +
> +       switch (off) {
> +#if defined(CONFIG_NF_CONNTRACK_MARK)
> +       case offsetof(struct nf_conn, mark):
> +               end = offsetofend(struct nf_conn, mark);
> +               break;
> +#endif
> +       default:
> +               bpf_log(log, "no write support to nf_conn at off %d\n", off);
> +               return -EACCES;
> +       }
> +
> +       if (off + size > end) {
> +               bpf_log(log,
> +                       "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n",
> +                       off, size, end);
> +               return -EACCES;
> +       }
> +
> +       return NOT_INIT;
> +}
> +
>  __diag_push();
>  __diag_ignore_all("-Wmissing-prototypes",
>                   "Global functions as their definitions will be in nf_conntrack BTF");
> --
> 2.37.1
>
Alexei Starovoitov Aug. 17, 2022, 9:30 p.m. UTC | #2
On Wed, Aug 17, 2022 at 11:43 AM Daniel Xu <dxu@dxuuu.xyz> wrote:
>
> +/* Check writes into `struct nf_conn` */
> +int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
> +                                  const struct btf *btf,
> +                                  const struct btf_type *t, int off,
> +                                  int size, enum bpf_access_type atype,
> +                                  u32 *next_btf_id,
> +                                  enum bpf_type_flag *flag)
> +{
> +       const struct btf_type *nct = READ_ONCE(nf_conn_type);
> +       s32 type_id;
> +       size_t end;
> +
> +       if (!nct) {
> +               type_id = btf_find_by_name_kind(btf, "nf_conn", BTF_KIND_STRUCT);
> +               if (type_id < 0)
> +                       return -EINVAL;
> +
> +               nct = btf_type_by_id(btf, type_id);
> +               WRITE_ONCE(nf_conn_type, nct);
> +       }
> +
> +       if (t != nct) {
> +               bpf_log(log, "only read is supported\n");
> +               return -EACCES;
> +       }
> +
> +       switch (off) {
> +#if defined(CONFIG_NF_CONNTRACK_MARK)
> +       case offsetof(struct nf_conn, mark):
> +               end = offsetofend(struct nf_conn, mark);
> +               break;
> +#endif
> +       default:
> +               bpf_log(log, "no write support to nf_conn at off %d\n", off);
> +               return -EACCES;
> +       }
> +
> +       if (off + size > end) {
> +               bpf_log(log,
> +                       "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n",
> +                       off, size, end);
> +               return -EACCES;
> +       }
> +
> +       return NOT_INIT;

Took me a long time to realize that this is a copy-paste
from net/ipv4/bpf_tcp_ca.c.
It's not wrong, but misleading.
When atype == BPF_READ the return value from
btf_struct_access should only be error<0, SCALAR_VALUE, PTR_TO_BTF_ID.
For atype == BPF_WRITE we should probably standardize on
error<0, or 0.

The NOT_INIT happens to be zero, but explicit 0
is cleaner to avoid confusion that this is somehow enum bpf_reg_type.

Martin,
since you've added this code in bpf_tcp_ca, wdyt?
Martin KaFai Lau Aug. 17, 2022, 10:05 p.m. UTC | #3
On Wed, Aug 17, 2022 at 02:30:01PM -0700, Alexei Starovoitov wrote:
> On Wed, Aug 17, 2022 at 11:43 AM Daniel Xu <dxu@dxuuu.xyz> wrote:
> >
> > +/* Check writes into `struct nf_conn` */
> > +int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
> > +                                  const struct btf *btf,
> > +                                  const struct btf_type *t, int off,
> > +                                  int size, enum bpf_access_type atype,
> > +                                  u32 *next_btf_id,
> > +                                  enum bpf_type_flag *flag)
> > +{
> > +       const struct btf_type *nct = READ_ONCE(nf_conn_type);
> > +       s32 type_id;
> > +       size_t end;
> > +
> > +       if (!nct) {
> > +               type_id = btf_find_by_name_kind(btf, "nf_conn", BTF_KIND_STRUCT);
> > +               if (type_id < 0)
> > +                       return -EINVAL;
> > +
> > +               nct = btf_type_by_id(btf, type_id);
> > +               WRITE_ONCE(nf_conn_type, nct);
> > +       }
> > +
> > +       if (t != nct) {
> > +               bpf_log(log, "only read is supported\n");
> > +               return -EACCES;
> > +       }
> > +
> > +       switch (off) {
> > +#if defined(CONFIG_NF_CONNTRACK_MARK)
> > +       case offsetof(struct nf_conn, mark):
> > +               end = offsetofend(struct nf_conn, mark);
> > +               break;
> > +#endif
> > +       default:
> > +               bpf_log(log, "no write support to nf_conn at off %d\n", off);
> > +               return -EACCES;
> > +       }
> > +
> > +       if (off + size > end) {
> > +               bpf_log(log,
> > +                       "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n",
> > +                       off, size, end);
> > +               return -EACCES;
> > +       }
> > +
> > +       return NOT_INIT;
> 
> Took me a long time to realize that this is a copy-paste
> from net/ipv4/bpf_tcp_ca.c.
> It's not wrong, but misleading.
> When atype == BPF_READ the return value from
> btf_struct_access should only be error<0, SCALAR_VALUE, PTR_TO_BTF_ID.
> For atype == BPF_WRITE we should probably standardize on
> error<0, or 0.
> 
> The NOT_INIT happens to be zero, but explicit 0
> is cleaner to avoid confusion that this is somehow enum bpf_reg_type.
> 
> Martin,
> since you've added this code in bpf_tcp_ca, wdyt?
Yep, sgtm.  This will be less confusing.
Daniel Xu Aug. 18, 2022, 7:31 p.m. UTC | #4
On Wed, Aug 17, 2022 at 09:48:31PM +0200, Kumar Kartikeya Dwivedi wrote:
> On Wed, 17 Aug 2022 at 20:43, Daniel Xu <dxu@dxuuu.xyz> wrote:
[...]
> > diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h
> > index a473b56842c5..0f584c2bd475 100644
> > --- a/include/net/netfilter/nf_conntrack_bpf.h
> > +++ b/include/net/netfilter/nf_conntrack_bpf.h
> > @@ -3,6 +3,7 @@
> >  #ifndef _NF_CONNTRACK_BPF_H
> >  #define _NF_CONNTRACK_BPF_H
> >
> > +#include <linux/bpf.h>
> >  #include <linux/btf.h>
> >  #include <linux/kconfig.h>
> >
> > @@ -10,6 +11,12 @@
> >      (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
> >
> >  extern int register_nf_conntrack_bpf(void);
> > +extern int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
> > +                                         const struct btf *btf,
> > +                                         const struct btf_type *t, int off,
> > +                                         int size, enum bpf_access_type atype,
> > +                                         u32 *next_btf_id,
> > +                                         enum bpf_type_flag *flag);
> >
> >  #else
> >
> > @@ -18,6 +25,17 @@ static inline int register_nf_conntrack_bpf(void)
> >         return 0;
> >  }
> >
> > +static inline int
> > +nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
> > +                              const struct btf *btf,
> > +                              const struct btf_type *t, int off,
> > +                              int size, enum bpf_access_type atype,
> > +                              u32 *next_btf_id,
> > +                              enum bpf_type_flag *flag)
> > +{
> > +       return -EACCES;
> > +}
> > +
> 
> We should make it work when nf_conntrack is a kernel module as well,
> not just when it is compiled in. The rest of the stuff already works
> when it is a module. For that, you can have a global function pointer
> for this callback, protected by a mutex. register/unregister sets
> it/unsets it. Each time you call it requires mutex to be held during
> the call.
> 
> Later when we have more modules that supply btf_struct_access callback
> for their module types we can generalize it, for now it should be ok
> to hardcode it for nf_conn.

Ok, will look into that.

> 
> >  #endif
> >
> >  #endif /* _NF_CONNTRACK_BPF_H */
[...]
> >
> > +/* Check writes into `struct nf_conn` */
> > +int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
> > +                                  const struct btf *btf,
> > +                                  const struct btf_type *t, int off,
> > +                                  int size, enum bpf_access_type atype,
> > +                                  u32 *next_btf_id,
> > +                                  enum bpf_type_flag *flag)
> > +{
> > +       const struct btf_type *nct = READ_ONCE(nf_conn_type);
> > +       s32 type_id;
> > +       size_t end;
> > +
> > +       if (!nct) {
> > +               type_id = btf_find_by_name_kind(btf, "nf_conn", BTF_KIND_STRUCT);
> > +               if (type_id < 0)
> > +                       return -EINVAL;
> > +
> > +               nct = btf_type_by_id(btf, type_id);
> > +               WRITE_ONCE(nf_conn_type, nct);
> 
> Instead of this, why not just use BTF_ID_LIST_SINGLE to get the
> type_id and then match 't' to the result of btf_type_by_id?
> btf_type_by_id is not expensive.

Ah yeah, good idea. Will fix.

[...]

Thanks,
Daniel
Daniel Xu Aug. 18, 2022, 7:31 p.m. UTC | #5
On Wed, Aug 17, 2022 at 03:05:01PM -0700, Martin KaFai Lau wrote:
> On Wed, Aug 17, 2022 at 02:30:01PM -0700, Alexei Starovoitov wrote:
> > On Wed, Aug 17, 2022 at 11:43 AM Daniel Xu <dxu@dxuuu.xyz> wrote:
> > >
> > > +/* Check writes into `struct nf_conn` */
> > > +int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
> > > +                                  const struct btf *btf,
> > > +                                  const struct btf_type *t, int off,
> > > +                                  int size, enum bpf_access_type atype,
> > > +                                  u32 *next_btf_id,
> > > +                                  enum bpf_type_flag *flag)
> > > +{
> > > +       const struct btf_type *nct = READ_ONCE(nf_conn_type);
> > > +       s32 type_id;
> > > +       size_t end;
> > > +
> > > +       if (!nct) {
> > > +               type_id = btf_find_by_name_kind(btf, "nf_conn", BTF_KIND_STRUCT);
> > > +               if (type_id < 0)
> > > +                       return -EINVAL;
> > > +
> > > +               nct = btf_type_by_id(btf, type_id);
> > > +               WRITE_ONCE(nf_conn_type, nct);
> > > +       }
> > > +
> > > +       if (t != nct) {
> > > +               bpf_log(log, "only read is supported\n");
> > > +               return -EACCES;
> > > +       }
> > > +
> > > +       switch (off) {
> > > +#if defined(CONFIG_NF_CONNTRACK_MARK)
> > > +       case offsetof(struct nf_conn, mark):
> > > +               end = offsetofend(struct nf_conn, mark);
> > > +               break;
> > > +#endif
> > > +       default:
> > > +               bpf_log(log, "no write support to nf_conn at off %d\n", off);
> > > +               return -EACCES;
> > > +       }
> > > +
> > > +       if (off + size > end) {
> > > +               bpf_log(log,
> > > +                       "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n",
> > > +                       off, size, end);
> > > +               return -EACCES;
> > > +       }
> > > +
> > > +       return NOT_INIT;
> > 
> > Took me a long time to realize that this is a copy-paste
> > from net/ipv4/bpf_tcp_ca.c.
> > It's not wrong, but misleading.
> > When atype == BPF_READ the return value from
> > btf_struct_access should only be error<0, SCALAR_VALUE, PTR_TO_BTF_ID.
> > For atype == BPF_WRITE we should probably standardize on
> > error<0, or 0.
> > 
> > The NOT_INIT happens to be zero, but explicit 0
> > is cleaner to avoid confusion that this is somehow enum bpf_reg_type.
> > 
> > Martin,
> > since you've added this code in bpf_tcp_ca, wdyt?
> Yep, sgtm.  This will be less confusing.

Ok, will fix both occurrences. 

Thanks,
Daniel
Toke Høiland-Jørgensen Aug. 18, 2022, 7:52 p.m. UTC | #6
Daniel Xu <dxu@dxuuu.xyz> writes:

> Support direct writes to nf_conn:mark from TC and XDP prog types. This
> is useful when applications want to store per-connection metadata. This
> is also particularly useful for applications that run both bpf and
> iptables/nftables because the latter can trivially access this
> metadata.

Looking closer at the nf_conn definition, the mark field (and possibly
secmark) seems to be the only field that is likely to be feasible to
support direct writes to, as everything else either requires special
handling (like status and timeout), or they are composite field that
will require helpers anyway to use correctly.

Which means we're in the process of creating an API where users have to
call helpers to fill in all fields *except* this one field that happens
to be directly writable. That seems like a really confusing and
inconsistent API, so IMO it strengthens the case for just making a
helper for this field as well, even though it adds a bit of overhead
(and then solving the overhead issue in a more generic way such as by
supporting clever inlining).

-Toke
Daniel Xu Aug. 18, 2022, 10:10 p.m. UTC | #7
Hi Toke,

On Thu, Aug 18, 2022 at 09:52:08PM +0200, Toke Høiland-Jørgensen wrote:
> Daniel Xu <dxu@dxuuu.xyz> writes:
> 
> > Support direct writes to nf_conn:mark from TC and XDP prog types. This
> > is useful when applications want to store per-connection metadata. This
> > is also particularly useful for applications that run both bpf and
> > iptables/nftables because the latter can trivially access this
> > metadata.
> 
> Looking closer at the nf_conn definition, the mark field (and possibly
> secmark) seems to be the only field that is likely to be feasible to
> support direct writes to, as everything else either requires special
> handling (like status and timeout), or they are composite field that
> will require helpers anyway to use correctly.
> 
> Which means we're in the process of creating an API where users have to
> call helpers to fill in all fields *except* this one field that happens
> to be directly writable. That seems like a really confusing and
> inconsistent API, so IMO it strengthens the case for just making a
> helper for this field as well, even though it adds a bit of overhead
> (and then solving the overhead issue in a more generic way such as by
> supporting clever inlining).
> 
> -Toke

I don't particularly have a strong opinion here. But to play devil's
advocate:

* It may be confusing now, but over time I expect to see more direct
  write support via BTF, especially b/c there is support for unstable
  helpers now. So perhaps in the future it will seem more sensible.

* The unstable helpers do not have external documentation. Nor should
  they in my opinion as their unstableness + stale docs may lead to
  undesirable outcomes. So users of the unstable API already have to
  splunk through kernel code and/or selftests to figure out how to wield
  the APIs. All this to say there may not be an argument for
  discoverability.

* Direct writes are slightly more ergnomic than using a helper.

Thanks,
Daniel
Toke Høiland-Jørgensen Aug. 19, 2022, 1:05 p.m. UTC | #8
Daniel Xu <dxu@dxuuu.xyz> writes:

> Hi Toke,
>
> On Thu, Aug 18, 2022 at 09:52:08PM +0200, Toke Høiland-Jørgensen wrote:
>> Daniel Xu <dxu@dxuuu.xyz> writes:
>> 
>> > Support direct writes to nf_conn:mark from TC and XDP prog types. This
>> > is useful when applications want to store per-connection metadata. This
>> > is also particularly useful for applications that run both bpf and
>> > iptables/nftables because the latter can trivially access this
>> > metadata.
>> 
>> Looking closer at the nf_conn definition, the mark field (and possibly
>> secmark) seems to be the only field that is likely to be feasible to
>> support direct writes to, as everything else either requires special
>> handling (like status and timeout), or they are composite field that
>> will require helpers anyway to use correctly.
>> 
>> Which means we're in the process of creating an API where users have to
>> call helpers to fill in all fields *except* this one field that happens
>> to be directly writable. That seems like a really confusing and
>> inconsistent API, so IMO it strengthens the case for just making a
>> helper for this field as well, even though it adds a bit of overhead
>> (and then solving the overhead issue in a more generic way such as by
>> supporting clever inlining).
>> 
>> -Toke
>
> I don't particularly have a strong opinion here. But to play devil's
> advocate:
>
> * It may be confusing now, but over time I expect to see more direct
>   write support via BTF, especially b/c there is support for unstable
>   helpers now. So perhaps in the future it will seem more sensible.

Right, sure, for other structs. My point was that it doesn't look like
this particular one (nf_conn) is likely to grow any other members we can
access directly, so it'll be a weird one-off for that single field...

> * The unstable helpers do not have external documentation. Nor should
>   they in my opinion as their unstableness + stale docs may lead to
>   undesirable outcomes. So users of the unstable API already have to
>   splunk through kernel code and/or selftests to figure out how to wield
>   the APIs. All this to say there may not be an argument for
>   discoverability.

This I don't buy at all. Just because it's (supposedly) "unstable" is no
excuse to design a bad API, or make it actively user-hostile by hiding
things so users have to go browse kernel code to know how to use it. So
in any case, we should definitely document everything.

> * Direct writes are slightly more ergnomic than using a helper.

This is true, and that's the main argument for doing it this way. The
point of my previous email was that since it's only a single field,
consistency weighs heavier than ergonomics :)

-Toke
Alexei Starovoitov Aug. 19, 2022, 4:39 p.m. UTC | #9
On Fri, Aug 19, 2022 at 6:05 AM Toke Høiland-Jørgensen <toke@kernel.org> wrote:
>
> Daniel Xu <dxu@dxuuu.xyz> writes:
>
> > Hi Toke,
> >
> > On Thu, Aug 18, 2022 at 09:52:08PM +0200, Toke Høiland-Jørgensen wrote:
> >> Daniel Xu <dxu@dxuuu.xyz> writes:
> >>
> >> > Support direct writes to nf_conn:mark from TC and XDP prog types. This
> >> > is useful when applications want to store per-connection metadata. This
> >> > is also particularly useful for applications that run both bpf and
> >> > iptables/nftables because the latter can trivially access this
> >> > metadata.
> >>
> >> Looking closer at the nf_conn definition, the mark field (and possibly
> >> secmark) seems to be the only field that is likely to be feasible to
> >> support direct writes to, as everything else either requires special
> >> handling (like status and timeout), or they are composite field that
> >> will require helpers anyway to use correctly.
> >>
> >> Which means we're in the process of creating an API where users have to
> >> call helpers to fill in all fields *except* this one field that happens
> >> to be directly writable. That seems like a really confusing and
> >> inconsistent API, so IMO it strengthens the case for just making a
> >> helper for this field as well, even though it adds a bit of overhead
> >> (and then solving the overhead issue in a more generic way such as by
> >> supporting clever inlining).
> >>
> >> -Toke
> >
> > I don't particularly have a strong opinion here. But to play devil's
> > advocate:
> >
> > * It may be confusing now, but over time I expect to see more direct
> >   write support via BTF, especially b/c there is support for unstable
> >   helpers now. So perhaps in the future it will seem more sensible.
>
> Right, sure, for other structs. My point was that it doesn't look like
> this particular one (nf_conn) is likely to grow any other members we can
> access directly, so it'll be a weird one-off for that single field...
>
> > * The unstable helpers do not have external documentation. Nor should
> >   they in my opinion as their unstableness + stale docs may lead to
> >   undesirable outcomes. So users of the unstable API already have to
> >   splunk through kernel code and/or selftests to figure out how to wield
> >   the APIs. All this to say there may not be an argument for
> >   discoverability.
>
> This I don't buy at all. Just because it's (supposedly) "unstable" is no

They're unstable. Please don't start this 'but can we actually remove
them' doubts. You're only confusing yourself and others.
We tweaked kfuncs already. We removed tracepoints too after they
were in a full kernel release.

> excuse to design a bad API, or make it actively user-hostile by hiding

'bad API'? what? It's a direct field write.
We do allow it in other data structures.

> things so users have to go browse kernel code to know how to use it. So
> in any case, we should definitely document everything.
>
> > * Direct writes are slightly more ergnomic than using a helper.
>
> This is true, and that's the main argument for doing it this way. The
> point of my previous email was that since it's only a single field,
> consistency weighs heavier than ergonomics :)

I don't think the 'consistency' argument applies here.
We already allow direct read of all fields.
Also the field access is easier to handle with CO-RE.
diff mbox series

Patch

diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h
index a473b56842c5..0f584c2bd475 100644
--- a/include/net/netfilter/nf_conntrack_bpf.h
+++ b/include/net/netfilter/nf_conntrack_bpf.h
@@ -3,6 +3,7 @@ 
 #ifndef _NF_CONNTRACK_BPF_H
 #define _NF_CONNTRACK_BPF_H
 
+#include <linux/bpf.h>
 #include <linux/btf.h>
 #include <linux/kconfig.h>
 
@@ -10,6 +11,12 @@ 
     (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
 
 extern int register_nf_conntrack_bpf(void);
+extern int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
+					  const struct btf *btf,
+					  const struct btf_type *t, int off,
+					  int size, enum bpf_access_type atype,
+					  u32 *next_btf_id,
+					  enum bpf_type_flag *flag);
 
 #else
 
@@ -18,6 +25,17 @@  static inline int register_nf_conntrack_bpf(void)
 	return 0;
 }
 
+static inline int
+nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
+			       const struct btf *btf,
+			       const struct btf_type *t, int off,
+			       int size, enum bpf_access_type atype,
+			       u32 *next_btf_id,
+			       enum bpf_type_flag *flag)
+{
+	return -EACCES;
+}
+
 #endif
 
 #endif /* _NF_CONNTRACK_BPF_H */
diff --git a/net/core/filter.c b/net/core/filter.c
index 5669248aff25..d7b768fe9de7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -18,6 +18,7 @@ 
  */
 
 #include <linux/atomic.h>
+#include <linux/bpf_verifier.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/mm.h>
@@ -55,6 +56,7 @@ 
 #include <net/sock_reuseport.h>
 #include <net/busy_poll.h>
 #include <net/tcp.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
 #include <net/xfrm.h>
 #include <net/udp.h>
 #include <linux/bpf_trace.h>
@@ -8710,6 +8712,21 @@  static bool tc_cls_act_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
+static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
+					const struct btf *btf,
+					const struct btf_type *t, int off,
+					int size, enum bpf_access_type atype,
+					u32 *next_btf_id,
+					enum bpf_type_flag *flag)
+{
+	if (atype == BPF_READ)
+		return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+					 flag);
+
+	return nf_conntrack_btf_struct_access(log, btf, t, off, size, atype,
+					      next_btf_id, flag);
+}
+
 static bool __is_valid_xdp_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct xdp_md))
@@ -8769,6 +8786,21 @@  void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog,
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
+static int xdp_btf_struct_access(struct bpf_verifier_log *log,
+				 const struct btf *btf,
+				 const struct btf_type *t, int off,
+				 int size, enum bpf_access_type atype,
+				 u32 *next_btf_id,
+				 enum bpf_type_flag *flag)
+{
+	if (atype == BPF_READ)
+		return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+					 flag);
+
+	return nf_conntrack_btf_struct_access(log, btf, t, off, size, atype,
+					      next_btf_id, flag);
+}
+
 static bool sock_addr_is_valid_access(int off, int size,
 				      enum bpf_access_type type,
 				      const struct bpf_prog *prog,
@@ -10663,6 +10695,7 @@  const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
 	.gen_ld_abs		= bpf_gen_ld_abs,
+	.btf_struct_access	= tc_cls_act_btf_struct_access,
 };
 
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
@@ -10674,6 +10707,7 @@  const struct bpf_verifier_ops xdp_verifier_ops = {
 	.is_valid_access	= xdp_is_valid_access,
 	.convert_ctx_access	= xdp_convert_ctx_access,
 	.gen_prologue		= bpf_noop_prologue,
+	.btf_struct_access	= xdp_btf_struct_access,
 };
 
 const struct bpf_prog_ops xdp_prog_ops = {
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 1cd87b28c9b0..8010cc542d17 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -6,6 +6,7 @@ 
  * are exposed through to BPF programs is explicitly unstable.
  */
 
+#include <linux/bpf_verifier.h>
 #include <linux/bpf.h>
 #include <linux/btf.h>
 #include <linux/types.h>
@@ -15,6 +16,8 @@ 
 #include <net/netfilter/nf_conntrack_bpf.h>
 #include <net/netfilter/nf_conntrack_core.h>
 
+static const struct btf_type *nf_conn_type;
+
 /* bpf_ct_opts - Options for CT lookup helpers
  *
  * Members:
@@ -184,6 +187,53 @@  static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
 	return ct;
 }
 
+/* Check writes into `struct nf_conn` */
+int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
+				   const struct btf *btf,
+				   const struct btf_type *t, int off,
+				   int size, enum bpf_access_type atype,
+				   u32 *next_btf_id,
+				   enum bpf_type_flag *flag)
+{
+	const struct btf_type *nct = READ_ONCE(nf_conn_type);
+	s32 type_id;
+	size_t end;
+
+	if (!nct) {
+		type_id = btf_find_by_name_kind(btf, "nf_conn", BTF_KIND_STRUCT);
+		if (type_id < 0)
+			return -EINVAL;
+
+		nct = btf_type_by_id(btf, type_id);
+		WRITE_ONCE(nf_conn_type, nct);
+	}
+
+	if (t != nct) {
+		bpf_log(log, "only read is supported\n");
+		return -EACCES;
+	}
+
+	switch (off) {
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+	case offsetof(struct nf_conn, mark):
+		end = offsetofend(struct nf_conn, mark);
+		break;
+#endif
+	default:
+		bpf_log(log, "no write support to nf_conn at off %d\n", off);
+		return -EACCES;
+	}
+
+	if (off + size > end) {
+		bpf_log(log,
+			"write access at off %d with size %d beyond the member of nf_conn ended at %zu\n",
+			off, size, end);
+		return -EACCES;
+	}
+
+	return NOT_INIT;
+}
+
 __diag_push();
 __diag_ignore_all("-Wmissing-prototypes",
 		  "Global functions as their definitions will be in nf_conntrack BTF");