diff mbox series

[1/2,bpf-next] bpf: expose net_device from xdp for metadata

Message ID 20221109215242.1279993-2-john.fastabend@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Expose netdev in XDP progs with BTF_ID | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1692 this patch: 1692
netdev/cc_maintainers warning 9 maintainers not CCed: kpsingh@kernel.org song@kernel.org pabeni@redhat.com haoluo@google.com yhs@fb.com edumazet@google.com jolsa@kernel.org martin.lau@linux.dev andrii@kernel.org
netdev/build_clang success Errors and warnings before: 169 this patch: 169
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1684 this patch: 1684
netdev/checkpatch fail ERROR: code indent should use tabs where possible WARNING: 'convertion' may be misspelled - perhaps 'conversion'? WARNING: line length of 81 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc

Commit Message

John Fastabend Nov. 9, 2022, 9:52 p.m. UTC
Allow xdp progs to read the net_device structure. Its useful to extract
info from the dev itself. Currently, our tracing tooling uses kprobes
to capture statistics and information about running net devices. We use
kprobes instead of other hooks tc/xdp because we need to collect
information about the interface not exposed through the xdp_md structures.
This has some down sides that we want to avoid by moving these into the
XDP hook itself. First, placing the kprobes in a generic function in
the kernel is after XDP so we miss redirects and such done by the
XDP networking program. And its needless overhead because we are
already paying the cost for calling the XDP program, calling yet
another prog is a waste. Better to do everything in one hook from
performance side.

Of course we could one-off each one of these fields, but that would
explode the xdp_md struct and then require writing convert_ctx_access
writers for each field. By using BTF we avoid writing field specific
convertion logic, BTF just knows how to read the fields, we don't
have to add many fields to xdp_md, and I don't have to get every
field we will use in the future correct.

For reference current examples in our code base use the ifindex,
ifname, qdisc stats, net_ns fields, among others. With this
patch we can now do the following,

        dev = ctx->rx_dev;
        net = dev->nd_net.net;

	uid.ifindex = dev->ifindex;
	memcpy(uid.ifname, dev->ifname, NAME);
        if (net)
		uid.inum = net->ns.inum;

to report the name, index and ns.inum which identifies an
interface in our system.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/uapi/linux/bpf.h       |  1 +
 net/core/filter.c              | 19 +++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 3 files changed, 21 insertions(+)

Comments

Yonghong Song Nov. 10, 2022, 1:37 a.m. UTC | #1
On 11/9/22 1:52 PM, John Fastabend wrote:
> Allow xdp progs to read the net_device structure. Its useful to extract
> info from the dev itself. Currently, our tracing tooling uses kprobes
> to capture statistics and information about running net devices. We use
> kprobes instead of other hooks tc/xdp because we need to collect
> information about the interface not exposed through the xdp_md structures.
> This has some down sides that we want to avoid by moving these into the
> XDP hook itself. First, placing the kprobes in a generic function in
> the kernel is after XDP so we miss redirects and such done by the
> XDP networking program. And its needless overhead because we are
> already paying the cost for calling the XDP program, calling yet
> another prog is a waste. Better to do everything in one hook from
> performance side.
> 
> Of course we could one-off each one of these fields, but that would
> explode the xdp_md struct and then require writing convert_ctx_access
> writers for each field. By using BTF we avoid writing field specific
> convertion logic, BTF just knows how to read the fields, we don't
> have to add many fields to xdp_md, and I don't have to get every
> field we will use in the future correct.
> 
> For reference current examples in our code base use the ifindex,
> ifname, qdisc stats, net_ns fields, among others. With this
> patch we can now do the following,
> 
>          dev = ctx->rx_dev;
>          net = dev->nd_net.net;
> 
> 	uid.ifindex = dev->ifindex;
> 	memcpy(uid.ifname, dev->ifname, NAME);
>          if (net)
> 		uid.inum = net->ns.inum;
> 
> to report the name, index and ns.inum which identifies an
> interface in our system.

In
https://lore.kernel.org/bpf/ad15b398-9069-4a0e-48cb-4bb651ec3088@meta.com/
Namhyung Kim wanted to access new perf data with a helper.
I proposed a helper bpf_get_kern_ctx() which will get
the kernel ctx struct from which the actual perf data
can be retrieved. The interface looks like
	void *bpf_get_kern_ctx(void *)
the input parameter needs to be a PTR_TO_CTX and
the verifer is able to return the corresponding kernel
ctx struct based on program type.

The following is really hacked demonstration with
some of change coming from my bpf_rcu_read_lock()
patch set https://lore.kernel.org/bpf/20221109211944.3213817-1-yhs@fb.com/

I modified your test to utilize the
bpf_get_kern_ctx() helper in your test_xdp_md.c.

With this single helper, we can cover the above perf
data use case and your use case and maybe others
to avoid new UAPI changes.


diff --git a/include/linux/bpf.h b/include/linux/bpf.h 
 

index 798aec816970..da5e3f79f8a4 100644 
 

--- a/include/linux/bpf.h 
 

+++ b/include/linux/bpf.h 
 

@@ -2113,6 +2113,7 @@ bool bpf_prog_has_kfunc_call(const struct bpf_prog 
*prog); 

  const struct btf_func_model * 
 

  bpf_jit_find_kfunc_model(const struct bpf_prog *prog, 
 

                          const struct bpf_insn *insn); 
 

+void *bpf_get_kern_ctx(void *); 
 

  struct bpf_core_ctx { 
 

         struct bpf_verifier_log *log; 
 

         const struct btf *btf; 
 

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c 
 

index 5579ff3a5b54..cd2832e73de3 100644 
 

--- a/kernel/bpf/btf.c 
 

+++ b/kernel/bpf/btf.c 
 

@@ -199,6 +199,7 @@ DEFINE_IDR(btf_idr); 
 

  DEFINE_SPINLOCK(btf_idr_lock); 
 


  enum btf_kfunc_hook {
+       BTF_KFUNC_HOOK_GENERIC,
         BTF_KFUNC_HOOK_XDP,
         BTF_KFUNC_HOOK_TC,
         BTF_KFUNC_HOOK_STRUCT_OPS,
@@ -6428,6 +6429,10 @@ static int btf_check_func_arg_match(struct 
bpf_verifier_env *env,
                         return -EINVAL;
                 }

+               /* HACK: pointer to void, skip the rest of processing */
+               if (t->type == 0)
+                       continue;
+
                 ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
                 ref_tname = btf_name_by_offset(btf, ref_t->name_off);

@@ -7499,6 +7504,8 @@ static u32 *__btf_kfunc_id_set_contains(const 
struct btf *btf,
  static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
  {
         switch (prog_type) {
+       case BPF_PROG_TYPE_UNSPEC:
+               return BTF_KFUNC_HOOK_GENERIC;
         case BPF_PROG_TYPE_XDP:
                 return BTF_KFUNC_HOOK_XDP;
         case BPF_PROG_TYPE_SCHED_CLS:
@@ -7527,6 +7534,11 @@ u32 *btf_kfunc_id_set_contains(const struct btf *btf,
                                u32 kfunc_btf_id)
  {
         enum btf_kfunc_hook hook;
+       u32 *kfunc_flags;
+
+       kfunc_flags = __btf_kfunc_id_set_contains(btf, 
BTF_KFUNC_HOOK_GENERIC, kfunc_btf_id);
+       if (kfunc_flags)
+               return kfunc_flags;

         hook = bpf_prog_type_to_kfunc_hook(prog_type);
         return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 283f55bbeb70..b18e3464acc3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1717,9 +1717,25 @@ static const struct btf_kfunc_id_set 
tracing_kfunc_set = {
         .set   = &tracing_btf_ids,
  };

+void *bpf_get_kern_ctx(void *ctx) {
+       return ctx;
+}
+
+BTF_SET8_START(generic_btf_ids)
+BTF_ID_FLAGS(func, bpf_get_kern_ctx)
+BTF_SET8_END(generic_btf_ids)
+
+static const struct btf_kfunc_id_set generic_kfunc_set = {
+       .owner = THIS_MODULE,
+       .set   = &generic_btf_ids,
+};
+
  static int __init kfunc_init(void)
  {
-       return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, 
&tracing_kfunc_set);
+       int ret;
+
+       ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, 
&tracing_kfunc_set);
+       return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, 
&generic_kfunc_set);
  }

  late_initcall(kfunc_init);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d3b75aa0c54d..d8303abdb377 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7784,6 +7784,16 @@ static void mark_btf_func_reg_size(struct 
bpf_verifier_env *env, u32 regno,
         }
  }

+BTF_ID_LIST_SINGLE(bpf_get_kern_ctx_id, func, bpf_get_kern_ctx)
+BTF_ID_LIST_SINGLE(xdp_buff_btf_ids, struct, xdp_buff)
+
+static int get_kctx_btf_id(enum bpf_prog_type prog_type) {
+       if (prog_type == BPF_PROG_TYPE_XDP)
+               return *xdp_buff_btf_ids;
+       /* other program types */
+       return -EINVAL;
+}
+
  static int check_kfunc_call(struct bpf_verifier_env *env, struct 
bpf_insn *insn,
                             int *insn_idx_p)
  {
@@ -7853,7 +7863,20 @@ static int check_kfunc_call(struct 
bpf_verifier_env *env, struct bpf_insn *insn,
                 return -EINVAL;
         }

-       if (btf_type_is_scalar(t)) {
+       if (func_id == *bpf_get_kern_ctx_id) {
+               /* HACK: bpf_get_kern_ctx() kfunc */
+               int btf_id = get_kctx_btf_id(resolve_prog_type(env->prog));
+
+               if (btf_id > 0) {
+                       mark_reg_known_zero(env, regs, BPF_REG_0);
+                       regs[BPF_REG_0].btf = desc_btf;
+                       regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+                       regs[BPF_REG_0].btf_id = btf_id;
+               } else {
+                       verbose(env, "Cannot get kctx\n");
+                       return -EINVAL;
+               }
+       } else if (btf_type_is_scalar(t)) {
                 mark_reg_unknown(env, regs, BPF_REG_0);
                 mark_btf_func_reg_size(env, BPF_REG_0, t->size);
         } else if (btf_type_is_ptr(t)) {
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_md.c 
b/tools/testing/selftests/bpf/prog_tests/xdp_md.c
new file mode 100644
index 000000000000..facf3f3ab86f
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_md.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "test_xdp_md.skel.h"
+
+void test_xdp_md(void)
+{
+       struct test_xdp_md *skel;
+       int err, prog_fd;
+       char buf[128];
+
+       LIBBPF_OPTS(bpf_test_run_opts, topts,
+               .data_in = &pkt_v4,
+               .data_size_in = sizeof(pkt_v4),
+               .data_out = buf,
+               .data_size_out = sizeof(buf),
+               .repeat = 1,
+       );
+
+       skel = test_xdp_md__open_and_load();
+       if (!ASSERT_OK_PTR(skel, "skel_open"))
+               return;
+
+       prog_fd = bpf_program__fd(skel->progs.md_xdp);
+       err = bpf_prog_test_run_opts(prog_fd, &topts);
+       ASSERT_OK(err, "test_run");
+       ASSERT_EQ(topts.retval, XDP_PASS, "xdp_md test_run retval");
+
+       ASSERT_EQ(skel->bss->ifindex, 1, "xdp_md ifindex");
+       ASSERT_EQ(skel->bss->ifindex, skel->bss->ingress_ifindex, 
"xdp_md ingress_ifindex");
+       ASSERT_STREQ(skel->bss->name, "lo", "xdp_md name");
+       ASSERT_NEQ(skel->bss->inum, 0, "xdp_md inum");
+
+       test_xdp_md__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_md.c 
b/tools/testing/selftests/bpf/progs/test_xdp_md.c
new file mode 100644
index 000000000000..c6a7686ec9c6
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_md.c
@@ -0,0 +1,28 @@
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <string.h>
+
+#define        IFNAMSIZ 16
+
+int ifindex, ingress_ifindex;
+char name[IFNAMSIZ];
+unsigned int inum;
+
+extern void *bpf_get_kern_ctx(void *) __ksym;
+
+SEC("xdp")
+int md_xdp(struct xdp_md *ctx)
+{
+       struct xdp_buff *kctx = bpf_get_kern_ctx(ctx);
+       struct net_device *dev;
+
+       dev = kctx->rxq->dev;
+
+       inum = dev->nd_net.net->ns.inum;
+       memcpy(name, dev->name, IFNAMSIZ);
+       ingress_ifindex = dev->ifindex;
+       return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";

> 
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> ---
>   include/uapi/linux/bpf.h       |  1 +
>   net/core/filter.c              | 19 +++++++++++++++++++
>   tools/include/uapi/linux/bpf.h |  1 +
>   3 files changed, 21 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 94659f6b3395..50403eb3b6cf 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -6123,6 +6123,7 @@ struct xdp_md {
>   	__u32 rx_queue_index;  /* rxq->queue_index  */
>   
>   	__u32 egress_ifindex;  /* txq->dev->ifindex */
> +	__bpf_md_ptr(struct net_device *, rx_dev); /* rxq->dev */
>   };
>   
>   /* DEVMAP map-value layout
> diff --git a/net/core/filter.c b/net/core/filter.c
> index bb0136e7a8e4..d445ffbea8f1 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -8686,6 +8686,8 @@ static bool __is_valid_xdp_access(int off, int size)
>   	return true;
>   }
>   
> +BTF_ID_LIST_SINGLE(btf_xdp_get_netdev_id, struct, net_device)
> +
>   static bool xdp_is_valid_access(int off, int size,
>   				enum bpf_access_type type,
>   				const struct bpf_prog *prog,
> @@ -8718,6 +8720,15 @@ static bool xdp_is_valid_access(int off, int size,
>   	case offsetof(struct xdp_md, data_end):
>   		info->reg_type = PTR_TO_PACKET_END;
>   		break;
> +	case offsetof(struct xdp_md, rx_dev):
> +		info->reg_type = PTR_TO_BTF_ID;
> +		info->btf_id = btf_xdp_get_netdev_id[0];
> +		info->btf = bpf_get_btf_vmlinux();
> +	        if (IS_ERR_OR_NULL(info->btf))
> +			return false;
> +		if (size != sizeof(u64))
> +			return false;
> +		return true;
>   	}
>   
>   	return __is_valid_xdp_access(off, size);
> @@ -9808,6 +9819,14 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
>   		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
>   				      offsetof(struct net_device, ifindex));
>   		break;
> +	case offsetof(struct xdp_md, rx_dev):
> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
> +				      si->dst_reg, si->src_reg,
> +				      offsetof(struct xdp_buff, rxq));
> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
> +				      si->dst_reg, si->dst_reg,
> +				      offsetof(struct xdp_rxq_info, dev));
> +		break;
>   	}
>   
>   	return insn - insn_buf;
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 94659f6b3395..50403eb3b6cf 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -6123,6 +6123,7 @@ struct xdp_md {
>   	__u32 rx_queue_index;  /* rxq->queue_index  */
>   
>   	__u32 egress_ifindex;  /* txq->dev->ifindex */
> +	__bpf_md_ptr(struct net_device *, rx_dev); /* rxq->dev */
>   };
>   
>   /* DEVMAP map-value layout
John Fastabend Nov. 10, 2022, 2:17 a.m. UTC | #2
Yonghong Song wrote:
> 
> 
> On 11/9/22 1:52 PM, John Fastabend wrote:
> > Allow xdp progs to read the net_device structure. Its useful to extract
> > info from the dev itself. Currently, our tracing tooling uses kprobes
> > to capture statistics and information about running net devices. We use
> > kprobes instead of other hooks tc/xdp because we need to collect
> > information about the interface not exposed through the xdp_md structures.
> > This has some down sides that we want to avoid by moving these into the
> > XDP hook itself. First, placing the kprobes in a generic function in
> > the kernel is after XDP so we miss redirects and such done by the
> > XDP networking program. And its needless overhead because we are
> > already paying the cost for calling the XDP program, calling yet
> > another prog is a waste. Better to do everything in one hook from
> > performance side.
> > 
> > Of course we could one-off each one of these fields, but that would
> > explode the xdp_md struct and then require writing convert_ctx_access
> > writers for each field. By using BTF we avoid writing field specific
> > convertion logic, BTF just knows how to read the fields, we don't
> > have to add many fields to xdp_md, and I don't have to get every
> > field we will use in the future correct.
> > 
> > For reference current examples in our code base use the ifindex,
> > ifname, qdisc stats, net_ns fields, among others. With this
> > patch we can now do the following,
> > 
> >          dev = ctx->rx_dev;
> >          net = dev->nd_net.net;
> > 
> > 	uid.ifindex = dev->ifindex;
> > 	memcpy(uid.ifname, dev->ifname, NAME);
> >          if (net)
> > 		uid.inum = net->ns.inum;
> > 
> > to report the name, index and ns.inum which identifies an
> > interface in our system.
> 
> In
> https://lore.kernel.org/bpf/ad15b398-9069-4a0e-48cb-4bb651ec3088@meta.com/
> Namhyung Kim wanted to access new perf data with a helper.
> I proposed a helper bpf_get_kern_ctx() which will get
> the kernel ctx struct from which the actual perf data
> can be retrieved. The interface looks like
> 	void *bpf_get_kern_ctx(void *)
> the input parameter needs to be a PTR_TO_CTX and
> the verifer is able to return the corresponding kernel
> ctx struct based on program type.
> 
> The following is really hacked demonstration with
> some of change coming from my bpf_rcu_read_lock()
> patch set https://lore.kernel.org/bpf/20221109211944.3213817-1-yhs@fb.com/
> 
> I modified your test to utilize the
> bpf_get_kern_ctx() helper in your test_xdp_md.c.
> 
> With this single helper, we can cover the above perf
> data use case and your use case and maybe others
> to avoid new UAPI changes.

hmm I like the idea of just accessing the xdp_buff directly
instead of adding more fields. I'm less convinced of the
kfunc approach. What about a terminating field *self in the
xdp_md. Then we can use existing convert_ctx_access to make
it BPF inlined and no verifier changes needed.

Something like this quickly typed up and not compiled, but
I think shows what I'm thinking.

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 94659f6b3395..10ebd90d6677 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6123,6 +6123,10 @@ struct xdp_md {
        __u32 rx_queue_index;  /* rxq->queue_index  */
 
        __u32 egress_ifindex;  /* txq->dev->ifindex */
+       /* Last xdp_md entry, for new types add directly to xdp_buff and use
+        * BTF access. Reading this gives BTF access to xdp_buff.
+        */
+       __bpf_md_ptr(struct xdp_buff *, self);
 };
 
 /* DEVMAP map-value layout
diff --git a/net/core/filter.c b/net/core/filter.c
index bb0136e7a8e4..547e9576a918 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9808,6 +9808,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct net_device, ifindex));
                break;
+       case offsetof(struct xdp_md, self):
+               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, self),
+                                     si->dst_reg, si->src_reg,
+                                     offsetof(struct xdp_buff, 0));
+               break;
        }
 
        return insn - insn_buf;

Actually even that single insn conversion is a bit unnessary because
should be enough to just change the type to the correct BTF_ID in the
verifier and omit any instructions. But it wwould be a bit confusing
for C side. Might be a good use for passing 'cast' info through to
the verifier as an annotation so it could just do the BTF_ID cast for
us without any insns.
Toke Høiland-Jørgensen Nov. 10, 2022, 12:45 p.m. UTC | #3
John Fastabend <john.fastabend@gmail.com> writes:

> Yonghong Song wrote:
>> 
>> 
>> On 11/9/22 1:52 PM, John Fastabend wrote:
>> > Allow xdp progs to read the net_device structure. Its useful to extract
>> > info from the dev itself. Currently, our tracing tooling uses kprobes
>> > to capture statistics and information about running net devices. We use
>> > kprobes instead of other hooks tc/xdp because we need to collect
>> > information about the interface not exposed through the xdp_md structures.
>> > This has some down sides that we want to avoid by moving these into the
>> > XDP hook itself. First, placing the kprobes in a generic function in
>> > the kernel is after XDP so we miss redirects and such done by the
>> > XDP networking program. And its needless overhead because we are
>> > already paying the cost for calling the XDP program, calling yet
>> > another prog is a waste. Better to do everything in one hook from
>> > performance side.
>> > 
>> > Of course we could one-off each one of these fields, but that would
>> > explode the xdp_md struct and then require writing convert_ctx_access
>> > writers for each field. By using BTF we avoid writing field specific
>> > convertion logic, BTF just knows how to read the fields, we don't
>> > have to add many fields to xdp_md, and I don't have to get every
>> > field we will use in the future correct.
>> > 
>> > For reference current examples in our code base use the ifindex,
>> > ifname, qdisc stats, net_ns fields, among others. With this
>> > patch we can now do the following,
>> > 
>> >          dev = ctx->rx_dev;
>> >          net = dev->nd_net.net;
>> > 
>> > 	uid.ifindex = dev->ifindex;
>> > 	memcpy(uid.ifname, dev->ifname, NAME);
>> >          if (net)
>> > 		uid.inum = net->ns.inum;
>> > 
>> > to report the name, index and ns.inum which identifies an
>> > interface in our system.
>> 
>> In
>> https://lore.kernel.org/bpf/ad15b398-9069-4a0e-48cb-4bb651ec3088@meta.com/
>> Namhyung Kim wanted to access new perf data with a helper.
>> I proposed a helper bpf_get_kern_ctx() which will get
>> the kernel ctx struct from which the actual perf data
>> can be retrieved. The interface looks like
>> 	void *bpf_get_kern_ctx(void *)
>> the input parameter needs to be a PTR_TO_CTX and
>> the verifer is able to return the corresponding kernel
>> ctx struct based on program type.
>> 
>> The following is really hacked demonstration with
>> some of change coming from my bpf_rcu_read_lock()
>> patch set https://lore.kernel.org/bpf/20221109211944.3213817-1-yhs@fb.com/
>> 
>> I modified your test to utilize the
>> bpf_get_kern_ctx() helper in your test_xdp_md.c.
>> 
>> With this single helper, we can cover the above perf
>> data use case and your use case and maybe others
>> to avoid new UAPI changes.
>
> hmm I like the idea of just accessing the xdp_buff directly
> instead of adding more fields. I'm less convinced of the
> kfunc approach. What about a terminating field *self in the
> xdp_md. Then we can use existing convert_ctx_access to make
> it BPF inlined and no verifier changes needed.
>
> Something like this quickly typed up and not compiled, but
> I think shows what I'm thinking.
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 94659f6b3395..10ebd90d6677 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -6123,6 +6123,10 @@ struct xdp_md {
>         __u32 rx_queue_index;  /* rxq->queue_index  */
>  
>         __u32 egress_ifindex;  /* txq->dev->ifindex */
> +       /* Last xdp_md entry, for new types add directly to xdp_buff and use
> +        * BTF access. Reading this gives BTF access to xdp_buff.
> +        */
> +       __bpf_md_ptr(struct xdp_buff *, self);
>  };

xdp_md is UAPI; I really don't think it's a good idea to add "unstable"
BTF fields like this to it, that's just going to confuse people. Tying
this to a kfunc for conversion is more consistent with the whole "kfunc
and BTF are its own thing" expectation.

The kfunc doesn't actually have to execute any instructions either, it
can just be collapsed into a type conversion to BTF inside the verifier,
no?

-Toke
Yonghong Song Nov. 10, 2022, 4:46 p.m. UTC | #4
On 11/9/22 6:17 PM, John Fastabend wrote:
> Yonghong Song wrote:
>>
>>
>> On 11/9/22 1:52 PM, John Fastabend wrote:
>>> Allow xdp progs to read the net_device structure. Its useful to extract
>>> info from the dev itself. Currently, our tracing tooling uses kprobes
>>> to capture statistics and information about running net devices. We use
>>> kprobes instead of other hooks tc/xdp because we need to collect
>>> information about the interface not exposed through the xdp_md structures.
>>> This has some down sides that we want to avoid by moving these into the
>>> XDP hook itself. First, placing the kprobes in a generic function in
>>> the kernel is after XDP so we miss redirects and such done by the
>>> XDP networking program. And its needless overhead because we are
>>> already paying the cost for calling the XDP program, calling yet
>>> another prog is a waste. Better to do everything in one hook from
>>> performance side.
>>>
>>> Of course we could one-off each one of these fields, but that would
>>> explode the xdp_md struct and then require writing convert_ctx_access
>>> writers for each field. By using BTF we avoid writing field specific
>>> convertion logic, BTF just knows how to read the fields, we don't
>>> have to add many fields to xdp_md, and I don't have to get every
>>> field we will use in the future correct.
>>>
>>> For reference current examples in our code base use the ifindex,
>>> ifname, qdisc stats, net_ns fields, among others. With this
>>> patch we can now do the following,
>>>
>>>           dev = ctx->rx_dev;
>>>           net = dev->nd_net.net;
>>>
>>> 	uid.ifindex = dev->ifindex;
>>> 	memcpy(uid.ifname, dev->ifname, NAME);
>>>           if (net)
>>> 		uid.inum = net->ns.inum;
>>>
>>> to report the name, index and ns.inum which identifies an
>>> interface in our system.
>>
>> In
>> https://lore.kernel.org/bpf/ad15b398-9069-4a0e-48cb-4bb651ec3088@meta.com/
>> Namhyung Kim wanted to access new perf data with a helper.
>> I proposed a helper bpf_get_kern_ctx() which will get
>> the kernel ctx struct from which the actual perf data
>> can be retrieved. The interface looks like
>> 	void *bpf_get_kern_ctx(void *)
>> the input parameter needs to be a PTR_TO_CTX and
>> the verifer is able to return the corresponding kernel
>> ctx struct based on program type.
>>
>> The following is really hacked demonstration with
>> some of change coming from my bpf_rcu_read_lock()
>> patch set https://lore.kernel.org/bpf/20221109211944.3213817-1-yhs@fb.com/
>>
>> I modified your test to utilize the
>> bpf_get_kern_ctx() helper in your test_xdp_md.c.
>>
>> With this single helper, we can cover the above perf
>> data use case and your use case and maybe others
>> to avoid new UAPI changes.
> 
> hmm I like the idea of just accessing the xdp_buff directly
> instead of adding more fields. I'm less convinced of the
> kfunc approach. What about a terminating field *self in the
> xdp_md. Then we can use existing convert_ctx_access to make
> it BPF inlined and no verifier changes needed.
> 
> Something like this quickly typed up and not compiled, but
> I think shows what I'm thinking.
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 94659f6b3395..10ebd90d6677 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -6123,6 +6123,10 @@ struct xdp_md {
>          __u32 rx_queue_index;  /* rxq->queue_index  */
>   
>          __u32 egress_ifindex;  /* txq->dev->ifindex */
> +       /* Last xdp_md entry, for new types add directly to xdp_buff and use
> +        * BTF access. Reading this gives BTF access to xdp_buff.
> +        */
> +       __bpf_md_ptr(struct xdp_buff *, self);
>   };

This would be the first instance to have a kernel internal struct
in a uapi struct. Not sure whether this is a good idea or not.

>   
>   /* DEVMAP map-value layout
> diff --git a/net/core/filter.c b/net/core/filter.c
> index bb0136e7a8e4..547e9576a918 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -9808,6 +9808,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
>                  *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
>                                        offsetof(struct net_device, ifindex));
>                  break;
> +       case offsetof(struct xdp_md, self):
> +               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, self),
> +                                     si->dst_reg, si->src_reg,
> +                                     offsetof(struct xdp_buff, 0));
> +               break;
>          }
>   
>          return insn - insn_buf;
> 
> Actually even that single insn conversion is a bit unnessary because
> should be enough to just change the type to the correct BTF_ID in the
> verifier and omit any instructions. But it wwould be a bit confusing
> for C side. Might be a good use for passing 'cast' info through to
> the verifier as an annotation so it could just do the BTF_ID cast for
> us without any insns.

We cannot change the context type to BTF_ID style which will be a
uapi violation.

The helper I proposed can be rewritten by verifier as
	r0 = r1
so we should not have overhead for this.
It cover all program types with known uapi ctx -> kern ctx
conversions. So there is no need to change existing uapi structs.
Also I except that most people probably won't use this kfunc.
The existing uapi fields might already serve most needs.

Internally we have another use case to access some 'struct sock' fields
but the uapi struct only has struct bpf_sock. Currently it is advised
to use bpf_probe_read_kernel(...) to get the needed information.
The proposed helper should help that too without uapi change.
Yonghong Song Nov. 10, 2022, 4:53 p.m. UTC | #5
On 11/10/22 4:45 AM, Toke Høiland-Jørgensen wrote:
> John Fastabend <john.fastabend@gmail.com> writes:
> 
>> Yonghong Song wrote:
>>>
>>>
>>> On 11/9/22 1:52 PM, John Fastabend wrote:
>>>> Allow xdp progs to read the net_device structure. Its useful to extract
>>>> info from the dev itself. Currently, our tracing tooling uses kprobes
>>>> to capture statistics and information about running net devices. We use
>>>> kprobes instead of other hooks tc/xdp because we need to collect
>>>> information about the interface not exposed through the xdp_md structures.
>>>> This has some down sides that we want to avoid by moving these into the
>>>> XDP hook itself. First, placing the kprobes in a generic function in
>>>> the kernel is after XDP so we miss redirects and such done by the
>>>> XDP networking program. And its needless overhead because we are
>>>> already paying the cost for calling the XDP program, calling yet
>>>> another prog is a waste. Better to do everything in one hook from
>>>> performance side.
>>>>
>>>> Of course we could one-off each one of these fields, but that would
>>>> explode the xdp_md struct and then require writing convert_ctx_access
>>>> writers for each field. By using BTF we avoid writing field specific
>>>> convertion logic, BTF just knows how to read the fields, we don't
>>>> have to add many fields to xdp_md, and I don't have to get every
>>>> field we will use in the future correct.
>>>>
>>>> For reference current examples in our code base use the ifindex,
>>>> ifname, qdisc stats, net_ns fields, among others. With this
>>>> patch we can now do the following,
>>>>
>>>>           dev = ctx->rx_dev;
>>>>           net = dev->nd_net.net;
>>>>
>>>> 	uid.ifindex = dev->ifindex;
>>>> 	memcpy(uid.ifname, dev->ifname, NAME);
>>>>           if (net)
>>>> 		uid.inum = net->ns.inum;
>>>>
>>>> to report the name, index and ns.inum which identifies an
>>>> interface in our system.
>>>
>>> In
>>> https://lore.kernel.org/bpf/ad15b398-9069-4a0e-48cb-4bb651ec3088@meta.com/
>>> Namhyung Kim wanted to access new perf data with a helper.
>>> I proposed a helper bpf_get_kern_ctx() which will get
>>> the kernel ctx struct from which the actual perf data
>>> can be retrieved. The interface looks like
>>> 	void *bpf_get_kern_ctx(void *)
>>> the input parameter needs to be a PTR_TO_CTX and
>>> the verifer is able to return the corresponding kernel
>>> ctx struct based on program type.
>>>
>>> The following is really hacked demonstration with
>>> some of change coming from my bpf_rcu_read_lock()
>>> patch set https://lore.kernel.org/bpf/20221109211944.3213817-1-yhs@fb.com/
>>>
>>> I modified your test to utilize the
>>> bpf_get_kern_ctx() helper in your test_xdp_md.c.
>>>
>>> With this single helper, we can cover the above perf
>>> data use case and your use case and maybe others
>>> to avoid new UAPI changes.
>>
>> hmm I like the idea of just accessing the xdp_buff directly
>> instead of adding more fields. I'm less convinced of the
>> kfunc approach. What about a terminating field *self in the
>> xdp_md. Then we can use existing convert_ctx_access to make
>> it BPF inlined and no verifier changes needed.
>>
>> Something like this quickly typed up and not compiled, but
>> I think shows what I'm thinking.
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 94659f6b3395..10ebd90d6677 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -6123,6 +6123,10 @@ struct xdp_md {
>>          __u32 rx_queue_index;  /* rxq->queue_index  */
>>   
>>          __u32 egress_ifindex;  /* txq->dev->ifindex */
>> +       /* Last xdp_md entry, for new types add directly to xdp_buff and use
>> +        * BTF access. Reading this gives BTF access to xdp_buff.
>> +        */
>> +       __bpf_md_ptr(struct xdp_buff *, self);
>>   };
> 
> xdp_md is UAPI; I really don't think it's a good idea to add "unstable"
> BTF fields like this to it, that's just going to confuse people. Tying
> this to a kfunc for conversion is more consistent with the whole "kfunc
> and BTF are its own thing" expectation.
> 
> The kfunc doesn't actually have to execute any instructions either, it
> can just be collapsed into a type conversion to BTF inside the verifier,
> no?

The kfunc execution can be replaced with a register move like
	r0 = r1  /* r1 is the ctx */
	/* r0 is the kctx */

> 
> -Toke
John Fastabend Nov. 10, 2022, 5:02 p.m. UTC | #6
Toke Høiland-Jørgensen wrote:
> John Fastabend <john.fastabend@gmail.com> writes:
> 
> > Yonghong Song wrote:
> >> 
> >> 
> >> On 11/9/22 1:52 PM, John Fastabend wrote:
> >> > Allow xdp progs to read the net_device structure. Its useful to extract
> >> > info from the dev itself. Currently, our tracing tooling uses kprobes
> >> > to capture statistics and information about running net devices. We use
> >> > kprobes instead of other hooks tc/xdp because we need to collect
> >> > information about the interface not exposed through the xdp_md structures.
> >> > This has some down sides that we want to avoid by moving these into the
> >> > XDP hook itself. First, placing the kprobes in a generic function in
> >> > the kernel is after XDP so we miss redirects and such done by the
> >> > XDP networking program. And its needless overhead because we are
> >> > already paying the cost for calling the XDP program, calling yet
> >> > another prog is a waste. Better to do everything in one hook from
> >> > performance side.
> >> > 
> >> > Of course we could one-off each one of these fields, but that would
> >> > explode the xdp_md struct and then require writing convert_ctx_access
> >> > writers for each field. By using BTF we avoid writing field specific
> >> > convertion logic, BTF just knows how to read the fields, we don't
> >> > have to add many fields to xdp_md, and I don't have to get every
> >> > field we will use in the future correct.
> >> > 
> >> > For reference current examples in our code base use the ifindex,
> >> > ifname, qdisc stats, net_ns fields, among others. With this
> >> > patch we can now do the following,
> >> > 
> >> >          dev = ctx->rx_dev;
> >> >          net = dev->nd_net.net;
> >> > 
> >> > 	uid.ifindex = dev->ifindex;
> >> > 	memcpy(uid.ifname, dev->ifname, NAME);
> >> >          if (net)
> >> > 		uid.inum = net->ns.inum;
> >> > 
> >> > to report the name, index and ns.inum which identifies an
> >> > interface in our system.
> >> 
> >> In
> >> https://lore.kernel.org/bpf/ad15b398-9069-4a0e-48cb-4bb651ec3088@meta.com/
> >> Namhyung Kim wanted to access new perf data with a helper.
> >> I proposed a helper bpf_get_kern_ctx() which will get
> >> the kernel ctx struct from which the actual perf data
> >> can be retrieved. The interface looks like
> >> 	void *bpf_get_kern_ctx(void *)
> >> the input parameter needs to be a PTR_TO_CTX and
> >> the verifer is able to return the corresponding kernel
> >> ctx struct based on program type.
> >> 
> >> The following is really hacked demonstration with
> >> some of change coming from my bpf_rcu_read_lock()
> >> patch set https://lore.kernel.org/bpf/20221109211944.3213817-1-yhs@fb.com/
> >> 
> >> I modified your test to utilize the
> >> bpf_get_kern_ctx() helper in your test_xdp_md.c.
> >> 
> >> With this single helper, we can cover the above perf
> >> data use case and your use case and maybe others
> >> to avoid new UAPI changes.
> >
> > hmm I like the idea of just accessing the xdp_buff directly
> > instead of adding more fields. I'm less convinced of the
> > kfunc approach. What about a terminating field *self in the
> > xdp_md. Then we can use existing convert_ctx_access to make
> > it BPF inlined and no verifier changes needed.
> >
> > Something like this quickly typed up and not compiled, but
> > I think shows what I'm thinking.
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 94659f6b3395..10ebd90d6677 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -6123,6 +6123,10 @@ struct xdp_md {
> >         __u32 rx_queue_index;  /* rxq->queue_index  */
> >  
> >         __u32 egress_ifindex;  /* txq->dev->ifindex */
> > +       /* Last xdp_md entry, for new types add directly to xdp_buff and use
> > +        * BTF access. Reading this gives BTF access to xdp_buff.
> > +        */
> > +       __bpf_md_ptr(struct xdp_buff *, self);
> >  };
> 
> xdp_md is UAPI; I really don't think it's a good idea to add "unstable"
> BTF fields like this to it, that's just going to confuse people. Tying
> this to a kfunc for conversion is more consistent with the whole "kfunc
> and BTF are its own thing" expectation.

hmm from my side self here would be stable. Whats behind it is not,
but that seems fine to me.  Doing `ctx->self` feels more natural imo
then doing a call. A bunch more work but could do btf casts maybe
with annotations. I'm not sure its worth it though because only reason
I can think to do this would be for this self reference from ctx.

   struct xdp_buff *xdp = __btf (struct xdp_buff *)ctx;

C++ has 'this' as well but thats confusing from C side. Could have
a common syntax to do 'ctx->this' to get the pointer in BTF
format.

Maybe see what Yonghong thinks.

> 
> The kfunc doesn't actually have to execute any instructions either, it
> can just be collapsed into a type conversion to BTF inside the verifier,
> no?

Agree either implementation can be made that same underneath its just
a style question. I can probably do either but using the ctx keeps
the existing machinery to go through is_valid_access and so on.

Thanks.
John Fastabend Nov. 10, 2022, 10:58 p.m. UTC | #7
Yonghong Song wrote:
> 
> 
> On 11/9/22 6:17 PM, John Fastabend wrote:
> > Yonghong Song wrote:
> >>
> >>
> >> On 11/9/22 1:52 PM, John Fastabend wrote:
> >>> Allow xdp progs to read the net_device structure. Its useful to extract
> >>> info from the dev itself. Currently, our tracing tooling uses kprobes
> >>> to capture statistics and information about running net devices. We use
> >>> kprobes instead of other hooks tc/xdp because we need to collect
> >>> information about the interface not exposed through the xdp_md structures.
> >>> This has some down sides that we want to avoid by moving these into the
> >>> XDP hook itself. First, placing the kprobes in a generic function in
> >>> the kernel is after XDP so we miss redirects and such done by the
> >>> XDP networking program. And its needless overhead because we are
> >>> already paying the cost for calling the XDP program, calling yet
> >>> another prog is a waste. Better to do everything in one hook from
> >>> performance side.
> >>>
> >>> Of course we could one-off each one of these fields, but that would
> >>> explode the xdp_md struct and then require writing convert_ctx_access
> >>> writers for each field. By using BTF we avoid writing field specific
> >>> convertion logic, BTF just knows how to read the fields, we don't
> >>> have to add many fields to xdp_md, and I don't have to get every
> >>> field we will use in the future correct.
> >>>
> >>> For reference current examples in our code base use the ifindex,
> >>> ifname, qdisc stats, net_ns fields, among others. With this
> >>> patch we can now do the following,
> >>>
> >>>           dev = ctx->rx_dev;
> >>>           net = dev->nd_net.net;
> >>>
> >>> 	uid.ifindex = dev->ifindex;
> >>> 	memcpy(uid.ifname, dev->ifname, NAME);
> >>>           if (net)
> >>> 		uid.inum = net->ns.inum;
> >>>
> >>> to report the name, index and ns.inum which identifies an
> >>> interface in our system.
> >>
> >> In
> >> https://lore.kernel.org/bpf/ad15b398-9069-4a0e-48cb-4bb651ec3088@meta.com/
> >> Namhyung Kim wanted to access new perf data with a helper.
> >> I proposed a helper bpf_get_kern_ctx() which will get
> >> the kernel ctx struct from which the actual perf data
> >> can be retrieved. The interface looks like
> >> 	void *bpf_get_kern_ctx(void *)
> >> the input parameter needs to be a PTR_TO_CTX and
> >> the verifer is able to return the corresponding kernel
> >> ctx struct based on program type.
> >>
> >> The following is really hacked demonstration with
> >> some of change coming from my bpf_rcu_read_lock()
> >> patch set https://lore.kernel.org/bpf/20221109211944.3213817-1-yhs@fb.com/
> >>
> >> I modified your test to utilize the
> >> bpf_get_kern_ctx() helper in your test_xdp_md.c.
> >>
> >> With this single helper, we can cover the above perf
> >> data use case and your use case and maybe others
> >> to avoid new UAPI changes.
> > 
> > hmm I like the idea of just accessing the xdp_buff directly
> > instead of adding more fields. I'm less convinced of the
> > kfunc approach. What about a terminating field *self in the
> > xdp_md. Then we can use existing convert_ctx_access to make
> > it BPF inlined and no verifier changes needed.
> > 
> > Something like this quickly typed up and not compiled, but
> > I think shows what I'm thinking.
> > 
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 94659f6b3395..10ebd90d6677 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -6123,6 +6123,10 @@ struct xdp_md {
> >          __u32 rx_queue_index;  /* rxq->queue_index  */
> >   
> >          __u32 egress_ifindex;  /* txq->dev->ifindex */
> > +       /* Last xdp_md entry, for new types add directly to xdp_buff and use
> > +        * BTF access. Reading this gives BTF access to xdp_buff.
> > +        */
> > +       __bpf_md_ptr(struct xdp_buff *, self);
> >   };
> 
> This would be the first instance to have a kernel internal struct
> in a uapi struct. Not sure whether this is a good idea or not.

We can use probe_read from some of the socket progs already but
sure.

> 
> >   
> >   /* DEVMAP map-value layout
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index bb0136e7a8e4..547e9576a918 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -9808,6 +9808,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
> >                  *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
> >                                        offsetof(struct net_device, ifindex));
> >                  break;
> > +       case offsetof(struct xdp_md, self):
> > +               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, self),
> > +                                     si->dst_reg, si->src_reg,
> > +                                     offsetof(struct xdp_buff, 0));
> > +               break;
> >          }
> >   
> >          return insn - insn_buf;
> > 
> > Actually even that single insn conversion is a bit unnessary because
> > should be enough to just change the type to the correct BTF_ID in the
> > verifier and omit any instructions. But it wwould be a bit confusing
> > for C side. Might be a good use for passing 'cast' info through to
> > the verifier as an annotation so it could just do the BTF_ID cast for
> > us without any insns.
> 
> We cannot change the context type to BTF_ID style which will be a
> uapi violation.

I don't think it would be uapi violation if user asks for it
by annotating the cast.

> 
> The helper I proposed can be rewritten by verifier as
> 	r0 = r1
> so we should not have overhead for this.

Agree other than reading the bpf asm where its a bit odd.

> It cover all program types with known uapi ctx -> kern ctx
> conversions. So there is no need to change existing uapi structs.
> Also I except that most people probably won't use this kfunc.
> The existing uapi fields might already serve most needs.

Maybe not sure missing some things we need.

> 
> Internally we have another use case to access some 'struct sock' fields
> but the uapi struct only has struct bpf_sock. Currently it is advised
> to use bpf_probe_read_kernel(...) to get the needed information.
> The proposed helper should help that too without uapi change.

Yep.

I'm fine doing it with bpf_get_kern_ctx() did you want me to code it
the rest of the way up and test it?

.John
John Fastabend Nov. 10, 2022, 11:11 p.m. UTC | #8
John Fastabend wrote:
> Yonghong Song wrote:
> > 
> > 
> > On 11/9/22 6:17 PM, John Fastabend wrote:
> > > Yonghong Song wrote:
> > >>
> > >>
> > >> On 11/9/22 1:52 PM, John Fastabend wrote:
> > >>> Allow xdp progs to read the net_device structure. Its useful to extract
> > >>> info from the dev itself. Currently, our tracing tooling uses kprobes
> > >>> to capture statistics and information about running net devices. We use
> > >>> kprobes instead of other hooks tc/xdp because we need to collect
> > >>> information about the interface not exposed through the xdp_md structures.
> > >>> This has some down sides that we want to avoid by moving these into the
> > >>> XDP hook itself. First, placing the kprobes in a generic function in
> > >>> the kernel is after XDP so we miss redirects and such done by the
> > >>> XDP networking program. And its needless overhead because we are
> > >>> already paying the cost for calling the XDP program, calling yet
> > >>> another prog is a waste. Better to do everything in one hook from
> > >>> performance side.
> > >>>
> > >>> Of course we could one-off each one of these fields, but that would
> > >>> explode the xdp_md struct and then require writing convert_ctx_access
> > >>> writers for each field. By using BTF we avoid writing field specific
> > >>> convertion logic, BTF just knows how to read the fields, we don't
> > >>> have to add many fields to xdp_md, and I don't have to get every
> > >>> field we will use in the future correct.
> > >>>
> > >>> For reference current examples in our code base use the ifindex,
> > >>> ifname, qdisc stats, net_ns fields, among others. With this
> > >>> patch we can now do the following,
> > >>>
> > >>>           dev = ctx->rx_dev;
> > >>>           net = dev->nd_net.net;
> > >>>
> > >>> 	uid.ifindex = dev->ifindex;
> > >>> 	memcpy(uid.ifname, dev->ifname, NAME);
> > >>>           if (net)
> > >>> 		uid.inum = net->ns.inum;
> > >>>
> > >>> to report the name, index and ns.inum which identifies an
> > >>> interface in our system.
> > >>
> > >> In
> > >> https://lore.kernel.org/bpf/ad15b398-9069-4a0e-48cb-4bb651ec3088@meta.com/
> > >> Namhyung Kim wanted to access new perf data with a helper.
> > >> I proposed a helper bpf_get_kern_ctx() which will get
> > >> the kernel ctx struct from which the actual perf data
> > >> can be retrieved. The interface looks like
> > >> 	void *bpf_get_kern_ctx(void *)
> > >> the input parameter needs to be a PTR_TO_CTX and
> > >> the verifer is able to return the corresponding kernel
> > >> ctx struct based on program type.
> > >>
> > >> The following is really hacked demonstration with
> > >> some of change coming from my bpf_rcu_read_lock()
> > >> patch set https://lore.kernel.org/bpf/20221109211944.3213817-1-yhs@fb.com/
> > >>
> > >> I modified your test to utilize the
> > >> bpf_get_kern_ctx() helper in your test_xdp_md.c.
> > >>
> > >> With this single helper, we can cover the above perf
> > >> data use case and your use case and maybe others
> > >> to avoid new UAPI changes.
> > > 
> > > hmm I like the idea of just accessing the xdp_buff directly
> > > instead of adding more fields. I'm less convinced of the
> > > kfunc approach. What about a terminating field *self in the
> > > xdp_md. Then we can use existing convert_ctx_access to make
> > > it BPF inlined and no verifier changes needed.
> > > 
> > > Something like this quickly typed up and not compiled, but
> > > I think shows what I'm thinking.
> > > 
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index 94659f6b3395..10ebd90d6677 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -6123,6 +6123,10 @@ struct xdp_md {
> > >          __u32 rx_queue_index;  /* rxq->queue_index  */
> > >   
> > >          __u32 egress_ifindex;  /* txq->dev->ifindex */
> > > +       /* Last xdp_md entry, for new types add directly to xdp_buff and use
> > > +        * BTF access. Reading this gives BTF access to xdp_buff.
> > > +        */
> > > +       __bpf_md_ptr(struct xdp_buff *, self);
> > >   };
> > 
> > This would be the first instance to have a kernel internal struct
> > in a uapi struct. Not sure whether this is a good idea or not.
> 
> We can use probe_read from some of the socket progs already but
> sure.
> 
> > 
> > >   
> > >   /* DEVMAP map-value layout
> > > diff --git a/net/core/filter.c b/net/core/filter.c
> > > index bb0136e7a8e4..547e9576a918 100644
> > > --- a/net/core/filter.c
> > > +++ b/net/core/filter.c
> > > @@ -9808,6 +9808,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
> > >                  *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
> > >                                        offsetof(struct net_device, ifindex));
> > >                  break;
> > > +       case offsetof(struct xdp_md, self):
> > > +               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, self),
> > > +                                     si->dst_reg, si->src_reg,
> > > +                                     offsetof(struct xdp_buff, 0));
> > > +               break;
> > >          }
> > >   
> > >          return insn - insn_buf;
> > > 
> > > Actually even that single insn conversion is a bit unnessary because
> > > should be enough to just change the type to the correct BTF_ID in the
> > > verifier and omit any instructions. But it wwould be a bit confusing
> > > for C side. Might be a good use for passing 'cast' info through to
> > > the verifier as an annotation so it could just do the BTF_ID cast for
> > > us without any insns.
> > 
> > We cannot change the context type to BTF_ID style which will be a
> > uapi violation.
> 
> I don't think it would be uapi violation if user asks for it
> by annotating the cast.
> 
> > 
> > The helper I proposed can be rewritten by verifier as
> > 	r0 = r1
> > so we should not have overhead for this.
> 
> Agree other than reading the bpf asm where its a bit odd.
> 
> > It cover all program types with known uapi ctx -> kern ctx
> > conversions. So there is no need to change existing uapi structs.
> > Also I except that most people probably won't use this kfunc.
> > The existing uapi fields might already serve most needs.
> 
> Maybe not sure missing some things we need.
> 
> > 
> > Internally we have another use case to access some 'struct sock' fields
> > but the uapi struct only has struct bpf_sock. Currently it is advised
> > to use bpf_probe_read_kernel(...) to get the needed information.
> > The proposed helper should help that too without uapi change.
> 
> Yep.
> 
> I'm fine doing it with bpf_get_kern_ctx() did you want me to code it
> the rest of the way up and test it?
> 
> .John

Related I think. We also want to get kernel variable net_namespace_list,
this points to the network namespace lists. Based on above should
we do something like,

  void *bpf_get_kern_var(enum var_id);

then,

  net_ns_list = bpf_get_kern_var(__btf_net_namesapce_list);

would get us a ptr to the list? The other thought was to put it in the
xdp_md but from above seems better idea to get it through helper.
kernel test robot Nov. 11, 2022, 1:13 a.m. UTC | #9
Hi John,

I love your patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/John-Fastabend/bpf-expose-net_device-from-xdp-for-metadata/20221110-055550
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20221109215242.1279993-2-john.fastabend%40gmail.com
patch subject: [1/2 bpf-next] bpf: expose net_device from xdp for metadata
config: mips-bcm63xx_defconfig
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 463da45892e2d2a262277b91b96f5f8c05dc25d0)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install mips cross compiling tool for clang build
        # apt-get install binutils-mips-linux-gnu
        # https://github.com/intel-lab-lkp/linux/commit/410d4baa8908e723a4b0ef7ca54e24462bb0ea16
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review John-Fastabend/bpf-expose-net_device-from-xdp-for-metadata/20221110-055550
        git checkout 410d4baa8908e723a4b0ef7ca54e24462bb0ea16
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=mips SHELL=/bin/bash net/core/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> net/core/filter.c:8730:15: error: call to undeclared function 'bpf_get_btf_vmlinux'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
                   info->btf = bpf_get_btf_vmlinux();
                               ^
>> net/core/filter.c:8730:13: error: incompatible integer to pointer conversion assigning to 'struct btf *' from 'int' [-Wint-conversion]
                   info->btf = bpf_get_btf_vmlinux();
                             ^ ~~~~~~~~~~~~~~~~~~~~~
   2 errors generated.


vim +/bpf_get_btf_vmlinux +8730 net/core/filter.c

  8694	
  8695	static bool xdp_is_valid_access(int off, int size,
  8696					enum bpf_access_type type,
  8697					const struct bpf_prog *prog,
  8698					struct bpf_insn_access_aux *info)
  8699	{
  8700		if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
  8701			switch (off) {
  8702			case offsetof(struct xdp_md, egress_ifindex):
  8703				return false;
  8704			}
  8705		}
  8706	
  8707		if (type == BPF_WRITE) {
  8708			if (bpf_prog_is_dev_bound(prog->aux)) {
  8709				switch (off) {
  8710				case offsetof(struct xdp_md, rx_queue_index):
  8711					return __is_valid_xdp_access(off, size);
  8712				}
  8713			}
  8714			return false;
  8715		}
  8716	
  8717		switch (off) {
  8718		case offsetof(struct xdp_md, data):
  8719			info->reg_type = PTR_TO_PACKET;
  8720			break;
  8721		case offsetof(struct xdp_md, data_meta):
  8722			info->reg_type = PTR_TO_PACKET_META;
  8723			break;
  8724		case offsetof(struct xdp_md, data_end):
  8725			info->reg_type = PTR_TO_PACKET_END;
  8726			break;
  8727		case offsetof(struct xdp_md, rx_dev):
  8728			info->reg_type = PTR_TO_BTF_ID;
  8729			info->btf_id = btf_xdp_get_netdev_id[0];
> 8730			info->btf = bpf_get_btf_vmlinux();
  8731		        if (IS_ERR_OR_NULL(info->btf))
  8732				return false;
  8733			if (size != sizeof(u64))
  8734				return false;
  8735			return true;
  8736		}
  8737	
  8738		return __is_valid_xdp_access(off, size);
  8739	}
  8740
kernel test robot Nov. 11, 2022, 3:04 a.m. UTC | #10
Hi John,

I love your patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/John-Fastabend/bpf-expose-net_device-from-xdp-for-metadata/20221110-055550
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20221109215242.1279993-2-john.fastabend%40gmail.com
patch subject: [1/2 bpf-next] bpf: expose net_device from xdp for metadata
config: i386-defconfig
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/410d4baa8908e723a4b0ef7ca54e24462bb0ea16
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review John-Fastabend/bpf-expose-net_device-from-xdp-for-metadata/20221110-055550
        git checkout 410d4baa8908e723a4b0ef7ca54e24462bb0ea16
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash net/core/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   net/core/filter.c: In function 'xdp_is_valid_access':
   net/core/filter.c:8730:29: error: implicit declaration of function 'bpf_get_btf_vmlinux' [-Werror=implicit-function-declaration]
    8730 |                 info->btf = bpf_get_btf_vmlinux();
         |                             ^~~~~~~~~~~~~~~~~~~
>> net/core/filter.c:8730:27: warning: assignment to 'struct btf *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
    8730 |                 info->btf = bpf_get_btf_vmlinux();
         |                           ^
   cc1: some warnings being treated as errors


vim +8730 net/core/filter.c

  8694	
  8695	static bool xdp_is_valid_access(int off, int size,
  8696					enum bpf_access_type type,
  8697					const struct bpf_prog *prog,
  8698					struct bpf_insn_access_aux *info)
  8699	{
  8700		if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
  8701			switch (off) {
  8702			case offsetof(struct xdp_md, egress_ifindex):
  8703				return false;
  8704			}
  8705		}
  8706	
  8707		if (type == BPF_WRITE) {
  8708			if (bpf_prog_is_dev_bound(prog->aux)) {
  8709				switch (off) {
  8710				case offsetof(struct xdp_md, rx_queue_index):
  8711					return __is_valid_xdp_access(off, size);
  8712				}
  8713			}
  8714			return false;
  8715		}
  8716	
  8717		switch (off) {
  8718		case offsetof(struct xdp_md, data):
  8719			info->reg_type = PTR_TO_PACKET;
  8720			break;
  8721		case offsetof(struct xdp_md, data_meta):
  8722			info->reg_type = PTR_TO_PACKET_META;
  8723			break;
  8724		case offsetof(struct xdp_md, data_end):
  8725			info->reg_type = PTR_TO_PACKET_END;
  8726			break;
  8727		case offsetof(struct xdp_md, rx_dev):
  8728			info->reg_type = PTR_TO_BTF_ID;
  8729			info->btf_id = btf_xdp_get_netdev_id[0];
> 8730			info->btf = bpf_get_btf_vmlinux();
  8731		        if (IS_ERR_OR_NULL(info->btf))
  8732				return false;
  8733			if (size != sizeof(u64))
  8734				return false;
  8735			return true;
  8736		}
  8737	
  8738		return __is_valid_xdp_access(off, size);
  8739	}
  8740
kernel test robot Nov. 11, 2022, 5:15 a.m. UTC | #11
Hi John,

I love your patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/John-Fastabend/bpf-expose-net_device-from-xdp-for-metadata/20221110-055550
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20221109215242.1279993-2-john.fastabend%40gmail.com
patch subject: [1/2 bpf-next] bpf: expose net_device from xdp for metadata
config: i386-defconfig
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/410d4baa8908e723a4b0ef7ca54e24462bb0ea16
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review John-Fastabend/bpf-expose-net_device-from-xdp-for-metadata/20221110-055550
        git checkout 410d4baa8908e723a4b0ef7ca54e24462bb0ea16
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   net/core/filter.c: In function 'xdp_is_valid_access':
>> net/core/filter.c:8730:29: error: implicit declaration of function 'bpf_get_btf_vmlinux' [-Werror=implicit-function-declaration]
    8730 |                 info->btf = bpf_get_btf_vmlinux();
         |                             ^~~~~~~~~~~~~~~~~~~
   net/core/filter.c:8730:27: warning: assignment to 'struct btf *' from 'int' makes pointer from integer without a cast [-Wint-conversion]
    8730 |                 info->btf = bpf_get_btf_vmlinux();
         |                           ^
   cc1: some warnings being treated as errors


vim +/bpf_get_btf_vmlinux +8730 net/core/filter.c

  8694	
  8695	static bool xdp_is_valid_access(int off, int size,
  8696					enum bpf_access_type type,
  8697					const struct bpf_prog *prog,
  8698					struct bpf_insn_access_aux *info)
  8699	{
  8700		if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
  8701			switch (off) {
  8702			case offsetof(struct xdp_md, egress_ifindex):
  8703				return false;
  8704			}
  8705		}
  8706	
  8707		if (type == BPF_WRITE) {
  8708			if (bpf_prog_is_dev_bound(prog->aux)) {
  8709				switch (off) {
  8710				case offsetof(struct xdp_md, rx_queue_index):
  8711					return __is_valid_xdp_access(off, size);
  8712				}
  8713			}
  8714			return false;
  8715		}
  8716	
  8717		switch (off) {
  8718		case offsetof(struct xdp_md, data):
  8719			info->reg_type = PTR_TO_PACKET;
  8720			break;
  8721		case offsetof(struct xdp_md, data_meta):
  8722			info->reg_type = PTR_TO_PACKET_META;
  8723			break;
  8724		case offsetof(struct xdp_md, data_end):
  8725			info->reg_type = PTR_TO_PACKET_END;
  8726			break;
  8727		case offsetof(struct xdp_md, rx_dev):
  8728			info->reg_type = PTR_TO_BTF_ID;
  8729			info->btf_id = btf_xdp_get_netdev_id[0];
> 8730			info->btf = bpf_get_btf_vmlinux();
  8731		        if (IS_ERR_OR_NULL(info->btf))
  8732				return false;
  8733			if (size != sizeof(u64))
  8734				return false;
  8735			return true;
  8736		}
  8737	
  8738		return __is_valid_xdp_access(off, size);
  8739	}
  8740
Yonghong Song Nov. 11, 2022, 6:28 a.m. UTC | #12
On 11/10/22 2:58 PM, John Fastabend wrote:
> Yonghong Song wrote:
>>
>>
>> On 11/9/22 6:17 PM, John Fastabend wrote:
>>> Yonghong Song wrote:
>>>>
>>>>
>>>> On 11/9/22 1:52 PM, John Fastabend wrote:
>>>>> Allow xdp progs to read the net_device structure. Its useful to extract
>>>>> info from the dev itself. Currently, our tracing tooling uses kprobes
>>>>> to capture statistics and information about running net devices. We use
>>>>> kprobes instead of other hooks tc/xdp because we need to collect
>>>>> information about the interface not exposed through the xdp_md structures.
>>>>> This has some down sides that we want to avoid by moving these into the
>>>>> XDP hook itself. First, placing the kprobes in a generic function in
>>>>> the kernel is after XDP so we miss redirects and such done by the
>>>>> XDP networking program. And its needless overhead because we are
>>>>> already paying the cost for calling the XDP program, calling yet
>>>>> another prog is a waste. Better to do everything in one hook from
>>>>> performance side.
>>>>>
>>>>> Of course we could one-off each one of these fields, but that would
>>>>> explode the xdp_md struct and then require writing convert_ctx_access
>>>>> writers for each field. By using BTF we avoid writing field specific
>>>>> convertion logic, BTF just knows how to read the fields, we don't
>>>>> have to add many fields to xdp_md, and I don't have to get every
>>>>> field we will use in the future correct.
>>>>>
>>>>> For reference current examples in our code base use the ifindex,
>>>>> ifname, qdisc stats, net_ns fields, among others. With this
>>>>> patch we can now do the following,
>>>>>
>>>>>            dev = ctx->rx_dev;
>>>>>            net = dev->nd_net.net;
>>>>>
>>>>> 	uid.ifindex = dev->ifindex;
>>>>> 	memcpy(uid.ifname, dev->ifname, NAME);
>>>>>            if (net)
>>>>> 		uid.inum = net->ns.inum;
>>>>>
>>>>> to report the name, index and ns.inum which identifies an
>>>>> interface in our system.
>>>>
>>>> In
>>>> https://lore.kernel.org/bpf/ad15b398-9069-4a0e-48cb-4bb651ec3088@meta.com/
>>>> Namhyung Kim wanted to access new perf data with a helper.
>>>> I proposed a helper bpf_get_kern_ctx() which will get
>>>> the kernel ctx struct from which the actual perf data
>>>> can be retrieved. The interface looks like
>>>> 	void *bpf_get_kern_ctx(void *)
>>>> the input parameter needs to be a PTR_TO_CTX and
>>>> the verifer is able to return the corresponding kernel
>>>> ctx struct based on program type.
>>>>
>>>> The following is really hacked demonstration with
>>>> some of change coming from my bpf_rcu_read_lock()
>>>> patch set https://lore.kernel.org/bpf/20221109211944.3213817-1-yhs@fb.com/
>>>>
>>>> I modified your test to utilize the
>>>> bpf_get_kern_ctx() helper in your test_xdp_md.c.
>>>>
>>>> With this single helper, we can cover the above perf
>>>> data use case and your use case and maybe others
>>>> to avoid new UAPI changes.
>>>
>>> hmm I like the idea of just accessing the xdp_buff directly
>>> instead of adding more fields. I'm less convinced of the
>>> kfunc approach. What about a terminating field *self in the
>>> xdp_md. Then we can use existing convert_ctx_access to make
>>> it BPF inlined and no verifier changes needed.
>>>
>>> Something like this quickly typed up and not compiled, but
>>> I think shows what I'm thinking.
>>>
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index 94659f6b3395..10ebd90d6677 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -6123,6 +6123,10 @@ struct xdp_md {
>>>           __u32 rx_queue_index;  /* rxq->queue_index  */
>>>    
>>>           __u32 egress_ifindex;  /* txq->dev->ifindex */
>>> +       /* Last xdp_md entry, for new types add directly to xdp_buff and use
>>> +        * BTF access. Reading this gives BTF access to xdp_buff.
>>> +        */
>>> +       __bpf_md_ptr(struct xdp_buff *, self);
>>>    };
>>
>> This would be the first instance to have a kernel internal struct
>> in a uapi struct. Not sure whether this is a good idea or not.
> 
> We can use probe_read from some of the socket progs already but
> sure.
> 
>>
>>>    
>>>    /* DEVMAP map-value layout
>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>> index bb0136e7a8e4..547e9576a918 100644
>>> --- a/net/core/filter.c
>>> +++ b/net/core/filter.c
>>> @@ -9808,6 +9808,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
>>>                   *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
>>>                                         offsetof(struct net_device, ifindex));
>>>                   break;
>>> +       case offsetof(struct xdp_md, self):
>>> +               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, self),
>>> +                                     si->dst_reg, si->src_reg,
>>> +                                     offsetof(struct xdp_buff, 0));
>>> +               break;
>>>           }
>>>    
>>>           return insn - insn_buf;
>>>
>>> Actually even that single insn conversion is a bit unnessary because
>>> should be enough to just change the type to the correct BTF_ID in the
>>> verifier and omit any instructions. But it wwould be a bit confusing
>>> for C side. Might be a good use for passing 'cast' info through to
>>> the verifier as an annotation so it could just do the BTF_ID cast for
>>> us without any insns.
>>
>> We cannot change the context type to BTF_ID style which will be a
>> uapi violation.
> 
> I don't think it would be uapi violation if user asks for it
> by annotating the cast.
> 
>>
>> The helper I proposed can be rewritten by verifier as
>> 	r0 = r1
>> so we should not have overhead for this.
> 
> Agree other than reading the bpf asm where its a bit odd.
> 
>> It cover all program types with known uapi ctx -> kern ctx
>> conversions. So there is no need to change existing uapi structs.
>> Also I except that most people probably won't use this kfunc.
>> The existing uapi fields might already serve most needs.
> 
> Maybe not sure missing some things we need.
> 
>>
>> Internally we have another use case to access some 'struct sock' fields
>> but the uapi struct only has struct bpf_sock. Currently it is advised
>> to use bpf_probe_read_kernel(...) to get the needed information.
>> The proposed helper should help that too without uapi change.
> 
> Yep.
> 
> I'm fine doing it with bpf_get_kern_ctx() did you want me to code it
> the rest of the way up and test it?

I have an off-line discussion with Martin. Martin has a use case
to get the btf_id from a pointer casting, something like
     void *p = ...
     struct t *pt = (struct t *)p; // having a kfunc to do this.
So I would like to generate the above helper to be something like
    bpf_get_kern_btf_id(...)
which will cover ctx case as well.

I should be able to have a RFC patch ready next week.

> 
> .John
Yonghong Song Nov. 11, 2022, 6:34 a.m. UTC | #13
On 11/10/22 3:11 PM, John Fastabend wrote:
> John Fastabend wrote:
>> Yonghong Song wrote:
>>>
>>>
>>> On 11/9/22 6:17 PM, John Fastabend wrote:
>>>> Yonghong Song wrote:
>>>>>
>>>>>
>>>>> On 11/9/22 1:52 PM, John Fastabend wrote:
>>>>>> Allow xdp progs to read the net_device structure. Its useful to extract
>>>>>> info from the dev itself. Currently, our tracing tooling uses kprobes
>>>>>> to capture statistics and information about running net devices. We use
>>>>>> kprobes instead of other hooks tc/xdp because we need to collect
>>>>>> information about the interface not exposed through the xdp_md structures.
>>>>>> This has some down sides that we want to avoid by moving these into the
>>>>>> XDP hook itself. First, placing the kprobes in a generic function in
>>>>>> the kernel is after XDP so we miss redirects and such done by the
>>>>>> XDP networking program. And its needless overhead because we are
>>>>>> already paying the cost for calling the XDP program, calling yet
>>>>>> another prog is a waste. Better to do everything in one hook from
>>>>>> performance side.
>>>>>>
>>>>>> Of course we could one-off each one of these fields, but that would
>>>>>> explode the xdp_md struct and then require writing convert_ctx_access
>>>>>> writers for each field. By using BTF we avoid writing field specific
>>>>>> convertion logic, BTF just knows how to read the fields, we don't
>>>>>> have to add many fields to xdp_md, and I don't have to get every
>>>>>> field we will use in the future correct.
>>>>>>
>>>>>> For reference current examples in our code base use the ifindex,
>>>>>> ifname, qdisc stats, net_ns fields, among others. With this
>>>>>> patch we can now do the following,
>>>>>>
>>>>>>            dev = ctx->rx_dev;
>>>>>>            net = dev->nd_net.net;
>>>>>>
>>>>>> 	uid.ifindex = dev->ifindex;
>>>>>> 	memcpy(uid.ifname, dev->ifname, NAME);
>>>>>>            if (net)
>>>>>> 		uid.inum = net->ns.inum;
>>>>>>
>>>>>> to report the name, index and ns.inum which identifies an
>>>>>> interface in our system.
>>>>>
>>>>> In
>>>>> https://lore.kernel.org/bpf/ad15b398-9069-4a0e-48cb-4bb651ec3088@meta.com/
>>>>> Namhyung Kim wanted to access new perf data with a helper.
>>>>> I proposed a helper bpf_get_kern_ctx() which will get
>>>>> the kernel ctx struct from which the actual perf data
>>>>> can be retrieved. The interface looks like
>>>>> 	void *bpf_get_kern_ctx(void *)
>>>>> the input parameter needs to be a PTR_TO_CTX and
>>>>> the verifer is able to return the corresponding kernel
>>>>> ctx struct based on program type.
>>>>>
>>>>> The following is really hacked demonstration with
>>>>> some of change coming from my bpf_rcu_read_lock()
>>>>> patch set https://lore.kernel.org/bpf/20221109211944.3213817-1-yhs@fb.com/
>>>>>
>>>>> I modified your test to utilize the
>>>>> bpf_get_kern_ctx() helper in your test_xdp_md.c.
>>>>>
>>>>> With this single helper, we can cover the above perf
>>>>> data use case and your use case and maybe others
>>>>> to avoid new UAPI changes.
>>>>
>>>> hmm I like the idea of just accessing the xdp_buff directly
>>>> instead of adding more fields. I'm less convinced of the
>>>> kfunc approach. What about a terminating field *self in the
>>>> xdp_md. Then we can use existing convert_ctx_access to make
>>>> it BPF inlined and no verifier changes needed.
>>>>
>>>> Something like this quickly typed up and not compiled, but
>>>> I think shows what I'm thinking.
>>>>
>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>> index 94659f6b3395..10ebd90d6677 100644
>>>> --- a/include/uapi/linux/bpf.h
>>>> +++ b/include/uapi/linux/bpf.h
>>>> @@ -6123,6 +6123,10 @@ struct xdp_md {
>>>>           __u32 rx_queue_index;  /* rxq->queue_index  */
>>>>    
>>>>           __u32 egress_ifindex;  /* txq->dev->ifindex */
>>>> +       /* Last xdp_md entry, for new types add directly to xdp_buff and use
>>>> +        * BTF access. Reading this gives BTF access to xdp_buff.
>>>> +        */
>>>> +       __bpf_md_ptr(struct xdp_buff *, self);
>>>>    };
>>>
>>> This would be the first instance to have a kernel internal struct
>>> in a uapi struct. Not sure whether this is a good idea or not.
>>
>> We can use probe_read from some of the socket progs already but
>> sure.
>>
>>>
>>>>    
>>>>    /* DEVMAP map-value layout
>>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>>> index bb0136e7a8e4..547e9576a918 100644
>>>> --- a/net/core/filter.c
>>>> +++ b/net/core/filter.c
>>>> @@ -9808,6 +9808,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
>>>>                   *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
>>>>                                         offsetof(struct net_device, ifindex));
>>>>                   break;
>>>> +       case offsetof(struct xdp_md, self):
>>>> +               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, self),
>>>> +                                     si->dst_reg, si->src_reg,
>>>> +                                     offsetof(struct xdp_buff, 0));
>>>> +               break;
>>>>           }
>>>>    
>>>>           return insn - insn_buf;
>>>>
>>>> Actually even that single insn conversion is a bit unnessary because
>>>> should be enough to just change the type to the correct BTF_ID in the
>>>> verifier and omit any instructions. But it wwould be a bit confusing
>>>> for C side. Might be a good use for passing 'cast' info through to
>>>> the verifier as an annotation so it could just do the BTF_ID cast for
>>>> us without any insns.
>>>
>>> We cannot change the context type to BTF_ID style which will be a
>>> uapi violation.
>>
>> I don't think it would be uapi violation if user asks for it
>> by annotating the cast.
>>
>>>
>>> The helper I proposed can be rewritten by verifier as
>>> 	r0 = r1
>>> so we should not have overhead for this.
>>
>> Agree other than reading the bpf asm where its a bit odd.
>>
>>> It cover all program types with known uapi ctx -> kern ctx
>>> conversions. So there is no need to change existing uapi structs.
>>> Also I except that most people probably won't use this kfunc.
>>> The existing uapi fields might already serve most needs.
>>
>> Maybe not sure missing some things we need.
>>
>>>
>>> Internally we have another use case to access some 'struct sock' fields
>>> but the uapi struct only has struct bpf_sock. Currently it is advised
>>> to use bpf_probe_read_kernel(...) to get the needed information.
>>> The proposed helper should help that too without uapi change.
>>
>> Yep.
>>
>> I'm fine doing it with bpf_get_kern_ctx() did you want me to code it
>> the rest of the way up and test it?
>>
>> .John
> 
> Related I think. We also want to get kernel variable net_namespace_list,
> this points to the network namespace lists. Based on above should
> we do something like,
> 
>    void *bpf_get_kern_var(enum var_id);
> 
> then,
> 
>    net_ns_list = bpf_get_kern_var(__btf_net_namesapce_list);
> 
> would get us a ptr to the list? The other thought was to put it in the
> xdp_md but from above seems better idea to get it through helper.

Sounds great. I guess my new proposed bpf_get_kern_btf_id() kfunc could
cover such a use case as well.
Jesper Dangaard Brouer Nov. 11, 2022, 10:51 a.m. UTC | #14
On 10/11/2022 18.02, John Fastabend wrote:
> Toke Høiland-Jørgensen wrote:
>> John Fastabend <john.fastabend@gmail.com> writes:
>>
>>> Yonghong Song wrote:
>>>>
>>>>
>>>> On 11/9/22 1:52 PM, John Fastabend wrote:
>>>>> Allow xdp progs to read the net_device structure. Its useful to extract
>>>>> info from the dev itself. Currently, our tracing tooling uses kprobes
>>>>> to capture statistics and information about running net devices. We use
>>>>> kprobes instead of other hooks tc/xdp because we need to collect
>>>>> information about the interface not exposed through the xdp_md structures.
>>>>> This has some down sides that we want to avoid by moving these into the
>>>>> XDP hook itself. First, placing the kprobes in a generic function in
>>>>> the kernel is after XDP so we miss redirects and such done by the
>>>>> XDP networking program. And its needless overhead because we are
>>>>> already paying the cost for calling the XDP program, calling yet
>>>>> another prog is a waste. Better to do everything in one hook from
>>>>> performance side.
>>>>>
>>>>> Of course we could one-off each one of these fields, but that would
>>>>> explode the xdp_md struct and then require writing convert_ctx_access
>>>>> writers for each field. By using BTF we avoid writing field specific
>>>>> convertion logic, BTF just knows how to read the fields, we don't
>>>>> have to add many fields to xdp_md, and I don't have to get every
>>>>> field we will use in the future correct.
>>>>>
>>>>> For reference current examples in our code base use the ifindex,
>>>>> ifname, qdisc stats, net_ns fields, among others. With this
>>>>> patch we can now do the following,
>>>>>
>>>>>           dev = ctx->rx_dev;
>>>>>           net = dev->nd_net.net;
>>>>>
>>>>> 	uid.ifindex = dev->ifindex;
>>>>> 	memcpy(uid.ifname, dev->ifname, NAME);
>>>>>           if (net)
>>>>> 		uid.inum = net->ns.inum;
>>>>>
>>>>> to report the name, index and ns.inum which identifies an
>>>>> interface in our system.
>>>>
>>>> In
>>>> https://lore.kernel.org/bpf/ad15b398-9069-4a0e-48cb-4bb651ec3088@meta.com/
>>>> Namhyung Kim wanted to access new perf data with a helper.
>>>> I proposed a helper bpf_get_kern_ctx() which will get
>>>> the kernel ctx struct from which the actual perf data
>>>> can be retrieved. The interface looks like
>>>> 	void *bpf_get_kern_ctx(void *)
>>>> the input parameter needs to be a PTR_TO_CTX and
>>>> the verifer is able to return the corresponding kernel
>>>> ctx struct based on program type.
>>>>
>>>> The following is really hacked demonstration with
>>>> some of change coming from my bpf_rcu_read_lock()
>>>> patch set https://lore.kernel.org/bpf/20221109211944.3213817-1-yhs@fb.com/
>>>>
>>>> I modified your test to utilize the
>>>> bpf_get_kern_ctx() helper in your test_xdp_md.c.
>>>>
>>>> With this single helper, we can cover the above perf
>>>> data use case and your use case and maybe others
>>>> to avoid new UAPI changes.
>>>
>>> hmm I like the idea of just accessing the xdp_buff directly
>>> instead of adding more fields. I'm less convinced of the
>>> kfunc approach. What about a terminating field *self in the
>>> xdp_md. Then we can use existing convert_ctx_access to make
>>> it BPF inlined and no verifier changes needed.
>>>
>>> Something like this quickly typed up and not compiled, but
>>> I think shows what I'm thinking.
>>>
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index 94659f6b3395..10ebd90d6677 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -6123,6 +6123,10 @@ struct xdp_md {
>>>          __u32 rx_queue_index;  /* rxq->queue_index  */
>>>   
>>>          __u32 egress_ifindex;  /* txq->dev->ifindex */
>>> +       /* Last xdp_md entry, for new types add directly to xdp_buff and use
>>> +        * BTF access. Reading this gives BTF access to xdp_buff.
>>> +        */
>>> +       __bpf_md_ptr(struct xdp_buff *, self);
>>>   };
>>
>> xdp_md is UAPI; I really don't think it's a good idea to add "unstable"
>> BTF fields like this to it, that's just going to confuse people. Tying
>> this to a kfunc for conversion is more consistent with the whole "kfunc
>> and BTF are its own thing" expectation.
> 
> hmm from my side self here would be stable. Whats behind it is not,
> but that seems fine to me.  Doing `ctx->self` feels more natural imo
> then doing a call. A bunch more work but could do btf casts maybe
> with annotations. I'm not sure its worth it though because only reason
> I can think to do this would be for this self reference from ctx.
> 
>     struct xdp_buff *xdp = __btf (struct xdp_buff *)ctx;
> 
> C++ has 'this' as well but thats confusing from C side. Could have
> a common syntax to do 'ctx->this' to get the pointer in BTF
> format.
> 
> Maybe see what Yonghong thinks.
> 
>>
>> The kfunc doesn't actually have to execute any instructions either, it
>> can just be collapsed into a type conversion to BTF inside the verifier,
>> no?
> 
> Agree either implementation can be made that same underneath its just
> a style question. I can probably do either but using the ctx keeps
> the existing machinery to go through is_valid_access and so on.
> 

What kind of access does the BPF-prog obtain with these different
proposals, e.g. read-only access to xdp_buff or also write access?

--Jesper
Yonghong Song Nov. 11, 2022, 3:15 p.m. UTC | #15
On 11/11/22 2:51 AM, Jesper Dangaard Brouer wrote:
> 
> 
> On 10/11/2022 18.02, John Fastabend wrote:
>> Toke Høiland-Jørgensen wrote:
>>> John Fastabend <john.fastabend@gmail.com> writes:
>>>
>>>> Yonghong Song wrote:
>>>>>
>>>>>
>>>>> On 11/9/22 1:52 PM, John Fastabend wrote:
>>>>>> Allow xdp progs to read the net_device structure. Its useful to 
>>>>>> extract
>>>>>> info from the dev itself. Currently, our tracing tooling uses kprobes
>>>>>> to capture statistics and information about running net devices. 
>>>>>> We use
>>>>>> kprobes instead of other hooks tc/xdp because we need to collect
>>>>>> information about the interface not exposed through the xdp_md 
>>>>>> structures.
>>>>>> This has some down sides that we want to avoid by moving these 
>>>>>> into the
>>>>>> XDP hook itself. First, placing the kprobes in a generic function in
>>>>>> the kernel is after XDP so we miss redirects and such done by the
>>>>>> XDP networking program. And its needless overhead because we are
>>>>>> already paying the cost for calling the XDP program, calling yet
>>>>>> another prog is a waste. Better to do everything in one hook from
>>>>>> performance side.
>>>>>>
>>>>>> Of course we could one-off each one of these fields, but that would
>>>>>> explode the xdp_md struct and then require writing convert_ctx_access
>>>>>> writers for each field. By using BTF we avoid writing field specific
>>>>>> convertion logic, BTF just knows how to read the fields, we don't
>>>>>> have to add many fields to xdp_md, and I don't have to get every
>>>>>> field we will use in the future correct.
>>>>>>
>>>>>> For reference current examples in our code base use the ifindex,
>>>>>> ifname, qdisc stats, net_ns fields, among others. With this
>>>>>> patch we can now do the following,
>>>>>>
>>>>>>           dev = ctx->rx_dev;
>>>>>>           net = dev->nd_net.net;
>>>>>>
>>>>>>     uid.ifindex = dev->ifindex;
>>>>>>     memcpy(uid.ifname, dev->ifname, NAME);
>>>>>>           if (net)
>>>>>>         uid.inum = net->ns.inum;
>>>>>>
>>>>>> to report the name, index and ns.inum which identifies an
>>>>>> interface in our system.
>>>>>
>>>>> In
>>>>> https://lore.kernel.org/bpf/ad15b398-9069-4a0e-48cb-4bb651ec3088@meta.com/
>>>>> Namhyung Kim wanted to access new perf data with a helper.
>>>>> I proposed a helper bpf_get_kern_ctx() which will get
>>>>> the kernel ctx struct from which the actual perf data
>>>>> can be retrieved. The interface looks like
>>>>>     void *bpf_get_kern_ctx(void *)
>>>>> the input parameter needs to be a PTR_TO_CTX and
>>>>> the verifer is able to return the corresponding kernel
>>>>> ctx struct based on program type.
>>>>>
>>>>> The following is really hacked demonstration with
>>>>> some of change coming from my bpf_rcu_read_lock()
>>>>> patch set 
>>>>> https://lore.kernel.org/bpf/20221109211944.3213817-1-yhs@fb.com/
>>>>>
>>>>> I modified your test to utilize the
>>>>> bpf_get_kern_ctx() helper in your test_xdp_md.c.
>>>>>
>>>>> With this single helper, we can cover the above perf
>>>>> data use case and your use case and maybe others
>>>>> to avoid new UAPI changes.
>>>>
>>>> hmm I like the idea of just accessing the xdp_buff directly
>>>> instead of adding more fields. I'm less convinced of the
>>>> kfunc approach. What about a terminating field *self in the
>>>> xdp_md. Then we can use existing convert_ctx_access to make
>>>> it BPF inlined and no verifier changes needed.
>>>>
>>>> Something like this quickly typed up and not compiled, but
>>>> I think shows what I'm thinking.
>>>>
>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>> index 94659f6b3395..10ebd90d6677 100644
>>>> --- a/include/uapi/linux/bpf.h
>>>> +++ b/include/uapi/linux/bpf.h
>>>> @@ -6123,6 +6123,10 @@ struct xdp_md {
>>>>          __u32 rx_queue_index;  /* rxq->queue_index  */
>>>>          __u32 egress_ifindex;  /* txq->dev->ifindex */
>>>> +       /* Last xdp_md entry, for new types add directly to xdp_buff 
>>>> and use
>>>> +        * BTF access. Reading this gives BTF access to xdp_buff.
>>>> +        */
>>>> +       __bpf_md_ptr(struct xdp_buff *, self);
>>>>   };
>>>
>>> xdp_md is UAPI; I really don't think it's a good idea to add "unstable"
>>> BTF fields like this to it, that's just going to confuse people. Tying
>>> this to a kfunc for conversion is more consistent with the whole "kfunc
>>> and BTF are its own thing" expectation.
>>
>> hmm from my side self here would be stable. Whats behind it is not,
>> but that seems fine to me.  Doing `ctx->self` feels more natural imo
>> then doing a call. A bunch more work but could do btf casts maybe
>> with annotations. I'm not sure its worth it though because only reason
>> I can think to do this would be for this self reference from ctx.
>>
>>     struct xdp_buff *xdp = __btf (struct xdp_buff *)ctx;
>>
>> C++ has 'this' as well but thats confusing from C side. Could have
>> a common syntax to do 'ctx->this' to get the pointer in BTF
>> format.
>>
>> Maybe see what Yonghong thinks.
>>
>>>
>>> The kfunc doesn't actually have to execute any instructions either, it
>>> can just be collapsed into a type conversion to BTF inside the verifier,
>>> no?
>>
>> Agree either implementation can be made that same underneath its just
>> a style question. I can probably do either but using the ctx keeps
>> the existing machinery to go through is_valid_access and so on.
>>
> 
> What kind of access does the BPF-prog obtain with these different
> proposals, e.g. read-only access to xdp_buff or also write access?

read-only access.

> 
> --Jesper
>
John Fastabend Nov. 13, 2022, 6:27 p.m. UTC | #16
Yonghong Song wrote:
> 
> 
> On 11/10/22 3:11 PM, John Fastabend wrote:
> > John Fastabend wrote:
> >> Yonghong Song wrote:
> >>>
> >>>
> >>> On 11/9/22 6:17 PM, John Fastabend wrote:
> >>>> Yonghong Song wrote:
> >>>>>
> >>>>>
> >>>>> On 11/9/22 1:52 PM, John Fastabend wrote:
> >>>>>> Allow xdp progs to read the net_device structure. Its useful to extract
> >>>>>> info from the dev itself. Currently, our tracing tooling uses kprobes
> >>>>>> to capture statistics and information about running net devices. We use
> >>>>>> kprobes instead of other hooks tc/xdp because we need to collect
> >>>>>> information about the interface not exposed through the xdp_md structures.
> >>>>>> This has some down sides that we want to avoid by moving these into the
> >>>>>> XDP hook itself. First, placing the kprobes in a generic function in
> >>>>>> the kernel is after XDP so we miss redirects and such done by the
> >>>>>> XDP networking program. And its needless overhead because we are
> >>>>>> already paying the cost for calling the XDP program, calling yet
> >>>>>> another prog is a waste. Better to do everything in one hook from
> >>>>>> performance side.
> >>>>>>
> >>>>>> Of course we could one-off each one of these fields, but that would
> >>>>>> explode the xdp_md struct and then require writing convert_ctx_access
> >>>>>> writers for each field. By using BTF we avoid writing field specific
> >>>>>> convertion logic, BTF just knows how to read the fields, we don't
> >>>>>> have to add many fields to xdp_md, and I don't have to get every
> >>>>>> field we will use in the future correct.
> >>>>>>
> >>>>>> For reference current examples in our code base use the ifindex,
> >>>>>> ifname, qdisc stats, net_ns fields, among others. With this
> >>>>>> patch we can now do the following,
> >>>>>>
> >>>>>>            dev = ctx->rx_dev;
> >>>>>>            net = dev->nd_net.net;
> >>>>>>
> >>>>>> 	uid.ifindex = dev->ifindex;
> >>>>>> 	memcpy(uid.ifname, dev->ifname, NAME);
> >>>>>>            if (net)
> >>>>>> 		uid.inum = net->ns.inum;
> >>>>>>
> >>>>>> to report the name, index and ns.inum which identifies an
> >>>>>> interface in our system.
> >>>>>

[...]

> >> Yep.
> >>
> >> I'm fine doing it with bpf_get_kern_ctx() did you want me to code it
> >> the rest of the way up and test it?
> >>
> >> .John
> > 
> > Related I think. We also want to get kernel variable net_namespace_list,
> > this points to the network namespace lists. Based on above should
> > we do something like,
> > 
> >    void *bpf_get_kern_var(enum var_id);
> > 
> > then,
> > 
> >    net_ns_list = bpf_get_kern_var(__btf_net_namesapce_list);
> > 
> > would get us a ptr to the list? The other thought was to put it in the
> > xdp_md but from above seems better idea to get it through helper.
> 
> Sounds great. I guess my new proposed bpf_get_kern_btf_id() kfunc could
> cover such a use case as well.

Yes I think this should be good. The only catch is that we need to
get the kernel global var pointer net_namespace_list.

Then we can write iterators on network namespaces and net_devices
without having to do anything else. The usecase is to iterate
the network namespace and collect some subset of netdevices. Populate
a map with these and then keep it in sync from XDP with stats. We
already hook create/destroy paths so have built up maps that track
this and have some XDP stats but not everything we would want.

The other piece I would like to get out of the xdp ctx is the
rx descriptor of the device. I want to use this to pull out info
about the received buffer for debug mostly, but could also grab
some fields that are useful for us to track. That we can likely
do this,

  ctx->rxdesc

Recently had to debug an ugly hardware/driver bug where this would
have been very useful.

.John
Yonghong Song Nov. 14, 2022, 4:51 p.m. UTC | #17
On 11/13/22 10:27 AM, John Fastabend wrote:
> Yonghong Song wrote:
>>
>>
>> On 11/10/22 3:11 PM, John Fastabend wrote:
>>> John Fastabend wrote:
>>>> Yonghong Song wrote:
>>>>>
>>>>>
>>>>> On 11/9/22 6:17 PM, John Fastabend wrote:
>>>>>> Yonghong Song wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 11/9/22 1:52 PM, John Fastabend wrote:
>>>>>>>> Allow xdp progs to read the net_device structure. Its useful to extract
>>>>>>>> info from the dev itself. Currently, our tracing tooling uses kprobes
>>>>>>>> to capture statistics and information about running net devices. We use
>>>>>>>> kprobes instead of other hooks tc/xdp because we need to collect
>>>>>>>> information about the interface not exposed through the xdp_md structures.
>>>>>>>> This has some down sides that we want to avoid by moving these into the
>>>>>>>> XDP hook itself. First, placing the kprobes in a generic function in
>>>>>>>> the kernel is after XDP so we miss redirects and such done by the
>>>>>>>> XDP networking program. And its needless overhead because we are
>>>>>>>> already paying the cost for calling the XDP program, calling yet
>>>>>>>> another prog is a waste. Better to do everything in one hook from
>>>>>>>> performance side.
>>>>>>>>
>>>>>>>> Of course we could one-off each one of these fields, but that would
>>>>>>>> explode the xdp_md struct and then require writing convert_ctx_access
>>>>>>>> writers for each field. By using BTF we avoid writing field specific
>>>>>>>> convertion logic, BTF just knows how to read the fields, we don't
>>>>>>>> have to add many fields to xdp_md, and I don't have to get every
>>>>>>>> field we will use in the future correct.
>>>>>>>>
>>>>>>>> For reference current examples in our code base use the ifindex,
>>>>>>>> ifname, qdisc stats, net_ns fields, among others. With this
>>>>>>>> patch we can now do the following,
>>>>>>>>
>>>>>>>>             dev = ctx->rx_dev;
>>>>>>>>             net = dev->nd_net.net;
>>>>>>>>
>>>>>>>> 	uid.ifindex = dev->ifindex;
>>>>>>>> 	memcpy(uid.ifname, dev->ifname, NAME);
>>>>>>>>             if (net)
>>>>>>>> 		uid.inum = net->ns.inum;
>>>>>>>>
>>>>>>>> to report the name, index and ns.inum which identifies an
>>>>>>>> interface in our system.
>>>>>>>
> 
> [...]
> 
>>>> Yep.
>>>>
>>>> I'm fine doing it with bpf_get_kern_ctx() did you want me to code it
>>>> the rest of the way up and test it?
>>>>
>>>> .John
>>>
>>> Related I think. We also want to get kernel variable net_namespace_list,
>>> this points to the network namespace lists. Based on above should
>>> we do something like,
>>>
>>>     void *bpf_get_kern_var(enum var_id);
>>>
>>> then,
>>>
>>>     net_ns_list = bpf_get_kern_var(__btf_net_namesapce_list);
>>>
>>> would get us a ptr to the list? The other thought was to put it in the
>>> xdp_md but from above seems better idea to get it through helper.
>>
>> Sounds great. I guess my new proposed bpf_get_kern_btf_id() kfunc could
>> cover such a use case as well.
> 
> Yes I think this should be good. The only catch is that we need to
> get the kernel global var pointer net_namespace_list.

Currently, the kernel supports percpu variable, but
not other global var like net_namespace_list. Currently, there is
an effort to add global var to BTF:
 
https://lore.kernel.org/bpf/20221104231103.752040-1-stephen.s.brennan@oracle.com/

> 
> Then we can write iterators on network namespaces and net_devices
> without having to do anything else. The usecase is to iterate
> the network namespace and collect some subset of netdevices. Populate
> a map with these and then keep it in sync from XDP with stats. We
> already hook create/destroy paths so have built up maps that track
> this and have some XDP stats but not everything we would want.

the net_namespace_list is defined as:
   struct list_head net_namespace_list;
So it is still difficult to iterate with bpf program. But we
could have a bpf_iter (similar to task, task_file, etc.)
for net namespaces and it can provide enough context
for the bpf program for each namespace to satisfy your
above need.

You can also with a bounded loop to traverse net_namespace_list
in the bpf program, but it may incur complicated codes...

> 
> The other piece I would like to get out of the xdp ctx is the
> rx descriptor of the device. I want to use this to pull out info
> about the received buffer for debug mostly, but could also grab
> some fields that are useful for us to track. That we can likely
> do this,
> 
>    ctx->rxdesc

I think it is possible. Adding rxdesc to xdp_buff as
     unsigned char *rxdesc;
or
     void *rxdesc;

and using bpf_get_kern_btf_id(kctx->rxdesc, expected_btf_id)
to get a btf id for rxdesc. Here we assume there is
a struct available for rxdesc in vmlinux.h.
Then you can trace through rxdesc with direct memory
access.

I have a RFC patch
   https://lore.kernel.org/bpf/20221114162328.622665-1-yhs@fb.com/
please help take a look.

> 
> Recently had to debug an ugly hardware/driver bug where this would
> have been very useful.
> 
> .John
John Fastabend Nov. 14, 2022, 6:23 p.m. UTC | #18
Yonghong Song wrote:
> 
> 
> On 11/13/22 10:27 AM, John Fastabend wrote:
> > Yonghong Song wrote:
> >>
> >>
> >> On 11/10/22 3:11 PM, John Fastabend wrote:
> >>> John Fastabend wrote:
> >>>> Yonghong Song wrote:
> >>>>>
> >>>>>
> >>>>> On 11/9/22 6:17 PM, John Fastabend wrote:
> >>>>>> Yonghong Song wrote:
> >>>>>>>
> >>>>>>>
> >>>>>>> On 11/9/22 1:52 PM, John Fastabend wrote:
> >>>>>>>> Allow xdp progs to read the net_device structure. Its useful to extract
> >>>>>>>> info from the dev itself. Currently, our tracing tooling uses kprobes
> >>>>>>>> to capture statistics and information about running net devices. We use
> >>>>>>>> kprobes instead of other hooks tc/xdp because we need to collect
> >>>>>>>> information about the interface not exposed through the xdp_md structures.
> >>>>>>>> This has some down sides that we want to avoid by moving these into the
> >>>>>>>> XDP hook itself. First, placing the kprobes in a generic function in
> >>>>>>>> the kernel is after XDP so we miss redirects and such done by the
> >>>>>>>> XDP networking program. And its needless overhead because we are
> >>>>>>>> already paying the cost for calling the XDP program, calling yet
> >>>>>>>> another prog is a waste. Better to do everything in one hook from
> >>>>>>>> performance side.
> >>>>>>>>
> >>>>>>>> Of course we could one-off each one of these fields, but that would
> >>>>>>>> explode the xdp_md struct and then require writing convert_ctx_access
> >>>>>>>> writers for each field. By using BTF we avoid writing field specific
> >>>>>>>> convertion logic, BTF just knows how to read the fields, we don't
> >>>>>>>> have to add many fields to xdp_md, and I don't have to get every
> >>>>>>>> field we will use in the future correct.
> >>>>>>>>
> >>>>>>>> For reference current examples in our code base use the ifindex,
> >>>>>>>> ifname, qdisc stats, net_ns fields, among others. With this
> >>>>>>>> patch we can now do the following,
> >>>>>>>>
> >>>>>>>>             dev = ctx->rx_dev;
> >>>>>>>>             net = dev->nd_net.net;
> >>>>>>>>
> >>>>>>>> 	uid.ifindex = dev->ifindex;
> >>>>>>>> 	memcpy(uid.ifname, dev->ifname, NAME);
> >>>>>>>>             if (net)
> >>>>>>>> 		uid.inum = net->ns.inum;
> >>>>>>>>
> >>>>>>>> to report the name, index and ns.inum which identifies an
> >>>>>>>> interface in our system.
> >>>>>>>
> > 
> > [...]
> > 
> >>>> Yep.
> >>>>
> >>>> I'm fine doing it with bpf_get_kern_ctx() did you want me to code it
> >>>> the rest of the way up and test it?
> >>>>
> >>>> .John
> >>>
> >>> Related I think. We also want to get kernel variable net_namespace_list,
> >>> this points to the network namespace lists. Based on above should
> >>> we do something like,
> >>>
> >>>     void *bpf_get_kern_var(enum var_id);
> >>>
> >>> then,
> >>>
> >>>     net_ns_list = bpf_get_kern_var(__btf_net_namesapce_list);
> >>>
> >>> would get us a ptr to the list? The other thought was to put it in the
> >>> xdp_md but from above seems better idea to get it through helper.
> >>
> >> Sounds great. I guess my new proposed bpf_get_kern_btf_id() kfunc could
> >> cover such a use case as well.
> > 
> > Yes I think this should be good. The only catch is that we need to
> > get the kernel global var pointer net_namespace_list.
> 
> Currently, the kernel supports percpu variable, but
> not other global var like net_namespace_list. Currently, there is
> an effort to add global var to BTF:
>  
> https://lore.kernel.org/bpf/20221104231103.752040-1-stephen.s.brennan@oracle.com/
> 
> > 
> > Then we can write iterators on network namespaces and net_devices
> > without having to do anything else. The usecase is to iterate
> > the network namespace and collect some subset of netdevices. Populate
> > a map with these and then keep it in sync from XDP with stats. We
> > already hook create/destroy paths so have built up maps that track
> > this and have some XDP stats but not everything we would want.
> 
> the net_namespace_list is defined as:
>    struct list_head net_namespace_list;
> So it is still difficult to iterate with bpf program. But we
> could have a bpf_iter (similar to task, task_file, etc.)
> for net namespaces and it can provide enough context
> for the bpf program for each namespace to satisfy your
> above need.

Considered having bpf iter programs for net_namespace and then
net_device, but these are protected by RCU so figured rather
than create a bunch of iterators we could just use BPF directly.

> 
> You can also with a bounded loop to traverse net_namespace_list
> in the bpf program, but it may incur complicated codes...

This was going to be my first approach. I'll try to write a test
program that walks the net namespace and collect all the net
devs in a hashmap this would be more or less what our programs
do today.

Seems nicer to me to simply use native BPF codes vs iterators.
We already have mechanisms in Tetragon to run BPF code on
timers so could just wire it up there. If it doesn't work 
we can add the iterators.

> 
> > 
> > The other piece I would like to get out of the xdp ctx is the
> > rx descriptor of the device. I want to use this to pull out info
> > about the received buffer for debug mostly, but could also grab
> > some fields that are useful for us to track. That we can likely
> > do this,
> > 
> >    ctx->rxdesc
> 
> I think it is possible. Adding rxdesc to xdp_buff as
>      unsigned char *rxdesc;
> or
>      void *rxdesc;
> 
> and using bpf_get_kern_btf_id(kctx->rxdesc, expected_btf_id)
> to get a btf id for rxdesc. Here we assume there is
> a struct available for rxdesc in vmlinux.h.
> Then you can trace through rxdesc with direct memory
> access.

The trickest part here is that the rxdesc btf_id depends on 
what device we are attached to. So would need something to
resolve the btf_id from attached device.

> 
> I have a RFC patch
>    https://lore.kernel.org/bpf/20221114162328.622665-1-yhs@fb.com/
> please help take a look.

Will do thanks!
Jakub Kicinski Nov. 16, 2022, 7:46 p.m. UTC | #19
On Mon, 14 Nov 2022 10:23:00 -0800 John Fastabend wrote:
> > > The other piece I would like to get out of the xdp ctx is the
> > > rx descriptor of the device. I want to use this to pull out info
> > > about the received buffer for debug mostly, but could also grab
> > > some fields that are useful for us to track. That we can likely
> > > do this,
> > > 
> > >    ctx->rxdesc  
> > 
> > I think it is possible. Adding rxdesc to xdp_buff as
> >      unsigned char *rxdesc;
> > or
> >      void *rxdesc;

We should avoid having to add fields to structures just to expose 
them to BPF. Would the approach that Stan uses not work here?
Having the driver place the desc pointer in a well known location
on the stack and kfunc or some other magic resolve it?
 
> > and using bpf_get_kern_btf_id(kctx->rxdesc, expected_btf_id)
> > to get a btf id for rxdesc. Here we assume there is
> > a struct available for rxdesc in vmlinux.h.
> > Then you can trace through rxdesc with direct memory
> > access.  
> 
> The trickest part here is that the rxdesc btf_id depends on 
> what device we are attached to. So would need something to
> resolve the btf_id from attached device.

Right, driver needs to get involved one way or another, so it can
return "how to get to the descriptor given a xdp_buff pointer" 
as well as the btf_id or dynptr params.

(Sorry, I'm only catching up with the xdp hw field discussions
now so this may have already been discussed elsewhere..)
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 94659f6b3395..50403eb3b6cf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6123,6 +6123,7 @@  struct xdp_md {
 	__u32 rx_queue_index;  /* rxq->queue_index  */
 
 	__u32 egress_ifindex;  /* txq->dev->ifindex */
+	__bpf_md_ptr(struct net_device *, rx_dev); /* rxq->dev */
 };
 
 /* DEVMAP map-value layout
diff --git a/net/core/filter.c b/net/core/filter.c
index bb0136e7a8e4..d445ffbea8f1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8686,6 +8686,8 @@  static bool __is_valid_xdp_access(int off, int size)
 	return true;
 }
 
+BTF_ID_LIST_SINGLE(btf_xdp_get_netdev_id, struct, net_device)
+
 static bool xdp_is_valid_access(int off, int size,
 				enum bpf_access_type type,
 				const struct bpf_prog *prog,
@@ -8718,6 +8720,15 @@  static bool xdp_is_valid_access(int off, int size,
 	case offsetof(struct xdp_md, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
+	case offsetof(struct xdp_md, rx_dev):
+		info->reg_type = PTR_TO_BTF_ID;
+		info->btf_id = btf_xdp_get_netdev_id[0];
+		info->btf = bpf_get_btf_vmlinux();
+	        if (IS_ERR_OR_NULL(info->btf))
+			return false;
+		if (size != sizeof(u64))
+			return false;
+		return true;
 	}
 
 	return __is_valid_xdp_access(off, size);
@@ -9808,6 +9819,14 @@  static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct net_device, ifindex));
 		break;
+	case offsetof(struct xdp_md, rx_dev):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, rxq));
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
+				      si->dst_reg, si->dst_reg,
+				      offsetof(struct xdp_rxq_info, dev));
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 94659f6b3395..50403eb3b6cf 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6123,6 +6123,7 @@  struct xdp_md {
 	__u32 rx_queue_index;  /* rxq->queue_index  */
 
 	__u32 egress_ifindex;  /* txq->dev->ifindex */
+	__bpf_md_ptr(struct net_device *, rx_dev); /* rxq->dev */
 };
 
 /* DEVMAP map-value layout