diff mbox series

[v6,bpf-next,6/6] selftests/bpf: add test for bpf_seq_printf_btf helper

Message ID 1600883188-4831-7-git-send-email-alan.maguire@oracle.com (mailing list archive)
State New
Headers show
Series bpf: add helpers to support BTF-based kernel data display | expand

Commit Message

Alan Maguire Sept. 23, 2020, 5:46 p.m. UTC
Add a test verifying iterating over tasks and displaying BTF
representation of data succeeds.  Note here that we do not display
the task_struct itself, as it will overflow the PAGE_SIZE limit on seq
data; instead we write task->fs (a struct fs_struct).

Suggested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 tools/testing/selftests/bpf/prog_tests/bpf_iter.c  | 66 ++++++++++++++++++++++
 .../selftests/bpf/progs/bpf_iter_task_btf.c        | 49 ++++++++++++++++
 2 files changed, 115 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c

Comments

Alexei Starovoitov Sept. 25, 2020, 1:26 a.m. UTC | #1
On Wed, Sep 23, 2020 at 06:46:28PM +0100, Alan Maguire wrote:
> Add a test verifying iterating over tasks and displaying BTF
> representation of data succeeds.  Note here that we do not display
> the task_struct itself, as it will overflow the PAGE_SIZE limit on seq
> data; instead we write task->fs (a struct fs_struct).

Yeah. I've tried to print task_struct before reading above comment and
it took me long time to figure out what 'read failed: Argument list too long' means.
How can we improve usability of this helper?

We can bump:
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -88,8 +88,8 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
        mutex_lock(&seq->lock);

        if (!seq->buf) {
-               seq->size = PAGE_SIZE;
+               seq->size = PAGE_SIZE * 32;

to whatever number, but printing single task_struct needs ~800 lines and
~18kbytes. Humans can scroll through that much spam, but can we make it less
verbose by default somehow?
May be not in this patch set, but in the follow up?

> +SEC("iter/task")
> +int dump_task_fs_struct(struct bpf_iter__task *ctx)
> +{
> +	static const char fs_type[] = "struct fs_struct";
> +	struct seq_file *seq = ctx->meta->seq;
> +	struct task_struct *task = ctx->task;
> +	struct fs_struct *fs = (void *)0;
> +	static struct btf_ptr ptr = { };
> +	long ret;
> +
> +	if (task)
> +		fs = task->fs;
> +
> +	ptr.type = fs_type;
> +	ptr.ptr = fs;

imo the following is better:
       ptr.type_id = __builtin_btf_type_id(*fs, 1);
       ptr.ptr = fs;

> +
> +	if (ctx->meta->seq_num == 0)
> +		BPF_SEQ_PRINTF(seq, "Raw BTF fs_struct per task\n");
> +
> +	ret = bpf_seq_printf_btf(seq, &ptr, sizeof(ptr), 0);
> +	switch (ret) {
> +	case 0:
> +		tasks++;
> +		break;
> +	case -ERANGE:
> +		/* NULL task or task->fs, don't count it as an error. */
> +		break;
> +	default:
> +		seq_err = ret;
> +		break;
> +	}

Please add handling of E2BIG to this switch. Otherwise
printing large amount of tiny structs will overflow PAGE_SIZE and E2BIG
will be send to user space.
Like this:
@@ -40,6 +40,8 @@ int dump_task_fs_struct(struct bpf_iter__task *ctx)
        case -ERANGE:
                /* NULL task or task->fs, don't count it as an error. */
                break;
+       case -E2BIG:
+               return 1;

Also please change bpf_seq_read() like this:
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 30833bbf3019..8f10e30ea0b0 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -88,8 +88,8 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
        mutex_lock(&seq->lock);

        if (!seq->buf) {
-               seq->size = PAGE_SIZE;
-               seq->buf = kmalloc(seq->size, GFP_KERNEL);
+               seq->size = PAGE_SIZE << 3;
+               seq->buf = kvmalloc(seq->size, GFP_KERNEL);

So users can print task_struct by default.
Hopefully we will figure out how to deal with spam later.
Alan Maguire Sept. 28, 2020, 2:12 p.m. UTC | #2
On Thu, 24 Sep 2020, Alexei Starovoitov wrote:

> to whatever number, but printing single task_struct needs ~800 lines and
> ~18kbytes. Humans can scroll through that much spam, but can we make it less
> verbose by default somehow?
> May be not in this patch set, but in the follow up?
>

One approach that might work would be to devote 4 bits or so of
flag space to a "maximum depth" specifier; i.e. at depth 1,
only base types are displayed, no aggregate types like arrays,
structs and unions.  We've already got depth processing in the
code to figure out if possibly zeroed nested data needs to be
displayed, so it should hopefully be a simple follow-up.

One way to express it would be to use "..." to denote field(s)
were omitted. We could even use the number of "."s to denote
cases where multiple fields were omitted, giving a visual sense
of how much data was omitted.  So for example with 
BTF_F_MAX_DEPTH(1), task_struct looks like this:

(struct task_struct){
 .state = ()1,
 .stack = ( *)0x00000000029d1e6f,
 ...
 .flags = (unsigned int)4194560,
 ...
 .cpu = (unsigned int)36,
 .wakee_flips = (unsigned int)11,
 .wakee_flip_decay_ts = (long unsigned int)4294914874,
 .last_wakee = (struct task_struct *)0x000000006c7dfe6d,
 .recent_used_cpu = (int)19,
 .wake_cpu = (int)36,
 .prio = (int)120,
 .static_prio = (int)120,
 .normal_prio = (int)120,
 .sched_class = (struct sched_class *)0x00000000ad1561e6,
 ...
 .exec_start = (u64)674402577156,
 .sum_exec_runtime = (u64)5009664110,
 .vruntime = (u64)167038057,
 .prev_sum_exec_runtime = (u64)5009578167,
 .nr_migrations = (u64)54,
 .depth = (int)1,
 .parent = (struct sched_entity *)0x00000000cba60e7d,
 .cfs_rq = (struct cfs_rq *)0x0000000014f353ed,
 ...
 
...etc. What do you think?

> > +SEC("iter/task")
> > +int dump_task_fs_struct(struct bpf_iter__task *ctx)
> > +{
> > +	static const char fs_type[] = "struct fs_struct";
> > +	struct seq_file *seq = ctx->meta->seq;
> > +	struct task_struct *task = ctx->task;
> > +	struct fs_struct *fs = (void *)0;
> > +	static struct btf_ptr ptr = { };
> > +	long ret;
> > +
> > +	if (task)
> > +		fs = task->fs;
> > +
> > +	ptr.type = fs_type;
> > +	ptr.ptr = fs;
> 
> imo the following is better:
>        ptr.type_id = __builtin_btf_type_id(*fs, 1);
>        ptr.ptr = fs;
> 

I'm still seeing lookup failures using __builtin_btf_type_id(,1) -
whereas both __builtin_btf_type_id(,0) and Andrii's
suggestion of bpf_core_type_id_kernel() work. Not sure what's
going on - pahole is v1.17, clang is

clang version 12.0.0 (/mnt/src/llvm-project/clang 
7ab7b979d29e1e43701cf690f5cf1903740f50e3)

> > +
> > +	if (ctx->meta->seq_num == 0)
> > +		BPF_SEQ_PRINTF(seq, "Raw BTF fs_struct per task\n");
> > +
> > +	ret = bpf_seq_printf_btf(seq, &ptr, sizeof(ptr), 0);
> > +	switch (ret) {
> > +	case 0:
> > +		tasks++;
> > +		break;
> > +	case -ERANGE:
> > +		/* NULL task or task->fs, don't count it as an error. */
> > +		break;
> > +	default:
> > +		seq_err = ret;
> > +		break;
> > +	}
> 
> Please add handling of E2BIG to this switch. Otherwise
> printing large amount of tiny structs will overflow PAGE_SIZE and E2BIG
> will be send to user space.
> Like this:
> @@ -40,6 +40,8 @@ int dump_task_fs_struct(struct bpf_iter__task *ctx)
>         case -ERANGE:
>                 /* NULL task or task->fs, don't count it as an error. */
>                 break;
> +       case -E2BIG:
> +               return 1;
> 

Done.

> Also please change bpf_seq_read() like this:
> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index 30833bbf3019..8f10e30ea0b0 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -88,8 +88,8 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
>         mutex_lock(&seq->lock);
> 
>         if (!seq->buf) {
> -               seq->size = PAGE_SIZE;
> -               seq->buf = kmalloc(seq->size, GFP_KERNEL);
> +               seq->size = PAGE_SIZE << 3;
> +               seq->buf = kvmalloc(seq->size, GFP_KERNEL);
> 
> So users can print task_struct by default.
> Hopefully we will figure out how to deal with spam later.
> 

Thanks for all the help and suggestions! I didn't want to
attribute the patch bumping seq size in v7 to you without your 
permission, but it's all your work so if I need to respin let me
know if you'd like me to fix that. Thanks again!

Alan
Andrii Nakryiko Sept. 28, 2020, 5:51 p.m. UTC | #3
On Mon, Sep 28, 2020 at 7:14 AM Alan Maguire <alan.maguire@oracle.com> wrote:
>
>
>
> On Thu, 24 Sep 2020, Alexei Starovoitov wrote:
>
> > to whatever number, but printing single task_struct needs ~800 lines and
> > ~18kbytes. Humans can scroll through that much spam, but can we make it less
> > verbose by default somehow?
> > May be not in this patch set, but in the follow up?
> >
>
> One approach that might work would be to devote 4 bits or so of
> flag space to a "maximum depth" specifier; i.e. at depth 1,
> only base types are displayed, no aggregate types like arrays,
> structs and unions.  We've already got depth processing in the
> code to figure out if possibly zeroed nested data needs to be
> displayed, so it should hopefully be a simple follow-up.
>
> One way to express it would be to use "..." to denote field(s)
> were omitted. We could even use the number of "."s to denote
> cases where multiple fields were omitted, giving a visual sense
> of how much data was omitted.  So for example with
> BTF_F_MAX_DEPTH(1), task_struct looks like this:
>
> (struct task_struct){
>  .state = ()1,
>  .stack = ( *)0x00000000029d1e6f,
>  ...
>  .flags = (unsigned int)4194560,
>  ...
>  .cpu = (unsigned int)36,
>  .wakee_flips = (unsigned int)11,
>  .wakee_flip_decay_ts = (long unsigned int)4294914874,
>  .last_wakee = (struct task_struct *)0x000000006c7dfe6d,
>  .recent_used_cpu = (int)19,
>  .wake_cpu = (int)36,
>  .prio = (int)120,
>  .static_prio = (int)120,
>  .normal_prio = (int)120,
>  .sched_class = (struct sched_class *)0x00000000ad1561e6,
>  ...
>  .exec_start = (u64)674402577156,
>  .sum_exec_runtime = (u64)5009664110,
>  .vruntime = (u64)167038057,
>  .prev_sum_exec_runtime = (u64)5009578167,
>  .nr_migrations = (u64)54,
>  .depth = (int)1,
>  .parent = (struct sched_entity *)0x00000000cba60e7d,
>  .cfs_rq = (struct cfs_rq *)0x0000000014f353ed,
>  ...
>
> ...etc. What do you think?

It's not clear to me what exactly is omitted with ... ? Would it make
sense to still at least list a field name and "abbreviated" value.
E.g., for arrays:

.array_field = (int[16]){ ... },

Similarly for struct:

.struct_field = (struct my_struct){ ... },

? With just '...' I get a very strong and unsettling feeling of
missing out on the important stuff :)

>
> > > +SEC("iter/task")
> > > +int dump_task_fs_struct(struct bpf_iter__task *ctx)
> > > +{
> > > +   static const char fs_type[] = "struct fs_struct";
> > > +   struct seq_file *seq = ctx->meta->seq;
> > > +   struct task_struct *task = ctx->task;
> > > +   struct fs_struct *fs = (void *)0;
> > > +   static struct btf_ptr ptr = { };
> > > +   long ret;
> > > +
> > > +   if (task)
> > > +           fs = task->fs;
> > > +
> > > +   ptr.type = fs_type;
> > > +   ptr.ptr = fs;
> >
> > imo the following is better:
> >        ptr.type_id = __builtin_btf_type_id(*fs, 1);
> >        ptr.ptr = fs;
> >
>
> I'm still seeing lookup failures using __builtin_btf_type_id(,1) -
> whereas both __builtin_btf_type_id(,0) and Andrii's
> suggestion of bpf_core_type_id_kernel() work. Not sure what's
> going on - pahole is v1.17, clang is

bpf_core_type_id_kernel() is

__builtin_btf_type_id(*(typeof(type) *)0, BPF_TYPE_ID_TARGET)

BPF_TYPE_ID_TARGET is exactly 1. So I bet it's because of the type
capturing through typeof() and pointer casting/dereferencing, which
preserves type information properly. Regardless, just use the helper,
IMO.


>
> clang version 12.0.0 (/mnt/src/llvm-project/clang
> 7ab7b979d29e1e43701cf690f5cf1903740f50e3)
>

[...]
Alexei Starovoitov Sept. 29, 2020, 1:54 a.m. UTC | #4
On Mon, Sep 28, 2020 at 10:51:19AM -0700, Andrii Nakryiko wrote:
> On Mon, Sep 28, 2020 at 7:14 AM Alan Maguire <alan.maguire@oracle.com> wrote:
> >
> >
> >
> > On Thu, 24 Sep 2020, Alexei Starovoitov wrote:
> >
> > > to whatever number, but printing single task_struct needs ~800 lines and
> > > ~18kbytes. Humans can scroll through that much spam, but can we make it less
> > > verbose by default somehow?
> > > May be not in this patch set, but in the follow up?
> > >
> >
> > One approach that might work would be to devote 4 bits or so of
> > flag space to a "maximum depth" specifier; i.e. at depth 1,
> > only base types are displayed, no aggregate types like arrays,
> > structs and unions.  We've already got depth processing in the
> > code to figure out if possibly zeroed nested data needs to be
> > displayed, so it should hopefully be a simple follow-up.

That sounds great to me.

Would it be possible to specify the depth from the other side as well?
Like a lot of 'leaf' fields are struct list_head, struct lockdep_map,
atomic_t, struct callback_head, etc.
When printing a big struct I'm interested in the data that the
struct provides, but many small inner structs are not that useful.
So the data is at the top level and in few layers down,
but depth is different at different fields.
If I could tell printf to avoid printing the last depth I think
it will make the output more concise.
Whereas if I say print depth=2 from the top it will still print
'struct list_head' that happened to be at the top level.

> >
> > One way to express it would be to use "..." to denote field(s)
> > were omitted. We could even use the number of "."s to denote
> > cases where multiple fields were omitted, giving a visual sense
> > of how much data was omitted.  So for example with
> > BTF_F_MAX_DEPTH(1), task_struct looks like this:
> >
> > (struct task_struct){
> >  .state = ()1,
> >  .stack = ( *)0x00000000029d1e6f,
> >  ...
> >  .flags = (unsigned int)4194560,
> >  ...
> >  .cpu = (unsigned int)36,
> >  .wakee_flips = (unsigned int)11,
> >  .wakee_flip_decay_ts = (long unsigned int)4294914874,
> >  .last_wakee = (struct task_struct *)0x000000006c7dfe6d,
> >  .recent_used_cpu = (int)19,
> >  .wake_cpu = (int)36,
> >  .prio = (int)120,
> >  .static_prio = (int)120,
> >  .normal_prio = (int)120,
> >  .sched_class = (struct sched_class *)0x00000000ad1561e6,
> >  ...
> >  .exec_start = (u64)674402577156,
> >  .sum_exec_runtime = (u64)5009664110,
> >  .vruntime = (u64)167038057,
> >  .prev_sum_exec_runtime = (u64)5009578167,
> >  .nr_migrations = (u64)54,
> >  .depth = (int)1,
> >  .parent = (struct sched_entity *)0x00000000cba60e7d,
> >  .cfs_rq = (struct cfs_rq *)0x0000000014f353ed,
> >  ...
> >
> > ...etc. What do you think?
> 
> It's not clear to me what exactly is omitted with ... ? Would it make
> sense to still at least list a field name and "abbreviated" value.
> E.g., for arrays:
> 
> .array_field = (int[16]){ ... },
> 
> Similarly for struct:
> 
> .struct_field = (struct my_struct){ ... },

+1
Something like this would be great.

Another idea...
If there is only one field in the struct can we omit it?
Like instead of:
   .refs = (atomic_t){
    .counter = (int)2,
   },
print
   .refs = (atomic_t){ 2 },

From C point of view it is still a valid initializer and
it's not ambiguous which field being inited, since there is only
one field.
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index fe1a83b9..323c48a 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -7,6 +7,7 @@ 
 #include "bpf_iter_task.skel.h"
 #include "bpf_iter_task_stack.skel.h"
 #include "bpf_iter_task_file.skel.h"
+#include "bpf_iter_task_btf.skel.h"
 #include "bpf_iter_tcp4.skel.h"
 #include "bpf_iter_tcp6.skel.h"
 #include "bpf_iter_udp4.skel.h"
@@ -167,6 +168,69 @@  static void test_task_file(void)
 	bpf_iter_task_file__destroy(skel);
 }
 
+#define FSBUFSZ		8192
+
+static char fsbuf[FSBUFSZ];
+
+static void do_btf_read(struct bpf_program *prog)
+{
+	int iter_fd = -1, len = 0, bufleft = FSBUFSZ;
+	struct bpf_link *link;
+	char *buf = fsbuf;
+
+	link = bpf_program__attach_iter(prog, NULL);
+	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+		return;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+		goto free_link;
+
+	do {
+		len = read(iter_fd, buf, bufleft);
+		if (len > 0) {
+			buf += len;
+			bufleft -= len;
+		}
+	} while (len > 0);
+
+	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+		goto free_link;
+
+	CHECK(strstr(fsbuf, "(struct fs_struct)") == NULL,
+	      "check for btf representation of fs_struct in iter data",
+	      "struct fs_struct not found");
+free_link:
+	if (iter_fd > 0)
+		close(iter_fd);
+	bpf_link__destroy(link);
+}
+
+static void test_task_btf(void)
+{
+	struct bpf_iter_task_btf__bss *bss;
+	struct bpf_iter_task_btf *skel;
+
+	skel = bpf_iter_task_btf__open_and_load();
+	if (CHECK(!skel, "bpf_iter_task_btf__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	bss = skel->bss;
+
+	do_btf_read(skel->progs.dump_task_fs_struct);
+
+	if (CHECK(bss->tasks == 0, "check if iterated over tasks",
+		  "no task iteration, did BPF program run?\n"))
+		goto cleanup;
+
+	CHECK(bss->seq_err != 0, "check for unexpected err",
+	      "bpf_seq_printf_btf returned %ld", bss->seq_err);
+
+cleanup:
+	bpf_iter_task_btf__destroy(skel);
+}
+
 static void test_tcp4(void)
 {
 	struct bpf_iter_tcp4 *skel;
@@ -957,6 +1021,8 @@  void test_bpf_iter(void)
 		test_task_stack();
 	if (test__start_subtest("task_file"))
 		test_task_file();
+	if (test__start_subtest("task_btf"))
+		test_task_btf();
 	if (test__start_subtest("tcp4"))
 		test_tcp4();
 	if (test__start_subtest("tcp6"))
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c
new file mode 100644
index 0000000..88631a8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c
@@ -0,0 +1,49 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020, Oracle and/or its affiliates. */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <errno.h>
+
+char _license[] SEC("license") = "GPL";
+
+long tasks = 0;
+long seq_err = 0;
+
+/* struct task_struct's BTF representation will overflow PAGE_SIZE so cannot
+ * be used here; instead dump a structure associated with each task.
+ */
+SEC("iter/task")
+int dump_task_fs_struct(struct bpf_iter__task *ctx)
+{
+	static const char fs_type[] = "struct fs_struct";
+	struct seq_file *seq = ctx->meta->seq;
+	struct task_struct *task = ctx->task;
+	struct fs_struct *fs = (void *)0;
+	static struct btf_ptr ptr = { };
+	long ret;
+
+	if (task)
+		fs = task->fs;
+
+	ptr.type = fs_type;
+	ptr.ptr = fs;
+
+	if (ctx->meta->seq_num == 0)
+		BPF_SEQ_PRINTF(seq, "Raw BTF fs_struct per task\n");
+
+	ret = bpf_seq_printf_btf(seq, &ptr, sizeof(ptr), 0);
+	switch (ret) {
+	case 0:
+		tasks++;
+		break;
+	case -ERANGE:
+		/* NULL task or task->fs, don't count it as an error. */
+		break;
+	default:
+		seq_err = ret;
+		break;
+	}
+
+	return 0;
+}