Message ID | 20220726051713.840431-2-kuifeng@fb.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Parameterize task iterators. | expand |
Context | Check | Description |
---|---|---|
bpf/vmtest-bpf-next-PR | success | PR summary |
bpf/vmtest-bpf-next-VM_Test-1 | success | Logs for Kernel LATEST on ubuntu-latest with gcc |
bpf/vmtest-bpf-next-VM_Test-2 | success | Logs for Kernel LATEST on ubuntu-latest with llvm-15 |
bpf/vmtest-bpf-next-VM_Test-3 | success | Logs for Kernel LATEST on z15 with gcc |
On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote: > Allow creating an iterator that loops through resources of one task/thread. > > People could only create iterators to loop through all resources of > files, vma, and tasks in the system, even though they were interested > in only the resources of a specific task or process. Passing the > additional parameters, people can now create an iterator to go > through all resources or only the resources of a task. > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > --- > include/linux/bpf.h | 4 ++ > include/uapi/linux/bpf.h | 23 ++++++++++ > kernel/bpf/task_iter.c | 81 +++++++++++++++++++++++++--------- > tools/include/uapi/linux/bpf.h | 23 ++++++++++ > 4 files changed, 109 insertions(+), 22 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 11950029284f..c8d164404e20 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user *pathname, int flags); > > struct bpf_iter_aux_info { > struct bpf_map *map; > + struct { > + __u32 tid; should be just u32 ? > + u8 type; > + } task; > }; > SNIP > > /* BPF syscall commands, see bpf(2) man-page for more details. */ > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > index 8c921799def4..7979aacb651e 100644 > --- a/kernel/bpf/task_iter.c > +++ b/kernel/bpf/task_iter.c > @@ -12,6 +12,8 @@ > > struct bpf_iter_seq_task_common { > struct pid_namespace *ns; > + u32 tid; > + u8 type; > }; > > struct bpf_iter_seq_task_info { > @@ -22,18 +24,31 @@ struct bpf_iter_seq_task_info { > u32 tid; > }; > > -static struct task_struct *task_seq_get_next(struct pid_namespace *ns, > +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, > u32 *tid, > bool skip_if_dup_files) > { > struct task_struct *task = NULL; > struct pid *pid; > > + if (common->type == BPF_TASK_ITER_TID) { > + if (*tid) > + return NULL; I tested and this condition breaks it for fd iterations, not sure about the task and vma, because they share this function if bpf_seq_read is called with small buffer there will be multiple calls to task_file_seq_get_next and second one will stop in here, even if there are more files to be displayed for the task in filter it'd be nice to have some test for this ;-) or perhaps compare with the not filtered output SNIP > static const struct seq_operations task_seq_ops = { > .start = task_seq_start, > .next = task_seq_next, > @@ -137,8 +166,7 @@ struct bpf_iter_seq_task_file_info { > static struct file * > task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) > { > - struct pid_namespace *ns = info->common.ns; > - u32 curr_tid = info->tid; > + u32 saved_tid = info->tid; > struct task_struct *curr_task; > unsigned int curr_fd = info->fd; > > @@ -151,21 +179,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) > curr_task = info->task; > curr_fd = info->fd; > } else { > - curr_task = task_seq_get_next(ns, &curr_tid, true); > + curr_task = task_seq_get_next(&info->common, &info->tid, true); > if (!curr_task) { > info->task = NULL; > - info->tid = curr_tid; > return NULL; > } nit, looks like we're missing proper indent in here > > - /* set info->task and info->tid */ > + /* set info->task */ > info->task = curr_task; > - if (curr_tid == info->tid) { > + if (saved_tid == info->tid) > curr_fd = info->fd; > - } else { > - info->tid = curr_tid; > + else SNIP
On Tue, Jul 26, 2022 at 02:13:17PM +0200, Jiri Olsa wrote: > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote: > > Allow creating an iterator that loops through resources of one task/thread. > > > > People could only create iterators to loop through all resources of > > files, vma, and tasks in the system, even though they were interested > > in only the resources of a specific task or process. Passing the > > additional parameters, people can now create an iterator to go > > through all resources or only the resources of a task. > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > --- > > include/linux/bpf.h | 4 ++ > > include/uapi/linux/bpf.h | 23 ++++++++++ > > kernel/bpf/task_iter.c | 81 +++++++++++++++++++++++++--------- > > tools/include/uapi/linux/bpf.h | 23 ++++++++++ > > 4 files changed, 109 insertions(+), 22 deletions(-) > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > index 11950029284f..c8d164404e20 100644 > > --- a/include/linux/bpf.h > > +++ b/include/linux/bpf.h > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user *pathname, int flags); > > > > struct bpf_iter_aux_info { > > struct bpf_map *map; > > + struct { > > + __u32 tid; > > should be just u32 ? > > > + u8 type; > > + } task; > > }; > > > > SNIP > > > > > /* BPF syscall commands, see bpf(2) man-page for more details. */ > > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > > index 8c921799def4..7979aacb651e 100644 > > --- a/kernel/bpf/task_iter.c > > +++ b/kernel/bpf/task_iter.c > > @@ -12,6 +12,8 @@ > > > > struct bpf_iter_seq_task_common { > > struct pid_namespace *ns; > > + u32 tid; > > + u8 type; > > }; > > > > struct bpf_iter_seq_task_info { > > @@ -22,18 +24,31 @@ struct bpf_iter_seq_task_info { > > u32 tid; > > }; > > > > -static struct task_struct *task_seq_get_next(struct pid_namespace *ns, > > +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, > > u32 *tid, > > bool skip_if_dup_files) > > { > > struct task_struct *task = NULL; > > struct pid *pid; > > > > + if (common->type == BPF_TASK_ITER_TID) { > > + if (*tid) > > + return NULL; > > I tested and this condition breaks it for fd iterations, not sure about > the task and vma, because they share this function > > if bpf_seq_read is called with small buffer there will be multiple calls > to task_file_seq_get_next and second one will stop in here, even if there > are more files to be displayed for the task in filter I mean there will be multiple calls of following sequence: bpf_seq_read task_file_seq_start task_seq_get_next and 2nd one will return NULL in task_seq_get_next, because info->tid is already set jirka > > it'd be nice to have some test for this ;-) or perhaps compare with the > not filtered output > > SNIP > > > static const struct seq_operations task_seq_ops = { > > .start = task_seq_start, > > .next = task_seq_next, > > @@ -137,8 +166,7 @@ struct bpf_iter_seq_task_file_info { > > static struct file * > > task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) > > { > > - struct pid_namespace *ns = info->common.ns; > > - u32 curr_tid = info->tid; > > + u32 saved_tid = info->tid; > > struct task_struct *curr_task; > > unsigned int curr_fd = info->fd; > > > > @@ -151,21 +179,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) > > curr_task = info->task; > > curr_fd = info->fd; > > } else { > > - curr_task = task_seq_get_next(ns, &curr_tid, true); > > + curr_task = task_seq_get_next(&info->common, &info->tid, true); > > if (!curr_task) { > > info->task = NULL; > > - info->tid = curr_tid; > > return NULL; > > } > > nit, looks like we're missing proper indent in here > > > > > > - /* set info->task and info->tid */ > > + /* set info->task */ > > info->task = curr_task; > > - if (curr_tid == info->tid) { > > + if (saved_tid == info->tid) > > curr_fd = info->fd; > > - } else { > > - info->tid = curr_tid; > > + else > > SNIP
On Tue, 2022-07-26 at 15:19 +0200, Jiri Olsa wrote: > On Tue, Jul 26, 2022 at 02:13:17PM +0200, Jiri Olsa wrote: > > > -static struct task_struct *task_seq_get_next(struct > > > pid_namespace *ns, > > > +static struct task_struct *task_seq_get_next(struct > > > bpf_iter_seq_task_common *common, > > > u32 *tid, > > > bool > > > skip_if_dup_files) > > > { > > > struct task_struct *task = NULL; > > > struct pid *pid; > > > > > > + if (common->type == BPF_TASK_ITER_TID) { > > > + if (*tid) > > > + return NULL; > > > > I tested and this condition breaks it for fd iterations, not sure > > about > > the task and vma, because they share this function > > > > if bpf_seq_read is called with small buffer there will be multiple > > calls > > to task_file_seq_get_next and second one will stop in here, even if > > there > > are more files to be displayed for the task in filter > > I mean there will be multiple calls of following sequence: > > bpf_seq_read > task_file_seq_start > task_seq_get_next > > and 2nd one will return NULL in task_seq_get_next, > because info->tid is already set Ok! I got your point. I will fix it ASAP.
On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote: > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote: > > Allow creating an iterator that loops through resources of one > > task/thread. > > > > People could only create iterators to loop through all resources of > > files, vma, and tasks in the system, even though they were > > interested > > in only the resources of a specific task or process. Passing the > > additional parameters, people can now create an iterator to go > > through all resources or only the resources of a task. > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > --- > > include/linux/bpf.h | 4 ++ > > include/uapi/linux/bpf.h | 23 ++++++++++ > > kernel/bpf/task_iter.c | 81 +++++++++++++++++++++++++----- > > ---- > > tools/include/uapi/linux/bpf.h | 23 ++++++++++ > > 4 files changed, 109 insertions(+), 22 deletions(-) > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > index 11950029284f..c8d164404e20 100644 > > --- a/include/linux/bpf.h > > +++ b/include/linux/bpf.h > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user > > *pathname, int flags); > > > > struct bpf_iter_aux_info { > > struct bpf_map *map; > > + struct { > > + __u32 tid; > > should be just u32 ? Or, should change the following 'type' to __u8? > > > + u8 type; > > + } task; > > }; > > > > SNIP > > > > > /* BPF syscall commands, see bpf(2) man-page for more details. */ > > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > > index 8c921799def4..7979aacb651e 100644 > > --- a/kernel/bpf/task_iter.c > > +++ b/kernel/bpf/task_iter.c > > @@ -12,6 +12,8 @@ > > > > struct bpf_iter_seq_task_common { > > struct pid_namespace *ns; > > + u32 tid; > > + u8 type; > > }; > > > > struct bpf_iter_seq_task_info { > > @@ -22,18 +24,31 @@ struct bpf_iter_seq_task_info { > > u32 tid; > > }; > > > > -static struct task_struct *task_seq_get_next(struct pid_namespace > > *ns, > > +static struct task_struct *task_seq_get_next(struct > > bpf_iter_seq_task_common *common, > > u32 *tid, > > bool > > skip_if_dup_files) > > { > > struct task_struct *task = NULL; > > struct pid *pid; > > > > + if (common->type == BPF_TASK_ITER_TID) { > > + if (*tid) > > + return NULL; > > I tested and this condition breaks it for fd iterations, not sure > about > the task and vma, because they share this function > > if bpf_seq_read is called with small buffer there will be multiple > calls > to task_file_seq_get_next and second one will stop in here, even if > there > are more files to be displayed for the task in filter > > it'd be nice to have some test for this ;-) or perhaps compare with > the > not filtered output > > SNIP > > > static const struct seq_operations task_seq_ops = { > > .start = task_seq_start, > > .next = task_seq_next, > > @@ -137,8 +166,7 @@ struct bpf_iter_seq_task_file_info { > > static struct file * > > task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) > > { > > - struct pid_namespace *ns = info->common.ns; > > - u32 curr_tid = info->tid; > > + u32 saved_tid = info->tid; > > struct task_struct *curr_task; > > unsigned int curr_fd = info->fd; > > > > @@ -151,21 +179,18 @@ task_file_seq_get_next(struct > > bpf_iter_seq_task_file_info *info) > > curr_task = info->task; > > curr_fd = info->fd; > > } else { > > - curr_task = task_seq_get_next(ns, &curr_tid, > > true); > > + curr_task = task_seq_get_next(&info->common, &info- > > >tid, true); > > if (!curr_task) { > > info->task = NULL; > > - info->tid = curr_tid; > > return NULL; > > } > > nit, looks like we're missing proper indent in here Yes, we mixed spaces and tabs. Should I change these lines to tabs? > > > > > > - /* set info->task and info->tid */ > > + /* set info->task */ > > info->task = curr_task; > > - if (curr_tid == info->tid) { > > + if (saved_tid == info->tid) > > curr_fd = info->fd; > > - } else { > > - info->tid = curr_tid; > > + else > > SNIP
On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com> wrote: > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote: > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote: > > > Allow creating an iterator that loops through resources of one > > > task/thread. > > > > > > People could only create iterators to loop through all resources of > > > files, vma, and tasks in the system, even though they were > > > interested > > > in only the resources of a specific task or process. Passing the > > > additional parameters, people can now create an iterator to go > > > through all resources or only the resources of a task. > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > > --- > > > include/linux/bpf.h | 4 ++ > > > include/uapi/linux/bpf.h | 23 ++++++++++ > > > kernel/bpf/task_iter.c | 81 +++++++++++++++++++++++++----- > > > ---- > > > tools/include/uapi/linux/bpf.h | 23 ++++++++++ > > > 4 files changed, 109 insertions(+), 22 deletions(-) > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > > index 11950029284f..c8d164404e20 100644 > > > --- a/include/linux/bpf.h > > > +++ b/include/linux/bpf.h > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user > > > *pathname, int flags); > > > > > > struct bpf_iter_aux_info { > > > struct bpf_map *map; > > > + struct { > > > + __u32 tid; > > > > should be just u32 ? > > Or, should change the following 'type' to __u8? Would it be better to use a pidfd instead of a tid here? Unset pidfd would mean going over all tasks, and any fd > 0 implies attaching to a specific task (as is the convention in BPF land). Most of the new UAPIs working on processes are using pidfds (to work with a stable handle instead of a reusable ID). The iterator taking an fd also gives an opportunity to BPF LSMs to attach permissions/policies to it (once we have a file local storage map) e.g. whether creating a task iterator for that specific pidfd instance (backed by the struct file) would be allowed or not. You are using getpid in the selftest and keeping track of last_tgid in the iterator, so I guess you don't even need to extend pidfd_open to work on thread IDs right now for your use case (and fdtable and mm are shared for POSIX threads anyway, so for those two it won't make a difference). What is your opinion? > > [...]
On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote: > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com> wrote: > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote: > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote: > > > > Allow creating an iterator that loops through resources of one > > > > task/thread. > > > > > > > > People could only create iterators to loop through all > > > > resources of > > > > files, vma, and tasks in the system, even though they were > > > > interested > > > > in only the resources of a specific task or process. Passing > > > > the > > > > additional parameters, people can now create an iterator to go > > > > through all resources or only the resources of a task. > > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > > > --- > > > > include/linux/bpf.h | 4 ++ > > > > include/uapi/linux/bpf.h | 23 ++++++++++ > > > > kernel/bpf/task_iter.c | 81 +++++++++++++++++++++++++- > > > > ---- > > > > ---- > > > > tools/include/uapi/linux/bpf.h | 23 ++++++++++ > > > > 4 files changed, 109 insertions(+), 22 deletions(-) > > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > > > index 11950029284f..c8d164404e20 100644 > > > > --- a/include/linux/bpf.h > > > > +++ b/include/linux/bpf.h > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user > > > > *pathname, int flags); > > > > > > > > struct bpf_iter_aux_info { > > > > struct bpf_map *map; > > > > + struct { > > > > + __u32 tid; > > > > > > should be just u32 ? > > > > Or, should change the following 'type' to __u8? > > Would it be better to use a pidfd instead of a tid here? Unset pidfd > would mean going over all tasks, and any fd > 0 implies attaching to > a > specific task (as is the convention in BPF land). Most of the new > UAPIs working on processes are using pidfds (to work with a stable > handle instead of a reusable ID). > The iterator taking an fd also gives an opportunity to BPF LSMs to > attach permissions/policies to it (once we have a file local storage > map) e.g. whether creating a task iterator for that specific pidfd > instance (backed by the struct file) would be allowed or not. > You are using getpid in the selftest and keeping track of last_tgid > in > the iterator, so I guess you don't even need to extend pidfd_open to > work on thread IDs right now for your use case (and fdtable and mm > are > shared for POSIX threads anyway, so for those two it won't make a > difference). > > What is your opinion? Do you mean removed both tid and type, and replace them with a pidfd? We can do that in uapi, struct bpf_link_info. But, the interal types, ex. bpf_iter_aux_info, still need to use tid or struct file to avoid getting file from the per-process fdtable. Is that what you mean?
On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> wrote: > > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote: > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com> wrote: > > > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote: > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote: > > > > > Allow creating an iterator that loops through resources of one > > > > > task/thread. > > > > > > > > > > People could only create iterators to loop through all > > > > > resources of > > > > > files, vma, and tasks in the system, even though they were > > > > > interested > > > > > in only the resources of a specific task or process. Passing > > > > > the > > > > > additional parameters, people can now create an iterator to go > > > > > through all resources or only the resources of a task. > > > > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > > > > --- > > > > > include/linux/bpf.h | 4 ++ > > > > > include/uapi/linux/bpf.h | 23 ++++++++++ > > > > > kernel/bpf/task_iter.c | 81 +++++++++++++++++++++++++- > > > > > ---- > > > > > ---- > > > > > tools/include/uapi/linux/bpf.h | 23 ++++++++++ > > > > > 4 files changed, 109 insertions(+), 22 deletions(-) > > > > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > > > > index 11950029284f..c8d164404e20 100644 > > > > > --- a/include/linux/bpf.h > > > > > +++ b/include/linux/bpf.h > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user > > > > > *pathname, int flags); > > > > > > > > > > struct bpf_iter_aux_info { > > > > > struct bpf_map *map; > > > > > + struct { > > > > > + __u32 tid; > > > > > > > > should be just u32 ? > > > > > > Or, should change the following 'type' to __u8? > > > > Would it be better to use a pidfd instead of a tid here? Unset pidfd > > would mean going over all tasks, and any fd > 0 implies attaching to > > a > > specific task (as is the convention in BPF land). Most of the new > > UAPIs working on processes are using pidfds (to work with a stable > > handle instead of a reusable ID). > > The iterator taking an fd also gives an opportunity to BPF LSMs to > > attach permissions/policies to it (once we have a file local storage > > map) e.g. whether creating a task iterator for that specific pidfd > > instance (backed by the struct file) would be allowed or not. > > You are using getpid in the selftest and keeping track of last_tgid > > in > > the iterator, so I guess you don't even need to extend pidfd_open to > > work on thread IDs right now for your use case (and fdtable and mm > > are > > shared for POSIX threads anyway, so for those two it won't make a > > difference). > > > > What is your opinion? > > Do you mean removed both tid and type, and replace them with a pidfd? > We can do that in uapi, struct bpf_link_info. But, the interal types, > ex. bpf_iter_aux_info, still need to use tid or struct file to avoid > getting file from the per-process fdtable. Is that what you mean? > Yes, just for the UAPI, it is similar to taking map_fd for map iter. In bpf_link_info we should report just the tid, just like map iter reports map_id.
On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote: > On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> wrote: > > > > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote: > > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com> > > > wrote: > > > > > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote: > > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote: > > > > > > Allow creating an iterator that loops through resources of > > > > > > one > > > > > > task/thread. > > > > > > > > > > > > People could only create iterators to loop through all > > > > > > resources of > > > > > > files, vma, and tasks in the system, even though they were > > > > > > interested > > > > > > in only the resources of a specific task or process. > > > > > > Passing > > > > > > the > > > > > > additional parameters, people can now create an iterator to > > > > > > go > > > > > > through all resources or only the resources of a task. > > > > > > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > > > > > --- > > > > > > include/linux/bpf.h | 4 ++ > > > > > > include/uapi/linux/bpf.h | 23 ++++++++++ > > > > > > kernel/bpf/task_iter.c | 81 > > > > > > +++++++++++++++++++++++++- > > > > > > ---- > > > > > > ---- > > > > > > tools/include/uapi/linux/bpf.h | 23 ++++++++++ > > > > > > 4 files changed, 109 insertions(+), 22 deletions(-) > > > > > > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > > > > > index 11950029284f..c8d164404e20 100644 > > > > > > --- a/include/linux/bpf.h > > > > > > +++ b/include/linux/bpf.h > > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char > > > > > > __user > > > > > > *pathname, int flags); > > > > > > > > > > > > struct bpf_iter_aux_info { > > > > > > struct bpf_map *map; > > > > > > + struct { > > > > > > + __u32 tid; > > > > > > > > > > should be just u32 ? > > > > > > > > Or, should change the following 'type' to __u8? > > > > > > Would it be better to use a pidfd instead of a tid here? Unset > > > pidfd > > > would mean going over all tasks, and any fd > 0 implies attaching > > > to > > > a > > > specific task (as is the convention in BPF land). Most of the new > > > UAPIs working on processes are using pidfds (to work with a > > > stable > > > handle instead of a reusable ID). > > > The iterator taking an fd also gives an opportunity to BPF LSMs > > > to > > > attach permissions/policies to it (once we have a file local > > > storage > > > map) e.g. whether creating a task iterator for that specific > > > pidfd > > > instance (backed by the struct file) would be allowed or not. > > > You are using getpid in the selftest and keeping track of > > > last_tgid > > > in > > > the iterator, so I guess you don't even need to extend pidfd_open > > > to > > > work on thread IDs right now for your use case (and fdtable and > > > mm > > > are > > > shared for POSIX threads anyway, so for those two it won't make a > > > difference). > > > > > > What is your opinion? > > > > Do you mean removed both tid and type, and replace them with a > > pidfd? > > We can do that in uapi, struct bpf_link_info. But, the interal > > types, > > ex. bpf_iter_aux_info, still need to use tid or struct file to > > avoid > > getting file from the per-process fdtable. Is that what you mean? > > > > Yes, just for the UAPI, it is similar to taking map_fd for map iter. > In bpf_link_info we should report just the tid, just like map iter > reports map_id. It sounds good to me. One thing I need a clarification. You mentioned that a fd > 0 implies attaching to a specific task, however fd can be 0. So, it should be fd >= 0. So, it forces the user to initialize the value of pidfd to -1. So, for convenience, we still need a field like 'type' to make it easy to create iterators without a filter.
On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com> wrote: > > On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote: > > On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> wrote: > > > > > > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote: > > > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com> > > > > wrote: > > > > > > > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote: > > > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote: > > > > > > > Allow creating an iterator that loops through resources of > > > > > > > one > > > > > > > task/thread. > > > > > > > > > > > > > > People could only create iterators to loop through all > > > > > > > resources of > > > > > > > files, vma, and tasks in the system, even though they were > > > > > > > interested > > > > > > > in only the resources of a specific task or process. > > > > > > > Passing > > > > > > > the > > > > > > > additional parameters, people can now create an iterator to > > > > > > > go > > > > > > > through all resources or only the resources of a task. > > > > > > > > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > > > > > > --- > > > > > > > include/linux/bpf.h | 4 ++ > > > > > > > include/uapi/linux/bpf.h | 23 ++++++++++ > > > > > > > kernel/bpf/task_iter.c | 81 > > > > > > > +++++++++++++++++++++++++- > > > > > > > ---- > > > > > > > ---- > > > > > > > tools/include/uapi/linux/bpf.h | 23 ++++++++++ > > > > > > > 4 files changed, 109 insertions(+), 22 deletions(-) > > > > > > > > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > > > > > > index 11950029284f..c8d164404e20 100644 > > > > > > > --- a/include/linux/bpf.h > > > > > > > +++ b/include/linux/bpf.h > > > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char > > > > > > > __user > > > > > > > *pathname, int flags); > > > > > > > > > > > > > > struct bpf_iter_aux_info { > > > > > > > struct bpf_map *map; > > > > > > > + struct { > > > > > > > + __u32 tid; > > > > > > > > > > > > should be just u32 ? > > > > > > > > > > Or, should change the following 'type' to __u8? > > > > > > > > Would it be better to use a pidfd instead of a tid here? Unset > > > > pidfd > > > > would mean going over all tasks, and any fd > 0 implies attaching > > > > to > > > > a > > > > specific task (as is the convention in BPF land). Most of the new > > > > UAPIs working on processes are using pidfds (to work with a > > > > stable > > > > handle instead of a reusable ID). > > > > The iterator taking an fd also gives an opportunity to BPF LSMs > > > > to > > > > attach permissions/policies to it (once we have a file local > > > > storage > > > > map) e.g. whether creating a task iterator for that specific > > > > pidfd > > > > instance (backed by the struct file) would be allowed or not. > > > > You are using getpid in the selftest and keeping track of > > > > last_tgid > > > > in > > > > the iterator, so I guess you don't even need to extend pidfd_open > > > > to > > > > work on thread IDs right now for your use case (and fdtable and > > > > mm > > > > are > > > > shared for POSIX threads anyway, so for those two it won't make a > > > > difference). > > > > > > > > What is your opinion? > > > > > > Do you mean removed both tid and type, and replace them with a > > > pidfd? > > > We can do that in uapi, struct bpf_link_info. But, the interal > > > types, > > > ex. bpf_iter_aux_info, still need to use tid or struct file to > > > avoid > > > getting file from the per-process fdtable. Is that what you mean? > > > > > > > Yes, just for the UAPI, it is similar to taking map_fd for map iter. > > In bpf_link_info we should report just the tid, just like map iter > > reports map_id. > > It sounds good to me. > > One thing I need a clarification. You mentioned that a fd > 0 implies > attaching to a specific task, however fd can be 0. So, it should be fd > >= 0. So, it forces the user to initialize the value of pidfd to -1. > So, for convenience, we still need a field like 'type' to make it easy > to create iterators without a filter. > Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so it is fine to rely on that assumption. For e.g. even for map_fd, bpf_map_elem iterator considers fd 0 to be unset. Then you don't need the type field. >
On Thu, 2022-07-28 at 18:22 +0200, Kumar Kartikeya Dwivedi wrote: > On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com> wrote: > > > > On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote: > > > On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> > > > wrote: > > > > > > > > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi > > > > wrote: > > > > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com> > > > > > wrote: > > > > > > > > > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote: > > > > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee > > > > > > > wrote: > > > > > > > > Allow creating an iterator that loops through resources > > > > > > > > of > > > > > > > > one > > > > > > > > task/thread. > > > > > > > > > > > > > > > > People could only create iterators to loop through all > > > > > > > > resources of > > > > > > > > files, vma, and tasks in the system, even though they > > > > > > > > were > > > > > > > > interested > > > > > > > > in only the resources of a specific task or process. > > > > > > > > Passing > > > > > > > > the > > > > > > > > additional parameters, people can now create an > > > > > > > > iterator to > > > > > > > > go > > > > > > > > through all resources or only the resources of a task. > > > > > > > > > > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > > > > > > > --- > > > > > > > > include/linux/bpf.h | 4 ++ > > > > > > > > include/uapi/linux/bpf.h | 23 ++++++++++ > > > > > > > > kernel/bpf/task_iter.c | 81 > > > > > > > > +++++++++++++++++++++++++- > > > > > > > > ---- > > > > > > > > ---- > > > > > > > > tools/include/uapi/linux/bpf.h | 23 ++++++++++ > > > > > > > > 4 files changed, 109 insertions(+), 22 deletions(-) > > > > > > > > > > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > > > > > > > index 11950029284f..c8d164404e20 100644 > > > > > > > > --- a/include/linux/bpf.h > > > > > > > > +++ b/include/linux/bpf.h > > > > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char > > > > > > > > __user > > > > > > > > *pathname, int flags); > > > > > > > > > > > > > > > > struct bpf_iter_aux_info { > > > > > > > > struct bpf_map *map; > > > > > > > > + struct { > > > > > > > > + __u32 tid; > > > > > > > > > > > > > > should be just u32 ? > > > > > > > > > > > > Or, should change the following 'type' to __u8? > > > > > > > > > > Would it be better to use a pidfd instead of a tid here? > > > > > Unset > > > > > pidfd > > > > > would mean going over all tasks, and any fd > 0 implies > > > > > attaching > > > > > to > > > > > a > > > > > specific task (as is the convention in BPF land). Most of the > > > > > new > > > > > UAPIs working on processes are using pidfds (to work with a > > > > > stable > > > > > handle instead of a reusable ID). > > > > > The iterator taking an fd also gives an opportunity to BPF > > > > > LSMs > > > > > to > > > > > attach permissions/policies to it (once we have a file local > > > > > storage > > > > > map) e.g. whether creating a task iterator for that specific > > > > > pidfd > > > > > instance (backed by the struct file) would be allowed or not. > > > > > You are using getpid in the selftest and keeping track of > > > > > last_tgid > > > > > in > > > > > the iterator, so I guess you don't even need to extend > > > > > pidfd_open > > > > > to > > > > > work on thread IDs right now for your use case (and fdtable > > > > > and > > > > > mm > > > > > are > > > > > shared for POSIX threads anyway, so for those two it won't > > > > > make a > > > > > difference). > > > > > > > > > > What is your opinion? > > > > > > > > Do you mean removed both tid and type, and replace them with a > > > > pidfd? > > > > We can do that in uapi, struct bpf_link_info. But, the interal > > > > types, > > > > ex. bpf_iter_aux_info, still need to use tid or struct file to > > > > avoid > > > > getting file from the per-process fdtable. Is that what you > > > > mean? > > > > > > > > > > Yes, just for the UAPI, it is similar to taking map_fd for map > > > iter. > > > In bpf_link_info we should report just the tid, just like map > > > iter > > > reports map_id. > > > > It sounds good to me. > > > > One thing I need a clarification. You mentioned that a fd > 0 > > implies > > attaching to a specific task, however fd can be 0. So, it should be > > fd > > > = 0. So, it forces the user to initialize the value of pidfd to - > > > 1. > > So, for convenience, we still need a field like 'type' to make it > > easy > > to create iterators without a filter. > > > > Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so it > is fine to rely on that assumption. For e.g. even for map_fd, > bpf_map_elem iterator considers fd 0 to be unset. Then you don't need > the type field. I just realize that pidfd may be meaningless for the bpf_link_info returned by bpf_obj_get_info_by_fd() since the origin fd might be closed already. So, I will always set it a value of 0.
On 7/28/22 9:40 AM, Kui-Feng Lee wrote: > On Thu, 2022-07-28 at 18:22 +0200, Kumar Kartikeya Dwivedi wrote: >> On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com> wrote: >>> >>> On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote: >>>> On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> >>>> wrote: >>>>> >>>>> On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi >>>>> wrote: >>>>>> On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com> >>>>>> wrote: >>>>>>> >>>>>>> On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote: >>>>>>>> On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee >>>>>>>> wrote: >>>>>>>>> Allow creating an iterator that loops through resources >>>>>>>>> of >>>>>>>>> one >>>>>>>>> task/thread. >>>>>>>>> >>>>>>>>> People could only create iterators to loop through all >>>>>>>>> resources of >>>>>>>>> files, vma, and tasks in the system, even though they >>>>>>>>> were >>>>>>>>> interested >>>>>>>>> in only the resources of a specific task or process. >>>>>>>>> Passing >>>>>>>>> the >>>>>>>>> additional parameters, people can now create an >>>>>>>>> iterator to >>>>>>>>> go >>>>>>>>> through all resources or only the resources of a task. >>>>>>>>> >>>>>>>>> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> >>>>>>>>> --- >>>>>>>>> include/linux/bpf.h | 4 ++ >>>>>>>>> include/uapi/linux/bpf.h | 23 ++++++++++ >>>>>>>>> kernel/bpf/task_iter.c | 81 >>>>>>>>> +++++++++++++++++++++++++- >>>>>>>>> ---- >>>>>>>>> ---- >>>>>>>>> tools/include/uapi/linux/bpf.h | 23 ++++++++++ >>>>>>>>> 4 files changed, 109 insertions(+), 22 deletions(-) >>>>>>>>> >>>>>>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >>>>>>>>> index 11950029284f..c8d164404e20 100644 >>>>>>>>> --- a/include/linux/bpf.h >>>>>>>>> +++ b/include/linux/bpf.h >>>>>>>>> @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char >>>>>>>>> __user >>>>>>>>> *pathname, int flags); >>>>>>>>> >>>>>>>>> struct bpf_iter_aux_info { >>>>>>>>> struct bpf_map *map; >>>>>>>>> + struct { >>>>>>>>> + __u32 tid; >>>>>>>> >>>>>>>> should be just u32 ? >>>>>>> >>>>>>> Or, should change the following 'type' to __u8? >>>>>> >>>>>> Would it be better to use a pidfd instead of a tid here? >>>>>> Unset >>>>>> pidfd >>>>>> would mean going over all tasks, and any fd > 0 implies >>>>>> attaching >>>>>> to >>>>>> a >>>>>> specific task (as is the convention in BPF land). Most of the >>>>>> new >>>>>> UAPIs working on processes are using pidfds (to work with a >>>>>> stable >>>>>> handle instead of a reusable ID). >>>>>> The iterator taking an fd also gives an opportunity to BPF >>>>>> LSMs >>>>>> to >>>>>> attach permissions/policies to it (once we have a file local >>>>>> storage >>>>>> map) e.g. whether creating a task iterator for that specific >>>>>> pidfd >>>>>> instance (backed by the struct file) would be allowed or not. >>>>>> You are using getpid in the selftest and keeping track of >>>>>> last_tgid >>>>>> in >>>>>> the iterator, so I guess you don't even need to extend >>>>>> pidfd_open >>>>>> to >>>>>> work on thread IDs right now for your use case (and fdtable >>>>>> and >>>>>> mm >>>>>> are >>>>>> shared for POSIX threads anyway, so for those two it won't >>>>>> make a >>>>>> difference). There is one problem here. The current pidfd_open syscall only supports thread-group leader, i.e., main thread, i.e., it won't support any non-main-thread tid's. Yes, thread-group leader and other threads should share the same vma and files in most of times, but it still possible different threads in the same process may have different files which is why in current task_iter.c we have: *tid = pid_nr_ns(pid, ns); task = get_pid_task(pid, PIDTYPE_PID); if (!task) { ++*tid; goto retry; } else if (skip_if_dup_files && !thread_group_leader(task) && task->files == task->group_leader->files) { put_task_struct(task); task = NULL; ++*tid; goto retry; } Each thread (tid) will have some fields different from thread-group leader (tgid), e.g., comm and most (if not all) scheduling related fields. So it would be good to support for each tid from the start as it is not clear when pidfd_open will support non thread-group leader. If it worries wrap around, a reference to the task can be held when tid passed to the kernel at link create time. This is similar to pid is passed to the kernel at pidfd_open syscall. But in practice, I think taking the reference during read() should also fine. The race always exist anyway. Kumar, could you give more details about security concerns? I am not sure about the tight relationship between bpf_iter and security. bpf_iter just for iterating kernel data structures. >>>>>> >>>>>> What is your opinion? >>>>> >>>>> Do you mean removed both tid and type, and replace them with a >>>>> pidfd? >>>>> We can do that in uapi, struct bpf_link_info. But, the interal >>>>> types, >>>>> ex. bpf_iter_aux_info, still need to use tid or struct file to >>>>> avoid >>>>> getting file from the per-process fdtable. Is that what you >>>>> mean? >>>>> >>>> >>>> Yes, just for the UAPI, it is similar to taking map_fd for map >>>> iter. >>>> In bpf_link_info we should report just the tid, just like map >>>> iter >>>> reports map_id. >>> >>> It sounds good to me. >>> >>> One thing I need a clarification. You mentioned that a fd > 0 >>> implies >>> attaching to a specific task, however fd can be 0. So, it should be >>> fd >>>> = 0. So, it forces the user to initialize the value of pidfd to - >>>> 1. >>> So, for convenience, we still need a field like 'type' to make it >>> easy >>> to create iterators without a filter. >>> >> >> Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so it >> is fine to rely on that assumption. For e.g. even for map_fd, >> bpf_map_elem iterator considers fd 0 to be unset. Then you don't need >> the type field. > > I just realize that pidfd may be meaningless for the bpf_link_info > returned by bpf_obj_get_info_by_fd() since the origin fd might be > closed already. So, I will always set it a value of 0. >
On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote: > > [...] > > There is one problem here. The current pidfd_open syscall > only supports thread-group leader, i.e., main thread, i.e., > it won't support any non-main-thread tid's. Yes, thread-group > leader and other threads should share the same vma and files > in most of times, but it still possible different threads > in the same process may have different files which is why > in current task_iter.c we have: > *tid = pid_nr_ns(pid, ns); > task = get_pid_task(pid, PIDTYPE_PID); > if (!task) { > ++*tid; > goto retry; > } else if (skip_if_dup_files && > !thread_group_leader(task) && > task->files == task->group_leader->files) { > put_task_struct(task); > task = NULL; > ++*tid; > goto retry; > } > > > Each thread (tid) will have some fields different from > thread-group leader (tgid), e.g., comm and most (if not all) > scheduling related fields. > > So it would be good to support for each tid from the start > as it is not clear when pidfd_open will support non > thread-group leader. I think this is just a missing feature, not a design limitation. If we just disable thread pifdfd from waitid and pidfd_send_signal, I think it is ok to use it elsewhere. > > If it worries wrap around, a reference to the task > can be held when tid passed to the kernel at link > create time. This is similar to pid is passed to > the kernel at pidfd_open syscall. But in practice, > I think taking the reference during read() should > also fine. The race always exist anyway. > > Kumar, could you give more details about security > concerns? I am not sure about the tight relationship > between bpf_iter and security. bpf_iter just for > iterating kernel data structures. > There is no security 'concern' per se. But having an fd which is used to set up the iterator just gives a chance to a BPF LSM to easily isolate permissions to attach the iterator to a task represented by that fd. i.e. the fd is an object to which this permission can be bound, the fd can be passed around (to share the same permission with others but it is limited to the task corresponding to it), etc. The permission management is even easier and faster once you have a file local storage map (which I plan to send soon). So you could have two pidfds, one which allows the process to attach the iter to it, and one which doesn't, without relying on the task's capabilities, the checks can become more fine grained, and the original task can even drop its capabilities (because they are bound to the fd instead), which allows privilege separation. It becomes a bit difficult when kernel APIs take IDs because then you don't have any subject (other than the task) to draw the permissions from. But anyway, all of this was just a suggestion (which is why I solicited opinions in the original reply). Feel free to drop/ignore if it is too much of a hassle to support (i.e. it is understandable you wouldn't want to spend time extending pidfd_open for this).
On Thu, Jul 28, 2022 at 9:24 AM Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote: > > On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com> wrote: > > > > On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote: > > > On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> wrote: > > > > > > > > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote: > > > > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com> > > > > > wrote: > > > > > > > > > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote: > > > > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote: > > > > > > > > Allow creating an iterator that loops through resources of > > > > > > > > one > > > > > > > > task/thread. > > > > > > > > > > > > > > > > People could only create iterators to loop through all > > > > > > > > resources of > > > > > > > > files, vma, and tasks in the system, even though they were > > > > > > > > interested > > > > > > > > in only the resources of a specific task or process. > > > > > > > > Passing > > > > > > > > the > > > > > > > > additional parameters, people can now create an iterator to > > > > > > > > go > > > > > > > > through all resources or only the resources of a task. > > > > > > > > > > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > > > > > > > --- > > > > > > > > include/linux/bpf.h | 4 ++ > > > > > > > > include/uapi/linux/bpf.h | 23 ++++++++++ > > > > > > > > kernel/bpf/task_iter.c | 81 > > > > > > > > +++++++++++++++++++++++++- > > > > > > > > ---- > > > > > > > > ---- > > > > > > > > tools/include/uapi/linux/bpf.h | 23 ++++++++++ > > > > > > > > 4 files changed, 109 insertions(+), 22 deletions(-) > > > > > > > > > > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > > > > > > > index 11950029284f..c8d164404e20 100644 > > > > > > > > --- a/include/linux/bpf.h > > > > > > > > +++ b/include/linux/bpf.h > > > > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char > > > > > > > > __user > > > > > > > > *pathname, int flags); > > > > > > > > > > > > > > > > struct bpf_iter_aux_info { > > > > > > > > struct bpf_map *map; > > > > > > > > + struct { > > > > > > > > + __u32 tid; > > > > > > > > > > > > > > should be just u32 ? > > > > > > > > > > > > Or, should change the following 'type' to __u8? > > > > > > > > > > Would it be better to use a pidfd instead of a tid here? Unset > > > > > pidfd > > > > > would mean going over all tasks, and any fd > 0 implies attaching > > > > > to > > > > > a > > > > > specific task (as is the convention in BPF land). Most of the new > > > > > UAPIs working on processes are using pidfds (to work with a > > > > > stable > > > > > handle instead of a reusable ID). > > > > > The iterator taking an fd also gives an opportunity to BPF LSMs > > > > > to > > > > > attach permissions/policies to it (once we have a file local > > > > > storage > > > > > map) e.g. whether creating a task iterator for that specific > > > > > pidfd > > > > > instance (backed by the struct file) would be allowed or not. > > > > > You are using getpid in the selftest and keeping track of > > > > > last_tgid > > > > > in > > > > > the iterator, so I guess you don't even need to extend pidfd_open > > > > > to > > > > > work on thread IDs right now for your use case (and fdtable and > > > > > mm > > > > > are > > > > > shared for POSIX threads anyway, so for those two it won't make a > > > > > difference). > > > > > > > > > > What is your opinion? > > > > > > > > Do you mean removed both tid and type, and replace them with a > > > > pidfd? > > > > We can do that in uapi, struct bpf_link_info. But, the interal > > > > types, > > > > ex. bpf_iter_aux_info, still need to use tid or struct file to > > > > avoid > > > > getting file from the per-process fdtable. Is that what you mean? > > > > > > > > > > Yes, just for the UAPI, it is similar to taking map_fd for map iter. > > > In bpf_link_info we should report just the tid, just like map iter > > > reports map_id. > > > > It sounds good to me. > > > > One thing I need a clarification. You mentioned that a fd > 0 implies > > attaching to a specific task, however fd can be 0. So, it should be fd > > >= 0. So, it forces the user to initialize the value of pidfd to -1. > > So, for convenience, we still need a field like 'type' to make it easy > > to create iterators without a filter. > > > > Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so it > is fine to rely on that assumption. For e.g. even for map_fd, > bpf_map_elem iterator considers fd 0 to be unset. Then you don't need > the type field. > FD 0 is STDIN, unless someone explicitly close(0). IMO using fd 0 for default behavior is fine. Hao > >
Hi Kumar, On Thu, Jul 28, 2022 at 10:53 AM Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote: > > On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote: > > > [...] > > > > There is one problem here. The current pidfd_open syscall > > only supports thread-group leader, i.e., main thread, i.e., > > it won't support any non-main-thread tid's. Yes, thread-group > > leader and other threads should share the same vma and files > > in most of times, but it still possible different threads > > in the same process may have different files which is why > > in current task_iter.c we have: > > *tid = pid_nr_ns(pid, ns); > > task = get_pid_task(pid, PIDTYPE_PID); > > if (!task) { > > ++*tid; > > goto retry; > > } else if (skip_if_dup_files && > > !thread_group_leader(task) && > > task->files == task->group_leader->files) { > > put_task_struct(task); > > task = NULL; > > ++*tid; > > goto retry; > > } > > > > > > Each thread (tid) will have some fields different from > > thread-group leader (tgid), e.g., comm and most (if not all) > > scheduling related fields. > > > > So it would be good to support for each tid from the start > > as it is not clear when pidfd_open will support non > > thread-group leader. > > I think this is just a missing feature, not a design limitation. If we > just disable thread pifdfd from waitid and pidfd_send_signal, I think > it is ok to use it elsewhere. > > > > > If it worries wrap around, a reference to the task > > can be held when tid passed to the kernel at link > > create time. This is similar to pid is passed to > > the kernel at pidfd_open syscall. But in practice, > > I think taking the reference during read() should > > also fine. The race always exist anyway. > > > > Kumar, could you give more details about security > > concerns? I am not sure about the tight relationship > > between bpf_iter and security. bpf_iter just for > > iterating kernel data structures. > > > > There is no security 'concern' per se. But having an fd which is used > to set up the iterator just gives a chance to a BPF LSM to easily > isolate permissions to attach the iterator to a task represented by > that fd. i.e. the fd is an object to which this permission can be > bound, the fd can be passed around (to share the same permission with > others but it is limited to the task corresponding to it), etc. The > permission management is even easier and faster once you have a file > local storage map (which I plan to send soon). > I like the idea of a file local storage map. I have several questions in mind, but don't want to digress from the discussion under Kui-Feng's patchset. It probably will be clear when seeing your change. Please cc me when you send it out. thanks. > So you could have two pidfds, one which allows the process to attach > the iter to it, and one which doesn't, without relying on the task's > capabilities, the checks can become more fine grained, and the > original task can even drop its capabilities (because they are bound > to the fd instead), which allows privilege separation. > > It becomes a bit difficult when kernel APIs take IDs because then you > don't have any subject (other than the task) to draw the permissions > from. > > But anyway, all of this was just a suggestion (which is why I > solicited opinions in the original reply). Feel free to drop/ignore if > it is too much of a hassle to support (i.e. it is understandable you > wouldn't want to spend time extending pidfd_open for this). On another thread, I was having a discussion with Tejun on FD vs ID for cgroup_iter. I am in favor of ID in general, because it's stateless and matches the info reported by bpf_link_info. This is nice from the userspace's perspective. Hao
On 7/28/22 10:52 AM, Kumar Kartikeya Dwivedi wrote: > On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote: >>> [...] >> >> There is one problem here. The current pidfd_open syscall >> only supports thread-group leader, i.e., main thread, i.e., >> it won't support any non-main-thread tid's. Yes, thread-group >> leader and other threads should share the same vma and files >> in most of times, but it still possible different threads >> in the same process may have different files which is why >> in current task_iter.c we have: >> *tid = pid_nr_ns(pid, ns); >> task = get_pid_task(pid, PIDTYPE_PID); >> if (!task) { >> ++*tid; >> goto retry; >> } else if (skip_if_dup_files && >> !thread_group_leader(task) && >> task->files == task->group_leader->files) { >> put_task_struct(task); >> task = NULL; >> ++*tid; >> goto retry; >> } >> >> >> Each thread (tid) will have some fields different from >> thread-group leader (tgid), e.g., comm and most (if not all) >> scheduling related fields. >> >> So it would be good to support for each tid from the start >> as it is not clear when pidfd_open will support non >> thread-group leader. > > I think this is just a missing feature, not a design limitation. If we > just disable thread pifdfd from waitid and pidfd_send_signal, I think > it is ok to use it elsewhere. Yes, I agree this is a missing feature. It is desirable that the missing feature gets implemented in kernel or at least promising work is recognized before we use pidfd for this task structure. > >> >> If it worries wrap around, a reference to the task >> can be held when tid passed to the kernel at link >> create time. This is similar to pid is passed to >> the kernel at pidfd_open syscall. But in practice, >> I think taking the reference during read() should >> also fine. The race always exist anyway. >> >> Kumar, could you give more details about security >> concerns? I am not sure about the tight relationship >> between bpf_iter and security. bpf_iter just for >> iterating kernel data structures. >> > > There is no security 'concern' per se. But having an fd which is used > to set up the iterator just gives a chance to a BPF LSM to easily > isolate permissions to attach the iterator to a task represented by > that fd. i.e. the fd is an object to which this permission can be > bound, the fd can be passed around (to share the same permission with > others but it is limited to the task corresponding to it), etc. The > permission management is even easier and faster once you have a file > local storage map (which I plan to send soon). So you mean with fd, bpf_lsm could add a hook in bpf iterator to enforce policies about whether a particular task can be visited or not, right? In such cases, I think the policy should be against a task pointer and a bpf pointer, which have enough information for the policy to work. In typical use cases, user space gets a pid (the process own pid or another process id). Yes, we could create a pidfd with pidfd_open(). And this is pidfd can be manipulated with permissions, and passed to the bpf iter. The current pidfd creation looks like below: int pidfd_create(struct pid *pid, unsigned int flags) { int fd; if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) return -EINVAL; if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) return -EINVAL; fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), flags | O_RDWR | O_CLOEXEC); if (fd < 0) put_pid(pid); return fd; } I am not sure how security policy can be easily applied to such a fd. Probably user need to further manipulate fd with fcntl() and I think most users probably won't do that. The following are some bpf program lsm hooks: #ifdef CONFIG_BPF_SYSCALL LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux) #endif /* CONFIG_BPF_SYSCALL */ I think task_struct ptr and prog ptr should be enough for the potential LSM hook. > > So you could have two pidfds, one which allows the process to attach > the iter to it, and one which doesn't, without relying on the task's > capabilities, the checks can become more fine grained, and the > original task can even drop its capabilities (because they are bound > to the fd instead), which allows privilege separation. > > It becomes a bit difficult when kernel APIs take IDs because then you > don't have any subject (other than the task) to draw the permissions > from. Maybe I missed something from security perspective. But from design perspective, I am okay with pidfd since it pushes the responsibility of pid -> task_struct conversion to user space. Although unlikely, it still has chances that inside the kernel tid->task_struct may actually use wrapped around tid... But since pidfd_open misses tid support, I hesitate to use pidfd for now. Maybe Daniel or Alexei can comment as well? > > But anyway, all of this was just a suggestion (which is why I > solicited opinions in the original reply). Feel free to drop/ignore if > it is too much of a hassle to support (i.e. it is understandable you > wouldn't want to spend time extending pidfd_open for this).
On Thu, Jul 28, 2022 at 03:54:01PM -0700, Yonghong Song wrote: > > > On 7/28/22 10:52 AM, Kumar Kartikeya Dwivedi wrote: > > On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote: > > > > [...] > > > > > > There is one problem here. The current pidfd_open syscall > > > only supports thread-group leader, i.e., main thread, i.e., > > > it won't support any non-main-thread tid's. Yes, thread-group > > > leader and other threads should share the same vma and files > > > in most of times, but it still possible different threads > > > in the same process may have different files which is why > > > in current task_iter.c we have: > > > *tid = pid_nr_ns(pid, ns); > > > task = get_pid_task(pid, PIDTYPE_PID); > > > if (!task) { > > > ++*tid; > > > goto retry; > > > } else if (skip_if_dup_files && > > > !thread_group_leader(task) && > > > task->files == task->group_leader->files) { > > > put_task_struct(task); > > > task = NULL; > > > ++*tid; > > > goto retry; > > > } > > > > > > > > > Each thread (tid) will have some fields different from > > > thread-group leader (tgid), e.g., comm and most (if not all) > > > scheduling related fields. > > > > > > So it would be good to support for each tid from the start > > > as it is not clear when pidfd_open will support non > > > thread-group leader. > > > > I think this is just a missing feature, not a design limitation. If we > > just disable thread pifdfd from waitid and pidfd_send_signal, I think > > it is ok to use it elsewhere. > > Yes, I agree this is a missing feature. It is desirable > that the missing feature gets implemented in kernel or > at least promising work is recognized before we use pidfd > for this task structure. When I did pidfd_{open,getfd,send_signal}, CLONE_PIDFD, and the waitid support we simply didn't enable support for pidfd to refer to individual threads because there was no immediate use-case for it and we hade some concerns that I can't remember off the top of my head. Plus, we were quite happy that we got the pidfd stuff in so we limited the scope of what we wanted to do in the first iteration. But if there is a good use-case for this then by all means we should enable pidfds to refer to individual threads and I'm happy to route that upstream. But this needs to be solid work as that area of the kernel can be interesting technically and otherwise... There have been requests for this before already. I've added a wrapper pidfd_get_task() a while ago that encodes the PIDTYPE_TGID restriction. If you grep for it you should find all places that rely on pidfds (process_mrelease() and whatnot). All these places need to be able to deal with individual threads if you change that. But note, the mw is coming up.
On Thu, 2022-07-28 at 19:00 +0200, Kumar Kartikeya Dwivedi wrote: > On Thu, 28 Jul 2022 at 18:40, Kui-Feng Lee <kuifeng@fb.com> wrote: > > > > On Thu, 2022-07-28 at 18:22 +0200, Kumar Kartikeya Dwivedi wrote: > > > On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com> > > > wrote: > > > > > > > > On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi > > > > wrote: > > > > > On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> > > > > > wrote: > > > > > > > > > > > > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi > > > > > > wrote: > > > > > > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee > > > > > > > <kuifeng@fb.com> > > > > > > > wrote: > > > > > > > > > > > > > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote: > > > > > > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng > > > > > > > > > Lee > > > > > > > > > wrote: > > > > > > > > > > Allow creating an iterator that loops through > > > > > > > > > > resources > > > > > > > > > > of > > > > > > > > > > one > > > > > > > > > > task/thread. > > > > > > > > > > > > > > > > > > > > People could only create iterators to loop through > > > > > > > > > > all > > > > > > > > > > resources of > > > > > > > > > > files, vma, and tasks in the system, even though > > > > > > > > > > they > > > > > > > > > > were > > > > > > > > > > interested > > > > > > > > > > in only the resources of a specific task or > > > > > > > > > > process. > > > > > > > > > > Passing > > > > > > > > > > the > > > > > > > > > > additional parameters, people can now create an > > > > > > > > > > iterator to > > > > > > > > > > go > > > > > > > > > > through all resources or only the resources of a > > > > > > > > > > task. > > > > > > > > > > > > > > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > > > > > > > > > --- > > > > > > > > > > include/linux/bpf.h | 4 ++ > > > > > > > > > > include/uapi/linux/bpf.h | 23 ++++++++++ > > > > > > > > > > kernel/bpf/task_iter.c | 81 > > > > > > > > > > +++++++++++++++++++++++++- > > > > > > > > > > ---- > > > > > > > > > > ---- > > > > > > > > > > tools/include/uapi/linux/bpf.h | 23 ++++++++++ > > > > > > > > > > 4 files changed, 109 insertions(+), 22 > > > > > > > > > > deletions(-) > > > > > > > > > > > > > > > > > > > > diff --git a/include/linux/bpf.h > > > > > > > > > > b/include/linux/bpf.h > > > > > > > > > > index 11950029284f..c8d164404e20 100644 > > > > > > > > > > --- a/include/linux/bpf.h > > > > > > > > > > +++ b/include/linux/bpf.h > > > > > > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const > > > > > > > > > > char > > > > > > > > > > __user > > > > > > > > > > *pathname, int flags); > > > > > > > > > > > > > > > > > > > > struct bpf_iter_aux_info { > > > > > > > > > > struct bpf_map *map; > > > > > > > > > > + struct { > > > > > > > > > > + __u32 tid; > > > > > > > > > > > > > > > > > > should be just u32 ? > > > > > > > > > > > > > > > > Or, should change the following 'type' to __u8? > > > > > > > > > > > > > > Would it be better to use a pidfd instead of a tid here? > > > > > > > Unset > > > > > > > pidfd > > > > > > > would mean going over all tasks, and any fd > 0 implies > > > > > > > attaching > > > > > > > to > > > > > > > a > > > > > > > specific task (as is the convention in BPF land). Most of > > > > > > > the > > > > > > > new > > > > > > > UAPIs working on processes are using pidfds (to work with > > > > > > > a > > > > > > > stable > > > > > > > handle instead of a reusable ID). > > > > > > > The iterator taking an fd also gives an opportunity to > > > > > > > BPF > > > > > > > LSMs > > > > > > > to > > > > > > > attach permissions/policies to it (once we have a file > > > > > > > local > > > > > > > storage > > > > > > > map) e.g. whether creating a task iterator for that > > > > > > > specific > > > > > > > pidfd > > > > > > > instance (backed by the struct file) would be allowed or > > > > > > > not. > > > > > > > You are using getpid in the selftest and keeping track of > > > > > > > last_tgid > > > > > > > in > > > > > > > the iterator, so I guess you don't even need to extend > > > > > > > pidfd_open > > > > > > > to > > > > > > > work on thread IDs right now for your use case (and > > > > > > > fdtable > > > > > > > and > > > > > > > mm > > > > > > > are > > > > > > > shared for POSIX threads anyway, so for those two it > > > > > > > won't > > > > > > > make a > > > > > > > difference). > > > > > > > > > > > > > > What is your opinion? > > > > > > > > > > > > Do you mean removed both tid and type, and replace them > > > > > > with a > > > > > > pidfd? > > > > > > We can do that in uapi, struct bpf_link_info. But, the > > > > > > interal > > > > > > types, > > > > > > ex. bpf_iter_aux_info, still need to use tid or struct file > > > > > > to > > > > > > avoid > > > > > > getting file from the per-process fdtable. Is that what > > > > > > you > > > > > > mean? > > > > > > > > > > > > > > > > Yes, just for the UAPI, it is similar to taking map_fd for > > > > > map > > > > > iter. > > > > > In bpf_link_info we should report just the tid, just like map > > > > > iter > > > > > reports map_id. > > > > > > > > It sounds good to me. > > > > > > > > One thing I need a clarification. You mentioned that a fd > 0 > > > > implies > > > > attaching to a specific task, however fd can be 0. So, it > > > > should be > > > > fd > > > > > = 0. So, it forces the user to initialize the value of pidfd > > > > > to - > > > > > 1. > > > > So, for convenience, we still need a field like 'type' to make > > > > it > > > > easy > > > > to create iterators without a filter. > > > > > > > > > > Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so > > > it > > > is fine to rely on that assumption. For e.g. even for map_fd, > > > bpf_map_elem iterator considers fd 0 to be unset. Then you don't > > > need > > > the type field. > > > > I just realize that pidfd may be meaningless for the bpf_link_info > > returned by bpf_obj_get_info_by_fd() since the origin fd might be > > closed already. So, I will always set it a value of 0. > > > > For bpf_link_info, we should only be returning the tid of the task it > is attached to, you cannot report the pidfd in bpf_link_info > correctly (as you already realised). By default this would be 0, > which is also an invalid tid, but when pidfd is set it will be the > tid of the task it is attached to, so it works well. We have a lot of dicussions around using tid or pidfd? Kumar also mentioned about removing 'type'. However, I have a feel that we need to keep 'type' in struct bpf_link_info. I cam imagine that we may like to create iterators of tasks in a cgroup or other paramters in futhure. 'type' will help us to tell the types of a parameter.
On Thu, 28 Jul 2022 at 21:11, Hao Luo <haoluo@google.com> wrote: > > Hi Kumar, > > On Thu, Jul 28, 2022 at 10:53 AM Kumar Kartikeya Dwivedi > <memxor@gmail.com> wrote: > > > > On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote: > > > > [...] > > > > > > There is one problem here. The current pidfd_open syscall > > > only supports thread-group leader, i.e., main thread, i.e., > > > it won't support any non-main-thread tid's. Yes, thread-group > > > leader and other threads should share the same vma and files > > > in most of times, but it still possible different threads > > > in the same process may have different files which is why > > > in current task_iter.c we have: > > > *tid = pid_nr_ns(pid, ns); > > > task = get_pid_task(pid, PIDTYPE_PID); > > > if (!task) { > > > ++*tid; > > > goto retry; > > > } else if (skip_if_dup_files && > > > !thread_group_leader(task) && > > > task->files == task->group_leader->files) { > > > put_task_struct(task); > > > task = NULL; > > > ++*tid; > > > goto retry; > > > } > > > > > > > > > Each thread (tid) will have some fields different from > > > thread-group leader (tgid), e.g., comm and most (if not all) > > > scheduling related fields. > > > > > > So it would be good to support for each tid from the start > > > as it is not clear when pidfd_open will support non > > > thread-group leader. > > > > I think this is just a missing feature, not a design limitation. If we > > just disable thread pifdfd from waitid and pidfd_send_signal, I think > > it is ok to use it elsewhere. > > > > > > > > If it worries wrap around, a reference to the task > > > can be held when tid passed to the kernel at link > > > create time. This is similar to pid is passed to > > > the kernel at pidfd_open syscall. But in practice, > > > I think taking the reference during read() should > > > also fine. The race always exist anyway. > > > > > > Kumar, could you give more details about security > > > concerns? I am not sure about the tight relationship > > > between bpf_iter and security. bpf_iter just for > > > iterating kernel data structures. > > > > > > > There is no security 'concern' per se. But having an fd which is used > > to set up the iterator just gives a chance to a BPF LSM to easily > > isolate permissions to attach the iterator to a task represented by > > that fd. i.e. the fd is an object to which this permission can be > > bound, the fd can be passed around (to share the same permission with > > others but it is limited to the task corresponding to it), etc. The > > permission management is even easier and faster once you have a file > > local storage map (which I plan to send soon). > > > > I like the idea of a file local storage map. I have several questions > in mind, but don't want to digress from the discussion under > Kui-Feng's patchset. It probably will be clear when seeing your > change. Please cc me when you send it out. thanks. > Thanks for the interest! I'll cc you when I send it out.
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 11950029284f..c8d164404e20 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user *pathname, int flags); struct bpf_iter_aux_info { struct bpf_map *map; + struct { + __u32 tid; + u8 type; + } task; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ffcbf79a556b..991141cacb9e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -87,10 +87,33 @@ struct bpf_cgroup_storage_key { __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; +enum bpf_task_iter_type { + BPF_TASK_ITER_ALL = 0, + BPF_TASK_ITER_TID, +}; + union bpf_iter_link_info { struct { __u32 map_fd; } map; + /* + * Parameters of task iterators. + */ + struct { + __u32 tid; + /* + * The type of the iterator. + * + * It can be one of enum bpf_task_iter_type. + * + * BPF_TASK_ITER_ALL (default) + * The iterator iterates over resources of everyprocess. + * + * BPF_TASK_ITER_TID + * You should also set *tid* to iterate over one task. + */ + __u8 type; /* BPF_TASK_ITER_* */ + } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8c921799def4..7979aacb651e 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -12,6 +12,8 @@ struct bpf_iter_seq_task_common { struct pid_namespace *ns; + u32 tid; + u8 type; }; struct bpf_iter_seq_task_info { @@ -22,18 +24,31 @@ struct bpf_iter_seq_task_info { u32 tid; }; -static struct task_struct *task_seq_get_next(struct pid_namespace *ns, +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, u32 *tid, bool skip_if_dup_files) { struct task_struct *task = NULL; struct pid *pid; + if (common->type == BPF_TASK_ITER_TID) { + if (*tid) + return NULL; + rcu_read_lock(); + pid = find_pid_ns(common->tid, common->ns); + if (pid) { + task = get_pid_task(pid, PIDTYPE_PID); + *tid = common->tid; + } + rcu_read_unlock(); + return task; + } + rcu_read_lock(); retry: - pid = find_ge_pid(*tid, ns); + pid = find_ge_pid(*tid, common->ns); if (pid) { - *tid = pid_nr_ns(pid, ns); + *tid = pid_nr_ns(pid, common->ns); task = get_pid_task(pid, PIDTYPE_PID); if (!task) { ++*tid; @@ -56,7 +71,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_task_info *info = seq->private; struct task_struct *task; - task = task_seq_get_next(info->common.ns, &info->tid, false); + task = task_seq_get_next(&info->common, &info->tid, false); + if (!task) return NULL; @@ -73,7 +89,8 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; ++info->tid; put_task_struct((struct task_struct *)v); - task = task_seq_get_next(info->common.ns, &info->tid, false); + + task = task_seq_get_next(&info->common, &info->tid, false); if (!task) return NULL; @@ -117,6 +134,18 @@ static void task_seq_stop(struct seq_file *seq, void *v) put_task_struct((struct task_struct *)v); } +static int bpf_iter_attach_task(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + if (linfo->task.type == BPF_TASK_ITER_ALL && linfo->task.tid) + return -EINVAL; + + aux->task.type = linfo->task.type; + aux->task.tid = linfo->task.tid; + return 0; +} + static const struct seq_operations task_seq_ops = { .start = task_seq_start, .next = task_seq_next, @@ -137,8 +166,7 @@ struct bpf_iter_seq_task_file_info { static struct file * task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) { - struct pid_namespace *ns = info->common.ns; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; struct task_struct *curr_task; unsigned int curr_fd = info->fd; @@ -151,21 +179,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) curr_task = info->task; curr_fd = info->fd; } else { - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { info->task = NULL; - info->tid = curr_tid; return NULL; } - /* set info->task and info->tid */ + /* set info->task */ info->task = curr_task; - if (curr_tid == info->tid) { + if (saved_tid == info->tid) curr_fd = info->fd; - } else { - info->tid = curr_tid; + else curr_fd = 0; - } } rcu_read_lock(); @@ -186,9 +211,15 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) /* the current task is done, go to the next task */ rcu_read_unlock(); put_task_struct(curr_task); + + if (info->common.type == BPF_TASK_ITER_TID) { + info->task = NULL; + return NULL; + } + info->task = NULL; info->fd = 0; - curr_tid = ++(info->tid); + saved_tid = ++(info->tid); goto again; } @@ -269,6 +300,8 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) struct bpf_iter_seq_task_common *common = priv_data; common->ns = get_pid_ns(task_active_pid_ns(current)); + common->type = aux->task.type; + common->tid = aux->task.tid; return 0; } @@ -307,11 +340,10 @@ enum bpf_task_vma_iter_find_op { static struct vm_area_struct * task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) { - struct pid_namespace *ns = info->common.ns; enum bpf_task_vma_iter_find_op op; struct vm_area_struct *curr_vma; struct task_struct *curr_task; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; /* If this function returns a non-NULL vma, it holds a reference to * the task_struct, and holds read lock on vma->mm->mmap_lock. @@ -371,14 +403,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) } } else { again: - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { - info->tid = curr_tid + 1; + info->tid++; goto finish; } - if (curr_tid != info->tid) { - info->tid = curr_tid; + if (saved_tid != info->tid) { /* new task, process the first vma */ op = task_vma_iter_first_vma; } else { @@ -430,9 +461,12 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) return curr_vma; next_task: + if (info->common.type == BPF_TASK_ITER_TID) + goto finish; + put_task_struct(curr_task); info->task = NULL; - curr_tid++; + info->tid++; goto again; finish: @@ -533,6 +567,7 @@ static const struct bpf_iter_seq_info task_seq_info = { static struct bpf_iter_reg task_reg_info = { .target = "task", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { @@ -551,6 +586,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = { static struct bpf_iter_reg task_file_reg_info = { .target = "task_file", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { @@ -571,6 +607,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = { static struct bpf_iter_reg task_vma_reg_info = { .target = "task_vma", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ffcbf79a556b..991141cacb9e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -87,10 +87,33 @@ struct bpf_cgroup_storage_key { __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; +enum bpf_task_iter_type { + BPF_TASK_ITER_ALL = 0, + BPF_TASK_ITER_TID, +}; + union bpf_iter_link_info { struct { __u32 map_fd; } map; + /* + * Parameters of task iterators. + */ + struct { + __u32 tid; + /* + * The type of the iterator. + * + * It can be one of enum bpf_task_iter_type. + * + * BPF_TASK_ITER_ALL (default) + * The iterator iterates over resources of everyprocess. + * + * BPF_TASK_ITER_TID + * You should also set *tid* to iterate over one task. + */ + __u8 type; /* BPF_TASK_ITER_* */ + } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */
Allow creating an iterator that loops through resources of one task/thread. People could only create iterators to loop through all resources of files, vma, and tasks in the system, even though they were interested in only the resources of a specific task or process. Passing the additional parameters, people can now create an iterator to go through all resources or only the resources of a task. Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> --- include/linux/bpf.h | 4 ++ include/uapi/linux/bpf.h | 23 ++++++++++ kernel/bpf/task_iter.c | 81 +++++++++++++++++++++++++--------- tools/include/uapi/linux/bpf.h | 23 ++++++++++ 4 files changed, 109 insertions(+), 22 deletions(-)