diff mbox series

[bpf-next,1/3] bpf: Parameterize task iterators.

Message ID 20220726051713.840431-2-kuifeng@fb.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Parameterize task iterators. | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for Kernel LATEST on ubuntu-latest with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Kernel LATEST on ubuntu-latest with llvm-15
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Kernel LATEST on z15 with gcc

Commit Message

Kui-Feng Lee July 26, 2022, 5:17 a.m. UTC
Allow creating an iterator that loops through resources of one task/thread.

People could only create iterators to loop through all resources of
files, vma, and tasks in the system, even though they were interested
in only the resources of a specific task or process.  Passing the
additional parameters, people can now create an iterator to go
through all resources or only the resources of a task.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
---
 include/linux/bpf.h            |  4 ++
 include/uapi/linux/bpf.h       | 23 ++++++++++
 kernel/bpf/task_iter.c         | 81 +++++++++++++++++++++++++---------
 tools/include/uapi/linux/bpf.h | 23 ++++++++++
 4 files changed, 109 insertions(+), 22 deletions(-)

Comments

Jiri Olsa July 26, 2022, 12:13 p.m. UTC | #1
On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> Allow creating an iterator that loops through resources of one task/thread.
> 
> People could only create iterators to loop through all resources of
> files, vma, and tasks in the system, even though they were interested
> in only the resources of a specific task or process.  Passing the
> additional parameters, people can now create an iterator to go
> through all resources or only the resources of a task.
> 
> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> ---
>  include/linux/bpf.h            |  4 ++
>  include/uapi/linux/bpf.h       | 23 ++++++++++
>  kernel/bpf/task_iter.c         | 81 +++++++++++++++++++++++++---------
>  tools/include/uapi/linux/bpf.h | 23 ++++++++++
>  4 files changed, 109 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 11950029284f..c8d164404e20 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>  
>  struct bpf_iter_aux_info {
>  	struct bpf_map *map;
> +	struct {
> +		__u32	tid;

should be just u32 ?

> +		u8	type;
> +	} task;
>  };
>  

SNIP

>  
>  /* BPF syscall commands, see bpf(2) man-page for more details. */
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index 8c921799def4..7979aacb651e 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -12,6 +12,8 @@
>  
>  struct bpf_iter_seq_task_common {
>  	struct pid_namespace *ns;
> +	u32	tid;
> +	u8	type;
>  };
>  
>  struct bpf_iter_seq_task_info {
> @@ -22,18 +24,31 @@ struct bpf_iter_seq_task_info {
>  	u32 tid;
>  };
>  
> -static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
> +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
>  					     u32 *tid,
>  					     bool skip_if_dup_files)
>  {
>  	struct task_struct *task = NULL;
>  	struct pid *pid;
>  
> +	if (common->type == BPF_TASK_ITER_TID) {
> +		if (*tid)
> +			return NULL;

I tested and this condition breaks it for fd iterations, not sure about
the task and vma, because they share this function

if bpf_seq_read is called with small buffer there will be multiple calls
to task_file_seq_get_next and second one will stop in here, even if there
are more files to be displayed for the task in filter

it'd be nice to have some test for this ;-) or perhaps compare with the
not filtered output

SNIP

>  static const struct seq_operations task_seq_ops = {
>  	.start	= task_seq_start,
>  	.next	= task_seq_next,
> @@ -137,8 +166,7 @@ struct bpf_iter_seq_task_file_info {
>  static struct file *
>  task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
>  {
> -	struct pid_namespace *ns = info->common.ns;
> -	u32 curr_tid = info->tid;
> +	u32 saved_tid = info->tid;
>  	struct task_struct *curr_task;
>  	unsigned int curr_fd = info->fd;
>  
> @@ -151,21 +179,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
>  		curr_task = info->task;
>  		curr_fd = info->fd;
>  	} else {
> -                curr_task = task_seq_get_next(ns, &curr_tid, true);
> +		curr_task = task_seq_get_next(&info->common, &info->tid, true);
>                  if (!curr_task) {
>                          info->task = NULL;
> -                        info->tid = curr_tid;
>                          return NULL;
>                  }

nit, looks like we're missing proper indent in here


>  
> -                /* set info->task and info->tid */
> +		/* set info->task */
>  		info->task = curr_task;
> -		if (curr_tid == info->tid) {
> +		if (saved_tid == info->tid)
>  			curr_fd = info->fd;
> -		} else {
> -			info->tid = curr_tid;
> +		else

SNIP
Jiri Olsa July 26, 2022, 1:19 p.m. UTC | #2
On Tue, Jul 26, 2022 at 02:13:17PM +0200, Jiri Olsa wrote:
> On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > Allow creating an iterator that loops through resources of one task/thread.
> > 
> > People could only create iterators to loop through all resources of
> > files, vma, and tasks in the system, even though they were interested
> > in only the resources of a specific task or process.  Passing the
> > additional parameters, people can now create an iterator to go
> > through all resources or only the resources of a task.
> > 
> > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > ---
> >  include/linux/bpf.h            |  4 ++
> >  include/uapi/linux/bpf.h       | 23 ++++++++++
> >  kernel/bpf/task_iter.c         | 81 +++++++++++++++++++++++++---------
> >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> >  4 files changed, 109 insertions(+), 22 deletions(-)
> > 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 11950029284f..c8d164404e20 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
> >  
> >  struct bpf_iter_aux_info {
> >  	struct bpf_map *map;
> > +	struct {
> > +		__u32	tid;
> 
> should be just u32 ?
> 
> > +		u8	type;
> > +	} task;
> >  };
> >  
> 
> SNIP
> 
> >  
> >  /* BPF syscall commands, see bpf(2) man-page for more details. */
> > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > index 8c921799def4..7979aacb651e 100644
> > --- a/kernel/bpf/task_iter.c
> > +++ b/kernel/bpf/task_iter.c
> > @@ -12,6 +12,8 @@
> >  
> >  struct bpf_iter_seq_task_common {
> >  	struct pid_namespace *ns;
> > +	u32	tid;
> > +	u8	type;
> >  };
> >  
> >  struct bpf_iter_seq_task_info {
> > @@ -22,18 +24,31 @@ struct bpf_iter_seq_task_info {
> >  	u32 tid;
> >  };
> >  
> > -static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
> > +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
> >  					     u32 *tid,
> >  					     bool skip_if_dup_files)
> >  {
> >  	struct task_struct *task = NULL;
> >  	struct pid *pid;
> >  
> > +	if (common->type == BPF_TASK_ITER_TID) {
> > +		if (*tid)
> > +			return NULL;
> 
> I tested and this condition breaks it for fd iterations, not sure about
> the task and vma, because they share this function
> 
> if bpf_seq_read is called with small buffer there will be multiple calls
> to task_file_seq_get_next and second one will stop in here, even if there
> are more files to be displayed for the task in filter

I mean there will be multiple calls of following sequence:

  bpf_seq_read
    task_file_seq_start
      task_seq_get_next

and 2nd one will return NULL in task_seq_get_next,
because info->tid is already set
 
jirka

> 
> it'd be nice to have some test for this ;-) or perhaps compare with the
> not filtered output
> 
> SNIP
> 
> >  static const struct seq_operations task_seq_ops = {
> >  	.start	= task_seq_start,
> >  	.next	= task_seq_next,
> > @@ -137,8 +166,7 @@ struct bpf_iter_seq_task_file_info {
> >  static struct file *
> >  task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
> >  {
> > -	struct pid_namespace *ns = info->common.ns;
> > -	u32 curr_tid = info->tid;
> > +	u32 saved_tid = info->tid;
> >  	struct task_struct *curr_task;
> >  	unsigned int curr_fd = info->fd;
> >  
> > @@ -151,21 +179,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
> >  		curr_task = info->task;
> >  		curr_fd = info->fd;
> >  	} else {
> > -                curr_task = task_seq_get_next(ns, &curr_tid, true);
> > +		curr_task = task_seq_get_next(&info->common, &info->tid, true);
> >                  if (!curr_task) {
> >                          info->task = NULL;
> > -                        info->tid = curr_tid;
> >                          return NULL;
> >                  }
> 
> nit, looks like we're missing proper indent in here
> 
> 
> >  
> > -                /* set info->task and info->tid */
> > +		/* set info->task */
> >  		info->task = curr_task;
> > -		if (curr_tid == info->tid) {
> > +		if (saved_tid == info->tid)
> >  			curr_fd = info->fd;
> > -		} else {
> > -			info->tid = curr_tid;
> > +		else
> 
> SNIP
Kui-Feng Lee July 27, 2022, 6:39 a.m. UTC | #3
On Tue, 2022-07-26 at 15:19 +0200, Jiri Olsa wrote:
> On Tue, Jul 26, 2022 at 02:13:17PM +0200, Jiri Olsa wrote:
> > > -static struct task_struct *task_seq_get_next(struct
> > > pid_namespace *ns,
> > > +static struct task_struct *task_seq_get_next(struct
> > > bpf_iter_seq_task_common *common,
> > >                                              u32 *tid,
> > >                                              bool
> > > skip_if_dup_files)
> > >  {
> > >         struct task_struct *task = NULL;
> > >         struct pid *pid;
> > >  
> > > +       if (common->type == BPF_TASK_ITER_TID) {
> > > +               if (*tid)
> > > +                       return NULL;
> > 
> > I tested and this condition breaks it for fd iterations, not sure
> > about
> > the task and vma, because they share this function
> > 
> > if bpf_seq_read is called with small buffer there will be multiple
> > calls
> > to task_file_seq_get_next and second one will stop in here, even if
> > there
> > are more files to be displayed for the task in filter
> 
> I mean there will be multiple calls of following sequence:
> 
>   bpf_seq_read
>     task_file_seq_start
>       task_seq_get_next
> 
> and 2nd one will return NULL in task_seq_get_next,
> because info->tid is already set

Ok!  I got your point.  I will fix it ASAP.
Kui-Feng Lee July 27, 2022, 6:56 a.m. UTC | #4
On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > Allow creating an iterator that loops through resources of one
> > task/thread.
> > 
> > People could only create iterators to loop through all resources of
> > files, vma, and tasks in the system, even though they were
> > interested
> > in only the resources of a specific task or process.  Passing the
> > additional parameters, people can now create an iterator to go
> > through all resources or only the resources of a task.
> > 
> > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > ---
> >  include/linux/bpf.h            |  4 ++
> >  include/uapi/linux/bpf.h       | 23 ++++++++++
> >  kernel/bpf/task_iter.c         | 81 +++++++++++++++++++++++++-----
> > ----
> >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> >  4 files changed, 109 insertions(+), 22 deletions(-)
> > 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 11950029284f..c8d164404e20 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user
> > *pathname, int flags);
> >  
> >  struct bpf_iter_aux_info {
> >         struct bpf_map *map;
> > +       struct {
> > +               __u32   tid;
> 
> should be just u32 ?


Or, should change the following 'type' to __u8?

> 
> > +               u8      type;
> > +       } task;
> >  };
> >  
> 
> SNIP
> 
> >  
> >  /* BPF syscall commands, see bpf(2) man-page for more details. */
> > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > index 8c921799def4..7979aacb651e 100644
> > --- a/kernel/bpf/task_iter.c
> > +++ b/kernel/bpf/task_iter.c
> > @@ -12,6 +12,8 @@
> >  
> >  struct bpf_iter_seq_task_common {
> >         struct pid_namespace *ns;
> > +       u32     tid;
> > +       u8      type;
> >  };
> >  
> >  struct bpf_iter_seq_task_info {
> > @@ -22,18 +24,31 @@ struct bpf_iter_seq_task_info {
> >         u32 tid;
> >  };
> >  
> > -static struct task_struct *task_seq_get_next(struct pid_namespace
> > *ns,
> > +static struct task_struct *task_seq_get_next(struct
> > bpf_iter_seq_task_common *common,
> >                                              u32 *tid,
> >                                              bool
> > skip_if_dup_files)
> >  {
> >         struct task_struct *task = NULL;
> >         struct pid *pid;
> >  
> > +       if (common->type == BPF_TASK_ITER_TID) {
> > +               if (*tid)
> > +                       return NULL;
> 
> I tested and this condition breaks it for fd iterations, not sure
> about
> the task and vma, because they share this function
> 
> if bpf_seq_read is called with small buffer there will be multiple
> calls
> to task_file_seq_get_next and second one will stop in here, even if
> there
> are more files to be displayed for the task in filter
> 
> it'd be nice to have some test for this ;-) or perhaps compare with
> the
> not filtered output
> 
> SNIP
> 
> >  static const struct seq_operations task_seq_ops = {
> >         .start  = task_seq_start,
> >         .next   = task_seq_next,
> > @@ -137,8 +166,7 @@ struct bpf_iter_seq_task_file_info {
> >  static struct file *
> >  task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
> >  {
> > -       struct pid_namespace *ns = info->common.ns;
> > -       u32 curr_tid = info->tid;
> > +       u32 saved_tid = info->tid;
> >         struct task_struct *curr_task;
> >         unsigned int curr_fd = info->fd;
> >  
> > @@ -151,21 +179,18 @@ task_file_seq_get_next(struct
> > bpf_iter_seq_task_file_info *info)
> >                 curr_task = info->task;
> >                 curr_fd = info->fd;
> >         } else {
> > -                curr_task = task_seq_get_next(ns, &curr_tid,
> > true);
> > +               curr_task = task_seq_get_next(&info->common, &info-
> > >tid, true);
> >                  if (!curr_task) {
> >                          info->task = NULL;
> > -                        info->tid = curr_tid;
> >                          return NULL;
> >                  }
> 
> nit, looks like we're missing proper indent in here

Yes, we mixed spaces and tabs.  Should I change these lines to tabs?
> 
> 
> >  
> > -                /* set info->task and info->tid */
> > +               /* set info->task */
> >                 info->task = curr_task;
> > -               if (curr_tid == info->tid) {
> > +               if (saved_tid == info->tid)
> >                         curr_fd = info->fd;
> > -               } else {
> > -                       info->tid = curr_tid;
> > +               else
> 
> SNIP
Kumar Kartikeya Dwivedi July 27, 2022, 8:19 a.m. UTC | #5
On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > > Allow creating an iterator that loops through resources of one
> > > task/thread.
> > >
> > > People could only create iterators to loop through all resources of
> > > files, vma, and tasks in the system, even though they were
> > > interested
> > > in only the resources of a specific task or process.  Passing the
> > > additional parameters, people can now create an iterator to go
> > > through all resources or only the resources of a task.
> > >
> > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > ---
> > >  include/linux/bpf.h            |  4 ++
> > >  include/uapi/linux/bpf.h       | 23 ++++++++++
> > >  kernel/bpf/task_iter.c         | 81 +++++++++++++++++++++++++-----
> > > ----
> > >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > >  4 files changed, 109 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > index 11950029284f..c8d164404e20 100644
> > > --- a/include/linux/bpf.h
> > > +++ b/include/linux/bpf.h
> > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user
> > > *pathname, int flags);
> > >
> > >  struct bpf_iter_aux_info {
> > >         struct bpf_map *map;
> > > +       struct {
> > > +               __u32   tid;
> >
> > should be just u32 ?
>
> Or, should change the following 'type' to __u8?

Would it be better to use a pidfd instead of a tid here? Unset pidfd
would mean going over all tasks, and any fd > 0 implies attaching to a
specific task (as is the convention in BPF land). Most of the new
UAPIs working on processes are using pidfds (to work with a stable
handle instead of a reusable ID).
The iterator taking an fd also gives an opportunity to BPF LSMs to
attach permissions/policies to it (once we have a file local storage
map) e.g. whether creating a task iterator for that specific pidfd
instance (backed by the struct file) would be allowed or not.
You are using getpid in the selftest and keeping track of last_tgid in
the iterator, so I guess you don't even need to extend pidfd_open to
work on thread IDs right now for your use case (and fdtable and mm are
shared for POSIX threads anyway, so for those two it won't make a
difference).

What is your opinion?

>
> [...]
Kui-Feng Lee July 28, 2022, 5:25 a.m. UTC | #6
On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote:
> On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com> wrote:
> > 
> > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > > > Allow creating an iterator that loops through resources of one
> > > > task/thread.
> > > > 
> > > > People could only create iterators to loop through all
> > > > resources of
> > > > files, vma, and tasks in the system, even though they were
> > > > interested
> > > > in only the resources of a specific task or process.  Passing
> > > > the
> > > > additional parameters, people can now create an iterator to go
> > > > through all resources or only the resources of a task.
> > > > 
> > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > ---
> > > >  include/linux/bpf.h            |  4 ++
> > > >  include/uapi/linux/bpf.h       | 23 ++++++++++
> > > >  kernel/bpf/task_iter.c         | 81 +++++++++++++++++++++++++-
> > > > ----
> > > > ----
> > > >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > > >  4 files changed, 109 insertions(+), 22 deletions(-)
> > > > 
> > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > index 11950029284f..c8d164404e20 100644
> > > > --- a/include/linux/bpf.h
> > > > +++ b/include/linux/bpf.h
> > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user
> > > > *pathname, int flags);
> > > > 
> > > >  struct bpf_iter_aux_info {
> > > >         struct bpf_map *map;
> > > > +       struct {
> > > > +               __u32   tid;
> > > 
> > > should be just u32 ?
> > 
> > Or, should change the following 'type' to __u8?
> 
> Would it be better to use a pidfd instead of a tid here? Unset pidfd
> would mean going over all tasks, and any fd > 0 implies attaching to
> a
> specific task (as is the convention in BPF land). Most of the new
> UAPIs working on processes are using pidfds (to work with a stable
> handle instead of a reusable ID).
> The iterator taking an fd also gives an opportunity to BPF LSMs to
> attach permissions/policies to it (once we have a file local storage
> map) e.g. whether creating a task iterator for that specific pidfd
> instance (backed by the struct file) would be allowed or not.
> You are using getpid in the selftest and keeping track of last_tgid
> in
> the iterator, so I guess you don't even need to extend pidfd_open to
> work on thread IDs right now for your use case (and fdtable and mm
> are
> shared for POSIX threads anyway, so for those two it won't make a
> difference).
> 
> What is your opinion?

Do you mean removed both tid and type, and replace them with a pidfd?
We can do that in uapi, struct bpf_link_info.  But, the interal types,
ex. bpf_iter_aux_info, still need to use tid or struct file to avoid
getting file from the per-process fdtable.  Is that what you mean?
Kumar Kartikeya Dwivedi July 28, 2022, 8:47 a.m. UTC | #7
On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote:
> > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com> wrote:
> > >
> > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > > > > Allow creating an iterator that loops through resources of one
> > > > > task/thread.
> > > > >
> > > > > People could only create iterators to loop through all
> > > > > resources of
> > > > > files, vma, and tasks in the system, even though they were
> > > > > interested
> > > > > in only the resources of a specific task or process.  Passing
> > > > > the
> > > > > additional parameters, people can now create an iterator to go
> > > > > through all resources or only the resources of a task.
> > > > >
> > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > > ---
> > > > >  include/linux/bpf.h            |  4 ++
> > > > >  include/uapi/linux/bpf.h       | 23 ++++++++++
> > > > >  kernel/bpf/task_iter.c         | 81 +++++++++++++++++++++++++-
> > > > > ----
> > > > > ----
> > > > >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > > > >  4 files changed, 109 insertions(+), 22 deletions(-)
> > > > >
> > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > index 11950029284f..c8d164404e20 100644
> > > > > --- a/include/linux/bpf.h
> > > > > +++ b/include/linux/bpf.h
> > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char __user
> > > > > *pathname, int flags);
> > > > >
> > > > >  struct bpf_iter_aux_info {
> > > > >         struct bpf_map *map;
> > > > > +       struct {
> > > > > +               __u32   tid;
> > > >
> > > > should be just u32 ?
> > >
> > > Or, should change the following 'type' to __u8?
> >
> > Would it be better to use a pidfd instead of a tid here? Unset pidfd
> > would mean going over all tasks, and any fd > 0 implies attaching to
> > a
> > specific task (as is the convention in BPF land). Most of the new
> > UAPIs working on processes are using pidfds (to work with a stable
> > handle instead of a reusable ID).
> > The iterator taking an fd also gives an opportunity to BPF LSMs to
> > attach permissions/policies to it (once we have a file local storage
> > map) e.g. whether creating a task iterator for that specific pidfd
> > instance (backed by the struct file) would be allowed or not.
> > You are using getpid in the selftest and keeping track of last_tgid
> > in
> > the iterator, so I guess you don't even need to extend pidfd_open to
> > work on thread IDs right now for your use case (and fdtable and mm
> > are
> > shared for POSIX threads anyway, so for those two it won't make a
> > difference).
> >
> > What is your opinion?
>
> Do you mean removed both tid and type, and replace them with a pidfd?
> We can do that in uapi, struct bpf_link_info.  But, the interal types,
> ex. bpf_iter_aux_info, still need to use tid or struct file to avoid
> getting file from the per-process fdtable.  Is that what you mean?
>

Yes, just for the UAPI, it is similar to taking map_fd for map iter.
In bpf_link_info we should report just the tid, just like map iter
reports map_id.
Kui-Feng Lee July 28, 2022, 3:16 p.m. UTC | #8
On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote:
> On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> wrote:
> > 
> > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote:
> > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com>
> > > wrote:
> > > > 
> > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > > > > > Allow creating an iterator that loops through resources of
> > > > > > one
> > > > > > task/thread.
> > > > > > 
> > > > > > People could only create iterators to loop through all
> > > > > > resources of
> > > > > > files, vma, and tasks in the system, even though they were
> > > > > > interested
> > > > > > in only the resources of a specific task or process. 
> > > > > > Passing
> > > > > > the
> > > > > > additional parameters, people can now create an iterator to
> > > > > > go
> > > > > > through all resources or only the resources of a task.
> > > > > > 
> > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > > > ---
> > > > > >  include/linux/bpf.h            |  4 ++
> > > > > >  include/uapi/linux/bpf.h       | 23 ++++++++++
> > > > > >  kernel/bpf/task_iter.c         | 81
> > > > > > +++++++++++++++++++++++++-
> > > > > > ----
> > > > > > ----
> > > > > >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > > > > >  4 files changed, 109 insertions(+), 22 deletions(-)
> > > > > > 
> > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > > index 11950029284f..c8d164404e20 100644
> > > > > > --- a/include/linux/bpf.h
> > > > > > +++ b/include/linux/bpf.h
> > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char
> > > > > > __user
> > > > > > *pathname, int flags);
> > > > > > 
> > > > > >  struct bpf_iter_aux_info {
> > > > > >         struct bpf_map *map;
> > > > > > +       struct {
> > > > > > +               __u32   tid;
> > > > > 
> > > > > should be just u32 ?
> > > > 
> > > > Or, should change the following 'type' to __u8?
> > > 
> > > Would it be better to use a pidfd instead of a tid here? Unset
> > > pidfd
> > > would mean going over all tasks, and any fd > 0 implies attaching
> > > to
> > > a
> > > specific task (as is the convention in BPF land). Most of the new
> > > UAPIs working on processes are using pidfds (to work with a
> > > stable
> > > handle instead of a reusable ID).
> > > The iterator taking an fd also gives an opportunity to BPF LSMs
> > > to
> > > attach permissions/policies to it (once we have a file local
> > > storage
> > > map) e.g. whether creating a task iterator for that specific
> > > pidfd
> > > instance (backed by the struct file) would be allowed or not.
> > > You are using getpid in the selftest and keeping track of
> > > last_tgid
> > > in
> > > the iterator, so I guess you don't even need to extend pidfd_open
> > > to
> > > work on thread IDs right now for your use case (and fdtable and
> > > mm
> > > are
> > > shared for POSIX threads anyway, so for those two it won't make a
> > > difference).
> > > 
> > > What is your opinion?
> > 
> > Do you mean removed both tid and type, and replace them with a
> > pidfd?
> > We can do that in uapi, struct bpf_link_info.  But, the interal
> > types,
> > ex. bpf_iter_aux_info, still need to use tid or struct file to
> > avoid
> > getting file from the per-process fdtable.  Is that what you mean?
> > 
> 
> Yes, just for the UAPI, it is similar to taking map_fd for map iter.
> In bpf_link_info we should report just the tid, just like map iter
> reports map_id.

It sounds good to me.

One thing I need a clarification. You mentioned that a fd > 0 implies
attaching to a specific task, however fd can be 0. So, it should be fd
>= 0. So, it forces the user to initialize the value of pidfd to -1. 
So, for convenience, we still need a field like 'type' to make it easy
to create iterators without a filter.
Kumar Kartikeya Dwivedi July 28, 2022, 4:22 p.m. UTC | #9
On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote:
> > On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> wrote:
> > >
> > > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote:
> > > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com>
> > > > wrote:
> > > > >
> > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > > > > > > Allow creating an iterator that loops through resources of
> > > > > > > one
> > > > > > > task/thread.
> > > > > > >
> > > > > > > People could only create iterators to loop through all
> > > > > > > resources of
> > > > > > > files, vma, and tasks in the system, even though they were
> > > > > > > interested
> > > > > > > in only the resources of a specific task or process.
> > > > > > > Passing
> > > > > > > the
> > > > > > > additional parameters, people can now create an iterator to
> > > > > > > go
> > > > > > > through all resources or only the resources of a task.
> > > > > > >
> > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > > > > ---
> > > > > > >  include/linux/bpf.h            |  4 ++
> > > > > > >  include/uapi/linux/bpf.h       | 23 ++++++++++
> > > > > > >  kernel/bpf/task_iter.c         | 81
> > > > > > > +++++++++++++++++++++++++-
> > > > > > > ----
> > > > > > > ----
> > > > > > >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > > > > > >  4 files changed, 109 insertions(+), 22 deletions(-)
> > > > > > >
> > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > > > index 11950029284f..c8d164404e20 100644
> > > > > > > --- a/include/linux/bpf.h
> > > > > > > +++ b/include/linux/bpf.h
> > > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char
> > > > > > > __user
> > > > > > > *pathname, int flags);
> > > > > > >
> > > > > > >  struct bpf_iter_aux_info {
> > > > > > >         struct bpf_map *map;
> > > > > > > +       struct {
> > > > > > > +               __u32   tid;
> > > > > >
> > > > > > should be just u32 ?
> > > > >
> > > > > Or, should change the following 'type' to __u8?
> > > >
> > > > Would it be better to use a pidfd instead of a tid here? Unset
> > > > pidfd
> > > > would mean going over all tasks, and any fd > 0 implies attaching
> > > > to
> > > > a
> > > > specific task (as is the convention in BPF land). Most of the new
> > > > UAPIs working on processes are using pidfds (to work with a
> > > > stable
> > > > handle instead of a reusable ID).
> > > > The iterator taking an fd also gives an opportunity to BPF LSMs
> > > > to
> > > > attach permissions/policies to it (once we have a file local
> > > > storage
> > > > map) e.g. whether creating a task iterator for that specific
> > > > pidfd
> > > > instance (backed by the struct file) would be allowed or not.
> > > > You are using getpid in the selftest and keeping track of
> > > > last_tgid
> > > > in
> > > > the iterator, so I guess you don't even need to extend pidfd_open
> > > > to
> > > > work on thread IDs right now for your use case (and fdtable and
> > > > mm
> > > > are
> > > > shared for POSIX threads anyway, so for those two it won't make a
> > > > difference).
> > > >
> > > > What is your opinion?
> > >
> > > Do you mean removed both tid and type, and replace them with a
> > > pidfd?
> > > We can do that in uapi, struct bpf_link_info.  But, the interal
> > > types,
> > > ex. bpf_iter_aux_info, still need to use tid or struct file to
> > > avoid
> > > getting file from the per-process fdtable.  Is that what you mean?
> > >
> >
> > Yes, just for the UAPI, it is similar to taking map_fd for map iter.
> > In bpf_link_info we should report just the tid, just like map iter
> > reports map_id.
>
> It sounds good to me.
>
> One thing I need a clarification. You mentioned that a fd > 0 implies
> attaching to a specific task, however fd can be 0. So, it should be fd
> >= 0. So, it forces the user to initialize the value of pidfd to -1.
> So, for convenience, we still need a field like 'type' to make it easy
> to create iterators without a filter.
>

Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so it
is fine to rely on that assumption. For e.g. even for map_fd,
bpf_map_elem iterator considers fd 0 to be unset. Then you don't need
the type field.

>
Kui-Feng Lee July 28, 2022, 4:40 p.m. UTC | #10
On Thu, 2022-07-28 at 18:22 +0200, Kumar Kartikeya Dwivedi wrote:
> On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com> wrote:
> > 
> > On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote:
> > > On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com>
> > > wrote:
> > > > 
> > > > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi
> > > > wrote:
> > > > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com>
> > > > > wrote:
> > > > > > 
> > > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > > > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee
> > > > > > > wrote:
> > > > > > > > Allow creating an iterator that loops through resources
> > > > > > > > of
> > > > > > > > one
> > > > > > > > task/thread.
> > > > > > > > 
> > > > > > > > People could only create iterators to loop through all
> > > > > > > > resources of
> > > > > > > > files, vma, and tasks in the system, even though they
> > > > > > > > were
> > > > > > > > interested
> > > > > > > > in only the resources of a specific task or process.
> > > > > > > > Passing
> > > > > > > > the
> > > > > > > > additional parameters, people can now create an
> > > > > > > > iterator to
> > > > > > > > go
> > > > > > > > through all resources or only the resources of a task.
> > > > > > > > 
> > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > > > > > ---
> > > > > > > >  include/linux/bpf.h            |  4 ++
> > > > > > > >  include/uapi/linux/bpf.h       | 23 ++++++++++
> > > > > > > >  kernel/bpf/task_iter.c         | 81
> > > > > > > > +++++++++++++++++++++++++-
> > > > > > > > ----
> > > > > > > > ----
> > > > > > > >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > > > > > > >  4 files changed, 109 insertions(+), 22 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > > > > index 11950029284f..c8d164404e20 100644
> > > > > > > > --- a/include/linux/bpf.h
> > > > > > > > +++ b/include/linux/bpf.h
> > > > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char
> > > > > > > > __user
> > > > > > > > *pathname, int flags);
> > > > > > > > 
> > > > > > > >  struct bpf_iter_aux_info {
> > > > > > > >         struct bpf_map *map;
> > > > > > > > +       struct {
> > > > > > > > +               __u32   tid;
> > > > > > > 
> > > > > > > should be just u32 ?
> > > > > > 
> > > > > > Or, should change the following 'type' to __u8?
> > > > > 
> > > > > Would it be better to use a pidfd instead of a tid here?
> > > > > Unset
> > > > > pidfd
> > > > > would mean going over all tasks, and any fd > 0 implies
> > > > > attaching
> > > > > to
> > > > > a
> > > > > specific task (as is the convention in BPF land). Most of the
> > > > > new
> > > > > UAPIs working on processes are using pidfds (to work with a
> > > > > stable
> > > > > handle instead of a reusable ID).
> > > > > The iterator taking an fd also gives an opportunity to BPF
> > > > > LSMs
> > > > > to
> > > > > attach permissions/policies to it (once we have a file local
> > > > > storage
> > > > > map) e.g. whether creating a task iterator for that specific
> > > > > pidfd
> > > > > instance (backed by the struct file) would be allowed or not.
> > > > > You are using getpid in the selftest and keeping track of
> > > > > last_tgid
> > > > > in
> > > > > the iterator, so I guess you don't even need to extend
> > > > > pidfd_open
> > > > > to
> > > > > work on thread IDs right now for your use case (and fdtable
> > > > > and
> > > > > mm
> > > > > are
> > > > > shared for POSIX threads anyway, so for those two it won't
> > > > > make a
> > > > > difference).
> > > > > 
> > > > > What is your opinion?
> > > > 
> > > > Do you mean removed both tid and type, and replace them with a
> > > > pidfd?
> > > > We can do that in uapi, struct bpf_link_info.  But, the interal
> > > > types,
> > > > ex. bpf_iter_aux_info, still need to use tid or struct file to
> > > > avoid
> > > > getting file from the per-process fdtable.  Is that what you
> > > > mean?
> > > > 
> > > 
> > > Yes, just for the UAPI, it is similar to taking map_fd for map
> > > iter.
> > > In bpf_link_info we should report just the tid, just like map
> > > iter
> > > reports map_id.
> > 
> > It sounds good to me.
> > 
> > One thing I need a clarification. You mentioned that a fd > 0
> > implies
> > attaching to a specific task, however fd can be 0. So, it should be
> > fd
> > > = 0. So, it forces the user to initialize the value of pidfd to -
> > > 1.
> > So, for convenience, we still need a field like 'type' to make it
> > easy
> > to create iterators without a filter.
> > 
> 
> Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so it
> is fine to rely on that assumption. For e.g. even for map_fd,
> bpf_map_elem iterator considers fd 0 to be unset. Then you don't need
> the type field.

I just realize that pidfd may be meaningless for the bpf_link_info
returned by bpf_obj_get_info_by_fd() since the origin fd might be
closed already.  So, I will always set it a value of 0.
Yonghong Song July 28, 2022, 5:08 p.m. UTC | #11
On 7/28/22 9:40 AM, Kui-Feng Lee wrote:
> On Thu, 2022-07-28 at 18:22 +0200, Kumar Kartikeya Dwivedi wrote:
>> On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com> wrote:
>>>
>>> On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote:
>>>> On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com>
>>>> wrote:
>>>>>
>>>>> On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi
>>>>> wrote:
>>>>>> On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com>
>>>>>> wrote:
>>>>>>>
>>>>>>> On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
>>>>>>>> On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee
>>>>>>>> wrote:
>>>>>>>>> Allow creating an iterator that loops through resources
>>>>>>>>> of
>>>>>>>>> one
>>>>>>>>> task/thread.
>>>>>>>>>
>>>>>>>>> People could only create iterators to loop through all
>>>>>>>>> resources of
>>>>>>>>> files, vma, and tasks in the system, even though they
>>>>>>>>> were
>>>>>>>>> interested
>>>>>>>>> in only the resources of a specific task or process.
>>>>>>>>> Passing
>>>>>>>>> the
>>>>>>>>> additional parameters, people can now create an
>>>>>>>>> iterator to
>>>>>>>>> go
>>>>>>>>> through all resources or only the resources of a task.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
>>>>>>>>> ---
>>>>>>>>>   include/linux/bpf.h            |  4 ++
>>>>>>>>>   include/uapi/linux/bpf.h       | 23 ++++++++++
>>>>>>>>>   kernel/bpf/task_iter.c         | 81
>>>>>>>>> +++++++++++++++++++++++++-
>>>>>>>>> ----
>>>>>>>>> ----
>>>>>>>>>   tools/include/uapi/linux/bpf.h | 23 ++++++++++
>>>>>>>>>   4 files changed, 109 insertions(+), 22 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>>>>>>> index 11950029284f..c8d164404e20 100644
>>>>>>>>> --- a/include/linux/bpf.h
>>>>>>>>> +++ b/include/linux/bpf.h
>>>>>>>>> @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char
>>>>>>>>> __user
>>>>>>>>> *pathname, int flags);
>>>>>>>>>
>>>>>>>>>   struct bpf_iter_aux_info {
>>>>>>>>>          struct bpf_map *map;
>>>>>>>>> +       struct {
>>>>>>>>> +               __u32   tid;
>>>>>>>>
>>>>>>>> should be just u32 ?
>>>>>>>
>>>>>>> Or, should change the following 'type' to __u8?
>>>>>>
>>>>>> Would it be better to use a pidfd instead of a tid here?
>>>>>> Unset
>>>>>> pidfd
>>>>>> would mean going over all tasks, and any fd > 0 implies
>>>>>> attaching
>>>>>> to
>>>>>> a
>>>>>> specific task (as is the convention in BPF land). Most of the
>>>>>> new
>>>>>> UAPIs working on processes are using pidfds (to work with a
>>>>>> stable
>>>>>> handle instead of a reusable ID).
>>>>>> The iterator taking an fd also gives an opportunity to BPF
>>>>>> LSMs
>>>>>> to
>>>>>> attach permissions/policies to it (once we have a file local
>>>>>> storage
>>>>>> map) e.g. whether creating a task iterator for that specific
>>>>>> pidfd
>>>>>> instance (backed by the struct file) would be allowed or not.
>>>>>> You are using getpid in the selftest and keeping track of
>>>>>> last_tgid
>>>>>> in
>>>>>> the iterator, so I guess you don't even need to extend
>>>>>> pidfd_open
>>>>>> to
>>>>>> work on thread IDs right now for your use case (and fdtable
>>>>>> and
>>>>>> mm
>>>>>> are
>>>>>> shared for POSIX threads anyway, so for those two it won't
>>>>>> make a
>>>>>> difference).

There is one problem here. The current pidfd_open syscall
only supports thread-group leader, i.e., main thread, i.e.,
it won't support any non-main-thread tid's. Yes, thread-group
leader and other threads should share the same vma and files
in most of times, but it still possible different threads
in the same process may have different files which is why
in current task_iter.c we have:
                 *tid = pid_nr_ns(pid, ns);
                 task = get_pid_task(pid, PIDTYPE_PID);
                 if (!task) {
                         ++*tid;
                         goto retry;
                 } else if (skip_if_dup_files && 
!thread_group_leader(task) &&
                            task->files == task->group_leader->files) {
                         put_task_struct(task);
                         task = NULL;
                         ++*tid;
                         goto retry;
                 }


Each thread (tid) will have some fields different from
thread-group leader (tgid), e.g., comm and most (if not all)
scheduling related fields.

So it would be good to support for each tid from the start
as it is not clear when pidfd_open will support non
thread-group leader.

If it worries wrap around, a reference to the task
can be held when tid passed to the kernel at link
create time. This is similar to pid is passed to
the kernel at pidfd_open syscall. But in practice,
I think taking the reference during read() should
also fine. The race always exist anyway.

Kumar, could you give more details about security
concerns? I am not sure about the tight relationship
between bpf_iter and security. bpf_iter just for
iterating kernel data structures.

>>>>>>
>>>>>> What is your opinion?
>>>>>
>>>>> Do you mean removed both tid and type, and replace them with a
>>>>> pidfd?
>>>>> We can do that in uapi, struct bpf_link_info.  But, the interal
>>>>> types,
>>>>> ex. bpf_iter_aux_info, still need to use tid or struct file to
>>>>> avoid
>>>>> getting file from the per-process fdtable.  Is that what you
>>>>> mean?
>>>>>
>>>>
>>>> Yes, just for the UAPI, it is similar to taking map_fd for map
>>>> iter.
>>>> In bpf_link_info we should report just the tid, just like map
>>>> iter
>>>> reports map_id.
>>>
>>> It sounds good to me.
>>>
>>> One thing I need a clarification. You mentioned that a fd > 0
>>> implies
>>> attaching to a specific task, however fd can be 0. So, it should be
>>> fd
>>>> = 0. So, it forces the user to initialize the value of pidfd to -
>>>> 1.
>>> So, for convenience, we still need a field like 'type' to make it
>>> easy
>>> to create iterators without a filter.
>>>
>>
>> Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so it
>> is fine to rely on that assumption. For e.g. even for map_fd,
>> bpf_map_elem iterator considers fd 0 to be unset. Then you don't need
>> the type field.
> 
> I just realize that pidfd may be meaningless for the bpf_link_info
> returned by bpf_obj_get_info_by_fd() since the origin fd might be
> closed already.  So, I will always set it a value of 0.
>
Kumar Kartikeya Dwivedi July 28, 2022, 5:52 p.m. UTC | #12
On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote:
> > [...]
>
> There is one problem here. The current pidfd_open syscall
> only supports thread-group leader, i.e., main thread, i.e.,
> it won't support any non-main-thread tid's. Yes, thread-group
> leader and other threads should share the same vma and files
> in most of times, but it still possible different threads
> in the same process may have different files which is why
> in current task_iter.c we have:
>                  *tid = pid_nr_ns(pid, ns);
>                  task = get_pid_task(pid, PIDTYPE_PID);
>                  if (!task) {
>                          ++*tid;
>                          goto retry;
>                  } else if (skip_if_dup_files &&
> !thread_group_leader(task) &&
>                             task->files == task->group_leader->files) {
>                          put_task_struct(task);
>                          task = NULL;
>                          ++*tid;
>                          goto retry;
>                  }
>
>
> Each thread (tid) will have some fields different from
> thread-group leader (tgid), e.g., comm and most (if not all)
> scheduling related fields.
>
> So it would be good to support for each tid from the start
> as it is not clear when pidfd_open will support non
> thread-group leader.

I think this is just a missing feature, not a design limitation. If we
just disable thread pifdfd from waitid and pidfd_send_signal, I think
it is ok to use it elsewhere.

>
> If it worries wrap around, a reference to the task
> can be held when tid passed to the kernel at link
> create time. This is similar to pid is passed to
> the kernel at pidfd_open syscall. But in practice,
> I think taking the reference during read() should
> also fine. The race always exist anyway.
>
> Kumar, could you give more details about security
> concerns? I am not sure about the tight relationship
> between bpf_iter and security. bpf_iter just for
> iterating kernel data structures.
>

There is no security 'concern' per se. But having an fd which is used
to set up the iterator just gives a chance to a BPF LSM to easily
isolate permissions to attach the iterator to a task represented by
that fd. i.e. the fd is an object to which this permission can be
bound, the fd can be passed around (to share the same permission with
others but it is limited to the task corresponding to it), etc. The
permission management is even easier and faster once you have a file
local storage map (which I plan to send soon).

So you could have two pidfds, one which allows the process to attach
the iter to it, and one which doesn't, without relying on the task's
capabilities, the checks can become more fine grained, and the
original task can even drop its capabilities (because they are bound
to the fd instead), which allows privilege separation.

It becomes a bit difficult when kernel APIs take IDs because then you
don't have any subject (other than the task) to draw the permissions
from.

But anyway, all of this was just a suggestion (which is why I
solicited opinions in the original reply). Feel free to drop/ignore if
it is too much of a hassle to support (i.e. it is understandable you
wouldn't want to spend time extending pidfd_open for this).
Hao Luo July 28, 2022, 6:01 p.m. UTC | #13
On Thu, Jul 28, 2022 at 9:24 AM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com> wrote:
> >
> > On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi wrote:
> > > On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com> wrote:
> > > >
> > > > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi wrote:
> > > > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee <kuifeng@fb.com>
> > > > > wrote:
> > > > > >
> > > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > > > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng Lee wrote:
> > > > > > > > Allow creating an iterator that loops through resources of
> > > > > > > > one
> > > > > > > > task/thread.
> > > > > > > >
> > > > > > > > People could only create iterators to loop through all
> > > > > > > > resources of
> > > > > > > > files, vma, and tasks in the system, even though they were
> > > > > > > > interested
> > > > > > > > in only the resources of a specific task or process.
> > > > > > > > Passing
> > > > > > > > the
> > > > > > > > additional parameters, people can now create an iterator to
> > > > > > > > go
> > > > > > > > through all resources or only the resources of a task.
> > > > > > > >
> > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > > > > > ---
> > > > > > > >  include/linux/bpf.h            |  4 ++
> > > > > > > >  include/uapi/linux/bpf.h       | 23 ++++++++++
> > > > > > > >  kernel/bpf/task_iter.c         | 81
> > > > > > > > +++++++++++++++++++++++++-
> > > > > > > > ----
> > > > > > > > ----
> > > > > > > >  tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > > > > > > >  4 files changed, 109 insertions(+), 22 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > > > > index 11950029284f..c8d164404e20 100644
> > > > > > > > --- a/include/linux/bpf.h
> > > > > > > > +++ b/include/linux/bpf.h
> > > > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const char
> > > > > > > > __user
> > > > > > > > *pathname, int flags);
> > > > > > > >
> > > > > > > >  struct bpf_iter_aux_info {
> > > > > > > >         struct bpf_map *map;
> > > > > > > > +       struct {
> > > > > > > > +               __u32   tid;
> > > > > > >
> > > > > > > should be just u32 ?
> > > > > >
> > > > > > Or, should change the following 'type' to __u8?
> > > > >
> > > > > Would it be better to use a pidfd instead of a tid here? Unset
> > > > > pidfd
> > > > > would mean going over all tasks, and any fd > 0 implies attaching
> > > > > to
> > > > > a
> > > > > specific task (as is the convention in BPF land). Most of the new
> > > > > UAPIs working on processes are using pidfds (to work with a
> > > > > stable
> > > > > handle instead of a reusable ID).
> > > > > The iterator taking an fd also gives an opportunity to BPF LSMs
> > > > > to
> > > > > attach permissions/policies to it (once we have a file local
> > > > > storage
> > > > > map) e.g. whether creating a task iterator for that specific
> > > > > pidfd
> > > > > instance (backed by the struct file) would be allowed or not.
> > > > > You are using getpid in the selftest and keeping track of
> > > > > last_tgid
> > > > > in
> > > > > the iterator, so I guess you don't even need to extend pidfd_open
> > > > > to
> > > > > work on thread IDs right now for your use case (and fdtable and
> > > > > mm
> > > > > are
> > > > > shared for POSIX threads anyway, so for those two it won't make a
> > > > > difference).
> > > > >
> > > > > What is your opinion?
> > > >
> > > > Do you mean removed both tid and type, and replace them with a
> > > > pidfd?
> > > > We can do that in uapi, struct bpf_link_info.  But, the interal
> > > > types,
> > > > ex. bpf_iter_aux_info, still need to use tid or struct file to
> > > > avoid
> > > > getting file from the per-process fdtable.  Is that what you mean?
> > > >
> > >
> > > Yes, just for the UAPI, it is similar to taking map_fd for map iter.
> > > In bpf_link_info we should report just the tid, just like map iter
> > > reports map_id.
> >
> > It sounds good to me.
> >
> > One thing I need a clarification. You mentioned that a fd > 0 implies
> > attaching to a specific task, however fd can be 0. So, it should be fd
> > >= 0. So, it forces the user to initialize the value of pidfd to -1.
> > So, for convenience, we still need a field like 'type' to make it easy
> > to create iterators without a filter.
> >
>
> Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so it
> is fine to rely on that assumption. For e.g. even for map_fd,
> bpf_map_elem iterator considers fd 0 to be unset. Then you don't need
> the type field.
>

FD 0 is STDIN, unless someone explicitly close(0). IMO using fd 0 for
default behavior is fine.

Hao

> >
Hao Luo July 28, 2022, 7:11 p.m. UTC | #14
Hi Kumar,

On Thu, Jul 28, 2022 at 10:53 AM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote:
> > > [...]
> >
> > There is one problem here. The current pidfd_open syscall
> > only supports thread-group leader, i.e., main thread, i.e.,
> > it won't support any non-main-thread tid's. Yes, thread-group
> > leader and other threads should share the same vma and files
> > in most of times, but it still possible different threads
> > in the same process may have different files which is why
> > in current task_iter.c we have:
> >                  *tid = pid_nr_ns(pid, ns);
> >                  task = get_pid_task(pid, PIDTYPE_PID);
> >                  if (!task) {
> >                          ++*tid;
> >                          goto retry;
> >                  } else if (skip_if_dup_files &&
> > !thread_group_leader(task) &&
> >                             task->files == task->group_leader->files) {
> >                          put_task_struct(task);
> >                          task = NULL;
> >                          ++*tid;
> >                          goto retry;
> >                  }
> >
> >
> > Each thread (tid) will have some fields different from
> > thread-group leader (tgid), e.g., comm and most (if not all)
> > scheduling related fields.
> >
> > So it would be good to support for each tid from the start
> > as it is not clear when pidfd_open will support non
> > thread-group leader.
>
> I think this is just a missing feature, not a design limitation. If we
> just disable thread pifdfd from waitid and pidfd_send_signal, I think
> it is ok to use it elsewhere.
>
> >
> > If it worries wrap around, a reference to the task
> > can be held when tid passed to the kernel at link
> > create time. This is similar to pid is passed to
> > the kernel at pidfd_open syscall. But in practice,
> > I think taking the reference during read() should
> > also fine. The race always exist anyway.
> >
> > Kumar, could you give more details about security
> > concerns? I am not sure about the tight relationship
> > between bpf_iter and security. bpf_iter just for
> > iterating kernel data structures.
> >
>
> There is no security 'concern' per se. But having an fd which is used
> to set up the iterator just gives a chance to a BPF LSM to easily
> isolate permissions to attach the iterator to a task represented by
> that fd. i.e. the fd is an object to which this permission can be
> bound, the fd can be passed around (to share the same permission with
> others but it is limited to the task corresponding to it), etc. The
> permission management is even easier and faster once you have a file
> local storage map (which I plan to send soon).
>

I like the idea of a file local storage map. I have several questions
in mind, but don't want to digress from the discussion under
Kui-Feng's patchset. It probably will be clear when seeing your
change. Please cc me when you send it out. thanks.

> So you could have two pidfds, one which allows the process to attach
> the iter to it, and one which doesn't, without relying on the task's
> capabilities, the checks can become more fine grained, and the
> original task can even drop its capabilities (because they are bound
> to the fd instead), which allows privilege separation.
>
> It becomes a bit difficult when kernel APIs take IDs because then you
> don't have any subject (other than the task) to draw the permissions
> from.
>
> But anyway, all of this was just a suggestion (which is why I
> solicited opinions in the original reply). Feel free to drop/ignore if
> it is too much of a hassle to support (i.e. it is understandable you
> wouldn't want to spend time extending pidfd_open for this).

On another thread, I was having a discussion with Tejun on FD vs ID
for cgroup_iter. I am in favor of ID in general, because it's
stateless and matches the info reported by bpf_link_info. This is nice
from the userspace's perspective.

Hao
Yonghong Song July 28, 2022, 10:54 p.m. UTC | #15
On 7/28/22 10:52 AM, Kumar Kartikeya Dwivedi wrote:
> On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote:
>>> [...]
>>
>> There is one problem here. The current pidfd_open syscall
>> only supports thread-group leader, i.e., main thread, i.e.,
>> it won't support any non-main-thread tid's. Yes, thread-group
>> leader and other threads should share the same vma and files
>> in most of times, but it still possible different threads
>> in the same process may have different files which is why
>> in current task_iter.c we have:
>>                   *tid = pid_nr_ns(pid, ns);
>>                   task = get_pid_task(pid, PIDTYPE_PID);
>>                   if (!task) {
>>                           ++*tid;
>>                           goto retry;
>>                   } else if (skip_if_dup_files &&
>> !thread_group_leader(task) &&
>>                              task->files == task->group_leader->files) {
>>                           put_task_struct(task);
>>                           task = NULL;
>>                           ++*tid;
>>                           goto retry;
>>                   }
>>
>>
>> Each thread (tid) will have some fields different from
>> thread-group leader (tgid), e.g., comm and most (if not all)
>> scheduling related fields.
>>
>> So it would be good to support for each tid from the start
>> as it is not clear when pidfd_open will support non
>> thread-group leader.
> 
> I think this is just a missing feature, not a design limitation. If we
> just disable thread pifdfd from waitid and pidfd_send_signal, I think
> it is ok to use it elsewhere.

Yes, I agree this is a missing feature. It is desirable
that the missing feature gets implemented in kernel or
at least promising work is recognized before we use pidfd
for this task structure.

> 
>>
>> If it worries wrap around, a reference to the task
>> can be held when tid passed to the kernel at link
>> create time. This is similar to pid is passed to
>> the kernel at pidfd_open syscall. But in practice,
>> I think taking the reference during read() should
>> also fine. The race always exist anyway.
>>
>> Kumar, could you give more details about security
>> concerns? I am not sure about the tight relationship
>> between bpf_iter and security. bpf_iter just for
>> iterating kernel data structures.
>>
> 
> There is no security 'concern' per se. But having an fd which is used
> to set up the iterator just gives a chance to a BPF LSM to easily
> isolate permissions to attach the iterator to a task represented by
> that fd. i.e. the fd is an object to which this permission can be
> bound, the fd can be passed around (to share the same permission with
> others but it is limited to the task corresponding to it), etc. The
> permission management is even easier and faster once you have a file
> local storage map (which I plan to send soon).

So you mean with fd, bpf_lsm could add a hook in bpf iterator
to enforce policies about whether a particular task can be visited
or not, right? In such cases, I think the policy should be
against a task pointer and a bpf pointer, which have enough information
for the policy to work.

In typical use cases, user space gets a pid (the process own pid or
another process id). Yes, we could create a pidfd with pidfd_open().
And this is pidfd can be manipulated with permissions, and passed
to the bpf iter. The current pidfd creation looks like below:

int pidfd_create(struct pid *pid, unsigned int flags)
{
         int fd;

         if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
                 return -EINVAL;

         if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
                 return -EINVAL;

         fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
                               flags | O_RDWR | O_CLOEXEC);
         if (fd < 0)
                 put_pid(pid);

         return fd;
}

I am not sure how security policy can be easily applied to such a
fd. Probably user need to further manipulate fd with fcntl() and I think
most users probably won't do that.

The following are some bpf program lsm hooks:

#ifdef CONFIG_BPF_SYSCALL
LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size)
LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map)
LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map)
LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux)
LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux 
*aux)
#endif /* CONFIG_BPF_SYSCALL */

I think task_struct ptr and prog ptr should be enough for the
potential LSM hook.

> 
> So you could have two pidfds, one which allows the process to attach
> the iter to it, and one which doesn't, without relying on the task's
> capabilities, the checks can become more fine grained, and the
> original task can even drop its capabilities (because they are bound
> to the fd instead), which allows privilege separation.
> 
> It becomes a bit difficult when kernel APIs take IDs because then you
> don't have any subject (other than the task) to draw the permissions
> from.

Maybe I missed something from security perspective. But from design
perspective, I am okay with pidfd since it pushes the responsibility
of pid -> task_struct conversion to user space. Although unlikely,
it still has chances that inside the kernel tid->task_struct may
actually use wrapped around tid...

But since pidfd_open misses tid support, I hesitate to use pidfd
for now.

Maybe Daniel or Alexei can comment as well?

> 
> But anyway, all of this was just a suggestion (which is why I
> solicited opinions in the original reply). Feel free to drop/ignore if
> it is too much of a hassle to support (i.e. it is understandable you
> wouldn't want to spend time extending pidfd_open for this).
Christian Brauner July 29, 2022, 9:10 a.m. UTC | #16
On Thu, Jul 28, 2022 at 03:54:01PM -0700, Yonghong Song wrote:
> 
> 
> On 7/28/22 10:52 AM, Kumar Kartikeya Dwivedi wrote:
> > On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote:
> > > > [...]
> > > 
> > > There is one problem here. The current pidfd_open syscall
> > > only supports thread-group leader, i.e., main thread, i.e.,
> > > it won't support any non-main-thread tid's. Yes, thread-group
> > > leader and other threads should share the same vma and files
> > > in most of times, but it still possible different threads
> > > in the same process may have different files which is why
> > > in current task_iter.c we have:
> > >                   *tid = pid_nr_ns(pid, ns);
> > >                   task = get_pid_task(pid, PIDTYPE_PID);
> > >                   if (!task) {
> > >                           ++*tid;
> > >                           goto retry;
> > >                   } else if (skip_if_dup_files &&
> > > !thread_group_leader(task) &&
> > >                              task->files == task->group_leader->files) {
> > >                           put_task_struct(task);
> > >                           task = NULL;
> > >                           ++*tid;
> > >                           goto retry;
> > >                   }
> > > 
> > > 
> > > Each thread (tid) will have some fields different from
> > > thread-group leader (tgid), e.g., comm and most (if not all)
> > > scheduling related fields.
> > > 
> > > So it would be good to support for each tid from the start
> > > as it is not clear when pidfd_open will support non
> > > thread-group leader.
> > 
> > I think this is just a missing feature, not a design limitation. If we
> > just disable thread pifdfd from waitid and pidfd_send_signal, I think
> > it is ok to use it elsewhere.
> 
> Yes, I agree this is a missing feature. It is desirable
> that the missing feature gets implemented in kernel or
> at least promising work is recognized before we use pidfd
> for this task structure.

When I did pidfd_{open,getfd,send_signal}, CLONE_PIDFD, and the waitid
support we simply didn't enable support for pidfd to refer to individual
threads because there was no immediate use-case for it and we hade some
concerns that I can't remember off the top of my head. Plus, we were
quite happy that we got the pidfd stuff in so we limited the scope of
what we wanted to do in the first iteration.

But if there is a good use-case for this then by all means we should
enable pidfds to refer to individual threads and I'm happy to route that
upstream. But this needs to be solid work as that area of the kernel can
be interesting technically and otherwise...

There have been requests for this before already.

I've added a wrapper pidfd_get_task() a while ago that encodes the
PIDTYPE_TGID restriction. If you grep for it you should find all places
that rely on pidfds (process_mrelease() and whatnot). All these places
need to be able to deal with individual threads if you change that.

But note, the mw is coming up.
Kui-Feng Lee July 30, 2022, 2:46 a.m. UTC | #17
On Thu, 2022-07-28 at 19:00 +0200, Kumar Kartikeya Dwivedi wrote:
> On Thu, 28 Jul 2022 at 18:40, Kui-Feng Lee <kuifeng@fb.com> wrote:
> > 
> > On Thu, 2022-07-28 at 18:22 +0200, Kumar Kartikeya Dwivedi wrote:
> > > On Thu, 28 Jul 2022 at 17:16, Kui-Feng Lee <kuifeng@fb.com>
> > > wrote:
> > > > 
> > > > On Thu, 2022-07-28 at 10:47 +0200, Kumar Kartikeya Dwivedi
> > > > wrote:
> > > > > On Thu, 28 Jul 2022 at 07:25, Kui-Feng Lee <kuifeng@fb.com>
> > > > > wrote:
> > > > > > 
> > > > > > On Wed, 2022-07-27 at 10:19 +0200, Kumar Kartikeya Dwivedi
> > > > > > wrote:
> > > > > > > On Wed, 27 Jul 2022 at 09:01, Kui-Feng Lee
> > > > > > > <kuifeng@fb.com>
> > > > > > > wrote:
> > > > > > > > 
> > > > > > > > On Tue, 2022-07-26 at 14:13 +0200, Jiri Olsa wrote:
> > > > > > > > > On Mon, Jul 25, 2022 at 10:17:11PM -0700, Kui-Feng
> > > > > > > > > Lee
> > > > > > > > > wrote:
> > > > > > > > > > Allow creating an iterator that loops through
> > > > > > > > > > resources
> > > > > > > > > > of
> > > > > > > > > > one
> > > > > > > > > > task/thread.
> > > > > > > > > > 
> > > > > > > > > > People could only create iterators to loop through
> > > > > > > > > > all
> > > > > > > > > > resources of
> > > > > > > > > > files, vma, and tasks in the system, even though
> > > > > > > > > > they
> > > > > > > > > > were
> > > > > > > > > > interested
> > > > > > > > > > in only the resources of a specific task or
> > > > > > > > > > process.
> > > > > > > > > > Passing
> > > > > > > > > > the
> > > > > > > > > > additional parameters, people can now create an
> > > > > > > > > > iterator to
> > > > > > > > > > go
> > > > > > > > > > through all resources or only the resources of a
> > > > > > > > > > task.
> > > > > > > > > > 
> > > > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > > > > > > > ---
> > > > > > > > > >   include/linux/bpf.h            |  4 ++
> > > > > > > > > >   include/uapi/linux/bpf.h       | 23 ++++++++++
> > > > > > > > > >   kernel/bpf/task_iter.c         | 81
> > > > > > > > > > +++++++++++++++++++++++++-
> > > > > > > > > > ----
> > > > > > > > > > ----
> > > > > > > > > >   tools/include/uapi/linux/bpf.h | 23 ++++++++++
> > > > > > > > > >   4 files changed, 109 insertions(+), 22
> > > > > > > > > > deletions(-)
> > > > > > > > > > 
> > > > > > > > > > diff --git a/include/linux/bpf.h
> > > > > > > > > > b/include/linux/bpf.h
> > > > > > > > > > index 11950029284f..c8d164404e20 100644
> > > > > > > > > > --- a/include/linux/bpf.h
> > > > > > > > > > +++ b/include/linux/bpf.h
> > > > > > > > > > @@ -1718,6 +1718,10 @@ int bpf_obj_get_user(const
> > > > > > > > > > char
> > > > > > > > > > __user
> > > > > > > > > > *pathname, int flags);
> > > > > > > > > > 
> > > > > > > > > >   struct bpf_iter_aux_info {
> > > > > > > > > >          struct bpf_map *map;
> > > > > > > > > > +       struct {
> > > > > > > > > > +               __u32   tid;
> > > > > > > > > 
> > > > > > > > > should be just u32 ?
> > > > > > > > 
> > > > > > > > Or, should change the following 'type' to __u8?
> > > > > > > 
> > > > > > > Would it be better to use a pidfd instead of a tid here?
> > > > > > > Unset
> > > > > > > pidfd
> > > > > > > would mean going over all tasks, and any fd > 0 implies
> > > > > > > attaching
> > > > > > > to
> > > > > > > a
> > > > > > > specific task (as is the convention in BPF land). Most of
> > > > > > > the
> > > > > > > new
> > > > > > > UAPIs working on processes are using pidfds (to work with
> > > > > > > a
> > > > > > > stable
> > > > > > > handle instead of a reusable ID).
> > > > > > > The iterator taking an fd also gives an opportunity to
> > > > > > > BPF
> > > > > > > LSMs
> > > > > > > to
> > > > > > > attach permissions/policies to it (once we have a file
> > > > > > > local
> > > > > > > storage
> > > > > > > map) e.g. whether creating a task iterator for that
> > > > > > > specific
> > > > > > > pidfd
> > > > > > > instance (backed by the struct file) would be allowed or
> > > > > > > not.
> > > > > > > You are using getpid in the selftest and keeping track of
> > > > > > > last_tgid
> > > > > > > in
> > > > > > > the iterator, so I guess you don't even need to extend
> > > > > > > pidfd_open
> > > > > > > to
> > > > > > > work on thread IDs right now for your use case (and
> > > > > > > fdtable
> > > > > > > and
> > > > > > > mm
> > > > > > > are
> > > > > > > shared for POSIX threads anyway, so for those two it
> > > > > > > won't
> > > > > > > make a
> > > > > > > difference).
> > > > > > > 
> > > > > > > What is your opinion?
> > > > > > 
> > > > > > Do you mean removed both tid and type, and replace them
> > > > > > with a
> > > > > > pidfd?
> > > > > > We can do that in uapi, struct bpf_link_info.  But, the
> > > > > > interal
> > > > > > types,
> > > > > > ex. bpf_iter_aux_info, still need to use tid or struct file
> > > > > > to
> > > > > > avoid
> > > > > > getting file from the per-process fdtable.  Is that what
> > > > > > you
> > > > > > mean?
> > > > > > 
> > > > > 
> > > > > Yes, just for the UAPI, it is similar to taking map_fd for
> > > > > map
> > > > > iter.
> > > > > In bpf_link_info we should report just the tid, just like map
> > > > > iter
> > > > > reports map_id.
> > > > 
> > > > It sounds good to me.
> > > > 
> > > > One thing I need a clarification. You mentioned that a fd > 0
> > > > implies
> > > > attaching to a specific task, however fd can be 0. So, it
> > > > should be
> > > > fd
> > > > > = 0. So, it forces the user to initialize the value of pidfd
> > > > > to -
> > > > > 1.
> > > > So, for convenience, we still need a field like 'type' to make
> > > > it
> > > > easy
> > > > to create iterators without a filter.
> > > > 
> > > 
> > > Right, but in lots of BPF UAPI fields, fd 0 means fd is unset, so
> > > it
> > > is fine to rely on that assumption. For e.g. even for map_fd,
> > > bpf_map_elem iterator considers fd 0 to be unset. Then you don't
> > > need
> > > the type field.
> > 
> > I just realize that pidfd may be meaningless for the bpf_link_info
> > returned by bpf_obj_get_info_by_fd() since the origin fd might be
> > closed already.  So, I will always set it a value of 0.
> > 
> 
> For bpf_link_info, we should only be returning the tid of the task it
> is attached to, you cannot report the pidfd in bpf_link_info
> correctly (as you already realised). By default this would be 0,
> which is also an invalid tid, but when pidfd is set it will be the
> tid of the task it is attached to, so it works well.


We have a lot of dicussions around using tid or pidfd?
Kumar also mentioned about removing 'type'.
However, I have a feel that we need to keep 'type' in struct
bpf_link_info.  I cam imagine that we may like to create iterators of
tasks in a cgroup or other paramters in futhure.  'type' will help us
to tell the types of a parameter.
Kumar Kartikeya Dwivedi Aug. 2, 2022, 2:25 p.m. UTC | #18
On Thu, 28 Jul 2022 at 21:11, Hao Luo <haoluo@google.com> wrote:
>
> Hi Kumar,
>
> On Thu, Jul 28, 2022 at 10:53 AM Kumar Kartikeya Dwivedi
> <memxor@gmail.com> wrote:
> >
> > On Thu, 28 Jul 2022 at 19:08, Yonghong Song <yhs@fb.com> wrote:
> > > > [...]
> > >
> > > There is one problem here. The current pidfd_open syscall
> > > only supports thread-group leader, i.e., main thread, i.e.,
> > > it won't support any non-main-thread tid's. Yes, thread-group
> > > leader and other threads should share the same vma and files
> > > in most of times, but it still possible different threads
> > > in the same process may have different files which is why
> > > in current task_iter.c we have:
> > >                  *tid = pid_nr_ns(pid, ns);
> > >                  task = get_pid_task(pid, PIDTYPE_PID);
> > >                  if (!task) {
> > >                          ++*tid;
> > >                          goto retry;
> > >                  } else if (skip_if_dup_files &&
> > > !thread_group_leader(task) &&
> > >                             task->files == task->group_leader->files) {
> > >                          put_task_struct(task);
> > >                          task = NULL;
> > >                          ++*tid;
> > >                          goto retry;
> > >                  }
> > >
> > >
> > > Each thread (tid) will have some fields different from
> > > thread-group leader (tgid), e.g., comm and most (if not all)
> > > scheduling related fields.
> > >
> > > So it would be good to support for each tid from the start
> > > as it is not clear when pidfd_open will support non
> > > thread-group leader.
> >
> > I think this is just a missing feature, not a design limitation. If we
> > just disable thread pifdfd from waitid and pidfd_send_signal, I think
> > it is ok to use it elsewhere.
> >
> > >
> > > If it worries wrap around, a reference to the task
> > > can be held when tid passed to the kernel at link
> > > create time. This is similar to pid is passed to
> > > the kernel at pidfd_open syscall. But in practice,
> > > I think taking the reference during read() should
> > > also fine. The race always exist anyway.
> > >
> > > Kumar, could you give more details about security
> > > concerns? I am not sure about the tight relationship
> > > between bpf_iter and security. bpf_iter just for
> > > iterating kernel data structures.
> > >
> >
> > There is no security 'concern' per se. But having an fd which is used
> > to set up the iterator just gives a chance to a BPF LSM to easily
> > isolate permissions to attach the iterator to a task represented by
> > that fd. i.e. the fd is an object to which this permission can be
> > bound, the fd can be passed around (to share the same permission with
> > others but it is limited to the task corresponding to it), etc. The
> > permission management is even easier and faster once you have a file
> > local storage map (which I plan to send soon).
> >
>
> I like the idea of a file local storage map. I have several questions
> in mind, but don't want to digress from the discussion under
> Kui-Feng's patchset. It probably will be clear when seeing your
> change. Please cc me when you send it out. thanks.
>

Thanks for the interest! I'll cc you when I send it out.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 11950029284f..c8d164404e20 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1718,6 +1718,10 @@  int bpf_obj_get_user(const char __user *pathname, int flags);
 
 struct bpf_iter_aux_info {
 	struct bpf_map *map;
+	struct {
+		__u32	tid;
+		u8	type;
+	} task;
 };
 
 typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ffcbf79a556b..991141cacb9e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -87,10 +87,33 @@  struct bpf_cgroup_storage_key {
 	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
 };
 
+enum bpf_task_iter_type {
+	BPF_TASK_ITER_ALL = 0,
+	BPF_TASK_ITER_TID,
+};
+
 union bpf_iter_link_info {
 	struct {
 		__u32	map_fd;
 	} map;
+	/*
+	 * Parameters of task iterators.
+	 */
+	struct {
+		__u32	tid;
+		/*
+		 * The type of the iterator.
+		 *
+		 * It can be one of enum bpf_task_iter_type.
+		 *
+		 * BPF_TASK_ITER_ALL (default)
+		 *	The iterator iterates over resources of everyprocess.
+		 *
+		 * BPF_TASK_ITER_TID
+		 *	You should also set *tid* to iterate over one task.
+		 */
+		__u8	type;	/* BPF_TASK_ITER_* */
+	} task;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8c921799def4..7979aacb651e 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -12,6 +12,8 @@ 
 
 struct bpf_iter_seq_task_common {
 	struct pid_namespace *ns;
+	u32	tid;
+	u8	type;
 };
 
 struct bpf_iter_seq_task_info {
@@ -22,18 +24,31 @@  struct bpf_iter_seq_task_info {
 	u32 tid;
 };
 
-static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
 					     u32 *tid,
 					     bool skip_if_dup_files)
 {
 	struct task_struct *task = NULL;
 	struct pid *pid;
 
+	if (common->type == BPF_TASK_ITER_TID) {
+		if (*tid)
+			return NULL;
+		rcu_read_lock();
+		pid = find_pid_ns(common->tid, common->ns);
+		if (pid) {
+			task = get_pid_task(pid, PIDTYPE_PID);
+			*tid = common->tid;
+		}
+		rcu_read_unlock();
+		return task;
+	}
+
 	rcu_read_lock();
 retry:
-	pid = find_ge_pid(*tid, ns);
+	pid = find_ge_pid(*tid, common->ns);
 	if (pid) {
-		*tid = pid_nr_ns(pid, ns);
+		*tid = pid_nr_ns(pid, common->ns);
 		task = get_pid_task(pid, PIDTYPE_PID);
 		if (!task) {
 			++*tid;
@@ -56,7 +71,8 @@  static void *task_seq_start(struct seq_file *seq, loff_t *pos)
 	struct bpf_iter_seq_task_info *info = seq->private;
 	struct task_struct *task;
 
-	task = task_seq_get_next(info->common.ns, &info->tid, false);
+	task = task_seq_get_next(&info->common, &info->tid, false);
+
 	if (!task)
 		return NULL;
 
@@ -73,7 +89,8 @@  static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	++*pos;
 	++info->tid;
 	put_task_struct((struct task_struct *)v);
-	task = task_seq_get_next(info->common.ns, &info->tid, false);
+
+	task = task_seq_get_next(&info->common, &info->tid, false);
 	if (!task)
 		return NULL;
 
@@ -117,6 +134,18 @@  static void task_seq_stop(struct seq_file *seq, void *v)
 		put_task_struct((struct task_struct *)v);
 }
 
+static int bpf_iter_attach_task(struct bpf_prog *prog,
+				union bpf_iter_link_info *linfo,
+				struct bpf_iter_aux_info *aux)
+{
+	if (linfo->task.type == BPF_TASK_ITER_ALL && linfo->task.tid)
+		return -EINVAL;
+
+	aux->task.type = linfo->task.type;
+	aux->task.tid = linfo->task.tid;
+	return 0;
+}
+
 static const struct seq_operations task_seq_ops = {
 	.start	= task_seq_start,
 	.next	= task_seq_next,
@@ -137,8 +166,7 @@  struct bpf_iter_seq_task_file_info {
 static struct file *
 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
 {
-	struct pid_namespace *ns = info->common.ns;
-	u32 curr_tid = info->tid;
+	u32 saved_tid = info->tid;
 	struct task_struct *curr_task;
 	unsigned int curr_fd = info->fd;
 
@@ -151,21 +179,18 @@  task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
 		curr_task = info->task;
 		curr_fd = info->fd;
 	} else {
-                curr_task = task_seq_get_next(ns, &curr_tid, true);
+		curr_task = task_seq_get_next(&info->common, &info->tid, true);
                 if (!curr_task) {
                         info->task = NULL;
-                        info->tid = curr_tid;
                         return NULL;
                 }
 
-                /* set info->task and info->tid */
+		/* set info->task */
 		info->task = curr_task;
-		if (curr_tid == info->tid) {
+		if (saved_tid == info->tid)
 			curr_fd = info->fd;
-		} else {
-			info->tid = curr_tid;
+		else
 			curr_fd = 0;
-		}
 	}
 
 	rcu_read_lock();
@@ -186,9 +211,15 @@  task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
 	/* the current task is done, go to the next task */
 	rcu_read_unlock();
 	put_task_struct(curr_task);
+
+	if (info->common.type == BPF_TASK_ITER_TID) {
+		info->task = NULL;
+		return NULL;
+	}
+
 	info->task = NULL;
 	info->fd = 0;
-	curr_tid = ++(info->tid);
+	saved_tid = ++(info->tid);
 	goto again;
 }
 
@@ -269,6 +300,8 @@  static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
 	struct bpf_iter_seq_task_common *common = priv_data;
 
 	common->ns = get_pid_ns(task_active_pid_ns(current));
+	common->type = aux->task.type;
+	common->tid = aux->task.tid;
 	return 0;
 }
 
@@ -307,11 +340,10 @@  enum bpf_task_vma_iter_find_op {
 static struct vm_area_struct *
 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 {
-	struct pid_namespace *ns = info->common.ns;
 	enum bpf_task_vma_iter_find_op op;
 	struct vm_area_struct *curr_vma;
 	struct task_struct *curr_task;
-	u32 curr_tid = info->tid;
+	u32 saved_tid = info->tid;
 
 	/* If this function returns a non-NULL vma, it holds a reference to
 	 * the task_struct, and holds read lock on vma->mm->mmap_lock.
@@ -371,14 +403,13 @@  task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 		}
 	} else {
 again:
-		curr_task = task_seq_get_next(ns, &curr_tid, true);
+		curr_task = task_seq_get_next(&info->common, &info->tid, true);
 		if (!curr_task) {
-			info->tid = curr_tid + 1;
+			info->tid++;
 			goto finish;
 		}
 
-		if (curr_tid != info->tid) {
-			info->tid = curr_tid;
+		if (saved_tid != info->tid) {
 			/* new task, process the first vma */
 			op = task_vma_iter_first_vma;
 		} else {
@@ -430,9 +461,12 @@  task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 	return curr_vma;
 
 next_task:
+	if (info->common.type == BPF_TASK_ITER_TID)
+		goto finish;
+
 	put_task_struct(curr_task);
 	info->task = NULL;
-	curr_tid++;
+	info->tid++;
 	goto again;
 
 finish:
@@ -533,6 +567,7 @@  static const struct bpf_iter_seq_info task_seq_info = {
 
 static struct bpf_iter_reg task_reg_info = {
 	.target			= "task",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
@@ -551,6 +586,7 @@  static const struct bpf_iter_seq_info task_file_seq_info = {
 
 static struct bpf_iter_reg task_file_reg_info = {
 	.target			= "task_file",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
@@ -571,6 +607,7 @@  static const struct bpf_iter_seq_info task_vma_seq_info = {
 
 static struct bpf_iter_reg task_vma_reg_info = {
 	.target			= "task_vma",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ffcbf79a556b..991141cacb9e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -87,10 +87,33 @@  struct bpf_cgroup_storage_key {
 	__u32	attach_type;		/* program attach type (enum bpf_attach_type) */
 };
 
+enum bpf_task_iter_type {
+	BPF_TASK_ITER_ALL = 0,
+	BPF_TASK_ITER_TID,
+};
+
 union bpf_iter_link_info {
 	struct {
 		__u32	map_fd;
 	} map;
+	/*
+	 * Parameters of task iterators.
+	 */
+	struct {
+		__u32	tid;
+		/*
+		 * The type of the iterator.
+		 *
+		 * It can be one of enum bpf_task_iter_type.
+		 *
+		 * BPF_TASK_ITER_ALL (default)
+		 *	The iterator iterates over resources of everyprocess.
+		 *
+		 * BPF_TASK_ITER_TID
+		 *	You should also set *tid* to iterate over one task.
+		 */
+		__u8	type;	/* BPF_TASK_ITER_* */
+	} task;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */