diff mbox series

[bpf-next,v8,1/5] bpf: Parameterize task iterators.

Message ID 20220829192317.486946-2-kuifeng@fb.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Parameterize task iterators. | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1697 this patch: 1697
netdev/cc_maintainers warning 8 maintainers not CCed: john.fastabend@gmail.com jolsa@kernel.org song@kernel.org brauner@kernel.org haoluo@google.com martin.lau@linux.dev kpsingh@kernel.org sdf@google.com
netdev/build_clang fail Errors and warnings before: 173 this patch: 176
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1691 this patch: 1691
netdev/checkpatch warning WARNING: line length of 85 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-6 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-5 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-PR fail merge-conflict

Commit Message

Kui-Feng Lee Aug. 29, 2022, 7:23 p.m. UTC
Allow creating an iterator that loops through resources of one
thread/process.

People could only create iterators to loop through all resources of
files, vma, and tasks in the system, even though they were interested
in only the resources of a specific task or process.  Passing the
additional parameters, people can now create an iterator to go
through all resources or only the resources of a task.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h            |  25 +++++
 include/uapi/linux/bpf.h       |   6 ++
 kernel/bpf/task_iter.c         | 184 +++++++++++++++++++++++++++++----
 tools/include/uapi/linux/bpf.h |   6 ++
 4 files changed, 199 insertions(+), 22 deletions(-)

Comments

Yonghong Song Aug. 30, 2022, 11:54 p.m. UTC | #1
On 8/29/22 12:23 PM, Kui-Feng Lee wrote:
> Allow creating an iterator that loops through resources of one
> thread/process.
> 
> People could only create iterators to loop through all resources of
> files, vma, and tasks in the system, even though they were interested
> in only the resources of a specific task or process.  Passing the
> additional parameters, people can now create an iterator to go
> through all resources or only the resources of a task.
> 
> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> Acked-by: Yonghong Song <yhs@fb.com>
> ---
>   include/linux/bpf.h            |  25 +++++
>   include/uapi/linux/bpf.h       |   6 ++
>   kernel/bpf/task_iter.c         | 184 +++++++++++++++++++++++++++++----
>   tools/include/uapi/linux/bpf.h |   6 ++
>   4 files changed, 199 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 9c1674973e03..31ac2c1181f5 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
>   	extern int bpf_iter_ ## target(args);			\
>   	int __init bpf_iter_ ## target(args) { return 0; }
>   
> +/*
> + * The task type of iterators.
> + *
> + * For BPF task iterators, they can be parameterized with various
> + * parameters to visit only some of tasks.
> + *
> + * BPF_TASK_ITER_ALL (default)
> + *	Iterate over resources of every task.
> + *
> + * BPF_TASK_ITER_TID
> + *	Iterate over resources of a task/tid.
> + *
> + * BPF_TASK_ITER_TGID
> + *	Iterate over resources of every task of a process / task group.
> + */
> +enum bpf_iter_task_type {
> +	BPF_TASK_ITER_ALL = 0,
> +	BPF_TASK_ITER_TID,
> +	BPF_TASK_ITER_TGID,
> +};
> +
>   struct bpf_iter_aux_info {
>   	/* for map_elem iter */
>   	struct bpf_map *map;
> @@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info {
>   		struct cgroup *start; /* starting cgroup */
>   		enum bpf_cgroup_iter_order order;
>   	} cgroup;
> +	struct {
> +		enum bpf_iter_task_type	type;
> +		u32 pid;
> +	} task;
>   };
>   
>   typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 962960a98835..f212a19eda06 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -110,6 +110,12 @@ union bpf_iter_link_info {
>   		__u32	cgroup_fd;
>   		__u64	cgroup_id;
>   	} cgroup;
> +	/* Parameters of task iterators. */
> +	struct {
> +		__u32	tid;
> +		__u32	pid;
> +		__u32	pid_fd;
> +	} task;
>   };
>   
>   /* BPF syscall commands, see bpf(2) man-page for more details. */
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index 8c921799def4..0bc7277d1ee1 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -12,6 +12,9 @@
>   
>   struct bpf_iter_seq_task_common {
>   	struct pid_namespace *ns;
> +	enum bpf_iter_task_type	type;
> +	u32 pid;
> +	u32 pid_visiting;
>   };
>   
>   struct bpf_iter_seq_task_info {
> @@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info {
>   	u32 tid;
>   };
>   
> -static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
> +static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
> +						   u32 *tid,
> +						   bool skip_if_dup_files)
> +{
> +	struct task_struct *task, *next_task;
> +	struct pid *pid;
> +	u32 saved_tid;
> +
> +	if (!*tid) {

Add a comment in the above to say that this is for the *very first* 
visit of tasks in the process.

> +		pid = find_pid_ns(common->pid, common->ns);
> +		if (pid)
> +			task = get_pid_task(pid, PIDTYPE_TGID);

'task' is not initialized, so it is possible task could hold a
garbase value here if !pid, right?

Also if indeed task is NULL, here, should we return NULL here
first?

> +
> +		*tid = common->pid;
> +		common->pid_visiting = common->pid;
> +
> +		return task;
> +	}
> +
> +	/* The callers increase *tid by 1 once they want next task.
> +	 * However, next_thread() doesn't return tasks in incremental
> +	 * order of pids. We can not find next task by just finding a
> +	 * task whose pid is greater or equal to *tid.  pid_visiting
> +	 * remembers the pid value of the task returned last time. By
> +	 * comparing pid_visiting and *tid, we known if the caller
> +	 * wants the next task.
> +	 */
> +	if (*tid == common->pid_visiting) {
> +		pid = find_pid_ns(common->pid_visiting, common->ns);
> +		task = get_pid_task(pid, PIDTYPE_PID);
> +
> +		return task;
> +	}

Do not understand the above code. Why we need it? Looks like
the code below trying to get the *next_task* and will return NULL
if wrap around happens(the tid again equals tgid), right?

> +
> +retry:
> +	pid = find_pid_ns(common->pid_visiting, common->ns);
> +	if (!pid)
> +		return NULL;
> +
> +	task = get_pid_task(pid, PIDTYPE_PID);
> +	if (!task)
> +		return NULL;
> +
> +	next_task = next_thread(task);
> +	put_task_struct(task);
> +	if (!next_task)
> +		return NULL;
> +
> +	saved_tid = *tid;
> +	*tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
> +	if (*tid == common->pid) {
> +		/* Run out of tasks of a process.  The tasks of a
> +		 * thread_group are linked as circular linked list.
> +		 */
> +		*tid = saved_tid;
> +		return NULL;
> +	}
> +
> +	get_task_struct(next_task);
> +	common->pid_visiting = *tid;

We could do quite some redundant works here if the following
condition is true. Basically, we get next_task and get a tid
and release it, but in the next iteration, from tid, we try to get
the task again.

> +
> +	if (skip_if_dup_files && task->files == task->group_leader->files)
> +		goto retry;
> +
> +	return next_task;
> +}
> +
> +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
>   					     u32 *tid,
>   					     bool skip_if_dup_files)
>   {
>   	struct task_struct *task = NULL;
>   	struct pid *pid;
>   
> +	if (common->type == BPF_TASK_ITER_TID) {
> +		if (*tid && *tid != common->pid)
> +			return NULL;
> +		rcu_read_lock();
> +		pid = find_pid_ns(common->pid, common->ns);
> +		if (pid) {
> +			task = get_pid_task(pid, PIDTYPE_TGID);
> +			*tid = common->pid;
> +		}
> +		rcu_read_unlock();
> +
> +		return task;
> +	}
> +
> +	if (common->type == BPF_TASK_ITER_TGID) {
> +		rcu_read_lock();
> +		task = task_group_seq_get_next(common, tid, skip_if_dup_files);
> +		rcu_read_unlock();
> +
> +		return task;
> +	}
> +
>   	rcu_read_lock();
>   retry:
> -	pid = find_ge_pid(*tid, ns);
> +	pid = find_ge_pid(*tid, common->ns);
>   	if (pid) {
> -		*tid = pid_nr_ns(pid, ns);
> +		*tid = pid_nr_ns(pid, common->ns);
>   		task = get_pid_task(pid, PIDTYPE_PID);
>   		if (!task) {
>   			++*tid;
> @@ -56,7 +148,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
>   	struct bpf_iter_seq_task_info *info = seq->private;
>   	struct task_struct *task;
>   
> -	task = task_seq_get_next(info->common.ns, &info->tid, false);
> +	task = task_seq_get_next(&info->common, &info->tid, false);
>   	if (!task)
>   		return NULL;
>   
> @@ -73,7 +165,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>   	++*pos;
>   	++info->tid;
>   	put_task_struct((struct task_struct *)v);
> -	task = task_seq_get_next(info->common.ns, &info->tid, false);
> +	task = task_seq_get_next(&info->common, &info->tid, false);
>   	if (!task)
>   		return NULL;
>   
> @@ -117,6 +209,45 @@ static void task_seq_stop(struct seq_file *seq, void *v)
>   		put_task_struct((struct task_struct *)v);
>   }
>   
[...]
Kui-Feng Lee Aug. 31, 2022, 12:35 a.m. UTC | #2
On Tue, 2022-08-30 at 16:54 -0700, Yonghong Song wrote:
> 
> 
> On 8/29/22 12:23 PM, Kui-Feng Lee wrote:
> > Allow creating an iterator that loops through resources of one
> > thread/process.
> > 
> > People could only create iterators to loop through all resources of
> > files, vma, and tasks in the system, even though they were
> > interested
> > in only the resources of a specific task or process.  Passing the
> > additional parameters, people can now create an iterator to go
> > through all resources or only the resources of a task.
> > 
> > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > Acked-by: Yonghong Song <yhs@fb.com>
> > ---
> >   include/linux/bpf.h            |  25 +++++
> >   include/uapi/linux/bpf.h       |   6 ++
> >   kernel/bpf/task_iter.c         | 184
> > +++++++++++++++++++++++++++++----
> >   tools/include/uapi/linux/bpf.h |   6 ++
> >   4 files changed, 199 insertions(+), 22 deletions(-)
> > 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 9c1674973e03..31ac2c1181f5 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user
> > *pathname, int flags);
> >         extern int bpf_iter_ ## target(args);                   \
> >         int __init bpf_iter_ ## target(args) { return 0; }
> >   
> > +/*
> > + * The task type of iterators.
> > + *
> > + * For BPF task iterators, they can be parameterized with various
> > + * parameters to visit only some of tasks.
> > + *
> > + * BPF_TASK_ITER_ALL (default)
> > + *     Iterate over resources of every task.
> > + *
> > + * BPF_TASK_ITER_TID
> > + *     Iterate over resources of a task/tid.
> > + *
> > + * BPF_TASK_ITER_TGID
> > + *     Iterate over resources of every task of a process / task
> > group.
> > + */
> > +enum bpf_iter_task_type {
> > +       BPF_TASK_ITER_ALL = 0,
> > +       BPF_TASK_ITER_TID,
> > +       BPF_TASK_ITER_TGID,
> > +};
> > +
> >   struct bpf_iter_aux_info {
> >         /* for map_elem iter */
> >         struct bpf_map *map;
> > @@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info {
> >                 struct cgroup *start; /* starting cgroup */
> >                 enum bpf_cgroup_iter_order order;
> >         } cgroup;
> > +       struct {
> > +               enum bpf_iter_task_type type;
> > +               u32 pid;
> > +       } task;
> >   };
> >   
> >   typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 962960a98835..f212a19eda06 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -110,6 +110,12 @@ union bpf_iter_link_info {
> >                 __u32   cgroup_fd;
> >                 __u64   cgroup_id;
> >         } cgroup;
> > +       /* Parameters of task iterators. */
> > +       struct {
> > +               __u32   tid;
> > +               __u32   pid;
> > +               __u32   pid_fd;
> > +       } task;
> >   };
> >   
> >   /* BPF syscall commands, see bpf(2) man-page for more details. */
> > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > index 8c921799def4..0bc7277d1ee1 100644
> > --- a/kernel/bpf/task_iter.c
> > +++ b/kernel/bpf/task_iter.c
> > @@ -12,6 +12,9 @@
> >   
> >   struct bpf_iter_seq_task_common {
> >         struct pid_namespace *ns;
> > +       enum bpf_iter_task_type type;
> > +       u32 pid;
> > +       u32 pid_visiting;
> >   };
> >   
> >   struct bpf_iter_seq_task_info {
> > @@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info {
> >         u32 tid;
> >   };
> >   
> > -static struct task_struct *task_seq_get_next(struct pid_namespace
> > *ns,
> > +static struct task_struct *task_group_seq_get_next(struct
> > bpf_iter_seq_task_common *common,
> > +                                                  u32 *tid,
> > +                                                  bool
> > skip_if_dup_files)
> > +{
> > +       struct task_struct *task, *next_task;
> > +       struct pid *pid;
> > +       u32 saved_tid;
> > +
> > +       if (!*tid) {
> 
> Add a comment in the above to say that this is for the *very first* 
> visit of tasks in the process.
> 
> > +               pid = find_pid_ns(common->pid, common->ns);
> > +               if (pid)
> > +                       task = get_pid_task(pid, PIDTYPE_TGID);
> 
> 'task' is not initialized, so it is possible task could hold a
> garbase value here if !pid, right?
> 
> Also if indeed task is NULL, here, should we return NULL here
> first?

yes, it should return earlier.

> 
> > +
> > +               *tid = common->pid;
> > +               common->pid_visiting = common->pid;
> > +
> > +               return task;
> > +       }
> > +
> > +       /* The callers increase *tid by 1 once they want next task.
> > +        * However, next_thread() doesn't return tasks in
> > incremental
> > +        * order of pids. We can not find next task by just finding
> > a
> > +        * task whose pid is greater or equal to *tid. 
> > pid_visiting
> > +        * remembers the pid value of the task returned last time.
> > By
> > +        * comparing pid_visiting and *tid, we known if the caller
> > +        * wants the next task.
> > +        */
> > +       if (*tid == common->pid_visiting) {
> > +               pid = find_pid_ns(common->pid_visiting, common-
> > >ns);
> > +               task = get_pid_task(pid, PIDTYPE_PID);
> > +
> > +               return task;
> > +       }
> 
> Do not understand the above code. Why we need it? Looks like
> the code below trying to get the *next_task* and will return NULL
> if wrap around happens(the tid again equals tgid), right?

The above code is to handle the case that the caller want to visit the
same task again.  For example, task_file_seq_get_next() will call this
function several time to return the same task, and move to next task by
increasing info->tid.  The above code checks the value of *tid to
return the same task if the value doesn't change.

> 
> > +
> > +retry:
> > +       pid = find_pid_ns(common->pid_visiting, common->ns);
> > +       if (!pid)
> > +               return NULL;
> > +
> > +       task = get_pid_task(pid, PIDTYPE_PID);
> > +       if (!task)
> > +               return NULL;
> > +
> > +       next_task = next_thread(task);
> > +       put_task_struct(task);
> > +       if (!next_task)
> > +               return NULL;
> > +
> > +       saved_tid = *tid;
> > +       *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common-
> > >ns);
> > +       if (*tid == common->pid) {
> > +               /* Run out of tasks of a process.  The tasks of a
> > +                * thread_group are linked as circular linked list.
> > +                */
> > +               *tid = saved_tid;
> > +               return NULL;
> > +       }
> > +
> > +       get_task_struct(next_task);
> > +       common->pid_visiting = *tid;
> 
> We could do quite some redundant works here if the following
> condition is true. Basically, we get next_task and get a tid
> and release it, but in the next iteration, from tid, we try to get
> the task again.

Yes, I will move 'retry' and move next_task to task to avoid the
redundant work.

> 
> > +
> > +       if (skip_if_dup_files && task->files == task->group_leader-
> > >files)
> > +               goto retry;
> > +
> > +       return next_task;
> > +}
> > +
> > +static struct task_struct *task_seq_get_next(struct
> > bpf_iter_seq_task_common *common,
> >                                              u32 *tid,
> >                                              bool
> > skip_if_dup_files)
> >   {
> >         struct task_struct *task = NULL;
> >         struct pid *pid;
> >   
> > +       if (common->type == BPF_TASK_ITER_TID) {
> > +               if (*tid && *tid != common->pid)
> > +                       return NULL;
> > +               rcu_read_lock();
> > +               pid = find_pid_ns(common->pid, common->ns);
> > +               if (pid) {
> > +                       task = get_pid_task(pid, PIDTYPE_TGID);
> > +                       *tid = common->pid;
> > +               }
> > +               rcu_read_unlock();
> > +
> > +               return task;
> > +       }
> > +
> > +       if (common->type == BPF_TASK_ITER_TGID) {
> > +               rcu_read_lock();
> > +               task = task_group_seq_get_next(common, tid,
> > skip_if_dup_files);
> > +               rcu_read_unlock();
> > +
> > +               return task;
> > +       }
> > +
> >         rcu_read_lock();
> >   retry:
> > -       pid = find_ge_pid(*tid, ns);
> > +       pid = find_ge_pid(*tid, common->ns);
> >         if (pid) {
> > -               *tid = pid_nr_ns(pid, ns);
> > +               *tid = pid_nr_ns(pid, common->ns);
> >                 task = get_pid_task(pid, PIDTYPE_PID);
> >                 if (!task) {
> >                         ++*tid;
> > @@ -56,7 +148,7 @@ static void *task_seq_start(struct seq_file
> > *seq, loff_t *pos)
> >         struct bpf_iter_seq_task_info *info = seq->private;
> >         struct task_struct *task;
> >   
> > -       task = task_seq_get_next(info->common.ns, &info->tid,
> > false);
> > +       task = task_seq_get_next(&info->common, &info->tid, false);
> >         if (!task)
> >                 return NULL;
> >   
> > @@ -73,7 +165,7 @@ static void *task_seq_next(struct seq_file *seq,
> > void *v, loff_t *pos)
> >         ++*pos;
> >         ++info->tid;
> >         put_task_struct((struct task_struct *)v);
> > -       task = task_seq_get_next(info->common.ns, &info->tid,
> > false);
> > +       task = task_seq_get_next(&info->common, &info->tid, false);
> >         if (!task)
> >                 return NULL;
> >   
> > @@ -117,6 +209,45 @@ static void task_seq_stop(struct seq_file
> > *seq, void *v)
> >                 put_task_struct((struct task_struct *)v);
> >   }
> >   
> [...]
Yonghong Song Aug. 31, 2022, 2:37 a.m. UTC | #3
On 8/30/22 5:35 PM, Kui-Feng Lee wrote:
> On Tue, 2022-08-30 at 16:54 -0700, Yonghong Song wrote:
>>
>>
>> On 8/29/22 12:23 PM, Kui-Feng Lee wrote:
>>> Allow creating an iterator that loops through resources of one
>>> thread/process.
>>>
>>> People could only create iterators to loop through all resources of
>>> files, vma, and tasks in the system, even though they were
>>> interested
>>> in only the resources of a specific task or process.  Passing the
>>> additional parameters, people can now create an iterator to go
>>> through all resources or only the resources of a task.
>>>
>>> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
>>> Acked-by: Yonghong Song <yhs@fb.com>
>>> ---
>>>    include/linux/bpf.h            |  25 +++++
>>>    include/uapi/linux/bpf.h       |   6 ++
>>>    kernel/bpf/task_iter.c         | 184
>>> +++++++++++++++++++++++++++++----
>>>    tools/include/uapi/linux/bpf.h |   6 ++
>>>    4 files changed, 199 insertions(+), 22 deletions(-)
>>>
>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>> index 9c1674973e03..31ac2c1181f5 100644
>>> --- a/include/linux/bpf.h
>>> +++ b/include/linux/bpf.h
>>> @@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user
>>> *pathname, int flags);
>>>          extern int bpf_iter_ ## target(args);                   \
>>>          int __init bpf_iter_ ## target(args) { return 0; }
>>>    
>>> +/*
>>> + * The task type of iterators.
>>> + *
>>> + * For BPF task iterators, they can be parameterized with various
>>> + * parameters to visit only some of tasks.
>>> + *
>>> + * BPF_TASK_ITER_ALL (default)
>>> + *     Iterate over resources of every task.
>>> + *
>>> + * BPF_TASK_ITER_TID
>>> + *     Iterate over resources of a task/tid.
>>> + *
>>> + * BPF_TASK_ITER_TGID
>>> + *     Iterate over resources of every task of a process / task
>>> group.
>>> + */
>>> +enum bpf_iter_task_type {
>>> +       BPF_TASK_ITER_ALL = 0,
>>> +       BPF_TASK_ITER_TID,
>>> +       BPF_TASK_ITER_TGID,
>>> +};
>>> +
>>>    struct bpf_iter_aux_info {
>>>          /* for map_elem iter */
>>>          struct bpf_map *map;
>>> @@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info {
>>>                  struct cgroup *start; /* starting cgroup */
>>>                  enum bpf_cgroup_iter_order order;
>>>          } cgroup;
>>> +       struct {
>>> +               enum bpf_iter_task_type type;
>>> +               u32 pid;
>>> +       } task;
>>>    };
>>>    
>>>    typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index 962960a98835..f212a19eda06 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -110,6 +110,12 @@ union bpf_iter_link_info {
>>>                  __u32   cgroup_fd;
>>>                  __u64   cgroup_id;
>>>          } cgroup;
>>> +       /* Parameters of task iterators. */
>>> +       struct {
>>> +               __u32   tid;
>>> +               __u32   pid;
>>> +               __u32   pid_fd;
>>> +       } task;
>>>    };
>>>    
>>>    /* BPF syscall commands, see bpf(2) man-page for more details. */
>>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>>> index 8c921799def4..0bc7277d1ee1 100644
>>> --- a/kernel/bpf/task_iter.c
>>> +++ b/kernel/bpf/task_iter.c
>>> @@ -12,6 +12,9 @@
>>>    
>>>    struct bpf_iter_seq_task_common {
>>>          struct pid_namespace *ns;
>>> +       enum bpf_iter_task_type type;
>>> +       u32 pid;
>>> +       u32 pid_visiting;
>>>    };
>>>    
>>>    struct bpf_iter_seq_task_info {
>>> @@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info {
>>>          u32 tid;
>>>    };
>>>    
>>> -static struct task_struct *task_seq_get_next(struct pid_namespace
>>> *ns,
>>> +static struct task_struct *task_group_seq_get_next(struct
>>> bpf_iter_seq_task_common *common,
>>> +                                                  u32 *tid,
>>> +                                                  bool
>>> skip_if_dup_files)
>>> +{
>>> +       struct task_struct *task, *next_task;
>>> +       struct pid *pid;
>>> +       u32 saved_tid;
>>> +
>>> +       if (!*tid) {
>>
>> Add a comment in the above to say that this is for the *very first*
>> visit of tasks in the process.
>>
>>> +               pid = find_pid_ns(common->pid, common->ns);
>>> +               if (pid)
>>> +                       task = get_pid_task(pid, PIDTYPE_TGID);
>>
>> 'task' is not initialized, so it is possible task could hold a
>> garbase value here if !pid, right?
>>
>> Also if indeed task is NULL, here, should we return NULL here
>> first?
> 
> yes, it should return earlier.
> 
>>
>>> +
>>> +               *tid = common->pid;
>>> +               common->pid_visiting = common->pid;
>>> +
>>> +               return task;
>>> +       }
>>> +
>>> +       /* The callers increase *tid by 1 once they want next task.
>>> +        * However, next_thread() doesn't return tasks in
>>> incremental
>>> +        * order of pids. We can not find next task by just finding
>>> a
>>> +        * task whose pid is greater or equal to *tid.
>>> pid_visiting
>>> +        * remembers the pid value of the task returned last time.
>>> By
>>> +        * comparing pid_visiting and *tid, we known if the caller
>>> +        * wants the next task.
>>> +        */
>>> +       if (*tid == common->pid_visiting) {
>>> +               pid = find_pid_ns(common->pid_visiting, common-
>>>> ns);
>>> +               task = get_pid_task(pid, PIDTYPE_PID);
>>> +
>>> +               return task;
>>> +       }
>>
>> Do not understand the above code. Why we need it? Looks like
>> the code below trying to get the *next_task* and will return NULL
>> if wrap around happens(the tid again equals tgid), right?
> 
> The above code is to handle the case that the caller want to visit the
> same task again.  For example, task_file_seq_get_next() will call this
> function several time to return the same task, and move to next task by
> increasing info->tid.  The above code checks the value of *tid to
> return the same task if the value doesn't change.

Could you explain when task_file_seq_get_next() will call this function
several times? IIUC, from the following code,

+static struct task_struct *task_seq_get_next(struct 
bpf_iter_seq_task_common *common,
  					     u32 *tid,
  					     bool skip_if_dup_files)
  {
  	struct task_struct *task = NULL;
  	struct pid *pid;

+	if (common->type == BPF_TASK_ITER_TID) {
+		if (*tid && *tid != common->pid)
+			return NULL;
+		rcu_read_lock();
+		pid = find_pid_ns(common->pid, common->ns);
+		if (pid) {
+			task = get_pid_task(pid, PIDTYPE_TGID);
+			*tid = common->pid;
+		}
+		rcu_read_unlock();
+
+		return task;
+	}
+
+	if (common->type == BPF_TASK_ITER_TGID) {
+		rcu_read_lock();
+		task = task_group_seq_get_next(common, tid, skip_if_dup_files);
+		rcu_read_unlock();
+
+		return task;
+	}
+

task_group_seq_get_next() is only called once per task_seq_get_next()
for BPF_TASK_ITER_TGID.
Maybe I missed something?



> 
>>
>>> +
>>> +retry:
>>> +       pid = find_pid_ns(common->pid_visiting, common->ns);
>>> +       if (!pid)
>>> +               return NULL;
>>> +
>>> +       task = get_pid_task(pid, PIDTYPE_PID);
>>> +       if (!task)
>>> +               return NULL;
>>> +
>>> +       next_task = next_thread(task);
>>> +       put_task_struct(task);
>>> +       if (!next_task)
>>> +               return NULL;
>>> +
>>> +       saved_tid = *tid;
>>> +       *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common-
>>>> ns);
>>> +       if (*tid == common->pid) {
>>> +               /* Run out of tasks of a process.  The tasks of a
>>> +                * thread_group are linked as circular linked list.
>>> +                */
>>> +               *tid = saved_tid;
>>> +               return NULL;
>>> +       }
>>> +
>>> +       get_task_struct(next_task);
>>> +       common->pid_visiting = *tid;
>>
>> We could do quite some redundant works here if the following
>> condition is true. Basically, we get next_task and get a tid
>> and release it, but in the next iteration, from tid, we try to get
>> the task again.
> 
> Yes, I will move 'retry' and move next_task to task to avoid the
> redundant work.
> 
>>
>>> +
>>> +       if (skip_if_dup_files && task->files == task->group_leader-
>>>> files)
>>> +               goto retry;
>>> +
>>> +       return next_task;
>>> +}
>>> +
>>> +static struct task_struct *task_seq_get_next(struct
>>> bpf_iter_seq_task_common *common,
>>>                                               u32 *tid,
>>>                                               bool
>>> skip_if_dup_files)
>>>    {
>>>          struct task_struct *task = NULL;
>>>          struct pid *pid;
>>>    
>>> +       if (common->type == BPF_TASK_ITER_TID) {
>>> +               if (*tid && *tid != common->pid)
>>> +                       return NULL;
>>> +               rcu_read_lock();
>>> +               pid = find_pid_ns(common->pid, common->ns);
>>> +               if (pid) {
>>> +                       task = get_pid_task(pid, PIDTYPE_TGID);
>>> +                       *tid = common->pid;
>>> +               }
>>> +               rcu_read_unlock();
>>> +
>>> +               return task;
>>> +       }
>>> +
>>> +       if (common->type == BPF_TASK_ITER_TGID) {
>>> +               rcu_read_lock();
>>> +               task = task_group_seq_get_next(common, tid,
>>> skip_if_dup_files);
>>> +               rcu_read_unlock();
>>> +
>>> +               return task;
>>> +       }
>>> +
>>>          rcu_read_lock();
>>>    retry:
>>> -       pid = find_ge_pid(*tid, ns);
>>> +       pid = find_ge_pid(*tid, common->ns);
>>>          if (pid) {
>>> -               *tid = pid_nr_ns(pid, ns);
>>> +               *tid = pid_nr_ns(pid, common->ns);
>>>                  task = get_pid_task(pid, PIDTYPE_PID);
>>>                  if (!task) {
>>>                          ++*tid;
>>> @@ -56,7 +148,7 @@ static void *task_seq_start(struct seq_file
>>> *seq, loff_t *pos)
>>>          struct bpf_iter_seq_task_info *info = seq->private;
>>>          struct task_struct *task;
>>>    
>>> -       task = task_seq_get_next(info->common.ns, &info->tid,
>>> false);
>>> +       task = task_seq_get_next(&info->common, &info->tid, false);
>>>          if (!task)
>>>                  return NULL;
>>>    
>>> @@ -73,7 +165,7 @@ static void *task_seq_next(struct seq_file *seq,
>>> void *v, loff_t *pos)
>>>          ++*pos;
>>>          ++info->tid;
>>>          put_task_struct((struct task_struct *)v);
>>> -       task = task_seq_get_next(info->common.ns, &info->tid,
>>> false);
>>> +       task = task_seq_get_next(&info->common, &info->tid, false);
>>>          if (!task)
>>>                  return NULL;
>>>    
>>> @@ -117,6 +209,45 @@ static void task_seq_stop(struct seq_file
>>> *seq, void *v)
>>>                  put_task_struct((struct task_struct *)v);
>>>    }
>>>    
>> [...]
>
Yonghong Song Aug. 31, 2022, 3:59 a.m. UTC | #4
On 8/30/22 7:37 PM, Yonghong Song wrote:
> 
> 
> On 8/30/22 5:35 PM, Kui-Feng Lee wrote:
>> On Tue, 2022-08-30 at 16:54 -0700, Yonghong Song wrote:
>>>
>>>
>>> On 8/29/22 12:23 PM, Kui-Feng Lee wrote:
>>>> Allow creating an iterator that loops through resources of one
>>>> thread/process.
>>>>
>>>> People could only create iterators to loop through all resources of
>>>> files, vma, and tasks in the system, even though they were
>>>> interested
>>>> in only the resources of a specific task or process.  Passing the
>>>> additional parameters, people can now create an iterator to go
>>>> through all resources or only the resources of a task.
>>>>
>>>> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
>>>> Acked-by: Yonghong Song <yhs@fb.com>
>>>> ---
>>>>    include/linux/bpf.h            |  25 +++++
>>>>    include/uapi/linux/bpf.h       |   6 ++
>>>>    kernel/bpf/task_iter.c         | 184
>>>> +++++++++++++++++++++++++++++----
>>>>    tools/include/uapi/linux/bpf.h |   6 ++
>>>>    4 files changed, 199 insertions(+), 22 deletions(-)
>>>>
>>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>>> index 9c1674973e03..31ac2c1181f5 100644
>>>> --- a/include/linux/bpf.h
>>>> +++ b/include/linux/bpf.h
>>>> @@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user
>>>> *pathname, int flags);
>>>>          extern int bpf_iter_ ## target(args);                   \
>>>>          int __init bpf_iter_ ## target(args) { return 0; }
>>>> +/*
>>>> + * The task type of iterators.
>>>> + *
>>>> + * For BPF task iterators, they can be parameterized with various
>>>> + * parameters to visit only some of tasks.
>>>> + *
>>>> + * BPF_TASK_ITER_ALL (default)
>>>> + *     Iterate over resources of every task.
>>>> + *
>>>> + * BPF_TASK_ITER_TID
>>>> + *     Iterate over resources of a task/tid.
>>>> + *
>>>> + * BPF_TASK_ITER_TGID
>>>> + *     Iterate over resources of every task of a process / task
>>>> group.
>>>> + */
>>>> +enum bpf_iter_task_type {
>>>> +       BPF_TASK_ITER_ALL = 0,
>>>> +       BPF_TASK_ITER_TID,
>>>> +       BPF_TASK_ITER_TGID,
>>>> +};
>>>> +
>>>>    struct bpf_iter_aux_info {
>>>>          /* for map_elem iter */
>>>>          struct bpf_map *map;
>>>> @@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info {
>>>>                  struct cgroup *start; /* starting cgroup */
>>>>                  enum bpf_cgroup_iter_order order;
>>>>          } cgroup;
>>>> +       struct {
>>>> +               enum bpf_iter_task_type type;
>>>> +               u32 pid;
>>>> +       } task;
>>>>    };
>>>>    typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>> index 962960a98835..f212a19eda06 100644
>>>> --- a/include/uapi/linux/bpf.h
>>>> +++ b/include/uapi/linux/bpf.h
>>>> @@ -110,6 +110,12 @@ union bpf_iter_link_info {
>>>>                  __u32   cgroup_fd;
>>>>                  __u64   cgroup_id;
>>>>          } cgroup;
>>>> +       /* Parameters of task iterators. */
>>>> +       struct {
>>>> +               __u32   tid;
>>>> +               __u32   pid;
>>>> +               __u32   pid_fd;
>>>> +       } task;
>>>>    };
>>>>    /* BPF syscall commands, see bpf(2) man-page for more details. */
>>>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
>>>> index 8c921799def4..0bc7277d1ee1 100644
>>>> --- a/kernel/bpf/task_iter.c
>>>> +++ b/kernel/bpf/task_iter.c
>>>> @@ -12,6 +12,9 @@
>>>>    struct bpf_iter_seq_task_common {
>>>>          struct pid_namespace *ns;
>>>> +       enum bpf_iter_task_type type;
>>>> +       u32 pid;
>>>> +       u32 pid_visiting;
>>>>    };
>>>>    struct bpf_iter_seq_task_info {
>>>> @@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info {
>>>>          u32 tid;
>>>>    };
>>>> -static struct task_struct *task_seq_get_next(struct pid_namespace
>>>> *ns,
>>>> +static struct task_struct *task_group_seq_get_next(struct
>>>> bpf_iter_seq_task_common *common,
>>>> +                                                  u32 *tid,
>>>> +                                                  bool
>>>> skip_if_dup_files)
>>>> +{
>>>> +       struct task_struct *task, *next_task;
>>>> +       struct pid *pid;
>>>> +       u32 saved_tid;
>>>> +
>>>> +       if (!*tid) {
>>>
>>> Add a comment in the above to say that this is for the *very first*
>>> visit of tasks in the process.
>>>
>>>> +               pid = find_pid_ns(common->pid, common->ns);
>>>> +               if (pid)
>>>> +                       task = get_pid_task(pid, PIDTYPE_TGID);
>>>
>>> 'task' is not initialized, so it is possible task could hold a
>>> garbase value here if !pid, right?
>>>
>>> Also if indeed task is NULL, here, should we return NULL here
>>> first?
>>
>> yes, it should return earlier.
>>
>>>
>>>> +
>>>> +               *tid = common->pid;
>>>> +               common->pid_visiting = common->pid;
>>>> +
>>>> +               return task;
>>>> +       }
>>>> +
>>>> +       /* The callers increase *tid by 1 once they want next task.
>>>> +        * However, next_thread() doesn't return tasks in
>>>> incremental
>>>> +        * order of pids. We can not find next task by just finding
>>>> a
>>>> +        * task whose pid is greater or equal to *tid.
>>>> pid_visiting
>>>> +        * remembers the pid value of the task returned last time.
>>>> By
>>>> +        * comparing pid_visiting and *tid, we known if the caller
>>>> +        * wants the next task.
>>>> +        */
>>>> +       if (*tid == common->pid_visiting) {
>>>> +               pid = find_pid_ns(common->pid_visiting, common-
>>>>> ns);
>>>> +               task = get_pid_task(pid, PIDTYPE_PID);
>>>> +
>>>> +               return task;
>>>> +       }
>>>
>>> Do not understand the above code. Why we need it? Looks like
>>> the code below trying to get the *next_task* and will return NULL
>>> if wrap around happens(the tid again equals tgid), right?
>>
>> The above code is to handle the case that the caller want to visit the
>> same task again.  For example, task_file_seq_get_next() will call this
>> function several time to return the same task, and move to next task by
>> increasing info->tid.  The above code checks the value of *tid to
>> return the same task if the value doesn't change.

Okay, I did a little more investigation. The above code is correct.
But the comment does not clearly describe why. The real
reason we need this is for subsequent iterations from task_seq_start().
Basically we have

    task_seq_next();
    // return to user space
    task_seq_start()
      task_group_seq_get_next()

In above task_group_seq_get_next() needs to retrieve the task which
is the to-be-visited task during last task_seq_start/next/show/stop() run.

The comment can be as simple as below.

	/* If the control returns to user space and comes back to
	 * the kernel again, *tid and common->pid_visiting should
	 * be the same for task_seq_start() to pick up the correct
	 * task.
	 */
> 
> Could you explain when task_file_seq_get_next() will call this function
> several times? IIUC, from the following code,
> 
> +static struct task_struct *task_seq_get_next(struct 
> bpf_iter_seq_task_common *common,
>                            u32 *tid,
>                            bool skip_if_dup_files)
>   {
>       struct task_struct *task = NULL;
>       struct pid *pid;
> 
> +    if (common->type == BPF_TASK_ITER_TID) {
> +        if (*tid && *tid != common->pid)
> +            return NULL;
> +        rcu_read_lock();
> +        pid = find_pid_ns(common->pid, common->ns);
> +        if (pid) {
> +            task = get_pid_task(pid, PIDTYPE_TGID);
> +            *tid = common->pid;
> +        }
> +        rcu_read_unlock();
> +
> +        return task;
> +    }
> +
> +    if (common->type == BPF_TASK_ITER_TGID) {
> +        rcu_read_lock();
> +        task = task_group_seq_get_next(common, tid, skip_if_dup_files);
> +        rcu_read_unlock();
> +
> +        return task;
> +    }
> +
> 
> task_group_seq_get_next() is only called once per task_seq_get_next()
> for BPF_TASK_ITER_TGID.
> Maybe I missed something?
> 
> 
> 
[...]
Kui-Feng Lee Aug. 31, 2022, 4:39 p.m. UTC | #5
On Tue, 2022-08-30 at 19:37 -0700, Yonghong Song wrote:
> 
> 
> On 8/30/22 5:35 PM, Kui-Feng Lee wrote:
> > On Tue, 2022-08-30 at 16:54 -0700, Yonghong Song wrote:
> > > 
> > > 
> > > On 8/29/22 12:23 PM, Kui-Feng Lee wrote:
> > > > Allow creating an iterator that loops through resources of one
> > > > thread/process.
> > > > 
> > > > People could only create iterators to loop through all
> > > > resources of
> > > > files, vma, and tasks in the system, even though they were
> > > > interested
> > > > in only the resources of a specific task or process.  Passing
> > > > the
> > > > additional parameters, people can now create an iterator to go
> > > > through all resources or only the resources of a task.
> > > > 
> > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > > > Acked-by: Yonghong Song <yhs@fb.com>
> > > > ---
> > > >    include/linux/bpf.h            |  25 +++++
> > > >    include/uapi/linux/bpf.h       |   6 ++
> > > >    kernel/bpf/task_iter.c         | 184
> > > > +++++++++++++++++++++++++++++----
> > > >    tools/include/uapi/linux/bpf.h |   6 ++
> > > >    4 files changed, 199 insertions(+), 22 deletions(-)
> > > > 
> > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > index 9c1674973e03..31ac2c1181f5 100644
> > > > --- a/include/linux/bpf.h
> > > > +++ b/include/linux/bpf.h
> > > > @@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user
> > > > *pathname, int flags);
> > > >          extern int bpf_iter_ ##
> > > > target(args);                   \
> > > >          int __init bpf_iter_ ## target(args) { return 0; }
> > > >    
> > > > +/*
> > > > + * The task type of iterators.
> > > > + *
> > > > + * For BPF task iterators, they can be parameterized with
> > > > various
> > > > + * parameters to visit only some of tasks.
> > > > + *
> > > > + * BPF_TASK_ITER_ALL (default)
> > > > + *     Iterate over resources of every task.
> > > > + *
> > > > + * BPF_TASK_ITER_TID
> > > > + *     Iterate over resources of a task/tid.
> > > > + *
> > > > + * BPF_TASK_ITER_TGID
> > > > + *     Iterate over resources of every task of a process /
> > > > task
> > > > group.
> > > > + */
> > > > +enum bpf_iter_task_type {
> > > > +       BPF_TASK_ITER_ALL = 0,
> > > > +       BPF_TASK_ITER_TID,
> > > > +       BPF_TASK_ITER_TGID,
> > > > +};
> > > > +
> > > >    struct bpf_iter_aux_info {
> > > >          /* for map_elem iter */
> > > >          struct bpf_map *map;
> > > > @@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info {
> > > >                  struct cgroup *start; /* starting cgroup */
> > > >                  enum bpf_cgroup_iter_order order;
> > > >          } cgroup;
> > > > +       struct {
> > > > +               enum bpf_iter_task_type type;
> > > > +               u32 pid;
> > > > +       } task;
> > > >    };
> > > >    
> > > >    typedef int (*bpf_iter_attach_target_t)(struct bpf_prog
> > > > *prog,
> > > > diff --git a/include/uapi/linux/bpf.h
> > > > b/include/uapi/linux/bpf.h
> > > > index 962960a98835..f212a19eda06 100644
> > > > --- a/include/uapi/linux/bpf.h
> > > > +++ b/include/uapi/linux/bpf.h
> > > > @@ -110,6 +110,12 @@ union bpf_iter_link_info {
> > > >                  __u32   cgroup_fd;
> > > >                  __u64   cgroup_id;
> > > >          } cgroup;
> > > > +       /* Parameters of task iterators. */
> > > > +       struct {
> > > > +               __u32   tid;
> > > > +               __u32   pid;
> > > > +               __u32   pid_fd;
> > > > +       } task;
> > > >    };
> > > >    
> > > >    /* BPF syscall commands, see bpf(2) man-page for more
> > > > details. */
> > > > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > > > index 8c921799def4..0bc7277d1ee1 100644
> > > > --- a/kernel/bpf/task_iter.c
> > > > +++ b/kernel/bpf/task_iter.c
> > > > @@ -12,6 +12,9 @@
> > > >    
> > > >    struct bpf_iter_seq_task_common {
> > > >          struct pid_namespace *ns;
> > > > +       enum bpf_iter_task_type type;
> > > > +       u32 pid;
> > > > +       u32 pid_visiting;
> > > >    };
> > > >    
> > > >    struct bpf_iter_seq_task_info {
> > > > @@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info {
> > > >          u32 tid;
> > > >    };
> > > >    
> > > > -static struct task_struct *task_seq_get_next(struct
> > > > pid_namespace
> > > > *ns,
> > > > +static struct task_struct *task_group_seq_get_next(struct
> > > > bpf_iter_seq_task_common *common,
> > > > +                                                  u32 *tid,
> > > > +                                                  bool
> > > > skip_if_dup_files)
> > > > +{
> > > > +       struct task_struct *task, *next_task;
> > > > +       struct pid *pid;
> > > > +       u32 saved_tid;
> > > > +
> > > > +       if (!*tid) {
> > > 
> > > Add a comment in the above to say that this is for the *very
> > > first*
> > > visit of tasks in the process.
> > > 
> > > > +               pid = find_pid_ns(common->pid, common->ns);
> > > > +               if (pid)
> > > > +                       task = get_pid_task(pid, PIDTYPE_TGID);
> > > 
> > > 'task' is not initialized, so it is possible task could hold a
> > > garbase value here if !pid, right?
> > > 
> > > Also if indeed task is NULL, here, should we return NULL here
> > > first?
> > 
> > yes, it should return earlier.
> > 
> > > 
> > > > +
> > > > +               *tid = common->pid;
> > > > +               common->pid_visiting = common->pid;
> > > > +
> > > > +               return task;
> > > > +       }
> > > > +
> > > > +       /* The callers increase *tid by 1 once they want next
> > > > task.
> > > > +        * However, next_thread() doesn't return tasks in
> > > > incremental
> > > > +        * order of pids. We can not find next task by just
> > > > finding
> > > > a
> > > > +        * task whose pid is greater or equal to *tid.
> > > > pid_visiting
> > > > +        * remembers the pid value of the task returned last
> > > > time.
> > > > By
> > > > +        * comparing pid_visiting and *tid, we known if the
> > > > caller
> > > > +        * wants the next task.
> > > > +        */
> > > > +       if (*tid == common->pid_visiting) {
> > > > +               pid = find_pid_ns(common->pid_visiting, common-
> > > > > ns);
> > > > +               task = get_pid_task(pid, PIDTYPE_PID);
> > > > +
> > > > +               return task;
> > > > +       }
> > > 
> > > Do not understand the above code. Why we need it? Looks like
> > > the code below trying to get the *next_task* and will return NULL
> > > if wrap around happens(the tid again equals tgid), right?
> > 
> > The above code is to handle the case that the caller want to visit
> > the
> > same task again.  For example, task_file_seq_get_next() will call
> > this
> > function several time to return the same task, and move to next
> > task by
> > increasing info->tid.  The above code checks the value of *tid to
> > return the same task if the value doesn't change.
> 
> Could you explain when task_file_seq_get_next() will call this
> function
> several times? IIUC, from the following code,

Let's assume that a userspace program reads an BPF file iterator in
several times instead of reading the whole content in once.  

bpf_seq_read() will be called several times. Every time returning from
bpf_seq_read except the last one, info->tid will be the tid of the
latest visited task.

bpf_seq_read() will call seq->op->start() (a.k.a task_file_seq_start()
in this case).  However, task_file_seq_start() will reset info->task,
and call task_file_seq_get_next() to return the task which has tid
giving by *tid. Since info->task is NULL and *tid is not 0,
task_file_seq_get_next() will call task_seq_get_next() to retrieve the
same task again.

Does that make sense?


> 
> +static struct task_struct *task_seq_get_next(struct 
> bpf_iter_seq_task_common *common,
>                                              u32 *tid,
>                                              bool skip_if_dup_files)
>   {
>         struct task_struct *task = NULL;
>         struct pid *pid;
> 
> +       if (common->type == BPF_TASK_ITER_TID) {
> +               if (*tid && *tid != common->pid)
> +                       return NULL;
> +               rcu_read_lock();
> +               pid = find_pid_ns(common->pid, common->ns);
> +               if (pid) {
> +                       task = get_pid_task(pid, PIDTYPE_TGID);
> +                       *tid = common->pid;
> +               }
> +               rcu_read_unlock();
> +
> +               return task;
> +       }
> +
> +       if (common->type == BPF_TASK_ITER_TGID) {
> +               rcu_read_lock();
> +               task = task_group_seq_get_next(common, tid,
> skip_if_dup_files);
> +               rcu_read_unlock();
> +
> +               return task;
> +       }
> +
> 
> task_group_seq_get_next() is only called once per task_seq_get_next()
> for BPF_TASK_ITER_TGID.
> Maybe I missed something?
> 
> 
> 
> > 
> > > 
> > > > +
> > > > +retry:
> > > > +       pid = find_pid_ns(common->pid_visiting, common->ns);
> > > > +       if (!pid)
> > > > +               return NULL;
> > > > +
> > > > +       task = get_pid_task(pid, PIDTYPE_PID);
> > > > +       if (!task)
> > > > +               return NULL;
> > > > +
> > > > +       next_task = next_thread(task);
> > > > +       put_task_struct(task);
> > > > +       if (!next_task)
> > > > +               return NULL;
> > > > +
> > > > +       saved_tid = *tid;
> > > > +       *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common-
> > > > > ns);
> > > > +       if (*tid == common->pid) {
> > > > +               /* Run out of tasks of a process.  The tasks of
> > > > a
> > > > +                * thread_group are linked as circular linked
> > > > list.
> > > > +                */
> > > > +               *tid = saved_tid;
> > > > +               return NULL;
> > > > +       }
> > > > +
> > > > +       get_task_struct(next_task);
> > > > +       common->pid_visiting = *tid;
> > > 
> > > We could do quite some redundant works here if the following
> > > condition is true. Basically, we get next_task and get a tid
> > > and release it, but in the next iteration, from tid, we try to
> > > get
> > > the task again.
> > 
> > Yes, I will move 'retry' and move next_task to task to avoid the
> > redundant work.
> > 
> > > 
> > > > +
> > > > +       if (skip_if_dup_files && task->files == task-
> > > > >group_leader-
> > > > > files)
> > > > +               goto retry;
> > > > +
> > > > +       return next_task;
> > > > +}
> > > > +
> > > > +static struct task_struct *task_seq_get_next(struct
> > > > bpf_iter_seq_task_common *common,
> > > >                                               u32 *tid,
> > > >                                               bool
> > > > skip_if_dup_files)
> > > >    {
> > > >          struct task_struct *task = NULL;
> > > >          struct pid *pid;
> > > >    
> > > > +       if (common->type == BPF_TASK_ITER_TID) {
> > > > +               if (*tid && *tid != common->pid)
> > > > +                       return NULL;
> > > > +               rcu_read_lock();
> > > > +               pid = find_pid_ns(common->pid, common->ns);
> > > > +               if (pid) {
> > > > +                       task = get_pid_task(pid, PIDTYPE_TGID);
> > > > +                       *tid = common->pid;
> > > > +               }
> > > > +               rcu_read_unlock();
> > > > +
> > > > +               return task;
> > > > +       }
> > > > +
> > > > +       if (common->type == BPF_TASK_ITER_TGID) {
> > > > +               rcu_read_lock();
> > > > +               task = task_group_seq_get_next(common, tid,
> > > > skip_if_dup_files);
> > > > +               rcu_read_unlock();
> > > > +
> > > > +               return task;
> > > > +       }
> > > > +
> > > >          rcu_read_lock();
> > > >    retry:
> > > > -       pid = find_ge_pid(*tid, ns);
> > > > +       pid = find_ge_pid(*tid, common->ns);
> > > >          if (pid) {
> > > > -               *tid = pid_nr_ns(pid, ns);
> > > > +               *tid = pid_nr_ns(pid, common->ns);
> > > >                  task = get_pid_task(pid, PIDTYPE_PID);
> > > >                  if (!task) {
> > > >                          ++*tid;
> > > > @@ -56,7 +148,7 @@ static void *task_seq_start(struct seq_file
> > > > *seq, loff_t *pos)
> > > >          struct bpf_iter_seq_task_info *info = seq->private;
> > > >          struct task_struct *task;
> > > >    
> > > > -       task = task_seq_get_next(info->common.ns, &info->tid,
> > > > false);
> > > > +       task = task_seq_get_next(&info->common, &info->tid,
> > > > false);
> > > >          if (!task)
> > > >                  return NULL;
> > > >    
> > > > @@ -73,7 +165,7 @@ static void *task_seq_next(struct seq_file
> > > > *seq,
> > > > void *v, loff_t *pos)
> > > >          ++*pos;
> > > >          ++info->tid;
> > > >          put_task_struct((struct task_struct *)v);
> > > > -       task = task_seq_get_next(info->common.ns, &info->tid,
> > > > false);
> > > > +       task = task_seq_get_next(&info->common, &info->tid,
> > > > false);
> > > >          if (!task)
> > > >                  return NULL;
> > > >    
> > > > @@ -117,6 +209,45 @@ static void task_seq_stop(struct seq_file
> > > > *seq, void *v)
> > > >                  put_task_struct((struct task_struct *)v);
> > > >    }
> > > >    
> > > [...]
> >
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9c1674973e03..31ac2c1181f5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1730,6 +1730,27 @@  int bpf_obj_get_user(const char __user *pathname, int flags);
 	extern int bpf_iter_ ## target(args);			\
 	int __init bpf_iter_ ## target(args) { return 0; }
 
+/*
+ * The task type of iterators.
+ *
+ * For BPF task iterators, they can be parameterized with various
+ * parameters to visit only some of tasks.
+ *
+ * BPF_TASK_ITER_ALL (default)
+ *	Iterate over resources of every task.
+ *
+ * BPF_TASK_ITER_TID
+ *	Iterate over resources of a task/tid.
+ *
+ * BPF_TASK_ITER_TGID
+ *	Iterate over resources of every task of a process / task group.
+ */
+enum bpf_iter_task_type {
+	BPF_TASK_ITER_ALL = 0,
+	BPF_TASK_ITER_TID,
+	BPF_TASK_ITER_TGID,
+};
+
 struct bpf_iter_aux_info {
 	/* for map_elem iter */
 	struct bpf_map *map;
@@ -1739,6 +1760,10 @@  struct bpf_iter_aux_info {
 		struct cgroup *start; /* starting cgroup */
 		enum bpf_cgroup_iter_order order;
 	} cgroup;
+	struct {
+		enum bpf_iter_task_type	type;
+		u32 pid;
+	} task;
 };
 
 typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 962960a98835..f212a19eda06 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -110,6 +110,12 @@  union bpf_iter_link_info {
 		__u32	cgroup_fd;
 		__u64	cgroup_id;
 	} cgroup;
+	/* Parameters of task iterators. */
+	struct {
+		__u32	tid;
+		__u32	pid;
+		__u32	pid_fd;
+	} task;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8c921799def4..0bc7277d1ee1 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -12,6 +12,9 @@ 
 
 struct bpf_iter_seq_task_common {
 	struct pid_namespace *ns;
+	enum bpf_iter_task_type	type;
+	u32 pid;
+	u32 pid_visiting;
 };
 
 struct bpf_iter_seq_task_info {
@@ -22,18 +25,107 @@  struct bpf_iter_seq_task_info {
 	u32 tid;
 };
 
-static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
+						   u32 *tid,
+						   bool skip_if_dup_files)
+{
+	struct task_struct *task, *next_task;
+	struct pid *pid;
+	u32 saved_tid;
+
+	if (!*tid) {
+		pid = find_pid_ns(common->pid, common->ns);
+		if (pid)
+			task = get_pid_task(pid, PIDTYPE_TGID);
+
+		*tid = common->pid;
+		common->pid_visiting = common->pid;
+
+		return task;
+	}
+
+	/* The callers increase *tid by 1 once they want next task.
+	 * However, next_thread() doesn't return tasks in incremental
+	 * order of pids. We can not find next task by just finding a
+	 * task whose pid is greater or equal to *tid.  pid_visiting
+	 * remembers the pid value of the task returned last time. By
+	 * comparing pid_visiting and *tid, we known if the caller
+	 * wants the next task.
+	 */
+	if (*tid == common->pid_visiting) {
+		pid = find_pid_ns(common->pid_visiting, common->ns);
+		task = get_pid_task(pid, PIDTYPE_PID);
+
+		return task;
+	}
+
+retry:
+	pid = find_pid_ns(common->pid_visiting, common->ns);
+	if (!pid)
+		return NULL;
+
+	task = get_pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		return NULL;
+
+	next_task = next_thread(task);
+	put_task_struct(task);
+	if (!next_task)
+		return NULL;
+
+	saved_tid = *tid;
+	*tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
+	if (*tid == common->pid) {
+		/* Run out of tasks of a process.  The tasks of a
+		 * thread_group are linked as circular linked list.
+		 */
+		*tid = saved_tid;
+		return NULL;
+	}
+
+	get_task_struct(next_task);
+	common->pid_visiting = *tid;
+
+	if (skip_if_dup_files && task->files == task->group_leader->files)
+		goto retry;
+
+	return next_task;
+}
+
+static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
 					     u32 *tid,
 					     bool skip_if_dup_files)
 {
 	struct task_struct *task = NULL;
 	struct pid *pid;
 
+	if (common->type == BPF_TASK_ITER_TID) {
+		if (*tid && *tid != common->pid)
+			return NULL;
+		rcu_read_lock();
+		pid = find_pid_ns(common->pid, common->ns);
+		if (pid) {
+			task = get_pid_task(pid, PIDTYPE_TGID);
+			*tid = common->pid;
+		}
+		rcu_read_unlock();
+
+		return task;
+	}
+
+	if (common->type == BPF_TASK_ITER_TGID) {
+		rcu_read_lock();
+		task = task_group_seq_get_next(common, tid, skip_if_dup_files);
+		rcu_read_unlock();
+
+		return task;
+	}
+
 	rcu_read_lock();
 retry:
-	pid = find_ge_pid(*tid, ns);
+	pid = find_ge_pid(*tid, common->ns);
 	if (pid) {
-		*tid = pid_nr_ns(pid, ns);
+		*tid = pid_nr_ns(pid, common->ns);
 		task = get_pid_task(pid, PIDTYPE_PID);
 		if (!task) {
 			++*tid;
@@ -56,7 +148,7 @@  static void *task_seq_start(struct seq_file *seq, loff_t *pos)
 	struct bpf_iter_seq_task_info *info = seq->private;
 	struct task_struct *task;
 
-	task = task_seq_get_next(info->common.ns, &info->tid, false);
+	task = task_seq_get_next(&info->common, &info->tid, false);
 	if (!task)
 		return NULL;
 
@@ -73,7 +165,7 @@  static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	++*pos;
 	++info->tid;
 	put_task_struct((struct task_struct *)v);
-	task = task_seq_get_next(info->common.ns, &info->tid, false);
+	task = task_seq_get_next(&info->common, &info->tid, false);
 	if (!task)
 		return NULL;
 
@@ -117,6 +209,45 @@  static void task_seq_stop(struct seq_file *seq, void *v)
 		put_task_struct((struct task_struct *)v);
 }
 
+static int bpf_iter_attach_task(struct bpf_prog *prog,
+				union bpf_iter_link_info *linfo,
+				struct bpf_iter_aux_info *aux)
+{
+	unsigned int flags;
+	struct pid_namespace *ns;
+	struct pid *pid;
+	pid_t tgid;
+
+	if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1)
+		return -EINVAL;
+
+	aux->task.type = BPF_TASK_ITER_ALL;
+	if (linfo->task.tid != 0) {
+		aux->task.type = BPF_TASK_ITER_TID;
+		aux->task.pid = linfo->task.tid;
+	}
+	if (linfo->task.pid != 0) {
+		aux->task.type = BPF_TASK_ITER_TGID;
+		aux->task.pid = linfo->task.pid;
+	}
+	if (linfo->task.pid_fd != 0) {
+		aux->task.type = BPF_TASK_ITER_TGID;
+		ns = task_active_pid_ns(current);
+		if (IS_ERR(ns))
+			return PTR_ERR(ns);
+
+		pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
+		if (IS_ERR(pid))
+			return PTR_ERR(pid);
+
+		tgid = pid_nr_ns(pid, ns);
+		aux->task.pid = tgid;
+		put_pid(pid);
+	}
+
+	return 0;
+}
+
 static const struct seq_operations task_seq_ops = {
 	.start	= task_seq_start,
 	.next	= task_seq_next,
@@ -137,8 +268,7 @@  struct bpf_iter_seq_task_file_info {
 static struct file *
 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
 {
-	struct pid_namespace *ns = info->common.ns;
-	u32 curr_tid = info->tid;
+	u32 saved_tid = info->tid;
 	struct task_struct *curr_task;
 	unsigned int curr_fd = info->fd;
 
@@ -151,21 +281,18 @@  task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
 		curr_task = info->task;
 		curr_fd = info->fd;
 	} else {
-                curr_task = task_seq_get_next(ns, &curr_tid, true);
+		curr_task = task_seq_get_next(&info->common, &info->tid, true);
                 if (!curr_task) {
                         info->task = NULL;
-                        info->tid = curr_tid;
                         return NULL;
                 }
 
-                /* set info->task and info->tid */
+		/* set info->task */
 		info->task = curr_task;
-		if (curr_tid == info->tid) {
+		if (saved_tid == info->tid)
 			curr_fd = info->fd;
-		} else {
-			info->tid = curr_tid;
+		else
 			curr_fd = 0;
-		}
 	}
 
 	rcu_read_lock();
@@ -186,9 +313,15 @@  task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
 	/* the current task is done, go to the next task */
 	rcu_read_unlock();
 	put_task_struct(curr_task);
+
+	if (info->common.type == BPF_TASK_ITER_TID) {
+		info->task = NULL;
+		return NULL;
+	}
+
 	info->task = NULL;
 	info->fd = 0;
-	curr_tid = ++(info->tid);
+	saved_tid = ++(info->tid);
 	goto again;
 }
 
@@ -269,6 +402,9 @@  static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
 	struct bpf_iter_seq_task_common *common = priv_data;
 
 	common->ns = get_pid_ns(task_active_pid_ns(current));
+	common->type = aux->task.type;
+	common->pid = aux->task.pid;
+
 	return 0;
 }
 
@@ -307,11 +443,10 @@  enum bpf_task_vma_iter_find_op {
 static struct vm_area_struct *
 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 {
-	struct pid_namespace *ns = info->common.ns;
 	enum bpf_task_vma_iter_find_op op;
 	struct vm_area_struct *curr_vma;
 	struct task_struct *curr_task;
-	u32 curr_tid = info->tid;
+	u32 saved_tid = info->tid;
 
 	/* If this function returns a non-NULL vma, it holds a reference to
 	 * the task_struct, and holds read lock on vma->mm->mmap_lock.
@@ -371,14 +506,13 @@  task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 		}
 	} else {
 again:
-		curr_task = task_seq_get_next(ns, &curr_tid, true);
+		curr_task = task_seq_get_next(&info->common, &info->tid, true);
 		if (!curr_task) {
-			info->tid = curr_tid + 1;
+			info->tid++;
 			goto finish;
 		}
 
-		if (curr_tid != info->tid) {
-			info->tid = curr_tid;
+		if (saved_tid != info->tid) {
 			/* new task, process the first vma */
 			op = task_vma_iter_first_vma;
 		} else {
@@ -430,9 +564,12 @@  task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 	return curr_vma;
 
 next_task:
+	if (info->common.type == BPF_TASK_ITER_TID)
+		goto finish;
+
 	put_task_struct(curr_task);
 	info->task = NULL;
-	curr_tid++;
+	info->tid++;
 	goto again;
 
 finish:
@@ -533,6 +670,7 @@  static const struct bpf_iter_seq_info task_seq_info = {
 
 static struct bpf_iter_reg task_reg_info = {
 	.target			= "task",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
@@ -551,6 +689,7 @@  static const struct bpf_iter_seq_info task_file_seq_info = {
 
 static struct bpf_iter_reg task_file_reg_info = {
 	.target			= "task_file",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
@@ -571,6 +710,7 @@  static const struct bpf_iter_seq_info task_vma_seq_info = {
 
 static struct bpf_iter_reg task_vma_reg_info = {
 	.target			= "task_vma",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f4ba82a1eace..40935278eede 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -110,6 +110,12 @@  union bpf_iter_link_info {
 		__u32	cgroup_fd;
 		__u64	cgroup_id;
 	} cgroup;
+	/* Parameters of task iterators. */
+	struct {
+		__u32	tid;
+		__u32	pid;
+		__u32	pid_fd;
+	} task;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */