Message ID | 20220829192317.486946-2-kuifeng@fb.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Parameterize task iterators. | expand |
On 8/29/22 12:23 PM, Kui-Feng Lee wrote: > Allow creating an iterator that loops through resources of one > thread/process. > > People could only create iterators to loop through all resources of > files, vma, and tasks in the system, even though they were interested > in only the resources of a specific task or process. Passing the > additional parameters, people can now create an iterator to go > through all resources or only the resources of a task. > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > Acked-by: Yonghong Song <yhs@fb.com> > --- > include/linux/bpf.h | 25 +++++ > include/uapi/linux/bpf.h | 6 ++ > kernel/bpf/task_iter.c | 184 +++++++++++++++++++++++++++++---- > tools/include/uapi/linux/bpf.h | 6 ++ > 4 files changed, 199 insertions(+), 22 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 9c1674973e03..31ac2c1181f5 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user *pathname, int flags); > extern int bpf_iter_ ## target(args); \ > int __init bpf_iter_ ## target(args) { return 0; } > > +/* > + * The task type of iterators. > + * > + * For BPF task iterators, they can be parameterized with various > + * parameters to visit only some of tasks. > + * > + * BPF_TASK_ITER_ALL (default) > + * Iterate over resources of every task. > + * > + * BPF_TASK_ITER_TID > + * Iterate over resources of a task/tid. > + * > + * BPF_TASK_ITER_TGID > + * Iterate over resources of every task of a process / task group. > + */ > +enum bpf_iter_task_type { > + BPF_TASK_ITER_ALL = 0, > + BPF_TASK_ITER_TID, > + BPF_TASK_ITER_TGID, > +}; > + > struct bpf_iter_aux_info { > /* for map_elem iter */ > struct bpf_map *map; > @@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info { > struct cgroup *start; /* starting cgroup */ > enum bpf_cgroup_iter_order order; > } cgroup; > + struct { > + enum bpf_iter_task_type type; > + u32 pid; > + } task; > }; > > typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 962960a98835..f212a19eda06 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -110,6 +110,12 @@ union bpf_iter_link_info { > __u32 cgroup_fd; > __u64 cgroup_id; > } cgroup; > + /* Parameters of task iterators. */ > + struct { > + __u32 tid; > + __u32 pid; > + __u32 pid_fd; > + } task; > }; > > /* BPF syscall commands, see bpf(2) man-page for more details. */ > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > index 8c921799def4..0bc7277d1ee1 100644 > --- a/kernel/bpf/task_iter.c > +++ b/kernel/bpf/task_iter.c > @@ -12,6 +12,9 @@ > > struct bpf_iter_seq_task_common { > struct pid_namespace *ns; > + enum bpf_iter_task_type type; > + u32 pid; > + u32 pid_visiting; > }; > > struct bpf_iter_seq_task_info { > @@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info { > u32 tid; > }; > > -static struct task_struct *task_seq_get_next(struct pid_namespace *ns, > +static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common, > + u32 *tid, > + bool skip_if_dup_files) > +{ > + struct task_struct *task, *next_task; > + struct pid *pid; > + u32 saved_tid; > + > + if (!*tid) { Add a comment in the above to say that this is for the *very first* visit of tasks in the process. > + pid = find_pid_ns(common->pid, common->ns); > + if (pid) > + task = get_pid_task(pid, PIDTYPE_TGID); 'task' is not initialized, so it is possible task could hold a garbase value here if !pid, right? Also if indeed task is NULL, here, should we return NULL here first? > + > + *tid = common->pid; > + common->pid_visiting = common->pid; > + > + return task; > + } > + > + /* The callers increase *tid by 1 once they want next task. > + * However, next_thread() doesn't return tasks in incremental > + * order of pids. We can not find next task by just finding a > + * task whose pid is greater or equal to *tid. pid_visiting > + * remembers the pid value of the task returned last time. By > + * comparing pid_visiting and *tid, we known if the caller > + * wants the next task. > + */ > + if (*tid == common->pid_visiting) { > + pid = find_pid_ns(common->pid_visiting, common->ns); > + task = get_pid_task(pid, PIDTYPE_PID); > + > + return task; > + } Do not understand the above code. Why we need it? Looks like the code below trying to get the *next_task* and will return NULL if wrap around happens(the tid again equals tgid), right? > + > +retry: > + pid = find_pid_ns(common->pid_visiting, common->ns); > + if (!pid) > + return NULL; > + > + task = get_pid_task(pid, PIDTYPE_PID); > + if (!task) > + return NULL; > + > + next_task = next_thread(task); > + put_task_struct(task); > + if (!next_task) > + return NULL; > + > + saved_tid = *tid; > + *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns); > + if (*tid == common->pid) { > + /* Run out of tasks of a process. The tasks of a > + * thread_group are linked as circular linked list. > + */ > + *tid = saved_tid; > + return NULL; > + } > + > + get_task_struct(next_task); > + common->pid_visiting = *tid; We could do quite some redundant works here if the following condition is true. Basically, we get next_task and get a tid and release it, but in the next iteration, from tid, we try to get the task again. > + > + if (skip_if_dup_files && task->files == task->group_leader->files) > + goto retry; > + > + return next_task; > +} > + > +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, > u32 *tid, > bool skip_if_dup_files) > { > struct task_struct *task = NULL; > struct pid *pid; > > + if (common->type == BPF_TASK_ITER_TID) { > + if (*tid && *tid != common->pid) > + return NULL; > + rcu_read_lock(); > + pid = find_pid_ns(common->pid, common->ns); > + if (pid) { > + task = get_pid_task(pid, PIDTYPE_TGID); > + *tid = common->pid; > + } > + rcu_read_unlock(); > + > + return task; > + } > + > + if (common->type == BPF_TASK_ITER_TGID) { > + rcu_read_lock(); > + task = task_group_seq_get_next(common, tid, skip_if_dup_files); > + rcu_read_unlock(); > + > + return task; > + } > + > rcu_read_lock(); > retry: > - pid = find_ge_pid(*tid, ns); > + pid = find_ge_pid(*tid, common->ns); > if (pid) { > - *tid = pid_nr_ns(pid, ns); > + *tid = pid_nr_ns(pid, common->ns); > task = get_pid_task(pid, PIDTYPE_PID); > if (!task) { > ++*tid; > @@ -56,7 +148,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) > struct bpf_iter_seq_task_info *info = seq->private; > struct task_struct *task; > > - task = task_seq_get_next(info->common.ns, &info->tid, false); > + task = task_seq_get_next(&info->common, &info->tid, false); > if (!task) > return NULL; > > @@ -73,7 +165,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) > ++*pos; > ++info->tid; > put_task_struct((struct task_struct *)v); > - task = task_seq_get_next(info->common.ns, &info->tid, false); > + task = task_seq_get_next(&info->common, &info->tid, false); > if (!task) > return NULL; > > @@ -117,6 +209,45 @@ static void task_seq_stop(struct seq_file *seq, void *v) > put_task_struct((struct task_struct *)v); > } > [...]
On Tue, 2022-08-30 at 16:54 -0700, Yonghong Song wrote: > > > On 8/29/22 12:23 PM, Kui-Feng Lee wrote: > > Allow creating an iterator that loops through resources of one > > thread/process. > > > > People could only create iterators to loop through all resources of > > files, vma, and tasks in the system, even though they were > > interested > > in only the resources of a specific task or process. Passing the > > additional parameters, people can now create an iterator to go > > through all resources or only the resources of a task. > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > Acked-by: Yonghong Song <yhs@fb.com> > > --- > > include/linux/bpf.h | 25 +++++ > > include/uapi/linux/bpf.h | 6 ++ > > kernel/bpf/task_iter.c | 184 > > +++++++++++++++++++++++++++++---- > > tools/include/uapi/linux/bpf.h | 6 ++ > > 4 files changed, 199 insertions(+), 22 deletions(-) > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > index 9c1674973e03..31ac2c1181f5 100644 > > --- a/include/linux/bpf.h > > +++ b/include/linux/bpf.h > > @@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user > > *pathname, int flags); > > extern int bpf_iter_ ## target(args); \ > > int __init bpf_iter_ ## target(args) { return 0; } > > > > +/* > > + * The task type of iterators. > > + * > > + * For BPF task iterators, they can be parameterized with various > > + * parameters to visit only some of tasks. > > + * > > + * BPF_TASK_ITER_ALL (default) > > + * Iterate over resources of every task. > > + * > > + * BPF_TASK_ITER_TID > > + * Iterate over resources of a task/tid. > > + * > > + * BPF_TASK_ITER_TGID > > + * Iterate over resources of every task of a process / task > > group. > > + */ > > +enum bpf_iter_task_type { > > + BPF_TASK_ITER_ALL = 0, > > + BPF_TASK_ITER_TID, > > + BPF_TASK_ITER_TGID, > > +}; > > + > > struct bpf_iter_aux_info { > > /* for map_elem iter */ > > struct bpf_map *map; > > @@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info { > > struct cgroup *start; /* starting cgroup */ > > enum bpf_cgroup_iter_order order; > > } cgroup; > > + struct { > > + enum bpf_iter_task_type type; > > + u32 pid; > > + } task; > > }; > > > > typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > index 962960a98835..f212a19eda06 100644 > > --- a/include/uapi/linux/bpf.h > > +++ b/include/uapi/linux/bpf.h > > @@ -110,6 +110,12 @@ union bpf_iter_link_info { > > __u32 cgroup_fd; > > __u64 cgroup_id; > > } cgroup; > > + /* Parameters of task iterators. */ > > + struct { > > + __u32 tid; > > + __u32 pid; > > + __u32 pid_fd; > > + } task; > > }; > > > > /* BPF syscall commands, see bpf(2) man-page for more details. */ > > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > > index 8c921799def4..0bc7277d1ee1 100644 > > --- a/kernel/bpf/task_iter.c > > +++ b/kernel/bpf/task_iter.c > > @@ -12,6 +12,9 @@ > > > > struct bpf_iter_seq_task_common { > > struct pid_namespace *ns; > > + enum bpf_iter_task_type type; > > + u32 pid; > > + u32 pid_visiting; > > }; > > > > struct bpf_iter_seq_task_info { > > @@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info { > > u32 tid; > > }; > > > > -static struct task_struct *task_seq_get_next(struct pid_namespace > > *ns, > > +static struct task_struct *task_group_seq_get_next(struct > > bpf_iter_seq_task_common *common, > > + u32 *tid, > > + bool > > skip_if_dup_files) > > +{ > > + struct task_struct *task, *next_task; > > + struct pid *pid; > > + u32 saved_tid; > > + > > + if (!*tid) { > > Add a comment in the above to say that this is for the *very first* > visit of tasks in the process. > > > + pid = find_pid_ns(common->pid, common->ns); > > + if (pid) > > + task = get_pid_task(pid, PIDTYPE_TGID); > > 'task' is not initialized, so it is possible task could hold a > garbase value here if !pid, right? > > Also if indeed task is NULL, here, should we return NULL here > first? yes, it should return earlier. > > > + > > + *tid = common->pid; > > + common->pid_visiting = common->pid; > > + > > + return task; > > + } > > + > > + /* The callers increase *tid by 1 once they want next task. > > + * However, next_thread() doesn't return tasks in > > incremental > > + * order of pids. We can not find next task by just finding > > a > > + * task whose pid is greater or equal to *tid. > > pid_visiting > > + * remembers the pid value of the task returned last time. > > By > > + * comparing pid_visiting and *tid, we known if the caller > > + * wants the next task. > > + */ > > + if (*tid == common->pid_visiting) { > > + pid = find_pid_ns(common->pid_visiting, common- > > >ns); > > + task = get_pid_task(pid, PIDTYPE_PID); > > + > > + return task; > > + } > > Do not understand the above code. Why we need it? Looks like > the code below trying to get the *next_task* and will return NULL > if wrap around happens(the tid again equals tgid), right? The above code is to handle the case that the caller want to visit the same task again. For example, task_file_seq_get_next() will call this function several time to return the same task, and move to next task by increasing info->tid. The above code checks the value of *tid to return the same task if the value doesn't change. > > > + > > +retry: > > + pid = find_pid_ns(common->pid_visiting, common->ns); > > + if (!pid) > > + return NULL; > > + > > + task = get_pid_task(pid, PIDTYPE_PID); > > + if (!task) > > + return NULL; > > + > > + next_task = next_thread(task); > > + put_task_struct(task); > > + if (!next_task) > > + return NULL; > > + > > + saved_tid = *tid; > > + *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common- > > >ns); > > + if (*tid == common->pid) { > > + /* Run out of tasks of a process. The tasks of a > > + * thread_group are linked as circular linked list. > > + */ > > + *tid = saved_tid; > > + return NULL; > > + } > > + > > + get_task_struct(next_task); > > + common->pid_visiting = *tid; > > We could do quite some redundant works here if the following > condition is true. Basically, we get next_task and get a tid > and release it, but in the next iteration, from tid, we try to get > the task again. Yes, I will move 'retry' and move next_task to task to avoid the redundant work. > > > + > > + if (skip_if_dup_files && task->files == task->group_leader- > > >files) > > + goto retry; > > + > > + return next_task; > > +} > > + > > +static struct task_struct *task_seq_get_next(struct > > bpf_iter_seq_task_common *common, > > u32 *tid, > > bool > > skip_if_dup_files) > > { > > struct task_struct *task = NULL; > > struct pid *pid; > > > > + if (common->type == BPF_TASK_ITER_TID) { > > + if (*tid && *tid != common->pid) > > + return NULL; > > + rcu_read_lock(); > > + pid = find_pid_ns(common->pid, common->ns); > > + if (pid) { > > + task = get_pid_task(pid, PIDTYPE_TGID); > > + *tid = common->pid; > > + } > > + rcu_read_unlock(); > > + > > + return task; > > + } > > + > > + if (common->type == BPF_TASK_ITER_TGID) { > > + rcu_read_lock(); > > + task = task_group_seq_get_next(common, tid, > > skip_if_dup_files); > > + rcu_read_unlock(); > > + > > + return task; > > + } > > + > > rcu_read_lock(); > > retry: > > - pid = find_ge_pid(*tid, ns); > > + pid = find_ge_pid(*tid, common->ns); > > if (pid) { > > - *tid = pid_nr_ns(pid, ns); > > + *tid = pid_nr_ns(pid, common->ns); > > task = get_pid_task(pid, PIDTYPE_PID); > > if (!task) { > > ++*tid; > > @@ -56,7 +148,7 @@ static void *task_seq_start(struct seq_file > > *seq, loff_t *pos) > > struct bpf_iter_seq_task_info *info = seq->private; > > struct task_struct *task; > > > > - task = task_seq_get_next(info->common.ns, &info->tid, > > false); > > + task = task_seq_get_next(&info->common, &info->tid, false); > > if (!task) > > return NULL; > > > > @@ -73,7 +165,7 @@ static void *task_seq_next(struct seq_file *seq, > > void *v, loff_t *pos) > > ++*pos; > > ++info->tid; > > put_task_struct((struct task_struct *)v); > > - task = task_seq_get_next(info->common.ns, &info->tid, > > false); > > + task = task_seq_get_next(&info->common, &info->tid, false); > > if (!task) > > return NULL; > > > > @@ -117,6 +209,45 @@ static void task_seq_stop(struct seq_file > > *seq, void *v) > > put_task_struct((struct task_struct *)v); > > } > > > [...]
On 8/30/22 5:35 PM, Kui-Feng Lee wrote: > On Tue, 2022-08-30 at 16:54 -0700, Yonghong Song wrote: >> >> >> On 8/29/22 12:23 PM, Kui-Feng Lee wrote: >>> Allow creating an iterator that loops through resources of one >>> thread/process. >>> >>> People could only create iterators to loop through all resources of >>> files, vma, and tasks in the system, even though they were >>> interested >>> in only the resources of a specific task or process. Passing the >>> additional parameters, people can now create an iterator to go >>> through all resources or only the resources of a task. >>> >>> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> >>> Acked-by: Yonghong Song <yhs@fb.com> >>> --- >>> include/linux/bpf.h | 25 +++++ >>> include/uapi/linux/bpf.h | 6 ++ >>> kernel/bpf/task_iter.c | 184 >>> +++++++++++++++++++++++++++++---- >>> tools/include/uapi/linux/bpf.h | 6 ++ >>> 4 files changed, 199 insertions(+), 22 deletions(-) >>> >>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >>> index 9c1674973e03..31ac2c1181f5 100644 >>> --- a/include/linux/bpf.h >>> +++ b/include/linux/bpf.h >>> @@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user >>> *pathname, int flags); >>> extern int bpf_iter_ ## target(args); \ >>> int __init bpf_iter_ ## target(args) { return 0; } >>> >>> +/* >>> + * The task type of iterators. >>> + * >>> + * For BPF task iterators, they can be parameterized with various >>> + * parameters to visit only some of tasks. >>> + * >>> + * BPF_TASK_ITER_ALL (default) >>> + * Iterate over resources of every task. >>> + * >>> + * BPF_TASK_ITER_TID >>> + * Iterate over resources of a task/tid. >>> + * >>> + * BPF_TASK_ITER_TGID >>> + * Iterate over resources of every task of a process / task >>> group. >>> + */ >>> +enum bpf_iter_task_type { >>> + BPF_TASK_ITER_ALL = 0, >>> + BPF_TASK_ITER_TID, >>> + BPF_TASK_ITER_TGID, >>> +}; >>> + >>> struct bpf_iter_aux_info { >>> /* for map_elem iter */ >>> struct bpf_map *map; >>> @@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info { >>> struct cgroup *start; /* starting cgroup */ >>> enum bpf_cgroup_iter_order order; >>> } cgroup; >>> + struct { >>> + enum bpf_iter_task_type type; >>> + u32 pid; >>> + } task; >>> }; >>> >>> typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, >>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >>> index 962960a98835..f212a19eda06 100644 >>> --- a/include/uapi/linux/bpf.h >>> +++ b/include/uapi/linux/bpf.h >>> @@ -110,6 +110,12 @@ union bpf_iter_link_info { >>> __u32 cgroup_fd; >>> __u64 cgroup_id; >>> } cgroup; >>> + /* Parameters of task iterators. */ >>> + struct { >>> + __u32 tid; >>> + __u32 pid; >>> + __u32 pid_fd; >>> + } task; >>> }; >>> >>> /* BPF syscall commands, see bpf(2) man-page for more details. */ >>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c >>> index 8c921799def4..0bc7277d1ee1 100644 >>> --- a/kernel/bpf/task_iter.c >>> +++ b/kernel/bpf/task_iter.c >>> @@ -12,6 +12,9 @@ >>> >>> struct bpf_iter_seq_task_common { >>> struct pid_namespace *ns; >>> + enum bpf_iter_task_type type; >>> + u32 pid; >>> + u32 pid_visiting; >>> }; >>> >>> struct bpf_iter_seq_task_info { >>> @@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info { >>> u32 tid; >>> }; >>> >>> -static struct task_struct *task_seq_get_next(struct pid_namespace >>> *ns, >>> +static struct task_struct *task_group_seq_get_next(struct >>> bpf_iter_seq_task_common *common, >>> + u32 *tid, >>> + bool >>> skip_if_dup_files) >>> +{ >>> + struct task_struct *task, *next_task; >>> + struct pid *pid; >>> + u32 saved_tid; >>> + >>> + if (!*tid) { >> >> Add a comment in the above to say that this is for the *very first* >> visit of tasks in the process. >> >>> + pid = find_pid_ns(common->pid, common->ns); >>> + if (pid) >>> + task = get_pid_task(pid, PIDTYPE_TGID); >> >> 'task' is not initialized, so it is possible task could hold a >> garbase value here if !pid, right? >> >> Also if indeed task is NULL, here, should we return NULL here >> first? > > yes, it should return earlier. > >> >>> + >>> + *tid = common->pid; >>> + common->pid_visiting = common->pid; >>> + >>> + return task; >>> + } >>> + >>> + /* The callers increase *tid by 1 once they want next task. >>> + * However, next_thread() doesn't return tasks in >>> incremental >>> + * order of pids. We can not find next task by just finding >>> a >>> + * task whose pid is greater or equal to *tid. >>> pid_visiting >>> + * remembers the pid value of the task returned last time. >>> By >>> + * comparing pid_visiting and *tid, we known if the caller >>> + * wants the next task. >>> + */ >>> + if (*tid == common->pid_visiting) { >>> + pid = find_pid_ns(common->pid_visiting, common- >>>> ns); >>> + task = get_pid_task(pid, PIDTYPE_PID); >>> + >>> + return task; >>> + } >> >> Do not understand the above code. Why we need it? Looks like >> the code below trying to get the *next_task* and will return NULL >> if wrap around happens(the tid again equals tgid), right? > > The above code is to handle the case that the caller want to visit the > same task again. For example, task_file_seq_get_next() will call this > function several time to return the same task, and move to next task by > increasing info->tid. The above code checks the value of *tid to > return the same task if the value doesn't change. Could you explain when task_file_seq_get_next() will call this function several times? IIUC, from the following code, +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, u32 *tid, bool skip_if_dup_files) { struct task_struct *task = NULL; struct pid *pid; + if (common->type == BPF_TASK_ITER_TID) { + if (*tid && *tid != common->pid) + return NULL; + rcu_read_lock(); + pid = find_pid_ns(common->pid, common->ns); + if (pid) { + task = get_pid_task(pid, PIDTYPE_TGID); + *tid = common->pid; + } + rcu_read_unlock(); + + return task; + } + + if (common->type == BPF_TASK_ITER_TGID) { + rcu_read_lock(); + task = task_group_seq_get_next(common, tid, skip_if_dup_files); + rcu_read_unlock(); + + return task; + } + task_group_seq_get_next() is only called once per task_seq_get_next() for BPF_TASK_ITER_TGID. Maybe I missed something? > >> >>> + >>> +retry: >>> + pid = find_pid_ns(common->pid_visiting, common->ns); >>> + if (!pid) >>> + return NULL; >>> + >>> + task = get_pid_task(pid, PIDTYPE_PID); >>> + if (!task) >>> + return NULL; >>> + >>> + next_task = next_thread(task); >>> + put_task_struct(task); >>> + if (!next_task) >>> + return NULL; >>> + >>> + saved_tid = *tid; >>> + *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common- >>>> ns); >>> + if (*tid == common->pid) { >>> + /* Run out of tasks of a process. The tasks of a >>> + * thread_group are linked as circular linked list. >>> + */ >>> + *tid = saved_tid; >>> + return NULL; >>> + } >>> + >>> + get_task_struct(next_task); >>> + common->pid_visiting = *tid; >> >> We could do quite some redundant works here if the following >> condition is true. Basically, we get next_task and get a tid >> and release it, but in the next iteration, from tid, we try to get >> the task again. > > Yes, I will move 'retry' and move next_task to task to avoid the > redundant work. > >> >>> + >>> + if (skip_if_dup_files && task->files == task->group_leader- >>>> files) >>> + goto retry; >>> + >>> + return next_task; >>> +} >>> + >>> +static struct task_struct *task_seq_get_next(struct >>> bpf_iter_seq_task_common *common, >>> u32 *tid, >>> bool >>> skip_if_dup_files) >>> { >>> struct task_struct *task = NULL; >>> struct pid *pid; >>> >>> + if (common->type == BPF_TASK_ITER_TID) { >>> + if (*tid && *tid != common->pid) >>> + return NULL; >>> + rcu_read_lock(); >>> + pid = find_pid_ns(common->pid, common->ns); >>> + if (pid) { >>> + task = get_pid_task(pid, PIDTYPE_TGID); >>> + *tid = common->pid; >>> + } >>> + rcu_read_unlock(); >>> + >>> + return task; >>> + } >>> + >>> + if (common->type == BPF_TASK_ITER_TGID) { >>> + rcu_read_lock(); >>> + task = task_group_seq_get_next(common, tid, >>> skip_if_dup_files); >>> + rcu_read_unlock(); >>> + >>> + return task; >>> + } >>> + >>> rcu_read_lock(); >>> retry: >>> - pid = find_ge_pid(*tid, ns); >>> + pid = find_ge_pid(*tid, common->ns); >>> if (pid) { >>> - *tid = pid_nr_ns(pid, ns); >>> + *tid = pid_nr_ns(pid, common->ns); >>> task = get_pid_task(pid, PIDTYPE_PID); >>> if (!task) { >>> ++*tid; >>> @@ -56,7 +148,7 @@ static void *task_seq_start(struct seq_file >>> *seq, loff_t *pos) >>> struct bpf_iter_seq_task_info *info = seq->private; >>> struct task_struct *task; >>> >>> - task = task_seq_get_next(info->common.ns, &info->tid, >>> false); >>> + task = task_seq_get_next(&info->common, &info->tid, false); >>> if (!task) >>> return NULL; >>> >>> @@ -73,7 +165,7 @@ static void *task_seq_next(struct seq_file *seq, >>> void *v, loff_t *pos) >>> ++*pos; >>> ++info->tid; >>> put_task_struct((struct task_struct *)v); >>> - task = task_seq_get_next(info->common.ns, &info->tid, >>> false); >>> + task = task_seq_get_next(&info->common, &info->tid, false); >>> if (!task) >>> return NULL; >>> >>> @@ -117,6 +209,45 @@ static void task_seq_stop(struct seq_file >>> *seq, void *v) >>> put_task_struct((struct task_struct *)v); >>> } >>> >> [...] >
On 8/30/22 7:37 PM, Yonghong Song wrote: > > > On 8/30/22 5:35 PM, Kui-Feng Lee wrote: >> On Tue, 2022-08-30 at 16:54 -0700, Yonghong Song wrote: >>> >>> >>> On 8/29/22 12:23 PM, Kui-Feng Lee wrote: >>>> Allow creating an iterator that loops through resources of one >>>> thread/process. >>>> >>>> People could only create iterators to loop through all resources of >>>> files, vma, and tasks in the system, even though they were >>>> interested >>>> in only the resources of a specific task or process. Passing the >>>> additional parameters, people can now create an iterator to go >>>> through all resources or only the resources of a task. >>>> >>>> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> >>>> Acked-by: Yonghong Song <yhs@fb.com> >>>> --- >>>> include/linux/bpf.h | 25 +++++ >>>> include/uapi/linux/bpf.h | 6 ++ >>>> kernel/bpf/task_iter.c | 184 >>>> +++++++++++++++++++++++++++++---- >>>> tools/include/uapi/linux/bpf.h | 6 ++ >>>> 4 files changed, 199 insertions(+), 22 deletions(-) >>>> >>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >>>> index 9c1674973e03..31ac2c1181f5 100644 >>>> --- a/include/linux/bpf.h >>>> +++ b/include/linux/bpf.h >>>> @@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user >>>> *pathname, int flags); >>>> extern int bpf_iter_ ## target(args); \ >>>> int __init bpf_iter_ ## target(args) { return 0; } >>>> +/* >>>> + * The task type of iterators. >>>> + * >>>> + * For BPF task iterators, they can be parameterized with various >>>> + * parameters to visit only some of tasks. >>>> + * >>>> + * BPF_TASK_ITER_ALL (default) >>>> + * Iterate over resources of every task. >>>> + * >>>> + * BPF_TASK_ITER_TID >>>> + * Iterate over resources of a task/tid. >>>> + * >>>> + * BPF_TASK_ITER_TGID >>>> + * Iterate over resources of every task of a process / task >>>> group. >>>> + */ >>>> +enum bpf_iter_task_type { >>>> + BPF_TASK_ITER_ALL = 0, >>>> + BPF_TASK_ITER_TID, >>>> + BPF_TASK_ITER_TGID, >>>> +}; >>>> + >>>> struct bpf_iter_aux_info { >>>> /* for map_elem iter */ >>>> struct bpf_map *map; >>>> @@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info { >>>> struct cgroup *start; /* starting cgroup */ >>>> enum bpf_cgroup_iter_order order; >>>> } cgroup; >>>> + struct { >>>> + enum bpf_iter_task_type type; >>>> + u32 pid; >>>> + } task; >>>> }; >>>> typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, >>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >>>> index 962960a98835..f212a19eda06 100644 >>>> --- a/include/uapi/linux/bpf.h >>>> +++ b/include/uapi/linux/bpf.h >>>> @@ -110,6 +110,12 @@ union bpf_iter_link_info { >>>> __u32 cgroup_fd; >>>> __u64 cgroup_id; >>>> } cgroup; >>>> + /* Parameters of task iterators. */ >>>> + struct { >>>> + __u32 tid; >>>> + __u32 pid; >>>> + __u32 pid_fd; >>>> + } task; >>>> }; >>>> /* BPF syscall commands, see bpf(2) man-page for more details. */ >>>> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c >>>> index 8c921799def4..0bc7277d1ee1 100644 >>>> --- a/kernel/bpf/task_iter.c >>>> +++ b/kernel/bpf/task_iter.c >>>> @@ -12,6 +12,9 @@ >>>> struct bpf_iter_seq_task_common { >>>> struct pid_namespace *ns; >>>> + enum bpf_iter_task_type type; >>>> + u32 pid; >>>> + u32 pid_visiting; >>>> }; >>>> struct bpf_iter_seq_task_info { >>>> @@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info { >>>> u32 tid; >>>> }; >>>> -static struct task_struct *task_seq_get_next(struct pid_namespace >>>> *ns, >>>> +static struct task_struct *task_group_seq_get_next(struct >>>> bpf_iter_seq_task_common *common, >>>> + u32 *tid, >>>> + bool >>>> skip_if_dup_files) >>>> +{ >>>> + struct task_struct *task, *next_task; >>>> + struct pid *pid; >>>> + u32 saved_tid; >>>> + >>>> + if (!*tid) { >>> >>> Add a comment in the above to say that this is for the *very first* >>> visit of tasks in the process. >>> >>>> + pid = find_pid_ns(common->pid, common->ns); >>>> + if (pid) >>>> + task = get_pid_task(pid, PIDTYPE_TGID); >>> >>> 'task' is not initialized, so it is possible task could hold a >>> garbase value here if !pid, right? >>> >>> Also if indeed task is NULL, here, should we return NULL here >>> first? >> >> yes, it should return earlier. >> >>> >>>> + >>>> + *tid = common->pid; >>>> + common->pid_visiting = common->pid; >>>> + >>>> + return task; >>>> + } >>>> + >>>> + /* The callers increase *tid by 1 once they want next task. >>>> + * However, next_thread() doesn't return tasks in >>>> incremental >>>> + * order of pids. We can not find next task by just finding >>>> a >>>> + * task whose pid is greater or equal to *tid. >>>> pid_visiting >>>> + * remembers the pid value of the task returned last time. >>>> By >>>> + * comparing pid_visiting and *tid, we known if the caller >>>> + * wants the next task. >>>> + */ >>>> + if (*tid == common->pid_visiting) { >>>> + pid = find_pid_ns(common->pid_visiting, common- >>>>> ns); >>>> + task = get_pid_task(pid, PIDTYPE_PID); >>>> + >>>> + return task; >>>> + } >>> >>> Do not understand the above code. Why we need it? Looks like >>> the code below trying to get the *next_task* and will return NULL >>> if wrap around happens(the tid again equals tgid), right? >> >> The above code is to handle the case that the caller want to visit the >> same task again. For example, task_file_seq_get_next() will call this >> function several time to return the same task, and move to next task by >> increasing info->tid. The above code checks the value of *tid to >> return the same task if the value doesn't change. Okay, I did a little more investigation. The above code is correct. But the comment does not clearly describe why. The real reason we need this is for subsequent iterations from task_seq_start(). Basically we have task_seq_next(); // return to user space task_seq_start() task_group_seq_get_next() In above task_group_seq_get_next() needs to retrieve the task which is the to-be-visited task during last task_seq_start/next/show/stop() run. The comment can be as simple as below. /* If the control returns to user space and comes back to * the kernel again, *tid and common->pid_visiting should * be the same for task_seq_start() to pick up the correct * task. */ > > Could you explain when task_file_seq_get_next() will call this function > several times? IIUC, from the following code, > > +static struct task_struct *task_seq_get_next(struct > bpf_iter_seq_task_common *common, > u32 *tid, > bool skip_if_dup_files) > { > struct task_struct *task = NULL; > struct pid *pid; > > + if (common->type == BPF_TASK_ITER_TID) { > + if (*tid && *tid != common->pid) > + return NULL; > + rcu_read_lock(); > + pid = find_pid_ns(common->pid, common->ns); > + if (pid) { > + task = get_pid_task(pid, PIDTYPE_TGID); > + *tid = common->pid; > + } > + rcu_read_unlock(); > + > + return task; > + } > + > + if (common->type == BPF_TASK_ITER_TGID) { > + rcu_read_lock(); > + task = task_group_seq_get_next(common, tid, skip_if_dup_files); > + rcu_read_unlock(); > + > + return task; > + } > + > > task_group_seq_get_next() is only called once per task_seq_get_next() > for BPF_TASK_ITER_TGID. > Maybe I missed something? > > > [...]
On Tue, 2022-08-30 at 19:37 -0700, Yonghong Song wrote: > > > On 8/30/22 5:35 PM, Kui-Feng Lee wrote: > > On Tue, 2022-08-30 at 16:54 -0700, Yonghong Song wrote: > > > > > > > > > On 8/29/22 12:23 PM, Kui-Feng Lee wrote: > > > > Allow creating an iterator that loops through resources of one > > > > thread/process. > > > > > > > > People could only create iterators to loop through all > > > > resources of > > > > files, vma, and tasks in the system, even though they were > > > > interested > > > > in only the resources of a specific task or process. Passing > > > > the > > > > additional parameters, people can now create an iterator to go > > > > through all resources or only the resources of a task. > > > > > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > > > Acked-by: Yonghong Song <yhs@fb.com> > > > > --- > > > > include/linux/bpf.h | 25 +++++ > > > > include/uapi/linux/bpf.h | 6 ++ > > > > kernel/bpf/task_iter.c | 184 > > > > +++++++++++++++++++++++++++++---- > > > > tools/include/uapi/linux/bpf.h | 6 ++ > > > > 4 files changed, 199 insertions(+), 22 deletions(-) > > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > > > index 9c1674973e03..31ac2c1181f5 100644 > > > > --- a/include/linux/bpf.h > > > > +++ b/include/linux/bpf.h > > > > @@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user > > > > *pathname, int flags); > > > > extern int bpf_iter_ ## > > > > target(args); \ > > > > int __init bpf_iter_ ## target(args) { return 0; } > > > > > > > > +/* > > > > + * The task type of iterators. > > > > + * > > > > + * For BPF task iterators, they can be parameterized with > > > > various > > > > + * parameters to visit only some of tasks. > > > > + * > > > > + * BPF_TASK_ITER_ALL (default) > > > > + * Iterate over resources of every task. > > > > + * > > > > + * BPF_TASK_ITER_TID > > > > + * Iterate over resources of a task/tid. > > > > + * > > > > + * BPF_TASK_ITER_TGID > > > > + * Iterate over resources of every task of a process / > > > > task > > > > group. > > > > + */ > > > > +enum bpf_iter_task_type { > > > > + BPF_TASK_ITER_ALL = 0, > > > > + BPF_TASK_ITER_TID, > > > > + BPF_TASK_ITER_TGID, > > > > +}; > > > > + > > > > struct bpf_iter_aux_info { > > > > /* for map_elem iter */ > > > > struct bpf_map *map; > > > > @@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info { > > > > struct cgroup *start; /* starting cgroup */ > > > > enum bpf_cgroup_iter_order order; > > > > } cgroup; > > > > + struct { > > > > + enum bpf_iter_task_type type; > > > > + u32 pid; > > > > + } task; > > > > }; > > > > > > > > typedef int (*bpf_iter_attach_target_t)(struct bpf_prog > > > > *prog, > > > > diff --git a/include/uapi/linux/bpf.h > > > > b/include/uapi/linux/bpf.h > > > > index 962960a98835..f212a19eda06 100644 > > > > --- a/include/uapi/linux/bpf.h > > > > +++ b/include/uapi/linux/bpf.h > > > > @@ -110,6 +110,12 @@ union bpf_iter_link_info { > > > > __u32 cgroup_fd; > > > > __u64 cgroup_id; > > > > } cgroup; > > > > + /* Parameters of task iterators. */ > > > > + struct { > > > > + __u32 tid; > > > > + __u32 pid; > > > > + __u32 pid_fd; > > > > + } task; > > > > }; > > > > > > > > /* BPF syscall commands, see bpf(2) man-page for more > > > > details. */ > > > > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > > > > index 8c921799def4..0bc7277d1ee1 100644 > > > > --- a/kernel/bpf/task_iter.c > > > > +++ b/kernel/bpf/task_iter.c > > > > @@ -12,6 +12,9 @@ > > > > > > > > struct bpf_iter_seq_task_common { > > > > struct pid_namespace *ns; > > > > + enum bpf_iter_task_type type; > > > > + u32 pid; > > > > + u32 pid_visiting; > > > > }; > > > > > > > > struct bpf_iter_seq_task_info { > > > > @@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info { > > > > u32 tid; > > > > }; > > > > > > > > -static struct task_struct *task_seq_get_next(struct > > > > pid_namespace > > > > *ns, > > > > +static struct task_struct *task_group_seq_get_next(struct > > > > bpf_iter_seq_task_common *common, > > > > + u32 *tid, > > > > + bool > > > > skip_if_dup_files) > > > > +{ > > > > + struct task_struct *task, *next_task; > > > > + struct pid *pid; > > > > + u32 saved_tid; > > > > + > > > > + if (!*tid) { > > > > > > Add a comment in the above to say that this is for the *very > > > first* > > > visit of tasks in the process. > > > > > > > + pid = find_pid_ns(common->pid, common->ns); > > > > + if (pid) > > > > + task = get_pid_task(pid, PIDTYPE_TGID); > > > > > > 'task' is not initialized, so it is possible task could hold a > > > garbase value here if !pid, right? > > > > > > Also if indeed task is NULL, here, should we return NULL here > > > first? > > > > yes, it should return earlier. > > > > > > > > > + > > > > + *tid = common->pid; > > > > + common->pid_visiting = common->pid; > > > > + > > > > + return task; > > > > + } > > > > + > > > > + /* The callers increase *tid by 1 once they want next > > > > task. > > > > + * However, next_thread() doesn't return tasks in > > > > incremental > > > > + * order of pids. We can not find next task by just > > > > finding > > > > a > > > > + * task whose pid is greater or equal to *tid. > > > > pid_visiting > > > > + * remembers the pid value of the task returned last > > > > time. > > > > By > > > > + * comparing pid_visiting and *tid, we known if the > > > > caller > > > > + * wants the next task. > > > > + */ > > > > + if (*tid == common->pid_visiting) { > > > > + pid = find_pid_ns(common->pid_visiting, common- > > > > > ns); > > > > + task = get_pid_task(pid, PIDTYPE_PID); > > > > + > > > > + return task; > > > > + } > > > > > > Do not understand the above code. Why we need it? Looks like > > > the code below trying to get the *next_task* and will return NULL > > > if wrap around happens(the tid again equals tgid), right? > > > > The above code is to handle the case that the caller want to visit > > the > > same task again. For example, task_file_seq_get_next() will call > > this > > function several time to return the same task, and move to next > > task by > > increasing info->tid. The above code checks the value of *tid to > > return the same task if the value doesn't change. > > Could you explain when task_file_seq_get_next() will call this > function > several times? IIUC, from the following code, Let's assume that a userspace program reads an BPF file iterator in several times instead of reading the whole content in once. bpf_seq_read() will be called several times. Every time returning from bpf_seq_read except the last one, info->tid will be the tid of the latest visited task. bpf_seq_read() will call seq->op->start() (a.k.a task_file_seq_start() in this case). However, task_file_seq_start() will reset info->task, and call task_file_seq_get_next() to return the task which has tid giving by *tid. Since info->task is NULL and *tid is not 0, task_file_seq_get_next() will call task_seq_get_next() to retrieve the same task again. Does that make sense? > > +static struct task_struct *task_seq_get_next(struct > bpf_iter_seq_task_common *common, > u32 *tid, > bool skip_if_dup_files) > { > struct task_struct *task = NULL; > struct pid *pid; > > + if (common->type == BPF_TASK_ITER_TID) { > + if (*tid && *tid != common->pid) > + return NULL; > + rcu_read_lock(); > + pid = find_pid_ns(common->pid, common->ns); > + if (pid) { > + task = get_pid_task(pid, PIDTYPE_TGID); > + *tid = common->pid; > + } > + rcu_read_unlock(); > + > + return task; > + } > + > + if (common->type == BPF_TASK_ITER_TGID) { > + rcu_read_lock(); > + task = task_group_seq_get_next(common, tid, > skip_if_dup_files); > + rcu_read_unlock(); > + > + return task; > + } > + > > task_group_seq_get_next() is only called once per task_seq_get_next() > for BPF_TASK_ITER_TGID. > Maybe I missed something? > > > > > > > > > > > > + > > > > +retry: > > > > + pid = find_pid_ns(common->pid_visiting, common->ns); > > > > + if (!pid) > > > > + return NULL; > > > > + > > > > + task = get_pid_task(pid, PIDTYPE_PID); > > > > + if (!task) > > > > + return NULL; > > > > + > > > > + next_task = next_thread(task); > > > > + put_task_struct(task); > > > > + if (!next_task) > > > > + return NULL; > > > > + > > > > + saved_tid = *tid; > > > > + *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common- > > > > > ns); > > > > + if (*tid == common->pid) { > > > > + /* Run out of tasks of a process. The tasks of > > > > a > > > > + * thread_group are linked as circular linked > > > > list. > > > > + */ > > > > + *tid = saved_tid; > > > > + return NULL; > > > > + } > > > > + > > > > + get_task_struct(next_task); > > > > + common->pid_visiting = *tid; > > > > > > We could do quite some redundant works here if the following > > > condition is true. Basically, we get next_task and get a tid > > > and release it, but in the next iteration, from tid, we try to > > > get > > > the task again. > > > > Yes, I will move 'retry' and move next_task to task to avoid the > > redundant work. > > > > > > > > > + > > > > + if (skip_if_dup_files && task->files == task- > > > > >group_leader- > > > > > files) > > > > + goto retry; > > > > + > > > > + return next_task; > > > > +} > > > > + > > > > +static struct task_struct *task_seq_get_next(struct > > > > bpf_iter_seq_task_common *common, > > > > u32 *tid, > > > > bool > > > > skip_if_dup_files) > > > > { > > > > struct task_struct *task = NULL; > > > > struct pid *pid; > > > > > > > > + if (common->type == BPF_TASK_ITER_TID) { > > > > + if (*tid && *tid != common->pid) > > > > + return NULL; > > > > + rcu_read_lock(); > > > > + pid = find_pid_ns(common->pid, common->ns); > > > > + if (pid) { > > > > + task = get_pid_task(pid, PIDTYPE_TGID); > > > > + *tid = common->pid; > > > > + } > > > > + rcu_read_unlock(); > > > > + > > > > + return task; > > > > + } > > > > + > > > > + if (common->type == BPF_TASK_ITER_TGID) { > > > > + rcu_read_lock(); > > > > + task = task_group_seq_get_next(common, tid, > > > > skip_if_dup_files); > > > > + rcu_read_unlock(); > > > > + > > > > + return task; > > > > + } > > > > + > > > > rcu_read_lock(); > > > > retry: > > > > - pid = find_ge_pid(*tid, ns); > > > > + pid = find_ge_pid(*tid, common->ns); > > > > if (pid) { > > > > - *tid = pid_nr_ns(pid, ns); > > > > + *tid = pid_nr_ns(pid, common->ns); > > > > task = get_pid_task(pid, PIDTYPE_PID); > > > > if (!task) { > > > > ++*tid; > > > > @@ -56,7 +148,7 @@ static void *task_seq_start(struct seq_file > > > > *seq, loff_t *pos) > > > > struct bpf_iter_seq_task_info *info = seq->private; > > > > struct task_struct *task; > > > > > > > > - task = task_seq_get_next(info->common.ns, &info->tid, > > > > false); > > > > + task = task_seq_get_next(&info->common, &info->tid, > > > > false); > > > > if (!task) > > > > return NULL; > > > > > > > > @@ -73,7 +165,7 @@ static void *task_seq_next(struct seq_file > > > > *seq, > > > > void *v, loff_t *pos) > > > > ++*pos; > > > > ++info->tid; > > > > put_task_struct((struct task_struct *)v); > > > > - task = task_seq_get_next(info->common.ns, &info->tid, > > > > false); > > > > + task = task_seq_get_next(&info->common, &info->tid, > > > > false); > > > > if (!task) > > > > return NULL; > > > > > > > > @@ -117,6 +209,45 @@ static void task_seq_stop(struct seq_file > > > > *seq, void *v) > > > > put_task_struct((struct task_struct *)v); > > > > } > > > > > > > [...] > >
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9c1674973e03..31ac2c1181f5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1730,6 +1730,27 @@ int bpf_obj_get_user(const char __user *pathname, int flags); extern int bpf_iter_ ## target(args); \ int __init bpf_iter_ ## target(args) { return 0; } +/* + * The task type of iterators. + * + * For BPF task iterators, they can be parameterized with various + * parameters to visit only some of tasks. + * + * BPF_TASK_ITER_ALL (default) + * Iterate over resources of every task. + * + * BPF_TASK_ITER_TID + * Iterate over resources of a task/tid. + * + * BPF_TASK_ITER_TGID + * Iterate over resources of every task of a process / task group. + */ +enum bpf_iter_task_type { + BPF_TASK_ITER_ALL = 0, + BPF_TASK_ITER_TID, + BPF_TASK_ITER_TGID, +}; + struct bpf_iter_aux_info { /* for map_elem iter */ struct bpf_map *map; @@ -1739,6 +1760,10 @@ struct bpf_iter_aux_info { struct cgroup *start; /* starting cgroup */ enum bpf_cgroup_iter_order order; } cgroup; + struct { + enum bpf_iter_task_type type; + u32 pid; + } task; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 962960a98835..f212a19eda06 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -110,6 +110,12 @@ union bpf_iter_link_info { __u32 cgroup_fd; __u64 cgroup_id; } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8c921799def4..0bc7277d1ee1 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -12,6 +12,9 @@ struct bpf_iter_seq_task_common { struct pid_namespace *ns; + enum bpf_iter_task_type type; + u32 pid; + u32 pid_visiting; }; struct bpf_iter_seq_task_info { @@ -22,18 +25,107 @@ struct bpf_iter_seq_task_info { u32 tid; }; -static struct task_struct *task_seq_get_next(struct pid_namespace *ns, +static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common, + u32 *tid, + bool skip_if_dup_files) +{ + struct task_struct *task, *next_task; + struct pid *pid; + u32 saved_tid; + + if (!*tid) { + pid = find_pid_ns(common->pid, common->ns); + if (pid) + task = get_pid_task(pid, PIDTYPE_TGID); + + *tid = common->pid; + common->pid_visiting = common->pid; + + return task; + } + + /* The callers increase *tid by 1 once they want next task. + * However, next_thread() doesn't return tasks in incremental + * order of pids. We can not find next task by just finding a + * task whose pid is greater or equal to *tid. pid_visiting + * remembers the pid value of the task returned last time. By + * comparing pid_visiting and *tid, we known if the caller + * wants the next task. + */ + if (*tid == common->pid_visiting) { + pid = find_pid_ns(common->pid_visiting, common->ns); + task = get_pid_task(pid, PIDTYPE_PID); + + return task; + } + +retry: + pid = find_pid_ns(common->pid_visiting, common->ns); + if (!pid) + return NULL; + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return NULL; + + next_task = next_thread(task); + put_task_struct(task); + if (!next_task) + return NULL; + + saved_tid = *tid; + *tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns); + if (*tid == common->pid) { + /* Run out of tasks of a process. The tasks of a + * thread_group are linked as circular linked list. + */ + *tid = saved_tid; + return NULL; + } + + get_task_struct(next_task); + common->pid_visiting = *tid; + + if (skip_if_dup_files && task->files == task->group_leader->files) + goto retry; + + return next_task; +} + +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, u32 *tid, bool skip_if_dup_files) { struct task_struct *task = NULL; struct pid *pid; + if (common->type == BPF_TASK_ITER_TID) { + if (*tid && *tid != common->pid) + return NULL; + rcu_read_lock(); + pid = find_pid_ns(common->pid, common->ns); + if (pid) { + task = get_pid_task(pid, PIDTYPE_TGID); + *tid = common->pid; + } + rcu_read_unlock(); + + return task; + } + + if (common->type == BPF_TASK_ITER_TGID) { + rcu_read_lock(); + task = task_group_seq_get_next(common, tid, skip_if_dup_files); + rcu_read_unlock(); + + return task; + } + rcu_read_lock(); retry: - pid = find_ge_pid(*tid, ns); + pid = find_ge_pid(*tid, common->ns); if (pid) { - *tid = pid_nr_ns(pid, ns); + *tid = pid_nr_ns(pid, common->ns); task = get_pid_task(pid, PIDTYPE_PID); if (!task) { ++*tid; @@ -56,7 +148,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_task_info *info = seq->private; struct task_struct *task; - task = task_seq_get_next(info->common.ns, &info->tid, false); + task = task_seq_get_next(&info->common, &info->tid, false); if (!task) return NULL; @@ -73,7 +165,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; ++info->tid; put_task_struct((struct task_struct *)v); - task = task_seq_get_next(info->common.ns, &info->tid, false); + task = task_seq_get_next(&info->common, &info->tid, false); if (!task) return NULL; @@ -117,6 +209,45 @@ static void task_seq_stop(struct seq_file *seq, void *v) put_task_struct((struct task_struct *)v); } +static int bpf_iter_attach_task(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + unsigned int flags; + struct pid_namespace *ns; + struct pid *pid; + pid_t tgid; + + if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1) + return -EINVAL; + + aux->task.type = BPF_TASK_ITER_ALL; + if (linfo->task.tid != 0) { + aux->task.type = BPF_TASK_ITER_TID; + aux->task.pid = linfo->task.tid; + } + if (linfo->task.pid != 0) { + aux->task.type = BPF_TASK_ITER_TGID; + aux->task.pid = linfo->task.pid; + } + if (linfo->task.pid_fd != 0) { + aux->task.type = BPF_TASK_ITER_TGID; + ns = task_active_pid_ns(current); + if (IS_ERR(ns)) + return PTR_ERR(ns); + + pid = pidfd_get_pid(linfo->task.pid_fd, &flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + tgid = pid_nr_ns(pid, ns); + aux->task.pid = tgid; + put_pid(pid); + } + + return 0; +} + static const struct seq_operations task_seq_ops = { .start = task_seq_start, .next = task_seq_next, @@ -137,8 +268,7 @@ struct bpf_iter_seq_task_file_info { static struct file * task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) { - struct pid_namespace *ns = info->common.ns; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; struct task_struct *curr_task; unsigned int curr_fd = info->fd; @@ -151,21 +281,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) curr_task = info->task; curr_fd = info->fd; } else { - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { info->task = NULL; - info->tid = curr_tid; return NULL; } - /* set info->task and info->tid */ + /* set info->task */ info->task = curr_task; - if (curr_tid == info->tid) { + if (saved_tid == info->tid) curr_fd = info->fd; - } else { - info->tid = curr_tid; + else curr_fd = 0; - } } rcu_read_lock(); @@ -186,9 +313,15 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) /* the current task is done, go to the next task */ rcu_read_unlock(); put_task_struct(curr_task); + + if (info->common.type == BPF_TASK_ITER_TID) { + info->task = NULL; + return NULL; + } + info->task = NULL; info->fd = 0; - curr_tid = ++(info->tid); + saved_tid = ++(info->tid); goto again; } @@ -269,6 +402,9 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) struct bpf_iter_seq_task_common *common = priv_data; common->ns = get_pid_ns(task_active_pid_ns(current)); + common->type = aux->task.type; + common->pid = aux->task.pid; + return 0; } @@ -307,11 +443,10 @@ enum bpf_task_vma_iter_find_op { static struct vm_area_struct * task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) { - struct pid_namespace *ns = info->common.ns; enum bpf_task_vma_iter_find_op op; struct vm_area_struct *curr_vma; struct task_struct *curr_task; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; /* If this function returns a non-NULL vma, it holds a reference to * the task_struct, and holds read lock on vma->mm->mmap_lock. @@ -371,14 +506,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) } } else { again: - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { - info->tid = curr_tid + 1; + info->tid++; goto finish; } - if (curr_tid != info->tid) { - info->tid = curr_tid; + if (saved_tid != info->tid) { /* new task, process the first vma */ op = task_vma_iter_first_vma; } else { @@ -430,9 +564,12 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) return curr_vma; next_task: + if (info->common.type == BPF_TASK_ITER_TID) + goto finish; + put_task_struct(curr_task); info->task = NULL; - curr_tid++; + info->tid++; goto again; finish: @@ -533,6 +670,7 @@ static const struct bpf_iter_seq_info task_seq_info = { static struct bpf_iter_reg task_reg_info = { .target = "task", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { @@ -551,6 +689,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = { static struct bpf_iter_reg task_file_reg_info = { .target = "task_file", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { @@ -571,6 +710,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = { static struct bpf_iter_reg task_vma_reg_info = { .target = "task_vma", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f4ba82a1eace..40935278eede 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -110,6 +110,12 @@ union bpf_iter_link_info { __u32 cgroup_fd; __u64 cgroup_id; } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */