Message ID | 20220811001654.1316689-2-kuifeng@fb.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Parameterize task iterators. | expand |
On 8/10/22 5:16 PM, Kui-Feng Lee wrote: > Allow creating an iterator that loops through resources of one task/thread. > > People could only create iterators to loop through all resources of > files, vma, and tasks in the system, even though they were interested > in only the resources of a specific task or process. Passing the > additional parameters, people can now create an iterator to go > through all resources or only the resources of a task. > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > --- > include/linux/bpf.h | 29 ++++++++ > include/uapi/linux/bpf.h | 8 +++ > kernel/bpf/task_iter.c | 126 ++++++++++++++++++++++++++------- > tools/include/uapi/linux/bpf.h | 8 +++ > 4 files changed, 147 insertions(+), 24 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 11950029284f..6bbe53d06faa 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1716,8 +1716,37 @@ int bpf_obj_get_user(const char __user *pathname, int flags); > extern int bpf_iter_ ## target(args); \ > int __init bpf_iter_ ## target(args) { return 0; } > > +/* > + * The task type of iterators. > + * > + * For BPF task iterators, they can be parameterized with various > + * parameters to visit only some of tasks. > + * > + * BPF_TASK_ITER_ALL (default) > + * Iterate over resources of every task. > + * > + * BPF_TASK_ITER_TID > + * Iterate over resources of a task/tid. > + * > + * BPF_TASK_ITER_TGID > + * Iterate over reosurces of evevry task of a process / task group. > + */ > +enum bpf_iter_task_type { > + BPF_TASK_ITER_ALL = 0, > + BPF_TASK_ITER_TID, > + BPF_TASK_ITER_TGID, > +}; > + > struct bpf_iter_aux_info { > struct bpf_map *map; > + struct { > + enum bpf_iter_task_type type; > + union { > + u32 tid; > + u32 tgid; > + u32 pid_fd; > + }; > + } task; > }; > > typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index ffcbf79a556b..6328aca0cf5c 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -91,6 +91,14 @@ union bpf_iter_link_info { > struct { > __u32 map_fd; > } map; > + /* > + * Parameters of task iterators. > + */ The comment can be put into one line. > + struct { > + __u32 tid; > + __u32 tgid; > + __u32 pid_fd; The above is a max of kernel and user space terminologies. tid/pid are user space concept and tgid is a kernel space concept. In bpf uapi header, we have struct bpf_pidns_info { __u32 pid; __u32 tgid; }; which uses kernel terminologies. So I suggest the bpf_iter_link_info.task can also use pure kernel terminology pid/tgid/tgid_fd here. Alternative, using pure user space terminology can be tid/pid/pid_fd but seems the kernel terminology might be better since we already have precedence. > + } task; > }; > > /* BPF syscall commands, see bpf(2) man-page for more details. */ > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > index 8c921799def4..f2e21efe075d 100644 > --- a/kernel/bpf/task_iter.c > +++ b/kernel/bpf/task_iter.c > @@ -12,6 +12,12 @@ > > struct bpf_iter_seq_task_common { > struct pid_namespace *ns; > + enum bpf_iter_task_type type; > + union { > + u32 tid; > + u32 tgid; > + u32 pid_fd; > + }; > }; > > struct bpf_iter_seq_task_info { > @@ -22,24 +28,40 @@ struct bpf_iter_seq_task_info { > u32 tid; > }; > > -static struct task_struct *task_seq_get_next(struct pid_namespace *ns, > +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, > u32 *tid, > bool skip_if_dup_files) > { > struct task_struct *task = NULL; > struct pid *pid; > > + if (common->type == BPF_TASK_ITER_TID) { > + if (*tid && *tid != common->tid) > + return NULL; > + rcu_read_lock(); > + pid = find_pid_ns(common->tid, common->ns); > + if (pid) { > + task = get_pid_task(pid, PIDTYPE_PID); > + *tid = common->tid; > + } > + rcu_read_unlock(); > + return task; > + } > + > rcu_read_lock(); > retry: > - pid = find_ge_pid(*tid, ns); > + pid = find_ge_pid(*tid, common->ns); > if (pid) { > - *tid = pid_nr_ns(pid, ns); > + *tid = pid_nr_ns(pid, common->ns); > task = get_pid_task(pid, PIDTYPE_PID); > + This extra line is unnecessary. > if (!task) { > ++*tid; > goto retry; > - } else if (skip_if_dup_files && !thread_group_leader(task) && > - task->files == task->group_leader->files) { > + } else if ((skip_if_dup_files && !thread_group_leader(task) && > + task->files == task->group_leader->files) || > + (common->type == BPF_TASK_ITER_TGID && > + __task_pid_nr_ns(task, PIDTYPE_TGID, common->ns) != common->tgid)) { > put_task_struct(task); > task = NULL; > ++*tid; > @@ -56,7 +78,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) > struct bpf_iter_seq_task_info *info = seq->private; > struct task_struct *task; > > - task = task_seq_get_next(info->common.ns, &info->tid, false); > + task = task_seq_get_next(&info->common, &info->tid, false); > + Extra line? > if (!task) > return NULL; > > @@ -73,7 +96,8 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) > ++*pos; > ++info->tid; > put_task_struct((struct task_struct *)v); > - task = task_seq_get_next(info->common.ns, &info->tid, false); > + Extra line? > + task = task_seq_get_next(&info->common, &info->tid, false); > if (!task) > return NULL; > > @@ -117,6 +141,43 @@ static void task_seq_stop(struct seq_file *seq, void *v) > put_task_struct((struct task_struct *)v); > } > > +static int bpf_iter_attach_task(struct bpf_prog *prog, > + union bpf_iter_link_info *linfo, > + struct bpf_iter_aux_info *aux) > +{ > + unsigned int flags; > + struct pid_namespace *ns; > + struct pid *pid; > + pid_t tgid; Follow reverse chrismas tree style? > + > + if (linfo->task.tid != 0) { > + aux->task.type = BPF_TASK_ITER_TID; > + aux->task.tid = linfo->task.tid; > + } else if (linfo->task.tgid != 0) { > + aux->task.type = BPF_TASK_ITER_TGID; > + aux->task.tgid = linfo->task.tgid; > + } else if (linfo->task.pid_fd != 0) { > + aux->task.type = BPF_TASK_ITER_TGID; > + pid = pidfd_get_pid(linfo->task.pid_fd, &flags); > + if (IS_ERR(pid)) > + return PTR_ERR(pid); > + > + ns = task_active_pid_ns(current); > + if (IS_ERR(ns)) > + return PTR_ERR(ns); > + > + tgid = pid_nr_ns(pid, ns); > + if (tgid <= 0) > + return -EINVAL; Is it possible that tgid <= 0? I think no, so the above two lines are unnecessary. > + > + aux->task.tgid = tgid; We leaks the reference count for 'pid' here. We need to add put_pid(pid); to release the reference for pid. > + } else { > + aux->task.type = BPF_TASK_ITER_ALL; > + } What will happen if two or all of task.tid, task.tgid and task.pid_fd non-zero? Should we fail here? > + > + return 0; > +} > + > static const struct seq_operations task_seq_ops = { > .start = task_seq_start, > .next = task_seq_next, > @@ -137,8 +198,7 @@ struct bpf_iter_seq_task_file_info { > static struct file * [...] > > @@ -307,11 +381,10 @@ enum bpf_task_vma_iter_find_op { > static struct vm_area_struct * > task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) > { > - struct pid_namespace *ns = info->common.ns; > enum bpf_task_vma_iter_find_op op; > struct vm_area_struct *curr_vma; > struct task_struct *curr_task; > - u32 curr_tid = info->tid; > + u32 saved_tid = info->tid; > > /* If this function returns a non-NULL vma, it holds a reference to > * the task_struct, and holds read lock on vma->mm->mmap_lock. > @@ -371,14 +444,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) > } > } else { > again: > - curr_task = task_seq_get_next(ns, &curr_tid, true); > + curr_task = task_seq_get_next(&info->common, &info->tid, true); > if (!curr_task) { > - info->tid = curr_tid + 1; > + info->tid++; > goto finish; > } > > - if (curr_tid != info->tid) { > - info->tid = curr_tid; > + if (saved_tid != info->tid) { > /* new task, process the first vma */ > op = task_vma_iter_first_vma; > } else { > @@ -430,9 +502,12 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) > return curr_vma; > > next_task: > + if (info->common.type == BPF_TASK_ITER_TID) > + goto finish; > + > put_task_struct(curr_task); > info->task = NULL; > - curr_tid++; > + info->tid++; saved_tid = ++info->tid? > goto again; > > finish: [...]
On Wed, Aug 10, 2022 at 05:16:52PM -0700, Kui-Feng Lee wrote: SNIP > +static int bpf_iter_attach_task(struct bpf_prog *prog, > + union bpf_iter_link_info *linfo, > + struct bpf_iter_aux_info *aux) > +{ > + unsigned int flags; > + struct pid_namespace *ns; > + struct pid *pid; > + pid_t tgid; > + > + if (linfo->task.tid != 0) { > + aux->task.type = BPF_TASK_ITER_TID; > + aux->task.tid = linfo->task.tid; > + } else if (linfo->task.tgid != 0) { > + aux->task.type = BPF_TASK_ITER_TGID; > + aux->task.tgid = linfo->task.tgid; > + } else if (linfo->task.pid_fd != 0) { > + aux->task.type = BPF_TASK_ITER_TGID; > + pid = pidfd_get_pid(linfo->task.pid_fd, &flags); > + if (IS_ERR(pid)) > + return PTR_ERR(pid); > + > + ns = task_active_pid_ns(current); > + if (IS_ERR(ns)) > + return PTR_ERR(ns); > + > + tgid = pid_nr_ns(pid, ns); > + if (tgid <= 0) > + return -EINVAL; > + > + aux->task.tgid = tgid; > + } else { > + aux->task.type = BPF_TASK_ITER_ALL; > + } > + > + return 0; > +} > + > static const struct seq_operations task_seq_ops = { > .start = task_seq_start, > .next = task_seq_next, > @@ -137,8 +198,7 @@ struct bpf_iter_seq_task_file_info { > static struct file * > task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) > { > - struct pid_namespace *ns = info->common.ns; > - u32 curr_tid = info->tid; > + u32 saved_tid = info->tid; we use 'curr_' prefix for other stuff in the function, like curr_task, curr_fd .. I think we should either change all of them or actually keep curr_tid, which seem better to me jirka > struct task_struct *curr_task; > unsigned int curr_fd = info->fd; > > @@ -151,21 +211,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) > curr_task = info->task; > curr_fd = info->fd; > } else { > - curr_task = task_seq_get_next(ns, &curr_tid, true); > + curr_task = task_seq_get_next(&info->common, &info->tid, true); > if (!curr_task) { > info->task = NULL; > - info->tid = curr_tid; > return NULL; > } > > - /* set info->task and info->tid */ > + /* set info->task */ > info->task = curr_task; > - if (curr_tid == info->tid) { > + if (saved_tid == info->tid) > curr_fd = info->fd; > - } else { > - info->tid = curr_tid; > + else > curr_fd = 0; > - } > } > SNIP
On Sat, Aug 13, 2022 at 3:17 PM Yonghong Song <yhs@fb.com> wrote: > > > > On 8/10/22 5:16 PM, Kui-Feng Lee wrote: > > Allow creating an iterator that loops through resources of one task/thread. > > > > People could only create iterators to loop through all resources of > > files, vma, and tasks in the system, even though they were interested > > in only the resources of a specific task or process. Passing the > > additional parameters, people can now create an iterator to go > > through all resources or only the resources of a task. > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > --- > > include/linux/bpf.h | 29 ++++++++ > > include/uapi/linux/bpf.h | 8 +++ > > kernel/bpf/task_iter.c | 126 ++++++++++++++++++++++++++------- > > tools/include/uapi/linux/bpf.h | 8 +++ > > 4 files changed, 147 insertions(+), 24 deletions(-) > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > index 11950029284f..6bbe53d06faa 100644 > > --- a/include/linux/bpf.h > > +++ b/include/linux/bpf.h > > @@ -1716,8 +1716,37 @@ int bpf_obj_get_user(const char __user *pathname, int flags); > > extern int bpf_iter_ ## target(args); \ > > int __init bpf_iter_ ## target(args) { return 0; } > > > > +/* > > + * The task type of iterators. > > + * > > + * For BPF task iterators, they can be parameterized with various > > + * parameters to visit only some of tasks. > > + * > > + * BPF_TASK_ITER_ALL (default) > > + * Iterate over resources of every task. > > + * > > + * BPF_TASK_ITER_TID > > + * Iterate over resources of a task/tid. > > + * > > + * BPF_TASK_ITER_TGID > > + * Iterate over reosurces of evevry task of a process / task group. > > + */ > > +enum bpf_iter_task_type { > > + BPF_TASK_ITER_ALL = 0, > > + BPF_TASK_ITER_TID, > > + BPF_TASK_ITER_TGID, > > +}; > > + > > struct bpf_iter_aux_info { > > struct bpf_map *map; > > + struct { > > + enum bpf_iter_task_type type; > > + union { > > + u32 tid; > > + u32 tgid; > > + u32 pid_fd; > > + }; > > + } task; > > }; > > > > typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > index ffcbf79a556b..6328aca0cf5c 100644 > > --- a/include/uapi/linux/bpf.h > > +++ b/include/uapi/linux/bpf.h > > @@ -91,6 +91,14 @@ union bpf_iter_link_info { > > struct { > > __u32 map_fd; > > } map; > > + /* > > + * Parameters of task iterators. > > + */ > > The comment can be put into one line. > > > + struct { > > + __u32 tid; > > + __u32 tgid; > > + __u32 pid_fd; > > The above is a max of kernel and user space terminologies. > tid/pid are user space concept and tgid is a kernel space > concept. > > In bpf uapi header, we have > > struct bpf_pidns_info { > __u32 pid; > __u32 tgid; > }; > > which uses kernel terminologies. > > So I suggest the bpf_iter_link_info.task can also > use pure kernel terminology pid/tgid/tgid_fd here. > > Alternative, using pure user space terminology > can be tid/pid/pid_fd but seems the kernel terminology > might be better since we already have precedence. Great catch and excellent point!
Hi Kui-Feng, On Wed, Aug 10, 2022 at 5:17 PM Kui-Feng Lee <kuifeng@fb.com> wrote: > > Allow creating an iterator that loops through resources of one task/thread. > > People could only create iterators to loop through all resources of > files, vma, and tasks in the system, even though they were interested > in only the resources of a specific task or process. Passing the > additional parameters, people can now create an iterator to go > through all resources or only the resources of a task. > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > --- > include/linux/bpf.h | 29 ++++++++ > include/uapi/linux/bpf.h | 8 +++ > kernel/bpf/task_iter.c | 126 ++++++++++++++++++++++++++------- > tools/include/uapi/linux/bpf.h | 8 +++ > 4 files changed, 147 insertions(+), 24 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 11950029284f..6bbe53d06faa 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1716,8 +1716,37 @@ int bpf_obj_get_user(const char __user *pathname, int flags); > extern int bpf_iter_ ## target(args); \ > int __init bpf_iter_ ## target(args) { return 0; } > > +/* > + * The task type of iterators. > + * > + * For BPF task iterators, they can be parameterized with various > + * parameters to visit only some of tasks. > + * > + * BPF_TASK_ITER_ALL (default) > + * Iterate over resources of every task. > + * > + * BPF_TASK_ITER_TID > + * Iterate over resources of a task/tid. > + * > + * BPF_TASK_ITER_TGID > + * Iterate over reosurces of evevry task of a process / task group. typos: resources and every. > + */ > +enum bpf_iter_task_type { > + BPF_TASK_ITER_ALL = 0, > + BPF_TASK_ITER_TID, > + BPF_TASK_ITER_TGID, > +}; > + > struct bpf_iter_aux_info { > struct bpf_map *map; > + struct { > + enum bpf_iter_task_type type; > + union { > + u32 tid; > + u32 tgid; > + u32 pid_fd; > + }; > + } task; > }; > > typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index ffcbf79a556b..6328aca0cf5c 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -91,6 +91,14 @@ union bpf_iter_link_info { > struct { > __u32 map_fd; > } map; > + /* > + * Parameters of task iterators. > + */ We could remove this particular comment. It is kind of obvious. > + struct { > + __u32 tid; > + __u32 tgid; > + __u32 pid_fd; > + } task; > }; > > /* BPF syscall commands, see bpf(2) man-page for more details. */ > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > index 8c921799def4..f2e21efe075d 100644 > --- a/kernel/bpf/task_iter.c > +++ b/kernel/bpf/task_iter.c > @@ -12,6 +12,12 @@ > > struct bpf_iter_seq_task_common { > struct pid_namespace *ns; > + enum bpf_iter_task_type type; > + union { > + u32 tid; > + u32 tgid; > + u32 pid_fd; > + }; > }; > > struct bpf_iter_seq_task_info { > @@ -22,24 +28,40 @@ struct bpf_iter_seq_task_info { > u32 tid; > }; > > -static struct task_struct *task_seq_get_next(struct pid_namespace *ns, > +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, > u32 *tid, > bool skip_if_dup_files) > { > struct task_struct *task = NULL; > struct pid *pid; > > + if (common->type == BPF_TASK_ITER_TID) { > + if (*tid && *tid != common->tid) > + return NULL; > + rcu_read_lock(); > + pid = find_pid_ns(common->tid, common->ns); > + if (pid) { > + task = get_pid_task(pid, PIDTYPE_PID); > + *tid = common->tid; > + } > + rcu_read_unlock(); nit: this is ok. But I think the commonly used pattern (e.g. proc_pid_lookup) is rcu_read_lock(); task = find_task_by_pid_ns(tid, ns); if (task) get_task_struct(task); rcu_read_unlock(); > + return task; > + } > + > rcu_read_lock(); > retry: > - pid = find_ge_pid(*tid, ns); > + pid = find_ge_pid(*tid, common->ns); > if (pid) { > - *tid = pid_nr_ns(pid, ns); > + *tid = pid_nr_ns(pid, common->ns); > task = get_pid_task(pid, PIDTYPE_PID); > + > if (!task) { > ++*tid; > goto retry; > - } else if (skip_if_dup_files && !thread_group_leader(task) && > - task->files == task->group_leader->files) { > + } else if ((skip_if_dup_files && !thread_group_leader(task) && > + task->files == task->group_leader->files) || > + (common->type == BPF_TASK_ITER_TGID && > + __task_pid_nr_ns(task, PIDTYPE_TGID, common->ns) != common->tgid)) { Use task_tgid_nr_ns instead of __task_pid_nr_ns? > put_task_struct(task); > task = NULL; > ++*tid; > @@ -56,7 +78,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) > struct bpf_iter_seq_task_info *info = seq->private; > struct task_struct *task; > > - task = task_seq_get_next(info->common.ns, &info->tid, false); > + task = task_seq_get_next(&info->common, &info->tid, false); > + > if (!task) > return NULL; > > @@ -73,7 +96,8 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) > ++*pos; > ++info->tid; > put_task_struct((struct task_struct *)v); > - task = task_seq_get_next(info->common.ns, &info->tid, false); > + > + task = task_seq_get_next(&info->common, &info->tid, false); > if (!task) > return NULL; > > @@ -117,6 +141,43 @@ static void task_seq_stop(struct seq_file *seq, void *v) > put_task_struct((struct task_struct *)v); > } > > +static int bpf_iter_attach_task(struct bpf_prog *prog, > + union bpf_iter_link_info *linfo, > + struct bpf_iter_aux_info *aux) > +{ > + unsigned int flags; > + struct pid_namespace *ns; > + struct pid *pid; > + pid_t tgid; > + > + if (linfo->task.tid != 0) { > + aux->task.type = BPF_TASK_ITER_TID; > + aux->task.tid = linfo->task.tid; > + } else if (linfo->task.tgid != 0) { > + aux->task.type = BPF_TASK_ITER_TGID; > + aux->task.tgid = linfo->task.tgid; > + } else if (linfo->task.pid_fd != 0) { > + aux->task.type = BPF_TASK_ITER_TGID; > + pid = pidfd_get_pid(linfo->task.pid_fd, &flags); > + if (IS_ERR(pid)) > + return PTR_ERR(pid); > + > + ns = task_active_pid_ns(current); > + if (IS_ERR(ns)) > + return PTR_ERR(ns); > + > + tgid = pid_nr_ns(pid, ns); > + if (tgid <= 0) > + return -EINVAL; Is this just pid_vnr? > + > + aux->task.tgid = tgid; > + } else { > + aux->task.type = BPF_TASK_ITER_ALL; > + } The same question as Yonghong has. Do we need to enforce that at most one of {tid, tgid, pid_fd} is non-zero? > + > + return 0; > +} > + > static const struct seq_operations task_seq_ops = { > .start = task_seq_start, > .next = task_seq_next, > @@ -137,8 +198,7 @@ struct bpf_iter_seq_task_file_info { > static struct file * > task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) > { > - struct pid_namespace *ns = info->common.ns; > - u32 curr_tid = info->tid; > + u32 saved_tid = info->tid; > struct task_struct *curr_task; > unsigned int curr_fd = info->fd; > > @@ -151,21 +211,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) > curr_task = info->task; > curr_fd = info->fd; > } else { > - curr_task = task_seq_get_next(ns, &curr_tid, true); > + curr_task = task_seq_get_next(&info->common, &info->tid, true); > if (!curr_task) { > info->task = NULL; > - info->tid = curr_tid; > return NULL; > } > > - /* set info->task and info->tid */ > + /* set info->task */ > info->task = curr_task; > - if (curr_tid == info->tid) { > + if (saved_tid == info->tid) > curr_fd = info->fd; > - } else { > - info->tid = curr_tid; > + else > curr_fd = 0; > - } > } > > rcu_read_lock(); > @@ -186,9 +243,15 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) > /* the current task is done, go to the next task */ > rcu_read_unlock(); > put_task_struct(curr_task); > + > + if (info->common.type == BPF_TASK_ITER_TID) { > + info->task = NULL; > + return NULL; > + } > + Do we need to set info->fd to 0? I am not sure if the caller reads info->fd anywhere. I think it would be good to do some refactoring on task_file_seq_get_next(). > info->task = NULL; > info->fd = 0; > - curr_tid = ++(info->tid); > + saved_tid = ++(info->tid); > goto again; > } > > @@ -269,6 +332,17 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) > struct bpf_iter_seq_task_common *common = priv_data; > > common->ns = get_pid_ns(task_active_pid_ns(current)); > + common->type = aux->task.type; > + switch (common->type) { > + case BPF_TASK_ITER_TID: > + common->tid = aux->task.tid; > + break; > + case BPF_TASK_ITER_TGID: > + common->tgid = aux->task.tgid; > + break; > + default: very nit: IMHO we could place a warning here. > + break; > + } > return 0; > } > > @@ -307,11 +381,10 @@ enum bpf_task_vma_iter_find_op { > static struct vm_area_struct * > task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) > { > - struct pid_namespace *ns = info->common.ns; > enum bpf_task_vma_iter_find_op op; > struct vm_area_struct *curr_vma; > struct task_struct *curr_task; > - u32 curr_tid = info->tid; > + u32 saved_tid = info->tid; > Why do we need to directly operate on info->tid while other task iters (e.g. vma_iter) uses curr_tid? IMHO, prefer staying using curr_tid if possible, for two reasons: - consistent with other iters. - decouple refactoring changes from the changes that introduce new features > /* If this function returns a non-NULL vma, it holds a reference to > * the task_struct, and holds read lock on vma->mm->mmap_lock. > @@ -371,14 +444,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) > } > } else { > again: > - curr_task = task_seq_get_next(ns, &curr_tid, true); > + curr_task = task_seq_get_next(&info->common, &info->tid, true); > if (!curr_task) { > - info->tid = curr_tid + 1; > + info->tid++; > goto finish; > } > > - if (curr_tid != info->tid) { > - info->tid = curr_tid; > + if (saved_tid != info->tid) { > /* new task, process the first vma */ > op = task_vma_iter_first_vma; > } else { > @@ -430,9 +502,12 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) > return curr_vma; > > next_task: > + if (info->common.type == BPF_TASK_ITER_TID) > + goto finish; > + > put_task_struct(curr_task); > info->task = NULL; > - curr_tid++; > + info->tid++; > goto again; > > finish: > @@ -533,6 +608,7 @@ static const struct bpf_iter_seq_info task_seq_info = { > > static struct bpf_iter_reg task_reg_info = { > .target = "task", > + .attach_target = bpf_iter_attach_task, > .feature = BPF_ITER_RESCHED, > .ctx_arg_info_size = 1, > .ctx_arg_info = { > @@ -551,6 +627,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = { > > static struct bpf_iter_reg task_file_reg_info = { > .target = "task_file", > + .attach_target = bpf_iter_attach_task, > .feature = BPF_ITER_RESCHED, > .ctx_arg_info_size = 2, > .ctx_arg_info = { > @@ -571,6 +648,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = { > > static struct bpf_iter_reg task_vma_reg_info = { > .target = "task_vma", > + .attach_target = bpf_iter_attach_task, > .feature = BPF_ITER_RESCHED, > .ctx_arg_info_size = 2, > .ctx_arg_info = { > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index ffcbf79a556b..6328aca0cf5c 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -91,6 +91,14 @@ union bpf_iter_link_info { > struct { > __u32 map_fd; > } map; > + /* > + * Parameters of task iterators. > + */ > + struct { > + __u32 tid; > + __u32 tgid; > + __u32 pid_fd; > + } task; > }; > > /* BPF syscall commands, see bpf(2) man-page for more details. */ > -- > 2.30.2 >
On Sat, Aug 13, 2022 at 3:17 PM Yonghong Song <yhs@fb.com> wrote: > > > > On 8/10/22 5:16 PM, Kui-Feng Lee wrote: > > Allow creating an iterator that loops through resources of one task/thread. > > > > People could only create iterators to loop through all resources of > > files, vma, and tasks in the system, even though they were interested > > in only the resources of a specific task or process. Passing the > > additional parameters, people can now create an iterator to go > > through all resources or only the resources of a task. > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > --- > > include/linux/bpf.h | 29 ++++++++ > > include/uapi/linux/bpf.h | 8 +++ > > kernel/bpf/task_iter.c | 126 ++++++++++++++++++++++++++------- > > tools/include/uapi/linux/bpf.h | 8 +++ > > 4 files changed, 147 insertions(+), 24 deletions(-) > > [...] > > + struct { > > + __u32 tid; > > + __u32 tgid; > > + __u32 pid_fd; > > The above is a max of kernel and user space terminologies. > tid/pid are user space concept and tgid is a kernel space > concept. > > In bpf uapi header, we have > > struct bpf_pidns_info { > __u32 pid; > __u32 tgid; > }; > > which uses kernel terminologies. > > So I suggest the bpf_iter_link_info.task can also > use pure kernel terminology pid/tgid/tgid_fd here. Except tgid_fd is a made up terminology. It is called pidfd in documentation and even if pidfd gains add support for threads (tasks), it would still be created through pidfd_open() syscall, probably. So it seems better to stick to "pidfd" here. As for pid/tgid precedent. Are we referring to bpf_get_current_pid_tgid() BPF helper and struct bpf_pidns_info? Those are kernel-side BPF helper and kernel-side auxiliary type to describe return results of another in-kernel helper, so I think it's a bit less relevant here to set a precedent. On the other hand, if we look at user-space-facing perf_event subsystem UAPI, for example, it seems to be using pid/tid terminology (and so does getpid()/gettid() syscall, etc). So I wonder if for a user-space-facing API it's better to stick with user-space-facing terminology? > > Alternative, using pure user space terminology > can be tid/pid/pid_fd but seems the kernel terminology > might be better since we already have precedence. > > > > + } task; > > }; > > [...]
On Wed, Aug 10, 2022 at 5:17 PM Kui-Feng Lee <kuifeng@fb.com> wrote: > > Allow creating an iterator that loops through resources of one task/thread. > > People could only create iterators to loop through all resources of > files, vma, and tasks in the system, even though they were interested > in only the resources of a specific task or process. Passing the > additional parameters, people can now create an iterator to go > through all resources or only the resources of a task. > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > --- > include/linux/bpf.h | 29 ++++++++ > include/uapi/linux/bpf.h | 8 +++ > kernel/bpf/task_iter.c | 126 ++++++++++++++++++++++++++------- > tools/include/uapi/linux/bpf.h | 8 +++ > 4 files changed, 147 insertions(+), 24 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 11950029284f..6bbe53d06faa 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1716,8 +1716,37 @@ int bpf_obj_get_user(const char __user *pathname, int flags); > extern int bpf_iter_ ## target(args); \ > int __init bpf_iter_ ## target(args) { return 0; } > > +/* > + * The task type of iterators. > + * > + * For BPF task iterators, they can be parameterized with various > + * parameters to visit only some of tasks. > + * > + * BPF_TASK_ITER_ALL (default) > + * Iterate over resources of every task. > + * > + * BPF_TASK_ITER_TID > + * Iterate over resources of a task/tid. > + * > + * BPF_TASK_ITER_TGID > + * Iterate over reosurces of evevry task of a process / task group. > + */ > +enum bpf_iter_task_type { > + BPF_TASK_ITER_ALL = 0, > + BPF_TASK_ITER_TID, > + BPF_TASK_ITER_TGID, > +}; > + > struct bpf_iter_aux_info { > struct bpf_map *map; > + struct { > + enum bpf_iter_task_type type; > + union { > + u32 tid; > + u32 tgid; > + u32 pid_fd; > + }; > + } task; You don't seem to use pid_fd in bpf_iter_aux_info at all, is that right? Drop it? And for tid/tgid, I'd use kernel-side terminology for this internal data structure and just have single u32 pid here. Then type determines whether you are iterating tasks or task leaders (processes), no ambiguity. > }; > > typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index ffcbf79a556b..6328aca0cf5c 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -91,6 +91,14 @@ union bpf_iter_link_info { > struct { > __u32 map_fd; > } map; > + /* > + * Parameters of task iterators. > + */ > + struct { > + __u32 tid; > + __u32 tgid; > + __u32 pid_fd; > + } task; > }; > [...]
On Sat, Aug 13, 2022 at 3:17 PM Yonghong Song <yhs@fb.com> wrote: > > > > On 8/10/22 5:16 PM, Kui-Feng Lee wrote: > > Allow creating an iterator that loops through resources of one task/thread. > > > > People could only create iterators to loop through all resources of > > files, vma, and tasks in the system, even though they were interested > > in only the resources of a specific task or process. Passing the > > additional parameters, people can now create an iterator to go > > through all resources or only the resources of a task. > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > --- > > include/linux/bpf.h | 29 ++++++++ > > include/uapi/linux/bpf.h | 8 +++ > > kernel/bpf/task_iter.c | 126 ++++++++++++++++++++++++++------- > > tools/include/uapi/linux/bpf.h | 8 +++ > > 4 files changed, 147 insertions(+), 24 deletions(-) > > Btw, Yonghong, I tried to figure it out myself, but got lost in all the kernel functions that don't seem to be very well documented. Sorry for being lazy and throwing this at you :) Is it easy and efficient to iterate only processes using whatever kernel helpers we have at our disposal? E.g., if I wanted to write an iterator that would go only over processes (not individual threads, just task leaders of each different process) within a cgroup, is that possible? I see task iterator as consisting of two different parts (and that makes it a bit hard to define nice and clean interface, but if we can crack this, we'd get an elegant and powerful construct): 1. What entity to iterate: threads or processes? (I'm ignoring task_vma and task_files here, but one could task about files of each thread or files of each process, but it's less practical, probably) 2. What's the scope of objects to iterate: just a thread by tid, just a process by pid/pidfd, once cgroup iter lands, we'll be able to talk about threads or processes within a cgroup or cgroup hierarchy (this is where descendants_{pre,post}, cgroup_self_only and ancestors ordering comes in as well). Currently Kui-Feng is addressing first half of #2 (tid/pid/pidfd parameters), we can use cgroup iter's parameters to define the scope of tasks/processes by cgroup "filter" in a follow up (it naturally extends what we have in this patch set). So now I'm wondering if there is any benefit to also somehow specifying threads vs processes as entities to iterate? And if we do that, does kernel support efficient iteration of processes (as opposed to threads). To be clear, there is a lot of value in having just #2, but while we are all at this topic, I thought I'd clarify for myself #1 as well. Thanks!
On Sat, 2022-08-13 at 15:17 -0700, Yonghong Song wrote: > > > On 8/10/22 5:16 PM, Kui-Feng Lee wrote: > > Allow creating an iterator that loops through resources of one > > task/thread. > > > > People could only create iterators to loop through all resources of > > files, vma, and tasks in the system, even though they were > > interested > > in only the resources of a specific task or process. Passing the > > additional parameters, people can now create an iterator to go > > through all resources or only the resources of a task. > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > --- > > include/linux/bpf.h | 29 ++++++++ > > include/uapi/linux/bpf.h | 8 +++ > > kernel/bpf/task_iter.c | 126 ++++++++++++++++++++++++++-- > > ----- > > tools/include/uapi/linux/bpf.h | 8 +++ > > 4 files changed, 147 insertions(+), 24 deletions(-) > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > index 11950029284f..6bbe53d06faa 100644 > > --- a/include/linux/bpf.h > > +++ b/include/linux/bpf.h > > @@ -1716,8 +1716,37 @@ int bpf_obj_get_user(const char __user > > *pathname, int flags); > > extern int bpf_iter_ ## target(args); \ > > int __init bpf_iter_ ## target(args) { return 0; } > > > > +/* > > + * The task type of iterators. > > + * > > + * For BPF task iterators, they can be parameterized with various > > + * parameters to visit only some of tasks. > > + * > > + * BPF_TASK_ITER_ALL (default) > > + * Iterate over resources of every task. > > + * > > + * BPF_TASK_ITER_TID > > + * Iterate over resources of a task/tid. > > + * > > + * BPF_TASK_ITER_TGID > > + * Iterate over reosurces of evevry task of a process / task > > group. > > + */ > > +enum bpf_iter_task_type { > > + BPF_TASK_ITER_ALL = 0, > > + BPF_TASK_ITER_TID, > > + BPF_TASK_ITER_TGID, > > +}; > > + > > struct bpf_iter_aux_info { > > struct bpf_map *map; > > + struct { > > + enum bpf_iter_task_type type; > > + union { > > + u32 tid; > > + u32 tgid; > > + u32 pid_fd; > > + }; > > + } task; > > }; > > > > typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > index ffcbf79a556b..6328aca0cf5c 100644 > > --- a/include/uapi/linux/bpf.h > > +++ b/include/uapi/linux/bpf.h > > @@ -91,6 +91,14 @@ union bpf_iter_link_info { > > struct { > > __u32 map_fd; > > } map; > > + /* > > + * Parameters of task iterators. > > + */ > > The comment can be put into one line. > > > + struct { > > + __u32 tid; > > + __u32 tgid; > > + __u32 pid_fd; > > The above is a max of kernel and user space terminologies. > tid/pid are user space concept and tgid is a kernel space > concept. > > In bpf uapi header, we have > > struct bpf_pidns_info { > __u32 pid; > __u32 tgid; > }; > > which uses kernel terminologies. > > So I suggest the bpf_iter_link_info.task can also > use pure kernel terminology pid/tgid/tgid_fd here. > > Alternative, using pure user space terminology > can be tid/pid/pid_fd but seems the kernel terminology > might be better since we already have precedence. > > > > + } task; > > }; > > > > /* BPF syscall commands, see bpf(2) man-page for more details. */ > > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > > index 8c921799def4..f2e21efe075d 100644 > > --- a/kernel/bpf/task_iter.c > > +++ b/kernel/bpf/task_iter.c > > @@ -12,6 +12,12 @@ > > > > struct bpf_iter_seq_task_common { > > struct pid_namespace *ns; > > + enum bpf_iter_task_type type; > > + union { > > + u32 tid; > > + u32 tgid; > > + u32 pid_fd; > > + }; > > }; > > > > struct bpf_iter_seq_task_info { > > @@ -22,24 +28,40 @@ struct bpf_iter_seq_task_info { > > u32 tid; > > }; > > > > -static struct task_struct *task_seq_get_next(struct pid_namespace > > *ns, > > +static struct task_struct *task_seq_get_next(struct > > bpf_iter_seq_task_common *common, > > u32 *tid, > > bool > > skip_if_dup_files) > > { > > struct task_struct *task = NULL; > > struct pid *pid; > > > > + if (common->type == BPF_TASK_ITER_TID) { > > + if (*tid && *tid != common->tid) > > + return NULL; > > + rcu_read_lock(); > > + pid = find_pid_ns(common->tid, common->ns); > > + if (pid) { > > + task = get_pid_task(pid, PIDTYPE_PID); > > + *tid = common->tid; > > + } > > + rcu_read_unlock(); > > + return task; > > + } > > + > > rcu_read_lock(); > > retry: > > - pid = find_ge_pid(*tid, ns); > > + pid = find_ge_pid(*tid, common->ns); > > if (pid) { > > - *tid = pid_nr_ns(pid, ns); > > + *tid = pid_nr_ns(pid, common->ns); > > task = get_pid_task(pid, PIDTYPE_PID); > > + > > This extra line is unnecessary. > > > if (!task) { > > ++*tid; > > goto retry; > > - } else if (skip_if_dup_files && > > !thread_group_leader(task) && > > - task->files == task->group_leader- > > >files) { > > + } else if ((skip_if_dup_files && > > !thread_group_leader(task) && > > + task->files == task->group_leader- > > >files) || > > + (common->type == BPF_TASK_ITER_TGID && > > + __task_pid_nr_ns(task, PIDTYPE_TGID, > > common->ns) != common->tgid)) { > > put_task_struct(task); > > task = NULL; > > ++*tid; > > @@ -56,7 +78,8 @@ static void *task_seq_start(struct seq_file *seq, > > loff_t *pos) > > struct bpf_iter_seq_task_info *info = seq->private; > > struct task_struct *task; > > > > - task = task_seq_get_next(info->common.ns, &info->tid, > > false); > > + task = task_seq_get_next(&info->common, &info->tid, false); > > + > > Extra line? > > > if (!task) > > return NULL; > > > > @@ -73,7 +96,8 @@ static void *task_seq_next(struct seq_file *seq, > > void *v, loff_t *pos) > > ++*pos; > > ++info->tid; > > put_task_struct((struct task_struct *)v); > > - task = task_seq_get_next(info->common.ns, &info->tid, > > false); > > + > > Extra line? > > > + task = task_seq_get_next(&info->common, &info->tid, false); > > if (!task) > > return NULL; > > > > @@ -117,6 +141,43 @@ static void task_seq_stop(struct seq_file > > *seq, void *v) > > put_task_struct((struct task_struct *)v); > > } > > > > +static int bpf_iter_attach_task(struct bpf_prog *prog, > > + union bpf_iter_link_info *linfo, > > + struct bpf_iter_aux_info *aux) > > +{ > > + unsigned int flags; > > + struct pid_namespace *ns; > > + struct pid *pid; > > + pid_t tgid; > > Follow reverse chrismas tree style? > > > + > > + if (linfo->task.tid != 0) { > > + aux->task.type = BPF_TASK_ITER_TID; > > + aux->task.tid = linfo->task.tid; > > + } else if (linfo->task.tgid != 0) { > > + aux->task.type = BPF_TASK_ITER_TGID; > > + aux->task.tgid = linfo->task.tgid; > > + } else if (linfo->task.pid_fd != 0) { > > + aux->task.type = BPF_TASK_ITER_TGID; > > + pid = pidfd_get_pid(linfo->task.pid_fd, &flags); > > + if (IS_ERR(pid)) > > + return PTR_ERR(pid); > > + > > + ns = task_active_pid_ns(current); > > + if (IS_ERR(ns)) > > + return PTR_ERR(ns); > > + > > + tgid = pid_nr_ns(pid, ns); > > + if (tgid <= 0) > > + return -EINVAL; > > Is it possible that tgid <= 0? I think no, so > the above two lines are unnecessary. > > > + > > + aux->task.tgid = tgid; > > We leaks the reference count for 'pid' here. > We need to add > put_pid(pid); > to release the reference for pid. > > > + } else { > > + aux->task.type = BPF_TASK_ITER_ALL; > > + } > > What will happen if two or all of task.tid, task.tgid and > task.pid_fd non-zero? Should we fail here? > > > + > > + return 0; > > +} > > + > > static const struct seq_operations task_seq_ops = { > > .start = task_seq_start, > > .next = task_seq_next, > > @@ -137,8 +198,7 @@ struct bpf_iter_seq_task_file_info { > > static struct file * > [...] > > > > @@ -307,11 +381,10 @@ enum bpf_task_vma_iter_find_op { > > static struct vm_area_struct * > > task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) > > { > > - struct pid_namespace *ns = info->common.ns; > > enum bpf_task_vma_iter_find_op op; > > struct vm_area_struct *curr_vma; > > struct task_struct *curr_task; > > - u32 curr_tid = info->tid; > > + u32 saved_tid = info->tid; > > > > /* If this function returns a non-NULL vma, it holds a > > reference to > > * the task_struct, and holds read lock on vma->mm- > > >mmap_lock. > > @@ -371,14 +444,13 @@ task_vma_seq_get_next(struct > > bpf_iter_seq_task_vma_info *info) > > } > > } else { > > again: > > - curr_task = task_seq_get_next(ns, &curr_tid, true); > > + curr_task = task_seq_get_next(&info->common, &info- > > >tid, true); > > if (!curr_task) { > > - info->tid = curr_tid + 1; > > + info->tid++; > > goto finish; > > } > > > > - if (curr_tid != info->tid) { > > - info->tid = curr_tid; > > + if (saved_tid != info->tid) { > > /* new task, process the first vma */ > > op = task_vma_iter_first_vma; > > } else { > > @@ -430,9 +502,12 @@ task_vma_seq_get_next(struct > > bpf_iter_seq_task_vma_info *info) > > return curr_vma; > > > > next_task: > > + if (info->common.type == BPF_TASK_ITER_TID) > > + goto finish; > > + > > put_task_struct(curr_task); > > info->task = NULL; > > - curr_tid++; > > + info->tid++; > > saved_tid = ++info->tid? saved_tid is the value of info->tid when entering this funciton. It is used to check if the current visiting task is the same one entering this function. For this purpose, updating saved_tid or not will not change the result. The value of info->tid will be different from saved_tid after info->tid++ anyway, and it will show that the current visiting task is not the one when entering this function. > > > goto again; > > > > finish: > [...]
On Sun, 2022-08-14 at 22:24 +0200, Jiri Olsa wrote: > On Wed, Aug 10, 2022 at 05:16:52PM -0700, Kui-Feng Lee wrote: > > SNIP > > > +static int bpf_iter_attach_task(struct bpf_prog *prog, > > + union bpf_iter_link_info *linfo, > > + struct bpf_iter_aux_info *aux) > > +{ > > + unsigned int flags; > > + struct pid_namespace *ns; > > + struct pid *pid; > > + pid_t tgid; > > + > > + if (linfo->task.tid != 0) { > > + aux->task.type = BPF_TASK_ITER_TID; > > + aux->task.tid = linfo->task.tid; > > + } else if (linfo->task.tgid != 0) { > > + aux->task.type = BPF_TASK_ITER_TGID; > > + aux->task.tgid = linfo->task.tgid; > > + } else if (linfo->task.pid_fd != 0) { > > + aux->task.type = BPF_TASK_ITER_TGID; > > + pid = pidfd_get_pid(linfo->task.pid_fd, &flags); > > + if (IS_ERR(pid)) > > + return PTR_ERR(pid); > > + > > + ns = task_active_pid_ns(current); > > + if (IS_ERR(ns)) > > + return PTR_ERR(ns); > > + > > + tgid = pid_nr_ns(pid, ns); > > + if (tgid <= 0) > > + return -EINVAL; > > + > > + aux->task.tgid = tgid; > > + } else { > > + aux->task.type = BPF_TASK_ITER_ALL; > > + } > > + > > + return 0; > > +} > > + > > static const struct seq_operations task_seq_ops = { > > .start = task_seq_start, > > .next = task_seq_next, > > @@ -137,8 +198,7 @@ struct bpf_iter_seq_task_file_info { > > static struct file * > > task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) > > { > > - struct pid_namespace *ns = info->common.ns; > > - u32 curr_tid = info->tid; > > + u32 saved_tid = info->tid; > > we use 'curr_' prefix for other stuff in the function, like > curr_task, curr_fd .. I think we should either change all of > them or actually keep curr_tid, which seem better to me The purpose of the variable is changed, so I think 'curr_tid' is not feasible anymore. It was the tid of the task that we are visiting. But, now, it is the tid of the task that the iterator was visiting when/before entering the function. In this patch, info->tid is always the value of the current visiting task. > > jirka > > > struct task_struct *curr_task; > > unsigned int curr_fd = info->fd; > > > > @@ -151,21 +211,18 @@ task_file_seq_get_next(struct > > bpf_iter_seq_task_file_info *info) > > curr_task = info->task; > > curr_fd = info->fd; > > } else { > > - curr_task = task_seq_get_next(ns, &curr_tid, > > true); > > + curr_task = task_seq_get_next(&info->common, &info- > > >tid, true); > > if (!curr_task) { > > info->task = NULL; > > - info->tid = curr_tid; > > return NULL; > > } > > > > - /* set info->task and info->tid */ > > + /* set info->task */ > > info->task = curr_task; > > - if (curr_tid == info->tid) { > > + if (saved_tid == info->tid) > > curr_fd = info->fd; > > - } else { > > - info->tid = curr_tid; > > + else > > curr_fd = 0; > > - } > > } > > > > SNIP
On Mon, 2022-08-15 at 22:02 -0700, Andrii Nakryiko wrote: > On Wed, Aug 10, 2022 at 5:17 PM Kui-Feng Lee <kuifeng@fb.com> wrote: > > > > Allow creating an iterator that loops through resources of one > > task/thread. > > > > People could only create iterators to loop through all resources of > > files, vma, and tasks in the system, even though they were > > interested > > in only the resources of a specific task or process. Passing the > > additional parameters, people can now create an iterator to go > > through all resources or only the resources of a task. > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > --- > > include/linux/bpf.h | 29 ++++++++ > > include/uapi/linux/bpf.h | 8 +++ > > kernel/bpf/task_iter.c | 126 ++++++++++++++++++++++++++--- > > ---- > > tools/include/uapi/linux/bpf.h | 8 +++ > > 4 files changed, 147 insertions(+), 24 deletions(-) > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > index 11950029284f..6bbe53d06faa 100644 > > --- a/include/linux/bpf.h > > +++ b/include/linux/bpf.h > > @@ -1716,8 +1716,37 @@ int bpf_obj_get_user(const char __user > > *pathname, int flags); > > extern int bpf_iter_ ## target(args); \ > > int __init bpf_iter_ ## target(args) { return 0; } > > > > +/* > > + * The task type of iterators. > > + * > > + * For BPF task iterators, they can be parameterized with various > > + * parameters to visit only some of tasks. > > + * > > + * BPF_TASK_ITER_ALL (default) > > + * Iterate over resources of every task. > > + * > > + * BPF_TASK_ITER_TID > > + * Iterate over resources of a task/tid. > > + * > > + * BPF_TASK_ITER_TGID > > + * Iterate over reosurces of evevry task of a process / task > > group. > > + */ > > +enum bpf_iter_task_type { > > + BPF_TASK_ITER_ALL = 0, > > + BPF_TASK_ITER_TID, > > + BPF_TASK_ITER_TGID, > > +}; > > + > > struct bpf_iter_aux_info { > > struct bpf_map *map; > > + struct { > > + enum bpf_iter_task_type type; > > + union { > > + u32 tid; > > + u32 tgid; > > + u32 pid_fd; > > + }; > > + } task; > > You don't seem to use pid_fd in bpf_iter_aux_info at all, is that > right? Drop it? And for tid/tgid, I'd use kernel-side terminology for > this internal data structure and just have single u32 pid here. Then > type determines whether you are iterating tasks or task leaders > (processes), no ambiguity. Yes, it should be removed from struct bpf_iter_aux_info. Using just single u32 pid here is also fine to distinguish field names by the type of data instead of the purpose of data. > > > }; > > > > typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > index ffcbf79a556b..6328aca0cf5c 100644 > > --- a/include/uapi/linux/bpf.h > > +++ b/include/uapi/linux/bpf.h > > @@ -91,6 +91,14 @@ union bpf_iter_link_info { > > struct { > > __u32 map_fd; > > } map; > > + /* > > + * Parameters of task iterators. > > + */ > > + struct { > > + __u32 tid; > > + __u32 tgid; > > + __u32 pid_fd; > > + } task; > > }; > > > > [...]
On Mon, 2022-08-15 at 16:08 -0700, Hao Luo wrote: > !-------------------------------------------------------------------| > This Message Is From an External Sender > > > ------------------------------------------------------------------- > > ! > > Hi Kui-Feng, > > On Wed, Aug 10, 2022 at 5:17 PM Kui-Feng Lee <kuifeng@fb.com> wrote: > > > > Allow creating an iterator that loops through resources of one > > task/thread. > > > > People could only create iterators to loop through all resources of > > files, vma, and tasks in the system, even though they were > > interested > > in only the resources of a specific task or process. Passing the > > additional parameters, people can now create an iterator to go > > through all resources or only the resources of a task. > > > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > > --- > > include/linux/bpf.h | 29 ++++++++ > > include/uapi/linux/bpf.h | 8 +++ > > kernel/bpf/task_iter.c | 126 ++++++++++++++++++++++++++--- > > ---- > > tools/include/uapi/linux/bpf.h | 8 +++ > > 4 files changed, 147 insertions(+), 24 deletions(-) > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > index 11950029284f..6bbe53d06faa 100644 > > --- a/include/linux/bpf.h > > +++ b/include/linux/bpf.h > > @@ -1716,8 +1716,37 @@ int bpf_obj_get_user(const char __user > > *pathname, int flags); > > extern int bpf_iter_ ## target(args); \ > > int __init bpf_iter_ ## target(args) { return 0; } > > > > +/* > > + * The task type of iterators. > > + * > > + * For BPF task iterators, they can be parameterized with various > > + * parameters to visit only some of tasks. > > + * > > + * BPF_TASK_ITER_ALL (default) > > + * Iterate over resources of every task. > > + * > > + * BPF_TASK_ITER_TID > > + * Iterate over resources of a task/tid. > > + * > > + * BPF_TASK_ITER_TGID > > + * Iterate over reosurces of evevry task of a process / task > > group. > > typos: resources and every. > > > + */ > > +enum bpf_iter_task_type { > > + BPF_TASK_ITER_ALL = 0, > > + BPF_TASK_ITER_TID, > > + BPF_TASK_ITER_TGID, > > +}; > > + > > struct bpf_iter_aux_info { > > struct bpf_map *map; > > + struct { > > + enum bpf_iter_task_type type; > > + union { > > + u32 tid; > > + u32 tgid; > > + u32 pid_fd; > > + }; > > + } task; > > }; > > > > typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > index ffcbf79a556b..6328aca0cf5c 100644 > > --- a/include/uapi/linux/bpf.h > > +++ b/include/uapi/linux/bpf.h > > @@ -91,6 +91,14 @@ union bpf_iter_link_info { > > struct { > > __u32 map_fd; > > } map; > > + /* > > + * Parameters of task iterators. > > + */ > > We could remove this particular comment. It is kind of obvious. > > > + struct { > > + __u32 tid; > > + __u32 tgid; > > + __u32 pid_fd; > > + } task; > > }; > > > > /* BPF syscall commands, see bpf(2) man-page for more details. */ > > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c > > index 8c921799def4..f2e21efe075d 100644 > > --- a/kernel/bpf/task_iter.c > > +++ b/kernel/bpf/task_iter.c > > @@ -12,6 +12,12 @@ > > > > struct bpf_iter_seq_task_common { > > struct pid_namespace *ns; > > + enum bpf_iter_task_type type; > > + union { > > + u32 tid; > > + u32 tgid; > > + u32 pid_fd; > > + }; > > }; > > > > struct bpf_iter_seq_task_info { > > @@ -22,24 +28,40 @@ struct bpf_iter_seq_task_info { > > u32 tid; > > }; > > > > -static struct task_struct *task_seq_get_next(struct pid_namespace > > *ns, > > +static struct task_struct *task_seq_get_next(struct > > bpf_iter_seq_task_common *common, > > u32 *tid, > > bool > > skip_if_dup_files) > > { > > struct task_struct *task = NULL; > > struct pid *pid; > > > > + if (common->type == BPF_TASK_ITER_TID) { > > + if (*tid && *tid != common->tid) > > + return NULL; > > + rcu_read_lock(); > > + pid = find_pid_ns(common->tid, common->ns); > > + if (pid) { > > + task = get_pid_task(pid, PIDTYPE_PID); > > + *tid = common->tid; > > + } > > + rcu_read_unlock(); > > nit: this is ok. But I think the commonly used pattern (e.g. > proc_pid_lookup) is > > rcu_read_lock(); > task = find_task_by_pid_ns(tid, ns); > if (task) > get_task_struct(task); > rcu_read_unlock(); > > > + return task; > > + } > > + > > rcu_read_lock(); > > retry: > > - pid = find_ge_pid(*tid, ns); > > + pid = find_ge_pid(*tid, common->ns); > > if (pid) { > > - *tid = pid_nr_ns(pid, ns); > > + *tid = pid_nr_ns(pid, common->ns); > > task = get_pid_task(pid, PIDTYPE_PID); > > + > > if (!task) { > > ++*tid; > > goto retry; > > - } else if (skip_if_dup_files && > > !thread_group_leader(task) && > > - task->files == task->group_leader- > > >files) { > > + } else if ((skip_if_dup_files && > > !thread_group_leader(task) && > > + task->files == task->group_leader- > > >files) || > > + (common->type == BPF_TASK_ITER_TGID && > > + __task_pid_nr_ns(task, PIDTYPE_TGID, > > common->ns) != common->tgid)) { > > Use task_tgid_nr_ns instead of __task_pid_nr_ns? > > > put_task_struct(task); > > task = NULL; > > ++*tid; > > @@ -56,7 +78,8 @@ static void *task_seq_start(struct seq_file *seq, > > loff_t *pos) > > struct bpf_iter_seq_task_info *info = seq->private; > > struct task_struct *task; > > > > - task = task_seq_get_next(info->common.ns, &info->tid, > > false); > > + task = task_seq_get_next(&info->common, &info->tid, false); > > + > > if (!task) > > return NULL; > > > > @@ -73,7 +96,8 @@ static void *task_seq_next(struct seq_file *seq, > > void *v, loff_t *pos) > > ++*pos; > > ++info->tid; > > put_task_struct((struct task_struct *)v); > > - task = task_seq_get_next(info->common.ns, &info->tid, > > false); > > + > > + task = task_seq_get_next(&info->common, &info->tid, false); > > if (!task) > > return NULL; > > > > @@ -117,6 +141,43 @@ static void task_seq_stop(struct seq_file > > *seq, void *v) > > put_task_struct((struct task_struct *)v); > > } > > > > +static int bpf_iter_attach_task(struct bpf_prog *prog, > > + union bpf_iter_link_info *linfo, > > + struct bpf_iter_aux_info *aux) > > +{ > > + unsigned int flags; > > + struct pid_namespace *ns; > > + struct pid *pid; > > + pid_t tgid; > > + > > + if (linfo->task.tid != 0) { > > + aux->task.type = BPF_TASK_ITER_TID; > > + aux->task.tid = linfo->task.tid; > > + } else if (linfo->task.tgid != 0) { > > + aux->task.type = BPF_TASK_ITER_TGID; > > + aux->task.tgid = linfo->task.tgid; > > + } else if (linfo->task.pid_fd != 0) { > > + aux->task.type = BPF_TASK_ITER_TGID; > > + pid = pidfd_get_pid(linfo->task.pid_fd, &flags); > > + if (IS_ERR(pid)) > > + return PTR_ERR(pid); > > + > > + ns = task_active_pid_ns(current); > > + if (IS_ERR(ns)) > > + return PTR_ERR(ns); > > + > > + tgid = pid_nr_ns(pid, ns); > > + if (tgid <= 0) > > + return -EINVAL; > > Is this just pid_vnr? > > > + > > + aux->task.tgid = tgid; > > + } else { > > + aux->task.type = BPF_TASK_ITER_ALL; > > + } > > The same question as Yonghong has. Do we need to enforce that at most > one of {tid, tgid, pid_fd} is non-zero? > > > + > > + return 0; > > +} > > + > > static const struct seq_operations task_seq_ops = { > > .start = task_seq_start, > > .next = task_seq_next, > > @@ -137,8 +198,7 @@ struct bpf_iter_seq_task_file_info { > > static struct file * > > task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) > > { > > - struct pid_namespace *ns = info->common.ns; > > - u32 curr_tid = info->tid; > > + u32 saved_tid = info->tid; > > struct task_struct *curr_task; > > unsigned int curr_fd = info->fd; > > > > @@ -151,21 +211,18 @@ task_file_seq_get_next(struct > > bpf_iter_seq_task_file_info *info) > > curr_task = info->task; > > curr_fd = info->fd; > > } else { > > - curr_task = task_seq_get_next(ns, &curr_tid, > > true); > > + curr_task = task_seq_get_next(&info->common, &info- > > >tid, true); > > if (!curr_task) { > > info->task = NULL; > > - info->tid = curr_tid; > > return NULL; > > } > > > > - /* set info->task and info->tid */ > > + /* set info->task */ > > info->task = curr_task; > > - if (curr_tid == info->tid) { > > + if (saved_tid == info->tid) > > curr_fd = info->fd; > > - } else { > > - info->tid = curr_tid; > > + else > > curr_fd = 0; > > - } > > } > > > > rcu_read_lock(); > > @@ -186,9 +243,15 @@ task_file_seq_get_next(struct > > bpf_iter_seq_task_file_info *info) > > /* the current task is done, go to the next task */ > > rcu_read_unlock(); > > put_task_struct(curr_task); > > + > > + if (info->common.type == BPF_TASK_ITER_TID) { > > + info->task = NULL; > > + return NULL; > > + } > > + > > Do we need to set info->fd to 0? I am not sure if the caller reads > info->fd anywhere. I think it would be good to do some refactoring on > task_file_seq_get_next(). > > > info->task = NULL; > > info->fd = 0; > > - curr_tid = ++(info->tid); > > + saved_tid = ++(info->tid); > > goto again; > > } > > > > @@ -269,6 +332,17 @@ static int init_seq_pidns(void *priv_data, > > struct bpf_iter_aux_info *aux) > > struct bpf_iter_seq_task_common *common = priv_data; > > > > common->ns = get_pid_ns(task_active_pid_ns(current)); > > + common->type = aux->task.type; > > + switch (common->type) { > > + case BPF_TASK_ITER_TID: > > + common->tid = aux->task.tid; > > + break; > > + case BPF_TASK_ITER_TGID: > > + common->tgid = aux->task.tgid; > > + break; > > + default: > > very nit: IMHO we could place a warning here. > > > + break; > > + } > > return 0; > > } > > > > @@ -307,11 +381,10 @@ enum bpf_task_vma_iter_find_op { > > static struct vm_area_struct * > > task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) > > { > > - struct pid_namespace *ns = info->common.ns; > > enum bpf_task_vma_iter_find_op op; > > struct vm_area_struct *curr_vma; > > struct task_struct *curr_task; > > - u32 curr_tid = info->tid; > > + u32 saved_tid = info->tid; > > > > Why do we need to directly operate on info->tid while other task > iters > (e.g. vma_iter) uses curr_tid? IMHO, prefer staying using curr_tid if > possible, for two reasons: > - consistent with other iters. > - decouple refactoring changes from the changes that introduce new > features These are reasonable considerations. I chagned it to make it easy to maintained the value of info->tid. With curr_tid, we always put the curr_tid's value back to info->tid before leaving the function, and it is obfuscating and error-prone, IMHO. I may add another patch for the purpose of factoring. > > > /* If this function returns a non-NULL vma, it holds a > > reference to > > * the task_struct, and holds read lock on vma->mm- > > >mmap_lock. > > @@ -371,14 +444,13 @@ task_vma_seq_get_next(struct > > bpf_iter_seq_task_vma_info *info) > > } > > } else { > > again: > > - curr_task = task_seq_get_next(ns, &curr_tid, true); > > + curr_task = task_seq_get_next(&info->common, &info- > > >tid, true); > > if (!curr_task) { > > - info->tid = curr_tid + 1; > > + info->tid++; > > goto finish; > > } > > > > - if (curr_tid != info->tid) { > > - info->tid = curr_tid; > > + if (saved_tid != info->tid) { > > /* new task, process the first vma */ > > op = task_vma_iter_first_vma; > > } else { > > @@ -430,9 +502,12 @@ task_vma_seq_get_next(struct > > bpf_iter_seq_task_vma_info *info) > > return curr_vma; > > > > next_task: > > + if (info->common.type == BPF_TASK_ITER_TID) > > + goto finish; > > + > > put_task_struct(curr_task); > > info->task = NULL; > > - curr_tid++; > > + info->tid++; > > goto again; > > > > finish: > > @@ -533,6 +608,7 @@ static const struct bpf_iter_seq_info > > task_seq_info = { > > > > static struct bpf_iter_reg task_reg_info = { > > .target = "task", > > + .attach_target = bpf_iter_attach_task, > > .feature = BPF_ITER_RESCHED, > > .ctx_arg_info_size = 1, > > .ctx_arg_info = { > > @@ -551,6 +627,7 @@ static const struct bpf_iter_seq_info > > task_file_seq_info = { > > > > static struct bpf_iter_reg task_file_reg_info = { > > .target = "task_file", > > + .attach_target = bpf_iter_attach_task, > > .feature = BPF_ITER_RESCHED, > > .ctx_arg_info_size = 2, > > .ctx_arg_info = { > > @@ -571,6 +648,7 @@ static const struct bpf_iter_seq_info > > task_vma_seq_info = { > > > > static struct bpf_iter_reg task_vma_reg_info = { > > .target = "task_vma", > > + .attach_target = bpf_iter_attach_task, > > .feature = BPF_ITER_RESCHED, > > .ctx_arg_info_size = 2, > > .ctx_arg_info = { > > diff --git a/tools/include/uapi/linux/bpf.h > > b/tools/include/uapi/linux/bpf.h > > index ffcbf79a556b..6328aca0cf5c 100644 > > --- a/tools/include/uapi/linux/bpf.h > > +++ b/tools/include/uapi/linux/bpf.h > > @@ -91,6 +91,14 @@ union bpf_iter_link_info { > > struct { > > __u32 map_fd; > > } map; > > + /* > > + * Parameters of task iterators. > > + */ > > + struct { > > + __u32 tid; > > + __u32 tgid; > > + __u32 pid_fd; > > + } task; > > }; > > > > /* BPF syscall commands, see bpf(2) man-page for more details. */ > > -- > > 2.30.2 > >
On 8/15/22 9:42 PM, Andrii Nakryiko wrote: > On Sat, Aug 13, 2022 at 3:17 PM Yonghong Song <yhs@fb.com> wrote: >> >> >> >> On 8/10/22 5:16 PM, Kui-Feng Lee wrote: >>> Allow creating an iterator that loops through resources of one task/thread. >>> >>> People could only create iterators to loop through all resources of >>> files, vma, and tasks in the system, even though they were interested >>> in only the resources of a specific task or process. Passing the >>> additional parameters, people can now create an iterator to go >>> through all resources or only the resources of a task. >>> >>> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> >>> --- >>> include/linux/bpf.h | 29 ++++++++ >>> include/uapi/linux/bpf.h | 8 +++ >>> kernel/bpf/task_iter.c | 126 ++++++++++++++++++++++++++------- >>> tools/include/uapi/linux/bpf.h | 8 +++ >>> 4 files changed, 147 insertions(+), 24 deletions(-) >>> > > [...] > >>> + struct { >>> + __u32 tid; >>> + __u32 tgid; >>> + __u32 pid_fd; >> >> The above is a max of kernel and user space terminologies. >> tid/pid are user space concept and tgid is a kernel space >> concept. >> >> In bpf uapi header, we have >> >> struct bpf_pidns_info { >> __u32 pid; >> __u32 tgid; >> }; >> >> which uses kernel terminologies. >> >> So I suggest the bpf_iter_link_info.task can also >> use pure kernel terminology pid/tgid/tgid_fd here. > > Except tgid_fd is a made up terminology. It is called pidfd in > documentation and even if pidfd gains add support for threads (tasks), > it would still be created through pidfd_open() syscall, probably. So > it seems better to stick to "pidfd" here. > > As for pid/tgid precedent. Are we referring to > bpf_get_current_pid_tgid() BPF helper and struct bpf_pidns_info? Those > are kernel-side BPF helper and kernel-side auxiliary type to describe > return results of another in-kernel helper, so I think it's a bit less > relevant here to set a precedent. > > On the other hand, if we look at user-space-facing perf_event > subsystem UAPI, for example, it seems to be using pid/tid terminology > (and so does getpid()/gettid() syscall, etc). So I wonder if for a > user-space-facing API it's better to stick with user-space-facing > terminology? I don't have strong preferences here as long as all terminologies are consistent. user-space-facing API is okay. Currently, we only have pid_fd to traverse all tasks in a process. Based on an early discussion, it is possible that pidfd_open syscall might adapt to return a fd for a task in the future if necessary. So we might have tid_fd as well to traverse a single task. > >> >> Alternative, using pure user space terminology >> can be tid/pid/pid_fd but seems the kernel terminology >> might be better since we already have precedence. >> >> >>> + } task; >>> }; >>> > > [...]
On 8/15/22 10:25 PM, Andrii Nakryiko wrote: > On Sat, Aug 13, 2022 at 3:17 PM Yonghong Song <yhs@fb.com> wrote: >> >> >> >> On 8/10/22 5:16 PM, Kui-Feng Lee wrote: >>> Allow creating an iterator that loops through resources of one task/thread. >>> >>> People could only create iterators to loop through all resources of >>> files, vma, and tasks in the system, even though they were interested >>> in only the resources of a specific task or process. Passing the >>> additional parameters, people can now create an iterator to go >>> through all resources or only the resources of a task. >>> >>> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> >>> --- >>> include/linux/bpf.h | 29 ++++++++ >>> include/uapi/linux/bpf.h | 8 +++ >>> kernel/bpf/task_iter.c | 126 ++++++++++++++++++++++++++------- >>> tools/include/uapi/linux/bpf.h | 8 +++ >>> 4 files changed, 147 insertions(+), 24 deletions(-) >>> > > Btw, Yonghong, I tried to figure it out myself, but got lost in all > the kernel functions that don't seem to be very well documented. Sorry > for being lazy and throwing this at you :) > > Is it easy and efficient to iterate only processes using whatever > kernel helpers we have at our disposal? E.g., if I wanted to write an > iterator that would go only over processes (not individual threads, > just task leaders of each different process) within a cgroup, is that > possible? To traverse processes in a cgroup, the best location is in kernel/cgroup/cgroup.c where there exists a seq_ops to traverse all processes in cgroup.procs file. If we try to implement a bpf based iterator, we could reuse some codes in that file. > > I see task iterator as consisting of two different parts (and that > makes it a bit hard to define nice and clean interface, but if we can > crack this, we'd get an elegant and powerful construct): > > 1. What entity to iterate: threads or processes? (I'm ignoring > task_vma and task_files here, but one could task about files of each > thread or files of each process, but it's less practical, probably) > > 2. What's the scope of objects to iterate: just a thread by tid, just > a process by pid/pidfd, once cgroup iter lands, we'll be able to talk > about threads or processes within a cgroup or cgroup hierarchy (this > is where descendants_{pre,post}, cgroup_self_only and ancestors > ordering comes in as well). > > Currently Kui-Feng is addressing first half of #2 (tid/pid/pidfd > parameters), we can use cgroup iter's parameters to define the scope > of tasks/processes by cgroup "filter" in a follow up (it naturally > extends what we have in this patch set). For #2 as well, it is also possible to have a complete new seq_ops if the traversal is only once. That is why in Kui-Feng's patch, there are a few special case w.r.t. TID. But current approach is also okay. > > So now I'm wondering if there is any benefit to also somehow > specifying threads vs processes as entities to iterate? And if we do > that, does kernel support efficient iteration of processes (as opposed > to threads). IIUC, I didn't find an efficient way to traverse processes only. The current pid_ns.idr records all tasks so traversing processes have to skip intermediate non-main-thread tasks. > > > To be clear, there is a lot of value in having just #2, but while we > are all at this topic, I thought I'd clarify for myself #1 as well. > > Thanks!
On Wed, Aug 17, 2022 at 9:31 PM Yonghong Song <yhs@fb.com> wrote: > > > > On 8/15/22 10:25 PM, Andrii Nakryiko wrote: > > On Sat, Aug 13, 2022 at 3:17 PM Yonghong Song <yhs@fb.com> wrote: > >> > >> > >> > >> On 8/10/22 5:16 PM, Kui-Feng Lee wrote: > >>> Allow creating an iterator that loops through resources of one task/thread. > >>> > >>> People could only create iterators to loop through all resources of > >>> files, vma, and tasks in the system, even though they were interested > >>> in only the resources of a specific task or process. Passing the > >>> additional parameters, people can now create an iterator to go > >>> through all resources or only the resources of a task. > >>> > >>> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> > >>> --- > >>> include/linux/bpf.h | 29 ++++++++ > >>> include/uapi/linux/bpf.h | 8 +++ > >>> kernel/bpf/task_iter.c | 126 ++++++++++++++++++++++++++------- > >>> tools/include/uapi/linux/bpf.h | 8 +++ > >>> 4 files changed, 147 insertions(+), 24 deletions(-) > >>> > > > > Btw, Yonghong, I tried to figure it out myself, but got lost in all > > the kernel functions that don't seem to be very well documented. Sorry > > for being lazy and throwing this at you :) > > > > Is it easy and efficient to iterate only processes using whatever > > kernel helpers we have at our disposal? E.g., if I wanted to write an > > iterator that would go only over processes (not individual threads, > > just task leaders of each different process) within a cgroup, is that > > possible? > > To traverse processes in a cgroup, the best location is in > kernel/cgroup/cgroup.c where there exists a seq_ops to > traverse all processes in cgroup.procs file. If we try > to implement a bpf based iterator, we could reuse some > codes in that file. > yep > > > > I see task iterator as consisting of two different parts (and that > > makes it a bit hard to define nice and clean interface, but if we can > > crack this, we'd get an elegant and powerful construct): > > > > 1. What entity to iterate: threads or processes? (I'm ignoring > > task_vma and task_files here, but one could task about files of each > > thread or files of each process, but it's less practical, probably) > > > > 2. What's the scope of objects to iterate: just a thread by tid, just > > a process by pid/pidfd, once cgroup iter lands, we'll be able to talk > > about threads or processes within a cgroup or cgroup hierarchy (this > > is where descendants_{pre,post}, cgroup_self_only and ancestors > > ordering comes in as well). > > > > Currently Kui-Feng is addressing first half of #2 (tid/pid/pidfd > > parameters), we can use cgroup iter's parameters to define the scope > > of tasks/processes by cgroup "filter" in a follow up (it naturally > > extends what we have in this patch set). > > For #2 as well, it is also possible to have a complete new seq_ops > if the traversal is only once. That is why in Kui-Feng's patch, > there are a few special case w.r.t. TID. But current approach > is also okay. > sounds good! > > > > So now I'm wondering if there is any benefit to also somehow > > specifying threads vs processes as entities to iterate? And if we do > > that, does kernel support efficient iteration of processes (as opposed > > to threads). > > IIUC, I didn't find an efficient way to traverse processes only. > The current pid_ns.idr records all tasks so traversing processes > have to skip intermediate non-main-thread tasks. > I see, too bad, but thanks for checking! > > > > > > To be clear, there is a lot of value in having just #2, but while we > > are all at this topic, I thought I'd clarify for myself #1 as well. > > > > Thanks!
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 11950029284f..6bbe53d06faa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1716,8 +1716,37 @@ int bpf_obj_get_user(const char __user *pathname, int flags); extern int bpf_iter_ ## target(args); \ int __init bpf_iter_ ## target(args) { return 0; } +/* + * The task type of iterators. + * + * For BPF task iterators, they can be parameterized with various + * parameters to visit only some of tasks. + * + * BPF_TASK_ITER_ALL (default) + * Iterate over resources of every task. + * + * BPF_TASK_ITER_TID + * Iterate over resources of a task/tid. + * + * BPF_TASK_ITER_TGID + * Iterate over reosurces of evevry task of a process / task group. + */ +enum bpf_iter_task_type { + BPF_TASK_ITER_ALL = 0, + BPF_TASK_ITER_TID, + BPF_TASK_ITER_TGID, +}; + struct bpf_iter_aux_info { struct bpf_map *map; + struct { + enum bpf_iter_task_type type; + union { + u32 tid; + u32 tgid; + u32 pid_fd; + }; + } task; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ffcbf79a556b..6328aca0cf5c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -91,6 +91,14 @@ union bpf_iter_link_info { struct { __u32 map_fd; } map; + /* + * Parameters of task iterators. + */ + struct { + __u32 tid; + __u32 tgid; + __u32 pid_fd; + } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8c921799def4..f2e21efe075d 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -12,6 +12,12 @@ struct bpf_iter_seq_task_common { struct pid_namespace *ns; + enum bpf_iter_task_type type; + union { + u32 tid; + u32 tgid; + u32 pid_fd; + }; }; struct bpf_iter_seq_task_info { @@ -22,24 +28,40 @@ struct bpf_iter_seq_task_info { u32 tid; }; -static struct task_struct *task_seq_get_next(struct pid_namespace *ns, +static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common, u32 *tid, bool skip_if_dup_files) { struct task_struct *task = NULL; struct pid *pid; + if (common->type == BPF_TASK_ITER_TID) { + if (*tid && *tid != common->tid) + return NULL; + rcu_read_lock(); + pid = find_pid_ns(common->tid, common->ns); + if (pid) { + task = get_pid_task(pid, PIDTYPE_PID); + *tid = common->tid; + } + rcu_read_unlock(); + return task; + } + rcu_read_lock(); retry: - pid = find_ge_pid(*tid, ns); + pid = find_ge_pid(*tid, common->ns); if (pid) { - *tid = pid_nr_ns(pid, ns); + *tid = pid_nr_ns(pid, common->ns); task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { ++*tid; goto retry; - } else if (skip_if_dup_files && !thread_group_leader(task) && - task->files == task->group_leader->files) { + } else if ((skip_if_dup_files && !thread_group_leader(task) && + task->files == task->group_leader->files) || + (common->type == BPF_TASK_ITER_TGID && + __task_pid_nr_ns(task, PIDTYPE_TGID, common->ns) != common->tgid)) { put_task_struct(task); task = NULL; ++*tid; @@ -56,7 +78,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_task_info *info = seq->private; struct task_struct *task; - task = task_seq_get_next(info->common.ns, &info->tid, false); + task = task_seq_get_next(&info->common, &info->tid, false); + if (!task) return NULL; @@ -73,7 +96,8 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; ++info->tid; put_task_struct((struct task_struct *)v); - task = task_seq_get_next(info->common.ns, &info->tid, false); + + task = task_seq_get_next(&info->common, &info->tid, false); if (!task) return NULL; @@ -117,6 +141,43 @@ static void task_seq_stop(struct seq_file *seq, void *v) put_task_struct((struct task_struct *)v); } +static int bpf_iter_attach_task(struct bpf_prog *prog, + union bpf_iter_link_info *linfo, + struct bpf_iter_aux_info *aux) +{ + unsigned int flags; + struct pid_namespace *ns; + struct pid *pid; + pid_t tgid; + + if (linfo->task.tid != 0) { + aux->task.type = BPF_TASK_ITER_TID; + aux->task.tid = linfo->task.tid; + } else if (linfo->task.tgid != 0) { + aux->task.type = BPF_TASK_ITER_TGID; + aux->task.tgid = linfo->task.tgid; + } else if (linfo->task.pid_fd != 0) { + aux->task.type = BPF_TASK_ITER_TGID; + pid = pidfd_get_pid(linfo->task.pid_fd, &flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + ns = task_active_pid_ns(current); + if (IS_ERR(ns)) + return PTR_ERR(ns); + + tgid = pid_nr_ns(pid, ns); + if (tgid <= 0) + return -EINVAL; + + aux->task.tgid = tgid; + } else { + aux->task.type = BPF_TASK_ITER_ALL; + } + + return 0; +} + static const struct seq_operations task_seq_ops = { .start = task_seq_start, .next = task_seq_next, @@ -137,8 +198,7 @@ struct bpf_iter_seq_task_file_info { static struct file * task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) { - struct pid_namespace *ns = info->common.ns; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; struct task_struct *curr_task; unsigned int curr_fd = info->fd; @@ -151,21 +211,18 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) curr_task = info->task; curr_fd = info->fd; } else { - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { info->task = NULL; - info->tid = curr_tid; return NULL; } - /* set info->task and info->tid */ + /* set info->task */ info->task = curr_task; - if (curr_tid == info->tid) { + if (saved_tid == info->tid) curr_fd = info->fd; - } else { - info->tid = curr_tid; + else curr_fd = 0; - } } rcu_read_lock(); @@ -186,9 +243,15 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) /* the current task is done, go to the next task */ rcu_read_unlock(); put_task_struct(curr_task); + + if (info->common.type == BPF_TASK_ITER_TID) { + info->task = NULL; + return NULL; + } + info->task = NULL; info->fd = 0; - curr_tid = ++(info->tid); + saved_tid = ++(info->tid); goto again; } @@ -269,6 +332,17 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) struct bpf_iter_seq_task_common *common = priv_data; common->ns = get_pid_ns(task_active_pid_ns(current)); + common->type = aux->task.type; + switch (common->type) { + case BPF_TASK_ITER_TID: + common->tid = aux->task.tid; + break; + case BPF_TASK_ITER_TGID: + common->tgid = aux->task.tgid; + break; + default: + break; + } return 0; } @@ -307,11 +381,10 @@ enum bpf_task_vma_iter_find_op { static struct vm_area_struct * task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) { - struct pid_namespace *ns = info->common.ns; enum bpf_task_vma_iter_find_op op; struct vm_area_struct *curr_vma; struct task_struct *curr_task; - u32 curr_tid = info->tid; + u32 saved_tid = info->tid; /* If this function returns a non-NULL vma, it holds a reference to * the task_struct, and holds read lock on vma->mm->mmap_lock. @@ -371,14 +444,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) } } else { again: - curr_task = task_seq_get_next(ns, &curr_tid, true); + curr_task = task_seq_get_next(&info->common, &info->tid, true); if (!curr_task) { - info->tid = curr_tid + 1; + info->tid++; goto finish; } - if (curr_tid != info->tid) { - info->tid = curr_tid; + if (saved_tid != info->tid) { /* new task, process the first vma */ op = task_vma_iter_first_vma; } else { @@ -430,9 +502,12 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) return curr_vma; next_task: + if (info->common.type == BPF_TASK_ITER_TID) + goto finish; + put_task_struct(curr_task); info->task = NULL; - curr_tid++; + info->tid++; goto again; finish: @@ -533,6 +608,7 @@ static const struct bpf_iter_seq_info task_seq_info = { static struct bpf_iter_reg task_reg_info = { .target = "task", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { @@ -551,6 +627,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = { static struct bpf_iter_reg task_file_reg_info = { .target = "task_file", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { @@ -571,6 +648,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = { static struct bpf_iter_reg task_vma_reg_info = { .target = "task_vma", + .attach_target = bpf_iter_attach_task, .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ffcbf79a556b..6328aca0cf5c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -91,6 +91,14 @@ union bpf_iter_link_info { struct { __u32 map_fd; } map; + /* + * Parameters of task iterators. + */ + struct { + __u32 tid; + __u32 tgid; + __u32 pid_fd; + } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */
Allow creating an iterator that loops through resources of one task/thread. People could only create iterators to loop through all resources of files, vma, and tasks in the system, even though they were interested in only the resources of a specific task or process. Passing the additional parameters, people can now create an iterator to go through all resources or only the resources of a task. Signed-off-by: Kui-Feng Lee <kuifeng@fb.com> --- include/linux/bpf.h | 29 ++++++++ include/uapi/linux/bpf.h | 8 +++ kernel/bpf/task_iter.c | 126 ++++++++++++++++++++++++++------- tools/include/uapi/linux/bpf.h | 8 +++ 4 files changed, 147 insertions(+), 24 deletions(-)