Message ID | 20220826165238.30915-5-mkoutny@suse.com (mailing list archive) |
---|---|
State | Not Applicable |
Delegated to: | BPF |
Headers | show |
Series | Honor cgroup namespace when resolving cgroup id | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Not a local patch |
bpf/vmtest-bpf-next-VM_Test-6 | success | Logs for test_maps on s390x with gcc |
bpf/vmtest-bpf-next-PR | fail | PR summary |
bpf/vmtest-bpf-next-VM_Test-9 | success | Logs for test_progs on s390x with gcc |
bpf/vmtest-bpf-next-VM_Test-12 | success | Logs for test_progs_no_alu32 on s390x with gcc |
bpf/vmtest-bpf-next-VM_Test-15 | success | Logs for test_verifier on s390x with gcc |
bpf/vmtest-bpf-next-VM_Test-4 | success | Logs for llvm-toolchain |
bpf/vmtest-bpf-next-VM_Test-5 | success | Logs for set-matrix |
bpf/vmtest-bpf-next-VM_Test-2 | success | Logs for build for x86_64 with gcc |
bpf/vmtest-bpf-next-VM_Test-3 | success | Logs for build for x86_64 with llvm-16 |
bpf/vmtest-bpf-next-VM_Test-1 | success | Logs for build for s390x with gcc |
bpf/vmtest-bpf-next-VM_Test-7 | success | Logs for test_maps on x86_64 with gcc |
bpf/vmtest-bpf-next-VM_Test-8 | success | Logs for test_maps on x86_64 with llvm-16 |
bpf/vmtest-bpf-next-VM_Test-16 | success | Logs for test_verifier on x86_64 with gcc |
bpf/vmtest-bpf-next-VM_Test-17 | success | Logs for test_verifier on x86_64 with llvm-16 |
bpf/vmtest-bpf-next-VM_Test-10 | success | Logs for test_progs on x86_64 with gcc |
bpf/vmtest-bpf-next-VM_Test-11 | success | Logs for test_progs on x86_64 with llvm-16 |
bpf/vmtest-bpf-next-VM_Test-13 | success | Logs for test_progs_no_alu32 on x86_64 with gcc |
bpf/vmtest-bpf-next-VM_Test-14 | success | Logs for test_progs_no_alu32 on x86_64 with llvm-16 |
Hi there! Thanks for following up with this series! On Fri, Aug 26, 2022 at 9:53 AM Michal Koutný <mkoutny@suse.com> wrote: > > The iterator with BPF_CGROUP_ITER_ANCESTORS_UP can traverse up across a > cgroup namespace level, which may be surprising within a non-init cgroup > namespace. > > Introduce and use a new cgroup_parent_ns() helper that stops according > to cgroup namespace boundary. With BPF_CGROUP_ITER_ANCESTORS_UP. We use > the cgroup namespace of the iterator caller, not that one of the creator > (might be different, the former is relevant). > > Fixes: d4ccaf58a847 ("bpf: Introduce cgroup iter") > Signed-off-by: Michal Koutný <mkoutny@suse.com> > --- > include/linux/cgroup.h | 3 +++ > kernel/bpf/cgroup_iter.c | 9 ++++++--- > kernel/cgroup/cgroup.c | 32 +++++++++++++++++++++++--------- > 3 files changed, 32 insertions(+), 12 deletions(-) > > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h > index b6a9528374a8..b63a80e03fae 100644 > --- a/include/linux/cgroup.h > +++ b/include/linux/cgroup.h > @@ -858,6 +858,9 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, > int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, > struct cgroup_namespace *ns); > > +struct cgroup *cgroup_parent_ns(struct cgroup *cgrp, > + struct cgroup_namespace *ns); > + > #else /* !CONFIG_CGROUPS */ > > static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } > diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c > index c69bce2f4403..06ee4a0c5870 100644 > --- a/kernel/bpf/cgroup_iter.c > +++ b/kernel/bpf/cgroup_iter.c > @@ -104,6 +104,7 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) > { > struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v; > struct cgroup_iter_priv *p = seq->private; > + struct cgroup *parent; > > ++*pos; > if (p->terminate) > @@ -113,9 +114,11 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) > return css_next_descendant_pre(curr, p->start_css); > else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) > return css_next_descendant_post(curr, p->start_css); > - else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) > - return curr->parent; > - else /* BPF_CGROUP_ITER_SELF_ONLY */ > + else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) { > + parent = cgroup_parent_ns(curr->cgroup, > + current->nsproxy->cgroup_ns); > + return parent ? &parent->self : NULL; > + } else /* BPF_CGROUP_ITER_SELF_ONLY */ > return NULL; > } > > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > index c0377726031f..d60b5dfbbbc9 100644 > --- a/kernel/cgroup/cgroup.c > +++ b/kernel/cgroup/cgroup.c > @@ -1417,11 +1417,11 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, > } > > /* > - * look up cgroup associated with current task's cgroup namespace on the > + * look up cgroup associated with given cgroup namespace on the > * specified hierarchy > */ > -static struct cgroup * > -current_cgns_cgroup_from_root(struct cgroup_root *root) > +static struct cgroup *cgns_cgroup_from_root(struct cgroup_root *root, > + struct cgroup_namespace *ns) > { > struct cgroup *res = NULL; > struct css_set *cset; > @@ -1430,7 +1430,7 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) > > rcu_read_lock(); > > - cset = current->nsproxy->cgroup_ns->root_cset; > + cset = ns->root_cset; > res = __cset_cgroup_from_root(cset, root); > > rcu_read_unlock(); > @@ -1852,15 +1852,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, > int len = 0; > char *buf = NULL; > struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root); > - struct cgroup *ns_cgroup; > + struct cgroup *root_cgroup; > > buf = kmalloc(PATH_MAX, GFP_KERNEL); > if (!buf) > return -ENOMEM; > > spin_lock_irq(&css_set_lock); > - ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); > - len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); > + root_cgroup = cgns_cgroup_from_root(kf_cgroot, current->nsproxy->cgroup_ns); > + len = kernfs_path_from_node(kf_node, root_cgroup->kn, buf, PATH_MAX); > spin_unlock_irq(&css_set_lock); > > if (len >= PATH_MAX) > @@ -2330,6 +2330,18 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, > } > EXPORT_SYMBOL_GPL(cgroup_path_ns); > > +struct cgroup *cgroup_parent_ns(struct cgroup *cgrp, > + struct cgroup_namespace *ns) > +{ > + struct cgroup *root_cgrp; > + > + spin_lock_irq(&css_set_lock); > + root_cgrp = cgns_cgroup_from_root(cgrp->root, ns); > + spin_unlock_irq(&css_set_lock); > + > + return cgrp == root_cgrp ? NULL : cgroup_parent(cgrp); I understand that currently cgroup_iter is the only user of this, but for future use cases, is it safe to assume that cgrp will always be inside ns? Would it be safer to do something like: struct cgroup *parent = cgroup_parent(cgrp); if (!parent) return NULL; return cgroup_is_descendant(parent, root_cgrp) ? parent : NULL; > +} > + > /** > * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy > * @task: target task > @@ -6031,7 +6043,8 @@ struct cgroup *cgroup_get_from_id(u64 id) > goto out; > > spin_lock_irq(&css_set_lock); > - root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root); > + root_cgrp = cgns_cgroup_from_root(&cgrp_dfl_root, > + current->nsproxy->cgroup_ns); > spin_unlock_irq(&css_set_lock); > if (!cgroup_is_descendant(cgrp, root_cgrp)) { > cgroup_put(cgrp); > @@ -6612,7 +6625,8 @@ struct cgroup *cgroup_get_from_path(const char *path) > struct cgroup *root_cgrp; > > spin_lock_irq(&css_set_lock); > - root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root); > + root_cgrp = cgns_cgroup_from_root(&cgrp_dfl_root, > + current->nsproxy->cgroup_ns); > kn = kernfs_walk_and_get(root_cgrp->kn, path); > spin_unlock_irq(&css_set_lock); > if (!kn) > -- > 2.37.0 >
On Fri, Aug 26, 2022 at 10:41:37AM -0700, Yosry Ahmed <yosryahmed@google.com> wrote: > I understand that currently cgroup_iter is the only user of this, but > for future use cases, is it safe to assume that cgrp will always be > inside ns? Would it be safer to do something like: I preferred the simpler root_cgrp comparison to avoid pointer arithmetics in cgroup_is_descendant. But I also made the assumption of cgrp in ns. Thanks, I'll likely adjust cgroup_path_ns to make it more robust for an external cgrp. I'd like to clarify, if a process A in a broad cgroup ns sets up a BPF cgroup iterator, exposes it via bpffs and than a process B in a narrowed cgroup ns (which excludes the origin cgroup) wants to traverse the iterator, should it fail straight ahead (regardless of iter order)? The alternative would be to allow self-dereference but prohibit any iterator moves (regardless of order). Thanks, Michal
On Mon, Aug 29, 2022 at 6:00 AM Michal Koutný <mkoutny@suse.com> wrote: > > On Fri, Aug 26, 2022 at 10:41:37AM -0700, Yosry Ahmed <yosryahmed@google.com> wrote: > > I understand that currently cgroup_iter is the only user of this, but > > for future use cases, is it safe to assume that cgrp will always be > > inside ns? Would it be safer to do something like: > > I preferred the simpler root_cgrp comparison to avoid pointer > arithmetics in cgroup_is_descendant. But I also made the assumption of > cgrp in ns. > > Thanks, I'll likely adjust cgroup_path_ns to make it more robust for > an external cgrp. > Great, thanks! > > I'd like to clarify, if a process A in a broad cgroup ns sets up a BPF > cgroup iterator, exposes it via bpffs and than a process B in a narrowed > cgroup ns (which excludes the origin cgroup) wants to traverse the > iterator, should it fail straight ahead (regardless of iter order)? > The alternative would be to allow self-dereference but prohibit any > iterator moves (regardless of order). > imo it should fail straight ahead, but maybe others (Tejun? Hao?) have other opinions here. > > Thanks, > Michal
On Mon, Aug 29, 2022 at 10:30:45AM -0700, Yosry Ahmed wrote: > > I'd like to clarify, if a process A in a broad cgroup ns sets up a BPF > > cgroup iterator, exposes it via bpffs and than a process B in a narrowed > > cgroup ns (which excludes the origin cgroup) wants to traverse the > > iterator, should it fail straight ahead (regardless of iter order)? > > The alternative would be to allow self-dereference but prohibit any > > iterator moves (regardless of order). > > > > imo it should fail straight ahead, but maybe others (Tejun? Hao?) have > other opinions here. Yeah, I'd prefer it to fail right away as that's simple and gives us the most choices for the future. Thanks.
On Mon, Aug 29, 2022 at 10:49 AM Tejun Heo <tj@kernel.org> wrote: > > On Mon, Aug 29, 2022 at 10:30:45AM -0700, Yosry Ahmed wrote: > > > I'd like to clarify, if a process A in a broad cgroup ns sets up a BPF > > > cgroup iterator, exposes it via bpffs and than a process B in a narrowed > > > cgroup ns (which excludes the origin cgroup) wants to traverse the > > > iterator, should it fail straight ahead (regardless of iter order)? > > > The alternative would be to allow self-dereference but prohibit any > > > iterator moves (regardless of order). > > > > > > > imo it should fail straight ahead, but maybe others (Tejun? Hao?) have > > other opinions here. > > Yeah, I'd prefer it to fail right away as that's simple and gives us the > most choices for the future. > Thanks Michal for fixing the cgroup iter use case! I agree that failing straight ahead is better. I don't envision a use case that wants the alternative. > Thanks. > > -- > tejun
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b6a9528374a8..b63a80e03fae 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -858,6 +858,9 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); +struct cgroup *cgroup_parent_ns(struct cgroup *cgrp, + struct cgroup_namespace *ns); + #else /* !CONFIG_CGROUPS */ static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index c69bce2f4403..06ee4a0c5870 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -104,6 +104,7 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v; struct cgroup_iter_priv *p = seq->private; + struct cgroup *parent; ++*pos; if (p->terminate) @@ -113,9 +114,11 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) return css_next_descendant_pre(curr, p->start_css); else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) return css_next_descendant_post(curr, p->start_css); - else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) - return curr->parent; - else /* BPF_CGROUP_ITER_SELF_ONLY */ + else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) { + parent = cgroup_parent_ns(curr->cgroup, + current->nsproxy->cgroup_ns); + return parent ? &parent->self : NULL; + } else /* BPF_CGROUP_ITER_SELF_ONLY */ return NULL; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c0377726031f..d60b5dfbbbc9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1417,11 +1417,11 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, } /* - * look up cgroup associated with current task's cgroup namespace on the + * look up cgroup associated with given cgroup namespace on the * specified hierarchy */ -static struct cgroup * -current_cgns_cgroup_from_root(struct cgroup_root *root) +static struct cgroup *cgns_cgroup_from_root(struct cgroup_root *root, + struct cgroup_namespace *ns) { struct cgroup *res = NULL; struct css_set *cset; @@ -1430,7 +1430,7 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) rcu_read_lock(); - cset = current->nsproxy->cgroup_ns->root_cset; + cset = ns->root_cset; res = __cset_cgroup_from_root(cset, root); rcu_read_unlock(); @@ -1852,15 +1852,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, int len = 0; char *buf = NULL; struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root); - struct cgroup *ns_cgroup; + struct cgroup *root_cgroup; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) return -ENOMEM; spin_lock_irq(&css_set_lock); - ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); - len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); + root_cgroup = cgns_cgroup_from_root(kf_cgroot, current->nsproxy->cgroup_ns); + len = kernfs_path_from_node(kf_node, root_cgroup->kn, buf, PATH_MAX); spin_unlock_irq(&css_set_lock); if (len >= PATH_MAX) @@ -2330,6 +2330,18 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, } EXPORT_SYMBOL_GPL(cgroup_path_ns); +struct cgroup *cgroup_parent_ns(struct cgroup *cgrp, + struct cgroup_namespace *ns) +{ + struct cgroup *root_cgrp; + + spin_lock_irq(&css_set_lock); + root_cgrp = cgns_cgroup_from_root(cgrp->root, ns); + spin_unlock_irq(&css_set_lock); + + return cgrp == root_cgrp ? NULL : cgroup_parent(cgrp); +} + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -6031,7 +6043,8 @@ struct cgroup *cgroup_get_from_id(u64 id) goto out; spin_lock_irq(&css_set_lock); - root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root); + root_cgrp = cgns_cgroup_from_root(&cgrp_dfl_root, + current->nsproxy->cgroup_ns); spin_unlock_irq(&css_set_lock); if (!cgroup_is_descendant(cgrp, root_cgrp)) { cgroup_put(cgrp); @@ -6612,7 +6625,8 @@ struct cgroup *cgroup_get_from_path(const char *path) struct cgroup *root_cgrp; spin_lock_irq(&css_set_lock); - root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root); + root_cgrp = cgns_cgroup_from_root(&cgrp_dfl_root, + current->nsproxy->cgroup_ns); kn = kernfs_walk_and_get(root_cgrp->kn, path); spin_unlock_irq(&css_set_lock); if (!kn)
The iterator with BPF_CGROUP_ITER_ANCESTORS_UP can traverse up across a cgroup namespace level, which may be surprising within a non-init cgroup namespace. Introduce and use a new cgroup_parent_ns() helper that stops according to cgroup namespace boundary. With BPF_CGROUP_ITER_ANCESTORS_UP. We use the cgroup namespace of the iterator caller, not that one of the creator (might be different, the former is relevant). Fixes: d4ccaf58a847 ("bpf: Introduce cgroup iter") Signed-off-by: Michal Koutný <mkoutny@suse.com> --- include/linux/cgroup.h | 3 +++ kernel/bpf/cgroup_iter.c | 9 ++++++--- kernel/cgroup/cgroup.c | 32 +++++++++++++++++++++++--------- 3 files changed, 32 insertions(+), 12 deletions(-)