diff mbox series

[4/4] cgroup/bpf: Honor cgroup NS in cgroup_iter for ancestors

Message ID 20220826165238.30915-5-mkoutny@suse.com (mailing list archive)
State Not Applicable
Delegated to: BPF
Headers show
Series Honor cgroup namespace when resolving cgroup id | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch
bpf/vmtest-bpf-next-VM_Test-6 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-5 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-1 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs_no_alu32 on x86_64 with llvm-16

Commit Message

Michal Koutný Aug. 26, 2022, 4:52 p.m. UTC
The iterator with BPF_CGROUP_ITER_ANCESTORS_UP can traverse up across a
cgroup namespace level, which may be surprising within a non-init cgroup
namespace.

Introduce and use a new cgroup_parent_ns() helper that stops according
to cgroup namespace boundary. With BPF_CGROUP_ITER_ANCESTORS_UP. We use
the cgroup namespace of the iterator caller, not that one of the creator
(might be different, the former is relevant).

Fixes: d4ccaf58a847 ("bpf: Introduce cgroup iter")
Signed-off-by: Michal Koutný <mkoutny@suse.com>
---
 include/linux/cgroup.h   |  3 +++
 kernel/bpf/cgroup_iter.c |  9 ++++++---
 kernel/cgroup/cgroup.c   | 32 +++++++++++++++++++++++---------
 3 files changed, 32 insertions(+), 12 deletions(-)

Comments

Yosry Ahmed Aug. 26, 2022, 5:41 p.m. UTC | #1
Hi there!

Thanks for following up with this series!

On Fri, Aug 26, 2022 at 9:53 AM Michal Koutný <mkoutny@suse.com> wrote:
>
> The iterator with BPF_CGROUP_ITER_ANCESTORS_UP can traverse up across a
> cgroup namespace level, which may be surprising within a non-init cgroup
> namespace.
>
> Introduce and use a new cgroup_parent_ns() helper that stops according
> to cgroup namespace boundary. With BPF_CGROUP_ITER_ANCESTORS_UP. We use
> the cgroup namespace of the iterator caller, not that one of the creator
> (might be different, the former is relevant).
>
> Fixes: d4ccaf58a847 ("bpf: Introduce cgroup iter")
> Signed-off-by: Michal Koutný <mkoutny@suse.com>
> ---
>  include/linux/cgroup.h   |  3 +++
>  kernel/bpf/cgroup_iter.c |  9 ++++++---
>  kernel/cgroup/cgroup.c   | 32 +++++++++++++++++++++++---------
>  3 files changed, 32 insertions(+), 12 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index b6a9528374a8..b63a80e03fae 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -858,6 +858,9 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
>  int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
>                    struct cgroup_namespace *ns);
>
> +struct cgroup *cgroup_parent_ns(struct cgroup *cgrp,
> +                               struct cgroup_namespace *ns);
> +
>  #else /* !CONFIG_CGROUPS */
>
>  static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
> diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
> index c69bce2f4403..06ee4a0c5870 100644
> --- a/kernel/bpf/cgroup_iter.c
> +++ b/kernel/bpf/cgroup_iter.c
> @@ -104,6 +104,7 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>  {
>         struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
>         struct cgroup_iter_priv *p = seq->private;
> +       struct cgroup *parent;
>
>         ++*pos;
>         if (p->terminate)
> @@ -113,9 +114,11 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>                 return css_next_descendant_pre(curr, p->start_css);
>         else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
>                 return css_next_descendant_post(curr, p->start_css);
> -       else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
> -               return curr->parent;
> -       else  /* BPF_CGROUP_ITER_SELF_ONLY */
> +       else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) {
> +               parent = cgroup_parent_ns(curr->cgroup,
> +                                         current->nsproxy->cgroup_ns);
> +               return parent ? &parent->self : NULL;
> +       } else  /* BPF_CGROUP_ITER_SELF_ONLY */
>                 return NULL;
>  }
>
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index c0377726031f..d60b5dfbbbc9 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -1417,11 +1417,11 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
>  }
>
>  /*
> - * look up cgroup associated with current task's cgroup namespace on the
> + * look up cgroup associated with given cgroup namespace on the
>   * specified hierarchy
>   */
> -static struct cgroup *
> -current_cgns_cgroup_from_root(struct cgroup_root *root)
> +static struct cgroup *cgns_cgroup_from_root(struct cgroup_root *root,
> +                                           struct cgroup_namespace *ns)
>  {
>         struct cgroup *res = NULL;
>         struct css_set *cset;
> @@ -1430,7 +1430,7 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
>
>         rcu_read_lock();
>
> -       cset = current->nsproxy->cgroup_ns->root_cset;
> +       cset = ns->root_cset;
>         res = __cset_cgroup_from_root(cset, root);
>
>         rcu_read_unlock();
> @@ -1852,15 +1852,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
>         int len = 0;
>         char *buf = NULL;
>         struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
> -       struct cgroup *ns_cgroup;
> +       struct cgroup *root_cgroup;
>
>         buf = kmalloc(PATH_MAX, GFP_KERNEL);
>         if (!buf)
>                 return -ENOMEM;
>
>         spin_lock_irq(&css_set_lock);
> -       ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
> -       len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
> +       root_cgroup = cgns_cgroup_from_root(kf_cgroot, current->nsproxy->cgroup_ns);
> +       len = kernfs_path_from_node(kf_node, root_cgroup->kn, buf, PATH_MAX);
>         spin_unlock_irq(&css_set_lock);
>
>         if (len >= PATH_MAX)
> @@ -2330,6 +2330,18 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
>  }
>  EXPORT_SYMBOL_GPL(cgroup_path_ns);
>
> +struct cgroup *cgroup_parent_ns(struct cgroup *cgrp,
> +                                  struct cgroup_namespace *ns)
> +{
> +       struct cgroup *root_cgrp;
> +
> +       spin_lock_irq(&css_set_lock);
> +       root_cgrp = cgns_cgroup_from_root(cgrp->root, ns);
> +       spin_unlock_irq(&css_set_lock);
> +
> +       return cgrp == root_cgrp ? NULL : cgroup_parent(cgrp);

I understand that currently cgroup_iter is the only user of this, but
for future use cases, is it safe to assume that cgrp will always be
inside ns? Would it be safer to do something like:

struct cgroup *parent = cgroup_parent(cgrp);

if (!parent)
    return NULL;

return cgroup_is_descendant(parent, root_cgrp) ? parent : NULL;


> +}
> +
>  /**
>   * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
>   * @task: target task
> @@ -6031,7 +6043,8 @@ struct cgroup *cgroup_get_from_id(u64 id)
>                 goto out;
>
>         spin_lock_irq(&css_set_lock);
> -       root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
> +       root_cgrp = cgns_cgroup_from_root(&cgrp_dfl_root,
> +                                         current->nsproxy->cgroup_ns);
>         spin_unlock_irq(&css_set_lock);
>         if (!cgroup_is_descendant(cgrp, root_cgrp)) {
>                 cgroup_put(cgrp);
> @@ -6612,7 +6625,8 @@ struct cgroup *cgroup_get_from_path(const char *path)
>         struct cgroup *root_cgrp;
>
>         spin_lock_irq(&css_set_lock);
> -       root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
> +       root_cgrp = cgns_cgroup_from_root(&cgrp_dfl_root,
> +                                         current->nsproxy->cgroup_ns);
>         kn = kernfs_walk_and_get(root_cgrp->kn, path);
>         spin_unlock_irq(&css_set_lock);
>         if (!kn)
> --
> 2.37.0
>
Michal Koutný Aug. 29, 2022, 12:59 p.m. UTC | #2
On Fri, Aug 26, 2022 at 10:41:37AM -0700, Yosry Ahmed <yosryahmed@google.com> wrote:
> I understand that currently cgroup_iter is the only user of this, but
> for future use cases, is it safe to assume that cgrp will always be
> inside ns? Would it be safer to do something like:

I preferred the simpler root_cgrp comparison to avoid pointer
arithmetics in cgroup_is_descendant. But I also made the assumption of
cgrp in ns.

Thanks, I'll likely adjust cgroup_path_ns to make it more robust for
an external cgrp.


I'd like to clarify, if a process A in a broad cgroup ns sets up a BPF
cgroup iterator, exposes it via bpffs and than a process B in a narrowed
cgroup ns (which excludes the origin cgroup) wants to traverse the
iterator, should it fail straight ahead (regardless of iter order)?
The alternative would be to allow self-dereference but prohibit any
iterator moves (regardless of order).


Thanks,
Michal
Yosry Ahmed Aug. 29, 2022, 5:30 p.m. UTC | #3
On Mon, Aug 29, 2022 at 6:00 AM Michal Koutný <mkoutny@suse.com> wrote:
>
> On Fri, Aug 26, 2022 at 10:41:37AM -0700, Yosry Ahmed <yosryahmed@google.com> wrote:
> > I understand that currently cgroup_iter is the only user of this, but
> > for future use cases, is it safe to assume that cgrp will always be
> > inside ns? Would it be safer to do something like:
>
> I preferred the simpler root_cgrp comparison to avoid pointer
> arithmetics in cgroup_is_descendant. But I also made the assumption of
> cgrp in ns.
>
> Thanks, I'll likely adjust cgroup_path_ns to make it more robust for
> an external cgrp.
>

Great, thanks!

>
> I'd like to clarify, if a process A in a broad cgroup ns sets up a BPF
> cgroup iterator, exposes it via bpffs and than a process B in a narrowed
> cgroup ns (which excludes the origin cgroup) wants to traverse the
> iterator, should it fail straight ahead (regardless of iter order)?
> The alternative would be to allow self-dereference but prohibit any
> iterator moves (regardless of order).
>

imo it should fail straight ahead, but maybe others (Tejun? Hao?) have
other opinions here.

>
> Thanks,
> Michal
Tejun Heo Aug. 29, 2022, 5:49 p.m. UTC | #4
On Mon, Aug 29, 2022 at 10:30:45AM -0700, Yosry Ahmed wrote:
> > I'd like to clarify, if a process A in a broad cgroup ns sets up a BPF
> > cgroup iterator, exposes it via bpffs and than a process B in a narrowed
> > cgroup ns (which excludes the origin cgroup) wants to traverse the
> > iterator, should it fail straight ahead (regardless of iter order)?
> > The alternative would be to allow self-dereference but prohibit any
> > iterator moves (regardless of order).
> >
> 
> imo it should fail straight ahead, but maybe others (Tejun? Hao?) have
> other opinions here.

Yeah, I'd prefer it to fail right away as that's simple and gives us the
most choices for the future.

Thanks.
Hao Luo Aug. 29, 2022, 6:02 p.m. UTC | #5
On Mon, Aug 29, 2022 at 10:49 AM Tejun Heo <tj@kernel.org> wrote:
>
> On Mon, Aug 29, 2022 at 10:30:45AM -0700, Yosry Ahmed wrote:
> > > I'd like to clarify, if a process A in a broad cgroup ns sets up a BPF
> > > cgroup iterator, exposes it via bpffs and than a process B in a narrowed
> > > cgroup ns (which excludes the origin cgroup) wants to traverse the
> > > iterator, should it fail straight ahead (regardless of iter order)?
> > > The alternative would be to allow self-dereference but prohibit any
> > > iterator moves (regardless of order).
> > >
> >
> > imo it should fail straight ahead, but maybe others (Tejun? Hao?) have
> > other opinions here.
>
> Yeah, I'd prefer it to fail right away as that's simple and gives us the
> most choices for the future.
>

Thanks Michal for fixing the cgroup iter use case! I agree that
failing straight ahead is better. I don't envision a use case that
wants the alternative.

> Thanks.
>
> --
> tejun
diff mbox series

Patch

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b6a9528374a8..b63a80e03fae 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -858,6 +858,9 @@  struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
 int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
 		   struct cgroup_namespace *ns);
 
+struct cgroup *cgroup_parent_ns(struct cgroup *cgrp,
+				struct cgroup_namespace *ns);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index c69bce2f4403..06ee4a0c5870 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -104,6 +104,7 @@  static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v;
 	struct cgroup_iter_priv *p = seq->private;
+	struct cgroup *parent;
 
 	++*pos;
 	if (p->terminate)
@@ -113,9 +114,11 @@  static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		return css_next_descendant_pre(curr, p->start_css);
 	else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
 		return css_next_descendant_post(curr, p->start_css);
-	else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
-		return curr->parent;
-	else  /* BPF_CGROUP_ITER_SELF_ONLY */
+	else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) {
+		parent = cgroup_parent_ns(curr->cgroup,
+					  current->nsproxy->cgroup_ns);
+		return parent ? &parent->self : NULL;
+	} else  /* BPF_CGROUP_ITER_SELF_ONLY */
 		return NULL;
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c0377726031f..d60b5dfbbbc9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1417,11 +1417,11 @@  static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
 }
 
 /*
- * look up cgroup associated with current task's cgroup namespace on the
+ * look up cgroup associated with given cgroup namespace on the
  * specified hierarchy
  */
-static struct cgroup *
-current_cgns_cgroup_from_root(struct cgroup_root *root)
+static struct cgroup *cgns_cgroup_from_root(struct cgroup_root *root,
+					    struct cgroup_namespace *ns)
 {
 	struct cgroup *res = NULL;
 	struct css_set *cset;
@@ -1430,7 +1430,7 @@  current_cgns_cgroup_from_root(struct cgroup_root *root)
 
 	rcu_read_lock();
 
-	cset = current->nsproxy->cgroup_ns->root_cset;
+	cset = ns->root_cset;
 	res = __cset_cgroup_from_root(cset, root);
 
 	rcu_read_unlock();
@@ -1852,15 +1852,15 @@  int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 	int len = 0;
 	char *buf = NULL;
 	struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
-	struct cgroup *ns_cgroup;
+	struct cgroup *root_cgroup;
 
 	buf = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
 	spin_lock_irq(&css_set_lock);
-	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
-	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
+	root_cgroup = cgns_cgroup_from_root(kf_cgroot, current->nsproxy->cgroup_ns);
+	len = kernfs_path_from_node(kf_node, root_cgroup->kn, buf, PATH_MAX);
 	spin_unlock_irq(&css_set_lock);
 
 	if (len >= PATH_MAX)
@@ -2330,6 +2330,18 @@  int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
 }
 EXPORT_SYMBOL_GPL(cgroup_path_ns);
 
+struct cgroup *cgroup_parent_ns(struct cgroup *cgrp,
+				   struct cgroup_namespace *ns)
+{
+	struct cgroup *root_cgrp;
+
+	spin_lock_irq(&css_set_lock);
+	root_cgrp = cgns_cgroup_from_root(cgrp->root, ns);
+	spin_unlock_irq(&css_set_lock);
+
+	return cgrp == root_cgrp ? NULL : cgroup_parent(cgrp);
+}
+
 /**
  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
  * @task: target task
@@ -6031,7 +6043,8 @@  struct cgroup *cgroup_get_from_id(u64 id)
 		goto out;
 
 	spin_lock_irq(&css_set_lock);
-	root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
+	root_cgrp = cgns_cgroup_from_root(&cgrp_dfl_root,
+					  current->nsproxy->cgroup_ns);
 	spin_unlock_irq(&css_set_lock);
 	if (!cgroup_is_descendant(cgrp, root_cgrp)) {
 		cgroup_put(cgrp);
@@ -6612,7 +6625,8 @@  struct cgroup *cgroup_get_from_path(const char *path)
 	struct cgroup *root_cgrp;
 
 	spin_lock_irq(&css_set_lock);
-	root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
+	root_cgrp = cgns_cgroup_from_root(&cgrp_dfl_root,
+					  current->nsproxy->cgroup_ns);
 	kn = kernfs_walk_and_get(root_cgrp->kn, path);
 	spin_unlock_irq(&css_set_lock);
 	if (!kn)