diff mbox series

[RFC,bpf-next,1/9] bpf: introduce CGROUP_SUBSYS_RSTAT program type

Message ID 20220510001807.4132027-2-yosryahmed@google.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series bpf: cgroup hierarchical stats collection | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail merge-conflict
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/apply fail Patch does not apply to bpf-next

Commit Message

Yosry Ahmed May 10, 2022, 12:17 a.m. UTC
This patch introduces a new bpf program type CGROUP_SUBSYS_RSTAT,
with new corresponding link and attach types.

The main purpose of these programs is to allow BPF programs to collect
and maintain hierarchical cgroup stats easily and efficiently by making
using of the rstat framework in the kernel.

Those programs attach to a cgroup subsystem. They typically contain logic
to aggregate per-cpu and per-cgroup stats collected by other BPF programs.

Currently, only rstat flusher programs can be attached to cgroup
subsystems, but this can be extended later if a use-case arises.

See the selftest in the final patch for a practical example.

Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
---
 include/linux/bpf-cgroup-subsys.h |  30 ++++++
 include/linux/bpf_types.h         |   2 +
 include/linux/cgroup-defs.h       |   4 +
 include/uapi/linux/bpf.h          |  12 +++
 kernel/bpf/Makefile               |   1 +
 kernel/bpf/cgroup_subsys.c        | 166 ++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c              |   6 ++
 kernel/cgroup/cgroup.c            |   1 +
 tools/include/uapi/linux/bpf.h    |  12 +++
 9 files changed, 234 insertions(+)
 create mode 100644 include/linux/bpf-cgroup-subsys.h
 create mode 100644 kernel/bpf/cgroup_subsys.c

Comments

Yosry Ahmed May 10, 2022, 6:07 p.m. UTC | #1
On Mon, May 9, 2022 at 5:18 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> This patch introduces a new bpf program type CGROUP_SUBSYS_RSTAT,
> with new corresponding link and attach types.
>
> The main purpose of these programs is to allow BPF programs to collect
> and maintain hierarchical cgroup stats easily and efficiently by making
> using of the rstat framework in the kernel.
>
> Those programs attach to a cgroup subsystem. They typically contain logic
> to aggregate per-cpu and per-cgroup stats collected by other BPF programs.
>
> Currently, only rstat flusher programs can be attached to cgroup
> subsystems, but this can be extended later if a use-case arises.
>
> See the selftest in the final patch for a practical example.
>
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> ---
>  include/linux/bpf-cgroup-subsys.h |  30 ++++++
>  include/linux/bpf_types.h         |   2 +
>  include/linux/cgroup-defs.h       |   4 +
>  include/uapi/linux/bpf.h          |  12 +++
>  kernel/bpf/Makefile               |   1 +
>  kernel/bpf/cgroup_subsys.c        | 166 ++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c              |   6 ++
>  kernel/cgroup/cgroup.c            |   1 +
>  tools/include/uapi/linux/bpf.h    |  12 +++
>  9 files changed, 234 insertions(+)
>  create mode 100644 include/linux/bpf-cgroup-subsys.h
>  create mode 100644 kernel/bpf/cgroup_subsys.c
>
> diff --git a/include/linux/bpf-cgroup-subsys.h b/include/linux/bpf-cgroup-subsys.h
> new file mode 100644
> index 000000000000..4dcde06b5599
> --- /dev/null
> +++ b/include/linux/bpf-cgroup-subsys.h
> @@ -0,0 +1,30 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright 2022 Google LLC.
> + */
> +#ifndef _BPF_CGROUP_SUBSYS_H_
> +#define _BPF_CGROUP_SUBSYS_H_
> +
> +#include <linux/bpf.h>
> +
> +struct cgroup_subsys_bpf {
> +       /* Head of the list of BPF rstat flushers attached to this subsystem */
> +       struct list_head rstat_flushers;
> +       spinlock_t flushers_lock;
> +};
> +
> +struct bpf_subsys_rstat_flusher {
> +       struct bpf_prog *prog;
> +       /* List of BPF rtstat flushers, anchored at subsys->bpf */
> +       struct list_head list;
> +};
> +
> +struct bpf_cgroup_subsys_link {
> +       struct bpf_link link;
> +       struct cgroup_subsys *ss;
> +};
> +
> +int cgroup_subsys_bpf_link_attach(const union bpf_attr *attr,
> +                                 struct bpf_prog *prog);
> +

In the next version I will make sure everything here is also defined
for when CONFIG_BPF_SYSCALL is not set, and move the structs that can
be moved to the cc file there.

> +#endif  // _BPF_CGROUP_SUBSYS_H_
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index 3e24ad0c4b3c..854ee958b0e4 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -56,6 +56,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl,
>               struct bpf_sysctl, struct bpf_sysctl_kern)
>  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt,
>               struct bpf_sockopt, struct bpf_sockopt_kern)
> +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT, cgroup_subsys_rstat,
> +             struct bpf_rstat_ctx, struct bpf_rstat_ctx)
>  #endif
>  #ifdef CONFIG_BPF_LIRC_MODE2
>  BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index 1bfcfb1af352..3bd6eed1fa13 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -20,6 +20,7 @@
>  #include <linux/u64_stats_sync.h>
>  #include <linux/workqueue.h>
>  #include <linux/bpf-cgroup-defs.h>
> +#include <linux/bpf-cgroup-subsys.h>
>  #include <linux/psi_types.h>
>
>  #ifdef CONFIG_CGROUPS
> @@ -706,6 +707,9 @@ struct cgroup_subsys {
>          * specifies the mask of subsystems that this one depends on.
>          */
>         unsigned int depends_on;
> +
> +       /* used to store bpf programs.*/
> +       struct cgroup_subsys_bpf bpf;
>  };
>
>  extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index d14b10b85e51..0f4855fa85db 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -952,6 +952,7 @@ enum bpf_prog_type {
>         BPF_PROG_TYPE_LSM,
>         BPF_PROG_TYPE_SK_LOOKUP,
>         BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> +       BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT,
>  };
>
>  enum bpf_attach_type {
> @@ -998,6 +999,7 @@ enum bpf_attach_type {
>         BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
>         BPF_PERF_EVENT,
>         BPF_TRACE_KPROBE_MULTI,
> +       BPF_CGROUP_SUBSYS_RSTAT,
>         __MAX_BPF_ATTACH_TYPE
>  };
>
> @@ -1013,6 +1015,7 @@ enum bpf_link_type {
>         BPF_LINK_TYPE_XDP = 6,
>         BPF_LINK_TYPE_PERF_EVENT = 7,
>         BPF_LINK_TYPE_KPROBE_MULTI = 8,
> +       BPF_LINK_TYPE_CGROUP_SUBSYS = 9,
>
>         MAX_BPF_LINK_TYPE,
>  };
> @@ -1482,6 +1485,9 @@ union bpf_attr {
>                                  */
>                                 __u64           bpf_cookie;
>                         } perf_event;
> +                       struct {
> +                               __u64           name;
> +                       } cgroup_subsys;
>                         struct {
>                                 __u32           flags;
>                                 __u32           cnt;
> @@ -6324,6 +6330,12 @@ struct bpf_cgroup_dev_ctx {
>         __u32 minor;
>  };
>
> +struct bpf_rstat_ctx {
> +       __u64 cgroup_id;
> +       __u64 parent_cgroup_id; /* 0 if root */
> +       __s32 cpu;
> +};
> +
>  struct bpf_raw_tracepoint_args {
>         __u64 args[0];
>  };
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index c1a9be6a4b9f..6caf4a61e543 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -25,6 +25,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
>  obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
>  endif
>  obj-$(CONFIG_CGROUP_BPF) += cgroup.o
> +obj-$(CONFIG_CGROUP_BPF) += cgroup_subsys.o

In the next version I will replace this with:
ifeq ($(CONFIG_CGROUP),y)
obj-$(CONFIG_BPF_SYSCALL) += cgroup_subsys.o
endif

, as this program type doesn't attach to cgroups and does not depend
on CONFIG_CGROUP_BPF, only CONFIG_CGROUP and CONFIG_BPF_SYSCALL.

>  ifeq ($(CONFIG_INET),y)
>  obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
>  endif
> diff --git a/kernel/bpf/cgroup_subsys.c b/kernel/bpf/cgroup_subsys.c
> new file mode 100644
> index 000000000000..9673ce6aa84a
> --- /dev/null
> +++ b/kernel/bpf/cgroup_subsys.c
> @@ -0,0 +1,166 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Functions to manage eBPF programs attached to cgroup subsystems
> + *
> + * Copyright 2022 Google LLC.
> + */
> +
> +#include <linux/bpf-cgroup-subsys.h>
> +#include <linux/filter.h>
> +
> +#include "../cgroup/cgroup-internal.h"
> +
> +
> +static int cgroup_subsys_bpf_attach(struct cgroup_subsys *ss, struct bpf_prog *prog)
> +{
> +       struct bpf_subsys_rstat_flusher *rstat_flusher;
> +
> +       rstat_flusher = kmalloc(sizeof(*rstat_flusher), GFP_KERNEL);
> +       if (!rstat_flusher)
> +               return -ENOMEM;
> +       rstat_flusher->prog = prog;
> +
> +       spin_lock(&ss->bpf.flushers_lock);
> +       list_add(&rstat_flusher->list, &ss->bpf.rstat_flushers);
> +       spin_unlock(&ss->bpf.flushers_lock);
> +
> +       return 0;
> +}
> +
> +static void cgroup_subsys_bpf_detach(struct cgroup_subsys *ss, struct bpf_prog *prog)
> +{
> +       struct bpf_subsys_rstat_flusher *rstat_flusher = NULL;
> +
> +       spin_lock(&ss->bpf.flushers_lock);
> +       list_for_each_entry(rstat_flusher, &ss->bpf.rstat_flushers, list)
> +               if (rstat_flusher->prog == prog)
> +                       break;
> +
> +       if (rstat_flusher) {
> +               list_del(&rstat_flusher->list);
> +               bpf_prog_put(rstat_flusher->prog);
> +               kfree(rstat_flusher);
> +       }
> +       spin_unlock(&ss->bpf.flushers_lock);
> +}
> +
> +static void bpf_cgroup_subsys_link_release(struct bpf_link *link)
> +{
> +       struct bpf_cgroup_subsys_link *ss_link = container_of(link,
> +                                                      struct bpf_cgroup_subsys_link,
> +                                                      link);
> +       if (ss_link->ss) {
> +               cgroup_subsys_bpf_detach(ss_link->ss, ss_link->link.prog);
> +               ss_link->ss = NULL;
> +       }
> +}
> +
> +static int bpf_cgroup_subsys_link_detach(struct bpf_link *link)
> +{
> +       bpf_cgroup_subsys_link_release(link);
> +       return 0;
> +}
> +
> +static void bpf_cgroup_subsys_link_dealloc(struct bpf_link *link)
> +{
> +       struct bpf_cgroup_subsys_link *ss_link = container_of(link,
> +                                                      struct bpf_cgroup_subsys_link,
> +                                                      link);
> +       kfree(ss_link);
> +}
> +
> +static const struct bpf_link_ops bpf_cgroup_subsys_link_lops = {
> +       .detach = bpf_cgroup_subsys_link_detach,
> +       .release = bpf_cgroup_subsys_link_release,
> +       .dealloc = bpf_cgroup_subsys_link_dealloc,
> +};
> +
> +int cgroup_subsys_bpf_link_attach(const union bpf_attr *attr,
> +                                 struct bpf_prog *prog)
> +{
> +       struct bpf_link_primer link_primer;
> +       struct bpf_cgroup_subsys_link *link;
> +       struct cgroup_subsys *ss, *attach_ss = NULL;
> +       const char __user *ss_name_user;
> +       char ss_name[MAX_CGROUP_TYPE_NAMELEN];
> +       int ssid, err;
> +
> +       if (attr->link_create.target_fd || attr->link_create.flags)
> +               return -EINVAL;
> +
> +       ss_name_user = u64_to_user_ptr(attr->link_create.cgroup_subsys.name);
> +       if (strncpy_from_user(ss_name, ss_name_user, sizeof(ss_name) - 1) < 0)
> +               return -EFAULT;
> +
> +       for_each_subsys(ss, ssid)
> +               if (!strcmp(ss_name, ss->name) ||
> +                   !strcmp(ss_name, ss->legacy_name))
> +                       attach_ss = ss;
> +
> +       if (!attach_ss)
> +               return -EINVAL;
> +
> +       link = kzalloc(sizeof(*link), GFP_USER);
> +       if (!link)
> +               return -ENOMEM;
> +
> +       bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP_SUBSYS,
> +                     &bpf_cgroup_subsys_link_lops,
> +                     prog);
> +       link->ss = attach_ss;
> +
> +       err = bpf_link_prime(&link->link, &link_primer);
> +       if (err) {
> +               kfree(link);
> +               return err;
> +       }
> +
> +       err = cgroup_subsys_bpf_attach(attach_ss, prog);
> +       if (err) {
> +               bpf_link_cleanup(&link_primer);
> +               return err;
> +       }
> +
> +       return bpf_link_settle(&link_primer);
> +}
> +
> +static const struct bpf_func_proto *
> +cgroup_subsys_rstat_func_proto(enum bpf_func_id func_id,
> +                              const struct bpf_prog *prog)
> +{
> +       return bpf_base_func_proto(func_id);
> +}
> +
> +static bool cgroup_subsys_rstat_is_valid_access(int off, int size,
> +                                          enum bpf_access_type type,
> +                                          const struct bpf_prog *prog,
> +                                          struct bpf_insn_access_aux *info)
> +{
> +       if (type == BPF_WRITE)
> +               return false;
> +
> +       if (off < 0 || off + size > sizeof(struct bpf_rstat_ctx))
> +               return false;
> +       /* The verifier guarantees that size > 0 */
> +       if (off % size != 0)
> +               return false;
> +
> +       switch (off) {
> +       case offsetof(struct bpf_rstat_ctx, cgroup_id):
> +               return size == sizeof(__u64);
> +       case offsetof(struct bpf_rstat_ctx, parent_cgroup_id):
> +               return size == sizeof(__u64);
> +       case offsetof(struct bpf_rstat_ctx, cpu):
> +               return size == sizeof(__s32);
> +       default:
> +               return false;
> +       }
> +}
> +
> +const struct bpf_prog_ops cgroup_subsys_rstat_prog_ops = {
> +};
> +
> +const struct bpf_verifier_ops cgroup_subsys_rstat_verifier_ops = {
> +       .get_func_proto         = cgroup_subsys_rstat_func_proto,
> +       .is_valid_access        = cgroup_subsys_rstat_is_valid_access,
> +};
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index cdaa1152436a..48149c54d969 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -3,6 +3,7 @@
>   */
>  #include <linux/bpf.h>
>  #include <linux/bpf-cgroup.h>
> +#include <linux/bpf-cgroup-subsys.h>
>  #include <linux/bpf_trace.h>
>  #include <linux/bpf_lirc.h>
>  #include <linux/bpf_verifier.h>
> @@ -3194,6 +3195,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
>                 return BPF_PROG_TYPE_SK_LOOKUP;
>         case BPF_XDP:
>                 return BPF_PROG_TYPE_XDP;
> +       case BPF_CGROUP_SUBSYS_RSTAT:
> +               return BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT;
>         default:
>                 return BPF_PROG_TYPE_UNSPEC;
>         }
> @@ -4341,6 +4344,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
>                 else
>                         ret = bpf_kprobe_multi_link_attach(attr, prog);
>                 break;
> +       case BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT:
> +               ret = cgroup_subsys_bpf_link_attach(attr, prog);
> +               break;
>         default:
>                 ret = -EINVAL;
>         }
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index adb820e98f24..7b1448013009 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -5745,6 +5745,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
>
>         idr_init(&ss->css_idr);
>         INIT_LIST_HEAD(&ss->cfts);
> +       INIT_LIST_HEAD(&ss->bpf.rstat_flushers);
>
>         /* Create the root cgroup state for this subsystem */
>         ss->root = &cgrp_dfl_root;
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index d14b10b85e51..0f4855fa85db 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -952,6 +952,7 @@ enum bpf_prog_type {
>         BPF_PROG_TYPE_LSM,
>         BPF_PROG_TYPE_SK_LOOKUP,
>         BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> +       BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT,
>  };
>
>  enum bpf_attach_type {
> @@ -998,6 +999,7 @@ enum bpf_attach_type {
>         BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
>         BPF_PERF_EVENT,
>         BPF_TRACE_KPROBE_MULTI,
> +       BPF_CGROUP_SUBSYS_RSTAT,
>         __MAX_BPF_ATTACH_TYPE
>  };
>
> @@ -1013,6 +1015,7 @@ enum bpf_link_type {
>         BPF_LINK_TYPE_XDP = 6,
>         BPF_LINK_TYPE_PERF_EVENT = 7,
>         BPF_LINK_TYPE_KPROBE_MULTI = 8,
> +       BPF_LINK_TYPE_CGROUP_SUBSYS = 9,
>
>         MAX_BPF_LINK_TYPE,
>  };
> @@ -1482,6 +1485,9 @@ union bpf_attr {
>                                  */
>                                 __u64           bpf_cookie;
>                         } perf_event;
> +                       struct {
> +                               __u64           name;
> +                       } cgroup_subsys;
>                         struct {
>                                 __u32           flags;
>                                 __u32           cnt;
> @@ -6324,6 +6330,12 @@ struct bpf_cgroup_dev_ctx {
>         __u32 minor;
>  };
>
> +struct bpf_rstat_ctx {
> +       __u64 cgroup_id;
> +       __u64 parent_cgroup_id; /* 0 if root */
> +       __s32 cpu;
> +};
> +
>  struct bpf_raw_tracepoint_args {
>         __u64 args[0];
>  };
> --
> 2.36.0.512.ge40c2bad7a-goog
>
Tejun Heo May 10, 2022, 6:44 p.m. UTC | #2
Hello,

On Tue, May 10, 2022 at 12:17:59AM +0000, Yosry Ahmed wrote:
> @@ -706,6 +707,9 @@ struct cgroup_subsys {
>  	 * specifies the mask of subsystems that this one depends on.
>  	 */
>  	unsigned int depends_on;
> +
> +	/* used to store bpf programs.*/
> +	struct cgroup_subsys_bpf bpf;
>  };

Care to elaborate on rationales around associating this with a specific
cgroup_subsys rather than letting it walk cgroups and access whatever csses
as needed? I don't think it's a wrong approach or anything but I can think
of plenty of things that would be interesting without being associated with
a specific subsystem - even all the cpu usage statistics are built to in the
cgroup core and given how e.g. systemd uses cgroup to organize the
applications in the system whether resource control is active or not, there
are a lot of info one can gather about those without being associated with a
specific subsystem.

Thanks.
Yosry Ahmed May 10, 2022, 7:21 p.m. UTC | #3
On Tue, May 10, 2022 at 11:07 AM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Mon, May 9, 2022 at 5:18 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > This patch introduces a new bpf program type CGROUP_SUBSYS_RSTAT,
> > with new corresponding link and attach types.
> >
> > The main purpose of these programs is to allow BPF programs to collect
> > and maintain hierarchical cgroup stats easily and efficiently by making
> > using of the rstat framework in the kernel.
> >
> > Those programs attach to a cgroup subsystem. They typically contain logic
> > to aggregate per-cpu and per-cgroup stats collected by other BPF programs.
> >
> > Currently, only rstat flusher programs can be attached to cgroup
> > subsystems, but this can be extended later if a use-case arises.
> >
> > See the selftest in the final patch for a practical example.
> >
> > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > ---
> >  include/linux/bpf-cgroup-subsys.h |  30 ++++++
> >  include/linux/bpf_types.h         |   2 +
> >  include/linux/cgroup-defs.h       |   4 +
> >  include/uapi/linux/bpf.h          |  12 +++
> >  kernel/bpf/Makefile               |   1 +
> >  kernel/bpf/cgroup_subsys.c        | 166 ++++++++++++++++++++++++++++++
> >  kernel/bpf/syscall.c              |   6 ++
> >  kernel/cgroup/cgroup.c            |   1 +
> >  tools/include/uapi/linux/bpf.h    |  12 +++
> >  9 files changed, 234 insertions(+)
> >  create mode 100644 include/linux/bpf-cgroup-subsys.h
> >  create mode 100644 kernel/bpf/cgroup_subsys.c
> >
> > diff --git a/include/linux/bpf-cgroup-subsys.h b/include/linux/bpf-cgroup-subsys.h
> > new file mode 100644
> > index 000000000000..4dcde06b5599
> > --- /dev/null
> > +++ b/include/linux/bpf-cgroup-subsys.h
> > @@ -0,0 +1,30 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * Copyright 2022 Google LLC.
> > + */
> > +#ifndef _BPF_CGROUP_SUBSYS_H_
> > +#define _BPF_CGROUP_SUBSYS_H_
> > +
> > +#include <linux/bpf.h>
> > +
> > +struct cgroup_subsys_bpf {
> > +       /* Head of the list of BPF rstat flushers attached to this subsystem */
> > +       struct list_head rstat_flushers;
> > +       spinlock_t flushers_lock;
> > +};
> > +
> > +struct bpf_subsys_rstat_flusher {
> > +       struct bpf_prog *prog;
> > +       /* List of BPF rtstat flushers, anchored at subsys->bpf */
> > +       struct list_head list;
> > +};
> > +
> > +struct bpf_cgroup_subsys_link {
> > +       struct bpf_link link;
> > +       struct cgroup_subsys *ss;
> > +};
> > +
> > +int cgroup_subsys_bpf_link_attach(const union bpf_attr *attr,
> > +                                 struct bpf_prog *prog);
> > +
>
> In the next version I will make sure everything here is also defined
> for when CONFIG_BPF_SYSCALL is not set, and move the structs that can
> be moved to the cc file there.
>
> > +#endif  // _BPF_CGROUP_SUBSYS_H_
> > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> > index 3e24ad0c4b3c..854ee958b0e4 100644
> > --- a/include/linux/bpf_types.h
> > +++ b/include/linux/bpf_types.h
> > @@ -56,6 +56,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl,
> >               struct bpf_sysctl, struct bpf_sysctl_kern)
> >  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt,
> >               struct bpf_sockopt, struct bpf_sockopt_kern)
> > +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT, cgroup_subsys_rstat,
> > +             struct bpf_rstat_ctx, struct bpf_rstat_ctx)
> >  #endif
> >  #ifdef CONFIG_BPF_LIRC_MODE2
> >  BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
> > diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> > index 1bfcfb1af352..3bd6eed1fa13 100644
> > --- a/include/linux/cgroup-defs.h
> > +++ b/include/linux/cgroup-defs.h
> > @@ -20,6 +20,7 @@
> >  #include <linux/u64_stats_sync.h>
> >  #include <linux/workqueue.h>
> >  #include <linux/bpf-cgroup-defs.h>
> > +#include <linux/bpf-cgroup-subsys.h>
> >  #include <linux/psi_types.h>
> >
> >  #ifdef CONFIG_CGROUPS
> > @@ -706,6 +707,9 @@ struct cgroup_subsys {
> >          * specifies the mask of subsystems that this one depends on.
> >          */
> >         unsigned int depends_on;
> > +
> > +       /* used to store bpf programs.*/
> > +       struct cgroup_subsys_bpf bpf;
> >  };
> >
> >  extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index d14b10b85e51..0f4855fa85db 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -952,6 +952,7 @@ enum bpf_prog_type {
> >         BPF_PROG_TYPE_LSM,
> >         BPF_PROG_TYPE_SK_LOOKUP,
> >         BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> > +       BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT,
> >  };
> >
> >  enum bpf_attach_type {
> > @@ -998,6 +999,7 @@ enum bpf_attach_type {
> >         BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
> >         BPF_PERF_EVENT,
> >         BPF_TRACE_KPROBE_MULTI,
> > +       BPF_CGROUP_SUBSYS_RSTAT,
> >         __MAX_BPF_ATTACH_TYPE
> >  };
> >
> > @@ -1013,6 +1015,7 @@ enum bpf_link_type {
> >         BPF_LINK_TYPE_XDP = 6,
> >         BPF_LINK_TYPE_PERF_EVENT = 7,
> >         BPF_LINK_TYPE_KPROBE_MULTI = 8,
> > +       BPF_LINK_TYPE_CGROUP_SUBSYS = 9,
> >
> >         MAX_BPF_LINK_TYPE,
> >  };
> > @@ -1482,6 +1485,9 @@ union bpf_attr {
> >                                  */
> >                                 __u64           bpf_cookie;
> >                         } perf_event;
> > +                       struct {
> > +                               __u64           name;
> > +                       } cgroup_subsys;
> >                         struct {
> >                                 __u32           flags;
> >                                 __u32           cnt;
> > @@ -6324,6 +6330,12 @@ struct bpf_cgroup_dev_ctx {
> >         __u32 minor;
> >  };
> >
> > +struct bpf_rstat_ctx {
> > +       __u64 cgroup_id;
> > +       __u64 parent_cgroup_id; /* 0 if root */
> > +       __s32 cpu;
> > +};
> > +
> >  struct bpf_raw_tracepoint_args {
> >         __u64 args[0];
> >  };
> > diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> > index c1a9be6a4b9f..6caf4a61e543 100644
> > --- a/kernel/bpf/Makefile
> > +++ b/kernel/bpf/Makefile
> > @@ -25,6 +25,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
> >  obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
> >  endif
> >  obj-$(CONFIG_CGROUP_BPF) += cgroup.o
> > +obj-$(CONFIG_CGROUP_BPF) += cgroup_subsys.o
>
> In the next version I will replace this with:
> ifeq ($(CONFIG_CGROUP),y)
> obj-$(CONFIG_BPF_SYSCALL) += cgroup_subsys.o
> endif
>
> , as this program type doesn't attach to cgroups and does not depend
> on CONFIG_CGROUP_BPF, only CONFIG_CGROUP and CONFIG_BPF_SYSCALL.

On second thought it might be simpler and cleaner to leave this code
under CONFIG_CGROUP_BPF.

>
> >  ifeq ($(CONFIG_INET),y)
> >  obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
> >  endif
> > diff --git a/kernel/bpf/cgroup_subsys.c b/kernel/bpf/cgroup_subsys.c
> > new file mode 100644
> > index 000000000000..9673ce6aa84a
> > --- /dev/null
> > +++ b/kernel/bpf/cgroup_subsys.c
> > @@ -0,0 +1,166 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Functions to manage eBPF programs attached to cgroup subsystems
> > + *
> > + * Copyright 2022 Google LLC.
> > + */
> > +
> > +#include <linux/bpf-cgroup-subsys.h>
> > +#include <linux/filter.h>
> > +
> > +#include "../cgroup/cgroup-internal.h"
> > +
> > +
> > +static int cgroup_subsys_bpf_attach(struct cgroup_subsys *ss, struct bpf_prog *prog)
> > +{
> > +       struct bpf_subsys_rstat_flusher *rstat_flusher;
> > +
> > +       rstat_flusher = kmalloc(sizeof(*rstat_flusher), GFP_KERNEL);
> > +       if (!rstat_flusher)
> > +               return -ENOMEM;
> > +       rstat_flusher->prog = prog;
> > +
> > +       spin_lock(&ss->bpf.flushers_lock);
> > +       list_add(&rstat_flusher->list, &ss->bpf.rstat_flushers);
> > +       spin_unlock(&ss->bpf.flushers_lock);
> > +
> > +       return 0;
> > +}
> > +
> > +static void cgroup_subsys_bpf_detach(struct cgroup_subsys *ss, struct bpf_prog *prog)
> > +{
> > +       struct bpf_subsys_rstat_flusher *rstat_flusher = NULL;
> > +
> > +       spin_lock(&ss->bpf.flushers_lock);
> > +       list_for_each_entry(rstat_flusher, &ss->bpf.rstat_flushers, list)
> > +               if (rstat_flusher->prog == prog)
> > +                       break;
> > +
> > +       if (rstat_flusher) {
> > +               list_del(&rstat_flusher->list);
> > +               bpf_prog_put(rstat_flusher->prog);
> > +               kfree(rstat_flusher);
> > +       }
> > +       spin_unlock(&ss->bpf.flushers_lock);
> > +}
> > +
> > +static void bpf_cgroup_subsys_link_release(struct bpf_link *link)
> > +{
> > +       struct bpf_cgroup_subsys_link *ss_link = container_of(link,
> > +                                                      struct bpf_cgroup_subsys_link,
> > +                                                      link);
> > +       if (ss_link->ss) {
> > +               cgroup_subsys_bpf_detach(ss_link->ss, ss_link->link.prog);
> > +               ss_link->ss = NULL;
> > +       }
> > +}
> > +
> > +static int bpf_cgroup_subsys_link_detach(struct bpf_link *link)
> > +{
> > +       bpf_cgroup_subsys_link_release(link);
> > +       return 0;
> > +}
> > +
> > +static void bpf_cgroup_subsys_link_dealloc(struct bpf_link *link)
> > +{
> > +       struct bpf_cgroup_subsys_link *ss_link = container_of(link,
> > +                                                      struct bpf_cgroup_subsys_link,
> > +                                                      link);
> > +       kfree(ss_link);
> > +}
> > +
> > +static const struct bpf_link_ops bpf_cgroup_subsys_link_lops = {
> > +       .detach = bpf_cgroup_subsys_link_detach,
> > +       .release = bpf_cgroup_subsys_link_release,
> > +       .dealloc = bpf_cgroup_subsys_link_dealloc,
> > +};
> > +
> > +int cgroup_subsys_bpf_link_attach(const union bpf_attr *attr,
> > +                                 struct bpf_prog *prog)
> > +{
> > +       struct bpf_link_primer link_primer;
> > +       struct bpf_cgroup_subsys_link *link;
> > +       struct cgroup_subsys *ss, *attach_ss = NULL;
> > +       const char __user *ss_name_user;
> > +       char ss_name[MAX_CGROUP_TYPE_NAMELEN];
> > +       int ssid, err;
> > +
> > +       if (attr->link_create.target_fd || attr->link_create.flags)
> > +               return -EINVAL;
> > +
> > +       ss_name_user = u64_to_user_ptr(attr->link_create.cgroup_subsys.name);
> > +       if (strncpy_from_user(ss_name, ss_name_user, sizeof(ss_name) - 1) < 0)
> > +               return -EFAULT;
> > +
> > +       for_each_subsys(ss, ssid)
> > +               if (!strcmp(ss_name, ss->name) ||
> > +                   !strcmp(ss_name, ss->legacy_name))
> > +                       attach_ss = ss;
> > +
> > +       if (!attach_ss)
> > +               return -EINVAL;
> > +
> > +       link = kzalloc(sizeof(*link), GFP_USER);
> > +       if (!link)
> > +               return -ENOMEM;
> > +
> > +       bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP_SUBSYS,
> > +                     &bpf_cgroup_subsys_link_lops,
> > +                     prog);
> > +       link->ss = attach_ss;
> > +
> > +       err = bpf_link_prime(&link->link, &link_primer);
> > +       if (err) {
> > +               kfree(link);
> > +               return err;
> > +       }
> > +
> > +       err = cgroup_subsys_bpf_attach(attach_ss, prog);
> > +       if (err) {
> > +               bpf_link_cleanup(&link_primer);
> > +               return err;
> > +       }
> > +
> > +       return bpf_link_settle(&link_primer);
> > +}
> > +
> > +static const struct bpf_func_proto *
> > +cgroup_subsys_rstat_func_proto(enum bpf_func_id func_id,
> > +                              const struct bpf_prog *prog)
> > +{
> > +       return bpf_base_func_proto(func_id);
> > +}
> > +
> > +static bool cgroup_subsys_rstat_is_valid_access(int off, int size,
> > +                                          enum bpf_access_type type,
> > +                                          const struct bpf_prog *prog,
> > +                                          struct bpf_insn_access_aux *info)
> > +{
> > +       if (type == BPF_WRITE)
> > +               return false;
> > +
> > +       if (off < 0 || off + size > sizeof(struct bpf_rstat_ctx))
> > +               return false;
> > +       /* The verifier guarantees that size > 0 */
> > +       if (off % size != 0)
> > +               return false;
> > +
> > +       switch (off) {
> > +       case offsetof(struct bpf_rstat_ctx, cgroup_id):
> > +               return size == sizeof(__u64);
> > +       case offsetof(struct bpf_rstat_ctx, parent_cgroup_id):
> > +               return size == sizeof(__u64);
> > +       case offsetof(struct bpf_rstat_ctx, cpu):
> > +               return size == sizeof(__s32);
> > +       default:
> > +               return false;
> > +       }
> > +}
> > +
> > +const struct bpf_prog_ops cgroup_subsys_rstat_prog_ops = {
> > +};
> > +
> > +const struct bpf_verifier_ops cgroup_subsys_rstat_verifier_ops = {
> > +       .get_func_proto         = cgroup_subsys_rstat_func_proto,
> > +       .is_valid_access        = cgroup_subsys_rstat_is_valid_access,
> > +};
> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > index cdaa1152436a..48149c54d969 100644
> > --- a/kernel/bpf/syscall.c
> > +++ b/kernel/bpf/syscall.c
> > @@ -3,6 +3,7 @@
> >   */
> >  #include <linux/bpf.h>
> >  #include <linux/bpf-cgroup.h>
> > +#include <linux/bpf-cgroup-subsys.h>
> >  #include <linux/bpf_trace.h>
> >  #include <linux/bpf_lirc.h>
> >  #include <linux/bpf_verifier.h>
> > @@ -3194,6 +3195,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
> >                 return BPF_PROG_TYPE_SK_LOOKUP;
> >         case BPF_XDP:
> >                 return BPF_PROG_TYPE_XDP;
> > +       case BPF_CGROUP_SUBSYS_RSTAT:
> > +               return BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT;
> >         default:
> >                 return BPF_PROG_TYPE_UNSPEC;
> >         }
> > @@ -4341,6 +4344,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
> >                 else
> >                         ret = bpf_kprobe_multi_link_attach(attr, prog);
> >                 break;
> > +       case BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT:
> > +               ret = cgroup_subsys_bpf_link_attach(attr, prog);
> > +               break;
> >         default:
> >                 ret = -EINVAL;
> >         }
> > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> > index adb820e98f24..7b1448013009 100644
> > --- a/kernel/cgroup/cgroup.c
> > +++ b/kernel/cgroup/cgroup.c
> > @@ -5745,6 +5745,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
> >
> >         idr_init(&ss->css_idr);
> >         INIT_LIST_HEAD(&ss->cfts);
> > +       INIT_LIST_HEAD(&ss->bpf.rstat_flushers);
> >
> >         /* Create the root cgroup state for this subsystem */
> >         ss->root = &cgrp_dfl_root;
> > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> > index d14b10b85e51..0f4855fa85db 100644
> > --- a/tools/include/uapi/linux/bpf.h
> > +++ b/tools/include/uapi/linux/bpf.h
> > @@ -952,6 +952,7 @@ enum bpf_prog_type {
> >         BPF_PROG_TYPE_LSM,
> >         BPF_PROG_TYPE_SK_LOOKUP,
> >         BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> > +       BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT,
> >  };
> >
> >  enum bpf_attach_type {
> > @@ -998,6 +999,7 @@ enum bpf_attach_type {
> >         BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
> >         BPF_PERF_EVENT,
> >         BPF_TRACE_KPROBE_MULTI,
> > +       BPF_CGROUP_SUBSYS_RSTAT,
> >         __MAX_BPF_ATTACH_TYPE
> >  };
> >
> > @@ -1013,6 +1015,7 @@ enum bpf_link_type {
> >         BPF_LINK_TYPE_XDP = 6,
> >         BPF_LINK_TYPE_PERF_EVENT = 7,
> >         BPF_LINK_TYPE_KPROBE_MULTI = 8,
> > +       BPF_LINK_TYPE_CGROUP_SUBSYS = 9,
> >
> >         MAX_BPF_LINK_TYPE,
> >  };
> > @@ -1482,6 +1485,9 @@ union bpf_attr {
> >                                  */
> >                                 __u64           bpf_cookie;
> >                         } perf_event;
> > +                       struct {
> > +                               __u64           name;
> > +                       } cgroup_subsys;
> >                         struct {
> >                                 __u32           flags;
> >                                 __u32           cnt;
> > @@ -6324,6 +6330,12 @@ struct bpf_cgroup_dev_ctx {
> >         __u32 minor;
> >  };
> >
> > +struct bpf_rstat_ctx {
> > +       __u64 cgroup_id;
> > +       __u64 parent_cgroup_id; /* 0 if root */
> > +       __s32 cpu;
> > +};
> > +
> >  struct bpf_raw_tracepoint_args {
> >         __u64 args[0];
> >  };
> > --
> > 2.36.0.512.ge40c2bad7a-goog
> >
Yosry Ahmed May 10, 2022, 7:34 p.m. UTC | #4
On Tue, May 10, 2022 at 11:44 AM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Tue, May 10, 2022 at 12:17:59AM +0000, Yosry Ahmed wrote:
> > @@ -706,6 +707,9 @@ struct cgroup_subsys {
> >        * specifies the mask of subsystems that this one depends on.
> >        */
> >       unsigned int depends_on;
> > +
> > +     /* used to store bpf programs.*/
> > +     struct cgroup_subsys_bpf bpf;
> >  };
>
> Care to elaborate on rationales around associating this with a specific
> cgroup_subsys rather than letting it walk cgroups and access whatever csses
> as needed? I don't think it's a wrong approach or anything but I can think
> of plenty of things that would be interesting without being associated with
> a specific subsystem - even all the cpu usage statistics are built to in the
> cgroup core and given how e.g. systemd uses cgroup to organize the
> applications in the system whether resource control is active or not, there
> are a lot of info one can gather about those without being associated with a
> specific subsystem.

Hi Tejun,

Thanks so much for taking the time to look into this!

The rationale behind associating this work with cgroup_subsys is that
usually the stats are associated with a resource (e.g. memory, cpu,
etc). For example, if the memory controller is only enabled for a
subtree in a big hierarchy, it would be more efficient to only run BPF
rstat programs for those cgroups, not the entire hierarchy. It
provides a way to control what part of the hierarchy you want to
collect stats for. This is also semantically similar to the
css_rstat_flush() callback.

However, I do see your point about the benefits of collecting stats
that are not associated with any controller. I think there are
multiple options here, and I would love to hear what you prefer:
1. In addition to subsystems, support an "all" or "cgroup" attach
point that loads BPF rstat flush programs that will run for all
cgroups.
2. Simplify the interface so that all BPF rstat flush programs run for
all cgroups, and add the subsystem association later if a need arises.
3. Instead of attaching BPF programs to a subsystem, attach them to a
cgroup. This gives more flexibility, but also makes lifetime handling
of programs more complicated and error-prone. I can also see most use
cases (including ours) attaching programs to the root cgroup anyway.
In this case, we waste space by storing pointers to the same program
in every cgroup, and have unnecessary complexity in the code.

Let me know what you think!

>
> Thanks.
>
> --
> tejun
Tejun Heo May 10, 2022, 7:59 p.m. UTC | #5
Hello,

On Tue, May 10, 2022 at 12:34:42PM -0700, Yosry Ahmed wrote:
> The rationale behind associating this work with cgroup_subsys is that
> usually the stats are associated with a resource (e.g. memory, cpu,
> etc). For example, if the memory controller is only enabled for a
> subtree in a big hierarchy, it would be more efficient to only run BPF
> rstat programs for those cgroups, not the entire hierarchy. It
> provides a way to control what part of the hierarchy you want to
> collect stats for. This is also semantically similar to the
> css_rstat_flush() callback.

Hmm... one major point of rstat is not having to worry about these things
because we iterate what's been active rather than what exists. Now, this
isn't entirely true because we share the same updated list for all sources.
This is a trade-off which makes sense because 1. the number of cgroups to
iterate each cycle is generally really low anyway 2. different controllers
often get enabled together. If the balance tilts towards "we're walking too
many due to the sharing of updated list across different sources", the
solution would be splitting the updated list so that we make the walk finer
grained.

Note that the above doesn't really affect the conceptual model. It's purely
an optimization decision. Tying these things to a cgroup_subsys does affect
the conceptual model and, in this case, the userland API for a performance
consideration which can be solved otherwise.

So, let's please keep this simple and in the (unlikely) case that the
overhead becomes an issue, solve it from rstat operation side.

Thanks.
Yosry Ahmed May 10, 2022, 8:43 p.m. UTC | #6
On Tue, May 10, 2022 at 12:59 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Tue, May 10, 2022 at 12:34:42PM -0700, Yosry Ahmed wrote:
> > The rationale behind associating this work with cgroup_subsys is that
> > usually the stats are associated with a resource (e.g. memory, cpu,
> > etc). For example, if the memory controller is only enabled for a
> > subtree in a big hierarchy, it would be more efficient to only run BPF
> > rstat programs for those cgroups, not the entire hierarchy. It
> > provides a way to control what part of the hierarchy you want to
> > collect stats for. This is also semantically similar to the
> > css_rstat_flush() callback.
>
> Hmm... one major point of rstat is not having to worry about these things
> because we iterate what's been active rather than what exists. Now, this
> isn't entirely true because we share the same updated list for all sources.
> This is a trade-off which makes sense because 1. the number of cgroups to
> iterate each cycle is generally really low anyway 2. different controllers
> often get enabled together. If the balance tilts towards "we're walking too
> many due to the sharing of updated list across different sources", the
> solution would be splitting the updated list so that we make the walk finer
> grained.
>
> Note that the above doesn't really affect the conceptual model. It's purely
> an optimization decision. Tying these things to a cgroup_subsys does affect
> the conceptual model and, in this case, the userland API for a performance
> consideration which can be solved otherwise.
>
> So, let's please keep this simple and in the (unlikely) case that the
> overhead becomes an issue, solve it from rstat operation side.
>
> Thanks.

I assume if we do this optimization, and have separate updated lists
for controllers, we will still have a "core" updated list that is not
tied to any controller. Is this correct?

If yes, then we can make the interface controller-agnostic (a global
list of BPF flushers). If we do the optimization later, we tie BPF
stats to the "core" updated list. We can even extend the userland
interface then to allow for controller-specific BPF stats if found
useful.

If not, and there will only be controller-specific updated lists then,
then we might need to maintain a "core" updated list just for the sake
of BPF programs, which I don't think would be favorable.

What do you think? Either-way, I will try to document our discussion
outcome in the commit message (and maybe the code), so that
if-and-when this optimization is made, we can come back to it.


>
> --
> tejun
Tejun Heo May 10, 2022, 9:01 p.m. UTC | #7
Hello,

On Tue, May 10, 2022 at 01:43:46PM -0700, Yosry Ahmed wrote:
> I assume if we do this optimization, and have separate updated lists
> for controllers, we will still have a "core" updated list that is not
> tied to any controller. Is this correct?

Or we can create a dedicated updated list for the bpf progs, or even
multiple for groups of them and so on.

> If yes, then we can make the interface controller-agnostic (a global
> list of BPF flushers). If we do the optimization later, we tie BPF
> stats to the "core" updated list. We can even extend the userland
> interface then to allow for controller-specific BPF stats if found
> useful.

We'll need that anyway as cpustats are tied to the cgroup themselves rather
than the cpu controller.

> If not, and there will only be controller-specific updated lists then,
> then we might need to maintain a "core" updated list just for the sake
> of BPF programs, which I don't think would be favorable.

If needed, that's fine actually.

> What do you think? Either-way, I will try to document our discussion
> outcome in the commit message (and maybe the code), so that
> if-and-when this optimization is made, we can come back to it.

So, the main focus is keeping the userspace interface as simple as possible
and solving performance issues on the rstat side. If we need however many
updated lists to do that, that's all fine. FWIW, the experience up until now
has been consistent with the assumptions that the current implementation
makes and I haven't seen real any world cases where the shared updated list
are problematic.

Thanks.
Yosry Ahmed May 10, 2022, 9:55 p.m. UTC | #8
On Tue, May 10, 2022 at 2:01 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Tue, May 10, 2022 at 01:43:46PM -0700, Yosry Ahmed wrote:
> > I assume if we do this optimization, and have separate updated lists
> > for controllers, we will still have a "core" updated list that is not
> > tied to any controller. Is this correct?
>
> Or we can create a dedicated updated list for the bpf progs, or even
> multiple for groups of them and so on.
>
> > If yes, then we can make the interface controller-agnostic (a global
> > list of BPF flushers). If we do the optimization later, we tie BPF
> > stats to the "core" updated list. We can even extend the userland
> > interface then to allow for controller-specific BPF stats if found
> > useful.
>
> We'll need that anyway as cpustats are tied to the cgroup themselves rather
> than the cpu controller.
>
> > If not, and there will only be controller-specific updated lists then,
> > then we might need to maintain a "core" updated list just for the sake
> > of BPF programs, which I don't think would be favorable.
>
> If needed, that's fine actually.
>
> > What do you think? Either-way, I will try to document our discussion
> > outcome in the commit message (and maybe the code), so that
> > if-and-when this optimization is made, we can come back to it.
>
> So, the main focus is keeping the userspace interface as simple as possible
> and solving performance issues on the rstat side. If we need however many
> updated lists to do that, that's all fine. FWIW, the experience up until now
> has been consistent with the assumptions that the current implementation
> makes and I haven't seen real any world cases where the shared updated list
> are problematic.
>

Thanks again for your insights and time!

That's great to hear. I am all in for making the userspace interface
simpler. I will rework this patch series so that the BPF programs just
attach to "rstat" and send a V1.
Any other concerns you have that you think I should address in V1?

> Thanks.
>
> --
> tejun
Tejun Heo May 10, 2022, 10:09 p.m. UTC | #9
Hello,

On Tue, May 10, 2022 at 02:55:32PM -0700, Yosry Ahmed wrote:
> That's great to hear. I am all in for making the userspace interface
> simpler. I will rework this patch series so that the BPF programs just
> attach to "rstat" and send a V1.
> Any other concerns you have that you think I should address in V1?

Not that I can think of right now but my bpf side insight is really limited,
so it might be worthwhile to wait for some bpf folks to chime in?

Thanks.
Yosry Ahmed May 10, 2022, 10:10 p.m. UTC | #10
On Tue, May 10, 2022 at 3:09 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Tue, May 10, 2022 at 02:55:32PM -0700, Yosry Ahmed wrote:
> > That's great to hear. I am all in for making the userspace interface
> > simpler. I will rework this patch series so that the BPF programs just
> > attach to "rstat" and send a V1.
> > Any other concerns you have that you think I should address in V1?
>
> Not that I can think of right now but my bpf side insight is really limited,
> so it might be worthwhile to wait for some bpf folks to chime in?
>

Sounds good. Will wait for feedback on the BPF side of things before I
send a V1.

> Thanks.
>
> --
> tejun
diff mbox series

Patch

diff --git a/include/linux/bpf-cgroup-subsys.h b/include/linux/bpf-cgroup-subsys.h
new file mode 100644
index 000000000000..4dcde06b5599
--- /dev/null
+++ b/include/linux/bpf-cgroup-subsys.h
@@ -0,0 +1,30 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2022 Google LLC.
+ */
+#ifndef _BPF_CGROUP_SUBSYS_H_
+#define _BPF_CGROUP_SUBSYS_H_
+
+#include <linux/bpf.h>
+
+struct cgroup_subsys_bpf {
+	/* Head of the list of BPF rstat flushers attached to this subsystem */
+	struct list_head rstat_flushers;
+	spinlock_t flushers_lock;
+};
+
+struct bpf_subsys_rstat_flusher {
+	struct bpf_prog *prog;
+	/* List of BPF rtstat flushers, anchored at subsys->bpf */
+	struct list_head list;
+};
+
+struct bpf_cgroup_subsys_link {
+	struct bpf_link link;
+	struct cgroup_subsys *ss;
+};
+
+int cgroup_subsys_bpf_link_attach(const union bpf_attr *attr,
+				  struct bpf_prog *prog);
+
+#endif  // _BPF_CGROUP_SUBSYS_H_
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 3e24ad0c4b3c..854ee958b0e4 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -56,6 +56,8 @@  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl,
 	      struct bpf_sysctl, struct bpf_sysctl_kern)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt,
 	      struct bpf_sockopt, struct bpf_sockopt_kern)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT, cgroup_subsys_rstat,
+	      struct bpf_rstat_ctx, struct bpf_rstat_ctx)
 #endif
 #ifdef CONFIG_BPF_LIRC_MODE2
 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1bfcfb1af352..3bd6eed1fa13 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -20,6 +20,7 @@ 
 #include <linux/u64_stats_sync.h>
 #include <linux/workqueue.h>
 #include <linux/bpf-cgroup-defs.h>
+#include <linux/bpf-cgroup-subsys.h>
 #include <linux/psi_types.h>
 
 #ifdef CONFIG_CGROUPS
@@ -706,6 +707,9 @@  struct cgroup_subsys {
 	 * specifies the mask of subsystems that this one depends on.
 	 */
 	unsigned int depends_on;
+
+	/* used to store bpf programs.*/
+	struct cgroup_subsys_bpf bpf;
 };
 
 extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d14b10b85e51..0f4855fa85db 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -952,6 +952,7 @@  enum bpf_prog_type {
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
+	BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT,
 };
 
 enum bpf_attach_type {
@@ -998,6 +999,7 @@  enum bpf_attach_type {
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	BPF_PERF_EVENT,
 	BPF_TRACE_KPROBE_MULTI,
+	BPF_CGROUP_SUBSYS_RSTAT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1013,6 +1015,7 @@  enum bpf_link_type {
 	BPF_LINK_TYPE_XDP = 6,
 	BPF_LINK_TYPE_PERF_EVENT = 7,
 	BPF_LINK_TYPE_KPROBE_MULTI = 8,
+	BPF_LINK_TYPE_CGROUP_SUBSYS = 9,
 
 	MAX_BPF_LINK_TYPE,
 };
@@ -1482,6 +1485,9 @@  union bpf_attr {
 				 */
 				__u64		bpf_cookie;
 			} perf_event;
+			struct {
+				__u64		name;
+			} cgroup_subsys;
 			struct {
 				__u32		flags;
 				__u32		cnt;
@@ -6324,6 +6330,12 @@  struct bpf_cgroup_dev_ctx {
 	__u32 minor;
 };
 
+struct bpf_rstat_ctx {
+	__u64 cgroup_id;
+	__u64 parent_cgroup_id; /* 0 if root */
+	__s32 cpu;
+};
+
 struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index c1a9be6a4b9f..6caf4a61e543 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -25,6 +25,7 @@  ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
 obj-$(CONFIG_CGROUP_BPF) += cgroup.o
+obj-$(CONFIG_CGROUP_BPF) += cgroup_subsys.o
 ifeq ($(CONFIG_INET),y)
 obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
 endif
diff --git a/kernel/bpf/cgroup_subsys.c b/kernel/bpf/cgroup_subsys.c
new file mode 100644
index 000000000000..9673ce6aa84a
--- /dev/null
+++ b/kernel/bpf/cgroup_subsys.c
@@ -0,0 +1,166 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Functions to manage eBPF programs attached to cgroup subsystems
+ *
+ * Copyright 2022 Google LLC.
+ */
+
+#include <linux/bpf-cgroup-subsys.h>
+#include <linux/filter.h>
+
+#include "../cgroup/cgroup-internal.h"
+
+
+static int cgroup_subsys_bpf_attach(struct cgroup_subsys *ss, struct bpf_prog *prog)
+{
+	struct bpf_subsys_rstat_flusher *rstat_flusher;
+
+	rstat_flusher = kmalloc(sizeof(*rstat_flusher), GFP_KERNEL);
+	if (!rstat_flusher)
+		return -ENOMEM;
+	rstat_flusher->prog = prog;
+
+	spin_lock(&ss->bpf.flushers_lock);
+	list_add(&rstat_flusher->list, &ss->bpf.rstat_flushers);
+	spin_unlock(&ss->bpf.flushers_lock);
+
+	return 0;
+}
+
+static void cgroup_subsys_bpf_detach(struct cgroup_subsys *ss, struct bpf_prog *prog)
+{
+	struct bpf_subsys_rstat_flusher *rstat_flusher = NULL;
+
+	spin_lock(&ss->bpf.flushers_lock);
+	list_for_each_entry(rstat_flusher, &ss->bpf.rstat_flushers, list)
+		if (rstat_flusher->prog == prog)
+			break;
+
+	if (rstat_flusher) {
+		list_del(&rstat_flusher->list);
+		bpf_prog_put(rstat_flusher->prog);
+		kfree(rstat_flusher);
+	}
+	spin_unlock(&ss->bpf.flushers_lock);
+}
+
+static void bpf_cgroup_subsys_link_release(struct bpf_link *link)
+{
+	struct bpf_cgroup_subsys_link *ss_link = container_of(link,
+						       struct bpf_cgroup_subsys_link,
+						       link);
+	if (ss_link->ss) {
+		cgroup_subsys_bpf_detach(ss_link->ss, ss_link->link.prog);
+		ss_link->ss = NULL;
+	}
+}
+
+static int bpf_cgroup_subsys_link_detach(struct bpf_link *link)
+{
+	bpf_cgroup_subsys_link_release(link);
+	return 0;
+}
+
+static void bpf_cgroup_subsys_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_cgroup_subsys_link *ss_link = container_of(link,
+						       struct bpf_cgroup_subsys_link,
+						       link);
+	kfree(ss_link);
+}
+
+static const struct bpf_link_ops bpf_cgroup_subsys_link_lops = {
+	.detach = bpf_cgroup_subsys_link_detach,
+	.release = bpf_cgroup_subsys_link_release,
+	.dealloc = bpf_cgroup_subsys_link_dealloc,
+};
+
+int cgroup_subsys_bpf_link_attach(const union bpf_attr *attr,
+				  struct bpf_prog *prog)
+{
+	struct bpf_link_primer link_primer;
+	struct bpf_cgroup_subsys_link *link;
+	struct cgroup_subsys *ss, *attach_ss = NULL;
+	const char __user *ss_name_user;
+	char ss_name[MAX_CGROUP_TYPE_NAMELEN];
+	int ssid, err;
+
+	if (attr->link_create.target_fd || attr->link_create.flags)
+		return -EINVAL;
+
+	ss_name_user = u64_to_user_ptr(attr->link_create.cgroup_subsys.name);
+	if (strncpy_from_user(ss_name, ss_name_user, sizeof(ss_name) - 1) < 0)
+		return -EFAULT;
+
+	for_each_subsys(ss, ssid)
+		if (!strcmp(ss_name, ss->name) ||
+		    !strcmp(ss_name, ss->legacy_name))
+			attach_ss = ss;
+
+	if (!attach_ss)
+		return -EINVAL;
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link)
+		return -ENOMEM;
+
+	bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP_SUBSYS,
+		      &bpf_cgroup_subsys_link_lops,
+		      prog);
+	link->ss = attach_ss;
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err) {
+		kfree(link);
+		return err;
+	}
+
+	err = cgroup_subsys_bpf_attach(attach_ss, prog);
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		return err;
+	}
+
+	return bpf_link_settle(&link_primer);
+}
+
+static const struct bpf_func_proto *
+cgroup_subsys_rstat_func_proto(enum bpf_func_id func_id,
+			       const struct bpf_prog *prog)
+{
+	return bpf_base_func_proto(func_id);
+}
+
+static bool cgroup_subsys_rstat_is_valid_access(int off, int size,
+					   enum bpf_access_type type,
+					   const struct bpf_prog *prog,
+					   struct bpf_insn_access_aux *info)
+{
+	if (type == BPF_WRITE)
+		return false;
+
+	if (off < 0 || off + size > sizeof(struct bpf_rstat_ctx))
+		return false;
+	/* The verifier guarantees that size > 0 */
+	if (off % size != 0)
+		return false;
+
+	switch (off) {
+	case offsetof(struct bpf_rstat_ctx, cgroup_id):
+		return size == sizeof(__u64);
+	case offsetof(struct bpf_rstat_ctx, parent_cgroup_id):
+		return size == sizeof(__u64);
+	case offsetof(struct bpf_rstat_ctx, cpu):
+		return size == sizeof(__s32);
+	default:
+		return false;
+	}
+}
+
+const struct bpf_prog_ops cgroup_subsys_rstat_prog_ops = {
+};
+
+const struct bpf_verifier_ops cgroup_subsys_rstat_verifier_ops = {
+	.get_func_proto         = cgroup_subsys_rstat_func_proto,
+	.is_valid_access        = cgroup_subsys_rstat_is_valid_access,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cdaa1152436a..48149c54d969 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3,6 +3,7 @@ 
  */
 #include <linux/bpf.h>
 #include <linux/bpf-cgroup.h>
+#include <linux/bpf-cgroup-subsys.h>
 #include <linux/bpf_trace.h>
 #include <linux/bpf_lirc.h>
 #include <linux/bpf_verifier.h>
@@ -3194,6 +3195,8 @@  attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_SK_LOOKUP;
 	case BPF_XDP:
 		return BPF_PROG_TYPE_XDP;
+	case BPF_CGROUP_SUBSYS_RSTAT:
+		return BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT;
 	default:
 		return BPF_PROG_TYPE_UNSPEC;
 	}
@@ -4341,6 +4344,9 @@  static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 		else
 			ret = bpf_kprobe_multi_link_attach(attr, prog);
 		break;
+	case BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT:
+		ret = cgroup_subsys_bpf_link_attach(attr, prog);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index adb820e98f24..7b1448013009 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5745,6 +5745,7 @@  static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 
 	idr_init(&ss->css_idr);
 	INIT_LIST_HEAD(&ss->cfts);
+	INIT_LIST_HEAD(&ss->bpf.rstat_flushers);
 
 	/* Create the root cgroup state for this subsystem */
 	ss->root = &cgrp_dfl_root;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d14b10b85e51..0f4855fa85db 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -952,6 +952,7 @@  enum bpf_prog_type {
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
+	BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT,
 };
 
 enum bpf_attach_type {
@@ -998,6 +999,7 @@  enum bpf_attach_type {
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	BPF_PERF_EVENT,
 	BPF_TRACE_KPROBE_MULTI,
+	BPF_CGROUP_SUBSYS_RSTAT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1013,6 +1015,7 @@  enum bpf_link_type {
 	BPF_LINK_TYPE_XDP = 6,
 	BPF_LINK_TYPE_PERF_EVENT = 7,
 	BPF_LINK_TYPE_KPROBE_MULTI = 8,
+	BPF_LINK_TYPE_CGROUP_SUBSYS = 9,
 
 	MAX_BPF_LINK_TYPE,
 };
@@ -1482,6 +1485,9 @@  union bpf_attr {
 				 */
 				__u64		bpf_cookie;
 			} perf_event;
+			struct {
+				__u64		name;
+			} cgroup_subsys;
 			struct {
 				__u32		flags;
 				__u32		cnt;
@@ -6324,6 +6330,12 @@  struct bpf_cgroup_dev_ctx {
 	__u32 minor;
 };
 
+struct bpf_rstat_ctx {
+	__u64 cgroup_id;
+	__u64 parent_cgroup_id; /* 0 if root */
+	__s32 cpu;
+};
+
 struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };