diff mbox series

[14/19] mm: Introduce a cgroup for pinned memory

Message ID c7b5e502d1a3b9b8f6e96cbf9ca553b143c327e0.1675669136.git-series.apopple@nvidia.com (mailing list archive)
State New
Headers show
Series mm: Introduce a cgroup to limit the amount of locked and pinned memory | expand

Commit Message

Alistair Popple Feb. 6, 2023, 7:47 a.m. UTC
If too much memory in a system is pinned or locked it can lead to
problems such as performance degradation or in the worst case
out-of-memory errors as such memory cannot be moved or paged out.

In order to prevent users without CAP_IPC_LOCK from causing these
issues the amount of memory that can be pinned is typically limited by
RLIMIT_MEMLOCK. However this is inflexible as limits can't be shared
between tasks and the enforcement of these limits is inconsistent
between in-kernel users of pinned memory such as mlock() and device
drivers which may also pin pages with pin_user_pages().

To allow for a single limit to be set introduce a cgroup controller
which can be used to limit the number of pages being pinned by all
tasks in the cgroup.

Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Cc: cgroups@vger.kernel.org
Cc: linux-mm@kvack.org
---
 MAINTAINERS                   |   7 +-
 include/linux/cgroup.h        |  20 +++-
 include/linux/cgroup_subsys.h |   4 +-
 mm/Kconfig                    |  11 +-
 mm/Makefile                   |   1 +-
 mm/pins_cgroup.c              | 273 +++++++++++++++++++++++++++++++++++-
 6 files changed, 316 insertions(+)
 create mode 100644 mm/pins_cgroup.c

Comments

Yosry Ahmed Feb. 6, 2023, 9:01 p.m. UTC | #1
On Sun, Feb 5, 2023 at 11:49 PM Alistair Popple <apopple@nvidia.com> wrote:
>
> If too much memory in a system is pinned or locked it can lead to
> problems such as performance degradation or in the worst case
> out-of-memory errors as such memory cannot be moved or paged out.
>
> In order to prevent users without CAP_IPC_LOCK from causing these
> issues the amount of memory that can be pinned is typically limited by
> RLIMIT_MEMLOCK. However this is inflexible as limits can't be shared
> between tasks and the enforcement of these limits is inconsistent
> between in-kernel users of pinned memory such as mlock() and device
> drivers which may also pin pages with pin_user_pages().
>
> To allow for a single limit to be set introduce a cgroup controller
> which can be used to limit the number of pages being pinned by all
> tasks in the cgroup.
>
> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Zefan Li <lizefan.x@bytedance.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: linux-kernel@vger.kernel.org
> Cc: cgroups@vger.kernel.org
> Cc: linux-mm@kvack.org
> ---
>  MAINTAINERS                   |   7 +-
>  include/linux/cgroup.h        |  20 +++-
>  include/linux/cgroup_subsys.h |   4 +-
>  mm/Kconfig                    |  11 +-
>  mm/Makefile                   |   1 +-
>  mm/pins_cgroup.c              | 273 +++++++++++++++++++++++++++++++++++-
>  6 files changed, 316 insertions(+)
>  create mode 100644 mm/pins_cgroup.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index f781f93..f8526e2 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -5381,6 +5381,13 @@ F:       tools/testing/selftests/cgroup/memcg_protection.m
>  F:     tools/testing/selftests/cgroup/test_kmem.c
>  F:     tools/testing/selftests/cgroup/test_memcontrol.c
>
> +CONTROL GROUP - PINNED AND LOCKED MEMORY
> +M:     Alistair Popple <apopple@nvidia.com>
> +L:     cgroups@vger.kernel.org
> +L:     linux-mm@kvack.org
> +S:     Maintained
> +F:     mm/pins_cgroup.c
> +
>  CORETEMP HARDWARE MONITORING DRIVER
>  M:     Fenghua Yu <fenghua.yu@intel.com>
>  L:     linux-hwmon@vger.kernel.org
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 3410aec..d98de25 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -857,4 +857,24 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
>
>  #endif /* CONFIG_CGROUP_BPF */
>
> +#ifdef CONFIG_CGROUP_PINS
> +
> +struct pins_cgroup *get_pins_cg(struct task_struct *task);
> +void put_pins_cg(struct pins_cgroup *cg);
> +void pins_uncharge(struct pins_cgroup *pins, int num);
> +bool pins_try_charge(struct pins_cgroup *pins, int num);
> +
> +#else /* CONFIG_CGROUP_PINS */
> +
> +static inline struct pins_cgroup *get_pins_cg(struct task_struct *task)
> +{
> +       return NULL;
> +}
> +
> +static inline void put_pins_cg(struct pins_cgroup *cg) {}
> +static inline void pins_uncharge(struct pins_cgroup *pins, int num) {}
> +static inline bool pins_try_charge(struct pins_cgroup *pins, int num) { return 1; }
> +
> +#endif /* CONFIG_CGROUP_PINS */
> +
>  #endif /* _LINUX_CGROUP_H */
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index 4452354..c1b4aab 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -65,6 +65,10 @@ SUBSYS(rdma)
>  SUBSYS(misc)
>  #endif
>
> +#if IS_ENABLED(CONFIG_CGROUP_PINS)
> +SUBSYS(pins)
> +#endif
> +
>  /*
>   * The following subsystems are not supported on the default hierarchy.
>   */
> diff --git a/mm/Kconfig b/mm/Kconfig
> index ff7b209..4472043 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1183,6 +1183,17 @@ config LRU_GEN_STATS
>           This option has a per-memcg and per-node memory overhead.
>  # }
>
> +config CGROUP_PINS
> +    bool "Cgroup controller for pinned and locked memory"
> +    depends on CGROUPS
> +    default y
> +    help
> +      Having too much memory pinned or locked can lead to system
> +      instability due to increased likelihood of encountering
> +      out-of-memory conditions. Select this option to enable a cgroup
> +      controller which can be used to limit the overall number of
> +      pages procceses in a cgroup may lock or have pinned by drivers.
> +
>  source "mm/damon/Kconfig"
>
>  endmenu
> diff --git a/mm/Makefile b/mm/Makefile
> index 8e105e5..81db189 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -138,3 +138,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
>  obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
>  obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
>  obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
> +obj-$(CONFIG_CGROUP_PINS) += pins_cgroup.o
> diff --git a/mm/pins_cgroup.c b/mm/pins_cgroup.c
> new file mode 100644
> index 0000000..2d8c6c7
> --- /dev/null
> +++ b/mm/pins_cgroup.c
> @@ -0,0 +1,273 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Controller for cgroups limiting number of pages pinned for FOLL_LONGETERM.
> + *
> + * Copyright (C) 2022 Alistair Popple <apopple@nvidia.com>
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/threads.h>
> +#include <linux/atomic.h>
> +#include <linux/cgroup.h>
> +#include <linux/slab.h>
> +#include <linux/sched/task.h>
> +
> +#define PINS_MAX (-1ULL)
> +#define PINS_MAX_STR "max"
> +
> +struct pins_cgroup {
> +       struct cgroup_subsys_state      css;
> +
> +       atomic64_t                      counter;
> +       atomic64_t                      limit;
> +
> +       struct cgroup_file              events_file;
> +       atomic64_t                      events_limit;
> +};
> +
> +static struct pins_cgroup *css_pins(struct cgroup_subsys_state *css)
> +{
> +       return container_of(css, struct pins_cgroup, css);
> +}
> +
> +static struct pins_cgroup *parent_pins(struct pins_cgroup *pins)
> +{
> +       return css_pins(pins->css.parent);
> +}
> +
> +struct pins_cgroup *get_pins_cg(struct task_struct *task)

I think you'll need a version of this that gets multiple refs to the
pins cgroup. I will add more details in the patch that actually hooks
the accounting.

> +{
> +       return css_pins(task_get_css(task, pins_cgrp_id));
> +}
> +
> +void put_pins_cg(struct pins_cgroup *cg)
> +{
> +       css_put(&cg->css);
> +}
> +
> +static struct cgroup_subsys_state *
> +pins_css_alloc(struct cgroup_subsys_state *parent)
> +{
> +       struct pins_cgroup *pins;
> +
> +       pins = kzalloc(sizeof(struct pins_cgroup), GFP_KERNEL);
> +       if (!pins)
> +               return ERR_PTR(-ENOMEM);
> +
> +       atomic64_set(&pins->counter, 0);
> +       atomic64_set(&pins->limit, PINS_MAX);
> +       atomic64_set(&pins->events_limit, 0);
> +       return &pins->css;
> +}
> +
> +static void pins_css_free(struct cgroup_subsys_state *css)
> +{
> +       kfree(css_pins(css));
> +}
> +
> +/**
> + * pins_cancel - uncharge the local pin count
> + * @pins: the pin cgroup state
> + * @num: the number of pins to cancel
> + *
> + * This function will WARN if the pin count goes under 0, because such a case is
> + * a bug in the pins controller proper.
> + */
> +void pins_cancel(struct pins_cgroup *pins, int num)
> +{
> +       /*
> +        * A negative count (or overflow for that matter) is invalid,
> +        * and indicates a bug in the `pins` controller proper.
> +        */
> +       WARN_ON_ONCE(atomic64_add_negative(-num, &pins->counter));
> +}
> +
> +/**
> + * pins_uncharge - hierarchically uncharge the pin count
> + * @pins: the pin cgroup state
> + * @num: the number of pins to uncharge
> + */
> +void pins_uncharge(struct pins_cgroup *pins, int num)
> +{
> +       struct pins_cgroup *p;
> +
> +       for (p = pins; parent_pins(p); p = parent_pins(p))
> +               pins_cancel(p, num);
> +}
> +
> +/**
> + * pins_charge - hierarchically charge the pin count
> + * @pins: the pin cgroup state
> + * @num: the number of pins to charge
> + *
> + * This function does *not* follow the pin limit set. It cannot fail and the new
> + * pin count may exceed the limit. This is only used for reverting failed
> + * attaches, where there is no other way out than violating the limit.
> + */
> +static void pins_charge(struct pins_cgroup *pins, int num)
> +{
> +       struct pins_cgroup *p;
> +
> +       for (p = pins; parent_pins(p); p = parent_pins(p))
> +               atomic64_add(num, &p->counter);
> +}
> +
> +/**
> + * pins_try_charge - hierarchically try to charge the pin count
> + * @pins: the pin cgroup state
> + * @num: the number of pins to charge
> + *
> + * This function follows the set limit. It will fail if the charge would cause
> + * the new value to exceed the hierarchical limit. Returns true if the charge
> + * succeeded, false otherwise.
> + */
> +bool pins_try_charge(struct pins_cgroup *pins, int num)
> +{
> +       struct pins_cgroup *p, *q;
> +
> +       for (p = pins; parent_pins(p); p = parent_pins(p)) {
> +               uint64_t new = atomic64_add_return(num, &p->counter);
> +               uint64_t limit = atomic64_read(&p->limit);
> +
> +               if (limit != PINS_MAX && new > limit)
> +                       goto revert;
> +       }
> +
> +       return true;
> +
> +revert:
> +       for (q = pins; q != p; q = parent_pins(q))
> +               pins_cancel(q, num);
> +       pins_cancel(p, num);
> +
> +       return false;
> +}
> +
> +static int pins_can_attach(struct cgroup_taskset *tset)
> +{
> +       struct cgroup_subsys_state *dst_css;
> +       struct task_struct *task;
> +
> +       cgroup_taskset_for_each(task, dst_css, tset) {
> +               struct pins_cgroup *pins = css_pins(dst_css);
> +               struct cgroup_subsys_state *old_css;
> +               struct pins_cgroup *old_pins;
> +
> +               old_css = task_css(task, pins_cgrp_id);
> +               old_pins = css_pins(old_css);
> +
> +               pins_charge(pins, task->mm->locked_vm);

Why are we not using pins_try_charge() here, and failing attachment if
we cannot charge?

The comment above pins_charge() states that it is only used when
cancelling attachment, but this is not the case here, right?

> +               pins_uncharge(old_pins, task->mm->locked_vm);
> +       }
> +
> +       return 0;
> +}
> +
> +static void pins_cancel_attach(struct cgroup_taskset *tset)
> +{
> +       struct cgroup_subsys_state *dst_css;
> +       struct task_struct *task;
> +
> +       cgroup_taskset_for_each(task, dst_css, tset) {
> +               struct pins_cgroup *pins = css_pins(dst_css);
> +               struct cgroup_subsys_state *old_css;
> +               struct pins_cgroup *old_pins;
> +
> +               old_css = task_css(task, pins_cgrp_id);
> +               old_pins = css_pins(old_css);
> +
> +               pins_charge(old_pins, task->mm->locked_vm);
> +               pins_uncharge(pins, task->mm->locked_vm);
> +       }
> +}
> +
> +
> +static ssize_t pins_max_write(struct kernfs_open_file *of, char *buf,
> +                             size_t nbytes, loff_t off)
> +{
> +       struct cgroup_subsys_state *css = of_css(of);
> +       struct pins_cgroup *pins = css_pins(css);
> +       uint64_t limit;
> +       int err;
> +
> +       buf = strstrip(buf);
> +       if (!strcmp(buf, PINS_MAX_STR)) {
> +               limit = PINS_MAX;
> +               goto set_limit;
> +       }
> +
> +       err = kstrtoll(buf, 0, &limit);
> +       if (err)
> +               return err;
> +
> +       if (limit < 0 || limit >= PINS_MAX)
> +               return -EINVAL;
> +
> +set_limit:
> +       /*
> +        * Limit updates don't need to be mutex'd, since it isn't
> +        * critical that any racing fork()s follow the new limit.
> +        */
> +       atomic64_set(&pins->limit, limit);
> +       return nbytes;
> +}
> +
> +static int pins_max_show(struct seq_file *sf, void *v)
> +{
> +       struct cgroup_subsys_state *css = seq_css(sf);
> +       struct pins_cgroup *pins = css_pins(css);
> +       uint64_t limit = atomic64_read(&pins->limit);
> +
> +       if (limit >= PINS_MAX)
> +               seq_printf(sf, "%s\n", PINS_MAX_STR);
> +       else
> +               seq_printf(sf, "%lld\n", limit);
> +
> +       return 0;
> +}
> +
> +static s64 pins_current_read(struct cgroup_subsys_state *css,
> +                            struct cftype *cft)
> +{
> +       struct pins_cgroup *pins = css_pins(css);
> +
> +       return atomic64_read(&pins->counter);
> +}
> +
> +static int pins_events_show(struct seq_file *sf, void *v)
> +{
> +       struct pins_cgroup *pins = css_pins(seq_css(sf));
> +
> +       seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pins->events_limit));
> +       return 0;
> +}
> +
> +static struct cftype pins_files[] = {
> +       {
> +               .name = "max",
> +               .write = pins_max_write,
> +               .seq_show = pins_max_show,
> +               .flags = CFTYPE_NOT_ON_ROOT,
> +       },
> +       {
> +               .name = "current",
> +               .read_s64 = pins_current_read,
> +               .flags = CFTYPE_NOT_ON_ROOT,
> +       },
> +       {
> +               .name = "events",
> +               .seq_show = pins_events_show,
> +               .file_offset = offsetof(struct pins_cgroup, events_file),
> +               .flags = CFTYPE_NOT_ON_ROOT,
> +       },
> +       { }     /* terminate */
> +};
> +
> +struct cgroup_subsys pins_cgrp_subsys = {
> +       .css_alloc = pins_css_alloc,
> +       .css_free = pins_css_free,
> +       .legacy_cftypes = pins_files,
> +       .dfl_cftypes = pins_files,
> +       .can_attach = pins_can_attach,
> +       .cancel_attach = pins_cancel_attach,
> +};
> --
> git-series 0.9.1
>
Tejun Heo Feb. 6, 2023, 9:14 p.m. UTC | #2
On Mon, Feb 06, 2023 at 06:47:51PM +1100, Alistair Popple wrote:
> If too much memory in a system is pinned or locked it can lead to
> problems such as performance degradation or in the worst case
> out-of-memory errors as such memory cannot be moved or paged out.
> 
> In order to prevent users without CAP_IPC_LOCK from causing these
> issues the amount of memory that can be pinned is typically limited by
> RLIMIT_MEMLOCK. However this is inflexible as limits can't be shared
> between tasks and the enforcement of these limits is inconsistent
> between in-kernel users of pinned memory such as mlock() and device
> drivers which may also pin pages with pin_user_pages().
> 
> To allow for a single limit to be set introduce a cgroup controller
> which can be used to limit the number of pages being pinned by all
> tasks in the cgroup.

As I wrote before, I think this might fit better as a part of memcg than as
its own controller.

Thanks.
Yosry Ahmed Feb. 6, 2023, 10:32 p.m. UTC | #3
On Mon, Feb 6, 2023 at 1:14 PM Tejun Heo <tj@kernel.org> wrote:
>
> On Mon, Feb 06, 2023 at 06:47:51PM +1100, Alistair Popple wrote:
> > If too much memory in a system is pinned or locked it can lead to
> > problems such as performance degradation or in the worst case
> > out-of-memory errors as such memory cannot be moved or paged out.
> >
> > In order to prevent users without CAP_IPC_LOCK from causing these
> > issues the amount of memory that can be pinned is typically limited by
> > RLIMIT_MEMLOCK. However this is inflexible as limits can't be shared
> > between tasks and the enforcement of these limits is inconsistent
> > between in-kernel users of pinned memory such as mlock() and device
> > drivers which may also pin pages with pin_user_pages().
> >
> > To allow for a single limit to be set introduce a cgroup controller
> > which can be used to limit the number of pages being pinned by all
> > tasks in the cgroup.
>
> As I wrote before, I think this might fit better as a part of memcg than as
> its own controller.

I guess it boils down to which we want:
(a) Limit the amount of memory processes in a cgroup can be pinned/locked.
(b) Limit the amount of memory charged to a cgroup that can be pinned/locked.

The proposal is doing (a), I suppose if this was part of memcg it
would be (b), right?

I am not saying it should be one or the other, I am just making sure
my understanding is clear.

>
> Thanks.
>
> --
> tejun
Tejun Heo Feb. 6, 2023, 10:36 p.m. UTC | #4
On Mon, Feb 06, 2023 at 02:32:10PM -0800, Yosry Ahmed wrote:
> I guess it boils down to which we want:
> (a) Limit the amount of memory processes in a cgroup can be pinned/locked.
> (b) Limit the amount of memory charged to a cgroup that can be pinned/locked.
> 
> The proposal is doing (a), I suppose if this was part of memcg it
> would be (b), right?
> 
> I am not saying it should be one or the other, I am just making sure
> my understanding is clear.

I don't quite understand what the distinction would mean in practice. It's
just odd to put locked memory in a separate controller from interface POV.

Thanks.
Yosry Ahmed Feb. 6, 2023, 10:39 p.m. UTC | #5
On Mon, Feb 6, 2023 at 2:36 PM Tejun Heo <tj@kernel.org> wrote:
>
> On Mon, Feb 06, 2023 at 02:32:10PM -0800, Yosry Ahmed wrote:
> > I guess it boils down to which we want:
> > (a) Limit the amount of memory processes in a cgroup can be pinned/locked.
> > (b) Limit the amount of memory charged to a cgroup that can be pinned/locked.
> >
> > The proposal is doing (a), I suppose if this was part of memcg it
> > would be (b), right?
> >
> > I am not saying it should be one or the other, I am just making sure
> > my understanding is clear.
>
> I don't quite understand what the distinction would mean in practice. It's
> just odd to put locked memory in a separate controller from interface POV.

Assume we have 2 cgroups, A and B. A process in cgroup A creates a
tmpfs file and writes to it, so the memory is now charged to cgroup A.
Now imagine a process in cgroup B tries to lock this memory.
- With (a) the amount of locked memory will count toward's cgroup A's
limit, because cgroup A is charged for the memory.
- With (b) the amount of locked memory will count toward's cgroup B's
limit, because a process in cgroup B is locking the memory.

I agree that it is confusing from an interface POV.

>
> Thanks.
>
> --
> tejun
Tejun Heo Feb. 6, 2023, 11:25 p.m. UTC | #6
On Mon, Feb 06, 2023 at 02:39:17PM -0800, Yosry Ahmed wrote:
> On Mon, Feb 6, 2023 at 2:36 PM Tejun Heo <tj@kernel.org> wrote:
> >
> > On Mon, Feb 06, 2023 at 02:32:10PM -0800, Yosry Ahmed wrote:
> > > I guess it boils down to which we want:
> > > (a) Limit the amount of memory processes in a cgroup can be pinned/locked.
> > > (b) Limit the amount of memory charged to a cgroup that can be pinned/locked.
> > >
> > > The proposal is doing (a), I suppose if this was part of memcg it
> > > would be (b), right?
> > >
> > > I am not saying it should be one or the other, I am just making sure
> > > my understanding is clear.
> >
> > I don't quite understand what the distinction would mean in practice. It's
> > just odd to put locked memory in a separate controller from interface POV.
> 
> Assume we have 2 cgroups, A and B. A process in cgroup A creates a
> tmpfs file and writes to it, so the memory is now charged to cgroup A.
> Now imagine a process in cgroup B tries to lock this memory.
> - With (a) the amount of locked memory will count toward's cgroup A's
> limit, because cgroup A is charged for the memory.
> - With (b) the amount of locked memory will count toward's cgroup B's
> limit, because a process in cgroup B is locking the memory.
> 
> I agree that it is confusing from an interface POV.

Oh yeah, that's confusing. I'd go with (a) for consistency with the rest of
memcg - locked memory should fit inside e.g. memory.max. The problem with
shared memory accounting exists for non-locked memory as well and prolly
best to handle the same way rather than handling differently.

Thanks.
Yosry Ahmed Feb. 6, 2023, 11:34 p.m. UTC | #7
On Mon, Feb 6, 2023 at 3:25 PM Tejun Heo <tj@kernel.org> wrote:
>
> On Mon, Feb 06, 2023 at 02:39:17PM -0800, Yosry Ahmed wrote:
> > On Mon, Feb 6, 2023 at 2:36 PM Tejun Heo <tj@kernel.org> wrote:
> > >
> > > On Mon, Feb 06, 2023 at 02:32:10PM -0800, Yosry Ahmed wrote:
> > > > I guess it boils down to which we want:
> > > > (a) Limit the amount of memory processes in a cgroup can be pinned/locked.
> > > > (b) Limit the amount of memory charged to a cgroup that can be pinned/locked.
> > > >
> > > > The proposal is doing (a), I suppose if this was part of memcg it
> > > > would be (b), right?
> > > >
> > > > I am not saying it should be one or the other, I am just making sure
> > > > my understanding is clear.
> > >
> > > I don't quite understand what the distinction would mean in practice. It's
> > > just odd to put locked memory in a separate controller from interface POV.
> >
> > Assume we have 2 cgroups, A and B. A process in cgroup A creates a
> > tmpfs file and writes to it, so the memory is now charged to cgroup A.
> > Now imagine a process in cgroup B tries to lock this memory.
> > - With (a) the amount of locked memory will count toward's cgroup A's
> > limit, because cgroup A is charged for the memory.
> > - With (b) the amount of locked memory will count toward's cgroup B's
> > limit, because a process in cgroup B is locking the memory.
> >
> > I agree that it is confusing from an interface POV.
>
> Oh yeah, that's confusing. I'd go with (a) for consistency with the rest of
> memcg - locked memory should fit inside e.g. memory.max. The problem with
> shared memory accounting exists for non-locked memory as well and prolly
> best to handle the same way rather than handling differently.

+Michal Hocko +Roman Gushchin +Shakeel Butt for visibility with memcg.

>
> Thanks.
>
> --
> tejun
Jason Gunthorpe Feb. 6, 2023, 11:40 p.m. UTC | #8
On Mon, Feb 06, 2023 at 01:25:33PM -1000, Tejun Heo wrote:
> On Mon, Feb 06, 2023 at 02:39:17PM -0800, Yosry Ahmed wrote:
> > On Mon, Feb 6, 2023 at 2:36 PM Tejun Heo <tj@kernel.org> wrote:
> > >
> > > On Mon, Feb 06, 2023 at 02:32:10PM -0800, Yosry Ahmed wrote:
> > > > I guess it boils down to which we want:
> > > > (a) Limit the amount of memory processes in a cgroup can be pinned/locked.
> > > > (b) Limit the amount of memory charged to a cgroup that can be pinned/locked.
> > > >
> > > > The proposal is doing (a), I suppose if this was part of memcg it
> > > > would be (b), right?
> > > >
> > > > I am not saying it should be one or the other, I am just making sure
> > > > my understanding is clear.
> > >
> > > I don't quite understand what the distinction would mean in practice. It's
> > > just odd to put locked memory in a separate controller from interface POV.
> > 
> > Assume we have 2 cgroups, A and B. A process in cgroup A creates a
> > tmpfs file and writes to it, so the memory is now charged to cgroup A.
> > Now imagine a process in cgroup B tries to lock this memory.
> > - With (a) the amount of locked memory will count toward's cgroup A's
> > limit, because cgroup A is charged for the memory.
> > - With (b) the amount of locked memory will count toward's cgroup B's
> > limit, because a process in cgroup B is locking the memory.
> > 
> > I agree that it is confusing from an interface POV.
> 
> Oh yeah, that's confusing. I'd go with (a) for consistency with the rest of
> memcg - locked memory should fit inside e.g. memory.max. The problem with
> shared memory accounting exists for non-locked memory as well and prolly
> best to handle the same way rather than handling differently.

(a) kind of destroys the point of this as a sandboxing tool

It is not so harmful to use memory that someone else has been charged
with allocating.

But it is harmful to pin memory if someone else is charged for the
pin. It means it is unpredictable how much memory a sandbox can
actually lock down.

Plus we have the double accounting problem, if 1000 processes in
different cgroups open the tmpfs and all pin the memory then cgroup A
will be charged 1000x for the memory and hit its limit, possibly
creating a DOS from less priv to more priv

Jason
Tejun Heo Feb. 7, 2023, 12:32 a.m. UTC | #9
Hello,

On Mon, Feb 06, 2023 at 07:40:55PM -0400, Jason Gunthorpe wrote:
> (a) kind of destroys the point of this as a sandboxing tool
> 
> It is not so harmful to use memory that someone else has been charged
> with allocating.
> 
> But it is harmful to pin memory if someone else is charged for the
> pin. It means it is unpredictable how much memory a sandbox can
> actually lock down.
> 
> Plus we have the double accounting problem, if 1000 processes in
> different cgroups open the tmpfs and all pin the memory then cgroup A
> will be charged 1000x for the memory and hit its limit, possibly
> creating a DOS from less priv to more priv

Let's hear what memcg people think about it. I'm not a fan of disassociating
the ownership and locker of the same page but it is true that actively
increasing locked consumption on a remote cgroup is awkward too.

Thanks.
Waiman Long Feb. 7, 2023, 1 a.m. UTC | #10
On 2/6/23 17:39, Yosry Ahmed wrote:
> On Mon, Feb 6, 2023 at 2:36 PM Tejun Heo <tj@kernel.org> wrote:
>> On Mon, Feb 06, 2023 at 02:32:10PM -0800, Yosry Ahmed wrote:
>>> I guess it boils down to which we want:
>>> (a) Limit the amount of memory processes in a cgroup can be pinned/locked.
>>> (b) Limit the amount of memory charged to a cgroup that can be pinned/locked.
>>>
>>> The proposal is doing (a), I suppose if this was part of memcg it
>>> would be (b), right?
>>>
>>> I am not saying it should be one or the other, I am just making sure
>>> my understanding is clear.
>> I don't quite understand what the distinction would mean in practice. It's
>> just odd to put locked memory in a separate controller from interface POV.
> Assume we have 2 cgroups, A and B. A process in cgroup A creates a
> tmpfs file and writes to it, so the memory is now charged to cgroup A.
> Now imagine a process in cgroup B tries to lock this memory.
> - With (a) the amount of locked memory will count toward's cgroup A's
> limit, because cgroup A is charged for the memory.
> - With (b) the amount of locked memory will count toward's cgroup B's
> limit, because a process in cgroup B is locking the memory.
>
> I agree that it is confusing from an interface POV.

If it should not be part of the memcg, does it make sense to make it a 
resource in the existing misc controller? I believe we don't want a 
proliferation of new cgroup controllers.

Cheers,
Longman
Tejun Heo Feb. 7, 2023, 1:03 a.m. UTC | #11
On Mon, Feb 06, 2023 at 08:00:54PM -0500, Waiman Long wrote:
> If it should not be part of the memcg, does it make sense to make it a
> resource in the existing misc controller? I believe we don't want a
> proliferation of new cgroup controllers.

Yeah, if it's gonna be an independent knob, I suppose so, but I really think
the locked accounting should be tied to the page, which mostly likely would
mean that it'd be tied to the page ownership too making its natural place
memcg.

Thanks.
Alistair Popple Feb. 7, 2023, 1:50 a.m. UTC | #12
Tejun Heo <tj@kernel.org> writes:

> On Mon, Feb 06, 2023 at 08:00:54PM -0500, Waiman Long wrote:
>> If it should not be part of the memcg, does it make sense to make it a
>> resource in the existing misc controller? I believe we don't want a
>> proliferation of new cgroup controllers.
>
> Yeah, if it's gonna be an independent knob, I suppose so, but I really think
> the locked accounting should be tied to the page, which mostly likely would
> mean that it'd be tied to the page ownership too making its natural place
> memcg.

Yes, I think it might be possible. I looked briefly at doing it when
initially developing the series but I would like to resolve the question
of independent knob vs. memcg before heading too far down either path.

> Thanks.
Jason Gunthorpe Feb. 7, 2023, 12:19 p.m. UTC | #13
On Mon, Feb 06, 2023 at 02:32:37PM -1000, Tejun Heo wrote:
> Hello,
> 
> On Mon, Feb 06, 2023 at 07:40:55PM -0400, Jason Gunthorpe wrote:
> > (a) kind of destroys the point of this as a sandboxing tool
> > 
> > It is not so harmful to use memory that someone else has been charged
> > with allocating.
> > 
> > But it is harmful to pin memory if someone else is charged for the
> > pin. It means it is unpredictable how much memory a sandbox can
> > actually lock down.
> > 
> > Plus we have the double accounting problem, if 1000 processes in
> > different cgroups open the tmpfs and all pin the memory then cgroup A
> > will be charged 1000x for the memory and hit its limit, possibly
> > creating a DOS from less priv to more priv
> 
> Let's hear what memcg people think about it. I'm not a fan of disassociating
> the ownership and locker of the same page but it is true that actively
> increasing locked consumption on a remote cgroup is awkward too.

The main purpose of all this is to support libvirt, so they need to
support (a) too.

(b) is what we have now and most closely emulates the way the RLIMIT
works.

Jason
Michal Hocko Feb. 15, 2023, 7 p.m. UTC | #14
On Mon 06-02-23 14:32:37, Tejun Heo wrote:
> Hello,
> 
> On Mon, Feb 06, 2023 at 07:40:55PM -0400, Jason Gunthorpe wrote:
> > (a) kind of destroys the point of this as a sandboxing tool
> > 
> > It is not so harmful to use memory that someone else has been charged
> > with allocating.
> > 
> > But it is harmful to pin memory if someone else is charged for the
> > pin. It means it is unpredictable how much memory a sandbox can
> > actually lock down.
> > 
> > Plus we have the double accounting problem, if 1000 processes in
> > different cgroups open the tmpfs and all pin the memory then cgroup A
> > will be charged 1000x for the memory and hit its limit, possibly
> > creating a DOS from less priv to more priv
> 
> Let's hear what memcg people think about it. I'm not a fan of disassociating
> the ownership and locker of the same page but it is true that actively
> increasing locked consumption on a remote cgroup is awkward too.

One thing that is not really clear to me is whether those pins do
actually have any "ownership". The interface itself doesn't talk about
anything like that and so it seems perfectly fine to unpin from a
completely different context then pinning. If there is no enforcement
then Tejun is right and relying on memcg ownership is likely the only
reliable way to use for tracking. The downside is sharing obviously but
this is the same problem we already do deal with with shared pages.

Another thing that is not really clear to me is how the limit is
actually going to be used in practice. As there is no concept of a
reclaim for pins then I can imagine that it would be quite easy to
reach the hard limit and essentially DoS any further use of pins. Cross
cgroup pinning would make it even worse because it could become a DoS
vector very easily. Practically speaking what tends to be a corner case
in the memcg limit world would be norm for pin based limit.

Or am I misunderstanding something here?
Jason Gunthorpe Feb. 15, 2023, 7:07 p.m. UTC | #15
On Wed, Feb 15, 2023 at 08:00:22PM +0100, Michal Hocko wrote:
> On Mon 06-02-23 14:32:37, Tejun Heo wrote:
> > Hello,
> > 
> > On Mon, Feb 06, 2023 at 07:40:55PM -0400, Jason Gunthorpe wrote:
> > > (a) kind of destroys the point of this as a sandboxing tool
> > > 
> > > It is not so harmful to use memory that someone else has been charged
> > > with allocating.
> > > 
> > > But it is harmful to pin memory if someone else is charged for the
> > > pin. It means it is unpredictable how much memory a sandbox can
> > > actually lock down.
> > > 
> > > Plus we have the double accounting problem, if 1000 processes in
> > > different cgroups open the tmpfs and all pin the memory then cgroup A
> > > will be charged 1000x for the memory and hit its limit, possibly
> > > creating a DOS from less priv to more priv
> > 
> > Let's hear what memcg people think about it. I'm not a fan of disassociating
> > the ownership and locker of the same page but it is true that actively
> > increasing locked consumption on a remote cgroup is awkward too.
> 
> One thing that is not really clear to me is whether those pins do
> actually have any "ownership".

In most cases the ownship traces back to a file descriptor. When the
file is closed the pin goes away.

> The interface itself doesn't talk about
> anything like that and so it seems perfectly fine to unpin from a
> completely different context then pinning. 

Yes, concievably the close of the FD can be in a totally different
process with a different cgroup.

> If there is no enforcement then Tejun is right and relying on memcg
> ownership is likely the only reliable way to use for tracking. The
> downside is sharing obviously but this is the same problem we
> already do deal with with shared pages.

I think this does not work well because the owner in a memcg sense is
unrelated to the file descriptor which is the true owner.

So we can get cases where the pin is charged to the wrong cgroup which
is effectively fatal for sandboxing, IMHO.

> Another thing that is not really clear to me is how the limit is
> actually going to be used in practice. As there is no concept of a
> reclaim for pins then I can imagine that it would be quite easy to
> reach the hard limit and essentially DoS any further use of pins. 

Yes, that is the purpose. It is to sandbox pin users to put some limit
on the effect they have on the full machine.

It replaces the rlimit mess that was doing the same thing.

> Cross cgroup pinning would make it even worse because it could
> become a DoS vector very easily. Practically speaking what tends to
> be a corner case in the memcg limit world would be norm for pin
> based limit.

This is why the cgroup charged for the pin must be tightly linked to
some cgroup that is obviously connected to the creator of the FD
owning the pin.

Jason
Michal Hocko Feb. 16, 2023, 8:04 a.m. UTC | #16
On Wed 15-02-23 15:07:05, Jason Gunthorpe wrote:
> On Wed, Feb 15, 2023 at 08:00:22PM +0100, Michal Hocko wrote:
> > On Mon 06-02-23 14:32:37, Tejun Heo wrote:
> > > Hello,
> > > 
> > > On Mon, Feb 06, 2023 at 07:40:55PM -0400, Jason Gunthorpe wrote:
> > > > (a) kind of destroys the point of this as a sandboxing tool
> > > > 
> > > > It is not so harmful to use memory that someone else has been charged
> > > > with allocating.
> > > > 
> > > > But it is harmful to pin memory if someone else is charged for the
> > > > pin. It means it is unpredictable how much memory a sandbox can
> > > > actually lock down.
> > > > 
> > > > Plus we have the double accounting problem, if 1000 processes in
> > > > different cgroups open the tmpfs and all pin the memory then cgroup A
> > > > will be charged 1000x for the memory and hit its limit, possibly
> > > > creating a DOS from less priv to more priv
> > > 
> > > Let's hear what memcg people think about it. I'm not a fan of disassociating
> > > the ownership and locker of the same page but it is true that actively
> > > increasing locked consumption on a remote cgroup is awkward too.
> > 
> > One thing that is not really clear to me is whether those pins do
> > actually have any "ownership".
> 
> In most cases the ownship traces back to a file descriptor. When the
> file is closed the pin goes away.

This assumes a specific use of {un}pin_user_page*, right? IIUC the
cgroup charging is meant to be used from vm_account but that doesn't
really tell anything about the lifetime nor the ownership. Maybe this is
just a matter of documentation update...

> > The interface itself doesn't talk about
> > anything like that and so it seems perfectly fine to unpin from a
> > completely different context then pinning. 
> 
> Yes, concievably the close of the FD can be in a totally different
> process with a different cgroup.

Wouldn't you get an unbalanced charges then? How can admin recover that
situation?

> > If there is no enforcement then Tejun is right and relying on memcg
> > ownership is likely the only reliable way to use for tracking. The
> > downside is sharing obviously but this is the same problem we
> > already do deal with with shared pages.
> 
> I think this does not work well because the owner in a memcg sense is
> unrelated to the file descriptor which is the true owner.
> 
> So we can get cases where the pin is charged to the wrong cgroup which
> is effectively fatal for sandboxing, IMHO.

OK, I see. This makes it really much more complicated then.
 
> > Another thing that is not really clear to me is how the limit is
> > actually going to be used in practice. As there is no concept of a
> > reclaim for pins then I can imagine that it would be quite easy to
> > reach the hard limit and essentially DoS any further use of pins. 
> 
> Yes, that is the purpose. It is to sandbox pin users to put some limit
> on the effect they have on the full machine.
> 
> It replaces the rlimit mess that was doing the same thing.

arguably rlimit has a concept of the owner at least AFAICS. I do realize
this is not really great wrt a high level resource control though.

> > Cross cgroup pinning would make it even worse because it could
> > become a DoS vector very easily. Practically speaking what tends to
> > be a corner case in the memcg limit world would be norm for pin
> > based limit.
> 
> This is why the cgroup charged for the pin must be tightly linked to
> some cgroup that is obviously connected to the creator of the FD
> owning the pin.

The problem I can see is that the fd is just too fluid for tracking. You
can pass fd over to a different cgroup context and then all the tracking
just loses any trail to an owner.

I can see how the underlying memcg tracking information is not really
feasible for your usecases but I am really worried that it is just too
easy to misaccount without any other proper ownership tracking.
Jason Gunthorpe Feb. 16, 2023, 12:45 p.m. UTC | #17
On Thu, Feb 16, 2023 at 09:04:03AM +0100, Michal Hocko wrote:

> > In most cases the ownship traces back to a file descriptor. When the
> > file is closed the pin goes away.
> 
> This assumes a specific use of {un}pin_user_page*, right? IIUC the
> cgroup charging is meant to be used from vm_account but that doesn't
> really tell anything about the lifetime nor the ownership. Maybe this is
> just a matter of documentation update...

Yes documentation.

> > > The interface itself doesn't talk about
> > > anything like that and so it seems perfectly fine to unpin from a
> > > completely different context then pinning. 
> > 
> > Yes, concievably the close of the FD can be in a totally different
> > process with a different cgroup.
> 
> Wouldn't you get an unbalanced charges then? How can admin recover that
> situation?

No, the approach in this patch series captures the cgroup that was
charged and stores it in the FD until uncharge.

This is the same as we do today for rlimit. The user/process that is
charged is captured and the uncharge always applies to user/process
that was charged, not the user/process that happens to be associated
with the uncharging context.

cgroup just add another option so it is user/process/cgroup that can
hold the charge.

It is conceptually similar to how each struct page has the memcg that
its allocation was charged to - we just record this in the FD not the
page.

> > > Another thing that is not really clear to me is how the limit is
> > > actually going to be used in practice. As there is no concept of a
> > > reclaim for pins then I can imagine that it would be quite easy to
> > > reach the hard limit and essentially DoS any further use of pins. 
> > 
> > Yes, that is the purpose. It is to sandbox pin users to put some limit
> > on the effect they have on the full machine.
> > 
> > It replaces the rlimit mess that was doing the same thing.
> 
> arguably rlimit has a concept of the owner at least AFAICS. I do realize
> this is not really great wrt a high level resource control though.

rlimit uses either the user or the process as the "owner". In this
model we view a cgroup as the "owner". The lifetime logic is all the
same, you figure out the owner (cgroup/user/process) when the charge
is made and record it, when the uncharge comes the recorded owner is
uncharged.

It never allows unbalanced charge/uncharge because that would be a
security problem even for rlimit cases today.

Jason
Tejun Heo Feb. 21, 2023, 4:51 p.m. UTC | #18
Hello,

On Thu, Feb 16, 2023 at 08:45:38AM -0400, Jason Gunthorpe wrote:
> On Thu, Feb 16, 2023 at 09:04:03AM +0100, Michal Hocko wrote:
> 
> > > In most cases the ownship traces back to a file descriptor. When the
> > > file is closed the pin goes away.
> > 
> > This assumes a specific use of {un}pin_user_page*, right? IIUC the
> > cgroup charging is meant to be used from vm_account but that doesn't
> > really tell anything about the lifetime nor the ownership. Maybe this is
> > just a matter of documentation update...
> 
> Yes documentation.
> 
> > > > The interface itself doesn't talk about
> > > > anything like that and so it seems perfectly fine to unpin from a
> > > > completely different context then pinning. 
> > > 
> > > Yes, concievably the close of the FD can be in a totally different
> > > process with a different cgroup.
> > 
> > Wouldn't you get an unbalanced charges then? How can admin recover that
> > situation?
> 
> No, the approach in this patch series captures the cgroup that was
> charged and stores it in the FD until uncharge.
> 
> This is the same as we do today for rlimit. The user/process that is
> charged is captured and the uncharge always applies to user/process
> that was charged, not the user/process that happens to be associated
> with the uncharging context.
> 
> cgroup just add another option so it is user/process/cgroup that can
> hold the charge.
> 
> It is conceptually similar to how each struct page has the memcg that
> its allocation was charged to - we just record this in the FD not the
> page.

I don't have a lot of context here but it looks like the problem here is
that the newly proposed controller is introducing an ownership discrepancy.
If a memory which is pinned by a cgroup is to be charged to the owner of the
fd, the memory which isn't pinned should be charged to the memcg of the same
cgroup, right? It makes little sense to me to separate the owner of the
memory page and the pinner of it. They should be one and the same.

Alistair, can you please elaborate how these pages are allocated, charged
and used?

Thanks.
Jason Gunthorpe Feb. 21, 2023, 5:25 p.m. UTC | #19
On Tue, Feb 21, 2023 at 06:51:48AM -1000, Tejun Heo wrote:
> cgroup, right? It makes little sense to me to separate the owner of the
> memory page and the pinner of it. They should be one and the same.

The owner and pinner are not always the same entity or we could just
use the page's cgroup.

Jason
Tejun Heo Feb. 21, 2023, 5:29 p.m. UTC | #20
On Tue, Feb 21, 2023 at 01:25:59PM -0400, Jason Gunthorpe wrote:
> On Tue, Feb 21, 2023 at 06:51:48AM -1000, Tejun Heo wrote:
> > cgroup, right? It makes little sense to me to separate the owner of the
> > memory page and the pinner of it. They should be one and the same.
> 
> The owner and pinner are not always the same entity or we could just
> use the page's cgroup.

Yeah, so, what I'm trying to say is that that might be the source of the
problem. Is the current page ownership attribution correct given that the fd
for whatever reason is determining the pinning ownership or should the page
ownership be attributed the same way too? If they indeed need to differ,
that probably would need pretty strong justifications.

Thanks.
Jason Gunthorpe Feb. 21, 2023, 5:51 p.m. UTC | #21
On Tue, Feb 21, 2023 at 07:29:18AM -1000, Tejun Heo wrote:
> On Tue, Feb 21, 2023 at 01:25:59PM -0400, Jason Gunthorpe wrote:
> > On Tue, Feb 21, 2023 at 06:51:48AM -1000, Tejun Heo wrote:
> > > cgroup, right? It makes little sense to me to separate the owner of the
> > > memory page and the pinner of it. They should be one and the same.
> > 
> > The owner and pinner are not always the same entity or we could just
> > use the page's cgroup.
> 
> Yeah, so, what I'm trying to say is that that might be the source of the
> problem. Is the current page ownership attribution correct 

It should be correct.

This mechanism is driven by pin_user_page(), (as it is the only API
that can actually create a pin) so the cgroup owner of the page is
broadly related to the "owner" of the VMA's inode.

The owner of the pin is the caller of pin_user_page(), which is
initated by some FD/proces that is not necessarily related to the
VMA's inode.

Eg concretely, something like io_uring will do something like:
  buffer = mmap()     <- Charge memcg for the pages
  fd = io_uring_setup(..)
  io_uring_register(fd,xx,buffer,..);   <- Charge the pincg for the pin

If mmap is a private anonymous VMA created by the same process then it
is likely the pages will have the same cgroup as io_uring_register and
the FD.

Otherwise the page cgroup is unconstrained. MAP_SHARED mappings will
have the page cgroup point at whatever cgroup was first to allocate
the page for the VMA's inode.

AFAIK there are few real use cases to establish a pin on MAP_SHARED
mappings outside your cgroup. However, it is possible, the APIs allow
it, and for security sandbox purposes we can't allow a process inside
a cgroup to triger a charge on a different cgroup. That breaks the
sandbox goal.

If memcg could support multiple owners then it would be logical that
the pinner would be one of the memcg owners.

> for whatever reason is determining the pinning ownership or should the page
> ownership be attributed the same way too? If they indeed need to differ,
> that probably would need pretty strong justifications.

It is inherent to how pin_user_pages() works. It is an API that
establishs pins on existing pages. There is nothing about it that says
who the page's memcg owner is.

I don't think we can do anything about this without breaking things.

Jason
Tejun Heo Feb. 21, 2023, 6:07 p.m. UTC | #22
Hello,

On Tue, Feb 21, 2023 at 01:51:12PM -0400, Jason Gunthorpe wrote:
> > Yeah, so, what I'm trying to say is that that might be the source of the
> > problem. Is the current page ownership attribution correct 
> 
> It should be correct.
> 
> This mechanism is driven by pin_user_page(), (as it is the only API
> that can actually create a pin) so the cgroup owner of the page is
> broadly related to the "owner" of the VMA's inode.
> 
> The owner of the pin is the caller of pin_user_page(), which is
> initated by some FD/proces that is not necessarily related to the
> VMA's inode.
> 
> Eg concretely, something like io_uring will do something like:
>   buffer = mmap()     <- Charge memcg for the pages
>   fd = io_uring_setup(..)
>   io_uring_register(fd,xx,buffer,..);   <- Charge the pincg for the pin
> 
> If mmap is a private anonymous VMA created by the same process then it
> is likely the pages will have the same cgroup as io_uring_register and
> the FD.
> 
> Otherwise the page cgroup is unconstrained. MAP_SHARED mappings will
> have the page cgroup point at whatever cgroup was first to allocate
> the page for the VMA's inode.
> 
> AFAIK there are few real use cases to establish a pin on MAP_SHARED
> mappings outside your cgroup. However, it is possible, the APIs allow
> it, and for security sandbox purposes we can't allow a process inside
> a cgroup to triger a charge on a different cgroup. That breaks the
> sandbox goal.

It seems broken anyway. Please consider the following scenario:

1. A is a tiny cgroup which only does streaming IOs and has memory.high of
   128M which is more than sufficient for IO window. The last file it
   streamed happened to be F which was about 256M.

2. B is an a lot larger cgroup w/ pin limit way above 256M. B pins the
   entirety of F.

3. A now tries to stream another file but F is almost fully occupying its
   memory allowance and can't be evicted. A keeps thrashing due to lack of
   memory and isolation is completely broken.

This stems directly from page ownership and pin accounting discrepancy.

> If memcg could support multiple owners then it would be logical that
> the pinner would be one of the memcg owners.
> 
> > for whatever reason is determining the pinning ownership or should the page
> > ownership be attributed the same way too? If they indeed need to differ,
> > that probably would need pretty strong justifications.
> 
> It is inherent to how pin_user_pages() works. It is an API that
> establishs pins on existing pages. There is nothing about it that says
> who the page's memcg owner is.
> 
> I don't think we can do anything about this without breaking things.

That's a discrepancy in an internal interface and we don't wanna codify
something like that into userspace interface. Semantially, it seems like if
pin_user_pages() wanna charge pinning to the cgroup associated with an fd
(or whatever), it should also claim the ownership of the pages themselves. I
have no idea how feasiable that'd be from memcg POV tho. Given that this
would be a fairly cold path (in most cases, the ownership should already
match), maybe it won't be too bad?

Thanks.
Jason Gunthorpe Feb. 21, 2023, 7:26 p.m. UTC | #23
On Tue, Feb 21, 2023 at 08:07:13AM -1000, Tejun Heo wrote:
> > AFAIK there are few real use cases to establish a pin on MAP_SHARED
> > mappings outside your cgroup. However, it is possible, the APIs allow
> > it, and for security sandbox purposes we can't allow a process inside
> > a cgroup to triger a charge on a different cgroup. That breaks the
> > sandbox goal.
> 
> It seems broken anyway. Please consider the following scenario:

Yes, this is broken like this already today - memcg doesn't work
entirely perfectly for MAP_SHARED scenarios, IMHO.

> > > for whatever reason is determining the pinning ownership or should the page
> > > ownership be attributed the same way too? If they indeed need to differ,
> > > that probably would need pretty strong justifications.
> > 
> > It is inherent to how pin_user_pages() works. It is an API that
> > establishs pins on existing pages. There is nothing about it that says
> > who the page's memcg owner is.
> > 
> > I don't think we can do anything about this without breaking things.
> 
> That's a discrepancy in an internal interface and we don't wanna codify
> something like that into userspace interface. Semantially, it seems like if
> pin_user_pages() wanna charge pinning to the cgroup associated with an fd
> (or whatever), it should also claim the ownership of the pages
> themselves.

Multiple cgroup can pin the same page, so it is not as simple as just
transfering ownership, we need multi-ownership and to really fix the
memcg limitations with MAP_SHARED without an API impact.

You are right that pinning is really just a special case of
allocation, but there is a reason the memcg was left with weak support
for MAP_SHARED and changing that may be more than just hard but an
infeasible trade off..

At least I don't have a good idea how to even approach building a
reasonable datstructure that can track the number of
charges per-cgroup per page. :\

Jason
Tejun Heo Feb. 21, 2023, 7:45 p.m. UTC | #24
Hello,

On Tue, Feb 21, 2023 at 03:26:33PM -0400, Jason Gunthorpe wrote:
> On Tue, Feb 21, 2023 at 08:07:13AM -1000, Tejun Heo wrote:
> > > AFAIK there are few real use cases to establish a pin on MAP_SHARED
> > > mappings outside your cgroup. However, it is possible, the APIs allow
> > > it, and for security sandbox purposes we can't allow a process inside
> > > a cgroup to triger a charge on a different cgroup. That breaks the
> > > sandbox goal.
> > 
> > It seems broken anyway. Please consider the following scenario:
> 
> Yes, this is broken like this already today - memcg doesn't work
> entirely perfectly for MAP_SHARED scenarios, IMHO.

It is far from perfect but the existing behavior isn't that broken. e.g. in
the same scenario, without pinning, even if the larger cgroup keeps using
the same page, the smaller cgroup should be able to evict the pages as they
are not pinned and the cgroup is under heavy reclaim pressure. The larger
cgroup will refault them back in and end up owning those pages.

memcg can't capture the case of the same pages being actively shared by
multiple cgroups concurrently (I think those cases should be handled by
pushing them to the common parent as discussed elswhere but that's a
separate topic) but it can converge when page usage transfers across cgroups
if needed. Disassociating ownership and pinning will break that in an
irreversible way.

> > > > for whatever reason is determining the pinning ownership or should the page
> > > > ownership be attributed the same way too? If they indeed need to differ,
> > > > that probably would need pretty strong justifications.
> > > 
> > > It is inherent to how pin_user_pages() works. It is an API that
> > > establishs pins on existing pages. There is nothing about it that says
> > > who the page's memcg owner is.
> > > 
> > > I don't think we can do anything about this without breaking things.
> > 
> > That's a discrepancy in an internal interface and we don't wanna codify
> > something like that into userspace interface. Semantially, it seems like if
> > pin_user_pages() wanna charge pinning to the cgroup associated with an fd
> > (or whatever), it should also claim the ownership of the pages
> > themselves.
> 
> Multiple cgroup can pin the same page, so it is not as simple as just
> transfering ownership, we need multi-ownership and to really fix the
> memcg limitations with MAP_SHARED without an API impact.
> 
> You are right that pinning is really just a special case of
> allocation, but there is a reason the memcg was left with weak support
> for MAP_SHARED and changing that may be more than just hard but an
> infeasible trade off..
> 
> At least I don't have a good idea how to even approach building a
> reasonable datstructure that can track the number of
> charges per-cgroup per page. :\

As I wrote above, I don't think the problem here is the case of pages being
shared by multiple cgroups concurrently. We can leave that problem for
another thread. However, if we want to support accounting and control of
pinned memory, we really shouldn't introduce a fundmental discrepancy like
the owner and pinner disagreeing with each other. At least conceptually, the
solution is rather straight-forward - whoever pins a page should also claim
the ownership of it.

Thanks.
Tejun Heo Feb. 21, 2023, 7:49 p.m. UTC | #25
On Tue, Feb 21, 2023 at 09:45:15AM -1000, Tejun Heo wrote:
> > Multiple cgroup can pin the same page, so it is not as simple as just
> > transfering ownership, we need multi-ownership and to really fix the
> > memcg limitations with MAP_SHARED without an API impact.
> > 
> > You are right that pinning is really just a special case of
> > allocation, but there is a reason the memcg was left with weak support
> > for MAP_SHARED and changing that may be more than just hard but an
> > infeasible trade off..
> > 
> > At least I don't have a good idea how to even approach building a
> > reasonable datstructure that can track the number of
> > charges per-cgroup per page. :\
> 
> As I wrote above, I don't think the problem here is the case of pages being
> shared by multiple cgroups concurrently. We can leave that problem for
> another thread. However, if we want to support accounting and control of
> pinned memory, we really shouldn't introduce a fundmental discrepancy like
> the owner and pinner disagreeing with each other. At least conceptually, the
> solution is rather straight-forward - whoever pins a page should also claim
> the ownership of it.

Ah, sorry, I missed the part about multiple cgroups pinning the same page.
Yeah, I can't think of a good answer for that.

Thanks.
Jason Gunthorpe Feb. 21, 2023, 7:57 p.m. UTC | #26
On Tue, Feb 21, 2023 at 09:45:15AM -1000, Tejun Heo wrote:
> Hello,
> 
> On Tue, Feb 21, 2023 at 03:26:33PM -0400, Jason Gunthorpe wrote:
> > On Tue, Feb 21, 2023 at 08:07:13AM -1000, Tejun Heo wrote:
> > > > AFAIK there are few real use cases to establish a pin on MAP_SHARED
> > > > mappings outside your cgroup. However, it is possible, the APIs allow
> > > > it, and for security sandbox purposes we can't allow a process inside
> > > > a cgroup to triger a charge on a different cgroup. That breaks the
> > > > sandbox goal.
> > > 
> > > It seems broken anyway. Please consider the following scenario:
> > 
> > Yes, this is broken like this already today - memcg doesn't work
> > entirely perfectly for MAP_SHARED scenarios, IMHO.
> 
> It is far from perfect but the existing behavior isn't that broken. e.g. in
> the same scenario, without pinning, even if the larger cgroup keeps using
> the same page, the smaller cgroup should be able to evict the pages as they
> are not pinned and the cgroup is under heavy reclaim pressure. The larger
> cgroup will refault them back in and end up owning those pages.
> 
> memcg can't capture the case of the same pages being actively shared by
> multiple cgroups concurrently (I think those cases should be handled by
> pushing them to the common parent as discussed elswhere but that's a
> separate topic) but it can converge when page usage transfers across cgroups
> if needed. Disassociating ownership and pinning will break that in an
> irreversible way.

It is already disassociated - memcg is broken as you describe today
with pin_user_pages().

If you want to fix that, then we need to redefine how memcg works with
pin_user_pages() and I'm open to ideas..

> the owner and pinner disagreeing with each other. At least
> conceptually, the solution is rather straight-forward - whoever pins
> a page should also claim the ownership of it.

If the answer is pinner is owner, then multi-pinners must mean
multi-owner too. We probably can't block multi-pinner without causing
uAPI problems.

You are not wrong on any of these remarks, but this looses sight of
the point - it is take the existing broken RLIMIT scheme and make it
incrementally better by being the same broken scheme just with
cgroups.

If we eventually fix everything so memcg can do multi-pinners/owners
then would it be reasonable to phase out the new pincg at that time?

Jason
Alistair Popple Feb. 22, 2023, 11:38 a.m. UTC | #27
Jason Gunthorpe <jgg@nvidia.com> writes:

>> the owner and pinner disagreeing with each other. At least
>> conceptually, the solution is rather straight-forward - whoever pins
>> a page should also claim the ownership of it.
>
> If the answer is pinner is owner, then multi-pinners must mean
> multi-owner too. We probably can't block multi-pinner without causing
> uAPI problems.

It seems the problem is how to track multiple pinners of the page. At
the moment memcg ownership is stored in folio->memcg_data which
basically points to the owning struct mem_cgroup *.

For pinning this series already introduces this data structure:

struct vm_account {
	struct task_struct *task;
	struct mm_struct *mm;
	struct user_struct *user;
	enum vm_account_flags flags;
};

We could modify it to something like:

struct vm_account {
       struct list_head possible_pinners;
       struct mem_cgroup *memcg;
       [...]
};

When a page is pinned the first pinner takes ownership and stores it's
memcg there, updating memcg_data to point to it. This would require a
new page_memcg_data_flags but I think we have one left. Subsequent
pinners create a vm_account and add it to the pinners list.

When a driver unpins a page we scan the pinners list and assign
ownership to the next driver pinning the page by updating memcg_data and
removing the vm_account from the list.

The problem with this approach is each pinner (ie. struct vm_account)
may cover different subsets of pages. Drivers have to store a list of
pinned pages somewhere, so we could query drivers or store the list of
pinned pages in the vm_account. That seems like a fair bit of overhead
though and would make unpinning expensive as we'd have to traverse
several lists.

We'd also have to ensure possible owners had enough free memory in the
owning memcg to accept having the page charged when another pinner
unpins. That could be done by reserving space during pinning.

And of course it only works for pin_user_pages() - other users don't
always have a list of pages conveniently available although I suppose
they could walk the rmap, but again that adds overhead. So not sure it's
a great solution but figured I'd leave it here in case it triggers other
ideas.

> You are not wrong on any of these remarks, but this looses sight of
> the point - it is take the existing broken RLIMIT scheme and make it
> incrementally better by being the same broken scheme just with
> cgroups.

Right. RLIMIT_MEMLOCK is pretty broken because most uses enforce it
against a specific task so it can be easily bypassed. The aim here was
to make it at least possible to enforce a meaningful limit.

> If we eventually fix everything so memcg can do multi-pinners/owners
> then would it be reasonable to phase out the new pincg at that time?
>
> Jason
Jason Gunthorpe Feb. 22, 2023, 12:57 p.m. UTC | #28
On Wed, Feb 22, 2023 at 10:38:25PM +1100, Alistair Popple wrote:
> When a driver unpins a page we scan the pinners list and assign
> ownership to the next driver pinning the page by updating memcg_data and
> removing the vm_account from the list.

I don't see how this works with just the data structure you outlined??
Every unique page needs its own list_head in the vm_account, it is
doable just incredibly costly.

Jason
Alistair Popple Feb. 22, 2023, 10:59 p.m. UTC | #29
Jason Gunthorpe <jgg@nvidia.com> writes:

> On Wed, Feb 22, 2023 at 10:38:25PM +1100, Alistair Popple wrote:
>> When a driver unpins a page we scan the pinners list and assign
>> ownership to the next driver pinning the page by updating memcg_data and
>> removing the vm_account from the list.
>
> I don't see how this works with just the data structure you outlined??
> Every unique page needs its own list_head in the vm_account, it is
> doable just incredibly costly.

The idea was every driver already needs to allocate a pages array to
pass to pin_user_pages(), and by necessity drivers have to keep a
reference to the contents of that in one form or another. So
conceptually the equivalent of:

struct vm_account {
       struct list_head possible_pinners;
       struct mem_cgroup *memcg;
       struct pages **pages;
       [...]
};

Unpinnig involves finding a new owner by traversing the list of
page->memcg_data->possible_pinners and iterating over *pages[] to figure
out if that vm_account actually has this page pinned or not and could
own it.

Agree this is costly though. And I don't think all drivers keep the
array around so "iterating over *pages[]" may need to be a callback.

> Jason
Christoph Hellwig Feb. 23, 2023, 12:05 a.m. UTC | #30
On Thu, Feb 23, 2023 at 09:59:35AM +1100, Alistair Popple wrote:
> The idea was every driver already needs to allocate a pages array to
> pass to pin_user_pages(), and by necessity drivers have to keep a
> reference to the contents of that in one form or another. So
> conceptually the equivalent of:
> 
> struct vm_account {
>        struct list_head possible_pinners;
>        struct mem_cgroup *memcg;
>        struct pages **pages;
>        [...]
> };
> 
> Unpinnig involves finding a new owner by traversing the list of
> page->memcg_data->possible_pinners and iterating over *pages[] to figure
> out if that vm_account actually has this page pinned or not and could
> own it.
> 
> Agree this is costly though. And I don't think all drivers keep the
> array around so "iterating over *pages[]" may need to be a callback.

Is pinning in this context referring to FOLL_LONGTERM pins or any
FOLL_PIN?  In the latter case block based direct I/O does not keep
the pages array around, and also is absolutely not willing to pay
for the overhead.

For FOLL_LONGTERM the schemes sounds vaguely reasonable to me.
Alistair Popple Feb. 23, 2023, 12:35 a.m. UTC | #31
Christoph Hellwig <hch@infradead.org> writes:

> On Thu, Feb 23, 2023 at 09:59:35AM +1100, Alistair Popple wrote:
>> The idea was every driver already needs to allocate a pages array to
>> pass to pin_user_pages(), and by necessity drivers have to keep a
>> reference to the contents of that in one form or another. So
>> conceptually the equivalent of:
>> 
>> struct vm_account {
>>        struct list_head possible_pinners;
>>        struct mem_cgroup *memcg;
>>        struct pages **pages;
>>        [...]
>> };
>> 
>> Unpinnig involves finding a new owner by traversing the list of
>> page->memcg_data->possible_pinners and iterating over *pages[] to figure
>> out if that vm_account actually has this page pinned or not and could
>> own it.
>> 
>> Agree this is costly though. And I don't think all drivers keep the
>> array around so "iterating over *pages[]" may need to be a callback.
>
> Is pinning in this context referring to FOLL_LONGTERM pins or any
> FOLL_PIN?  In the latter case block based direct I/O does not keep
> the pages array around, and also is absolutely not willing to pay
> for the overhead.

Good point. I was primarily targeting FOLL_LONGTERM users. I'm not too
familiar with block based direct I/O but from what I can tell it
currently doesn't respect any kind of RLIMIT anyway so I guess the
requirment to limit pinned pages there isn't so revelant.
Jason Gunthorpe Feb. 23, 2023, 1:53 a.m. UTC | #32
On Thu, Feb 23, 2023 at 09:59:35AM +1100, Alistair Popple wrote:
> 
> Jason Gunthorpe <jgg@nvidia.com> writes:
> 
> > On Wed, Feb 22, 2023 at 10:38:25PM +1100, Alistair Popple wrote:
> >> When a driver unpins a page we scan the pinners list and assign
> >> ownership to the next driver pinning the page by updating memcg_data and
> >> removing the vm_account from the list.
> >
> > I don't see how this works with just the data structure you outlined??
> > Every unique page needs its own list_head in the vm_account, it is
> > doable just incredibly costly.
> 
> The idea was every driver already needs to allocate a pages array to
> pass to pin_user_pages(), and by necessity drivers have to keep a
> reference to the contents of that in one form or another. So
> conceptually the equivalent of:
> 
> struct vm_account {
>        struct list_head possible_pinners;
>        struct mem_cgroup *memcg;
>        struct pages **pages;
>        [...]
> };
> 
> Unpinnig involves finding a new owner by traversing the list of
> page->memcg_data->possible_pinners and iterating over *pages[] to figure
> out if that vm_account actually has this page pinned or not and could
> own it.

Oh, you are focusing on Tejun's DOS scenario. 

The DOS problem is to prevent a pin users in cgroup A from keeping
memory charged to cgroup B that it isn't using any more.

cgroup B doesn't need to be pinning the memory, it could just be
normal VMAs and "isn't using anymore" means it has unmapped all the
VMAs.

Solving that problem means figuring out when every cgroup stops using
the memory - pinning or not. That seems to be very costly.

AFAIK this problem also already exists today as the memcg of a page
doesn't change while it is pinned. So maybe we don't need to address
it.

Arguably the pins are not the problem. If we want to treat the pin
like allocation then we simply charge the non-owning memcg's for the
pin as though it was an allocation. Eg go over every page and if the
owning memcg is not the current memcg then charge the current memcg
for an allocation of the MAP_SHARED memory. Undoing this is trivial
enoug.

This doesn't fix the DOS problem but it does sort of harmonize the pin
accounting with the memcg by multi-accounting every pin of a
MAP_SHARED page.

The other drawback is that this isn't the same thing as the current
rlimit. The rlimit is largely restricting the creation of unmovable
memory.

Though, AFAICT memcg seems to bundle unmovable memory (eg GFP_KERNEL)
along with movable user pages so it would be self-consistent.

I'm unclear if this is OK for libvirt..

> Agree this is costly though. And I don't think all drivers keep the
> array around so "iterating over *pages[]" may need to be a callback.

I think searching lists of pages is not reasonable. Things like VFIO &
KVM use cases effectively pin 90% of all system memory, that is
potentially TB of page lists that might need linear searching!

Jason
Daniel P. Berrangé Feb. 23, 2023, 9:12 a.m. UTC | #33
On Wed, Feb 22, 2023 at 09:53:56PM -0400, Jason Gunthorpe wrote:
> On Thu, Feb 23, 2023 at 09:59:35AM +1100, Alistair Popple wrote:
> > 
> > Jason Gunthorpe <jgg@nvidia.com> writes:
> > 
> > > On Wed, Feb 22, 2023 at 10:38:25PM +1100, Alistair Popple wrote:
> > >> When a driver unpins a page we scan the pinners list and assign
> > >> ownership to the next driver pinning the page by updating memcg_data and
> > >> removing the vm_account from the list.
> > >
> > > I don't see how this works with just the data structure you outlined??
> > > Every unique page needs its own list_head in the vm_account, it is
> > > doable just incredibly costly.
> > 
> > The idea was every driver already needs to allocate a pages array to
> > pass to pin_user_pages(), and by necessity drivers have to keep a
> > reference to the contents of that in one form or another. So
> > conceptually the equivalent of:
> > 
> > struct vm_account {
> >        struct list_head possible_pinners;
> >        struct mem_cgroup *memcg;
> >        struct pages **pages;
> >        [...]
> > };
> > 
> > Unpinnig involves finding a new owner by traversing the list of
> > page->memcg_data->possible_pinners and iterating over *pages[] to figure
> > out if that vm_account actually has this page pinned or not and could
> > own it.
> 
> Oh, you are focusing on Tejun's DOS scenario. 
> 
> The DOS problem is to prevent a pin users in cgroup A from keeping
> memory charged to cgroup B that it isn't using any more.
> 
> cgroup B doesn't need to be pinning the memory, it could just be
> normal VMAs and "isn't using anymore" means it has unmapped all the
> VMAs.
> 
> Solving that problem means figuring out when every cgroup stops using
> the memory - pinning or not. That seems to be very costly.
> 
> AFAIK this problem also already exists today as the memcg of a page
> doesn't change while it is pinned. So maybe we don't need to address
> it.
> 
> Arguably the pins are not the problem. If we want to treat the pin
> like allocation then we simply charge the non-owning memcg's for the
> pin as though it was an allocation. Eg go over every page and if the
> owning memcg is not the current memcg then charge the current memcg
> for an allocation of the MAP_SHARED memory. Undoing this is trivial
> enoug.
> 
> This doesn't fix the DOS problem but it does sort of harmonize the pin
> accounting with the memcg by multi-accounting every pin of a
> MAP_SHARED page.
> 
> The other drawback is that this isn't the same thing as the current
> rlimit. The rlimit is largely restricting the creation of unmovable
> memory.
> 
> Though, AFAICT memcg seems to bundle unmovable memory (eg GFP_KERNEL)
> along with movable user pages so it would be self-consistent.
> 
> I'm unclear if this is OK for libvirt..

I'm not sure what exact scenario you're thinking of when talking
about two distinct cgroups and its impact on libvirt. None the less
here's a rough summary of libvirt's approach to cgroups and memory

On the libvirt side, we create 1 single cgroup per VM, in which lives
at least the QEMU process, but possibly some additional per-VM helper
processes (swtpm for TPM, sometimes slirp/passt for NIC, etc).

Potentially there are externally managed processes that are handling
some resources on behalf of the VM. These might be a single centralized
daemon handling work for many VMs, or might be per VM services. Either
way, since they are externally managed, their setup and usage of cgroups
is completely opaque to libvirt.

Libvirt is only concerned with the 1 cgroup per VM that it creates and
manages. Its goal is to protect the host OS from a misbehaving guest
OS/compromised QEMU.

The memory limits we can set on VMs are somewhat limited. In general
we prefer to avoid setting any hard per-VM memory cap by default.
QEMU's worst case memory usage is incredibly hard to predict, because
of an incredibly broad range of possible configurations and opaque
behaviour/usage from ELF libraries it uses. Every time anyone has
tried hard memory caps, we've ended up with VMs being incorrectly
killed because they genuinely wanted more memory than anticipated
by the algorithm.

To protect the host OS, I tend to suggest mgmt apps/admins set a
hard memory limit acrosss all VMs in aggregate eg at /machine.slice,
instead of per-VM. This aims to makes it possible to ensure that
the host OS always has some memory reserved for its own system
services, while allowing the individual VMs to battle it out
between themselves.

We do still have to apply some tuning for VFIO, around what amount
of memory it is allowed to lock, but that is not so bad as we just
need to allow it to lock guest RAM which is known + an finite extra
amount, so don't need to take account of all of QEMU's memory
allocations in general. This is all still just in context of 1
cgroup though, as least as far as libvirt is aware. Any other
cgroups involved are opaque to libvirt, and not our concern as long
as QEMU's cgroup is preventing QEMU's misbehaviour as configured.

With regards,
Daniel
T.J. Mercier Feb. 23, 2023, 5:18 p.m. UTC | #34
On Wed, Feb 22, 2023 at 5:54 PM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Thu, Feb 23, 2023 at 09:59:35AM +1100, Alistair Popple wrote:
> >
> > Jason Gunthorpe <jgg@nvidia.com> writes:
> >
> > > On Wed, Feb 22, 2023 at 10:38:25PM +1100, Alistair Popple wrote:
> > >> When a driver unpins a page we scan the pinners list and assign
> > >> ownership to the next driver pinning the page by updating memcg_data and
> > >> removing the vm_account from the list.
> > >
> > > I don't see how this works with just the data structure you outlined??
> > > Every unique page needs its own list_head in the vm_account, it is
> > > doable just incredibly costly.
> >
> > The idea was every driver already needs to allocate a pages array to
> > pass to pin_user_pages(), and by necessity drivers have to keep a
> > reference to the contents of that in one form or another. So
> > conceptually the equivalent of:
> >
> > struct vm_account {
> >        struct list_head possible_pinners;
> >        struct mem_cgroup *memcg;
> >        struct pages **pages;
> >        [...]
> > };
> >
> > Unpinnig involves finding a new owner by traversing the list of
> > page->memcg_data->possible_pinners and iterating over *pages[] to figure
> > out if that vm_account actually has this page pinned or not and could
> > own it.
>
> Oh, you are focusing on Tejun's DOS scenario.
>
> The DOS problem is to prevent a pin users in cgroup A from keeping
> memory charged to cgroup B that it isn't using any more.
>
> cgroup B doesn't need to be pinning the memory, it could just be
> normal VMAs and "isn't using anymore" means it has unmapped all the
> VMAs.
>
> Solving that problem means figuring out when every cgroup stops using
> the memory - pinning or not. That seems to be very costly.
>
This is the current behavior of accounting for memfds, and I suspect
any kind of shared memory.

If cgroup A creates a memfd, maps and faults in pages, shares the
memfd with cgroup B and then A unmaps and closes the memfd, then
cgroup A is still charged for the pages it faulted in.

FWIW this is also the behavior I was trying to use to attribute
dma-buffers to their original allocators. Whoever touches it first
gets charged as long as the memory is alive somewhere.

Can't we do the same thing for pins?

> AFAIK this problem also already exists today as the memcg of a page
> doesn't change while it is pinned. So maybe we don't need to address
> it.
>
> Arguably the pins are not the problem. If we want to treat the pin
> like allocation then we simply charge the non-owning memcg's for the
> pin as though it was an allocation. Eg go over every page and if the
> owning memcg is not the current memcg then charge the current memcg
> for an allocation of the MAP_SHARED memory. Undoing this is trivial
> enoug.
>
> This doesn't fix the DOS problem but it does sort of harmonize the pin
> accounting with the memcg by multi-accounting every pin of a
> MAP_SHARED page.
>
> The other drawback is that this isn't the same thing as the current
> rlimit. The rlimit is largely restricting the creation of unmovable
> memory.
>
> Though, AFAICT memcg seems to bundle unmovable memory (eg GFP_KERNEL)
> along with movable user pages so it would be self-consistent.
>
> I'm unclear if this is OK for libvirt..
>
> > Agree this is costly though. And I don't think all drivers keep the
> > array around so "iterating over *pages[]" may need to be a callback.
>
> I think searching lists of pages is not reasonable. Things like VFIO &
> KVM use cases effectively pin 90% of all system memory, that is
> potentially TB of page lists that might need linear searching!
>
> Jason
Jason Gunthorpe Feb. 23, 2023, 5:28 p.m. UTC | #35
On Thu, Feb 23, 2023 at 09:18:23AM -0800, T.J. Mercier wrote:

> > Solving that problem means figuring out when every cgroup stops using
> > the memory - pinning or not. That seems to be very costly.
> >
> This is the current behavior of accounting for memfds, and I suspect
> any kind of shared memory.
> 
> If cgroup A creates a memfd, maps and faults in pages, shares the
> memfd with cgroup B and then A unmaps and closes the memfd, then
> cgroup A is still charged for the pages it faulted in.

As we discussed, as long as the memory is swappable then eventually
memory pressure on cgroup A will evict the memfd pages and then cgroup
B will swap it in and be charged for it.
 
> FWIW this is also the behavior I was trying to use to attribute
> dma-buffers to their original allocators. Whoever touches it first
> gets charged as long as the memory is alive somewhere.
>
> Can't we do the same thing for pins?

If pins are tracked independently from memcg then definately not,
a process in cgroup A should never be able to make a charge on cgroup
B as a matter of security.

If pins are part of the memcg then we can't always turn the pin
request in to a NOP - the current cgroup always has to be charged for
the memory. Otherwise what is the point from a security perspective?

Jason
Jason Gunthorpe Feb. 23, 2023, 5:31 p.m. UTC | #36
On Thu, Feb 23, 2023 at 09:12:19AM +0000, Daniel P. Berrangé wrote:

> The memory limits we can set on VMs are somewhat limited. In general
> we prefer to avoid setting any hard per-VM memory cap by default.

So if you don't use hard limits on the memcg..

But to do this with a hard limit:
 
> We do still have to apply some tuning for VFIO, around what amount
> of memory it is allowed to lock, but that is not so bad as we just
> need to allow it to lock guest RAM which is known + an finite extra
> amount, so don't need to take account of all of QEMU's memory
> allocations in general.

Will need its own controller, right?

Jason
Yosry Ahmed Feb. 23, 2023, 6:03 p.m. UTC | #37
On Thu, Feb 23, 2023 at 9:28 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Thu, Feb 23, 2023 at 09:18:23AM -0800, T.J. Mercier wrote:
>
> > > Solving that problem means figuring out when every cgroup stops using
> > > the memory - pinning or not. That seems to be very costly.
> > >
> > This is the current behavior of accounting for memfds, and I suspect
> > any kind of shared memory.
> >
> > If cgroup A creates a memfd, maps and faults in pages, shares the
> > memfd with cgroup B and then A unmaps and closes the memfd, then
> > cgroup A is still charged for the pages it faulted in.
>
> As we discussed, as long as the memory is swappable then eventually
> memory pressure on cgroup A will evict the memfd pages and then cgroup
> B will swap it in and be charged for it.

I am not familiar with memfd, but based on
mem_cgroup_swapin_charge_folio() it seems like if cgroup B swapped in
the pages they will remain charged to cgroup A, unless cgroup A is
removed/offlined. Am I missing something?

>
> > FWIW this is also the behavior I was trying to use to attribute
> > dma-buffers to their original allocators. Whoever touches it first
> > gets charged as long as the memory is alive somewhere.
> >
> > Can't we do the same thing for pins?
>
> If pins are tracked independently from memcg then definately not,
> a process in cgroup A should never be able to make a charge on cgroup
> B as a matter of security.
>
> If pins are part of the memcg then we can't always turn the pin
> request in to a NOP - the current cgroup always has to be charged for
> the memory. Otherwise what is the point from a security perspective?
>
> Jason
Jason Gunthorpe Feb. 23, 2023, 6:10 p.m. UTC | #38
On Thu, Feb 23, 2023 at 10:03:50AM -0800, Yosry Ahmed wrote:
> On Thu, Feb 23, 2023 at 9:28 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> >
> > On Thu, Feb 23, 2023 at 09:18:23AM -0800, T.J. Mercier wrote:
> >
> > > > Solving that problem means figuring out when every cgroup stops using
> > > > the memory - pinning or not. That seems to be very costly.
> > > >
> > > This is the current behavior of accounting for memfds, and I suspect
> > > any kind of shared memory.
> > >
> > > If cgroup A creates a memfd, maps and faults in pages, shares the
> > > memfd with cgroup B and then A unmaps and closes the memfd, then
> > > cgroup A is still charged for the pages it faulted in.
> >
> > As we discussed, as long as the memory is swappable then eventually
> > memory pressure on cgroup A will evict the memfd pages and then cgroup
> > B will swap it in and be charged for it.
> 
> I am not familiar with memfd, but based on
> mem_cgroup_swapin_charge_folio() it seems like if cgroup B swapped in
> the pages they will remain charged to cgroup A, unless cgroup A is
> removed/offlined. Am I missing something?

Ah, I don't know, Tejun said:

"but it can converge when page usage transfers across cgroups
if needed."

Which I assumed was swap related but I don't know how convergence
works.

Jason
Yosry Ahmed Feb. 23, 2023, 6:14 p.m. UTC | #39
On Thu, Feb 23, 2023 at 10:11 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Thu, Feb 23, 2023 at 10:03:50AM -0800, Yosry Ahmed wrote:
> > On Thu, Feb 23, 2023 at 9:28 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
> > >
> > > On Thu, Feb 23, 2023 at 09:18:23AM -0800, T.J. Mercier wrote:
> > >
> > > > > Solving that problem means figuring out when every cgroup stops using
> > > > > the memory - pinning or not. That seems to be very costly.
> > > > >
> > > > This is the current behavior of accounting for memfds, and I suspect
> > > > any kind of shared memory.
> > > >
> > > > If cgroup A creates a memfd, maps and faults in pages, shares the
> > > > memfd with cgroup B and then A unmaps and closes the memfd, then
> > > > cgroup A is still charged for the pages it faulted in.
> > >
> > > As we discussed, as long as the memory is swappable then eventually
> > > memory pressure on cgroup A will evict the memfd pages and then cgroup
> > > B will swap it in and be charged for it.
> >
> > I am not familiar with memfd, but based on
> > mem_cgroup_swapin_charge_folio() it seems like if cgroup B swapped in
> > the pages they will remain charged to cgroup A, unless cgroup A is
> > removed/offlined. Am I missing something?
>
> Ah, I don't know, Tejun said:
>
> "but it can converge when page usage transfers across cgroups
> if needed."
>
> Which I assumed was swap related but I don't know how convergence
> works.

I believe that's the case for file-backed pages, but I do not believe
it's the case for swap-backed pages.

>
> Jason
Tejun Heo Feb. 23, 2023, 6:15 p.m. UTC | #40
On Thu, Feb 23, 2023 at 02:10:56PM -0400, Jason Gunthorpe wrote:
> > I am not familiar with memfd, but based on
> > mem_cgroup_swapin_charge_folio() it seems like if cgroup B swapped in
> > the pages they will remain charged to cgroup A, unless cgroup A is
> > removed/offlined. Am I missing something?
> 
> Ah, I don't know, Tejun said:
> 
> "but it can converge when page usage transfers across cgroups
> if needed."
> 
> Which I assumed was swap related but I don't know how convergence
> works.

That'd work for pagecache. For swap-backed, I think Yosry is right. Is
MAP_SHARED | MAP_ANONYMOUS a concern? Such mappings can only be shared
through forking, so it's not a common thing to be shared across different
resource domains.

Thanks.
Jason Gunthorpe Feb. 23, 2023, 6:17 p.m. UTC | #41
On Thu, Feb 23, 2023 at 08:15:17AM -1000, Tejun Heo wrote:
> On Thu, Feb 23, 2023 at 02:10:56PM -0400, Jason Gunthorpe wrote:
> > > I am not familiar with memfd, but based on
> > > mem_cgroup_swapin_charge_folio() it seems like if cgroup B swapped in
> > > the pages they will remain charged to cgroup A, unless cgroup A is
> > > removed/offlined. Am I missing something?
> > 
> > Ah, I don't know, Tejun said:
> > 
> > "but it can converge when page usage transfers across cgroups
> > if needed."
> > 
> > Which I assumed was swap related but I don't know how convergence
> > works.
> 
> That'd work for pagecache. For swap-backed, I think Yosry is right. Is
> MAP_SHARED | MAP_ANONYMOUS a concern? Such mappings can only be shared
> through forking, so it's not a common thing to be shared across different
> resource domains.

Isn't memfd also in the same boat?

Jason
Tejun Heo Feb. 23, 2023, 6:22 p.m. UTC | #42
On Thu, Feb 23, 2023 at 02:17:18PM -0400, Jason Gunthorpe wrote:
> On Thu, Feb 23, 2023 at 08:15:17AM -1000, Tejun Heo wrote:
> > On Thu, Feb 23, 2023 at 02:10:56PM -0400, Jason Gunthorpe wrote:
> > > > I am not familiar with memfd, but based on
> > > > mem_cgroup_swapin_charge_folio() it seems like if cgroup B swapped in
> > > > the pages they will remain charged to cgroup A, unless cgroup A is
> > > > removed/offlined. Am I missing something?
> > > 
> > > Ah, I don't know, Tejun said:
> > > 
> > > "but it can converge when page usage transfers across cgroups
> > > if needed."
> > > 
> > > Which I assumed was swap related but I don't know how convergence
> > > works.
> > 
> > That'd work for pagecache. For swap-backed, I think Yosry is right. Is
> > MAP_SHARED | MAP_ANONYMOUS a concern? Such mappings can only be shared
> > through forking, so it's not a common thing to be shared across different
> > resource domains.
> 
> Isn't memfd also in the same boat?

I see. Yeah, that's looks like named shared anon. The first one
instantiating a page would always be the owner.

Thanks.
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index f781f93..f8526e2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5381,6 +5381,13 @@  F:	tools/testing/selftests/cgroup/memcg_protection.m
 F:	tools/testing/selftests/cgroup/test_kmem.c
 F:	tools/testing/selftests/cgroup/test_memcontrol.c
 
+CONTROL GROUP - PINNED AND LOCKED MEMORY
+M:	Alistair Popple <apopple@nvidia.com>
+L:	cgroups@vger.kernel.org
+L:	linux-mm@kvack.org
+S:	Maintained
+F:	mm/pins_cgroup.c
+
 CORETEMP HARDWARE MONITORING DRIVER
 M:	Fenghua Yu <fenghua.yu@intel.com>
 L:	linux-hwmon@vger.kernel.org
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3410aec..d98de25 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -857,4 +857,24 @@  static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 
 #endif /* CONFIG_CGROUP_BPF */
 
+#ifdef CONFIG_CGROUP_PINS
+
+struct pins_cgroup *get_pins_cg(struct task_struct *task);
+void put_pins_cg(struct pins_cgroup *cg);
+void pins_uncharge(struct pins_cgroup *pins, int num);
+bool pins_try_charge(struct pins_cgroup *pins, int num);
+
+#else /* CONFIG_CGROUP_PINS */
+
+static inline struct pins_cgroup *get_pins_cg(struct task_struct *task)
+{
+	return NULL;
+}
+
+static inline void put_pins_cg(struct pins_cgroup *cg) {}
+static inline void pins_uncharge(struct pins_cgroup *pins, int num) {}
+static inline bool pins_try_charge(struct pins_cgroup *pins, int num) { return 1; }
+
+#endif /* CONFIG_CGROUP_PINS */
+
 #endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 4452354..c1b4aab 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -65,6 +65,10 @@  SUBSYS(rdma)
 SUBSYS(misc)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_PINS)
+SUBSYS(pins)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/mm/Kconfig b/mm/Kconfig
index ff7b209..4472043 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1183,6 +1183,17 @@  config LRU_GEN_STATS
 	  This option has a per-memcg and per-node memory overhead.
 # }
 
+config CGROUP_PINS
+    bool "Cgroup controller for pinned and locked memory"
+    depends on CGROUPS
+    default y
+    help
+      Having too much memory pinned or locked can lead to system
+      instability due to increased likelihood of encountering
+      out-of-memory conditions. Select this option to enable a cgroup
+      controller which can be used to limit the overall number of
+      pages procceses in a cgroup may lock or have pinned by drivers.
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 8e105e5..81db189 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -138,3 +138,4 @@  obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
 obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
+obj-$(CONFIG_CGROUP_PINS) += pins_cgroup.o
diff --git a/mm/pins_cgroup.c b/mm/pins_cgroup.c
new file mode 100644
index 0000000..2d8c6c7
--- /dev/null
+++ b/mm/pins_cgroup.c
@@ -0,0 +1,273 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Controller for cgroups limiting number of pages pinned for FOLL_LONGETERM.
+ *
+ * Copyright (C) 2022 Alistair Popple <apopple@nvidia.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/sched/task.h>
+
+#define PINS_MAX (-1ULL)
+#define PINS_MAX_STR "max"
+
+struct pins_cgroup {
+	struct cgroup_subsys_state	css;
+
+	atomic64_t			counter;
+	atomic64_t			limit;
+
+	struct cgroup_file		events_file;
+	atomic64_t			events_limit;
+};
+
+static struct pins_cgroup *css_pins(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct pins_cgroup, css);
+}
+
+static struct pins_cgroup *parent_pins(struct pins_cgroup *pins)
+{
+	return css_pins(pins->css.parent);
+}
+
+struct pins_cgroup *get_pins_cg(struct task_struct *task)
+{
+	return css_pins(task_get_css(task, pins_cgrp_id));
+}
+
+void put_pins_cg(struct pins_cgroup *cg)
+{
+	css_put(&cg->css);
+}
+
+static struct cgroup_subsys_state *
+pins_css_alloc(struct cgroup_subsys_state *parent)
+{
+	struct pins_cgroup *pins;
+
+	pins = kzalloc(sizeof(struct pins_cgroup), GFP_KERNEL);
+	if (!pins)
+		return ERR_PTR(-ENOMEM);
+
+	atomic64_set(&pins->counter, 0);
+	atomic64_set(&pins->limit, PINS_MAX);
+	atomic64_set(&pins->events_limit, 0);
+	return &pins->css;
+}
+
+static void pins_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css_pins(css));
+}
+
+/**
+ * pins_cancel - uncharge the local pin count
+ * @pins: the pin cgroup state
+ * @num: the number of pins to cancel
+ *
+ * This function will WARN if the pin count goes under 0, because such a case is
+ * a bug in the pins controller proper.
+ */
+void pins_cancel(struct pins_cgroup *pins, int num)
+{
+	/*
+	 * A negative count (or overflow for that matter) is invalid,
+	 * and indicates a bug in the `pins` controller proper.
+	 */
+	WARN_ON_ONCE(atomic64_add_negative(-num, &pins->counter));
+}
+
+/**
+ * pins_uncharge - hierarchically uncharge the pin count
+ * @pins: the pin cgroup state
+ * @num: the number of pins to uncharge
+ */
+void pins_uncharge(struct pins_cgroup *pins, int num)
+{
+	struct pins_cgroup *p;
+
+	for (p = pins; parent_pins(p); p = parent_pins(p))
+		pins_cancel(p, num);
+}
+
+/**
+ * pins_charge - hierarchically charge the pin count
+ * @pins: the pin cgroup state
+ * @num: the number of pins to charge
+ *
+ * This function does *not* follow the pin limit set. It cannot fail and the new
+ * pin count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pins_charge(struct pins_cgroup *pins, int num)
+{
+	struct pins_cgroup *p;
+
+	for (p = pins; parent_pins(p); p = parent_pins(p))
+		atomic64_add(num, &p->counter);
+}
+
+/**
+ * pins_try_charge - hierarchically try to charge the pin count
+ * @pins: the pin cgroup state
+ * @num: the number of pins to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns true if the charge
+ * succeeded, false otherwise.
+ */
+bool pins_try_charge(struct pins_cgroup *pins, int num)
+{
+	struct pins_cgroup *p, *q;
+
+	for (p = pins; parent_pins(p); p = parent_pins(p)) {
+		uint64_t new = atomic64_add_return(num, &p->counter);
+		uint64_t limit = atomic64_read(&p->limit);
+
+		if (limit != PINS_MAX && new > limit)
+			goto revert;
+	}
+
+	return true;
+
+revert:
+	for (q = pins; q != p; q = parent_pins(q))
+		pins_cancel(q, num);
+	pins_cancel(p, num);
+
+	return false;
+}
+
+static int pins_can_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *dst_css;
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, dst_css, tset) {
+		struct pins_cgroup *pins = css_pins(dst_css);
+		struct cgroup_subsys_state *old_css;
+		struct pins_cgroup *old_pins;
+
+		old_css = task_css(task, pins_cgrp_id);
+		old_pins = css_pins(old_css);
+
+		pins_charge(pins, task->mm->locked_vm);
+		pins_uncharge(old_pins, task->mm->locked_vm);
+	}
+
+	return 0;
+}
+
+static void pins_cancel_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *dst_css;
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, dst_css, tset) {
+		struct pins_cgroup *pins = css_pins(dst_css);
+		struct cgroup_subsys_state *old_css;
+		struct pins_cgroup *old_pins;
+
+		old_css = task_css(task, pins_cgrp_id);
+		old_pins = css_pins(old_css);
+
+		pins_charge(old_pins, task->mm->locked_vm);
+		pins_uncharge(pins, task->mm->locked_vm);
+	}
+}
+
+
+static ssize_t pins_max_write(struct kernfs_open_file *of, char *buf,
+			      size_t nbytes, loff_t off)
+{
+	struct cgroup_subsys_state *css = of_css(of);
+	struct pins_cgroup *pins = css_pins(css);
+	uint64_t limit;
+	int err;
+
+	buf = strstrip(buf);
+	if (!strcmp(buf, PINS_MAX_STR)) {
+		limit = PINS_MAX;
+		goto set_limit;
+	}
+
+	err = kstrtoll(buf, 0, &limit);
+	if (err)
+		return err;
+
+	if (limit < 0 || limit >= PINS_MAX)
+		return -EINVAL;
+
+set_limit:
+	/*
+	 * Limit updates don't need to be mutex'd, since it isn't
+	 * critical that any racing fork()s follow the new limit.
+	 */
+	atomic64_set(&pins->limit, limit);
+	return nbytes;
+}
+
+static int pins_max_show(struct seq_file *sf, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(sf);
+	struct pins_cgroup *pins = css_pins(css);
+	uint64_t limit = atomic64_read(&pins->limit);
+
+	if (limit >= PINS_MAX)
+		seq_printf(sf, "%s\n", PINS_MAX_STR);
+	else
+		seq_printf(sf, "%lld\n", limit);
+
+	return 0;
+}
+
+static s64 pins_current_read(struct cgroup_subsys_state *css,
+			     struct cftype *cft)
+{
+	struct pins_cgroup *pins = css_pins(css);
+
+	return atomic64_read(&pins->counter);
+}
+
+static int pins_events_show(struct seq_file *sf, void *v)
+{
+	struct pins_cgroup *pins = css_pins(seq_css(sf));
+
+	seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pins->events_limit));
+	return 0;
+}
+
+static struct cftype pins_files[] = {
+	{
+		.name = "max",
+		.write = pins_max_write,
+		.seq_show = pins_max_show,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "current",
+		.read_s64 = pins_current_read,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "events",
+		.seq_show = pins_events_show,
+		.file_offset = offsetof(struct pins_cgroup, events_file),
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys pins_cgrp_subsys = {
+	.css_alloc = pins_css_alloc,
+	.css_free = pins_css_free,
+	.legacy_cftypes = pins_files,
+	.dfl_cftypes = pins_files,
+	.can_attach = pins_can_attach,
+	.cancel_attach = pins_cancel_attach,
+};