diff mbox series

mm, memcg: cg2 memory{.swap,}.peak write handlers

Message ID 20240715203625.1462309-2-davidf@vimeo.com (mailing list archive)
State New
Headers show
Series mm, memcg: cg2 memory{.swap,}.peak write handlers | expand

Commit Message

David Finkel July 15, 2024, 8:36 p.m. UTC
Other mechanisms for querying the peak memory usage of either a process
or v1 memory cgroup allow for resetting the high watermark. Restore
parity with those mechanisms.

For example:
 - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
   the high watermark.
 - writing "5" to the clear_refs pseudo-file in a processes's proc
   directory resets the peak RSS.

This change copies the cgroup v1 behavior so any write to the
memory.peak and memory.swap.peak pseudo-files reset the high watermark
to the current usage.

This behavior is particularly useful for work scheduling systems that
need to track memory usage of worker processes/cgroups per-work-item.
Since memory can't be squeezed like CPU can (the OOM-killer has
opinions), these systems need to track the peak memory usage to compute
system/container fullness when binpacking workitems.

Signed-off-by: David Finkel <davidf@vimeo.com>
---
 Documentation/admin-guide/cgroup-v2.rst       | 20 +++---
 mm/memcontrol.c                               | 23 ++++++
 .../selftests/cgroup/test_memcontrol.c        | 72 ++++++++++++++++---
 3 files changed, 99 insertions(+), 16 deletions(-)

Comments

David Finkel July 15, 2024, 8:42 p.m. UTC | #1
Note: this is a simple rebase of a patch I sent a few months ago,
which received two acks before the thread petered out:
https://www.spinics.net/lists/cgroups/msg40602.html

Thanks,

On Mon, Jul 15, 2024 at 4:38 PM David Finkel <davidf@vimeo.com> wrote:
>
> Other mechanisms for querying the peak memory usage of either a process
> or v1 memory cgroup allow for resetting the high watermark. Restore
> parity with those mechanisms.
>
> For example:
>  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
>    the high watermark.
>  - writing "5" to the clear_refs pseudo-file in a processes's proc
>    directory resets the peak RSS.
>
> This change copies the cgroup v1 behavior so any write to the
> memory.peak and memory.swap.peak pseudo-files reset the high watermark
> to the current usage.
>
> This behavior is particularly useful for work scheduling systems that
> need to track memory usage of worker processes/cgroups per-work-item.
> Since memory can't be squeezed like CPU can (the OOM-killer has
> opinions), these systems need to track the peak memory usage to compute
> system/container fullness when binpacking workitems.
>
> Signed-off-by: David Finkel <davidf@vimeo.com>
> ---
>  Documentation/admin-guide/cgroup-v2.rst       | 20 +++---
>  mm/memcontrol.c                               | 23 ++++++
>  .../selftests/cgroup/test_memcontrol.c        | 72 ++++++++++++++++---
>  3 files changed, 99 insertions(+), 16 deletions(-)
>
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index 8fbb0519d556..201d8e5d9f82 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -1322,11 +1322,13 @@ PAGE_SIZE multiple when read back.
>         reclaim induced by memory.reclaim.
>
>    memory.peak
> -       A read-only single value file which exists on non-root
> -       cgroups.
> +       A read-write single value file which exists on non-root cgroups.
> +
> +       The max memory usage recorded for the cgroup and its descendants since
> +       either the creation of the cgroup or the most recent reset.
>
> -       The max memory usage recorded for the cgroup and its
> -       descendants since the creation of the cgroup.
> +       Any non-empty write to this file resets it to the current memory usage.
> +       All content written is completely ignored.
>
>    memory.oom.group
>         A read-write single value file which exists on non-root
> @@ -1652,11 +1654,13 @@ PAGE_SIZE multiple when read back.
>         Healthy workloads are not expected to reach this limit.
>
>    memory.swap.peak
> -       A read-only single value file which exists on non-root
> -       cgroups.
> +       A read-write single value file which exists on non-root cgroups.
> +
> +       The max swap usage recorded for the cgroup and its descendants since
> +       the creation of the cgroup or the most recent reset.
>
> -       The max swap usage recorded for the cgroup and its
> -       descendants since the creation of the cgroup.
> +       Any non-empty write to this file resets it to the current swap usage.
> +       All content written is completely ignored.
>
>    memory.swap.max
>         A read-write single value file which exists on non-root
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 8f2f1bb18c9c..abfa547615d6 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -25,6 +25,7 @@
>   * Copyright (C) 2020 Alibaba, Inc, Alex Shi
>   */
>
> +#include <linux/cgroup-defs.h>
>  #include <linux/page_counter.h>
>  #include <linux/memcontrol.h>
>  #include <linux/cgroup.h>
> @@ -6915,6 +6916,16 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css,
>         return (u64)memcg->memory.watermark * PAGE_SIZE;
>  }
>
> +static ssize_t memory_peak_write(struct kernfs_open_file *of,
> +                                char *buf, size_t nbytes, loff_t off)
> +{
> +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> +
> +       page_counter_reset_watermark(&memcg->memory);
> +
> +       return nbytes;
> +}
> +
>  static int memory_min_show(struct seq_file *m, void *v)
>  {
>         return seq_puts_memcg_tunable(m,
> @@ -7232,6 +7243,7 @@ static struct cftype memory_files[] = {
>                 .name = "peak",
>                 .flags = CFTYPE_NOT_ON_ROOT,
>                 .read_u64 = memory_peak_read,
> +               .write = memory_peak_write,
>         },
>         {
>                 .name = "min",
> @@ -8201,6 +8213,16 @@ static u64 swap_peak_read(struct cgroup_subsys_state *css,
>         return (u64)memcg->swap.watermark * PAGE_SIZE;
>  }
>
> +static ssize_t swap_peak_write(struct kernfs_open_file *of,
> +                                char *buf, size_t nbytes, loff_t off)
> +{
> +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> +
> +       page_counter_reset_watermark(&memcg->swap);
> +
> +       return nbytes;
> +}
> +
>  static int swap_high_show(struct seq_file *m, void *v)
>  {
>         return seq_puts_memcg_tunable(m,
> @@ -8283,6 +8305,7 @@ static struct cftype swap_files[] = {
>                 .name = "swap.peak",
>                 .flags = CFTYPE_NOT_ON_ROOT,
>                 .read_u64 = swap_peak_read,
> +               .write = swap_peak_write,
>         },
>         {
>                 .name = "swap.events",
> diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
> index 41ae8047b889..681972de673b 100644
> --- a/tools/testing/selftests/cgroup/test_memcontrol.c
> +++ b/tools/testing/selftests/cgroup/test_memcontrol.c
> @@ -161,12 +161,12 @@ static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
>  /*
>   * This test create a memory cgroup, allocates
>   * some anonymous memory and some pagecache
> - * and check memory.current and some memory.stat values.
> + * and checks memory.current, memory.peak, and some memory.stat values.
>   */
> -static int test_memcg_current(const char *root)
> +static int test_memcg_current_peak(const char *root)
>  {
>         int ret = KSFT_FAIL;
> -       long current;
> +       long current, peak, peak_reset;
>         char *memcg;
>
>         memcg = cg_name(root, "memcg_test");
> @@ -180,12 +180,32 @@ static int test_memcg_current(const char *root)
>         if (current != 0)
>                 goto cleanup;
>
> +       peak = cg_read_long(memcg, "memory.peak");
> +       if (peak != 0)
> +               goto cleanup;
> +
>         if (cg_run(memcg, alloc_anon_50M_check, NULL))
>                 goto cleanup;
>
> +       peak = cg_read_long(memcg, "memory.peak");
> +       if (peak < MB(50))
> +               goto cleanup;
> +
> +       peak_reset = cg_write(memcg, "memory.peak", "\n");
> +       if (peak_reset != 0)
> +               goto cleanup;
> +
> +       peak = cg_read_long(memcg, "memory.peak");
> +       if (peak > MB(30))
> +               goto cleanup;
> +
>         if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
>                 goto cleanup;
>
> +       peak = cg_read_long(memcg, "memory.peak");
> +       if (peak < MB(50))
> +               goto cleanup;
> +
>         ret = KSFT_PASS;
>
>  cleanup:
> @@ -817,13 +837,14 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
>
>  /*
>   * This test checks that memory.swap.max limits the amount of
> - * anonymous memory which can be swapped out.
> + * anonymous memory which can be swapped out. Additionally, it verifies that
> + * memory.swap.peak reflects the high watermark and can be reset.
>   */
> -static int test_memcg_swap_max(const char *root)
> +static int test_memcg_swap_max_peak(const char *root)
>  {
>         int ret = KSFT_FAIL;
>         char *memcg;
> -       long max;
> +       long max, peak;
>
>         if (!is_swap_enabled())
>                 return KSFT_SKIP;
> @@ -840,6 +861,12 @@ static int test_memcg_swap_max(const char *root)
>                 goto cleanup;
>         }
>
> +       if (cg_read_long(memcg, "memory.swap.peak"))
> +               goto cleanup;
> +
> +       if (cg_read_long(memcg, "memory.peak"))
> +               goto cleanup;
> +
>         if (cg_read_strcmp(memcg, "memory.max", "max\n"))
>                 goto cleanup;
>
> @@ -862,6 +889,27 @@ static int test_memcg_swap_max(const char *root)
>         if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
>                 goto cleanup;
>
> +       peak = cg_read_long(memcg, "memory.peak");
> +       if (peak < MB(29))
> +               goto cleanup;
> +
> +       peak = cg_read_long(memcg, "memory.swap.peak");
> +       if (peak < MB(29))
> +               goto cleanup;
> +
> +       if (cg_write(memcg, "memory.swap.peak", "\n"))
> +               goto cleanup;
> +
> +       if (cg_read_long(memcg, "memory.swap.peak") > MB(10))
> +               goto cleanup;
> +
> +
> +       if (cg_write(memcg, "memory.peak", "\n"))
> +               goto cleanup;
> +
> +       if (cg_read_long(memcg, "memory.peak"))
> +               goto cleanup;
> +
>         if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
>                 goto cleanup;
>
> @@ -869,6 +917,14 @@ static int test_memcg_swap_max(const char *root)
>         if (max <= 0)
>                 goto cleanup;
>
> +       peak = cg_read_long(memcg, "memory.peak");
> +       if (peak < MB(29))
> +               goto cleanup;
> +
> +       peak = cg_read_long(memcg, "memory.swap.peak");
> +       if (peak < MB(19))
> +               goto cleanup;
> +
>         ret = KSFT_PASS;
>
>  cleanup:
> @@ -1295,7 +1351,7 @@ struct memcg_test {
>         const char *name;
>  } tests[] = {
>         T(test_memcg_subtree_control),
> -       T(test_memcg_current),
> +       T(test_memcg_current_peak),
>         T(test_memcg_min),
>         T(test_memcg_low),
>         T(test_memcg_high),
> @@ -1303,7 +1359,7 @@ struct memcg_test {
>         T(test_memcg_max),
>         T(test_memcg_reclaim),
>         T(test_memcg_oom_events),
> -       T(test_memcg_swap_max),
> +       T(test_memcg_swap_max_peak),
>         T(test_memcg_sock),
>         T(test_memcg_oom_group_leaf_events),
>         T(test_memcg_oom_group_parent_events),
> --
> 2.40.1
>
David Finkel July 15, 2024, 8:46 p.m. UTC | #2
[Fixing Shakeel's email address (I didn't notice it changed when
comparing my previous "git send-email" commandline and
get_mainainer.pl output)]

On Mon, Jul 15, 2024 at 4:42 PM David Finkel <davidf@vimeo.com> wrote:
>
> Note: this is a simple rebase of a patch I sent a few months ago,
> which received two acks before the thread petered out:
> https://www.spinics.net/lists/cgroups/msg40602.html
>
> Thanks,
>
> On Mon, Jul 15, 2024 at 4:38 PM David Finkel <davidf@vimeo.com> wrote:
> >
> > Other mechanisms for querying the peak memory usage of either a process
> > or v1 memory cgroup allow for resetting the high watermark. Restore
> > parity with those mechanisms.
> >
> > For example:
> >  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
> >    the high watermark.
> >  - writing "5" to the clear_refs pseudo-file in a processes's proc
> >    directory resets the peak RSS.
> >
> > This change copies the cgroup v1 behavior so any write to the
> > memory.peak and memory.swap.peak pseudo-files reset the high watermark
> > to the current usage.
> >
> > This behavior is particularly useful for work scheduling systems that
> > need to track memory usage of worker processes/cgroups per-work-item.
> > Since memory can't be squeezed like CPU can (the OOM-killer has
> > opinions), these systems need to track the peak memory usage to compute
> > system/container fullness when binpacking workitems.
> >
> > Signed-off-by: David Finkel <davidf@vimeo.com>
> > ---
> >  Documentation/admin-guide/cgroup-v2.rst       | 20 +++---
> >  mm/memcontrol.c                               | 23 ++++++
> >  .../selftests/cgroup/test_memcontrol.c        | 72 ++++++++++++++++---
> >  3 files changed, 99 insertions(+), 16 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> > index 8fbb0519d556..201d8e5d9f82 100644
> > --- a/Documentation/admin-guide/cgroup-v2.rst
> > +++ b/Documentation/admin-guide/cgroup-v2.rst
> > @@ -1322,11 +1322,13 @@ PAGE_SIZE multiple when read back.
> >         reclaim induced by memory.reclaim.
> >
> >    memory.peak
> > -       A read-only single value file which exists on non-root
> > -       cgroups.
> > +       A read-write single value file which exists on non-root cgroups.
> > +
> > +       The max memory usage recorded for the cgroup and its descendants since
> > +       either the creation of the cgroup or the most recent reset.
> >
> > -       The max memory usage recorded for the cgroup and its
> > -       descendants since the creation of the cgroup.
> > +       Any non-empty write to this file resets it to the current memory usage.
> > +       All content written is completely ignored.
> >
> >    memory.oom.group
> >         A read-write single value file which exists on non-root
> > @@ -1652,11 +1654,13 @@ PAGE_SIZE multiple when read back.
> >         Healthy workloads are not expected to reach this limit.
> >
> >    memory.swap.peak
> > -       A read-only single value file which exists on non-root
> > -       cgroups.
> > +       A read-write single value file which exists on non-root cgroups.
> > +
> > +       The max swap usage recorded for the cgroup and its descendants since
> > +       the creation of the cgroup or the most recent reset.
> >
> > -       The max swap usage recorded for the cgroup and its
> > -       descendants since the creation of the cgroup.
> > +       Any non-empty write to this file resets it to the current swap usage.
> > +       All content written is completely ignored.
> >
> >    memory.swap.max
> >         A read-write single value file which exists on non-root
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 8f2f1bb18c9c..abfa547615d6 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -25,6 +25,7 @@
> >   * Copyright (C) 2020 Alibaba, Inc, Alex Shi
> >   */
> >
> > +#include <linux/cgroup-defs.h>
> >  #include <linux/page_counter.h>
> >  #include <linux/memcontrol.h>
> >  #include <linux/cgroup.h>
> > @@ -6915,6 +6916,16 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css,
> >         return (u64)memcg->memory.watermark * PAGE_SIZE;
> >  }
> >
> > +static ssize_t memory_peak_write(struct kernfs_open_file *of,
> > +                                char *buf, size_t nbytes, loff_t off)
> > +{
> > +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> > +
> > +       page_counter_reset_watermark(&memcg->memory);
> > +
> > +       return nbytes;
> > +}
> > +
> >  static int memory_min_show(struct seq_file *m, void *v)
> >  {
> >         return seq_puts_memcg_tunable(m,
> > @@ -7232,6 +7243,7 @@ static struct cftype memory_files[] = {
> >                 .name = "peak",
> >                 .flags = CFTYPE_NOT_ON_ROOT,
> >                 .read_u64 = memory_peak_read,
> > +               .write = memory_peak_write,
> >         },
> >         {
> >                 .name = "min",
> > @@ -8201,6 +8213,16 @@ static u64 swap_peak_read(struct cgroup_subsys_state *css,
> >         return (u64)memcg->swap.watermark * PAGE_SIZE;
> >  }
> >
> > +static ssize_t swap_peak_write(struct kernfs_open_file *of,
> > +                                char *buf, size_t nbytes, loff_t off)
> > +{
> > +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> > +
> > +       page_counter_reset_watermark(&memcg->swap);
> > +
> > +       return nbytes;
> > +}
> > +
> >  static int swap_high_show(struct seq_file *m, void *v)
> >  {
> >         return seq_puts_memcg_tunable(m,
> > @@ -8283,6 +8305,7 @@ static struct cftype swap_files[] = {
> >                 .name = "swap.peak",
> >                 .flags = CFTYPE_NOT_ON_ROOT,
> >                 .read_u64 = swap_peak_read,
> > +               .write = swap_peak_write,
> >         },
> >         {
> >                 .name = "swap.events",
> > diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
> > index 41ae8047b889..681972de673b 100644
> > --- a/tools/testing/selftests/cgroup/test_memcontrol.c
> > +++ b/tools/testing/selftests/cgroup/test_memcontrol.c
> > @@ -161,12 +161,12 @@ static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
> >  /*
> >   * This test create a memory cgroup, allocates
> >   * some anonymous memory and some pagecache
> > - * and check memory.current and some memory.stat values.
> > + * and checks memory.current, memory.peak, and some memory.stat values.
> >   */
> > -static int test_memcg_current(const char *root)
> > +static int test_memcg_current_peak(const char *root)
> >  {
> >         int ret = KSFT_FAIL;
> > -       long current;
> > +       long current, peak, peak_reset;
> >         char *memcg;
> >
> >         memcg = cg_name(root, "memcg_test");
> > @@ -180,12 +180,32 @@ static int test_memcg_current(const char *root)
> >         if (current != 0)
> >                 goto cleanup;
> >
> > +       peak = cg_read_long(memcg, "memory.peak");
> > +       if (peak != 0)
> > +               goto cleanup;
> > +
> >         if (cg_run(memcg, alloc_anon_50M_check, NULL))
> >                 goto cleanup;
> >
> > +       peak = cg_read_long(memcg, "memory.peak");
> > +       if (peak < MB(50))
> > +               goto cleanup;
> > +
> > +       peak_reset = cg_write(memcg, "memory.peak", "\n");
> > +       if (peak_reset != 0)
> > +               goto cleanup;
> > +
> > +       peak = cg_read_long(memcg, "memory.peak");
> > +       if (peak > MB(30))
> > +               goto cleanup;
> > +
> >         if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
> >                 goto cleanup;
> >
> > +       peak = cg_read_long(memcg, "memory.peak");
> > +       if (peak < MB(50))
> > +               goto cleanup;
> > +
> >         ret = KSFT_PASS;
> >
> >  cleanup:
> > @@ -817,13 +837,14 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
> >
> >  /*
> >   * This test checks that memory.swap.max limits the amount of
> > - * anonymous memory which can be swapped out.
> > + * anonymous memory which can be swapped out. Additionally, it verifies that
> > + * memory.swap.peak reflects the high watermark and can be reset.
> >   */
> > -static int test_memcg_swap_max(const char *root)
> > +static int test_memcg_swap_max_peak(const char *root)
> >  {
> >         int ret = KSFT_FAIL;
> >         char *memcg;
> > -       long max;
> > +       long max, peak;
> >
> >         if (!is_swap_enabled())
> >                 return KSFT_SKIP;
> > @@ -840,6 +861,12 @@ static int test_memcg_swap_max(const char *root)
> >                 goto cleanup;
> >         }
> >
> > +       if (cg_read_long(memcg, "memory.swap.peak"))
> > +               goto cleanup;
> > +
> > +       if (cg_read_long(memcg, "memory.peak"))
> > +               goto cleanup;
> > +
> >         if (cg_read_strcmp(memcg, "memory.max", "max\n"))
> >                 goto cleanup;
> >
> > @@ -862,6 +889,27 @@ static int test_memcg_swap_max(const char *root)
> >         if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
> >                 goto cleanup;
> >
> > +       peak = cg_read_long(memcg, "memory.peak");
> > +       if (peak < MB(29))
> > +               goto cleanup;
> > +
> > +       peak = cg_read_long(memcg, "memory.swap.peak");
> > +       if (peak < MB(29))
> > +               goto cleanup;
> > +
> > +       if (cg_write(memcg, "memory.swap.peak", "\n"))
> > +               goto cleanup;
> > +
> > +       if (cg_read_long(memcg, "memory.swap.peak") > MB(10))
> > +               goto cleanup;
> > +
> > +
> > +       if (cg_write(memcg, "memory.peak", "\n"))
> > +               goto cleanup;
> > +
> > +       if (cg_read_long(memcg, "memory.peak"))
> > +               goto cleanup;
> > +
> >         if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
> >                 goto cleanup;
> >
> > @@ -869,6 +917,14 @@ static int test_memcg_swap_max(const char *root)
> >         if (max <= 0)
> >                 goto cleanup;
> >
> > +       peak = cg_read_long(memcg, "memory.peak");
> > +       if (peak < MB(29))
> > +               goto cleanup;
> > +
> > +       peak = cg_read_long(memcg, "memory.swap.peak");
> > +       if (peak < MB(19))
> > +               goto cleanup;
> > +
> >         ret = KSFT_PASS;
> >
> >  cleanup:
> > @@ -1295,7 +1351,7 @@ struct memcg_test {
> >         const char *name;
> >  } tests[] = {
> >         T(test_memcg_subtree_control),
> > -       T(test_memcg_current),
> > +       T(test_memcg_current_peak),
> >         T(test_memcg_min),
> >         T(test_memcg_low),
> >         T(test_memcg_high),
> > @@ -1303,7 +1359,7 @@ struct memcg_test {
> >         T(test_memcg_max),
> >         T(test_memcg_reclaim),
> >         T(test_memcg_oom_events),
> > -       T(test_memcg_swap_max),
> > +       T(test_memcg_swap_max_peak),
> >         T(test_memcg_sock),
> >         T(test_memcg_oom_group_leaf_events),
> >         T(test_memcg_oom_group_parent_events),
> > --
> > 2.40.1
> >
>
>
> --
> David Finkel
> Senior Principal Software Engineer, Core Services
Michal Hocko July 16, 2024, 7:20 a.m. UTC | #3
On Mon 15-07-24 16:46:36, David Finkel wrote:
> > On Mon, Jul 15, 2024 at 4:38 PM David Finkel <davidf@vimeo.com> wrote:
> > >
> > > Other mechanisms for querying the peak memory usage of either a process
> > > or v1 memory cgroup allow for resetting the high watermark. Restore
> > > parity with those mechanisms.
> > >
> > > For example:
> > >  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
> > >    the high watermark.
> > >  - writing "5" to the clear_refs pseudo-file in a processes's proc
> > >    directory resets the peak RSS.
> > >
> > > This change copies the cgroup v1 behavior so any write to the
> > > memory.peak and memory.swap.peak pseudo-files reset the high watermark
> > > to the current usage.
> > >
> > > This behavior is particularly useful for work scheduling systems that
> > > need to track memory usage of worker processes/cgroups per-work-item.
> > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > opinions),

I do not understand the OOM-killer reference here. Why does it matter?
Could you explain please?

> > > these systems need to track the peak memory usage to compute
> > > system/container fullness when binpacking workitems.

Could you elaborate some more on how you are using this please? I expect
you recycle memcgs for different runs of workers and reset peak
consumptions before a new run and record it after it is done. The thing
which is not really clear to me is how the peak value really helps if it
can vary a lot among different runs. But maybe I misunderstand.

> > >
> > > Signed-off-by: David Finkel <davidf@vimeo.com>
> > > ---
> > >  Documentation/admin-guide/cgroup-v2.rst       | 20 +++---
> > >  mm/memcontrol.c                               | 23 ++++++
> > >  .../selftests/cgroup/test_memcontrol.c        | 72 ++++++++++++++++---
> > >  3 files changed, 99 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> > > index 8fbb0519d556..201d8e5d9f82 100644
> > > --- a/Documentation/admin-guide/cgroup-v2.rst
> > > +++ b/Documentation/admin-guide/cgroup-v2.rst
> > > @@ -1322,11 +1322,13 @@ PAGE_SIZE multiple when read back.
> > >         reclaim induced by memory.reclaim.
> > >
> > >    memory.peak
> > > -       A read-only single value file which exists on non-root
> > > -       cgroups.
> > > +       A read-write single value file which exists on non-root cgroups.
> > > +
> > > +       The max memory usage recorded for the cgroup and its descendants since
> > > +       either the creation of the cgroup or the most recent reset.
> > >
> > > -       The max memory usage recorded for the cgroup and its
> > > -       descendants since the creation of the cgroup.
> > > +       Any non-empty write to this file resets it to the current memory usage.
> > > +       All content written is completely ignored.
> > >
> > >    memory.oom.group
> > >         A read-write single value file which exists on non-root
> > > @@ -1652,11 +1654,13 @@ PAGE_SIZE multiple when read back.
> > >         Healthy workloads are not expected to reach this limit.
> > >
> > >    memory.swap.peak
> > > -       A read-only single value file which exists on non-root
> > > -       cgroups.
> > > +       A read-write single value file which exists on non-root cgroups.
> > > +
> > > +       The max swap usage recorded for the cgroup and its descendants since
> > > +       the creation of the cgroup or the most recent reset.
> > >
> > > -       The max swap usage recorded for the cgroup and its
> > > -       descendants since the creation of the cgroup.
> > > +       Any non-empty write to this file resets it to the current swap usage.
> > > +       All content written is completely ignored.
> > >
> > >    memory.swap.max
> > >         A read-write single value file which exists on non-root
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index 8f2f1bb18c9c..abfa547615d6 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -25,6 +25,7 @@
> > >   * Copyright (C) 2020 Alibaba, Inc, Alex Shi
> > >   */
> > >
> > > +#include <linux/cgroup-defs.h>
> > >  #include <linux/page_counter.h>
> > >  #include <linux/memcontrol.h>
> > >  #include <linux/cgroup.h>
> > > @@ -6915,6 +6916,16 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css,
> > >         return (u64)memcg->memory.watermark * PAGE_SIZE;
> > >  }
> > >
> > > +static ssize_t memory_peak_write(struct kernfs_open_file *of,
> > > +                                char *buf, size_t nbytes, loff_t off)
> > > +{
> > > +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> > > +
> > > +       page_counter_reset_watermark(&memcg->memory);
> > > +
> > > +       return nbytes;
> > > +}
> > > +
> > >  static int memory_min_show(struct seq_file *m, void *v)
> > >  {
> > >         return seq_puts_memcg_tunable(m,
> > > @@ -7232,6 +7243,7 @@ static struct cftype memory_files[] = {
> > >                 .name = "peak",
> > >                 .flags = CFTYPE_NOT_ON_ROOT,
> > >                 .read_u64 = memory_peak_read,
> > > +               .write = memory_peak_write,
> > >         },
> > >         {
> > >                 .name = "min",
> > > @@ -8201,6 +8213,16 @@ static u64 swap_peak_read(struct cgroup_subsys_state *css,
> > >         return (u64)memcg->swap.watermark * PAGE_SIZE;
> > >  }
> > >
> > > +static ssize_t swap_peak_write(struct kernfs_open_file *of,
> > > +                                char *buf, size_t nbytes, loff_t off)
> > > +{
> > > +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> > > +
> > > +       page_counter_reset_watermark(&memcg->swap);
> > > +
> > > +       return nbytes;
> > > +}
> > > +
> > >  static int swap_high_show(struct seq_file *m, void *v)
> > >  {
> > >         return seq_puts_memcg_tunable(m,
> > > @@ -8283,6 +8305,7 @@ static struct cftype swap_files[] = {
> > >                 .name = "swap.peak",
> > >                 .flags = CFTYPE_NOT_ON_ROOT,
> > >                 .read_u64 = swap_peak_read,
> > > +               .write = swap_peak_write,
> > >         },
> > >         {
> > >                 .name = "swap.events",
> > > diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
> > > index 41ae8047b889..681972de673b 100644
> > > --- a/tools/testing/selftests/cgroup/test_memcontrol.c
> > > +++ b/tools/testing/selftests/cgroup/test_memcontrol.c
> > > @@ -161,12 +161,12 @@ static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
> > >  /*
> > >   * This test create a memory cgroup, allocates
> > >   * some anonymous memory and some pagecache
> > > - * and check memory.current and some memory.stat values.
> > > + * and checks memory.current, memory.peak, and some memory.stat values.
> > >   */
> > > -static int test_memcg_current(const char *root)
> > > +static int test_memcg_current_peak(const char *root)
> > >  {
> > >         int ret = KSFT_FAIL;
> > > -       long current;
> > > +       long current, peak, peak_reset;
> > >         char *memcg;
> > >
> > >         memcg = cg_name(root, "memcg_test");
> > > @@ -180,12 +180,32 @@ static int test_memcg_current(const char *root)
> > >         if (current != 0)
> > >                 goto cleanup;
> > >
> > > +       peak = cg_read_long(memcg, "memory.peak");
> > > +       if (peak != 0)
> > > +               goto cleanup;
> > > +
> > >         if (cg_run(memcg, alloc_anon_50M_check, NULL))
> > >                 goto cleanup;
> > >
> > > +       peak = cg_read_long(memcg, "memory.peak");
> > > +       if (peak < MB(50))
> > > +               goto cleanup;
> > > +
> > > +       peak_reset = cg_write(memcg, "memory.peak", "\n");
> > > +       if (peak_reset != 0)
> > > +               goto cleanup;
> > > +
> > > +       peak = cg_read_long(memcg, "memory.peak");
> > > +       if (peak > MB(30))
> > > +               goto cleanup;
> > > +
> > >         if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
> > >                 goto cleanup;
> > >
> > > +       peak = cg_read_long(memcg, "memory.peak");
> > > +       if (peak < MB(50))
> > > +               goto cleanup;
> > > +
> > >         ret = KSFT_PASS;
> > >
> > >  cleanup:
> > > @@ -817,13 +837,14 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
> > >
> > >  /*
> > >   * This test checks that memory.swap.max limits the amount of
> > > - * anonymous memory which can be swapped out.
> > > + * anonymous memory which can be swapped out. Additionally, it verifies that
> > > + * memory.swap.peak reflects the high watermark and can be reset.
> > >   */
> > > -static int test_memcg_swap_max(const char *root)
> > > +static int test_memcg_swap_max_peak(const char *root)
> > >  {
> > >         int ret = KSFT_FAIL;
> > >         char *memcg;
> > > -       long max;
> > > +       long max, peak;
> > >
> > >         if (!is_swap_enabled())
> > >                 return KSFT_SKIP;
> > > @@ -840,6 +861,12 @@ static int test_memcg_swap_max(const char *root)
> > >                 goto cleanup;
> > >         }
> > >
> > > +       if (cg_read_long(memcg, "memory.swap.peak"))
> > > +               goto cleanup;
> > > +
> > > +       if (cg_read_long(memcg, "memory.peak"))
> > > +               goto cleanup;
> > > +
> > >         if (cg_read_strcmp(memcg, "memory.max", "max\n"))
> > >                 goto cleanup;
> > >
> > > @@ -862,6 +889,27 @@ static int test_memcg_swap_max(const char *root)
> > >         if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
> > >                 goto cleanup;
> > >
> > > +       peak = cg_read_long(memcg, "memory.peak");
> > > +       if (peak < MB(29))
> > > +               goto cleanup;
> > > +
> > > +       peak = cg_read_long(memcg, "memory.swap.peak");
> > > +       if (peak < MB(29))
> > > +               goto cleanup;
> > > +
> > > +       if (cg_write(memcg, "memory.swap.peak", "\n"))
> > > +               goto cleanup;
> > > +
> > > +       if (cg_read_long(memcg, "memory.swap.peak") > MB(10))
> > > +               goto cleanup;
> > > +
> > > +
> > > +       if (cg_write(memcg, "memory.peak", "\n"))
> > > +               goto cleanup;
> > > +
> > > +       if (cg_read_long(memcg, "memory.peak"))
> > > +               goto cleanup;
> > > +
> > >         if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
> > >                 goto cleanup;
> > >
> > > @@ -869,6 +917,14 @@ static int test_memcg_swap_max(const char *root)
> > >         if (max <= 0)
> > >                 goto cleanup;
> > >
> > > +       peak = cg_read_long(memcg, "memory.peak");
> > > +       if (peak < MB(29))
> > > +               goto cleanup;
> > > +
> > > +       peak = cg_read_long(memcg, "memory.swap.peak");
> > > +       if (peak < MB(19))
> > > +               goto cleanup;
> > > +
> > >         ret = KSFT_PASS;
> > >
> > >  cleanup:
> > > @@ -1295,7 +1351,7 @@ struct memcg_test {
> > >         const char *name;
> > >  } tests[] = {
> > >         T(test_memcg_subtree_control),
> > > -       T(test_memcg_current),
> > > +       T(test_memcg_current_peak),
> > >         T(test_memcg_min),
> > >         T(test_memcg_low),
> > >         T(test_memcg_high),
> > > @@ -1303,7 +1359,7 @@ struct memcg_test {
> > >         T(test_memcg_max),
> > >         T(test_memcg_reclaim),
> > >         T(test_memcg_oom_events),
> > > -       T(test_memcg_swap_max),
> > > +       T(test_memcg_swap_max_peak),
> > >         T(test_memcg_sock),
> > >         T(test_memcg_oom_group_leaf_events),
> > >         T(test_memcg_oom_group_parent_events),
> > > --
> > > 2.40.1
> > >
> >
> >
> > --
> > David Finkel
> > Senior Principal Software Engineer, Core Services
> 
> 
> 
> -- 
> David Finkel
> Senior Principal Software Engineer, Core Services
David Finkel July 16, 2024, 12:47 p.m. UTC | #4
On Tue, Jul 16, 2024 at 3:20 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 15-07-24 16:46:36, David Finkel wrote:
> > > On Mon, Jul 15, 2024 at 4:38 PM David Finkel <davidf@vimeo.com> wrote:
> > > >
> > > > Other mechanisms for querying the peak memory usage of either a process
> > > > or v1 memory cgroup allow for resetting the high watermark. Restore
> > > > parity with those mechanisms.
> > > >
> > > > For example:
> > > >  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
> > > >    the high watermark.
> > > >  - writing "5" to the clear_refs pseudo-file in a processes's proc
> > > >    directory resets the peak RSS.
> > > >
> > > > This change copies the cgroup v1 behavior so any write to the
> > > > memory.peak and memory.swap.peak pseudo-files reset the high watermark
> > > > to the current usage.
> > > >
> > > > This behavior is particularly useful for work scheduling systems that
> > > > need to track memory usage of worker processes/cgroups per-work-item.
> > > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > > opinions),
>
> I do not understand the OOM-killer reference here. Why does it matter?
> Could you explain please?

Sure, we're attempting to bin-packing work based on past items of the same type.
With CPU, we can provision for the mean CPU-time per-wall-time to get
a lose "cores"
concept that we use for binpacking. With CPU, if we end up with a bit
of contention,
everything just gets a bit slower while the schedule arbitrates among cgroups.

However, with memory, you only have so much physical memory for the outer memcg.
If we pack things too tightly on memory, the OOM-killer is going to kill
something to free up memory. In some cases that's fine, but provisioning for the
peak memory for that "type" of work-item mostly avoids this issue.

My apologies. I should have reworded this sentence before resending.
(there was a question about it last time, too)


>
> > > > these systems need to track the peak memory usage to compute
> > > > system/container fullness when binpacking workitems.
>
> Could you elaborate some more on how you are using this please? I expect
> you recycle memcgs for different runs of workers and reset peak
> consumptions before a new run and record it after it is done. The thing
> which is not really clear to me is how the peak value really helps if it
> can vary a lot among different runs. But maybe I misunderstand.
>

That's mostly correct. The workers are long-lived and will handle many
work-items over their lifetimes (to amortize startup overheads).
The particular system that uses this classifies work in "queues", which
can be loosely assumed to use the same resources between runs, since
they're doing similar work.

To mitigate the effect of outliers, we take a high quantile of the peak memory
consumed by work items within a queue when estimating the memory dimension
to binpack future work-items.

> > > >
> > > > Signed-off-by: David Finkel <davidf@vimeo.com>
> > > > ---
> > > >  Documentation/admin-guide/cgroup-v2.rst       | 20 +++---
> > > >  mm/memcontrol.c                               | 23 ++++++
> > > >  .../selftests/cgroup/test_memcontrol.c        | 72 ++++++++++++++++---
> > > >  3 files changed, 99 insertions(+), 16 deletions(-)
> > > >
> > > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> > > > index 8fbb0519d556..201d8e5d9f82 100644
> > > > --- a/Documentation/admin-guide/cgroup-v2.rst
> > > > +++ b/Documentation/admin-guide/cgroup-v2.rst
> > > > @@ -1322,11 +1322,13 @@ PAGE_SIZE multiple when read back.
> > > >         reclaim induced by memory.reclaim.
> > > >
> > > >    memory.peak
> > > > -       A read-only single value file which exists on non-root
> > > > -       cgroups.
> > > > +       A read-write single value file which exists on non-root cgroups.
> > > > +
> > > > +       The max memory usage recorded for the cgroup and its descendants since
> > > > +       either the creation of the cgroup or the most recent reset.
> > > >
> > > > -       The max memory usage recorded for the cgroup and its
> > > > -       descendants since the creation of the cgroup.
> > > > +       Any non-empty write to this file resets it to the current memory usage.
> > > > +       All content written is completely ignored.
> > > >
> > > >    memory.oom.group
> > > >         A read-write single value file which exists on non-root
> > > > @@ -1652,11 +1654,13 @@ PAGE_SIZE multiple when read back.
> > > >         Healthy workloads are not expected to reach this limit.
> > > >
> > > >    memory.swap.peak
> > > > -       A read-only single value file which exists on non-root
> > > > -       cgroups.
> > > > +       A read-write single value file which exists on non-root cgroups.
> > > > +
> > > > +       The max swap usage recorded for the cgroup and its descendants since
> > > > +       the creation of the cgroup or the most recent reset.
> > > >
> > > > -       The max swap usage recorded for the cgroup and its
> > > > -       descendants since the creation of the cgroup.
> > > > +       Any non-empty write to this file resets it to the current swap usage.
> > > > +       All content written is completely ignored.
> > > >
> > > >    memory.swap.max
> > > >         A read-write single value file which exists on non-root
> > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > index 8f2f1bb18c9c..abfa547615d6 100644
> > > > --- a/mm/memcontrol.c
> > > > +++ b/mm/memcontrol.c
> > > > @@ -25,6 +25,7 @@
> > > >   * Copyright (C) 2020 Alibaba, Inc, Alex Shi
> > > >   */
> > > >
> > > > +#include <linux/cgroup-defs.h>
> > > >  #include <linux/page_counter.h>
> > > >  #include <linux/memcontrol.h>
> > > >  #include <linux/cgroup.h>
> > > > @@ -6915,6 +6916,16 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css,
> > > >         return (u64)memcg->memory.watermark * PAGE_SIZE;
> > > >  }
> > > >
> > > > +static ssize_t memory_peak_write(struct kernfs_open_file *of,
> > > > +                                char *buf, size_t nbytes, loff_t off)
> > > > +{
> > > > +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> > > > +
> > > > +       page_counter_reset_watermark(&memcg->memory);
> > > > +
> > > > +       return nbytes;
> > > > +}
> > > > +
> > > >  static int memory_min_show(struct seq_file *m, void *v)
> > > >  {
> > > >         return seq_puts_memcg_tunable(m,
> > > > @@ -7232,6 +7243,7 @@ static struct cftype memory_files[] = {
> > > >                 .name = "peak",
> > > >                 .flags = CFTYPE_NOT_ON_ROOT,
> > > >                 .read_u64 = memory_peak_read,
> > > > +               .write = memory_peak_write,
> > > >         },
> > > >         {
> > > >                 .name = "min",
> > > > @@ -8201,6 +8213,16 @@ static u64 swap_peak_read(struct cgroup_subsys_state *css,
> > > >         return (u64)memcg->swap.watermark * PAGE_SIZE;
> > > >  }
> > > >
> > > > +static ssize_t swap_peak_write(struct kernfs_open_file *of,
> > > > +                                char *buf, size_t nbytes, loff_t off)
> > > > +{
> > > > +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> > > > +
> > > > +       page_counter_reset_watermark(&memcg->swap);
> > > > +
> > > > +       return nbytes;
> > > > +}
> > > > +
> > > >  static int swap_high_show(struct seq_file *m, void *v)
> > > >  {
> > > >         return seq_puts_memcg_tunable(m,
> > > > @@ -8283,6 +8305,7 @@ static struct cftype swap_files[] = {
> > > >                 .name = "swap.peak",
> > > >                 .flags = CFTYPE_NOT_ON_ROOT,
> > > >                 .read_u64 = swap_peak_read,
> > > > +               .write = swap_peak_write,
> > > >         },
> > > >         {
> > > >                 .name = "swap.events",
> > > > diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
> > > > index 41ae8047b889..681972de673b 100644
> > > > --- a/tools/testing/selftests/cgroup/test_memcontrol.c
> > > > +++ b/tools/testing/selftests/cgroup/test_memcontrol.c
> > > > @@ -161,12 +161,12 @@ static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
> > > >  /*
> > > >   * This test create a memory cgroup, allocates
> > > >   * some anonymous memory and some pagecache
> > > > - * and check memory.current and some memory.stat values.
> > > > + * and checks memory.current, memory.peak, and some memory.stat values.
> > > >   */
> > > > -static int test_memcg_current(const char *root)
> > > > +static int test_memcg_current_peak(const char *root)
> > > >  {
> > > >         int ret = KSFT_FAIL;
> > > > -       long current;
> > > > +       long current, peak, peak_reset;
> > > >         char *memcg;
> > > >
> > > >         memcg = cg_name(root, "memcg_test");
> > > > @@ -180,12 +180,32 @@ static int test_memcg_current(const char *root)
> > > >         if (current != 0)
> > > >                 goto cleanup;
> > > >
> > > > +       peak = cg_read_long(memcg, "memory.peak");
> > > > +       if (peak != 0)
> > > > +               goto cleanup;
> > > > +
> > > >         if (cg_run(memcg, alloc_anon_50M_check, NULL))
> > > >                 goto cleanup;
> > > >
> > > > +       peak = cg_read_long(memcg, "memory.peak");
> > > > +       if (peak < MB(50))
> > > > +               goto cleanup;
> > > > +
> > > > +       peak_reset = cg_write(memcg, "memory.peak", "\n");
> > > > +       if (peak_reset != 0)
> > > > +               goto cleanup;
> > > > +
> > > > +       peak = cg_read_long(memcg, "memory.peak");
> > > > +       if (peak > MB(30))
> > > > +               goto cleanup;
> > > > +
> > > >         if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
> > > >                 goto cleanup;
> > > >
> > > > +       peak = cg_read_long(memcg, "memory.peak");
> > > > +       if (peak < MB(50))
> > > > +               goto cleanup;
> > > > +
> > > >         ret = KSFT_PASS;
> > > >
> > > >  cleanup:
> > > > @@ -817,13 +837,14 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
> > > >
> > > >  /*
> > > >   * This test checks that memory.swap.max limits the amount of
> > > > - * anonymous memory which can be swapped out.
> > > > + * anonymous memory which can be swapped out. Additionally, it verifies that
> > > > + * memory.swap.peak reflects the high watermark and can be reset.
> > > >   */
> > > > -static int test_memcg_swap_max(const char *root)
> > > > +static int test_memcg_swap_max_peak(const char *root)
> > > >  {
> > > >         int ret = KSFT_FAIL;
> > > >         char *memcg;
> > > > -       long max;
> > > > +       long max, peak;
> > > >
> > > >         if (!is_swap_enabled())
> > > >                 return KSFT_SKIP;
> > > > @@ -840,6 +861,12 @@ static int test_memcg_swap_max(const char *root)
> > > >                 goto cleanup;
> > > >         }
> > > >
> > > > +       if (cg_read_long(memcg, "memory.swap.peak"))
> > > > +               goto cleanup;
> > > > +
> > > > +       if (cg_read_long(memcg, "memory.peak"))
> > > > +               goto cleanup;
> > > > +
> > > >         if (cg_read_strcmp(memcg, "memory.max", "max\n"))
> > > >                 goto cleanup;
> > > >
> > > > @@ -862,6 +889,27 @@ static int test_memcg_swap_max(const char *root)
> > > >         if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
> > > >                 goto cleanup;
> > > >
> > > > +       peak = cg_read_long(memcg, "memory.peak");
> > > > +       if (peak < MB(29))
> > > > +               goto cleanup;
> > > > +
> > > > +       peak = cg_read_long(memcg, "memory.swap.peak");
> > > > +       if (peak < MB(29))
> > > > +               goto cleanup;
> > > > +
> > > > +       if (cg_write(memcg, "memory.swap.peak", "\n"))
> > > > +               goto cleanup;
> > > > +
> > > > +       if (cg_read_long(memcg, "memory.swap.peak") > MB(10))
> > > > +               goto cleanup;
> > > > +
> > > > +
> > > > +       if (cg_write(memcg, "memory.peak", "\n"))
> > > > +               goto cleanup;
> > > > +
> > > > +       if (cg_read_long(memcg, "memory.peak"))
> > > > +               goto cleanup;
> > > > +
> > > >         if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
> > > >                 goto cleanup;
> > > >
> > > > @@ -869,6 +917,14 @@ static int test_memcg_swap_max(const char *root)
> > > >         if (max <= 0)
> > > >                 goto cleanup;
> > > >
> > > > +       peak = cg_read_long(memcg, "memory.peak");
> > > > +       if (peak < MB(29))
> > > > +               goto cleanup;
> > > > +
> > > > +       peak = cg_read_long(memcg, "memory.swap.peak");
> > > > +       if (peak < MB(19))
> > > > +               goto cleanup;
> > > > +
> > > >         ret = KSFT_PASS;
> > > >
> > > >  cleanup:
> > > > @@ -1295,7 +1351,7 @@ struct memcg_test {
> > > >         const char *name;
> > > >  } tests[] = {
> > > >         T(test_memcg_subtree_control),
> > > > -       T(test_memcg_current),
> > > > +       T(test_memcg_current_peak),
> > > >         T(test_memcg_min),
> > > >         T(test_memcg_low),
> > > >         T(test_memcg_high),
> > > > @@ -1303,7 +1359,7 @@ struct memcg_test {
> > > >         T(test_memcg_max),
> > > >         T(test_memcg_reclaim),
> > > >         T(test_memcg_oom_events),
> > > > -       T(test_memcg_swap_max),
> > > > +       T(test_memcg_swap_max_peak),
> > > >         T(test_memcg_sock),
> > > >         T(test_memcg_oom_group_leaf_events),
> > > >         T(test_memcg_oom_group_parent_events),
> > > > --
> > > > 2.40.1
> > > >
> > >
> > >
> > > --
> > > David Finkel
> > > Senior Principal Software Engineer, Core Services
> >
> >
> >
> > --
> > David Finkel
> > Senior Principal Software Engineer, Core Services
>
> --
> Michal Hocko
> SUSE Labs

Thanks!
Michal Hocko July 16, 2024, 1:19 p.m. UTC | #5
On Tue 16-07-24 08:47:59, David Finkel wrote:
> On Tue, Jul 16, 2024 at 3:20 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Mon 15-07-24 16:46:36, David Finkel wrote:
> > > > On Mon, Jul 15, 2024 at 4:38 PM David Finkel <davidf@vimeo.com> wrote:
> > > > >
> > > > > Other mechanisms for querying the peak memory usage of either a process
> > > > > or v1 memory cgroup allow for resetting the high watermark. Restore
> > > > > parity with those mechanisms.
> > > > >
> > > > > For example:
> > > > >  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
> > > > >    the high watermark.
> > > > >  - writing "5" to the clear_refs pseudo-file in a processes's proc
> > > > >    directory resets the peak RSS.
> > > > >
> > > > > This change copies the cgroup v1 behavior so any write to the
> > > > > memory.peak and memory.swap.peak pseudo-files reset the high watermark
> > > > > to the current usage.
> > > > >
> > > > > This behavior is particularly useful for work scheduling systems that
> > > > > need to track memory usage of worker processes/cgroups per-work-item.
> > > > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > > > opinions),
> >
> > I do not understand the OOM-killer reference here. Why does it matter?
> > Could you explain please?
> 
> Sure, we're attempting to bin-packing work based on past items of the same type.
> With CPU, we can provision for the mean CPU-time per-wall-time to get
> a lose "cores"
> concept that we use for binpacking. With CPU, if we end up with a bit
> of contention,
> everything just gets a bit slower while the schedule arbitrates among cgroups.
> 
> However, with memory, you only have so much physical memory for the outer memcg.
> If we pack things too tightly on memory, the OOM-killer is going to kill
> something to free up memory. In some cases that's fine, but provisioning for the
> peak memory for that "type" of work-item mostly avoids this issue.

It is still not clear to me how the memory reclaim falls into that. Are
your workloads mostly unreclaimable (e.g. anon mostly consumers without
any swap)? Why I am asking? Well, if the workload's memory is
reclaimable then the peak memory consumption is largely misleading
because an unknown portion of that memory consumption is hidden by the
reclaimed portion of it. This is not really specific to the write
handlers to reset the value though so I do not want to digress this
patch too much. I do not have objections to the patch itself. Clarifying
the usecase with your followup here would be nice.

Thanks for the clarification!
David Finkel July 16, 2024, 1:39 p.m. UTC | #6
On Tue, Jul 16, 2024 at 9:19 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Tue 16-07-24 08:47:59, David Finkel wrote:
> > On Tue, Jul 16, 2024 at 3:20 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Mon 15-07-24 16:46:36, David Finkel wrote:
> > > > > On Mon, Jul 15, 2024 at 4:38 PM David Finkel <davidf@vimeo.com> wrote:
> > > > > >
> > > > > > Other mechanisms for querying the peak memory usage of either a process
> > > > > > or v1 memory cgroup allow for resetting the high watermark. Restore
> > > > > > parity with those mechanisms.
> > > > > >
> > > > > > For example:
> > > > > >  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
> > > > > >    the high watermark.
> > > > > >  - writing "5" to the clear_refs pseudo-file in a processes's proc
> > > > > >    directory resets the peak RSS.
> > > > > >
> > > > > > This change copies the cgroup v1 behavior so any write to the
> > > > > > memory.peak and memory.swap.peak pseudo-files reset the high watermark
> > > > > > to the current usage.
> > > > > >
> > > > > > This behavior is particularly useful for work scheduling systems that
> > > > > > need to track memory usage of worker processes/cgroups per-work-item.
> > > > > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > > > > opinions),
> > >
> > > I do not understand the OOM-killer reference here. Why does it matter?
> > > Could you explain please?
> >
> > Sure, we're attempting to bin-packing work based on past items of the same type.
> > With CPU, we can provision for the mean CPU-time per-wall-time to get
> > a lose "cores"
> > concept that we use for binpacking. With CPU, if we end up with a bit
> > of contention,
> > everything just gets a bit slower while the schedule arbitrates among cgroups.
> >
> > However, with memory, you only have so much physical memory for the outer memcg.
> > If we pack things too tightly on memory, the OOM-killer is going to kill
> > something to free up memory. In some cases that's fine, but provisioning for the
> > peak memory for that "type" of work-item mostly avoids this issue.
>
> It is still not clear to me how the memory reclaim falls into that. Are
> your workloads mostly unreclaimable (e.g. anon mostly consumers without
> any swap)? Why I am asking? Well, if the workload's memory is
> reclaimable then the peak memory consumption is largely misleading
> because an unknown portion of that memory consumption is hidden by the
> reclaimed portion of it. This is not really specific to the write
> handlers to reset the value though so I do not want to digress this
> patch too much. I do not have objections to the patch itself. Clarifying
> the usecase with your followup here would be nice.

Thanks, I'm happy to clarify things!

That's a good point about peak-RSS being unreliable if the memory's reclaimable.

The memory is mostly unreclaimable. It's almost all anonymous mmap,
with a few local files that would be resident in buffercache. (but
generally aren't mmaped)
We don't run with swap enabled on the systems for a few reasons.
In particular, kubernetes disallows swap, which ties our hands, but
even if it didn't,
demand paging from disk tends to stall any useful work, so we'd rather
see the OOM-killer invoked, anyway.

(we actually have some plans for disabling OOM-kills in these cgroups
and letting the userspace process
managing these memcgs handle work-throttling and worker-killing when
there are OOM-conditions, but that's another story :) )

>
> Thanks for the clarification!
> --
> Michal Hocko
> SUSE Labs
Michal Hocko July 16, 2024, 1:48 p.m. UTC | #7
On Mon 15-07-24 16:36:26, David Finkel wrote:
> Other mechanisms for querying the peak memory usage of either a process
> or v1 memory cgroup allow for resetting the high watermark. Restore
> parity with those mechanisms.
> 
> For example:
>  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
>    the high watermark.
>  - writing "5" to the clear_refs pseudo-file in a processes's proc
>    directory resets the peak RSS.
> 
> This change copies the cgroup v1 behavior so any write to the
> memory.peak and memory.swap.peak pseudo-files reset the high watermark
> to the current usage.
> 
> This behavior is particularly useful for work scheduling systems that
> need to track memory usage of worker processes/cgroups per-work-item.
> Since memory can't be squeezed like CPU can (the OOM-killer has
> opinions), these systems need to track the peak memory usage to compute
> system/container fullness when binpacking workitems.
> 
> Signed-off-by: David Finkel <davidf@vimeo.com>

As mentioned down the email thread, I consider usefulness of peak value
rather limited. It is misleading when memory is reclaimed. But
fundamentally I do not oppose to unifying the write behavior to reset
values.

The chnagelog could use some of the clarifications down the thread.

Acked-by: Michal Hocko <mhocko@suse.com>
David Finkel July 16, 2024, 1:54 p.m. UTC | #8
On Tue, Jul 16, 2024 at 9:48 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 15-07-24 16:36:26, David Finkel wrote:
> > Other mechanisms for querying the peak memory usage of either a process
> > or v1 memory cgroup allow for resetting the high watermark. Restore
> > parity with those mechanisms.
> >
> > For example:
> >  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
> >    the high watermark.
> >  - writing "5" to the clear_refs pseudo-file in a processes's proc
> >    directory resets the peak RSS.
> >
> > This change copies the cgroup v1 behavior so any write to the
> > memory.peak and memory.swap.peak pseudo-files reset the high watermark
> > to the current usage.
> >
> > This behavior is particularly useful for work scheduling systems that
> > need to track memory usage of worker processes/cgroups per-work-item.
> > Since memory can't be squeezed like CPU can (the OOM-killer has
> > opinions), these systems need to track the peak memory usage to compute
> > system/container fullness when binpacking workitems.
> >
> > Signed-off-by: David Finkel <davidf@vimeo.com>
>
> As mentioned down the email thread, I consider usefulness of peak value
> rather limited. It is misleading when memory is reclaimed. But
> fundamentally I do not oppose to unifying the write behavior to reset
> values.
>
> The chnagelog could use some of the clarifications down the thread.

Sure, I can spend some time rewording the changelog this afternoon, and
remail it. in a few hours.

>
> Acked-by: Michal Hocko <mhocko@suse.com>
>

Thank you!
> --
> Michal Hocko
> SUSE Labs
Tejun Heo July 16, 2024, 4:44 p.m. UTC | #9
Hello,

On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
...
> > This behavior is particularly useful for work scheduling systems that
> > need to track memory usage of worker processes/cgroups per-work-item.
> > Since memory can't be squeezed like CPU can (the OOM-killer has
> > opinions), these systems need to track the peak memory usage to compute
> > system/container fullness when binpacking workitems.

Swap still has bad reps but there's nothing drastically worse about it than
page cache. ie. If you're under memory pressure, you get thrashing one way
or another. If there's no swap, the system is just memlocking anon memory
even when they are a lot colder than page cache, so I'm skeptical that no
swap + mostly anon + kernel OOM kills is a good strategy in general
especially given that the system behavior is not very predictable under OOM
conditions.

> As mentioned down the email thread, I consider usefulness of peak value
> rather limited. It is misleading when memory is reclaimed. But
> fundamentally I do not oppose to unifying the write behavior to reset
> values.

The removal of resets was intentional. The problem was that it wasn't clear
who owned those counters and there's no way of telling who reset what when.
It was easy to accidentally end up with multiple entities that think they
can get timed measurement by resetting.

So, in general, I don't think this is a great idea. There are shortcomings
to how memory.peak behaves in that its meaningfulness quickly declines over
time. This is expected and the rationale behind adding memory.peak, IIRC,
was that it was difficult to tell the memory usage of a short-lived cgroup.

If we want to allow peak measurement of time periods, I wonder whether we
could do something similar to pressure triggers - ie. let users register
watchers so that each user can define their own watch periods. This is more
involved but more useful and less error-inducing than adding reset to a
single counter.

Johannes, what do you think?

Thanks.
Roman Gushchin July 16, 2024, 5:01 p.m. UTC | #10
On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
> Hello,
> 
> On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> ...
> 
> The removal of resets was intentional. The problem was that it wasn't clear
> who owned those counters and there's no way of telling who reset what when.
> It was easy to accidentally end up with multiple entities that think they
> can get timed measurement by resetting.
> 
> So, in general, I don't think this is a great idea. There are shortcomings
> to how memory.peak behaves in that its meaningfulness quickly declines over
> time. This is expected and the rationale behind adding memory.peak, IIRC,
> was that it was difficult to tell the memory usage of a short-lived cgroup.
> 
> If we want to allow peak measurement of time periods, I wonder whether we
> could do something similar to pressure triggers - ie. let users register
> watchers so that each user can define their own watch periods. This is more
> involved but more useful and less error-inducing than adding reset to a
> single counter.

It's definitely a better user interface and I totally agree with you regarding
the shortcomings of the proposed interface with a global reset. But if you let
users to register a (potentially large) number of watchers, it might be quite
bad for the overall performance, isn't it? To mitigate it, we'll need to reduce
the accuracy of peak values. And then the question is why not just poll it
periodically from userspace?
David Finkel July 16, 2024, 5:10 p.m. UTC | #11
On Tue, Jul 16, 2024 at 12:44 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> ...
> > > This behavior is particularly useful for work scheduling systems that
> > > need to track memory usage of worker processes/cgroups per-work-item.
> > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > opinions), these systems need to track the peak memory usage to compute
> > > system/container fullness when binpacking workitems.
>
> Swap still has bad reps but there's nothing drastically worse about it than
> page cache. ie. If you're under memory pressure, you get thrashing one way
> or another. If there's no swap, the system is just memlocking anon memory
> even when they are a lot colder than page cache, so I'm skeptical that no
> swap + mostly anon + kernel OOM kills is a good strategy in general
> especially given that the system behavior is not very predictable under OOM
> conditions.

The reason we need peak memory information is to let us schedule work in a way
that we generally avoid OOM conditions.
For the workloads I work on, we generally have very little in the
page-cache, since
the data isn't stored locally most of the time, but streamed from
other storage/database
systems. For those cases, demand-paging will cause large variations in
servicing time,
and we'd rather restart the process than have unpredictable latency.
The same is true for the batch/queue-work system I wrote this patch to support.
We keep very little data on the local disk, so the page cache is
relatively small.


>
> > As mentioned down the email thread, I consider usefulness of peak value
> > rather limited. It is misleading when memory is reclaimed. But
> > fundamentally I do not oppose to unifying the write behavior to reset
> > values.
>
> The removal of resets was intentional. The problem was that it wasn't clear
> who owned those counters and there's no way of telling who reset what when.
> It was easy to accidentally end up with multiple entities that think they
> can get timed measurement by resetting.
>
> So, in general, I don't think this is a great idea. There are shortcomings
> to how memory.peak behaves in that its meaningfulness quickly declines over
> time. This is expected and the rationale behind adding memory.peak, IIRC,
> was that it was difficult to tell the memory usage of a short-lived cgroup.
>
> If we want to allow peak measurement of time periods, I wonder whether we
> could do something similar to pressure triggers - ie. let users register
> watchers so that each user can define their own watch periods. This is more
> involved but more useful and less error-inducing than adding reset to a
> single counter.

I appreciate the ownership issues with the current resetting interface
in the other locations.
However, this peak RSS data is not used by all that many applications
(as evidenced by
the fact that the memory.peak file was only added a bit over a year ago).
I think there are enough cases where ownership is enforced externally
that mirroring the
existing interface to cgroup2 is sufficient.

I do think a more stateful interface would be nice, but I don't know
whether I have enough
knowledge of memcg to implement that in a reasonable amount of time.

Ownership aside, I think being able to reset the high watermark of a
process makes it
significantly more useful. Creating new cgroups and moving processes
around is significantly
heavier-weight.

Thanks,

>
> Johannes, what do you think?
>
> Thanks.
>
> --
> tejun
David Finkel July 16, 2024, 5:20 p.m. UTC | #12
On Tue, Jul 16, 2024 at 1:01 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
> > Hello,
> >
> > On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> > ...
> >
> > The removal of resets was intentional. The problem was that it wasn't clear
> > who owned those counters and there's no way of telling who reset what when.
> > It was easy to accidentally end up with multiple entities that think they
> > can get timed measurement by resetting.
> >
> > So, in general, I don't think this is a great idea. There are shortcomings
> > to how memory.peak behaves in that its meaningfulness quickly declines over
> > time. This is expected and the rationale behind adding memory.peak, IIRC,
> > was that it was difficult to tell the memory usage of a short-lived cgroup.
> >
> > If we want to allow peak measurement of time periods, I wonder whether we
> > could do something similar to pressure triggers - ie. let users register
> > watchers so that each user can define their own watch periods. This is more
> > involved but more useful and less error-inducing than adding reset to a
> > single counter.
>
> It's definitely a better user interface and I totally agree with you regarding
> the shortcomings of the proposed interface with a global reset. But if you let
> users to register a (potentially large) number of watchers, it might be quite
> bad for the overall performance, isn't it? To mitigate it, we'll need to reduce
> the accuracy of peak values. And then the question is why not just poll it
> periodically from userspace?

FWIW, as a stop-gap, we did implement periodic polling from userspace for
the system that motivated this change, but that is unlikely to catch
memory-usage
spikes that have shorter timescales than the polling interval. For
now, we're keeping
it on cgroups v1, but that's looking like a long-term untenable position.


Thanks,
Michal Hocko July 16, 2024, 6 p.m. UTC | #13
On Tue 16-07-24 06:44:11, Tejun Heo wrote:
> Hello,
> 
> On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> ...
> > > This behavior is particularly useful for work scheduling systems that
> > > need to track memory usage of worker processes/cgroups per-work-item.
> > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > opinions), these systems need to track the peak memory usage to compute
> > > system/container fullness when binpacking workitems.
> 
> Swap still has bad reps but there's nothing drastically worse about it than
> page cache. ie. If you're under memory pressure, you get thrashing one way
> or another. If there's no swap, the system is just memlocking anon memory
> even when they are a lot colder than page cache, so I'm skeptical that no
> swap + mostly anon + kernel OOM kills is a good strategy in general
> especially given that the system behavior is not very predictable under OOM
> conditions.

Completely agree on this!

> > As mentioned down the email thread, I consider usefulness of peak value
> > rather limited. It is misleading when memory is reclaimed. But
> > fundamentally I do not oppose to unifying the write behavior to reset
> > values.
> 
> The removal of resets was intentional. The problem was that it wasn't clear
> who owned those counters and there's no way of telling who reset what when.
> It was easy to accidentally end up with multiple entities that think they
> can get timed measurement by resetting.

yes, I understand and agree with you. Generally speaking peak value is
of a very limited value. On the other hand we already have it in v2 and
if it allows a reliable way to scale the workload (which seems to be the
case here) than reseting the value sounds like a cheaper value than
tearing down the memcg and creating it again (with all the dead cgroups
headache that might follow). The reset interface doesn't cause much from
the maintenance POV and if somebody wants to use it they surely need
find a way to coordinate.

> So, in general, I don't think this is a great idea. There are shortcomings
> to how memory.peak behaves in that its meaningfulness quickly declines over
> time. This is expected and the rationale behind adding memory.peak, IIRC,
> was that it was difficult to tell the memory usage of a short-lived cgroup.
> 
> If we want to allow peak measurement of time periods, I wonder whether we
> could do something similar to pressure triggers - ie. let users register
> watchers so that each user can define their own watch periods. This is more
> involved but more useful and less error-inducing than adding reset to a
> single counter.

I would rather not get back to that unless we have many more users that
really need that. Absolute value of the memory consumption is a long
living concept that doesn't make much sense most of the time. People
just tend to still use it because it is much simpler to compare two different
values rather than something as dynamic as PSI similar metrics.
Tejun Heo July 16, 2024, 7:48 p.m. UTC | #14
Hello,

On Tue, Jul 16, 2024 at 01:10:14PM -0400, David Finkel wrote:
> > Swap still has bad reps but there's nothing drastically worse about it than
> > page cache. ie. If you're under memory pressure, you get thrashing one way
> > or another. If there's no swap, the system is just memlocking anon memory
> > even when they are a lot colder than page cache, so I'm skeptical that no
> > swap + mostly anon + kernel OOM kills is a good strategy in general
> > especially given that the system behavior is not very predictable under OOM
> > conditions.
> 
> The reason we need peak memory information is to let us schedule work in a
> way that we generally avoid OOM conditions. For the workloads I work on,
> we generally have very little in the page-cache, since the data isn't
> stored locally most of the time, but streamed from other storage/database
> systems. For those cases, demand-paging will cause large variations in
> servicing time, and we'd rather restart the process than have
> unpredictable latency. The same is true for the batch/queue-work system I
> wrote this patch to support. We keep very little data on the local disk,
> so the page cache is relatively small.

You can detect these conditions more reliably and *earlier* using PSI
triggers with swap enabled than hard allocations and OOM kills. Then, you
can take whatever decision you want to take including killing the job
without worrying about the whole system severely suffering. You can even do
things like freezing the cgroup and taking backtraces and collecting other
debug info to better understand why the memory usage is blowing up.

There are of course multiple ways to go about things but I think it's useful
to note that hard alloc based on peak usage + OOM kills likely isn't the
best way here.

...
> I appreciate the ownership issues with the current resetting interface in
> the other locations. However, this peak RSS data is not used by all that
> many applications (as evidenced by the fact that the memory.peak file was
> only added a bit over a year ago). I think there are enough cases where
> ownership is enforced externally that mirroring the existing interface to
> cgroup2 is sufficient.

It's fairly new addition and its utility is limited, so it's not that widely
used. Adding reset makes it more useful but in a way which can be
deterimental in the long term.

> I do think a more stateful interface would be nice, but I don't know
> whether I have enough knowledge of memcg to implement that in a reasonable
> amount of time.

Right, this probably isn't trivial.

> Ownership aside, I think being able to reset the high watermark of a
> process makes it significantly more useful. Creating new cgroups and
> moving processes around is significantly heavier-weight.

Yeah, the setup / teardown cost can be non-trivial for short lived cgroups.
I agree that having some way of measuring peak in different time intervals
can be useful.

Thanks.
Tejun Heo July 16, 2024, 7:53 p.m. UTC | #15
Hello,

On Tue, Jul 16, 2024 at 05:01:15PM +0000, Roman Gushchin wrote:
> > If we want to allow peak measurement of time periods, I wonder whether we
> > could do something similar to pressure triggers - ie. let users register
> > watchers so that each user can define their own watch periods. This is more
> > involved but more useful and less error-inducing than adding reset to a
> > single counter.
> 
> It's definitely a better user interface and I totally agree with you regarding
> the shortcomings of the proposed interface with a global reset. But if you let
> users to register a (potentially large) number of watchers, it might be quite
> bad for the overall performance, isn't it? To mitigate it, we'll need to reduce
> the accuracy of peak values. And then the question is why not just poll it
> periodically from userspace?

I haven't thought in detail but it's the same problem that PSI triggers
have, right? PSI seems to have found a reasonable trade-off across accuracy,
overhead and usage restrictions? Maybe peak usage isn't as widely useful and
it doesn't justify to engineer something as sophisticated. I don't know.

Thanks.
Tejun Heo July 16, 2024, 8 p.m. UTC | #16
Hello,

On Tue, Jul 16, 2024 at 08:00:52PM +0200, Michal Hocko wrote:
...
> > If we want to allow peak measurement of time periods, I wonder whether we
> > could do something similar to pressure triggers - ie. let users register
> > watchers so that each user can define their own watch periods. This is more
> > involved but more useful and less error-inducing than adding reset to a
> > single counter.
> 
> I would rather not get back to that unless we have many more users that
> really need that. Absolute value of the memory consumption is a long
> living concept that doesn't make much sense most of the time. People
> just tend to still use it because it is much simpler to compare two different
> values rather than something as dynamic as PSI similar metrics.

The initial justification for adding memory.peak was that it's mostly to
monitor short lived cgroups. Adding reset would make it used more widely,
which isn't necessarily a bad thing and people most likely will find ways to
use it creatively. I'm mostly worried that that's going to create a mess
down the road. Yeah, so, it's not widely useful now but adding reset makes
it more useful and in a way which can potentially paint us into a corner.

But then again, maybe this is really niche, future usages will still remain
very restricted, and adding something more akin to PSI triggers is way
over-engineering it.

Thanks.
David Finkel July 16, 2024, 8:18 p.m. UTC | #17
On Tue, Jul 16, 2024 at 3:48 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Tue, Jul 16, 2024 at 01:10:14PM -0400, David Finkel wrote:
> > > Swap still has bad reps but there's nothing drastically worse about it than
> > > page cache. ie. If you're under memory pressure, you get thrashing one way
> > > or another. If there's no swap, the system is just memlocking anon memory
> > > even when they are a lot colder than page cache, so I'm skeptical that no
> > > swap + mostly anon + kernel OOM kills is a good strategy in general
> > > especially given that the system behavior is not very predictable under OOM
> > > conditions.
> >
> > The reason we need peak memory information is to let us schedule work in a
> > way that we generally avoid OOM conditions. For the workloads I work on,
> > we generally have very little in the page-cache, since the data isn't
> > stored locally most of the time, but streamed from other storage/database
> > systems. For those cases, demand-paging will cause large variations in
> > servicing time, and we'd rather restart the process than have
> > unpredictable latency. The same is true for the batch/queue-work system I
> > wrote this patch to support. We keep very little data on the local disk,
> > so the page cache is relatively small.
>
> You can detect these conditions more reliably and *earlier* using PSI
> triggers with swap enabled than hard allocations and OOM kills. Then, you
> can take whatever decision you want to take including killing the job
> without worrying about the whole system severely suffering. You can even do
> things like freezing the cgroup and taking backtraces and collecting other
> debug info to better understand why the memory usage is blowing up.
>
> There are of course multiple ways to go about things but I think it's useful
> to note that hard alloc based on peak usage + OOM kills likely isn't the
> best way here.

To be clear, my goal with peak memory tracking is to bin-pack in a way
that I don't encounter OOMs. I'd prefer to have a bit of headroom and
avoid OOMs if I can.

PSI does seem like a wonderful tool, and I do intend to use it, but
since it's a reactive
signal and doesn't provide absolute values for the total memory usage
that we'd need to
figure out in our central scheduler which work can cohabitate (and how
many instances),
it complements memory.peak rather than replacing my need for it.

FWIW, at the moment, we have some (partially broken) OOM-detection,
which does make
sense to swap out for PSI tracking/trigger-watching that takes care of
scaling down workers
when there's resource-pressure.
(Thanks for pointing out that PSI is generally a better signal than
OOMs for memory pressure)


Thanks again,

>
> ...
> > I appreciate the ownership issues with the current resetting interface in
> > the other locations. However, this peak RSS data is not used by all that
> > many applications (as evidenced by the fact that the memory.peak file was
> > only added a bit over a year ago). I think there are enough cases where
> > ownership is enforced externally that mirroring the existing interface to
> > cgroup2 is sufficient.
>
> It's fairly new addition and its utility is limited, so it's not that widely
> used. Adding reset makes it more useful but in a way which can be
> deterimental in the long term.
>
> > I do think a more stateful interface would be nice, but I don't know
> > whether I have enough knowledge of memcg to implement that in a reasonable
> > amount of time.
>
> Right, this probably isn't trivial.
>
> > Ownership aside, I think being able to reset the high watermark of a
> > process makes it significantly more useful. Creating new cgroups and
> > moving processes around is significantly heavier-weight.
>
> Yeah, the setup / teardown cost can be non-trivial for short lived cgroups.
> I agree that having some way of measuring peak in different time intervals
> can be useful.
>
> Thanks.
>
> --
> tejun
David Finkel July 16, 2024, 10:06 p.m. UTC | #18
On Tue, Jul 16, 2024 at 4:00 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Tue, Jul 16, 2024 at 08:00:52PM +0200, Michal Hocko wrote:
> ...
> > > If we want to allow peak measurement of time periods, I wonder whether we
> > > could do something similar to pressure triggers - ie. let users register
> > > watchers so that each user can define their own watch periods. This is more
> > > involved but more useful and less error-inducing than adding reset to a
> > > single counter.
> >
> > I would rather not get back to that unless we have many more users that
> > really need that. Absolute value of the memory consumption is a long
> > living concept that doesn't make much sense most of the time. People
> > just tend to still use it because it is much simpler to compare two different
> > values rather than something as dynamic as PSI similar metrics.
>
> The initial justification for adding memory.peak was that it's mostly to
> monitor short lived cgroups. Adding reset would make it used more widely,
> which isn't necessarily a bad thing and people most likely will find ways to
> use it creatively. I'm mostly worried that that's going to create a mess
> down the road. Yeah, so, it's not widely useful now but adding reset makes
> it more useful and in a way which can potentially paint us into a corner.

This is a pretty low-overhead feature as-is, but we can reduce it further by
changing it so instead of resetting the watermark on any non-empty string,
we reset only on one particular string.

I'm thinking of something like "global_reset\n", so if we do something like the
PSI interface later, users can write "fd_local_reset\n", and get that
nicer behavior.

This also has the benefit of allowing "echo global_reset >
/sys/fs/cgroup/.../memory.peak" to do the right thing.
(better names welcome)

>
> But then again, maybe this is really niche, future usages will still remain
> very restricted, and adding something more akin to PSI triggers is way
> over-engineering it.

Yeah, I think this is niche enough that it isn't worth over-engineering.
There is some value to keeping broad compatibility for things moving
from cgroups v1, too.

>
> Thanks.
>
> --
> tejun


Thanks again,
Michal Hocko July 17, 2024, 6:23 a.m. UTC | #19
On Tue 16-07-24 10:00:10, Tejun Heo wrote:
> Hello,
> 
> On Tue, Jul 16, 2024 at 08:00:52PM +0200, Michal Hocko wrote:
> ...
> > > If we want to allow peak measurement of time periods, I wonder whether we
> > > could do something similar to pressure triggers - ie. let users register
> > > watchers so that each user can define their own watch periods. This is more
> > > involved but more useful and less error-inducing than adding reset to a
> > > single counter.
> > 
> > I would rather not get back to that unless we have many more users that
> > really need that. Absolute value of the memory consumption is a long
> > living concept that doesn't make much sense most of the time. People
> > just tend to still use it because it is much simpler to compare two different
> > values rather than something as dynamic as PSI similar metrics.
> 
> The initial justification for adding memory.peak was that it's mostly to
> monitor short lived cgroups. Adding reset would make it used more widely,
> which isn't necessarily a bad thing and people most likely will find ways to
> use it creatively. I'm mostly worried that that's going to create a mess
> down the road. Yeah, so, it's not widely useful now but adding reset makes
> it more useful and in a way which can potentially paint us into a corner.

I really fail to see how this would cause problems with future
maintainability. It is not like we are trying to deprecate this
memory.peak.

I am also not sure this makes it so much more attractive that people
would start using it just because they can reset the value. It makes
sense to extend our documentation and actually describe pitfalls.
Michal Hocko July 17, 2024, 6:26 a.m. UTC | #20
On Tue 16-07-24 18:06:17, David Finkel wrote:
> On Tue, Jul 16, 2024 at 4:00 PM Tejun Heo <tj@kernel.org> wrote:
> >
> > Hello,
> >
> > On Tue, Jul 16, 2024 at 08:00:52PM +0200, Michal Hocko wrote:
> > ...
> > > > If we want to allow peak measurement of time periods, I wonder whether we
> > > > could do something similar to pressure triggers - ie. let users register
> > > > watchers so that each user can define their own watch periods. This is more
> > > > involved but more useful and less error-inducing than adding reset to a
> > > > single counter.
> > >
> > > I would rather not get back to that unless we have many more users that
> > > really need that. Absolute value of the memory consumption is a long
> > > living concept that doesn't make much sense most of the time. People
> > > just tend to still use it because it is much simpler to compare two different
> > > values rather than something as dynamic as PSI similar metrics.
> >
> > The initial justification for adding memory.peak was that it's mostly to
> > monitor short lived cgroups. Adding reset would make it used more widely,
> > which isn't necessarily a bad thing and people most likely will find ways to
> > use it creatively. I'm mostly worried that that's going to create a mess
> > down the road. Yeah, so, it's not widely useful now but adding reset makes
> > it more useful and in a way which can potentially paint us into a corner.
> 
> This is a pretty low-overhead feature as-is, but we can reduce it further by
> changing it so instead of resetting the watermark on any non-empty string,
> we reset only on one particular string.
> 
> I'm thinking of something like "global_reset\n", so if we do something like the
> PSI interface later, users can write "fd_local_reset\n", and get that
> nicer behavior.
> 
> This also has the benefit of allowing "echo global_reset >
> /sys/fs/cgroup/.../memory.peak" to do the right thing.
> (better names welcome)

This would be a different behavior than in v1 and therefore confusing
for those who rely on this in v1 already. So I wouldn't overengineer it
and keep the semantic as simple as possible. If we decide to add PSI
triggers they are completely independent on peak value because that is
reclaim based interface which by definition makes peak value very
dubious.
David Finkel July 17, 2024, 2:24 p.m. UTC | #21
On Wed, Jul 17, 2024 at 2:26 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Tue 16-07-24 18:06:17, David Finkel wrote:
> > On Tue, Jul 16, 2024 at 4:00 PM Tejun Heo <tj@kernel.org> wrote:
> > >
> > > Hello,
> > >
> > > On Tue, Jul 16, 2024 at 08:00:52PM +0200, Michal Hocko wrote:
> > > ...
> > > > > If we want to allow peak measurement of time periods, I wonder whether we
> > > > > could do something similar to pressure triggers - ie. let users register
> > > > > watchers so that each user can define their own watch periods. This is more
> > > > > involved but more useful and less error-inducing than adding reset to a
> > > > > single counter.
> > > >
> > > > I would rather not get back to that unless we have many more users that
> > > > really need that. Absolute value of the memory consumption is a long
> > > > living concept that doesn't make much sense most of the time. People
> > > > just tend to still use it because it is much simpler to compare two different
> > > > values rather than something as dynamic as PSI similar metrics.
> > >
> > > The initial justification for adding memory.peak was that it's mostly to
> > > monitor short lived cgroups. Adding reset would make it used more widely,
> > > which isn't necessarily a bad thing and people most likely will find ways to
> > > use it creatively. I'm mostly worried that that's going to create a mess
> > > down the road. Yeah, so, it's not widely useful now but adding reset makes
> > > it more useful and in a way which can potentially paint us into a corner.
> >
> > This is a pretty low-overhead feature as-is, but we can reduce it further by
> > changing it so instead of resetting the watermark on any non-empty string,
> > we reset only on one particular string.
> >
> > I'm thinking of something like "global_reset\n", so if we do something like the
> > PSI interface later, users can write "fd_local_reset\n", and get that
> > nicer behavior.
> >
> > This also has the benefit of allowing "echo global_reset >
> > /sys/fs/cgroup/.../memory.peak" to do the right thing.
> > (better names welcome)
>
> This would be a different behavior than in v1 and therefore confusing
> for those who rely on this in v1 already. So I wouldn't overengineer it
> and keep the semantic as simple as possible. If we decide to add PSI
> triggers they are completely independent on peak value because that is
> reclaim based interface which by definition makes peak value very
> dubious.

That's fair.

My only thought is that "write any non-empty string", is a very wide interface
to support, and limits other possible behaviors later.

FWIW, I have patches with and without this narrowed interface ready to re-mail
pending the outcome of this discussion. (both include additional
use-case info in the changelog)

Keeping the "all non-empty writes" behavior:
https://github.com/dfinkel/linux/commit/5edb21882a88693024a95bbd76b4a8d4561348da

With the narrowed interface:
https://github.com/dfinkel/linux/commit/f341c8c0cf5fbdcf7af30e20b334e532df74906d

> --
> Michal Hocko
> SUSE Labs


Thanks,
Michal Hocko July 17, 2024, 3:46 p.m. UTC | #22
On Wed 17-07-24 10:24:07, David Finkel wrote:
> On Wed, Jul 17, 2024 at 2:26 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Tue 16-07-24 18:06:17, David Finkel wrote:
[...]
> > > I'm thinking of something like "global_reset\n", so if we do something like the
> > > PSI interface later, users can write "fd_local_reset\n", and get that
> > > nicer behavior.
> > >
> > > This also has the benefit of allowing "echo global_reset >
> > > /sys/fs/cgroup/.../memory.peak" to do the right thing.
> > > (better names welcome)
> >
> > This would be a different behavior than in v1 and therefore confusing
> > for those who rely on this in v1 already. So I wouldn't overengineer it
> > and keep the semantic as simple as possible. If we decide to add PSI
> > triggers they are completely independent on peak value because that is
> > reclaim based interface which by definition makes peak value very
> > dubious.
> 
> That's fair.
> 
> My only thought is that "write any non-empty string", is a very wide interface
> to support, and limits other possible behaviors later.

yes, that ship has sailed long time ago.
Johannes Weiner July 17, 2024, 5:04 p.m. UTC | #23
On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
> Hello,
> 
> On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> ...
> > > This behavior is particularly useful for work scheduling systems that
> > > need to track memory usage of worker processes/cgroups per-work-item.
> > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > opinions), these systems need to track the peak memory usage to compute
> > > system/container fullness when binpacking workitems.
> 
> Swap still has bad reps but there's nothing drastically worse about it than
> page cache. ie. If you're under memory pressure, you get thrashing one way
> or another. If there's no swap, the system is just memlocking anon memory
> even when they are a lot colder than page cache, so I'm skeptical that no
> swap + mostly anon + kernel OOM kills is a good strategy in general
> especially given that the system behavior is not very predictable under OOM
> conditions.
> 
> > As mentioned down the email thread, I consider usefulness of peak value
> > rather limited. It is misleading when memory is reclaimed. But
> > fundamentally I do not oppose to unifying the write behavior to reset
> > values.
> 
> The removal of resets was intentional. The problem was that it wasn't clear
> who owned those counters and there's no way of telling who reset what when.
> It was easy to accidentally end up with multiple entities that think they
> can get timed measurement by resetting.
> 
> So, in general, I don't think this is a great idea. There are shortcomings
> to how memory.peak behaves in that its meaningfulness quickly declines over
> time. This is expected and the rationale behind adding memory.peak, IIRC,
> was that it was difficult to tell the memory usage of a short-lived cgroup.
> 
> If we want to allow peak measurement of time periods, I wonder whether we
> could do something similar to pressure triggers - ie. let users register
> watchers so that each user can define their own watch periods. This is more
> involved but more useful and less error-inducing than adding reset to a
> single counter.
> 
> Johannes, what do you think?

I'm also not a fan of the ability to reset globally.

I seem to remember a scheme we discussed some time ago to do local
state tracking without having the overhead in the page counter
fastpath. The new data that needs to be tracked is a pc->local_peak
(in the page_counter) and an fd->peak (in the watcher's file state).

1. Usage peak is tracked in pc->watermark, and now also in pc->local_peak.

2. Somebody opens the memory.peak. Initialize fd->peak = -1.

3. If they write, set fd->peak = pc->local_peak = usage.

4. Usage grows.

5. They read(). A conventional reader has fd->peak == -1, so we return
   pc->watermark. If the fd has been written to, return max(fd->peak, pc->local_peak).

6. Usage drops.

7. New watcher opens and writes. Bring up all existing watchers'
   fd->peak (that aren't -1) to pc->local_peak *iff* latter is bigger.
   Then set the new fd->peak = pc->local_peak = current usage as in 3.

8. See 5. again for read() from each watcher.

This way all fd's can arbitrarily start tracking new local peaks with
write(). The operation in the charging fast path is cheap. The write()
is O(existing_watchers), which seems reasonable. It's fully backward
compatible with conventional open() + read() users.
David Finkel July 17, 2024, 8:14 p.m. UTC | #24
On Wed, Jul 17, 2024 at 1:04 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
> > Hello,
> >
> > On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> > ...
> > > > This behavior is particularly useful for work scheduling systems that
> > > > need to track memory usage of worker processes/cgroups per-work-item.
> > > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > > opinions), these systems need to track the peak memory usage to compute
> > > > system/container fullness when binpacking workitems.
> >
> > Swap still has bad reps but there's nothing drastically worse about it than
> > page cache. ie. If you're under memory pressure, you get thrashing one way
> > or another. If there's no swap, the system is just memlocking anon memory
> > even when they are a lot colder than page cache, so I'm skeptical that no
> > swap + mostly anon + kernel OOM kills is a good strategy in general
> > especially given that the system behavior is not very predictable under OOM
> > conditions.
> >
> > > As mentioned down the email thread, I consider usefulness of peak value
> > > rather limited. It is misleading when memory is reclaimed. But
> > > fundamentally I do not oppose to unifying the write behavior to reset
> > > values.
> >
> > The removal of resets was intentional. The problem was that it wasn't clear
> > who owned those counters and there's no way of telling who reset what when.
> > It was easy to accidentally end up with multiple entities that think they
> > can get timed measurement by resetting.
> >
> > So, in general, I don't think this is a great idea. There are shortcomings
> > to how memory.peak behaves in that its meaningfulness quickly declines over
> > time. This is expected and the rationale behind adding memory.peak, IIRC,
> > was that it was difficult to tell the memory usage of a short-lived cgroup.
> >
> > If we want to allow peak measurement of time periods, I wonder whether we
> > could do something similar to pressure triggers - ie. let users register
> > watchers so that each user can define their own watch periods. This is more
> > involved but more useful and less error-inducing than adding reset to a
> > single counter.
> >
> > Johannes, what do you think?
>
> I'm also not a fan of the ability to reset globally.
>
> I seem to remember a scheme we discussed some time ago to do local
> state tracking without having the overhead in the page counter
> fastpath. The new data that needs to be tracked is a pc->local_peak
> (in the page_counter) and an fd->peak (in the watcher's file state).
>
> 1. Usage peak is tracked in pc->watermark, and now also in pc->local_peak.
>
> 2. Somebody opens the memory.peak. Initialize fd->peak = -1.
>
> 3. If they write, set fd->peak = pc->local_peak = usage.
>
> 4. Usage grows.
>
> 5. They read(). A conventional reader has fd->peak == -1, so we return
>    pc->watermark. If the fd has been written to, return max(fd->peak, pc->local_peak).
>
> 6. Usage drops.
>
> 7. New watcher opens and writes. Bring up all existing watchers'
>    fd->peak (that aren't -1) to pc->local_peak *iff* latter is bigger.
>    Then set the new fd->peak = pc->local_peak = current usage as in 3.
>
> 8. See 5. again for read() from each watcher.
>
> This way all fd's can arbitrarily start tracking new local peaks with
> write(). The operation in the charging fast path is cheap. The write()
> is O(existing_watchers), which seems reasonable. It's fully backward
> compatible with conventional open() + read() users.

That scheme seems viable, but it's a lot more work to implement and maintain
than a simple global reset.

Since that scheme maintains a separate pc->local_peak, it's not mutually
exclusive with implementing a global reset now. (as long as we reserve a
way to distinguish the different kinds of writes).

As discussed on other sub-threads, this might be too niche to be worth
the significant complexity of avoiding a global reset. (especially when
users would likely be moving from cgroups v1 which does have a global reset)

Thanks,
Johannes Weiner July 17, 2024, 8:44 p.m. UTC | #25
On Wed, Jul 17, 2024 at 04:14:07PM -0400, David Finkel wrote:
> On Wed, Jul 17, 2024 at 1:04 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
> > > Hello,
> > >
> > > On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> > > ...
> > > > > This behavior is particularly useful for work scheduling systems that
> > > > > need to track memory usage of worker processes/cgroups per-work-item.
> > > > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > > > opinions), these systems need to track the peak memory usage to compute
> > > > > system/container fullness when binpacking workitems.
> > >
> > > Swap still has bad reps but there's nothing drastically worse about it than
> > > page cache. ie. If you're under memory pressure, you get thrashing one way
> > > or another. If there's no swap, the system is just memlocking anon memory
> > > even when they are a lot colder than page cache, so I'm skeptical that no
> > > swap + mostly anon + kernel OOM kills is a good strategy in general
> > > especially given that the system behavior is not very predictable under OOM
> > > conditions.
> > >
> > > > As mentioned down the email thread, I consider usefulness of peak value
> > > > rather limited. It is misleading when memory is reclaimed. But
> > > > fundamentally I do not oppose to unifying the write behavior to reset
> > > > values.
> > >
> > > The removal of resets was intentional. The problem was that it wasn't clear
> > > who owned those counters and there's no way of telling who reset what when.
> > > It was easy to accidentally end up with multiple entities that think they
> > > can get timed measurement by resetting.
> > >
> > > So, in general, I don't think this is a great idea. There are shortcomings
> > > to how memory.peak behaves in that its meaningfulness quickly declines over
> > > time. This is expected and the rationale behind adding memory.peak, IIRC,
> > > was that it was difficult to tell the memory usage of a short-lived cgroup.
> > >
> > > If we want to allow peak measurement of time periods, I wonder whether we
> > > could do something similar to pressure triggers - ie. let users register
> > > watchers so that each user can define their own watch periods. This is more
> > > involved but more useful and less error-inducing than adding reset to a
> > > single counter.
> > >
> > > Johannes, what do you think?
> >
> > I'm also not a fan of the ability to reset globally.
> >
> > I seem to remember a scheme we discussed some time ago to do local
> > state tracking without having the overhead in the page counter
> > fastpath. The new data that needs to be tracked is a pc->local_peak
> > (in the page_counter) and an fd->peak (in the watcher's file state).
> >
> > 1. Usage peak is tracked in pc->watermark, and now also in pc->local_peak.
> >
> > 2. Somebody opens the memory.peak. Initialize fd->peak = -1.
> >
> > 3. If they write, set fd->peak = pc->local_peak = usage.
> >
> > 4. Usage grows.
> >
> > 5. They read(). A conventional reader has fd->peak == -1, so we return
> >    pc->watermark. If the fd has been written to, return max(fd->peak, pc->local_peak).
> >
> > 6. Usage drops.
> >
> > 7. New watcher opens and writes. Bring up all existing watchers'
> >    fd->peak (that aren't -1) to pc->local_peak *iff* latter is bigger.
> >    Then set the new fd->peak = pc->local_peak = current usage as in 3.
> >
> > 8. See 5. again for read() from each watcher.
> >
> > This way all fd's can arbitrarily start tracking new local peaks with
> > write(). The operation in the charging fast path is cheap. The write()
> > is O(existing_watchers), which seems reasonable. It's fully backward
> > compatible with conventional open() + read() users.
> 
> That scheme seems viable, but it's a lot more work to implement and maintain
> than a simple global reset.
> 
> Since that scheme maintains a separate pc->local_peak, it's not mutually
> exclusive with implementing a global reset now. (as long as we reserve a
> way to distinguish the different kinds of writes).
> 
> As discussed on other sub-threads, this might be too niche to be worth
> the significant complexity of avoiding a global reset. (especially when
> users would likely be moving from cgroups v1 which does have a global reset)

The problem is that once global resetting is allowed, it makes the
number reported in memory.peak unreliable for everyone. You just don't
know, and can't tell, if somebody wrote to it recently. It's not too
much of a leap to say this breaks the existing interface contract.

You have to decide whether the above is worth implementing. But my
take is that the downsides of the simpler solution outweigh its
benefits.
David Finkel July 17, 2024, 9:13 p.m. UTC | #26
On Wed, Jul 17, 2024 at 4:44 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Wed, Jul 17, 2024 at 04:14:07PM -0400, David Finkel wrote:
> > On Wed, Jul 17, 2024 at 1:04 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > >
> > > On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
> > > > Hello,
> > > >
> > > > On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> > > > ...
> > > > > > This behavior is particularly useful for work scheduling systems that
> > > > > > need to track memory usage of worker processes/cgroups per-work-item.
> > > > > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > > > > opinions), these systems need to track the peak memory usage to compute
> > > > > > system/container fullness when binpacking workitems.
> > > >
> > > > Swap still has bad reps but there's nothing drastically worse about it than
> > > > page cache. ie. If you're under memory pressure, you get thrashing one way
> > > > or another. If there's no swap, the system is just memlocking anon memory
> > > > even when they are a lot colder than page cache, so I'm skeptical that no
> > > > swap + mostly anon + kernel OOM kills is a good strategy in general
> > > > especially given that the system behavior is not very predictable under OOM
> > > > conditions.
> > > >
> > > > > As mentioned down the email thread, I consider usefulness of peak value
> > > > > rather limited. It is misleading when memory is reclaimed. But
> > > > > fundamentally I do not oppose to unifying the write behavior to reset
> > > > > values.
> > > >
> > > > The removal of resets was intentional. The problem was that it wasn't clear
> > > > who owned those counters and there's no way of telling who reset what when.
> > > > It was easy to accidentally end up with multiple entities that think they
> > > > can get timed measurement by resetting.
> > > >
> > > > So, in general, I don't think this is a great idea. There are shortcomings
> > > > to how memory.peak behaves in that its meaningfulness quickly declines over
> > > > time. This is expected and the rationale behind adding memory.peak, IIRC,
> > > > was that it was difficult to tell the memory usage of a short-lived cgroup.
> > > >
> > > > If we want to allow peak measurement of time periods, I wonder whether we
> > > > could do something similar to pressure triggers - ie. let users register
> > > > watchers so that each user can define their own watch periods. This is more
> > > > involved but more useful and less error-inducing than adding reset to a
> > > > single counter.
> > > >
> > > > Johannes, what do you think?
> > >
> > > I'm also not a fan of the ability to reset globally.
> > >
> > > I seem to remember a scheme we discussed some time ago to do local
> > > state tracking without having the overhead in the page counter
> > > fastpath. The new data that needs to be tracked is a pc->local_peak
> > > (in the page_counter) and an fd->peak (in the watcher's file state).
> > >
> > > 1. Usage peak is tracked in pc->watermark, and now also in pc->local_peak.
> > >
> > > 2. Somebody opens the memory.peak. Initialize fd->peak = -1.
> > >
> > > 3. If they write, set fd->peak = pc->local_peak = usage.
> > >
> > > 4. Usage grows.
> > >
> > > 5. They read(). A conventional reader has fd->peak == -1, so we return
> > >    pc->watermark. If the fd has been written to, return max(fd->peak, pc->local_peak).
> > >
> > > 6. Usage drops.
> > >
> > > 7. New watcher opens and writes. Bring up all existing watchers'
> > >    fd->peak (that aren't -1) to pc->local_peak *iff* latter is bigger.
> > >    Then set the new fd->peak = pc->local_peak = current usage as in 3.
> > >
> > > 8. See 5. again for read() from each watcher.
> > >
> > > This way all fd's can arbitrarily start tracking new local peaks with
> > > write(). The operation in the charging fast path is cheap. The write()
> > > is O(existing_watchers), which seems reasonable. It's fully backward
> > > compatible with conventional open() + read() users.
> >
> > That scheme seems viable, but it's a lot more work to implement and maintain
> > than a simple global reset.
> >
> > Since that scheme maintains a separate pc->local_peak, it's not mutually
> > exclusive with implementing a global reset now. (as long as we reserve a
> > way to distinguish the different kinds of writes).
> >
> > As discussed on other sub-threads, this might be too niche to be worth
> > the significant complexity of avoiding a global reset. (especially when
> > users would likely be moving from cgroups v1 which does have a global reset)
>
> The problem is that once global resetting is allowed, it makes the
> number reported in memory.peak unreliable for everyone. You just don't
> know, and can't tell, if somebody wrote to it recently. It's not too
> much of a leap to say this breaks the existing interface contract.

It does make it hard to tell when it was reset, however, it also allows some
very powerful commandline interactions that aren't possible if you need to
keep a persistent fd open.

I have run things in cgroups to measure peak memory and CPU-time for
things that have subprocesses. If I needed to keep a persistent fd open
in order to reset the high watermark, it would have been far less useful.

Honestly, I don't see a ton of value in tracking the peak memory if I
can't reset it.
It's not my use-case, but, there are a lot of cases where process-startup uses
a lot more memory than the steady-state, so the sysadmin might want to
measure that startup peak and any later peaks separately.

In my use-case, I do have a long-lived process managing the cgroups
for its workers, so I could keep an fd around and reset it as necessary.
However, I do sometimes shell into the relevant k8s container and poke
at the cgroups with a shell, and having to dup that managing processes'
FD somehow to check the high watermark while debugging would be
rather annoying. (although definitely not a dealbreaker)

>
> You have to decide whether the above is worth implementing. But my
> take is that the downsides of the simpler solution outweigh its
> benefits.

There are a few parts to my reticence to implement something
more complicated.
 1) Correctly cleaning up when one of those FDs gets closed can
     be subtle
 2) It's a lot of code, in some very sensitive portions of the kernel,
     so I'd need to test that code a lot more than I do for slapping
     a new entrypoint on the existing watermark reset of the
     page_counter.
 3) For various reasons, the relevant workload runs on
     Google Kubernetes Engine with their Container Optimised OS.
     If the patch is simple enough, I can request that Google
     cherry-pick the relevant commit, so we don't have to wait
     over a year for the next LTS kernel to roll out before we
     can switch to cgroups v2.

It would be a nice personal challenge to implement the solution
you suggest, but it's definitely not something I'd knock out in the
next couple days.

Thanks,
Waiman Long July 17, 2024, 11:48 p.m. UTC | #27
On 7/17/24 17:13, David Finkel wrote:
> On Wed, Jul 17, 2024 at 4:44 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>> On Wed, Jul 17, 2024 at 04:14:07PM -0400, David Finkel wrote:
>>> On Wed, Jul 17, 2024 at 1:04 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>>>> On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
>>>>> Hello,
>>>>>
>>>>> On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
>>>>> ...
>>>>>>> This behavior is particularly useful for work scheduling systems that
>>>>>>> need to track memory usage of worker processes/cgroups per-work-item.
>>>>>>> Since memory can't be squeezed like CPU can (the OOM-killer has
>>>>>>> opinions), these systems need to track the peak memory usage to compute
>>>>>>> system/container fullness when binpacking workitems.
>>>>> Swap still has bad reps but there's nothing drastically worse about it than
>>>>> page cache. ie. If you're under memory pressure, you get thrashing one way
>>>>> or another. If there's no swap, the system is just memlocking anon memory
>>>>> even when they are a lot colder than page cache, so I'm skeptical that no
>>>>> swap + mostly anon + kernel OOM kills is a good strategy in general
>>>>> especially given that the system behavior is not very predictable under OOM
>>>>> conditions.
>>>>>
>>>>>> As mentioned down the email thread, I consider usefulness of peak value
>>>>>> rather limited. It is misleading when memory is reclaimed. But
>>>>>> fundamentally I do not oppose to unifying the write behavior to reset
>>>>>> values.
>>>>> The removal of resets was intentional. The problem was that it wasn't clear
>>>>> who owned those counters and there's no way of telling who reset what when.
>>>>> It was easy to accidentally end up with multiple entities that think they
>>>>> can get timed measurement by resetting.
>>>>>
>>>>> So, in general, I don't think this is a great idea. There are shortcomings
>>>>> to how memory.peak behaves in that its meaningfulness quickly declines over
>>>>> time. This is expected and the rationale behind adding memory.peak, IIRC,
>>>>> was that it was difficult to tell the memory usage of a short-lived cgroup.
>>>>>
>>>>> If we want to allow peak measurement of time periods, I wonder whether we
>>>>> could do something similar to pressure triggers - ie. let users register
>>>>> watchers so that each user can define their own watch periods. This is more
>>>>> involved but more useful and less error-inducing than adding reset to a
>>>>> single counter.
>>>>>
>>>>> Johannes, what do you think?
>>>> I'm also not a fan of the ability to reset globally.
>>>>
>>>> I seem to remember a scheme we discussed some time ago to do local
>>>> state tracking without having the overhead in the page counter
>>>> fastpath. The new data that needs to be tracked is a pc->local_peak
>>>> (in the page_counter) and an fd->peak (in the watcher's file state).
>>>>
>>>> 1. Usage peak is tracked in pc->watermark, and now also in pc->local_peak.
>>>>
>>>> 2. Somebody opens the memory.peak. Initialize fd->peak = -1.
>>>>
>>>> 3. If they write, set fd->peak = pc->local_peak = usage.
>>>>
>>>> 4. Usage grows.
>>>>
>>>> 5. They read(). A conventional reader has fd->peak == -1, so we return
>>>>     pc->watermark. If the fd has been written to, return max(fd->peak, pc->local_peak).
>>>>
>>>> 6. Usage drops.
>>>>
>>>> 7. New watcher opens and writes. Bring up all existing watchers'
>>>>     fd->peak (that aren't -1) to pc->local_peak *iff* latter is bigger.
>>>>     Then set the new fd->peak = pc->local_peak = current usage as in 3.
>>>>
>>>> 8. See 5. again for read() from each watcher.
>>>>
>>>> This way all fd's can arbitrarily start tracking new local peaks with
>>>> write(). The operation in the charging fast path is cheap. The write()
>>>> is O(existing_watchers), which seems reasonable. It's fully backward
>>>> compatible with conventional open() + read() users.
>>> That scheme seems viable, but it's a lot more work to implement and maintain
>>> than a simple global reset.
>>>
>>> Since that scheme maintains a separate pc->local_peak, it's not mutually
>>> exclusive with implementing a global reset now. (as long as we reserve a
>>> way to distinguish the different kinds of writes).
>>>
>>> As discussed on other sub-threads, this might be too niche to be worth
>>> the significant complexity of avoiding a global reset. (especially when
>>> users would likely be moving from cgroups v1 which does have a global reset)
>> The problem is that once global resetting is allowed, it makes the
>> number reported in memory.peak unreliable for everyone. You just don't
>> know, and can't tell, if somebody wrote to it recently. It's not too
>> much of a leap to say this breaks the existing interface contract.
> It does make it hard to tell when it was reset, however, it also allows some
> very powerful commandline interactions that aren't possible if you need to
> keep a persistent fd open.
>
> I have run things in cgroups to measure peak memory and CPU-time for
> things that have subprocesses. If I needed to keep a persistent fd open
> in order to reset the high watermark, it would have been far less useful.
>
> Honestly, I don't see a ton of value in tracking the peak memory if I
> can't reset it.
> It's not my use-case, but, there are a lot of cases where process-startup uses
> a lot more memory than the steady-state, so the sysadmin might want to
> measure that startup peak and any later peaks separately.
>
> In my use-case, I do have a long-lived process managing the cgroups
> for its workers, so I could keep an fd around and reset it as necessary.
> However, I do sometimes shell into the relevant k8s container and poke
> at the cgroups with a shell, and having to dup that managing processes'
> FD somehow to check the high watermark while debugging would be
> rather annoying. (although definitely not a dealbreaker)
>
>> You have to decide whether the above is worth implementing. But my
>> take is that the downsides of the simpler solution outweigh its
>> benefits.
> There are a few parts to my reticence to implement something
> more complicated.
>   1) Correctly cleaning up when one of those FDs gets closed can
>       be subtle
>   2) It's a lot of code, in some very sensitive portions of the kernel,
>       so I'd need to test that code a lot more than I do for slapping
>       a new entrypoint on the existing watermark reset of the
>       page_counter.
>   3) For various reasons, the relevant workload runs on
>       Google Kubernetes Engine with their Container Optimised OS.
>       If the patch is simple enough, I can request that Google
>       cherry-pick the relevant commit, so we don't have to wait
>       over a year for the next LTS kernel to roll out before we
>       can switch to cgroups v2.
>
> It would be a nice personal challenge to implement the solution
> you suggest, but it's definitely not something I'd knock out in the
> next couple days.

How about letting .peak shows two numbers? The first one is the peak 
since the creation of the cgroup and cannot be reset. The second one is 
a local maximum that can be reset to 0. We just to keep track of one 
more counter that should be simple enough to implement.

Cheers,
Longman
Tejun Heo July 18, 2024, 1:24 a.m. UTC | #28
On Wed, Jul 17, 2024 at 07:48:40PM -0400, Waiman Long wrote:
...
> How about letting .peak shows two numbers? The first one is the peak since
> the creation of the cgroup and cannot be reset. The second one is a local
> maximum that can be reset to 0. We just to keep track of one more counter
> that should be simple enough to implement.

What Johannes suggested seems to hit all the marks - it's efficient and
relatively simple, the overhead is only on the users of the facility, and
flexible in a straightforward manner. I have a hard time buying the argument
that it's more difficult to use - the benefit to cost ratio seems pretty
clear. Given that, I'm not sure why we'd want to add something fishy that
can lead to longterm problems.

Thanks.
Roman Gushchin July 18, 2024, 2:17 a.m. UTC | #29
On Wed, Jul 17, 2024 at 03:24:49PM -1000, Tejun Heo wrote:
> On Wed, Jul 17, 2024 at 07:48:40PM -0400, Waiman Long wrote:
> ...
> > How about letting .peak shows two numbers? The first one is the peak since
> > the creation of the cgroup and cannot be reset. The second one is a local
> > maximum that can be reset to 0. We just to keep track of one more counter
> > that should be simple enough to implement.
> 
> What Johannes suggested seems to hit all the marks - it's efficient and
> relatively simple, the overhead is only on the users of the facility, and
> flexible in a straightforward manner. I have a hard time buying the argument
> that it's more difficult to use - the benefit to cost ratio seems pretty
> clear. Given that, I'm not sure why we'd want to add something fishy that
> can lead to longterm problems.

+1 to that. I really like Johannes's suggestion.

Thanks
Waiman Long July 18, 2024, 2:22 a.m. UTC | #30
On 7/17/24 21:24, Tejun Heo wrote:
> On Wed, Jul 17, 2024 at 07:48:40PM -0400, Waiman Long wrote:
> ...
>> How about letting .peak shows two numbers? The first one is the peak since
>> the creation of the cgroup and cannot be reset. The second one is a local
>> maximum that can be reset to 0. We just to keep track of one more counter
>> that should be simple enough to implement.
> What Johannes suggested seems to hit all the marks - it's efficient and
> relatively simple, the overhead is only on the users of the facility, and
> flexible in a straightforward manner. I have a hard time buying the argument
> that it's more difficult to use - the benefit to cost ratio seems pretty
> clear. Given that, I'm not sure why we'd want to add something fishy that
> can lead to longterm problems.

On second thought, it is a change in the user interface that may break 
existing apps that use the peak file. So it is probably better to go 
with Johannes' suggestion.

Thanks,
Longman
Michal Hocko July 18, 2024, 7:21 a.m. UTC | #31
On Wed 17-07-24 16:44:53, Johannes Weiner wrote:
[...]
> The problem is that once global resetting is allowed, it makes the
> number reported in memory.peak unreliable for everyone. You just don't
> know, and can't tell, if somebody wrote to it recently. It's not too
> much of a leap to say this breaks the existing interface contract.

I do not remember any bug reports from v1 where there was a max usage
misreported because of uncoordinated value reseting. So while you are
right that this is theoretically possible I am not convinced this is a
real problem in practice.

On the other hand it seems there is a wider agreement this shouldn't be
added to v2 and I do respect that.
 
> You have to decide whether the above is worth implementing. But my
> take is that the downsides of the simpler solution outweigh its
> benefits.

While this seems quite elegant I am not convinced this is really worth
the additional code for a metric like peak memory consumption which is a
very limited metric in a presence of memory reclaim.

Thanks!
David Finkel July 18, 2024, 9:49 p.m. UTC | #32
On Wed, Jul 17, 2024 at 1:04 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
> > Hello,
> >
> > On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> > ...
> > > > This behavior is particularly useful for work scheduling systems that
> > > > need to track memory usage of worker processes/cgroups per-work-item.
> > > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > > opinions), these systems need to track the peak memory usage to compute
> > > > system/container fullness when binpacking workitems.
> >
> > Swap still has bad reps but there's nothing drastically worse about it than
> > page cache. ie. If you're under memory pressure, you get thrashing one way
> > or another. If there's no swap, the system is just memlocking anon memory
> > even when they are a lot colder than page cache, so I'm skeptical that no
> > swap + mostly anon + kernel OOM kills is a good strategy in general
> > especially given that the system behavior is not very predictable under OOM
> > conditions.
> >
> > > As mentioned down the email thread, I consider usefulness of peak value
> > > rather limited. It is misleading when memory is reclaimed. But
> > > fundamentally I do not oppose to unifying the write behavior to reset
> > > values.
> >
> > The removal of resets was intentional. The problem was that it wasn't clear
> > who owned those counters and there's no way of telling who reset what when.
> > It was easy to accidentally end up with multiple entities that think they
> > can get timed measurement by resetting.
> >
> > So, in general, I don't think this is a great idea. There are shortcomings
> > to how memory.peak behaves in that its meaningfulness quickly declines over
> > time. This is expected and the rationale behind adding memory.peak, IIRC,
> > was that it was difficult to tell the memory usage of a short-lived cgroup.
> >
> > If we want to allow peak measurement of time periods, I wonder whether we
> > could do something similar to pressure triggers - ie. let users register
> > watchers so that each user can define their own watch periods. This is more
> > involved but more useful and less error-inducing than adding reset to a
> > single counter.
> >
> > Johannes, what do you think?
>
> I'm also not a fan of the ability to reset globally.
>
> I seem to remember a scheme we discussed some time ago to do local
> state tracking without having the overhead in the page counter
> fastpath. The new data that needs to be tracked is a pc->local_peak
> (in the page_counter) and an fd->peak (in the watcher's file state).
>
> 1. Usage peak is tracked in pc->watermark, and now also in pc->local_peak.
>
> 2. Somebody opens the memory.peak. Initialize fd->peak = -1.
>
> 3. If they write, set fd->peak = pc->local_peak = usage.
>
> 4. Usage grows.
>
> 5. They read(). A conventional reader has fd->peak == -1, so we return
>    pc->watermark. If the fd has been written to, return max(fd->peak, pc->local_peak).
>
> 6. Usage drops.
>
> 7. New watcher opens and writes. Bring up all existing watchers'
>    fd->peak (that aren't -1) to pc->local_peak *iff* latter is bigger.
>    Then set the new fd->peak = pc->local_peak = current usage as in 3.
>
> 8. See 5. again for read() from each watcher.
>
> This way all fd's can arbitrarily start tracking new local peaks with
> write(). The operation in the charging fast path is cheap. The write()
> is O(existing_watchers), which seems reasonable. It's fully backward
> compatible with conventional open() + read() users.

I spent some time today attempting to implement this.
Here's a branch on github that compiles, and I think is close to what you
described, but is definitely still a WIP:

https://github.com/torvalds/linux/compare/master...dfinkel:linux:memcg2_memory_peak_fd_session

Since there seems to be significant agreement that this approach is better
long-term as a kernel interface, if that continues, I can factor out some of
the changes so it supports both memory.peak and memory.swap.peak,
fix the tests, and clean up any style issues tomorrow.

Also, If there are opinions on whether the cgroup_lock is a reasonable way
of handling this synchronization, or if I should add a more appropriate spinlock
or mutex onto either the pagecounter or the memcg, I'm all ears.

(I can mail the WIP change as-is if that's prefered to github)
Waiman Long July 19, 2024, 3:23 a.m. UTC | #33
On 7/18/24 17:49, David Finkel wrote:
> I spent some time today attempting to implement this.
> Here's a branch on github that compiles, and I think is close to what you
> described, but is definitely still a WIP:
>
> https://github.com/torvalds/linux/compare/master...dfinkel:linux:memcg2_memory_peak_fd_session
>
> Since there seems to be significant agreement that this approach is better
> long-term as a kernel interface, if that continues, I can factor out some of
> the changes so it supports both memory.peak and memory.swap.peak,
> fix the tests, and clean up any style issues tomorrow.
>
> Also, If there are opinions on whether the cgroup_lock is a reasonable way
> of handling this synchronization, or if I should add a more appropriate spinlock
> or mutex onto either the pagecounter or the memcg, I'm all ears.

cgroup_lock() should only be used by the cgroup core code, though there 
are exceptions.

You may or may not need lock protection depending on what data you want 
to protect and if there is any chance that concurrent race may screw 
thing up. If lock protection is really needed, add your own lock to 
protect the data. Since your critical sections seem to be pretty short, 
a regular spinlock will be enough.

Cheers,
Longman
David Finkel July 22, 2024, 3:18 p.m. UTC | #34
On Thu, Jul 18, 2024 at 11:23 PM Waiman Long <longman@redhat.com> wrote:
>
>
> On 7/18/24 17:49, David Finkel wrote:
> > I spent some time today attempting to implement this.
> > Here's a branch on github that compiles, and I think is close to what you
> > described, but is definitely still a WIP:
> >
> > https://github.com/torvalds/linux/compare/master...dfinkel:linux:memcg2_memory_peak_fd_session
> >
> > Since there seems to be significant agreement that this approach is better
> > long-term as a kernel interface, if that continues, I can factor out some of
> > the changes so it supports both memory.peak and memory.swap.peak,
> > fix the tests, and clean up any style issues tomorrow.
> >
> > Also, If there are opinions on whether the cgroup_lock is a reasonable way
> > of handling this synchronization, or if I should add a more appropriate spinlock
> > or mutex onto either the pagecounter or the memcg, I'm all ears.
>
> cgroup_lock() should only be used by the cgroup core code, though there
> are exceptions.
>
> You may or may not need lock protection depending on what data you want
> to protect and if there is any chance that concurrent race may screw
> thing up. If lock protection is really needed, add your own lock to
> protect the data. Since your critical sections seem to be pretty short,
> a regular spinlock will be enough.

Thanks, using cgroup_lock() created a deadlock, and using a spinlock
resolved that.

>
> Cheers,
> Longman
>
diff mbox series

Patch

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 8fbb0519d556..201d8e5d9f82 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1322,11 +1322,13 @@  PAGE_SIZE multiple when read back.
 	reclaim induced by memory.reclaim.
 
   memory.peak
-	A read-only single value file which exists on non-root
-	cgroups.
+	A read-write single value file which exists on non-root cgroups.
+
+	The max memory usage recorded for the cgroup and its descendants since
+	either the creation of the cgroup or the most recent reset.
 
-	The max memory usage recorded for the cgroup and its
-	descendants since the creation of the cgroup.
+	Any non-empty write to this file resets it to the current memory usage.
+	All content written is completely ignored.
 
   memory.oom.group
 	A read-write single value file which exists on non-root
@@ -1652,11 +1654,13 @@  PAGE_SIZE multiple when read back.
 	Healthy workloads are not expected to reach this limit.
 
   memory.swap.peak
-	A read-only single value file which exists on non-root
-	cgroups.
+	A read-write single value file which exists on non-root cgroups.
+
+	The max swap usage recorded for the cgroup and its descendants since
+	the creation of the cgroup or the most recent reset.
 
-	The max swap usage recorded for the cgroup and its
-	descendants since the creation of the cgroup.
+	Any non-empty write to this file resets it to the current swap usage.
+	All content written is completely ignored.
 
   memory.swap.max
 	A read-write single value file which exists on non-root
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8f2f1bb18c9c..abfa547615d6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,6 +25,7 @@ 
  * Copyright (C) 2020 Alibaba, Inc, Alex Shi
  */
 
+#include <linux/cgroup-defs.h>
 #include <linux/page_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
@@ -6915,6 +6916,16 @@  static u64 memory_peak_read(struct cgroup_subsys_state *css,
 	return (u64)memcg->memory.watermark * PAGE_SIZE;
 }
 
+static ssize_t memory_peak_write(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+	page_counter_reset_watermark(&memcg->memory);
+
+	return nbytes;
+}
+
 static int memory_min_show(struct seq_file *m, void *v)
 {
 	return seq_puts_memcg_tunable(m,
@@ -7232,6 +7243,7 @@  static struct cftype memory_files[] = {
 		.name = "peak",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_u64 = memory_peak_read,
+		.write = memory_peak_write,
 	},
 	{
 		.name = "min",
@@ -8201,6 +8213,16 @@  static u64 swap_peak_read(struct cgroup_subsys_state *css,
 	return (u64)memcg->swap.watermark * PAGE_SIZE;
 }
 
+static ssize_t swap_peak_write(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+	page_counter_reset_watermark(&memcg->swap);
+
+	return nbytes;
+}
+
 static int swap_high_show(struct seq_file *m, void *v)
 {
 	return seq_puts_memcg_tunable(m,
@@ -8283,6 +8305,7 @@  static struct cftype swap_files[] = {
 		.name = "swap.peak",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_u64 = swap_peak_read,
+		.write = swap_peak_write,
 	},
 	{
 		.name = "swap.events",
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
index 41ae8047b889..681972de673b 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -161,12 +161,12 @@  static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
 /*
  * This test create a memory cgroup, allocates
  * some anonymous memory and some pagecache
- * and check memory.current and some memory.stat values.
+ * and checks memory.current, memory.peak, and some memory.stat values.
  */
-static int test_memcg_current(const char *root)
+static int test_memcg_current_peak(const char *root)
 {
 	int ret = KSFT_FAIL;
-	long current;
+	long current, peak, peak_reset;
 	char *memcg;
 
 	memcg = cg_name(root, "memcg_test");
@@ -180,12 +180,32 @@  static int test_memcg_current(const char *root)
 	if (current != 0)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak != 0)
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(50))
+		goto cleanup;
+
+	peak_reset = cg_write(memcg, "memory.peak", "\n");
+	if (peak_reset != 0)
+		goto cleanup;
+
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak > MB(30))
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(50))
+		goto cleanup;
+
 	ret = KSFT_PASS;
 
 cleanup:
@@ -817,13 +837,14 @@  static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
 
 /*
  * This test checks that memory.swap.max limits the amount of
- * anonymous memory which can be swapped out.
+ * anonymous memory which can be swapped out. Additionally, it verifies that
+ * memory.swap.peak reflects the high watermark and can be reset.
  */
-static int test_memcg_swap_max(const char *root)
+static int test_memcg_swap_max_peak(const char *root)
 {
 	int ret = KSFT_FAIL;
 	char *memcg;
-	long max;
+	long max, peak;
 
 	if (!is_swap_enabled())
 		return KSFT_SKIP;
@@ -840,6 +861,12 @@  static int test_memcg_swap_max(const char *root)
 		goto cleanup;
 	}
 
+	if (cg_read_long(memcg, "memory.swap.peak"))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.peak"))
+		goto cleanup;
+
 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
 		goto cleanup;
 
@@ -862,6 +889,27 @@  static int test_memcg_swap_max(const char *root)
 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long(memcg, "memory.swap.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.swap.peak", "\n"))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.swap.peak") > MB(10))
+		goto cleanup;
+
+
+	if (cg_write(memcg, "memory.peak", "\n"))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.peak"))
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
 		goto cleanup;
 
@@ -869,6 +917,14 @@  static int test_memcg_swap_max(const char *root)
 	if (max <= 0)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long(memcg, "memory.swap.peak");
+	if (peak < MB(19))
+		goto cleanup;
+
 	ret = KSFT_PASS;
 
 cleanup:
@@ -1295,7 +1351,7 @@  struct memcg_test {
 	const char *name;
 } tests[] = {
 	T(test_memcg_subtree_control),
-	T(test_memcg_current),
+	T(test_memcg_current_peak),
 	T(test_memcg_min),
 	T(test_memcg_low),
 	T(test_memcg_high),
@@ -1303,7 +1359,7 @@  struct memcg_test {
 	T(test_memcg_max),
 	T(test_memcg_reclaim),
 	T(test_memcg_oom_events),
-	T(test_memcg_swap_max),
+	T(test_memcg_swap_max_peak),
 	T(test_memcg_sock),
 	T(test_memcg_oom_group_leaf_events),
 	T(test_memcg_oom_group_parent_events),