diff mbox series

[V2] cgroup/rstat: Avoid thundering herd problem by kswapd across NUMA nodes

Message ID 171923011608.1500238.3591002573732683639.stgit@firesoul (mailing list archive)
State New
Headers show
Series [V2] cgroup/rstat: Avoid thundering herd problem by kswapd across NUMA nodes | expand

Commit Message

Jesper Dangaard Brouer June 24, 2024, 11:55 a.m. UTC
Avoid lock contention on the global cgroup rstat lock caused by kswapd
starting on all NUMA nodes simultaneously. At Cloudflare, we observed
massive issues due to kswapd and the specific mem_cgroup_flush_stats()
call inlined in shrink_node, which takes the rstat lock.

On our 12 NUMA node machines, each with a kswapd kthread per NUMA node,
we noted severe lock contention on the rstat lock. This contention
causes 12 CPUs to waste cycles spinning every time kswapd runs.
Fleet-wide stats (/proc/N/schedstat) for kthreads revealed that we are
burning an average of 20,000 CPU cores fleet-wide on kswapd, primarily
due to spinning on the rstat lock.

To help reviewer follow code: When the Per-CPU-Pages (PCP) freelist is
empty, __alloc_pages_slowpath calls wake_all_kswapds(), causing all
kswapdN threads to wake up simultaneously. The kswapd thread invokes
shrink_node (via balance_pgdat) triggering the cgroup rstat flush
operation as part of its work. This results in kernel self-induced rstat
lock contention by waking up all kswapd threads simultaneously.
Leveraging this detail: balance_pgdat() have NULL value in
target_mem_cgroup, this cause mem_cgroup_flush_stats() to do flush with
root_mem_cgroup.

To resolve the kswapd issue, we generalized the "stats_flush_ongoing"
concept to apply to all users of cgroup rstat, not just memcg. This
concept was originally reverted in commit 7d7ef0a4686a ("mm: memcg:
restore subtree stats flushing"). If there is an ongoing rstat flush,
limited to the root cgroup, the flush is skipped. This is effective as
kswapd operates on the root tree, sufficiently mitigating the thundering
herd problem.

This lowers contention on the global rstat lock, although limited to the
root cgroup. Flushing cgroup subtree's can still lead to lock contention.

Fixes: 7d7ef0a4686a ("mm: memcg: restore subtree stats flushing").
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
V1: https://lore.kernel.org/all/171898037079.1222367.13467317484793748519.stgit@firesoul/
RFC: https://lore.kernel.org/all/171895533185.1084853.3033751561302228252.stgit@firesoul/

 include/linux/cgroup.h |    5 +++++
 kernel/cgroup/rstat.c  |   25 +++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

Comments

Yosry Ahmed June 24, 2024, 12:46 p.m. UTC | #1
On Mon, Jun 24, 2024 at 4:55 AM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
>
> Avoid lock contention on the global cgroup rstat lock caused by kswapd
> starting on all NUMA nodes simultaneously. At Cloudflare, we observed
> massive issues due to kswapd and the specific mem_cgroup_flush_stats()
> call inlined in shrink_node, which takes the rstat lock.
>
> On our 12 NUMA node machines, each with a kswapd kthread per NUMA node,
> we noted severe lock contention on the rstat lock. This contention
> causes 12 CPUs to waste cycles spinning every time kswapd runs.
> Fleet-wide stats (/proc/N/schedstat) for kthreads revealed that we are
> burning an average of 20,000 CPU cores fleet-wide on kswapd, primarily
> due to spinning on the rstat lock.
>
> To help reviewer follow code: When the Per-CPU-Pages (PCP) freelist is
> empty, __alloc_pages_slowpath calls wake_all_kswapds(), causing all
> kswapdN threads to wake up simultaneously. The kswapd thread invokes
> shrink_node (via balance_pgdat) triggering the cgroup rstat flush
> operation as part of its work. This results in kernel self-induced rstat
> lock contention by waking up all kswapd threads simultaneously.
> Leveraging this detail: balance_pgdat() have NULL value in
> target_mem_cgroup, this cause mem_cgroup_flush_stats() to do flush with
> root_mem_cgroup.
>
> To resolve the kswapd issue, we generalized the "stats_flush_ongoing"
> concept to apply to all users of cgroup rstat, not just memcg. This
> concept was originally reverted in commit 7d7ef0a4686a ("mm: memcg:
> restore subtree stats flushing"). If there is an ongoing rstat flush,
> limited to the root cgroup, the flush is skipped. This is effective as
> kswapd operates on the root tree, sufficiently mitigating the thundering
> herd problem.
>
> This lowers contention on the global rstat lock, although limited to the
> root cgroup. Flushing cgroup subtree's can still lead to lock contention.
>
> Fixes: 7d7ef0a4686a ("mm: memcg: restore subtree stats flushing").
> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
> ---
> V1: https://lore.kernel.org/all/171898037079.1222367.13467317484793748519.stgit@firesoul/
> RFC: https://lore.kernel.org/all/171895533185.1084853.3033751561302228252.stgit@firesoul/
>
>  include/linux/cgroup.h |    5 +++++
>  kernel/cgroup/rstat.c  |   25 +++++++++++++++++++++++++
>  2 files changed, 30 insertions(+)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 2150ca60394b..ad41cca5c3b6 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -499,6 +499,11 @@ static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
>         return NULL;
>  }
>
> +static inline bool cgroup_is_root(struct cgroup *cgrp)
> +{
> +       return cgroup_parent(cgrp) == NULL;
> +}
> +
>  /**
>   * cgroup_is_descendant - test ancestry
>   * @cgrp: the cgroup to be tested
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index fb8b49437573..2591840b6dc1 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -11,6 +11,7 @@
>
>  static DEFINE_SPINLOCK(cgroup_rstat_lock);
>  static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
> +static atomic_t root_rstat_flush_ongoing = ATOMIC_INIT(0);
>
>  static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
>
> @@ -350,8 +351,25 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
>  {
>         might_sleep();
>
> +       /*
> +        * This avoids thundering herd problem on global rstat lock. When an
> +        * ongoing flush of the entire tree is in progress, then skip flush.
> +        */
> +       if (atomic_read(&root_rstat_flush_ongoing))
> +               return;
> +
> +       /* Grab right to be ongoing flusher, return if loosing race */
> +       if (cgroup_is_root(cgrp) &&
> +           atomic_xchg(&root_rstat_flush_ongoing, 1))
> +               return;
> +

I am assuming this supersedes your other patch titled "[PATCH RFC]
cgroup/rstat: avoid thundering herd problem on root cgrp", so I will
only respond here.

I have two comments:
- There is no reason why this should be limited to the root cgroup. We
can keep track of the cgroup being flushed, and use
cgroup_is_descendant() to find out if the cgroup we want to flush is a
descendant of it. We can use a pointer and cmpxchg primitives instead
of the atomic here IIUC.

- More importantly, I am not a fan of skipping the flush if there is
an ongoing one. For all we know, the ongoing flush could have just
started and the stats have not been flushed yet. This is another
example of non deterministic behavior that could be difficult to
debug.

I tried a similar approach before where we sleep and wait for the
ongoing flush to complete instead, without contending on the lock,
using completions [1]. Although that patch has a lot of complexity, I
think just using completions to wait for the ongoing flush may be the
right way to go, assuming it also helps with the problem you are
facing.

[1]https://lore.kernel.org/lkml/20230913073846.1528938-4-yosryahmed@google.com/

>         __cgroup_rstat_lock(cgrp, -1);
> +
>         cgroup_rstat_flush_locked(cgrp);
> +
> +       if (cgroup_is_root(cgrp))
> +               atomic_set(&root_rstat_flush_ongoing, 0);
> +
>         __cgroup_rstat_unlock(cgrp, -1);
>  }
>
> @@ -362,13 +380,20 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
>   * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
>   * paired with cgroup_rstat_flush_release().
>   *
> + * Current invariant, not called with root cgrp.
> + *
>   * This function may block.
>   */
>  void cgroup_rstat_flush_hold(struct cgroup *cgrp)
>         __acquires(&cgroup_rstat_lock)
>  {
>         might_sleep();
> +
>         __cgroup_rstat_lock(cgrp, -1);
> +
> +       if (atomic_read(&root_rstat_flush_ongoing))
> +               return;
> +
>         cgroup_rstat_flush_locked(cgrp);
>  }
>
>
>
Shakeel Butt June 24, 2024, 5:32 p.m. UTC | #2
On Mon, Jun 24, 2024 at 05:46:05AM GMT, Yosry Ahmed wrote:
> On Mon, Jun 24, 2024 at 4:55 AM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
> 
[...]
> I am assuming this supersedes your other patch titled "[PATCH RFC]
> cgroup/rstat: avoid thundering herd problem on root cgrp", so I will
> only respond here.
> 
> I have two comments:
> - There is no reason why this should be limited to the root cgroup. We
> can keep track of the cgroup being flushed, and use
> cgroup_is_descendant() to find out if the cgroup we want to flush is a
> descendant of it. We can use a pointer and cmpxchg primitives instead
> of the atomic here IIUC.
> 
> - More importantly, I am not a fan of skipping the flush if there is
> an ongoing one. For all we know, the ongoing flush could have just
> started and the stats have not been flushed yet. This is another
> example of non deterministic behavior that could be difficult to
> debug.

Even with the flush, there will almost always per-cpu updates which will
be missed. This can not be fixed unless we block the stats updaters as
well (which is not going to happen). So, we are already ok with this
level of non-determinism. Why skipping flushing would be worse? One may
argue 'time window is smaller' but this still does not cap the amount of
updates. So, unless there is concrete data that this skipping flushing
is detrimental to the users of stats, I don't see an issue in the
presense of periodic flusher.

> 
> I tried a similar approach before where we sleep and wait for the
> ongoing flush to complete instead, without contending on the lock,
> using completions [1]. Although that patch has a lot of complexity,

We can definitely add complexity but only if there are no simple good
enough mitigations.
Yosry Ahmed June 24, 2024, 5:40 p.m. UTC | #3
On Mon, Jun 24, 2024 at 10:32 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Mon, Jun 24, 2024 at 05:46:05AM GMT, Yosry Ahmed wrote:
> > On Mon, Jun 24, 2024 at 4:55 AM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
> >
> [...]
> > I am assuming this supersedes your other patch titled "[PATCH RFC]
> > cgroup/rstat: avoid thundering herd problem on root cgrp", so I will
> > only respond here.
> >
> > I have two comments:
> > - There is no reason why this should be limited to the root cgroup. We
> > can keep track of the cgroup being flushed, and use
> > cgroup_is_descendant() to find out if the cgroup we want to flush is a
> > descendant of it. We can use a pointer and cmpxchg primitives instead
> > of the atomic here IIUC.
> >
> > - More importantly, I am not a fan of skipping the flush if there is
> > an ongoing one. For all we know, the ongoing flush could have just
> > started and the stats have not been flushed yet. This is another
> > example of non deterministic behavior that could be difficult to
> > debug.
>
> Even with the flush, there will almost always per-cpu updates which will
> be missed. This can not be fixed unless we block the stats updaters as
> well (which is not going to happen). So, we are already ok with this
> level of non-determinism. Why skipping flushing would be worse? One may
> argue 'time window is smaller' but this still does not cap the amount of
> updates. So, unless there is concrete data that this skipping flushing
> is detrimental to the users of stats, I don't see an issue in the
> presense of periodic flusher.

As you mentioned, the updates that happen during the flush are
unavoidable anyway, and the window is small. On the other hand, we
should be able to maintain the current behavior that at least all the
stat updates that happened *before* the call to cgroup_rstat_flush()
are flushed after the call.

The main concern here is that the stats read *after* an event occurs
should reflect the system state at that time. For example, a proactive
reclaimer reading the stats after writing to memory.reclaim should
observe the system state after the reclaim operation happened.

Please see [1] for more details about why this is important, which was
the rationale for removing stats_flush_ongoing in the first place.

[1]https://lore.kernel.org/lkml/20231129032154.3710765-6-yosryahmed@google.com/

>
> >
> > I tried a similar approach before where we sleep and wait for the
> > ongoing flush to complete instead, without contending on the lock,
> > using completions [1]. Although that patch has a lot of complexity,
>
> We can definitely add complexity but only if there are no simple good
> enough mitigations.

I agree that my patch was complicated. I am hoping we can have a
simpler version here that just waits for ongoing flushers if the
cgroup is a descendant of the cgroup already being flushed.
Shakeel Butt June 24, 2024, 7:29 p.m. UTC | #4
On Mon, Jun 24, 2024 at 10:40:48AM GMT, Yosry Ahmed wrote:
> On Mon, Jun 24, 2024 at 10:32 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Mon, Jun 24, 2024 at 05:46:05AM GMT, Yosry Ahmed wrote:
> > > On Mon, Jun 24, 2024 at 4:55 AM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
> > >
> > [...]
> > > I am assuming this supersedes your other patch titled "[PATCH RFC]
> > > cgroup/rstat: avoid thundering herd problem on root cgrp", so I will
> > > only respond here.
> > >
> > > I have two comments:
> > > - There is no reason why this should be limited to the root cgroup. We
> > > can keep track of the cgroup being flushed, and use
> > > cgroup_is_descendant() to find out if the cgroup we want to flush is a
> > > descendant of it. We can use a pointer and cmpxchg primitives instead
> > > of the atomic here IIUC.
> > >
> > > - More importantly, I am not a fan of skipping the flush if there is
> > > an ongoing one. For all we know, the ongoing flush could have just
> > > started and the stats have not been flushed yet. This is another
> > > example of non deterministic behavior that could be difficult to
> > > debug.
> >
> > Even with the flush, there will almost always per-cpu updates which will
> > be missed. This can not be fixed unless we block the stats updaters as
> > well (which is not going to happen). So, we are already ok with this
> > level of non-determinism. Why skipping flushing would be worse? One may
> > argue 'time window is smaller' but this still does not cap the amount of
> > updates. So, unless there is concrete data that this skipping flushing
> > is detrimental to the users of stats, I don't see an issue in the
> > presense of periodic flusher.
> 
> As you mentioned, the updates that happen during the flush are
> unavoidable anyway, and the window is small. On the other hand, we
> should be able to maintain the current behavior that at least all the
> stat updates that happened *before* the call to cgroup_rstat_flush()
> are flushed after the call.
> 
> The main concern here is that the stats read *after* an event occurs
> should reflect the system state at that time. For example, a proactive
> reclaimer reading the stats after writing to memory.reclaim should
> observe the system state after the reclaim operation happened.

What about the in-kernel users like kswapd? I don't see any before or
after events for the in-kernel users.

> 
> Please see [1] for more details about why this is important, which was
> the rationale for removing stats_flush_ongoing in the first place.
> 
> [1]https://lore.kernel.org/lkml/20231129032154.3710765-6-yosryahmed@google.com/
>
Yosry Ahmed June 24, 2024, 7:37 p.m. UTC | #5
On Mon, Jun 24, 2024 at 12:29 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Mon, Jun 24, 2024 at 10:40:48AM GMT, Yosry Ahmed wrote:
> > On Mon, Jun 24, 2024 at 10:32 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > >
> > > On Mon, Jun 24, 2024 at 05:46:05AM GMT, Yosry Ahmed wrote:
> > > > On Mon, Jun 24, 2024 at 4:55 AM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
> > > >
> > > [...]
> > > > I am assuming this supersedes your other patch titled "[PATCH RFC]
> > > > cgroup/rstat: avoid thundering herd problem on root cgrp", so I will
> > > > only respond here.
> > > >
> > > > I have two comments:
> > > > - There is no reason why this should be limited to the root cgroup. We
> > > > can keep track of the cgroup being flushed, and use
> > > > cgroup_is_descendant() to find out if the cgroup we want to flush is a
> > > > descendant of it. We can use a pointer and cmpxchg primitives instead
> > > > of the atomic here IIUC.
> > > >
> > > > - More importantly, I am not a fan of skipping the flush if there is
> > > > an ongoing one. For all we know, the ongoing flush could have just
> > > > started and the stats have not been flushed yet. This is another
> > > > example of non deterministic behavior that could be difficult to
> > > > debug.
> > >
> > > Even with the flush, there will almost always per-cpu updates which will
> > > be missed. This can not be fixed unless we block the stats updaters as
> > > well (which is not going to happen). So, we are already ok with this
> > > level of non-determinism. Why skipping flushing would be worse? One may
> > > argue 'time window is smaller' but this still does not cap the amount of
> > > updates. So, unless there is concrete data that this skipping flushing
> > > is detrimental to the users of stats, I don't see an issue in the
> > > presense of periodic flusher.
> >
> > As you mentioned, the updates that happen during the flush are
> > unavoidable anyway, and the window is small. On the other hand, we
> > should be able to maintain the current behavior that at least all the
> > stat updates that happened *before* the call to cgroup_rstat_flush()
> > are flushed after the call.
> >
> > The main concern here is that the stats read *after* an event occurs
> > should reflect the system state at that time. For example, a proactive
> > reclaimer reading the stats after writing to memory.reclaim should
> > observe the system state after the reclaim operation happened.
>
> What about the in-kernel users like kswapd? I don't see any before or
> after events for the in-kernel users.

The example I can think of off the top of my head is the cache trim
mode scenario I mentioned when discussing your patch (i.e. not
realizing that file memory had already been reclaimed). There is also
a heuristic in zswap that may writeback more (or less) pages that it
should to the swap device if the stats are significantly stale.

I did not take a closer look to find more examples, but I think we
need to respect this condition at least for userspace readers.
Shakeel Butt June 24, 2024, 8:18 p.m. UTC | #6
On Mon, Jun 24, 2024 at 12:37:30PM GMT, Yosry Ahmed wrote:
> On Mon, Jun 24, 2024 at 12:29 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Mon, Jun 24, 2024 at 10:40:48AM GMT, Yosry Ahmed wrote:
> > > On Mon, Jun 24, 2024 at 10:32 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > >
> > > > On Mon, Jun 24, 2024 at 05:46:05AM GMT, Yosry Ahmed wrote:
> > > > > On Mon, Jun 24, 2024 at 4:55 AM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
> > > > >
> > > > [...]
> > > > > I am assuming this supersedes your other patch titled "[PATCH RFC]
> > > > > cgroup/rstat: avoid thundering herd problem on root cgrp", so I will
> > > > > only respond here.
> > > > >
> > > > > I have two comments:
> > > > > - There is no reason why this should be limited to the root cgroup. We
> > > > > can keep track of the cgroup being flushed, and use
> > > > > cgroup_is_descendant() to find out if the cgroup we want to flush is a
> > > > > descendant of it. We can use a pointer and cmpxchg primitives instead
> > > > > of the atomic here IIUC.
> > > > >
> > > > > - More importantly, I am not a fan of skipping the flush if there is
> > > > > an ongoing one. For all we know, the ongoing flush could have just
> > > > > started and the stats have not been flushed yet. This is another
> > > > > example of non deterministic behavior that could be difficult to
> > > > > debug.
> > > >
> > > > Even with the flush, there will almost always per-cpu updates which will
> > > > be missed. This can not be fixed unless we block the stats updaters as
> > > > well (which is not going to happen). So, we are already ok with this
> > > > level of non-determinism. Why skipping flushing would be worse? One may
> > > > argue 'time window is smaller' but this still does not cap the amount of
> > > > updates. So, unless there is concrete data that this skipping flushing
> > > > is detrimental to the users of stats, I don't see an issue in the
> > > > presense of periodic flusher.
> > >
> > > As you mentioned, the updates that happen during the flush are
> > > unavoidable anyway, and the window is small. On the other hand, we
> > > should be able to maintain the current behavior that at least all the
> > > stat updates that happened *before* the call to cgroup_rstat_flush()
> > > are flushed after the call.
> > >
> > > The main concern here is that the stats read *after* an event occurs
> > > should reflect the system state at that time. For example, a proactive
> > > reclaimer reading the stats after writing to memory.reclaim should
> > > observe the system state after the reclaim operation happened.
> >
> > What about the in-kernel users like kswapd? I don't see any before or
> > after events for the in-kernel users.
> 
> The example I can think of off the top of my head is the cache trim
> mode scenario I mentioned when discussing your patch (i.e. not
> realizing that file memory had already been reclaimed).

Kswapd has some kind of cache trim failure mode where it decides to skip
cache trim heuristic. Also for global reclaim there are couple more
condition in play as well.

> There is also
> a heuristic in zswap that may writeback more (or less) pages that it
> should to the swap device if the stats are significantly stale.
> 

Is this the ratio of MEMCG_ZSWAP_B and MEMCG_ZSWAPPED in
zswap_shrinker_count()? There is already a target memcg flush in that
function and I don't expect root memcg flush from there.

> I did not take a closer look to find more examples, but I think we
> need to respect this condition at least for userspace readers.
Yosry Ahmed June 24, 2024, 9:43 p.m. UTC | #7
On Mon, Jun 24, 2024 at 1:18 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Mon, Jun 24, 2024 at 12:37:30PM GMT, Yosry Ahmed wrote:
> > On Mon, Jun 24, 2024 at 12:29 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > >
> > > On Mon, Jun 24, 2024 at 10:40:48AM GMT, Yosry Ahmed wrote:
> > > > On Mon, Jun 24, 2024 at 10:32 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > > >
> > > > > On Mon, Jun 24, 2024 at 05:46:05AM GMT, Yosry Ahmed wrote:
> > > > > > On Mon, Jun 24, 2024 at 4:55 AM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
> > > > > >
> > > > > [...]
> > > > > > I am assuming this supersedes your other patch titled "[PATCH RFC]
> > > > > > cgroup/rstat: avoid thundering herd problem on root cgrp", so I will
> > > > > > only respond here.
> > > > > >
> > > > > > I have two comments:
> > > > > > - There is no reason why this should be limited to the root cgroup. We
> > > > > > can keep track of the cgroup being flushed, and use
> > > > > > cgroup_is_descendant() to find out if the cgroup we want to flush is a
> > > > > > descendant of it. We can use a pointer and cmpxchg primitives instead
> > > > > > of the atomic here IIUC.
> > > > > >
> > > > > > - More importantly, I am not a fan of skipping the flush if there is
> > > > > > an ongoing one. For all we know, the ongoing flush could have just
> > > > > > started and the stats have not been flushed yet. This is another
> > > > > > example of non deterministic behavior that could be difficult to
> > > > > > debug.
> > > > >
> > > > > Even with the flush, there will almost always per-cpu updates which will
> > > > > be missed. This can not be fixed unless we block the stats updaters as
> > > > > well (which is not going to happen). So, we are already ok with this
> > > > > level of non-determinism. Why skipping flushing would be worse? One may
> > > > > argue 'time window is smaller' but this still does not cap the amount of
> > > > > updates. So, unless there is concrete data that this skipping flushing
> > > > > is detrimental to the users of stats, I don't see an issue in the
> > > > > presense of periodic flusher.
> > > >
> > > > As you mentioned, the updates that happen during the flush are
> > > > unavoidable anyway, and the window is small. On the other hand, we
> > > > should be able to maintain the current behavior that at least all the
> > > > stat updates that happened *before* the call to cgroup_rstat_flush()
> > > > are flushed after the call.
> > > >
> > > > The main concern here is that the stats read *after* an event occurs
> > > > should reflect the system state at that time. For example, a proactive
> > > > reclaimer reading the stats after writing to memory.reclaim should
> > > > observe the system state after the reclaim operation happened.
> > >
> > > What about the in-kernel users like kswapd? I don't see any before or
> > > after events for the in-kernel users.
> >
> > The example I can think of off the top of my head is the cache trim
> > mode scenario I mentioned when discussing your patch (i.e. not
> > realizing that file memory had already been reclaimed).
>
> Kswapd has some kind of cache trim failure mode where it decides to skip
> cache trim heuristic. Also for global reclaim there are couple more
> condition in play as well.

I was mostly concerned about entering cache trim mode when we
shouldn't, not vice versa, as I explained in the other thread. Anyway,
I think the problem of missing stat updates of events is more
pronounced with userspace reads.

>
> > There is also
> > a heuristic in zswap that may writeback more (or less) pages that it
> > should to the swap device if the stats are significantly stale.
> >
>
> Is this the ratio of MEMCG_ZSWAP_B and MEMCG_ZSWAPPED in
> zswap_shrinker_count()? There is already a target memcg flush in that
> function and I don't expect root memcg flush from there.

I was thinking of the generic approach I suggested, where we can avoid
contending on the lock if the cgroup is a descendant of the cgroup
being flushed, regardless of whether or not it's the root memcg. I
think this would be more beneficial than just focusing on root
flushes.
Shakeel Butt June 24, 2024, 10:17 p.m. UTC | #8
On Mon, Jun 24, 2024 at 02:43:02PM GMT, Yosry Ahmed wrote:
[...]
> >
> > > There is also
> > > a heuristic in zswap that may writeback more (or less) pages that it
> > > should to the swap device if the stats are significantly stale.
> > >
> >
> > Is this the ratio of MEMCG_ZSWAP_B and MEMCG_ZSWAPPED in
> > zswap_shrinker_count()? There is already a target memcg flush in that
> > function and I don't expect root memcg flush from there.
> 
> I was thinking of the generic approach I suggested, where we can avoid
> contending on the lock if the cgroup is a descendant of the cgroup
> being flushed, regardless of whether or not it's the root memcg. I
> think this would be more beneficial than just focusing on root
> flushes.

Yes I agree with this but what about skipping the flush in this case?
Are you ok with that?
Shakeel Butt June 25, 2024, 12:24 a.m. UTC | #9
On Mon, Jun 24, 2024 at 03:21:22PM GMT, Yosry Ahmed wrote:
> On Mon, Jun 24, 2024 at 3:17 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Mon, Jun 24, 2024 at 02:43:02PM GMT, Yosry Ahmed wrote:
> > [...]
> > > >
> > > > > There is also
> > > > > a heuristic in zswap that may writeback more (or less) pages that it
> > > > > should to the swap device if the stats are significantly stale.
> > > > >
> > > >
> > > > Is this the ratio of MEMCG_ZSWAP_B and MEMCG_ZSWAPPED in
> > > > zswap_shrinker_count()? There is already a target memcg flush in that
> > > > function and I don't expect root memcg flush from there.
> > >
> > > I was thinking of the generic approach I suggested, where we can avoid
> > > contending on the lock if the cgroup is a descendant of the cgroup
> > > being flushed, regardless of whether or not it's the root memcg. I
> > > think this would be more beneficial than just focusing on root
> > > flushes.
> >
> > Yes I agree with this but what about skipping the flush in this case?
> > Are you ok with that?
> 
> Sorry if I am confused, but IIUC this patch affects all root flushes,
> even for userspace reads, right? In this case I think it's not okay to
> skip the flush without waiting for the ongoing flush.

So, we differentiate between userspace and in-kernel users. For
userspace, we should not skip flush and for in-kernel users, we can skip
if flushing memcg is the ancestor of the given memcg. Is that what you
are saying?
Jesper Dangaard Brouer June 25, 2024, 3:32 p.m. UTC | #10
On 25/06/2024 11.28, Yosry Ahmed wrote:
> On Mon, Jun 24, 2024 at 5:24 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>>
>> On Mon, Jun 24, 2024 at 03:21:22PM GMT, Yosry Ahmed wrote:
>>> On Mon, Jun 24, 2024 at 3:17 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>>>>
>>>> On Mon, Jun 24, 2024 at 02:43:02PM GMT, Yosry Ahmed wrote:
>>>> [...]
>>>>>>
>>>>>>> There is also
>>>>>>> a heuristic in zswap that may writeback more (or less) pages that it
>>>>>>> should to the swap device if the stats are significantly stale.
>>>>>>>
>>>>>>
>>>>>> Is this the ratio of MEMCG_ZSWAP_B and MEMCG_ZSWAPPED in
>>>>>> zswap_shrinker_count()? There is already a target memcg flush in that
>>>>>> function and I don't expect root memcg flush from there.
>>>>>
>>>>> I was thinking of the generic approach I suggested, where we can avoid
>>>>> contending on the lock if the cgroup is a descendant of the cgroup
>>>>> being flushed, regardless of whether or not it's the root memcg. I
>>>>> think this would be more beneficial than just focusing on root
>>>>> flushes.
>>>>
>>>> Yes I agree with this but what about skipping the flush in this case?
>>>> Are you ok with that?
>>>
>>> Sorry if I am confused, but IIUC this patch affects all root flushes,
>>> even for userspace reads, right? In this case I think it's not okay to
>>> skip the flush without waiting for the ongoing flush.
>>
>> So, we differentiate between userspace and in-kernel users. For
>> userspace, we should not skip flush and for in-kernel users, we can skip
>> if flushing memcg is the ancestor of the given memcg. Is that what you
>> are saying?
> 
> Basically, I prefer that we don't skip flushing at all and keep
> userspace and in-kernel users the same. We can use completions to make
> other overlapping flushers sleep instead of spin on the lock.
> 

I think there are good reasons for skipping flushes for userspace when 
reading these stats. More below.

I'm looking at kernel code to spot cases where the flush MUST to be
completed before returning.  There are clearly cases where we don't need
100% accurate stats, evident by mem_cgroup_flush_stats_ratelimited() and
mem_cgroup_flush_stats() that use memcg_vmstats_needs_flush().

The cgroup_rstat_exit() call seems to depend on cgroup_rstat_flush() 
being strict/accurate, because need to free the percpu resources.

The obj_cgroup_may_zswap() have a comments that says it needs to get 
accurate stats for charging.

These were the two cases, I found, do you know of others?


> A proof of concept is basically something like:
> 
> void cgroup_rstat_flush(cgroup)
> {
>      if (cgroup_is_descendant(cgroup, READ_ONCE(cgroup_under_flush))) {
>          wait_for_completion_interruptible(&cgroup_under_flush->completion);
>          return;
>      }

This feels like what we would achieve by changing this to a mutex.

> 
>      __cgroup_rstat_lock(cgrp, -1);
>      reinit_completion(&cgroup->completion);
>      /* Any overlapping flush requests after this write will not spin
> on the lock */
>      WRITE_ONCE(cgroup_under_flush, cgroup);
> 
>      cgroup_rstat_flush_locked(cgrp);
>      complete_all(&cgroup->completion);
>      __cgroup_rstat_unlock(cgrp, -1);
> }
> 
> There may be missing barriers or chances to reduce the window between
> __cgroup_rstat_lock and WRITE_ONCE(), but that's what I have in mind.
> I think it's not too complicated, but we need to check if it fixes the
> problem.
> 
> If this is not preferable, then yeah, let's at least keep the
> userspace behavior intact. This makes sure we don't affect userspace
> negatively, and we can change it later as we please.

I don't think userspace reading these stats need to be 100% accurate.
We are only reading the io.stat, memory.stat and cpu.stat every 53 
seconds. Reading cpu.stat print stats divided by NSEC_PER_USEC (1000).

If userspace is reading these very often, then they will be killing the 
system as it disables IRQs.

On my prod system the flush of root cgroup can take 35 ms, which is not 
good, but this inaccuracy should not matter for userspace.

Please educate me on why we need accurate userspace stats?


--Jesper
Yosry Ahmed June 25, 2024, 4 p.m. UTC | #11
[..]
> >
> > Basically, I prefer that we don't skip flushing at all and keep
> > userspace and in-kernel users the same. We can use completions to make
> > other overlapping flushers sleep instead of spin on the lock.
> >
>
> I think there are good reasons for skipping flushes for userspace when
> reading these stats. More below.
>
> I'm looking at kernel code to spot cases where the flush MUST to be
> completed before returning.  There are clearly cases where we don't need
> 100% accurate stats, evident by mem_cgroup_flush_stats_ratelimited() and
> mem_cgroup_flush_stats() that use memcg_vmstats_needs_flush().
>
> The cgroup_rstat_exit() call seems to depend on cgroup_rstat_flush()
> being strict/accurate, because need to free the percpu resources.

Yeah I think this one cannot be skipped.

>
> The obj_cgroup_may_zswap() have a comments that says it needs to get
> accurate stats for charging.

This one needs to be somewhat accurate to respect memcg limits. I am
not sure how much inaccuracy we can tolerate.

>
> These were the two cases, I found, do you know of others?

Nothing that screamed at me, but as I mentioned, the non-deterministic
nature of this makes me uncomfortable and feels to me like a potential
way to get subtle bugs.

>
>
> > A proof of concept is basically something like:
> >
> > void cgroup_rstat_flush(cgroup)
> > {
> >      if (cgroup_is_descendant(cgroup, READ_ONCE(cgroup_under_flush))) {
> >          wait_for_completion_interruptible(&cgroup_under_flush->completion);
> >          return;
> >      }
>
> This feels like what we would achieve by changing this to a mutex.

The main difference is that whoever is holding the lock still cannot
sleep, while waiters can (and more importantly, they don't disable
interrupts). This is essentially a middle ground between a mutex and a
lock. I think this dodges the priority inversion problem Shakeel
described because a low priority job holding the lock cannot sleep.

Is there an existing locking primitive that can achieve this?

>
> >
> >      __cgroup_rstat_lock(cgrp, -1);
> >      reinit_completion(&cgroup->completion);
> >      /* Any overlapping flush requests after this write will not spin
> > on the lock */
> >      WRITE_ONCE(cgroup_under_flush, cgroup);
> >
> >      cgroup_rstat_flush_locked(cgrp);
> >      complete_all(&cgroup->completion);
> >      __cgroup_rstat_unlock(cgrp, -1);
> > }
> >
> > There may be missing barriers or chances to reduce the window between
> > __cgroup_rstat_lock and WRITE_ONCE(), but that's what I have in mind.
> > I think it's not too complicated, but we need to check if it fixes the
> > problem.
> >
> > If this is not preferable, then yeah, let's at least keep the
> > userspace behavior intact. This makes sure we don't affect userspace
> > negatively, and we can change it later as we please.
>
> I don't think userspace reading these stats need to be 100% accurate.
> We are only reading the io.stat, memory.stat and cpu.stat every 53
> seconds. Reading cpu.stat print stats divided by NSEC_PER_USEC (1000).
>
> If userspace is reading these very often, then they will be killing the
> system as it disables IRQs.
>
> On my prod system the flush of root cgroup can take 35 ms, which is not
> good, but this inaccuracy should not matter for userspace.
>
> Please educate me on why we need accurate userspace stats?

My point is not about accuracy, although I think it's a reasonable
argument on its own (a lot of things could change in a short amount of
time, which is why I prefer magnitude-based ratelimiting).

My point is about logical ordering. If a userspace program reads the
stats *after* an event occurs, it expects to get a snapshot of the
system state after that event. Two examples are:

- A proactive reclaimer reading the stats after a reclaim attempt to
check if it needs to reclaim more memory or fallback.
- A userspace OOM killer reading the stats after a usage spike to
decide which workload to kill.

I listed such examples with more detail in [1], when I removed
stats_flush_ongoing from the memcg code.

[1]https://lore.kernel.org/lkml/20231129032154.3710765-6-yosryahmed@google.com/
Shakeel Butt June 25, 2024, 4:21 p.m. UTC | #12
On Tue, Jun 25, 2024 at 09:00:03AM GMT, Yosry Ahmed wrote:
[...]
> 
> My point is not about accuracy, although I think it's a reasonable
> argument on its own (a lot of things could change in a short amount of
> time, which is why I prefer magnitude-based ratelimiting).
> 
> My point is about logical ordering. If a userspace program reads the
> stats *after* an event occurs, it expects to get a snapshot of the
> system state after that event. Two examples are:
> 
> - A proactive reclaimer reading the stats after a reclaim attempt to
> check if it needs to reclaim more memory or fallback.
> - A userspace OOM killer reading the stats after a usage spike to
> decide which workload to kill.
> 
> I listed such examples with more detail in [1], when I removed
> stats_flush_ongoing from the memcg code.
> 
> [1]https://lore.kernel.org/lkml/20231129032154.3710765-6-yosryahmed@google.com/

You are kind of arbitrarily adding restrictions and rules here. Why not
follow the rules of a well established and battle tested stats infra
used by everyone i.e. vmstats? There is no sync flush and there are
frequent async flushes. I think that is what Jesper wants as well.

Shakeel
Yosry Ahmed June 25, 2024, 8:45 p.m. UTC | #13
On Tue, Jun 25, 2024 at 9:21 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Tue, Jun 25, 2024 at 09:00:03AM GMT, Yosry Ahmed wrote:
> [...]
> >
> > My point is not about accuracy, although I think it's a reasonable
> > argument on its own (a lot of things could change in a short amount of
> > time, which is why I prefer magnitude-based ratelimiting).
> >
> > My point is about logical ordering. If a userspace program reads the
> > stats *after* an event occurs, it expects to get a snapshot of the
> > system state after that event. Two examples are:
> >
> > - A proactive reclaimer reading the stats after a reclaim attempt to
> > check if it needs to reclaim more memory or fallback.
> > - A userspace OOM killer reading the stats after a usage spike to
> > decide which workload to kill.
> >
> > I listed such examples with more detail in [1], when I removed
> > stats_flush_ongoing from the memcg code.
> >
> > [1]https://lore.kernel.org/lkml/20231129032154.3710765-6-yosryahmed@google.com/
>
> You are kind of arbitrarily adding restrictions and rules here. Why not
> follow the rules of a well established and battle tested stats infra
> used by everyone i.e. vmstats? There is no sync flush and there are
> frequent async flushes. I think that is what Jesper wants as well.

That's how the memcg stats worked previously since before rstat and
until the introduction of stats_flush_ongoing AFAICT. We saw an actual
behavioral change when we were moving from a pre-rstat kernel to a
kernel with stats_flush_ongoing. This was the rationale when I removed
stats_flush_ongoing in [1]. It's not a new argument, I am just
reiterating what we discussed back then.

We saw an actual change in the proactive reclaimer as sometimes the
stats read after the reclaim attempt did not reflect the actual state
of the system. Sometimes the proactive reclaimer would back off when
it shouldn't, because it thinks it didn't reclaim memory when it
actually did.

Upon further investigation, we realized that this could also affect
the userspace OOM killer, because it uses the memcg stats to figure
out which memcg will free most memory if it was killed (by looking at
the anon stats, among others). If a memory usage spike occurs, and we
read stats from before the spike, we may kill the wrong memcg.

So as you said, we can experiment with in-kernel flushers, but let's
keep userspace flushing consistent.

Taking a step back, I just want to clarify that my arguments for the
flushing changes, whether it's in this patch or your ratelimiting
patch, are from a purely technical perspective. I am making
suggestions that I believe may be better. I am not trying to stop any
progress in this area or stand in the way. The only thing I really
don't want is affecting userspace flushers as I described above.

[1]https://lore.kernel.org/lkml/20231129032154.3710765-6-yosryahmed@google.com/
Shakeel Butt June 25, 2024, 9:20 p.m. UTC | #14
On Tue, Jun 25, 2024 at 01:45:00PM GMT, Yosry Ahmed wrote:
> On Tue, Jun 25, 2024 at 9:21 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Tue, Jun 25, 2024 at 09:00:03AM GMT, Yosry Ahmed wrote:
> > [...]
> > >
> > > My point is not about accuracy, although I think it's a reasonable
> > > argument on its own (a lot of things could change in a short amount of
> > > time, which is why I prefer magnitude-based ratelimiting).
> > >
> > > My point is about logical ordering. If a userspace program reads the
> > > stats *after* an event occurs, it expects to get a snapshot of the
> > > system state after that event. Two examples are:
> > >
> > > - A proactive reclaimer reading the stats after a reclaim attempt to
> > > check if it needs to reclaim more memory or fallback.
> > > - A userspace OOM killer reading the stats after a usage spike to
> > > decide which workload to kill.
> > >
> > > I listed such examples with more detail in [1], when I removed
> > > stats_flush_ongoing from the memcg code.
> > >
> > > [1]https://lore.kernel.org/lkml/20231129032154.3710765-6-yosryahmed@google.com/
> >
> > You are kind of arbitrarily adding restrictions and rules here. Why not
> > follow the rules of a well established and battle tested stats infra
> > used by everyone i.e. vmstats? There is no sync flush and there are
> > frequent async flushes. I think that is what Jesper wants as well.
> 
> That's how the memcg stats worked previously since before rstat and
> until the introduction of stats_flush_ongoing AFAICT. We saw an actual
> behavioral change when we were moving from a pre-rstat kernel to a
> kernel with stats_flush_ongoing. This was the rationale when I removed
> stats_flush_ongoing in [1]. It's not a new argument, I am just
> reiterating what we discussed back then.

In my reply above, I am not arguing to go back to the older
stats_flush_ongoing situation. Rather I am discussing what should be the
best eventual solution. From the vmstats infra, we can learn that
frequent async flushes along with no sync flush, users are fine with the
'non-determinism'. Of course cgroup stats are different from vmstats
i.e. are hierarchical but I think we can try out this approach and see
if this works or not.

BTW it seems like this topic should be discussed be discussed
face-to-face over vc or LPC. What do you folks thing?

Shakeel
Yosry Ahmed June 25, 2024, 9:24 p.m. UTC | #15
On Tue, Jun 25, 2024 at 2:20 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Tue, Jun 25, 2024 at 01:45:00PM GMT, Yosry Ahmed wrote:
> > On Tue, Jun 25, 2024 at 9:21 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > >
> > > On Tue, Jun 25, 2024 at 09:00:03AM GMT, Yosry Ahmed wrote:
> > > [...]
> > > >
> > > > My point is not about accuracy, although I think it's a reasonable
> > > > argument on its own (a lot of things could change in a short amount of
> > > > time, which is why I prefer magnitude-based ratelimiting).
> > > >
> > > > My point is about logical ordering. If a userspace program reads the
> > > > stats *after* an event occurs, it expects to get a snapshot of the
> > > > system state after that event. Two examples are:
> > > >
> > > > - A proactive reclaimer reading the stats after a reclaim attempt to
> > > > check if it needs to reclaim more memory or fallback.
> > > > - A userspace OOM killer reading the stats after a usage spike to
> > > > decide which workload to kill.
> > > >
> > > > I listed such examples with more detail in [1], when I removed
> > > > stats_flush_ongoing from the memcg code.
> > > >
> > > > [1]https://lore.kernel.org/lkml/20231129032154.3710765-6-yosryahmed@google.com/
> > >
> > > You are kind of arbitrarily adding restrictions and rules here. Why not
> > > follow the rules of a well established and battle tested stats infra
> > > used by everyone i.e. vmstats? There is no sync flush and there are
> > > frequent async flushes. I think that is what Jesper wants as well.
> >
> > That's how the memcg stats worked previously since before rstat and
> > until the introduction of stats_flush_ongoing AFAICT. We saw an actual
> > behavioral change when we were moving from a pre-rstat kernel to a
> > kernel with stats_flush_ongoing. This was the rationale when I removed
> > stats_flush_ongoing in [1]. It's not a new argument, I am just
> > reiterating what we discussed back then.
>
> In my reply above, I am not arguing to go back to the older
> stats_flush_ongoing situation. Rather I am discussing what should be the
> best eventual solution. From the vmstats infra, we can learn that
> frequent async flushes along with no sync flush, users are fine with the
> 'non-determinism'. Of course cgroup stats are different from vmstats
> i.e. are hierarchical but I think we can try out this approach and see
> if this works or not.

If we do not do sync flushing, then the same problem that happened
with stats_flush_ongoing could occur again, right? Userspace could
read the stats after an event, and get a snapshot of the system before
that event.

Perhaps this is fine for vmstats if it has always been like that (I
have no idea), or if no users make assumptions about this. But for
cgroup stats, we have use cases that rely on this behavior.

>
> BTW it seems like this topic should be discussed be discussed
> face-to-face over vc or LPC. What do you folks thing?

I am not going to be at LPC, but I am happy to discuss this over VC.
Christoph Lameter (Ampere) June 25, 2024, 10:35 p.m. UTC | #16
On Tue, 25 Jun 2024, Yosry Ahmed wrote:

>> In my reply above, I am not arguing to go back to the older
>> stats_flush_ongoing situation. Rather I am discussing what should be the
>> best eventual solution. From the vmstats infra, we can learn that
>> frequent async flushes along with no sync flush, users are fine with the
>> 'non-determinism'. Of course cgroup stats are different from vmstats
>> i.e. are hierarchical but I think we can try out this approach and see
>> if this works or not.
>
> If we do not do sync flushing, then the same problem that happened
> with stats_flush_ongoing could occur again, right? Userspace could
> read the stats after an event, and get a snapshot of the system before
> that event.
>
> Perhaps this is fine for vmstats if it has always been like that (I
> have no idea), or if no users make assumptions about this. But for
> cgroup stats, we have use cases that rely on this behavior.

vmstat updates are triggered initially as needed by the shepherd task and 
there is no requirement that this is triggered simultaenously. We 
could actually randomize the intervals in vmstat_update() a bit if this 
will help.
Yosry Ahmed June 25, 2024, 10:59 p.m. UTC | #17
On Tue, Jun 25, 2024 at 3:35 PM Christoph Lameter (Ampere) <cl@linux.com> wrote:
>
> On Tue, 25 Jun 2024, Yosry Ahmed wrote:
>
> >> In my reply above, I am not arguing to go back to the older
> >> stats_flush_ongoing situation. Rather I am discussing what should be the
> >> best eventual solution. From the vmstats infra, we can learn that
> >> frequent async flushes along with no sync flush, users are fine with the
> >> 'non-determinism'. Of course cgroup stats are different from vmstats
> >> i.e. are hierarchical but I think we can try out this approach and see
> >> if this works or not.
> >
> > If we do not do sync flushing, then the same problem that happened
> > with stats_flush_ongoing could occur again, right? Userspace could
> > read the stats after an event, and get a snapshot of the system before
> > that event.
> >
> > Perhaps this is fine for vmstats if it has always been like that (I
> > have no idea), or if no users make assumptions about this. But for
> > cgroup stats, we have use cases that rely on this behavior.
>
> vmstat updates are triggered initially as needed by the shepherd task and
> there is no requirement that this is triggered simultaenously. We
> could actually randomize the intervals in vmstat_update() a bit if this
> will help.

The problem is that for cgroup stats, the behavior has been that a
userspace read will trigger a flush (i.e. propagating updates). We
have use cases that depend on this. If we switch to the vmstat model
where updates are triggered independently from user reads, it
constitutes a behavioral change.
Jesper Dangaard Brouer June 26, 2024, 9:35 p.m. UTC | #18
On 26/06/2024 00.59, Yosry Ahmed wrote:
> On Tue, Jun 25, 2024 at 3:35 PM Christoph Lameter (Ampere) <cl@linux.com> wrote:
>>
>> On Tue, 25 Jun 2024, Yosry Ahmed wrote:
>>
>>>> In my reply above, I am not arguing to go back to the older
>>>> stats_flush_ongoing situation. Rather I am discussing what should be the
>>>> best eventual solution. From the vmstats infra, we can learn that
>>>> frequent async flushes along with no sync flush, users are fine with the
>>>> 'non-determinism'. Of course cgroup stats are different from vmstats
>>>> i.e. are hierarchical but I think we can try out this approach and see
>>>> if this works or not.
>>>
>>> If we do not do sync flushing, then the same problem that happened
>>> with stats_flush_ongoing could occur again, right? Userspace could
>>> read the stats after an event, and get a snapshot of the system before
>>> that event.
>>>
>>> Perhaps this is fine for vmstats if it has always been like that (I
>>> have no idea), or if no users make assumptions about this. But for
>>> cgroup stats, we have use cases that rely on this behavior.
>>
>> vmstat updates are triggered initially as needed by the shepherd task and
>> there is no requirement that this is triggered simultaenously. We
>> could actually randomize the intervals in vmstat_update() a bit if this
>> will help.
> 
> The problem is that for cgroup stats, the behavior has been that a
> userspace read will trigger a flush (i.e. propagating updates). We
> have use cases that depend on this. If we switch to the vmstat model
> where updates are triggered independently from user reads, it
> constitutes a behavioral change.

I implemented a variant using completions as Yosry asked for:
 
https://lore.kernel.org/all/171943668946.1638606.1320095353103578332.stgit@firesoul/

--Jesper
Yosry Ahmed June 26, 2024, 10:07 p.m. UTC | #19
On Wed, Jun 26, 2024 at 2:35 PM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
>
>
>
> On 26/06/2024 00.59, Yosry Ahmed wrote:
> > On Tue, Jun 25, 2024 at 3:35 PM Christoph Lameter (Ampere) <cl@linux.com> wrote:
> >>
> >> On Tue, 25 Jun 2024, Yosry Ahmed wrote:
> >>
> >>>> In my reply above, I am not arguing to go back to the older
> >>>> stats_flush_ongoing situation. Rather I am discussing what should be the
> >>>> best eventual solution. From the vmstats infra, we can learn that
> >>>> frequent async flushes along with no sync flush, users are fine with the
> >>>> 'non-determinism'. Of course cgroup stats are different from vmstats
> >>>> i.e. are hierarchical but I think we can try out this approach and see
> >>>> if this works or not.
> >>>
> >>> If we do not do sync flushing, then the same problem that happened
> >>> with stats_flush_ongoing could occur again, right? Userspace could
> >>> read the stats after an event, and get a snapshot of the system before
> >>> that event.
> >>>
> >>> Perhaps this is fine for vmstats if it has always been like that (I
> >>> have no idea), or if no users make assumptions about this. But for
> >>> cgroup stats, we have use cases that rely on this behavior.
> >>
> >> vmstat updates are triggered initially as needed by the shepherd task and
> >> there is no requirement that this is triggered simultaenously. We
> >> could actually randomize the intervals in vmstat_update() a bit if this
> >> will help.
> >
> > The problem is that for cgroup stats, the behavior has been that a
> > userspace read will trigger a flush (i.e. propagating updates). We
> > have use cases that depend on this. If we switch to the vmstat model
> > where updates are triggered independently from user reads, it
> > constitutes a behavioral change.
>
> I implemented a variant using completions as Yosry asked for:
>
> https://lore.kernel.org/all/171943668946.1638606.1320095353103578332.stgit@firesoul/

Thanks! I will take a look at this a little bit later. I am wondering
if you could verify if that solution fixes the problem with kswapd
flushing?
Jesper Dangaard Brouer June 27, 2024, 9:21 a.m. UTC | #20
On 27/06/2024 00.07, Yosry Ahmed wrote:
> On Wed, Jun 26, 2024 at 2:35 PM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
>>
>> On 26/06/2024 00.59, Yosry Ahmed wrote:
>>> On Tue, Jun 25, 2024 at 3:35 PM Christoph Lameter (Ampere) <cl@linux.com> wrote:
>>>>
>>>> On Tue, 25 Jun 2024, Yosry Ahmed wrote:
>>>>
[...]
>>
>> I implemented a variant using completions as Yosry asked for:
>>
>> [V3] https://lore.kernel.org/all/171943668946.1638606.1320095353103578332.stgit@firesoul/
> 
> Thanks! I will take a look at this a little bit later. I am wondering
> if you could verify if that solution fixes the problem with kswapd
> flushing?

I will deploy V3 on some production metals and report back in that thread.

For this patch V2, I already have some results that show it solves the
kswapd lock contention. Attaching grafana screenshot comparing two
machines without/with this V2 patch. Green (16m1253) without patch, and
Yellow line (16m1254) with patched kernel.  These machines have 12 NUMA
nodes and thus 12 kswapd threads, and CPU time is summed for these threads.

Zooming in with perf record we can also see the lock contention is gone.
  - sudo perf record -g -p $(pgrep -d, kswapd) -F 499 sleep 60
  - sudo perf report --no-children  --call-graph graph,0.01,callee 
--sort symbol


On a machine (16m1254) with this V2 patch:

  Samples: 7K of event 'cycles:P', Event count (approx.): 61228473670
    Overhead  Symbol
  +    8.28%  [k] mem_cgroup_css_rstat_flush
  +    6.69%  [k] xfs_perag_get_tag
  +    6.51%  [k] radix_tree_next_chunk
  +    5.09%  [k] queued_spin_lock_slowpath
  +    3.94%  [k] srso_alias_safe_ret
  +    3.62%  [k] srso_alias_return_thunk
  +    3.11%  [k] super_cache_count
  +    2.96%  [k] mem_cgroup_iter
  +    2.95%  [k] down_read_trylock
  +    2.48%  [k] shrink_lruvec
  +    2.12%  [k] isolate_lru_folios
  +    1.76%  [k] dentry_lru_isolate
  +    1.74%  [k] radix_tree_gang_lookup_tag


On a machine (16m1253) without patch:

  Samples: 65K of event 'cycles:P', Event count (approx.): 492125554022
    Overhead  SymbolCoverage]
  +   55.84%  [k] queued_spin_lock_slowpath
    - 55.80% queued_spin_lock_slowpath
       + 53.10% __cgroup_rstat_lock
       + 2.63% evict
  +    7.06%  [k] mem_cgroup_css_rstat_flush
  +    2.07%  [k] page_vma_mapped_walk
  +    1.76%  [k] invalid_folio_referenced_vma
  +    1.72%  [k] srso_alias_safe_ret
  +    1.37%  [k] shrink_lruvec
  +    1.23%  [k] srso_alias_return_thunk
  +    1.17%  [k] down_read_trylock
  +    0.98%  [k] perf_adjust_freq_unthr_context
  +    0.97%  [k] super_cache_count

I think this (clearly) shows that the patch works and eliminates kswapd
lock contention.

--Jesper
Yosry Ahmed June 27, 2024, 10:36 a.m. UTC | #21
On Thu, Jun 27, 2024 at 2:21 AM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
>
>
> On 27/06/2024 00.07, Yosry Ahmed wrote:
> > On Wed, Jun 26, 2024 at 2:35 PM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
> >>
> >> On 26/06/2024 00.59, Yosry Ahmed wrote:
> >>> On Tue, Jun 25, 2024 at 3:35 PM Christoph Lameter (Ampere) <cl@linux.com> wrote:
> >>>>
> >>>> On Tue, 25 Jun 2024, Yosry Ahmed wrote:
> >>>>
> [...]
> >>
> >> I implemented a variant using completions as Yosry asked for:
> >>
> >> [V3] https://lore.kernel.org/all/171943668946.1638606.1320095353103578332.stgit@firesoul/
> >
> > Thanks! I will take a look at this a little bit later. I am wondering
> > if you could verify if that solution fixes the problem with kswapd
> > flushing?
>
> I will deploy V3 on some production metals and report back in that thread.
>
> For this patch V2, I already have some results that show it solves the
> kswapd lock contention. Attaching grafana screenshot comparing two
> machines without/with this V2 patch. Green (16m1253) without patch, and
> Yellow line (16m1254) with patched kernel.  These machines have 12 NUMA
> nodes and thus 12 kswapd threads, and CPU time is summed for these threads.

Thanks for the data! Looking forward to whether v3 also fixes the
problem. I think it should, especially with the timeout, but let's see
:)

>
> Zooming in with perf record we can also see the lock contention is gone.
>   - sudo perf record -g -p $(pgrep -d, kswapd) -F 499 sleep 60
>   - sudo perf report --no-children  --call-graph graph,0.01,callee
> --sort symbol
>
>
> On a machine (16m1254) with this V2 patch:
>
>   Samples: 7K of event 'cycles:P', Event count (approx.): 61228473670
>     Overhead  Symbol
>   +    8.28%  [k] mem_cgroup_css_rstat_flush
>   +    6.69%  [k] xfs_perag_get_tag
>   +    6.51%  [k] radix_tree_next_chunk
>   +    5.09%  [k] queued_spin_lock_slowpath
>   +    3.94%  [k] srso_alias_safe_ret
>   +    3.62%  [k] srso_alias_return_thunk
>   +    3.11%  [k] super_cache_count
>   +    2.96%  [k] mem_cgroup_iter
>   +    2.95%  [k] down_read_trylock
>   +    2.48%  [k] shrink_lruvec
>   +    2.12%  [k] isolate_lru_folios
>   +    1.76%  [k] dentry_lru_isolate
>   +    1.74%  [k] radix_tree_gang_lookup_tag
>
>
> On a machine (16m1253) without patch:
>
>   Samples: 65K of event 'cycles:P', Event count (approx.): 492125554022
>     Overhead  SymbolCoverage]
>   +   55.84%  [k] queued_spin_lock_slowpath
>     - 55.80% queued_spin_lock_slowpath
>        + 53.10% __cgroup_rstat_lock
>        + 2.63% evict
>   +    7.06%  [k] mem_cgroup_css_rstat_flush
>   +    2.07%  [k] page_vma_mapped_walk
>   +    1.76%  [k] invalid_folio_referenced_vma
>   +    1.72%  [k] srso_alias_safe_ret
>   +    1.37%  [k] shrink_lruvec
>   +    1.23%  [k] srso_alias_return_thunk
>   +    1.17%  [k] down_read_trylock
>   +    0.98%  [k] perf_adjust_freq_unthr_context
>   +    0.97%  [k] super_cache_count
>
> I think this (clearly) shows that the patch works and eliminates kswapd
> lock contention.
>
> --Jesper
diff mbox series

Patch

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 2150ca60394b..ad41cca5c3b6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -499,6 +499,11 @@  static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
 	return NULL;
 }
 
+static inline bool cgroup_is_root(struct cgroup *cgrp)
+{
+	return cgroup_parent(cgrp) == NULL;
+}
+
 /**
  * cgroup_is_descendant - test ancestry
  * @cgrp: the cgroup to be tested
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index fb8b49437573..2591840b6dc1 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -11,6 +11,7 @@ 
 
 static DEFINE_SPINLOCK(cgroup_rstat_lock);
 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
+static atomic_t root_rstat_flush_ongoing = ATOMIC_INIT(0);
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
 
@@ -350,8 +351,25 @@  __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
 {
 	might_sleep();
 
+	/*
+	 * This avoids thundering herd problem on global rstat lock. When an
+	 * ongoing flush of the entire tree is in progress, then skip flush.
+	 */
+	if (atomic_read(&root_rstat_flush_ongoing))
+		return;
+
+	/* Grab right to be ongoing flusher, return if loosing race */
+	if (cgroup_is_root(cgrp) &&
+	    atomic_xchg(&root_rstat_flush_ongoing, 1))
+		return;
+
 	__cgroup_rstat_lock(cgrp, -1);
+
 	cgroup_rstat_flush_locked(cgrp);
+
+	if (cgroup_is_root(cgrp))
+		atomic_set(&root_rstat_flush_ongoing, 0);
+
 	__cgroup_rstat_unlock(cgrp, -1);
 }
 
@@ -362,13 +380,20 @@  __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
  * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
  * paired with cgroup_rstat_flush_release().
  *
+ * Current invariant, not called with root cgrp.
+ *
  * This function may block.
  */
 void cgroup_rstat_flush_hold(struct cgroup *cgrp)
 	__acquires(&cgroup_rstat_lock)
 {
 	might_sleep();
+
 	__cgroup_rstat_lock(cgrp, -1);
+
+	if (atomic_read(&root_rstat_flush_ongoing))
+		return;
+
 	cgroup_rstat_flush_locked(cgrp);
 }