diff mbox series

[v1,2/3] cgroup/rstat: convert cgroup_rstat_lock back to mutex

Message ID 171328989335.3930751.3091577850420501533.stgit@firesoul (mailing list archive)
State New
Headers show
Series cgroup/rstat: global cgroup_rstat_lock changes | expand

Commit Message

Jesper Dangaard Brouer April 16, 2024, 5:51 p.m. UTC
Since kernel v4.18, cgroup_rstat_lock has been an IRQ-disabling spinlock,
as introduced by commit 0fa294fb1985 ("cgroup: Replace cgroup_rstat_mutex
with a spinlock").

Despite efforts in cgroup_rstat_flush_locked() to yield the lock when
necessary during the collection of per-CPU stats, this approach has led
to several scaling issues observed in production environments. Holding
this IRQ lock has caused starvation of other critical kernel functions,
such as softirq (e.g., timers and netstack). Although kernel v6.8
introduced optimizations in this area, we continue to observe instances
where the spin_lock is held for 64-128 ms in production.

This patch converts cgroup_rstat_lock back to being a mutex lock. This
change is made possible thanks to the significant effort by Yosry Ahmed
to eliminate all atomic context use-cases through multiple commits,
ending in 0a2dc6ac3329 ("cgroup: removecgroup_rstat_flush_atomic()"),
included in kernel v6.5.

After this patch lock contention will be less obvious, as converting this
to a mutex avoids multiple CPUs spinning while waiting for the lock, but
it doesn't remove the lock contention. It is recommended to use the
tracepoints to diagnose this.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 kernel/cgroup/rstat.c |   10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

Comments

Yosry Ahmed April 18, 2024, 2:19 a.m. UTC | #1
On Tue, Apr 16, 2024 at 10:51 AM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
>
> Since kernel v4.18, cgroup_rstat_lock has been an IRQ-disabling spinlock,
> as introduced by commit 0fa294fb1985 ("cgroup: Replace cgroup_rstat_mutex
> with a spinlock").
>
> Despite efforts in cgroup_rstat_flush_locked() to yield the lock when
> necessary during the collection of per-CPU stats, this approach has led
> to several scaling issues observed in production environments. Holding
> this IRQ lock has caused starvation of other critical kernel functions,
> such as softirq (e.g., timers and netstack). Although kernel v6.8
> introduced optimizations in this area, we continue to observe instances
> where the spin_lock is held for 64-128 ms in production.
>
> This patch converts cgroup_rstat_lock back to being a mutex lock. This
> change is made possible thanks to the significant effort by Yosry Ahmed
> to eliminate all atomic context use-cases through multiple commits,
> ending in 0a2dc6ac3329 ("cgroup: removecgroup_rstat_flush_atomic()"),
> included in kernel v6.5.
>
> After this patch lock contention will be less obvious, as converting this
> to a mutex avoids multiple CPUs spinning while waiting for the lock, but
> it doesn't remove the lock contention. It is recommended to use the
> tracepoints to diagnose this.

I will keep the high-level conversation about using the mutex here in
the cover letter thread, but I am wondering why we are keeping the
lock dropping logic here with the mutex?

If this is to reduce lock contention, why does it depend on
need_resched()? spin_needbreak() is a good indicator for lock
contention, but need_resched() isn't, right?

Also, how was this tested?

When I did previous changes to the flushing logic I used to make sure
that userspace read latency was not impacted, as well as in-kernel
flushers (e.g. reclaim). We should make sure there are no regressions
on both fronts.

>
> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
> ---
>  kernel/cgroup/rstat.c |   10 +++++-----
>  1 file changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index ff68c904e647..a90d68a7c27f 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -9,7 +9,7 @@
>
>  #include <trace/events/cgroup.h>
>
> -static DEFINE_SPINLOCK(cgroup_rstat_lock);
> +static DEFINE_MUTEX(cgroup_rstat_lock);
>  static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
>
>  static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
> @@ -238,10 +238,10 @@ static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
>  {
>         bool contended;
>
> -       contended = !spin_trylock_irq(&cgroup_rstat_lock);
> +       contended = !mutex_trylock(&cgroup_rstat_lock);
>         if (contended) {
>                 trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
> -               spin_lock_irq(&cgroup_rstat_lock);
> +               mutex_lock(&cgroup_rstat_lock);
>         }
>         trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
>  }
> @@ -250,7 +250,7 @@ static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
>         __releases(&cgroup_rstat_lock)
>  {
>         trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
> -       spin_unlock_irq(&cgroup_rstat_lock);
> +       mutex_unlock(&cgroup_rstat_lock);
>  }
>
>  /* see cgroup_rstat_flush() */
> @@ -278,7 +278,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
>                 }
>
>                 /* play nice and yield if necessary */
> -               if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
> +               if (need_resched()) {
>                         __cgroup_rstat_unlock(cgrp, cpu);
>                         if (!cond_resched())
>                                 cpu_relax();
Jesper Dangaard Brouer April 18, 2024, 9:02 a.m. UTC | #2
On 18/04/2024 04.19, Yosry Ahmed wrote:
> On Tue, Apr 16, 2024 at 10:51 AM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
>>
>> Since kernel v4.18, cgroup_rstat_lock has been an IRQ-disabling spinlock,
>> as introduced by commit 0fa294fb1985 ("cgroup: Replace cgroup_rstat_mutex
>> with a spinlock").
>>
>> Despite efforts in cgroup_rstat_flush_locked() to yield the lock when
>> necessary during the collection of per-CPU stats, this approach has led
>> to several scaling issues observed in production environments. Holding
>> this IRQ lock has caused starvation of other critical kernel functions,
>> such as softirq (e.g., timers and netstack). Although kernel v6.8
>> introduced optimizations in this area, we continue to observe instances
>> where the spin_lock is held for 64-128 ms in production.
>>
>> This patch converts cgroup_rstat_lock back to being a mutex lock. This
>> change is made possible thanks to the significant effort by Yosry Ahmed
>> to eliminate all atomic context use-cases through multiple commits,
>> ending in 0a2dc6ac3329 ("cgroup: removecgroup_rstat_flush_atomic()"),
>> included in kernel v6.5.
>>
>> After this patch lock contention will be less obvious, as converting this
>> to a mutex avoids multiple CPUs spinning while waiting for the lock, but
>> it doesn't remove the lock contention. It is recommended to use the
>> tracepoints to diagnose this.
> 
> I will keep the high-level conversation about using the mutex here in
> the cover letter thread, but I am wondering why we are keeping the
> lock dropping logic here with the mutex?
> 

I agree that yielding the mutex in the loop makes less sense.
Especially since the raw_spin_unlock_irqrestore(cpu_lock, flags) call
will be a preemption point for my softirq.   But I kept it because, we
are running a CONFIG_PREEMPT_VOLUNTARY kernel, so I still worried that
there was no sched point for other userspace processes while holding the
mutex, but I don't fully know the sched implication when holding a mutex.


> If this is to reduce lock contention, why does it depend on
> need_resched()? spin_needbreak() is a good indicator for lock
> contention, but need_resched() isn't, right?
>

As I said, I'm unsure of the semantics of holding a mutex.


> Also, how was this tested?
> 

I tested this in a testlab, prior to posting upstream, with parallel
reader of the stat files.  As I said in other mail, I plan to experiment
with these patches(2+3) in production, as micro-benchmarking will not
reveal the corner cases we care about.  With BPF based measurements of
the lock congestion time, I hope we can catch production issues at a
time scale that is happens prior to user visible impacts.


> When I did previous changes to the flushing logic I used to make sure
> that userspace read latency was not impacted, as well as in-kernel
> flushers (e.g. reclaim). We should make sure there are no regressions
> on both fronts.
> 

Agree, we should consider both userspace readers and in-kernel flushers.
Maybe these needed separate handing as they have separate needs.

>>
>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>> ---
>>   kernel/cgroup/rstat.c |   10 +++++-----
>>   1 file changed, 5 insertions(+), 5 deletions(-)
>>
>> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
>> index ff68c904e647..a90d68a7c27f 100644
>> --- a/kernel/cgroup/rstat.c
>> +++ b/kernel/cgroup/rstat.c
>> @@ -9,7 +9,7 @@
>>
>>   #include <trace/events/cgroup.h>
>>
>> -static DEFINE_SPINLOCK(cgroup_rstat_lock);
>> +static DEFINE_MUTEX(cgroup_rstat_lock);
>>   static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
>>
>>   static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
>> @@ -238,10 +238,10 @@ static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
>>   {
>>          bool contended;
>>
>> -       contended = !spin_trylock_irq(&cgroup_rstat_lock);
>> +       contended = !mutex_trylock(&cgroup_rstat_lock);
>>          if (contended) {
>>                  trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
>> -               spin_lock_irq(&cgroup_rstat_lock);
>> +               mutex_lock(&cgroup_rstat_lock);
>>          }
>>          trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
>>   }
>> @@ -250,7 +250,7 @@ static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
>>          __releases(&cgroup_rstat_lock)
>>   {
>>          trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
>> -       spin_unlock_irq(&cgroup_rstat_lock);
>> +       mutex_unlock(&cgroup_rstat_lock);
>>   }
>>
>>   /* see cgroup_rstat_flush() */
>> @@ -278,7 +278,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
>>                  }
>>
>>                  /* play nice and yield if necessary */
>> -               if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
>> +               if (need_resched()) {
>>                          __cgroup_rstat_unlock(cgrp, cpu);
>>                          if (!cond_resched())
>>                                  cpu_relax();
Shakeel Butt April 18, 2024, 2:49 p.m. UTC | #3
On Thu, Apr 18, 2024 at 11:02:06AM +0200, Jesper Dangaard Brouer wrote:
> 
> 
> On 18/04/2024 04.19, Yosry Ahmed wrote:
[...]
> > 
> > I will keep the high-level conversation about using the mutex here in
> > the cover letter thread, but I am wondering why we are keeping the
> > lock dropping logic here with the mutex?
> > 
> 
> I agree that yielding the mutex in the loop makes less sense.
> Especially since the raw_spin_unlock_irqrestore(cpu_lock, flags) call
> will be a preemption point for my softirq.   But I kept it because, we
> are running a CONFIG_PREEMPT_VOLUNTARY kernel, so I still worried that
> there was no sched point for other userspace processes while holding the
> mutex, but I don't fully know the sched implication when holding a mutex.
> 

Are the softirqs you are interested in, raised from the same cpu or
remote cpu? What about local_softirq_pending() check in addition to
need_resched() and spin_needbreak() checks? If softirq can only be
raised on local cpu then convert the spin_lock to non-irq one (Please
correct me if I am wrong but on return from hard irq and not within bh
or irq disabled spin_lock, the kernel will run the pending softirqs,
right?). Did you get the chance to test these two changes or something
similar in your prod environment?
Yosry Ahmed April 18, 2024, 8:38 p.m. UTC | #4
On Thu, Apr 18, 2024 at 2:02 AM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
>
>
>
> On 18/04/2024 04.19, Yosry Ahmed wrote:
> > On Tue, Apr 16, 2024 at 10:51 AM Jesper Dangaard Brouer <hawk@kernel.org> wrote:
> >>
> >> Since kernel v4.18, cgroup_rstat_lock has been an IRQ-disabling spinlock,
> >> as introduced by commit 0fa294fb1985 ("cgroup: Replace cgroup_rstat_mutex
> >> with a spinlock").
> >>
> >> Despite efforts in cgroup_rstat_flush_locked() to yield the lock when
> >> necessary during the collection of per-CPU stats, this approach has led
> >> to several scaling issues observed in production environments. Holding
> >> this IRQ lock has caused starvation of other critical kernel functions,
> >> such as softirq (e.g., timers and netstack). Although kernel v6.8
> >> introduced optimizations in this area, we continue to observe instances
> >> where the spin_lock is held for 64-128 ms in production.
> >>
> >> This patch converts cgroup_rstat_lock back to being a mutex lock. This
> >> change is made possible thanks to the significant effort by Yosry Ahmed
> >> to eliminate all atomic context use-cases through multiple commits,
> >> ending in 0a2dc6ac3329 ("cgroup: removecgroup_rstat_flush_atomic()"),
> >> included in kernel v6.5.
> >>
> >> After this patch lock contention will be less obvious, as converting this
> >> to a mutex avoids multiple CPUs spinning while waiting for the lock, but
> >> it doesn't remove the lock contention. It is recommended to use the
> >> tracepoints to diagnose this.
> >
> > I will keep the high-level conversation about using the mutex here in
> > the cover letter thread, but I am wondering why we are keeping the
> > lock dropping logic here with the mutex?
> >
>
> I agree that yielding the mutex in the loop makes less sense.
> Especially since the raw_spin_unlock_irqrestore(cpu_lock, flags) call
> will be a preemption point for my softirq.   But I kept it because, we
> are running a CONFIG_PREEMPT_VOLUNTARY kernel, so I still worried that
> there was no sched point for other userspace processes while holding the
> mutex, but I don't fully know the sched implication when holding a mutex.

I guess dropping the lock before rescheduling could be more preferable
in this case since we do not need to keep holding the lock for
correctness.

>
> > If this is to reduce lock contention, why does it depend on
> > need_resched()? spin_needbreak() is a good indicator for lock
> > contention, but need_resched() isn't, right?
> >
>
> As I said, I'm unsure of the semantics of holding a mutex.
>
>
> > Also, how was this tested?
> >
>
> I tested this in a testlab, prior to posting upstream, with parallel
> reader of the stat files.

I believe high concurrency is a key point here. CC'ing Wei who
reported regressions on previous attempts of mine before to address
the lock contention from userspace.

> As I said in other mail, I plan to experiment
> with these patches(2+3) in production, as micro-benchmarking will not
> reveal the corner cases we care about.

Right, but micro-benchmarking should give us a signal about
regressions. It was very useful for me when working with this code
before to use synthetic benchmarks with high concurrency of userspace
reads and/or kernel flushers.
Yosry Ahmed April 18, 2024, 8:39 p.m. UTC | #5
On Thu, Apr 18, 2024 at 7:49 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Thu, Apr 18, 2024 at 11:02:06AM +0200, Jesper Dangaard Brouer wrote:
> >
> >
> > On 18/04/2024 04.19, Yosry Ahmed wrote:
> [...]
> > >
> > > I will keep the high-level conversation about using the mutex here in
> > > the cover letter thread, but I am wondering why we are keeping the
> > > lock dropping logic here with the mutex?
> > >
> >
> > I agree that yielding the mutex in the loop makes less sense.
> > Especially since the raw_spin_unlock_irqrestore(cpu_lock, flags) call
> > will be a preemption point for my softirq.   But I kept it because, we
> > are running a CONFIG_PREEMPT_VOLUNTARY kernel, so I still worried that
> > there was no sched point for other userspace processes while holding the
> > mutex, but I don't fully know the sched implication when holding a mutex.
> >
>
> Are the softirqs you are interested in, raised from the same cpu or
> remote cpu? What about local_softirq_pending() check in addition to
> need_resched() and spin_needbreak() checks? If softirq can only be
> raised on local cpu then convert the spin_lock to non-irq one (Please
> correct me if I am wrong but on return from hard irq and not within bh
> or irq disabled spin_lock, the kernel will run the pending softirqs,
> right?). Did you get the chance to test these two changes or something
> similar in your prod environment?

I tried making the spinlock a non-irq lock before, but Tejun objected [1].

Perhaps we could experiment with always dropping the lock at CPU
boundaries instead?

[1]https://lore.kernel.org/lkml/ZBz%2FV5a7%2F6PZeM7S@slm.duckdns.org/
Jesper Dangaard Brouer April 19, 2024, 1:15 p.m. UTC | #6
On 18/04/2024 22.39, Yosry Ahmed wrote:
> On Thu, Apr 18, 2024 at 7:49 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>>
>> On Thu, Apr 18, 2024 at 11:02:06AM +0200, Jesper Dangaard Brouer wrote:
>>>
>>>
>>> On 18/04/2024 04.19, Yosry Ahmed wrote:
>> [...]
>>>>
>>>> I will keep the high-level conversation about using the mutex here in
>>>> the cover letter thread, but I am wondering why we are keeping the
>>>> lock dropping logic here with the mutex?
>>>>
>>>
>>> I agree that yielding the mutex in the loop makes less sense.
>>> Especially since the raw_spin_unlock_irqrestore(cpu_lock, flags) call
>>> will be a preemption point for my softirq.   But I kept it because, we
>>> are running a CONFIG_PREEMPT_VOLUNTARY kernel, so I still worried that
>>> there was no sched point for other userspace processes while holding the
>>> mutex, but I don't fully know the sched implication when holding a mutex.
>>>
>>
>> Are the softirqs you are interested in, raised from the same cpu or
>> remote cpu? What about local_softirq_pending() check in addition to
>> need_resched() and spin_needbreak() checks? If softirq can only be
>> raised on local cpu then convert the spin_lock to non-irq one (Please
>> correct me if I am wrong but on return from hard irq and not within bh
>> or irq disabled spin_lock, the kernel will run the pending softirqs,
>> right?). Did you get the chance to test these two changes or something
>> similar in your prod environment?
> 
> I tried making the spinlock a non-irq lock before, but Tejun objected [1].
> [1] https://lore.kernel.org/lkml/ZBz%2FV5a7%2F6PZeM7S@slm.duckdns.org/
> 

After reading [1], I think using a mutex is a better approach (than 
non-irq spinlock).


> Perhaps we could experiment with always dropping the lock at CPU
> boundaries instead?
> 

I don't think this will be enough (always dropping the lock at CPU
boundaries).  My measured "lock-hold" times that is blocking IRQ (and
softirq) for too long.  When looking at prod with my new cgroup
tracepoint script[2]. When contention occurs, I see many Yields
happening and with same magnitude as Contended. But still see events
with long "lock-hold" times, even-though yields are high.

  [2] 
https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt

Example output:

  12:46:56 High Lock-contention: wait: 739 usec (0 ms) on CPU:56 
comm:kswapd7
  12:46:56 Long lock-hold time: 6381 usec (6 ms) on CPU:27 comm:kswapd3
  12:46:56 Long lock-hold time: 18905 usec (18 ms) on CPU:100 
comm:kworker/u261:12

  12:46:56  time elapsed: 36 sec (interval = 1 sec)
   Flushes(2051) 15/interval (avg 56/sec)
   Locks(44464) 1340/interval (avg 1235/sec)
   Yields(42413) 1325/interval (avg 1178/sec)
   Contended(42112) 1322/interval (avg 1169/sec)

There is reported 15 flushes/sec, but locks are yielded quickly.

More problematically (for softirq latency) we see a Long lock-hold time
reaching 18 ms.  For network RX softirq I need lower than 0.5ms latency,
to avoid RX-ring HW queue overflows.


--Jesper
p.s. I'm seeing a pattern with kswapdN contending on this lock.

@stack[697, kswapd3]:
         __cgroup_rstat_lock+107
         __cgroup_rstat_lock+107
         cgroup_rstat_flush_locked+851
         cgroup_rstat_flush+35
         shrink_node+226
         balance_pgdat+807
         kswapd+521
         kthread+228
         ret_from_fork+48
         ret_from_fork_asm+27

@stack[698, kswapd4]:
         __cgroup_rstat_lock+107
         __cgroup_rstat_lock+107
         cgroup_rstat_flush_locked+851
         cgroup_rstat_flush+35
         shrink_node+226
         balance_pgdat+807
         kswapd+521
         kthread+228
         ret_from_fork+48
         ret_from_fork_asm+27

@stack[699, kswapd5]:
         __cgroup_rstat_lock+107
         __cgroup_rstat_lock+107
         cgroup_rstat_flush_locked+851
         cgroup_rstat_flush+35
         shrink_node+226
         balance_pgdat+807
         kswapd+521
         kthread+228
         ret_from_fork+48
         ret_from_fork_asm+27
Shakeel Butt April 19, 2024, 4:11 p.m. UTC | #7
On Fri, Apr 19, 2024 at 03:15:01PM +0200, Jesper Dangaard Brouer wrote:
> 
> 
> On 18/04/2024 22.39, Yosry Ahmed wrote:
> > On Thu, Apr 18, 2024 at 7:49 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > 
> > > On Thu, Apr 18, 2024 at 11:02:06AM +0200, Jesper Dangaard Brouer wrote:
> > > > 
> > > > 
> > > > On 18/04/2024 04.19, Yosry Ahmed wrote:
> > > [...]
> > > > > 
> > > > > I will keep the high-level conversation about using the mutex here in
> > > > > the cover letter thread, but I am wondering why we are keeping the
> > > > > lock dropping logic here with the mutex?
> > > > > 
> > > > 
> > > > I agree that yielding the mutex in the loop makes less sense.
> > > > Especially since the raw_spin_unlock_irqrestore(cpu_lock, flags) call
> > > > will be a preemption point for my softirq.   But I kept it because, we
> > > > are running a CONFIG_PREEMPT_VOLUNTARY kernel, so I still worried that
> > > > there was no sched point for other userspace processes while holding the
> > > > mutex, but I don't fully know the sched implication when holding a mutex.
> > > > 
> > > 
> > > Are the softirqs you are interested in, raised from the same cpu or
> > > remote cpu? What about local_softirq_pending() check in addition to
> > > need_resched() and spin_needbreak() checks? If softirq can only be
> > > raised on local cpu then convert the spin_lock to non-irq one (Please
> > > correct me if I am wrong but on return from hard irq and not within bh
> > > or irq disabled spin_lock, the kernel will run the pending softirqs,
> > > right?). Did you get the chance to test these two changes or something
> > > similar in your prod environment?
> > 
> > I tried making the spinlock a non-irq lock before, but Tejun objected [1].
> > [1] https://lore.kernel.org/lkml/ZBz%2FV5a7%2F6PZeM7S@slm.duckdns.org/
> > 
> 
> After reading [1], I think using a mutex is a better approach (than non-irq
> spinlock).
> 
> 
> > Perhaps we could experiment with always dropping the lock at CPU
> > boundaries instead?
> > 
> 
> I don't think this will be enough (always dropping the lock at CPU
> boundaries).  My measured "lock-hold" times that is blocking IRQ (and
> softirq) for too long.  When looking at prod with my new cgroup
> tracepoint script[2]. When contention occurs, I see many Yields
> happening and with same magnitude as Contended. But still see events
> with long "lock-hold" times, even-though yields are high.
> 
>  [2] https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt
> 
> Example output:
> 
>  12:46:56 High Lock-contention: wait: 739 usec (0 ms) on CPU:56 comm:kswapd7
>  12:46:56 Long lock-hold time: 6381 usec (6 ms) on CPU:27 comm:kswapd3
>  12:46:56 Long lock-hold time: 18905 usec (18 ms) on CPU:100
> comm:kworker/u261:12
> 
>  12:46:56  time elapsed: 36 sec (interval = 1 sec)
>   Flushes(2051) 15/interval (avg 56/sec)
>   Locks(44464) 1340/interval (avg 1235/sec)
>   Yields(42413) 1325/interval (avg 1178/sec)
>   Contended(42112) 1322/interval (avg 1169/sec)
> 
> There is reported 15 flushes/sec, but locks are yielded quickly.
> 
> More problematically (for softirq latency) we see a Long lock-hold time
> reaching 18 ms.  For network RX softirq I need lower than 0.5ms latency,
> to avoid RX-ring HW queue overflows.
> 
> 
> --Jesper
> p.s. I'm seeing a pattern with kswapdN contending on this lock.
> 
> @stack[697, kswapd3]:
>         __cgroup_rstat_lock+107
>         __cgroup_rstat_lock+107
>         cgroup_rstat_flush_locked+851
>         cgroup_rstat_flush+35
>         shrink_node+226
>         balance_pgdat+807
>         kswapd+521
>         kthread+228
>         ret_from_fork+48
>         ret_from_fork_asm+27
> 
> @stack[698, kswapd4]:
>         __cgroup_rstat_lock+107
>         __cgroup_rstat_lock+107
>         cgroup_rstat_flush_locked+851
>         cgroup_rstat_flush+35
>         shrink_node+226
>         balance_pgdat+807
>         kswapd+521
>         kthread+228
>         ret_from_fork+48
>         ret_from_fork_asm+27
> 
> @stack[699, kswapd5]:
>         __cgroup_rstat_lock+107
>         __cgroup_rstat_lock+107
>         cgroup_rstat_flush_locked+851
>         cgroup_rstat_flush+35
>         shrink_node+226
>         balance_pgdat+807
>         kswapd+521
>         kthread+228
>         ret_from_fork+48
>         ret_from_fork_asm+27
> 

Can you simply replace mem_cgroup_flush_stats() in
prepare_scan_control() with the ratelimited version and see if the issue
still persists for your production traffic?

Also were you able to get which specific stats are getting the most
updates?
Yosry Ahmed April 19, 2024, 7:21 p.m. UTC | #8
[..]
> > > Perhaps we could experiment with always dropping the lock at CPU
> > > boundaries instead?
> > >
> >
> > I don't think this will be enough (always dropping the lock at CPU
> > boundaries).  My measured "lock-hold" times that is blocking IRQ (and
> > softirq) for too long.  When looking at prod with my new cgroup
> > tracepoint script[2]. When contention occurs, I see many Yields
> > happening and with same magnitude as Contended. But still see events
> > with long "lock-hold" times, even-though yields are high.
> >
> >  [2] https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt
> >
> > Example output:
> >
> >  12:46:56 High Lock-contention: wait: 739 usec (0 ms) on CPU:56 comm:kswapd7
> >  12:46:56 Long lock-hold time: 6381 usec (6 ms) on CPU:27 comm:kswapd3
> >  12:46:56 Long lock-hold time: 18905 usec (18 ms) on CPU:100
> > comm:kworker/u261:12
> >
> >  12:46:56  time elapsed: 36 sec (interval = 1 sec)
> >   Flushes(2051) 15/interval (avg 56/sec)
> >   Locks(44464) 1340/interval (avg 1235/sec)
> >   Yields(42413) 1325/interval (avg 1178/sec)
> >   Contended(42112) 1322/interval (avg 1169/sec)
> >
> > There is reported 15 flushes/sec, but locks are yielded quickly.
> >
> > More problematically (for softirq latency) we see a Long lock-hold time
> > reaching 18 ms.  For network RX softirq I need lower than 0.5ms latency,
> > to avoid RX-ring HW queue overflows.

Here we are measuring yields against contention, but the main problem
here is IRQ serving latency, which doesn't have to correlate with
contention, right?

Perhaps contention is causing us to yield the lock every nth cpu
boundary, but apparently this is not enough for IRQ serving latency.
Dropping the lock on each boundary should improve IRQ serving latency,
regardless of the presence of contention.

Let's focus on one problem at a time ;)

> >
> >
> > --Jesper
> > p.s. I'm seeing a pattern with kswapdN contending on this lock.
> >
> > @stack[697, kswapd3]:
> >         __cgroup_rstat_lock+107
> >         __cgroup_rstat_lock+107
> >         cgroup_rstat_flush_locked+851
> >         cgroup_rstat_flush+35
> >         shrink_node+226
> >         balance_pgdat+807
> >         kswapd+521
> >         kthread+228
> >         ret_from_fork+48
> >         ret_from_fork_asm+27
> >
> > @stack[698, kswapd4]:
> >         __cgroup_rstat_lock+107
> >         __cgroup_rstat_lock+107
> >         cgroup_rstat_flush_locked+851
> >         cgroup_rstat_flush+35
> >         shrink_node+226
> >         balance_pgdat+807
> >         kswapd+521
> >         kthread+228
> >         ret_from_fork+48
> >         ret_from_fork_asm+27
> >
> > @stack[699, kswapd5]:
> >         __cgroup_rstat_lock+107
> >         __cgroup_rstat_lock+107
> >         cgroup_rstat_flush_locked+851
> >         cgroup_rstat_flush+35
> >         shrink_node+226
> >         balance_pgdat+807
> >         kswapd+521
> >         kthread+228
> >         ret_from_fork+48
> >         ret_from_fork_asm+27
> >
>
> Can you simply replace mem_cgroup_flush_stats() in
> prepare_scan_control() with the ratelimited version and see if the issue
> still persists for your production traffic?

With thresholding, the fact that we reach cgroup_rstat_flush() means
that there is a high magnitude of pending updates. I think Jesper
mentioned 128 CPUs before, that means 128 * 64 (MEMCG_CHARGE_BATCH)
page-sized updates. That could be over 33 MBs with 4K page size.

I am not sure if it's fine to ignore such updates in shrink_node(),
especially that it is called in a loop sometimes so I imagine we may
want to see what changed after the last iteration.

>
> Also were you able to get which specific stats are getting the most
> updates?

This, on the other hand, would be very interesting. I think it is very
possible that we don't actually have 33 MBs of updates, but rather we
keep adding and subtracting from the same stat until we reach the
threshold. This could especially be true for hot stats like slab
allocations.
diff mbox series

Patch

diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index ff68c904e647..a90d68a7c27f 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -9,7 +9,7 @@ 
 
 #include <trace/events/cgroup.h>
 
-static DEFINE_SPINLOCK(cgroup_rstat_lock);
+static DEFINE_MUTEX(cgroup_rstat_lock);
 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -238,10 +238,10 @@  static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
 {
 	bool contended;
 
-	contended = !spin_trylock_irq(&cgroup_rstat_lock);
+	contended = !mutex_trylock(&cgroup_rstat_lock);
 	if (contended) {
 		trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
-		spin_lock_irq(&cgroup_rstat_lock);
+		mutex_lock(&cgroup_rstat_lock);
 	}
 	trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
 }
@@ -250,7 +250,7 @@  static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
 	__releases(&cgroup_rstat_lock)
 {
 	trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
-	spin_unlock_irq(&cgroup_rstat_lock);
+	mutex_unlock(&cgroup_rstat_lock);
 }
 
 /* see cgroup_rstat_flush() */
@@ -278,7 +278,7 @@  static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
 		}
 
 		/* play nice and yield if necessary */
-		if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
+		if (need_resched()) {
 			__cgroup_rstat_unlock(cgrp, cpu);
 			if (!cond_resched())
 				cpu_relax();