diff mbox series

[v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints

Message ID 171457225108.4159924.12821205549807669839.stgit@firesoul (mailing list archive)
State Not Applicable
Headers show
Series [v1] cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Jesper Dangaard Brouer May 1, 2024, 2:04 p.m. UTC
This closely resembles helpers added for the global cgroup_rstat_lock in
commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.

Based on production workloads, we observe the fast-path "update" function
cgroup_rstat_updated() is invoked around 3 million times per sec, while the
"flush" function cgroup_rstat_flush_locked(), walking each possible CPU,
can see periodic spikes of 700 invocations/sec.

For this reason, the tracepoints are split into normal and fastpath
versions for this per-CPU lock. Making it feasible for production to
continuously monitor the non-fastpath tracepoint to detect lock contention
issues. The reason for monitoring is that lock disables IRQs which can
disturb e.g. softirq processing on the local CPUs involved. When the
global cgroup_rstat_lock stops disabling IRQs (e.g converted to a mutex),
this per CPU lock becomes the next bottleneck that can introduce latency
variations.

A practical bpftrace script for monitoring contention latency:

 bpftrace -e '
   tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
     @start[tid]=nsecs; @cnt[probe]=count()}
   tracepoint:cgroup:cgroup_rstat_cpu_locked {
     if (args->contended) {
       @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
     @cnt[probe]=count()}
   interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); clear(@cnt);}'

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 include/trace/events/cgroup.h |   56 +++++++++++++++++++++++++++++----
 kernel/cgroup/rstat.c         |   70 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 108 insertions(+), 18 deletions(-)

Comments

Waiman Long May 1, 2024, 2:24 p.m. UTC | #1
On 5/1/24 10:04, Jesper Dangaard Brouer wrote:
> This closely resembles helpers added for the global cgroup_rstat_lock in
> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
>
> Based on production workloads, we observe the fast-path "update" function
> cgroup_rstat_updated() is invoked around 3 million times per sec, while the
> "flush" function cgroup_rstat_flush_locked(), walking each possible CPU,
> can see periodic spikes of 700 invocations/sec.
>
> For this reason, the tracepoints are split into normal and fastpath
> versions for this per-CPU lock. Making it feasible for production to
> continuously monitor the non-fastpath tracepoint to detect lock contention
> issues. The reason for monitoring is that lock disables IRQs which can
> disturb e.g. softirq processing on the local CPUs involved. When the
> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a mutex),
> this per CPU lock becomes the next bottleneck that can introduce latency
> variations.
>
> A practical bpftrace script for monitoring contention latency:
>
>   bpftrace -e '
>     tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>       @start[tid]=nsecs; @cnt[probe]=count()}
>     tracepoint:cgroup:cgroup_rstat_cpu_locked {
>       if (args->contended) {
>         @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>       @cnt[probe]=count()}
>     interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); clear(@cnt);}'

This is a per-cpu lock. So the only possible contention involves only 2 
CPUs - a local CPU invoking cgroup_rstat_updated(). A flusher CPU doing 
cgroup_rstat_flush_locked() calling into cgroup_rstat_updated_list(). 
With recent commits to reduce the percpu lock hold time, I doubt lock 
contention on the percpu lock will have a great impact on latency. So do 
we really need such an elaborate scheme to monitor this? BTW, the 
additional code will also add to the worst case latency.

Cheers,
Longman

>
> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
> ---
>   include/trace/events/cgroup.h |   56 +++++++++++++++++++++++++++++----
>   kernel/cgroup/rstat.c         |   70 ++++++++++++++++++++++++++++++++++-------
>   2 files changed, 108 insertions(+), 18 deletions(-)
>
> diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
> index 13f375800135..0b95865a90f3 100644
> --- a/include/trace/events/cgroup.h
> +++ b/include/trace/events/cgroup.h
> @@ -206,15 +206,15 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
>   
>   DECLARE_EVENT_CLASS(cgroup_rstat,
>   
> -	TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>   
> -	TP_ARGS(cgrp, cpu_in_loop, contended),
> +	TP_ARGS(cgrp, cpu, contended),
>   
>   	TP_STRUCT__entry(
>   		__field(	int,		root			)
>   		__field(	int,		level			)
>   		__field(	u64,		id			)
> -		__field(	int,		cpu_in_loop		)
> +		__field(	int,		cpu			)
>   		__field(	bool,		contended		)
>   	),
>   
> @@ -222,15 +222,16 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
>   		__entry->root = cgrp->root->hierarchy_id;
>   		__entry->id = cgroup_id(cgrp);
>   		__entry->level = cgrp->level;
> -		__entry->cpu_in_loop = cpu_in_loop;
> +		__entry->cpu = cpu;
>   		__entry->contended = contended;
>   	),
>   
> -	TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d",
> +	TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
>   		  __entry->root, __entry->id, __entry->level,
> -		  __entry->cpu_in_loop, __entry->contended)
> +		  __entry->cpu, __entry->contended)
>   );
>   
> +/* Related to global: cgroup_rstat_lock */
>   DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
>   
>   	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> @@ -252,6 +253,49 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
>   	TP_ARGS(cgrp, cpu, contended)
>   );
>   
> +/* Related to per CPU: cgroup_rstat_cpu_lock */
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
>   #endif /* _TRACE_CGROUP_H */
>   
>   /* This part must be outside protection */
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index 52e3b0ed1cee..fb8b49437573 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
>   	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
>   }
>   
> +/*
> + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
> + *
> + * This makes it easier to diagnose locking issues and contention in
> + * production environments. The parameter @fast_path determine the
> + * tracepoints being added, allowing us to diagnose "flush" related
> + * operations without handling high-frequency fast-path "update" events.
> + */
> +static __always_inline
> +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
> +				     struct cgroup *cgrp, const bool fast_path)
> +{
> +	unsigned long flags;
> +	bool contended;
> +
> +	/*
> +	 * The _irqsave() is needed because cgroup_rstat_lock is
> +	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
> +	 * this lock with the _irq() suffix only disables interrupts on
> +	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
> +	 * interrupts on both configurations. The _irqsave() ensures
> +	 * that interrupts are always disabled and later restored.
> +	 */
> +	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
> +	if (contended) {
> +		if (fast_path)
> +			trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
> +		else
> +			trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
> +
> +		raw_spin_lock_irqsave(cpu_lock, flags);
> +	}
> +
> +	if (fast_path)
> +		trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
> +	else
> +		trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
> +
> +	return flags;
> +}
> +
> +static __always_inline
> +void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
> +			      struct cgroup *cgrp, unsigned long flags,
> +			      const bool fast_path)
> +{
> +	if (fast_path)
> +		trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
> +	else
> +		trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
> +
> +	raw_spin_unlock_irqrestore(cpu_lock, flags);
> +}
> +
>   /**
>    * cgroup_rstat_updated - keep track of updated rstat_cpu
>    * @cgrp: target cgroup
> @@ -44,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
>   	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
>   		return;
>   
> -	raw_spin_lock_irqsave(cpu_lock, flags);
> +	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
>   
>   	/* put @cgrp and all ancestors on the corresponding updated lists */
>   	while (true) {
> @@ -72,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
>   		cgrp = parent;
>   	}
>   
> -	raw_spin_unlock_irqrestore(cpu_lock, flags);
> +	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
>   }
>   
>   /**
> @@ -153,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
>   	struct cgroup *head = NULL, *parent, *child;
>   	unsigned long flags;
>   
> -	/*
> -	 * The _irqsave() is needed because cgroup_rstat_lock is
> -	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
> -	 * this lock with the _irq() suffix only disables interrupts on
> -	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
> -	 * interrupts on both configurations. The _irqsave() ensures
> -	 * that interrupts are always disabled and later restored.
> -	 */
> -	raw_spin_lock_irqsave(cpu_lock, flags);
> +	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
>   
>   	/* Return NULL if this subtree is not on-list */
>   	if (!rstatc->updated_next)
> @@ -198,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
>   	if (child != root)
>   		head = cgroup_rstat_push_children(head, child, cpu);
>   unlock_ret:
> -	raw_spin_unlock_irqrestore(cpu_lock, flags);
> +	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
>   	return head;
>   }
>   
>
>
Jesper Dangaard Brouer May 1, 2024, 5:22 p.m. UTC | #2
On 01/05/2024 16.24, Waiman Long wrote:
> On 5/1/24 10:04, Jesper Dangaard Brouer wrote:
>> This closely resembles helpers added for the global cgroup_rstat_lock in
>> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
>> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
>>
>> Based on production workloads, we observe the fast-path "update" function
>> cgroup_rstat_updated() is invoked around 3 million times per sec, 
>> while the
>> "flush" function cgroup_rstat_flush_locked(), walking each possible CPU,
>> can see periodic spikes of 700 invocations/sec.
>>
>> For this reason, the tracepoints are split into normal and fastpath
>> versions for this per-CPU lock. Making it feasible for production to
>> continuously monitor the non-fastpath tracepoint to detect lock 
>> contention
>> issues. The reason for monitoring is that lock disables IRQs which can
>> disturb e.g. softirq processing on the local CPUs involved. When the
>> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a mutex),
>> this per CPU lock becomes the next bottleneck that can introduce latency
>> variations.
>>
>> A practical bpftrace script for monitoring contention latency:
>>
>>   bpftrace -e '
>>     tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>>       @start[tid]=nsecs; @cnt[probe]=count()}
>>     tracepoint:cgroup:cgroup_rstat_cpu_locked {
>>       if (args->contended) {
>>         @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>>       @cnt[probe]=count()}
>>     interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); 
>> clear(@cnt);}'
> 
> This is a per-cpu lock. So the only possible contention involves only 2 
> CPUs - a local CPU invoking cgroup_rstat_updated(). A flusher CPU doing 
> cgroup_rstat_flush_locked() calling into cgroup_rstat_updated_list(). 
> With recent commits to reduce the percpu lock hold time, I doubt lock 
> contention on the percpu lock will have a great impact on latency. 

I do appriciate your recent changes to reduce the percpu lock hold time.
These tracepoints allow me to measure and differentiate the percpu lock
hold time vs. the flush time.

In production (using [1]) I'm seeing "Long lock-hold time" [L100] e.g.
upto 29 ms, which is time spend after obtaining the lock (runtime under
lock).  I was expecting to see "High Lock-contention wait" [L82] which
is the time waiting for obtaining the lock.

This is why I'm adding these tracepoints, as they allow me to digg
deeper, to understand where this high runtime variations originate from.


Data:

  16:52:09 Long lock-hold time: 14950 usec (14 ms) on CPU:34 comm:kswapd4
  16:52:09 Long lock-hold time: 14821 usec (14 ms) on CPU:34 comm:kswapd4
  16:52:09 Long lock-hold time: 11299 usec (11 ms) on CPU:98 comm:kswapd4
  16:52:09 Long lock-hold time: 17237 usec (17 ms) on CPU:113 comm:kswapd6
  16:52:09 Long lock-hold time: 29000 usec (29 ms) on CPU:36 
comm:kworker/u261:12
  16:52:09  time elapsed: 80 sec (interval = 1 sec)
   Flushes(5033) 294/interval (avg 62/sec)
   Locks(53374) 1748/interval (avg 667/sec)
   Yields(48341) 1454/interval (avg 604/sec)
   Contended(48104) 1450/interval (avg 601/sec)


> So do 
> we really need such an elaborate scheme to monitor this? BTW, the 
> additional code will also add to the worst case latency.

Hmm, I designed this code to have minimal impact, as tracepoints are
no-ops until activated.  I really doubt this code will change the latency.


[1] 
https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt

[L100] 
https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L100

[L82] 
https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L82

>>
>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>

More data, the histogram of time spend under the lock have some strange
variation issues with a group in 4ms to 65ms area. Investigating what
can be causeing this... which next step depend in these tracepoints.

@lock_cnt: 759146

@locked_ns:
[1K, 2K)             499 | 
      |
[2K, 4K)          206928 
|@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 
      |
[8K, 16K)          64453 |@@@@@@@@@@@@@@@@ 
      |
[16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 
      |
[32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@ 
      |
[64K, 128K)        38359 |@@@@@@@@@ 
      |
[128K, 256K)       46597 |@@@@@@@@@@@ 
      |
[256K, 512K)       32466 |@@@@@@@@ 
      |
[512K, 1M)          3945 | 
      |
[1M, 2M)             642 | 
      |
[2M, 4M)             750 | 
      |
[4M, 8M)            1932 | 
      |
[8M, 16M)           2114 | 
      |
[16M, 32M)          1039 | 
      |
[32M, 64M)           108 | 
      |




>> ---
>>   include/trace/events/cgroup.h |   56 +++++++++++++++++++++++++++++----
>>   kernel/cgroup/rstat.c         |   70 
>> ++++++++++++++++++++++++++++++++++-------
>>   2 files changed, 108 insertions(+), 18 deletions(-)
>>
>> diff --git a/include/trace/events/cgroup.h 
>> b/include/trace/events/cgroup.h
>> index 13f375800135..0b95865a90f3 100644
>> --- a/include/trace/events/cgroup.h
>> +++ b/include/trace/events/cgroup.h
>> @@ -206,15 +206,15 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
>>   DECLARE_EVENT_CLASS(cgroup_rstat,
>> -    TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> -    TP_ARGS(cgrp, cpu_in_loop, contended),
>> +    TP_ARGS(cgrp, cpu, contended),
>>       TP_STRUCT__entry(
>>           __field(    int,        root            )
>>           __field(    int,        level            )
>>           __field(    u64,        id            )
>> -        __field(    int,        cpu_in_loop        )
>> +        __field(    int,        cpu            )
>>           __field(    bool,        contended        )
>>       ),
>> @@ -222,15 +222,16 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
>>           __entry->root = cgrp->root->hierarchy_id;
>>           __entry->id = cgroup_id(cgrp);
>>           __entry->level = cgrp->level;
>> -        __entry->cpu_in_loop = cpu_in_loop;
>> +        __entry->cpu = cpu;
>>           __entry->contended = contended;
>>       ),
>> -    TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock 
>> contended:%d",
>> +    TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
>>             __entry->root, __entry->id, __entry->level,
>> -          __entry->cpu_in_loop, __entry->contended)
>> +          __entry->cpu, __entry->contended)
>>   );
>> +/* Related to global: cgroup_rstat_lock */
>>   DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
>>       TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> @@ -252,6 +253,49 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
>>       TP_ARGS(cgrp, cpu, contended)
>>   );
>> +/* Related to per CPU: cgroup_rstat_cpu_lock */
>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
>> +
>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> +
>> +    TP_ARGS(cgrp, cpu, contended)
>> +);
>> +
>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
>> +
>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> +
>> +    TP_ARGS(cgrp, cpu, contended)
>> +);
>> +
>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
>> +
>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> +
>> +    TP_ARGS(cgrp, cpu, contended)
>> +);
>> +
>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
>> +
>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> +
>> +    TP_ARGS(cgrp, cpu, contended)
>> +);
>> +
>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
>> +
>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> +
>> +    TP_ARGS(cgrp, cpu, contended)
>> +);
>> +
>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
>> +
>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>> +
>> +    TP_ARGS(cgrp, cpu, contended)
>> +);
>> +
>>   #endif /* _TRACE_CGROUP_H */
>>   /* This part must be outside protection */
>> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
>> index 52e3b0ed1cee..fb8b49437573 100644
>> --- a/kernel/cgroup/rstat.c
>> +++ b/kernel/cgroup/rstat.c
>> @@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu 
>> *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
>>       return per_cpu_ptr(cgrp->rstat_cpu, cpu);
>>   }
>> +/*
>> + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
>> + *
>> + * This makes it easier to diagnose locking issues and contention in
>> + * production environments. The parameter @fast_path determine the
>> + * tracepoints being added, allowing us to diagnose "flush" related
>> + * operations without handling high-frequency fast-path "update" events.
>> + */
>> +static __always_inline
>> +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
>> +                     struct cgroup *cgrp, const bool fast_path)
>> +{
>> +    unsigned long flags;
>> +    bool contended;
>> +
>> +    /*
>> +     * The _irqsave() is needed because cgroup_rstat_lock is
>> +     * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
>> +     * this lock with the _irq() suffix only disables interrupts on
>> +     * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
>> +     * interrupts on both configurations. The _irqsave() ensures
>> +     * that interrupts are always disabled and later restored.
>> +     */
>> +    contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
>> +    if (contended) {
>> +        if (fast_path)
>> +            trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, 
>> contended);
>> +        else
>> +            trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
>> +
>> +        raw_spin_lock_irqsave(cpu_lock, flags);
>> +    }
>> +
>> +    if (fast_path)
>> +        trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
>> +    else
>> +        trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
>> +
>> +    return flags;
>> +}
>> +
>> +static __always_inline
>> +void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
>> +                  struct cgroup *cgrp, unsigned long flags,
>> +                  const bool fast_path)
>> +{
>> +    if (fast_path)
>> +        trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
>> +    else
>> +        trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
>> +
>> +    raw_spin_unlock_irqrestore(cpu_lock, flags);
>> +}
>> +
>>   /**
>>    * cgroup_rstat_updated - keep track of updated rstat_cpu
>>    * @cgrp: target cgroup
>> @@ -44,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup 
>> *cgrp, int cpu)
>>       if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
>>           return;
>> -    raw_spin_lock_irqsave(cpu_lock, flags);
>> +    flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
>>       /* put @cgrp and all ancestors on the corresponding updated 
>> lists */
>>       while (true) {
>> @@ -72,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup 
>> *cgrp, int cpu)
>>           cgrp = parent;
>>       }
>> -    raw_spin_unlock_irqrestore(cpu_lock, flags);
>> +    _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
>>   }
>>   /**
>> @@ -153,15 +207,7 @@ static struct cgroup 
>> *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
>>       struct cgroup *head = NULL, *parent, *child;
>>       unsigned long flags;
>> -    /*
>> -     * The _irqsave() is needed because cgroup_rstat_lock is
>> -     * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
>> -     * this lock with the _irq() suffix only disables interrupts on
>> -     * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
>> -     * interrupts on both configurations. The _irqsave() ensures
>> -     * that interrupts are always disabled and later restored.
>> -     */
>> -    raw_spin_lock_irqsave(cpu_lock, flags);
>> +    flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
>>       /* Return NULL if this subtree is not on-list */
>>       if (!rstatc->updated_next)
>> @@ -198,7 +244,7 @@ static struct cgroup 
>> *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
>>       if (child != root)
>>           head = cgroup_rstat_push_children(head, child, cpu);
>>   unlock_ret:
>> -    raw_spin_unlock_irqrestore(cpu_lock, flags);
>> +    _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
>>       return head;
>>   }
>>
>>
>
Jesper Dangaard Brouer May 2, 2024, 11:23 a.m. UTC | #3
On 01/05/2024 20.41, Waiman Long wrote:
> On 5/1/24 13:22, Jesper Dangaard Brouer wrote:
>>
>>
>> On 01/05/2024 16.24, Waiman Long wrote:
>>> On 5/1/24 10:04, Jesper Dangaard Brouer wrote:
>>>> This closely resembles helpers added for the global 
>>>> cgroup_rstat_lock in
>>>> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
>>>> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
>>>>
>>>> Based on production workloads, we observe the fast-path "update" 
>>>> function
>>>> cgroup_rstat_updated() is invoked around 3 million times per sec, 
>>>> while the
>>>> "flush" function cgroup_rstat_flush_locked(), walking each possible 
>>>> CPU,
>>>> can see periodic spikes of 700 invocations/sec.
>>>>
>>>> For this reason, the tracepoints are split into normal and fastpath
>>>> versions for this per-CPU lock. Making it feasible for production to
>>>> continuously monitor the non-fastpath tracepoint to detect lock 
>>>> contention
>>>> issues. The reason for monitoring is that lock disables IRQs which can
>>>> disturb e.g. softirq processing on the local CPUs involved. When the
>>>> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a 
>>>> mutex),
>>>> this per CPU lock becomes the next bottleneck that can introduce 
>>>> latency
>>>> variations.
>>>>
>>>> A practical bpftrace script for monitoring contention latency:
>>>>
>>>>   bpftrace -e '
>>>>     tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>>>>       @start[tid]=nsecs; @cnt[probe]=count()}
>>>>     tracepoint:cgroup:cgroup_rstat_cpu_locked {
>>>>       if (args->contended) {
>>>>         @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>>>>       @cnt[probe]=count()}
>>>>     interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); 
>>>> clear(@cnt);}'
>>>
>>> This is a per-cpu lock. So the only possible contention involves only 
>>> 2 CPUs - a local CPU invoking cgroup_rstat_updated(). A flusher CPU 
>>> doing cgroup_rstat_flush_locked() calling into 
>>> cgroup_rstat_updated_list(). With recent commits to reduce the percpu 
>>> lock hold time, I doubt lock contention on the percpu lock will have 
>>> a great impact on latency. 
>>
>> I do appriciate your recent changes to reduce the percpu lock hold time.
>> These tracepoints allow me to measure and differentiate the percpu lock
>> hold time vs. the flush time.
>>
>> In production (using [1]) I'm seeing "Long lock-hold time" [L100] e.g.
>> upto 29 ms, which is time spend after obtaining the lock (runtime under
>> lock).  I was expecting to see "High Lock-contention wait" [L82] which
>> is the time waiting for obtaining the lock.
>>
>> This is why I'm adding these tracepoints, as they allow me to digg
>> deeper, to understand where this high runtime variations originate from.
>>
>>
>> Data:
>>
>>  16:52:09 Long lock-hold time: 14950 usec (14 ms) on CPU:34 comm:kswapd4
>>  16:52:09 Long lock-hold time: 14821 usec (14 ms) on CPU:34 comm:kswapd4
>>  16:52:09 Long lock-hold time: 11299 usec (11 ms) on CPU:98 comm:kswapd4
>>  16:52:09 Long lock-hold time: 17237 usec (17 ms) on CPU:113 comm:kswapd6
>>  16:52:09 Long lock-hold time: 29000 usec (29 ms) on CPU:36 
>> comm:kworker/u261:12
> That lock hold time is much higher than I would have expected.
>>  16:52:09 time elapsed: 80 sec (interval = 1 sec)
>>   Flushes(5033) 294/interval (avg 62/sec)
>>   Locks(53374) 1748/interval (avg 667/sec)
>>   Yields(48341) 1454/interval (avg 604/sec)
>>   Contended(48104) 1450/interval (avg 601/sec)
>>
>>
>>> So do we really need such an elaborate scheme to monitor this? BTW, 
>>> the additional code will also add to the worst case latency.
>>
>> Hmm, I designed this code to have minimal impact, as tracepoints are
>> no-ops until activated.  I really doubt this code will change the 
>> latency.
>>
>>
>> [1] 
>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt
>>
>> [L100] 
>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L100
>>
>> [L82] 
>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L82
>>
>>>>
>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>
>> More data, the histogram of time spend under the lock have some strange
>> variation issues with a group in 4ms to 65ms area. Investigating what
>> can be causeing this... which next step depend in these tracepoints.
>>
>> @lock_cnt: 759146
>>
>> @locked_ns:
>> [1K, 2K)             499 |      |
>> [2K, 4K)          206928 
>> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>> [4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
>> [8K, 16K)          64453 |@@@@@@@@@@@@@@@@      |
>> [16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ |
>> [32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@      |
>> [64K, 128K)        38359 |@@@@@@@@@      |
>> [128K, 256K)       46597 |@@@@@@@@@@@      |
>> [256K, 512K)       32466 |@@@@@@@@      |
>> [512K, 1M)          3945 |      |
>> [1M, 2M)             642 |      |
>> [2M, 4M)             750 |      |
>> [4M, 8M)            1932 |      |
>> [8M, 16M)           2114 |      |
>> [16M, 32M)          1039 |      |
>> [32M, 64M)           108 |      |
>>
>>
>>
>>
>>>> ---
>>>>   include/trace/events/cgroup.h |   56 
>>>> +++++++++++++++++++++++++++++----
>>>>   kernel/cgroup/rstat.c         |   70 
>>>> ++++++++++++++++++++++++++++++++++-------
>>>>   2 files changed, 108 insertions(+), 18 deletions(-)
>>>>
>>>> diff --git a/include/trace/events/cgroup.h 
>>>> b/include/trace/events/cgroup.h
>>>> index 13f375800135..0b95865a90f3 100644
>>>> --- a/include/trace/events/cgroup.h
[...]
>>>> +++ b/include/trace/events/cgroup.h >>>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
>>>> +
>>>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>>>> +
>>>> +    TP_ARGS(cgrp, cpu, contended)
>>>> +);
>>>> +
>>>>   #endif /* _TRACE_CGROUP_H */
>>>>   /* This part must be outside protection */
>>>> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
>>>> index 52e3b0ed1cee..fb8b49437573 100644
>>>> --- a/kernel/cgroup/rstat.c
>>>> +++ b/kernel/cgroup/rstat.c
>>>> @@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu 
>>>> *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
>>>>       return per_cpu_ptr(cgrp->rstat_cpu, cpu);
>>>>   }
>>>> +/*
>>>> + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
>>>> + *
>>>> + * This makes it easier to diagnose locking issues and contention in
>>>> + * production environments. The parameter @fast_path determine the
>>>> + * tracepoints being added, allowing us to diagnose "flush" related
>>>> + * operations without handling high-frequency fast-path "update" 
>>>> events.
>>>> + */
>>>> +static __always_inline
>>>> +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int 
>>>> cpu,
>>>> +                     struct cgroup *cgrp, const bool fast_path)
>>>> +{
>>>> +    unsigned long flags;
>>>> +    bool contended;
>>>> +
>>>> +    /*
>>>> +     * The _irqsave() is needed because cgroup_rstat_lock is
>>>> +     * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
>>>> +     * this lock with the _irq() suffix only disables interrupts on
>>>> +     * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
>>>> +     * interrupts on both configurations. The _irqsave() ensures
>>>> +     * that interrupts are always disabled and later restored.
>>>> +     */
>>>> +    contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
>>>> +    if (contended) {
>>>> +        if (fast_path)
>>>> + trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
>>>> +        else
>>>> +            trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, 
>>>> contended);
>>>> +
>>>> +        raw_spin_lock_irqsave(cpu_lock, flags);
> 
> Could you do a local_irq_save() before calling trace_cgroup*() and 
> raw_spin_lock()? Would that help in eliminating this high lock hold time?
> 

Nope it will not eliminating high lock *hold* time, because the hold
start timestamp is first taken *AFTER* obtaining the lock.

It could help the contended "wait-time" measurement, but my prod
measurements show this isn't an issues.

> You can also do a local_irq_save() first before the trylock. That will 
> eliminate the duplicated irq_restore() and irq_save() when there is 
> contention.

I wrote the code like this on purpose ;-)
My issue with this code/lock is it cause latency issues for softirq 
NET_RX. So, when I detect a "contended" lock event, I do want a 
irq_restore() as that will allow networking/do_softirq() to run before I 
start waiting for the lock (with IRQ disabled).


> If not, there may be NMIs mixed in.
> 

NMIs are definitely on my list of things to investigate.
These AMD CPUs also have other types of interrupts that needs a close look.

The easier explaination is that the lock isn't "yielded" on every cycle
through the for each CPU loop.

Lets look at the data I provided above:

 >>   Flushes(5033) 294/interval (avg 62/sec)
 >>   Locks(53374) 1748/interval (avg 667/sec)
 >>   Yields(48341) 1454/interval (avg 604/sec)
 >>   Contended(48104) 1450/interval (avg 601/sec)

In this 1 second sample, we have 294 flushes, and more yields 1454,
great but the factor is not 128 (num-of-CPUs) but closer to 5.  Thus, on
average we hold the lock for (128/5) 25.6 CPUs-walks.

We have spoken about releasing the lock on for_each CPU before... it
will likely solve this long hold time, but IMHO a mutex is still the
better solution.

--Jesper
Waiman Long May 2, 2024, 6:19 p.m. UTC | #4
On 5/2/24 07:23, Jesper Dangaard Brouer wrote:
>
>
> On 01/05/2024 20.41, Waiman Long wrote:
>> On 5/1/24 13:22, Jesper Dangaard Brouer wrote:
>>>
>>>
>>> On 01/05/2024 16.24, Waiman Long wrote:
>>>> On 5/1/24 10:04, Jesper Dangaard Brouer wrote:
>>>>> This closely resembles helpers added for the global 
>>>>> cgroup_rstat_lock in
>>>>> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
>>>>> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
>>>>>
>>>>> Based on production workloads, we observe the fast-path "update" 
>>>>> function
>>>>> cgroup_rstat_updated() is invoked around 3 million times per sec, 
>>>>> while the
>>>>> "flush" function cgroup_rstat_flush_locked(), walking each 
>>>>> possible CPU,
>>>>> can see periodic spikes of 700 invocations/sec.
>>>>>
>>>>> For this reason, the tracepoints are split into normal and fastpath
>>>>> versions for this per-CPU lock. Making it feasible for production to
>>>>> continuously monitor the non-fastpath tracepoint to detect lock 
>>>>> contention
>>>>> issues. The reason for monitoring is that lock disables IRQs which 
>>>>> can
>>>>> disturb e.g. softirq processing on the local CPUs involved. When the
>>>>> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a 
>>>>> mutex),
>>>>> this per CPU lock becomes the next bottleneck that can introduce 
>>>>> latency
>>>>> variations.
>>>>>
>>>>> A practical bpftrace script for monitoring contention latency:
>>>>>
>>>>>   bpftrace -e '
>>>>>     tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>>>>>       @start[tid]=nsecs; @cnt[probe]=count()}
>>>>>     tracepoint:cgroup:cgroup_rstat_cpu_locked {
>>>>>       if (args->contended) {
>>>>>         @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>>>>>       @cnt[probe]=count()}
>>>>>     interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); 
>>>>> clear(@cnt);}'
>>>>
>>>> This is a per-cpu lock. So the only possible contention involves 
>>>> only 2 CPUs - a local CPU invoking cgroup_rstat_updated(). A 
>>>> flusher CPU doing cgroup_rstat_flush_locked() calling into 
>>>> cgroup_rstat_updated_list(). With recent commits to reduce the 
>>>> percpu lock hold time, I doubt lock contention on the percpu lock 
>>>> will have a great impact on latency. 
>>>
>>> I do appriciate your recent changes to reduce the percpu lock hold 
>>> time.
>>> These tracepoints allow me to measure and differentiate the percpu lock
>>> hold time vs. the flush time.
>>>
>>> In production (using [1]) I'm seeing "Long lock-hold time" [L100] e.g.
>>> upto 29 ms, which is time spend after obtaining the lock (runtime under
>>> lock).  I was expecting to see "High Lock-contention wait" [L82] which
>>> is the time waiting for obtaining the lock.
>>>
>>> This is why I'm adding these tracepoints, as they allow me to digg
>>> deeper, to understand where this high runtime variations originate 
>>> from.
>>>
>>>
>>> Data:
>>>
>>>  16:52:09 Long lock-hold time: 14950 usec (14 ms) on CPU:34 
>>> comm:kswapd4
>>>  16:52:09 Long lock-hold time: 14821 usec (14 ms) on CPU:34 
>>> comm:kswapd4
>>>  16:52:09 Long lock-hold time: 11299 usec (11 ms) on CPU:98 
>>> comm:kswapd4
>>>  16:52:09 Long lock-hold time: 17237 usec (17 ms) on CPU:113 
>>> comm:kswapd6
>>>  16:52:09 Long lock-hold time: 29000 usec (29 ms) on CPU:36 
>>> comm:kworker/u261:12
>> That lock hold time is much higher than I would have expected.
>>>  16:52:09 time elapsed: 80 sec (interval = 1 sec)
>>>   Flushes(5033) 294/interval (avg 62/sec)
>>>   Locks(53374) 1748/interval (avg 667/sec)
>>>   Yields(48341) 1454/interval (avg 604/sec)
>>>   Contended(48104) 1450/interval (avg 601/sec)
>>>
>>>
>>>> So do we really need such an elaborate scheme to monitor this? BTW, 
>>>> the additional code will also add to the worst case latency.
>>>
>>> Hmm, I designed this code to have minimal impact, as tracepoints are
>>> no-ops until activated.  I really doubt this code will change the 
>>> latency.
>>>
>>>
>>> [1] 
>>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt
>>>
>>> [L100] 
>>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L100
>>>
>>> [L82] 
>>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L82
>>>
>>>>>
>>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>
>>> More data, the histogram of time spend under the lock have some strange
>>> variation issues with a group in 4ms to 65ms area. Investigating what
>>> can be causeing this... which next step depend in these tracepoints.
>>>
>>> @lock_cnt: 759146
>>>
>>> @locked_ns:
>>> [1K, 2K)             499 |      |
>>> [2K, 4K)          206928 
>>> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>>> [4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
>>> [8K, 16K)          64453 |@@@@@@@@@@@@@@@@      |
>>> [16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ |
>>> [32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@      |
>>> [64K, 128K)        38359 |@@@@@@@@@      |
>>> [128K, 256K)       46597 |@@@@@@@@@@@      |
>>> [256K, 512K)       32466 |@@@@@@@@      |
>>> [512K, 1M)          3945 |      |
>>> [1M, 2M)             642 |      |
>>> [2M, 4M)             750 |      |
>>> [4M, 8M)            1932 |      |
>>> [8M, 16M)           2114 |      |
>>> [16M, 32M)          1039 |      |
>>> [32M, 64M)           108 |      |
>>>
>>>
>>>
>>>
>>>>> ---
>>>>>   include/trace/events/cgroup.h |   56 
>>>>> +++++++++++++++++++++++++++++----
>>>>>   kernel/cgroup/rstat.c         |   70 
>>>>> ++++++++++++++++++++++++++++++++++-------
>>>>>   2 files changed, 108 insertions(+), 18 deletions(-)
>>>>>
>>>>> diff --git a/include/trace/events/cgroup.h 
>>>>> b/include/trace/events/cgroup.h
>>>>> index 13f375800135..0b95865a90f3 100644
>>>>> --- a/include/trace/events/cgroup.h
> [...]
>>>>> +++ b/include/trace/events/cgroup.h >>>> 
>>>>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
>>>>> +
>>>>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>>>>> +
>>>>> +    TP_ARGS(cgrp, cpu, contended)
>>>>> +);
>>>>> +
>>>>>   #endif /* _TRACE_CGROUP_H */
>>>>>   /* This part must be outside protection */
>>>>> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
>>>>> index 52e3b0ed1cee..fb8b49437573 100644
>>>>> --- a/kernel/cgroup/rstat.c
>>>>> +++ b/kernel/cgroup/rstat.c
>>>>> @@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu 
>>>>> *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
>>>>>       return per_cpu_ptr(cgrp->rstat_cpu, cpu);
>>>>>   }
>>>>> +/*
>>>>> + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
>>>>> + *
>>>>> + * This makes it easier to diagnose locking issues and contention in
>>>>> + * production environments. The parameter @fast_path determine the
>>>>> + * tracepoints being added, allowing us to diagnose "flush" related
>>>>> + * operations without handling high-frequency fast-path "update" 
>>>>> events.
>>>>> + */
>>>>> +static __always_inline
>>>>> +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, 
>>>>> int cpu,
>>>>> +                     struct cgroup *cgrp, const bool fast_path)
>>>>> +{
>>>>> +    unsigned long flags;
>>>>> +    bool contended;
>>>>> +
>>>>> +    /*
>>>>> +     * The _irqsave() is needed because cgroup_rstat_lock is
>>>>> +     * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
>>>>> +     * this lock with the _irq() suffix only disables interrupts on
>>>>> +     * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
>>>>> +     * interrupts on both configurations. The _irqsave() ensures
>>>>> +     * that interrupts are always disabled and later restored.
>>>>> +     */
>>>>> +    contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
>>>>> +    if (contended) {
>>>>> +        if (fast_path)
>>>>> + trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, 
>>>>> contended);
>>>>> +        else
>>>>> +            trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, 
>>>>> contended);
>>>>> +
>>>>> +        raw_spin_lock_irqsave(cpu_lock, flags);
>>
>> Could you do a local_irq_save() before calling trace_cgroup*() and 
>> raw_spin_lock()? Would that help in eliminating this high lock hold 
>> time?
>>
>
> Nope it will not eliminating high lock *hold* time, because the hold
> start timestamp is first taken *AFTER* obtaining the lock.
>
> It could help the contended "wait-time" measurement, but my prod
> measurements show this isn't an issues.

Right.


>
>> You can also do a local_irq_save() first before the trylock. That 
>> will eliminate the duplicated irq_restore() and irq_save() when there 
>> is contention.
>
> I wrote the code like this on purpose ;-)
> My issue with this code/lock is it cause latency issues for softirq 
> NET_RX. So, when I detect a "contended" lock event, I do want a 
> irq_restore() as that will allow networking/do_softirq() to run before 
> I start waiting for the lock (with IRQ disabled).
>
Assuming the time taken by the tracing code is negligible, we are 
talking about disabling IRQ almost immediate after enabling it. The 
trylock time should be relatively short so the additional delay due to 
irq disabled for the whole period is insignificant.
>
>> If not, there may be NMIs mixed in.
>>
>
> NMIs are definitely on my list of things to investigate.
> These AMD CPUs also have other types of interrupts that needs a close 
> look.
>
> The easier explaination is that the lock isn't "yielded" on every cycle
> through the for each CPU loop.
>
> Lets look at the data I provided above:
>
> >>   Flushes(5033) 294/interval (avg 62/sec)
> >>   Locks(53374) 1748/interval (avg 667/sec)
> >>   Yields(48341) 1454/interval (avg 604/sec)
> >>   Contended(48104) 1450/interval (avg 601/sec)
>
> In this 1 second sample, we have 294 flushes, and more yields 1454,
> great but the factor is not 128 (num-of-CPUs) but closer to 5. Thus, on
> average we hold the lock for (128/5) 25.6 CPUs-walks.
>
> We have spoken about releasing the lock on for_each CPU before... it
> will likely solve this long hold time, but IMHO a mutex is still the
> better solution.

I may have mistakenly thinking the lock hold time refers to just the 
cpu_lock. Your reported times here are about the cgroup_rstat_lock. 
Right? If so, the numbers make sense to me.

Cheers,
Longman
Shakeel Butt May 2, 2024, 7:44 p.m. UTC | #5
On Wed, May 01, 2024 at 07:22:26PM +0200, Jesper Dangaard Brouer wrote:
> 
[...]
> 
> More data, the histogram of time spend under the lock have some strange
> variation issues with a group in 4ms to 65ms area. Investigating what
> can be causeing this... which next step depend in these tracepoints.
> 
> @lock_cnt: 759146
> 
> @locked_ns:
> [1K, 2K)             499 |      |
> [2K, 4K)          206928
> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
> [8K, 16K)          64453 |@@@@@@@@@@@@@@@@      |
> [16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
> [32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@      |
> [64K, 128K)        38359 |@@@@@@@@@      |
> [128K, 256K)       46597 |@@@@@@@@@@@      |
> [256K, 512K)       32466 |@@@@@@@@      |
> [512K, 1M)          3945 |      |
> [1M, 2M)             642 |      |
> [2M, 4M)             750 |      |
> [4M, 8M)            1932 |      |
> [8M, 16M)           2114 |      |
> [16M, 32M)          1039 |      |
> [32M, 64M)           108 |      |
> 

Am I understanding correctly that 1K is 1 microsecond and 1M is 1
millisecond? Is it possible to further divide this table into update
side and flush side?
Jesper Dangaard Brouer May 3, 2024, 12:58 p.m. UTC | #6
On 02/05/2024 21.44, Shakeel Butt wrote:
> On Wed, May 01, 2024 at 07:22:26PM +0200, Jesper Dangaard Brouer wrote:
>>
> [...]
>>
>> More data, the histogram of time spend under the lock have some strange
>> variation issues with a group in 4ms to 65ms area. Investigating what
>> can be causeing this... which next step depend in these tracepoints.
>>
>> @lock_cnt: 759146
>>
>> @locked_ns:
>> [1K, 2K)             499 |      |
>> [2K, 4K)          206928
>> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>> [4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
>> [8K, 16K)          64453 |@@@@@@@@@@@@@@@@      |
>> [16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
>> [32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@      |
>> [64K, 128K)        38359 |@@@@@@@@@      |
>> [128K, 256K)       46597 |@@@@@@@@@@@      |
>> [256K, 512K)       32466 |@@@@@@@@      |
>> [512K, 1M)          3945 |      |
>> [1M, 2M)             642 |      |
>> [2M, 4M)             750 |      |
>> [4M, 8M)            1932 |      |
>> [8M, 16M)           2114 |      |
>> [16M, 32M)          1039 |      |
>> [32M, 64M)           108 |      |
>>
> 
> Am I understanding correctly that 1K is 1 microsecond and 1M is 1
> millisecond? 

Correct.

> Is it possible to further divide this table into update
> side and flush side?
>

This is *only* flush side.

You question indicate, that we are talking past each-other ;-)

Measurements above is with (recently) accepted tracepoints (e.g. not the
proposed tracepoints in this patch).  I'm arguing with existing
tracepoint that I'm seeing this data, and arguing I need per-CPU
tracepoints to dig deeper into this (as proposed in this patch).

The "update side" can only be measured once we apply this patch.

This morning I got 6 prod machines booted with new kernels, that contain 
this proposed per-CPU lock tracepoint patch.  And 3 of these machines 
have the Mutex lock change also.  No data to share yet...

--Jesper
Jesper Dangaard Brouer May 3, 2024, 2 p.m. UTC | #7
On 02/05/2024 20.19, Waiman Long wrote:
> On 5/2/24 07:23, Jesper Dangaard Brouer wrote:
>>
>>
>> On 01/05/2024 20.41, Waiman Long wrote:
>>> On 5/1/24 13:22, Jesper Dangaard Brouer wrote:
>>>>
>>>>
>>>> On 01/05/2024 16.24, Waiman Long wrote:
>>>>> On 5/1/24 10:04, Jesper Dangaard Brouer wrote:
>>>>>> This closely resembles helpers added for the global 
>>>>>> cgroup_rstat_lock in
>>>>>> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
>>>>>> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
>>>>>>
>>>>>> Based on production workloads, we observe the fast-path "update" 
>>>>>> function
>>>>>> cgroup_rstat_updated() is invoked around 3 million times per sec, 
>>>>>> while the
>>>>>> "flush" function cgroup_rstat_flush_locked(), walking each 
>>>>>> possible CPU,
>>>>>> can see periodic spikes of 700 invocations/sec.
>>>>>>
>>>>>> For this reason, the tracepoints are split into normal and fastpath
>>>>>> versions for this per-CPU lock. Making it feasible for production to
>>>>>> continuously monitor the non-fastpath tracepoint to detect lock 
>>>>>> contention
>>>>>> issues. The reason for monitoring is that lock disables IRQs which 
>>>>>> can
>>>>>> disturb e.g. softirq processing on the local CPUs involved. When the
>>>>>> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a 
>>>>>> mutex),
>>>>>> this per CPU lock becomes the next bottleneck that can introduce 
>>>>>> latency
>>>>>> variations.
>>>>>>
>>>>>> A practical bpftrace script for monitoring contention latency:
>>>>>>
>>>>>>   bpftrace -e '
>>>>>>     tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>>>>>>       @start[tid]=nsecs; @cnt[probe]=count()}
>>>>>>     tracepoint:cgroup:cgroup_rstat_cpu_locked {
>>>>>>       if (args->contended) {
>>>>>>         @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>>>>>>       @cnt[probe]=count()}
>>>>>>     interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); 
>>>>>> clear(@cnt);}'
>>>>>
>>>>> This is a per-cpu lock. So the only possible contention involves 
>>>>> only 2 CPUs - a local CPU invoking cgroup_rstat_updated(). A 
>>>>> flusher CPU doing cgroup_rstat_flush_locked() calling into 
>>>>> cgroup_rstat_updated_list(). With recent commits to reduce the 
>>>>> percpu lock hold time, I doubt lock contention on the percpu lock 
>>>>> will have a great impact on latency. 
>>>>
>>>> I do appriciate your recent changes to reduce the percpu lock hold 
>>>> time.
>>>> These tracepoints allow me to measure and differentiate the percpu lock
>>>> hold time vs. the flush time.
>>>>
>>>> In production (using [1]) I'm seeing "Long lock-hold time" [L100] e.g.
>>>> upto 29 ms, which is time spend after obtaining the lock (runtime under
>>>> lock).  I was expecting to see "High Lock-contention wait" [L82] which
>>>> is the time waiting for obtaining the lock.
>>>>
>>>> This is why I'm adding these tracepoints, as they allow me to digg
>>>> deeper, to understand where this high runtime variations originate 
>>>> from.
>>>>
>>>>
>>>> Data:
>>>>
>>>>  16:52:09 Long lock-hold time: 14950 usec (14 ms) on CPU:34 
>>>> comm:kswapd4
>>>>  16:52:09 Long lock-hold time: 14821 usec (14 ms) on CPU:34 
>>>> comm:kswapd4
>>>>  16:52:09 Long lock-hold time: 11299 usec (11 ms) on CPU:98 
>>>> comm:kswapd4
>>>>  16:52:09 Long lock-hold time: 17237 usec (17 ms) on CPU:113 
>>>> comm:kswapd6
>>>>  16:52:09 Long lock-hold time: 29000 usec (29 ms) on CPU:36 
>>>> comm:kworker/u261:12
>>> That lock hold time is much higher than I would have expected.
>>>>  16:52:09 time elapsed: 80 sec (interval = 1 sec)
>>>>   Flushes(5033) 294/interval (avg 62/sec)
>>>>   Locks(53374) 1748/interval (avg 667/sec)
>>>>   Yields(48341) 1454/interval (avg 604/sec)
>>>>   Contended(48104) 1450/interval (avg 601/sec)
>>>>
>>>>
>>>>> So do we really need such an elaborate scheme to monitor this? BTW, 
>>>>> the additional code will also add to the worst case latency.
>>>>
>>>> Hmm, I designed this code to have minimal impact, as tracepoints are
>>>> no-ops until activated.  I really doubt this code will change the 
>>>> latency.
>>>>
>>>>
>>>> [1] 
>>>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt
>>>>
>>>> [L100] 
>>>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L100
>>>>
>>>> [L82] 
>>>> https://github.com/xdp-project/xdp-project/blob/master/areas/latency/cgroup_rstat_tracepoint.bt#L82
>>>>
>>>>>>
>>>>>> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
>>>>
>>>> More data, the histogram of time spend under the lock have some strange
>>>> variation issues with a group in 4ms to 65ms area. Investigating what
>>>> can be causeing this... which next step depend in these tracepoints.
>>>>
>>>> @lock_cnt: 759146
>>>>
>>>> @locked_ns:
>>>> [1K, 2K)             499 |      |
>>>> [2K, 4K)          206928 
>>>> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>>>> [4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
>>>> [8K, 16K)          64453 |@@@@@@@@@@@@@@@@      |
>>>> [16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ |
>>>> [32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@      |
>>>> [64K, 128K)        38359 |@@@@@@@@@      |
>>>> [128K, 256K)       46597 |@@@@@@@@@@@      |
>>>> [256K, 512K)       32466 |@@@@@@@@      |
>>>> [512K, 1M)          3945 |      |
>>>> [1M, 2M)             642 |      |
>>>> [2M, 4M)             750 |      |
>>>> [4M, 8M)            1932 |      |
>>>> [8M, 16M)           2114 |      |
>>>> [16M, 32M)          1039 |      |
>>>> [32M, 64M)           108 |      |
>>>>
>>>>
>>>>
>>>>
>>>>>> ---
>>>>>>   include/trace/events/cgroup.h |   56 
>>>>>> +++++++++++++++++++++++++++++----
>>>>>>   kernel/cgroup/rstat.c         |   70 
>>>>>> ++++++++++++++++++++++++++++++++++-------
>>>>>>   2 files changed, 108 insertions(+), 18 deletions(-)
>>>>>>
>>>>>> diff --git a/include/trace/events/cgroup.h 
>>>>>> b/include/trace/events/cgroup.h
>>>>>> index 13f375800135..0b95865a90f3 100644
>>>>>> --- a/include/trace/events/cgroup.h
>> [...]
>>>>>> +++ b/include/trace/events/cgroup.h >>>> 
>>>>>> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
>>>>>> +
>>>>>> +    TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>>>>>> +
>>>>>> +    TP_ARGS(cgrp, cpu, contended)
>>>>>> +);
>>>>>> +
>>>>>>   #endif /* _TRACE_CGROUP_H */
>>>>>>   /* This part must be outside protection */
>>>>>> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
>>>>>> index 52e3b0ed1cee..fb8b49437573 100644
>>>>>> --- a/kernel/cgroup/rstat.c
>>>>>> +++ b/kernel/cgroup/rstat.c
>>>>>> @@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu 
>>>>>> *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
>>>>>>       return per_cpu_ptr(cgrp->rstat_cpu, cpu);
>>>>>>   }
>>>>>> +/*
>>>>>> + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
>>>>>> + *
>>>>>> + * This makes it easier to diagnose locking issues and contention in
>>>>>> + * production environments. The parameter @fast_path determine the
>>>>>> + * tracepoints being added, allowing us to diagnose "flush" related
>>>>>> + * operations without handling high-frequency fast-path "update" 
>>>>>> events.
>>>>>> + */
>>>>>> +static __always_inline
>>>>>> +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, 
>>>>>> int cpu,
>>>>>> +                     struct cgroup *cgrp, const bool fast_path)
>>>>>> +{
>>>>>> +    unsigned long flags;
>>>>>> +    bool contended;
>>>>>> +
>>>>>> +    /*
>>>>>> +     * The _irqsave() is needed because cgroup_rstat_lock is
>>>>>> +     * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
>>>>>> +     * this lock with the _irq() suffix only disables interrupts on
>>>>>> +     * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
>>>>>> +     * interrupts on both configurations. The _irqsave() ensures
>>>>>> +     * that interrupts are always disabled and later restored.
>>>>>> +     */
>>>>>> +    contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
>>>>>> +    if (contended) {
>>>>>> +        if (fast_path)
>>>>>> + trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, 
>>>>>> contended);
>>>>>> +        else
>>>>>> +            trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, 
>>>>>> contended);
>>>>>> +
>>>>>> +        raw_spin_lock_irqsave(cpu_lock, flags);
>>>
>>> Could you do a local_irq_save() before calling trace_cgroup*() and 
>>> raw_spin_lock()? Would that help in eliminating this high lock hold 
>>> time?
>>>
>>
>> Nope it will not eliminating high lock *hold* time, because the hold
>> start timestamp is first taken *AFTER* obtaining the lock.
>>
>> It could help the contended "wait-time" measurement, but my prod
>> measurements show this isn't an issues.
> 
> Right.
> 
> 
>>
>>> You can also do a local_irq_save() first before the trylock. That 
>>> will eliminate the duplicated irq_restore() and irq_save() when there 
>>> is contention.
>>
>> I wrote the code like this on purpose ;-)
>> My issue with this code/lock is it cause latency issues for softirq 
>> NET_RX. So, when I detect a "contended" lock event, I do want a 
>> irq_restore() as that will allow networking/do_softirq() to run before 
>> I start waiting for the lock (with IRQ disabled).
>>
> Assuming the time taken by the tracing code is negligible, we are 
> talking about disabling IRQ almost immediate after enabling it. The 
> trylock time should be relatively short so the additional delay due to 
> irq disabled for the whole period is insignificant.
>>
>>> If not, there may be NMIs mixed in.
>>>
>>
>> NMIs are definitely on my list of things to investigate.
>> These AMD CPUs also have other types of interrupts that needs a close 
>> look.
>>
>> The easier explaination is that the lock isn't "yielded" on every cycle
>> through the for each CPU loop.
>>
>> Lets look at the data I provided above:
>>
>> >>   Flushes(5033) 294/interval (avg 62/sec)
>> >>   Locks(53374) 1748/interval (avg 667/sec)
>> >>   Yields(48341) 1454/interval (avg 604/sec)
>> >>   Contended(48104) 1450/interval (avg 601/sec)
>>
>> In this 1 second sample, we have 294 flushes, and more yields 1454,
>> great but the factor is not 128 (num-of-CPUs) but closer to 5. Thus, on
>> average we hold the lock for (128/5) 25.6 CPUs-walks.
>>
>> We have spoken about releasing the lock on for_each CPU before... it
>> will likely solve this long hold time, but IMHO a mutex is still the
>> better solution.
> 
> I may have mistakenly thinking the lock hold time refers to just the 
> cpu_lock. Your reported times here are about the cgroup_rstat_lock. 
> Right? If so, the numbers make sense to me.
> 

True, my reported number here are about the cgroup_rstat_lock.
Glad to hear, we are more aligned then :-)

Given I just got some prod machines online with this patch
cgroup_rstat_cpu_lock tracepoints, I can give you some early results,
about hold-time for the cgroup_rstat_cpu_lock.

 From this oneliner bpftrace commands:

   sudo bpftrace -e '
          tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
            @start[tid]=nsecs; @cnt[probe]=count()}
          tracepoint:cgroup:cgroup_rstat_cpu_locked {
            $now=nsecs;
            if (args->contended) {
              @wait_per_cpu_ns=hist($now-@start[tid]); delete(@start[tid]);}
            @cnt[probe]=count(); @locked[tid]=$now}
          tracepoint:cgroup:cgroup_rstat_cpu_unlock {
            $now=nsecs;
            @locked_per_cpu_ns=hist($now-@locked[tid]); 
delete(@locked[tid]);
            @cnt[probe]=count()}
          interval:s:1 {time("%H:%M:%S "); print(@wait_per_cpu_ns);
            print(@locked_per_cpu_ns); print(@cnt); clear(@cnt);}'

Results from one 1 sec period:

13:39:55 @wait_per_cpu_ns:
[512, 1K)              3 | 
      |
[1K, 2K)              12 |@ 
      |
[2K, 4K)             390 
|@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[4K, 8K)              70 |@@@@@@@@@ 
      |
[8K, 16K)             24 |@@@ 
      |
[16K, 32K)           183 |@@@@@@@@@@@@@@@@@@@@@@@@ 
      |
[32K, 64K)            11 |@ 
      |

@locked_per_cpu_ns:
[256, 512)         75592 |@ 
      |
[512, 1K)        2537357 
|@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[1K, 2K)          528615 |@@@@@@@@@@ 
      |
[2K, 4K)          168519 |@@@ 
      |
[4K, 8K)          162039 |@@@ 
      |
[8K, 16K)         100730 |@@ 
      |
[16K, 32K)         42276 | 
      |
[32K, 64K)          1423 | 
      |
[64K, 128K)           89 | 
      |

  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_lock_contended]: 3 /sec
  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_unlock]: 3200  /sec
  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_locked]: 3200  /sec


So, we see "flush-code-path" per-CPU-holding @locked_per_cpu_ns isn't
exceeding 128 usec.

My latency requirements, to avoid RX-queue overflow, with 1024 slots,
running at 25 Gbit/s, is 27.6 usec with small packets, and 500 usec
(0.5ms) with MTU size packets.  This is very close to my latency
requirements.

--Jesper
Waiman Long May 3, 2024, 2:30 p.m. UTC | #8
On 5/3/24 10:00, Jesper Dangaard Brouer wrote:
>> I may have mistakenly thinking the lock hold time refers to just the 
>> cpu_lock. Your reported times here are about the cgroup_rstat_lock. 
>> Right? If so, the numbers make sense to me.
>>
>
> True, my reported number here are about the cgroup_rstat_lock.
> Glad to hear, we are more aligned then 
Shakeel Butt May 3, 2024, 6:11 p.m. UTC | #9
On Fri, May 03, 2024 at 02:58:56PM +0200, Jesper Dangaard Brouer wrote:
> 
> 
> On 02/05/2024 21.44, Shakeel Butt wrote:
> > On Wed, May 01, 2024 at 07:22:26PM +0200, Jesper Dangaard Brouer wrote:
> > > 
> > [...]
> > > 
> > > More data, the histogram of time spend under the lock have some strange
> > > variation issues with a group in 4ms to 65ms area. Investigating what
> > > can be causeing this... which next step depend in these tracepoints.
> > > 
> > > @lock_cnt: 759146
> > > 
> > > @locked_ns:
> > > [1K, 2K)             499 |      |
> > > [2K, 4K)          206928
> > > |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> > > [4K, 8K)          147904 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
> > > [8K, 16K)          64453 |@@@@@@@@@@@@@@@@      |
> > > [16K, 32K)        135467 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      |
> > > [32K, 64K)         75943 |@@@@@@@@@@@@@@@@@@@      |
> > > [64K, 128K)        38359 |@@@@@@@@@      |
> > > [128K, 256K)       46597 |@@@@@@@@@@@      |
> > > [256K, 512K)       32466 |@@@@@@@@      |
> > > [512K, 1M)          3945 |      |
> > > [1M, 2M)             642 |      |
> > > [2M, 4M)             750 |      |
> > > [4M, 8M)            1932 |      |
> > > [8M, 16M)           2114 |      |
> > > [16M, 32M)          1039 |      |
> > > [32M, 64M)           108 |      |
> > > 
> > 
> > Am I understanding correctly that 1K is 1 microsecond and 1M is 1
> > millisecond?
> 
> Correct.
> 
> > Is it possible to further divide this table into update
> > side and flush side?
> > 
> 
> This is *only* flush side.
> 
> You question indicate, that we are talking past each-other ;-)
> 
> Measurements above is with (recently) accepted tracepoints (e.g. not the
> proposed tracepoints in this patch).  I'm arguing with existing
> tracepoint that I'm seeing this data, and arguing I need per-CPU
> tracepoints to dig deeper into this (as proposed in this patch).

Ah my mistake, I just assumed that the data shown is with the given
patchset.

> 
> The "update side" can only be measured once we apply this patch.
> 
> This morning I got 6 prod machines booted with new kernels, that contain
> this proposed per-CPU lock tracepoint patch.  And 3 of these machines have
> the Mutex lock change also.  No data to share yet...
> 

Eagerly waiting for the results. Also I don't have any concerns with
these new traces.

> --Jesper
Shakeel Butt May 3, 2024, 7:18 p.m. UTC | #10
On Fri, May 03, 2024 at 04:00:20PM +0200, Jesper Dangaard Brouer wrote:
> 
> 
[...]
> > 
> > I may have mistakenly thinking the lock hold time refers to just the
> > cpu_lock. Your reported times here are about the cgroup_rstat_lock.
> > Right? If so, the numbers make sense to me.
> > 
> 
> True, my reported number here are about the cgroup_rstat_lock.
> Glad to hear, we are more aligned then :-)
> 
> Given I just got some prod machines online with this patch
> cgroup_rstat_cpu_lock tracepoints, I can give you some early results,
> about hold-time for the cgroup_rstat_cpu_lock.

Oh you have already shared the preliminary data.

> 
> From this oneliner bpftrace commands:
> 
>   sudo bpftrace -e '
>          tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>            @start[tid]=nsecs; @cnt[probe]=count()}
>          tracepoint:cgroup:cgroup_rstat_cpu_locked {
>            $now=nsecs;
>            if (args->contended) {
>              @wait_per_cpu_ns=hist($now-@start[tid]); delete(@start[tid]);}
>            @cnt[probe]=count(); @locked[tid]=$now}
>          tracepoint:cgroup:cgroup_rstat_cpu_unlock {
>            $now=nsecs;
>            @locked_per_cpu_ns=hist($now-@locked[tid]); delete(@locked[tid]);
>            @cnt[probe]=count()}
>          interval:s:1 {time("%H:%M:%S "); print(@wait_per_cpu_ns);
>            print(@locked_per_cpu_ns); print(@cnt); clear(@cnt);}'
> 
> Results from one 1 sec period:
> 
> 13:39:55 @wait_per_cpu_ns:
> [512, 1K)              3 |      |
> [1K, 2K)              12 |@      |
> [2K, 4K)             390
> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [4K, 8K)              70 |@@@@@@@@@      |
> [8K, 16K)             24 |@@@      |
> [16K, 32K)           183 |@@@@@@@@@@@@@@@@@@@@@@@@      |
> [32K, 64K)            11 |@      |
> 
> @locked_per_cpu_ns:
> [256, 512)         75592 |@      |
> [512, 1K)        2537357
> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [1K, 2K)          528615 |@@@@@@@@@@      |
> [2K, 4K)          168519 |@@@      |
> [4K, 8K)          162039 |@@@      |
> [8K, 16K)         100730 |@@      |
> [16K, 32K)         42276 |      |
> [32K, 64K)          1423 |      |
> [64K, 128K)           89 |      |
> 
>  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_lock_contended]: 3 /sec
>  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_unlock]: 3200  /sec
>  @cnt[tracepoint:cgroup:cgroup_rstat_cpu_locked]: 3200  /sec
> 
> 
> So, we see "flush-code-path" per-CPU-holding @locked_per_cpu_ns isn't
> exceeding 128 usec.

Hmm 128 usec is actually unexpectedly high. How does the cgroup
hierarchy on your system looks like? How many cgroups have actual
workloads running? Can the network softirqs run on any cpus or smaller
set of cpus? I am assuming these softirqs are processing packets from
any or all cgroups and thus have larger cgroup update tree. I wonder if
you comment out MEMCG_SOCK stat update and still see the same holding
time.

> 
> My latency requirements, to avoid RX-queue overflow, with 1024 slots,
> running at 25 Gbit/s, is 27.6 usec with small packets, and 500 usec
> (0.5ms) with MTU size packets.  This is very close to my latency
> requirements.
> 
> --Jesper
>
Jesper Dangaard Brouer May 6, 2024, 12:03 p.m. UTC | #11
On 03/05/2024 21.18, Shakeel Butt wrote:
> On Fri, May 03, 2024 at 04:00:20PM +0200, Jesper Dangaard Brouer wrote:
>>
>>
> [...]
>>>
>>> I may have mistakenly thinking the lock hold time refers to just the
>>> cpu_lock. Your reported times here are about the cgroup_rstat_lock.
>>> Right? If so, the numbers make sense to me.
>>>
>>
>> True, my reported number here are about the cgroup_rstat_lock.
>> Glad to hear, we are more aligned then :-)
>>
>> Given I just got some prod machines online with this patch
>> cgroup_rstat_cpu_lock tracepoints, I can give you some early results,
>> about hold-time for the cgroup_rstat_cpu_lock.
> 
> Oh you have already shared the preliminary data.
> 
>>
>>  From this oneliner bpftrace commands:
>>
>>    sudo bpftrace -e '
>>           tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>>             @start[tid]=nsecs; @cnt[probe]=count()}
>>           tracepoint:cgroup:cgroup_rstat_cpu_locked {
>>             $now=nsecs;
>>             if (args->contended) {
>>               @wait_per_cpu_ns=hist($now-@start[tid]); delete(@start[tid]);}
>>             @cnt[probe]=count(); @locked[tid]=$now}
>>           tracepoint:cgroup:cgroup_rstat_cpu_unlock {
>>             $now=nsecs;
>>             @locked_per_cpu_ns=hist($now-@locked[tid]); delete(@locked[tid]);
>>             @cnt[probe]=count()}
>>           interval:s:1 {time("%H:%M:%S "); print(@wait_per_cpu_ns);
>>             print(@locked_per_cpu_ns); print(@cnt); clear(@cnt);}'
>>
>> Results from one 1 sec period:
>>
>> 13:39:55 @wait_per_cpu_ns:
>> [512, 1K)              3 |      |
>> [1K, 2K)              12 |@      |
>> [2K, 4K)             390
>> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>> [4K, 8K)              70 |@@@@@@@@@      |
>> [8K, 16K)             24 |@@@      |
>> [16K, 32K)           183 |@@@@@@@@@@@@@@@@@@@@@@@@      |
>> [32K, 64K)            11 |@      |
>>
>> @locked_per_cpu_ns:
>> [256, 512)         75592 |@      |
>> [512, 1K)        2537357
>> |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>> [1K, 2K)          528615 |@@@@@@@@@@      |
>> [2K, 4K)          168519 |@@@      |
>> [4K, 8K)          162039 |@@@      |
>> [8K, 16K)         100730 |@@      |
>> [16K, 32K)         42276 |      |
>> [32K, 64K)          1423 |      |
>> [64K, 128K)           89 |      |
>>
>>   @cnt[tracepoint:cgroup:cgroup_rstat_cpu_lock_contended]: 3 /sec
>>   @cnt[tracepoint:cgroup:cgroup_rstat_cpu_unlock]: 3200  /sec
>>   @cnt[tracepoint:cgroup:cgroup_rstat_cpu_locked]: 3200  /sec
>>
>>
>> So, we see "flush-code-path" per-CPU-holding @locked_per_cpu_ns isn't
>> exceeding 128 usec.
> 
> Hmm 128 usec is actually unexpectedly high. 

> How does the cgroup hierarchy on your system looks like? 
I didn't design this, so hopefully my co-workers can help me out here? 
(To @Daniel or @Jon)

My low level view is that, there are 17 top-level directories in 
/sys/fs/cgroup/.
There are 649 cgroups (counting occurrence of memory.stat).
There are two directories that contain the major part.
  - /sys/fs/cgroup/system.slice = 379
  - /sys/fs/cgroup/production.slice = 233
  - (production.slice have directory two levels)
  - remaining 37

We are open to changing this if you have any advice?
(@Daniel and @Jon are actually working on restructuring this)

> How many cgroups have actual workloads running?
Do you have a command line trick to determine this?


> Can the network softirqs run on any cpus or smaller
> set of cpus? I am assuming these softirqs are processing packets from
> any or all cgroups and thus have larger cgroup update tree. 

Softirq and specifically NET_RX is running half of the cores (e.g. 64).
(I'm looking at restructuring this allocation)

> I wonder if
> you comment out MEMCG_SOCK stat update and still see the same holding
> time.
>

It doesn't look like MEMCG_SOCK is used.

I deduct you are asking:
  - What is the update count for different types of mod_memcg_state() calls?

// Dumped via BTF info
enum memcg_stat_item {
         MEMCG_SWAP = 43,
         MEMCG_SOCK = 44,
         MEMCG_PERCPU_B = 45,
         MEMCG_VMALLOC = 46,
         MEMCG_KMEM = 47,
         MEMCG_ZSWAP_B = 48,
         MEMCG_ZSWAPPED = 49,
         MEMCG_NR_STAT = 50,
};

sudo bpftrace -e 'kfunc:vmlinux:__mod_memcg_state{@[args->idx]=count()} 
END{printf("\nEND time elapsed: %d sec\n", elapsed / 1000000000);}'
Attaching 2 probes...
^C
END time elapsed: 99 sec

@[45]: 17996
@[46]: 18603
@[43]: 61858
@[47]: 21398919

It seems clear that MEMCG_KMEM = 47 is the main "user".
  - 21398919/99 = 216150 calls per sec

Could someone explain to me what this MEMCG_KMEM is used for?


>>
>> My latency requirements, to avoid RX-queue overflow, with 1024 slots,
>> running at 25 Gbit/s, is 27.6 usec with small packets, and 500 usec
>> (0.5ms) with MTU size packets.  This is very close to my latency
>> requirements.
>>
>> --Jesper
>>
Shakeel Butt May 6, 2024, 4:22 p.m. UTC | #12
On Mon, May 06, 2024 at 02:03:47PM +0200, Jesper Dangaard Brouer wrote:
> 
> 
> On 03/05/2024 21.18, Shakeel Butt wrote:
[...]
> > 
> > Hmm 128 usec is actually unexpectedly high.
> 
> > How does the cgroup hierarchy on your system looks like?
> I didn't design this, so hopefully my co-workers can help me out here? (To
> @Daniel or @Jon)
> 
> My low level view is that, there are 17 top-level directories in
> /sys/fs/cgroup/.
> There are 649 cgroups (counting occurrence of memory.stat).
> There are two directories that contain the major part.
>  - /sys/fs/cgroup/system.slice = 379
>  - /sys/fs/cgroup/production.slice = 233
>  - (production.slice have directory two levels)
>  - remaining 37
> 
> We are open to changing this if you have any advice?
> (@Daniel and @Jon are actually working on restructuring this)
> 
> > How many cgroups have actual workloads running?
> Do you have a command line trick to determine this?
> 

The rstat infra maintains a per-cpu cgroup update tree to only flush
stats of cgroups which have seen updates. So, even if you have large
number of cgroups but the workload is active in small number of cgroups,
the update tree should be much smaller. That is the reason I asked these
questions. I don't have any advise yet. At the I am trying to understand
the usage and then hopefully work on optimizing those.

> 
> > Can the network softirqs run on any cpus or smaller
> > set of cpus? I am assuming these softirqs are processing packets from
> > any or all cgroups and thus have larger cgroup update tree.
> 
> Softirq and specifically NET_RX is running half of the cores (e.g. 64).
> (I'm looking at restructuring this allocation)
> 
> > I wonder if
> > you comment out MEMCG_SOCK stat update and still see the same holding
> > time.
> > 
> 
> It doesn't look like MEMCG_SOCK is used.
> 
> I deduct you are asking:
>  - What is the update count for different types of mod_memcg_state() calls?
> 
> // Dumped via BTF info
> enum memcg_stat_item {
>         MEMCG_SWAP = 43,
>         MEMCG_SOCK = 44,
>         MEMCG_PERCPU_B = 45,
>         MEMCG_VMALLOC = 46,
>         MEMCG_KMEM = 47,
>         MEMCG_ZSWAP_B = 48,
>         MEMCG_ZSWAPPED = 49,
>         MEMCG_NR_STAT = 50,
> };
> 
> sudo bpftrace -e 'kfunc:vmlinux:__mod_memcg_state{@[args->idx]=count()}
> END{printf("\nEND time elapsed: %d sec\n", elapsed / 1000000000);}'
> Attaching 2 probes...
> ^C
> END time elapsed: 99 sec
> 
> @[45]: 17996
> @[46]: 18603
> @[43]: 61858
> @[47]: 21398919
> 
> It seems clear that MEMCG_KMEM = 47 is the main "user".
>  - 21398919/99 = 216150 calls per sec
> 
> Could someone explain to me what this MEMCG_KMEM is used for?
> 

MEMCG_KMEM is the kernel memory charged to a cgroup. It also contains
the untyped kernel memory which is not included in kernel_stack,
pagetables, percpu, vmalloc, slab e.t.c.

The reason I asked about MEMCG_SOCK was that it might be causing larger
update trees (more cgroups) on CPUs processing the NET_RX.

Anyways did the mutex change helped your production workload regarding
latencies?
Ivan Babrou May 6, 2024, 4:28 p.m. UTC | #13
On Mon, May 6, 2024 at 9:22 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Mon, May 06, 2024 at 02:03:47PM +0200, Jesper Dangaard Brouer wrote:
> >
> >
> > On 03/05/2024 21.18, Shakeel Butt wrote:
> [...]
> > >
> > > Hmm 128 usec is actually unexpectedly high.
> >
> > > How does the cgroup hierarchy on your system looks like?
> > I didn't design this, so hopefully my co-workers can help me out here? (To
> > @Daniel or @Jon)
> >
> > My low level view is that, there are 17 top-level directories in
> > /sys/fs/cgroup/.
> > There are 649 cgroups (counting occurrence of memory.stat).
> > There are two directories that contain the major part.
> >  - /sys/fs/cgroup/system.slice = 379
> >  - /sys/fs/cgroup/production.slice = 233
> >  - (production.slice have directory two levels)
> >  - remaining 37
> >
> > We are open to changing this if you have any advice?
> > (@Daniel and @Jon are actually working on restructuring this)
> >
> > > How many cgroups have actual workloads running?
> > Do you have a command line trick to determine this?
> >
>
> The rstat infra maintains a per-cpu cgroup update tree to only flush
> stats of cgroups which have seen updates. So, even if you have large
> number of cgroups but the workload is active in small number of cgroups,
> the update tree should be much smaller. That is the reason I asked these
> questions. I don't have any advise yet. At the I am trying to understand
> the usage and then hopefully work on optimizing those.
>
> >
> > > Can the network softirqs run on any cpus or smaller
> > > set of cpus? I am assuming these softirqs are processing packets from
> > > any or all cgroups and thus have larger cgroup update tree.
> >
> > Softirq and specifically NET_RX is running half of the cores (e.g. 64).
> > (I'm looking at restructuring this allocation)
> >
> > > I wonder if
> > > you comment out MEMCG_SOCK stat update and still see the same holding
> > > time.
> > >
> >
> > It doesn't look like MEMCG_SOCK is used.
> >
> > I deduct you are asking:
> >  - What is the update count for different types of mod_memcg_state() calls?
> >
> > // Dumped via BTF info
> > enum memcg_stat_item {
> >         MEMCG_SWAP = 43,
> >         MEMCG_SOCK = 44,
> >         MEMCG_PERCPU_B = 45,
> >         MEMCG_VMALLOC = 46,
> >         MEMCG_KMEM = 47,
> >         MEMCG_ZSWAP_B = 48,
> >         MEMCG_ZSWAPPED = 49,
> >         MEMCG_NR_STAT = 50,
> > };
> >
> > sudo bpftrace -e 'kfunc:vmlinux:__mod_memcg_state{@[args->idx]=count()}
> > END{printf("\nEND time elapsed: %d sec\n", elapsed / 1000000000);}'
> > Attaching 2 probes...
> > ^C
> > END time elapsed: 99 sec
> >
> > @[45]: 17996
> > @[46]: 18603
> > @[43]: 61858
> > @[47]: 21398919
> >
> > It seems clear that MEMCG_KMEM = 47 is the main "user".
> >  - 21398919/99 = 216150 calls per sec
> >
> > Could someone explain to me what this MEMCG_KMEM is used for?
> >
>
> MEMCG_KMEM is the kernel memory charged to a cgroup. It also contains
> the untyped kernel memory which is not included in kernel_stack,
> pagetables, percpu, vmalloc, slab e.t.c.
>
> The reason I asked about MEMCG_SOCK was that it might be causing larger
> update trees (more cgroups) on CPUs processing the NET_RX.

We pass cgroup.memory=nosocket in the kernel cmdline:

* https://lore.kernel.org/lkml/CABWYdi0G7cyNFbndM-ELTDAR3x4Ngm0AehEp5aP0tfNkXUE+Uw@mail.gmail.com/

> Anyways did the mutex change helped your production workload regarding
> latencies?
Shakeel Butt May 6, 2024, 7:45 p.m. UTC | #14
On Mon, May 06, 2024 at 09:28:41AM -0700, Ivan Babrou wrote:
> On Mon, May 6, 2024 at 9:22 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
[...]
> >
> > The reason I asked about MEMCG_SOCK was that it might be causing larger
> > update trees (more cgroups) on CPUs processing the NET_RX.
> 
> We pass cgroup.memory=nosocket in the kernel cmdline:
> 
> * https://lore.kernel.org/lkml/CABWYdi0G7cyNFbndM-ELTDAR3x4Ngm0AehEp5aP0tfNkXUE+Uw@mail.gmail.com/
> 

Ah another thing we should fix for you folks. Is it possible to repro
the issue on the latest linus tree and report back? I will be busy for
next 2 weeks but will get back to this after that.
Jesper Dangaard Brouer May 6, 2024, 7:54 p.m. UTC | #15
On 06/05/2024 18.22, Shakeel Butt wrote:
> Anyways did the mutex change helped your production workload regarding
> latencies?

Yes, see attached NET_RX softirq latency improvement with mutex patch[1].
Showing two grafana heatmaps over softirq wait (no-able-to run) time.
It is very clear that most of the latency spikes are gone with mutex 
patch applied.

--Jesper

[1] 
https://lore.kernel.org/all/171328989335.3930751.3091577850420501533.stgit@firesoul/
Jesper Dangaard Brouer May 14, 2024, 5:18 a.m. UTC | #16
Hi Tejun,

Could we please apply this for-6.10, to avoid splitting tracepoint 
changes over multiple kernel versions?

--Jesper
P.s. Cloudflare is already running with this applied in production.


On 01/05/2024 16.04, Jesper Dangaard Brouer wrote:
> This closely resembles helpers added for the global cgroup_rstat_lock in
> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
> 
> Based on production workloads, we observe the fast-path "update" function
> cgroup_rstat_updated() is invoked around 3 million times per sec, while the
> "flush" function cgroup_rstat_flush_locked(), walking each possible CPU,
> can see periodic spikes of 700 invocations/sec.
> 
> For this reason, the tracepoints are split into normal and fastpath
> versions for this per-CPU lock. Making it feasible for production to
> continuously monitor the non-fastpath tracepoint to detect lock contention
> issues. The reason for monitoring is that lock disables IRQs which can
> disturb e.g. softirq processing on the local CPUs involved. When the
> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a mutex),
> this per CPU lock becomes the next bottleneck that can introduce latency
> variations.
> 
> A practical bpftrace script for monitoring contention latency:
> 
>   bpftrace -e '
>     tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>       @start[tid]=nsecs; @cnt[probe]=count()}
>     tracepoint:cgroup:cgroup_rstat_cpu_locked {
>       if (args->contended) {
>         @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>       @cnt[probe]=count()}
>     interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); clear(@cnt);}'
> 
> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
> ---
>   include/trace/events/cgroup.h |   56 +++++++++++++++++++++++++++++----
>   kernel/cgroup/rstat.c         |   70 ++++++++++++++++++++++++++++++++++-------
>   2 files changed, 108 insertions(+), 18 deletions(-)
> 
> diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
> index 13f375800135..0b95865a90f3 100644
> --- a/include/trace/events/cgroup.h
> +++ b/include/trace/events/cgroup.h
> @@ -206,15 +206,15 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
>   
>   DECLARE_EVENT_CLASS(cgroup_rstat,
>   
> -	TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
>   
> -	TP_ARGS(cgrp, cpu_in_loop, contended),
> +	TP_ARGS(cgrp, cpu, contended),
>   
>   	TP_STRUCT__entry(
>   		__field(	int,		root			)
>   		__field(	int,		level			)
>   		__field(	u64,		id			)
> -		__field(	int,		cpu_in_loop		)
> +		__field(	int,		cpu			)
>   		__field(	bool,		contended		)
>   	),
>   
> @@ -222,15 +222,16 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
>   		__entry->root = cgrp->root->hierarchy_id;
>   		__entry->id = cgroup_id(cgrp);
>   		__entry->level = cgrp->level;
> -		__entry->cpu_in_loop = cpu_in_loop;
> +		__entry->cpu = cpu;
>   		__entry->contended = contended;
>   	),
>   
> -	TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d",
> +	TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
>   		  __entry->root, __entry->id, __entry->level,
> -		  __entry->cpu_in_loop, __entry->contended)
> +		  __entry->cpu, __entry->contended)
>   );
>   
> +/* Related to global: cgroup_rstat_lock */
>   DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
>   
>   	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> @@ -252,6 +253,49 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
>   	TP_ARGS(cgrp, cpu, contended)
>   );
>   
> +/* Related to per CPU: cgroup_rstat_cpu_lock */
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
> +DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
> +
> +	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +
> +	TP_ARGS(cgrp, cpu, contended)
> +);
> +
>   #endif /* _TRACE_CGROUP_H */
>   
>   /* This part must be outside protection */
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index 52e3b0ed1cee..fb8b49437573 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
>   	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
>   }
>   
> +/*
> + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
> + *
> + * This makes it easier to diagnose locking issues and contention in
> + * production environments. The parameter @fast_path determine the
> + * tracepoints being added, allowing us to diagnose "flush" related
> + * operations without handling high-frequency fast-path "update" events.
> + */
> +static __always_inline
> +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
> +				     struct cgroup *cgrp, const bool fast_path)
> +{
> +	unsigned long flags;
> +	bool contended;
> +
> +	/*
> +	 * The _irqsave() is needed because cgroup_rstat_lock is
> +	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
> +	 * this lock with the _irq() suffix only disables interrupts on
> +	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
> +	 * interrupts on both configurations. The _irqsave() ensures
> +	 * that interrupts are always disabled and later restored.
> +	 */
> +	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
> +	if (contended) {
> +		if (fast_path)
> +			trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
> +		else
> +			trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
> +
> +		raw_spin_lock_irqsave(cpu_lock, flags);
> +	}
> +
> +	if (fast_path)
> +		trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
> +	else
> +		trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
> +
> +	return flags;
> +}
> +
> +static __always_inline
> +void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
> +			      struct cgroup *cgrp, unsigned long flags,
> +			      const bool fast_path)
> +{
> +	if (fast_path)
> +		trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
> +	else
> +		trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
> +
> +	raw_spin_unlock_irqrestore(cpu_lock, flags);
> +}
> +
>   /**
>    * cgroup_rstat_updated - keep track of updated rstat_cpu
>    * @cgrp: target cgroup
> @@ -44,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
>   	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
>   		return;
>   
> -	raw_spin_lock_irqsave(cpu_lock, flags);
> +	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
>   
>   	/* put @cgrp and all ancestors on the corresponding updated lists */
>   	while (true) {
> @@ -72,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
>   		cgrp = parent;
>   	}
>   
> -	raw_spin_unlock_irqrestore(cpu_lock, flags);
> +	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
>   }
>   
>   /**
> @@ -153,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
>   	struct cgroup *head = NULL, *parent, *child;
>   	unsigned long flags;
>   
> -	/*
> -	 * The _irqsave() is needed because cgroup_rstat_lock is
> -	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
> -	 * this lock with the _irq() suffix only disables interrupts on
> -	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
> -	 * interrupts on both configurations. The _irqsave() ensures
> -	 * that interrupts are always disabled and later restored.
> -	 */
> -	raw_spin_lock_irqsave(cpu_lock, flags);
> +	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
>   
>   	/* Return NULL if this subtree is not on-list */
>   	if (!rstatc->updated_next)
> @@ -198,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
>   	if (child != root)
>   		head = cgroup_rstat_push_children(head, child, cpu);
>   unlock_ret:
> -	raw_spin_unlock_irqrestore(cpu_lock, flags);
> +	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
>   	return head;
>   }
>   
> 
>
Tejun Heo May 14, 2024, 5:55 a.m. UTC | #17
On Tue, May 14, 2024 at 07:18:18AM +0200, Jesper Dangaard Brouer wrote:
> Hi Tejun,
> 
> Could we please apply this for-6.10, to avoid splitting tracepoint changes
> over multiple kernel versions?

Yeah, I can. Waiman, would that be okay?

Thanks.
Waiman Long May 14, 2024, 4:59 p.m. UTC | #18
On 5/14/24 01:55, Tejun Heo wrote:
> On Tue, May 14, 2024 at 07:18:18AM +0200, Jesper Dangaard Brouer wrote:
>> Hi Tejun,
>>
>> Could we please apply this for-6.10, to avoid splitting tracepoint changes
>> over multiple kernel versions?
> Yeah, I can. Waiman, would that be okay?

I am OK with this commit as it shouldn't impose that much of a 
performance cost if the tracepoints are activated.

Cheers,
Longman
Tejun Heo May 15, 2024, 4:58 p.m. UTC | #19
On Wed, May 01, 2024 at 04:04:11PM +0200, Jesper Dangaard Brouer wrote:
> This closely resembles helpers added for the global cgroup_rstat_lock in
> commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
> tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.
> 
> Based on production workloads, we observe the fast-path "update" function
> cgroup_rstat_updated() is invoked around 3 million times per sec, while the
> "flush" function cgroup_rstat_flush_locked(), walking each possible CPU,
> can see periodic spikes of 700 invocations/sec.
> 
> For this reason, the tracepoints are split into normal and fastpath
> versions for this per-CPU lock. Making it feasible for production to
> continuously monitor the non-fastpath tracepoint to detect lock contention
> issues. The reason for monitoring is that lock disables IRQs which can
> disturb e.g. softirq processing on the local CPUs involved. When the
> global cgroup_rstat_lock stops disabling IRQs (e.g converted to a mutex),
> this per CPU lock becomes the next bottleneck that can introduce latency
> variations.
> 
> A practical bpftrace script for monitoring contention latency:
> 
>  bpftrace -e '
>    tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
>      @start[tid]=nsecs; @cnt[probe]=count()}
>    tracepoint:cgroup:cgroup_rstat_cpu_locked {
>      if (args->contended) {
>        @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
>      @cnt[probe]=count()}
>    interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); clear(@cnt);}'
> 
> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>

Applied to cgroup/for-6.10.

Thanks.
diff mbox series

Patch

diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index 13f375800135..0b95865a90f3 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -206,15 +206,15 @@  DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
 
 DECLARE_EVENT_CLASS(cgroup_rstat,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
 
-	TP_ARGS(cgrp, cpu_in_loop, contended),
+	TP_ARGS(cgrp, cpu, contended),
 
 	TP_STRUCT__entry(
 		__field(	int,		root			)
 		__field(	int,		level			)
 		__field(	u64,		id			)
-		__field(	int,		cpu_in_loop		)
+		__field(	int,		cpu			)
 		__field(	bool,		contended		)
 	),
 
@@ -222,15 +222,16 @@  DECLARE_EVENT_CLASS(cgroup_rstat,
 		__entry->root = cgrp->root->hierarchy_id;
 		__entry->id = cgroup_id(cgrp);
 		__entry->level = cgrp->level;
-		__entry->cpu_in_loop = cpu_in_loop;
+		__entry->cpu = cpu;
 		__entry->contended = contended;
 	),
 
-	TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d",
+	TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
 		  __entry->root, __entry->id, __entry->level,
-		  __entry->cpu_in_loop, __entry->contended)
+		  __entry->cpu, __entry->contended)
 );
 
+/* Related to global: cgroup_rstat_lock */
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
 
 	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
@@ -252,6 +253,49 @@  DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
 	TP_ARGS(cgrp, cpu, contended)
 );
 
+/* Related to per CPU: cgroup_rstat_cpu_lock */
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
 #endif /* _TRACE_CGROUP_H */
 
 /* This part must be outside protection */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 52e3b0ed1cee..fb8b49437573 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -19,6 +19,60 @@  static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
 	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
 }
 
+/*
+ * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments. The parameter @fast_path determine the
+ * tracepoints being added, allowing us to diagnose "flush" related
+ * operations without handling high-frequency fast-path "update" events.
+ */
+static __always_inline
+unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
+				     struct cgroup *cgrp, const bool fast_path)
+{
+	unsigned long flags;
+	bool contended;
+
+	/*
+	 * The _irqsave() is needed because cgroup_rstat_lock is
+	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+	 * this lock with the _irq() suffix only disables interrupts on
+	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+	 * interrupts on both configurations. The _irqsave() ensures
+	 * that interrupts are always disabled and later restored.
+	 */
+	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
+	if (contended) {
+		if (fast_path)
+			trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
+		else
+			trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
+
+		raw_spin_lock_irqsave(cpu_lock, flags);
+	}
+
+	if (fast_path)
+		trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
+	else
+		trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
+
+	return flags;
+}
+
+static __always_inline
+void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
+			      struct cgroup *cgrp, unsigned long flags,
+			      const bool fast_path)
+{
+	if (fast_path)
+		trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
+	else
+		trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
+
+	raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
 /**
  * cgroup_rstat_updated - keep track of updated rstat_cpu
  * @cgrp: target cgroup
@@ -44,7 +98,7 @@  __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
 	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
 		return;
 
-	raw_spin_lock_irqsave(cpu_lock, flags);
+	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
 
 	/* put @cgrp and all ancestors on the corresponding updated lists */
 	while (true) {
@@ -72,7 +126,7 @@  __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
 		cgrp = parent;
 	}
 
-	raw_spin_unlock_irqrestore(cpu_lock, flags);
+	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
 }
 
 /**
@@ -153,15 +207,7 @@  static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
 	struct cgroup *head = NULL, *parent, *child;
 	unsigned long flags;
 
-	/*
-	 * The _irqsave() is needed because cgroup_rstat_lock is
-	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
-	 * this lock with the _irq() suffix only disables interrupts on
-	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
-	 * interrupts on both configurations. The _irqsave() ensures
-	 * that interrupts are always disabled and later restored.
-	 */
-	raw_spin_lock_irqsave(cpu_lock, flags);
+	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
 
 	/* Return NULL if this subtree is not on-list */
 	if (!rstatc->updated_next)
@@ -198,7 +244,7 @@  static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
 	if (child != root)
 		head = cgroup_rstat_push_children(head, child, cpu);
 unlock_ret:
-	raw_spin_unlock_irqrestore(cpu_lock, flags);
+	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
 	return head;
 }