diff mbox series

[3/4,v2] cgroup: separate rstat locks for subsystems

Message ID 20250227215543.49928-4-inwardvessel@gmail.com (mailing list archive)
State New
Headers show
Series cgroup: separate rstat trees | expand

Commit Message

JP Kobryn Feb. 27, 2025, 9:55 p.m. UTC
From: JP Kobryn <inwardvessel@gmail.com>

Let the existing locks be dedicated to the base stats and rename them as
such. Also add new rstat locks for each enabled subsystem. When handling
cgroup subsystem states, distinguish between formal subsystems (memory,
io, etc) and the base stats subsystem state (represented by
cgroup::self) to decide on which locks to take. This change is made to
prevent contention between subsystems when updating/flushing stats.

Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
---
 kernel/cgroup/rstat.c | 93 +++++++++++++++++++++++++++++++++----------
 1 file changed, 72 insertions(+), 21 deletions(-)

Comments

Shakeel Butt Feb. 27, 2025, 10:52 p.m. UTC | #1
On Thu, Feb 27, 2025 at 01:55:42PM -0800, inwardvessel wrote:
> From: JP Kobryn <inwardvessel@gmail.com>
> 
> Let the existing locks be dedicated to the base stats and rename them as
> such. Also add new rstat locks for each enabled subsystem. When handling
> cgroup subsystem states, distinguish between formal subsystems (memory,
> io, etc) and the base stats subsystem state (represented by
> cgroup::self) to decide on which locks to take. This change is made to
> prevent contention between subsystems when updating/flushing stats.
> 
> Signed-off-by: JP Kobryn <inwardvessel@gmail.com>

Couple of nits below otherwise:

Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>

> ---
>  kernel/cgroup/rstat.c | 93 +++++++++++++++++++++++++++++++++----------
>  1 file changed, 72 insertions(+), 21 deletions(-)
> 
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index 88908ef9212d..b3eaefc1fd07 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -9,8 +9,12 @@
>  
>  #include <trace/events/cgroup.h>
>  
> -static DEFINE_SPINLOCK(cgroup_rstat_lock);
> -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
> +static DEFINE_SPINLOCK(cgroup_rstat_base_lock);
> +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_base_cpu_lock);
> +
> +static spinlock_t cgroup_rstat_subsys_lock[CGROUP_SUBSYS_COUNT];
> +static DEFINE_PER_CPU(raw_spinlock_t,
> +		cgroup_rstat_subsys_cpu_lock[CGROUP_SUBSYS_COUNT]);
>  

The name of these locks are too long and you had to go multi-line.
Please just reduce the size of the names. These are local to this file,
so maybe you can drop cgroup_rstat_ from them or keep the rstat.

>  static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
>  
> @@ -20,8 +24,13 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(
>  	return per_cpu_ptr(css->rstat_cpu, cpu);
>  }
>  
> +static inline bool is_base_css(struct cgroup_subsys_state *css)
> +{
> +	return css->ss == NULL;
> +}
> +
>  /*
> - * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
> + * Helper functions for rstat per CPU locks.
>   *
>   * This makes it easier to diagnose locking issues and contention in
>   * production environments. The parameter @fast_path determine the
> @@ -36,12 +45,12 @@ unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
>  	bool contended;
>  
>  	/*
> -	 * The _irqsave() is needed because cgroup_rstat_lock is
> -	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
> -	 * this lock with the _irq() suffix only disables interrupts on
> -	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
> -	 * interrupts on both configurations. The _irqsave() ensures
> -	 * that interrupts are always disabled and later restored.
> +	 * The _irqsave() is needed because the locks used for flushing are
> +	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
> +	 * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
> +	 * kernel. The raw_spinlock_t below disables interrupts on both
> +	 * configurations. The _irqsave() ensures that interrupts are always
> +	 * disabled and later restored.
>  	 */
>  	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
>  	if (contended) {
> @@ -87,7 +96,7 @@ __bpf_kfunc void cgroup_rstat_updated(
>  		struct cgroup_subsys_state *css, int cpu)
>  {
>  	struct cgroup *cgrp = css->cgroup;
> -	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
> +	raw_spinlock_t *cpu_lock;
>  	unsigned long flags;
>  
>  	/*
> @@ -101,6 +110,12 @@ __bpf_kfunc void cgroup_rstat_updated(
>  	if (data_race(cgroup_rstat_cpu(css, cpu)->updated_next))
>  		return;
>  
> +	if (is_base_css(css))
> +		cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
> +	else
> +		cpu_lock = per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) +
> +			css->ss->id;

Use the array index here like cgroup_rstat_subsys_cpu_lock[css->ss->id].

> +
>  	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
>  
>  	/* put @css and all ancestors on the corresponding updated lists */
> @@ -208,11 +223,17 @@ static struct cgroup_subsys_state *cgroup_rstat_updated_list(
>  		struct cgroup_subsys_state *root, int cpu)
>  {
>  	struct cgroup *cgrp = root->cgroup;
> -	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
>  	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
>  	struct cgroup_subsys_state *head = NULL, *parent, *child;
> +	raw_spinlock_t *cpu_lock;
>  	unsigned long flags;
>  
> +	if (is_base_css(root))
> +		cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
> +	else
> +		cpu_lock = per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) +
> +			root->ss->id;

Same here.

> +
>  	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, false);
>  
>  	/* Return NULL if this subtree is not on-list */
> @@ -315,7 +336,7 @@ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
>  	struct cgroup *cgrp = css->cgroup;
>  	int cpu;
>  
> -	lockdep_assert_held(&cgroup_rstat_lock);
> +	lockdep_assert_held(&lock);
>  
>  	for_each_possible_cpu(cpu) {
>  		struct cgroup_subsys_state *pos;
> @@ -356,12 +377,18 @@ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
>  __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
>  {
>  	struct cgroup *cgrp = css->cgroup;
> +	spinlock_t *lock;
> +
> +	if (is_base_css(css))
> +		lock = &cgroup_rstat_base_lock;
> +	else
> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];
>  
>  	might_sleep();
>  
> -	__cgroup_rstat_lock(&cgroup_rstat_lock, cgrp, -1);
> -	cgroup_rstat_flush_locked(css, &cgroup_rstat_lock);
> -	__cgroup_rstat_unlock(&cgroup_rstat_lock, cgrp, -1);
> +	__cgroup_rstat_lock(lock, cgrp, -1);
> +	cgroup_rstat_flush_locked(css, lock);
> +	__cgroup_rstat_unlock(lock, cgrp, -1);
>  }
>  
>  /**
> @@ -376,10 +403,16 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
>  void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
>  {
>  	struct cgroup *cgrp = css->cgroup;
> +	spinlock_t *lock;
> +
> +	if (is_base_css(css))
> +		lock = &cgroup_rstat_base_lock;
> +	else
> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];
>  
>  	might_sleep();
> -	__cgroup_rstat_lock(&cgroup_rstat_lock, cgrp, -1);
> -	cgroup_rstat_flush_locked(css, &cgroup_rstat_lock);
> +	__cgroup_rstat_lock(lock, cgrp, -1);
> +	cgroup_rstat_flush_locked(css, lock);
>  }
>  
>  /**
> @@ -389,7 +422,14 @@ void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
>  void cgroup_rstat_flush_release(struct cgroup_subsys_state *css)
>  {
>  	struct cgroup *cgrp = css->cgroup;
> -	__cgroup_rstat_unlock(&cgroup_rstat_lock, cgrp, -1);
> +	spinlock_t *lock;
> +
> +	if (is_base_css(css))
> +		lock = &cgroup_rstat_base_lock;
> +	else
> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];
> +
> +	__cgroup_rstat_unlock(lock, cgrp, -1);
>  }
>  
>  int cgroup_rstat_init(struct cgroup_subsys_state *css)
> @@ -435,10 +475,21 @@ void cgroup_rstat_exit(struct cgroup_subsys_state *css)
>  
>  void __init cgroup_rstat_boot(void)
>  {
> -	int cpu;
> +	struct cgroup_subsys *ss;
> +	int cpu, ssid;
>  
> -	for_each_possible_cpu(cpu)
> -		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
> +	for_each_subsys(ss, ssid) {
> +		spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
> +	}
> +
> +	for_each_possible_cpu(cpu) {
> +		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu));
> +
> +		for_each_subsys(ss, ssid) {
> +			raw_spin_lock_init(
> +					per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) + ssid);

Same here.

> +		}
> +	}
>  }
>  
>  /*
> -- 
> 2.43.5
>
JP Kobryn Feb. 28, 2025, 4:07 p.m. UTC | #2
On 2/27/25 2:52 PM, Shakeel Butt wrote:
> On Thu, Feb 27, 2025 at 01:55:42PM -0800, inwardvessel wrote:
>> From: JP Kobryn <inwardvessel@gmail.com>
>>
>> Let the existing locks be dedicated to the base stats and rename them as
>> such. Also add new rstat locks for each enabled subsystem. When handling
>> cgroup subsystem states, distinguish between formal subsystems (memory,
>> io, etc) and the base stats subsystem state (represented by
>> cgroup::self) to decide on which locks to take. This change is made to
>> prevent contention between subsystems when updating/flushing stats.
>>
>> Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
> 
> Couple of nits below otherwise:
> 
> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
> 
>> ---
>>   kernel/cgroup/rstat.c | 93 +++++++++++++++++++++++++++++++++----------
>>   1 file changed, 72 insertions(+), 21 deletions(-)
>>
>> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
>> index 88908ef9212d..b3eaefc1fd07 100644
>> --- a/kernel/cgroup/rstat.c
>> +++ b/kernel/cgroup/rstat.c
>> @@ -9,8 +9,12 @@
>>   
>>   #include <trace/events/cgroup.h>
>>   
>> -static DEFINE_SPINLOCK(cgroup_rstat_lock);
>> -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
>> +static DEFINE_SPINLOCK(cgroup_rstat_base_lock);
>> +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_base_cpu_lock);
>> +
>> +static spinlock_t cgroup_rstat_subsys_lock[CGROUP_SUBSYS_COUNT];
>> +static DEFINE_PER_CPU(raw_spinlock_t,
>> +		cgroup_rstat_subsys_cpu_lock[CGROUP_SUBSYS_COUNT]);
>>   
> 
> The name of these locks are too long and you had to go multi-line.
> Please just reduce the size of the names. These are local to this file,
> so maybe you can drop cgroup_rstat_ from them or keep the rstat.

Agreed. Will do in next rev.

> 
>>   static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
>>   
>> @@ -20,8 +24,13 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(
>>   	return per_cpu_ptr(css->rstat_cpu, cpu);
>>   }
>>   
>> +static inline bool is_base_css(struct cgroup_subsys_state *css)
>> +{
>> +	return css->ss == NULL;
>> +}
>> +
>>   /*
>> - * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
>> + * Helper functions for rstat per CPU locks.
>>    *
>>    * This makes it easier to diagnose locking issues and contention in
>>    * production environments. The parameter @fast_path determine the
>> @@ -36,12 +45,12 @@ unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
>>   	bool contended;
>>   
>>   	/*
>> -	 * The _irqsave() is needed because cgroup_rstat_lock is
>> -	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
>> -	 * this lock with the _irq() suffix only disables interrupts on
>> -	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
>> -	 * interrupts on both configurations. The _irqsave() ensures
>> -	 * that interrupts are always disabled and later restored.
>> +	 * The _irqsave() is needed because the locks used for flushing are
>> +	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
>> +	 * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
>> +	 * kernel. The raw_spinlock_t below disables interrupts on both
>> +	 * configurations. The _irqsave() ensures that interrupts are always
>> +	 * disabled and later restored.
>>   	 */
>>   	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
>>   	if (contended) {
>> @@ -87,7 +96,7 @@ __bpf_kfunc void cgroup_rstat_updated(
>>   		struct cgroup_subsys_state *css, int cpu)
>>   {
>>   	struct cgroup *cgrp = css->cgroup;
>> -	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
>> +	raw_spinlock_t *cpu_lock;
>>   	unsigned long flags;
>>   
>>   	/*
>> @@ -101,6 +110,12 @@ __bpf_kfunc void cgroup_rstat_updated(
>>   	if (data_race(cgroup_rstat_cpu(css, cpu)->updated_next))
>>   		return;
>>   
>> +	if (is_base_css(css))
>> +		cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
>> +	else
>> +		cpu_lock = per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) +
>> +			css->ss->id;
> 
> Use the array index here like cgroup_rstat_subsys_cpu_lock[css->ss->id].

Good call. I see the arithmetic may be more appropriate in cases where
the per-cpu array/buffer is dynamic. I'll use the index notation in v3.

> 
>> +
>>   	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
>>   
>>   	/* put @css and all ancestors on the corresponding updated lists */
>> @@ -208,11 +223,17 @@ static struct cgroup_subsys_state *cgroup_rstat_updated_list(
>>   		struct cgroup_subsys_state *root, int cpu)
>>   {
>>   	struct cgroup *cgrp = root->cgroup;
>> -	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
>>   	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
>>   	struct cgroup_subsys_state *head = NULL, *parent, *child;
>> +	raw_spinlock_t *cpu_lock;
>>   	unsigned long flags;
>>   
>> +	if (is_base_css(root))
>> +		cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
>> +	else
>> +		cpu_lock = per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) +
>> +			root->ss->id;
> 
> Same here.
> 
>> +
>>   	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, false);
>>   
>>   	/* Return NULL if this subtree is not on-list */
>> @@ -315,7 +336,7 @@ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
>>   	struct cgroup *cgrp = css->cgroup;
>>   	int cpu;
>>   
>> -	lockdep_assert_held(&cgroup_rstat_lock);
>> +	lockdep_assert_held(&lock);
>>   
>>   	for_each_possible_cpu(cpu) {
>>   		struct cgroup_subsys_state *pos;
>> @@ -356,12 +377,18 @@ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
>>   __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
>>   {
>>   	struct cgroup *cgrp = css->cgroup;
>> +	spinlock_t *lock;
>> +
>> +	if (is_base_css(css))
>> +		lock = &cgroup_rstat_base_lock;
>> +	else
>> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];
>>   
>>   	might_sleep();
>>   
>> -	__cgroup_rstat_lock(&cgroup_rstat_lock, cgrp, -1);
>> -	cgroup_rstat_flush_locked(css, &cgroup_rstat_lock);
>> -	__cgroup_rstat_unlock(&cgroup_rstat_lock, cgrp, -1);
>> +	__cgroup_rstat_lock(lock, cgrp, -1);
>> +	cgroup_rstat_flush_locked(css, lock);
>> +	__cgroup_rstat_unlock(lock, cgrp, -1);
>>   }
>>   
>>   /**
>> @@ -376,10 +403,16 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
>>   void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
>>   {
>>   	struct cgroup *cgrp = css->cgroup;
>> +	spinlock_t *lock;
>> +
>> +	if (is_base_css(css))
>> +		lock = &cgroup_rstat_base_lock;
>> +	else
>> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];
>>   
>>   	might_sleep();
>> -	__cgroup_rstat_lock(&cgroup_rstat_lock, cgrp, -1);
>> -	cgroup_rstat_flush_locked(css, &cgroup_rstat_lock);
>> +	__cgroup_rstat_lock(lock, cgrp, -1);
>> +	cgroup_rstat_flush_locked(css, lock);
>>   }
>>   
>>   /**
>> @@ -389,7 +422,14 @@ void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
>>   void cgroup_rstat_flush_release(struct cgroup_subsys_state *css)
>>   {
>>   	struct cgroup *cgrp = css->cgroup;
>> -	__cgroup_rstat_unlock(&cgroup_rstat_lock, cgrp, -1);
>> +	spinlock_t *lock;
>> +
>> +	if (is_base_css(css))
>> +		lock = &cgroup_rstat_base_lock;
>> +	else
>> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];
>> +
>> +	__cgroup_rstat_unlock(lock, cgrp, -1);
>>   }
>>   
>>   int cgroup_rstat_init(struct cgroup_subsys_state *css)
>> @@ -435,10 +475,21 @@ void cgroup_rstat_exit(struct cgroup_subsys_state *css)
>>   
>>   void __init cgroup_rstat_boot(void)
>>   {
>> -	int cpu;
>> +	struct cgroup_subsys *ss;
>> +	int cpu, ssid;
>>   
>> -	for_each_possible_cpu(cpu)
>> -		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
>> +	for_each_subsys(ss, ssid) {
>> +		spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
>> +	}
>> +
>> +	for_each_possible_cpu(cpu) {
>> +		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu));
>> +
>> +		for_each_subsys(ss, ssid) {
>> +			raw_spin_lock_init(
>> +					per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) + ssid);
> 
> Same here.
> 
>> +		}
>> +	}
>>   }
>>   
>>   /*
>> -- 
>> 2.43.5
>>
JP Kobryn Feb. 28, 2025, 5:37 p.m. UTC | #3
On 2/27/25 1:55 PM, inwardvessel wrote:
> From: JP Kobryn <inwardvessel@gmail.com>
> 
> Let the existing locks be dedicated to the base stats and rename them as
> such. Also add new rstat locks for each enabled subsystem. When handling
> cgroup subsystem states, distinguish between formal subsystems (memory,
> io, etc) and the base stats subsystem state (represented by
> cgroup::self) to decide on which locks to take. This change is made to
> prevent contention between subsystems when updating/flushing stats.
> 
> Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
> ---
>   kernel/cgroup/rstat.c | 93 +++++++++++++++++++++++++++++++++----------
>   1 file changed, 72 insertions(+), 21 deletions(-)
> 
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index 88908ef9212d..b3eaefc1fd07 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -9,8 +9,12 @@
>   
>   #include <trace/events/cgroup.h>
>   
> -static DEFINE_SPINLOCK(cgroup_rstat_lock);
> -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
> +static DEFINE_SPINLOCK(cgroup_rstat_base_lock);
> +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_base_cpu_lock);
> +
> +static spinlock_t cgroup_rstat_subsys_lock[CGROUP_SUBSYS_COUNT];
> +static DEFINE_PER_CPU(raw_spinlock_t,
> +		cgroup_rstat_subsys_cpu_lock[CGROUP_SUBSYS_COUNT]);
>   
>   static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
>   
> @@ -20,8 +24,13 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(
>   	return per_cpu_ptr(css->rstat_cpu, cpu);
>   }
>   
> +static inline bool is_base_css(struct cgroup_subsys_state *css)
> +{
> +	return css->ss == NULL;
> +}
> +
>   /*
> - * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
> + * Helper functions for rstat per CPU locks.
>    *
>    * This makes it easier to diagnose locking issues and contention in
>    * production environments. The parameter @fast_path determine the
> @@ -36,12 +45,12 @@ unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
>   	bool contended;
>   
>   	/*
> -	 * The _irqsave() is needed because cgroup_rstat_lock is
> -	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
> -	 * this lock with the _irq() suffix only disables interrupts on
> -	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
> -	 * interrupts on both configurations. The _irqsave() ensures
> -	 * that interrupts are always disabled and later restored.
> +	 * The _irqsave() is needed because the locks used for flushing are
> +	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
> +	 * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
> +	 * kernel. The raw_spinlock_t below disables interrupts on both
> +	 * configurations. The _irqsave() ensures that interrupts are always
> +	 * disabled and later restored.
>   	 */
>   	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
>   	if (contended) {
> @@ -87,7 +96,7 @@ __bpf_kfunc void cgroup_rstat_updated(
>   		struct cgroup_subsys_state *css, int cpu)
>   {
>   	struct cgroup *cgrp = css->cgroup;
> -	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
> +	raw_spinlock_t *cpu_lock;
>   	unsigned long flags;
>   
>   	/*
> @@ -101,6 +110,12 @@ __bpf_kfunc void cgroup_rstat_updated(
>   	if (data_race(cgroup_rstat_cpu(css, cpu)->updated_next))
>   		return;
>   
> +	if (is_base_css(css))
> +		cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
> +	else
> +		cpu_lock = per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) +
> +			css->ss->id;
> +
>   	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
>   
>   	/* put @css and all ancestors on the corresponding updated lists */
> @@ -208,11 +223,17 @@ static struct cgroup_subsys_state *cgroup_rstat_updated_list(
>   		struct cgroup_subsys_state *root, int cpu)
>   {
>   	struct cgroup *cgrp = root->cgroup;
> -	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
>   	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
>   	struct cgroup_subsys_state *head = NULL, *parent, *child;
> +	raw_spinlock_t *cpu_lock;
>   	unsigned long flags;
>   
> +	if (is_base_css(root))
> +		cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
> +	else
> +		cpu_lock = per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) +
> +			root->ss->id;
> +
>   	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, false);
>   
>   	/* Return NULL if this subtree is not on-list */
> @@ -315,7 +336,7 @@ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
>   	struct cgroup *cgrp = css->cgroup;
>   	int cpu;
>   
> -	lockdep_assert_held(&cgroup_rstat_lock);
> +	lockdep_assert_held(&lock);

I need to remove the ampersand since the variable is already a pointer.

>   
>   	for_each_possible_cpu(cpu) {
>   		struct cgroup_subsys_state *pos;
> @@ -356,12 +377,18 @@ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
>   __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
>   {
>   	struct cgroup *cgrp = css->cgroup;
> +	spinlock_t *lock;
> +
> +	if (is_base_css(css))
> +		lock = &cgroup_rstat_base_lock;
> +	else
> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];
>   
>   	might_sleep();
>   
> -	__cgroup_rstat_lock(&cgroup_rstat_lock, cgrp, -1);
> -	cgroup_rstat_flush_locked(css, &cgroup_rstat_lock);
> -	__cgroup_rstat_unlock(&cgroup_rstat_lock, cgrp, -1);
> +	__cgroup_rstat_lock(lock, cgrp, -1);
> +	cgroup_rstat_flush_locked(css, lock);
> +	__cgroup_rstat_unlock(lock, cgrp, -1);
>   }
>   
>   /**
> @@ -376,10 +403,16 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
>   void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
>   {
>   	struct cgroup *cgrp = css->cgroup;
> +	spinlock_t *lock;
> +
> +	if (is_base_css(css))
> +		lock = &cgroup_rstat_base_lock;
> +	else
> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];
>   
>   	might_sleep();
> -	__cgroup_rstat_lock(&cgroup_rstat_lock, cgrp, -1);
> -	cgroup_rstat_flush_locked(css, &cgroup_rstat_lock);
> +	__cgroup_rstat_lock(lock, cgrp, -1);
> +	cgroup_rstat_flush_locked(css, lock);
>   }
>   
>   /**
> @@ -389,7 +422,14 @@ void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
>   void cgroup_rstat_flush_release(struct cgroup_subsys_state *css)
>   {
>   	struct cgroup *cgrp = css->cgroup;
> -	__cgroup_rstat_unlock(&cgroup_rstat_lock, cgrp, -1);
> +	spinlock_t *lock;
> +
> +	if (is_base_css(css))
> +		lock = &cgroup_rstat_base_lock;
> +	else
> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];
> +
> +	__cgroup_rstat_unlock(lock, cgrp, -1);
>   }
>   
>   int cgroup_rstat_init(struct cgroup_subsys_state *css)
> @@ -435,10 +475,21 @@ void cgroup_rstat_exit(struct cgroup_subsys_state *css)
>   
>   void __init cgroup_rstat_boot(void)
>   {
> -	int cpu;
> +	struct cgroup_subsys *ss;
> +	int cpu, ssid;
>   
> -	for_each_possible_cpu(cpu)
> -		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
> +	for_each_subsys(ss, ssid) {
> +		spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
> +	}
> +
> +	for_each_possible_cpu(cpu) {
> +		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu));
> +
> +		for_each_subsys(ss, ssid) {
> +			raw_spin_lock_init(
> +					per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) + ssid);
> +		}
> +	}
>   }
>   
>   /*
Yosry Ahmed Feb. 28, 2025, 7:20 p.m. UTC | #4
On Thu, Feb 27, 2025 at 01:55:42PM -0800, inwardvessel wrote:
> From: JP Kobryn <inwardvessel@gmail.com>
> 
> Let the existing locks be dedicated to the base stats and rename them as
> such. Also add new rstat locks for each enabled subsystem. When handling
> cgroup subsystem states, distinguish between formal subsystems (memory,
> io, etc) and the base stats subsystem state (represented by
> cgroup::self) to decide on which locks to take. This change is made to
> prevent contention between subsystems when updating/flushing stats.
> 
> Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
> ---
>  kernel/cgroup/rstat.c | 93 +++++++++++++++++++++++++++++++++----------
>  1 file changed, 72 insertions(+), 21 deletions(-)
> 
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index 88908ef9212d..b3eaefc1fd07 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -9,8 +9,12 @@
>  
>  #include <trace/events/cgroup.h>
>  
> -static DEFINE_SPINLOCK(cgroup_rstat_lock);
> -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
> +static DEFINE_SPINLOCK(cgroup_rstat_base_lock);
> +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_base_cpu_lock);
> +
> +static spinlock_t cgroup_rstat_subsys_lock[CGROUP_SUBSYS_COUNT];
> +static DEFINE_PER_CPU(raw_spinlock_t,
> +		cgroup_rstat_subsys_cpu_lock[CGROUP_SUBSYS_COUNT]);
>  
>  static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
>  
> @@ -20,8 +24,13 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(
>  	return per_cpu_ptr(css->rstat_cpu, cpu);
>  }
>  
> +static inline bool is_base_css(struct cgroup_subsys_state *css)
> +{
> +	return css->ss == NULL;
> +}
> +
>  /*
> - * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
> + * Helper functions for rstat per CPU locks.
>   *
>   * This makes it easier to diagnose locking issues and contention in
>   * production environments. The parameter @fast_path determine the
> @@ -36,12 +45,12 @@ unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
>  	bool contended;
>  
>  	/*
> -	 * The _irqsave() is needed because cgroup_rstat_lock is
> -	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
> -	 * this lock with the _irq() suffix only disables interrupts on
> -	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
> -	 * interrupts on both configurations. The _irqsave() ensures
> -	 * that interrupts are always disabled and later restored.
> +	 * The _irqsave() is needed because the locks used for flushing are
> +	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
> +	 * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
> +	 * kernel. The raw_spinlock_t below disables interrupts on both
> +	 * configurations. The _irqsave() ensures that interrupts are always
> +	 * disabled and later restored.
>  	 */
>  	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
>  	if (contended) {
> @@ -87,7 +96,7 @@ __bpf_kfunc void cgroup_rstat_updated(
>  		struct cgroup_subsys_state *css, int cpu)
>  {
>  	struct cgroup *cgrp = css->cgroup;
> -	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
> +	raw_spinlock_t *cpu_lock;
>  	unsigned long flags;
>  
>  	/*
> @@ -101,6 +110,12 @@ __bpf_kfunc void cgroup_rstat_updated(
>  	if (data_race(cgroup_rstat_cpu(css, cpu)->updated_next))
>  		return;
>  
> +	if (is_base_css(css))
> +		cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
> +	else
> +		cpu_lock = per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) +
> +			css->ss->id;
> +

Maybe wrap this in a macro or function since it's used more than once.

>  	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
>  
>  	/* put @css and all ancestors on the corresponding updated lists */
> @@ -208,11 +223,17 @@ static struct cgroup_subsys_state *cgroup_rstat_updated_list(
>  		struct cgroup_subsys_state *root, int cpu)
>  {
>  	struct cgroup *cgrp = root->cgroup;
> -	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
>  	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
>  	struct cgroup_subsys_state *head = NULL, *parent, *child;
> +	raw_spinlock_t *cpu_lock;
>  	unsigned long flags;
>  
> +	if (is_base_css(root))
> +		cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
> +	else
> +		cpu_lock = per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) +
> +			root->ss->id;
> +
>  	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, false);
>  
>  	/* Return NULL if this subtree is not on-list */
> @@ -315,7 +336,7 @@ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
>  	struct cgroup *cgrp = css->cgroup;
>  	int cpu;
>  
> -	lockdep_assert_held(&cgroup_rstat_lock);
> +	lockdep_assert_held(&lock);
>  
>  	for_each_possible_cpu(cpu) {
>  		struct cgroup_subsys_state *pos;
> @@ -356,12 +377,18 @@ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
>  __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
>  {
>  	struct cgroup *cgrp = css->cgroup;
> +	spinlock_t *lock;
> +
> +	if (is_base_css(css))
> +		lock = &cgroup_rstat_base_lock;
> +	else
> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];

Same here. Also, instead of passing locks around, can we just pass
the css to __cgroup_rstat_lock/unlock() and have them find the correct
lock and use it directly?

>  
>  	might_sleep();
>  
> -	__cgroup_rstat_lock(&cgroup_rstat_lock, cgrp, -1);
> -	cgroup_rstat_flush_locked(css, &cgroup_rstat_lock);
> -	__cgroup_rstat_unlock(&cgroup_rstat_lock, cgrp, -1);
> +	__cgroup_rstat_lock(lock, cgrp, -1);
> +	cgroup_rstat_flush_locked(css, lock);
> +	__cgroup_rstat_unlock(lock, cgrp, -1);
>  }
>  
>  /**
> @@ -376,10 +403,16 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
>  void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
>  {
>  	struct cgroup *cgrp = css->cgroup;
> +	spinlock_t *lock;
> +
> +	if (is_base_css(css))
> +		lock = &cgroup_rstat_base_lock;
> +	else
> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];
>  
>  	might_sleep();
> -	__cgroup_rstat_lock(&cgroup_rstat_lock, cgrp, -1);
> -	cgroup_rstat_flush_locked(css, &cgroup_rstat_lock);
> +	__cgroup_rstat_lock(lock, cgrp, -1);
> +	cgroup_rstat_flush_locked(css, lock);
>  }
>  
>  /**
> @@ -389,7 +422,14 @@ void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
>  void cgroup_rstat_flush_release(struct cgroup_subsys_state *css)
>  {
>  	struct cgroup *cgrp = css->cgroup;
> -	__cgroup_rstat_unlock(&cgroup_rstat_lock, cgrp, -1);
> +	spinlock_t *lock;
> +
> +	if (is_base_css(css))
> +		lock = &cgroup_rstat_base_lock;
> +	else
> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];
> +
> +	__cgroup_rstat_unlock(lock, cgrp, -1);
>  }
>  
>  int cgroup_rstat_init(struct cgroup_subsys_state *css)
> @@ -435,10 +475,21 @@ void cgroup_rstat_exit(struct cgroup_subsys_state *css)
>  
>  void __init cgroup_rstat_boot(void)
>  {
> -	int cpu;
> +	struct cgroup_subsys *ss;
> +	int cpu, ssid;
>  
> -	for_each_possible_cpu(cpu)
> -		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
> +	for_each_subsys(ss, ssid) {
> +		spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
> +	}
> +
> +	for_each_possible_cpu(cpu) {
> +		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu));
> +
> +		for_each_subsys(ss, ssid) {
> +			raw_spin_lock_init(
> +					per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) + ssid);
> +		}
> +	}
>  }
>  
>  /*
> -- 
> 2.43.5
> 
>
kernel test robot March 1, 2025, 11 p.m. UTC | #5
Hi inwardvessel,

kernel test robot noticed the following build errors:

[auto build test ERROR on bpf-next/net]
[also build test ERROR on bpf-next/master bpf/master linus/master v6.14-rc4]
[cannot apply to tj-cgroup/for-next next-20250228]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/inwardvessel/cgroup-move-cgroup_rstat-from-cgroup-to-cgroup_subsys_state/20250228-055819
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git net
patch link:    https://lore.kernel.org/r/20250227215543.49928-4-inwardvessel%40gmail.com
patch subject: [PATCH 3/4 v2] cgroup: separate rstat locks for subsystems
config: hexagon-randconfig-001-20250302 (https://download.01.org/0day-ci/archive/20250302/202503020616.SJVwhuOV-lkp@intel.com/config)
compiler: clang version 21.0.0git (https://github.com/llvm/llvm-project 14170b16028c087ca154878f5ed93d3089a965c6)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250302/202503020616.SJVwhuOV-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202503020616.SJVwhuOV-lkp@intel.com/

All errors (new ones prefixed by >>):

>> kernel/cgroup/rstat.c:339:2: error: member reference base type 'spinlock_t *' (aka 'struct spinlock *') is not a structure or union
     339 |         lockdep_assert_held(&lock);
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/lockdep.h:285:17: note: expanded from macro 'lockdep_assert_held'
     285 |         lockdep_assert(lockdep_is_held(l) != LOCK_STATE_NOT_HELD)
         |         ~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/lockdep.h:252:52: note: expanded from macro 'lockdep_is_held'
     252 | #define lockdep_is_held(lock)           lock_is_held(&(lock)->dep_map)
         |                                                             ^ ~~~~~~~
   include/linux/lockdep.h:279:32: note: expanded from macro 'lockdep_assert'
     279 |         do { WARN_ON(debug_locks && !(cond)); } while (0)
         |              ~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~
   include/asm-generic/bug.h:123:25: note: expanded from macro 'WARN_ON'
     123 |         int __ret_warn_on = !!(condition);                              \
         |                                ^~~~~~~~~
   1 error generated.


vim +339 kernel/cgroup/rstat.c

   330	
   331	/* see cgroup_rstat_flush() */
   332	static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
   333			spinlock_t *lock)
   334		__releases(lock) __acquires(lock)
   335	{
   336		struct cgroup *cgrp = css->cgroup;
   337		int cpu;
   338	
 > 339		lockdep_assert_held(&lock);
   340	
   341		for_each_possible_cpu(cpu) {
   342			struct cgroup_subsys_state *pos;
   343	
   344			pos = cgroup_rstat_updated_list(css, cpu);
   345			for (; pos; pos = pos->rstat_flush_next) {
   346				if (!pos->ss)
   347					cgroup_base_stat_flush(pos->cgroup, cpu);
   348				else
   349					pos->ss->css_rstat_flush(pos, cpu);
   350	
   351				bpf_rstat_flush(pos->cgroup, cgroup_parent(pos->cgroup), cpu);
   352			}
   353	
   354			/* play nice and yield if necessary */
   355			if (need_resched() || spin_needbreak(lock)) {
   356				__cgroup_rstat_unlock(lock, cgrp, cpu);
   357				if (!cond_resched())
   358					cpu_relax();
   359				__cgroup_rstat_lock(lock, cgrp, cpu);
   360			}
   361		}
   362	}
   363
Michal Koutný March 3, 2025, 3:22 p.m. UTC | #6
On Thu, Feb 27, 2025 at 01:55:42PM -0800, inwardvessel <inwardvessel@gmail.com> wrote:
> From: JP Kobryn <inwardvessel@gmail.com>
...
> +static inline bool is_base_css(struct cgroup_subsys_state *css)
> +{
> +	return css->ss == NULL;
> +}

Similar predicate is also used in cgroup.c (various cgroup vs subsys
lifecycle functions, e.g. css_free_rwork_fn()). I think it'd better
unified, i.e. open code the predicate here or use the helper in both
cases (css_is_cgroup() or similar).

>  void __init cgroup_rstat_boot(void)
>  {
> -	int cpu;
> +	struct cgroup_subsys *ss;
> +	int cpu, ssid;
>  
> -	for_each_possible_cpu(cpu)
> -		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
> +	for_each_subsys(ss, ssid) {
> +		spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
> +	}

Hm, with this loop I realize it may be worth putting this lock into
struct cgroup_subsys_state and initializing them in
cgroup_init_subsys() to keep all per-subsys data in one pack.

> +
> +	for_each_possible_cpu(cpu) {
> +		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu));
> +
> +		for_each_subsys(ss, ssid) {
> +			raw_spin_lock_init(
> +					per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) + ssid);
> +		}

Similar here, and keep cgroup_rstat_boot() for the base locks only.
Yosry Ahmed March 3, 2025, 6:29 p.m. UTC | #7
On Mon, Mar 03, 2025 at 04:22:42PM +0100, Michal Koutný wrote:
> On Thu, Feb 27, 2025 at 01:55:42PM -0800, inwardvessel <inwardvessel@gmail.com> wrote:
> > From: JP Kobryn <inwardvessel@gmail.com>
> ...
> > +static inline bool is_base_css(struct cgroup_subsys_state *css)
> > +{
> > +	return css->ss == NULL;
> > +}
> 
> Similar predicate is also used in cgroup.c (various cgroup vs subsys
> lifecycle functions, e.g. css_free_rwork_fn()). I think it'd better
> unified, i.e. open code the predicate here or use the helper in both
> cases (css_is_cgroup() or similar).
> 
> >  void __init cgroup_rstat_boot(void)
> >  {
> > -	int cpu;
> > +	struct cgroup_subsys *ss;
> > +	int cpu, ssid;
> >  
> > -	for_each_possible_cpu(cpu)
> > -		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
> > +	for_each_subsys(ss, ssid) {
> > +		spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
> > +	}
> 
> Hm, with this loop I realize it may be worth putting this lock into
> struct cgroup_subsys_state and initializing them in
> cgroup_init_subsys() to keep all per-subsys data in one pack.

I thought about this, but this would have unnecessary memory overhead as
we only need one lock per-subsystem. So having a lock in every single
css is wasteful.

Maybe we can put the lock in struct cgroup_subsys? Then we can still
initialize them in cgroup_init_subsys().

> 
> > +
> > +	for_each_possible_cpu(cpu) {
> > +		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu));
> > +
> > +		for_each_subsys(ss, ssid) {
> > +			raw_spin_lock_init(
> > +					per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) + ssid);
> > +		}
> 
> Similar here, and keep cgroup_rstat_boot() for the base locks only.

I think it will be confusing to have cgroup_rstat_boot() only initialize
some of the locks.

I think if we initialize the subsys locks in cgroup_init_subsys(), then
we should open code initializing the base locks in cgroup_init(), and
remove cgroup_rstat_boot().

Alternatively, we can make cgroup_rstat_boot() take in a subsys and
initialize its lock. If we pass NULL, then it initialize the base locks.
In this case we can call cgroup_rstat_boot() for each subsystem that has
an rstat callback in cgroup_init() (or cgroup_init_subsys()), and then
once for the base locks.

WDYT?
Shakeel Butt March 3, 2025, 6:40 p.m. UTC | #8
On Mon, Mar 03, 2025 at 06:29:53PM +0000, Yosry Ahmed wrote:
> On Mon, Mar 03, 2025 at 04:22:42PM +0100, Michal Koutný wrote:
> > On Thu, Feb 27, 2025 at 01:55:42PM -0800, inwardvessel <inwardvessel@gmail.com> wrote:
> > > From: JP Kobryn <inwardvessel@gmail.com>
> > ...
> > > +static inline bool is_base_css(struct cgroup_subsys_state *css)
> > > +{
> > > +	return css->ss == NULL;
> > > +}
> > 
> > Similar predicate is also used in cgroup.c (various cgroup vs subsys
> > lifecycle functions, e.g. css_free_rwork_fn()). I think it'd better
> > unified, i.e. open code the predicate here or use the helper in both
> > cases (css_is_cgroup() or similar).
> > 
> > >  void __init cgroup_rstat_boot(void)
> > >  {
> > > -	int cpu;
> > > +	struct cgroup_subsys *ss;
> > > +	int cpu, ssid;
> > >  
> > > -	for_each_possible_cpu(cpu)
> > > -		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
> > > +	for_each_subsys(ss, ssid) {
> > > +		spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
> > > +	}
> > 
> > Hm, with this loop I realize it may be worth putting this lock into
> > struct cgroup_subsys_state and initializing them in
> > cgroup_init_subsys() to keep all per-subsys data in one pack.
> 
> I thought about this, but this would have unnecessary memory overhead as
> we only need one lock per-subsystem. So having a lock in every single
> css is wasteful.
> 
> Maybe we can put the lock in struct cgroup_subsys? Then we can still
> initialize them in cgroup_init_subsys().
> 

Actually one of things I was thinking about if we can just not have
per-subsystem lock at all. At the moment, it is protecting
rstat_flush_next field (today in cgroup and JP's series it is in css).
What if we make it a per-cpu then we don't need the per-subsystem lock
all? Let me know if I missed something which is being protected by this
lock.

This is help the case where there are multiple same subsystem stat
flushers, possibly of differnt part of cgroup tree. Though they will
still compete on per-cpu lock but still would be better than a
sub-system level lock.
Michal Koutný March 3, 2025, 6:49 p.m. UTC | #9
On Mon, Mar 03, 2025 at 06:29:53PM +0000, Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
> I thought about this, but this would have unnecessary memory overhead as
> we only need one lock per-subsystem. So having a lock in every single
> css is wasteful.
> 
> Maybe we can put the lock in struct cgroup_subsys? Then we can still
> initialize them in cgroup_init_subsys().

Ah, yes, muscle memory, of course I had struct cgroup_subsys\> in mind.

> I think it will be confusing to have cgroup_rstat_boot() only initialize
> some of the locks.
> 
> I think if we initialize the subsys locks in cgroup_init_subsys(), then
> we should open code initializing the base locks in cgroup_init(), and
> remove cgroup_rstat_boot().

Can this work?

DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_base_cpu_lock) =
	__RAW_SPIN_LOCK_INITIALIZER(cgroup_rstat_base_cpu_lock);

(I see other places in kernel that assign into the per-cpu definition
but I have no idea whether that does expands and links to what's
expected. Neglecting the fact that the lock initializer is apparently
not for external use.)

> Alternatively, we can make cgroup_rstat_boot() take in a subsys and
> initialize its lock. If we pass NULL, then it initialize the base locks.
> In this case we can call cgroup_rstat_boot() for each subsystem that has
> an rstat callback in cgroup_init() (or cgroup_init_subsys()), and then
> once for the base locks.
> 
> WDYT?

Such calls from cgroup_init_subsys() are fine too.
JP Kobryn March 3, 2025, 7:23 p.m. UTC | #10
On 3/3/25 10:40 AM, Shakeel Butt wrote:
> On Mon, Mar 03, 2025 at 06:29:53PM +0000, Yosry Ahmed wrote:
>> On Mon, Mar 03, 2025 at 04:22:42PM +0100, Michal Koutný wrote:
>>> On Thu, Feb 27, 2025 at 01:55:42PM -0800, inwardvessel <inwardvessel@gmail.com> wrote:
>>>> From: JP Kobryn <inwardvessel@gmail.com>
>>> ...
>>>> +static inline bool is_base_css(struct cgroup_subsys_state *css)
>>>> +{
>>>> +	return css->ss == NULL;
>>>> +}
>>>
>>> Similar predicate is also used in cgroup.c (various cgroup vs subsys
>>> lifecycle functions, e.g. css_free_rwork_fn()). I think it'd better
>>> unified, i.e. open code the predicate here or use the helper in both
>>> cases (css_is_cgroup() or similar).
>>>
>>>>   void __init cgroup_rstat_boot(void)
>>>>   {
>>>> -	int cpu;
>>>> +	struct cgroup_subsys *ss;
>>>> +	int cpu, ssid;
>>>>   
>>>> -	for_each_possible_cpu(cpu)
>>>> -		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
>>>> +	for_each_subsys(ss, ssid) {
>>>> +		spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
>>>> +	}
>>>
>>> Hm, with this loop I realize it may be worth putting this lock into
>>> struct cgroup_subsys_state and initializing them in
>>> cgroup_init_subsys() to keep all per-subsys data in one pack.
>>
>> I thought about this, but this would have unnecessary memory overhead as
>> we only need one lock per-subsystem. So having a lock in every single
>> css is wasteful.
>>
>> Maybe we can put the lock in struct cgroup_subsys? Then we can still
>> initialize them in cgroup_init_subsys().
>>
> 
> Actually one of things I was thinking about if we can just not have
> per-subsystem lock at all. At the moment, it is protecting
> rstat_flush_next field (today in cgroup and JP's series it is in css).
> What if we make it a per-cpu then we don't need the per-subsystem lock
> all? Let me know if I missed something which is being protected by this
> lock.
> 
> This is help the case where there are multiple same subsystem stat
> flushers, possibly of differnt part of cgroup tree. Though they will
> still compete on per-cpu lock but still would be better than a
> sub-system level lock.

Right, the trade-off would mean one subsystem flushing could contend for
a cpu where a different subsystem is updating and vice versa.
Shakeel Butt March 3, 2025, 7:39 p.m. UTC | #11
On Mon, Mar 03, 2025 at 11:23:49AM -0800, JP Kobryn wrote:
> 
> 
> On 3/3/25 10:40 AM, Shakeel Butt wrote:
> > On Mon, Mar 03, 2025 at 06:29:53PM +0000, Yosry Ahmed wrote:
> > > On Mon, Mar 03, 2025 at 04:22:42PM +0100, Michal Koutný wrote:
> > > > On Thu, Feb 27, 2025 at 01:55:42PM -0800, inwardvessel <inwardvessel@gmail.com> wrote:
> > > > > From: JP Kobryn <inwardvessel@gmail.com>
> > > > ...
> > > > > +static inline bool is_base_css(struct cgroup_subsys_state *css)
> > > > > +{
> > > > > +	return css->ss == NULL;
> > > > > +}
> > > > 
> > > > Similar predicate is also used in cgroup.c (various cgroup vs subsys
> > > > lifecycle functions, e.g. css_free_rwork_fn()). I think it'd better
> > > > unified, i.e. open code the predicate here or use the helper in both
> > > > cases (css_is_cgroup() or similar).
> > > > 
> > > > >   void __init cgroup_rstat_boot(void)
> > > > >   {
> > > > > -	int cpu;
> > > > > +	struct cgroup_subsys *ss;
> > > > > +	int cpu, ssid;
> > > > > -	for_each_possible_cpu(cpu)
> > > > > -		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
> > > > > +	for_each_subsys(ss, ssid) {
> > > > > +		spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
> > > > > +	}
> > > > 
> > > > Hm, with this loop I realize it may be worth putting this lock into
> > > > struct cgroup_subsys_state and initializing them in
> > > > cgroup_init_subsys() to keep all per-subsys data in one pack.
> > > 
> > > I thought about this, but this would have unnecessary memory overhead as
> > > we only need one lock per-subsystem. So having a lock in every single
> > > css is wasteful.
> > > 
> > > Maybe we can put the lock in struct cgroup_subsys? Then we can still
> > > initialize them in cgroup_init_subsys().
> > > 
> > 
> > Actually one of things I was thinking about if we can just not have
> > per-subsystem lock at all. At the moment, it is protecting
> > rstat_flush_next field (today in cgroup and JP's series it is in css).
> > What if we make it a per-cpu then we don't need the per-subsystem lock
> > all? Let me know if I missed something which is being protected by this
> > lock.
> > 
> > This is help the case where there are multiple same subsystem stat
> > flushers, possibly of differnt part of cgroup tree. Though they will
> > still compete on per-cpu lock but still would be better than a
> > sub-system level lock.
> 
> Right, the trade-off would mean one subsystem flushing could contend for
> a cpu where a different subsystem is updating and vice versa.
> 

I meant we keep the per-subsystem per-cpu locks but remove the
per-subsystem lock (i.e. cgroup_rstat_subsys_lock) if we make
rstat_flush_next per-cpu. In that case two memory stats readers will not
compete on memory subsystem lock but they will compete on per-cpu memory
locks. Though we will have to experiment to see if this really helps or
not.
Yosry Ahmed March 3, 2025, 7:50 p.m. UTC | #12
On Mon, Mar 03, 2025 at 10:40:45AM -0800, Shakeel Butt wrote:
> On Mon, Mar 03, 2025 at 06:29:53PM +0000, Yosry Ahmed wrote:
> > On Mon, Mar 03, 2025 at 04:22:42PM +0100, Michal Koutný wrote:
> > > On Thu, Feb 27, 2025 at 01:55:42PM -0800, inwardvessel <inwardvessel@gmail.com> wrote:
> > > > From: JP Kobryn <inwardvessel@gmail.com>
> > > ...
> > > > +static inline bool is_base_css(struct cgroup_subsys_state *css)
> > > > +{
> > > > +	return css->ss == NULL;
> > > > +}
> > > 
> > > Similar predicate is also used in cgroup.c (various cgroup vs subsys
> > > lifecycle functions, e.g. css_free_rwork_fn()). I think it'd better
> > > unified, i.e. open code the predicate here or use the helper in both
> > > cases (css_is_cgroup() or similar).
> > > 
> > > >  void __init cgroup_rstat_boot(void)
> > > >  {
> > > > -	int cpu;
> > > > +	struct cgroup_subsys *ss;
> > > > +	int cpu, ssid;
> > > >  
> > > > -	for_each_possible_cpu(cpu)
> > > > -		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
> > > > +	for_each_subsys(ss, ssid) {
> > > > +		spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
> > > > +	}
> > > 
> > > Hm, with this loop I realize it may be worth putting this lock into
> > > struct cgroup_subsys_state and initializing them in
> > > cgroup_init_subsys() to keep all per-subsys data in one pack.
> > 
> > I thought about this, but this would have unnecessary memory overhead as
> > we only need one lock per-subsystem. So having a lock in every single
> > css is wasteful.
> > 
> > Maybe we can put the lock in struct cgroup_subsys? Then we can still
> > initialize them in cgroup_init_subsys().
> > 
> 
> Actually one of things I was thinking about if we can just not have
> per-subsystem lock at all. At the moment, it is protecting
> rstat_flush_next field (today in cgroup and JP's series it is in css).
> What if we make it a per-cpu then we don't need the per-subsystem lock
> all? Let me know if I missed something which is being protected by this
> lock.

I think it protects more than that. I remember locking into this before,
and the thing I remember is that stats themselves. Looking at
mem_cgroup_stat_aggregate(), we aggregate the stats into the per-cgroup
counters non-atomically. This is only protected by the rstat lock
(currently global, per-subsystem with the series) AFAICT.

Not sure if only the memory subsystem has this dependency or if others
do as well. I remember looking into switching these counters to atomics
to remove the global lock, but it performed worse IIRC.

I remember also looking into partioning the lock into a per-cgroup (or
per-css now?) lock, and only holding locks of the parent and child
cgroups as we flush each cgroup. I don't remember if I actually
implemented this, but it introduces complexity.

Perhaps we can defer the locking into the subsystem, if only the memory
controller requires it. In this case mem_cgroup_css_rstat_flush() (or
mem_cgroup_stat_aggregate()) can hold a memcg-specific spinlock only
while aggregating the stats.

There could be other things protected by the lock, but that's what I
remember.
Shakeel Butt March 3, 2025, 8:09 p.m. UTC | #13
On Mon, Mar 03, 2025 at 07:50:20PM +0000, Yosry Ahmed wrote:
> On Mon, Mar 03, 2025 at 10:40:45AM -0800, Shakeel Butt wrote:
> > On Mon, Mar 03, 2025 at 06:29:53PM +0000, Yosry Ahmed wrote:
> > > On Mon, Mar 03, 2025 at 04:22:42PM +0100, Michal Koutný wrote:
> > > > On Thu, Feb 27, 2025 at 01:55:42PM -0800, inwardvessel <inwardvessel@gmail.com> wrote:
> > > > > From: JP Kobryn <inwardvessel@gmail.com>
> > > > ...
> > > > > +static inline bool is_base_css(struct cgroup_subsys_state *css)
> > > > > +{
> > > > > +	return css->ss == NULL;
> > > > > +}
> > > > 
> > > > Similar predicate is also used in cgroup.c (various cgroup vs subsys
> > > > lifecycle functions, e.g. css_free_rwork_fn()). I think it'd better
> > > > unified, i.e. open code the predicate here or use the helper in both
> > > > cases (css_is_cgroup() or similar).
> > > > 
> > > > >  void __init cgroup_rstat_boot(void)
> > > > >  {
> > > > > -	int cpu;
> > > > > +	struct cgroup_subsys *ss;
> > > > > +	int cpu, ssid;
> > > > >  
> > > > > -	for_each_possible_cpu(cpu)
> > > > > -		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
> > > > > +	for_each_subsys(ss, ssid) {
> > > > > +		spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
> > > > > +	}
> > > > 
> > > > Hm, with this loop I realize it may be worth putting this lock into
> > > > struct cgroup_subsys_state and initializing them in
> > > > cgroup_init_subsys() to keep all per-subsys data in one pack.
> > > 
> > > I thought about this, but this would have unnecessary memory overhead as
> > > we only need one lock per-subsystem. So having a lock in every single
> > > css is wasteful.
> > > 
> > > Maybe we can put the lock in struct cgroup_subsys? Then we can still
> > > initialize them in cgroup_init_subsys().
> > > 
> > 
> > Actually one of things I was thinking about if we can just not have
> > per-subsystem lock at all. At the moment, it is protecting
> > rstat_flush_next field (today in cgroup and JP's series it is in css).
> > What if we make it a per-cpu then we don't need the per-subsystem lock
> > all? Let me know if I missed something which is being protected by this
> > lock.
> 
> I think it protects more than that. I remember locking into this before,
> and the thing I remember is that stats themselves. Looking at
> mem_cgroup_stat_aggregate(), we aggregate the stats into the per-cgroup
> counters non-atomically. This is only protected by the rstat lock
> (currently global, per-subsystem with the series) AFAICT.
> 
> Not sure if only the memory subsystem has this dependency or if others
> do as well. I remember looking into switching these counters to atomics
> to remove the global lock, but it performed worse IIRC.
> 
> I remember also looking into partioning the lock into a per-cgroup (or
> per-css now?) lock, and only holding locks of the parent and child
> cgroups as we flush each cgroup. I don't remember if I actually
> implemented this, but it introduces complexity.
> 
> Perhaps we can defer the locking into the subsystem, if only the memory
> controller requires it. In this case mem_cgroup_css_rstat_flush() (or
> mem_cgroup_stat_aggregate()) can hold a memcg-specific spinlock only
> while aggregating the stats.
> 
> There could be other things protected by the lock, but that's what I
> remember.

Thanks for reminding me. It does not seem straight forward or simple.
Lower priority then.
kernel test robot March 3, 2025, 11:49 p.m. UTC | #14
Hi inwardvessel,

kernel test robot noticed the following build errors:

[auto build test ERROR on bpf-next/net]
[also build test ERROR on bpf-next/master bpf/master linus/master v6.14-rc5]
[cannot apply to tj-cgroup/for-next next-20250303]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/inwardvessel/cgroup-move-cgroup_rstat-from-cgroup-to-cgroup_subsys_state/20250228-055819
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git net
patch link:    https://lore.kernel.org/r/20250227215543.49928-4-inwardvessel%40gmail.com
patch subject: [PATCH 3/4 v2] cgroup: separate rstat locks for subsystems
config: loongarch-allyesconfig (https://download.01.org/0day-ci/archive/20250304/202503040736.kEeH1wt6-lkp@intel.com/config)
compiler: loongarch64-linux-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250304/202503040736.kEeH1wt6-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202503040736.kEeH1wt6-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from arch/loongarch/include/asm/bug.h:61,
                    from include/linux/bug.h:5,
                    from include/linux/thread_info.h:13,
                    from include/asm-generic/current.h:6,
                    from ./arch/loongarch/include/generated/asm/current.h:1,
                    from include/linux/sched.h:12,
                    from include/linux/cgroup.h:12,
                    from kernel/cgroup/cgroup-internal.h:5,
                    from kernel/cgroup/rstat.c:2:
   kernel/cgroup/rstat.c: In function 'cgroup_rstat_flush_locked':
>> include/linux/lockdep.h:252:61: error: 'lock' is a pointer; did you mean to use '->'?
     252 | #define lockdep_is_held(lock)           lock_is_held(&(lock)->dep_map)
         |                                                             ^~
   include/asm-generic/bug.h:123:32: note: in definition of macro 'WARN_ON'
     123 |         int __ret_warn_on = !!(condition);                              \
         |                                ^~~~~~~~~
   include/linux/lockdep.h:285:9: note: in expansion of macro 'lockdep_assert'
     285 |         lockdep_assert(lockdep_is_held(l) != LOCK_STATE_NOT_HELD)
         |         ^~~~~~~~~~~~~~
   include/linux/lockdep.h:285:24: note: in expansion of macro 'lockdep_is_held'
     285 |         lockdep_assert(lockdep_is_held(l) != LOCK_STATE_NOT_HELD)
         |                        ^~~~~~~~~~~~~~~
   kernel/cgroup/rstat.c:339:9: note: in expansion of macro 'lockdep_assert_held'
     339 |         lockdep_assert_held(&lock);
         |         ^~~~~~~~~~~~~~~~~~~


vim +252 include/linux/lockdep.h

f607c668577481 Peter Zijlstra 2009-07-20  251  
f8319483f57f1c Peter Zijlstra 2016-11-30 @252  #define lockdep_is_held(lock)		lock_is_held(&(lock)->dep_map)
f8319483f57f1c Peter Zijlstra 2016-11-30  253  #define lockdep_is_held_type(lock, r)	lock_is_held_type(&(lock)->dep_map, (r))
f607c668577481 Peter Zijlstra 2009-07-20  254
JP Kobryn March 6, 2025, 9:36 p.m. UTC | #15
On 3/3/25 10:29 AM, Yosry Ahmed wrote:
> On Mon, Mar 03, 2025 at 04:22:42PM +0100, Michal Koutný wrote:
>> On Thu, Feb 27, 2025 at 01:55:42PM -0800, inwardvessel <inwardvessel@gmail.com> wrote:
>>> From: JP Kobryn <inwardvessel@gmail.com>
>> ...
>>> +static inline bool is_base_css(struct cgroup_subsys_state *css)
>>> +{
>>> +	return css->ss == NULL;
>>> +}
>>
>> Similar predicate is also used in cgroup.c (various cgroup vs subsys
>> lifecycle functions, e.g. css_free_rwork_fn()). I think it'd better
>> unified, i.e. open code the predicate here or use the helper in both
>> cases (css_is_cgroup() or similar).

Sure. Thanks for pointing that one out.

>>
>>>   void __init cgroup_rstat_boot(void)
>>>   {
>>> -	int cpu;
>>> +	struct cgroup_subsys *ss;
>>> +	int cpu, ssid;
>>>   
>>> -	for_each_possible_cpu(cpu)
>>> -		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
>>> +	for_each_subsys(ss, ssid) {
>>> +		spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
>>> +	}
>>
>> Hm, with this loop I realize it may be worth putting this lock into
>> struct cgroup_subsys_state and initializing them in
>> cgroup_init_subsys() to keep all per-subsys data in one pack.

Will give this a go in next rev.

> 
> I thought about this, but this would have unnecessary memory overhead as
> we only need one lock per-subsystem. So having a lock in every single
> css is wasteful.
> 
> Maybe we can put the lock in struct cgroup_subsys? Then we can still
> initialize them in cgroup_init_subsys().
> 
>>
>>> +
>>> +	for_each_possible_cpu(cpu) {
>>> +		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu));
>>> +
>>> +		for_each_subsys(ss, ssid) {
>>> +			raw_spin_lock_init(
>>> +					per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) + ssid);
>>> +		}
>>
>> Similar here, and keep cgroup_rstat_boot() for the base locks only.
> 
> I think it will be confusing to have cgroup_rstat_boot() only initialize
> some of the locks.
> 
> I think if we initialize the subsys locks in cgroup_init_subsys(), then
> we should open code initializing the base locks in cgroup_init(), and
> remove cgroup_rstat_boot().
> 
> Alternatively, we can make cgroup_rstat_boot() take in a subsys and
> initialize its lock. If we pass NULL, then it initialize the base locks.
> In this case we can call cgroup_rstat_boot() for each subsystem that has
> an rstat callback in cgroup_init() (or cgroup_init_subsys()), and then
> once for the base locks.
> 
> WDYT?

I like this alternative idea of adjusting cgroup_rstat_boot() so it can
accept a subsys ref. Let's see how it looks when v3 goes out.
JP Kobryn March 6, 2025, 9:47 p.m. UTC | #16
On 2/28/25 11:20 AM, Yosry Ahmed wrote:
> On Thu, Feb 27, 2025 at 01:55:42PM -0800, inwardvessel wrote:
>> From: JP Kobryn <inwardvessel@gmail.com>
>>
>> Let the existing locks be dedicated to the base stats and rename them as
>> such. Also add new rstat locks for each enabled subsystem. When handling
>> cgroup subsystem states, distinguish between formal subsystems (memory,
>> io, etc) and the base stats subsystem state (represented by
>> cgroup::self) to decide on which locks to take. This change is made to
>> prevent contention between subsystems when updating/flushing stats.
>>
>> Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
>> ---
>>   kernel/cgroup/rstat.c | 93 +++++++++++++++++++++++++++++++++----------
>>   1 file changed, 72 insertions(+), 21 deletions(-)
>>
>> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
>> index 88908ef9212d..b3eaefc1fd07 100644
>> --- a/kernel/cgroup/rstat.c
>> +++ b/kernel/cgroup/rstat.c
>> @@ -9,8 +9,12 @@
>>   
>>   #include <trace/events/cgroup.h>
>>   
>> -static DEFINE_SPINLOCK(cgroup_rstat_lock);
>> -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
>> +static DEFINE_SPINLOCK(cgroup_rstat_base_lock);
>> +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_base_cpu_lock);
>> +
>> +static spinlock_t cgroup_rstat_subsys_lock[CGROUP_SUBSYS_COUNT];
>> +static DEFINE_PER_CPU(raw_spinlock_t,
>> +		cgroup_rstat_subsys_cpu_lock[CGROUP_SUBSYS_COUNT]);
>>   
>>   static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
>>   
>> @@ -20,8 +24,13 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(
>>   	return per_cpu_ptr(css->rstat_cpu, cpu);
>>   }
>>   
>> +static inline bool is_base_css(struct cgroup_subsys_state *css)
>> +{
>> +	return css->ss == NULL;
>> +}
>> +
>>   /*
>> - * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
>> + * Helper functions for rstat per CPU locks.
>>    *
>>    * This makes it easier to diagnose locking issues and contention in
>>    * production environments. The parameter @fast_path determine the
>> @@ -36,12 +45,12 @@ unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
>>   	bool contended;
>>   
>>   	/*
>> -	 * The _irqsave() is needed because cgroup_rstat_lock is
>> -	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
>> -	 * this lock with the _irq() suffix only disables interrupts on
>> -	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
>> -	 * interrupts on both configurations. The _irqsave() ensures
>> -	 * that interrupts are always disabled and later restored.
>> +	 * The _irqsave() is needed because the locks used for flushing are
>> +	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
>> +	 * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
>> +	 * kernel. The raw_spinlock_t below disables interrupts on both
>> +	 * configurations. The _irqsave() ensures that interrupts are always
>> +	 * disabled and later restored.
>>   	 */
>>   	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
>>   	if (contended) {
>> @@ -87,7 +96,7 @@ __bpf_kfunc void cgroup_rstat_updated(
>>   		struct cgroup_subsys_state *css, int cpu)
>>   {
>>   	struct cgroup *cgrp = css->cgroup;
>> -	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
>> +	raw_spinlock_t *cpu_lock;
>>   	unsigned long flags;
>>   
>>   	/*
>> @@ -101,6 +110,12 @@ __bpf_kfunc void cgroup_rstat_updated(
>>   	if (data_race(cgroup_rstat_cpu(css, cpu)->updated_next))
>>   		return;
>>   
>> +	if (is_base_css(css))
>> +		cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
>> +	else
>> +		cpu_lock = per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) +
>> +			css->ss->id;
>> +
> 
> Maybe wrap this in a macro or function since it's used more than once.
> 
>>   	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
>>   
>>   	/* put @css and all ancestors on the corresponding updated lists */
>> @@ -208,11 +223,17 @@ static struct cgroup_subsys_state *cgroup_rstat_updated_list(
>>   		struct cgroup_subsys_state *root, int cpu)
>>   {
>>   	struct cgroup *cgrp = root->cgroup;
>> -	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
>>   	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
>>   	struct cgroup_subsys_state *head = NULL, *parent, *child;
>> +	raw_spinlock_t *cpu_lock;
>>   	unsigned long flags;
>>   
>> +	if (is_base_css(root))
>> +		cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
>> +	else
>> +		cpu_lock = per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) +
>> +			root->ss->id;
>> +
>>   	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, false);
>>   
>>   	/* Return NULL if this subtree is not on-list */
>> @@ -315,7 +336,7 @@ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
>>   	struct cgroup *cgrp = css->cgroup;
>>   	int cpu;
>>   
>> -	lockdep_assert_held(&cgroup_rstat_lock);
>> +	lockdep_assert_held(&lock);
>>   
>>   	for_each_possible_cpu(cpu) {
>>   		struct cgroup_subsys_state *pos;
>> @@ -356,12 +377,18 @@ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
>>   __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
>>   {
>>   	struct cgroup *cgrp = css->cgroup;
>> +	spinlock_t *lock;
>> +
>> +	if (is_base_css(css))
>> +		lock = &cgroup_rstat_base_lock;
>> +	else
>> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];
> 
> Same here. Also, instead of passing locks around, can we just pass
> the css to __cgroup_rstat_lock/unlock() and have them find the correct
> lock and use it directly?

I like this and will take that approach in v3. It will allow the lock
selection code you previously commented on to exist in just one place.

> 
>>   
>>   	might_sleep();
>>   
>> -	__cgroup_rstat_lock(&cgroup_rstat_lock, cgrp, -1);
>> -	cgroup_rstat_flush_locked(css, &cgroup_rstat_lock);
>> -	__cgroup_rstat_unlock(&cgroup_rstat_lock, cgrp, -1);
>> +	__cgroup_rstat_lock(lock, cgrp, -1);
>> +	cgroup_rstat_flush_locked(css, lock);
>> +	__cgroup_rstat_unlock(lock, cgrp, -1);
>>   }
>>   
>>   /**
>> @@ -376,10 +403,16 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
>>   void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
>>   {
>>   	struct cgroup *cgrp = css->cgroup;
>> +	spinlock_t *lock;
>> +
>> +	if (is_base_css(css))
>> +		lock = &cgroup_rstat_base_lock;
>> +	else
>> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];
>>   
>>   	might_sleep();
>> -	__cgroup_rstat_lock(&cgroup_rstat_lock, cgrp, -1);
>> -	cgroup_rstat_flush_locked(css, &cgroup_rstat_lock);
>> +	__cgroup_rstat_lock(lock, cgrp, -1);
>> +	cgroup_rstat_flush_locked(css, lock);
>>   }
>>   
>>   /**
>> @@ -389,7 +422,14 @@ void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
>>   void cgroup_rstat_flush_release(struct cgroup_subsys_state *css)
>>   {
>>   	struct cgroup *cgrp = css->cgroup;
>> -	__cgroup_rstat_unlock(&cgroup_rstat_lock, cgrp, -1);
>> +	spinlock_t *lock;
>> +
>> +	if (is_base_css(css))
>> +		lock = &cgroup_rstat_base_lock;
>> +	else
>> +		lock = &cgroup_rstat_subsys_lock[css->ss->id];
>> +
>> +	__cgroup_rstat_unlock(lock, cgrp, -1);
>>   }
>>   
>>   int cgroup_rstat_init(struct cgroup_subsys_state *css)
>> @@ -435,10 +475,21 @@ void cgroup_rstat_exit(struct cgroup_subsys_state *css)
>>   
>>   void __init cgroup_rstat_boot(void)
>>   {
>> -	int cpu;
>> +	struct cgroup_subsys *ss;
>> +	int cpu, ssid;
>>   
>> -	for_each_possible_cpu(cpu)
>> -		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
>> +	for_each_subsys(ss, ssid) {
>> +		spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
>> +	}
>> +
>> +	for_each_possible_cpu(cpu) {
>> +		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu));
>> +
>> +		for_each_subsys(ss, ssid) {
>> +			raw_spin_lock_init(
>> +					per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) + ssid);
>> +		}
>> +	}
>>   }
>>   
>>   /*
>> -- 
>> 2.43.5
>>
>>
JP Kobryn March 10, 2025, 5:59 p.m. UTC | #17
On 3/3/25 10:49 AM, Michal Koutný wrote:
> On Mon, Mar 03, 2025 at 06:29:53PM +0000, Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
>> I thought about this, but this would have unnecessary memory overhead as
>> we only need one lock per-subsystem. So having a lock in every single
>> css is wasteful.
>>
>> Maybe we can put the lock in struct cgroup_subsys? Then we can still
>> initialize them in cgroup_init_subsys().
> 
> Ah, yes, muscle memory, of course I had struct cgroup_subsys\> in mind.
> 
>> I think it will be confusing to have cgroup_rstat_boot() only initialize
>> some of the locks.
>>
>> I think if we initialize the subsys locks in cgroup_init_subsys(), then
>> we should open code initializing the base locks in cgroup_init(), and
>> remove cgroup_rstat_boot().
> 
> Can this work?
> 
> DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_base_cpu_lock) =
> 	__RAW_SPIN_LOCK_INITIALIZER(cgroup_rstat_base_cpu_lock);
> 
> (I see other places in kernel that assign into the per-cpu definition
> but I have no idea whether that does expands and links to what's
> expected. Neglecting the fact that the lock initializer is apparently
> not for external use.)

I gave this a try. Using lockdep fields to verify, it expanded as
intended:
[    1.442498] [ss_rstat_init] cpu:0, lock.magic:dead4ead, 
lock.owner_cpu:-1, lock.owner:ffffffffffffffff
[    1.443027] [ss_rstat_init] cpu:1, lock.magic:dead4ead, 
lock.owner_cpu:-1, lock.owner:ffffffffffffffff
...

Unless anyone has objections on using the double under macro, I will use
this in v3.

> 
>> Alternatively, we can make cgroup_rstat_boot() take in a subsys and
>> initialize its lock. If we pass NULL, then it initialize the base locks.
>> In this case we can call cgroup_rstat_boot() for each subsystem that has
>> an rstat callback in cgroup_init() (or cgroup_init_subsys()), and then
>> once for the base locks.
>>
>> WDYT?
> 
> Such calls from cgroup_init_subsys() are fine too.

Using the lock initializer macro above simplifies this situation. I'm
currently working with these changes that should appear in v3:

move global subsys locks to ss struct:
	struct cgroup_subsys {
		...
		spinlock_t lock;
		raw_spinlock_t __percpu *percpu_lock;
	};

change:
	cgroup_rstat_boot(void)
to:
	ss_rstat_init(struct cgroup_subsys *ss)

... and only use ss_rstat_init(ss) to initialize the new subsystem
lock fields, since the locks for base stats are now initialized at
definition.

Note that these changes also have the added benefit of not having to
perform an allocation of the per-cpu lock for subsystems that do not
participate in rstat. Previously it was wasted static memory.
Michal Koutný March 11, 2025, 1:49 p.m. UTC | #18
On Mon, Mar 10, 2025 at 10:59:30AM -0700, JP Kobryn <inwardvessel@gmail.com> wrote:
> > DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_base_cpu_lock) =
> > 	__RAW_SPIN_LOCK_INITIALIZER(cgroup_rstat_base_cpu_lock);
> > 
> > (I see other places in kernel that assign into the per-cpu definition
> > but I have no idea whether that does expands and links to what's
> > expected. Neglecting the fact that the lock initializer is apparently
> > not for external use.)
> 
> I gave this a try. Using lockdep fields to verify, it expanded as
> intended:
> [    1.442498] [ss_rstat_init] cpu:0, lock.magic:dead4ead,
> lock.owner_cpu:-1, lock.owner:ffffffffffffffff
> [    1.443027] [ss_rstat_init] cpu:1, lock.magic:dead4ead,
> lock.owner_cpu:-1, lock.owner:ffffffffffffffff
> ...
> 
> Unless anyone has objections on using the double under macro, I will use
> this in v3.

Actually, I have the objection (to the underscored macro).
It may be work today but it may break subtly in the future.

Maybe add a separate patch that introduces a proper (non-underscore)
initializer (or percpu wrapped initializer) macro and people Cc'd on
that may evaluate wiseness of that.

Michal
diff mbox series

Patch

diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 88908ef9212d..b3eaefc1fd07 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -9,8 +9,12 @@ 
 
 #include <trace/events/cgroup.h>
 
-static DEFINE_SPINLOCK(cgroup_rstat_lock);
-static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
+static DEFINE_SPINLOCK(cgroup_rstat_base_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_base_cpu_lock);
+
+static spinlock_t cgroup_rstat_subsys_lock[CGROUP_SUBSYS_COUNT];
+static DEFINE_PER_CPU(raw_spinlock_t,
+		cgroup_rstat_subsys_cpu_lock[CGROUP_SUBSYS_COUNT]);
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
 
@@ -20,8 +24,13 @@  static struct cgroup_rstat_cpu *cgroup_rstat_cpu(
 	return per_cpu_ptr(css->rstat_cpu, cpu);
 }
 
+static inline bool is_base_css(struct cgroup_subsys_state *css)
+{
+	return css->ss == NULL;
+}
+
 /*
- * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
+ * Helper functions for rstat per CPU locks.
  *
  * This makes it easier to diagnose locking issues and contention in
  * production environments. The parameter @fast_path determine the
@@ -36,12 +45,12 @@  unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
 	bool contended;
 
 	/*
-	 * The _irqsave() is needed because cgroup_rstat_lock is
-	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
-	 * this lock with the _irq() suffix only disables interrupts on
-	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
-	 * interrupts on both configurations. The _irqsave() ensures
-	 * that interrupts are always disabled and later restored.
+	 * The _irqsave() is needed because the locks used for flushing are
+	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
+	 * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
+	 * kernel. The raw_spinlock_t below disables interrupts on both
+	 * configurations. The _irqsave() ensures that interrupts are always
+	 * disabled and later restored.
 	 */
 	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
 	if (contended) {
@@ -87,7 +96,7 @@  __bpf_kfunc void cgroup_rstat_updated(
 		struct cgroup_subsys_state *css, int cpu)
 {
 	struct cgroup *cgrp = css->cgroup;
-	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+	raw_spinlock_t *cpu_lock;
 	unsigned long flags;
 
 	/*
@@ -101,6 +110,12 @@  __bpf_kfunc void cgroup_rstat_updated(
 	if (data_race(cgroup_rstat_cpu(css, cpu)->updated_next))
 		return;
 
+	if (is_base_css(css))
+		cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
+	else
+		cpu_lock = per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) +
+			css->ss->id;
+
 	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
 
 	/* put @css and all ancestors on the corresponding updated lists */
@@ -208,11 +223,17 @@  static struct cgroup_subsys_state *cgroup_rstat_updated_list(
 		struct cgroup_subsys_state *root, int cpu)
 {
 	struct cgroup *cgrp = root->cgroup;
-	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
 	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
 	struct cgroup_subsys_state *head = NULL, *parent, *child;
+	raw_spinlock_t *cpu_lock;
 	unsigned long flags;
 
+	if (is_base_css(root))
+		cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
+	else
+		cpu_lock = per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) +
+			root->ss->id;
+
 	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, false);
 
 	/* Return NULL if this subtree is not on-list */
@@ -315,7 +336,7 @@  static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
 	struct cgroup *cgrp = css->cgroup;
 	int cpu;
 
-	lockdep_assert_held(&cgroup_rstat_lock);
+	lockdep_assert_held(&lock);
 
 	for_each_possible_cpu(cpu) {
 		struct cgroup_subsys_state *pos;
@@ -356,12 +377,18 @@  static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
 __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
+	spinlock_t *lock;
+
+	if (is_base_css(css))
+		lock = &cgroup_rstat_base_lock;
+	else
+		lock = &cgroup_rstat_subsys_lock[css->ss->id];
 
 	might_sleep();
 
-	__cgroup_rstat_lock(&cgroup_rstat_lock, cgrp, -1);
-	cgroup_rstat_flush_locked(css, &cgroup_rstat_lock);
-	__cgroup_rstat_unlock(&cgroup_rstat_lock, cgrp, -1);
+	__cgroup_rstat_lock(lock, cgrp, -1);
+	cgroup_rstat_flush_locked(css, lock);
+	__cgroup_rstat_unlock(lock, cgrp, -1);
 }
 
 /**
@@ -376,10 +403,16 @@  __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
 void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
+	spinlock_t *lock;
+
+	if (is_base_css(css))
+		lock = &cgroup_rstat_base_lock;
+	else
+		lock = &cgroup_rstat_subsys_lock[css->ss->id];
 
 	might_sleep();
-	__cgroup_rstat_lock(&cgroup_rstat_lock, cgrp, -1);
-	cgroup_rstat_flush_locked(css, &cgroup_rstat_lock);
+	__cgroup_rstat_lock(lock, cgrp, -1);
+	cgroup_rstat_flush_locked(css, lock);
 }
 
 /**
@@ -389,7 +422,14 @@  void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
 void cgroup_rstat_flush_release(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
-	__cgroup_rstat_unlock(&cgroup_rstat_lock, cgrp, -1);
+	spinlock_t *lock;
+
+	if (is_base_css(css))
+		lock = &cgroup_rstat_base_lock;
+	else
+		lock = &cgroup_rstat_subsys_lock[css->ss->id];
+
+	__cgroup_rstat_unlock(lock, cgrp, -1);
 }
 
 int cgroup_rstat_init(struct cgroup_subsys_state *css)
@@ -435,10 +475,21 @@  void cgroup_rstat_exit(struct cgroup_subsys_state *css)
 
 void __init cgroup_rstat_boot(void)
 {
-	int cpu;
+	struct cgroup_subsys *ss;
+	int cpu, ssid;
 
-	for_each_possible_cpu(cpu)
-		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
+	for_each_subsys(ss, ssid) {
+		spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
+	}
+
+	for_each_possible_cpu(cpu) {
+		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu));
+
+		for_each_subsys(ss, ssid) {
+			raw_spin_lock_init(
+					per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) + ssid);
+		}
+	}
 }
 
 /*