diff mbox series

[1/2] pcpcntr: add group allocation/free

Message ID 20230821202829.2163744-2-mjguzik@gmail.com (mailing list archive)
State New
Headers show
Series execve scalability issues, part 1 | expand

Commit Message

Mateusz Guzik Aug. 21, 2023, 8:28 p.m. UTC
Allocations and frees are globally serialized on the pcpu lock (and the
CPU hotplug lock if enabled, which is the case on Debian).

At least one frequent consumer allocates 4 back-to-back counters (and
frees them in the same manner), exacerbating the problem.

While this does not fully remedy scalability issues, it is a step
towards that goal and provides immediate relief.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
 include/linux/percpu_counter.h | 19 ++++++++---
 lib/percpu_counter.c           | 61 ++++++++++++++++++++++++----------
 2 files changed, 57 insertions(+), 23 deletions(-)

Comments

Vegard Nossum Aug. 22, 2023, 1:37 p.m. UTC | #1
Testing out a review style with very detailed comments. Let me know if
you hate it. Review notes:

On 8/21/23 22:28, Mateusz Guzik wrote:
> Allocations and frees are globally serialized on the pcpu lock (and the
> CPU hotplug lock if enabled, which is the case on Debian).
> 
> At least one frequent consumer allocates 4 back-to-back counters (and
> frees them in the same manner), exacerbating the problem.
> 
> While this does not fully remedy scalability issues, it is a step
> towards that goal and provides immediate relief.
> 
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
>   include/linux/percpu_counter.h | 19 ++++++++---
>   lib/percpu_counter.c           | 61 ++++++++++++++++++++++++----------
>   2 files changed, 57 insertions(+), 23 deletions(-)
> 
> diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> index 75b73c83bc9d..ff5850b07124 100644
> --- a/include/linux/percpu_counter.h
> +++ b/include/linux/percpu_counter.h
> @@ -30,17 +30,26 @@ struct percpu_counter {
>   
>   extern int percpu_counter_batch;
>   
> -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> -			  struct lock_class_key *key);
> +int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> +			  struct lock_class_key *key, u32 count);

renaming and adding a u32 count argument

>   
> -#define percpu_counter_init(fbc, value, gfp)				\
> +#define percpu_counter_init_many(fbc, value, gfp, count)		\

adding a count argument

>   	({								\
>   		static struct lock_class_key __key;			\
>   									\
> -		__percpu_counter_init(fbc, value, gfp, &__key);		\
> +		__percpu_counter_init_many(fbc, value, gfp, &__key, count);\

renaming and passing count along

>   	})
>   
> -void percpu_counter_destroy(struct percpu_counter *fbc);
> +
> +#define percpu_counter_init(fbc, value, gfp)				\
> +	percpu_counter_init_many(fbc, value, gfp, 1)
> +
> +void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 count);
> +static inline void percpu_counter_destroy(struct percpu_counter *fbc)
> +{
> +	percpu_counter_destroy_many(fbc, 1);
> +}
> +

wrappers for the count == 1 case

>   void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
>   void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
>   			      s32 batch);
> diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
> index 5004463c4f9f..2a33cf23df55 100644
> --- a/lib/percpu_counter.c
> +++ b/lib/percpu_counter.c
> @@ -151,48 +151,73 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
>   }
>   EXPORT_SYMBOL(__percpu_counter_sum);
>   
> -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> -			  struct lock_class_key *key)
> +int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> +			  struct lock_class_key *key, u32 count)
>   {
>   	unsigned long flags __maybe_unused;
> +	s32 __percpu *counters;
> +	u32 i;
>   
> -	raw_spin_lock_init(&fbc->lock);
> -	lockdep_set_class(&fbc->lock, key);
> -	fbc->count = amount;
> -	fbc->counters = alloc_percpu_gfp(s32, gfp);
> -	if (!fbc->counters)
> +	counters = __alloc_percpu_gfp(sizeof(*counters) * count,
> +				      sizeof(*counters), gfp);

The second argument here is the alignment. I see other callers using
__alignof__(type), which is what alloc_percpu_gfp() does as well. In
practice I think it shouldn't matter, but for clarity/consistency maybe
this should be __alignof__ as well?

Presumably multiplication overflow is not an issue here as it is with
kmalloc and friends since the count can't be controlled by userspace.

> +	if (!counters) {
> +		fbc[0].counters = NULL;
>   		return -ENOMEM;
> +	}

Checked that __alloc_percpu_gfp() returns NULL on failure.

Checked that nothing else before this in the function needs cleanup.

In the old code, fbc->count would have gotten initialized but it
shouldn't matter here, I think, as long as the counter is never activated.

I'm not sure why only fbc[0].counters is set to NULL, should this happen
for all the "count" members? [PS: percpu_counter_destroy_many() below
has a check for fbc[0].counters]

>   
> -	debug_percpu_counter_activate(fbc);
> +	for (i = 0; i < count; i++) {
> +		raw_spin_lock_init(&fbc[i].lock);
> +		lockdep_set_class(&fbc[i].lock, key);
> +#ifdef CONFIG_HOTPLUG_CPU
> +		INIT_LIST_HEAD(&fbc[i].list);
> +#endif
> +		fbc[i].count = amount;
> +		fbc[i].counters = &counters[i];
> +
> +		debug_percpu_counter_activate(&fbc[i]);

Checked that this can't return an error.

> +	}
>   
>   #ifdef CONFIG_HOTPLUG_CPU
> -	INIT_LIST_HEAD(&fbc->list);
>   	spin_lock_irqsave(&percpu_counters_lock, flags);
> -	list_add(&fbc->list, &percpu_counters);
> +	for (i = 0; i < count; i++) {
> +		list_add(&fbc[i].list, &percpu_counters);
> +	}
>   	spin_unlock_irqrestore(&percpu_counters_lock, flags);
>   #endif

each counter is added to the list while the spinlock is held

>   	return 0;

Nothing here can fail after the initial allocation so no cleanup/error
handling is needed before returning.

>   }
> -EXPORT_SYMBOL(__percpu_counter_init);
> +EXPORT_SYMBOL(__percpu_counter_init_many);
>   
> -void percpu_counter_destroy(struct percpu_counter *fbc)
> +void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 count)
>   {
>   	unsigned long flags __maybe_unused;
> +	u32 i;
>   
> -	if (!fbc->counters)
> +	if (WARN_ON_ONCE(!fbc))
>   		return;

This change is misleading, but correct; the WARN_ON_ONCE() is newly
added and the old check is modified below:

>   
> -	debug_percpu_counter_deactivate(fbc);
> +	if (!fbc[0].counters)
> +		return;

(this explains why only fbc[0] was NULL-ed out above in the allocation
function...)

> +
> +	for (i = 0; i < count; i++) {
> +		debug_percpu_counter_deactivate(&fbc[i]);
> +	}

Double checked that _activate() was not called in the cases where we
would return early from this function.

>   
>   #ifdef CONFIG_HOTPLUG_CPU
>   	spin_lock_irqsave(&percpu_counters_lock, flags);
> -	list_del(&fbc->list);
> +	for (i = 0; i < count; i++) {
> +		list_del(&fbc[i].list);
> +	}
>   	spin_unlock_irqrestore(&percpu_counters_lock, flags);
>   #endif
> -	free_percpu(fbc->counters);
> -	fbc->counters = NULL;
> +
> +	free_percpu(fbc[0].counters);
> +
> +	for (i = 0; i < count; i++) {
> +		fbc[i].counters = NULL;
> +	}
>   }

Looks correct to me; fbc[0].counters is the actual allocation so only
that gets passed to free_percpu().

> -EXPORT_SYMBOL(percpu_counter_destroy);
> +EXPORT_SYMBOL(percpu_counter_destroy_many);
>   
>   int percpu_counter_batch __read_mostly = 32;
>   EXPORT_SYMBOL(percpu_counter_batch);

Reviewed-by: Vegard Nossum <vegard.nossum@oracle.com>

In summary, my only slight concern is sizeof(*counters) being passed as
the alignment to __alloc_percpu_gfp() when maybe it would be more
appropriate to pass __alignof__() -- not that it makes a difference at
runtime since both are 4 for s32.

One other thing: I find it a bit odd that the "amount" parameter
(initial value) is s64 when the counters themselves are s32. Maybe just
a leftover from an old version?


Vegard
Mateusz Guzik Aug. 22, 2023, 2:06 p.m. UTC | #2
On 8/22/23, Vegard Nossum <vegard.nossum@oracle.com> wrote:
>
> Testing out a review style with very detailed comments. Let me know if
> you hate it. Review notes:
>

I do, very noisy and I don't think it adds any value. ;)

If something like this becomes the norm I'm confident people are going
to start skimming responses to their mail, as opposed to reading them.

But then again, I had serious disagreement with review folk in the past, so...

> On 8/21/23 22:28, Mateusz Guzik wrote:
>> +	counters = __alloc_percpu_gfp(sizeof(*counters) * count,
>> +				      sizeof(*counters), gfp);
>
> The second argument here is the alignment. I see other callers using
> __alignof__(type), which is what alloc_percpu_gfp() does as well. In
> practice I think it shouldn't matter, but for clarity/consistency maybe
> this should be __alignof__ as well?
>

Ye, I neglected to patch it up after a whipping out a PoC.

> Presumably multiplication overflow is not an issue here as it is with
> kmalloc and friends since the count can't be controlled by userspace.
>

I wanted to assert on the count being sensible to begin with, but
there is no general "only assert with debug enabled" macro. Perhaps a
warn_once + bail out would be preferred?

>> +	if (!counters) {
>> +		fbc[0].counters = NULL;
>>   		return -ENOMEM;
>> +	}
>
> Checked that __alloc_percpu_gfp() returns NULL on failure.
>
> Checked that nothing else before this in the function needs cleanup.
>
> In the old code, fbc->count would have gotten initialized but it
> shouldn't matter here, I think, as long as the counter is never activated.
>
> I'm not sure why only fbc[0].counters is set to NULL, should this happen
> for all the "count" members? [PS: percpu_counter_destroy_many() below
> has a check for fbc[0].counters]
>

Consumers looked fishy to me with zeroing the counter prior to calling
the init func.

I added the NULL assignment so that a call to destroy does nothing.

> In summary, my only slight concern is sizeof(*counters) being passed as
> the alignment to __alloc_percpu_gfp() when maybe it would be more
> appropriate to pass __alignof__() -- not that it makes a difference at
> runtime since both are 4 for s32.
>

Agreed, will patch later.

> One other thing: I find it a bit odd that the "amount" parameter
> (initial value) is s64 when the counters themselves are s32. Maybe just
> a leftover from an old version?
>

I don't know the reasoning by the authors, but seems a clear case to
me that the per-CPU counts were left int-sized to reduce memory usage
and reduce deviation between the central counter and the real state.
Dennis Zhou Aug. 22, 2023, 5:02 p.m. UTC | #3
Hi Mateusz,

On Mon, Aug 21, 2023 at 10:28:28PM +0200, Mateusz Guzik wrote:
> Allocations and frees are globally serialized on the pcpu lock (and the
> CPU hotplug lock if enabled, which is the case on Debian).
> 
> At least one frequent consumer allocates 4 back-to-back counters (and
> frees them in the same manner), exacerbating the problem.
> 
> While this does not fully remedy scalability issues, it is a step
> towards that goal and provides immediate relief.
> 
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>

As I mentioned yesterday, I like this approach because instead of making
percpu smarter, we're batching against the higher level inits which I'm
assuming will have additional synchronization requirements.

Below is mostly style changes to conform and a few nits wrt naming.

> ---
>  include/linux/percpu_counter.h | 19 ++++++++---
>  lib/percpu_counter.c           | 61 ++++++++++++++++++++++++----------
>  2 files changed, 57 insertions(+), 23 deletions(-)
> 
> diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> index 75b73c83bc9d..ff5850b07124 100644
> --- a/include/linux/percpu_counter.h
> +++ b/include/linux/percpu_counter.h
> @@ -30,17 +30,26 @@ struct percpu_counter {
>  
>  extern int percpu_counter_batch;
>  
> -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> -			  struct lock_class_key *key);
> +int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> +			  struct lock_class_key *key, u32 count);

1. Can we move count to before the lock_class_key?
2. count is an overloaded term between percpu_counters and
percpu_refcount. Maybe `nr` or `nr_counters` is better?

>  
> -#define percpu_counter_init(fbc, value, gfp)				\
> +#define percpu_counter_init_many(fbc, value, gfp, count)		\
>  	({								\
>  		static struct lock_class_key __key;			\
>  									\
> -		__percpu_counter_init(fbc, value, gfp, &__key);		\
> +		__percpu_counter_init_many(fbc, value, gfp, &__key, count);\
>  	})
>  
> -void percpu_counter_destroy(struct percpu_counter *fbc);
> +
> +#define percpu_counter_init(fbc, value, gfp)				\
> +	percpu_counter_init_many(fbc, value, gfp, 1)
> +
> +void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 count);
> +static inline void percpu_counter_destroy(struct percpu_counter *fbc)
> +{
> +	percpu_counter_destroy_many(fbc, 1);
> +}
> +
>  void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
>  void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
>  			      s32 batch);
> diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
> index 5004463c4f9f..2a33cf23df55 100644
> --- a/lib/percpu_counter.c
> +++ b/lib/percpu_counter.c
> @@ -151,48 +151,73 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
>  }
>  EXPORT_SYMBOL(__percpu_counter_sum);
>  
> -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> -			  struct lock_class_key *key)
> +int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> +			  struct lock_class_key *key, u32 count)
>  {
>  	unsigned long flags __maybe_unused;
> +	s32 __percpu *counters;
> +	u32 i;
>  
> -	raw_spin_lock_init(&fbc->lock);
> -	lockdep_set_class(&fbc->lock, key);
> -	fbc->count = amount;
> -	fbc->counters = alloc_percpu_gfp(s32, gfp);
> -	if (!fbc->counters)
> +	counters = __alloc_percpu_gfp(sizeof(*counters) * count,
> +				      sizeof(*counters), gfp);


So while a bit moot, I think it'd be nice to clarify what we should do
here wrt alignment and batch allocation. There has been a case in the
past where alignment > size.

This is from my batch api implementation:
+       /*
+        * When allocating a batch we need to align the size so that each
+        * element in the batch will have the appropriate alignment.
+        */
+       size = ALIGN(size, align);

So I think some variation of:
    element_size = ALIGN(sizeof(*counters, __alignof__(*counters)));
    counters = __alloc_percpu_gfp(nr * element_size, __alignof__(*counters), gfp);

While this isn't necessary here for s32's, I think it would be nice to
just set the precedent so we preserve alignment asks for future users if
they use this as a template.

> +	if (!counters) {
> +		fbc[0].counters = NULL;
>  		return -ENOMEM;
> +	}
>  
> -	debug_percpu_counter_activate(fbc);
> +	for (i = 0; i < count; i++) {
> +		raw_spin_lock_init(&fbc[i].lock);
> +		lockdep_set_class(&fbc[i].lock, key);
> +#ifdef CONFIG_HOTPLUG_CPU
> +		INIT_LIST_HEAD(&fbc[i].list);
> +#endif
> +		fbc[i].count = amount;
> +		fbc[i].counters = &counters[i];

This would have to become some (void *) math tho with element_size.

> +
> +		debug_percpu_counter_activate(&fbc[i]);
> +	}
>  
>  #ifdef CONFIG_HOTPLUG_CPU
> -	INIT_LIST_HEAD(&fbc->list);
>  	spin_lock_irqsave(&percpu_counters_lock, flags);
> -	list_add(&fbc->list, &percpu_counters);
> +	for (i = 0; i < count; i++) {
> +		list_add(&fbc[i].list, &percpu_counters);
> +	}

nit: we don't add {} for single line loops.

>  	spin_unlock_irqrestore(&percpu_counters_lock, flags);
>  #endif
>  	return 0;
>  }
> -EXPORT_SYMBOL(__percpu_counter_init);
> +EXPORT_SYMBOL(__percpu_counter_init_many);
>  
> -void percpu_counter_destroy(struct percpu_counter *fbc)
> +void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 count)
>  {
>  	unsigned long flags __maybe_unused;
> +	u32 i;
>  
> -	if (!fbc->counters)
> +	if (WARN_ON_ONCE(!fbc))
>  		return;
>  
> -	debug_percpu_counter_deactivate(fbc);
> +	if (!fbc[0].counters)
> +		return;
> +
> +	for (i = 0; i < count; i++) {
> +		debug_percpu_counter_deactivate(&fbc[i]);
> +	}
>  

nit: no {}.

>  #ifdef CONFIG_HOTPLUG_CPU
>  	spin_lock_irqsave(&percpu_counters_lock, flags);
> -	list_del(&fbc->list);
> +	for (i = 0; i < count; i++) {
> +		list_del(&fbc[i].list);
> +	}

nit: no {}.

>  	spin_unlock_irqrestore(&percpu_counters_lock, flags);
>  #endif
> -	free_percpu(fbc->counters);
> -	fbc->counters = NULL;
> +
> +	free_percpu(fbc[0].counters);
> +
> +	for (i = 0; i < count; i++) {
> +		fbc[i].counters = NULL;
> +	}

nit: no {}.

>  }
> -EXPORT_SYMBOL(percpu_counter_destroy);
> +EXPORT_SYMBOL(percpu_counter_destroy_many);
>  
>  int percpu_counter_batch __read_mostly = 32;
>  EXPORT_SYMBOL(percpu_counter_batch);
> -- 
> 2.39.2
> 

Thanks,
Dennis
diff mbox series

Patch

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 75b73c83bc9d..ff5850b07124 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -30,17 +30,26 @@  struct percpu_counter {
 
 extern int percpu_counter_batch;
 
-int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
-			  struct lock_class_key *key);
+int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
+			  struct lock_class_key *key, u32 count);
 
-#define percpu_counter_init(fbc, value, gfp)				\
+#define percpu_counter_init_many(fbc, value, gfp, count)		\
 	({								\
 		static struct lock_class_key __key;			\
 									\
-		__percpu_counter_init(fbc, value, gfp, &__key);		\
+		__percpu_counter_init_many(fbc, value, gfp, &__key, count);\
 	})
 
-void percpu_counter_destroy(struct percpu_counter *fbc);
+
+#define percpu_counter_init(fbc, value, gfp)				\
+	percpu_counter_init_many(fbc, value, gfp, 1)
+
+void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 count);
+static inline void percpu_counter_destroy(struct percpu_counter *fbc)
+{
+	percpu_counter_destroy_many(fbc, 1);
+}
+
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
 			      s32 batch);
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 5004463c4f9f..2a33cf23df55 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -151,48 +151,73 @@  s64 __percpu_counter_sum(struct percpu_counter *fbc)
 }
 EXPORT_SYMBOL(__percpu_counter_sum);
 
-int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
-			  struct lock_class_key *key)
+int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
+			  struct lock_class_key *key, u32 count)
 {
 	unsigned long flags __maybe_unused;
+	s32 __percpu *counters;
+	u32 i;
 
-	raw_spin_lock_init(&fbc->lock);
-	lockdep_set_class(&fbc->lock, key);
-	fbc->count = amount;
-	fbc->counters = alloc_percpu_gfp(s32, gfp);
-	if (!fbc->counters)
+	counters = __alloc_percpu_gfp(sizeof(*counters) * count,
+				      sizeof(*counters), gfp);
+	if (!counters) {
+		fbc[0].counters = NULL;
 		return -ENOMEM;
+	}
 
-	debug_percpu_counter_activate(fbc);
+	for (i = 0; i < count; i++) {
+		raw_spin_lock_init(&fbc[i].lock);
+		lockdep_set_class(&fbc[i].lock, key);
+#ifdef CONFIG_HOTPLUG_CPU
+		INIT_LIST_HEAD(&fbc[i].list);
+#endif
+		fbc[i].count = amount;
+		fbc[i].counters = &counters[i];
+
+		debug_percpu_counter_activate(&fbc[i]);
+	}
 
 #ifdef CONFIG_HOTPLUG_CPU
-	INIT_LIST_HEAD(&fbc->list);
 	spin_lock_irqsave(&percpu_counters_lock, flags);
-	list_add(&fbc->list, &percpu_counters);
+	for (i = 0; i < count; i++) {
+		list_add(&fbc[i].list, &percpu_counters);
+	}
 	spin_unlock_irqrestore(&percpu_counters_lock, flags);
 #endif
 	return 0;
 }
-EXPORT_SYMBOL(__percpu_counter_init);
+EXPORT_SYMBOL(__percpu_counter_init_many);
 
-void percpu_counter_destroy(struct percpu_counter *fbc)
+void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 count)
 {
 	unsigned long flags __maybe_unused;
+	u32 i;
 
-	if (!fbc->counters)
+	if (WARN_ON_ONCE(!fbc))
 		return;
 
-	debug_percpu_counter_deactivate(fbc);
+	if (!fbc[0].counters)
+		return;
+
+	for (i = 0; i < count; i++) {
+		debug_percpu_counter_deactivate(&fbc[i]);
+	}
 
 #ifdef CONFIG_HOTPLUG_CPU
 	spin_lock_irqsave(&percpu_counters_lock, flags);
-	list_del(&fbc->list);
+	for (i = 0; i < count; i++) {
+		list_del(&fbc[i].list);
+	}
 	spin_unlock_irqrestore(&percpu_counters_lock, flags);
 #endif
-	free_percpu(fbc->counters);
-	fbc->counters = NULL;
+
+	free_percpu(fbc[0].counters);
+
+	for (i = 0; i < count; i++) {
+		fbc[i].counters = NULL;
+	}
 }
-EXPORT_SYMBOL(percpu_counter_destroy);
+EXPORT_SYMBOL(percpu_counter_destroy_many);
 
 int percpu_counter_batch __read_mostly = 32;
 EXPORT_SYMBOL(percpu_counter_batch);