diff mbox series

[v2,3/3] mm/slub: Use percpu partial free counter

Message ID 1597061872-58724-4-git-send-email-xlpang@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series mm/slub: Fix count_partial() problem | expand

Commit Message

Xunlei Pang Aug. 10, 2020, 12:17 p.m. UTC
The only concern of introducing partial counter is that,
partial_free_objs may cause atomic operation contention
in case of same SLUB concurrent __slab_free().

This patch changes it to be a percpu counter to avoid that.

Co-developed-by: Wen Yang <wenyang@linux.alibaba.com>
Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
---
 mm/slab.h |  2 +-
 mm/slub.c | 38 +++++++++++++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 8 deletions(-)

Comments

Shu Ming March 2, 2021, 5:56 a.m. UTC | #1
On Mon, Aug 10, 2020 at 8:22 PM Xunlei Pang <xlpang@linux.alibaba.com> wrote:

>  static inline void
> @@ -2429,12 +2439,12 @@ static unsigned long partial_counter(struct kmem_cache_node *n,
>         unsigned long ret = 0;
>
>         if (item == PARTIAL_FREE) {
> -               ret = atomic_long_read(&n->partial_free_objs);
> +               ret = get_partial_free(n);
>         } else if (item == PARTIAL_TOTAL) {
>                 ret = atomic_long_read(&n->partial_total_objs);
>         } else if (item == PARTIAL_INUSE) {
>                 ret = atomic_long_read(&n->partial_total_objs) -
> -               atomic_long_read(&n->partial_free_objs);
> +                               get_partial_free(n);

Is it "ret = get_partial_free(n);" above?


>                 if ((long)ret < 0)
>                         ret = 0;
>         }
Christoph Lameter March 2, 2021, 9:14 a.m. UTC | #2
On Mon, 10 Aug 2020, Xunlei Pang wrote:

>
> diff --git a/mm/slab.h b/mm/slab.h
> index c85e2fa..a709a70 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -616,7 +616,7 @@ struct kmem_cache_node {
>  #ifdef CONFIG_SLUB
>  	unsigned long nr_partial;
>  	struct list_head partial;
> -	atomic_long_t partial_free_objs;
> +	atomic_long_t __percpu *partial_free_objs;

A percpu counter is never atomic. Just use unsigned long and use this_cpu
operations for this thing. That should cut down further on the overhead.

> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1775,11 +1775,21 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
>  /*
>   * Management of partially allocated slabs.
>   */
> +static inline long get_partial_free(struct kmem_cache_node *n)
> +{
> +	long nr = 0;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu)
> +		nr += atomic_long_read(per_cpu_ptr(n->partial_free_objs, cpu));

this_cpu_read(*n->partial_free_objs)

>  static inline void
>  __update_partial_free(struct kmem_cache_node *n, long delta)
>  {
> -	atomic_long_add(delta, &n->partial_free_objs);
> +	atomic_long_add(delta, this_cpu_ptr(n->partial_free_objs));

this_cpu_add()

and so on.
Xunlei Pang March 3, 2021, 1:46 p.m. UTC | #3
On 3/2/21 5:14 PM, Christoph Lameter wrote:
> On Mon, 10 Aug 2020, Xunlei Pang wrote:
> 
>>
>> diff --git a/mm/slab.h b/mm/slab.h
>> index c85e2fa..a709a70 100644
>> --- a/mm/slab.h
>> +++ b/mm/slab.h
>> @@ -616,7 +616,7 @@ struct kmem_cache_node {
>>  #ifdef CONFIG_SLUB
>>  	unsigned long nr_partial;
>>  	struct list_head partial;
>> -	atomic_long_t partial_free_objs;
>> +	atomic_long_t __percpu *partial_free_objs;
> 
> A percpu counter is never atomic. Just use unsigned long and use this_cpu
> operations for this thing. That should cut down further on the overhead.
> 
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -1775,11 +1775,21 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
>>  /*
>>   * Management of partially allocated slabs.
>>   */
>> +static inline long get_partial_free(struct kmem_cache_node *n)
>> +{
>> +	long nr = 0;
>> +	int cpu;
>> +
>> +	for_each_possible_cpu(cpu)
>> +		nr += atomic_long_read(per_cpu_ptr(n->partial_free_objs, cpu));
> 
> this_cpu_read(*n->partial_free_objs)
> 
>>  static inline void
>>  __update_partial_free(struct kmem_cache_node *n, long delta)
>>  {
>> -	atomic_long_add(delta, &n->partial_free_objs);
>> +	atomic_long_add(delta, this_cpu_ptr(n->partial_free_objs));
> 
> this_cpu_add()
> 
> and so on.
> 

Thanks, I changed them both to use "unsigned long", and will send v3 out
after our internal performance regression test passes.
Matthew Wilcox March 3, 2021, 2:26 p.m. UTC | #4
On Tue, Mar 02, 2021 at 10:14:53AM +0100, Christoph Lameter wrote:
> On Mon, 10 Aug 2020, Xunlei Pang wrote:
> > -	atomic_long_t partial_free_objs;
> > +	atomic_long_t __percpu *partial_free_objs;
> 
> A percpu counter is never atomic. Just use unsigned long and use this_cpu
> operations for this thing. That should cut down further on the overhead.

What about allocations from interrupt context?  Should this be a local_t
instead?
Christoph Lameter March 3, 2021, 7:15 p.m. UTC | #5
On Wed, 3 Mar 2021, Matthew Wilcox wrote:

> On Tue, Mar 02, 2021 at 10:14:53AM +0100, Christoph Lameter wrote:
> > On Mon, 10 Aug 2020, Xunlei Pang wrote:
> > > -	atomic_long_t partial_free_objs;
> > > +	atomic_long_t __percpu *partial_free_objs;
> >
> > A percpu counter is never atomic. Just use unsigned long and use this_cpu
> > operations for this thing. That should cut down further on the overhead.
>
> What about allocations from interrupt context?  Should this be a local_t
> instead?

Can this be allocated in an interrupt context?

And I am not sure how local_t relates to that? Percpu counters can be used
in an interrupt context without the overhead of the address calculations
that are required by a local_t.
Matthew Wilcox March 3, 2021, 7:30 p.m. UTC | #6
On Wed, Mar 03, 2021 at 08:15:58PM +0100, Christoph Lameter wrote:
> On Wed, 3 Mar 2021, Matthew Wilcox wrote:
> 
> > On Tue, Mar 02, 2021 at 10:14:53AM +0100, Christoph Lameter wrote:
> > > On Mon, 10 Aug 2020, Xunlei Pang wrote:
> > > > -	atomic_long_t partial_free_objs;
> > > > +	atomic_long_t __percpu *partial_free_objs;
> > >
> > > A percpu counter is never atomic. Just use unsigned long and use this_cpu
> > > operations for this thing. That should cut down further on the overhead.
> >
> > What about allocations from interrupt context?  Should this be a local_t
> > instead?
> 
> Can this be allocated in an interrupt context?
> 
> And I am not sure how local_t relates to that? Percpu counters can be used
> in an interrupt context without the overhead of the address calculations
> that are required by a local_t.

As I understand the patch, this counts the number of partially free slabs.
So if we start to free an object from a completely full slab in process
context, as "load x, add one to x, store x" and take an interrupt
between loading x and adding one to x, that interrupt handler might
free a different object from another completely full slab.  that would
also load the same x, add one to it and store x, but then the process
context would add one to the old x, overwriting the updated value from
interrupt context.

it's not the likeliest of races, and i don't know how important it is
that these counters remain accurate.  but using a local_t instead of
a percpu long would fix the problem.  i don't know why you think that
a local_t needs "address calculations".  perhaps you've misremembered
what a local_t is?
Christoph Lameter March 3, 2021, 7:55 p.m. UTC | #7
On Wed, 3 Mar 2021, Matthew Wilcox wrote:

> > Can this be allocated in an interrupt context?
> >
> > And I am not sure how local_t relates to that? Percpu counters can be used
> > in an interrupt context without the overhead of the address calculations
> > that are required by a local_t.
>
> As I understand the patch, this counts the number of partially free slabs.
> So if we start to free an object from a completely full slab in process
> context, as "load x, add one to x, store x" and take an interrupt
> between loading x and adding one to x, that interrupt handler might
> free a different object from another completely full slab.  that would
> also load the same x, add one to it and store x, but then the process
> context would add one to the old x, overwriting the updated value from
> interrupt context.

this_cpu operations are "atomic" vs. preemption but on some platforms not
vs interrupts. That could be an issue in kmem_cache_free(). This would
need a modification to the relevant this_cpu ops so that interrupts are
disabled on those platforms.

Like this_cpu_inc_irq() or so?


> it's not the likeliest of races, and i don't know how important it is
> that these counters remain accurate.  but using a local_t instead of
> a percpu long would fix the problem.  i don't know why you think that
> a local_t needs "address calculations".  perhaps you've misremembered
> what a local_t is?

local_t does not include the address arithmetic that the this_cpu
operation can implicitly perform on x86 f.e. with an segment register or
maybe another register on another platform thereby avoiding the need to
disable preemption or interrupts.

Therefore a manual calculation of the target address for a local_t
operation needs to be done beforehand which usually means disabling
interrupts and/or preemption for the code segment. Otherwise we may end up
on a different processor due to scheduler or other interruptions and use
the percpu counter value of a different processor which could be racy.
Matthew Wilcox March 3, 2021, 8:16 p.m. UTC | #8
On Wed, Mar 03, 2021 at 08:55:48PM +0100, Christoph Lameter wrote:
> On Wed, 3 Mar 2021, Matthew Wilcox wrote:
> 
> > > Can this be allocated in an interrupt context?
> > >
> > > And I am not sure how local_t relates to that? Percpu counters can be used
> > > in an interrupt context without the overhead of the address calculations
> > > that are required by a local_t.
> >
> > As I understand the patch, this counts the number of partially free slabs.
> > So if we start to free an object from a completely full slab in process
> > context, as "load x, add one to x, store x" and take an interrupt
> > between loading x and adding one to x, that interrupt handler might
> > free a different object from another completely full slab.  that would
> > also load the same x, add one to it and store x, but then the process
> > context would add one to the old x, overwriting the updated value from
> > interrupt context.
> 
> this_cpu operations are "atomic" vs. preemption but on some platforms not
> vs interrupts. That could be an issue in kmem_cache_free(). This would
> need a modification to the relevant this_cpu ops so that interrupts are
> disabled on those platforms.

Hmmmm ... re-reading the documentation, it says that this_cpu_x is
atomic against interrupts:

These operations can be used without worrying about
preemption and interrupts::
[...]
        this_cpu_add(pcp, val)
        this_cpu_inc(pcp)
...
diff mbox series

Patch

diff --git a/mm/slab.h b/mm/slab.h
index c85e2fa..a709a70 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -616,7 +616,7 @@  struct kmem_cache_node {
 #ifdef CONFIG_SLUB
 	unsigned long nr_partial;
 	struct list_head partial;
-	atomic_long_t partial_free_objs;
+	atomic_long_t __percpu *partial_free_objs;
 	atomic_long_t partial_total_objs;
 #ifdef CONFIG_SLUB_DEBUG
 	atomic_long_t nr_slabs;
diff --git a/mm/slub.c b/mm/slub.c
index 25a4421..f6fc60b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1775,11 +1775,21 @@  static void discard_slab(struct kmem_cache *s, struct page *page)
 /*
  * Management of partially allocated slabs.
  */
+static inline long get_partial_free(struct kmem_cache_node *n)
+{
+	long nr = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		nr += atomic_long_read(per_cpu_ptr(n->partial_free_objs, cpu));
+
+	return nr;
+}
 
 static inline void
 __update_partial_free(struct kmem_cache_node *n, long delta)
 {
-	atomic_long_add(delta, &n->partial_free_objs);
+	atomic_long_add(delta, this_cpu_ptr(n->partial_free_objs));
 }
 
 static inline void
@@ -2429,12 +2439,12 @@  static unsigned long partial_counter(struct kmem_cache_node *n,
 	unsigned long ret = 0;
 
 	if (item == PARTIAL_FREE) {
-		ret = atomic_long_read(&n->partial_free_objs);
+		ret = get_partial_free(n);
 	} else if (item == PARTIAL_TOTAL) {
 		ret = atomic_long_read(&n->partial_total_objs);
 	} else if (item == PARTIAL_INUSE) {
 		ret = atomic_long_read(&n->partial_total_objs) -
-		atomic_long_read(&n->partial_free_objs);
+				get_partial_free(n);
 		if ((long)ret < 0)
 			ret = 0;
 	}
@@ -3390,19 +3400,28 @@  static inline int calculate_order(unsigned int size)
 	return -ENOSYS;
 }
 
-static void
+static int
 init_kmem_cache_node(struct kmem_cache_node *n)
 {
+	int cpu;
+
 	n->nr_partial = 0;
 	spin_lock_init(&n->list_lock);
 	INIT_LIST_HEAD(&n->partial);
-	atomic_long_set(&n->partial_free_objs, 0);
+
+	n->partial_free_objs = alloc_percpu(atomic_long_t);
+	if (!n->partial_free_objs)
+		return -ENOMEM;
+	for_each_possible_cpu(cpu)
+		atomic_long_set(per_cpu_ptr(n->partial_free_objs, cpu), 0);
 	atomic_long_set(&n->partial_total_objs, 0);
 #ifdef CONFIG_SLUB_DEBUG
 	atomic_long_set(&n->nr_slabs, 0);
 	atomic_long_set(&n->total_objects, 0);
 	INIT_LIST_HEAD(&n->full);
 #endif
+
+	return 0;
 }
 
 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
@@ -3463,7 +3482,7 @@  static void early_kmem_cache_node_alloc(int node)
 	page->inuse = 1;
 	page->frozen = 0;
 	kmem_cache_node->node[node] = n;
-	init_kmem_cache_node(n);
+	BUG_ON(init_kmem_cache_node(n) < 0);
 	inc_slabs_node(kmem_cache_node, node, page->objects);
 
 	/*
@@ -3481,6 +3500,7 @@  static void free_kmem_cache_nodes(struct kmem_cache *s)
 
 	for_each_kmem_cache_node(s, node, n) {
 		s->node[node] = NULL;
+		free_percpu(n->partial_free_objs);
 		kmem_cache_free(kmem_cache_node, n);
 	}
 }
@@ -3511,7 +3531,11 @@  static int init_kmem_cache_nodes(struct kmem_cache *s)
 			return 0;
 		}
 
-		init_kmem_cache_node(n);
+		if (init_kmem_cache_node(n) < 0) {
+			free_kmem_cache_nodes(s);
+			return 0;
+		}
+
 		s->node[node] = n;
 	}
 	return 1;