diff mbox series

[v3] mm/vmalloc: lock contention optimization under multi-threading

Message ID 20240221032905.11392-1-rulin.huang@intel.com (mailing list archive)
State New
Headers show
Series [v3] mm/vmalloc: lock contention optimization under multi-threading | expand

Commit Message

Huang, Rulin Feb. 21, 2024, 3:29 a.m. UTC
When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.

Reviewed-by: Chen Tim C <tim.c.chen@intel.com>
Reviewed-by: King Colin <colin.king@intel.com>
Signed-off-by: rulinhuang <rulin.huang@intel.com>
---
V1 -> V2: Avoided the partial initialization issue of vm and 
separated insert_vmap_area() from alloc_vmap_area()
V2 -> V3: Rebased on 6.8-rc5
---
 mm/vmalloc.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)


base-commit: b401b621758e46812da61fa58a67c3fd8d91de0d

Comments

Uladzislau Rezki Feb. 21, 2024, 8:36 a.m. UTC | #1
On Tue, Feb 20, 2024 at 10:29:05PM -0500, rulinhuang wrote:
> When allocating a new memory area where the mapping address range is
> known, it is observed that the vmap_area lock is acquired twice.
> The first acquisition occurs in the alloc_vmap_area() function when
> inserting the vm area into the vm mapping red-black tree. The second
> acquisition occurs in the setup_vmalloc_vm() function when updating the
> properties of the vm, such as flags and address, etc.
> Combine these two operations together in alloc_vmap_area(), which
> improves scalability when the vmap_area lock is contended. By doing so,
> the need to acquire the lock twice can also be eliminated.
> With the above change, tested on intel icelake platform(160 vcpu, kernel
> v6.7), a 6% performance improvement and a 7% reduction in overall
> spinlock hotspot are gained on
> stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
> the stress test of thread creations.
> 
> Reviewed-by: Chen Tim C <tim.c.chen@intel.com>
> Reviewed-by: King Colin <colin.king@intel.com>
> Signed-off-by: rulinhuang <rulin.huang@intel.com>
> ---
> V1 -> V2: Avoided the partial initialization issue of vm and 
> separated insert_vmap_area() from alloc_vmap_area()
> V2 -> V3: Rebased on 6.8-rc5
> ---
>  mm/vmalloc.c | 36 +++++++++++++++++++++---------------
>  1 file changed, 21 insertions(+), 15 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index d12a17fc0c17..768e45f2ed94 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1630,17 +1630,18 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>  	va->vm = NULL;
>  	va->flags = va_flags;
>  
> -	spin_lock(&vmap_area_lock);
> -	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> -	spin_unlock(&vmap_area_lock);
> -
>  	BUG_ON(!IS_ALIGNED(va->va_start, align));
>  	BUG_ON(va->va_start < vstart);
>  	BUG_ON(va->va_end > vend);
>  
>  	ret = kasan_populate_vmalloc(addr, size);
>  	if (ret) {
> -		free_vmap_area(va);
> +		/*
> +		 * Insert/Merge it back to the free tree/list.
> +		 */
> +		spin_lock(&free_vmap_area_lock);
> +		merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
> +		spin_unlock(&free_vmap_area_lock);
>  		return ERR_PTR(ret);
>  	}
>  
> @@ -1669,6 +1670,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>  	return ERR_PTR(-EBUSY);
>  }
>  
> +static inline void insert_vmap_area_with_lock(struct vmap_area *va)
> +{
> +	spin_lock(&vmap_area_lock);
> +	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> +	spin_unlock(&vmap_area_lock);
> +}
> +
>  int register_vmap_purge_notifier(struct notifier_block *nb)
>  {
>  	return blocking_notifier_chain_register(&vmap_notify_list, nb);
> @@ -2045,6 +2053,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
>  		return ERR_CAST(va);
>  	}
>  
> +	insert_vmap_area_with_lock(va);
> +
>  	vaddr = vmap_block_vaddr(va->va_start, 0);
>  	spin_lock_init(&vb->lock);
>  	vb->va = va;
> @@ -2398,6 +2408,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
>  		if (IS_ERR(va))
>  			return NULL;
>  
> +		insert_vmap_area_with_lock(va);
> +
>  		addr = va->va_start;
>  		mem = (void *)addr;
>  	}
> @@ -2538,7 +2550,7 @@ static void vmap_init_free_space(void)
>  	}
>  }
>  
> -static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
> +static inline void setup_vmalloc_vm(struct vm_struct *vm,
>  	struct vmap_area *va, unsigned long flags, const void *caller)
>  {
>  	vm->flags = flags;
> @@ -2548,14 +2560,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
>  	va->vm = vm;
>  }
>  
> -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> -			      unsigned long flags, const void *caller)
> -{
> -	spin_lock(&vmap_area_lock);
> -	setup_vmalloc_vm_locked(vm, va, flags, caller);
> -	spin_unlock(&vmap_area_lock);
> -}
> -
>  static void clear_vm_uninitialized_flag(struct vm_struct *vm)
>  {
>  	/*
> @@ -2600,6 +2604,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>  
>  	setup_vmalloc_vm(area, va, flags, caller);
>  
> +	insert_vmap_area_with_lock(va);
> +
>
>  	/*
>  	 * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
>  	 * best-effort approach, as they can be mapped outside of vmalloc code.
> @@ -4166,7 +4172,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>  	for (area = 0; area < nr_vms; area++) {
>  		insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
>  
> -		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
> +		setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
>  				 pcpu_get_vm_areas);
>  	}
>  	spin_unlock(&vmap_area_lock);
> 
> base-commit: b401b621758e46812da61fa58a67c3fd8d91de0d
> -- 
> 2.43.0
> 
Spreading the insert_vmap_area_lock() across several callers, like:
__get_vm_area_node(), new_vmap_block(), vm_map_ram(), etc is not good
approach, simply because it changes the behaviour and people might
miss this point.

Could you please re-spin it on the mm-unstable, because the vmalloc
code was changes a lot? From my side i can check and help you how to
fix it in a better way. Because the v3 should be improved anyaway.

Apparently i have not seen you messages for some reason, i do not
understand why. I started to get emails with below topic:

"Bounce probe for linux-kernel@vger.kernel.org (no action required)"

--
Uladzislau Rezki
Huang, Rulin Feb. 22, 2024, 12:10 p.m. UTC | #2
Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch 
mm-unstable and remeasured it. Could you kindly help confirm if 
this is the right base to work on?
Compared to the previous result at kernel v6.7 with a 5% performance 
gain on intel icelake(160 vcpu), we only had a 0.6% with this commit 
base. But we think our modification still has some significance. On 
the one hand, this does reduce a critical section. On the other hand, 
we have a 4% performance gain on intel sapphire rapids(224 vcpu), 
which suggests more performance improvement would likely be achieved 
when the core count of processors increases to hundreds or 
even thousands.
Thank you again for your comments.
Uladzislau Rezki Feb. 22, 2024, 12:52 p.m. UTC | #3
Hello, Rulinhuang!

> Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch 
> mm-unstable and remeasured it. Could you kindly help confirm if 
> this is the right base to work on?
> Compared to the previous result at kernel v6.7 with a 5% performance 
> gain on intel icelake(160 vcpu), we only had a 0.6% with this commit 
> base. But we think our modification still has some significance. On 
> the one hand, this does reduce a critical section. On the other hand, 
> we have a 4% performance gain on intel sapphire rapids(224 vcpu), 
> which suggests more performance improvement would likely be achieved 
> when the core count of processors increases to hundreds or 
> even thousands.
> Thank you again for your comments.
>
According to the patch that was a correct rebase. Right a small delta
on your 160 CPUs is because of removing a contention. As for bigger
systems it is bigger impact, like you point here on your 224 vcpu
results where you see %4 perf improvement.

So we should fix it. But the way how it is fixed is not optimal from
my point of view, because the patch that is in question spreads the
internals from alloc_vmap_area(), like inserting busy area, across
many parts now.

--
Uladzislau Rezki
Baoquan He Feb. 22, 2024, 3:36 p.m. UTC | #4
On 02/22/24 at 01:52pm, Uladzislau Rezki wrote:
> Hello, Rulinhuang!
> 
> > Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch 
> > mm-unstable and remeasured it. Could you kindly help confirm if 
> > this is the right base to work on?
> > Compared to the previous result at kernel v6.7 with a 5% performance 
> > gain on intel icelake(160 vcpu), we only had a 0.6% with this commit 
> > base. But we think our modification still has some significance. On 
> > the one hand, this does reduce a critical section. On the other hand, 
> > we have a 4% performance gain on intel sapphire rapids(224 vcpu), 
> > which suggests more performance improvement would likely be achieved 
> > when the core count of processors increases to hundreds or 
> > even thousands.
> > Thank you again for your comments.
> >
> According to the patch that was a correct rebase. Right a small delta
> on your 160 CPUs is because of removing a contention. As for bigger
> systems it is bigger impact, like you point here on your 224 vcpu
> results where you see %4 perf improvement.
> 
> So we should fix it. But the way how it is fixed is not optimal from
> my point of view, because the patch that is in question spreads the
> internals from alloc_vmap_area(), like inserting busy area, across
> many parts now.

I happened to walk into this thread and come up with one draft patch.
Please help check if it's ok.

From 0112e39b3a8454a288e1bcece220c4599bac5326 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Thu, 22 Feb 2024 23:26:59 +0800
Subject: [PATCH] mm/vmalloc.c: avoid repeatedly requiring lock unnecessarily
Content-type: text/plain

By moving setup_vmalloc_vm() into alloc_vmap_area(), we can reduce
requiring lock one time in short time.

Signed-off-by: Baoquan He <bhe@redhat.com>
---
 mm/vmalloc.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index aeee71349157..6bda3c06b484 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1848,7 +1848,10 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 				unsigned long align,
 				unsigned long vstart, unsigned long vend,
 				int node, gfp_t gfp_mask,
-				unsigned long va_flags)
+				unsigned long va_flags,
+				struct vm_struct *vm,
+				unsigned long vm_flags,
+				const void *caller)
 {
 	struct vmap_node *vn;
 	struct vmap_area *va;
@@ -1915,6 +1918,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 
 	spin_lock(&vn->busy.lock);
 	insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
+	if (!(va_flags & VMAP_RAM) && vm)
+		setup_vmalloc_vm(vm, va, vm_flags, caller);
 	spin_unlock(&vn->busy.lock);
 
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
@@ -2947,7 +2952,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
 	kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
 }
 
-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
 	struct vmap_area *va, unsigned long flags, const void *caller)
 {
 	vm->flags = flags;
@@ -2957,16 +2962,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
 	va->vm = vm;
 }
 
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-			      unsigned long flags, const void *caller)
-{
-	struct vmap_node *vn = addr_to_node(va->va_start);
-
-	spin_lock(&vn->busy.lock);
-	setup_vmalloc_vm_locked(vm, va, flags, caller);
-	spin_unlock(&vn->busy.lock);
-}
-
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
 {
 	/*
@@ -3009,8 +3004,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 		return NULL;
 	}
 
-	setup_vmalloc_vm(area, va, flags, caller);
-
 	/*
 	 * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
 	 * best-effort approach, as they can be mapped outside of vmalloc code.
@@ -4586,7 +4579,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 
 		spin_lock(&vn->busy.lock);
 		insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
-		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+		setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
 				 pcpu_get_vm_areas);
 		spin_unlock(&vn->busy.lock);
 	}
Huang, Rulin Feb. 23, 2024, 1:09 p.m. UTC | #5
> 
> Hello, Rulinhuang!
> 
> > Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch
> > mm-unstable and remeasured it. Could you kindly help confirm if this
> > is the right base to work on?
> > Compared to the previous result at kernel v6.7 with a 5% performance
> > gain on intel icelake(160 vcpu), we only had a 0.6% with this commit
> > base. But we think our modification still has some significance. On
> > the one hand, this does reduce a critical section. On the other hand,
> > we have a 4% performance gain on intel sapphire rapids(224 vcpu),
> > which suggests more performance improvement would likely be achieved
> > when the core count of processors increases to hundreds or even
> > thousands.
> > Thank you again for your comments.
> >
> According to the patch that was a correct rebase. Right a small delta on your
> 160 CPUs is because of removing a contention. As for bigger systems it is
> bigger impact, like you point here on your 224 vcpu results where you see %4
> perf improvement.
> 
> So we should fix it. But the way how it is fixed is not optimal from my point of
> view, because the patch that is in question spreads the internals from
> alloc_vmap_area(), like inserting busy area, across many parts now.
> 
> --
> Uladzislau Rezki

Our modifications in patch 5 not only achieve the original effect, 
but also cancel the split of alloc_vmap_area()and setup_vmalloc_vm() 
is placed without lock and lengthen the critical section.
Without splitting alloc_vmap_area(), putting setup_vmalloc_vm() 
directly into it is all we can think of.
Regarding Baoquan’s changes, we think that:
We prefer put setup_vmalloc_vm() function not placed inside the 
critical section and it is no need to lengthen the critical section.
We prefer use judging (vm_data) rather than 
((!(va_flags & VMAP_RAM) && vm), and it is enough to deetermine the 
conditions for assignment. The change seem to be wandering about the 
judgment of va_flags.
Hi Uladzislau, could you please let us know if you have any better 
suggestions on the modification scheme?
Thank you for your advice!
diff mbox series

Patch

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c17..768e45f2ed94 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1630,17 +1630,18 @@  static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->vm = NULL;
 	va->flags = va_flags;
 
-	spin_lock(&vmap_area_lock);
-	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
-	spin_unlock(&vmap_area_lock);
-
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
 	BUG_ON(va->va_start < vstart);
 	BUG_ON(va->va_end > vend);
 
 	ret = kasan_populate_vmalloc(addr, size);
 	if (ret) {
-		free_vmap_area(va);
+		/*
+		 * Insert/Merge it back to the free tree/list.
+		 */
+		spin_lock(&free_vmap_area_lock);
+		merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
+		spin_unlock(&free_vmap_area_lock);
 		return ERR_PTR(ret);
 	}
 
@@ -1669,6 +1670,13 @@  static struct vmap_area *alloc_vmap_area(unsigned long size,
 	return ERR_PTR(-EBUSY);
 }
 
+static inline void insert_vmap_area_with_lock(struct vmap_area *va)
+{
+	spin_lock(&vmap_area_lock);
+	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+	spin_unlock(&vmap_area_lock);
+}
+
 int register_vmap_purge_notifier(struct notifier_block *nb)
 {
 	return blocking_notifier_chain_register(&vmap_notify_list, nb);
@@ -2045,6 +2053,8 @@  static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 		return ERR_CAST(va);
 	}
 
+	insert_vmap_area_with_lock(va);
+
 	vaddr = vmap_block_vaddr(va->va_start, 0);
 	spin_lock_init(&vb->lock);
 	vb->va = va;
@@ -2398,6 +2408,8 @@  void *vm_map_ram(struct page **pages, unsigned int count, int node)
 		if (IS_ERR(va))
 			return NULL;
 
+		insert_vmap_area_with_lock(va);
+
 		addr = va->va_start;
 		mem = (void *)addr;
 	}
@@ -2538,7 +2550,7 @@  static void vmap_init_free_space(void)
 	}
 }
 
-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
 	struct vmap_area *va, unsigned long flags, const void *caller)
 {
 	vm->flags = flags;
@@ -2548,14 +2560,6 @@  static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
 	va->vm = vm;
 }
 
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-			      unsigned long flags, const void *caller)
-{
-	spin_lock(&vmap_area_lock);
-	setup_vmalloc_vm_locked(vm, va, flags, caller);
-	spin_unlock(&vmap_area_lock);
-}
-
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
 {
 	/*
@@ -2600,6 +2604,8 @@  static struct vm_struct *__get_vm_area_node(unsigned long size,
 
 	setup_vmalloc_vm(area, va, flags, caller);
 
+	insert_vmap_area_with_lock(va);
+
 	/*
 	 * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
 	 * best-effort approach, as they can be mapped outside of vmalloc code.
@@ -4166,7 +4172,7 @@  struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	for (area = 0; area < nr_vms; area++) {
 		insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
 
-		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+		setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
 				 pcpu_get_vm_areas);
 	}
 	spin_unlock(&vmap_area_lock);