diff mbox series

[v2,9/9] mm: vmalloc: Set nr_nodes/node_size based on CPU-cores

Message ID 20230829081142.3619-10-urezki@gmail.com (mailing list archive)
State New
Headers show
Series Mitigate a vmap lock contention v2 | expand

Commit Message

Uladzislau Rezki Aug. 29, 2023, 8:11 a.m. UTC
The density ratio is set to 2, i.e. two users per one node.
For example if there are 6 cores in a system the "nr_nodes"
is 3.

The "node_size" also depends on number of physical cores.
A high-threshold limit is hard-coded and set to SZ_4M.

For 32-bit, single/dual core systems an access to a global
vmap heap is not balanced. Such small systems do not suffer
from lock contentions due to limitation of CPU-cores.

Test on AMD Ryzen Threadripper 3970X 32-Core Processor:
sudo ./test_vmalloc.sh run_test_mask=127 nr_threads=64

<default perf>
 94.17%     0.90%  [kernel]    [k] _raw_spin_lock
 93.27%    93.05%  [kernel]    [k] native_queued_spin_lock_slowpath
 74.69%     0.25%  [kernel]    [k] __vmalloc_node_range
 72.64%     0.01%  [kernel]    [k] __get_vm_area_node
 72.04%     0.89%  [kernel]    [k] alloc_vmap_area
 42.17%     0.00%  [kernel]    [k] vmalloc
 32.53%     0.00%  [kernel]    [k] __vmalloc_node
 24.91%     0.25%  [kernel]    [k] vfree
 24.32%     0.01%  [kernel]    [k] remove_vm_area
 22.63%     0.21%  [kernel]    [k] find_unlink_vmap_area
 15.51%     0.00%  [unknown]   [k] 0xffffffffc09a74ac
 14.35%     0.00%  [kernel]    [k] ret_from_fork_asm
 14.35%     0.00%  [kernel]    [k] ret_from_fork
 14.35%     0.00%  [kernel]    [k] kthread
<default perf>
   vs
<patch-series perf>
 74.32%     2.42%  [kernel]    [k] __vmalloc_node_range
 69.58%     0.01%  [kernel]    [k] vmalloc
 54.21%     1.17%  [kernel]    [k] __alloc_pages_bulk
 48.13%    47.91%  [kernel]    [k] clear_page_orig
 43.60%     0.01%  [unknown]   [k] 0xffffffffc082f16f
 32.06%     0.00%  [kernel]    [k] ret_from_fork_asm
 32.06%     0.00%  [kernel]    [k] ret_from_fork
 32.06%     0.00%  [kernel]    [k] kthread
 31.30%     0.00%  [unknown]   [k] 0xffffffffc082f889
 22.98%     4.16%  [kernel]    [k] vfree
 14.36%     0.28%  [kernel]    [k] __get_vm_area_node
 13.43%     3.35%  [kernel]    [k] alloc_vmap_area
 10.86%     0.04%  [kernel]    [k] remove_vm_area
  8.89%     2.75%  [kernel]    [k] _raw_spin_lock
  7.19%     0.00%  [unknown]   [k] 0xffffffffc082fba3
  6.65%     1.37%  [kernel]    [k] free_unref_page
  6.13%     6.11%  [kernel]    [k] native_queued_spin_lock_slowpath
<patch-series perf>

confirms that a native_queued_spin_lock_slowpath bottle-neck
can be considered as negligible for the patch-series version.

The throughput is ~15x higher:

urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=127 nr_threads=64
Run the test with following parameters: run_test_mask=127 nr_threads=64
Done.
Check the kernel ring buffer to see the summary.

real    24m3.305s
user    0m0.361s
sys     0m0.013s
urezki@pc638:~$

urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=127 nr_threads=64
Run the test with following parameters: run_test_mask=127 nr_threads=64
Done.
Check the kernel ring buffer to see the summary.

real    1m28.382s
user    0m0.014s
sys     0m0.026s
urezki@pc638:~$

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 mm/vmalloc.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

Comments

Baoquan He Sept. 15, 2023, 1:03 p.m. UTC | #1
On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
......
> real    1m28.382s
> user    0m0.014s
> sys     0m0.026s
> urezki@pc638:~$
> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> ---
>  mm/vmalloc.c | 26 ++++++++++++++++++++++++++
>  1 file changed, 26 insertions(+)

LGTM,

Reviewed-by: Baoquan He <bhe@redhat.com>

> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 9cce012aecdb..08990f630c21 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -796,6 +796,9 @@ struct vmap_node {
>  	atomic_t fill_in_progress;
>  };
>  
> +#define MAX_NODES U8_MAX
> +#define MAX_NODE_SIZE SZ_4M
> +
>  static struct vmap_node *nodes, snode;
>  static __read_mostly unsigned int nr_nodes = 1;
>  static __read_mostly unsigned int node_size = 1;
> @@ -4803,11 +4806,24 @@ static void vmap_init_free_space(void)
>  	}
>  }
>  
> +static unsigned int calculate_nr_nodes(void)
> +{
> +	unsigned int nr_cpus;
> +
> +	nr_cpus = num_present_cpus();
> +	if (nr_cpus <= 1)
> +		nr_cpus = num_possible_cpus();
> +
> +	/* Density factor. Two users per a node. */
> +	return clamp_t(unsigned int, nr_cpus >> 1, 1, MAX_NODES);
> +}
> +
>  static void vmap_init_nodes(void)
>  {
>  	struct vmap_node *vn;
>  	int i;
>  
> +	nr_nodes = calculate_nr_nodes();
>  	nodes = &snode;
>  
>  	if (nr_nodes > 1) {
> @@ -4830,6 +4846,16 @@ static void vmap_init_nodes(void)
>  		INIT_LIST_HEAD(&vn->free.head);
>  		spin_lock_init(&vn->free.lock);
>  	}
> +
> +	/*
> +	 * Scale a node size to number of CPUs. Each power of two
> +	 * value doubles a node size. A high-threshold limit is set
> +	 * to 4M.
> +	 */
> +#if BITS_PER_LONG == 64
> +	if (nr_nodes > 1)
> +		node_size = min(SZ_64K << fls(num_possible_cpus()), SZ_4M);
> +#endif
>  }
>  
>  void __init vmalloc_init(void)
> -- 
> 2.30.2
>
Uladzislau Rezki Sept. 15, 2023, 6:31 p.m. UTC | #2
On Fri, Sep 15, 2023 at 09:03:29PM +0800, Baoquan He wrote:
> On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> ......
> > real    1m28.382s
> > user    0m0.014s
> > sys     0m0.026s
> > urezki@pc638:~$
> > 
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > ---
> >  mm/vmalloc.c | 26 ++++++++++++++++++++++++++
> >  1 file changed, 26 insertions(+)
> 
> LGTM,
> 
> Reviewed-by: Baoquan He <bhe@redhat.com>
> 
Applied. Thank you!

--
Uladzislau Rezki
diff mbox series

Patch

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9cce012aecdb..08990f630c21 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -796,6 +796,9 @@  struct vmap_node {
 	atomic_t fill_in_progress;
 };
 
+#define MAX_NODES U8_MAX
+#define MAX_NODE_SIZE SZ_4M
+
 static struct vmap_node *nodes, snode;
 static __read_mostly unsigned int nr_nodes = 1;
 static __read_mostly unsigned int node_size = 1;
@@ -4803,11 +4806,24 @@  static void vmap_init_free_space(void)
 	}
 }
 
+static unsigned int calculate_nr_nodes(void)
+{
+	unsigned int nr_cpus;
+
+	nr_cpus = num_present_cpus();
+	if (nr_cpus <= 1)
+		nr_cpus = num_possible_cpus();
+
+	/* Density factor. Two users per a node. */
+	return clamp_t(unsigned int, nr_cpus >> 1, 1, MAX_NODES);
+}
+
 static void vmap_init_nodes(void)
 {
 	struct vmap_node *vn;
 	int i;
 
+	nr_nodes = calculate_nr_nodes();
 	nodes = &snode;
 
 	if (nr_nodes > 1) {
@@ -4830,6 +4846,16 @@  static void vmap_init_nodes(void)
 		INIT_LIST_HEAD(&vn->free.head);
 		spin_lock_init(&vn->free.lock);
 	}
+
+	/*
+	 * Scale a node size to number of CPUs. Each power of two
+	 * value doubles a node size. A high-threshold limit is set
+	 * to 4M.
+	 */
+#if BITS_PER_LONG == 64
+	if (nr_nodes > 1)
+		node_size = min(SZ_64K << fls(num_possible_cpus()), SZ_4M);
+#endif
 }
 
 void __init vmalloc_init(void)