diff mbox series

[v2,4/9] mm: vmalloc: Remove global vmap_area_root rb-tree

Message ID 20230829081142.3619-5-urezki@gmail.com (mailing list archive)
State New
Headers show
Series Mitigate a vmap lock contention v2 | expand

Commit Message

Uladzislau Rezki Aug. 29, 2023, 8:11 a.m. UTC
Store allocated objects in a separate nodes. A va->va_start
address is converted into a correct node where it should
be placed and resided. An addr_to_node() function is used
to do a proper address conversion to determine a node that
contains a VA.

Such approach balances VAs across nodes as a result an access
becomes scalable. Number of nodes in a system depends on number
of CPUs divided by two. The density factor in this case is 1/2.

Please note:

1. As of now allocated VAs are bound to a node-0. It means the
   patch does not give any difference comparing with a current
   behavior;

2. The global vmap_area_lock, vmap_area_root are removed as there
   is no need in it anymore. The vmap_area_list is still kept and
   is _empty_. It is exported for a kexec only;

3. The vmallocinfo and vread() have to be reworked to be able to
   handle multiple nodes.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 mm/vmalloc.c | 209 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 161 insertions(+), 48 deletions(-)

Comments

kernel test robot Aug. 29, 2023, 2:30 p.m. UTC | #1
Hi Uladzislau,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on linus/master v6.5]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Uladzislau-Rezki-Sony/mm-vmalloc-Add-va_alloc-helper/20230829-161248
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20230829081142.3619-5-urezki%40gmail.com
patch subject: [PATCH v2 4/9] mm: vmalloc: Remove global vmap_area_root rb-tree
config: csky-randconfig-r024-20230829 (https://download.01.org/0day-ci/archive/20230829/202308292228.RRrGUYyB-lkp@intel.com/config)
compiler: csky-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20230829/202308292228.RRrGUYyB-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202308292228.RRrGUYyB-lkp@intel.com/

All warnings (new ones prefixed by >>):

   mm/vmalloc.c: In function 'vmap_init_free_space':
>> mm/vmalloc.c:4506:45: warning: ordered comparison of pointer with integer zero [-Wextra]
    4506 |                 if (busy->addr - vmap_start > 0) {
         |                                             ^


vim +4506 mm/vmalloc.c

  4491	
  4492	static void vmap_init_free_space(void)
  4493	{
  4494		unsigned long vmap_start = 1;
  4495		const unsigned long vmap_end = ULONG_MAX;
  4496		struct vmap_area *free;
  4497		struct vm_struct *busy;
  4498	
  4499		/*
  4500		 *     B     F     B     B     B     F
  4501		 * -|-----|.....|-----|-----|-----|.....|-
  4502		 *  |           The KVA space           |
  4503		 *  |<--------------------------------->|
  4504		 */
  4505		for (busy = vmlist; busy; busy = busy->next) {
> 4506			if (busy->addr - vmap_start > 0) {
  4507				free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
  4508				if (!WARN_ON_ONCE(!free)) {
  4509					free->va_start = vmap_start;
  4510					free->va_end = (unsigned long) busy->addr;
  4511	
  4512					insert_vmap_area_augment(free, NULL,
  4513						&free_vmap_area_root,
  4514							&free_vmap_area_list);
  4515				}
  4516			}
  4517	
  4518			vmap_start = (unsigned long) busy->addr + busy->size;
  4519		}
  4520	
  4521		if (vmap_end - vmap_start > 0) {
  4522			free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
  4523			if (!WARN_ON_ONCE(!free)) {
  4524				free->va_start = vmap_start;
  4525				free->va_end = vmap_end;
  4526	
  4527				insert_vmap_area_augment(free, NULL,
  4528					&free_vmap_area_root,
  4529						&free_vmap_area_list);
  4530			}
  4531		}
  4532	}
  4533
Uladzislau Rezki Aug. 30, 2023, 2:48 p.m. UTC | #2
On Tue, Aug 29, 2023 at 10:30:19PM +0800, kernel test robot wrote:
> Hi Uladzislau,
> 
> kernel test robot noticed the following build warnings:
> 
> [auto build test WARNING on akpm-mm/mm-everything]
> [also build test WARNING on linus/master v6.5]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Uladzislau-Rezki-Sony/mm-vmalloc-Add-va_alloc-helper/20230829-161248
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
> patch link:    https://lore.kernel.org/r/20230829081142.3619-5-urezki%40gmail.com
> patch subject: [PATCH v2 4/9] mm: vmalloc: Remove global vmap_area_root rb-tree
> config: csky-randconfig-r024-20230829 (https://download.01.org/0day-ci/archive/20230829/202308292228.RRrGUYyB-lkp@intel.com/config)
> compiler: csky-linux-gcc (GCC) 13.2.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20230829/202308292228.RRrGUYyB-lkp@intel.com/reproduce)
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202308292228.RRrGUYyB-lkp@intel.com/
> 
> All warnings (new ones prefixed by >>):
> 
>    mm/vmalloc.c: In function 'vmap_init_free_space':
> >> mm/vmalloc.c:4506:45: warning: ordered comparison of pointer with integer zero [-Wextra]
>     4506 |                 if (busy->addr - vmap_start > 0) {
>          |                                             ^
>
Right. I will fix it. We should cast the busy->addr to unsigned long.

--
Uladzislau Rezki
Baoquan He Sept. 7, 2023, 2:17 a.m. UTC | #3
Add Kazu and Lianbo to CC, and kexec mailing list

On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> Store allocated objects in a separate nodes. A va->va_start
> address is converted into a correct node where it should
> be placed and resided. An addr_to_node() function is used
> to do a proper address conversion to determine a node that
> contains a VA.
> 
> Such approach balances VAs across nodes as a result an access
> becomes scalable. Number of nodes in a system depends on number
> of CPUs divided by two. The density factor in this case is 1/2.
> 
> Please note:
> 
> 1. As of now allocated VAs are bound to a node-0. It means the
>    patch does not give any difference comparing with a current
>    behavior;
> 
> 2. The global vmap_area_lock, vmap_area_root are removed as there
>    is no need in it anymore. The vmap_area_list is still kept and
>    is _empty_. It is exported for a kexec only;

I haven't taken a test, while accessing all nodes' busy tree to get
va of the lowest address could severely impact kcore reading efficiency
on system with many vmap nodes. People doing live debugging via
/proc/kcore will get a little surprise.

Empty vmap_area_list will break makedumpfile utility, Crash utility
could be impactd too. I checked makedumpfile code, it relys on
vmap_area_list to deduce the vmalloc_start value. 

> 
> 3. The vmallocinfo and vread() have to be reworked to be able to
>    handle multiple nodes.
Baoquan He Sept. 7, 2023, 9:38 a.m. UTC | #4
On 09/07/23 at 10:17am, Baoquan He wrote:
> Add Kazu and Lianbo to CC, and kexec mailing list
> 
> On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > Store allocated objects in a separate nodes. A va->va_start
> > address is converted into a correct node where it should
> > be placed and resided. An addr_to_node() function is used
> > to do a proper address conversion to determine a node that
> > contains a VA.
> > 
> > Such approach balances VAs across nodes as a result an access
> > becomes scalable. Number of nodes in a system depends on number
> > of CPUs divided by two. The density factor in this case is 1/2.
> > 
> > Please note:
> > 
> > 1. As of now allocated VAs are bound to a node-0. It means the
> >    patch does not give any difference comparing with a current
> >    behavior;
> > 
> > 2. The global vmap_area_lock, vmap_area_root are removed as there
> >    is no need in it anymore. The vmap_area_list is still kept and
> >    is _empty_. It is exported for a kexec only;
> 
> I haven't taken a test, while accessing all nodes' busy tree to get
> va of the lowest address could severely impact kcore reading efficiency
> on system with many vmap nodes. People doing live debugging via
> /proc/kcore will get a little surprise.
> 
> Empty vmap_area_list will break makedumpfile utility, Crash utility
> could be impactd too. I checked makedumpfile code, it relys on
> vmap_area_list to deduce the vmalloc_start value. 

Except of the empty vmap_area_list, this patch looks good to me.

We may need think of another way to export the vmalloc_start value or
deduce it in makedumpfile/Crash utility. And then remove the useless
vmap_area_list. I am not sure if we should remove vmap_area_list in this
patch because the empty value will cause breakage anyway. Otherwise,

Reviewed-by: Baoquan He <bhe@redhat.com>

> 
> > 
> > 3. The vmallocinfo and vread() have to be reworked to be able to
> >    handle multiple nodes.
>
Uladzislau Rezki Sept. 7, 2023, 9:39 a.m. UTC | #5
On Thu, Sep 07, 2023 at 10:17:39AM +0800, Baoquan He wrote:
> Add Kazu and Lianbo to CC, and kexec mailing list
> 
> On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > Store allocated objects in a separate nodes. A va->va_start
> > address is converted into a correct node where it should
> > be placed and resided. An addr_to_node() function is used
> > to do a proper address conversion to determine a node that
> > contains a VA.
> > 
> > Such approach balances VAs across nodes as a result an access
> > becomes scalable. Number of nodes in a system depends on number
> > of CPUs divided by two. The density factor in this case is 1/2.
> > 
> > Please note:
> > 
> > 1. As of now allocated VAs are bound to a node-0. It means the
> >    patch does not give any difference comparing with a current
> >    behavior;
> > 
> > 2. The global vmap_area_lock, vmap_area_root are removed as there
> >    is no need in it anymore. The vmap_area_list is still kept and
> >    is _empty_. It is exported for a kexec only;
> 
> I haven't taken a test, while accessing all nodes' busy tree to get
> va of the lowest address could severely impact kcore reading efficiency
> on system with many vmap nodes. People doing live debugging via
> /proc/kcore will get a little surprise.
> 
>
> Empty vmap_area_list will break makedumpfile utility, Crash utility
> could be impactd too. I checked makedumpfile code, it relys on
> vmap_area_list to deduce the vmalloc_start value. 
>
It is left part and i hope i fix it in v3. The problem here is
we can not give an opportunity to access to vmap internals from
outside. This is just not correct, i.e. you are not allowed to
access the list directly.

--
Uladzislau Rezki
Uladzislau Rezki Sept. 7, 2023, 9:40 a.m. UTC | #6
On Thu, Sep 07, 2023 at 05:38:07PM +0800, Baoquan He wrote:
> On 09/07/23 at 10:17am, Baoquan He wrote:
> > Add Kazu and Lianbo to CC, and kexec mailing list
> > 
> > On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > > Store allocated objects in a separate nodes. A va->va_start
> > > address is converted into a correct node where it should
> > > be placed and resided. An addr_to_node() function is used
> > > to do a proper address conversion to determine a node that
> > > contains a VA.
> > > 
> > > Such approach balances VAs across nodes as a result an access
> > > becomes scalable. Number of nodes in a system depends on number
> > > of CPUs divided by two. The density factor in this case is 1/2.
> > > 
> > > Please note:
> > > 
> > > 1. As of now allocated VAs are bound to a node-0. It means the
> > >    patch does not give any difference comparing with a current
> > >    behavior;
> > > 
> > > 2. The global vmap_area_lock, vmap_area_root are removed as there
> > >    is no need in it anymore. The vmap_area_list is still kept and
> > >    is _empty_. It is exported for a kexec only;
> > 
> > I haven't taken a test, while accessing all nodes' busy tree to get
> > va of the lowest address could severely impact kcore reading efficiency
> > on system with many vmap nodes. People doing live debugging via
> > /proc/kcore will get a little surprise.
> > 
> > Empty vmap_area_list will break makedumpfile utility, Crash utility
> > could be impactd too. I checked makedumpfile code, it relys on
> > vmap_area_list to deduce the vmalloc_start value. 
> 
> Except of the empty vmap_area_list, this patch looks good to me.
> 
> We may need think of another way to export the vmalloc_start value or
> deduce it in makedumpfile/Crash utility. And then remove the useless
> vmap_area_list. I am not sure if we should remove vmap_area_list in this
> patch because the empty value will cause breakage anyway. Otherwise,
> 
> Reviewed-by: Baoquan He <bhe@redhat.com>
> 
Thanks for the review!

--
Uladzislau Rezki
Baoquan He Sept. 7, 2023, 9:58 a.m. UTC | #7
On 09/07/23 at 11:39am, Uladzislau Rezki wrote:
> On Thu, Sep 07, 2023 at 10:17:39AM +0800, Baoquan He wrote:
> > Add Kazu and Lianbo to CC, and kexec mailing list
> > 
> > On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > > Store allocated objects in a separate nodes. A va->va_start
> > > address is converted into a correct node where it should
> > > be placed and resided. An addr_to_node() function is used
> > > to do a proper address conversion to determine a node that
> > > contains a VA.
> > > 
> > > Such approach balances VAs across nodes as a result an access
> > > becomes scalable. Number of nodes in a system depends on number
> > > of CPUs divided by two. The density factor in this case is 1/2.
> > > 
> > > Please note:
> > > 
> > > 1. As of now allocated VAs are bound to a node-0. It means the
> > >    patch does not give any difference comparing with a current
> > >    behavior;
> > > 
> > > 2. The global vmap_area_lock, vmap_area_root are removed as there
> > >    is no need in it anymore. The vmap_area_list is still kept and
> > >    is _empty_. It is exported for a kexec only;
> > 
> > I haven't taken a test, while accessing all nodes' busy tree to get
> > va of the lowest address could severely impact kcore reading efficiency
> > on system with many vmap nodes. People doing live debugging via
> > /proc/kcore will get a little surprise.
> > 
> >
> > Empty vmap_area_list will break makedumpfile utility, Crash utility
> > could be impactd too. I checked makedumpfile code, it relys on
> > vmap_area_list to deduce the vmalloc_start value. 
> >
> It is left part and i hope i fix it in v3. The problem here is
> we can not give an opportunity to access to vmap internals from
> outside. This is just not correct, i.e. you are not allowed to
> access the list directly.

Right. Thanks for the fix in v3, that is a relief of makedumpfile and
crash.

Hi Kazu,

Meanwhile, I am thinking if we should evaluate the necessity of
vmap_area_list in makedumpfile and Crash. In makedumpfile, we just use
vmap_area_list to deduce VMALLOC_START. Wondering if we can export
VMALLOC_START directly. Surely, the lowest va->va_start in vmap_area_list
is a tighter low boundary of vmalloc area and can reduce unnecessary
scanning below the lowest va. Not sure if this is the reason people
decided to export vmap_area_list.

Thanks
Baoquan
HAGIO KAZUHITO(萩尾 一仁) Sept. 8, 2023, 1:51 a.m. UTC | #8
On 2023/09/07 18:58, Baoquan He wrote:
> On 09/07/23 at 11:39am, Uladzislau Rezki wrote:
>> On Thu, Sep 07, 2023 at 10:17:39AM +0800, Baoquan He wrote:
>>> Add Kazu and Lianbo to CC, and kexec mailing list
>>>
>>> On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
>>>> Store allocated objects in a separate nodes. A va->va_start
>>>> address is converted into a correct node where it should
>>>> be placed and resided. An addr_to_node() function is used
>>>> to do a proper address conversion to determine a node that
>>>> contains a VA.
>>>>
>>>> Such approach balances VAs across nodes as a result an access
>>>> becomes scalable. Number of nodes in a system depends on number
>>>> of CPUs divided by two. The density factor in this case is 1/2.
>>>>
>>>> Please note:
>>>>
>>>> 1. As of now allocated VAs are bound to a node-0. It means the
>>>>     patch does not give any difference comparing with a current
>>>>     behavior;
>>>>
>>>> 2. The global vmap_area_lock, vmap_area_root are removed as there
>>>>     is no need in it anymore. The vmap_area_list is still kept and
>>>>     is _empty_. It is exported for a kexec only;
>>>
>>> I haven't taken a test, while accessing all nodes' busy tree to get
>>> va of the lowest address could severely impact kcore reading efficiency
>>> on system with many vmap nodes. People doing live debugging via
>>> /proc/kcore will get a little surprise.
>>>
>>>
>>> Empty vmap_area_list will break makedumpfile utility, Crash utility
>>> could be impactd too. I checked makedumpfile code, it relys on
>>> vmap_area_list to deduce the vmalloc_start value.
>>>
>> It is left part and i hope i fix it in v3. The problem here is
>> we can not give an opportunity to access to vmap internals from
>> outside. This is just not correct, i.e. you are not allowed to
>> access the list directly.
> 
> Right. Thanks for the fix in v3, that is a relief of makedumpfile and
> crash.
> 
> Hi Kazu,
> 
> Meanwhile, I am thinking if we should evaluate the necessity of
> vmap_area_list in makedumpfile and Crash. In makedumpfile, we just use
> vmap_area_list to deduce VMALLOC_START. Wondering if we can export
> VMALLOC_START directly. Surely, the lowest va->va_start in vmap_area_list
> is a tighter low boundary of vmalloc area and can reduce unnecessary
> scanning below the lowest va. Not sure if this is the reason people
> decided to export vmap_area_list.

The kernel commit acd99dbf5402 introduced the original vmlist entry to 
vmcoreinfo, but there is no information about why it did not export 
VMALLOC_START directly.

If VMALLOC_START is exported directly to vmcoreinfo, I think it would be 
enough for makedumpfile.

Thanks,
Kazu
Baoquan He Sept. 8, 2023, 4:43 a.m. UTC | #9
On 09/08/23 at 01:51am, HAGIO KAZUHITO(萩尾 一仁) wrote:
> On 2023/09/07 18:58, Baoquan He wrote:
> > On 09/07/23 at 11:39am, Uladzislau Rezki wrote:
> >> On Thu, Sep 07, 2023 at 10:17:39AM +0800, Baoquan He wrote:
> >>> Add Kazu and Lianbo to CC, and kexec mailing list
> >>>
> >>> On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> >>>> Store allocated objects in a separate nodes. A va->va_start
> >>>> address is converted into a correct node where it should
> >>>> be placed and resided. An addr_to_node() function is used
> >>>> to do a proper address conversion to determine a node that
> >>>> contains a VA.
> >>>>
> >>>> Such approach balances VAs across nodes as a result an access
> >>>> becomes scalable. Number of nodes in a system depends on number
> >>>> of CPUs divided by two. The density factor in this case is 1/2.
> >>>>
> >>>> Please note:
> >>>>
> >>>> 1. As of now allocated VAs are bound to a node-0. It means the
> >>>>     patch does not give any difference comparing with a current
> >>>>     behavior;
> >>>>
> >>>> 2. The global vmap_area_lock, vmap_area_root are removed as there
> >>>>     is no need in it anymore. The vmap_area_list is still kept and
> >>>>     is _empty_. It is exported for a kexec only;
> >>>
> >>> I haven't taken a test, while accessing all nodes' busy tree to get
> >>> va of the lowest address could severely impact kcore reading efficiency
> >>> on system with many vmap nodes. People doing live debugging via
> >>> /proc/kcore will get a little surprise.
> >>>
> >>>
> >>> Empty vmap_area_list will break makedumpfile utility, Crash utility
> >>> could be impactd too. I checked makedumpfile code, it relys on
> >>> vmap_area_list to deduce the vmalloc_start value.
> >>>
> >> It is left part and i hope i fix it in v3. The problem here is
> >> we can not give an opportunity to access to vmap internals from
> >> outside. This is just not correct, i.e. you are not allowed to
> >> access the list directly.
> > 
> > Right. Thanks for the fix in v3, that is a relief of makedumpfile and
> > crash.
> > 
> > Hi Kazu,
> > 
> > Meanwhile, I am thinking if we should evaluate the necessity of
> > vmap_area_list in makedumpfile and Crash. In makedumpfile, we just use
> > vmap_area_list to deduce VMALLOC_START. Wondering if we can export
> > VMALLOC_START directly. Surely, the lowest va->va_start in vmap_area_list
> > is a tighter low boundary of vmalloc area and can reduce unnecessary
> > scanning below the lowest va. Not sure if this is the reason people
> > decided to export vmap_area_list.
> 
> The kernel commit acd99dbf5402 introduced the original vmlist entry to 
> vmcoreinfo, but there is no information about why it did not export 
> VMALLOC_START directly.
> 
> If VMALLOC_START is exported directly to vmcoreinfo, I think it would be 
> enough for makedumpfile.

Thanks for confirmation, Kazu.

Then, below draft patch should be enough to export VMALLOC_START
instead, and remove vmap_area_list. In order to get the base address of
vmalloc area, constructing a vmap_area_list from multiple busy-tree
seems not worth.

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 599e8d3bcbc3..3cb1ea09ff26 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -65,11 +65,11 @@ Defines the beginning of the text section. In general, _stext indicates
 the kernel start address. Used to convert a virtual address from the
 direct kernel map to a physical address.
 
-vmap_area_list
---------------
+VMALLOC_START
+-------------
 
-Stores the virtual area list. makedumpfile gets the vmalloc start value
-from this variable and its value is necessary for vmalloc translation.
+Stores the base address of vmalloc area. makedumpfile gets this value and
+its value is necessary for vmalloc translation.
 
 mem_map
 -------
diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c
index 66cde752cd74..2a24199a9b81 100644
--- a/arch/arm64/kernel/crash_core.c
+++ b/arch/arm64/kernel/crash_core.c
@@ -23,7 +23,6 @@ void arch_crash_save_vmcoreinfo(void)
 	/* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */
 	vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR);
 	vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END);
-	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
 	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
 	vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START);
 	vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END);
diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/crash_core.c
index 55f1d7856b54..5c39cedd2c5c 100644
--- a/arch/riscv/kernel/crash_core.c
+++ b/arch/riscv/kernel/crash_core.c
@@ -9,7 +9,6 @@ void arch_crash_save_vmcoreinfo(void)
 	VMCOREINFO_NUMBER(phys_ram_base);
 
 	vmcoreinfo_append_str("NUMBER(PAGE_OFFSET)=0x%lx\n", PAGE_OFFSET);
-	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
 	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
 	vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START);
 	vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c720be70c8dd..91810b4e9510 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -253,7 +253,6 @@ extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
 /*
  *	Internals.  Don't use..
  */
-extern struct list_head vmap_area_list;
 extern __init void vm_area_add_early(struct vm_struct *vm);
 extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 03a7932cde0a..91af87930770 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -617,7 +617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
 #endif
 	VMCOREINFO_SYMBOL(_stext);
-	VMCOREINFO_SYMBOL(vmap_area_list);
+	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
 
 #ifndef CONFIG_NUMA
 	VMCOREINFO_SYMBOL(mem_map);
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index b4cac76ea5e9..8a689b4ff4f9 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -89,7 +89,6 @@ static struct test_item test_items[] = {
 	ITEM_DATA(kallsyms_test_var_data_static),
 	ITEM_DATA(kallsyms_test_var_bss),
 	ITEM_DATA(kallsyms_test_var_data),
-	ITEM_DATA(vmap_area_list),
 #endif
 };
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 7f9e9e5a0e12..8c6686176ebd 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -131,8 +131,6 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL(follow_pfn);
 
-LIST_HEAD(vmap_area_list);
-
 void vfree(const void *addr)
 {
 	kfree(addr);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 50d8239b82df..0a02633a9566 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -729,8 +729,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 
 
 static DEFINE_SPINLOCK(free_vmap_area_lock);
-/* Export for kexec only */
-LIST_HEAD(vmap_area_list);
+
 static bool vmap_initialized __read_mostly;
 
 /*
HAGIO KAZUHITO(萩尾 一仁) Sept. 8, 2023, 5:01 a.m. UTC | #10
On 2023/09/08 13:43, Baoquan He wrote:
> On 09/08/23 at 01:51am, HAGIO KAZUHITO(萩尾 一仁) wrote:
>> On 2023/09/07 18:58, Baoquan He wrote:
>>> On 09/07/23 at 11:39am, Uladzislau Rezki wrote:
>>>> On Thu, Sep 07, 2023 at 10:17:39AM +0800, Baoquan He wrote:
>>>>> Add Kazu and Lianbo to CC, and kexec mailing list
>>>>>
>>>>> On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
>>>>>> Store allocated objects in a separate nodes. A va->va_start
>>>>>> address is converted into a correct node where it should
>>>>>> be placed and resided. An addr_to_node() function is used
>>>>>> to do a proper address conversion to determine a node that
>>>>>> contains a VA.
>>>>>>
>>>>>> Such approach balances VAs across nodes as a result an access
>>>>>> becomes scalable. Number of nodes in a system depends on number
>>>>>> of CPUs divided by two. The density factor in this case is 1/2.
>>>>>>
>>>>>> Please note:
>>>>>>
>>>>>> 1. As of now allocated VAs are bound to a node-0. It means the
>>>>>>      patch does not give any difference comparing with a current
>>>>>>      behavior;
>>>>>>
>>>>>> 2. The global vmap_area_lock, vmap_area_root are removed as there
>>>>>>      is no need in it anymore. The vmap_area_list is still kept and
>>>>>>      is _empty_. It is exported for a kexec only;
>>>>>
>>>>> I haven't taken a test, while accessing all nodes' busy tree to get
>>>>> va of the lowest address could severely impact kcore reading efficiency
>>>>> on system with many vmap nodes. People doing live debugging via
>>>>> /proc/kcore will get a little surprise.
>>>>>
>>>>>
>>>>> Empty vmap_area_list will break makedumpfile utility, Crash utility
>>>>> could be impactd too. I checked makedumpfile code, it relys on
>>>>> vmap_area_list to deduce the vmalloc_start value.
>>>>>
>>>> It is left part and i hope i fix it in v3. The problem here is
>>>> we can not give an opportunity to access to vmap internals from
>>>> outside. This is just not correct, i.e. you are not allowed to
>>>> access the list directly.
>>>
>>> Right. Thanks for the fix in v3, that is a relief of makedumpfile and
>>> crash.
>>>
>>> Hi Kazu,
>>>
>>> Meanwhile, I am thinking if we should evaluate the necessity of
>>> vmap_area_list in makedumpfile and Crash. In makedumpfile, we just use
>>> vmap_area_list to deduce VMALLOC_START. Wondering if we can export
>>> VMALLOC_START directly. Surely, the lowest va->va_start in vmap_area_list
>>> is a tighter low boundary of vmalloc area and can reduce unnecessary
>>> scanning below the lowest va. Not sure if this is the reason people
>>> decided to export vmap_area_list.
>>
>> The kernel commit acd99dbf5402 introduced the original vmlist entry to
>> vmcoreinfo, but there is no information about why it did not export
>> VMALLOC_START directly.
>>
>> If VMALLOC_START is exported directly to vmcoreinfo, I think it would be
>> enough for makedumpfile.
> 
> Thanks for confirmation, Kazu.
> 
> Then, below draft patch should be enough to export VMALLOC_START
> instead, and remove vmap_area_list. 

also the following entries can be removed.

         VMCOREINFO_OFFSET(vmap_area, va_start);
         VMCOREINFO_OFFSET(vmap_area, list);

Thanks,
Kazu

In order to get the base address of
> vmalloc area, constructing a vmap_area_list from multiple busy-tree
> seems not worth.
> 
> diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
> index 599e8d3bcbc3..3cb1ea09ff26 100644
> --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
> +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
> @@ -65,11 +65,11 @@ Defines the beginning of the text section. In general, _stext indicates
>   the kernel start address. Used to convert a virtual address from the
>   direct kernel map to a physical address.
>   
> -vmap_area_list
> ---------------
> +VMALLOC_START
> +-------------
>   
> -Stores the virtual area list. makedumpfile gets the vmalloc start value
> -from this variable and its value is necessary for vmalloc translation.
> +Stores the base address of vmalloc area. makedumpfile gets this value and
> +its value is necessary for vmalloc translation.
>   
>   mem_map
>   -------
> diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c
> index 66cde752cd74..2a24199a9b81 100644
> --- a/arch/arm64/kernel/crash_core.c
> +++ b/arch/arm64/kernel/crash_core.c
> @@ -23,7 +23,6 @@ void arch_crash_save_vmcoreinfo(void)
>   	/* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */
>   	vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR);
>   	vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END);
> -	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
>   	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
>   	vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START);
>   	vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END);
> diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/crash_core.c
> index 55f1d7856b54..5c39cedd2c5c 100644
> --- a/arch/riscv/kernel/crash_core.c
> +++ b/arch/riscv/kernel/crash_core.c
> @@ -9,7 +9,6 @@ void arch_crash_save_vmcoreinfo(void)
>   	VMCOREINFO_NUMBER(phys_ram_base);
>   
>   	vmcoreinfo_append_str("NUMBER(PAGE_OFFSET)=0x%lx\n", PAGE_OFFSET);
> -	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
>   	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
>   	vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START);
>   	vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END);
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index c720be70c8dd..91810b4e9510 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -253,7 +253,6 @@ extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
>   /*
>    *	Internals.  Don't use..
>    */
> -extern struct list_head vmap_area_list;
>   extern __init void vm_area_add_early(struct vm_struct *vm);
>   extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
>   
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 03a7932cde0a..91af87930770 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -617,7 +617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
>   	VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
>   #endif
>   	VMCOREINFO_SYMBOL(_stext);
> -	VMCOREINFO_SYMBOL(vmap_area_list);
> +	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
>   
>   #ifndef CONFIG_NUMA
>   	VMCOREINFO_SYMBOL(mem_map);
> diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
> index b4cac76ea5e9..8a689b4ff4f9 100644
> --- a/kernel/kallsyms_selftest.c
> +++ b/kernel/kallsyms_selftest.c
> @@ -89,7 +89,6 @@ static struct test_item test_items[] = {
>   	ITEM_DATA(kallsyms_test_var_data_static),
>   	ITEM_DATA(kallsyms_test_var_bss),
>   	ITEM_DATA(kallsyms_test_var_data),
> -	ITEM_DATA(vmap_area_list),
>   #endif
>   };
>   
> diff --git a/mm/nommu.c b/mm/nommu.c
> index 7f9e9e5a0e12..8c6686176ebd 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -131,8 +131,6 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
>   }
>   EXPORT_SYMBOL(follow_pfn);
>   
> -LIST_HEAD(vmap_area_list);
> -
>   void vfree(const void *addr)
>   {
>   	kfree(addr);
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 50d8239b82df..0a02633a9566 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -729,8 +729,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
>   
>   
>   static DEFINE_SPINLOCK(free_vmap_area_lock);
> -/* Export for kexec only */
> -LIST_HEAD(vmap_area_list);
> +
>   static bool vmap_initialized __read_mostly;
>   
>   /*
Baoquan He Sept. 8, 2023, 6:44 a.m. UTC | #11
On 09/08/23 at 05:01am, HAGIO KAZUHITO(萩尾 一仁) wrote:
> On 2023/09/08 13:43, Baoquan He wrote:
> > On 09/08/23 at 01:51am, HAGIO KAZUHITO(萩尾 一仁) wrote:
> >> On 2023/09/07 18:58, Baoquan He wrote:
> >>> On 09/07/23 at 11:39am, Uladzislau Rezki wrote:
> >>>> On Thu, Sep 07, 2023 at 10:17:39AM +0800, Baoquan He wrote:
> >>>>> Add Kazu and Lianbo to CC, and kexec mailing list
> >>>>>
> >>>>> On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> >>>>>> Store allocated objects in a separate nodes. A va->va_start
> >>>>>> address is converted into a correct node where it should
> >>>>>> be placed and resided. An addr_to_node() function is used
> >>>>>> to do a proper address conversion to determine a node that
> >>>>>> contains a VA.
> >>>>>>
> >>>>>> Such approach balances VAs across nodes as a result an access
> >>>>>> becomes scalable. Number of nodes in a system depends on number
> >>>>>> of CPUs divided by two. The density factor in this case is 1/2.
> >>>>>>
> >>>>>> Please note:
> >>>>>>
> >>>>>> 1. As of now allocated VAs are bound to a node-0. It means the
> >>>>>>      patch does not give any difference comparing with a current
> >>>>>>      behavior;
> >>>>>>
> >>>>>> 2. The global vmap_area_lock, vmap_area_root are removed as there
> >>>>>>      is no need in it anymore. The vmap_area_list is still kept and
> >>>>>>      is _empty_. It is exported for a kexec only;
> >>>>>
> >>>>> I haven't taken a test, while accessing all nodes' busy tree to get
> >>>>> va of the lowest address could severely impact kcore reading efficiency
> >>>>> on system with many vmap nodes. People doing live debugging via
> >>>>> /proc/kcore will get a little surprise.
> >>>>>
> >>>>>
> >>>>> Empty vmap_area_list will break makedumpfile utility, Crash utility
> >>>>> could be impactd too. I checked makedumpfile code, it relys on
> >>>>> vmap_area_list to deduce the vmalloc_start value.
> >>>>>
> >>>> It is left part and i hope i fix it in v3. The problem here is
> >>>> we can not give an opportunity to access to vmap internals from
> >>>> outside. This is just not correct, i.e. you are not allowed to
> >>>> access the list directly.
> >>>
> >>> Right. Thanks for the fix in v3, that is a relief of makedumpfile and
> >>> crash.
> >>>
> >>> Hi Kazu,
> >>>
> >>> Meanwhile, I am thinking if we should evaluate the necessity of
> >>> vmap_area_list in makedumpfile and Crash. In makedumpfile, we just use
> >>> vmap_area_list to deduce VMALLOC_START. Wondering if we can export
> >>> VMALLOC_START directly. Surely, the lowest va->va_start in vmap_area_list
> >>> is a tighter low boundary of vmalloc area and can reduce unnecessary
> >>> scanning below the lowest va. Not sure if this is the reason people
> >>> decided to export vmap_area_list.
> >>
> >> The kernel commit acd99dbf5402 introduced the original vmlist entry to
> >> vmcoreinfo, but there is no information about why it did not export
> >> VMALLOC_START directly.
> >>
> >> If VMALLOC_START is exported directly to vmcoreinfo, I think it would be
> >> enough for makedumpfile.
> > 
> > Thanks for confirmation, Kazu.
> > 
> > Then, below draft patch should be enough to export VMALLOC_START
> > instead, and remove vmap_area_list. 
> 
> also the following entries can be removed.
> 
>          VMCOREINFO_OFFSET(vmap_area, va_start);
>          VMCOREINFO_OFFSET(vmap_area, list);

Right, they are useless now. I updated to remove them in below patch.

From a867fada34fd9e96528fcc5e72ae50b3b5685015 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Fri, 8 Sep 2023 11:53:22 +0800
Subject: [PATCH] mm/vmalloc: remove vmap_area_list
Content-type: text/plain

Earlier, vmap_area_list is exported to vmcoreinfo so that makedumpfile
get the base address of vmalloc area. Now, vmap_area_list is empty, so
export VMALLOC_START to vmcoreinfo instead, and remove vmap_area_list.

Signed-off-by: Baoquan He <bhe@redhat.com>
---
 Documentation/admin-guide/kdump/vmcoreinfo.rst | 8 ++++----
 arch/arm64/kernel/crash_core.c                 | 1 -
 arch/riscv/kernel/crash_core.c                 | 1 -
 include/linux/vmalloc.h                        | 1 -
 kernel/crash_core.c                            | 4 +---
 kernel/kallsyms_selftest.c                     | 1 -
 mm/nommu.c                                     | 2 --
 mm/vmalloc.c                                   | 3 +--
 8 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 599e8d3bcbc3..c11bd4b1ceb1 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -65,11 +65,11 @@ Defines the beginning of the text section. In general, _stext indicates
 the kernel start address. Used to convert a virtual address from the
 direct kernel map to a physical address.
 
-vmap_area_list
---------------
+VMALLOC_START
+-------------
 
-Stores the virtual area list. makedumpfile gets the vmalloc start value
-from this variable and its value is necessary for vmalloc translation.
+Stores the base address of vmalloc area. makedumpfile gets this value
+since is necessary for vmalloc translation.
 
 mem_map
 -------
diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c
index 66cde752cd74..2a24199a9b81 100644
--- a/arch/arm64/kernel/crash_core.c
+++ b/arch/arm64/kernel/crash_core.c
@@ -23,7 +23,6 @@ void arch_crash_save_vmcoreinfo(void)
 	/* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */
 	vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR);
 	vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END);
-	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
 	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
 	vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START);
 	vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END);
diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/crash_core.c
index 55f1d7856b54..5c39cedd2c5c 100644
--- a/arch/riscv/kernel/crash_core.c
+++ b/arch/riscv/kernel/crash_core.c
@@ -9,7 +9,6 @@ void arch_crash_save_vmcoreinfo(void)
 	VMCOREINFO_NUMBER(phys_ram_base);
 
 	vmcoreinfo_append_str("NUMBER(PAGE_OFFSET)=0x%lx\n", PAGE_OFFSET);
-	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
 	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
 	vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START);
 	vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c720be70c8dd..91810b4e9510 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -253,7 +253,6 @@ extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
 /*
  *	Internals.  Don't use..
  */
-extern struct list_head vmap_area_list;
 extern __init void vm_area_add_early(struct vm_struct *vm);
 extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 03a7932cde0a..a9faaf7e5f7d 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -617,7 +617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
 #endif
 	VMCOREINFO_SYMBOL(_stext);
-	VMCOREINFO_SYMBOL(vmap_area_list);
+	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
 
 #ifndef CONFIG_NUMA
 	VMCOREINFO_SYMBOL(mem_map);
@@ -658,8 +658,6 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(free_area, free_list);
 	VMCOREINFO_OFFSET(list_head, next);
 	VMCOREINFO_OFFSET(list_head, prev);
-	VMCOREINFO_OFFSET(vmap_area, va_start);
-	VMCOREINFO_OFFSET(vmap_area, list);
 	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1);
 	log_buf_vmcoreinfo_setup();
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index b4cac76ea5e9..8a689b4ff4f9 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -89,7 +89,6 @@ static struct test_item test_items[] = {
 	ITEM_DATA(kallsyms_test_var_data_static),
 	ITEM_DATA(kallsyms_test_var_bss),
 	ITEM_DATA(kallsyms_test_var_data),
-	ITEM_DATA(vmap_area_list),
 #endif
 };
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 7f9e9e5a0e12..8c6686176ebd 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -131,8 +131,6 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL(follow_pfn);
 
-LIST_HEAD(vmap_area_list);
-
 void vfree(const void *addr)
 {
 	kfree(addr);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 50d8239b82df..0a02633a9566 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -729,8 +729,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 
 
 static DEFINE_SPINLOCK(free_vmap_area_lock);
-/* Export for kexec only */
-LIST_HEAD(vmap_area_list);
+
 static bool vmap_initialized __read_mostly;
 
 /*
Uladzislau Rezki Sept. 8, 2023, 11:25 a.m. UTC | #12
On Fri, Sep 08, 2023 at 02:44:56PM +0800, Baoquan He wrote:
> On 09/08/23 at 05:01am, HAGIO KAZUHITO(萩尾 一仁) wrote:
> > On 2023/09/08 13:43, Baoquan He wrote:
> > > On 09/08/23 at 01:51am, HAGIO KAZUHITO(萩尾 一仁) wrote:
> > >> On 2023/09/07 18:58, Baoquan He wrote:
> > >>> On 09/07/23 at 11:39am, Uladzislau Rezki wrote:
> > >>>> On Thu, Sep 07, 2023 at 10:17:39AM +0800, Baoquan He wrote:
> > >>>>> Add Kazu and Lianbo to CC, and kexec mailing list
> > >>>>>
> > >>>>> On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > >>>>>> Store allocated objects in a separate nodes. A va->va_start
> > >>>>>> address is converted into a correct node where it should
> > >>>>>> be placed and resided. An addr_to_node() function is used
> > >>>>>> to do a proper address conversion to determine a node that
> > >>>>>> contains a VA.
> > >>>>>>
> > >>>>>> Such approach balances VAs across nodes as a result an access
> > >>>>>> becomes scalable. Number of nodes in a system depends on number
> > >>>>>> of CPUs divided by two. The density factor in this case is 1/2.
> > >>>>>>
> > >>>>>> Please note:
> > >>>>>>
> > >>>>>> 1. As of now allocated VAs are bound to a node-0. It means the
> > >>>>>>      patch does not give any difference comparing with a current
> > >>>>>>      behavior;
> > >>>>>>
> > >>>>>> 2. The global vmap_area_lock, vmap_area_root are removed as there
> > >>>>>>      is no need in it anymore. The vmap_area_list is still kept and
> > >>>>>>      is _empty_. It is exported for a kexec only;
> > >>>>>
> > >>>>> I haven't taken a test, while accessing all nodes' busy tree to get
> > >>>>> va of the lowest address could severely impact kcore reading efficiency
> > >>>>> on system with many vmap nodes. People doing live debugging via
> > >>>>> /proc/kcore will get a little surprise.
> > >>>>>
> > >>>>>
> > >>>>> Empty vmap_area_list will break makedumpfile utility, Crash utility
> > >>>>> could be impactd too. I checked makedumpfile code, it relys on
> > >>>>> vmap_area_list to deduce the vmalloc_start value.
> > >>>>>
> > >>>> It is left part and i hope i fix it in v3. The problem here is
> > >>>> we can not give an opportunity to access to vmap internals from
> > >>>> outside. This is just not correct, i.e. you are not allowed to
> > >>>> access the list directly.
> > >>>
> > >>> Right. Thanks for the fix in v3, that is a relief of makedumpfile and
> > >>> crash.
> > >>>
> > >>> Hi Kazu,
> > >>>
> > >>> Meanwhile, I am thinking if we should evaluate the necessity of
> > >>> vmap_area_list in makedumpfile and Crash. In makedumpfile, we just use
> > >>> vmap_area_list to deduce VMALLOC_START. Wondering if we can export
> > >>> VMALLOC_START directly. Surely, the lowest va->va_start in vmap_area_list
> > >>> is a tighter low boundary of vmalloc area and can reduce unnecessary
> > >>> scanning below the lowest va. Not sure if this is the reason people
> > >>> decided to export vmap_area_list.
> > >>
> > >> The kernel commit acd99dbf5402 introduced the original vmlist entry to
> > >> vmcoreinfo, but there is no information about why it did not export
> > >> VMALLOC_START directly.
> > >>
> > >> If VMALLOC_START is exported directly to vmcoreinfo, I think it would be
> > >> enough for makedumpfile.
> > > 
> > > Thanks for confirmation, Kazu.
> > > 
> > > Then, below draft patch should be enough to export VMALLOC_START
> > > instead, and remove vmap_area_list. 
> > 
> > also the following entries can be removed.
> > 
> >          VMCOREINFO_OFFSET(vmap_area, va_start);
> >          VMCOREINFO_OFFSET(vmap_area, list);
> 
> Right, they are useless now. I updated to remove them in below patch.
> 
> From a867fada34fd9e96528fcc5e72ae50b3b5685015 Mon Sep 17 00:00:00 2001
> From: Baoquan He <bhe@redhat.com>
> Date: Fri, 8 Sep 2023 11:53:22 +0800
> Subject: [PATCH] mm/vmalloc: remove vmap_area_list
> Content-type: text/plain
> 
> Earlier, vmap_area_list is exported to vmcoreinfo so that makedumpfile
> get the base address of vmalloc area. Now, vmap_area_list is empty, so
> export VMALLOC_START to vmcoreinfo instead, and remove vmap_area_list.
> 
> Signed-off-by: Baoquan He <bhe@redhat.com>
> ---
>  Documentation/admin-guide/kdump/vmcoreinfo.rst | 8 ++++----
>  arch/arm64/kernel/crash_core.c                 | 1 -
>  arch/riscv/kernel/crash_core.c                 | 1 -
>  include/linux/vmalloc.h                        | 1 -
>  kernel/crash_core.c                            | 4 +---
>  kernel/kallsyms_selftest.c                     | 1 -
>  mm/nommu.c                                     | 2 --
>  mm/vmalloc.c                                   | 3 +--
>  8 files changed, 6 insertions(+), 15 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
> index 599e8d3bcbc3..c11bd4b1ceb1 100644
> --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
> +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
> @@ -65,11 +65,11 @@ Defines the beginning of the text section. In general, _stext indicates
>  the kernel start address. Used to convert a virtual address from the
>  direct kernel map to a physical address.
>  
> -vmap_area_list
> ---------------
> +VMALLOC_START
> +-------------
>  
> -Stores the virtual area list. makedumpfile gets the vmalloc start value
> -from this variable and its value is necessary for vmalloc translation.
> +Stores the base address of vmalloc area. makedumpfile gets this value
> +since is necessary for vmalloc translation.
>  
>  mem_map
>  -------
> diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c
> index 66cde752cd74..2a24199a9b81 100644
> --- a/arch/arm64/kernel/crash_core.c
> +++ b/arch/arm64/kernel/crash_core.c
> @@ -23,7 +23,6 @@ void arch_crash_save_vmcoreinfo(void)
>  	/* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */
>  	vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR);
>  	vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END);
> -	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
>  	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
>  	vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START);
>  	vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END);
> diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/crash_core.c
> index 55f1d7856b54..5c39cedd2c5c 100644
> --- a/arch/riscv/kernel/crash_core.c
> +++ b/arch/riscv/kernel/crash_core.c
> @@ -9,7 +9,6 @@ void arch_crash_save_vmcoreinfo(void)
>  	VMCOREINFO_NUMBER(phys_ram_base);
>  
>  	vmcoreinfo_append_str("NUMBER(PAGE_OFFSET)=0x%lx\n", PAGE_OFFSET);
> -	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
>  	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
>  	vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START);
>  	vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END);
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index c720be70c8dd..91810b4e9510 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -253,7 +253,6 @@ extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
>  /*
>   *	Internals.  Don't use..
>   */
> -extern struct list_head vmap_area_list;
>  extern __init void vm_area_add_early(struct vm_struct *vm);
>  extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
>  
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 03a7932cde0a..a9faaf7e5f7d 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -617,7 +617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
>  	VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
>  #endif
>  	VMCOREINFO_SYMBOL(_stext);
> -	VMCOREINFO_SYMBOL(vmap_area_list);
> +	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
>  
>  #ifndef CONFIG_NUMA
>  	VMCOREINFO_SYMBOL(mem_map);
> @@ -658,8 +658,6 @@ static int __init crash_save_vmcoreinfo_init(void)
>  	VMCOREINFO_OFFSET(free_area, free_list);
>  	VMCOREINFO_OFFSET(list_head, next);
>  	VMCOREINFO_OFFSET(list_head, prev);
> -	VMCOREINFO_OFFSET(vmap_area, va_start);
> -	VMCOREINFO_OFFSET(vmap_area, list);
>  	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1);
>  	log_buf_vmcoreinfo_setup();
>  	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
> diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
> index b4cac76ea5e9..8a689b4ff4f9 100644
> --- a/kernel/kallsyms_selftest.c
> +++ b/kernel/kallsyms_selftest.c
> @@ -89,7 +89,6 @@ static struct test_item test_items[] = {
>  	ITEM_DATA(kallsyms_test_var_data_static),
>  	ITEM_DATA(kallsyms_test_var_bss),
>  	ITEM_DATA(kallsyms_test_var_data),
> -	ITEM_DATA(vmap_area_list),
>  #endif
>  };
>  
> diff --git a/mm/nommu.c b/mm/nommu.c
> index 7f9e9e5a0e12..8c6686176ebd 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -131,8 +131,6 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
>  }
>  EXPORT_SYMBOL(follow_pfn);
>  
> -LIST_HEAD(vmap_area_list);
> -
>  void vfree(const void *addr)
>  {
>  	kfree(addr);
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 50d8239b82df..0a02633a9566 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -729,8 +729,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
>  
>  
>  static DEFINE_SPINLOCK(free_vmap_area_lock);
> -/* Export for kexec only */
> -LIST_HEAD(vmap_area_list);
> +
>  static bool vmap_initialized __read_mostly;
>  
>  /*
> -- 
> 2.41.0
> 
Appreciate for your great input. This patch can go as standalone
with slight commit message updating or i can take it and send it
out as part of v3.

Either way i am totally fine. What do you prefer?

Thank you!

--
Uladzislau Rezki
Baoquan He Sept. 8, 2023, 11:38 a.m. UTC | #13
On 09/08/23 at 01:25pm, Uladzislau Rezki wrote:
> On Fri, Sep 08, 2023 at 02:44:56PM +0800, Baoquan He wrote:
> > On 09/08/23 at 05:01am, HAGIO KAZUHITO(萩尾 一仁) wrote:
> > > On 2023/09/08 13:43, Baoquan He wrote:
> > > > On 09/08/23 at 01:51am, HAGIO KAZUHITO(萩尾 一仁) wrote:
> > > >> On 2023/09/07 18:58, Baoquan He wrote:
> > > >>> On 09/07/23 at 11:39am, Uladzislau Rezki wrote:
> > > >>>> On Thu, Sep 07, 2023 at 10:17:39AM +0800, Baoquan He wrote:
> > > >>>>> Add Kazu and Lianbo to CC, and kexec mailing list
> > > >>>>>
> > > >>>>> On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > > >>>>>> Store allocated objects in a separate nodes. A va->va_start
> > > >>>>>> address is converted into a correct node where it should
> > > >>>>>> be placed and resided. An addr_to_node() function is used
> > > >>>>>> to do a proper address conversion to determine a node that
> > > >>>>>> contains a VA.
> > > >>>>>>
> > > >>>>>> Such approach balances VAs across nodes as a result an access
> > > >>>>>> becomes scalable. Number of nodes in a system depends on number
> > > >>>>>> of CPUs divided by two. The density factor in this case is 1/2.
> > > >>>>>>
> > > >>>>>> Please note:
> > > >>>>>>
> > > >>>>>> 1. As of now allocated VAs are bound to a node-0. It means the
> > > >>>>>>      patch does not give any difference comparing with a current
> > > >>>>>>      behavior;
> > > >>>>>>
> > > >>>>>> 2. The global vmap_area_lock, vmap_area_root are removed as there
> > > >>>>>>      is no need in it anymore. The vmap_area_list is still kept and
> > > >>>>>>      is _empty_. It is exported for a kexec only;
> > > >>>>>
> > > >>>>> I haven't taken a test, while accessing all nodes' busy tree to get
> > > >>>>> va of the lowest address could severely impact kcore reading efficiency
> > > >>>>> on system with many vmap nodes. People doing live debugging via
> > > >>>>> /proc/kcore will get a little surprise.
> > > >>>>>
> > > >>>>>
> > > >>>>> Empty vmap_area_list will break makedumpfile utility, Crash utility
> > > >>>>> could be impactd too. I checked makedumpfile code, it relys on
> > > >>>>> vmap_area_list to deduce the vmalloc_start value.
> > > >>>>>
> > > >>>> It is left part and i hope i fix it in v3. The problem here is
> > > >>>> we can not give an opportunity to access to vmap internals from
> > > >>>> outside. This is just not correct, i.e. you are not allowed to
> > > >>>> access the list directly.
> > > >>>
> > > >>> Right. Thanks for the fix in v3, that is a relief of makedumpfile and
> > > >>> crash.
> > > >>>
> > > >>> Hi Kazu,
> > > >>>
> > > >>> Meanwhile, I am thinking if we should evaluate the necessity of
> > > >>> vmap_area_list in makedumpfile and Crash. In makedumpfile, we just use
> > > >>> vmap_area_list to deduce VMALLOC_START. Wondering if we can export
> > > >>> VMALLOC_START directly. Surely, the lowest va->va_start in vmap_area_list
> > > >>> is a tighter low boundary of vmalloc area and can reduce unnecessary
> > > >>> scanning below the lowest va. Not sure if this is the reason people
> > > >>> decided to export vmap_area_list.
> > > >>
> > > >> The kernel commit acd99dbf5402 introduced the original vmlist entry to
> > > >> vmcoreinfo, but there is no information about why it did not export
> > > >> VMALLOC_START directly.
> > > >>
> > > >> If VMALLOC_START is exported directly to vmcoreinfo, I think it would be
> > > >> enough for makedumpfile.
> > > > 
> > > > Thanks for confirmation, Kazu.
> > > > 
> > > > Then, below draft patch should be enough to export VMALLOC_START
> > > > instead, and remove vmap_area_list. 
> > > 
> > > also the following entries can be removed.
> > > 
> > >          VMCOREINFO_OFFSET(vmap_area, va_start);
> > >          VMCOREINFO_OFFSET(vmap_area, list);
> > 
> > Right, they are useless now. I updated to remove them in below patch.
> > 
> > From a867fada34fd9e96528fcc5e72ae50b3b5685015 Mon Sep 17 00:00:00 2001
> > From: Baoquan He <bhe@redhat.com>
> > Date: Fri, 8 Sep 2023 11:53:22 +0800
> > Subject: [PATCH] mm/vmalloc: remove vmap_area_list
> > Content-type: text/plain
> > 
> > Earlier, vmap_area_list is exported to vmcoreinfo so that makedumpfile
> > get the base address of vmalloc area. Now, vmap_area_list is empty, so
> > export VMALLOC_START to vmcoreinfo instead, and remove vmap_area_list.
> > 
> > Signed-off-by: Baoquan He <bhe@redhat.com>
> > ---
> >  Documentation/admin-guide/kdump/vmcoreinfo.rst | 8 ++++----
> >  arch/arm64/kernel/crash_core.c                 | 1 -
> >  arch/riscv/kernel/crash_core.c                 | 1 -
> >  include/linux/vmalloc.h                        | 1 -
> >  kernel/crash_core.c                            | 4 +---
> >  kernel/kallsyms_selftest.c                     | 1 -
> >  mm/nommu.c                                     | 2 --
> >  mm/vmalloc.c                                   | 3 +--
> >  8 files changed, 6 insertions(+), 15 deletions(-)
> > 
> > diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
> > index 599e8d3bcbc3..c11bd4b1ceb1 100644
> > --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
> > +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
> > @@ -65,11 +65,11 @@ Defines the beginning of the text section. In general, _stext indicates
> >  the kernel start address. Used to convert a virtual address from the
> >  direct kernel map to a physical address.
> >  
> > -vmap_area_list
> > ---------------
> > +VMALLOC_START
> > +-------------
> >  
> > -Stores the virtual area list. makedumpfile gets the vmalloc start value
> > -from this variable and its value is necessary for vmalloc translation.
> > +Stores the base address of vmalloc area. makedumpfile gets this value
> > +since is necessary for vmalloc translation.
> >  
> >  mem_map
> >  -------
> > diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c
> > index 66cde752cd74..2a24199a9b81 100644
> > --- a/arch/arm64/kernel/crash_core.c
> > +++ b/arch/arm64/kernel/crash_core.c
> > @@ -23,7 +23,6 @@ void arch_crash_save_vmcoreinfo(void)
> >  	/* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */
> >  	vmcoreinfo_append_str("NUMBER(MODULES_VADDR)=0x%lx\n", MODULES_VADDR);
> >  	vmcoreinfo_append_str("NUMBER(MODULES_END)=0x%lx\n", MODULES_END);
> > -	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
> >  	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
> >  	vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START);
> >  	vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END);
> > diff --git a/arch/riscv/kernel/crash_core.c b/arch/riscv/kernel/crash_core.c
> > index 55f1d7856b54..5c39cedd2c5c 100644
> > --- a/arch/riscv/kernel/crash_core.c
> > +++ b/arch/riscv/kernel/crash_core.c
> > @@ -9,7 +9,6 @@ void arch_crash_save_vmcoreinfo(void)
> >  	VMCOREINFO_NUMBER(phys_ram_base);
> >  
> >  	vmcoreinfo_append_str("NUMBER(PAGE_OFFSET)=0x%lx\n", PAGE_OFFSET);
> > -	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
> >  	vmcoreinfo_append_str("NUMBER(VMALLOC_END)=0x%lx\n", VMALLOC_END);
> >  	vmcoreinfo_append_str("NUMBER(VMEMMAP_START)=0x%lx\n", VMEMMAP_START);
> >  	vmcoreinfo_append_str("NUMBER(VMEMMAP_END)=0x%lx\n", VMEMMAP_END);
> > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> > index c720be70c8dd..91810b4e9510 100644
> > --- a/include/linux/vmalloc.h
> > +++ b/include/linux/vmalloc.h
> > @@ -253,7 +253,6 @@ extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
> >  /*
> >   *	Internals.  Don't use..
> >   */
> > -extern struct list_head vmap_area_list;
> >  extern __init void vm_area_add_early(struct vm_struct *vm);
> >  extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
> >  
> > diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> > index 03a7932cde0a..a9faaf7e5f7d 100644
> > --- a/kernel/crash_core.c
> > +++ b/kernel/crash_core.c
> > @@ -617,7 +617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
> >  	VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir);
> >  #endif
> >  	VMCOREINFO_SYMBOL(_stext);
> > -	VMCOREINFO_SYMBOL(vmap_area_list);
> > +	vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", VMALLOC_START);
> >  
> >  #ifndef CONFIG_NUMA
> >  	VMCOREINFO_SYMBOL(mem_map);
> > @@ -658,8 +658,6 @@ static int __init crash_save_vmcoreinfo_init(void)
> >  	VMCOREINFO_OFFSET(free_area, free_list);
> >  	VMCOREINFO_OFFSET(list_head, next);
> >  	VMCOREINFO_OFFSET(list_head, prev);
> > -	VMCOREINFO_OFFSET(vmap_area, va_start);
> > -	VMCOREINFO_OFFSET(vmap_area, list);
> >  	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1);
> >  	log_buf_vmcoreinfo_setup();
> >  	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
> > diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
> > index b4cac76ea5e9..8a689b4ff4f9 100644
> > --- a/kernel/kallsyms_selftest.c
> > +++ b/kernel/kallsyms_selftest.c
> > @@ -89,7 +89,6 @@ static struct test_item test_items[] = {
> >  	ITEM_DATA(kallsyms_test_var_data_static),
> >  	ITEM_DATA(kallsyms_test_var_bss),
> >  	ITEM_DATA(kallsyms_test_var_data),
> > -	ITEM_DATA(vmap_area_list),
> >  #endif
> >  };
> >  
> > diff --git a/mm/nommu.c b/mm/nommu.c
> > index 7f9e9e5a0e12..8c6686176ebd 100644
> > --- a/mm/nommu.c
> > +++ b/mm/nommu.c
> > @@ -131,8 +131,6 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
> >  }
> >  EXPORT_SYMBOL(follow_pfn);
> >  
> > -LIST_HEAD(vmap_area_list);
> > -
> >  void vfree(const void *addr)
> >  {
> >  	kfree(addr);
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index 50d8239b82df..0a02633a9566 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -729,8 +729,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
> >  
> >  
> >  static DEFINE_SPINLOCK(free_vmap_area_lock);
> > -/* Export for kexec only */
> > -LIST_HEAD(vmap_area_list);
> > +
> >  static bool vmap_initialized __read_mostly;
> >  
> >  /*
> > -- 
> > 2.41.0
> > 
> Appreciate for your great input. This patch can go as standalone
> with slight commit message updating or i can take it and send it
> out as part of v3.
> 
> Either way i am totally fine. What do you prefer?

Maybe take it together with this patchset in v3. Thanks.
Uladzislau Rezki Sept. 8, 2023, 1:23 p.m. UTC | #14
> > > 
> > Appreciate for your great input. This patch can go as standalone
> > with slight commit message updating or i can take it and send it
> > out as part of v3.
> > 
> > Either way i am totally fine. What do you prefer?
> 
> Maybe take it together with this patchset in v3. Thanks.
> 
OK, i will deliver it with v3. Again, thank you very much.

--
Uladzislau Rezki
Baoquan He Sept. 11, 2023, 2:38 a.m. UTC | #15
On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> Store allocated objects in a separate nodes. A va->va_start
> address is converted into a correct node where it should
> be placed and resided. An addr_to_node() function is used
> to do a proper address conversion to determine a node that
> contains a VA.
> 
> Such approach balances VAs across nodes as a result an access
> becomes scalable. Number of nodes in a system depends on number
> of CPUs divided by two. The density factor in this case is 1/2.
> 
> Please note:
> 
> 1. As of now allocated VAs are bound to a node-0. It means the
>    patch does not give any difference comparing with a current
>    behavior;
> 
> 2. The global vmap_area_lock, vmap_area_root are removed as there
>    is no need in it anymore. The vmap_area_list is still kept and
>    is _empty_. It is exported for a kexec only;
> 
> 3. The vmallocinfo and vread() have to be reworked to be able to
>    handle multiple nodes.
> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> ---
>  mm/vmalloc.c | 209 +++++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 161 insertions(+), 48 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index b7deacca1483..ae0368c314ff 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -728,11 +728,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
>  #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
>  
>  
> -static DEFINE_SPINLOCK(vmap_area_lock);
>  static DEFINE_SPINLOCK(free_vmap_area_lock);
>  /* Export for kexec only */
>  LIST_HEAD(vmap_area_list);
> -static struct rb_root vmap_area_root = RB_ROOT;
>  static bool vmap_initialized __read_mostly;
>  
>  static struct rb_root purge_vmap_area_root = RB_ROOT;
> @@ -772,6 +770,38 @@ static struct rb_root free_vmap_area_root = RB_ROOT;
>   */
>  static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
>  
> +/*
> + * An effective vmap-node logic. Users make use of nodes instead
> + * of a global heap. It allows to balance an access and mitigate
> + * contention.
> + */
> +struct rb_list {
> +	struct rb_root root;
> +	struct list_head head;
> +	spinlock_t lock;
> +};
> +
> +struct vmap_node {
> +	/* Bookkeeping data of this node. */
> +	struct rb_list busy;
> +};
> +
> +static struct vmap_node *nodes, snode;
> +static __read_mostly unsigned int nr_nodes = 1;
> +static __read_mostly unsigned int node_size = 1;

It could be better if calling these global variables a meaningful name,
e.g vmap_nodes, static_vmap_nodes, nr_vmap_nodes. When I use vim+cscope
to reference them, it gives me a super long list. Aside from that, a
simple name often makes me mistake it as a local virable. A weak
opinion.


> +
> +static inline unsigned int
> +addr_to_node_id(unsigned long addr)
> +{
> +	return (addr / node_size) % nr_nodes;
> +}
> +
> +static inline struct vmap_node *
> +addr_to_node(unsigned long addr)
> +{
> +	return &nodes[addr_to_node_id(addr)];
> +}
> +
>  static __always_inline unsigned long
>  va_size(struct vmap_area *va)
>  {
Uladzislau Rezki Sept. 11, 2023, 4:53 p.m. UTC | #16
On Mon, Sep 11, 2023 at 10:38:29AM +0800, Baoquan He wrote:
> On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > Store allocated objects in a separate nodes. A va->va_start
> > address is converted into a correct node where it should
> > be placed and resided. An addr_to_node() function is used
> > to do a proper address conversion to determine a node that
> > contains a VA.
> > 
> > Such approach balances VAs across nodes as a result an access
> > becomes scalable. Number of nodes in a system depends on number
> > of CPUs divided by two. The density factor in this case is 1/2.
> > 
> > Please note:
> > 
> > 1. As of now allocated VAs are bound to a node-0. It means the
> >    patch does not give any difference comparing with a current
> >    behavior;
> > 
> > 2. The global vmap_area_lock, vmap_area_root are removed as there
> >    is no need in it anymore. The vmap_area_list is still kept and
> >    is _empty_. It is exported for a kexec only;
> > 
> > 3. The vmallocinfo and vread() have to be reworked to be able to
> >    handle multiple nodes.
> > 
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > ---
> >  mm/vmalloc.c | 209 +++++++++++++++++++++++++++++++++++++++------------
> >  1 file changed, 161 insertions(+), 48 deletions(-)
> > 
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index b7deacca1483..ae0368c314ff 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -728,11 +728,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
> >  #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
> >  
> >  
> > -static DEFINE_SPINLOCK(vmap_area_lock);
> >  static DEFINE_SPINLOCK(free_vmap_area_lock);
> >  /* Export for kexec only */
> >  LIST_HEAD(vmap_area_list);
> > -static struct rb_root vmap_area_root = RB_ROOT;
> >  static bool vmap_initialized __read_mostly;
> >  
> >  static struct rb_root purge_vmap_area_root = RB_ROOT;
> > @@ -772,6 +770,38 @@ static struct rb_root free_vmap_area_root = RB_ROOT;
> >   */
> >  static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
> >  
> > +/*
> > + * An effective vmap-node logic. Users make use of nodes instead
> > + * of a global heap. It allows to balance an access and mitigate
> > + * contention.
> > + */
> > +struct rb_list {
> > +	struct rb_root root;
> > +	struct list_head head;
> > +	spinlock_t lock;
> > +};
> > +
> > +struct vmap_node {
> > +	/* Bookkeeping data of this node. */
> > +	struct rb_list busy;
> > +};
> > +
> > +static struct vmap_node *nodes, snode;
> > +static __read_mostly unsigned int nr_nodes = 1;
> > +static __read_mostly unsigned int node_size = 1;
> 
> It could be better if calling these global variables a meaningful name,
> e.g vmap_nodes, static_vmap_nodes, nr_vmap_nodes. When I use vim+cscope
> to reference them, it gives me a super long list. Aside from that, a
> simple name often makes me mistake it as a local virable. A weak
> opinion.
> 
I am OK to add "vmap_" prefix:

vmap_nodes;
vmap_nr_nodes;
vmap_node_size;
..

If you are not OK with that, feel free to propose other variants.

Thank you!

--
Uladzislau Rezki
Baoquan He Sept. 12, 2023, 1:19 p.m. UTC | #17
On 09/11/23 at 06:53pm, Uladzislau Rezki wrote:
> On Mon, Sep 11, 2023 at 10:38:29AM +0800, Baoquan He wrote:
> > On 08/29/23 at 10:11am, Uladzislau Rezki (Sony) wrote:
> > > Store allocated objects in a separate nodes. A va->va_start
> > > address is converted into a correct node where it should
> > > be placed and resided. An addr_to_node() function is used
> > > to do a proper address conversion to determine a node that
> > > contains a VA.
> > > 
> > > Such approach balances VAs across nodes as a result an access
> > > becomes scalable. Number of nodes in a system depends on number
> > > of CPUs divided by two. The density factor in this case is 1/2.
> > > 
> > > Please note:
> > > 
> > > 1. As of now allocated VAs are bound to a node-0. It means the
> > >    patch does not give any difference comparing with a current
> > >    behavior;
> > > 
> > > 2. The global vmap_area_lock, vmap_area_root are removed as there
> > >    is no need in it anymore. The vmap_area_list is still kept and
> > >    is _empty_. It is exported for a kexec only;
> > > 
> > > 3. The vmallocinfo and vread() have to be reworked to be able to
> > >    handle multiple nodes.
> > > 
> > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > ---
> > >  mm/vmalloc.c | 209 +++++++++++++++++++++++++++++++++++++++------------
> > >  1 file changed, 161 insertions(+), 48 deletions(-)
> > > 
> > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > index b7deacca1483..ae0368c314ff 100644
> > > --- a/mm/vmalloc.c
> > > +++ b/mm/vmalloc.c
> > > @@ -728,11 +728,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
> > >  #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
> > >  
> > >  
> > > -static DEFINE_SPINLOCK(vmap_area_lock);
> > >  static DEFINE_SPINLOCK(free_vmap_area_lock);
> > >  /* Export for kexec only */
> > >  LIST_HEAD(vmap_area_list);
> > > -static struct rb_root vmap_area_root = RB_ROOT;
> > >  static bool vmap_initialized __read_mostly;
> > >  
> > >  static struct rb_root purge_vmap_area_root = RB_ROOT;
> > > @@ -772,6 +770,38 @@ static struct rb_root free_vmap_area_root = RB_ROOT;
> > >   */
> > >  static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
> > >  
> > > +/*
> > > + * An effective vmap-node logic. Users make use of nodes instead
> > > + * of a global heap. It allows to balance an access and mitigate
> > > + * contention.
> > > + */
> > > +struct rb_list {
> > > +	struct rb_root root;
> > > +	struct list_head head;
> > > +	spinlock_t lock;
> > > +};
> > > +
> > > +struct vmap_node {
> > > +	/* Bookkeeping data of this node. */
> > > +	struct rb_list busy;
> > > +};
> > > +
> > > +static struct vmap_node *nodes, snode;
> > > +static __read_mostly unsigned int nr_nodes = 1;
> > > +static __read_mostly unsigned int node_size = 1;
> > 
> > It could be better if calling these global variables a meaningful name,
> > e.g vmap_nodes, static_vmap_nodes, nr_vmap_nodes. When I use vim+cscope
> > to reference them, it gives me a super long list. Aside from that, a
> > simple name often makes me mistake it as a local virable. A weak
> > opinion.
> > 
> I am OK to add "vmap_" prefix:
> 
> vmap_nodes;
> vmap_nr_nodes;
           ~ nr_vmap_nodes?
> vmap_node_size;
> ..
> 
> If you are not OK with that, feel free to propose other variants.

Other than the nr_nodes one, others look good to me, thanks a lot.
diff mbox series

Patch

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b7deacca1483..ae0368c314ff 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -728,11 +728,9 @@  EXPORT_SYMBOL(vmalloc_to_pfn);
 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
 
 
-static DEFINE_SPINLOCK(vmap_area_lock);
 static DEFINE_SPINLOCK(free_vmap_area_lock);
 /* Export for kexec only */
 LIST_HEAD(vmap_area_list);
-static struct rb_root vmap_area_root = RB_ROOT;
 static bool vmap_initialized __read_mostly;
 
 static struct rb_root purge_vmap_area_root = RB_ROOT;
@@ -772,6 +770,38 @@  static struct rb_root free_vmap_area_root = RB_ROOT;
  */
 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
 
+/*
+ * An effective vmap-node logic. Users make use of nodes instead
+ * of a global heap. It allows to balance an access and mitigate
+ * contention.
+ */
+struct rb_list {
+	struct rb_root root;
+	struct list_head head;
+	spinlock_t lock;
+};
+
+struct vmap_node {
+	/* Bookkeeping data of this node. */
+	struct rb_list busy;
+};
+
+static struct vmap_node *nodes, snode;
+static __read_mostly unsigned int nr_nodes = 1;
+static __read_mostly unsigned int node_size = 1;
+
+static inline unsigned int
+addr_to_node_id(unsigned long addr)
+{
+	return (addr / node_size) % nr_nodes;
+}
+
+static inline struct vmap_node *
+addr_to_node(unsigned long addr)
+{
+	return &nodes[addr_to_node_id(addr)];
+}
+
 static __always_inline unsigned long
 va_size(struct vmap_area *va)
 {
@@ -803,10 +833,11 @@  unsigned long vmalloc_nr_pages(void)
 }
 
 /* Look up the first VA which satisfies addr < va_end, NULL if none. */
-static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
+static struct vmap_area *
+find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
 {
 	struct vmap_area *va = NULL;
-	struct rb_node *n = vmap_area_root.rb_node;
+	struct rb_node *n = root->rb_node;
 
 	addr = (unsigned long)kasan_reset_tag((void *)addr);
 
@@ -1552,12 +1583,14 @@  __alloc_vmap_area(struct rb_root *root, struct list_head *head,
  */
 static void free_vmap_area(struct vmap_area *va)
 {
+	struct vmap_node *vn = addr_to_node(va->va_start);
+
 	/*
 	 * Remove from the busy tree/list.
 	 */
-	spin_lock(&vmap_area_lock);
-	unlink_va(va, &vmap_area_root);
-	spin_unlock(&vmap_area_lock);
+	spin_lock(&vn->busy.lock);
+	unlink_va(va, &vn->busy.root);
+	spin_unlock(&vn->busy.lock);
 
 	/*
 	 * Insert/Merge it back to the free tree/list.
@@ -1600,6 +1633,7 @@  static struct vmap_area *alloc_vmap_area(unsigned long size,
 				int node, gfp_t gfp_mask,
 				unsigned long va_flags)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
 	unsigned long freed;
 	unsigned long addr;
@@ -1645,9 +1679,11 @@  static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->vm = NULL;
 	va->flags = va_flags;
 
-	spin_lock(&vmap_area_lock);
-	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
-	spin_unlock(&vmap_area_lock);
+	vn = addr_to_node(va->va_start);
+
+	spin_lock(&vn->busy.lock);
+	insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
+	spin_unlock(&vn->busy.lock);
 
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
 	BUG_ON(va->va_start < vstart);
@@ -1871,26 +1907,61 @@  static void free_unmap_vmap_area(struct vmap_area *va)
 
 struct vmap_area *find_vmap_area(unsigned long addr)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
+	int i, j;
+
+	/*
+	 * An addr_to_node_id(addr) converts an address to a node index
+	 * where a VA is located. If VA spans several zones and passed
+	 * addr is not the same as va->va_start, what is not common, we
+	 * may need to scan an extra nodes. See an example:
+	 *
+	 *      <--va-->
+	 * -|-----|-----|-----|-----|-
+	 *     1     2     0     1
+	 *
+	 * VA resides in node 1 whereas it spans 1 and 2. If passed
+	 * addr is within a second node we should do extra work. We
+	 * should mention that it is rare and is a corner case from
+	 * the other hand it has to be covered.
+	 */
+	i = j = addr_to_node_id(addr);
+	do {
+		vn = &nodes[i];
 
-	spin_lock(&vmap_area_lock);
-	va = __find_vmap_area(addr, &vmap_area_root);
-	spin_unlock(&vmap_area_lock);
+		spin_lock(&vn->busy.lock);
+		va = __find_vmap_area(addr, &vn->busy.root);
+		spin_unlock(&vn->busy.lock);
 
-	return va;
+		if (va)
+			return va;
+	} while ((i = (i + 1) % nr_nodes) != j);
+
+	return NULL;
 }
 
 static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
+	int i, j;
 
-	spin_lock(&vmap_area_lock);
-	va = __find_vmap_area(addr, &vmap_area_root);
-	if (va)
-		unlink_va(va, &vmap_area_root);
-	spin_unlock(&vmap_area_lock);
+	i = j = addr_to_node_id(addr);
+	do {
+		vn = &nodes[i];
 
-	return va;
+		spin_lock(&vn->busy.lock);
+		va = __find_vmap_area(addr, &vn->busy.root);
+		if (va)
+			unlink_va(va, &vn->busy.root);
+		spin_unlock(&vn->busy.lock);
+
+		if (va)
+			return va;
+	} while ((i = (i + 1) % nr_nodes) != j);
+
+	return NULL;
 }
 
 /*** Per cpu kva allocator ***/
@@ -2092,6 +2163,7 @@  static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 
 static void free_vmap_block(struct vmap_block *vb)
 {
+	struct vmap_node *vn;
 	struct vmap_block *tmp;
 	struct xarray *xa;
 
@@ -2099,9 +2171,10 @@  static void free_vmap_block(struct vmap_block *vb)
 	tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
 	BUG_ON(tmp != vb);
 
-	spin_lock(&vmap_area_lock);
-	unlink_va(vb->va, &vmap_area_root);
-	spin_unlock(&vmap_area_lock);
+	vn = addr_to_node(vb->va->va_start);
+	spin_lock(&vn->busy.lock);
+	unlink_va(vb->va, &vn->busy.root);
+	spin_unlock(&vn->busy.lock);
 
 	free_vmap_area_noflush(vb->va);
 	kfree_rcu(vb, rcu_head);
@@ -2525,9 +2598,11 @@  static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 			      unsigned long flags, const void *caller)
 {
-	spin_lock(&vmap_area_lock);
+	struct vmap_node *vn = addr_to_node(va->va_start);
+
+	spin_lock(&vn->busy.lock);
 	setup_vmalloc_vm_locked(vm, va, flags, caller);
-	spin_unlock(&vmap_area_lock);
+	spin_unlock(&vn->busy.lock);
 }
 
 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
@@ -3711,6 +3786,7 @@  static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
  */
 long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
 	struct vm_struct *vm;
 	char *vaddr;
@@ -3724,8 +3800,11 @@  long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 
 	remains = count;
 
-	spin_lock(&vmap_area_lock);
-	va = find_vmap_area_exceed_addr((unsigned long)addr);
+	/* Hooked to node_0 so far. */
+	vn = addr_to_node(0);
+	spin_lock(&vn->busy.lock);
+
+	va = find_vmap_area_exceed_addr((unsigned long)addr, &vn->busy.root);
 	if (!va)
 		goto finished_zero;
 
@@ -3733,7 +3812,7 @@  long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 	if ((unsigned long)addr + remains <= va->va_start)
 		goto finished_zero;
 
-	list_for_each_entry_from(va, &vmap_area_list, list) {
+	list_for_each_entry_from(va, &vn->busy.head, list) {
 		size_t copied;
 
 		if (remains == 0)
@@ -3792,12 +3871,12 @@  long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
 	}
 
 finished_zero:
-	spin_unlock(&vmap_area_lock);
+	spin_unlock(&vn->busy.lock);
 	/* zero-fill memory holes */
 	return count - remains + zero_iter(iter, remains);
 finished:
 	/* Nothing remains, or We couldn't copy/zero everything. */
-	spin_unlock(&vmap_area_lock);
+	spin_unlock(&vn->busy.lock);
 
 	return count - remains;
 }
@@ -4131,14 +4210,15 @@  struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 	}
 
 	/* insert all vm's */
-	spin_lock(&vmap_area_lock);
 	for (area = 0; area < nr_vms; area++) {
-		insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
+		struct vmap_node *vn = addr_to_node(vas[area]->va_start);
 
+		spin_lock(&vn->busy.lock);
+		insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
 		setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
 				 pcpu_get_vm_areas);
+		spin_unlock(&vn->busy.lock);
 	}
-	spin_unlock(&vmap_area_lock);
 
 	/*
 	 * Mark allocated areas as accessible. Do it now as a best-effort
@@ -4261,25 +4341,26 @@  bool vmalloc_dump_obj(void *object)
 
 #ifdef CONFIG_PROC_FS
 static void *s_start(struct seq_file *m, loff_t *pos)
-	__acquires(&vmap_purge_lock)
-	__acquires(&vmap_area_lock)
 {
+	struct vmap_node *vn = addr_to_node(0);
+
 	mutex_lock(&vmap_purge_lock);
-	spin_lock(&vmap_area_lock);
+	spin_lock(&vn->busy.lock);
 
-	return seq_list_start(&vmap_area_list, *pos);
+	return seq_list_start(&vn->busy.head, *pos);
 }
 
 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
-	return seq_list_next(p, &vmap_area_list, pos);
+	struct vmap_node *vn = addr_to_node(0);
+	return seq_list_next(p, &vn->busy.head, pos);
 }
 
 static void s_stop(struct seq_file *m, void *p)
-	__releases(&vmap_area_lock)
-	__releases(&vmap_purge_lock)
 {
-	spin_unlock(&vmap_area_lock);
+	struct vmap_node *vn = addr_to_node(0);
+
+	spin_unlock(&vn->busy.lock);
 	mutex_unlock(&vmap_purge_lock);
 }
 
@@ -4322,9 +4403,11 @@  static void show_purge_info(struct seq_file *m)
 
 static int s_show(struct seq_file *m, void *p)
 {
+	struct vmap_node *vn;
 	struct vmap_area *va;
 	struct vm_struct *v;
 
+	vn = addr_to_node(0);
 	va = list_entry(p, struct vmap_area, list);
 
 	if (!va->vm) {
@@ -4375,7 +4458,7 @@  static int s_show(struct seq_file *m, void *p)
 	 * As a final step, dump "unpurged" areas.
 	 */
 final:
-	if (list_is_last(&va->list, &vmap_area_list))
+	if (list_is_last(&va->list, &vn->busy.head))
 		show_purge_info(m);
 
 	return 0;
@@ -4406,7 +4489,8 @@  static void vmap_init_free_space(void)
 {
 	unsigned long vmap_start = 1;
 	const unsigned long vmap_end = ULONG_MAX;
-	struct vmap_area *busy, *free;
+	struct vmap_area *free;
+	struct vm_struct *busy;
 
 	/*
 	 *     B     F     B     B     B     F
@@ -4414,12 +4498,12 @@  static void vmap_init_free_space(void)
 	 *  |           The KVA space           |
 	 *  |<--------------------------------->|
 	 */
-	list_for_each_entry(busy, &vmap_area_list, list) {
-		if (busy->va_start - vmap_start > 0) {
+	for (busy = vmlist; busy; busy = busy->next) {
+		if (busy->addr - vmap_start > 0) {
 			free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
 			if (!WARN_ON_ONCE(!free)) {
 				free->va_start = vmap_start;
-				free->va_end = busy->va_start;
+				free->va_end = (unsigned long) busy->addr;
 
 				insert_vmap_area_augment(free, NULL,
 					&free_vmap_area_root,
@@ -4427,7 +4511,7 @@  static void vmap_init_free_space(void)
 			}
 		}
 
-		vmap_start = busy->va_end;
+		vmap_start = (unsigned long) busy->addr + busy->size;
 	}
 
 	if (vmap_end - vmap_start > 0) {
@@ -4443,9 +4527,31 @@  static void vmap_init_free_space(void)
 	}
 }
 
+static void vmap_init_nodes(void)
+{
+	struct vmap_node *vn;
+	int i;
+
+	nodes = &snode;
+
+	if (nr_nodes > 1) {
+		vn = kmalloc_array(nr_nodes, sizeof(*vn), GFP_NOWAIT);
+		if (vn)
+			nodes = vn;
+	}
+
+	for (i = 0; i < nr_nodes; i++) {
+		vn = &nodes[i];
+		vn->busy.root = RB_ROOT;
+		INIT_LIST_HEAD(&vn->busy.head);
+		spin_lock_init(&vn->busy.lock);
+	}
+}
+
 void __init vmalloc_init(void)
 {
 	struct vmap_area *va;
+	struct vmap_node *vn;
 	struct vm_struct *tmp;
 	int i;
 
@@ -4467,6 +4573,11 @@  void __init vmalloc_init(void)
 		xa_init(&vbq->vmap_blocks);
 	}
 
+	/*
+	 * Setup nodes before importing vmlist.
+	 */
+	vmap_init_nodes();
+
 	/* Import existing vmlist entries. */
 	for (tmp = vmlist; tmp; tmp = tmp->next) {
 		va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
@@ -4476,7 +4587,9 @@  void __init vmalloc_init(void)
 		va->va_start = (unsigned long)tmp->addr;
 		va->va_end = va->va_start + tmp->size;
 		va->vm = tmp;
-		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+
+		vn = addr_to_node(va->va_start);
+		insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
 	}
 
 	/*