diff mbox series

[2/6] mm/memblock: make full utilization of numa info

Message ID 1551011649-30103-3-git-send-email-kernelfans@gmail.com (mailing list archive)
State New, archived
Headers show
Series make memblock allocator utilize the node's fallback info | expand

Commit Message

Pingfan Liu Feb. 24, 2019, 12:34 p.m. UTC
There are numa machines with memory-less node. When allocating memory for
the memory-less node, memblock allocator falls back to 'Node 0' without fully
utilizing the nearest node. This hurts the performance, especially for per
cpu section. Suppressing this defect by building the full node fall back
info for memblock allocator, like what we have done for page allocator.

Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ingo Molnar <mingo@redhat.com>
CC: Borislav Petkov <bp@alien8.de>
CC: "H. Peter Anvin" <hpa@zytor.com>
CC: Dave Hansen <dave.hansen@linux.intel.com>
CC: Vlastimil Babka <vbabka@suse.cz>
CC: Mike Rapoport <rppt@linux.vnet.ibm.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Mel Gorman <mgorman@suse.de>
CC: Joonsoo Kim <iamjoonsoo.kim@lge.com>
CC: Andy Lutomirski <luto@kernel.org>
CC: Andi Kleen <ak@linux.intel.com>
CC: Petr Tesarik <ptesarik@suse.cz>
CC: Michal Hocko <mhocko@suse.com>
CC: Stephen Rothwell <sfr@canb.auug.org.au>
CC: Jonathan Corbet <corbet@lwn.net>
CC: Nicholas Piggin <npiggin@gmail.com>
CC: Daniel Vacek <neelx@redhat.com>
CC: linux-kernel@vger.kernel.org
---
 include/linux/memblock.h |  3 +++
 mm/memblock.c            | 68 ++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 66 insertions(+), 5 deletions(-)

Comments

kernel test robot Feb. 25, 2019, 7:07 a.m. UTC | #1
Hi Pingfan,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.0-rc4]
[cannot apply to next-20190222]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Pingfan-Liu/mm-numa-extract-the-code-of-building-node-fall-back-list/20190225-143613
config: i386-tinyconfig (attached as .config)
compiler: gcc-8 (Debian 8.2.0-20) 8.2.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   ld: mm/memblock.o: in function `memblock_build_node_order':
>> memblock.c:(.init.text+0x310): undefined reference to `build_node_order'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot Feb. 25, 2019, 7:59 a.m. UTC | #2
Hi Pingfan,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.0-rc4]
[cannot apply to next-20190222]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Pingfan-Liu/mm-numa-extract-the-code-of-building-node-fall-back-list/20190225-143613
config: i386-randconfig-a1-201908 (attached as .config)
compiler: gcc-4.9 (Debian 4.9.4-2) 4.9.4
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   ld: mm/memblock.o: in function `memblock_build_node_order':
>> mm/memblock.c:1364: undefined reference to `build_node_order'

vim +1364 mm/memblock.c

  1342	
  1343	/*
  1344	 * build_node_order() relies on cpumask_of_node(), hence arch should set up
  1345	 * cpumask before calling this func.
  1346	 */
  1347	void __init memblock_build_node_order(void)
  1348	{
  1349		int nid, i;
  1350		nodemask_t used_mask;
  1351	
  1352		node_fallback = memblock_alloc(MAX_NUMNODES * sizeof(int *),
  1353			sizeof(int *));
  1354		for_each_online_node(nid) {
  1355			node_fallback[nid] = memblock_alloc(
  1356				num_online_nodes() * sizeof(int), sizeof(int));
  1357			for (i = 0; i < num_online_nodes(); i++)
  1358				node_fallback[nid][i] = NUMA_NO_NODE;
  1359		}
  1360	
  1361		for_each_online_node(nid) {
  1362			nodes_clear(used_mask);
  1363			node_set(nid, used_mask);
> 1364			build_node_order(node_fallback[nid], num_online_nodes(),
  1365				nid, &used_mask);
  1366		}
  1367	}
  1368	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Dave Hansen Feb. 25, 2019, 3:34 p.m. UTC | #3
On 2/24/19 4:34 AM, Pingfan Liu wrote:
> +/*
> + * build_node_order() relies on cpumask_of_node(), hence arch should 
> + * set up cpumask before calling this func.
> + */

Whenever I see comments like this, I wonder what happens if the arch
doesn't do this?  Do we just crash in early boot in wonderful new ways?
 Or do we get a nice message telling us?

> +void __init memblock_build_node_order(void)
> +{
> +	int nid, i;
> +	nodemask_t used_mask;
> +
> +	node_fallback = memblock_alloc(MAX_NUMNODES * sizeof(int *),
> +		sizeof(int *));
> +	for_each_online_node(nid) {
> +		node_fallback[nid] = memblock_alloc(
> +			num_online_nodes() * sizeof(int), sizeof(int));
> +		for (i = 0; i < num_online_nodes(); i++)
> +			node_fallback[nid][i] = NUMA_NO_NODE;
> +	}
> +
> +	for_each_online_node(nid) {
> +		nodes_clear(used_mask);
> +		node_set(nid, used_mask);
> +		build_node_order(node_fallback[nid], num_online_nodes(),
> +			nid, &used_mask);
> +	}
> +}

This doesn't get used until patch 6 as far as I can tell.  Was there a
reason to define it here?
Pingfan Liu Feb. 26, 2019, 5:40 a.m. UTC | #4
On Mon, Feb 25, 2019 at 11:34 PM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 2/24/19 4:34 AM, Pingfan Liu wrote:
> > +/*
> > + * build_node_order() relies on cpumask_of_node(), hence arch should
> > + * set up cpumask before calling this func.
> > + */
>
> Whenever I see comments like this, I wonder what happens if the arch
> doesn't do this?  Do we just crash in early boot in wonderful new ways?
>  Or do we get a nice message telling us?
>
If doesn't do this, this function will crash. It is a shame but a
little hard to work around, since this function is called at early
boot stage, things like cpumask_of_node(cpu_to_node(cpu)) can not work
reliably, and we lack of an abstract interface to get such information
from all archs. So I leave this to arch's developer.

> > +void __init memblock_build_node_order(void)
> > +{
> > +     int nid, i;
> > +     nodemask_t used_mask;
> > +
> > +     node_fallback = memblock_alloc(MAX_NUMNODES * sizeof(int *),
> > +             sizeof(int *));
> > +     for_each_online_node(nid) {
> > +             node_fallback[nid] = memblock_alloc(
> > +                     num_online_nodes() * sizeof(int), sizeof(int));
> > +             for (i = 0; i < num_online_nodes(); i++)
> > +                     node_fallback[nid][i] = NUMA_NO_NODE;
> > +     }
> > +
> > +     for_each_online_node(nid) {
> > +             nodes_clear(used_mask);
> > +             node_set(nid, used_mask);
> > +             build_node_order(node_fallback[nid], num_online_nodes(),
> > +                     nid, &used_mask);
> > +     }
> > +}
>
> This doesn't get used until patch 6 as far as I can tell.  Was there a
> reason to define it here?
>
Yes, it gets used until patch 6. Patch 6 has two groups of
pre-requirements [1-2] and [3-5]. Do you think reorder the patches and
moving [3-5] ahead of [1-2] is a better choice?

Thanks and regards,
Pingfan
Mike Rapoport Feb. 26, 2019, 11:58 a.m. UTC | #5
On Sun, Feb 24, 2019 at 08:34:05PM +0800, Pingfan Liu wrote:
> There are numa machines with memory-less node. When allocating memory for
> the memory-less node, memblock allocator falls back to 'Node 0' without fully
> utilizing the nearest node. This hurts the performance, especially for per
> cpu section. Suppressing this defect by building the full node fall back
> info for memblock allocator, like what we have done for page allocator.

Is it really necessary to build full node fallback info for memblock and
then rebuild it again for the page allocator?

I think it should be possible to split parts of build_all_zonelists_init()
that do not touch per-cpu areas into a separate function and call that
function after topology detection. Then it would be possible to use
local_memory_node() when calling memblock.
 
> Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> CC: Thomas Gleixner <tglx@linutronix.de>
> CC: Ingo Molnar <mingo@redhat.com>
> CC: Borislav Petkov <bp@alien8.de>
> CC: "H. Peter Anvin" <hpa@zytor.com>
> CC: Dave Hansen <dave.hansen@linux.intel.com>
> CC: Vlastimil Babka <vbabka@suse.cz>
> CC: Mike Rapoport <rppt@linux.vnet.ibm.com>
> CC: Andrew Morton <akpm@linux-foundation.org>
> CC: Mel Gorman <mgorman@suse.de>
> CC: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> CC: Andy Lutomirski <luto@kernel.org>
> CC: Andi Kleen <ak@linux.intel.com>
> CC: Petr Tesarik <ptesarik@suse.cz>
> CC: Michal Hocko <mhocko@suse.com>
> CC: Stephen Rothwell <sfr@canb.auug.org.au>
> CC: Jonathan Corbet <corbet@lwn.net>
> CC: Nicholas Piggin <npiggin@gmail.com>
> CC: Daniel Vacek <neelx@redhat.com>
> CC: linux-kernel@vger.kernel.org
> ---
>  include/linux/memblock.h |  3 +++
>  mm/memblock.c            | 68 ++++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 66 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index 64c41cf..ee999c5 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -342,6 +342,9 @@ void *memblock_alloc_try_nid_nopanic(phys_addr_t size, phys_addr_t align,
>  void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
>  			     phys_addr_t min_addr, phys_addr_t max_addr,
>  			     int nid);
> +extern int build_node_order(int *node_oder_array, int sz,
> +	int local_node, nodemask_t *used_mask);
> +void memblock_build_node_order(void);
> 
>  static inline void * __init memblock_alloc(phys_addr_t size,  phys_addr_t align)
>  {
> diff --git a/mm/memblock.c b/mm/memblock.c
> index 022d4cb..cf78850 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -1338,6 +1338,47 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali
>  	return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
>  }
> 
> +static int **node_fallback __initdata;
> +
> +/*
> + * build_node_order() relies on cpumask_of_node(), hence arch should set up
> + * cpumask before calling this func.
> + */
> +void __init memblock_build_node_order(void)
> +{
> +	int nid, i;
> +	nodemask_t used_mask;
> +
> +	node_fallback = memblock_alloc(MAX_NUMNODES * sizeof(int *),
> +		sizeof(int *));
> +	for_each_online_node(nid) {
> +		node_fallback[nid] = memblock_alloc(
> +			num_online_nodes() * sizeof(int), sizeof(int));
> +		for (i = 0; i < num_online_nodes(); i++)
> +			node_fallback[nid][i] = NUMA_NO_NODE;
> +	}
> +
> +	for_each_online_node(nid) {
> +		nodes_clear(used_mask);
> +		node_set(nid, used_mask);
> +		build_node_order(node_fallback[nid], num_online_nodes(),
> +			nid, &used_mask);
> +	}
> +}
> +
> +static void __init memblock_free_node_order(void)
> +{
> +	int nid;
> +
> +	if (!node_fallback)
> +		return;
> +	for_each_online_node(nid)
> +		memblock_free(__pa(node_fallback[nid]),
> +			num_online_nodes() * sizeof(int));
> +	memblock_free(__pa(node_fallback), MAX_NUMNODES * sizeof(int *));
> +	node_fallback = NULL;
> +}
> +
>  /**
>   * memblock_alloc_internal - allocate boot memory block
>   * @size: size of memory block to be allocated in bytes
> @@ -1370,6 +1411,7 @@ static void * __init memblock_alloc_internal(
>  {
>  	phys_addr_t alloc;
>  	void *ptr;
> +	int node;
>  	enum memblock_flags flags = choose_memblock_flags();
> 
>  	if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
> @@ -1397,11 +1439,26 @@ static void * __init memblock_alloc_internal(
>  		goto done;
> 
>  	if (nid != NUMA_NO_NODE) {
> -		alloc = memblock_find_in_range_node(size, align, min_addr,
> -						    max_addr, NUMA_NO_NODE,
> -						    flags);
> -		if (alloc && !memblock_reserve(alloc, size))
> -			goto done;
> +		if (!node_fallback) {
> +			alloc = memblock_find_in_range_node(size, align,
> +					min_addr, max_addr,
> +					NUMA_NO_NODE, flags);
> +			if (alloc && !memblock_reserve(alloc, size))
> +				goto done;
> +		} else {
> +			int i;
> +			for (i = 0; i < num_online_nodes(); i++) {
> +				node = node_fallback[nid][i];
> +				/* fallback list has all memory nodes */
> +				if (node == NUMA_NO_NODE)
> +					break;
> +				alloc = memblock_find_in_range_node(size,
> +						align, min_addr, max_addr,
> +						node, flags);
> +				if (alloc && !memblock_reserve(alloc, size))
> +					goto done;
> +			}
> +		}
>  	}
> 
>  	if (min_addr) {
> @@ -1969,6 +2026,7 @@ unsigned long __init memblock_free_all(void)
> 
>  	reset_all_zones_managed_pages();
> 
> +	memblock_free_node_order();
>  	pages = free_low_memory_core_early();
>  	totalram_pages_add(pages);
> 
> -- 
> 2.7.4
>
Dave Hansen Feb. 26, 2019, 12:37 p.m. UTC | #6
On 2/25/19 9:40 PM, Pingfan Liu wrote:
>> This doesn't get used until patch 6 as far as I can tell.  Was there a
>> reason to define it here?
>>
> Yes, it gets used until patch 6. Patch 6 has two groups of
> pre-requirements [1-2] and [3-5]. Do you think reorder the patches and
> moving [3-5] ahead of [1-2] is a better choice?

I'd rather that you just introduce the code along with its first user.
Pingfan Liu Feb. 27, 2019, 9:23 a.m. UTC | #7
On Tue, Feb 26, 2019 at 7:58 PM Mike Rapoport <rppt@linux.ibm.com> wrote:
>
> On Sun, Feb 24, 2019 at 08:34:05PM +0800, Pingfan Liu wrote:
> > There are numa machines with memory-less node. When allocating memory for
> > the memory-less node, memblock allocator falls back to 'Node 0' without fully
> > utilizing the nearest node. This hurts the performance, especially for per
> > cpu section. Suppressing this defect by building the full node fall back
> > info for memblock allocator, like what we have done for page allocator.
>
> Is it really necessary to build full node fallback info for memblock and
> then rebuild it again for the page allocator?
>
Do you mean building the full node fallback info once, and share it by
both memblock and page allocator? If it is, then node online/offline
is the corner case to block this design.

> I think it should be possible to split parts of build_all_zonelists_init()
> that do not touch per-cpu areas into a separate function and call that
> function after topology detection. Then it would be possible to use
> local_memory_node() when calling memblock.
>
Yes, this is one way but may be with higher pay of changing the code.
I will try it.
Thank your for your suggestion.

Best regards,
Pingfan
> > Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> > CC: Thomas Gleixner <tglx@linutronix.de>
> > CC: Ingo Molnar <mingo@redhat.com>
> > CC: Borislav Petkov <bp@alien8.de>
> > CC: "H. Peter Anvin" <hpa@zytor.com>
> > CC: Dave Hansen <dave.hansen@linux.intel.com>
> > CC: Vlastimil Babka <vbabka@suse.cz>
> > CC: Mike Rapoport <rppt@linux.vnet.ibm.com>
> > CC: Andrew Morton <akpm@linux-foundation.org>
> > CC: Mel Gorman <mgorman@suse.de>
> > CC: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> > CC: Andy Lutomirski <luto@kernel.org>
> > CC: Andi Kleen <ak@linux.intel.com>
> > CC: Petr Tesarik <ptesarik@suse.cz>
> > CC: Michal Hocko <mhocko@suse.com>
> > CC: Stephen Rothwell <sfr@canb.auug.org.au>
> > CC: Jonathan Corbet <corbet@lwn.net>
> > CC: Nicholas Piggin <npiggin@gmail.com>
> > CC: Daniel Vacek <neelx@redhat.com>
> > CC: linux-kernel@vger.kernel.org
> > ---
> >  include/linux/memblock.h |  3 +++
> >  mm/memblock.c            | 68 ++++++++++++++++++++++++++++++++++++++++++++----
> >  2 files changed, 66 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> > index 64c41cf..ee999c5 100644
> > --- a/include/linux/memblock.h
> > +++ b/include/linux/memblock.h
> > @@ -342,6 +342,9 @@ void *memblock_alloc_try_nid_nopanic(phys_addr_t size, phys_addr_t align,
> >  void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
> >                            phys_addr_t min_addr, phys_addr_t max_addr,
> >                            int nid);
> > +extern int build_node_order(int *node_oder_array, int sz,
> > +     int local_node, nodemask_t *used_mask);
> > +void memblock_build_node_order(void);
> >
> >  static inline void * __init memblock_alloc(phys_addr_t size,  phys_addr_t align)
> >  {
> > diff --git a/mm/memblock.c b/mm/memblock.c
> > index 022d4cb..cf78850 100644
> > --- a/mm/memblock.c
> > +++ b/mm/memblock.c
> > @@ -1338,6 +1338,47 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali
> >       return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
> >  }
> >
> > +static int **node_fallback __initdata;
> > +
> > +/*
> > + * build_node_order() relies on cpumask_of_node(), hence arch should set up
> > + * cpumask before calling this func.
> > + */
> > +void __init memblock_build_node_order(void)
> > +{
> > +     int nid, i;
> > +     nodemask_t used_mask;
> > +
> > +     node_fallback = memblock_alloc(MAX_NUMNODES * sizeof(int *),
> > +             sizeof(int *));
> > +     for_each_online_node(nid) {
> > +             node_fallback[nid] = memblock_alloc(
> > +                     num_online_nodes() * sizeof(int), sizeof(int));
> > +             for (i = 0; i < num_online_nodes(); i++)
> > +                     node_fallback[nid][i] = NUMA_NO_NODE;
> > +     }
> > +
> > +     for_each_online_node(nid) {
> > +             nodes_clear(used_mask);
> > +             node_set(nid, used_mask);
> > +             build_node_order(node_fallback[nid], num_online_nodes(),
> > +                     nid, &used_mask);
> > +     }
> > +}
> > +
> > +static void __init memblock_free_node_order(void)
> > +{
> > +     int nid;
> > +
> > +     if (!node_fallback)
> > +             return;
> > +     for_each_online_node(nid)
> > +             memblock_free(__pa(node_fallback[nid]),
> > +                     num_online_nodes() * sizeof(int));
> > +     memblock_free(__pa(node_fallback), MAX_NUMNODES * sizeof(int *));
> > +     node_fallback = NULL;
> > +}
> > +
> >  /**
> >   * memblock_alloc_internal - allocate boot memory block
> >   * @size: size of memory block to be allocated in bytes
> > @@ -1370,6 +1411,7 @@ static void * __init memblock_alloc_internal(
> >  {
> >       phys_addr_t alloc;
> >       void *ptr;
> > +     int node;
> >       enum memblock_flags flags = choose_memblock_flags();
> >
> >       if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
> > @@ -1397,11 +1439,26 @@ static void * __init memblock_alloc_internal(
> >               goto done;
> >
> >       if (nid != NUMA_NO_NODE) {
> > -             alloc = memblock_find_in_range_node(size, align, min_addr,
> > -                                                 max_addr, NUMA_NO_NODE,
> > -                                                 flags);
> > -             if (alloc && !memblock_reserve(alloc, size))
> > -                     goto done;
> > +             if (!node_fallback) {
> > +                     alloc = memblock_find_in_range_node(size, align,
> > +                                     min_addr, max_addr,
> > +                                     NUMA_NO_NODE, flags);
> > +                     if (alloc && !memblock_reserve(alloc, size))
> > +                             goto done;
> > +             } else {
> > +                     int i;
> > +                     for (i = 0; i < num_online_nodes(); i++) {
> > +                             node = node_fallback[nid][i];
> > +                             /* fallback list has all memory nodes */
> > +                             if (node == NUMA_NO_NODE)
> > +                                     break;
> > +                             alloc = memblock_find_in_range_node(size,
> > +                                             align, min_addr, max_addr,
> > +                                             node, flags);
> > +                             if (alloc && !memblock_reserve(alloc, size))
> > +                                     goto done;
> > +                     }
> > +             }
> >       }
> >
> >       if (min_addr) {
> > @@ -1969,6 +2026,7 @@ unsigned long __init memblock_free_all(void)
> >
> >       reset_all_zones_managed_pages();
> >
> > +     memblock_free_node_order();
> >       pages = free_low_memory_core_early();
> >       totalram_pages_add(pages);
> >
> > --
> > 2.7.4
> >
>
> --
> Sincerely yours,
> Mike.
>
diff mbox series

Patch

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 64c41cf..ee999c5 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -342,6 +342,9 @@  void *memblock_alloc_try_nid_nopanic(phys_addr_t size, phys_addr_t align,
 void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
 			     phys_addr_t min_addr, phys_addr_t max_addr,
 			     int nid);
+extern int build_node_order(int *node_oder_array, int sz,
+	int local_node, nodemask_t *used_mask);
+void memblock_build_node_order(void);
 
 static inline void * __init memblock_alloc(phys_addr_t size,  phys_addr_t align)
 {
diff --git a/mm/memblock.c b/mm/memblock.c
index 022d4cb..cf78850 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1338,6 +1338,47 @@  phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali
 	return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 }
 
+static int **node_fallback __initdata;
+
+/*
+ * build_node_order() relies on cpumask_of_node(), hence arch should set up
+ * cpumask before calling this func.
+ */
+void __init memblock_build_node_order(void)
+{
+	int nid, i;
+	nodemask_t used_mask;
+
+	node_fallback = memblock_alloc(MAX_NUMNODES * sizeof(int *),
+		sizeof(int *));
+	for_each_online_node(nid) {
+		node_fallback[nid] = memblock_alloc(
+			num_online_nodes() * sizeof(int), sizeof(int));
+		for (i = 0; i < num_online_nodes(); i++)
+			node_fallback[nid][i] = NUMA_NO_NODE;
+	}
+
+	for_each_online_node(nid) {
+		nodes_clear(used_mask);
+		node_set(nid, used_mask);
+		build_node_order(node_fallback[nid], num_online_nodes(),
+			nid, &used_mask);
+	}
+}
+
+static void __init memblock_free_node_order(void)
+{
+	int nid;
+
+	if (!node_fallback)
+		return;
+	for_each_online_node(nid)
+		memblock_free(__pa(node_fallback[nid]),
+			num_online_nodes() * sizeof(int));
+	memblock_free(__pa(node_fallback), MAX_NUMNODES * sizeof(int *));
+	node_fallback = NULL;
+}
+
 /**
  * memblock_alloc_internal - allocate boot memory block
  * @size: size of memory block to be allocated in bytes
@@ -1370,6 +1411,7 @@  static void * __init memblock_alloc_internal(
 {
 	phys_addr_t alloc;
 	void *ptr;
+	int node;
 	enum memblock_flags flags = choose_memblock_flags();
 
 	if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
@@ -1397,11 +1439,26 @@  static void * __init memblock_alloc_internal(
 		goto done;
 
 	if (nid != NUMA_NO_NODE) {
-		alloc = memblock_find_in_range_node(size, align, min_addr,
-						    max_addr, NUMA_NO_NODE,
-						    flags);
-		if (alloc && !memblock_reserve(alloc, size))
-			goto done;
+		if (!node_fallback) {
+			alloc = memblock_find_in_range_node(size, align,
+					min_addr, max_addr,
+					NUMA_NO_NODE, flags);
+			if (alloc && !memblock_reserve(alloc, size))
+				goto done;
+		} else {
+			int i;
+			for (i = 0; i < num_online_nodes(); i++) {
+				node = node_fallback[nid][i];
+				/* fallback list has all memory nodes */
+				if (node == NUMA_NO_NODE)
+					break;
+				alloc = memblock_find_in_range_node(size,
+						align, min_addr, max_addr,
+						node, flags);
+				if (alloc && !memblock_reserve(alloc, size))
+					goto done;
+			}
+		}
 	}
 
 	if (min_addr) {
@@ -1969,6 +2026,7 @@  unsigned long __init memblock_free_all(void)
 
 	reset_all_zones_managed_pages();
 
+	memblock_free_node_order();
 	pages = free_low_memory_core_early();
 	totalram_pages_add(pages);