diff mbox

[RFC,v2,10/25] x86: NUMA: Move numa code and make it generic

Message ID 1490716413-19796-11-git-send-email-vijay.kilari@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Vijay Kilari March 28, 2017, 3:53 p.m. UTC
From: Vijaya Kumar K <Vijaya.Kumar@cavium.com>

Move code from xen/arch/x86/numa.c to xen/common/numa.c
so that it can be used by other archs.
Few generic static functions in x86/numa.c is made
non-static common/numa.c

The generic contents of header file asm-x86/numa.h
are moved to xen/numa.h.

Signed-off-by: Vijaya Kumar K <Vijaya.Kumar@cavium.com>
---
 xen/arch/x86/numa.c        | 456 ------------------------------------------
 xen/arch/x86/srat.c        |   7 +
 xen/common/Makefile        |   1 +
 xen/common/numa.c          | 488 +++++++++++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/numa.h |  15 --
 xen/include/xen/numa.h     |  18 ++
 6 files changed, 514 insertions(+), 471 deletions(-)

Comments

Julien Grall May 8, 2017, 4:41 p.m. UTC | #1
Hi Vijay,

On 28/03/17 16:53, vijay.kilari@gmail.com wrote:
> diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
> index 3bdab9a..33c6806 100644
> --- a/xen/arch/x86/numa.c
> +++ b/xen/arch/x86/numa.c
> @@ -10,286 +10,13 @@
>  #include <xen/ctype.h>
>  #include <xen/nodemask.h>
>  #include <xen/numa.h>
> -#include <xen/keyhandler.h>
>  #include <xen/time.h>
>  #include <xen/smp.h>
>  #include <xen/pfn.h>
>  #include <asm/acpi.h>
> -#include <xen/sched.h>
> -#include <xen/softirq.h>
> -
> -static int numa_setup(char *s);
> -custom_param("numa", numa_setup);
> -
> -struct node_data node_data[MAX_NUMNODES];
> -
> -/* Mapping from pdx to node id */
> -unsigned int memnode_shift;
> -static typeof(*memnodemap) _memnodemap[64];
> -unsigned long memnodemapsize;
> -uint8_t *memnodemap;
> -
> -nodeid_t __read_mostly cpu_to_node[NR_CPUS] = {
> -    [0 ... NR_CPUS-1] = NUMA_NO_NODE
> -};
> -/*
> - * Keep BIOS's CPU2node information, should not be used for memory allocaion
> - */
> -nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
> -    [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
> -};

Why this is moved in this patch from here to x86/srat.c?

[...]

> diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
> index 7cf4771..2cc87a3 100644
> --- a/xen/arch/x86/srat.c
> +++ b/xen/arch/x86/srat.c
> @@ -27,6 +27,13 @@ static nodemask_t __initdata memory_nodes_parsed;
>  static nodemask_t __initdata processor_nodes_parsed;
>  static struct node __initdata nodes[MAX_NUMNODES];
>
> +/*
> + * Keep BIOS's CPU2node information, should not be used for memory allocaion
> + */
> +nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
> +    [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
> +};
> +

This does not belong to this patch...

>  struct pxm2node {
>  	unsigned int pxm;
>  	nodeid_t node;

[...]

> diff --git a/xen/common/numa.c b/xen/common/numa.c
> new file mode 100644
> index 0000000..207ebd8
> --- /dev/null
> +++ b/xen/common/numa.c
> @@ -0,0 +1,488 @@
> +/*
> + * Common NUMA handling functions for x86 and arm.
> + * Original code extracted from arch/x86/numa.c
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms and conditions of the GNU General Public
> + * License, version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <xen/mm.h>
> +#include <xen/string.h>
> +#include <xen/init.h>
> +#include <xen/ctype.h>
> +#include <xen/nodemask.h>
> +#include <xen/numa.h>
> +#include <xen/keyhandler.h>
> +#include <xen/time.h>
> +#include <xen/smp.h>
> +#include <xen/pfn.h>
> +#include <asm/acpi.h>
> +#include <xen/sched.h>
> +#include <xen/softirq.h>

Whilst you are moving this in a newfile, please order the includes.

[...]

> +static unsigned int __init extract_lsb_from_nodes(const struct node *nodes,
> +                                                  int numnodes)
> +{
> +    unsigned int i, nodes_used = 0;
> +    unsigned long spdx, epdx;
> +    unsigned long bitfield = 0, memtop = 0;
> +
> +    for ( i = 0; i < numnodes; i++ )
> +    {
> +        spdx = paddr_to_pdx(nodes[i].start);
> +        epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
> +        if ( spdx >= epdx )
> +            continue;
> +        bitfield |= spdx;
> +        nodes_used++;
> +        if ( epdx > memtop )
> +            memtop = epdx;
> +    }
> +    if ( nodes_used <= 1 )
> +        i = BITS_PER_LONG - 1;
> +    else
> +        i = find_first_bit(&bitfield, sizeof(unsigned long) * 8);
> +

It is interesting to see that newline was added in the process of moving 
the code.

> +    memnodemapsize = (memtop >> i) + 1;

[....]

> diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h
> index 922fbd8..eed40af 100644
> --- a/xen/include/xen/numa.h
> +++ b/xen/include/xen/numa.h
> @@ -14,6 +14,21 @@
>
>  #define MAX_NUMNODES    (1 << NODES_SHIFT)
>
> +struct node {
> +    paddr_t start;
> +    paddr_t end;
> +};
> +
> +extern int compute_memnode_shift(struct node *nodes, int numnodes,
> +                                 nodeid_t *nodeids, unsigned int *shift);
> +extern void numa_init_array(void);
> +extern bool_t srat_disabled(void);
> +extern void numa_set_node(int cpu, nodeid_t node);
> +extern nodeid_t acpi_setup_node(unsigned int pxm);
> +extern void srat_detect_node(int cpu);
> +extern void setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t end);
> +extern void init_cpu_to_node(void);

Can you please be consistent with this file and drop the unecessary 
"extern".

> +
>  #define vcpu_to_node(v) (cpu_to_node((v)->processor))
>
>  #define domain_to_node(d) \
> @@ -23,4 +38,7 @@
>  bool is_numa_off(void);
>  bool get_acpi_numa(void);
>  void set_acpi_numa(bool val);
> +int get_numa_fake(void);
> +extern int numa_emulation(uint64_t start_pfn, uint64_t end_pfn);
> +extern void numa_dummy_init(uint64_t start_pfn, uint64_t end_pfn);

Ditto.

>  #endif /* _XEN_NUMA_H */
>
Julien Grall May 8, 2017, 4:51 p.m. UTC | #2
On 28/03/17 16:53, vijay.kilari@gmail.com wrote:
> diff --git a/xen/common/numa.c b/xen/common/numa.c
> new file mode 100644
> index 0000000..207ebd8
> --- /dev/null
> +++ b/xen/common/numa.c
> @@ -0,0 +1,488 @@
> +/*
> + * Common NUMA handling functions for x86 and arm.
> + * Original code extracted from arch/x86/numa.c
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms and conditions of the GNU General Public
> + * License, version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <xen/mm.h>
> +#include <xen/string.h>
> +#include <xen/init.h>
> +#include <xen/ctype.h>
> +#include <xen/nodemask.h>
> +#include <xen/numa.h>
> +#include <xen/keyhandler.h>
> +#include <xen/time.h>
> +#include <xen/smp.h>
> +#include <xen/pfn.h>
> +#include <asm/acpi.h>
> +#include <xen/sched.h>
> +#include <xen/softirq.h>
> +
> +static int numa_setup(char *s);
> +custom_param("numa", numa_setup);
> +
> +struct node_data node_data[MAX_NUMNODES];
> +
> +/* Mapping from pdx to node id */
> +unsigned int memnode_shift;
> +static typeof(*memnodemap) _memnodemap[64];

Also, you move the hardcoded 64 here. But have you checked it is valid 
for ARM?

Regardless that, this sounds like something that should be turned into a 
define and require a comment.

Cheers,
Vijay Kilari May 9, 2017, 7:36 a.m. UTC | #3
On Mon, May 8, 2017 at 10:11 PM, Julien Grall <julien.grall@arm.com> wrote:
> Hi Vijay,
>
>
> On 28/03/17 16:53, vijay.kilari@gmail.com wrote:
>>
>> diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
>> index 3bdab9a..33c6806 100644
>> --- a/xen/arch/x86/numa.c
>> +++ b/xen/arch/x86/numa.c
>> @@ -10,286 +10,13 @@
>>  #include <xen/ctype.h>
>>  #include <xen/nodemask.h>
>>  #include <xen/numa.h>
>> -#include <xen/keyhandler.h>
>>  #include <xen/time.h>
>>  #include <xen/smp.h>
>>  #include <xen/pfn.h>
>>  #include <asm/acpi.h>
>> -#include <xen/sched.h>
>> -#include <xen/softirq.h>
>> -
>> -static int numa_setup(char *s);
>> -custom_param("numa", numa_setup);
>> -
>> -struct node_data node_data[MAX_NUMNODES];
>> -
>> -/* Mapping from pdx to node id */
>> -unsigned int memnode_shift;
>> -static typeof(*memnodemap) _memnodemap[64];
>> -unsigned long memnodemapsize;
>> -uint8_t *memnodemap;
>> -
>> -nodeid_t __read_mostly cpu_to_node[NR_CPUS] = {
>> -    [0 ... NR_CPUS-1] = NUMA_NO_NODE
>> -};
>> -/*
>> - * Keep BIOS's CPU2node information, should not be used for memory
>> allocaion
>> - */
>> -nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
>> -    [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
>> -};
>
>
> Why this is moved in this patch from here to x86/srat.c?

This is x86 specific. I will make a separate patch for this
move.

>
> [...]
>
>> diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
>> index 7cf4771..2cc87a3 100644
>> --- a/xen/arch/x86/srat.c
>> +++ b/xen/arch/x86/srat.c
>> @@ -27,6 +27,13 @@ static nodemask_t __initdata memory_nodes_parsed;
>>  static nodemask_t __initdata processor_nodes_parsed;
>>  static struct node __initdata nodes[MAX_NUMNODES];
>>
>> +/*
>> + * Keep BIOS's CPU2node information, should not be used for memory
>> allocaion
>> + */
>> +nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
>> +    [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
>> +};
>> +
>
>
> This does not belong to this patch...
Ok
>
>>  struct pxm2node {
>>         unsigned int pxm;
>>         nodeid_t node;
>
>
> [...]
>
>
>> diff --git a/xen/common/numa.c b/xen/common/numa.c
>> new file mode 100644
>> index 0000000..207ebd8
>> --- /dev/null
>> +++ b/xen/common/numa.c
>> @@ -0,0 +1,488 @@
>> +/*
>> + * Common NUMA handling functions for x86 and arm.
>> + * Original code extracted from arch/x86/numa.c
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms and conditions of the GNU General Public
>> + * License, version 2, as published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include <xen/mm.h>
>> +#include <xen/string.h>
>> +#include <xen/init.h>
>> +#include <xen/ctype.h>
>> +#include <xen/nodemask.h>
>> +#include <xen/numa.h>
>> +#include <xen/keyhandler.h>
>> +#include <xen/time.h>
>> +#include <xen/smp.h>
>> +#include <xen/pfn.h>
>> +#include <asm/acpi.h>
>> +#include <xen/sched.h>
>> +#include <xen/softirq.h>
>
>
> Whilst you are moving this in a newfile, please order the includes.

I understand that you don't like any code changes in code movement
patch.

>
> [...]
>
>> +static unsigned int __init extract_lsb_from_nodes(const struct node
>> *nodes,
>> +                                                  int numnodes)
>> +{
>> +    unsigned int i, nodes_used = 0;
>> +    unsigned long spdx, epdx;
>> +    unsigned long bitfield = 0, memtop = 0;
>> +
>> +    for ( i = 0; i < numnodes; i++ )
>> +    {
>> +        spdx = paddr_to_pdx(nodes[i].start);
>> +        epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
>> +        if ( spdx >= epdx )
>> +            continue;
>> +        bitfield |= spdx;
>> +        nodes_used++;
>> +        if ( epdx > memtop )
>> +            memtop = epdx;
>> +    }
>> +    if ( nodes_used <= 1 )
>> +        i = BITS_PER_LONG - 1;
>> +    else
>> +        i = find_first_bit(&bitfield, sizeof(unsigned long) * 8);
>> +
>
>
> It is interesting to see that newline was added in the process of moving the
> code.

OK.
>
>> +    memnodemapsize = (memtop >> i) + 1;
>
>
> [....]
>
>> diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h
>> index 922fbd8..eed40af 100644
>> --- a/xen/include/xen/numa.h
>> +++ b/xen/include/xen/numa.h
>> @@ -14,6 +14,21 @@
>>
>>  #define MAX_NUMNODES    (1 << NODES_SHIFT)
>>
>> +struct node {
>> +    paddr_t start;
>> +    paddr_t end;
>> +};
>> +
>> +extern int compute_memnode_shift(struct node *nodes, int numnodes,
>> +                                 nodeid_t *nodeids, unsigned int *shift);
>> +extern void numa_init_array(void);
>> +extern bool_t srat_disabled(void);
>> +extern void numa_set_node(int cpu, nodeid_t node);
>> +extern nodeid_t acpi_setup_node(unsigned int pxm);
>> +extern void srat_detect_node(int cpu);
>> +extern void setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t
>> end);
>> +extern void init_cpu_to_node(void);
>
>
> Can you please be consistent with this file and drop the unecessary
> "extern".

I see all the externs are not required here. I will drop

>
>> +
>>  #define vcpu_to_node(v) (cpu_to_node((v)->processor))
>>
>>  #define domain_to_node(d) \
>> @@ -23,4 +38,7 @@
>>  bool is_numa_off(void);
>>  bool get_acpi_numa(void);
>>  void set_acpi_numa(bool val);
>> +int get_numa_fake(void);
>> +extern int numa_emulation(uint64_t start_pfn, uint64_t end_pfn);
>> +extern void numa_dummy_init(uint64_t start_pfn, uint64_t end_pfn);
>
>
> Ditto.
>
>
>>  #endif /* _XEN_NUMA_H */
>>
>
> --
> Julien Grall
Vijay Kilari May 9, 2017, 7:39 a.m. UTC | #4
On Mon, May 8, 2017 at 10:21 PM, Julien Grall <julien.grall@arm.com> wrote:
> On 28/03/17 16:53, vijay.kilari@gmail.com wrote:
>>
>> diff --git a/xen/common/numa.c b/xen/common/numa.c
>> new file mode 100644
>> index 0000000..207ebd8
>> --- /dev/null
>> +++ b/xen/common/numa.c
>> @@ -0,0 +1,488 @@
>> +/*
>> + * Common NUMA handling functions for x86 and arm.
>> + * Original code extracted from arch/x86/numa.c
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms and conditions of the GNU General Public
>> + * License, version 2, as published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include <xen/mm.h>
>> +#include <xen/string.h>
>> +#include <xen/init.h>
>> +#include <xen/ctype.h>
>> +#include <xen/nodemask.h>
>> +#include <xen/numa.h>
>> +#include <xen/keyhandler.h>
>> +#include <xen/time.h>
>> +#include <xen/smp.h>
>> +#include <xen/pfn.h>
>> +#include <asm/acpi.h>
>> +#include <xen/sched.h>
>> +#include <xen/softirq.h>
>> +
>> +static int numa_setup(char *s);
>> +custom_param("numa", numa_setup);
>> +
>> +struct node_data node_data[MAX_NUMNODES];
>> +
>> +/* Mapping from pdx to node id */
>> +unsigned int memnode_shift;
>> +static typeof(*memnodemap) _memnodemap[64];
>
>
> Also, you move the hardcoded 64 here. But have you checked it is valid for
> ARM?
>
> Regardless that, this sounds like something that should be turned into a
> define and require a comment.

64 is good enough. This _memnodemap is used in case of NUMA failed or off,
in which case memnode_shift is 63 (BITS_PER_LONG -1).

So all the phys_to_nid() conversion will indexed within limits of _memnodemap[]

>
> Cheers,
>
> --
> Julien Grall
Julien Grall May 9, 2017, 8:23 a.m. UTC | #5
On 05/09/2017 08:36 AM, Vijay Kilari wrote:
> On Mon, May 8, 2017 at 10:11 PM, Julien Grall <julien.grall@arm.com> wrote:
>> Hi Vijay,
>>
>>
>> On 28/03/17 16:53, vijay.kilari@gmail.com wrote:
>>>
>>> diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
>>> index 3bdab9a..33c6806 100644
>>> --- a/xen/arch/x86/numa.c
>>> +++ b/xen/arch/x86/numa.c
>>> @@ -10,286 +10,13 @@
>>>  #include <xen/ctype.h>
>>>  #include <xen/nodemask.h>
>>>  #include <xen/numa.h>
>>> -#include <xen/keyhandler.h>
>>>  #include <xen/time.h>
>>>  #include <xen/smp.h>
>>>  #include <xen/pfn.h>
>>>  #include <asm/acpi.h>
>>> -#include <xen/sched.h>
>>> -#include <xen/softirq.h>
>>> -
>>> -static int numa_setup(char *s);
>>> -custom_param("numa", numa_setup);
>>> -
>>> -struct node_data node_data[MAX_NUMNODES];
>>> -
>>> -/* Mapping from pdx to node id */
>>> -unsigned int memnode_shift;
>>> -static typeof(*memnodemap) _memnodemap[64];
>>> -unsigned long memnodemapsize;
>>> -uint8_t *memnodemap;
>>> -
>>> -nodeid_t __read_mostly cpu_to_node[NR_CPUS] = {
>>> -    [0 ... NR_CPUS-1] = NUMA_NO_NODE
>>> -};
>>> -/*
>>> - * Keep BIOS's CPU2node information, should not be used for memory
>>> allocaion
>>> - */
>>> -nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
>>> -    [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
>>> -};
>>
>>
>> Why this is moved in this patch from here to x86/srat.c?
>
> This is x86 specific. I will make a separate patch for this
> move.

But x86/numa.c is specific specific.... So why do you move it????

>
>>
>> [...]
>>
>>> diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
>>> index 7cf4771..2cc87a3 100644
>>> --- a/xen/arch/x86/srat.c
>>> +++ b/xen/arch/x86/srat.c
>>> @@ -27,6 +27,13 @@ static nodemask_t __initdata memory_nodes_parsed;
>>>  static nodemask_t __initdata processor_nodes_parsed;
>>>  static struct node __initdata nodes[MAX_NUMNODES];
>>>
>>> +/*
>>> + * Keep BIOS's CPU2node information, should not be used for memory
>>> allocaion
>>> + */
>>> +nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
>>> +    [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
>>> +};
>>> +
>>
>>
>> This does not belong to this patch...
> Ok
>>
>>>  struct pxm2node {
>>>         unsigned int pxm;
>>>         nodeid_t node;
>>
>>
>> [...]
>>
>>
>>> diff --git a/xen/common/numa.c b/xen/common/numa.c
>>> new file mode 100644
>>> index 0000000..207ebd8
>>> --- /dev/null
>>> +++ b/xen/common/numa.c
>>> @@ -0,0 +1,488 @@
>>> +/*
>>> + * Common NUMA handling functions for x86 and arm.
>>> + * Original code extracted from arch/x86/numa.c
>>> + *
>>> + * This program is free software; you can redistribute it and/or
>>> + * modify it under the terms and conditions of the GNU General Public
>>> + * License, version 2, as published by the Free Software Foundation.
>>> + *
>>> + * This program is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>> + * GNU General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU General Public License
>>> + * along with this program; If not, see <http://www.gnu.org/licenses/>.
>>> + */
>>> +
>>> +#include <xen/mm.h>
>>> +#include <xen/string.h>
>>> +#include <xen/init.h>
>>> +#include <xen/ctype.h>
>>> +#include <xen/nodemask.h>
>>> +#include <xen/numa.h>
>>> +#include <xen/keyhandler.h>
>>> +#include <xen/time.h>
>>> +#include <xen/smp.h>
>>> +#include <xen/pfn.h>
>>> +#include <asm/acpi.h>
>>> +#include <xen/sched.h>
>>> +#include <xen/softirq.h>
>>
>>
>> Whilst you are moving this in a newfile, please order the includes.
>
> I understand that you don't like any code changes in code movement
> patch.

Are you saying you blindly copied the headers without even checking they 
are necessary?

Surely, you only added the one necessary which means it would be ok to 
sort them as if one is missing this would be catch by compilation.

Cheers,
Julien Grall May 9, 2017, 8:26 a.m. UTC | #6
On 05/09/2017 08:39 AM, Vijay Kilari wrote:
> On Mon, May 8, 2017 at 10:21 PM, Julien Grall <julien.grall@arm.com> wrote:
>> On 28/03/17 16:53, vijay.kilari@gmail.com wrote:
>>> +static int numa_setup(char *s);
>>> +custom_param("numa", numa_setup);
>>> +
>>> +struct node_data node_data[MAX_NUMNODES];
>>> +
>>> +/* Mapping from pdx to node id */
>>> +unsigned int memnode_shift;
>>> +static typeof(*memnodemap) _memnodemap[64];
>>
>>
>> Also, you move the hardcoded 64 here. But have you checked it is valid for
>> ARM?
>>
>> Regardless that, this sounds like something that should be turned into a
>> define and require a comment.
>
> 64 is good enough. This _memnodemap is used in case of NUMA failed or off,
> in which case memnode_shift is 63 (BITS_PER_LONG -1).

If it based on BITS_PER_LONG, then you should use BITS_PER_LONG (via a 
proper define) rather than hardcoding it.
diff mbox

Patch

diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
index 3bdab9a..33c6806 100644
--- a/xen/arch/x86/numa.c
+++ b/xen/arch/x86/numa.c
@@ -10,286 +10,13 @@ 
 #include <xen/ctype.h>
 #include <xen/nodemask.h>
 #include <xen/numa.h>
-#include <xen/keyhandler.h>
 #include <xen/time.h>
 #include <xen/smp.h>
 #include <xen/pfn.h>
 #include <asm/acpi.h>
-#include <xen/sched.h>
-#include <xen/softirq.h>
-
-static int numa_setup(char *s);
-custom_param("numa", numa_setup);
-
-struct node_data node_data[MAX_NUMNODES];
-
-/* Mapping from pdx to node id */
-unsigned int memnode_shift;
-static typeof(*memnodemap) _memnodemap[64];
-unsigned long memnodemapsize;
-uint8_t *memnodemap;
-
-nodeid_t __read_mostly cpu_to_node[NR_CPUS] = {
-    [0 ... NR_CPUS-1] = NUMA_NO_NODE
-};
-/*
- * Keep BIOS's CPU2node information, should not be used for memory allocaion
- */
-nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
-    [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
-cpumask_t __read_mostly node_to_cpumask[MAX_NUMNODES];
 
 nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
 
-static bool numa_off = 0;
-static bool acpi_numa = 1;
-
-bool is_numa_off(void)
-{
-    return numa_off;
-}
-
-bool get_acpi_numa(void)
-{
-    return acpi_numa;
-}
-
-void set_acpi_numa(bool_t val)
-{
-    acpi_numa = val;
-}
-
-bool srat_disabled(void)
-{
-    return numa_off || acpi_numa == 0;
-}
-
-/*
- * Given a shift value, try to populate memnodemap[]
- * Returns :
- * 0 if OK
- * -EINVAL if memnodmap[] too small (of shift too small)
- * OR if node overlap or lost ram (shift too big)
- */
-static int __init populate_memnodemap(const struct node *nodes, int numnodes,
-                                      unsigned int shift, nodeid_t *nodeids)
-{
-    unsigned long spdx, epdx;
-    int i, res = -EINVAL;
-
-    memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap));
-    for ( i = 0; i < numnodes; i++ )
-    {
-        spdx = paddr_to_pdx(nodes[i].start);
-        epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
-        if ( spdx >= epdx )
-            continue;
-        if ( (epdx >> shift) >= memnodemapsize )
-            return 0;
-        do {
-            if ( memnodemap[spdx >> shift] != NUMA_NO_NODE )
-                return -EINVAL;
-
-            if ( !nodeids )
-                memnodemap[spdx >> shift] = i;
-            else
-                memnodemap[spdx >> shift] = nodeids[i];
-
-            spdx += (1UL << shift);
-        } while ( spdx < epdx );
-        res = 0;
-    }
-
-    return res;
-}
-
-static int __init allocate_cachealigned_memnodemap(void)
-{
-    unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
-    unsigned long mfn = alloc_boot_pages(size, 1);
-
-    if ( !mfn )
-    {
-        printk(KERN_ERR
-               "NUMA: Unable to allocate Memory to Node hash map\n");
-        memnodemapsize = 0;
-        return -ENOMEM;
-    }
-
-    memnodemap = mfn_to_virt(mfn);
-    mfn <<= PAGE_SHIFT;
-    size <<= PAGE_SHIFT;
-    printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
-           mfn, mfn + size);
-    memnodemapsize = size / sizeof(*memnodemap);
-
-    return 0;
-}
-
-/*
- * The LSB of all start and end addresses in the node map is the value of the
- * maximum possible shift.
- */
-static unsigned int __init extract_lsb_from_nodes(const struct node *nodes,
-                                                  int numnodes)
-{
-    unsigned int i, nodes_used = 0;
-    unsigned long spdx, epdx;
-    unsigned long bitfield = 0, memtop = 0;
-
-    for ( i = 0; i < numnodes; i++ )
-    {
-        spdx = paddr_to_pdx(nodes[i].start);
-        epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
-        if ( spdx >= epdx )
-            continue;
-        bitfield |= spdx;
-        nodes_used++;
-        if ( epdx > memtop )
-            memtop = epdx;
-    }
-    if ( nodes_used <= 1 )
-        i = BITS_PER_LONG - 1;
-    else
-        i = find_first_bit(&bitfield, sizeof(unsigned long) * 8);
-    memnodemapsize = (memtop >> i) + 1;
-
-    return i;
-}
-
-int __init compute_memnode_shift(struct node *nodes, int numnodes,
-                                 nodeid_t *nodeids, unsigned int *shift)
-{
-    *shift = extract_lsb_from_nodes(nodes, numnodes);
-
-    if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) )
-        memnodemap = _memnodemap;
-    else if ( allocate_cachealigned_memnodemap() )
-        return -ENOMEM;
-
-    printk(KERN_DEBUG "NUMA: Using %u for the hash shift.\n", *shift);
-
-    if ( populate_memnodemap(nodes, numnodes, *shift, nodeids) )
-    {
-        printk(KERN_INFO "Your memory is not aligned you need to "
-               "rebuild your hypervisor with a bigger NODEMAPSIZE "
-               "shift=%u\n", *shift);
-        return -EINVAL;
-    }
-
-    return 0;
-}
-/* initialize NODE_DATA given nodeid and start/end */
-void __init setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t end)
-{
-    unsigned long start_pfn, end_pfn;
-
-    start_pfn = start >> PAGE_SHIFT;
-    end_pfn = end >> PAGE_SHIFT;
-
-    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
-    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
-
-    node_set_online(nodeid);
-}
-
-void __init numa_init_array(void)
-{
-    int rr, i;
-
-    /* There are unfortunately some poorly designed mainboards around
-       that only connect memory to a single CPU. This breaks the 1:1 cpu->node
-       mapping. To avoid this fill in the mapping for all possible
-       CPUs, as the number of CPUs is not known yet.
-       We round robin the existing nodes. */
-    rr = first_node(node_online_map);
-    for ( i = 0; i < nr_cpu_ids; i++ )
-    {
-        if ( cpu_to_node[i] != NUMA_NO_NODE )
-            continue;
-        numa_set_node(i, rr);
-        rr = next_node(rr, node_online_map);
-        if ( rr == MAX_NUMNODES )
-            rr = first_node(node_online_map);
-    }
-}
-
-#ifdef CONFIG_NUMA_EMU
-static int __initdata numa_fake = 0;
-static int get_numa_fake(void)
-{
-    return numa_fake;
-}
-
-/* Numa emulation */
-static int __init numa_emulation(uint64_t start_pfn, uint64_t end_pfn)
-{
-    int i;
-    struct node nodes[MAX_NUMNODES];
-    uint64_t sz = ((end_pfn - start_pfn) << PAGE_SHIFT) / get_numa_fake();
-
-    /* Kludge needed for the hash function */
-    if ( hweight64(sz) > 1 )
-    {
-        uint64_t x = 1;
-        while ( (x << 1) < sz )
-            x <<= 1;
-        if ( x < sz / 2 )
-            printk(KERN_ERR
-                   "Numa emulation unbalanced. Complain to maintainer\n");
-        sz = x;
-    }
-
-    memset(&nodes,0,sizeof(nodes));
-    for ( i = 0; i < get_numa_fake(); i++ )
-    {
-        nodes[i].start = (start_pfn << PAGE_SHIFT) + i * sz;
-        if ( i == get_numa_fake() - 1 )
-            sz = (end_pfn << PAGE_SHIFT) - nodes[i].start;
-        nodes[i].end = nodes[i].start + sz;
-        printk(KERN_INFO
-               "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
-               i, nodes[i].start, nodes[i].end,
-               (nodes[i].end - nodes[i].start) >> 20);
-        node_set_online(i);
-    }
-    if ( compute_memnode_shift(nodes, get_numa_fake(), NULL, &memnode_shift) )
-    {
-        memnode_shift = 0;
-        printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
-        return -1;
-    }
-    for_each_online_node ( i )
-        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-    numa_init_array();
-
-    return 0;
-}
-#endif
-
-static void __init numa_dummy_init(unsigned long start_pfn, unsigned long end_pfn)
-{
-    int i;
-
-    printk(KERN_INFO "%s\n",
-           is_numa_off() ? "NUMA turned off" : "No NUMA configuration found");
-
-    printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
-           (uint64_t)start_pfn << PAGE_SHIFT,
-           (uint64_t)end_pfn << PAGE_SHIFT);
-    /* setup dummy node covering all memory */
-    memnode_shift = BITS_PER_LONG - 1;
-    memnodemap = _memnodemap;
-    nodes_clear(node_online_map);
-    node_set_online(0);
-    for ( i = 0; i < nr_cpu_ids; i++ )
-        numa_set_node(i, 0);
-    cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
-    setup_node_bootmem(0, (paddr_t)start_pfn << PAGE_SHIFT,
-                    (paddr_t)end_pfn << PAGE_SHIFT);
-}
-
 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 {
 #ifdef CONFIG_NUMA_EMU
@@ -306,43 +33,6 @@  void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
     numa_dummy_init(start_pfn, end_pfn);
 }
 
-void numa_add_cpu(int cpu)
-{
-    cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
-}
-
-void numa_set_node(int cpu, nodeid_t node)
-{
-    cpu_to_node[cpu] = node;
-}
-
-/* [numa=off] */
-static int __init numa_setup(char *opt)
-{
-    if ( !strncmp(opt,"off",3) )
-        numa_off = 1;
-    if ( !strncmp(opt,"on",2) )
-        numa_off = 0;
-#ifdef CONFIG_NUMA_EMU
-    if ( !strncmp(opt, "fake=", 5) )
-    {
-        numa_off = 0;
-        numa_fake = simple_strtoul(opt+5,NULL,0);
-        if ( numa_fake >= MAX_NUMNODES )
-            numa_fake = MAX_NUMNODES;
-    }
-#endif
-#ifdef CONFIG_ACPI_NUMA
-    if ( !strncmp(opt,"noacpi",6) )
-    {
-        numa_off = 0;
-        acpi_numa = 0;
-    }
-#endif
-
-    return 1;
-}
-
 /*
  * Setup early cpu_to_node.
  *
@@ -391,149 +81,3 @@  unsigned int __init arch_get_dma_bitsize(void)
                  flsl(node_start_pfn(node) + node_spanned_pages(node) / 4 - 1)
                  + PAGE_SHIFT, 32);
 }
-
-static void dump_numa(unsigned char key)
-{
-    s_time_t now = NOW();
-    unsigned int i, j, n;
-    int err;
-    struct domain *d;
-    struct page_info *page;
-    unsigned int page_num_node[MAX_NUMNODES];
-    const struct vnuma_info *vnuma;
-
-    printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
-           (uint32_t)(now >> 32), (uint32_t)now);
-
-    for_each_online_node ( i )
-    {
-        paddr_t pa = pfn_to_paddr(node_start_pfn(i) + 1);
-
-        printk("NODE%u start->%lu size->%lu free->%lu\n",
-               i, node_start_pfn(i), node_spanned_pages(i),
-               avail_node_heap_pages(i));
-        /* sanity check phys_to_nid() */
-        if ( phys_to_nid(pa) != i )
-            printk("phys_to_nid(%"PRIpaddr") -> %d should be %u\n",
-                   pa, phys_to_nid(pa), i);
-    }
-
-    j = cpumask_first(&cpu_online_map);
-    n = 0;
-    for_each_online_cpu ( i )
-    {
-        if ( i != j + n || cpu_to_node[j] != cpu_to_node[i] )
-        {
-            if ( n > 1 )
-                printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
-            else
-                printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
-            j = i;
-            n = 1;
-        }
-        else
-            ++n;
-    }
-    if ( n > 1 )
-        printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
-    else
-        printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
-
-    rcu_read_lock(&domlist_read_lock);
-
-    printk("Memory location of each domain:\n");
-    for_each_domain ( d )
-    {
-        process_pending_softirqs();
-
-        printk("Domain %u (total: %u):\n", d->domain_id, d->tot_pages);
-
-        for_each_online_node ( i )
-            page_num_node[i] = 0;
-
-        spin_lock(&d->page_alloc_lock);
-        page_list_for_each(page, &d->page_list)
-        {
-            i = phys_to_nid((paddr_t)page_to_mfn(page) << PAGE_SHIFT);
-            page_num_node[i]++;
-        }
-        spin_unlock(&d->page_alloc_lock);
-
-        for_each_online_node ( i )
-            printk("    Node %u: %u\n", i, page_num_node[i]);
-
-        if ( !read_trylock(&d->vnuma_rwlock) )
-            continue;
-
-        if ( !d->vnuma )
-        {
-            read_unlock(&d->vnuma_rwlock);
-            continue;
-        }
-
-        vnuma = d->vnuma;
-        printk("     %u vnodes, %u vcpus, guest physical layout:\n",
-               vnuma->nr_vnodes, d->max_vcpus);
-        for ( i = 0; i < vnuma->nr_vnodes; i++ )
-        {
-            unsigned int start_cpu = ~0U;
-
-            err = snprintf(keyhandler_scratch, 12, "%3u",
-                    vnuma->vnode_to_pnode[i]);
-            if ( err < 0 || vnuma->vnode_to_pnode[i] == NUMA_NO_NODE )
-                strlcpy(keyhandler_scratch, "???", sizeof(keyhandler_scratch));
-
-            printk("       %3u: pnode %s,", i, keyhandler_scratch);
-
-            printk(" vcpus ");
-
-            for ( j = 0; j < d->max_vcpus; j++ )
-            {
-                if ( !(j & 0x3f) )
-                    process_pending_softirqs();
-
-                if ( vnuma->vcpu_to_vnode[j] == i )
-                {
-                    if ( start_cpu == ~0U )
-                    {
-                        printk("%d", j);
-                        start_cpu = j;
-                    }
-                }
-                else if ( start_cpu != ~0U )
-                {
-                    if ( j - 1 != start_cpu )
-                        printk("-%d ", j - 1);
-                    else
-                        printk(" ");
-                    start_cpu = ~0U;
-                }
-            }
-
-            if ( start_cpu != ~0U  && start_cpu != j - 1 )
-                printk("-%d", j - 1);
-
-            printk("\n");
-
-            for ( j = 0; j < vnuma->nr_vmemranges; j++ )
-            {
-                if ( vnuma->vmemrange[j].nid == i )
-                    printk("           %016"PRIx64" - %016"PRIx64"\n",
-                           vnuma->vmemrange[j].start,
-                           vnuma->vmemrange[j].end);
-            }
-        }
-
-        read_unlock(&d->vnuma_rwlock);
-    }
-
-    rcu_read_unlock(&domlist_read_lock);
-}
-
-static int __init register_numa_trigger(void)
-{
-    register_keyhandler('u', dump_numa, "dump NUMA info", 1);
-    return 0;
-}
-__initcall(register_numa_trigger);
-
diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
index 7cf4771..2cc87a3 100644
--- a/xen/arch/x86/srat.c
+++ b/xen/arch/x86/srat.c
@@ -27,6 +27,13 @@  static nodemask_t __initdata memory_nodes_parsed;
 static nodemask_t __initdata processor_nodes_parsed;
 static struct node __initdata nodes[MAX_NUMNODES];
 
+/*
+ * Keep BIOS's CPU2node information, should not be used for memory allocaion
+ */
+nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
+    [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+
 struct pxm2node {
 	unsigned int pxm;
 	nodeid_t node;
diff --git a/xen/common/Makefile b/xen/common/Makefile
index 0fed30b..4b17344 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -63,6 +63,7 @@  obj-y += wait.o
 obj-bin-y += warning.init.o
 obj-$(CONFIG_XENOPROF) += xenoprof.o
 obj-y += xmalloc_tlsf.o
+obj-$(CONFIG_NUMA) += numa.o
 
 obj-bin-$(CONFIG_X86) += $(foreach n,decompress bunzip2 unxz unlzma unlzo unlz4 earlycpio,$(n).init.o)
 
diff --git a/xen/common/numa.c b/xen/common/numa.c
new file mode 100644
index 0000000..207ebd8
--- /dev/null
+++ b/xen/common/numa.c
@@ -0,0 +1,488 @@ 
+/*
+ * Common NUMA handling functions for x86 and arm.
+ * Original code extracted from arch/x86/numa.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/mm.h>
+#include <xen/string.h>
+#include <xen/init.h>
+#include <xen/ctype.h>
+#include <xen/nodemask.h>
+#include <xen/numa.h>
+#include <xen/keyhandler.h>
+#include <xen/time.h>
+#include <xen/smp.h>
+#include <xen/pfn.h>
+#include <asm/acpi.h>
+#include <xen/sched.h>
+#include <xen/softirq.h>
+
+static int numa_setup(char *s);
+custom_param("numa", numa_setup);
+
+struct node_data node_data[MAX_NUMNODES];
+
+/* Mapping from pdx to node id */
+unsigned int memnode_shift;
+static typeof(*memnodemap) _memnodemap[64];
+unsigned long memnodemapsize;
+uint8_t *memnodemap;
+
+nodeid_t __read_mostly cpu_to_node[NR_CPUS] = {
+    [0 ... NR_CPUS-1] = NUMA_NO_NODE
+};
+cpumask_t __read_mostly node_to_cpumask[MAX_NUMNODES];
+
+static bool numa_off = 0;
+static bool acpi_numa = 1;
+
+bool is_numa_off(void)
+{
+    return numa_off;
+}
+
+bool get_acpi_numa(void)
+{
+    return acpi_numa;
+}
+
+void set_acpi_numa(bool_t val)
+{
+    acpi_numa = val;
+}
+
+bool srat_disabled(void)
+{
+    return numa_off || acpi_numa == 0;
+}
+
+/*
+ * Given a shift value, try to populate memnodemap[]
+ * Returns :
+ * 0 if OK
+ * -EINVAL if memnodmap[] too small (of shift too small)
+ * OR if node overlap or lost ram (shift too big)
+ */
+static int __init populate_memnodemap(const struct node *nodes, int numnodes,
+                                      unsigned int shift, nodeid_t *nodeids)
+{
+    unsigned long spdx, epdx;
+    int i, res = -EINVAL;
+
+    memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap));
+    for ( i = 0; i < numnodes; i++ )
+    {
+        spdx = paddr_to_pdx(nodes[i].start);
+        epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
+        if ( spdx >= epdx )
+            continue;
+        if ( (epdx >> shift) >= memnodemapsize )
+            return 0;
+        do {
+            if ( memnodemap[spdx >> shift] != NUMA_NO_NODE )
+                return -EINVAL;
+
+            if ( !nodeids )
+                memnodemap[spdx >> shift] = i;
+            else
+                memnodemap[spdx >> shift] = nodeids[i];
+
+            spdx += (1UL << shift);
+        } while ( spdx < epdx );
+        res = 0;
+    }
+
+    return res;
+}
+
+static int __init allocate_cachealigned_memnodemap(void)
+{
+    unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
+    unsigned long mfn = alloc_boot_pages(size, 1);
+
+    if ( !mfn )
+    {
+        printk(KERN_ERR
+               "NUMA: Unable to allocate Memory to Node hash map\n");
+        memnodemapsize = 0;
+        return -ENOMEM;
+    }
+
+    memnodemap = mfn_to_virt(mfn);
+    mfn <<= PAGE_SHIFT;
+    size <<= PAGE_SHIFT;
+    printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+           mfn, mfn + size);
+    memnodemapsize = size / sizeof(*memnodemap);
+
+    return 0;
+}
+
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static unsigned int __init extract_lsb_from_nodes(const struct node *nodes,
+                                                  int numnodes)
+{
+    unsigned int i, nodes_used = 0;
+    unsigned long spdx, epdx;
+    unsigned long bitfield = 0, memtop = 0;
+
+    for ( i = 0; i < numnodes; i++ )
+    {
+        spdx = paddr_to_pdx(nodes[i].start);
+        epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
+        if ( spdx >= epdx )
+            continue;
+        bitfield |= spdx;
+        nodes_used++;
+        if ( epdx > memtop )
+            memtop = epdx;
+    }
+    if ( nodes_used <= 1 )
+        i = BITS_PER_LONG - 1;
+    else
+        i = find_first_bit(&bitfield, sizeof(unsigned long) * 8);
+
+    memnodemapsize = (memtop >> i) + 1;
+
+    return i;
+}
+
+int __init compute_memnode_shift(struct node *nodes, int numnodes,
+                                 nodeid_t *nodeids, unsigned int *shift)
+{
+    *shift = extract_lsb_from_nodes(nodes, numnodes);
+
+    if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) )
+        memnodemap = _memnodemap;
+    else if ( allocate_cachealigned_memnodemap() )
+        return -ENOMEM;
+
+    printk(KERN_DEBUG "NUMA: Using %u for the hash shift.\n", *shift);
+
+    if ( populate_memnodemap(nodes, numnodes, *shift, nodeids) )
+    {
+        printk(KERN_INFO "Your memory is not aligned you need to "
+               "rebuild your hypervisor with a bigger NODEMAPSIZE "
+               "shift=%u\n", *shift);
+        return -EINVAL;
+    }
+
+    return 0;
+}
+/* initialize NODE_DATA given nodeid and start/end */
+void __init setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t end)
+{
+    unsigned long start_pfn, end_pfn;
+
+    start_pfn = start >> PAGE_SHIFT;
+    end_pfn = end >> PAGE_SHIFT;
+
+    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
+    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
+
+    node_set_online(nodeid);
+}
+
+void __init numa_init_array(void)
+{
+    int rr, i;
+
+    /* There are unfortunately some poorly designed mainboards around
+       that only connect memory to a single CPU. This breaks the 1:1 cpu->node
+       mapping. To avoid this fill in the mapping for all possible
+       CPUs, as the number of CPUs is not known yet.
+       We round robin the existing nodes. */
+    rr = first_node(node_online_map);
+    for ( i = 0; i < nr_cpu_ids; i++ )
+    {
+        if ( cpu_to_node[i] != NUMA_NO_NODE )
+            continue;
+        numa_set_node(i, rr);
+        rr = next_node(rr, node_online_map);
+        if ( rr == MAX_NUMNODES )
+            rr = first_node(node_online_map);
+    }
+}
+
+#ifdef CONFIG_NUMA_EMU
+static int __initdata numa_fake = 0;
+
+int get_numa_fake(void)
+{
+    return numa_fake;
+}
+
+/* Numa emulation */
+int __init numa_emulation(uint64_t start_pfn, uint64_t end_pfn)
+{
+    int i;
+    struct node nodes[MAX_NUMNODES];
+    uint64_t sz = ((end_pfn - start_pfn) << PAGE_SHIFT) / get_numa_fake();
+
+    /* Kludge needed for the hash function */
+    if ( hweight64(sz) > 1 )
+    {
+        uint64_t x = 1;
+        while ( (x << 1) < sz )
+            x <<= 1;
+        if ( x < sz / 2 )
+            printk(KERN_ERR
+                   "Numa emulation unbalanced. Complain to maintainer\n");
+        sz = x;
+    }
+
+    memset(&nodes,0,sizeof(nodes));
+    for ( i = 0; i < get_numa_fake(); i++ )
+    {
+        nodes[i].start = (start_pfn << PAGE_SHIFT) + i * sz;
+        if ( i == get_numa_fake() - 1 )
+            sz = (end_pfn << PAGE_SHIFT) - nodes[i].start;
+        nodes[i].end = nodes[i].start + sz;
+        printk(KERN_INFO
+               "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
+               i, nodes[i].start, nodes[i].end,
+               (nodes[i].end - nodes[i].start) >> 20);
+        node_set_online(i);
+    }
+
+    if ( compute_memnode_shift(nodes, get_numa_fake(), NULL, &memnode_shift) )
+    {
+        memnode_shift = 0;
+        printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
+        return -1;
+    }
+    for_each_online_node ( i )
+        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+    numa_init_array();
+
+    return 0;
+}
+#endif
+
+void __init numa_dummy_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+    int i;
+
+    printk(KERN_INFO "%s\n",
+           is_numa_off() ? "NUMA turned off" : "No NUMA configuration found");
+
+    printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
+           (uint64_t)start_pfn << PAGE_SHIFT,
+           (uint64_t)end_pfn << PAGE_SHIFT);
+    /* setup dummy node covering all memory */
+    memnode_shift = BITS_PER_LONG - 1;
+    memnodemap = _memnodemap;
+    nodes_clear(node_online_map);
+    node_set_online(0);
+    for ( i = 0; i < nr_cpu_ids; i++ )
+        numa_set_node(i, 0);
+    cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
+    setup_node_bootmem(0, (paddr_t)start_pfn << PAGE_SHIFT,
+                    (paddr_t)end_pfn << PAGE_SHIFT);
+}
+
+void numa_add_cpu(int cpu)
+{
+    cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+}
+
+void numa_set_node(int cpu, nodeid_t node)
+{
+    cpu_to_node[cpu] = node;
+}
+
+/* [numa=off] */
+static int __init numa_setup(char *opt)
+{
+    if ( !strncmp(opt,"off",3) )
+        numa_off = 1;
+    if ( !strncmp(opt,"on",2) )
+        numa_off = 0;
+#ifdef CONFIG_NUMA_EMU
+    if ( !strncmp(opt, "fake=", 5) )
+    {
+        numa_off = 0;
+        numa_fake = simple_strtoul(opt+5,NULL,0);
+        if ( numa_fake >= MAX_NUMNODES )
+            numa_fake = MAX_NUMNODES;
+    }
+#endif
+#ifdef CONFIG_ACPI_NUMA
+    if ( !strncmp(opt,"noacpi",6) )
+    {
+        numa_off = 0;
+        acpi_numa = 0;
+    }
+#endif
+
+    return 1;
+}
+
+static void dump_numa(unsigned char key)
+{
+    s_time_t now = NOW();
+    unsigned int i, j, n;
+    int err;
+    struct domain *d;
+    struct page_info *page;
+    unsigned int page_num_node[MAX_NUMNODES];
+    const struct vnuma_info *vnuma;
+
+    printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
+           (uint32_t)(now >> 32), (uint32_t)now);
+
+    for_each_online_node ( i )
+    {
+        paddr_t pa = pfn_to_paddr(node_start_pfn(i) + 1);
+
+        printk("NODE%u start->%lu size->%lu free->%lu\n",
+               i, node_start_pfn(i), node_spanned_pages(i),
+               avail_node_heap_pages(i));
+        /* sanity check phys_to_nid() */
+        if ( phys_to_nid(pa) != i )
+            printk("phys_to_nid(%"PRIpaddr") -> %d should be %u\n",
+                   pa, phys_to_nid(pa), i);
+    }
+
+    j = cpumask_first(&cpu_online_map);
+    n = 0;
+    for_each_online_cpu ( i )
+    {
+        if ( i != j + n || cpu_to_node[j] != cpu_to_node[i] )
+        {
+            if ( n > 1 )
+                printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
+            else
+                printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
+            j = i;
+            n = 1;
+        }
+        else
+            ++n;
+    }
+    if ( n > 1 )
+        printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
+    else
+        printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
+
+    rcu_read_lock(&domlist_read_lock);
+
+    printk("Memory location of each domain:\n");
+    for_each_domain ( d )
+    {
+        process_pending_softirqs();
+
+        printk("Domain %u (total: %u):\n", d->domain_id, d->tot_pages);
+
+        for_each_online_node ( i )
+            page_num_node[i] = 0;
+
+        spin_lock(&d->page_alloc_lock);
+        page_list_for_each(page, &d->page_list)
+        {
+            i = phys_to_nid((paddr_t)page_to_mfn(page) << PAGE_SHIFT);
+            page_num_node[i]++;
+        }
+        spin_unlock(&d->page_alloc_lock);
+
+        for_each_online_node ( i )
+            printk("    Node %u: %u\n", i, page_num_node[i]);
+
+        if ( !read_trylock(&d->vnuma_rwlock) )
+            continue;
+
+        if ( !d->vnuma )
+        {
+            read_unlock(&d->vnuma_rwlock);
+            continue;
+        }
+
+        vnuma = d->vnuma;
+        printk("     %u vnodes, %u vcpus, guest physical layout:\n",
+               vnuma->nr_vnodes, d->max_vcpus);
+        for ( i = 0; i < vnuma->nr_vnodes; i++ )
+        {
+            unsigned int start_cpu = ~0U;
+
+            err = snprintf(keyhandler_scratch, 12, "%3u",
+                    vnuma->vnode_to_pnode[i]);
+            if ( err < 0 || vnuma->vnode_to_pnode[i] == NUMA_NO_NODE )
+                strlcpy(keyhandler_scratch, "???", sizeof(keyhandler_scratch));
+
+            printk("       %3u: pnode %s,", i, keyhandler_scratch);
+
+            printk(" vcpus ");
+
+            for ( j = 0; j < d->max_vcpus; j++ )
+            {
+                if ( !(j & 0x3f) )
+                    process_pending_softirqs();
+
+                if ( vnuma->vcpu_to_vnode[j] == i )
+                {
+                    if ( start_cpu == ~0U )
+                    {
+                        printk("%d", j);
+                        start_cpu = j;
+                    }
+                }
+                else if ( start_cpu != ~0U )
+                {
+                    if ( j - 1 != start_cpu )
+                        printk("-%d ", j - 1);
+                    else
+                        printk(" ");
+                    start_cpu = ~0U;
+                }
+            }
+
+            if ( start_cpu != ~0U  && start_cpu != j - 1 )
+                printk("-%d", j - 1);
+
+            printk("\n");
+
+            for ( j = 0; j < vnuma->nr_vmemranges; j++ )
+            {
+                if ( vnuma->vmemrange[j].nid == i )
+                    printk("           %016"PRIx64" - %016"PRIx64"\n",
+                           vnuma->vmemrange[j].start,
+                           vnuma->vmemrange[j].end);
+            }
+        }
+
+        read_unlock(&d->vnuma_rwlock);
+    }
+
+    rcu_read_unlock(&domlist_read_lock);
+}
+
+static int __init register_numa_trigger(void)
+{
+    register_keyhandler('u', dump_numa, "dump NUMA info", 1);
+    return 0;
+}
+__initcall(register_numa_trigger);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/numa.h b/xen/include/asm-x86/numa.h
index 7237ad1..421e8b7 100644
--- a/xen/include/asm-x86/numa.h
+++ b/xen/include/asm-x86/numa.h
@@ -17,27 +17,12 @@  extern cpumask_t     node_to_cpumask[];
 #define node_to_first_cpu(node)  (__ffs(node_to_cpumask[node]))
 #define node_to_cpumask(node)    (node_to_cpumask[node])
 
-struct node {
-    paddr_t start;
-    paddr_t end;
-};
-
-extern int compute_memnode_shift(struct node *nodes, int numnodes,
-                                 nodeid_t *nodeids, unsigned int *shift);
 extern nodeid_t pxm_to_node(unsigned int pxm);
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
 extern void numa_add_cpu(int cpu);
-extern void numa_init_array(void);
-extern bool srat_disabled(void);
-extern void numa_set_node(int cpu, nodeid_t node);
-extern nodeid_t acpi_setup_node(unsigned int pxm);
-extern void srat_detect_node(int cpu);
-
-extern void setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t end);
 extern nodeid_t apicid_to_node[];
-extern void init_cpu_to_node(void);
 
 /* Simple perfect hash to map pdx to node numbers */
 extern unsigned int memnode_shift;
diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h
index 922fbd8..eed40af 100644
--- a/xen/include/xen/numa.h
+++ b/xen/include/xen/numa.h
@@ -14,6 +14,21 @@ 
 
 #define MAX_NUMNODES    (1 << NODES_SHIFT)
 
+struct node {
+    paddr_t start;
+    paddr_t end;
+};
+
+extern int compute_memnode_shift(struct node *nodes, int numnodes,
+                                 nodeid_t *nodeids, unsigned int *shift);
+extern void numa_init_array(void);
+extern bool_t srat_disabled(void);
+extern void numa_set_node(int cpu, nodeid_t node);
+extern nodeid_t acpi_setup_node(unsigned int pxm);
+extern void srat_detect_node(int cpu);
+extern void setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t end);
+extern void init_cpu_to_node(void);
+
 #define vcpu_to_node(v) (cpu_to_node((v)->processor))
 
 #define domain_to_node(d) \
@@ -23,4 +38,7 @@ 
 bool is_numa_off(void);
 bool get_acpi_numa(void);
 void set_acpi_numa(bool val);
+int get_numa_fake(void);
+extern int numa_emulation(uint64_t start_pfn, uint64_t end_pfn);
+extern void numa_dummy_init(uint64_t start_pfn, uint64_t end_pfn);
 #endif /* _XEN_NUMA_H */