diff mbox series

[v3,2/6] xen/x86: move generically usable NUMA code from x86 to common

Message ID 20220822025810.2240707-3-wei.chen@arm.com (mailing list archive)
State Superseded
Headers show
Series Device tree based NUMA support for Arm - Part#2 | expand

Commit Message

Wei Chen Aug. 22, 2022, 2:58 a.m. UTC
There are some codes in x86/numa.c can be shared by common
architectures to implememnt NUMA support. Just like some
variables and functions to check and store NUMA memory map.
And some variables and functions to do NUMA initialization.

In this patch, we move them to common/numa.c and xen/numa.h
and use the CONFIG_NUMA to gate them for non-NUMA supported
architectures. As the target header file is Xen-style, so
we trim some spaces and replace tabs for the codes that has
been moved to xen/numa.h at the same time.

As acpi_scan_nodes has been used in a common function, it
doesn't make sense to use acpi_xxx in common code, so we
rename it to numa_scan_nodes in this patch too. After that
if we still use CONFIG_ACPI_NUMA in to gate numa_scan_nodes
in numa_initmem_init, that doesn't make sense. As CONFIG_NUMA
will be selected by CONFIG_ACPI_NUMA for x86. So, we replace
CONFIG_ACPI_NUMA by CONFIG_NUMA to gate numa_scan_nodes.

As arch_numa_disabled has been implememnted for ACPI NUMA,
we can rename srat_disabled to numa_disabled and move it
to common code as well.

Signed-off-by: Wei Chen <wei.chen@arm.com>
---
v2 -> v3:
 1. Remove acpi.h from common/numa.c.
 2. Rename acpi_scan_nodes to numa_scan_nodes.
 3. Replace u8 by uint8_t for memnodemap.
 4. Use unsigned int for memnode_shift and adjust related functions
    (compute_hash_shift, populate_memnodemap) to use correct types for
    return values or parameters.
 5. Use nodeid_t for nodeid and node numbers.
 6. Use __read_mostly and __ro_after_init for appropriate variables.
 7. Adjust the __read_mostly and __initdata location for some variables.
 8. convert from plain int to unsigned for cpuid and other proper variables.
 9. Use __attribute_pure__ instead of __attribute__((pure)).
10. Replace CONFIG_ACPI_NUMA by CONFIG_NUMA in numa_initmem_init.
11. Add const for some functions' parameters.
12. Move srat_disabled to common code with new name numa_disabled.
13. Fix some spaces code-style for numa_emulation.
14. Change from int to unsigned int for numa_fake.
v1 -> v2:
1. New patch in v2.
---
 xen/arch/x86/include/asm/acpi.h  |   1 -
 xen/arch/x86/include/asm/numa.h  |  57 +---
 xen/arch/x86/include/asm/setup.h |   1 -
 xen/arch/x86/numa.c              | 430 +-----------------------------
 xen/arch/x86/smpboot.c           |   2 +-
 xen/arch/x86/srat.c              |   8 +-
 xen/common/Makefile              |   1 +
 xen/common/numa.c                | 440 +++++++++++++++++++++++++++++++
 xen/include/xen/numa.h           |  66 +++++
 9 files changed, 514 insertions(+), 492 deletions(-)
 create mode 100644 xen/common/numa.c

Comments

Jan Beulich Aug. 25, 2022, 10:58 a.m. UTC | #1
On 22.08.2022 04:58, Wei Chen wrote:
> --- /dev/null
> +++ b/xen/common/numa.c
> @@ -0,0 +1,440 @@
> +/*
> + * Generic VM initialization for NUMA setups.
> + * Copyright 2002,2003 Andi Kleen, SuSE Labs.
> + * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
> + */
> +
> +#include <xen/init.h>
> +#include <xen/keyhandler.h>
> +#include <xen/mm.h>
> +#include <xen/nodemask.h>
> +#include <xen/numa.h>
> +#include <xen/param.h>
> +#include <xen/sched.h>
> +#include <xen/softirq.h>
> +
> +struct node_data __ro_after_init node_data[MAX_NUMNODES];
> +
> +/* Mapping from pdx to node id */
> +unsigned int __ro_after_init memnode_shift;
> +unsigned long __ro_after_init memnodemapsize;
> +uint8_t *__ro_after_init memnodemap;
> +static uint8_t __ro_after_init _memnodemap[64];
> +
> +nodeid_t __ro_after_init cpu_to_node[NR_CPUS] = {

I don't think this can be __ro_after_init, or you'll break CPU
hotplug.

> +    [0 ... NR_CPUS-1] = NUMA_NO_NODE
> +};
> +
> +cpumask_t __ro_after_init node_to_cpumask[MAX_NUMNODES];

Same here.

> +nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
> +
> +bool __read_mostly numa_off;

This, otoh, can be, or have I missed a place where it's written by a
non-__init function?

> +bool numa_disabled(void)
> +{
> +    return numa_off || arch_numa_disabled(false);
> +}
> +
> +/*
> + * Given a shift value, try to populate memnodemap[]
> + * Returns :
> + * 1 if OK
> + * 0 if memnodmap[] too small (of shift too small)
> + * -1 if node overlap or lost ram (shift too big)
> + */
> +static int __init populate_memnodemap(const struct node *nodes,
> +                                      nodeid_t numnodes, unsigned int shift,

I don't think you can use nodeid_t for a variable holding a node count.
Think of what would happen if there were 256 nodes, the IDs of which
all fit in nodeid_t. (Same again further down.)

> +                                      nodeid_t *nodeids)
> +{
> +    unsigned long spdx, epdx;
> +    nodeid_t i;

This is likely inefficient for a loop counter variable. Note how you
use "unsigned int" in e.g. extract_lsb_from_nodes().

> +unsigned int __init compute_hash_shift(const struct node *nodes,
> +                                       nodeid_t numnodes, nodeid_t *nodeids)
> +{
> +    unsigned int shift;
> +
> +    shift = extract_lsb_from_nodes(nodes, numnodes);
> +    if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) )
> +        memnodemap = _memnodemap;
> +    else if ( allocate_cachealigned_memnodemap() )
> +        return -1;

With this the function can't very well have "unsigned int" return type.

> +void __init numa_init_array(void)
> +{
> +    int rr, i;

"unsigned int" for i and perhaps nodeid_t for rr?

> +static int __init numa_emulation(unsigned long start_pfn,
> +                                 unsigned long end_pfn)
> +{
> +    unsigned int i;
> +    struct node nodes[MAX_NUMNODES];
> +    uint64_t sz = pfn_to_paddr(end_pfn - start_pfn) / numa_fake;
> +
> +    /* Kludge needed for the hash function */
> +    if ( hweight64(sz) > 1 )
> +    {
> +        u64 x = 1;

uint64_t and a blank line between declaration(s) and statement(s)
please.

> +        while ( (x << 1) < sz )
> +            x <<= 1;
> +        if ( x < sz / 2 )
> +            printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
> +        sz = x;
> +    }
> +
> +    memset(&nodes, 0, sizeof(nodes));
> +    for ( i = 0; i < numa_fake; i++ )
> +    {
> +        nodes[i].start = pfn_to_paddr(start_pfn) + i * sz;
> +        if ( i == numa_fake - 1 )
> +            sz = pfn_to_paddr(end_pfn) - nodes[i].start;
> +        nodes[i].end = nodes[i].start + sz;
> +        printk(KERN_INFO "Faking node %u at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
> +               i, nodes[i].start, nodes[i].end,
> +               (nodes[i].end - nodes[i].start) >> 20);
> +        node_set_online(i);
> +    }
> +    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
> +    if ( memnode_shift < 0 )

Does the compiler not warn here, comparing an unsigned value for being
negative?

> --- a/xen/include/xen/numa.h
> +++ b/xen/include/xen/numa.h
> @@ -18,4 +18,70 @@
>    (((d)->vcpu != NULL && (d)->vcpu[0] != NULL) \
>     ? vcpu_to_node((d)->vcpu[0]) : NUMA_NO_NODE)
>  
> +/* The following content can be used when NUMA feature is enabled */
> +#ifdef CONFIG_NUMA
> +
> +extern nodeid_t      cpu_to_node[NR_CPUS];
> +extern cpumask_t     node_to_cpumask[];
> +
> +#define cpu_to_node(cpu)        (cpu_to_node[cpu])
> +#define parent_node(node)       (node)
> +#define node_to_first_cpu(node) (__ffs(node_to_cpumask[node]))
> +#define node_to_cpumask(node)   (node_to_cpumask[node])

Please could you take the opportunity and drop unnecessary parentheses
from here? Afaict only parent_node() need them to be kept.

> +struct node {
> +    paddr_t start, end;
> +};
> +
> +extern unsigned int compute_hash_shift(const struct node *nodes,
> +                                       nodeid_t numnodes, nodeid_t *nodeids);
> +
> +#define VIRTUAL_BUG_ON(x)
> +
> +extern bool numa_off;
> +extern void numa_add_cpu(unsigned int cpu);

Please can you have variable and function declarations visually separated,
by adding a blank line between them?

> +extern void numa_init_array(void);
> +extern void numa_set_node(unsigned int cpu, nodeid_t node);
> +extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
> +extern int  numa_scan_nodes(paddr_t start, paddr_t end);
> +
> +extern int arch_numa_setup(const char *opt);
> +extern bool arch_numa_disabled(bool init_as_disable);
> +extern void setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t end);
> +
> +static inline void clear_node_cpumask(unsigned int cpu)
> +{
> +    cpumask_clear_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
> +}
> +
> +/* Simple perfect hash to map pdx to node numbers */
> +extern unsigned int memnode_shift;
> +extern unsigned long memnodemapsize;
> +extern uint8_t *memnodemap;
> +
> +struct node_data {
> +    unsigned long node_start_pfn;
> +    unsigned long node_spanned_pages;
> +};
> +
> +extern struct node_data node_data[];
> +
> +static inline __attribute_pure__ nodeid_t phys_to_nid(paddr_t addr)

Nit: The conventional place for attributes is between return type
and function (or object) name.

> +{
> +    nodeid_t nid;
> +    VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= memnodemapsize);
> +    nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift];
> +    VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
> +    return nid;
> +}
> +
> +#define NODE_DATA(nid)          (&(node_data[nid]))

Again please take the opportunity and drop the unnecessary inner
parentheses.

> +#define node_start_pfn(nid)     (NODE_DATA(nid)->node_start_pfn)
> +#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
> +#define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
> +                                NODE_DATA(nid)->node_spanned_pages)

Pleae correct indentation here - it was correct originally (except
for the fact that it was using hard tabs).

Jan
Wei Chen Aug. 29, 2022, 9:49 a.m. UTC | #2
Hi Jan,

> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: 2022年8月25日 18:58
> To: Wei Chen <Wei.Chen@arm.com>
> Cc: nd <nd@arm.com>; Andrew Cooper <andrew.cooper3@citrix.com>; Roger Pau
> Monné <roger.pau@citrix.com>; Wei Liu <wl@xen.org>; George Dunlap
> <george.dunlap@citrix.com>; Julien Grall <julien@xen.org>; Stefano
> Stabellini <sstabellini@kernel.org>; xen-devel@lists.xenproject.org
> Subject: Re: [PATCH v3 2/6] xen/x86: move generically usable NUMA code
> from x86 to common
> 
> On 22.08.2022 04:58, Wei Chen wrote:
> > --- /dev/null
> > +++ b/xen/common/numa.c
> > @@ -0,0 +1,440 @@
> > +/*
> > + * Generic VM initialization for NUMA setups.
> > + * Copyright 2002,2003 Andi Kleen, SuSE Labs.
> > + * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
> > + */
> > +
> > +#include <xen/init.h>
> > +#include <xen/keyhandler.h>
> > +#include <xen/mm.h>
> > +#include <xen/nodemask.h>
> > +#include <xen/numa.h>
> > +#include <xen/param.h>
> > +#include <xen/sched.h>
> > +#include <xen/softirq.h>
> > +
> > +struct node_data __ro_after_init node_data[MAX_NUMNODES];
> > +
> > +/* Mapping from pdx to node id */
> > +unsigned int __ro_after_init memnode_shift;
> > +unsigned long __ro_after_init memnodemapsize;
> > +uint8_t *__ro_after_init memnodemap;
> > +static uint8_t __ro_after_init _memnodemap[64];
> > +
> > +nodeid_t __ro_after_init cpu_to_node[NR_CPUS] = {
> 
> I don't think this can be __ro_after_init, or you'll break CPU
> hotplug.
> 

Yes, this will cause problem in cpu_add. I will __read_mostly for it
in next version.

> > +    [0 ... NR_CPUS-1] = NUMA_NO_NODE
> > +};
> > +
> > +cpumask_t __ro_after_init node_to_cpumask[MAX_NUMNODES];
> 
> Same here.
> 

Ok.

> > +nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
> > +
> > +bool __read_mostly numa_off;
> 
> This, otoh, can be, or have I missed a place where it's written by a
> non-__init function?
> 

I think yes, it will be used in numa_disabled and numa_disabled will
be called in cpu_add.

> > +bool numa_disabled(void)
> > +{
> > +    return numa_off || arch_numa_disabled(false);
> > +}
> > +
> > +/*
> > + * Given a shift value, try to populate memnodemap[]
> > + * Returns :
> > + * 1 if OK
> > + * 0 if memnodmap[] too small (of shift too small)
> > + * -1 if node overlap or lost ram (shift too big)
> > + */
> > +static int __init populate_memnodemap(const struct node *nodes,
> > +                                      nodeid_t numnodes, unsigned int
> shift,
> 
> I don't think you can use nodeid_t for a variable holding a node count.
> Think of what would happen if there were 256 nodes, the IDs of which
> all fit in nodeid_t. (Same again further down.)
> 

If we use u8 as nodeid_t, why there will be 256 nodes to here?
And the MAX_NUMNODES has been limited to 64 (using NODES_SHIFT or
CONFIG_NR_NUMA_NODES). If we allow 256 nodes, we have to update MAX_NUMNODES
and nodeid_t first I think?

> > +                                      nodeid_t *nodeids)
> > +{
> > +    unsigned long spdx, epdx;
> > +    nodeid_t i;
> 
> This is likely inefficient for a loop counter variable. Note how you
> use "unsigned int" in e.g. extract_lsb_from_nodes().
> 

Did you mean u8 for "i" will cause something like unalignment, and will
cause loop inefficient. If yes, I will use unsigned int for "i" in next
version.

> > +unsigned int __init compute_hash_shift(const struct node *nodes,
> > +                                       nodeid_t numnodes, nodeid_t
> *nodeids)
> > +{
> > +    unsigned int shift;
> > +
> > +    shift = extract_lsb_from_nodes(nodes, numnodes);
> > +    if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) )
> > +        memnodemap = _memnodemap;
> > +    else if ( allocate_cachealigned_memnodemap() )
> > +        return -1;
> 
> With this the function can't very well have "unsigned int" return type.
> 

Oh, yes, I had only thought a negative shift will be strange and hadn't
noticed this return value. I will restore the return value.

> > +void __init numa_init_array(void)
> > +{
> > +    int rr, i;
> 
> "unsigned int" for i and perhaps nodeid_t for rr?
> 

Yes, I will do it.

> > +static int __init numa_emulation(unsigned long start_pfn,
> > +                                 unsigned long end_pfn)
> > +{
> > +    unsigned int i;
> > +    struct node nodes[MAX_NUMNODES];
> > +    uint64_t sz = pfn_to_paddr(end_pfn - start_pfn) / numa_fake;
> > +
> > +    /* Kludge needed for the hash function */
> > +    if ( hweight64(sz) > 1 )
> > +    {
> > +        u64 x = 1;
> 
> uint64_t and a blank line between declaration(s) and statement(s)
> please.
> 

OK.

> > +        while ( (x << 1) < sz )
> > +            x <<= 1;
> > +        if ( x < sz / 2 )
> > +            printk(KERN_ERR "Numa emulation unbalanced. Complain to
> maintainer\n");
> > +        sz = x;
> > +    }
> > +
> > +    memset(&nodes, 0, sizeof(nodes));
> > +    for ( i = 0; i < numa_fake; i++ )
> > +    {
> > +        nodes[i].start = pfn_to_paddr(start_pfn) + i * sz;
> > +        if ( i == numa_fake - 1 )
> > +            sz = pfn_to_paddr(end_pfn) - nodes[i].start;
> > +        nodes[i].end = nodes[i].start + sz;
> > +        printk(KERN_INFO "Faking node %u at %"PRIx64"-%"PRIx64"
> (%"PRIu64"MB)\n",
> > +               i, nodes[i].start, nodes[i].end,
> > +               (nodes[i].end - nodes[i].start) >> 20);
> > +        node_set_online(i);
> > +    }
> > +    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
> > +    if ( memnode_shift < 0 )
> 
> Does the compiler not warn here, comparing an unsigned value for being
> negative?

It's strange, I haven't seen warnings for this kind of comparison.

> 
> > --- a/xen/include/xen/numa.h
> > +++ b/xen/include/xen/numa.h
> > @@ -18,4 +18,70 @@
> >    (((d)->vcpu != NULL && (d)->vcpu[0] != NULL) \
> >     ? vcpu_to_node((d)->vcpu[0]) : NUMA_NO_NODE)
> >
> > +/* The following content can be used when NUMA feature is enabled */
> > +#ifdef CONFIG_NUMA
> > +
> > +extern nodeid_t      cpu_to_node[NR_CPUS];
> > +extern cpumask_t     node_to_cpumask[];
> > +
> > +#define cpu_to_node(cpu)        (cpu_to_node[cpu])
> > +#define parent_node(node)       (node)
> > +#define node_to_first_cpu(node) (__ffs(node_to_cpumask[node]))
> > +#define node_to_cpumask(node)   (node_to_cpumask[node])
> 
> Please could you take the opportunity and drop unnecessary parentheses
> from here? Afaict only parent_node() need them to be kept.
>

OK.

> > +struct node {
> > +    paddr_t start, end;
> > +};
> > +
> > +extern unsigned int compute_hash_shift(const struct node *nodes,
> > +                                       nodeid_t numnodes, nodeid_t
> *nodeids);
> > +
> > +#define VIRTUAL_BUG_ON(x)
> > +
> > +extern bool numa_off;
> > +extern void numa_add_cpu(unsigned int cpu);
> 
> Please can you have variable and function declarations visually separated,
> by adding a blank line between them?
>

Sure. I will do it.
 
> > +extern void numa_init_array(void);
> > +extern void numa_set_node(unsigned int cpu, nodeid_t node);
> > +extern void numa_initmem_init(unsigned long start_pfn, unsigned long
> end_pfn);
> > +extern int  numa_scan_nodes(paddr_t start, paddr_t end);
> > +
> > +extern int arch_numa_setup(const char *opt);
> > +extern bool arch_numa_disabled(bool init_as_disable);
> > +extern void setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t
> end);
> > +
> > +static inline void clear_node_cpumask(unsigned int cpu)
> > +{
> > +    cpumask_clear_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
> > +}
> > +
> > +/* Simple perfect hash to map pdx to node numbers */
> > +extern unsigned int memnode_shift;
> > +extern unsigned long memnodemapsize;
> > +extern uint8_t *memnodemap;
> > +
> > +struct node_data {
> > +    unsigned long node_start_pfn;
> > +    unsigned long node_spanned_pages;
> > +};
> > +
> > +extern struct node_data node_data[];
> > +
> > +static inline __attribute_pure__ nodeid_t phys_to_nid(paddr_t addr)
> 
> Nit: The conventional place for attributes is between return type
> and function (or object) name.
>

Ok.
 
> > +{
> > +    nodeid_t nid;
> > +    VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >=
> memnodemapsize);
> > +    nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift];
> > +    VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
> > +    return nid;
> > +}
> > +
> > +#define NODE_DATA(nid)          (&(node_data[nid]))
> 
> Again please take the opportunity and drop the unnecessary inner
> parentheses.
> 

Ok.

> > +#define node_start_pfn(nid)     (NODE_DATA(nid)->node_start_pfn)
> > +#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
> > +#define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
> > +                                NODE_DATA(nid)->node_spanned_pages)
> 
> Pleae correct indentation here - it was correct originally (except
> for the fact that it was using hard tabs).
>

Ok.

Cheers,
Wei Chen
 
> Jan
Jan Beulich Sept. 6, 2022, 7:07 a.m. UTC | #3
On 29.08.2022 11:49, Wei Chen wrote:
>> -----Original Message-----
>> From: Jan Beulich <jbeulich@suse.com>
>> Sent: 2022年8月25日 18:58
>>
>> On 22.08.2022 04:58, Wei Chen wrote:
>>> +nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
>>> +
>>> +bool __read_mostly numa_off;
>>
>> This, otoh, can be, or have I missed a place where it's written by a
>> non-__init function?
>>
> 
> I think yes, it will be used in numa_disabled and numa_disabled will
> be called in cpu_add.

In the original code I cannot spot such a path - can you please point
out how exactly you see numa_disabled() reachable from cpu_add()? I'm
clearly overlooking something ...

>>> +bool numa_disabled(void)
>>> +{
>>> +    return numa_off || arch_numa_disabled(false);
>>> +}
>>> +
>>> +/*
>>> + * Given a shift value, try to populate memnodemap[]
>>> + * Returns :
>>> + * 1 if OK
>>> + * 0 if memnodmap[] too small (of shift too small)
>>> + * -1 if node overlap or lost ram (shift too big)
>>> + */
>>> +static int __init populate_memnodemap(const struct node *nodes,
>>> +                                      nodeid_t numnodes, unsigned int
>> shift,
>>
>> I don't think you can use nodeid_t for a variable holding a node count.
>> Think of what would happen if there were 256 nodes, the IDs of which
>> all fit in nodeid_t. (Same again further down.)
>>
> 
> If we use u8 as nodeid_t, why there will be 256 nodes to here?
> And the MAX_NUMNODES has been limited to 64 (using NODES_SHIFT or
> CONFIG_NR_NUMA_NODES). If we allow 256 nodes, we have to update MAX_NUMNODES
> and nodeid_t first I think?

Well, when writing the reply I did forget about MAX_NUMNODES, so yes,
with that the value can't be larger than 255. Nevertheless I don't
think a count-of-nodes value should be expressed with nodeid_t. It
should be simply "unsigned int".

>>> +                                      nodeid_t *nodeids)
>>> +{
>>> +    unsigned long spdx, epdx;
>>> +    nodeid_t i;
>>
>> This is likely inefficient for a loop counter variable. Note how you
>> use "unsigned int" in e.g. extract_lsb_from_nodes().
>>
> 
> Did you mean u8 for "i" will cause something like unalignment, and will
> cause loop inefficient. If yes, I will use unsigned int for "i" in next
> version.

There's no issue with mis-alignment afaics, but there still is the
inefficiency issue requiring the loop variable to be zero-extended
before being usable as an array. Both x86-64 and aarch64 have the
zero-extension as a side effect when moving 32-bit quantities (from
memory or between registers), and arm wouldn't require any zero-
extension at all then.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/include/asm/acpi.h b/xen/arch/x86/include/asm/acpi.h
index 9a9cc4c240..5c2dd5da2d 100644
--- a/xen/arch/x86/include/asm/acpi.h
+++ b/xen/arch/x86/include/asm/acpi.h
@@ -102,7 +102,6 @@  extern unsigned long acpi_wakeup_address;
 #define ARCH_HAS_POWER_INIT	1
 
 extern s8 acpi_numa;
-extern int acpi_scan_nodes(u64 start, u64 end);
 #define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
 
 extern struct acpi_sleep_info acpi_sinfo;
diff --git a/xen/arch/x86/include/asm/numa.h b/xen/arch/x86/include/asm/numa.h
index f41fe20f9b..6c87942d43 100644
--- a/xen/arch/x86/include/asm/numa.h
+++ b/xen/arch/x86/include/asm/numa.h
@@ -9,72 +9,17 @@  typedef u8 nodeid_t;
 
 extern int srat_rev;
 
-extern nodeid_t      cpu_to_node[NR_CPUS];
-extern cpumask_t     node_to_cpumask[];
-
-#define cpu_to_node(cpu)		(cpu_to_node[cpu])
-#define parent_node(node)		(node)
-#define node_to_first_cpu(node)  (__ffs(node_to_cpumask[node]))
-#define node_to_cpumask(node)    (node_to_cpumask[node])
-
-struct node { 
-	paddr_t start, end;
-};
-
-extern int compute_hash_shift(struct node *nodes, int numnodes,
-			      nodeid_t *nodeids);
 extern nodeid_t pxm_to_node(unsigned int pxm);
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
-#define VIRTUAL_BUG_ON(x) 
 
-extern void numa_add_cpu(int cpu);
-extern void numa_init_array(void);
-extern bool numa_off;
-
-extern int arch_numa_setup(const char *opt);
-extern bool arch_numa_disabled(bool init_as_disable);
-extern bool srat_disabled(void);
-extern void numa_set_node(int cpu, nodeid_t node);
+extern bool numa_disabled(void);
 extern nodeid_t setup_node(unsigned int pxm);
 extern void srat_detect_node(int cpu);
 
-extern void setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t end);
 extern nodeid_t apicid_to_node[];
 extern void init_cpu_to_node(void);
 
-static inline void clear_node_cpumask(int cpu)
-{
-	cpumask_clear_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
-}
-
-/* Simple perfect hash to map pdx to node numbers */
-extern int memnode_shift; 
-extern unsigned long memnodemapsize;
-extern u8 *memnodemap;
-
-struct node_data {
-    unsigned long node_start_pfn;
-    unsigned long node_spanned_pages;
-};
-
-extern struct node_data node_data[];
-
-static inline __attribute__((pure)) nodeid_t phys_to_nid(paddr_t addr)
-{ 
-	nodeid_t nid;
-	VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= memnodemapsize);
-	nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; 
-	VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 
-	return nid; 
-} 
-
-#define NODE_DATA(nid)		(&(node_data[nid]))
-
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)
-#define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
-				 NODE_DATA(nid)->node_spanned_pages)
 #define arch_want_default_dmazone() (num_online_nodes() > 1)
 
 extern int valid_numa_range(paddr_t start, paddr_t end, nodeid_t node);
diff --git a/xen/arch/x86/include/asm/setup.h b/xen/arch/x86/include/asm/setup.h
index 21037b7f31..ae470ea12f 100644
--- a/xen/arch/x86/include/asm/setup.h
+++ b/xen/arch/x86/include/asm/setup.h
@@ -20,7 +20,6 @@  void early_time_init(void);
 
 void set_nr_cpu_ids(unsigned int max_cpus);
 
-void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
 void arch_init_memory(void);
 void subarch_init_memory(void);
 
diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
index 3d3081c967..9a9090e99a 100644
--- a/xen/arch/x86/numa.c
+++ b/xen/arch/x86/numa.c
@@ -4,20 +4,11 @@ 
  * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
  */ 
 
-#include <xen/mm.h>
-#include <xen/string.h>
 #include <xen/init.h>
-#include <xen/ctype.h>
+#include <xen/mm.h>
 #include <xen/nodemask.h>
 #include <xen/numa.h>
-#include <xen/keyhandler.h>
-#include <xen/param.h>
-#include <xen/time.h>
-#include <xen/smp.h>
-#include <xen/pfn.h>
 #include <asm/acpi.h>
-#include <xen/sched.h>
-#include <xen/softirq.h>
 
 #ifndef Dprintk
 #define Dprintk(x...)
@@ -26,28 +17,13 @@ 
 /* from proto.h */
 #define round_up(x,y) ((((x)+(y))-1) & (~((y)-1)))
 
-struct node_data node_data[MAX_NUMNODES];
-
-/* Mapping from pdx to node id */
-int memnode_shift;
-static typeof(*memnodemap) _memnodemap[64];
-unsigned long memnodemapsize;
-u8 *memnodemap;
-
-nodeid_t cpu_to_node[NR_CPUS] __read_mostly = {
-    [0 ... NR_CPUS-1] = NUMA_NO_NODE
-};
 /*
  * Keep BIOS's CPU2node information, should not be used for memory allocaion
  */
 nodeid_t apicid_to_node[MAX_LOCAL_APIC] = {
     [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
-cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
 
-nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
-
-bool numa_off;
 s8 acpi_numa = 0;
 
 int __init arch_numa_setup(const char *opt)
@@ -72,267 +48,6 @@  bool arch_numa_disabled(bool init_as_disable)
     return acpi_numa <= 0;
 }
 
-bool srat_disabled(void)
-{
-    return numa_off || arch_numa_disabled(false);
-}
-
-/*
- * Given a shift value, try to populate memnodemap[]
- * Returns :
- * 1 if OK
- * 0 if memnodmap[] too small (of shift too small)
- * -1 if node overlap or lost ram (shift too big)
- */
-static int __init populate_memnodemap(const struct node *nodes,
-                                      int numnodes, int shift, nodeid_t *nodeids)
-{
-    unsigned long spdx, epdx;
-    int i, res = -1;
-
-    memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap));
-    for ( i = 0; i < numnodes; i++ )
-    {
-        spdx = paddr_to_pdx(nodes[i].start);
-        epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
-        if ( spdx >= epdx )
-            continue;
-        if ( (epdx >> shift) >= memnodemapsize )
-            return 0;
-        do {
-            if ( memnodemap[spdx >> shift] != NUMA_NO_NODE )
-                return -1;
-
-            if ( !nodeids )
-                memnodemap[spdx >> shift] = i;
-            else
-                memnodemap[spdx >> shift] = nodeids[i];
-
-            spdx += (1UL << shift);
-        } while ( spdx < epdx );
-        res = 1;
-    }
-
-    return res;
-}
-
-static int __init allocate_cachealigned_memnodemap(void)
-{
-    unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
-    unsigned long mfn = mfn_x(alloc_boot_pages(size, 1));
-
-    memnodemap = mfn_to_virt(mfn);
-    mfn <<= PAGE_SHIFT;
-    size <<= PAGE_SHIFT;
-    printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
-           mfn, mfn + size);
-    memnodemapsize = size / sizeof(*memnodemap);
-
-    return 0;
-}
-
-/*
- * The LSB of all start and end addresses in the node map is the value of the
- * maximum possible shift.
- */
-static int __init extract_lsb_from_nodes(const struct node *nodes,
-                                         int numnodes)
-{
-    int i, nodes_used = 0;
-    unsigned long spdx, epdx;
-    unsigned long bitfield = 0, memtop = 0;
-
-    for ( i = 0; i < numnodes; i++ )
-    {
-        spdx = paddr_to_pdx(nodes[i].start);
-        epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
-        if ( spdx >= epdx )
-            continue;
-        bitfield |= spdx;
-        nodes_used++;
-        if ( epdx > memtop )
-            memtop = epdx;
-    }
-    if ( nodes_used <= 1 )
-        i = BITS_PER_LONG - 1;
-    else
-        i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
-    memnodemapsize = (memtop >> i) + 1;
-    return i;
-}
-
-int __init compute_hash_shift(struct node *nodes, int numnodes,
-                              nodeid_t *nodeids)
-{
-    int shift;
-
-    shift = extract_lsb_from_nodes(nodes, numnodes);
-    if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) )
-        memnodemap = _memnodemap;
-    else if ( allocate_cachealigned_memnodemap() )
-        return -1;
-    printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift);
-
-    if ( populate_memnodemap(nodes, numnodes, shift, nodeids) != 1 )
-    {
-        printk(KERN_INFO "Your memory is not aligned you need to "
-               "rebuild your hypervisor with a bigger NODEMAPSIZE "
-               "shift=%d\n", shift);
-        return -1;
-    }
-
-    return shift;
-}
-/* initialize NODE_DATA given nodeid and start/end */
-void __init setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t end)
-{
-    unsigned long start_pfn = paddr_to_pfn(start);
-    unsigned long end_pfn = paddr_to_pfn(end);
-
-    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
-    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
-
-    node_set_online(nodeid);
-} 
-
-void __init numa_init_array(void)
-{
-    int rr, i;
-
-    /* There are unfortunately some poorly designed mainboards around
-       that only connect memory to a single CPU. This breaks the 1:1 cpu->node
-       mapping. To avoid this fill in the mapping for all possible
-       CPUs, as the number of CPUs is not known yet.
-       We round robin the existing nodes. */
-    rr = first_node(node_online_map);
-    for ( i = 0; i < nr_cpu_ids; i++ )
-    {
-        if ( cpu_to_node[i] != NUMA_NO_NODE )
-            continue;
-        numa_set_node(i, rr);
-        rr = cycle_node(rr, node_online_map);
-    }
-}
-
-#ifdef CONFIG_NUMA_EMU
-static int numa_fake __initdata = 0;
-
-/* Numa emulation */
-static int __init numa_emulation(unsigned long start_pfn,
-                                 unsigned long end_pfn)
-{
-    int i;
-    struct node nodes[MAX_NUMNODES];
-    uint64_t sz = pfn_to_paddr(end_pfn - start_pfn) / numa_fake;
-
-    /* Kludge needed for the hash function */
-    if ( hweight64(sz) > 1 )
-    {
-        u64 x = 1;
-        while ( (x << 1) < sz )
-            x <<= 1;
-        if ( x < sz/2 )
-            printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
-        sz = x;
-    }
-
-    memset(&nodes,0,sizeof(nodes));
-    for ( i = 0; i < numa_fake; i++ )
-    {
-        nodes[i].start = pfn_to_paddr(start_pfn) + i * sz;
-        if ( i == numa_fake - 1 )
-            sz = pfn_to_paddr(end_pfn) - nodes[i].start;
-        nodes[i].end = nodes[i].start + sz;
-        printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
-               i,
-               nodes[i].start, nodes[i].end,
-               (nodes[i].end - nodes[i].start) >> 20);
-        node_set_online(i);
-    }
-    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
-    if ( memnode_shift < 0 )
-    {
-        memnode_shift = 0;
-        printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
-        return -1;
-    }
-    for_each_online_node ( i )
-        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-    numa_init_array();
-
-    return 0;
-}
-#endif
-
-void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
-{ 
-    int i;
-    paddr_t start = pfn_to_paddr(start_pfn);
-    paddr_t end = pfn_to_paddr(end_pfn);
-
-#ifdef CONFIG_NUMA_EMU
-    if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
-        return;
-#endif
-
-#ifdef CONFIG_ACPI_NUMA
-    if ( !numa_off && !acpi_scan_nodes(start, end) )
-        return;
-#endif
-
-    printk(KERN_INFO "%s\n",
-           numa_off ? "NUMA turned off" : "No NUMA configuration found");
-
-    printk(KERN_INFO "Faking a node at %"PRIpaddr"-%"PRIpaddr"\n",
-           start, end);
-    /* setup dummy node covering all memory */
-    memnode_shift = BITS_PER_LONG - 1;
-    memnodemap = _memnodemap;
-    /* Dummy node only uses 1 slot in reality */
-    memnodemap[0] = 0;
-    memnodemapsize = 1;
-
-    nodes_clear(node_online_map);
-    node_set_online(0);
-    for ( i = 0; i < nr_cpu_ids; i++ )
-        numa_set_node(i, 0);
-    cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
-    setup_node_bootmem(0, start, end);
-}
-
-void numa_add_cpu(int cpu)
-{
-    cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
-} 
-
-void numa_set_node(int cpu, nodeid_t node)
-{
-    cpu_to_node[cpu] = node;
-}
-
-/* [numa=off] */
-static int __init cf_check numa_setup(const char *opt)
-{
-    if ( !strncmp(opt, "off", 3) )
-        numa_off = true;
-    else if ( !strncmp(opt, "on", 2) )
-        numa_off = false;
-#ifdef CONFIG_NUMA_EMU
-    else if ( !strncmp(opt, "fake=", 5) )
-    {
-        numa_off = false;
-        numa_fake = simple_strtoul(opt + 5, NULL, 0);
-        if ( numa_fake >= MAX_NUMNODES )
-            numa_fake = MAX_NUMNODES;
-    }
-#endif
-    else
-        return arch_numa_setup(opt);
-
-    return 0;
-} 
-custom_param("numa", numa_setup);
-
 /*
  * Setup early cpu_to_node.
  *
@@ -381,146 +96,3 @@  unsigned int __init arch_get_dma_bitsize(void)
                  flsl(node_start_pfn(node) + node_spanned_pages(node) / 4 - 1)
                  + PAGE_SHIFT, 32);
 }
-
-static void cf_check dump_numa(unsigned char key)
-{
-    s_time_t now = NOW();
-    unsigned int i, j, n;
-    struct domain *d;
-    struct page_info *page;
-    unsigned int page_num_node[MAX_NUMNODES];
-    const struct vnuma_info *vnuma;
-
-    printk("'%c' pressed -> dumping numa info (now = %"PRI_stime")\n", key,
-           now);
-
-    for_each_online_node ( i )
-    {
-        paddr_t pa = pfn_to_paddr(node_start_pfn(i) + 1);
-
-        printk("NODE%u start->%lu size->%lu free->%lu\n",
-               i, node_start_pfn(i), node_spanned_pages(i),
-               avail_node_heap_pages(i));
-        /* sanity check phys_to_nid() */
-        if ( phys_to_nid(pa) != i )
-            printk("phys_to_nid(%"PRIpaddr") -> %d should be %u\n",
-                   pa, phys_to_nid(pa), i);
-    }
-
-    j = cpumask_first(&cpu_online_map);
-    n = 0;
-    for_each_online_cpu ( i )
-    {
-        if ( i != j + n || cpu_to_node[j] != cpu_to_node[i] )
-        {
-            if ( n > 1 )
-                printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
-            else
-                printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
-            j = i;
-            n = 1;
-        }
-        else
-            ++n;
-    }
-    if ( n > 1 )
-        printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
-    else
-        printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
-
-    rcu_read_lock(&domlist_read_lock);
-
-    printk("Memory location of each domain:\n");
-    for_each_domain ( d )
-    {
-        process_pending_softirqs();
-
-        printk("Domain %u (total: %u):\n", d->domain_id, domain_tot_pages(d));
-
-        for_each_online_node ( i )
-            page_num_node[i] = 0;
-
-        spin_lock(&d->page_alloc_lock);
-        page_list_for_each(page, &d->page_list)
-        {
-            i = phys_to_nid(page_to_maddr(page));
-            page_num_node[i]++;
-        }
-        spin_unlock(&d->page_alloc_lock);
-
-        for_each_online_node ( i )
-            printk("    Node %u: %u\n", i, page_num_node[i]);
-
-        if ( !read_trylock(&d->vnuma_rwlock) )
-            continue;
-
-        if ( !d->vnuma )
-        {
-            read_unlock(&d->vnuma_rwlock);
-            continue;
-        }
-
-        vnuma = d->vnuma;
-        printk("     %u vnodes, %u vcpus, guest physical layout:\n",
-               vnuma->nr_vnodes, d->max_vcpus);
-        for ( i = 0; i < vnuma->nr_vnodes; i++ )
-        {
-            unsigned int start_cpu = ~0U;
-
-            if ( vnuma->vnode_to_pnode[i] == NUMA_NO_NODE )
-                printk("       %3u: pnode ???,", i);
-            else
-                printk("       %3u: pnode %3u,", i, vnuma->vnode_to_pnode[i]);
-
-            printk(" vcpus ");
-
-            for ( j = 0; j < d->max_vcpus; j++ )
-            {
-                if ( !(j & 0x3f) )
-                    process_pending_softirqs();
-
-                if ( vnuma->vcpu_to_vnode[j] == i )
-                {
-                    if ( start_cpu == ~0U )
-                    {
-                        printk("%d", j);
-                        start_cpu = j;
-                    }
-                }
-                else if ( start_cpu != ~0U )
-                {
-                    if ( j - 1 != start_cpu )
-                        printk("-%d ", j - 1);
-                    else
-                        printk(" ");
-                    start_cpu = ~0U;
-                }
-            }
-
-            if ( start_cpu != ~0U  && start_cpu != j - 1 )
-                printk("-%d", j - 1);
-
-            printk("\n");
-
-            for ( j = 0; j < vnuma->nr_vmemranges; j++ )
-            {
-                if ( vnuma->vmemrange[j].nid == i )
-                    printk("           %016"PRIx64" - %016"PRIx64"\n",
-                           vnuma->vmemrange[j].start,
-                           vnuma->vmemrange[j].end);
-            }
-        }
-
-        read_unlock(&d->vnuma_rwlock);
-    }
-
-    rcu_read_unlock(&domlist_read_lock);
-}
-
-static int __init cf_check register_numa_trigger(void)
-{
-    register_keyhandler('u', dump_numa, "dump NUMA info", 1);
-    return 0;
-}
-__initcall(register_numa_trigger);
-
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index b46fd9ab18..9df08e9366 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -1350,7 +1350,7 @@  int cpu_add(uint32_t apic_id, uint32_t acpi_id, uint32_t pxm)
 
     x86_acpiid_to_apicid[acpi_id] = apic_id;
 
-    if ( !srat_disabled() )
+    if ( !numa_disabled() )
     {
         nodeid_t node = setup_node(pxm);
 
diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
index b62a152911..4c7f0c547e 100644
--- a/xen/arch/x86/srat.c
+++ b/xen/arch/x86/srat.c
@@ -238,7 +238,7 @@  acpi_numa_x2apic_affinity_init(const struct acpi_srat_x2apic_cpu_affinity *pa)
 	unsigned pxm;
 	nodeid_t node;
 
-	if (srat_disabled())
+	if (numa_disabled())
 		return;
 	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
 		bad_srat();
@@ -274,7 +274,7 @@  acpi_numa_processor_affinity_init(const struct acpi_srat_cpu_affinity *pa)
 	unsigned pxm;
 	nodeid_t node;
 
-	if (srat_disabled())
+	if (numa_disabled())
 		return;
 	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
 		bad_srat();
@@ -313,7 +313,7 @@  acpi_numa_memory_affinity_init(const struct acpi_srat_mem_affinity *ma)
 	nodeid_t node;
 	unsigned int i;
 
-	if (srat_disabled())
+	if (numa_disabled())
 		return;
 	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
 		bad_srat();
@@ -520,7 +520,7 @@  void __init srat_parse_regions(paddr_t addr)
 }
 
 /* Use the information discovered above to actually set up the nodes. */
-int __init acpi_scan_nodes(paddr_t start, paddr_t end)
+int __init numa_scan_nodes(paddr_t start, paddr_t end)
 {
 	int i;
 	nodemask_t all_nodes_parsed;
diff --git a/xen/common/Makefile b/xen/common/Makefile
index 3baf83d527..9a3a12b12d 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -26,6 +26,7 @@  obj-$(CONFIG_MEM_ACCESS) += mem_access.o
 obj-y += memory.o
 obj-y += multicall.o
 obj-y += notifier.o
+obj-$(CONFIG_NUMA) += numa.o
 obj-y += page_alloc.o
 obj-$(CONFIG_HAS_PDX) += pdx.o
 obj-$(CONFIG_PERF_COUNTERS) += perfc.o
diff --git a/xen/common/numa.c b/xen/common/numa.c
new file mode 100644
index 0000000000..0bc5113753
--- /dev/null
+++ b/xen/common/numa.c
@@ -0,0 +1,440 @@ 
+/*
+ * Generic VM initialization for NUMA setups.
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
+ */
+
+#include <xen/init.h>
+#include <xen/keyhandler.h>
+#include <xen/mm.h>
+#include <xen/nodemask.h>
+#include <xen/numa.h>
+#include <xen/param.h>
+#include <xen/sched.h>
+#include <xen/softirq.h>
+
+struct node_data __ro_after_init node_data[MAX_NUMNODES];
+
+/* Mapping from pdx to node id */
+unsigned int __ro_after_init memnode_shift;
+unsigned long __ro_after_init memnodemapsize;
+uint8_t *__ro_after_init memnodemap;
+static uint8_t __ro_after_init _memnodemap[64];
+
+nodeid_t __ro_after_init cpu_to_node[NR_CPUS] = {
+    [0 ... NR_CPUS-1] = NUMA_NO_NODE
+};
+
+cpumask_t __ro_after_init node_to_cpumask[MAX_NUMNODES];
+
+nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
+
+bool __read_mostly numa_off;
+
+bool numa_disabled(void)
+{
+    return numa_off || arch_numa_disabled(false);
+}
+
+/*
+ * Given a shift value, try to populate memnodemap[]
+ * Returns :
+ * 1 if OK
+ * 0 if memnodmap[] too small (of shift too small)
+ * -1 if node overlap or lost ram (shift too big)
+ */
+static int __init populate_memnodemap(const struct node *nodes,
+                                      nodeid_t numnodes, unsigned int shift,
+                                      nodeid_t *nodeids)
+{
+    unsigned long spdx, epdx;
+    nodeid_t i;
+    int res = -1;
+
+    memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap));
+    for ( i = 0; i < numnodes; i++ )
+    {
+        spdx = paddr_to_pdx(nodes[i].start);
+        epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
+        if ( spdx >= epdx )
+            continue;
+        if ( (epdx >> shift) >= memnodemapsize )
+            return 0;
+        do {
+            if ( memnodemap[spdx >> shift] != NUMA_NO_NODE )
+                return -1;
+
+            if ( !nodeids )
+                memnodemap[spdx >> shift] = i;
+            else
+                memnodemap[spdx >> shift] = nodeids[i];
+
+            spdx += (1UL << shift);
+        } while ( spdx < epdx );
+        res = 1;
+    }
+
+    return res;
+}
+
+static int __init allocate_cachealigned_memnodemap(void)
+{
+    unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
+    unsigned long mfn = mfn_x(alloc_boot_pages(size, 1));
+
+    memnodemap = mfn_to_virt(mfn);
+    mfn <<= PAGE_SHIFT;
+    size <<= PAGE_SHIFT;
+    printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+           mfn, mfn + size);
+    memnodemapsize = size / sizeof(*memnodemap);
+
+    return 0;
+}
+
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static unsigned int __init extract_lsb_from_nodes(const struct node *nodes,
+                                                  nodeid_t numnodes)
+{
+    unsigned int i;
+    nodeid_t nodes_used = 0;
+    unsigned long spdx, epdx;
+    unsigned long bitfield = 0, memtop = 0;
+
+    for ( i = 0; i < numnodes; i++ )
+    {
+        spdx = paddr_to_pdx(nodes[i].start);
+        epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
+        if ( spdx >= epdx )
+            continue;
+        bitfield |= spdx;
+        nodes_used++;
+        if ( epdx > memtop )
+            memtop = epdx;
+    }
+    if ( nodes_used <= 1 )
+        i = BITS_PER_LONG - 1;
+    else
+        i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+    memnodemapsize = (memtop >> i) + 1;
+    return i;
+}
+
+unsigned int __init compute_hash_shift(const struct node *nodes,
+                                       nodeid_t numnodes, nodeid_t *nodeids)
+{
+    unsigned int shift;
+
+    shift = extract_lsb_from_nodes(nodes, numnodes);
+    if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) )
+        memnodemap = _memnodemap;
+    else if ( allocate_cachealigned_memnodemap() )
+        return -1;
+    printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift);
+
+    if ( populate_memnodemap(nodes, numnodes, shift, nodeids) != 1 )
+    {
+        printk(KERN_INFO "Your memory is not aligned you need to "
+               "rebuild your hypervisor with a bigger NODEMAPSIZE "
+               "shift=%d\n", shift);
+        return -1;
+    }
+
+    return shift;
+}
+
+/* initialize NODE_DATA given nodeid and start/end */
+void __init setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t end)
+{
+    unsigned long start_pfn = paddr_to_pfn(start);
+    unsigned long end_pfn = paddr_to_pfn(end);
+
+    NODE_DATA(nodeid)->node_start_pfn = start_pfn;
+    NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
+
+    node_set_online(nodeid);
+}
+
+void __init numa_init_array(void)
+{
+    int rr, i;
+
+    /*
+     * There are unfortunately some poorly designed mainboards around
+     * that only connect memory to a single CPU. This breaks the 1:1 cpu->node
+     * mapping. To avoid this fill in the mapping for all possible
+     * CPUs, as the number of CPUs is not known yet.
+     * We round robin the existing nodes.
+     */
+    rr = first_node(node_online_map);
+    for ( i = 0; i < nr_cpu_ids; i++ )
+    {
+        if ( cpu_to_node[i] != NUMA_NO_NODE )
+            continue;
+        numa_set_node(i, rr);
+        rr = cycle_node(rr, node_online_map);
+    }
+}
+
+#ifdef CONFIG_NUMA_EMU
+static unsigned int __initdata numa_fake;
+
+/* Numa emulation */
+static int __init numa_emulation(unsigned long start_pfn,
+                                 unsigned long end_pfn)
+{
+    unsigned int i;
+    struct node nodes[MAX_NUMNODES];
+    uint64_t sz = pfn_to_paddr(end_pfn - start_pfn) / numa_fake;
+
+    /* Kludge needed for the hash function */
+    if ( hweight64(sz) > 1 )
+    {
+        u64 x = 1;
+        while ( (x << 1) < sz )
+            x <<= 1;
+        if ( x < sz / 2 )
+            printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
+        sz = x;
+    }
+
+    memset(&nodes, 0, sizeof(nodes));
+    for ( i = 0; i < numa_fake; i++ )
+    {
+        nodes[i].start = pfn_to_paddr(start_pfn) + i * sz;
+        if ( i == numa_fake - 1 )
+            sz = pfn_to_paddr(end_pfn) - nodes[i].start;
+        nodes[i].end = nodes[i].start + sz;
+        printk(KERN_INFO "Faking node %u at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
+               i, nodes[i].start, nodes[i].end,
+               (nodes[i].end - nodes[i].start) >> 20);
+        node_set_online(i);
+    }
+    memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
+    if ( memnode_shift < 0 )
+    {
+        memnode_shift = 0;
+        printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
+        return -1;
+    }
+    for_each_online_node ( i )
+        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+    numa_init_array();
+
+    return 0;
+}
+#endif
+
+void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+    unsigned int i;
+    paddr_t start = pfn_to_paddr(start_pfn);
+    paddr_t end = pfn_to_paddr(end_pfn);
+
+#ifdef CONFIG_NUMA_EMU
+    if ( numa_fake && !numa_emulation(start_pfn, end_pfn) )
+        return;
+#endif
+
+#ifdef CONFIG_NUMA
+    if ( !numa_off && !numa_scan_nodes(start, end) )
+        return;
+#endif
+
+    printk(KERN_INFO "%s\n",
+           numa_off ? "NUMA turned off" : "No NUMA configuration found");
+
+    printk(KERN_INFO "Faking a node at %"PRIpaddr"-%"PRIpaddr"\n",
+           start, end);
+    /* setup dummy node covering all memory */
+    memnode_shift = BITS_PER_LONG - 1;
+    memnodemap = _memnodemap;
+    /* Dummy node only uses 1 slot in reality */
+    memnodemap[0] = 0;
+    memnodemapsize = 1;
+
+    nodes_clear(node_online_map);
+    node_set_online(0);
+    for ( i = 0; i < nr_cpu_ids; i++ )
+        numa_set_node(i, 0);
+    cpumask_copy(&node_to_cpumask[0], cpumask_of(0));
+    setup_node_bootmem(0, start, end);
+}
+
+void numa_add_cpu(unsigned int cpu)
+{
+    cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+}
+
+void numa_set_node(unsigned int cpu, nodeid_t node)
+{
+    cpu_to_node[cpu] = node;
+}
+
+/* [numa=off] */
+static int __init cf_check numa_setup(const char *opt)
+{
+    if ( !strncmp(opt, "off", 3) )
+        numa_off = true;
+    else if ( !strncmp(opt, "on", 2) )
+        numa_off = false;
+#ifdef CONFIG_NUMA_EMU
+    else if ( !strncmp(opt, "fake=", 5) )
+    {
+        numa_off = false;
+        numa_fake = simple_strtoul(opt + 5, NULL, 0);
+        if ( numa_fake >= MAX_NUMNODES )
+            numa_fake = MAX_NUMNODES;
+    }
+#endif
+    else
+        return arch_numa_setup(opt);
+
+    return 0;
+}
+custom_param("numa", numa_setup);
+
+static void cf_check dump_numa(unsigned char key)
+{
+    s_time_t now = NOW();
+    unsigned int i, j, n;
+    struct domain *d;
+    struct page_info *page;
+    unsigned int page_num_node[MAX_NUMNODES];
+    const struct vnuma_info *vnuma;
+
+    printk("'%c' pressed -> dumping numa info (now = %"PRI_stime")\n", key,
+           now);
+
+    for_each_online_node ( i )
+    {
+        paddr_t pa = pfn_to_paddr(node_start_pfn(i) + 1);
+
+        printk("NODE%u start->%lu size->%lu free->%lu\n",
+               i, node_start_pfn(i), node_spanned_pages(i),
+               avail_node_heap_pages(i));
+        /* sanity check phys_to_nid() */
+        if ( phys_to_nid(pa) != i )
+            printk("phys_to_nid(%"PRIpaddr") -> %d should be %u\n",
+                   pa, phys_to_nid(pa), i);
+    }
+
+    j = cpumask_first(&cpu_online_map);
+    n = 0;
+    for_each_online_cpu ( i )
+    {
+        if ( i != j + n || cpu_to_node[j] != cpu_to_node[i] )
+        {
+            if ( n > 1 )
+                printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
+            else
+                printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
+            j = i;
+            n = 1;
+        }
+        else
+            ++n;
+    }
+    if ( n > 1 )
+        printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]);
+    else
+        printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]);
+
+    rcu_read_lock(&domlist_read_lock);
+
+    printk("Memory location of each domain:\n");
+    for_each_domain ( d )
+    {
+        process_pending_softirqs();
+
+        printk("Domain %u (total: %u):\n", d->domain_id, domain_tot_pages(d));
+
+        for_each_online_node ( i )
+            page_num_node[i] = 0;
+
+        spin_lock(&d->page_alloc_lock);
+        page_list_for_each(page, &d->page_list)
+        {
+            i = phys_to_nid(page_to_maddr(page));
+            page_num_node[i]++;
+        }
+        spin_unlock(&d->page_alloc_lock);
+
+        for_each_online_node ( i )
+            printk("    Node %u: %u\n", i, page_num_node[i]);
+
+        if ( !read_trylock(&d->vnuma_rwlock) )
+            continue;
+
+        if ( !d->vnuma )
+        {
+            read_unlock(&d->vnuma_rwlock);
+            continue;
+        }
+
+        vnuma = d->vnuma;
+        printk("     %u vnodes, %u vcpus, guest physical layout:\n",
+               vnuma->nr_vnodes, d->max_vcpus);
+        for ( i = 0; i < vnuma->nr_vnodes; i++ )
+        {
+            unsigned int start_cpu = ~0U;
+
+            if ( vnuma->vnode_to_pnode[i] == NUMA_NO_NODE )
+                printk("       %3u: pnode ???,", i);
+            else
+                printk("       %3u: pnode %3u,", i, vnuma->vnode_to_pnode[i]);
+
+            printk(" vcpus ");
+
+            for ( j = 0; j < d->max_vcpus; j++ )
+            {
+                if ( !(j & 0x3f) )
+                    process_pending_softirqs();
+
+                if ( vnuma->vcpu_to_vnode[j] == i )
+                {
+                    if ( start_cpu == ~0U )
+                    {
+                        printk("%d", j);
+                        start_cpu = j;
+                    }
+                }
+                else if ( start_cpu != ~0U )
+                {
+                    if ( j - 1 != start_cpu )
+                        printk("-%d ", j - 1);
+                    else
+                        printk(" ");
+                    start_cpu = ~0U;
+                }
+            }
+
+            if ( start_cpu != ~0U  && start_cpu != j - 1 )
+                printk("-%d", j - 1);
+
+            printk("\n");
+
+            for ( j = 0; j < vnuma->nr_vmemranges; j++ )
+            {
+                if ( vnuma->vmemrange[j].nid == i )
+                    printk("           %016"PRIx64" - %016"PRIx64"\n",
+                           vnuma->vmemrange[j].start,
+                           vnuma->vmemrange[j].end);
+            }
+        }
+
+        read_unlock(&d->vnuma_rwlock);
+    }
+
+    rcu_read_unlock(&domlist_read_lock);
+}
+
+static int __init cf_check register_numa_trigger(void)
+{
+    register_keyhandler('u', dump_numa, "dump NUMA info", 1);
+    return 0;
+}
+__initcall(register_numa_trigger);
diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h
index 7aef1a88dc..bae534bb04 100644
--- a/xen/include/xen/numa.h
+++ b/xen/include/xen/numa.h
@@ -18,4 +18,70 @@ 
   (((d)->vcpu != NULL && (d)->vcpu[0] != NULL) \
    ? vcpu_to_node((d)->vcpu[0]) : NUMA_NO_NODE)
 
+/* The following content can be used when NUMA feature is enabled */
+#ifdef CONFIG_NUMA
+
+extern nodeid_t      cpu_to_node[NR_CPUS];
+extern cpumask_t     node_to_cpumask[];
+
+#define cpu_to_node(cpu)        (cpu_to_node[cpu])
+#define parent_node(node)       (node)
+#define node_to_first_cpu(node) (__ffs(node_to_cpumask[node]))
+#define node_to_cpumask(node)   (node_to_cpumask[node])
+
+struct node {
+    paddr_t start, end;
+};
+
+extern unsigned int compute_hash_shift(const struct node *nodes,
+                                       nodeid_t numnodes, nodeid_t *nodeids);
+
+#define VIRTUAL_BUG_ON(x)
+
+extern bool numa_off;
+extern void numa_add_cpu(unsigned int cpu);
+extern void numa_init_array(void);
+extern void numa_set_node(unsigned int cpu, nodeid_t node);
+extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
+extern int  numa_scan_nodes(paddr_t start, paddr_t end);
+
+extern int arch_numa_setup(const char *opt);
+extern bool arch_numa_disabled(bool init_as_disable);
+extern void setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t end);
+
+static inline void clear_node_cpumask(unsigned int cpu)
+{
+    cpumask_clear_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
+}
+
+/* Simple perfect hash to map pdx to node numbers */
+extern unsigned int memnode_shift;
+extern unsigned long memnodemapsize;
+extern uint8_t *memnodemap;
+
+struct node_data {
+    unsigned long node_start_pfn;
+    unsigned long node_spanned_pages;
+};
+
+extern struct node_data node_data[];
+
+static inline __attribute_pure__ nodeid_t phys_to_nid(paddr_t addr)
+{
+    nodeid_t nid;
+    VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= memnodemapsize);
+    nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift];
+    VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
+    return nid;
+}
+
+#define NODE_DATA(nid)          (&(node_data[nid]))
+
+#define node_start_pfn(nid)     (NODE_DATA(nid)->node_start_pfn)
+#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
+#define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
+                                NODE_DATA(nid)->node_spanned_pages)
+
+#endif
+
 #endif /* _XEN_NUMA_H */