Message ID | 1486655834-9708-3-git-send-email-vijay.kilari@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
>>> On 09.02.17 at 16:56, <vijay.kilari@gmail.com> wrote: > From: Vijaya Kumar K <Vijaya.Kumar@cavium.com> > > Move common generic NUMA code to xen/common/numa.c from > xen/arch/x86/numa.c. Also move generic code in header file > xen/include/asm-x86/numa.h to xen/include/xen/numa.h > > This common code can be re-used later for ARM. > > Signed-off-by: Vijaya Kumar K <Vijaya.Kumar@cavium.com> I would have been nice if you Cc-ed the maintainers of the code you're moving. > --- /dev/null > +++ b/xen/common/numa.c > @@ -0,0 +1,342 @@ > +/* > + * Common NUMA handling functions for x86 and arm. > + * Original code extracted from arch/x86/numa.c > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; If not, see <http://www.gnu.org/licenses/>. > + */ > + > + > +#include <xen/mm.h> > +#include <xen/string.h> > +#include <xen/init.h> > +#include <xen/ctype.h> > +#include <xen/nodemask.h> > +#include <xen/numa.h> > +#include <xen/keyhandler.h> > +#include <xen/time.h> > +#include <xen/smp.h> > +#include <xen/pfn.h> > +#include <xen/sched.h> > +#include <xen/errno.h> > +#include <xen/softirq.h> > +#include <asm/setup.h> This last one would better not be included here. > +struct node_data node_data[MAX_NUMNODES]; > + > +/* Mapping from pdx to node id */ > +int memnode_shift; > +unsigned long memnodemapsize; > +u8 *memnodemap; > +typeof(*memnodemap) _memnodemap[64]; Careful with removing any "static" please. For the last one in particular you would need to change the name if it's really necessary to make non-static. Even better would be though to keep it static and provide suitable accessors. Also please sanitize types as you're moving stuff: memnode_shift looks like it really wants to be unsigned int, and u8 should really be uint8_t (as we're trying to phase out u8 & Co). This also applies to code further down. Jan
Hi, On 09/02/17 16:11, Jan Beulich wrote: >>>> On 09.02.17 at 16:56, <vijay.kilari@gmail.com> wrote: >> From: Vijaya Kumar K <Vijaya.Kumar@cavium.com> >> >> Move common generic NUMA code to xen/common/numa.c from >> xen/arch/x86/numa.c. Also move generic code in header file >> xen/include/asm-x86/numa.h to xen/include/xen/numa.h >> >> This common code can be re-used later for ARM. >> >> Signed-off-by: Vijaya Kumar K <Vijaya.Kumar@cavium.com> > > I would have been nice if you Cc-ed the maintainers of the code > you're moving. > >> --- /dev/null >> +++ b/xen/common/numa.c >> @@ -0,0 +1,342 @@ >> +/* >> + * Common NUMA handling functions for x86 and arm. >> + * Original code extracted from arch/x86/numa.c >> + * >> + * This program is free software; you can redistribute it and/or modify >> + * it under the terms of the GNU General Public License as published by >> + * the Free Software Foundation; either version 2 of the License, or >> + * (at your option) any later version. >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> + * GNU General Public License for more details. >> + * >> + * You should have received a copy of the GNU General Public License >> + * along with this program; If not, see <http://www.gnu.org/licenses/>. >> + */ >> + >> + >> +#include <xen/mm.h> >> +#include <xen/string.h> >> +#include <xen/init.h> >> +#include <xen/ctype.h> >> +#include <xen/nodemask.h> >> +#include <xen/numa.h> >> +#include <xen/keyhandler.h> >> +#include <xen/time.h> >> +#include <xen/smp.h> >> +#include <xen/pfn.h> >> +#include <xen/sched.h> >> +#include <xen/errno.h> >> +#include <xen/softirq.h> >> +#include <asm/setup.h> > > This last one would better not be included here. > >> +struct node_data node_data[MAX_NUMNODES]; >> + >> +/* Mapping from pdx to node id */ >> +int memnode_shift; >> +unsigned long memnodemapsize; >> +u8 *memnodemap; >> +typeof(*memnodemap) _memnodemap[64]; > > Careful with removing any "static" please. For the last one in > particular you would need to change the name if it's really necessary > to make non-static. Even better would be though to keep it static > and provide suitable accessors. > > Also please sanitize types as you're moving stuff: memnode_shift > looks like it really wants to be unsigned int, and u8 should really > be uint8_t (as we're trying to phase out u8 & Co). This also applies > to code further down. I agree with the changes suggested. If possible I would prefer all those changes made in separate patches before the move. This would ease the review. Cheers,
Hello Vijay, On 09/02/17 15:56, vijay.kilari@gmail.com wrote: > From: Vijaya Kumar K <Vijaya.Kumar@cavium.com> > > Move common generic NUMA code to xen/common/numa.c from > xen/arch/x86/numa.c. Also move generic code in header file > xen/include/asm-x86/numa.h to xen/include/xen/numa.h > > This common code can be re-used later for ARM. > > Signed-off-by: Vijaya Kumar K <Vijaya.Kumar@cavium.com> > --- > xen/arch/x86/numa.c | 299 --------------------------------------- > xen/common/Makefile | 1 + > xen/common/numa.c | 342 +++++++++++++++++++++++++++++++++++++++++++++ > xen/include/asm-x86/numa.h | 47 ------- > xen/include/xen/numa.h | 54 +++++++ > 5 files changed, 397 insertions(+), 346 deletions(-) > > diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c > index 6f4d438..bc787e0 100644 > --- a/xen/arch/x86/numa.c > +++ b/xen/arch/x86/numa.c > @@ -25,27 +25,12 @@ custom_param("numa", numa_setup); > #define Dprintk(x...) > #endif > > -/* from proto.h */ > -#define round_up(x,y) ((((x)+(y))-1) & (~((y)-1))) > - > -struct node_data node_data[MAX_NUMNODES]; > - > -/* Mapping from pdx to node id */ > -int memnode_shift; > -static typeof(*memnodemap) _memnodemap[64]; > -unsigned long memnodemapsize; > -u8 *memnodemap; > - > -nodeid_t cpu_to_node[NR_CPUS] __read_mostly = { > - [0 ... NR_CPUS-1] = NUMA_NO_NODE > -}; > /* > * Keep BIOS's CPU2node information, should not be used for memory allocaion > */ > nodeid_t apicid_to_node[MAX_LOCAL_APIC] = { > [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE > }; > -cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; > > nodemask_t __read_mostly node_online_map = { { [0] = 1UL } }; Why this variable is not moved? [...] > void __init numa_init_array(void) Same question for this function. > { > int rr, i; > @@ -288,16 +145,6 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) > (u64)end_pfn << PAGE_SHIFT); > } > > -void numa_add_cpu(int cpu) > -{ > - cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); > -} > - > -void numa_set_node(int cpu, nodeid_t node) > -{ > - cpu_to_node[cpu] = node; > -} > - > /* [numa=off] */ > static __init int numa_setup(char *opt) Same question here. Everything in numa_setup and the fake NUMA looks valid for ARM too. [....] > diff --git a/xen/common/Makefile b/xen/common/Makefile > index 0fed30b..c1bd2ff 100644 > --- a/xen/common/Makefile > +++ b/xen/common/Makefile > @@ -63,6 +63,7 @@ obj-y += wait.o > obj-bin-y += warning.init.o > obj-$(CONFIG_XENOPROF) += xenoprof.o > obj-y += xmalloc_tlsf.o > +obj-y += numa.o This needs to be gated with CONFIG_NUMA. > > obj-bin-$(CONFIG_X86) += $(foreach n,decompress bunzip2 unxz unlzma unlzo unlz4 earlycpio,$(n).init.o) > > diff --git a/xen/common/numa.c b/xen/common/numa.c > new file mode 100644 > index 0000000..59dcb63 > --- /dev/null > +++ b/xen/common/numa.c > @@ -0,0 +1,342 @@ > +/* > + * Common NUMA handling functions for x86 and arm. > + * Original code extracted from arch/x86/numa.c > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. Xen is GPLv2 only. Please update to: "This program is free software; you can redistribute it and/or modify it under the terms and conditions of the GNU General Public License, version 2, as published by the Free Software Foundation." > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; If not, see <http://www.gnu.org/licenses/>. > + */ > + > + > +#include <xen/mm.h> > +#include <xen/string.h> > +#include <xen/init.h> > +#include <xen/ctype.h> > +#include <xen/nodemask.h> > +#include <xen/numa.h> > +#include <xen/keyhandler.h> > +#include <xen/time.h> > +#include <xen/smp.h> > +#include <xen/pfn.h> > +#include <xen/sched.h> > +#include <xen/errno.h> > +#include <xen/softirq.h> > +#include <asm/setup.h> > + > +struct node_data node_data[MAX_NUMNODES]; > + > +/* Mapping from pdx to node id */ Looking at this comment, it looks like the NUMA support should depend on HAS_PDX as this is not something that may be able on all the architecture. > +int memnode_shift; > +unsigned long memnodemapsize; > +u8 *memnodemap; > +typeof(*memnodemap) _memnodemap[64]; > + > +nodeid_t cpu_to_node[NR_CPUS] __read_mostly = { > + [0 ... NR_CPUS-1] = NUMA_NO_NODE > +}; > + > +cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; [...] > +void numa_add_cpu(int cpu) > +{ > + cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); > +} > + > +void numa_set_node(int cpu, nodeid_t node) > +{ > + cpu_to_node[cpu] = node; > +} > + > +EXPORT_SYMBOL(node_to_cpumask); > +EXPORT_SYMBOL(memnode_shift); > +EXPORT_SYMBOL(memnodemap); > +EXPORT_SYMBOL(node_data); Those 4 lines are not part of the original code. Why did you add them? To ease review I would like to see this patch split multiple one: - multiple small to prepare the code (export function, change the type...) - a patch to move the code and only move it. No changes at all in it. [...] > diff --git a/xen/include/asm-x86/numa.h b/xen/include/asm-x86/numa.h > index 2479238..61bcd8e 100644 > --- a/xen/include/asm-x86/numa.h > +++ b/xen/include/asm-x86/numa.h > @@ -17,64 +17,17 @@ extern cpumask_t node_to_cpumask[]; > #define node_to_first_cpu(node) (__ffs(node_to_cpumask[node])) > #define node_to_cpumask(node) (node_to_cpumask[node]) > > -struct node { > - u64 start,end; > -}; > - > -extern int compute_hash_shift(struct node *nodes, int numnodes, > - nodeid_t *nodeids); > extern nodeid_t pxm_to_node(unsigned int pxm); > > #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) > -#define VIRTUAL_BUG_ON(x) > > -extern void numa_add_cpu(int cpu); > extern void numa_init_array(void); > extern bool_t numa_off; > > - Spurious change. > extern int srat_disabled(void); > -extern void numa_set_node(int cpu, nodeid_t node); > -extern nodeid_t setup_node(unsigned int pxm); > extern void srat_detect_node(int cpu); > > -extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end); > extern nodeid_t apicid_to_node[]; > -extern void init_cpu_to_node(void); > - > -static inline void clear_node_cpumask(int cpu) > -{ > - cpumask_clear_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); > -} > - > -/* Simple perfect hash to map pdx to node numbers */ > -extern int memnode_shift; > -extern unsigned long memnodemapsize; > -extern u8 *memnodemap; > - > -struct node_data { > - unsigned long node_start_pfn; > - unsigned long node_spanned_pages; > -}; > - > -extern struct node_data node_data[]; > - > -static inline __attribute__((pure)) nodeid_t phys_to_nid(paddr_t addr) > -{ > - nodeid_t nid; > - VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= memnodemapsize); > - nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; > - VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); > - return nid; > -} > - > -#define NODE_DATA(nid) (&(node_data[nid])) > - > -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) > -#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) > -#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ > - NODE_DATA(nid)->node_spanned_pages) > - > extern int valid_numa_range(u64 start, u64 end, nodeid_t node); > > void srat_parse_regions(u64 addr); > diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h > index 7aef1a8..dd33c92 100644 > --- a/xen/include/xen/numa.h > +++ b/xen/include/xen/numa.h > @@ -18,4 +18,58 @@ > (((d)->vcpu != NULL && (d)->vcpu[0] != NULL) \ > ? vcpu_to_node((d)->vcpu[0]) : NUMA_NO_NODE) > > +struct node { > + u64 start,end; This contains hard tab. It looks like that asm-x86/numa.h add a mix hard tab/soft tab. Can you have a clean-up patch to drop them first? > +}; > + > +struct node_data { > + unsigned long node_start_pfn; > + unsigned long node_spanned_pages; > + nodeid_t node_id; > +}; > + > +#define NODE_DATA(nid) (&(node_data[nid])) Hard tab. > +#define VIRTUAL_BUG_ON(x) What is the point to have a BUG_ON that is a nop? > + > +#ifdef CONFIG_NUMA > +extern void init_cpu_to_node(void); > + > +static inline void clear_node_cpumask(int cpu) > +{ > + cpumask_clear_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); Hard tab. > +} You move this function in xen/numa.h but this is never used in xen code. It would be better to drop it. > + > +/* Simple perfect hash to map pdx to node numbers */ > +extern int memnode_shift; > +extern unsigned long memnodemapsize; > +extern u8 *memnodemap; > +extern typeof(*memnodemap) _memnodemap[]; > + > +extern struct node_data node_data[]; > + > +static inline __attribute__((pure)) nodeid_t phys_to_nid(paddr_t addr) > +{ > + nodeid_t nid; > + VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= memnodemapsize); > + nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; > + VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); > + return nid; Hard tab in all this function. > +} > + > +#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) > +#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) > +#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ > + NODE_DATA(nid)->node_spanned_pages) Same for those 3 macros. > + > +#else > +#define init_cpu_to_node() do {} while (0) Please use static inline init_cpu_to_node(void) {} > +#define clear_node_cpumask(cpu) do {} while (0) Not point of having this one. > +#endif /* CONFIG_NUMA */ > + > +extern void numa_add_cpu(int cpu); > +extern nodeid_t setup_node(unsigned int pxm); > +extern void numa_set_node(int cpu, nodeid_t node); > +extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end); I am not sure to understand why those function are not protected by #ifdef CONFIFG_NUMA. > +extern int compute_hash_shift(struct node *nodes, int numnodes, The function name is a bit too generic. > + nodeid_t *nodeids); > #endif /* _XEN_NUMA_H */ > Cheers,
On Mon, Feb 20, 2017 at 6:07 PM, Julien Grall <julien.grall@arm.com> wrote: > Hello Vijay, > > > On 09/02/17 15:56, vijay.kilari@gmail.com wrote: >> >> From: Vijaya Kumar K <Vijaya.Kumar@cavium.com> >> >> Move common generic NUMA code to xen/common/numa.c from >> xen/arch/x86/numa.c. Also move generic code in header file >> xen/include/asm-x86/numa.h to xen/include/xen/numa.h >> >> This common code can be re-used later for ARM. >> >> Signed-off-by: Vijaya Kumar K <Vijaya.Kumar@cavium.com> >> --- >> xen/arch/x86/numa.c | 299 --------------------------------------- >> xen/common/Makefile | 1 + >> xen/common/numa.c | 342 >> +++++++++++++++++++++++++++++++++++++++++++++ >> xen/include/asm-x86/numa.h | 47 ------- >> xen/include/xen/numa.h | 54 +++++++ >> 5 files changed, 397 insertions(+), 346 deletions(-) >> >> diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c >> index 6f4d438..bc787e0 100644 >> --- a/xen/arch/x86/numa.c >> +++ b/xen/arch/x86/numa.c >> @@ -25,27 +25,12 @@ custom_param("numa", numa_setup); >> #define Dprintk(x...) >> #endif >> >> -/* from proto.h */ >> -#define round_up(x,y) ((((x)+(y))-1) & (~((y)-1))) >> - >> -struct node_data node_data[MAX_NUMNODES]; >> - >> -/* Mapping from pdx to node id */ >> -int memnode_shift; >> -static typeof(*memnodemap) _memnodemap[64]; >> -unsigned long memnodemapsize; >> -u8 *memnodemap; >> - >> -nodeid_t cpu_to_node[NR_CPUS] __read_mostly = { >> - [0 ... NR_CPUS-1] = NUMA_NO_NODE >> -}; >> /* >> * Keep BIOS's CPU2node information, should not be used for memory >> allocaion >> */ >> nodeid_t apicid_to_node[MAX_LOCAL_APIC] = { >> [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE >> }; >> -cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; >> >> nodemask_t __read_mostly node_online_map = { { [0] = 1UL } }; > > > Why this variable is not moved? Ok. Can be moved. > > [...] > >> void __init numa_init_array(void) > > > Same question for this function. Initially I was suspicious about the comment in this function and thought it might be valid of x86 alone. But looks like it is generic. I will have a look. > >> { >> int rr, i; >> @@ -288,16 +145,6 @@ void __init numa_initmem_init(unsigned long >> start_pfn, unsigned long end_pfn) >> (u64)end_pfn << PAGE_SHIFT); >> } >> >> -void numa_add_cpu(int cpu) >> -{ >> - cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); >> -} >> - >> -void numa_set_node(int cpu, nodeid_t node) >> -{ >> - cpu_to_node[cpu] = node; >> -} >> - >> /* [numa=off] */ >> static __init int numa_setup(char *opt) > > > Same question here. Everything in numa_setup and the fake NUMA looks > valid for ARM too. numa_setup() is taken in separate patch. fake numa case is not considered. for x86 it is under separate config CONFIG_NUMA_EMU. > > [....] > >> diff --git a/xen/common/Makefile b/xen/common/Makefile >> index 0fed30b..c1bd2ff 100644 >> --- a/xen/common/Makefile >> +++ b/xen/common/Makefile >> @@ -63,6 +63,7 @@ obj-y += wait.o >> obj-bin-y += warning.init.o >> obj-$(CONFIG_XENOPROF) += xenoprof.o >> obj-y += xmalloc_tlsf.o >> +obj-y += numa.o > > > This needs to be gated with CONFIG_NUMA. As it is shared with x86 and prior this changes it is not gated under any config for x86. > >> >> obj-bin-$(CONFIG_X86) += $(foreach n,decompress bunzip2 unxz unlzma unlzo >> unlz4 earlycpio,$(n).init.o) >> >> diff --git a/xen/common/numa.c b/xen/common/numa.c >> new file mode 100644 >> index 0000000..59dcb63 >> --- /dev/null >> +++ b/xen/common/numa.c >> @@ -0,0 +1,342 @@ >> +/* >> + * Common NUMA handling functions for x86 and arm. >> + * Original code extracted from arch/x86/numa.c >> + * >> + * This program is free software; you can redistribute it and/or modify >> + * it under the terms of the GNU General Public License as published by >> + * the Free Software Foundation; either version 2 of the License, or >> + * (at your option) any later version. > > > Xen is GPLv2 only. Please update to: > > "This program is free software; you can redistribute it and/or > modify it under the terms and conditions of the GNU General Public > License, version 2, as published by the Free Software Foundation." > > >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> + * GNU General Public License for more details. >> + * >> + * You should have received a copy of the GNU General Public License >> + * along with this program; If not, see <http://www.gnu.org/licenses/>. >> + */ >> + >> + >> +#include <xen/mm.h> >> +#include <xen/string.h> >> +#include <xen/init.h> >> +#include <xen/ctype.h> >> +#include <xen/nodemask.h> >> +#include <xen/numa.h> >> +#include <xen/keyhandler.h> >> +#include <xen/time.h> >> +#include <xen/smp.h> >> +#include <xen/pfn.h> >> +#include <xen/sched.h> >> +#include <xen/errno.h> >> +#include <xen/softirq.h> >> +#include <asm/setup.h> >> + >> +struct node_data node_data[MAX_NUMNODES]; >> + >> +/* Mapping from pdx to node id */ > > > Looking at this comment, it looks like the NUMA support should depend on > HAS_PDX as this is not something that may be able on all the architecture. yes it uses pfn_to_pdx() while updating memnodemap. May be comment should be suffice. > >> +int memnode_shift; >> +unsigned long memnodemapsize; >> +u8 *memnodemap; >> +typeof(*memnodemap) _memnodemap[64]; >> + >> +nodeid_t cpu_to_node[NR_CPUS] __read_mostly = { >> + [0 ... NR_CPUS-1] = NUMA_NO_NODE >> +}; >> + >> +cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; > > > [...] > >> +void numa_add_cpu(int cpu) >> +{ >> + cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); >> +} >> + >> +void numa_set_node(int cpu, nodeid_t node) >> +{ >> + cpu_to_node[cpu] = node; >> +} >> + >> +EXPORT_SYMBOL(node_to_cpumask); >> +EXPORT_SYMBOL(memnode_shift); >> +EXPORT_SYMBOL(memnodemap); >> +EXPORT_SYMBOL(node_data); > > > Those 4 lines are not part of the original code. Why did you add them? > > To ease review I would like to see this patch split multiple one: > - multiple small to prepare the code (export function, change the > type...) > - a patch to move the code and only move it. No changes at all in > it. OK > > [...] > >> diff --git a/xen/include/asm-x86/numa.h b/xen/include/asm-x86/numa.h >> index 2479238..61bcd8e 100644 >> --- a/xen/include/asm-x86/numa.h >> +++ b/xen/include/asm-x86/numa.h >> @@ -17,64 +17,17 @@ extern cpumask_t node_to_cpumask[]; >> #define node_to_first_cpu(node) (__ffs(node_to_cpumask[node])) >> #define node_to_cpumask(node) (node_to_cpumask[node]) >> >> -struct node { >> - u64 start,end; >> -}; >> - >> -extern int compute_hash_shift(struct node *nodes, int numnodes, >> - nodeid_t *nodeids); >> extern nodeid_t pxm_to_node(unsigned int pxm); >> >> #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) >> -#define VIRTUAL_BUG_ON(x) >> >> -extern void numa_add_cpu(int cpu); >> extern void numa_init_array(void); >> extern bool_t numa_off; >> >> - > > > Spurious change. > > >> extern int srat_disabled(void); >> -extern void numa_set_node(int cpu, nodeid_t node); >> -extern nodeid_t setup_node(unsigned int pxm); >> extern void srat_detect_node(int cpu); >> >> -extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end); >> extern nodeid_t apicid_to_node[]; >> -extern void init_cpu_to_node(void); >> - >> -static inline void clear_node_cpumask(int cpu) >> -{ >> - cpumask_clear_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); >> -} >> - >> -/* Simple perfect hash to map pdx to node numbers */ >> -extern int memnode_shift; >> -extern unsigned long memnodemapsize; >> -extern u8 *memnodemap; >> - >> -struct node_data { >> - unsigned long node_start_pfn; >> - unsigned long node_spanned_pages; >> -}; >> - >> -extern struct node_data node_data[]; >> - >> -static inline __attribute__((pure)) nodeid_t phys_to_nid(paddr_t addr) >> -{ >> - nodeid_t nid; >> - VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= >> memnodemapsize); >> - nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; >> - VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); >> - return nid; >> -} >> - >> -#define NODE_DATA(nid) (&(node_data[nid])) >> - >> -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) >> -#define node_spanned_pages(nid) >> (NODE_DATA(nid)->node_spanned_pages) >> -#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ >> - NODE_DATA(nid)->node_spanned_pages) >> - >> extern int valid_numa_range(u64 start, u64 end, nodeid_t node); >> >> void srat_parse_regions(u64 addr); >> diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h >> index 7aef1a8..dd33c92 100644 >> --- a/xen/include/xen/numa.h >> +++ b/xen/include/xen/numa.h >> @@ -18,4 +18,58 @@ >> (((d)->vcpu != NULL && (d)->vcpu[0] != NULL) \ >> ? vcpu_to_node((d)->vcpu[0]) : NUMA_NO_NODE) >> >> +struct node { >> + u64 start,end; > > > This contains hard tab. It looks like that asm-x86/numa.h add a mix hard > tab/soft tab. Can you have a clean-up patch to drop them first? OK. I will try. > >> +}; >> + >> +struct node_data { >> + unsigned long node_start_pfn; >> + unsigned long node_spanned_pages; >> + nodeid_t node_id; >> +}; >> + >> +#define NODE_DATA(nid) (&(node_data[nid])) > > > Hard tab. > >> +#define VIRTUAL_BUG_ON(x) > > > What is the point to have a BUG_ON that is a nop? yes.. that is part of original code. As part of clean up patch. I will drop it. > >> + >> +#ifdef CONFIG_NUMA >> +extern void init_cpu_to_node(void); >> + >> +static inline void clear_node_cpumask(int cpu) >> +{ >> + cpumask_clear_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); > > > Hard tab. > >> +} > > > You move this function in xen/numa.h but this is never used in xen code. It > would be better to drop it. > >> + >> +/* Simple perfect hash to map pdx to node numbers */ >> +extern int memnode_shift; >> +extern unsigned long memnodemapsize; >> +extern u8 *memnodemap; >> +extern typeof(*memnodemap) _memnodemap[]; >> + >> +extern struct node_data node_data[]; >> + >> +static inline __attribute__((pure)) nodeid_t phys_to_nid(paddr_t addr) >> +{ >> + nodeid_t nid; >> + VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= >> memnodemapsize); >> + nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; >> + VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); >> + return nid; > > > Hard tab in all this function. > >> +} >> + >> +#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) >> +#define node_spanned_pages(nid) >> (NODE_DATA(nid)->node_spanned_pages) >> +#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ >> + NODE_DATA(nid)->node_spanned_pages) > > > Same for those 3 macros. > >> + >> +#else >> +#define init_cpu_to_node() do {} while (0) > > > Please use static inline init_cpu_to_node(void) {} > >> +#define clear_node_cpumask(cpu) do {} while (0) > > > Not point of having this one. > >> +#endif /* CONFIG_NUMA */ >> + >> +extern void numa_add_cpu(int cpu); >> +extern nodeid_t setup_node(unsigned int pxm); >> +extern void numa_set_node(int cpu, nodeid_t node); >> +extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end); > > > I am not sure to understand why those function are not protected by #ifdef > CONFIFG_NUMA. As these defined in xen/common/numa.c which is not under CONFIG_NUMA, I have kept them outside CONFIG_NUMA > >> +extern int compute_hash_shift(struct node *nodes, int numnodes, > > > The function name is a bit too generic. > >> + nodeid_t *nodeids); >> #endif /* _XEN_NUMA_H */ >> > > Cheers, > > -- > Julien Grall
Hello Vijay, On 22/02/17 10:04, Vijay Kilari wrote: > On Mon, Feb 20, 2017 at 6:07 PM, Julien Grall <julien.grall@arm.com> wrote: >> >>> { >>> int rr, i; >>> @@ -288,16 +145,6 @@ void __init numa_initmem_init(unsigned long >>> start_pfn, unsigned long end_pfn) >>> (u64)end_pfn << PAGE_SHIFT); >>> } >>> >>> -void numa_add_cpu(int cpu) >>> -{ >>> - cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); >>> -} >>> - >>> -void numa_set_node(int cpu, nodeid_t node) >>> -{ >>> - cpu_to_node[cpu] = node; >>> -} >>> - >>> /* [numa=off] */ >>> static __init int numa_setup(char *opt) >> >> >> Same question here. Everything in numa_setup and the fake NUMA looks >> valid for ARM too. > > numa_setup() is taken in separate patch. fake numa case is not considered. > for x86 it is under separate config CONFIG_NUMA_EMU. Why fake numa is not considered? This would help to test the series on non-NUMA platform. >> >> [....] >> >>> diff --git a/xen/common/Makefile b/xen/common/Makefile >>> index 0fed30b..c1bd2ff 100644 >>> --- a/xen/common/Makefile >>> +++ b/xen/common/Makefile >>> @@ -63,6 +63,7 @@ obj-y += wait.o >>> obj-bin-y += warning.init.o >>> obj-$(CONFIG_XENOPROF) += xenoprof.o >>> obj-y += xmalloc_tlsf.o >>> +obj-y += numa.o >> >> >> This needs to be gated with CONFIG_NUMA. > > As it is shared with x86 and prior this changes it is not gated under > any config for x86. Because x86 code assume that CONFIG_NUMA is always enabled. If you look at the .config CONFIG_NUMA will be set so you can gate it here. >> Looking at this comment, it looks like the NUMA support should depend on >> HAS_PDX as this is not something that may be able on all the architecture. > > yes it uses pfn_to_pdx() while updating memnodemap. > May be comment should be suffice. A comment may be missed, gated in the Kconfig will prevent a new architecture to use NUMA without knowing PDX is necessary. [...] >>> +#endif /* CONFIG_NUMA */ >>> + >>> +extern void numa_add_cpu(int cpu); >>> +extern nodeid_t setup_node(unsigned int pxm); >>> +extern void numa_set_node(int cpu, nodeid_t node); >>> +extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end); >> >> >> I am not sure to understand why those function are not protected by #ifdef >> CONFIFG_NUMA. > As these defined in xen/common/numa.c which is not under CONFIG_NUMA, > I have kept them outside CONFIG_NUMA xen/common/numa.c should be under CONFIG_NUMA. There is no reason to expose the code on non-NUMA platform. Regards,
Hi Jan, On Thu, Feb 9, 2017 at 9:41 PM, Jan Beulich <JBeulich@suse.com> wrote: >>>> On 09.02.17 at 16:56, <vijay.kilari@gmail.com> wrote: >> From: Vijaya Kumar K <Vijaya.Kumar@cavium.com> >> >> Move common generic NUMA code to xen/common/numa.c from >> xen/arch/x86/numa.c. Also move generic code in header file >> xen/include/asm-x86/numa.h to xen/include/xen/numa.h >> >> This common code can be re-used later for ARM. >> >> Signed-off-by: Vijaya Kumar K <Vijaya.Kumar@cavium.com> > > I would have been nice if you Cc-ed the maintainers of the code > you're moving. > >> --- /dev/null >> +++ b/xen/common/numa.c >> @@ -0,0 +1,342 @@ >> +/* >> + * Common NUMA handling functions for x86 and arm. >> + * Original code extracted from arch/x86/numa.c >> + * >> + * This program is free software; you can redistribute it and/or modify >> + * it under the terms of the GNU General Public License as published by >> + * the Free Software Foundation; either version 2 of the License, or >> + * (at your option) any later version. >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> + * GNU General Public License for more details. >> + * >> + * You should have received a copy of the GNU General Public License >> + * along with this program; If not, see <http://www.gnu.org/licenses/>. >> + */ >> + >> + >> +#include <xen/mm.h> >> +#include <xen/string.h> >> +#include <xen/init.h> >> +#include <xen/ctype.h> >> +#include <xen/nodemask.h> >> +#include <xen/numa.h> >> +#include <xen/keyhandler.h> >> +#include <xen/time.h> >> +#include <xen/smp.h> >> +#include <xen/pfn.h> >> +#include <xen/sched.h> >> +#include <xen/errno.h> >> +#include <xen/softirq.h> >> +#include <asm/setup.h> > > This last one would better not be included here. > >> +struct node_data node_data[MAX_NUMNODES]; >> + >> +/* Mapping from pdx to node id */ >> +int memnode_shift; >> +unsigned long memnodemapsize; >> +u8 *memnodemap; >> +typeof(*memnodemap) _memnodemap[64]; > > Careful with removing any "static" please. For the last one in > particular you would need to change the name if it's really necessary > to make non-static. Even better would be though to keep it static > and provide suitable accessors. > > Also please sanitize types as you're moving stuff: memnode_shift > looks like it really wants to be unsigned int, and u8 should really > be uint8_t (as we're trying to phase out u8 & Co). This also applies > to code further down. You mean to change all occurrences of s/u8/uint8_t s/u32/uint32_t s/u64/uint64_t Also, I see that xen/arch/x86/srat.c coding style is not adhere to xen coding style. Shall I clean up before I move the code?
>>> Vijay Kilari <vijay.kilari@gmail.com> 02/27/17 12:43 PM >>> >On Thu, Feb 9, 2017 at 9:41 PM, Jan Beulich <JBeulich@suse.com> wrote: >>>>> On 09.02.17 at 16:56, <vijay.kilari@gmail.com> wrote: >>> +struct node_data node_data[MAX_NUMNODES]; >>> + >>> +/* Mapping from pdx to node id */ >>> +int memnode_shift; >>> +unsigned long memnodemapsize; >>> +u8 *memnodemap; >>> +typeof(*memnodemap) _memnodemap[64]; >> >> Careful with removing any "static" please. For the last one in >> particular you would need to change the name if it's really necessary >> to make non-static. Even better would be though to keep it static >> and provide suitable accessors. >> >> Also please sanitize types as you're moving stuff: memnode_shift >> looks like it really wants to be unsigned int, and u8 should really >> be uint8_t (as we're trying to phase out u8 & Co). This also applies >> to code further down. > >You mean to change all occurrences of >s/u8/uint8_t >s/u32/uint32_t >s/u64/uint64_t Yes. >Also, I see that xen/arch/x86/srat.c coding style is not adhere to xen >coding style. >Shall I clean up before I move the code? If you want to take the time - sure. All I'd like to ask for is that at least the file(s) you move the code _to_ end up with consistent style. Jan
diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c index 6f4d438..bc787e0 100644 --- a/xen/arch/x86/numa.c +++ b/xen/arch/x86/numa.c @@ -25,27 +25,12 @@ custom_param("numa", numa_setup); #define Dprintk(x...) #endif -/* from proto.h */ -#define round_up(x,y) ((((x)+(y))-1) & (~((y)-1))) - -struct node_data node_data[MAX_NUMNODES]; - -/* Mapping from pdx to node id */ -int memnode_shift; -static typeof(*memnodemap) _memnodemap[64]; -unsigned long memnodemapsize; -u8 *memnodemap; - -nodeid_t cpu_to_node[NR_CPUS] __read_mostly = { - [0 ... NR_CPUS-1] = NUMA_NO_NODE -}; /* * Keep BIOS's CPU2node information, should not be used for memory allocaion */ nodeid_t apicid_to_node[MAX_LOCAL_APIC] = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; nodemask_t __read_mostly node_online_map = { { [0] = 1UL } }; @@ -57,134 +42,6 @@ int srat_disabled(void) return numa_off || acpi_numa < 0; } -/* - * Given a shift value, try to populate memnodemap[] - * Returns : - * 1 if OK - * 0 if memnodmap[] too small (of shift too small) - * -1 if node overlap or lost ram (shift too big) - */ -static int __init populate_memnodemap(const struct node *nodes, - int numnodes, int shift, nodeid_t *nodeids) -{ - unsigned long spdx, epdx; - int i, res = -1; - - memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap)); - for ( i = 0; i < numnodes; i++ ) - { - spdx = paddr_to_pdx(nodes[i].start); - epdx = paddr_to_pdx(nodes[i].end - 1) + 1; - if ( spdx >= epdx ) - continue; - if ( (epdx >> shift) >= memnodemapsize ) - return 0; - do { - if ( memnodemap[spdx >> shift] != NUMA_NO_NODE ) - return -1; - - if ( !nodeids ) - memnodemap[spdx >> shift] = i; - else - memnodemap[spdx >> shift] = nodeids[i]; - - spdx += (1UL << shift); - } while ( spdx < epdx ); - res = 1; - } - - return res; -} - -static int __init allocate_cachealigned_memnodemap(void) -{ - unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap)); - unsigned long mfn = alloc_boot_pages(size, 1); - - if ( !mfn ) - { - printk(KERN_ERR - "NUMA: Unable to allocate Memory to Node hash map\n"); - memnodemapsize = 0; - return -1; - } - - memnodemap = mfn_to_virt(mfn); - mfn <<= PAGE_SHIFT; - size <<= PAGE_SHIFT; - printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", - mfn, mfn + size); - memnodemapsize = size / sizeof(*memnodemap); - - return 0; -} - -/* - * The LSB of all start and end addresses in the node map is the value of the - * maximum possible shift. - */ -static int __init extract_lsb_from_nodes(const struct node *nodes, - int numnodes) -{ - int i, nodes_used = 0; - unsigned long spdx, epdx; - unsigned long bitfield = 0, memtop = 0; - - for ( i = 0; i < numnodes; i++ ) - { - spdx = paddr_to_pdx(nodes[i].start); - epdx = paddr_to_pdx(nodes[i].end - 1) + 1; - if ( spdx >= epdx ) - continue; - bitfield |= spdx; - nodes_used++; - if ( epdx > memtop ) - memtop = epdx; - } - if ( nodes_used <= 1 ) - i = BITS_PER_LONG - 1; - else - i = find_first_bit(&bitfield, sizeof(unsigned long)*8); - memnodemapsize = (memtop >> i) + 1; - return i; -} - -int __init compute_hash_shift(struct node *nodes, int numnodes, - nodeid_t *nodeids) -{ - int shift; - - shift = extract_lsb_from_nodes(nodes, numnodes); - if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) ) - memnodemap = _memnodemap; - else if ( allocate_cachealigned_memnodemap() ) - return -1; - printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); - - if ( populate_memnodemap(nodes, numnodes, shift, nodeids) != 1 ) - { - printk(KERN_INFO "Your memory is not aligned you need to " - "rebuild your hypervisor with a bigger NODEMAPSIZE " - "shift=%d\n", shift); - return -1; - } - - return shift; -} -/* initialize NODE_DATA given nodeid and start/end */ -void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end) -{ - unsigned long start_pfn, end_pfn; - - start_pfn = start >> PAGE_SHIFT; - end_pfn = end >> PAGE_SHIFT; - - NODE_DATA(nodeid)->node_start_pfn = start_pfn; - NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; - - node_set_online(nodeid); -} - void __init numa_init_array(void) { int rr, i; @@ -288,16 +145,6 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) (u64)end_pfn << PAGE_SHIFT); } -void numa_add_cpu(int cpu) -{ - cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); -} - -void numa_set_node(int cpu, nodeid_t node) -{ - cpu_to_node[cpu] = node; -} - /* [numa=off] */ static __init int numa_setup(char *opt) { @@ -373,149 +220,3 @@ unsigned int __init arch_get_dma_bitsize(void) flsl(node_start_pfn(node) + node_spanned_pages(node) / 4 - 1) + PAGE_SHIFT, 32); } - -static void dump_numa(unsigned char key) -{ - s_time_t now = NOW(); - unsigned int i, j, n; - int err; - struct domain *d; - struct page_info *page; - unsigned int page_num_node[MAX_NUMNODES]; - const struct vnuma_info *vnuma; - - printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key, - (u32)(now>>32), (u32)now); - - for_each_online_node ( i ) - { - paddr_t pa = pfn_to_paddr(node_start_pfn(i) + 1); - - printk("NODE%u start->%lu size->%lu free->%lu\n", - i, node_start_pfn(i), node_spanned_pages(i), - avail_node_heap_pages(i)); - /* sanity check phys_to_nid() */ - if ( phys_to_nid(pa) != i ) - printk("phys_to_nid(%"PRIpaddr") -> %d should be %u\n", - pa, phys_to_nid(pa), i); - } - - j = cpumask_first(&cpu_online_map); - n = 0; - for_each_online_cpu ( i ) - { - if ( i != j + n || cpu_to_node[j] != cpu_to_node[i] ) - { - if ( n > 1 ) - printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]); - else - printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]); - j = i; - n = 1; - } - else - ++n; - } - if ( n > 1 ) - printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]); - else - printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]); - - rcu_read_lock(&domlist_read_lock); - - printk("Memory location of each domain:\n"); - for_each_domain ( d ) - { - process_pending_softirqs(); - - printk("Domain %u (total: %u):\n", d->domain_id, d->tot_pages); - - for_each_online_node ( i ) - page_num_node[i] = 0; - - spin_lock(&d->page_alloc_lock); - page_list_for_each(page, &d->page_list) - { - i = phys_to_nid((paddr_t)page_to_mfn(page) << PAGE_SHIFT); - page_num_node[i]++; - } - spin_unlock(&d->page_alloc_lock); - - for_each_online_node ( i ) - printk(" Node %u: %u\n", i, page_num_node[i]); - - if ( !read_trylock(&d->vnuma_rwlock) ) - continue; - - if ( !d->vnuma ) - { - read_unlock(&d->vnuma_rwlock); - continue; - } - - vnuma = d->vnuma; - printk(" %u vnodes, %u vcpus, guest physical layout:\n", - vnuma->nr_vnodes, d->max_vcpus); - for ( i = 0; i < vnuma->nr_vnodes; i++ ) - { - unsigned int start_cpu = ~0U; - - err = snprintf(keyhandler_scratch, 12, "%3u", - vnuma->vnode_to_pnode[i]); - if ( err < 0 || vnuma->vnode_to_pnode[i] == NUMA_NO_NODE ) - strlcpy(keyhandler_scratch, "???", sizeof(keyhandler_scratch)); - - printk(" %3u: pnode %s,", i, keyhandler_scratch); - - printk(" vcpus "); - - for ( j = 0; j < d->max_vcpus; j++ ) - { - if ( !(j & 0x3f) ) - process_pending_softirqs(); - - if ( vnuma->vcpu_to_vnode[j] == i ) - { - if ( start_cpu == ~0U ) - { - printk("%d", j); - start_cpu = j; - } - } - else if ( start_cpu != ~0U ) - { - if ( j - 1 != start_cpu ) - printk("-%d ", j - 1); - else - printk(" "); - start_cpu = ~0U; - } - } - - if ( start_cpu != ~0U && start_cpu != j - 1 ) - printk("-%d", j - 1); - - printk("\n"); - - for ( j = 0; j < vnuma->nr_vmemranges; j++ ) - { - if ( vnuma->vmemrange[j].nid == i ) - printk(" %016"PRIx64" - %016"PRIx64"\n", - vnuma->vmemrange[j].start, - vnuma->vmemrange[j].end); - } - } - - read_unlock(&d->vnuma_rwlock); - } - - rcu_read_unlock(&domlist_read_lock); -} - -static __init int register_numa_trigger(void) -{ - register_keyhandler('u', dump_numa, "dump NUMA info", 1); - return 0; -} -__initcall(register_numa_trigger); - diff --git a/xen/common/Makefile b/xen/common/Makefile index 0fed30b..c1bd2ff 100644 --- a/xen/common/Makefile +++ b/xen/common/Makefile @@ -63,6 +63,7 @@ obj-y += wait.o obj-bin-y += warning.init.o obj-$(CONFIG_XENOPROF) += xenoprof.o obj-y += xmalloc_tlsf.o +obj-y += numa.o obj-bin-$(CONFIG_X86) += $(foreach n,decompress bunzip2 unxz unlzma unlzo unlz4 earlycpio,$(n).init.o) diff --git a/xen/common/numa.c b/xen/common/numa.c new file mode 100644 index 0000000..59dcb63 --- /dev/null +++ b/xen/common/numa.c @@ -0,0 +1,342 @@ +/* + * Common NUMA handling functions for x86 and arm. + * Original code extracted from arch/x86/numa.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; If not, see <http://www.gnu.org/licenses/>. + */ + + +#include <xen/mm.h> +#include <xen/string.h> +#include <xen/init.h> +#include <xen/ctype.h> +#include <xen/nodemask.h> +#include <xen/numa.h> +#include <xen/keyhandler.h> +#include <xen/time.h> +#include <xen/smp.h> +#include <xen/pfn.h> +#include <xen/sched.h> +#include <xen/errno.h> +#include <xen/softirq.h> +#include <asm/setup.h> + +struct node_data node_data[MAX_NUMNODES]; + +/* Mapping from pdx to node id */ +int memnode_shift; +unsigned long memnodemapsize; +u8 *memnodemap; +typeof(*memnodemap) _memnodemap[64]; + +nodeid_t cpu_to_node[NR_CPUS] __read_mostly = { + [0 ... NR_CPUS-1] = NUMA_NO_NODE +}; + +cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; + +/* + * Given a shift value, try to populate memnodemap[] + * Returns : + * 1 if OK + * 0 if memnodmap[] too small (of shift too small) + * -1 if node overlap or lost ram (shift too big) + */ +static int __init populate_memnodemap(const struct node *nodes, + int numnodes, int shift, + nodeid_t *nodeids) +{ + unsigned long spdx, epdx; + int i, res = -1; + + memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap)); + for ( i = 0; i < numnodes; i++ ) + { + spdx = paddr_to_pdx(nodes[i].start); + epdx = paddr_to_pdx(nodes[i].end - 1) + 1; + if ( spdx >= epdx ) + continue; + if ( (epdx >> shift) >= memnodemapsize ) + return 0; + do { + if ( memnodemap[spdx >> shift] != NUMA_NO_NODE ) + return -1; + + if ( !nodeids ) + memnodemap[spdx >> shift] = i; + else + memnodemap[spdx >> shift] = nodeids[i]; + + spdx += (1UL << shift); + } while ( spdx < epdx ); + res = 1; + } + + return res; +} + +static int __init allocate_cachealigned_memnodemap(void) +{ + unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap)); + unsigned long mfn; + + mfn = alloc_boot_pages(size, 1); + if ( !mfn ) + { + printk(KERN_ERR + "NUMA: Unable to allocate Memory to Node hash map\n"); + memnodemapsize = 0; + return -1; + } + + memnodemap = mfn_to_virt(mfn); + mfn <<= PAGE_SHIFT; + size <<= PAGE_SHIFT; + printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", + mfn, mfn + size); + memnodemapsize = size / sizeof(*memnodemap); + + return 0; +} + +/* + * The LSB of all start and end addresses in the node map is the value of the + * maximum possible shift. + */ +static int __init extract_lsb_from_nodes(const struct node *nodes, + int numnodes) +{ + int i, nodes_used = 0; + unsigned long spdx, epdx; + unsigned long bitfield = 0, memtop = 0; + + for ( i = 0; i < numnodes; i++ ) + { + spdx = paddr_to_pdx(nodes[i].start); + epdx = paddr_to_pdx(nodes[i].end - 1) + 1; + if ( spdx >= epdx ) + continue; + bitfield |= spdx; + nodes_used++; + if ( epdx > memtop ) + memtop = epdx; + } + if ( nodes_used <= 1 ) + i = BITS_PER_LONG - 1; + else + i = find_first_bit(&bitfield, sizeof(unsigned long)*8); + + memnodemapsize = (memtop >> i) + 1; + + return i; +} + +int __init compute_hash_shift(struct node *nodes, int numnodes, + nodeid_t *nodeids) +{ + int shift; + + shift = extract_lsb_from_nodes(nodes, numnodes); + if ( memnodemapsize <= ARRAY_SIZE(_memnodemap) ) + memnodemap = _memnodemap; + else if ( allocate_cachealigned_memnodemap() ) + return -1; + printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); + + if ( populate_memnodemap(nodes, numnodes, shift, nodeids) != 1 ) + { + printk(KERN_INFO "Your memory is not aligned you need to " + "rebuild your hypervisor with a bigger NODEMAPSIZE " + "shift=%d\n", shift); + return -1; + } + + return shift; +} + +/* initialize NODE_DATA given nodeid and start/end */ +void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end) +{ + unsigned long start_pfn, end_pfn; + + start_pfn = start >> PAGE_SHIFT; + end_pfn = end >> PAGE_SHIFT; + + NODE_DATA(nodeid)->node_id = nodeid; + NODE_DATA(nodeid)->node_start_pfn = start_pfn; + NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; + + node_set_online(nodeid); +} + +void numa_add_cpu(int cpu) +{ + cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); +} + +void numa_set_node(int cpu, nodeid_t node) +{ + cpu_to_node[cpu] = node; +} + +EXPORT_SYMBOL(node_to_cpumask); +EXPORT_SYMBOL(memnode_shift); +EXPORT_SYMBOL(memnodemap); +EXPORT_SYMBOL(node_data); + +static void dump_numa(unsigned char key) +{ + s_time_t now = NOW(); + unsigned int i, j, n; + int err; + struct domain *d; + struct page_info *page; + unsigned int page_num_node[MAX_NUMNODES] = {0}; + const struct vnuma_info *vnuma; + + printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key, + (u32)(now>>32), (u32)now); + + for_each_online_node ( i ) + { + paddr_t pa = (paddr_t)(NODE_DATA(i)->node_start_pfn + 1)<< PAGE_SHIFT; + printk("idx%d -> NODE%d start->%lu size->%lu free->%lu\n", + i, NODE_DATA(i)->node_id, + NODE_DATA(i)->node_start_pfn, + NODE_DATA(i)->node_spanned_pages, + avail_node_heap_pages(i)); + /* sanity check phys_to_nid() */ + printk("phys_to_nid(%"PRIpaddr") -> %d should be %d\n", pa, + phys_to_nid(pa), + NODE_DATA(i)->node_id); + } + + j = cpumask_first(&cpu_online_map); + n = 0; + for_each_online_cpu ( i ) + { + if ( i != j + n || cpu_to_node[j] != cpu_to_node[i] ) + { + if ( n > 1 ) + printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]); + else + printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]); + j = i; + n = 1; + } + else + ++n; + } + if ( n > 1 ) + printk("CPU%u...%u -> NODE%d\n", j, j + n - 1, cpu_to_node[j]); + else + printk("CPU%u -> NODE%d\n", j, cpu_to_node[j]); + + rcu_read_lock(&domlist_read_lock); + + printk("Memory location of each domain:\n"); + for_each_domain ( d ) + { + process_pending_softirqs(); + printk("Domain %u (total: %u):\n", d->domain_id, d->tot_pages); + + for_each_online_node ( i ) + page_num_node[i] = 0; + + spin_lock(&d->page_alloc_lock); + page_list_for_each(page, &d->page_list) + { + i = phys_to_nid((paddr_t)page_to_mfn(page) << PAGE_SHIFT); + page_num_node[i]++; + } + spin_unlock(&d->page_alloc_lock); + + for_each_online_node ( i ) + printk(" Node %u: %u\n", i, page_num_node[i]); + + if ( !read_trylock(&d->vnuma_rwlock) ) + continue; + + if ( !d->vnuma ) + { + read_unlock(&d->vnuma_rwlock); + continue; + } + + vnuma = d->vnuma; + printk(" %u vnodes, %u vcpus, guest physical layout:\n", + vnuma->nr_vnodes, d->max_vcpus); + for ( i = 0; i < vnuma->nr_vnodes; i++ ) + { + unsigned int start_cpu = ~0U; + + err = snprintf(keyhandler_scratch, 12, "%3u", + vnuma->vnode_to_pnode[i]); + if ( err < 0 || vnuma->vnode_to_pnode[i] == NUMA_NO_NODE ) + strlcpy(keyhandler_scratch, "???", sizeof(keyhandler_scratch)); + + printk(" %3u: pnode %s,", i, keyhandler_scratch); + + printk(" vcpus "); + + for ( j = 0; j < d->max_vcpus; j++ ) + { + if ( !(j & 0x3f) ) + process_pending_softirqs(); + + if ( vnuma->vcpu_to_vnode[j] == i ) + { + if ( start_cpu == ~0U ) + { + printk("%d", j); + start_cpu = j; + } + } + else if ( start_cpu != ~0U ) + { + if ( j - 1 != start_cpu ) + printk("-%d ", j - 1); + else + printk(" "); + start_cpu = ~0U; + } + } + + if ( start_cpu != ~0U && start_cpu != j - 1 ) + printk("-%d", j - 1); + + printk("\n"); + + for ( j = 0; j < vnuma->nr_vmemranges; j++ ) + { + if ( vnuma->vmemrange[j].nid == i ) + printk(" %016"PRIx64" - %016"PRIx64"\n", + vnuma->vmemrange[j].start, + vnuma->vmemrange[j].end); + } + } + + read_unlock(&d->vnuma_rwlock); + } + + rcu_read_unlock(&domlist_read_lock); +} + +static __init int register_numa_trigger(void) +{ + register_keyhandler('u', dump_numa, "dump NUMA info", 1); + return 0; +} +__initcall(register_numa_trigger); + diff --git a/xen/include/asm-x86/numa.h b/xen/include/asm-x86/numa.h index 2479238..61bcd8e 100644 --- a/xen/include/asm-x86/numa.h +++ b/xen/include/asm-x86/numa.h @@ -17,64 +17,17 @@ extern cpumask_t node_to_cpumask[]; #define node_to_first_cpu(node) (__ffs(node_to_cpumask[node])) #define node_to_cpumask(node) (node_to_cpumask[node]) -struct node { - u64 start,end; -}; - -extern int compute_hash_shift(struct node *nodes, int numnodes, - nodeid_t *nodeids); extern nodeid_t pxm_to_node(unsigned int pxm); #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) -#define VIRTUAL_BUG_ON(x) -extern void numa_add_cpu(int cpu); extern void numa_init_array(void); extern bool_t numa_off; - extern int srat_disabled(void); -extern void numa_set_node(int cpu, nodeid_t node); -extern nodeid_t setup_node(unsigned int pxm); extern void srat_detect_node(int cpu); -extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end); extern nodeid_t apicid_to_node[]; -extern void init_cpu_to_node(void); - -static inline void clear_node_cpumask(int cpu) -{ - cpumask_clear_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); -} - -/* Simple perfect hash to map pdx to node numbers */ -extern int memnode_shift; -extern unsigned long memnodemapsize; -extern u8 *memnodemap; - -struct node_data { - unsigned long node_start_pfn; - unsigned long node_spanned_pages; -}; - -extern struct node_data node_data[]; - -static inline __attribute__((pure)) nodeid_t phys_to_nid(paddr_t addr) -{ - nodeid_t nid; - VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= memnodemapsize); - nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; - VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); - return nid; -} - -#define NODE_DATA(nid) (&(node_data[nid])) - -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) -#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ - NODE_DATA(nid)->node_spanned_pages) - extern int valid_numa_range(u64 start, u64 end, nodeid_t node); void srat_parse_regions(u64 addr); diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h index 7aef1a8..dd33c92 100644 --- a/xen/include/xen/numa.h +++ b/xen/include/xen/numa.h @@ -18,4 +18,58 @@ (((d)->vcpu != NULL && (d)->vcpu[0] != NULL) \ ? vcpu_to_node((d)->vcpu[0]) : NUMA_NO_NODE) +struct node { + u64 start,end; +}; + +struct node_data { + unsigned long node_start_pfn; + unsigned long node_spanned_pages; + nodeid_t node_id; +}; + +#define NODE_DATA(nid) (&(node_data[nid])) +#define VIRTUAL_BUG_ON(x) + +#ifdef CONFIG_NUMA +extern void init_cpu_to_node(void); + +static inline void clear_node_cpumask(int cpu) +{ + cpumask_clear_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); +} + +/* Simple perfect hash to map pdx to node numbers */ +extern int memnode_shift; +extern unsigned long memnodemapsize; +extern u8 *memnodemap; +extern typeof(*memnodemap) _memnodemap[]; + +extern struct node_data node_data[]; + +static inline __attribute__((pure)) nodeid_t phys_to_nid(paddr_t addr) +{ + nodeid_t nid; + VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= memnodemapsize); + nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; + VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); + return nid; +} + +#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) +#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) +#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ + NODE_DATA(nid)->node_spanned_pages) + +#else +#define init_cpu_to_node() do {} while (0) +#define clear_node_cpumask(cpu) do {} while (0) +#endif /* CONFIG_NUMA */ + +extern void numa_add_cpu(int cpu); +extern nodeid_t setup_node(unsigned int pxm); +extern void numa_set_node(int cpu, nodeid_t node); +extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end); +extern int compute_hash_shift(struct node *nodes, int numnodes, + nodeid_t *nodeids); #endif /* _XEN_NUMA_H */