diff mbox series

[RFC] x86/sgx: Add trivial NUMA allocation

Message ID 20201216135031.21518-1-jarkko@kernel.org (mailing list archive)
State New, archived
Headers show
Series [RFC] x86/sgx: Add trivial NUMA allocation | expand

Commit Message

Jarkko Sakkinen Dec. 16, 2020, 1:50 p.m. UTC
Create a pointer array for each NUMA node with the references to the
contained EPC sections. Use this in __sgx_alloc_epc_page() to knock the
current NUMA node before the others.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 arch/x86/kernel/cpu/sgx/main.c | 66 +++++++++++++++++++++++++++++++---
 1 file changed, 61 insertions(+), 5 deletions(-)

Comments

Dave Hansen Jan. 13, 2021, 12:24 a.m. UTC | #1
On 12/16/20 5:50 AM, Jarkko Sakkinen wrote:
> Create a pointer array for each NUMA node with the references to the
> contained EPC sections. Use this in __sgx_alloc_epc_page() to knock the
> current NUMA node before the others.

It makes it harder to comment when I'm not on cc.

Hint, hint... ;)

We need a bit more information here as well.  What's the relationship
between NUMA nodes and sections?  How does the BIOS tell us which NUMA
nodes a section is in?  Is it the same or different from normal RAM and
PMEM?

> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index c519fc5f6948..0da510763c47 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -13,6 +13,13 @@
>  #include "encl.h"
>  #include "encls.h"
>  
> +struct sgx_numa_node {
> +	struct sgx_epc_section *sections[SGX_MAX_EPC_SECTIONS];
> +	int nr_sections;
> +};

So, we have a 'NUMA node' structure already: pg_data_t.  Why don't we
just hang the epc sections off there?

> +static struct sgx_numa_node sgx_numa_nodes[MAX_NUMNODES];

Hmm...  Time to see if I can still do math.

#define SGX_MAX_EPC_SECTIONS            8

(sizeof(struct sgx_epc_section *) + sizeof(int)) * 8 * MAX_NUMNODES

CONFIG_NODES_SHIFT=10 (on Fedora)
#define MAX_NUMNODES (1 << NODES_SHIFT)

12*8*1024 = ~100k.  Yikes.  For *EVERY* system that enables SGX,
regardless if they are NUMA or not.'

Trivial is great, this may be too trivial.

Adding a list_head to pg_data_t and sgx_epc_section would add
SGX_MAX_EPC_SECTIONS*sizeof(list_head)=192 bytes plus 16 bytes per
*present* NUMA node.

>  /**
>   * __sgx_alloc_epc_page() - Allocate an EPC page
>   *
> @@ -485,14 +511,19 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_sec
>   */
>  struct sgx_epc_page *__sgx_alloc_epc_page(void)
>  {
> -	struct sgx_epc_section *section;
>  	struct sgx_epc_page *page;
> +	int nid = numa_node_id();
>  	int i;
>  
> -	for (i = 0; i < sgx_nr_epc_sections; i++) {
> -		section = &sgx_epc_sections[i];
> +	page = __sgx_alloc_epc_page_from_node(nid);
> +	if (page)
> +		return page;
>  
> -		page = __sgx_alloc_epc_page_from_section(section);
> +	for (i = 0; i < sgx_nr_numa_nodes; i++) {
> +		if (i == nid)
> +			continue;

Yikes.  That's a horribly inefficient loop.  Consider if nodes 0 and
1023 were the only ones with EPC.  What would this loop do?  I think
it's a much better idea to keep a nodemask_t of nodes that have EPC.
Then, just do bitmap searches.

> +		page = __sgx_alloc_epc_page_from_node(i);
>  		if (page)
>  			return page;
>  	}
> @@ -661,11 +692,28 @@ static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
>  	       ((high & GENMASK_ULL(19, 0)) << 32);
>  }
>  
> +static int __init sgx_pfn_to_nid(unsigned long pfn)
> +{
> +	pg_data_t *pgdat;
> +	int nid;
> +
> +	for (nid = 0; nid < nr_node_ids; nid++) {
> +		pgdat = NODE_DATA(nid);
> +
> +		if (pfn >= pgdat->node_start_pfn &&
> +		    pfn < (pgdat->node_start_pfn + pgdat->node_spanned_pages))
> +			return nid;
> +	}
> +
> +	return 0;
> +}

I'm not positive this works.  I *thought* these ->node_start_pfn and
->node_spanned_pages are really only guaranteed to cover memory which is
managed by the kernel and has 'struct page' for it.

EPC doesn't have a 'struct page', so won't necessarily be covered by the
pgdat-> and zone-> ranges.  I *think* you may have to go all the way
back to the ACPI SRAT for this.

It would also be *possible* to have an SRAT constructed like this:

0->1GB System RAM - Node 0
1->2GB Reserved   - Node 1
2->3GB System RAM - Node 0

Where the 1->2GB is EPC.  The Node 0 pg_data_t would be:

	pgdat->node_start_pfn = 0
	pgdat->node_spanned_pages = 3GB
Jarkko Sakkinen Jan. 14, 2021, 5:54 p.m. UTC | #2
On Tue, Jan 12, 2021 at 04:24:01PM -0800, Dave Hansen wrote:
> On 12/16/20 5:50 AM, Jarkko Sakkinen wrote:
> > Create a pointer array for each NUMA node with the references to the
> > contained EPC sections. Use this in __sgx_alloc_epc_page() to knock the
> > current NUMA node before the others.
> 
> It makes it harder to comment when I'm not on cc.
> 
> Hint, hint... ;)
> 
> We need a bit more information here as well.  What's the relationship
> between NUMA nodes and sections?  How does the BIOS tell us which NUMA
> nodes a section is in?  Is it the same or different from normal RAM and
> PMEM?

How does it go with pmem?

> > diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> > index c519fc5f6948..0da510763c47 100644
> > --- a/arch/x86/kernel/cpu/sgx/main.c
> > +++ b/arch/x86/kernel/cpu/sgx/main.c
> > @@ -13,6 +13,13 @@
> >  #include "encl.h"
> >  #include "encls.h"
> >  
> > +struct sgx_numa_node {
> > +	struct sgx_epc_section *sections[SGX_MAX_EPC_SECTIONS];
> > +	int nr_sections;
> > +};
> 
> So, we have a 'NUMA node' structure already: pg_data_t.  Why don't we
> just hang the epc sections off there?
> 
> > +static struct sgx_numa_node sgx_numa_nodes[MAX_NUMNODES];
> 
> Hmm...  Time to see if I can still do math.
> 
> #define SGX_MAX_EPC_SECTIONS            8
> 
> (sizeof(struct sgx_epc_section *) + sizeof(int)) * 8 * MAX_NUMNODES
> 
> CONFIG_NODES_SHIFT=10 (on Fedora)
> #define MAX_NUMNODES (1 << NODES_SHIFT)
> 
> 12*8*1024 = ~100k.  Yikes.  For *EVERY* system that enables SGX,
> regardless if they are NUMA or not.'
> 
> Trivial is great, this may be too trivial.
> 
> Adding a list_head to pg_data_t and sgx_epc_section would add
> SGX_MAX_EPC_SECTIONS*sizeof(list_head)=192 bytes plus 16 bytes per
> *present* NUMA node.

I'm hearing you.

> >  /**
> >   * __sgx_alloc_epc_page() - Allocate an EPC page
> >   *
> > @@ -485,14 +511,19 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_sec
> >   */
> >  struct sgx_epc_page *__sgx_alloc_epc_page(void)
> >  {
> > -	struct sgx_epc_section *section;
> >  	struct sgx_epc_page *page;
> > +	int nid = numa_node_id();
> >  	int i;
> >  
> > -	for (i = 0; i < sgx_nr_epc_sections; i++) {
> > -		section = &sgx_epc_sections[i];
> > +	page = __sgx_alloc_epc_page_from_node(nid);
> > +	if (page)
> > +		return page;
> >  
> > -		page = __sgx_alloc_epc_page_from_section(section);
> > +	for (i = 0; i < sgx_nr_numa_nodes; i++) {
> > +		if (i == nid)
> > +			continue;
> 
> Yikes.  That's a horribly inefficient loop.  Consider if nodes 0 and
> 1023 were the only ones with EPC.  What would this loop do?  I think
> it's a much better idea to keep a nodemask_t of nodes that have EPC.
> Then, just do bitmap searches.

OK, so we need to maintain nodemask_t containing nodes with EPC.

Probably also split the patch to a patch that builds nodemask_t,
and the one that uses it. They are separately reviewable clearly.
The allocation part can considered sorted out now.
> 
> > +		page = __sgx_alloc_epc_page_from_node(i);
> >  		if (page)
> >  			return page;
> >  	}
> > @@ -661,11 +692,28 @@ static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
> >  	       ((high & GENMASK_ULL(19, 0)) << 32);
> >  }
> >  
> > +static int __init sgx_pfn_to_nid(unsigned long pfn)
> > +{
> > +	pg_data_t *pgdat;
> > +	int nid;
> > +
> > +	for (nid = 0; nid < nr_node_ids; nid++) {
> > +		pgdat = NODE_DATA(nid);
> > +
> > +		if (pfn >= pgdat->node_start_pfn &&
> > +		    pfn < (pgdat->node_start_pfn + pgdat->node_spanned_pages))
> > +			return nid;
> > +	}
> > +
> > +	return 0;
> > +}
> 
> I'm not positive this works.  I *thought* these ->node_start_pfn and
> ->node_spanned_pages are really only guaranteed to cover memory which is
> managed by the kernel and has 'struct page' for it.
> 
> EPC doesn't have a 'struct page', so won't necessarily be covered by the
> pgdat-> and zone-> ranges.  I *think* you may have to go all the way
> back to the ACPI SRAT for this.
> 
> It would also be *possible* to have an SRAT constructed like this:
> 
> 0->1GB System RAM - Node 0
> 1->2GB Reserved   - Node 1
> 2->3GB System RAM - Node 0
> 
> Where the 1->2GB is EPC.  The Node 0 pg_data_t would be:
> 
> 	pgdat->node_start_pfn = 0
> 	pgdat->node_spanned_pages = 3GB

If I've understood the current Linux memory architecture correctly.

- Memory is made available through mm/memory_hotplug.c, which is populated
  by drivers/acpi/acpi_memhotplug.c.
- drivers/acpi/numa/srat.c provides the conversion API from proximity node to
  logical node but I'm not *yet* sure how the interaction goes with memory
  hot plugging

I'm not sure of I'm following the idea of alternative SRAT construciton.
So are you saying that srat.c would somehow group pxm's with EPC to
specific node numbers?

/Jarkko
Dave Hansen Jan. 14, 2021, 6:35 p.m. UTC | #3
On 1/14/21 9:54 AM, Jarkko Sakkinen wrote:
> On Tue, Jan 12, 2021 at 04:24:01PM -0800, Dave Hansen wrote:
>> We need a bit more information here as well.  What's the relationship
>> between NUMA nodes and sections?  How does the BIOS tell us which NUMA
>> nodes a section is in?  Is it the same or different from normal RAM and
>> PMEM?
> 
> How does it go with pmem?

I just wanted to point out PMEM as being referred to by the SRAT, but as
something which is *not* "System RAM".  There might be some overlap in
NUMA for PMEM and NUMA for SGX memory since neither is enumerated as
"System RAM".

...
>> I'm not positive this works.  I *thought* these ->node_start_pfn and
>> ->node_spanned_pages are really only guaranteed to cover memory which is
>> managed by the kernel and has 'struct page' for it.
>>
>> EPC doesn't have a 'struct page', so won't necessarily be covered by the
>> pgdat-> and zone-> ranges.  I *think* you may have to go all the way
>> back to the ACPI SRAT for this.
>>
>> It would also be *possible* to have an SRAT constructed like this:
>>
>> 0->1GB System RAM - Node 0
>> 1->2GB Reserved   - Node 1
>> 2->3GB System RAM - Node 0
>>
>> Where the 1->2GB is EPC.  The Node 0 pg_data_t would be:
>>
>> 	pgdat->node_start_pfn = 0
>> 	pgdat->node_spanned_pages = 3GB
> 
> If I've understood the current Linux memory architecture correctly.
> 
> - Memory is made available through mm/memory_hotplug.c, which is populated
>   by drivers/acpi/acpi_memhotplug.c.
> - drivers/acpi/numa/srat.c provides the conversion API from proximity node to
>   logical node but I'm not *yet* sure how the interaction goes with memory
>   hot plugging
> 
> I'm not sure of I'm following the idea of alternative SRAT construciton.
> So are you saying that srat.c would somehow group pxm's with EPC to
> specific node numbers?

Basically, go look at the "SRAT:" messages in boot.  Are there SRAT
entries that cover all the EPC?  For instance, take this SRAT:

[    0.000000] ACPI: SRAT: Node 1 PXM 2 [mem 0x00000000-0xcfffffff]
[    0.000000] ACPI: SRAT: Node 1 PXM 2 [mem 0x100000000-0x82fffffff]
[    0.000000] ACPI: SRAT: Node 0 PXM 1 [mem 0x830000000-0xe2fffffff]

If EPC were at 0x100000000, we would be in good shape.  It is covered by
an SRAT entry that Linux parses as RAM.  But, if it were at 0xd0000000,
it would be in an SRAT "hole", uncovered by an SRAT entry.  In this
case, since 'Node 1" spans that hole the "Node 1" pgdat would span this
hole.  But, if some memory was removed from the system, "Node 1" might
no longer span that hole and EPC in this hole would not be assignable to
Node 1.

Please just make sure that there *ARE* SRAT entries that cover EPC
memory ranges.
Jarkko Sakkinen Jan. 15, 2021, 10:19 a.m. UTC | #4
On Thu, Jan 14, 2021 at 10:35:03AM -0800, Dave Hansen wrote:
> On 1/14/21 9:54 AM, Jarkko Sakkinen wrote:
> > On Tue, Jan 12, 2021 at 04:24:01PM -0800, Dave Hansen wrote:
> >> We need a bit more information here as well.  What's the relationship
> >> between NUMA nodes and sections?  How does the BIOS tell us which NUMA
> >> nodes a section is in?  Is it the same or different from normal RAM and
> >> PMEM?
> > 
> > How does it go with pmem?
> 
> I just wanted to point out PMEM as being referred to by the SRAT, but as
> something which is *not* "System RAM".  There might be some overlap in
> NUMA for PMEM and NUMA for SGX memory since neither is enumerated as
> "System RAM".

Right.

> ...
> >> I'm not positive this works.  I *thought* these ->node_start_pfn and
> >> ->node_spanned_pages are really only guaranteed to cover memory which is
> >> managed by the kernel and has 'struct page' for it.
> >>
> >> EPC doesn't have a 'struct page', so won't necessarily be covered by the
> >> pgdat-> and zone-> ranges.  I *think* you may have to go all the way
> >> back to the ACPI SRAT for this.
> >>
> >> It would also be *possible* to have an SRAT constructed like this:
> >>
> >> 0->1GB System RAM - Node 0
> >> 1->2GB Reserved   - Node 1
> >> 2->3GB System RAM - Node 0
> >>
> >> Where the 1->2GB is EPC.  The Node 0 pg_data_t would be:
> >>
> >> 	pgdat->node_start_pfn = 0
> >> 	pgdat->node_spanned_pages = 3GB
> > 
> > If I've understood the current Linux memory architecture correctly.
> > 
> > - Memory is made available through mm/memory_hotplug.c, which is populated
> >   by drivers/acpi/acpi_memhotplug.c.
> > - drivers/acpi/numa/srat.c provides the conversion API from proximity node to
> >   logical node but I'm not *yet* sure how the interaction goes with memory
> >   hot plugging
> > 
> > I'm not sure of I'm following the idea of alternative SRAT construciton.
> > So are you saying that srat.c would somehow group pxm's with EPC to
> > specific node numbers?
> 
> Basically, go look at the "SRAT:" messages in boot.  Are there SRAT
> entries that cover all the EPC?  For instance, take this SRAT:
> 
> [    0.000000] ACPI: SRAT: Node 1 PXM 2 [mem 0x00000000-0xcfffffff]
> [    0.000000] ACPI: SRAT: Node 1 PXM 2 [mem 0x100000000-0x82fffffff]
> [    0.000000] ACPI: SRAT: Node 0 PXM 1 [mem 0x830000000-0xe2fffffff]

Right!

> If EPC were at 0x100000000, we would be in good shape.  It is covered by
> an SRAT entry that Linux parses as RAM.  But, if it were at 0xd0000000,
> it would be in an SRAT "hole", uncovered by an SRAT entry.  In this
> case, since 'Node 1" spans that hole the "Node 1" pgdat would span this
> hole.  But, if some memory was removed from the system, "Node 1" might
> no longer span that hole and EPC in this hole would not be assignable to
> Node 1.
> 
> Please just make sure that there *ARE* SRAT entries that cover EPC
> memory ranges.

OK, I'm on page now, thanks.

/Jarkko
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index c519fc5f6948..0da510763c47 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -13,6 +13,13 @@ 
 #include "encl.h"
 #include "encls.h"
 
+struct sgx_numa_node {
+	struct sgx_epc_section *sections[SGX_MAX_EPC_SECTIONS];
+	int nr_sections;
+};
+
+static struct sgx_numa_node sgx_numa_nodes[MAX_NUMNODES];
+static int sgx_nr_numa_nodes;
 struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
 static int sgx_nr_epc_sections;
 static struct task_struct *ksgxd_tsk;
@@ -473,6 +480,25 @@  static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_sec
 	return page;
 }
 
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
+{
+	struct sgx_numa_node *node = &sgx_numa_nodes[nid];
+	struct sgx_epc_section *section;
+	struct sgx_epc_page *page;
+	int i;
+
+	for (i = 0; i < node->nr_sections; i++) {
+		section = node->sections[i];
+
+		page = __sgx_alloc_epc_page_from_section(section);
+		if (page)
+			return page;
+	}
+
+	return NULL;
+}
+
+
 /**
  * __sgx_alloc_epc_page() - Allocate an EPC page
  *
@@ -485,14 +511,19 @@  static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_sec
  */
 struct sgx_epc_page *__sgx_alloc_epc_page(void)
 {
-	struct sgx_epc_section *section;
 	struct sgx_epc_page *page;
+	int nid = numa_node_id();
 	int i;
 
-	for (i = 0; i < sgx_nr_epc_sections; i++) {
-		section = &sgx_epc_sections[i];
+	page = __sgx_alloc_epc_page_from_node(nid);
+	if (page)
+		return page;
 
-		page = __sgx_alloc_epc_page_from_section(section);
+	for (i = 0; i < sgx_nr_numa_nodes; i++) {
+		if (i == nid)
+			continue;
+
+		page = __sgx_alloc_epc_page_from_node(i);
 		if (page)
 			return page;
 	}
@@ -661,11 +692,28 @@  static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
 	       ((high & GENMASK_ULL(19, 0)) << 32);
 }
 
+static int __init sgx_pfn_to_nid(unsigned long pfn)
+{
+	pg_data_t *pgdat;
+	int nid;
+
+	for (nid = 0; nid < nr_node_ids; nid++) {
+		pgdat = NODE_DATA(nid);
+
+		if (pfn >= pgdat->node_start_pfn &&
+		    pfn < (pgdat->node_start_pfn + pgdat->node_spanned_pages))
+			return nid;
+	}
+
+	return 0;
+}
+
 static bool __init sgx_page_cache_init(void)
 {
 	u32 eax, ebx, ecx, edx, type;
+	struct sgx_numa_node *node;
 	u64 pa, size;
-	int i;
+	int i, nid;
 
 	for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
 		cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
@@ -690,6 +738,14 @@  static bool __init sgx_page_cache_init(void)
 		}
 
 		sgx_nr_epc_sections++;
+
+		nid = sgx_pfn_to_nid(PFN_DOWN(pa));
+		node = &sgx_numa_nodes[nid];
+
+		node->sections[node->nr_sections] = &sgx_epc_sections[i];
+		node->nr_sections++;
+
+		sgx_nr_numa_nodes = max(sgx_nr_numa_nodes, nid + 1);
 	}
 
 	if (!sgx_nr_epc_sections) {