diff mbox series

[v3,5/5] x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page()

Message ID 20210303150323.433207-6-jarkko@kernel.org (mailing list archive)
State New, archived
Headers show
Series x86/sgx: NUMA | expand

Commit Message

Jarkko Sakkinen March 3, 2021, 3:03 p.m. UTC
Background
==========

EPC section is covered by one or more SRAT entries that are associated with
one and only one PXM (NUMA node). The motivation behind this patch is to
provide basic elements of building allocation scheme based on this premise.

Solution
========

Use phys_to_target_node() to associate each NUMA node with the EPC
sections contained within its range. In sgx_alloc_epc_page(), first try
to allocate from the NUMA node, where the CPU is executing. If that
fails, fallback to the legacy allocation.

Link: https://lore.kernel.org/lkml/158188326978.894464.217282995221175417.stgit@dwillia2-desk3.amr.corp.intel.com/
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 arch/x86/Kconfig               |  1 +
 arch/x86/kernel/cpu/sgx/main.c | 84 ++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/sgx/sgx.h  |  9 ++++
 3 files changed, 94 insertions(+)

Comments

Dave Hansen March 4, 2021, 12:20 a.m. UTC | #1
What changed from the last patch?

On 3/3/21 7:03 AM, Jarkko Sakkinen wrote:
> Background
> ==========
> 
> EPC section is covered by one or more SRAT entries that are associated with
> one and only one PXM (NUMA node). The motivation behind this patch is to
> provide basic elements of building allocation scheme based on this premise.

Just like normal RAM, enclave memory (EPC) should be covered by entries
in the ACPI SRAT table.  These entries allow each EPC section to be
associated with a NUMA node.

Use this information to implement a simple NUMA-aware allocator for
enclave memory.

> Use phys_to_target_node() to associate each NUMA node with the EPC
> sections contained within its range. In sgx_alloc_epc_page(), first try
> to allocate from the NUMA node, where the CPU is executing. If that
> fails, fallback to the legacy allocation.

By "legacy", you mean the one from the last patch? :)

> Link: https://lore.kernel.org/lkml/158188326978.894464.217282995221175417.stgit@dwillia2-desk3.amr.corp.intel.com/
> Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
> ---
>  arch/x86/Kconfig               |  1 +
>  arch/x86/kernel/cpu/sgx/main.c | 84 ++++++++++++++++++++++++++++++++++
>  arch/x86/kernel/cpu/sgx/sgx.h  |  9 ++++
>  3 files changed, 94 insertions(+)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index a5f6a3013138..7eb1e96cfe8a 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1940,6 +1940,7 @@ config X86_SGX
>  	depends on CRYPTO_SHA256=y
>  	select SRCU
>  	select MMU_NOTIFIER
> +	select NUMA_KEEP_MEMINFO if NUMA

This dependency is worth mentioning somewhere.  Why do we suddenly need
NUMA_KEEP_MEMINFO?

> +/* Nodes with one or more EPC sections. */
> +static nodemask_t sgx_numa_mask;
> +
> +/*
> + * Array with one list_head for each possible NUMA node.  Each
> + * list contains all the sgx_epc_section's which are on that

					   ^ no "'", please

> + * node.
> + */
> +static struct sgx_numa_node *sgx_numa_nodes;
> +
> +/*
> + * sgx_free_epc_page() uses this to find out the correct struct sgx_numa_node,
> + * to put the page in.
> + */
> +static int sgx_section_to_numa_node_id[SGX_MAX_EPC_SECTIONS];

If this is per-section, why not put it in struct sgx_epc_section?

>  /*
> @@ -434,6 +451,36 @@ static bool __init sgx_page_reclaimer_init(struct list_head *laundry)
>  	return true;
>  }
>  
> +static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
> +{
> +	struct sgx_epc_page *page = NULL;
> +	struct sgx_numa_node *sgx_node;
> +
> +	if (WARN_ON_ONCE(nid < 0 || nid >= num_possible_nodes()))
> +		return NULL;

This has exactly one call-site which plumbs numa_node_id() in here
pretty directly.  Is this check worthwhile?

> +	if (!node_isset(nid, sgx_numa_mask))
> +		return NULL;
> +
> +	sgx_node = &sgx_numa_nodes[nid];
> +
> +	spin_lock(&sgx_free_page_list_lock);

The glocal lock protecting a per-node structure is a bit unsightly.

> +	if (list_empty(&sgx_node->free_page_list)) {
> +		spin_unlock(&sgx_free_page_list_lock);
> +		return NULL;
> +	}
> +
> +	page = list_first_entry(&sgx_node->free_page_list, struct sgx_epc_page, numa_list);
> +	list_del_init(&page->numa_list);
> +	list_del_init(&page->list);
> +	sgx_nr_free_pages--;
> +
> +	spin_unlock(&sgx_free_page_list_lock);
> +
> +	return page;
> +}
> +
>  /**
>   * __sgx_alloc_epc_page() - Allocate an EPC page
>   *
> @@ -446,8 +493,14 @@ static bool __init sgx_page_reclaimer_init(struct list_head *laundry)
>   */
>  struct sgx_epc_page *__sgx_alloc_epc_page(void)
>  {
> +	int current_nid = numa_node_id();
>  	struct sgx_epc_page *page;
>  
> +	/* Try to allocate EPC from the current node, first: */
> +	page = __sgx_alloc_epc_page_from_node(current_nid);
> +	if (page)
> +		return page;
> +
>  	spin_lock(&sgx_free_page_list_lock);
>  
>  	if (list_empty(&sgx_free_page_list)) {
> @@ -456,6 +509,7 @@ struct sgx_epc_page *__sgx_alloc_epc_page(void)
>  	}
>  
>  	page = list_first_entry(&sgx_free_page_list, struct sgx_epc_page, list);
> +	list_del_init(&page->numa_list);
>  	list_del_init(&page->list);
>  	sgx_nr_free_pages--;

I would much rather prefer that this does what the real page allocator
does: kep the page on a single list.  That list is maintained
per-NUMA-node.  Allocations try local NUMA node structures, then fall
back to other structures (hopefully in a locality-aware fashion).

I wrote you the loop that I want to see this implement in an earlier
review.  This, basically:

	page = NULL;
	nid = numa_node_id();
	while (true) {
		page = __sgx_alloc_epc_page_from_node(nid);	
		if (page)
			break;

		nid = // ... some search here, next_node_in()...
		// check if we wrapped around:
		if (nid == numa_node_id())
			break;
	}

There's no global list.  You just walk around nodes trying to find one
with space.  If you wrap around, you stop.

Please implement this.  If you think it's a bad idea, or can't, let's
talk about it in advance.  Right now, it appears that my review comments
aren't being incorporated into newer versions.

>  void sgx_free_epc_page(struct sgx_epc_page *page)
>  {
> +	int nid = sgx_section_to_numa_node_id[page->section];
> +	struct sgx_numa_node *sgx_node = &sgx_numa_nodes[nid];
>  	int ret;
>  
>  	WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
> @@ -575,7 +631,15 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
>  		return;
>  
>  	spin_lock(&sgx_free_page_list_lock);
> +
> +	/*   Enable NUMA local allocation in sgx_alloc_epc_page(). */
> +	if (!node_isset(nid, sgx_numa_mask)) {
> +		INIT_LIST_HEAD(&sgx_node->free_page_list);
> +		node_set(nid, sgx_numa_mask);
> +	}
> +
>  	list_add_tail(&page->list, &sgx_free_page_list);
> +	list_add_tail(&page->numa_list, &sgx_node->free_page_list);
>  	sgx_nr_free_pages++;
>  	spin_unlock(&sgx_free_page_list_lock);
>  }
> @@ -626,8 +690,28 @@ static bool __init sgx_page_cache_init(struct list_head *laundry)
>  {
>  	u32 eax, ebx, ecx, edx, type;
>  	u64 pa, size;
> +	int nid;
>  	int i;
>  
> +	nodes_clear(sgx_numa_mask);

Is this really required for a variable allocated in .bss?

> +	sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);

This is what I was looking for here, thanks!

> +	/*
> +	 * Create NUMA node lookup table for sgx_free_epc_page() as the very
> +	 * first step, as it is used to populate the free list's during the
> +	 * initialization.
> +	 */
> +	for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
> +		nid = numa_map_to_online_node(phys_to_target_node(pa));
> +		if (nid == NUMA_NO_NODE) {
> +			/* The physical address is already printed above. */
> +			pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
> +			nid = 0;
> +		}
> +
> +		sgx_section_to_numa_node_id[i] = nid;
> +	}
> +
>  	for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
>  		cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
>  
> diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
> index 41ca045a574a..3a3c07fc0c8e 100644
> --- a/arch/x86/kernel/cpu/sgx/sgx.h
> +++ b/arch/x86/kernel/cpu/sgx/sgx.h
> @@ -27,6 +27,7 @@ struct sgx_epc_page {
>  	unsigned int flags;
>  	struct sgx_encl_page *owner;
>  	struct list_head list;
> +	struct list_head numa_list;
>  };

I'll say it again, explicitly: Each sgx_epc_page should be on one and
only one free list: a per-NUMA-node list.

>  /*
> @@ -43,6 +44,14 @@ struct sgx_epc_section {
>  
>  extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
>  
> +/*
> + * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
> + * the free page list local to the node is stored here.
> + */
> +struct sgx_numa_node {
> +	struct list_head free_page_list;
> +};

I think it's unconscionable to leave this protected by a global lock.
Please at least give us a per-node spinlock proteting this list.
Jarkko Sakkinen March 10, 2021, 11:30 a.m. UTC | #2
Weird. I did check my kernel org last time on Thrusday night but did not
get this. I was actually wondering the lack of feedback.

Then I had suddenly huge pile of email waiting for me on Monday with
bunch emails from around the time you sent this one.

On Wed, Mar 03, 2021 at 04:20:03PM -0800, Dave Hansen wrote:
> What changed from the last patch?
> 
> On 3/3/21 7:03 AM, Jarkko Sakkinen wrote:
> > Background
> > ==========
> > 
> > EPC section is covered by one or more SRAT entries that are associated with
> > one and only one PXM (NUMA node). The motivation behind this patch is to
> > provide basic elements of building allocation scheme based on this premise.
> 
> Just like normal RAM, enclave memory (EPC) should be covered by entries
> in the ACPI SRAT table.  These entries allow each EPC section to be
> associated with a NUMA node.
> 
> Use this information to implement a simple NUMA-aware allocator for
> enclave memory.
> 
> > Use phys_to_target_node() to associate each NUMA node with the EPC
> > sections contained within its range. In sgx_alloc_epc_page(), first try
> > to allocate from the NUMA node, where the CPU is executing. If that
> > fails, fallback to the legacy allocation.
> 
> By "legacy", you mean the one from the last patch? :)
> 
> > Link: https://lore.kernel.org/lkml/158188326978.894464.217282995221175417.stgit@dwillia2-desk3.amr.corp.intel.com/
> > Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
> > ---
> >  arch/x86/Kconfig               |  1 +
> >  arch/x86/kernel/cpu/sgx/main.c | 84 ++++++++++++++++++++++++++++++++++
> >  arch/x86/kernel/cpu/sgx/sgx.h  |  9 ++++
> >  3 files changed, 94 insertions(+)
> > 
> > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> > index a5f6a3013138..7eb1e96cfe8a 100644
> > --- a/arch/x86/Kconfig
> > +++ b/arch/x86/Kconfig
> > @@ -1940,6 +1940,7 @@ config X86_SGX
> >  	depends on CRYPTO_SHA256=y
> >  	select SRCU
> >  	select MMU_NOTIFIER
> > +	select NUMA_KEEP_MEMINFO if NUMA
> 
> This dependency is worth mentioning somewhere.  Why do we suddenly need
> NUMA_KEEP_MEMINFO?
> 
> > +/* Nodes with one or more EPC sections. */
> > +static nodemask_t sgx_numa_mask;
> > +
> > +/*
> > + * Array with one list_head for each possible NUMA node.  Each
> > + * list contains all the sgx_epc_section's which are on that
> 
> 					   ^ no "'", please
> 
> > + * node.
> > + */
> > +static struct sgx_numa_node *sgx_numa_nodes;
> > +
> > +/*
> > + * sgx_free_epc_page() uses this to find out the correct struct sgx_numa_node,
> > + * to put the page in.
> > + */
> > +static int sgx_section_to_numa_node_id[SGX_MAX_EPC_SECTIONS];
> 
> If this is per-section, why not put it in struct sgx_epc_section?

Because struct sgx_epc_page does not contain a pointer to
struct sgx_epc_section.

> 
> >  /*
> > @@ -434,6 +451,36 @@ static bool __init sgx_page_reclaimer_init(struct list_head *laundry)
> >  	return true;
> >  }
> >  
> > +static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
> > +{
> > +	struct sgx_epc_page *page = NULL;
> > +	struct sgx_numa_node *sgx_node;
> > +
> > +	if (WARN_ON_ONCE(nid < 0 || nid >= num_possible_nodes()))
> > +		return NULL;
> 
> This has exactly one call-site which plumbs numa_node_id() in here
> pretty directly.  Is this check worthwhile?

Probably not.


> > +	if (!node_isset(nid, sgx_numa_mask))
> > +		return NULL;
> > +
> > +	sgx_node = &sgx_numa_nodes[nid];
> > +
> > +	spin_lock(&sgx_free_page_list_lock);
> 
> The glocal lock protecting a per-node structure is a bit unsightly.

The patch set could introduce additional patch for changing the
locking scheme. It's logically a separate change.

> > +	if (list_empty(&sgx_node->free_page_list)) {
> > +		spin_unlock(&sgx_free_page_list_lock);
> > +		return NULL;
> > +	}
> > +
> > +	page = list_first_entry(&sgx_node->free_page_list, struct sgx_epc_page, numa_list);
> > +	list_del_init(&page->numa_list);
> > +	list_del_init(&page->list);
> > +	sgx_nr_free_pages--;
> > +
> > +	spin_unlock(&sgx_free_page_list_lock);
> > +
> > +	return page;
> > +}
> > +
> >  /**
> >   * __sgx_alloc_epc_page() - Allocate an EPC page
> >   *
> > @@ -446,8 +493,14 @@ static bool __init sgx_page_reclaimer_init(struct list_head *laundry)
> >   */
> >  struct sgx_epc_page *__sgx_alloc_epc_page(void)
> >  {
> > +	int current_nid = numa_node_id();
> >  	struct sgx_epc_page *page;
> >  
> > +	/* Try to allocate EPC from the current node, first: */
> > +	page = __sgx_alloc_epc_page_from_node(current_nid);
> > +	if (page)
> > +		return page;
> > +
> >  	spin_lock(&sgx_free_page_list_lock);
> >  
> >  	if (list_empty(&sgx_free_page_list)) {
> > @@ -456,6 +509,7 @@ struct sgx_epc_page *__sgx_alloc_epc_page(void)
> >  	}
> >  
> >  	page = list_first_entry(&sgx_free_page_list, struct sgx_epc_page, list);
> > +	list_del_init(&page->numa_list);
> >  	list_del_init(&page->list);
> >  	sgx_nr_free_pages--;
> 
> I would much rather prefer that this does what the real page allocator
> does: kep the page on a single list.  That list is maintained
> per-NUMA-node.  Allocations try local NUMA node structures, then fall
> back to other structures (hopefully in a locality-aware fashion).
> 
> I wrote you the loop that I want to see this implement in an earlier
> review.  This, basically:
> 
> 	page = NULL;
> 	nid = numa_node_id();
> 	while (true) {
> 		page = __sgx_alloc_epc_page_from_node(nid);	
> 		if (page)
> 			break;
> 
> 		nid = // ... some search here, next_node_in()...
> 		// check if we wrapped around:
> 		if (nid == numa_node_id())
> 			break;
> 	}
> 
> There's no global list.  You just walk around nodes trying to find one
> with space.  If you wrap around, you stop.
> 
> Please implement this.  If you think it's a bad idea, or can't, let's
> talk about it in advance.  Right now, it appears that my review comments
> aren't being incorporated into newer versions.

How I interpreted your earlier comments is that the fallback is unfair and
this patch set version does fix that. 

I can buy the above allocation scheme, but I don't think this patch set
version is a step backwards. The things done to struct sgx_epc_section
are exactly what should be done to it.

Implementation-wise you are asking me to squash 4/5 and 5/5 into a single
patch, and remove global list. It's a tiny iteration from this patch
version and I can do it.

> >  void sgx_free_epc_page(struct sgx_epc_page *page)
> >  {
> > +	int nid = sgx_section_to_numa_node_id[page->section];
> > +	struct sgx_numa_node *sgx_node = &sgx_numa_nodes[nid];
> >  	int ret;
> >  
> >  	WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
> > @@ -575,7 +631,15 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
> >  		return;
> >  
> >  	spin_lock(&sgx_free_page_list_lock);
> > +
> > +	/*   Enable NUMA local allocation in sgx_alloc_epc_page(). */
> > +	if (!node_isset(nid, sgx_numa_mask)) {
> > +		INIT_LIST_HEAD(&sgx_node->free_page_list);
> > +		node_set(nid, sgx_numa_mask);
> > +	}
> > +
> >  	list_add_tail(&page->list, &sgx_free_page_list);
> > +	list_add_tail(&page->numa_list, &sgx_node->free_page_list);
> >  	sgx_nr_free_pages++;
> >  	spin_unlock(&sgx_free_page_list_lock);
> >  }
> > @@ -626,8 +690,28 @@ static bool __init sgx_page_cache_init(struct list_head *laundry)
> >  {
> >  	u32 eax, ebx, ecx, edx, type;
> >  	u64 pa, size;
> > +	int nid;
> >  	int i;
> >  
> > +	nodes_clear(sgx_numa_mask);
> 
> Is this really required for a variable allocated in .bss?

Probably not, I'll check what nodes_clear() does.

> > +	sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
> 
> This is what I was looking for here, thanks!
> 
> > +	/*
> > +	 * Create NUMA node lookup table for sgx_free_epc_page() as the very
> > +	 * first step, as it is used to populate the free list's during the
> > +	 * initialization.
> > +	 */
> > +	for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
> > +		nid = numa_map_to_online_node(phys_to_target_node(pa));
> > +		if (nid == NUMA_NO_NODE) {
> > +			/* The physical address is already printed above. */
> > +			pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
> > +			nid = 0;
> > +		}
> > +
> > +		sgx_section_to_numa_node_id[i] = nid;
> > +	}
> > +
> >  	for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
> >  		cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
> >  
> > diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
> > index 41ca045a574a..3a3c07fc0c8e 100644
> > --- a/arch/x86/kernel/cpu/sgx/sgx.h
> > +++ b/arch/x86/kernel/cpu/sgx/sgx.h
> > @@ -27,6 +27,7 @@ struct sgx_epc_page {
> >  	unsigned int flags;
> >  	struct sgx_encl_page *owner;
> >  	struct list_head list;
> > +	struct list_head numa_list;
> >  };
> 
> I'll say it again, explicitly: Each sgx_epc_page should be on one and
> only one free list: a per-NUMA-node list.
> 
> >  /*
> > @@ -43,6 +44,14 @@ struct sgx_epc_section {
> >  
> >  extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
> >  
> > +/*
> > + * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
> > + * the free page list local to the node is stored here.
> > + */
> > +struct sgx_numa_node {
> > +	struct list_head free_page_list;
> > +};
> 
> I think it's unconscionable to leave this protected by a global lock.
> Please at least give us a per-node spinlock proteting this list.

I can do it but I'll add a separate commit for it. It's better to make
locking scheme changes that way (IMHO). Helps with bisection later on...

/Jarkko
Dave Hansen March 10, 2021, 3:44 p.m. UTC | #3
>>> + * node.
>>> + */
>>> +static struct sgx_numa_node *sgx_numa_nodes;
>>> +
>>> +/*
>>> + * sgx_free_epc_page() uses this to find out the correct struct sgx_numa_node,
>>> + * to put the page in.
>>> + */
>>> +static int sgx_section_to_numa_node_id[SGX_MAX_EPC_SECTIONS];
>>
>> If this is per-section, why not put it in struct sgx_epc_section?
> 
> Because struct sgx_epc_page does not contain a pointer to
> struct sgx_epc_section.

Currently, we have epc_page->section.  That's not changing.  But, with
the free list moving from sgx_epc_section to sgx_numa_node, we need a
way to get from page->node, not just page->section.

We can either add that to:

	struct sgx_epc_section {
		...
+		struct sgx_numa_node *node;
	}

so we can do epc_page->section->node to find the epc_page's free list,
or we could just do:

 struct sgx_epc_page {
- 	unsigned int section;
+ 	unsigned int node;
 	unsigned int flags;
 	struct sgx_encl_page *owner;
 	struct list_head list;
	struct list_head numa_list;
 };

and go from page->node directly.

>>>  	page = list_first_entry(&sgx_free_page_list, struct sgx_epc_page, list);
>>> +	list_del_init(&page->numa_list);
>>>  	list_del_init(&page->list);
>>>  	sgx_nr_free_pages--;
>>
>> I would much rather prefer that this does what the real page allocator
>> does: kep the page on a single list.  That list is maintained
>> per-NUMA-node.  Allocations try local NUMA node structures, then fall
>> back to other structures (hopefully in a locality-aware fashion).
>>
>> I wrote you the loop that I want to see this implement in an earlier
>> review.  This, basically:
>>
>> 	page = NULL;
>> 	nid = numa_node_id();
>> 	while (true) {
>> 		page = __sgx_alloc_epc_page_from_node(nid);	
>> 		if (page)
>> 			break;
>>
>> 		nid = // ... some search here, next_node_in()...
>> 		// check if we wrapped around:
>> 		if (nid == numa_node_id())
>> 			break;
>> 	}
>>
>> There's no global list.  You just walk around nodes trying to find one
>> with space.  If you wrap around, you stop.
>>
>> Please implement this.  If you think it's a bad idea, or can't, let's
>> talk about it in advance.  Right now, it appears that my review comments
>> aren't being incorporated into newer versions.
> 
> How I interpreted your earlier comments is that the fallback is unfair and
> this patch set version does fix that. 
> 
> I can buy the above allocation scheme, but I don't think this patch set
> version is a step backwards. The things done to struct sgx_epc_section
> are exactly what should be done to it.

To me, it's a step backwards.  It regresses in that it falls back to an
entirely non-NUMA aware allocation mechanism.  The global list is
actually likely to be even worse than the per-section searches because
it has a global lock as opposed to the at per-section locks.  It also
has the overhead of managing two lists instead of one.

So, yes, it is *fair* in terms of NUMA node pressure.  But being fair in
a NUMA-aware allocator by simply not doing NUMA at all is a regression.

> Implementation-wise you are asking me to squash 4/5 and 5/5 into a single
> patch, and remove global list. It's a tiny iteration from this patch
> version and I can do it.

Sounds good.
Jarkko Sakkinen March 10, 2021, 9:48 p.m. UTC | #4
On Wed, Mar 10, 2021 at 07:44:39AM -0800, Dave Hansen wrote:
> >>> + * node.
> >>> + */
> >>> +static struct sgx_numa_node *sgx_numa_nodes;
> >>> +
> >>> +/*
> >>> + * sgx_free_epc_page() uses this to find out the correct struct sgx_numa_node,
> >>> + * to put the page in.
> >>> + */
> >>> +static int sgx_section_to_numa_node_id[SGX_MAX_EPC_SECTIONS];
> >>
> >> If this is per-section, why not put it in struct sgx_epc_section?
> > 
> > Because struct sgx_epc_page does not contain a pointer to
> > struct sgx_epc_section.
> 
> Currently, we have epc_page->section.  That's not changing.  But, with
> the free list moving from sgx_epc_section to sgx_numa_node, we need a
> way to get from page->node, not just page->section.
> 
> We can either add that to:
> 
> 	struct sgx_epc_section {
> 		...
> +		struct sgx_numa_node *node;
> 	}
> 
> so we can do epc_page->section->node to find the epc_page's free list,
> or we could just do:
> 
>  struct sgx_epc_page {
> - 	unsigned int section;
> + 	unsigned int node;
>  	unsigned int flags;
>  	struct sgx_encl_page *owner;
>  	struct list_head list;
> 	struct list_head numa_list;
>  };
> 
> and go from page->node directly.

OK, I buy this, thanks.

> >>>  	page = list_first_entry(&sgx_free_page_list, struct sgx_epc_page, list);
> >>> +	list_del_init(&page->numa_list);
> >>>  	list_del_init(&page->list);
> >>>  	sgx_nr_free_pages--;
> >>
> >> I would much rather prefer that this does what the real page allocator
> >> does: kep the page on a single list.  That list is maintained
> >> per-NUMA-node.  Allocations try local NUMA node structures, then fall
> >> back to other structures (hopefully in a locality-aware fashion).
> >>
> >> I wrote you the loop that I want to see this implement in an earlier
> >> review.  This, basically:
> >>
> >> 	page = NULL;
> >> 	nid = numa_node_id();
> >> 	while (true) {
> >> 		page = __sgx_alloc_epc_page_from_node(nid);	
> >> 		if (page)
> >> 			break;
> >>
> >> 		nid = // ... some search here, next_node_in()...
> >> 		// check if we wrapped around:
> >> 		if (nid == numa_node_id())
> >> 			break;
> >> 	}
> >>
> >> There's no global list.  You just walk around nodes trying to find one
> >> with space.  If you wrap around, you stop.
> >>
> >> Please implement this.  If you think it's a bad idea, or can't, let's
> >> talk about it in advance.  Right now, it appears that my review comments
> >> aren't being incorporated into newer versions.
> > 
> > How I interpreted your earlier comments is that the fallback is unfair and
> > this patch set version does fix that. 
> > 
> > I can buy the above allocation scheme, but I don't think this patch set
> > version is a step backwards. The things done to struct sgx_epc_section
> > are exactly what should be done to it.
> 
> To me, it's a step backwards.  It regresses in that it falls back to an
> entirely non-NUMA aware allocation mechanism.  The global list is
> actually likely to be even worse than the per-section searches because
> it has a global lock as opposed to the at per-section locks.  It also
> has the overhead of managing two lists instead of one.
> 
> So, yes, it is *fair* in terms of NUMA node pressure.  But being fair in
> a NUMA-aware allocator by simply not doing NUMA at all is a regression.

The code is structured now in a way that is trivial to remove the global
list and move on to just node lists. I.e. nasty section lists have been
wiped away. Refactoring global list out is a trivial step.  

That way this is a step forwards, even if having a global list would
be step backwards:-)

> > Implementation-wise you are asking me to squash 4/5 and 5/5 into a single
> > patch, and remove global list. It's a tiny iteration from this patch
> > version and I can do it.
> 
> Sounds good.

I'll dissolve your feedback and come with the new version, which I'll
put out tomorrow.

PS. If you don't here of me after you have given feedback to the next
version, please ping privately. It looks like things are getting through
again fast but better be sure than sorry...

/Jarkko
diff mbox series

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a5f6a3013138..7eb1e96cfe8a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1940,6 +1940,7 @@  config X86_SGX
 	depends on CRYPTO_SHA256=y
 	select SRCU
 	select MMU_NOTIFIER
+	select NUMA_KEEP_MEMINFO if NUMA
 	help
 	  Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
 	  that can be used by applications to set aside private regions of code
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 58474480f5be..62cc0e1f0728 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -25,6 +25,23 @@  static DEFINE_SPINLOCK(sgx_reclaimer_lock);
 /* The free page list lock protected variables prepend the lock. */
 static unsigned long sgx_nr_free_pages;
 static LIST_HEAD(sgx_free_page_list);
+
+/* Nodes with one or more EPC sections. */
+static nodemask_t sgx_numa_mask;
+
+/*
+ * Array with one list_head for each possible NUMA node.  Each
+ * list contains all the sgx_epc_section's which are on that
+ * node.
+ */
+static struct sgx_numa_node *sgx_numa_nodes;
+
+/*
+ * sgx_free_epc_page() uses this to find out the correct struct sgx_numa_node,
+ * to put the page in.
+ */
+static int sgx_section_to_numa_node_id[SGX_MAX_EPC_SECTIONS];
+
 static DEFINE_SPINLOCK(sgx_free_page_list_lock);
 
 /*
@@ -434,6 +451,36 @@  static bool __init sgx_page_reclaimer_init(struct list_head *laundry)
 	return true;
 }
 
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
+{
+	struct sgx_epc_page *page = NULL;
+	struct sgx_numa_node *sgx_node;
+
+	if (WARN_ON_ONCE(nid < 0 || nid >= num_possible_nodes()))
+		return NULL;
+
+	if (!node_isset(nid, sgx_numa_mask))
+		return NULL;
+
+	sgx_node = &sgx_numa_nodes[nid];
+
+	spin_lock(&sgx_free_page_list_lock);
+
+	if (list_empty(&sgx_node->free_page_list)) {
+		spin_unlock(&sgx_free_page_list_lock);
+		return NULL;
+	}
+
+	page = list_first_entry(&sgx_node->free_page_list, struct sgx_epc_page, numa_list);
+	list_del_init(&page->numa_list);
+	list_del_init(&page->list);
+	sgx_nr_free_pages--;
+
+	spin_unlock(&sgx_free_page_list_lock);
+
+	return page;
+}
+
 /**
  * __sgx_alloc_epc_page() - Allocate an EPC page
  *
@@ -446,8 +493,14 @@  static bool __init sgx_page_reclaimer_init(struct list_head *laundry)
  */
 struct sgx_epc_page *__sgx_alloc_epc_page(void)
 {
+	int current_nid = numa_node_id();
 	struct sgx_epc_page *page;
 
+	/* Try to allocate EPC from the current node, first: */
+	page = __sgx_alloc_epc_page_from_node(current_nid);
+	if (page)
+		return page;
+
 	spin_lock(&sgx_free_page_list_lock);
 
 	if (list_empty(&sgx_free_page_list)) {
@@ -456,6 +509,7 @@  struct sgx_epc_page *__sgx_alloc_epc_page(void)
 	}
 
 	page = list_first_entry(&sgx_free_page_list, struct sgx_epc_page, list);
+	list_del_init(&page->numa_list);
 	list_del_init(&page->list);
 	sgx_nr_free_pages--;
 
@@ -566,6 +620,8 @@  struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
  */
 void sgx_free_epc_page(struct sgx_epc_page *page)
 {
+	int nid = sgx_section_to_numa_node_id[page->section];
+	struct sgx_numa_node *sgx_node = &sgx_numa_nodes[nid];
 	int ret;
 
 	WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
@@ -575,7 +631,15 @@  void sgx_free_epc_page(struct sgx_epc_page *page)
 		return;
 
 	spin_lock(&sgx_free_page_list_lock);
+
+	/*   Enable NUMA local allocation in sgx_alloc_epc_page(). */
+	if (!node_isset(nid, sgx_numa_mask)) {
+		INIT_LIST_HEAD(&sgx_node->free_page_list);
+		node_set(nid, sgx_numa_mask);
+	}
+
 	list_add_tail(&page->list, &sgx_free_page_list);
+	list_add_tail(&page->numa_list, &sgx_node->free_page_list);
 	sgx_nr_free_pages++;
 	spin_unlock(&sgx_free_page_list_lock);
 }
@@ -626,8 +690,28 @@  static bool __init sgx_page_cache_init(struct list_head *laundry)
 {
 	u32 eax, ebx, ecx, edx, type;
 	u64 pa, size;
+	int nid;
 	int i;
 
+	nodes_clear(sgx_numa_mask);
+	sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
+
+	/*
+	 * Create NUMA node lookup table for sgx_free_epc_page() as the very
+	 * first step, as it is used to populate the free list's during the
+	 * initialization.
+	 */
+	for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
+		nid = numa_map_to_online_node(phys_to_target_node(pa));
+		if (nid == NUMA_NO_NODE) {
+			/* The physical address is already printed above. */
+			pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
+			nid = 0;
+		}
+
+		sgx_section_to_numa_node_id[i] = nid;
+	}
+
 	for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
 		cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
 
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 41ca045a574a..3a3c07fc0c8e 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -27,6 +27,7 @@  struct sgx_epc_page {
 	unsigned int flags;
 	struct sgx_encl_page *owner;
 	struct list_head list;
+	struct list_head numa_list;
 };
 
 /*
@@ -43,6 +44,14 @@  struct sgx_epc_section {
 
 extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
 
+/*
+ * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
+ * the free page list local to the node is stored here.
+ */
+struct sgx_numa_node {
+	struct list_head free_page_list;
+};
+
 static inline unsigned long sgx_get_epc_phys_addr(struct sgx_epc_page *page)
 {
 	struct sgx_epc_section *section = &sgx_epc_sections[page->section];