diff mbox series

[v9,3/7] x86/sgx: Initial poison handling for dirty and free pages

Message ID 20211011185924.374213-4-tony.luck@intel.com (mailing list archive)
State New, archived
Headers show
Series [v9,1/7] x86/sgx: Add new sgx_epc_page flag bit to mark in-use pages | expand

Commit Message

Luck, Tony Oct. 11, 2021, 6:59 p.m. UTC
A memory controller patrol scrubber can report poison in a page
that isn't currently being used.

Add "poison" field in the sgx_epc_page that can be set for an
sgx_epc_page. Check for it:
1) When sanitizing dirty pages
2) When freeing epc pages

Poison is a new field separated from flags to avoid having to make
all updates to flags atomic, or integrate poison state changes into
some other locking scheme to protect flags.

In both cases place the poisoned page on a list of poisoned epc pages
to make sure it will not be reallocated.

Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/sgx/main.c | 14 +++++++++++++-
 arch/x86/kernel/cpu/sgx/sgx.h  |  3 ++-
 2 files changed, 15 insertions(+), 2 deletions(-)

Comments

Sean Christopherson Oct. 15, 2021, 11:07 p.m. UTC | #1
On Mon, Oct 11, 2021, Tony Luck wrote:
> A memory controller patrol scrubber can report poison in a page
> that isn't currently being used.
> 
> Add "poison" field in the sgx_epc_page that can be set for an
> sgx_epc_page. Check for it:
> 1) When sanitizing dirty pages
> 2) When freeing epc pages
> 
> Poison is a new field separated from flags to avoid having to make
> all updates to flags atomic, or integrate poison state changes into
> some other locking scheme to protect flags.

Explain why atomic would be needed.  I lived in this code for a few years and
still had to look at the source to remember that the reclaimer can set flags
without taking node->lock.

> In both cases place the poisoned page on a list of poisoned epc pages
> to make sure it will not be reallocated.
> 
> Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
> Tested-by: Reinette Chatre <reinette.chatre@intel.com>
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
>  arch/x86/kernel/cpu/sgx/main.c | 14 +++++++++++++-
>  arch/x86/kernel/cpu/sgx/sgx.h  |  3 ++-
>  2 files changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index 09fa42690ff2..653bace26100 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -43,6 +43,7 @@ static nodemask_t sgx_numa_mask;
>  static struct sgx_numa_node *sgx_numa_nodes;
>  
>  static LIST_HEAD(sgx_dirty_page_list);
> +static LIST_HEAD(sgx_poison_page_list);
>  
>  /*
>   * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
> @@ -62,6 +63,12 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
>  
>  		page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
>  
> +		if (page->poison) {

Does this need READ_ONCE (and WRITE_ONCE in the writer) to prevent reloading
page->poison since the sanitizer doesn't hold node->lock, i.e. page->poison can
be set any time?  Honest question, I'm terrible with memory ordering rules...

> +			list_del(&page->list);
> +			list_add(&page->list, &sgx_poison_page_list);

list_move()

> +			continue;
> +		}
> +
>  		ret = __eremove(sgx_get_epc_virt_addr(page));
>  		if (!ret) {
>  			/*
> @@ -626,7 +633,11 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
>  
>  	spin_lock(&node->lock);
>  
> -	list_add_tail(&page->list, &node->free_page_list);
> +	page->owner = NULL;
> +	if (page->poison)
> +		list_add(&page->list, &sgx_poison_page_list);

sgx_poison_page_list is a global list, whereas node->lock is, well, per node.
On a system with multiple EPCs, this could corrupt sgx_poison_page_list if
multiple poisoned pages from different nodes are freed simultaneously.

> +	else
> +		list_add_tail(&page->list, &node->free_page_list);
>  	sgx_nr_free_pages++;
>  	page->flags = 0;
>  
> @@ -658,6 +669,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
>  		section->pages[i].section = index;
>  		section->pages[i].flags = SGX_EPC_PAGE_IN_USE;
>  		section->pages[i].owner = NULL;
> +		section->pages[i].poison = 0;
>  		list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
>  	}
>  
> diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
> index f9202d3d6278..a990a4c9a00f 100644
> --- a/arch/x86/kernel/cpu/sgx/sgx.h
> +++ b/arch/x86/kernel/cpu/sgx/sgx.h
> @@ -31,7 +31,8 @@
>  
>  struct sgx_epc_page {
>  	unsigned int section;
> -	unsigned int flags;
> +	u16 flags;
> +	u16 poison;
>  	struct sgx_encl_page *owner;
>  	struct list_head list;
>  };
> 
> -- 
> 2.31.1
>
Luck, Tony Oct. 15, 2021, 11:32 p.m. UTC | #2
On Fri, Oct 15, 2021 at 11:07:48PM +0000, Sean Christopherson wrote:
> On Mon, Oct 11, 2021, Tony Luck wrote:
> > A memory controller patrol scrubber can report poison in a page
> > that isn't currently being used.
> > 
> > Add "poison" field in the sgx_epc_page that can be set for an
> > sgx_epc_page. Check for it:
> > 1) When sanitizing dirty pages
> > 2) When freeing epc pages
> > 
> > Poison is a new field separated from flags to avoid having to make
> > all updates to flags atomic, or integrate poison state changes into
> > some other locking scheme to protect flags.
> 
> Explain why atomic would be needed.  I lived in this code for a few years and
> still had to look at the source to remember that the reclaimer can set flags
> without taking node->lock.

Will add explanation.

> 
> > In both cases place the poisoned page on a list of poisoned epc pages
> > to make sure it will not be reallocated.
> > 
> > Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
> > Tested-by: Reinette Chatre <reinette.chatre@intel.com>
> > Signed-off-by: Tony Luck <tony.luck@intel.com>
> > ---
> >  arch/x86/kernel/cpu/sgx/main.c | 14 +++++++++++++-
> >  arch/x86/kernel/cpu/sgx/sgx.h  |  3 ++-
> >  2 files changed, 15 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> > index 09fa42690ff2..653bace26100 100644
> > --- a/arch/x86/kernel/cpu/sgx/main.c
> > +++ b/arch/x86/kernel/cpu/sgx/main.c
> > @@ -43,6 +43,7 @@ static nodemask_t sgx_numa_mask;
> >  static struct sgx_numa_node *sgx_numa_nodes;
> >  
> >  static LIST_HEAD(sgx_dirty_page_list);
> > +static LIST_HEAD(sgx_poison_page_list);
> >  
> >  /*
> >   * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
> > @@ -62,6 +63,12 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
> >  
> >  		page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
> >  
> > +		if (page->poison) {
> 
> Does this need READ_ONCE (and WRITE_ONCE in the writer) to prevent reloading
> page->poison since the sanitizer doesn't hold node->lock, i.e. page->poison can
> be set any time?  Honest question, I'm terrible with memory ordering rules...
> 

I think it's safe. I set page->poison in arch_memory_failure() while
holding node->lock in kthread context.  So not "at any time".

This particular read is done without holding the lock ... and is thus
racy. But there are a zillion other races early in boot before the EPC
pages get sanitized and moved to the free list. E.g. if an error is
reported before they are added to the sgx_epc_address_space xarray,
then all this code will just ignore the error as "not in Linux
controlled memory".

-Tony
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 09fa42690ff2..653bace26100 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -43,6 +43,7 @@  static nodemask_t sgx_numa_mask;
 static struct sgx_numa_node *sgx_numa_nodes;
 
 static LIST_HEAD(sgx_dirty_page_list);
+static LIST_HEAD(sgx_poison_page_list);
 
 /*
  * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
@@ -62,6 +63,12 @@  static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
 
 		page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
 
+		if (page->poison) {
+			list_del(&page->list);
+			list_add(&page->list, &sgx_poison_page_list);
+			continue;
+		}
+
 		ret = __eremove(sgx_get_epc_virt_addr(page));
 		if (!ret) {
 			/*
@@ -626,7 +633,11 @@  void sgx_free_epc_page(struct sgx_epc_page *page)
 
 	spin_lock(&node->lock);
 
-	list_add_tail(&page->list, &node->free_page_list);
+	page->owner = NULL;
+	if (page->poison)
+		list_add(&page->list, &sgx_poison_page_list);
+	else
+		list_add_tail(&page->list, &node->free_page_list);
 	sgx_nr_free_pages++;
 	page->flags = 0;
 
@@ -658,6 +669,7 @@  static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
 		section->pages[i].section = index;
 		section->pages[i].flags = SGX_EPC_PAGE_IN_USE;
 		section->pages[i].owner = NULL;
+		section->pages[i].poison = 0;
 		list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
 	}
 
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index f9202d3d6278..a990a4c9a00f 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -31,7 +31,8 @@ 
 
 struct sgx_epc_page {
 	unsigned int section;
-	unsigned int flags;
+	u16 flags;
+	u16 poison;
 	struct sgx_encl_page *owner;
 	struct list_head list;
 };