diff mbox series

x86/sgx: fix a NULL pointer

Message ID 20230717202938.94989-1-haitao.huang@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series x86/sgx: fix a NULL pointer | expand

Commit Message

Haitao Huang July 17, 2023, 8:29 p.m. UTC
Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC
cgroup worker) may reclaim the SECS EPC page for an enclave and set
encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in
the SGX #PF handler without checking for NULL and reloading.

Fix this by checking if SECS is loaded before EAUG and load it if it was
reclaimed.

Fixes: 5a90d2c3f5ef8 ("x86/sgx: Support adding of pages to an initialized enclave")
Cc: stable@vger.kernel.org
Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
---
 arch/x86/kernel/cpu/sgx/encl.c | 25 ++++++++++++++++++++-----
 arch/x86/kernel/cpu/sgx/main.c |  4 ++++
 2 files changed, 24 insertions(+), 5 deletions(-)

Comments

Huang, Kai July 17, 2023, 10:42 p.m. UTC | #1
On Mon, 2023-07-17 at 13:29 -0700, Haitao Huang wrote:
> Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC
> cgroup worker) may reclaim the SECS EPC page for an enclave and set
> encl->secs.epc_page to NULL. 
> 

As a bug fix, I don't think you need to mention "future EPC cgroup worker".

> But the SECS EPC page is used for EAUG in
> the SGX #PF handler without checking for NULL and reloading.
> 
> Fix this by checking if SECS is loaded before EAUG and load it if it was
							 ^
							 loading
> reclaimed.
> 
> Fixes: 5a90d2c3f5ef8 ("x86/sgx: Support adding of pages to an initialized enclave")
> Cc: stable@vger.kernel.org
> Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
> ---
>  arch/x86/kernel/cpu/sgx/encl.c | 25 ++++++++++++++++++++-----
>  arch/x86/kernel/cpu/sgx/main.c |  4 ++++
>  2 files changed, 24 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
> index 2a0e90fe2abc..2ab544da1664 100644
> --- a/arch/x86/kernel/cpu/sgx/encl.c
> +++ b/arch/x86/kernel/cpu/sgx/encl.c
> @@ -235,6 +235,16 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
>  	return epc_page;
>  }
>  
> +static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl)
> +{
> +	struct sgx_epc_page *epc_page = encl->secs.epc_page;
> +
> +	if (!epc_page)
> +		epc_page = sgx_encl_eldu(&encl->secs, NULL);
> +
> +	return epc_page;
> +}
> +
>  static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
>  						  struct sgx_encl_page *entry)
>  {
> @@ -248,11 +258,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
>  		return entry;
>  	}
>  
> -	if (!(encl->secs.epc_page)) {
> -		epc_page = sgx_encl_eldu(&encl->secs, NULL);
> -		if (IS_ERR(epc_page))
> -			return ERR_CAST(epc_page);
> -	}
> +	epc_page = sgx_encl_load_secs(encl);
> +	if (IS_ERR(epc_page))
> +		return ERR_CAST(epc_page);
>  
>  	epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
>  	if (IS_ERR(epc_page))
> @@ -339,6 +347,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
>  
>  	mutex_lock(&encl->lock);
>  
> +	epc_page = sgx_encl_load_secs(encl);
> +	if (IS_ERR(epc_page)) {
> +		if (PTR_ERR(epc_page) == -EBUSY)
> +			vmret =  VM_FAULT_NOPAGE;
> +		goto err_out_unlock;
> +	}
> +
>  	epc_page = sgx_alloc_epc_page(encl_page, false);
>  	if (IS_ERR(epc_page)) {
>  		if (PTR_ERR(epc_page) == -EBUSY)
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index 166692f2d501..4662a364ce62 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -257,6 +257,10 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
>  
>  	mutex_lock(&encl->lock);
>  
> +	/* Should not be possible */
> +	if (WARN_ON(!(encl->secs.epc_page)))
> +		goto out;
> +

This shouldn't be a mandatory part of this fix, no?

If there's good reason to do, then probably you should describe the reason in
the changelog.


>  	sgx_encl_ewb(epc_page, backing);
>  	encl_page->epc_page = NULL;
>  	encl->secs_child_cnt--;
Dave Hansen July 18, 2023, 2:27 p.m. UTC | #2
On 7/17/23 13:29, Haitao Huang wrote:
> Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC
> cgroup worker) may reclaim the SECS EPC page for an enclave and set
> encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in
> the SGX #PF handler without checking for NULL and reloading.
> 
> Fix this by checking if SECS is loaded before EAUG and load it if it was
> reclaimed.

It would be nice to see a _bit_ more theory of the bug in here.

What is an SECS page and why is it special in a reclaim context?  Why is
this so hard to hit?  What led you to discover this issue now?  What is
EAUG?
Dave Hansen July 18, 2023, 2:30 p.m. UTC | #3
On 7/17/23 13:29, Haitao Huang wrote:
...
> @@ -248,11 +258,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
>  		return entry;
>  	}
>  
> -	if (!(encl->secs.epc_page)) {
> -		epc_page = sgx_encl_eldu(&encl->secs, NULL);
> -		if (IS_ERR(epc_page))
> -			return ERR_CAST(epc_page);
> -	}
> +	epc_page = sgx_encl_load_secs(encl);
> +	if (IS_ERR(epc_page))
> +		return ERR_CAST(epc_page);
>  
>  	epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
>  	if (IS_ERR(epc_page))
> @@ -339,6 +347,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
>  
>  	mutex_lock(&encl->lock);
>  
> +	epc_page = sgx_encl_load_secs(encl);
> +	if (IS_ERR(epc_page)) {
> +		if (PTR_ERR(epc_page) == -EBUSY)
> +			vmret =  VM_FAULT_NOPAGE;
> +		goto err_out_unlock;
> +	}

Whenever I see one of these "make sure it isn't NULL", I always jump to
asking what *keeps* it from becoming NULL again.  In both cases here, I
think that's encl->lock.

A comment would be really nice here, maybe on sgx_encl_load_secs().  Maybe:

/*
 * Ensure the SECS page is not swapped out.  Must be called with
 * encl->lock to protect _____ and ensure the SECS page is not
 * swapped out again.
 */

> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index 166692f2d501..4662a364ce62 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -257,6 +257,10 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
>  
>  	mutex_lock(&encl->lock);
>  
> +	/* Should not be possible */
> +	if (WARN_ON(!(encl->secs.epc_page)))
> +		goto out;

That comment isn't super helpful.  We generally don't WARN_ON() things
that should happen.  *Why* is it not possible?
Jarkko Sakkinen July 18, 2023, 3:37 p.m. UTC | #4
On Mon Jul 17, 2023 at 11:29 PM EEST, Haitao Huang wrote:
> Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC
> cgroup worker) may reclaim the SECS EPC page for an enclave and set
> encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in
> the SGX #PF handler without checking for NULL and reloading.
>
> Fix this by checking if SECS is loaded before EAUG and load it if it was
> reclaimed.
>
> Fixes: 5a90d2c3f5ef8 ("x86/sgx: Support adding of pages to an initialized enclave")
> Cc: stable@vger.kernel.org

Given that

	$ git describe --contains 5a90d2c3f5ef8
	v6.0-rc1~102^2~16

You could also describe this as:

Cc: stable@vger.kernel.org # v6.0+

> Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
> ---
>  arch/x86/kernel/cpu/sgx/encl.c | 25 ++++++++++++++++++++-----
>  arch/x86/kernel/cpu/sgx/main.c |  4 ++++
>  2 files changed, 24 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
> index 2a0e90fe2abc..2ab544da1664 100644
> --- a/arch/x86/kernel/cpu/sgx/encl.c
> +++ b/arch/x86/kernel/cpu/sgx/encl.c
> @@ -235,6 +235,16 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
>  	return epc_page;
>  }
>  
> +static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl)
> +{
> +	struct sgx_epc_page *epc_page = encl->secs.epc_page;
> +
> +	if (!epc_page)
> +		epc_page = sgx_encl_eldu(&encl->secs, NULL);
> +
> +	return epc_page;
> +}
> +
>  static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
>  						  struct sgx_encl_page *entry)
>  {
> @@ -248,11 +258,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
>  		return entry;
>  	}
>  
> -	if (!(encl->secs.epc_page)) {
> -		epc_page = sgx_encl_eldu(&encl->secs, NULL);
> -		if (IS_ERR(epc_page))
> -			return ERR_CAST(epc_page);
> -	}
> +	epc_page = sgx_encl_load_secs(encl);
> +	if (IS_ERR(epc_page))
> +		return ERR_CAST(epc_page);
>  
>  	epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
>  	if (IS_ERR(epc_page))
> @@ -339,6 +347,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
>  
>  	mutex_lock(&encl->lock);
>  
> +	epc_page = sgx_encl_load_secs(encl);
> +	if (IS_ERR(epc_page)) {
> +		if (PTR_ERR(epc_page) == -EBUSY)
> +			vmret =  VM_FAULT_NOPAGE;
> +		goto err_out_unlock;
> +	}
> +
>  	epc_page = sgx_alloc_epc_page(encl_page, false);
>  	if (IS_ERR(epc_page)) {
>  		if (PTR_ERR(epc_page) == -EBUSY)
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index 166692f2d501..4662a364ce62 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -257,6 +257,10 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
>  
>  	mutex_lock(&encl->lock);
>  
> +	/* Should not be possible */
> +	if (WARN_ON(!(encl->secs.epc_page)))
> +		goto out;
> +
>  	sgx_encl_ewb(epc_page, backing);
>  	encl_page->epc_page = NULL;
>  	encl->secs_child_cnt--;
> -- 
> 2.25.1

Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>

BR, Jarkko
Haitao Huang July 18, 2023, 4:39 p.m. UTC | #5
On Tue, 18 Jul 2023 09:30:11 -0500, Dave Hansen <dave.hansen@intel.com>  
wrote:

> On 7/17/23 13:29, Haitao Huang wrote:
> ...
>> @@ -248,11 +258,9 @@ static struct sgx_encl_page  
>> *__sgx_encl_load_page(struct sgx_encl *encl,
>>  		return entry;
>>  	}
>>
>> -	if (!(encl->secs.epc_page)) {
>> -		epc_page = sgx_encl_eldu(&encl->secs, NULL);
>> -		if (IS_ERR(epc_page))
>> -			return ERR_CAST(epc_page);
>> -	}
>> +	epc_page = sgx_encl_load_secs(encl);
>> +	if (IS_ERR(epc_page))
>> +		return ERR_CAST(epc_page);
>>
>>  	epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
>>  	if (IS_ERR(epc_page))
>> @@ -339,6 +347,13 @@ static vm_fault_t sgx_encl_eaug_page(struct  
>> vm_area_struct *vma,
>>
>>  	mutex_lock(&encl->lock);
>>
>> +	epc_page = sgx_encl_load_secs(encl);
>> +	if (IS_ERR(epc_page)) {
>> +		if (PTR_ERR(epc_page) == -EBUSY)
>> +			vmret =  VM_FAULT_NOPAGE;
>> +		goto err_out_unlock;
>> +	}
>
> Whenever I see one of these "make sure it isn't NULL", I always jump to
> asking what *keeps* it from becoming NULL again.  In both cases here, I
> think that's encl->lock.
>
Yes, encl->lock protects all enclave states, the xarray holding  
encl_pages, SECS, VAs, etc.

> A comment would be really nice here, maybe on sgx_encl_load_secs().   
> Maybe:
>
> /*
>  * Ensure the SECS page is not swapped out.  Must be called with
>  * encl->lock to protect _____ and ensure the SECS page is not
>  * swapped out again.
>  */
>
Thanks for the suggestion. Lock should be held for the duration of SECS  
usage.
So something like this?
/*
  * Ensure the SECS page is not swapped out.  Must be called with
  * encl->lock to protect the enclave states including SECS and
  * ensure the SECS page is not swapped out again while being used.
  */


>> diff --git a/arch/x86/kernel/cpu/sgx/main.c  
>> b/arch/x86/kernel/cpu/sgx/main.c
>> index 166692f2d501..4662a364ce62 100644
>> --- a/arch/x86/kernel/cpu/sgx/main.c
>> +++ b/arch/x86/kernel/cpu/sgx/main.c
>> @@ -257,6 +257,10 @@ static void sgx_reclaimer_write(struct  
>> sgx_epc_page *epc_page,
>>
>>  	mutex_lock(&encl->lock);
>>
>> +	/* Should not be possible */
>> +	if (WARN_ON(!(encl->secs.epc_page)))
>> +		goto out;
>
> That comment isn't super helpful.  We generally don't WARN_ON() things
> that should happen.  *Why* is it not possible?
>

When this part of code is reached, the reclaimer is holding at least one  
reclaimable EPC page to reclaim for the enclave and the code below only  
reclaims SECS when no reclaimable EPCs (number of SECS children being  
zero) of the enclave left. So it should not be possible.
I'll remove this change because this is really not needed for fixing the  
bug as Kai pointed out.

I added this for sanity check when implementing multiple EPC tracking  
lists for cgroups. At one point there were list corruption issues if  
moving EPCs between lists not managed well. With those straightened out,  
and clear definitions of EPC states for moving them from one list to  
another, I no longer see much value to keep this even in later cgroup  
patches.

Thanks
Haitao
Haitao Huang July 18, 2023, 6:11 p.m. UTC | #6
On Tue, 18 Jul 2023 09:27:49 -0500, Dave Hansen <dave.hansen@intel.com>  
wrote:

> On 7/17/23 13:29, Haitao Huang wrote:
>> Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC
>> cgroup worker) may reclaim the SECS EPC page for an enclave and set
>> encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in
>> the SGX #PF handler without checking for NULL and reloading.
>>
>> Fix this by checking if SECS is loaded before EAUG and load it if it was
>> reclaimed.
>
> It would be nice to see a _bit_ more theory of the bug in here.
>
> What is an SECS page and why is it special in a reclaim context?  Why is
> this so hard to hit?  What led you to discover this issue now?  What is
> EAUG?

Let me know if this clarify things.

The SECS page holds global states of an enclave, and all reclaimable pages  
tracked by the SGX EPC reclaimer (ksgxd) are considered 'child' pages of  
the SECS page corresponding to that enclave.  The reclaimer only reclaims  
the SECS page when all its children are reclaimed. That can happen on  
system under high EPC pressure where multiple large enclaves demanding  
much more EPC page than physically available. In a rare case, the  
reclaimer may reclaim all EPC pages of an enclave and it SECS page,  
setting encl->secs.epc_page to NULL, right before the #PF handler get the  
chance to handle a #PF for that enclave. In that case, if that #PF happens  
to require kernel to invoke the EAUG instruction to add a new EPC page for  
the enclave, then a NULL pointer results as current code does not check if  
encl->secs.epc_page is NULL before using it.

The bug is easier to reproduce with the EPC cgroup implementation when a  
low EPC limit is set for a group of enclave hosting processes. Without the  
EPC cgroup it's hard to trigger the reclaimer to reclaim all child pages  
of an SECS page. And it'd also require a machine configured with large RAM  
relative to EPC so no OOM killer triggered before this happens.

Thanks
Haitao
Dave Hansen July 18, 2023, 6:53 p.m. UTC | #7
On 7/18/23 11:11, Haitao Huang wrote:
> On Tue, 18 Jul 2023 09:27:49 -0500, Dave Hansen <dave.hansen@intel.com>
> wrote:
> 
>> On 7/17/23 13:29, Haitao Huang wrote:
>>> Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC
>>> cgroup worker) may reclaim the SECS EPC page for an enclave and set
>>> encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in
>>> the SGX #PF handler without checking for NULL and reloading.
>>>
>>> Fix this by checking if SECS is loaded before EAUG and load it if it was
>>> reclaimed.
>>
>> It would be nice to see a _bit_ more theory of the bug in here.
>>
>> What is an SECS page and why is it special in a reclaim context?  Why is
>> this so hard to hit?  What led you to discover this issue now?  What is
>> EAUG?
> 
> Let me know if this clarify things.
> 
> The SECS page holds global states of an enclave, and all reclaimable
> pages tracked by the SGX EPC reclaimer (ksgxd) are considered 'child'
> pages of the SECS page corresponding to that enclave.  The reclaimer
> only reclaims the SECS page when all its children are reclaimed. That
> can happen on system under high EPC pressure where multiple large
> enclaves demanding much more EPC page than physically available. In a
> rare case, the reclaimer may reclaim all EPC pages of an enclave and it
> SECS page, setting encl->secs.epc_page to NULL, right before the #PF
> handler get the chance to handle a #PF for that enclave. In that case,
> if that #PF happens to require kernel to invoke the EAUG instruction to
> add a new EPC page for the enclave, then a NULL pointer results as
> current code does not check if encl->secs.epc_page is NULL before using it.

Better, but that's *REALLY* verbose and really imprecise.  It doesn't
_require_ "high pressure".  It could literally happen at very, very low
pressures over a long period of time.  Please stick to the facts and
it'll actually simplify the description.

	The SECS page holds global enclave metadata.  It can only be
	reclaimed when there are no other enclave pages remaining.  At
	that point, virtually nothing can be done with the enclave until
	the SECS page is paged back in.

	An enclave can not run nor generate page faults without without
	a resident SECS page.  But it is still possible for a #PF for a
	non-SECS page to race with paging out the SECS page.

	Hitting this bug requires triggering that race.

> The bug is easier to reproduce with the EPC cgroup implementation when a
> low EPC limit is set for a group of enclave hosting processes. Without
> the EPC cgroup it's hard to trigger the reclaimer to reclaim all child
> pages of an SECS page. And it'd also require a machine configured with
> large RAM relative to EPC so no OOM killer triggered before this happens.

Isn't this the _normal_ case?  EPC is relatively tiny compared to RAM
normally.
Haitao Huang July 18, 2023, 8:32 p.m. UTC | #8
On Tue, 18 Jul 2023 13:53:47 -0500, Dave Hansen <dave.hansen@intel.com>  
wrote:

> On 7/18/23 11:11, Haitao Huang wrote:
>> On Tue, 18 Jul 2023 09:27:49 -0500, Dave Hansen <dave.hansen@intel.com>
>> wrote:
>>
>>> On 7/17/23 13:29, Haitao Huang wrote:
>>>> Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC
>>>> cgroup worker) may reclaim the SECS EPC page for an enclave and set
>>>> encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in
>>>> the SGX #PF handler without checking for NULL and reloading.
>>>>
>>>> Fix this by checking if SECS is loaded before EAUG and load it if it  
>>>> was
>>>> reclaimed.
>>>
>>> It would be nice to see a _bit_ more theory of the bug in here.
>>>
>>> What is an SECS page and why is it special in a reclaim context?  Why  
>>> is
>>> this so hard to hit?  What led you to discover this issue now?  What is
>>> EAUG?
>>
>> Let me know if this clarify things.
>>
>> The SECS page holds global states of an enclave, and all reclaimable
>> pages tracked by the SGX EPC reclaimer (ksgxd) are considered 'child'
>> pages of the SECS page corresponding to that enclave.  The reclaimer
>> only reclaims the SECS page when all its children are reclaimed. That
>> can happen on system under high EPC pressure where multiple large
>> enclaves demanding much more EPC page than physically available. In a
>> rare case, the reclaimer may reclaim all EPC pages of an enclave and it
>> SECS page, setting encl->secs.epc_page to NULL, right before the #PF
>> handler get the chance to handle a #PF for that enclave. In that case,
>> if that #PF happens to require kernel to invoke the EAUG instruction to
>> add a new EPC page for the enclave, then a NULL pointer results as
>> current code does not check if encl->secs.epc_page is NULL before using  
>> it.
>
> Better, but that's *REALLY* verbose and really imprecise.  It doesn't
> _require_ "high pressure".  It could literally happen at very, very low
> pressures over a long period of time.

I don't quite get this part. In low pressure scenario, the reclaimer never  
need to reclaim all children of SECs. So it would not reclaim SECS no  
matter how long you run?

Ignore VA pages for now. Say for a system with 10 page EPC, 2 enclaves,  
each needs 5 pages non-SECS so total demand would be 12 pages. The ksgxd  
would only need to swap out 2 pages at the most to get one enclave fully  
loaded with 6 pages, and the other one with 4 pages. There is no chance  
the ksgxd would swap any one of two SECS pages.

We would need at least one enclave A of 10 pages total to squeeze out the  
other B completely. For that to happen B pretty much has to be sleeping  
all the time so the LRU based reclaiming would hit it but not pages of A.  
So no chance to hit #PF on pages of B still.

So some minimal pressure is needed to ensure SECS swapped. The higher the  
pressure the higher the chance to hit #PF while SECS is swapped.

> Please stick to the facts and
> it'll actually simplify the description.
>
> 	The SECS page holds global enclave metadata.  It can only be
> 	reclaimed when there are no other enclave pages remaining.  At
> 	that point, virtually nothing can be done with the enclave until
> 	the SECS page is paged back in.
>
> 	An enclave can not run nor generate page faults without without
> 	a resident SECS page.  But it is still possible for a #PF for a
> 	non-SECS page to race with paging out the SECS page.
>
> 	Hitting this bug requires triggering that race.
>
Thanks for the suggestion. I agree on those.

>> The bug is easier to reproduce with the EPC cgroup implementation when a
>> low EPC limit is set for a group of enclave hosting processes. Without
>> the EPC cgroup it's hard to trigger the reclaimer to reclaim all child
>> pages of an SECS page. And it'd also require a machine configured with
>> large RAM relative to EPC so no OOM killer triggered before this  
>> happens.
>
> Isn't this the _normal_ case?  EPC is relatively tiny compared to RAM
> normally.

I don't know what is perceived as normal here. But for this to happen, the  
swapping space should be able to hold content much bigger than EPC, if my  
reasoning above for high pressure required is correct. I tried 70  
concurrent sgx selftests instances on a server with 4G EPC, 512G RAM and  
no disk swapping, and encountered OOM first. Those selftests instance each  
demand about 8G EPC.

Thanks
Haitao
Dave Hansen July 18, 2023, 8:56 p.m. UTC | #9
On 7/18/23 13:32, Haitao Huang wrote:
...
> Ignore VA pages for now. Say for a system with 10 page EPC, 2 enclaves,
> each needs 5 pages non-SECS so total demand would be 12 pages. The ksgxd
> would only need to swap out 2 pages at the most to get one enclave fully
> loaded with 6 pages, and the other one with 4 pages. There is no chance
> the ksgxd would swap any one of two SECS pages.
> 
> We would need at least one enclave A of 10 pages total to squeeze out
> the other B completely. For that to happen B pretty much has to be
> sleeping all the time so the LRU based reclaiming would hit it but not
> pages of A. So no chance to hit #PF on pages of B still.
> 
> So some minimal pressure is needed to ensure SECS swapped. The higher
> the pressure the higher the chance to hit #PF while SECS is swapped.

What would the second-to-last non-SECS page be?  A thread control page?
VA page?

As long as *that* page can generate a page fault, then you only need two
pages for this scenario to happen:

1. Reclaimer takes encl->lock
2. #PF occurs from another thread, blocks on encl->lock
3. SECS is reclaimed
4. encl->lock released
5. #PF sees reclaimed SECS
Haitao Huang July 18, 2023, 9:22 p.m. UTC | #10
On Tue, 18 Jul 2023 15:56:27 -0500, Dave Hansen <dave.hansen@intel.com>  
wrote:

> On 7/18/23 13:32, Haitao Huang wrote:
> ...
>> Ignore VA pages for now. Say for a system with 10 page EPC, 2 enclaves,
>> each needs 5 pages non-SECS so total demand would be 12 pages. The ksgxd
>> would only need to swap out 2 pages at the most to get one enclave fully
>> loaded with 6 pages, and the other one with 4 pages. There is no chance
>> the ksgxd would swap any one of two SECS pages.
>>
>> We would need at least one enclave A of 10 pages total to squeeze out
>> the other B completely. For that to happen B pretty much has to be
>> sleeping all the time so the LRU based reclaiming would hit it but not
>> pages of A. So no chance to hit #PF on pages of B still.
>>
>> So some minimal pressure is needed to ensure SECS swapped. The higher
>> the pressure the higher the chance to hit #PF while SECS is swapped.
>
> What would the second-to-last non-SECS page be?  A thread control page?
> VA page?
>
> As long as *that* page can generate a page fault, then you only need two
> pages for this scenario to happen:
>
> 1. Reclaimer takes encl->lock
> 2. #PF occurs from another thread, blocks on encl->lock
> 3. SECS is reclaimed
> 4. encl->lock released
> 5. #PF sees reclaimed SECS
>
>
I agree this is the race. But for this to happen, that is at #1 you have  
only one non-SECS page left so #3 can happen. That means it is already  
high pressure because reclaimer has swapped all other non-SECS.
In my example of two enclaves of 5 non-EPC pages. #3 won't happen because  
you don't reach #1 with only one non-SECS left.

Thanks
Haitao
Dave Hansen July 18, 2023, 9:36 p.m. UTC | #11
On 7/18/23 14:22, Haitao Huang wrote:
> I agree this is the race. But for this to happen, that is at #1 you have
> only one non-SECS page left so #3 can happen. That means it is already
> high pressure 

I think our definitions of memory pressure differ.

Pressure is raised by allocations and dropped by reclaim.  This
raise->drop cycle is (or should be) time-limited and can't take forever.
 The reclaim either works in a short period of time or something dies.
If allocations are transient, pressure is transient.

Let's say a pressure blip (a one-time event) comes along and pages out
that second-to-last page.  That's pretty low pressure.  Years pass.  The
enclave never gets run.  Nothing pages the second-to-last page back in.
A second pressure blip comes along.  The SECS page gets paged out.

That's two pressure blips in, say 10 years.  Is that "high pressure"?

> because reclaimer has swapped all other non-SECS.
> In my example of two enclaves of 5 non-EPC pages. #3 won't happen
> because you don't reach #1 with only one non-SECS left.
Haitao Huang July 18, 2023, 9:57 p.m. UTC | #12
On Tue, 18 Jul 2023 16:36:53 -0500, Dave Hansen <dave.hansen@intel.com>  
wrote:

> On 7/18/23 14:22, Haitao Huang wrote:
>> I agree this is the race. But for this to happen, that is at #1 you have
>> only one non-SECS page left so #3 can happen. That means it is already
>> high pressure
>
> I think our definitions of memory pressure differ.
>
> Pressure is raised by allocations and dropped by reclaim.  This
> raise->drop cycle is (or should be) time-limited and can't take forever.
>  The reclaim either works in a short period of time or something dies.
> If allocations are transient, pressure is transient.
>
> Let's say a pressure blip (a one-time event) comes along and pages out
> that second-to-last page.  That's pretty low pressure.  Years pass.  The
> enclave never gets run.  Nothing pages the second-to-last page back in.
> A second pressure blip comes along.  The SECS page gets paged out.
>
> That's two pressure blips in, say 10 years.  Is that "high pressure"?

Okay, that explains. I would consider it still triggered by high pressure  
blips :-)

But I agree we can drop the mentioning of pressure altogether and just  
state the race so no confusions.
Thanks
Haitao
Dave Hansen July 18, 2023, 10:05 p.m. UTC | #13
On 7/18/23 14:57, Haitao Huang wrote:
> Okay, that explains. I would consider it still triggered by high
> pressure blips 
Haitao Huang July 18, 2023, 11:11 p.m. UTC | #14
On Tue, 18 Jul 2023 10:37:45 -0500, Jarkko Sakkinen <jarkko@kernel.org>  
wrote:

> On Mon Jul 17, 2023 at 11:29 PM EEST, Haitao Huang wrote:
>> Under heavy load, the SGX EPC reclaimers (current ksgxd or future EPC
>> cgroup worker) may reclaim the SECS EPC page for an enclave and set
>> encl->secs.epc_page to NULL. But the SECS EPC page is used for EAUG in
>> the SGX #PF handler without checking for NULL and reloading.
>>
>> Fix this by checking if SECS is loaded before EAUG and load it if it was
>> reclaimed.
>>
>> Fixes: 5a90d2c3f5ef8 ("x86/sgx: Support adding of pages to an  
>> initialized enclave")
>> Cc: stable@vger.kernel.org
>
> Given that
>
> 	$ git describe --contains 5a90d2c3f5ef8
> 	v6.0-rc1~102^2~16
>
> You could also describe this as:
>
> Cc: stable@vger.kernel.org # v6.0+

Will add

>
...
>
> Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
>

Thank you.
Haitao
Haitao Huang July 19, 2023, 12:06 a.m. UTC | #15
On Tue, 18 Jul 2023 17:05:59 -0500, Dave Hansen <dave.hansen@intel.com>  
wrote:

> On 7/18/23 14:57, Haitao Huang wrote:
>> Okay, that explains. I would consider it still triggered by high
>> pressure blips 
Huang, Kai July 19, 2023, 12:14 a.m. UTC | #16
On Tue, 2023-07-18 at 16:57 -0500, Haitao Huang wrote:
> On Tue, 18 Jul 2023 16:36:53 -0500, Dave Hansen <dave.hansen@intel.com>  
> wrote:
> 
> > On 7/18/23 14:22, Haitao Huang wrote:
> > > I agree this is the race. But for this to happen, that is at #1 you have
> > > only one non-SECS page left so #3 can happen. That means it is already
> > > high pressure
> > 
> > I think our definitions of memory pressure differ.
> > 
> > Pressure is raised by allocations and dropped by reclaim.  This
> > raise->drop cycle is (or should be) time-limited and can't take forever.
> >  The reclaim either works in a short period of time or something dies.
> > If allocations are transient, pressure is transient.
> > 
> > Let's say a pressure blip (a one-time event) comes along and pages out
> > that second-to-last page.  That's pretty low pressure.  Years pass.  The
> > enclave never gets run.  Nothing pages the second-to-last page back in.
> > A second pressure blip comes along.  The SECS page gets paged out.
> > 
> > That's two pressure blips in, say 10 years.  Is that "high pressure"?
> 
> Okay, that explains. I would consider it still triggered by high pressure  
> blips :-)
> 
> But I agree we can drop the mentioning of pressure altogether and just  
> state the race so no confusions.

Also perhaps the patch title is too vague.  Adding more information doesn't hurt
I think, e.g., mentioning it is a fix for NULL pointer dereference in the EAUG
flow.
Dave Hansen July 19, 2023, 12:21 a.m. UTC | #17
On 7/18/23 17:14, Huang, Kai wrote:
> Also perhaps the patch title is too vague.  Adding more information doesn't hurt
> I think, e.g., mentioning it is a fix for NULL pointer dereference in the EAUG
> flow.

Yeah, let's say something like:

	x86/sgx: Resolve SECS reclaim vs. page fault race

please
Haitao Huang July 19, 2023, 1:53 p.m. UTC | #18
Hi Dave and Kai
On Tue, 18 Jul 2023 19:21:54 -0500, Dave Hansen <dave.hansen@intel.com>  
wrote:

> On 7/18/23 17:14, Huang, Kai wrote:
>> Also perhaps the patch title is too vague.  Adding more information  
>> doesn't hurt
>> I think, e.g., mentioning it is a fix for NULL pointer dereference in  
>> the EAUG
>> flow.
>
> Yeah, let's say something like:
>
> 	x86/sgx: Resolve SECS reclaim vs. page fault race
>
The patch is not to resolve SECS vs #PF race though the race is a  
necessary condition to cause the NULL pointer. The same condition does not  
cause NULL pointer in the ELDU path of #PF, only in EAUG path of #PF.

And the issue really is the NULL pointer not checked and fix was to reuse  
the same code to reload SECS in ELDU code path for EAUG code path


How about this:

x86/sgx:  Reload reclaimed SECS for EAUG on #PF

or

x86/sgx: Fix a NULL pointer to SECS used for EAUG on #PF

BR
Haitao
Huang, Kai July 21, 2023, 12:32 a.m. UTC | #19
On Wed, 2023-07-19 at 08:53 -0500, Haitao Huang wrote:
> Hi Dave and Kai
> On Tue, 18 Jul 2023 19:21:54 -0500, Dave Hansen <dave.hansen@intel.com>  
> wrote:
> 
> > On 7/18/23 17:14, Huang, Kai wrote:
> > > Also perhaps the patch title is too vague.  Adding more information  
> > > doesn't hurt
> > > I think, e.g., mentioning it is a fix for NULL pointer dereference in  
> > > the EAUG
> > > flow.
> > 
> > Yeah, let's say something like:
> > 
> > 	x86/sgx: Resolve SECS reclaim vs. page fault race
> > 
> The patch is not to resolve SECS vs #PF race though the race is a  
> necessary condition to cause the NULL pointer. The same condition does not  
> cause NULL pointer in the ELDU path of #PF, only in EAUG path of #PF.
> 
> And the issue really is the NULL pointer not checked and fix was to reuse  
> the same code to reload SECS in ELDU code path for EAUG code path
> 
> 
> How about this:
> 
> x86/sgx:  Reload reclaimed SECS for EAUG on #PF
> 
> or
> 
> x86/sgx: Fix a NULL pointer to SECS used for EAUG on #PF
> 

Perhaps you can add "EAUG" part to what Dave suggested?

	x86/sgx: Resolves SECS reclaim vs. page fault race on EAUG

(assuming Dave is fine with this :-))
Huang, Kai July 21, 2023, 12:52 a.m. UTC | #20
On Fri, 2023-07-21 at 00:32 +0000, Huang, Kai wrote:
> On Wed, 2023-07-19 at 08:53 -0500, Haitao Huang wrote:
> > Hi Dave and Kai
> > On Tue, 18 Jul 2023 19:21:54 -0500, Dave Hansen <dave.hansen@intel.com>  
> > wrote:
> > 
> > > On 7/18/23 17:14, Huang, Kai wrote:
> > > > Also perhaps the patch title is too vague.  Adding more information  
> > > > doesn't hurt
> > > > I think, e.g., mentioning it is a fix for NULL pointer dereference in  
> > > > the EAUG
> > > > flow.
> > > 
> > > Yeah, let's say something like:
> > > 
> > > 	x86/sgx: Resolve SECS reclaim vs. page fault race
> > > 
> > The patch is not to resolve SECS vs #PF race though the race is a  
> > necessary condition to cause the NULL pointer. The same condition does not  
> > cause NULL pointer in the ELDU path of #PF, only in EAUG path of #PF.
> > 
> > And the issue really is the NULL pointer not checked and fix was to reuse  
> > the same code to reload SECS in ELDU code path for EAUG code path
> > 
> > 
> > How about this:
> > 
> > x86/sgx:  Reload reclaimed SECS for EAUG on #PF
> > 
> > or
> > 
> > x86/sgx: Fix a NULL pointer to SECS used for EAUG on #PF
> > 
> 
> Perhaps you can add "EAUG" part to what Dave suggested?
> 
> 	x86/sgx: Resolves SECS reclaim vs. page fault race on EAUG
> 
> (assuming Dave is fine with this :-))

Btw, do you have a real call trace?  If you have, I think you can add that to
the changelog too because that catches people's eye immediately.
Haitao Huang July 26, 2023, 4:56 p.m. UTC | #21
On Thu, 20 Jul 2023 19:52:22 -0500, Huang, Kai <kai.huang@intel.com> wrote:

> On Fri, 2023-07-21 at 00:32 +0000, Huang, Kai wrote:
>> On Wed, 2023-07-19 at 08:53 -0500, Haitao Huang wrote:
>> > Hi Dave and Kai
>> > On Tue, 18 Jul 2023 19:21:54 -0500, Dave Hansen  
>> <dave.hansen@intel.com>
>> > wrote:
>> >
>> > > On 7/18/23 17:14, Huang, Kai wrote:
>> > > > Also perhaps the patch title is too vague.  Adding more  
>> information
>> > > > doesn't hurt
>> > > > I think, e.g., mentioning it is a fix for NULL pointer  
>> dereference in
>> > > > the EAUG
>> > > > flow.
>> > >
>> > > Yeah, let's say something like:
>> > >
>> > > 	x86/sgx: Resolve SECS reclaim vs. page fault race
>> > >
>> > The patch is not to resolve SECS vs #PF race though the race is a
>> > necessary condition to cause the NULL pointer. The same condition  
>> does not
>> > cause NULL pointer in the ELDU path of #PF, only in EAUG path of #PF.
>> >
>> > And the issue really is the NULL pointer not checked and fix was to  
>> reuse
>> > the same code to reload SECS in ELDU code path for EAUG code path
>> >
>> >
>> > How about this:
>> >
>> > x86/sgx:  Reload reclaimed SECS for EAUG on #PF
>> >
>> > or
>> >
>> > x86/sgx: Fix a NULL pointer to SECS used for EAUG on #PF
>> >
>>
>> Perhaps you can add "EAUG" part to what Dave suggested?
>>
>> 	x86/sgx: Resolves SECS reclaim vs. page fault race on EAUG
>>
>> (assuming Dave is fine with this :-))
Sure, I can use this too.

> Btw, do you have a real call trace?  If you have, I think you can add  
> that to
> the changelog too because that catches people's eye immediately.

Previously I was not able to reproduce without SGX cgroup patches. Now I  
managed to get a trace with a QEMU setup with small EPC (8M), large RAM  
(128G) and 128 vCPUs:

[ 1682.914263] BUG: kernel NULL pointer dereference, address:  
0000000000000000
[ 1682.922966] #PF: supervisor read access in kernel mode
[ 1682.929115] #PF: error_code(0x0000) - not-present page
[ 1682.935264] PGD 0 P4D 0
[ 1682.938383] Oops: 0000 [#1] PREEMPT SMP NOPTI
[ 1682.943620] CPU: 43 PID: 2681 Comm: test_sgx Not tainted  
6.3.0-rc4sgxcet #12
[ 1682.951989] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS  
rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014
[ 1682.965504] RIP: 0010:sgx_encl_eaug_page+0xc7/0x210
[ 1682.971359] Code: 25 49 8b 96 98 04 00 00 48 8d 40 48 48 89 42 08 48 89  
56 48 49 8d 96 98 04 00 00 48 89 56 50 49 89 86 98 04 00 00 49 8b 46 60  
<8b> 10 48 c1 e2 05 488
[ 1682.993330] RSP: 0000:ffffb2e64725bc00 EFLAGS: 00010246
[ 1682.999585] RAX: 0000000000000000 RBX: ffff987e5abac428 RCX:  
0000000000000000
[ 1683.008059] RDX: 0000000000000001 RSI: 0000000000000000 RDI:  
ffff987e61aee000
[ 1683.016533] RBP: ffffb2e64725bcf0 R08: 0000000000000000 R09:  
ffffb2e64725bb58
[ 1683.025008] R10: 0000000000000000 R11: 00007f3f5c418fff R12:  
ffff987e61aee020
[ 1683.033479] R13: ffff987e505bc080 R14: ffff987e61aee000 R15:  
ffffb2e6420fcb20
[ 1683.041949] FS:  00007f3f5cb48740(0000) GS:ffff989cfe8c0000(0000)  
knlGS:0000000000000000
[ 1683.051540] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1683.058478] CR2: 0000000000000000 CR3: 0000000115896002 CR4:  
0000000000770ee0
[ 1683.067018] DR0: 0000000000000000 DR1: 0000000000000000 DR2:  
0000000000000000
[ 1683.075539] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:  
0000000000000400
[ 1683.084085] PKRU: 55555554
[ 1683.087465] Call Trace:
[ 1683.090547]  <TASK>
[ 1683.093220]  ? __kmem_cache_alloc_node+0x16a/0x440
[ 1683.099034]  ? xa_load+0x6e/0xa0
[ 1683.103038]  sgx_vma_fault+0x119/0x230
[ 1683.107630]  __do_fault+0x36/0x140
[ 1683.111828]  do_fault+0x12f/0x400
[ 1683.115928]  __handle_mm_fault+0x728/0x1110
[ 1683.121050]  handle_mm_fault+0x105/0x310
[ 1683.125850]  do_user_addr_fault+0x1ee/0x750
[ 1683.130957]  ? __this_cpu_preempt_check+0x13/0x20
[ 1683.136667]  exc_page_fault+0x76/0x180
[ 1683.141265]  asm_exc_page_fault+0x27/0x30
[ 1683.146160] RIP: 0033:0x7ffc6496beea
[ 1683.150563] Code: 43 48 8b 4d 10 48 c7 c3 28 00 00 00 48 83 3c 19 00 75  
31 48 83 c3 08 48 81 fb 00 01 00 00 75 ec 48 8b 19 48 8d 0d 00 00 00 00  
<0f> 01 d7 48 8b 5d 101
[ 1683.172773] RSP: 002b:00007ffc64935b68 EFLAGS: 00000202
[ 1683.179138] RAX: 0000000000000003 RBX: 00007f3800000000 RCX:  
00007ffc6496beea
[ 1683.187675] RDX: 0000000000000000 RSI: 0000000000000000 RDI:  
0000000000000000
[ 1683.196200] RBP: 00007ffc64935b70 R08: 0000000000000000 R09:  
0000000000000000
[ 1683.204724] R10: 0000000000000000 R11: 0000000000000000 R12:  
0000000000000000
[ 1683.213310] R13: 0000000000000000 R14: 0000000000000000 R15:  
0000000000000000
[ 1683.221850]  </TASK>
[ 1683.224636] Modules linked in: isofs intel_rapl_msr intel_rapl_common  
binfmt_misc kvm_intel nls_iso8859_1 kvm ppdev irqbypass input_leds  
parport_pc joydev parport rapi
[ 1683.291173] CR2: 0000000000000000
[ 1683.295271] ---[ end trace 0000000000000000 ]---



I'll add this to the commit as well.

Thanks
Haitao
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 2a0e90fe2abc..2ab544da1664 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -235,6 +235,16 @@  static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
 	return epc_page;
 }
 
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl)
+{
+	struct sgx_epc_page *epc_page = encl->secs.epc_page;
+
+	if (!epc_page)
+		epc_page = sgx_encl_eldu(&encl->secs, NULL);
+
+	return epc_page;
+}
+
 static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
 						  struct sgx_encl_page *entry)
 {
@@ -248,11 +258,9 @@  static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
 		return entry;
 	}
 
-	if (!(encl->secs.epc_page)) {
-		epc_page = sgx_encl_eldu(&encl->secs, NULL);
-		if (IS_ERR(epc_page))
-			return ERR_CAST(epc_page);
-	}
+	epc_page = sgx_encl_load_secs(encl);
+	if (IS_ERR(epc_page))
+		return ERR_CAST(epc_page);
 
 	epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
 	if (IS_ERR(epc_page))
@@ -339,6 +347,13 @@  static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
 
 	mutex_lock(&encl->lock);
 
+	epc_page = sgx_encl_load_secs(encl);
+	if (IS_ERR(epc_page)) {
+		if (PTR_ERR(epc_page) == -EBUSY)
+			vmret =  VM_FAULT_NOPAGE;
+		goto err_out_unlock;
+	}
+
 	epc_page = sgx_alloc_epc_page(encl_page, false);
 	if (IS_ERR(epc_page)) {
 		if (PTR_ERR(epc_page) == -EBUSY)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 166692f2d501..4662a364ce62 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -257,6 +257,10 @@  static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
 
 	mutex_lock(&encl->lock);
 
+	/* Should not be possible */
+	if (WARN_ON(!(encl->secs.epc_page)))
+		goto out;
+
 	sgx_encl_ewb(epc_page, backing);
 	encl_page->epc_page = NULL;
 	encl->secs_child_cnt--;