diff mbox series

[v13,10/13] x86/sgx: Add sgx_einit() for initializing enclaves

Message ID 20180827185507.17087-11-jarkko.sakkinen@linux.intel.com (mailing list archive)
State Changes Requested, archived
Headers show
Series Intel SGX1 support | expand

Commit Message

Jarkko Sakkinen Aug. 27, 2018, 6:53 p.m. UTC
From: Sean Christopherson <sean.j.christopherson@intel.com>

Add a function to perform ENCLS(EINIT), which initializes an enclave,
which can be used by a driver for running enclaves and VMMs.

Writing the LE hash MSRs is extraordinarily expensive, e.g. 3-4x slower
than normal MSRs, so we use a per-cpu cache to track the last known value
of the MSRs to avoid unnecessarily writing the MSRs with the current value.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 arch/x86/include/asm/sgx.h      |  2 +
 arch/x86/kernel/cpu/intel_sgx.c | 86 +++++++++++++++++++++++++++++++--
 2 files changed, 85 insertions(+), 3 deletions(-)

Comments

Huang, Kai Aug. 27, 2018, 9:41 p.m. UTC | #1
On Mon, 2018-08-27 at 21:53 +0300, Jarkko Sakkinen wrote:
> From: Sean Christopherson <sean.j.christopherson@intel.com>
> 
> Add a function to perform ENCLS(EINIT), which initializes an enclave,
> which can be used by a driver for running enclaves and VMMs.
> 
> Writing the LE hash MSRs is extraordinarily expensive, e.g. 3-4x
> slower
> than normal MSRs, so we use a per-cpu cache to track the last known
> value
> of the MSRs to avoid unnecessarily writing the MSRs with the current
> value.
> 
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Co-developed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> ---
>  arch/x86/include/asm/sgx.h      |  2 +
>  arch/x86/kernel/cpu/intel_sgx.c | 86
> +++++++++++++++++++++++++++++++--
>  2 files changed, 85 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
> index baf30d49b71f..c15c156436be 100644
> --- a/arch/x86/include/asm/sgx.h
> +++ b/arch/x86/include/asm/sgx.h
> @@ -108,6 +108,8 @@ void sgx_free_page(struct sgx_epc_page *page);
>  void sgx_page_reclaimable(struct sgx_epc_page *page);
>  struct page *sgx_get_backing(struct file *file, pgoff_t index);
>  void sgx_put_backing(struct page *backing_page, bool write);
> +int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken
> *token,
> +	      struct sgx_epc_page *secs_page, u64 lepubkeyhash[4]);
>  
>  #define ENCLS_FAULT_FLAG 0x40000000UL
>  #define ENCLS_FAULT_FLAG_ASM "$0x40000000"
> diff --git a/arch/x86/kernel/cpu/intel_sgx.c
> b/arch/x86/kernel/cpu/intel_sgx.c
> index 1046478a3ab9..fe25e6805680 100644
> --- a/arch/x86/kernel/cpu/intel_sgx.c
> +++ b/arch/x86/kernel/cpu/intel_sgx.c
> @@ -9,6 +9,7 @@
>  #include <linux/sched/signal.h>
>  #include <linux/shmem_fs.h>
>  #include <linux/slab.h>
> +#include <linux/suspend.h>
>  #include <asm/sgx.h>
>  #include <asm/sgx_pr.h>
>  
> @@ -38,6 +39,18 @@ static LIST_HEAD(sgx_active_page_list);
>  static DEFINE_SPINLOCK(sgx_active_page_list_lock);
>  static struct task_struct *ksgxswapd_tsk;
>  static DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
> +static struct notifier_block sgx_pm_notifier;
> +static u64 sgx_pm_cnt;
> +
> +/* The cache for the last known values of IA32_SGXLEPUBKEYHASHx MSRs
> for each
> + * CPU. The entries are initialized when they are first used by
> sgx_einit().
> + */
> +struct sgx_lepubkeyhash {
> +	u64 msrs[4];
> +	u64 pm_cnt;

May I ask why do we need pm_cnt here? In fact why do we need suspend
staff (namely, sgx_pm_cnt above, and related code in this patch) here
in this patch? From the patch commit message I don't see why we need PM
staff here. Please give comment why you need PM staff, or you may
consider to split the PM staff to another patch.

> +};
> +
> +static DEFINE_PER_CPU(struct sgx_lepubkeyhash *,
> sgx_lepubkeyhash_cache);
>  
>  /**
>   * sgx_reclaim_pages - reclaim EPC pages from the consumers
> @@ -328,6 +341,54 @@ void sgx_put_backing(struct page *backing_page,
> bool write)
>  }
>  EXPORT_SYMBOL_GPL(sgx_put_backing);
>  
> +/**
> + * sgx_einit - initialize an enclave
> + * @sigstruct:		a pointer to the SIGSTRUCT
> + * @token:		a pointer to the EINITTOKEN
> + * @secs_page:		a pointer to the SECS EPC page
> + * @lepubkeyhash:	the desired value for IA32_SGXLEPUBKEYHASHx
> MSRs
> + *
> + * Try to perform EINIT operation. If the MSRs are writable, they
> are updated
> + * according to @lepubkeyhash.
> + *
> + * Return:
> + *   0 on success,
> + *   -errno on failure
> + *   SGX error code if EINIT fails
> + */
> +int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken
> *token,
> +	      struct sgx_epc_page *secs_page, u64 lepubkeyhash[4])
> +{
> +	struct sgx_lepubkeyhash __percpu *cache;
> +	bool cache_valid;
> +	int i, ret;
> +
> +	if (!sgx_lc_enabled)
> +		return __einit(sigstruct, token,
> sgx_epc_addr(secs_page));
> +
> +	cache = per_cpu(sgx_lepubkeyhash_cache, smp_processor_id());
> +	if (!cache) {
> +		cache = kzalloc(sizeof(struct sgx_lepubkeyhash),
> GFP_KERNEL);
> +		if (!cache)
> +			return -ENOMEM;
> +	}

It seems per-cpu variable is a pointer to struct sgx_lepubkeyhash, and
the actual structure is allocated at the first time the function is
called. May I ask when will it be freed? It seems the free is not in
this patch. Or I am misunderstanding something?

> +
> +	cache_valid = cache->pm_cnt == sgx_pm_cnt;
> +	cache->pm_cnt = sgx_pm_cnt;
> +	preempt_disable();
> +	for (i = 0; i < 4; i++) {
> +		if (cache_valid && lepubkeyhash[i] == cache-
> >msrs[i])
> +			continue;
> +
> +		wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i,
> lepubkeyhash[i]);
> +		cache->msrs[i] = lepubkeyhash[i];
> +	}
> +	ret = __einit(sigstruct, token, sgx_epc_addr(secs_page));
> +	preempt_enable();
> +	return ret;
> +}
> +EXPORT_SYMBOL(sgx_einit);
> +
>  static __init int sgx_init_epc_bank(u64 addr, u64 size, unsigned
> long index,
>  				    struct sgx_epc_bank *bank)
>  {
> @@ -426,6 +487,15 @@ static __init int sgx_page_cache_init(void)
>  	return 0;
>  }
>  
> +static int sgx_pm_notifier_cb(struct notifier_block *nb, unsigned
> long action,
> +			      void *data)
> +{
> +	if (action == PM_SUSPEND_PREPARE || action ==
> PM_HIBERNATION_PREPARE)
> +		sgx_pm_cnt++;
> +
> +	return NOTIFY_DONE;
> +}
> +
>  static __init int sgx_init(void)
>  {
>  	struct task_struct *tsk;
> @@ -452,20 +522,30 @@ static __init int sgx_init(void)
>  	if (!(fc & FEATURE_CONTROL_SGX_LE_WR))
>  		pr_info("IA32_SGXLEPUBKEYHASHn MSRs are not
> writable\n");
>  
> -	ret = sgx_page_cache_init();
> +	sgx_pm_notifier.notifier_call = sgx_pm_notifier_cb;
> +	ret = register_pm_notifier(&sgx_pm_notifier);
>  	if (ret)
>  		return ret;
>  
> +	ret = sgx_page_cache_init();
> +	if (ret)
> +		goto out_pm;
> +
>  	tsk = kthread_run(ksgxswapd, NULL, "ksgxswapd");
>  	if (IS_ERR(tsk)) {
> -		sgx_page_cache_teardown();
> -		return PTR_ERR(tsk);
> +		ret = PTR_ERR(tsk);
> +		goto out_pcache;
>  	}
>  	ksgxswapd_tsk = tsk;
>  
>  	sgx_enabled = true;
>  	sgx_lc_enabled = !!(fc & FEATURE_CONTROL_SGX_LE_WR);
>  	return 0;
> +out_pcache:
> +	sgx_page_cache_teardown();

I don't think this particular 2 lines of code of 'out_pcache' case
should be in this patch?

Thanks,
-Kai

> +out_pm:
> +	unregister_pm_notifier(&sgx_pm_notifier);
> +	return ret;
>  }
>  
>  arch_initcall(sgx_init);
Jarkko Sakkinen Aug. 28, 2018, 7:01 a.m. UTC | #2
On Mon, Aug 27, 2018 at 09:41:22PM +0000, Huang, Kai wrote:
> On Mon, 2018-08-27 at 21:53 +0300, Jarkko Sakkinen wrote:
> > From: Sean Christopherson <sean.j.christopherson@intel.com>
> > 
> > Add a function to perform ENCLS(EINIT), which initializes an enclave,
> > which can be used by a driver for running enclaves and VMMs.
> > 
> > Writing the LE hash MSRs is extraordinarily expensive, e.g. 3-4x
> > slower
> > than normal MSRs, so we use a per-cpu cache to track the last known
> > value
> > of the MSRs to avoid unnecessarily writing the MSRs with the current
> > value.
> > 
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > Co-developed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> > Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> > ---
> >  arch/x86/include/asm/sgx.h      |  2 +
> >  arch/x86/kernel/cpu/intel_sgx.c | 86
> > +++++++++++++++++++++++++++++++--
> >  2 files changed, 85 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
> > index baf30d49b71f..c15c156436be 100644
> > --- a/arch/x86/include/asm/sgx.h
> > +++ b/arch/x86/include/asm/sgx.h
> > @@ -108,6 +108,8 @@ void sgx_free_page(struct sgx_epc_page *page);
> >  void sgx_page_reclaimable(struct sgx_epc_page *page);
> >  struct page *sgx_get_backing(struct file *file, pgoff_t index);
> >  void sgx_put_backing(struct page *backing_page, bool write);
> > +int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken
> > *token,
> > +	      struct sgx_epc_page *secs_page, u64 lepubkeyhash[4]);
> >  
> >  #define ENCLS_FAULT_FLAG 0x40000000UL
> >  #define ENCLS_FAULT_FLAG_ASM "$0x40000000"
> > diff --git a/arch/x86/kernel/cpu/intel_sgx.c
> > b/arch/x86/kernel/cpu/intel_sgx.c
> > index 1046478a3ab9..fe25e6805680 100644
> > --- a/arch/x86/kernel/cpu/intel_sgx.c
> > +++ b/arch/x86/kernel/cpu/intel_sgx.c
> > @@ -9,6 +9,7 @@
> >  #include <linux/sched/signal.h>
> >  #include <linux/shmem_fs.h>
> >  #include <linux/slab.h>
> > +#include <linux/suspend.h>
> >  #include <asm/sgx.h>
> >  #include <asm/sgx_pr.h>
> >  
> > @@ -38,6 +39,18 @@ static LIST_HEAD(sgx_active_page_list);
> >  static DEFINE_SPINLOCK(sgx_active_page_list_lock);
> >  static struct task_struct *ksgxswapd_tsk;
> >  static DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
> > +static struct notifier_block sgx_pm_notifier;
> > +static u64 sgx_pm_cnt;
> > +
> > +/* The cache for the last known values of IA32_SGXLEPUBKEYHASHx MSRs
> > for each
> > + * CPU. The entries are initialized when they are first used by
> > sgx_einit().
> > + */
> > +struct sgx_lepubkeyhash {
> > +	u64 msrs[4];
> > +	u64 pm_cnt;
> 
> May I ask why do we need pm_cnt here? In fact why do we need suspend
> staff (namely, sgx_pm_cnt above, and related code in this patch) here
> in this patch? From the patch commit message I don't see why we need PM
> staff here. Please give comment why you need PM staff, or you may
> consider to split the PM staff to another patch.

Refining the commit message probably makes more sense because without PM
code sgx_einit() would be broken. The MSRs have been reset after waking
up.

Some kind of counter is required to keep track of the power cycle. When
going to sleep the sgx_pm_cnt is increased. sgx_einit() compares the
current value of the global count to the value in the cache entry to see
whether we are in a new power cycle.

This brings up one question though: how do we deal with VM host going
to sleep? VM guest would not be aware of this.

I think the best measure would be to add a new parameter to sgx_einit()
that enforces update of the MSRs. The driver can then set this parameter
in the case when sgx_einit() returns SGX_INVALID_LICENSE. This is
coherent because the driver requires writable MSRs. It would not be
coherent to do it directly in the core because KVM does not require
writable MSRs.

> 
> > +};
> > +
> > +static DEFINE_PER_CPU(struct sgx_lepubkeyhash *,
> > sgx_lepubkeyhash_cache);
> >  
> >  /**
> >   * sgx_reclaim_pages - reclaim EPC pages from the consumers
> > @@ -328,6 +341,54 @@ void sgx_put_backing(struct page *backing_page,
> > bool write)
> >  }
> >  EXPORT_SYMBOL_GPL(sgx_put_backing);
> >  
> > +/**
> > + * sgx_einit - initialize an enclave
> > + * @sigstruct:		a pointer to the SIGSTRUCT
> > + * @token:		a pointer to the EINITTOKEN
> > + * @secs_page:		a pointer to the SECS EPC page
> > + * @lepubkeyhash:	the desired value for IA32_SGXLEPUBKEYHASHx
> > MSRs
> > + *
> > + * Try to perform EINIT operation. If the MSRs are writable, they
> > are updated
> > + * according to @lepubkeyhash.
> > + *
> > + * Return:
> > + *   0 on success,
> > + *   -errno on failure
> > + *   SGX error code if EINIT fails
> > + */
> > +int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken
> > *token,
> > +	      struct sgx_epc_page *secs_page, u64 lepubkeyhash[4])
> > +{
> > +	struct sgx_lepubkeyhash __percpu *cache;
> > +	bool cache_valid;
> > +	int i, ret;
> > +
> > +	if (!sgx_lc_enabled)
> > +		return __einit(sigstruct, token,
> > sgx_epc_addr(secs_page));
> > +
> > +	cache = per_cpu(sgx_lepubkeyhash_cache, smp_processor_id());
> > +	if (!cache) {
> > +		cache = kzalloc(sizeof(struct sgx_lepubkeyhash),
> > GFP_KERNEL);
> > +		if (!cache)
> > +			return -ENOMEM;
> > +	}
> 
> It seems per-cpu variable is a pointer to struct sgx_lepubkeyhash, and
> the actual structure is allocated at the first time the function is
> called. May I ask when will it be freed? It seems the free is not in
> this patch. Or I am misunderstanding something?

Well, it is part of the core. When the power goes of from the DRAM
banks, everything is wiped out :-)


> 
> > +
> > +	cache_valid = cache->pm_cnt == sgx_pm_cnt;
> > +	cache->pm_cnt = sgx_pm_cnt;
> > +	preempt_disable();
> > +	for (i = 0; i < 4; i++) {
> > +		if (cache_valid && lepubkeyhash[i] == cache-
> > >msrs[i])
> > +			continue;
> > +
> > +		wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i,
> > lepubkeyhash[i]);
> > +		cache->msrs[i] = lepubkeyhash[i];
> > +	}
> > +	ret = __einit(sigstruct, token, sgx_epc_addr(secs_page));
> > +	preempt_enable();
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL(sgx_einit);
> > +
> >  static __init int sgx_init_epc_bank(u64 addr, u64 size, unsigned
> > long index,
> >  				    struct sgx_epc_bank *bank)
> >  {
> > @@ -426,6 +487,15 @@ static __init int sgx_page_cache_init(void)
> >  	return 0;
> >  }
> >  
> > +static int sgx_pm_notifier_cb(struct notifier_block *nb, unsigned
> > long action,
> > +			      void *data)
> > +{
> > +	if (action == PM_SUSPEND_PREPARE || action ==
> > PM_HIBERNATION_PREPARE)
> > +		sgx_pm_cnt++;
> > +
> > +	return NOTIFY_DONE;
> > +}
> > +
> >  static __init int sgx_init(void)
> >  {
> >  	struct task_struct *tsk;
> > @@ -452,20 +522,30 @@ static __init int sgx_init(void)
> >  	if (!(fc & FEATURE_CONTROL_SGX_LE_WR))
> >  		pr_info("IA32_SGXLEPUBKEYHASHn MSRs are not
> > writable\n");
> >  
> > -	ret = sgx_page_cache_init();
> > +	sgx_pm_notifier.notifier_call = sgx_pm_notifier_cb;
> > +	ret = register_pm_notifier(&sgx_pm_notifier);
> >  	if (ret)
> >  		return ret;
> >  
> > +	ret = sgx_page_cache_init();
> > +	if (ret)
> > +		goto out_pm;
> > +
> >  	tsk = kthread_run(ksgxswapd, NULL, "ksgxswapd");
> >  	if (IS_ERR(tsk)) {
> > -		sgx_page_cache_teardown();
> > -		return PTR_ERR(tsk);
> > +		ret = PTR_ERR(tsk);
> > +		goto out_pcache;
> >  	}
> >  	ksgxswapd_tsk = tsk;
> >  
> >  	sgx_enabled = true;
> >  	sgx_lc_enabled = !!(fc & FEATURE_CONTROL_SGX_LE_WR);
> >  	return 0;
> > +out_pcache:
> > +	sgx_page_cache_teardown();
> 
> I don't think this particular 2 lines of code of 'out_pcache' case
> should be in this patch?

Yea, you are right. It cold be implemented already in the earlier patch
(note taken).

> Thanks,
> -Kai

/Jarkko
Huang, Kai Aug. 29, 2018, 7:33 a.m. UTC | #3
[snip..]

> > >
> > > @@ -38,6 +39,18 @@ static LIST_HEAD(sgx_active_page_list);  static
> > > DEFINE_SPINLOCK(sgx_active_page_list_lock);
> > >  static struct task_struct *ksgxswapd_tsk;  static
> > > DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
> > > +static struct notifier_block sgx_pm_notifier; static u64
> > > +sgx_pm_cnt;
> > > +
> > > +/* The cache for the last known values of IA32_SGXLEPUBKEYHASHx
> > > +MSRs
> > > for each
> > > + * CPU. The entries are initialized when they are first used by
> > > sgx_einit().
> > > + */
> > > +struct sgx_lepubkeyhash {
> > > +	u64 msrs[4];
> > > +	u64 pm_cnt;
> >
> > May I ask why do we need pm_cnt here? In fact why do we need suspend
> > staff (namely, sgx_pm_cnt above, and related code in this patch) here
> > in this patch? From the patch commit message I don't see why we need
> > PM staff here. Please give comment why you need PM staff, or you may
> > consider to split the PM staff to another patch.
> 
> Refining the commit message probably makes more sense because without PM
> code sgx_einit() would be broken. The MSRs have been reset after waking up.
> 
> Some kind of counter is required to keep track of the power cycle. When going
> to sleep the sgx_pm_cnt is increased. sgx_einit() compares the current value of
> the global count to the value in the cache entry to see whether we are in a new
> power cycle.

You mean reset to Intel default? I think we can also just reset the cached MSR values on each power cycle, which would be simpler, IMHO?

I think we definitely need some code to handle S3-S5, but should be in separate patches, since I think the major impact of S3-S5 is entire EPC being destroyed. I think keeping pm_cnt is not sufficient enough to handle such case?

> 
> This brings up one question though: how do we deal with VM host going to sleep?
> VM guest would not be aware of this.

IMO VM just gets "sudden loss of EPC" after suspend & resume in host. SGX driver and SDK should be able to handle "sudden loss of EPC", ie, co-working together to re-establish the missing enclaves.

Actually supporting "sudden loss of EPC" is a requirement to support live migration of VM w/ SGX. Internally long time ago we had a discussion and the decision was we should support SGX live migration given two facts:

1) losing platform-dependent is not important. For example, losing sealing key is not a problem, as we could get secrets provisioned again from remote. 2) Both windows & linux driver commit to support "sudden loss of EPC".

I don't think we have to support in very first upstream driver, but I think we need to support someday.

Sean, 

Would you be able to comment here?

> 
> I think the best measure would be to add a new parameter to sgx_einit() that
> enforces update of the MSRs. The driver can then set this parameter in the case
> when sgx_einit() returns SGX_INVALID_LICENSE. This is coherent because the
> driver requires writable MSRs. It would not be coherent to do it directly in the
> core because KVM does not require writable MSRs.

IMHO this is not required, as I mentioned above.

And 
[snip...]

Thanks,
-Kai
Sean Christopherson Aug. 29, 2018, 8:33 p.m. UTC | #4
On Wed, Aug 29, 2018 at 12:33:54AM -0700, Huang, Kai wrote:
> [snip..]
> 
> > > >
> > > > @@ -38,6 +39,18 @@ static LIST_HEAD(sgx_active_page_list);  static
> > > > DEFINE_SPINLOCK(sgx_active_page_list_lock);
> > > >  static struct task_struct *ksgxswapd_tsk;  static
> > > > DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
> > > > +static struct notifier_block sgx_pm_notifier; static u64
> > > > +sgx_pm_cnt;
> > > > +
> > > > +/* The cache for the last known values of IA32_SGXLEPUBKEYHASHx
> > > > +MSRs
> > > > for each
> > > > + * CPU. The entries are initialized when they are first used by
> > > > sgx_einit().
> > > > + */
> > > > +struct sgx_lepubkeyhash {
> > > > +	u64 msrs[4];
> > > > +	u64 pm_cnt;
> > >
> > > May I ask why do we need pm_cnt here? In fact why do we need suspend
> > > staff (namely, sgx_pm_cnt above, and related code in this patch) here
> > > in this patch? From the patch commit message I don't see why we need
> > > PM staff here. Please give comment why you need PM staff, or you may
> > > consider to split the PM staff to another patch.
> > 
> > Refining the commit message probably makes more sense because without PM
> > code sgx_einit() would be broken. The MSRs have been reset after waking up.
> > 
> > Some kind of counter is required to keep track of the power cycle. When going
> > to sleep the sgx_pm_cnt is increased. sgx_einit() compares the current value of
> > the global count to the value in the cache entry to see whether we are in a new
> > power cycle.
> 
> You mean reset to Intel default? I think we can also just reset the cached MSR
> values on each power cycle, which would be simpler, IMHO?

Refresh my brain, does hardware reset the MSRs on a transition to S3 or lower?

> I think we definitely need some code to handle S3-S5, but should be in
> separate patches, since I think the major impact of S3-S5 is entire EPC
> being destroyed. I think keeping pm_cnt is not sufficient enough to handle
> such case?
> > 
> > This brings up one question though: how do we deal with VM host going to sleep?
> > VM guest would not be aware of this.
> 
> IMO VM just gets "sudden loss of EPC" after suspend & resume in host. SGX driver
> and SDK should be able to handle "sudden loss of EPC", ie, co-working together to
> re-establish the missing enclaves.
> 
> Actually supporting "sudden loss of EPC" is a requirement to support live migration
> of VM w/ SGX. Internally long time ago we had a discussion and the decision was we
> should support SGX live migration given two facts:
> 
> 1) losing platform-dependent is not important. For example, losing sealing key is
> not a problem, as we could get secrets provisioned again from remote. 2) Both windows
> & linux driver commit to support "sudden loss of EPC".
> 
> I don't think we have to support in very first upstream driver, but I think we need
> to support someday.

Actually, we can easily support this in the driver, at least for SGX1
hardware.  SGX2 isn't difficult to handle, but we've intentionally
postponed those patches until SGX1 support is in mainline[1].
Accesses to the EPC after it is lost will cause faults.  Userspace
EPC accesses, e.g. ERESUME, will get a SIGSEGV that the process
should interpret as an "I should restart my enclave" event.  The
SDK already does this.  In the driver, we just need to be aware of
this potential behavior and not freak out.  Specifically, SGX_INVD
needs to not WARN on faults that may have been due to a the EPC
being nuked.  I think we can even remove the sgx_encl_pm_notifier()
code altogether.

[1] SGX1 hardware signals a #GP on an access to an invalid EPC page.
    SGX2 signals a #PF with the PF_SGX error code bit set.  This is
    problematic because the kernel looks at the PTEs for CR2 and sees
    nothing wrong, so it thinks it should just restart the
    instruction, leading to an infinite fault loop.  Resolving this
    is fairly straightforward, but a complete fix requires propagating
    PF_SGX down to the ENCLS fixup handler, which means plumbing the
    error code through the fixup handlers or smushing PF_SGX into
    trapnr.  Since there is a parallel effort to plumb the error code
    through the handlers, https://lkml.org/lkml/2018/8/6/924, we opted
    to do this in a separate series.

> Sean, 
> 
> Would you be able to comment here?
> 
> > 
> > I think the best measure would be to add a new parameter to sgx_einit() that
> > enforces update of the MSRs. The driver can then set this parameter in the case
> > when sgx_einit() returns SGX_INVALID_LICENSE. This is coherent because the
> > driver requires writable MSRs. It would not be coherent to do it directly in the
> > core because KVM does not require writable MSRs.
> 
> IMHO this is not required, as I mentioned above.
> 
> And 
> [snip...]
> 
> Thanks,
> -Kai
Huang, Kai Aug. 29, 2018, 8:58 p.m. UTC | #5
> -----Original Message-----
> From: Christopherson, Sean J
> Sent: Thursday, August 30, 2018 8:34 AM
> To: Huang, Kai <kai.huang@intel.com>
> Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>; platform-driver-
> x86@vger.kernel.org; x86@kernel.org; nhorman@redhat.com; linux-
> kernel@vger.kernel.org; tglx@linutronix.de; suresh.b.siddha@intel.com; Ayoun,
> Serge <serge.ayoun@intel.com>; hpa@zytor.com; npmccallum@redhat.com;
> mingo@redhat.com; linux-sgx@vger.kernel.org; Hansen, Dave
> <dave.hansen@intel.com>
> Subject: Re: [PATCH v13 10/13] x86/sgx: Add sgx_einit() for initializing enclaves
> 
> On Wed, Aug 29, 2018 at 12:33:54AM -0700, Huang, Kai wrote:
> > [snip..]
> >
> > > > >
> > > > > @@ -38,6 +39,18 @@ static LIST_HEAD(sgx_active_page_list);
> > > > > static DEFINE_SPINLOCK(sgx_active_page_list_lock);
> > > > >  static struct task_struct *ksgxswapd_tsk;  static
> > > > > DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
> > > > > +static struct notifier_block sgx_pm_notifier; static u64
> > > > > +sgx_pm_cnt;
> > > > > +
> > > > > +/* The cache for the last known values of IA32_SGXLEPUBKEYHASHx
> > > > > +MSRs
> > > > > for each
> > > > > + * CPU. The entries are initialized when they are first used by
> > > > > sgx_einit().
> > > > > + */
> > > > > +struct sgx_lepubkeyhash {
> > > > > +	u64 msrs[4];
> > > > > +	u64 pm_cnt;
> > > >
> > > > May I ask why do we need pm_cnt here? In fact why do we need
> > > > suspend staff (namely, sgx_pm_cnt above, and related code in this
> > > > patch) here in this patch? From the patch commit message I don't
> > > > see why we need PM staff here. Please give comment why you need PM
> > > > staff, or you may consider to split the PM staff to another patch.
> > >
> > > Refining the commit message probably makes more sense because
> > > without PM code sgx_einit() would be broken. The MSRs have been reset
> after waking up.
> > >
> > > Some kind of counter is required to keep track of the power cycle.
> > > When going to sleep the sgx_pm_cnt is increased. sgx_einit()
> > > compares the current value of the global count to the value in the
> > > cache entry to see whether we are in a new power cycle.
> >
> > You mean reset to Intel default? I think we can also just reset the
> > cached MSR values on each power cycle, which would be simpler, IMHO?
> 
> Refresh my brain, does hardware reset the MSRs on a transition to S3 or lower?
> 
> > I think we definitely need some code to handle S3-S5, but should be in
> > separate patches, since I think the major impact of S3-S5 is entire
> > EPC being destroyed. I think keeping pm_cnt is not sufficient enough
> > to handle such case?
> > >
> > > This brings up one question though: how do we deal with VM host going to
> sleep?
> > > VM guest would not be aware of this.
> >
> > IMO VM just gets "sudden loss of EPC" after suspend & resume in host.
> > SGX driver and SDK should be able to handle "sudden loss of EPC", ie,
> > co-working together to re-establish the missing enclaves.
> >
> > Actually supporting "sudden loss of EPC" is a requirement to support
> > live migration of VM w/ SGX. Internally long time ago we had a
> > discussion and the decision was we should support SGX live migration given
> two facts:
> >
> > 1) losing platform-dependent is not important. For example, losing
> > sealing key is not a problem, as we could get secrets provisioned
> > again from remote. 2) Both windows & linux driver commit to support "sudden
> loss of EPC".
> >
> > I don't think we have to support in very first upstream driver, but I
> > think we need to support someday.
> 
> Actually, we can easily support this in the driver, at least for SGX1 hardware.

That's my guess too. Just want to check whether we are still on the same page :)

> SGX2 isn't difficult to handle, but we've intentionally postponed those patches
> until SGX1 support is in mainline[1].
> Accesses to the EPC after it is lost will cause faults.  Userspace EPC accesses, e.g.
> ERESUME, will get a SIGSEGV that the process should interpret as an "I should
> restart my enclave" event.  The SDK already does this.  In the driver, we just need
> to be aware of this potential behavior and not freak out.  Specifically, SGX_INVD
> needs to not WARN on faults that may have been due to a the EPC being nuked.
> I think we can even remove the sgx_encl_pm_notifier() code altogether.

Possibly we still need to do some cleanup, ie, all structures of enclaves, upon resume? 

Anyway I am just guessing.

Thanks,
-Kai

> 
> [1] SGX1 hardware signals a #GP on an access to an invalid EPC page.
>     SGX2 signals a #PF with the PF_SGX error code bit set.  This is
>     problematic because the kernel looks at the PTEs for CR2 and sees
>     nothing wrong, so it thinks it should just restart the
>     instruction, leading to an infinite fault loop.  Resolving this
>     is fairly straightforward, but a complete fix requires propagating
>     PF_SGX down to the ENCLS fixup handler, which means plumbing the
>     error code through the fixup handlers or smushing PF_SGX into
>     trapnr.  Since there is a parallel effort to plumb the error code
>     through the handlers, https://lkml.org/lkml/2018/8/6/924, we opted
>     to do this in a separate series.
> 
> > Sean,
> >
> > Would you be able to comment here?
> >
> > >
> > > I think the best measure would be to add a new parameter to
> > > sgx_einit() that enforces update of the MSRs. The driver can then
> > > set this parameter in the case when sgx_einit() returns
> > > SGX_INVALID_LICENSE. This is coherent because the driver requires
> > > writable MSRs. It would not be coherent to do it directly in the core because
> KVM does not require writable MSRs.
> >
> > IMHO this is not required, as I mentioned above.
> >
> > And
> > [snip...]
> >
> > Thanks,
> > -Kai
Sean Christopherson Aug. 29, 2018, 9:09 p.m. UTC | #6
On Wed, Aug 29, 2018 at 01:58:09PM -0700, Huang, Kai wrote:
> > -----Original Message-----
> > From: Christopherson, Sean J
> > Sent: Thursday, August 30, 2018 8:34 AM
> > To: Huang, Kai <kai.huang@intel.com>
> > Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>; platform-driver-
> > x86@vger.kernel.org; x86@kernel.org; nhorman@redhat.com; linux-
> > kernel@vger.kernel.org; tglx@linutronix.de; suresh.b.siddha@intel.com; Ayoun,
> > Serge <serge.ayoun@intel.com>; hpa@zytor.com; npmccallum@redhat.com;
> > mingo@redhat.com; linux-sgx@vger.kernel.org; Hansen, Dave
> > <dave.hansen@intel.com>
> > Subject: Re: [PATCH v13 10/13] x86/sgx: Add sgx_einit() for initializing enclaves
> > 
> > On Wed, Aug 29, 2018 at 12:33:54AM -0700, Huang, Kai wrote:
> > > [snip..]
> > >
> > > > > >
> > > > > > @@ -38,6 +39,18 @@ static LIST_HEAD(sgx_active_page_list);
> > > > > > static DEFINE_SPINLOCK(sgx_active_page_list_lock);
> > > > > >  static struct task_struct *ksgxswapd_tsk;  static
> > > > > > DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
> > > > > > +static struct notifier_block sgx_pm_notifier; static u64
> > > > > > +sgx_pm_cnt;
> > > > > > +
> > > > > > +/* The cache for the last known values of IA32_SGXLEPUBKEYHASHx
> > > > > > +MSRs
> > > > > > for each
> > > > > > + * CPU. The entries are initialized when they are first used by
> > > > > > sgx_einit().
> > > > > > + */
> > > > > > +struct sgx_lepubkeyhash {
> > > > > > +	u64 msrs[4];
> > > > > > +	u64 pm_cnt;
> > > > >
> > > > > May I ask why do we need pm_cnt here? In fact why do we need
> > > > > suspend staff (namely, sgx_pm_cnt above, and related code in this
> > > > > patch) here in this patch? From the patch commit message I don't
> > > > > see why we need PM staff here. Please give comment why you need PM
> > > > > staff, or you may consider to split the PM staff to another patch.
> > > >
> > > > Refining the commit message probably makes more sense because
> > > > without PM code sgx_einit() would be broken. The MSRs have been reset
> > after waking up.
> > > >
> > > > Some kind of counter is required to keep track of the power cycle.
> > > > When going to sleep the sgx_pm_cnt is increased. sgx_einit()
> > > > compares the current value of the global count to the value in the
> > > > cache entry to see whether we are in a new power cycle.
> > >
> > > You mean reset to Intel default? I think we can also just reset the
> > > cached MSR values on each power cycle, which would be simpler, IMHO?
> > 
> > Refresh my brain, does hardware reset the MSRs on a transition to S3 or lower?
> > 
> > > I think we definitely need some code to handle S3-S5, but should be in
> > > separate patches, since I think the major impact of S3-S5 is entire
> > > EPC being destroyed. I think keeping pm_cnt is not sufficient enough
> > > to handle such case?
> > > >
> > > > This brings up one question though: how do we deal with VM host going to
> > sleep?
> > > > VM guest would not be aware of this.
> > >
> > > IMO VM just gets "sudden loss of EPC" after suspend & resume in host.
> > > SGX driver and SDK should be able to handle "sudden loss of EPC", ie,
> > > co-working together to re-establish the missing enclaves.
> > >
> > > Actually supporting "sudden loss of EPC" is a requirement to support
> > > live migration of VM w/ SGX. Internally long time ago we had a
> > > discussion and the decision was we should support SGX live migration given
> > two facts:
> > >
> > > 1) losing platform-dependent is not important. For example, losing
> > > sealing key is not a problem, as we could get secrets provisioned
> > > again from remote. 2) Both windows & linux driver commit to support "sudden
> > loss of EPC".
> > >
> > > I don't think we have to support in very first upstream driver, but I
> > > think we need to support someday.
> > 
> > Actually, we can easily support this in the driver, at least for SGX1 hardware.
> 
> That's my guess too. Just want to check whether we are still on the same page :)
> 
> > SGX2 isn't difficult to handle, but we've intentionally postponed those patches
> > until SGX1 support is in mainline[1].
> > Accesses to the EPC after it is lost will cause faults.  Userspace EPC accesses, e.g.
> > ERESUME, will get a SIGSEGV that the process should interpret as an "I should
> > restart my enclave" event.  The SDK already does this.  In the driver, we just need
> > to be aware of this potential behavior and not freak out.  Specifically, SGX_INVD
> > needs to not WARN on faults that may have been due to a the EPC being nuked.
> > I think we can even remove the sgx_encl_pm_notifier() code altogether.
> 
> Possibly we still need to do some cleanup, ie, all structures of enclaves, upon resume? 

Not for functional reasons.  The driver will automatically do the
cleanup via SGX_INVD when it next accesses the enclave's pages and
takes a fault, e.g. during reclaim.  Proactively reclaiming the EPC
pages would probably affect performance, though not necessarily in
a good way.  And I think it would be a beneficial to get the driver
out of the suspend/hibernate/resume paths, e.g. zapping all enclaves
could noticeably impact suspend/resume latency.
 
> Anyway I am just guessing.
> 
> Thanks,
> -Kai
> 
> > 
> > [1] SGX1 hardware signals a #GP on an access to an invalid EPC page.
> >     SGX2 signals a #PF with the PF_SGX error code bit set.  This is
> >     problematic because the kernel looks at the PTEs for CR2 and sees
> >     nothing wrong, so it thinks it should just restart the
> >     instruction, leading to an infinite fault loop.  Resolving this
> >     is fairly straightforward, but a complete fix requires propagating
> >     PF_SGX down to the ENCLS fixup handler, which means plumbing the
> >     error code through the fixup handlers or smushing PF_SGX into
> >     trapnr.  Since there is a parallel effort to plumb the error code
> >     through the handlers, https://lkml.org/lkml/2018/8/6/924, we opted
> >     to do this in a separate series.
> > 
> > > Sean,
> > >
> > > Would you be able to comment here?
> > >
> > > >
> > > > I think the best measure would be to add a new parameter to
> > > > sgx_einit() that enforces update of the MSRs. The driver can then
> > > > set this parameter in the case when sgx_einit() returns
> > > > SGX_INVALID_LICENSE. This is coherent because the driver requires
> > > > writable MSRs. It would not be coherent to do it directly in the core because
> > KVM does not require writable MSRs.
> > >
> > > IMHO this is not required, as I mentioned above.
> > >
> > > And
> > > [snip...]
> > >
> > > Thanks,
> > > -Kai
Huang, Kai Aug. 30, 2018, 1:45 a.m. UTC | #7
> > > > > Some kind of counter is required to keep track of the power cycle.
> > > > > When going to sleep the sgx_pm_cnt is increased. sgx_einit()
> > > > > compares the current value of the global count to the value in
> > > > > the cache entry to see whether we are in a new power cycle.
> > > >
> > > > You mean reset to Intel default? I think we can also just reset
> > > > the cached MSR values on each power cycle, which would be simpler,
> IMHO?
> > >
> > > Refresh my brain, does hardware reset the MSRs on a transition to S3 or
> lower?

Sorry I missed this one. To be honest I don't know. I checked the SDM and all I can find is:

"On reset, the default value is the digest of Intel's signing key."

Jarkko may know.

> > >
> > > > I think we definitely need some code to handle S3-S5, but should
> > > > be in separate patches, since I think the major impact of S3-S5 is
> > > > entire EPC being destroyed. I think keeping pm_cnt is not
> > > > sufficient enough to handle such case?
> > > > >
> > > > > This brings up one question though: how do we deal with VM host
> > > > > going to
> > > sleep?
> > > > > VM guest would not be aware of this.
> > > >
> > > > IMO VM just gets "sudden loss of EPC" after suspend & resume in host.
> > > > SGX driver and SDK should be able to handle "sudden loss of EPC",
> > > > ie, co-working together to re-establish the missing enclaves.
> > > >
> > > > Actually supporting "sudden loss of EPC" is a requirement to
> > > > support live migration of VM w/ SGX. Internally long time ago we
> > > > had a discussion and the decision was we should support SGX live
> > > > migration given
> > > two facts:
> > > >
> > > > 1) losing platform-dependent is not important. For example, losing
> > > > sealing key is not a problem, as we could get secrets provisioned
> > > > again from remote. 2) Both windows & linux driver commit to
> > > > support "sudden
> > > loss of EPC".
> > > >
> > > > I don't think we have to support in very first upstream driver,
> > > > but I think we need to support someday.
> > >
> > > Actually, we can easily support this in the driver, at least for SGX1 hardware.
> >
> > That's my guess too. Just want to check whether we are still on the
> > same page :)
> >
> > > SGX2 isn't difficult to handle, but we've intentionally postponed
> > > those patches until SGX1 support is in mainline[1].
> > > Accesses to the EPC after it is lost will cause faults.  Userspace EPC accesses,
> e.g.
> > > ERESUME, will get a SIGSEGV that the process should interpret as an
> > > "I should restart my enclave" event.  The SDK already does this.  In
> > > the driver, we just need to be aware of this potential behavior and
> > > not freak out.  Specifically, SGX_INVD needs to not WARN on faults that may
> have been due to a the EPC being nuked.
> > > I think we can even remove the sgx_encl_pm_notifier() code altogether.
> >
> > Possibly we still need to do some cleanup, ie, all structures of enclaves, upon
> resume?
> 
> Not for functional reasons.  The driver will automatically do the cleanup via
> SGX_INVD when it next accesses the enclave's pages and takes a fault, e.g.
> during reclaim.  Proactively reclaiming the EPC pages would probably affect
> performance, though not necessarily in a good way.  And I think it would be a
> beneficial to get the driver out of the suspend/hibernate/resume paths, e.g.
> zapping all enclaves could noticeably impact suspend/resume latency.

Sure.

Thanks,
-Kai
Jarkko Sakkinen Aug. 31, 2018, 12:17 p.m. UTC | #8
On Wed, Aug 29, 2018 at 07:33:54AM +0000, Huang, Kai wrote:
> [snip..]
> 
> > > >
> > > > @@ -38,6 +39,18 @@ static LIST_HEAD(sgx_active_page_list);  static
> > > > DEFINE_SPINLOCK(sgx_active_page_list_lock);
> > > >  static struct task_struct *ksgxswapd_tsk;  static
> > > > DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
> > > > +static struct notifier_block sgx_pm_notifier; static u64
> > > > +sgx_pm_cnt;
> > > > +
> > > > +/* The cache for the last known values of IA32_SGXLEPUBKEYHASHx
> > > > +MSRs
> > > > for each
> > > > + * CPU. The entries are initialized when they are first used by
> > > > sgx_einit().
> > > > + */
> > > > +struct sgx_lepubkeyhash {
> > > > +	u64 msrs[4];
> > > > +	u64 pm_cnt;
> > >
> > > May I ask why do we need pm_cnt here? In fact why do we need suspend
> > > staff (namely, sgx_pm_cnt above, and related code in this patch) here
> > > in this patch? From the patch commit message I don't see why we need
> > > PM staff here. Please give comment why you need PM staff, or you may
> > > consider to split the PM staff to another patch.
> > 
> > Refining the commit message probably makes more sense because without PM
> > code sgx_einit() would be broken. The MSRs have been reset after waking up.
> > 
> > Some kind of counter is required to keep track of the power cycle. When going
> > to sleep the sgx_pm_cnt is increased. sgx_einit() compares the current value of
> > the global count to the value in the cache entry to see whether we are in a new
> > power cycle.
> 
> You mean reset to Intel default? I think we can also just reset the
> cached MSR values on each power cycle, which would be simpler, IMHO?

I don't really see that much difference in the complexity.

> I think we definitely need some code to handle S3-S5, but should be in
> separate patches, since I think the major impact of S3-S5 is entire
> EPC being destroyed. I think keeping pm_cnt is not sufficient enough
> to handle such case?

The driver has SGX_POWER_LOST_ENCLAVE for ioctls and it deletes the TCS
entries.

> > This brings up one question though: how do we deal with VM host going to sleep?
> > VM guest would not be aware of this.
> 
> IMO VM just gets "sudden loss of EPC" after suspend & resume in host.
> SGX driver and SDK should be able to handle "sudden loss of EPC", ie,
> co-working together to re-establish the missing enclaves.

This is not about EPC. It is already dealt by the driver. I'm concerned
about the MSR cache as it would mess up.

But I guess this logic is part of the KVM code anyway now that I think
more of it.

/Jarkko
Sean Christopherson Aug. 31, 2018, 5:43 p.m. UTC | #9
On Wed, Aug 29, 2018 at 06:45:29PM -0700, Huang, Kai wrote:
> > > > > > Some kind of counter is required to keep track of the power cycle.
> > > > > > When going to sleep the sgx_pm_cnt is increased. sgx_einit()
> > > > > > compares the current value of the global count to the value in
> > > > > > the cache entry to see whether we are in a new power cycle.
> > > > >
> > > > > You mean reset to Intel default? I think we can also just reset
> > > > > the cached MSR values on each power cycle, which would be simpler,
> > IMHO?
> > > >
> > > > Refresh my brain, does hardware reset the MSRs on a transition to S3 or
> > lower?
> 
> Sorry I missed this one. To be honest I don't know. I checked the SDM and all I can find is:
> 
> "On reset, the default value is the digest of Intel's signing key."

I confirmed the MSRs are reset any time the EPC is lost.  Not sure
what happens if the MSRs contained a non-Intel value but feature
control is locked with SGX launch control disabled.  I'll post an
update when I have an answer.
 
> Jarkko may know.
> 
> > > >
> > > > > I think we definitely need some code to handle S3-S5, but should
> > > > > be in separate patches, since I think the major impact of S3-S5 is
> > > > > entire EPC being destroyed. I think keeping pm_cnt is not
> > > > > sufficient enough to handle such case?
> > > > > >
> > > > > > This brings up one question though: how do we deal with VM host
> > > > > > going to
> > > > sleep?
> > > > > > VM guest would not be aware of this.
> > > > >
> > > > > IMO VM just gets "sudden loss of EPC" after suspend & resume in host.
> > > > > SGX driver and SDK should be able to handle "sudden loss of EPC",
> > > > > ie, co-working together to re-establish the missing enclaves.
> > > > >
> > > > > Actually supporting "sudden loss of EPC" is a requirement to
> > > > > support live migration of VM w/ SGX. Internally long time ago we
> > > > > had a discussion and the decision was we should support SGX live
> > > > > migration given
> > > > two facts:
> > > > >
> > > > > 1) losing platform-dependent is not important. For example, losing
> > > > > sealing key is not a problem, as we could get secrets provisioned
> > > > > again from remote. 2) Both windows & linux driver commit to
> > > > > support "sudden
> > > > loss of EPC".
> > > > >
> > > > > I don't think we have to support in very first upstream driver,
> > > > > but I think we need to support someday.
> > > >
> > > > Actually, we can easily support this in the driver, at least for SGX1 hardware.
> > >
> > > That's my guess too. Just want to check whether we are still on the
> > > same page :)
> > >
> > > > SGX2 isn't difficult to handle, but we've intentionally postponed
> > > > those patches until SGX1 support is in mainline[1].
> > > > Accesses to the EPC after it is lost will cause faults.  Userspace EPC accesses,
> > e.g.
> > > > ERESUME, will get a SIGSEGV that the process should interpret as an
> > > > "I should restart my enclave" event.  The SDK already does this.  In
> > > > the driver, we just need to be aware of this potential behavior and
> > > > not freak out.  Specifically, SGX_INVD needs to not WARN on faults that may
> > have been due to a the EPC being nuked.
> > > > I think we can even remove the sgx_encl_pm_notifier() code altogether.
> > >
> > > Possibly we still need to do some cleanup, ie, all structures of enclaves, upon
> > resume?
> > 
> > Not for functional reasons.  The driver will automatically do the cleanup via
> > SGX_INVD when it next accesses the enclave's pages and takes a fault, e.g.
> > during reclaim.  Proactively reclaiming the EPC pages would probably affect
> > performance, though not necessarily in a good way.  And I think it would be a
> > beneficial to get the driver out of the suspend/hibernate/resume paths, e.g.
> > zapping all enclaves could noticeably impact suspend/resume latency.
> 
> Sure.
> 
> Thanks,
> -Kai
>
Sean Christopherson Aug. 31, 2018, 6:15 p.m. UTC | #10
On Fri, Aug 31, 2018 at 03:17:03PM +0300, Jarkko Sakkinen wrote:
> On Wed, Aug 29, 2018 at 07:33:54AM +0000, Huang, Kai wrote:
> > [snip..]
> > 
> > > > >
> > > > > @@ -38,6 +39,18 @@ static LIST_HEAD(sgx_active_page_list);  static
> > > > > DEFINE_SPINLOCK(sgx_active_page_list_lock);
> > > > >  static struct task_struct *ksgxswapd_tsk;  static
> > > > > DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
> > > > > +static struct notifier_block sgx_pm_notifier; static u64
> > > > > +sgx_pm_cnt;
> > > > > +
> > > > > +/* The cache for the last known values of IA32_SGXLEPUBKEYHASHx
> > > > > +MSRs
> > > > > for each
> > > > > + * CPU. The entries are initialized when they are first used by
> > > > > sgx_einit().
> > > > > + */
> > > > > +struct sgx_lepubkeyhash {
> > > > > +	u64 msrs[4];
> > > > > +	u64 pm_cnt;
> > > >
> > > > May I ask why do we need pm_cnt here? In fact why do we need suspend
> > > > staff (namely, sgx_pm_cnt above, and related code in this patch) here
> > > > in this patch? From the patch commit message I don't see why we need
> > > > PM staff here. Please give comment why you need PM staff, or you may
> > > > consider to split the PM staff to another patch.
> > > 
> > > Refining the commit message probably makes more sense because without PM
> > > code sgx_einit() would be broken. The MSRs have been reset after waking up.
> > > 
> > > Some kind of counter is required to keep track of the power cycle. When going
> > > to sleep the sgx_pm_cnt is increased. sgx_einit() compares the current value of
> > > the global count to the value in the cache entry to see whether we are in a new
> > > power cycle.
> > 
> > You mean reset to Intel default? I think we can also just reset the
> > cached MSR values on each power cycle, which would be simpler, IMHO?
> 
> I don't really see that much difference in the complexity.

Tracking the validity of the cache means we're hosed if we miss any
condition that causes the MSRs to be reset.  I think we're better off
assuming the cache can be stale at any time, i.e. don't track power
cyles and instead handle EINIT failure due to INVALID_TOKEN by writing
the cache+MSRs with the desired hash and retrying EINIT.  EINIT is
interruptible and its latency is extremely variable in any case, e.g.
tens of thousands of cycles, so this rarely-hit "slow path" probably
wouldn't affect the worst case latency of EINIT.
 
> > I think we definitely need some code to handle S3-S5, but should be in
> > separate patches, since I think the major impact of S3-S5 is entire
> > EPC being destroyed. I think keeping pm_cnt is not sufficient enough
> > to handle such case?
> 
> The driver has SGX_POWER_LOST_ENCLAVE for ioctls and it deletes the TCS
> entries.
> 
> > > This brings up one question though: how do we deal with VM host going to sleep?
> > > VM guest would not be aware of this.
> > 
> > IMO VM just gets "sudden loss of EPC" after suspend & resume in host.
> > SGX driver and SDK should be able to handle "sudden loss of EPC", ie,
> > co-working together to re-establish the missing enclaves.
> 
> This is not about EPC. It is already dealt by the driver. I'm concerned
> about the MSR cache as it would mess up.
> 
> But I guess this logic is part of the KVM code anyway now that I think
> more of it.

Ya, I expect that VMMs will preserve VM's virtual pubkey MSRs, though
there might be scenarios where a VM has direct access to the hardware
MSRs.  This is probably a moot point since I don't think we want to
assume the kernel's cache is 100% accurate, regardless of environment.
Dr. Greg Aug. 31, 2018, 9:34 p.m. UTC | #11
On Fri, Aug 31, 2018 at 10:43:30AM -0700, Sean Christopherson wrote:

Good afternoon to everyone.

> > Sorry I missed this one. To be honest I don't know. I checked the
> > SDM and all I can find is:
> >
> > "On reset, the default value is the digest of Intel's signing key."

> I confirmed the MSRs are reset any time the EPC is lost.  Not sure
> what happens if the MSRs contained a non-Intel value but feature
> control is locked with SGX launch control disabled.  I'll post an
> update when I have an answer.

It was our interpretation from the SDM that the identity modulus
signature MSR's are 'trap-door' registers.  If flexible launch control
(FLC) is enabled the platform has one opportunity to write a new
signature value, after which the registers are locked from
modification until the next platform reset.

From a security architecture perspective it seemed that an FLC based
SGX implementation would use a modified version of TBOOT to securely
write that register once per platform boot/reset.  The architecture
that is being discussed where there is a need to continually check
whether or not the correct root signing key is loaded sounds a bit
clunky at best.

At worst it has potential security implications since it is the
reponsibility of the enclave launch control infrastructure to control
which enclaves are allowed to have the PROVISION_KEY attribute bit
set.

Have a good weekend.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"Extensive interviews show that not one alcoholic has ever actually seen
 a pink elephant."
                                -- Yale University
                                   Center of Alcohol Studies
Jann Horn Sept. 3, 2018, 1:53 p.m. UTC | #12
On Mon, Sep 3, 2018 at 3:33 PM Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> From: Sean Christopherson <sean.j.christopherson@intel.com>
>
> Add a function to perform ENCLS(EINIT), which initializes an enclave,
> which can be used by a driver for running enclaves and VMMs.
>
> Writing the LE hash MSRs is extraordinarily expensive, e.g. 3-4x slower
> than normal MSRs, so we use a per-cpu cache to track the last known value
> of the MSRs to avoid unnecessarily writing the MSRs with the current value.
>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Co-developed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
[...]
> +/**
> + * sgx_einit - initialize an enclave
> + * @sigstruct:         a pointer to the SIGSTRUCT
> + * @token:             a pointer to the EINITTOKEN
> + * @secs_page:         a pointer to the SECS EPC page
> + * @lepubkeyhash:      the desired value for IA32_SGXLEPUBKEYHASHx MSRs
> + *
> + * Try to perform EINIT operation. If the MSRs are writable, they are updated
> + * according to @lepubkeyhash.
> + *
> + * Return:
> + *   0 on success,
> + *   -errno on failure
> + *   SGX error code if EINIT fails
> + */
> +int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token,
> +             struct sgx_epc_page *secs_page, u64 lepubkeyhash[4])
> +{
> +       struct sgx_lepubkeyhash __percpu *cache;
> +       bool cache_valid;
> +       int i, ret;
> +
> +       if (!sgx_lc_enabled)
> +               return __einit(sigstruct, token, sgx_epc_addr(secs_page));
> +
> +       cache = per_cpu(sgx_lepubkeyhash_cache, smp_processor_id());

At this point, preemption must be off, because smp_processor_id() is
called; I don't think it is off here? If you have hardware/emulation
on which you can test this, you may want to test your patches with
DEBUG_PREEMPT enabled.

> +       if (!cache) {
> +               cache = kzalloc(sizeof(struct sgx_lepubkeyhash), GFP_KERNEL);

But then here you do a GFP_KERNEL allocation, which can sleep.

Also: After "cache" has been allocated in this branch, when do you
store the reference to it? As far as I can tell, you never write to
sgx_lepubkeyhash_cache, and the allocation just leaks.

> +               if (!cache)
> +                       return -ENOMEM;
> +       }
> +
> +       cache_valid = cache->pm_cnt == sgx_pm_cnt;

The cache should probably not be treated as valid if it has just been
created and only contains zeroes, right?

> +       cache->pm_cnt = sgx_pm_cnt;

Can sgx_pm_cnt be modified concurrently? If so, please use at least
READ_ONCE() to document that and prevent the compiler from doing weird
stuff.

> +       preempt_disable();

And here you turn off preemption, but it should already have been off?

> +       for (i = 0; i < 4; i++) {
> +               if (cache_valid && lepubkeyhash[i] == cache->msrs[i])
> +                       continue;
> +
> +               wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
> +               cache->msrs[i] = lepubkeyhash[i];
> +       }
> +       ret = __einit(sigstruct, token, sgx_epc_addr(secs_page));
> +       preempt_enable();
> +       return ret;
> +}
> +EXPORT_SYMBOL(sgx_einit);
> +
Jarkko Sakkinen Sept. 3, 2018, 6:15 p.m. UTC | #13
On Thu, Aug 30, 2018 at 01:45:29AM +0000, Huang, Kai wrote:
> > > > Refresh my brain, does hardware reset the MSRs on a transition to S3 or
> > lower?
> 
> Sorry I missed this one. To be honest I don't know. I checked the SDM and all I can find is:
> 
> "On reset, the default value is the digest of Intel's signing key."
> 
> Jarkko may know.

I found this out by testing. The cached MSR values stop working after
waking up from S3. Have not found anything better from the SDM but it
is the behavior that I've observed on my GLK NUC at least.

/Jarkko
Jarkko Sakkinen Sept. 3, 2018, 7:19 p.m. UTC | #14
On Fri, Aug 31, 2018 at 11:15:09AM -0700, Sean Christopherson wrote:
> On Fri, Aug 31, 2018 at 03:17:03PM +0300, Jarkko Sakkinen wrote:
> > On Wed, Aug 29, 2018 at 07:33:54AM +0000, Huang, Kai wrote:
> > > [snip..]
> > > 
> > > > > >
> > > > > > @@ -38,6 +39,18 @@ static LIST_HEAD(sgx_active_page_list);  static
> > > > > > DEFINE_SPINLOCK(sgx_active_page_list_lock);
> > > > > >  static struct task_struct *ksgxswapd_tsk;  static
> > > > > > DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
> > > > > > +static struct notifier_block sgx_pm_notifier; static u64
> > > > > > +sgx_pm_cnt;
> > > > > > +
> > > > > > +/* The cache for the last known values of IA32_SGXLEPUBKEYHASHx
> > > > > > +MSRs
> > > > > > for each
> > > > > > + * CPU. The entries are initialized when they are first used by
> > > > > > sgx_einit().
> > > > > > + */
> > > > > > +struct sgx_lepubkeyhash {
> > > > > > +	u64 msrs[4];
> > > > > > +	u64 pm_cnt;
> > > > >
> > > > > May I ask why do we need pm_cnt here? In fact why do we need suspend
> > > > > staff (namely, sgx_pm_cnt above, and related code in this patch) here
> > > > > in this patch? From the patch commit message I don't see why we need
> > > > > PM staff here. Please give comment why you need PM staff, or you may
> > > > > consider to split the PM staff to another patch.
> > > > 
> > > > Refining the commit message probably makes more sense because without PM
> > > > code sgx_einit() would be broken. The MSRs have been reset after waking up.
> > > > 
> > > > Some kind of counter is required to keep track of the power cycle. When going
> > > > to sleep the sgx_pm_cnt is increased. sgx_einit() compares the current value of
> > > > the global count to the value in the cache entry to see whether we are in a new
> > > > power cycle.
> > > 
> > > You mean reset to Intel default? I think we can also just reset the
> > > cached MSR values on each power cycle, which would be simpler, IMHO?
> > 
> > I don't really see that much difference in the complexity.
> 
> Tracking the validity of the cache means we're hosed if we miss any
> condition that causes the MSRs to be reset.  I think we're better off
> assuming the cache can be stale at any time, i.e. don't track power
> cyles and instead handle EINIT failure due to INVALID_TOKEN by writing
> the cache+MSRs with the desired hash and retrying EINIT.  EINIT is
> interruptible and its latency is extremely variable in any case, e.g.
> tens of thousands of cycles, so this rarely-hit "slow path" probably
> wouldn't affect the worst case latency of EINIT.

Sounds a good refiniment. Pretty good solution to heal from host sleep
on the guest VM and then there is no need for driver changes.

/Jarkko
Jarkko Sakkinen Sept. 3, 2018, 7:27 p.m. UTC | #15
On Fri, Aug 31, 2018 at 04:34:45PM -0500, Dr. Greg wrote:
> On Fri, Aug 31, 2018 at 10:43:30AM -0700, Sean Christopherson wrote:
> 
> Good afternoon to everyone.
> 
> > > Sorry I missed this one. To be honest I don't know. I checked the
> > > SDM and all I can find is:
> > >
> > > "On reset, the default value is the digest of Intel's signing key."
> 
> > I confirmed the MSRs are reset any time the EPC is lost.  Not sure
> > what happens if the MSRs contained a non-Intel value but feature
> > control is locked with SGX launch control disabled.  I'll post an
> > update when I have an answer.
> 
> It was our interpretation from the SDM that the identity modulus
> signature MSR's are 'trap-door' registers.  If flexible launch control
> (FLC) is enabled the platform has one opportunity to write a new
> signature value, after which the registers are locked from
> modification until the next platform reset.

In the driver we support only MSRs that are left writable by the BIOS
before locking the feature control.

> From a security architecture perspective it seemed that an FLC based
> SGX implementation would use a modified version of TBOOT to securely
> write that register once per platform boot/reset.  The architecture
> that is being discussed where there is a need to continually check
> whether or not the correct root signing key is loaded sounds a bit
> clunky at best.
> 
> At worst it has potential security implications since it is the
> reponsibility of the enclave launch control infrastructure to control
> which enclaves are allowed to have the PROVISION_KEY attribute bit
> set.

Based on the previous feedback supporting read-only MSRs in the driver
is an unwanted feature i.e. the kernel must be able to decide what gets
lauched (i.e. no launch enclave).

> Have a good weekend.
> 
> Dr. Greg
> 
> As always,
> Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
> 4206 N. 19th Ave.           Specializing in information infra-structure
> Fargo, ND  58102            development.
> PH: 701-281-1686
> FAX: 701-281-3949           EMAIL: greg@enjellic.com
> ------------------------------------------------------------------------------
> "Extensive interviews show that not one alcoholic has ever actually seen
>  a pink elephant."
>                                 -- Yale University
>                                    Center of Alcohol Studies

/Jarkko
Huang, Kai Sept. 3, 2018, 11:45 p.m. UTC | #16
> -----Original Message-----
> From: linux-sgx-owner@vger.kernel.org [mailto:linux-sgx-
> owner@vger.kernel.org] On Behalf Of Jarkko Sakkinen
> Sent: Tuesday, September 4, 2018 7:19 AM
> To: Christopherson, Sean J <sean.j.christopherson@intel.com>
> Cc: Huang, Kai <kai.huang@intel.com>; platform-driver-x86@vger.kernel.org;
> x86@kernel.org; nhorman@redhat.com; linux-kernel@vger.kernel.org;
> tglx@linutronix.de; suresh.b.siddha@intel.com; Ayoun, Serge
> <serge.ayoun@intel.com>; hpa@zytor.com; npmccallum@redhat.com;
> mingo@redhat.com; linux-sgx@vger.kernel.org; Hansen, Dave
> <dave.hansen@intel.com>
> Subject: Re: [PATCH v13 10/13] x86/sgx: Add sgx_einit() for initializing enclaves
> 
> On Fri, Aug 31, 2018 at 11:15:09AM -0700, Sean Christopherson wrote:
> > On Fri, Aug 31, 2018 at 03:17:03PM +0300, Jarkko Sakkinen wrote:
> > > On Wed, Aug 29, 2018 at 07:33:54AM +0000, Huang, Kai wrote:
> > > > [snip..]
> > > >
> > > > > > >
> > > > > > > @@ -38,6 +39,18 @@ static LIST_HEAD(sgx_active_page_list);
> > > > > > > static DEFINE_SPINLOCK(sgx_active_page_list_lock);
> > > > > > >  static struct task_struct *ksgxswapd_tsk;  static
> > > > > > > DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
> > > > > > > +static struct notifier_block sgx_pm_notifier; static u64
> > > > > > > +sgx_pm_cnt;
> > > > > > > +
> > > > > > > +/* The cache for the last known values of
> > > > > > > +IA32_SGXLEPUBKEYHASHx MSRs
> > > > > > > for each
> > > > > > > + * CPU. The entries are initialized when they are first
> > > > > > > + used by
> > > > > > > sgx_einit().
> > > > > > > + */
> > > > > > > +struct sgx_lepubkeyhash {
> > > > > > > +	u64 msrs[4];
> > > > > > > +	u64 pm_cnt;
> > > > > >
> > > > > > May I ask why do we need pm_cnt here? In fact why do we need
> > > > > > suspend staff (namely, sgx_pm_cnt above, and related code in
> > > > > > this patch) here in this patch? From the patch commit message
> > > > > > I don't see why we need PM staff here. Please give comment why
> > > > > > you need PM staff, or you may consider to split the PM staff to another
> patch.
> > > > >
> > > > > Refining the commit message probably makes more sense because
> > > > > without PM code sgx_einit() would be broken. The MSRs have been reset
> after waking up.
> > > > >
> > > > > Some kind of counter is required to keep track of the power
> > > > > cycle. When going to sleep the sgx_pm_cnt is increased.
> > > > > sgx_einit() compares the current value of the global count to
> > > > > the value in the cache entry to see whether we are in a new power cycle.
> > > >
> > > > You mean reset to Intel default? I think we can also just reset
> > > > the cached MSR values on each power cycle, which would be simpler,
> IMHO?
> > >
> > > I don't really see that much difference in the complexity.
> >
> > Tracking the validity of the cache means we're hosed if we miss any
> > condition that causes the MSRs to be reset.  I think we're better off
> > assuming the cache can be stale at any time, i.e. don't track power
> > cyles and instead handle EINIT failure due to INVALID_TOKEN by writing
> > the cache+MSRs with the desired hash and retrying EINIT.  EINIT is
> > interruptible and its latency is extremely variable in any case, e.g.
> > tens of thousands of cycles, so this rarely-hit "slow path" probably
> > wouldn't affect the worst case latency of EINIT.
> 
> Sounds a good refiniment. Pretty good solution to heal from host sleep on the
> guest VM and then there is no need for driver changes.

To me either way should be OK, keeping MSR cache or retrying EINIT, since EINIT should not be in performance critical path I think.

But INVALID_TOKEN is not only returned when MSRs are mismatched, so do you plan to check to rule out other cases that cause INVALID_TOKEN before retrying EINIT, or unconditionally  retry EINIT? And we should only retry once?

Thanks,
-Kai
> 
> /Jarkko
Jarkko Sakkinen Sept. 4, 2018, 9:55 a.m. UTC | #17
On Mon, Sep 03, 2018 at 03:53:24PM +0200, Jann Horn wrote:
> On Mon, Sep 3, 2018 at 3:33 PM Jarkko Sakkinen
> <jarkko.sakkinen@linux.intel.com> wrote:
> >
> > From: Sean Christopherson <sean.j.christopherson@intel.com>
> >
> > Add a function to perform ENCLS(EINIT), which initializes an enclave,
> > which can be used by a driver for running enclaves and VMMs.
> >
> > Writing the LE hash MSRs is extraordinarily expensive, e.g. 3-4x slower
> > than normal MSRs, so we use a per-cpu cache to track the last known value
> > of the MSRs to avoid unnecessarily writing the MSRs with the current value.
> >
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > Co-developed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> > Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> [...]
> > +/**
> > + * sgx_einit - initialize an enclave
> > + * @sigstruct:         a pointer to the SIGSTRUCT
> > + * @token:             a pointer to the EINITTOKEN
> > + * @secs_page:         a pointer to the SECS EPC page
> > + * @lepubkeyhash:      the desired value for IA32_SGXLEPUBKEYHASHx MSRs
> > + *
> > + * Try to perform EINIT operation. If the MSRs are writable, they are updated
> > + * according to @lepubkeyhash.
> > + *
> > + * Return:
> > + *   0 on success,
> > + *   -errno on failure
> > + *   SGX error code if EINIT fails
> > + */
> > +int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token,
> > +             struct sgx_epc_page *secs_page, u64 lepubkeyhash[4])
> > +{
> > +       struct sgx_lepubkeyhash __percpu *cache;
> > +       bool cache_valid;
> > +       int i, ret;
> > +
> > +       if (!sgx_lc_enabled)
> > +               return __einit(sigstruct, token, sgx_epc_addr(secs_page));
> > +
> > +       cache = per_cpu(sgx_lepubkeyhash_cache, smp_processor_id());
> 
> At this point, preemption must be off, because smp_processor_id() is
> called; I don't think it is off here? If you have hardware/emulation
> on which you can test this, you may want to test your patches with
> DEBUG_PREEMPT enabled.

Yeah, it really should. Thanks for spotting.

> 
> > +       if (!cache) {
> > +               cache = kzalloc(sizeof(struct sgx_lepubkeyhash), GFP_KERNEL);
> 
> But then here you do a GFP_KERNEL allocation, which can sleep.

Yes, of course this would need to be moved outside of the region where
pre-emption is disabled.

> Also: After "cache" has been allocated in this branch, when do you
> store the reference to it? As far as I can tell, you never write to
> sgx_lepubkeyhash_cache, and the allocation just leaks.

I have assignment in my local tree but for some reason it was not
squashed to this commit :-/

> 
> > +               if (!cache)
> > +                       return -ENOMEM;
> > +       }
> > +
> > +       cache_valid = cache->pm_cnt == sgx_pm_cnt;
> 
> The cache should probably not be treated as valid if it has just been
> created and only contains zeroes, right?

The name of the local variable is probably a bit misleading but also
cached values are compared in the loop.

> 
> > +       cache->pm_cnt = sgx_pm_cnt;
> 
> Can sgx_pm_cnt be modified concurrently? If so, please use at least
> READ_ONCE() to document that and prevent the compiler from doing weird
> stuff.

No it cannot.

> 
> > +       preempt_disable();
> 
> And here you turn off preemption, but it should already have been off?

Yes.

I think Sean's suggestion to update cache on SGX_INVALID_TOKEN is way to
go in this and instead of fixing this I'll change the code to use that
as a measure to update the cache.

/Jarkko
Sean Christopherson Sept. 4, 2018, 2:54 p.m. UTC | #18
On Mon, Sep 03, 2018 at 04:45:14PM -0700, Huang, Kai wrote:
> > -----Original Message-----
> > From: linux-sgx-owner@vger.kernel.org [mailto:linux-sgx-
> > owner@vger.kernel.org] On Behalf Of Jarkko Sakkinen
> > Sent: Tuesday, September 4, 2018 7:19 AM
> > To: Christopherson, Sean J <sean.j.christopherson@intel.com>
> > Cc: Huang, Kai <kai.huang@intel.com>; platform-driver-x86@vger.kernel.org;
> > x86@kernel.org; nhorman@redhat.com; linux-kernel@vger.kernel.org;
> > tglx@linutronix.de; suresh.b.siddha@intel.com; Ayoun, Serge
> > <serge.ayoun@intel.com>; hpa@zytor.com; npmccallum@redhat.com;
> > mingo@redhat.com; linux-sgx@vger.kernel.org; Hansen, Dave
> > <dave.hansen@intel.com>
> > Subject: Re: [PATCH v13 10/13] x86/sgx: Add sgx_einit() for initializing enclaves
> > 
> > On Fri, Aug 31, 2018 at 11:15:09AM -0700, Sean Christopherson wrote:
> > > On Fri, Aug 31, 2018 at 03:17:03PM +0300, Jarkko Sakkinen wrote:
> > > > On Wed, Aug 29, 2018 at 07:33:54AM +0000, Huang, Kai wrote:
> > > > > [snip..]
> > > > >
> > > > > > > >
> > > > > > > > @@ -38,6 +39,18 @@ static LIST_HEAD(sgx_active_page_list);
> > > > > > > > static DEFINE_SPINLOCK(sgx_active_page_list_lock);
> > > > > > > >  static struct task_struct *ksgxswapd_tsk;  static
> > > > > > > > DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
> > > > > > > > +static struct notifier_block sgx_pm_notifier; static u64
> > > > > > > > +sgx_pm_cnt;
> > > > > > > > +
> > > > > > > > +/* The cache for the last known values of
> > > > > > > > +IA32_SGXLEPUBKEYHASHx MSRs
> > > > > > > > for each
> > > > > > > > + * CPU. The entries are initialized when they are first
> > > > > > > > + used by
> > > > > > > > sgx_einit().
> > > > > > > > + */
> > > > > > > > +struct sgx_lepubkeyhash {
> > > > > > > > +	u64 msrs[4];
> > > > > > > > +	u64 pm_cnt;
> > > > > > >
> > > > > > > May I ask why do we need pm_cnt here? In fact why do we need
> > > > > > > suspend staff (namely, sgx_pm_cnt above, and related code in
> > > > > > > this patch) here in this patch? From the patch commit message
> > > > > > > I don't see why we need PM staff here. Please give comment why
> > > > > > > you need PM staff, or you may consider to split the PM staff to another
> > patch.
> > > > > >
> > > > > > Refining the commit message probably makes more sense because
> > > > > > without PM code sgx_einit() would be broken. The MSRs have been reset
> > after waking up.
> > > > > >
> > > > > > Some kind of counter is required to keep track of the power
> > > > > > cycle. When going to sleep the sgx_pm_cnt is increased.
> > > > > > sgx_einit() compares the current value of the global count to
> > > > > > the value in the cache entry to see whether we are in a new power cycle.
> > > > >
> > > > > You mean reset to Intel default? I think we can also just reset
> > > > > the cached MSR values on each power cycle, which would be simpler,
> > IMHO?
> > > >
> > > > I don't really see that much difference in the complexity.
> > >
> > > Tracking the validity of the cache means we're hosed if we miss any
> > > condition that causes the MSRs to be reset.  I think we're better off
> > > assuming the cache can be stale at any time, i.e. don't track power
> > > cyles and instead handle EINIT failure due to INVALID_TOKEN by writing
> > > the cache+MSRs with the desired hash and retrying EINIT.  EINIT is
> > > interruptible and its latency is extremely variable in any case, e.g.
> > > tens of thousands of cycles, so this rarely-hit "slow path" probably
> > > wouldn't affect the worst case latency of EINIT.
> > 
> > Sounds a good refiniment. Pretty good solution to heal from host sleep on the
> > guest VM and then there is no need for driver changes.
> 
> To me either way should be OK, keeping MSR cache or retrying EINIT, since EINIT should not be in performance critical path I think.
> 
> But INVALID_TOKEN is not only returned when MSRs are mismatched, so do you plan to check to rule out other cases that cause INVALID_TOKEN before retrying EINIT, or unconditionally  retry EINIT? And we should only retry once?

I don't see any value in trying to rule out specific causes of
INVALID_TOKEN, but we should only retry EINIT if ret==INVALID_TOKEN
and RDMSR(HASH0) != sgx_lepubkeyhash[0].  Only the first MSR needs to
be checked for validity as they're a package deal, i.e. they'll all be
valid or all be reset.  There shouldn't be a limit on retry attempts,
e.g. the MSRs could theoretically be reset between WRMSR and EINIT.

> 
> Thanks,
> -Kai
> > 
> > /Jarkko
Jarkko Sakkinen Sept. 4, 2018, 3:26 p.m. UTC | #19
On Mon, Sep 03, 2018 at 11:45:14PM +0000, Huang, Kai wrote:
> But INVALID_TOKEN is not only returned when MSRs are mismatched, so do
> you plan to check to rule out other cases that cause INVALID_TOKEN
> before retrying EINIT, or unconditionally  retry EINIT? And we should
> only retry once?

In the case of this error we will do wrmsrs and retry einit once.
This is how I understood it at least.

Q: Do we have to care about VMM sleep anyway? I mean if VMM always traps
EINIT it can write the MSRs with guest values if it has been sleeping.

If the answer is no, then retrying once should be a complete solution
and we don't need pm_cnt.

/Jarkko
Jarkko Sakkinen Sept. 4, 2018, 3:30 p.m. UTC | #20
On Tue, Sep 04, 2018 at 07:54:51AM -0700, Sean Christopherson wrote:
> I don't see any value in trying to rule out specific causes of
> INVALID_TOKEN, but we should only retry EINIT if ret==INVALID_TOKEN
> and RDMSR(HASH0) != sgx_lepubkeyhash[0].  Only the first MSR needs to
> be checked for validity as they're a package deal, i.e. they'll all be
> valid or all be reset.  There shouldn't be a limit on retry attempts,
> e.g. the MSRs could theoretically be reset between WRMSR and EINIT.

Why is doing rdmsrs necessary? With the INVALID_TOKEN error we know we
are out-of-sync i.e. have been sleeping and then one just needs to do
wrmsrs.

I think one retry should be enough given that VMM traps EINIT. One retry
is needed to take care of the guest itself (or host if we are running on
bare metal) having been in a sleep state.

/Jarkko
Andy Shevchenko Sept. 4, 2018, 4:05 p.m. UTC | #21
On Mon, Aug 27, 2018 at 9:58 PM Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> From: Sean Christopherson <sean.j.christopherson@intel.com>
>
> Add a function to perform ENCLS(EINIT), which initializes an enclave,
> which can be used by a driver for running enclaves and VMMs.
>
> Writing the LE hash MSRs is extraordinarily expensive, e.g. 3-4x slower
> than normal MSRs, so we use a per-cpu cache to track the last known value
> of the MSRs to avoid unnecessarily writing the MSRs with the current value.
>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Co-developed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>

> +int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token,
> +             struct sgx_epc_page *secs_page, u64 lepubkeyhash[4]);

This [4] doesn't make any sense in a C when used in function call
parameter list.

> +/* The cache for the last known values of IA32_SGXLEPUBKEYHASHx MSRs for each
> + * CPU. The entries are initialized when they are first used by sgx_einit().
> + */
> +struct sgx_lepubkeyhash {
> +       u64 msrs[4];

4 is MAGIC.

> +       u64 pm_cnt;
> +};

> +int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token,
> +             struct sgx_epc_page *secs_page, u64 lepubkeyhash[4])

Same two comments about [4].

> +{

> +       cache = per_cpu(sgx_lepubkeyhash_cache, smp_processor_id());

> +       if (!cache) {

How often it's being expected to happen?

> +               cache = kzalloc(sizeof(struct sgx_lepubkeyhash), GFP_KERNEL);
> +               if (!cache)
> +                       return -ENOMEM;
> +       }

> +       for (i = 0; i < 4; i++) {

Same MAGIC?

> +       }

> +}
Sean Christopherson Sept. 4, 2018, 4:35 p.m. UTC | #22
On Tue, Sep 04, 2018 at 06:30:21PM +0300, Jarkko Sakkinen wrote:
> On Tue, Sep 04, 2018 at 07:54:51AM -0700, Sean Christopherson wrote:
> > I don't see any value in trying to rule out specific causes of
> > INVALID_TOKEN, but we should only retry EINIT if ret==INVALID_TOKEN
> > and RDMSR(HASH0) != sgx_lepubkeyhash[0].  Only the first MSR needs to
> > be checked for validity as they're a package deal, i.e. they'll all be
> > valid or all be reset.  There shouldn't be a limit on retry attempts,
> > e.g. the MSRs could theoretically be reset between WRMSR and EINIT.
> 
> Why is doing rdmsrs necessary? With the INVALID_TOKEN error we know we
> are out-of-sync i.e. have been sleeping and then one just needs to do
> wrmsrs.

As Kai mentioned, INVALID_TOKEN is returned for other reasons, e.g. a
production enclave trying to use a debug token or reserved bits set in
the token.  And in the KVM case, the hash and token are provided by
the guest, so it's entirely possible the enclave/token is not signed
with the key specified in the hash.  RDMSR is relatively inexpensive
compared to the overall cost of EINIT.  Though of course EINIT failure
isn't exactly a fast path, so I'm ok if you want to opt for simplicity
and retry on INVALID_TOKEN without checking the MSRs, just make sure
to add a comment indicating we're intentionally not checking the MSRs.
 
> I think one retry should be enough given that VMM traps EINIT. One retry
> is needed to take care of the guest itself (or host if we are running on
> bare metal) having been in a sleep state.

Assuming we do RDMSR(hash0), that should be sufficient to prevent
infinite retry and it protects against the MSRs being lost between
WRMSR and EINIT during retry.  That being said, I'm ok retrying only
once, especially if you want to omit the RDMSR.  Disabling preemption
should prevent the kernel from suspending between WRMSR and EINIT,
I'm just being paranoid.
Huang, Kai Sept. 4, 2018, 10:13 p.m. UTC | #23
> -----Original Message-----
> From: platform-driver-x86-owner@vger.kernel.org [mailto:platform-driver-x86-
> owner@vger.kernel.org] On Behalf Of Sean Christopherson
> Sent: Wednesday, September 5, 2018 4:36 AM
> To: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> Cc: Huang, Kai <kai.huang@intel.com>; platform-driver-x86@vger.kernel.org;
> x86@kernel.org; nhorman@redhat.com; linux-kernel@vger.kernel.org;
> tglx@linutronix.de; suresh.b.siddha@intel.com; Ayoun, Serge
> <serge.ayoun@intel.com>; hpa@zytor.com; npmccallum@redhat.com;
> mingo@redhat.com; linux-sgx@vger.kernel.org; Hansen, Dave
> <dave.hansen@intel.com>
> Subject: Re: [PATCH v13 10/13] x86/sgx: Add sgx_einit() for initializing enclaves
> 
> On Tue, Sep 04, 2018 at 06:30:21PM +0300, Jarkko Sakkinen wrote:
> > On Tue, Sep 04, 2018 at 07:54:51AM -0700, Sean Christopherson wrote:
> > > I don't see any value in trying to rule out specific causes of
> > > INVALID_TOKEN, but we should only retry EINIT if ret==INVALID_TOKEN
> > > and RDMSR(HASH0) != sgx_lepubkeyhash[0].  Only the first MSR needs
> > > to be checked for validity as they're a package deal, i.e. they'll
> > > all be valid or all be reset.  There shouldn't be a limit on retry
> > > attempts, e.g. the MSRs could theoretically be reset between WRMSR and
> EINIT.
> >
> > Why is doing rdmsrs necessary? With the INVALID_TOKEN error we know we
> > are out-of-sync i.e. have been sleeping and then one just needs to do
> > wrmsrs.
> 
> As Kai mentioned, INVALID_TOKEN is returned for other reasons, e.g. a
> production enclave trying to use a debug token or reserved bits set in the token.
> And in the KVM case, the hash and token are provided by the guest, so it's
> entirely possible the enclave/token is not signed with the key specified in the
> hash.  RDMSR is relatively inexpensive compared to the overall cost of EINIT.
> Though of course EINIT failure isn't exactly a fast path, so I'm ok if you want to
> opt for simplicity and retry on INVALID_TOKEN without checking the MSRs, just
> make sure to add a comment indicating we're intentionally not checking the
> MSRs.
> 
> > I think one retry should be enough given that VMM traps EINIT. One
> > retry is needed to take care of the guest itself (or host if we are
> > running on bare metal) having been in a sleep state.
> 
> Assuming we do RDMSR(hash0), that should be sufficient to prevent infinite retry
> and 

IMHO probably we need to review this assumption w/ crypto guys, at least Intel internally.

Thanks,
-Kai

it protects against the MSRs being lost between WRMSR and EINIT during
> retry.  That being said, I'm ok retrying only once, especially if you want to omit
> the RDMSR.  Disabling preemption should prevent the kernel from suspending
> between WRMSR and EINIT, I'm just being paranoid.
Jarkko Sakkinen Sept. 5, 2018, 5:39 p.m. UTC | #24
On Tue, Sep 04, 2018 at 09:35:46AM -0700, Sean Christopherson wrote:
> On Tue, Sep 04, 2018 at 06:30:21PM +0300, Jarkko Sakkinen wrote:
> > On Tue, Sep 04, 2018 at 07:54:51AM -0700, Sean Christopherson wrote:
> > > I don't see any value in trying to rule out specific causes of
> > > INVALID_TOKEN, but we should only retry EINIT if ret==INVALID_TOKEN
> > > and RDMSR(HASH0) != sgx_lepubkeyhash[0].  Only the first MSR needs to
> > > be checked for validity as they're a package deal, i.e. they'll all be
> > > valid or all be reset.  There shouldn't be a limit on retry attempts,
> > > e.g. the MSRs could theoretically be reset between WRMSR and EINIT.
> > 
> > Why is doing rdmsrs necessary? With the INVALID_TOKEN error we know we
> > are out-of-sync i.e. have been sleeping and then one just needs to do
> > wrmsrs.
> 
> As Kai mentioned, INVALID_TOKEN is returned for other reasons, e.g. a
> production enclave trying to use a debug token or reserved bits set in
> the token.  And in the KVM case, the hash and token are provided by
> the guest, so it's entirely possible the enclave/token is not signed
> with the key specified in the hash.  RDMSR is relatively inexpensive
> compared to the overall cost of EINIT.  Though of course EINIT failure
> isn't exactly a fast path, so I'm ok if you want to opt for simplicity
> and retry on INVALID_TOKEN without checking the MSRs, just make sure
> to add a comment indicating we're intentionally not checking the MSRs.

Great!

> > I think one retry should be enough given that VMM traps EINIT. One retry
> > is needed to take care of the guest itself (or host if we are running on
> > bare metal) having been in a sleep state.
> 
> Assuming we do RDMSR(hash0), that should be sufficient to prevent
> infinite retry and it protects against the MSRs being lost between
> WRMSR and EINIT during retry.  That being said, I'm ok retrying only
> once, especially if you want to omit the RDMSR.  Disabling preemption
> should prevent the kernel from suspending between WRMSR and EINIT,
> I'm just being paranoid.

But they are in the same preempt-disabled-region already?

/Jarkko
diff mbox series

Patch

diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index baf30d49b71f..c15c156436be 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -108,6 +108,8 @@  void sgx_free_page(struct sgx_epc_page *page);
 void sgx_page_reclaimable(struct sgx_epc_page *page);
 struct page *sgx_get_backing(struct file *file, pgoff_t index);
 void sgx_put_backing(struct page *backing_page, bool write);
+int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token,
+	      struct sgx_epc_page *secs_page, u64 lepubkeyhash[4]);
 
 #define ENCLS_FAULT_FLAG 0x40000000UL
 #define ENCLS_FAULT_FLAG_ASM "$0x40000000"
diff --git a/arch/x86/kernel/cpu/intel_sgx.c b/arch/x86/kernel/cpu/intel_sgx.c
index 1046478a3ab9..fe25e6805680 100644
--- a/arch/x86/kernel/cpu/intel_sgx.c
+++ b/arch/x86/kernel/cpu/intel_sgx.c
@@ -9,6 +9,7 @@ 
 #include <linux/sched/signal.h>
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
+#include <linux/suspend.h>
 #include <asm/sgx.h>
 #include <asm/sgx_pr.h>
 
@@ -38,6 +39,18 @@  static LIST_HEAD(sgx_active_page_list);
 static DEFINE_SPINLOCK(sgx_active_page_list_lock);
 static struct task_struct *ksgxswapd_tsk;
 static DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
+static struct notifier_block sgx_pm_notifier;
+static u64 sgx_pm_cnt;
+
+/* The cache for the last known values of IA32_SGXLEPUBKEYHASHx MSRs for each
+ * CPU. The entries are initialized when they are first used by sgx_einit().
+ */
+struct sgx_lepubkeyhash {
+	u64 msrs[4];
+	u64 pm_cnt;
+};
+
+static DEFINE_PER_CPU(struct sgx_lepubkeyhash *, sgx_lepubkeyhash_cache);
 
 /**
  * sgx_reclaim_pages - reclaim EPC pages from the consumers
@@ -328,6 +341,54 @@  void sgx_put_backing(struct page *backing_page, bool write)
 }
 EXPORT_SYMBOL_GPL(sgx_put_backing);
 
+/**
+ * sgx_einit - initialize an enclave
+ * @sigstruct:		a pointer to the SIGSTRUCT
+ * @token:		a pointer to the EINITTOKEN
+ * @secs_page:		a pointer to the SECS EPC page
+ * @lepubkeyhash:	the desired value for IA32_SGXLEPUBKEYHASHx MSRs
+ *
+ * Try to perform EINIT operation. If the MSRs are writable, they are updated
+ * according to @lepubkeyhash.
+ *
+ * Return:
+ *   0 on success,
+ *   -errno on failure
+ *   SGX error code if EINIT fails
+ */
+int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token,
+	      struct sgx_epc_page *secs_page, u64 lepubkeyhash[4])
+{
+	struct sgx_lepubkeyhash __percpu *cache;
+	bool cache_valid;
+	int i, ret;
+
+	if (!sgx_lc_enabled)
+		return __einit(sigstruct, token, sgx_epc_addr(secs_page));
+
+	cache = per_cpu(sgx_lepubkeyhash_cache, smp_processor_id());
+	if (!cache) {
+		cache = kzalloc(sizeof(struct sgx_lepubkeyhash), GFP_KERNEL);
+		if (!cache)
+			return -ENOMEM;
+	}
+
+	cache_valid = cache->pm_cnt == sgx_pm_cnt;
+	cache->pm_cnt = sgx_pm_cnt;
+	preempt_disable();
+	for (i = 0; i < 4; i++) {
+		if (cache_valid && lepubkeyhash[i] == cache->msrs[i])
+			continue;
+
+		wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+		cache->msrs[i] = lepubkeyhash[i];
+	}
+	ret = __einit(sigstruct, token, sgx_epc_addr(secs_page));
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL(sgx_einit);
+
 static __init int sgx_init_epc_bank(u64 addr, u64 size, unsigned long index,
 				    struct sgx_epc_bank *bank)
 {
@@ -426,6 +487,15 @@  static __init int sgx_page_cache_init(void)
 	return 0;
 }
 
+static int sgx_pm_notifier_cb(struct notifier_block *nb, unsigned long action,
+			      void *data)
+{
+	if (action == PM_SUSPEND_PREPARE || action == PM_HIBERNATION_PREPARE)
+		sgx_pm_cnt++;
+
+	return NOTIFY_DONE;
+}
+
 static __init int sgx_init(void)
 {
 	struct task_struct *tsk;
@@ -452,20 +522,30 @@  static __init int sgx_init(void)
 	if (!(fc & FEATURE_CONTROL_SGX_LE_WR))
 		pr_info("IA32_SGXLEPUBKEYHASHn MSRs are not writable\n");
 
-	ret = sgx_page_cache_init();
+	sgx_pm_notifier.notifier_call = sgx_pm_notifier_cb;
+	ret = register_pm_notifier(&sgx_pm_notifier);
 	if (ret)
 		return ret;
 
+	ret = sgx_page_cache_init();
+	if (ret)
+		goto out_pm;
+
 	tsk = kthread_run(ksgxswapd, NULL, "ksgxswapd");
 	if (IS_ERR(tsk)) {
-		sgx_page_cache_teardown();
-		return PTR_ERR(tsk);
+		ret = PTR_ERR(tsk);
+		goto out_pcache;
 	}
 	ksgxswapd_tsk = tsk;
 
 	sgx_enabled = true;
 	sgx_lc_enabled = !!(fc & FEATURE_CONTROL_SGX_LE_WR);
 	return 0;
+out_pcache:
+	sgx_page_cache_teardown();
+out_pm:
+	unregister_pm_notifier(&sgx_pm_notifier);
+	return ret;
 }
 
 arch_initcall(sgx_init);