diff mbox

[intel-sgx-kernel-dev,v11,09/13] x86, sgx: basic routines for enclave page cache

Message ID 20180608171216.26521-10-jarkko.sakkinen@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jarkko Sakkinen June 8, 2018, 5:09 p.m. UTC
SGX has a set of data structures to maintain information about the enclaves
and their security properties. BIOS reserves a fixed size region of
physical memory for these structures by setting Processor Reserved Memory
Range Registers (PRMRR). This memory area is called Enclave Page Cache
(EPC).

This commit implements the basic routines to allocate and free pages from
different EPC banks. There is also a swapper thread ksgxswapd for EPC pages
that gets woken up by sgx_alloc_page() when we run below the low watermark.
The swapper thread continues swapping pages up until it reaches the high
watermark.

Each subsystem that uses SGX must provide a set of callbacks for EPC
pages that are used to reclaim, block and write an EPC page. Kernel
takes the responsibility of maintaining LRU cache for them.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 arch/x86/include/asm/sgx.h      |  67 +++++
 arch/x86/include/asm/sgx_arch.h | 224 ++++++++++++++++
 arch/x86/kernel/cpu/intel_sgx.c | 443 +++++++++++++++++++++++++++++++-
 3 files changed, 732 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/include/asm/sgx_arch.h

Comments

Jethro Beekman June 8, 2018, 6:21 p.m. UTC | #1
On 2018-06-08 10:09, Jarkko Sakkinen wrote:
> +/*
> + * Writing the LE hash MSRs is extraordinarily expensive, e.g.
> + * 3-4x slower than normal MSRs, so we use a per-cpu cache to
> + * track the last known value of the MSRs to avoid unnecessarily
> + * writing the MSRs with the current value.  Because most Linux
> + * kernels will use an LE that is signed with a non-Intel key,

I don't think you can predict what most Linux kernels will be doing. I 
think not initializing the cache to the CPU's initial value is fine, but 
this particular argument shouldn't appear in the rationale.

> + * i.e. the first EINIT will need to write the MSRs regardless
> + * of the cache, the cache is intentionally left uninitialized
> + * during boot as initializing the cache would be pure overhead
> + * for the majority of systems.  Furthermore, the MSRs are per-cpu
> + * and the boot-time values aren't guaranteed to be identical
> + * across cpus, so we'd have to run code all all cpus to properly
> + * init the cache.  All in all, the complexity and overhead of
> + * initializing the cache is not justified.
> + */
> +static DEFINE_PER_CPU(u64 [4], sgx_le_pubkey_hash_cache);
Dave Hansen June 8, 2018, 6:24 p.m. UTC | #2
On 06/08/2018 10:09 AM, Jarkko Sakkinen wrote:
> SGX has a set of data structures to maintain information about the enclaves
> and their security properties. BIOS reserves a fixed size region of
> physical memory for these structures by setting Processor Reserved Memory
> Range Registers (PRMRR). This memory area is called Enclave Page Cache
> (EPC).
> 
> This commit implements the basic routines to allocate and free pages from
> different EPC banks. There is also a swapper thread ksgxswapd for EPC pages
> that gets woken up by sgx_alloc_page() when we run below the low watermark.
> The swapper thread continues swapping pages up until it reaches the high
> watermark.

Yay!  A new memory manager in arch-specific code.

> Each subsystem that uses SGX must provide a set of callbacks for EPC
> pages that are used to reclaim, block and write an EPC page. Kernel
> takes the responsibility of maintaining LRU cache for them.

What does a "subsystem that uses SGX" mean?  Do we have one of those
already?

...
> +struct sgx_secs {
> +	uint64_t size;
> +	uint64_t base;
> +	uint32_t ssaframesize;
> +	uint32_t miscselect;
> +	uint8_t reserved1[SGX_SECS_RESERVED1_SIZE];
> +	uint64_t attributes;
> +	uint64_t xfrm;
> +	uint32_t mrenclave[8];
> +	uint8_t reserved2[SGX_SECS_RESERVED2_SIZE];
> +	uint32_t mrsigner[8];
> +	uint8_t	reserved3[SGX_SECS_RESERVED3_SIZE];
> +	uint16_t isvvprodid;
> +	uint16_t isvsvn;
> +	uint8_t reserved4[SGX_SECS_RESERVED4_SIZE];
> +};

This is a hardware structure, right?  Doesn't it need to be packed?

> +enum sgx_tcs_flags {
> +	SGX_TCS_DBGOPTIN	= 0x01, /* cleared on EADD */
> +};
> +
> +#define SGX_TCS_RESERVED_MASK 0xFFFFFFFFFFFFFFFEL

Would it be possible to separate out the SGX software structures from
SGX hardware?  It's hard to tell them apart.

> +#define SGX_NR_TO_SCAN	16
> +#define SGX_NR_LOW_PAGES 32
> +#define SGX_NR_HIGH_PAGES 64
> +
>  bool sgx_enabled __ro_after_init = false;
>  EXPORT_SYMBOL(sgx_enabled);
> +bool sgx_lc_enabled __ro_after_init;
> +EXPORT_SYMBOL(sgx_lc_enabled);
> +atomic_t sgx_nr_free_pages = ATOMIC_INIT(0);

Hmmm, global atomic.  Doesn't sound very scalable.

> +struct sgx_epc_bank sgx_epc_banks[SGX_MAX_EPC_BANKS];
> +EXPORT_SYMBOL(sgx_epc_banks);
> +int sgx_nr_epc_banks;
> +EXPORT_SYMBOL(sgx_nr_epc_banks);
> +LIST_HEAD(sgx_active_page_list);
> +EXPORT_SYMBOL(sgx_active_page_list);
> +DEFINE_SPINLOCK(sgx_active_page_list_lock);
> +EXPORT_SYMBOL(sgx_active_page_list_lock);

Hmmm, global spinlock protecting a page allocator linked list.  Sounds
even worse than at atomic.

Why is this OK?

> +static struct task_struct *ksgxswapd_tsk;
> +static DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
> +
> +/*
> + * Writing the LE hash MSRs is extraordinarily expensive, e.g.
> + * 3-4x slower than normal MSRs, so we use a per-cpu cache to
> + * track the last known value of the MSRs to avoid unnecessarily
> + * writing the MSRs with the current value.  Because most Linux
> + * kernels will use an LE that is signed with a non-Intel key,
> + * i.e. the first EINIT will need to write the MSRs regardless
> + * of the cache, the cache is intentionally left uninitialized
> + * during boot as initializing the cache would be pure overhead
> + * for the majority of systems.  Furthermore, the MSRs are per-cpu
> + * and the boot-time values aren't guaranteed to be identical
> + * across cpus, so we'd have to run code all all cpus to properly
> + * init the cache.  All in all, the complexity and overhead of
> + * initializing the cache is not justified.
> + */
> +static DEFINE_PER_CPU(u64 [4], sgx_le_pubkey_hash_cache);

Justifying the design decisions is great for changelogs, not so great
for comments.  Also, looking at this, I have no idea what this has to do
with the "enclave page cache".

> +static void sgx_swap_cluster(void)
> +{
> +	struct sgx_epc_page *cluster[SGX_NR_TO_SCAN + 1];
> +	struct sgx_epc_page *epc_page;
> +	int i;
> +	int j;

This is rather free of comments or explanation of what this is doing,
how it is related to swapping as everyone else knows it

> +	memset(cluster, 0, sizeof(cluster));
> +
> +	for (i = 0, j = 0; i < SGX_NR_TO_SCAN; i++) {
> +		spin_lock(&sgx_active_page_list_lock);
> +		if (list_empty(&sgx_active_page_list)) {
> +			spin_unlock(&sgx_active_page_list_lock);
> +			break;
> +		}
> +		epc_page = list_first_entry(&sgx_active_page_list,
> +					    struct sgx_epc_page, list);
> +		if (!epc_page->impl->ops->get(epc_page)) {
> +			list_move_tail(&epc_page->list, &sgx_active_page_list);
> +			spin_unlock(&sgx_active_page_list_lock);
> +			continue;
> +		}
> +		list_del(&epc_page->list);
> +		spin_unlock(&sgx_active_page_list_lock);
>  
> -static __init bool sgx_is_enabled(void)
> +		if (epc_page->impl->ops->reclaim(epc_page)) {
> +			cluster[j++] = epc_page;
> +		} else {
> +			spin_lock(&sgx_active_page_list_lock);
> +			list_add_tail(&epc_page->list, &sgx_active_page_list);
> +			spin_unlock(&sgx_active_page_list_lock);
> +			epc_page->impl->ops->put(epc_page);
> +		}
> +	}
> +
> +	for (i = 0; cluster[i]; i++) {
> +		epc_page = cluster[i];
> +		epc_page->impl->ops->block(epc_page);
> +	}
> +
> +	for (i = 0; cluster[i]; i++) {
> +		epc_page = cluster[i];
> +		epc_page->impl->ops->write(epc_page);
> +		epc_page->impl->ops->put(epc_page);
> +		sgx_free_page(epc_page);
> +	}
> +}

This is also gloriously free of any superfluous comments.  Could you fix
that?

> +/**
> + * sgx_try_alloc_page - try to allocate an EPC page
> + * @impl:	implementation for the struct sgx_epc_page
> + *
> + * Try to grab a page from the free EPC page list. If there is a free page
> + * available, it is returned to the caller.
> + *
> + * Return:
> + *   a &struct sgx_epc_page instace,
> + *   NULL otherwise
> + */
> +struct sgx_epc_page *sgx_try_alloc_page(struct sgx_epc_page_impl *impl)
> +{
> +	struct sgx_epc_bank *bank;
> +	struct sgx_epc_page *page = NULL;
> +	int i;
> +
> +	for (i = 0; i < sgx_nr_epc_banks; i++) {
> +		bank = &sgx_epc_banks[i];

What's a bank?  How many banks does a system have?

> +		down_write(&bank->lock);
> +
> +		if (atomic_read(&bank->free_cnt))
> +			page = bank->pages[atomic_dec_return(&bank->free_cnt)];

Why is a semaphore getting used here?  I don't see any sleeping or
anything happening under this lock.

> +		up_write(&bank->lock);
> +
> +		if (page)
> +			break;
> +	}
> +
> +	if (page) {
> +		atomic_dec(&sgx_nr_free_pages);
> +		page->impl = impl;
> +	}
> +
> +	return page;
> +}
> +EXPORT_SYMBOL(sgx_try_alloc_page);
> +
> +/**
> + * sgx_alloc_page - allocate an EPC page
> + * @flags:	allocation flags
> + * @impl:	implementation for the struct sgx_epc_page
> + *
> + * Try to grab a page from the free EPC page list. If there is a free page
> + * available, it is returned to the caller. If called with SGX_ALLOC_ATOMIC,
> + * the function will return immediately if the list is empty. Otherwise, it
> + * will swap pages up until there is a free page available. Upon returning the
> + * low watermark is checked and ksgxswapd is waken up if we are below it.
> + *
> + * Return:
> + *   a &struct sgx_epc_page instace,
> + *   -ENOMEM if all pages are unreclaimable,
> + *   -EBUSY when called with SGX_ALLOC_ATOMIC and out of free pages
> + */
> +struct sgx_epc_page *sgx_alloc_page(struct sgx_epc_page_impl *impl,
> +				    unsigned int flags)
> +{
> +	struct sgx_epc_page *entry;
> +
> +	for ( ; ; ) {
> +		entry = sgx_try_alloc_page(impl);
> +		if (entry)
> +			break;
> +
> +		if (list_empty(&sgx_active_page_list))
> +			return ERR_PTR(-ENOMEM);

"active" pages in the VM are allocated/in-use pages.  This doesn't look
to be using the same terminology.

> +		if (flags & SGX_ALLOC_ATOMIC) {
> +			entry = ERR_PTR(-EBUSY);
> +			break;
> +		}
> +
> +		if (signal_pending(current)) {
> +			entry = ERR_PTR(-ERESTARTSYS);
> +			break;
> +		}
> +
> +		sgx_swap_cluster();
> +		schedule();

What's the schedule trying to do?  Is this the equivalent of "direct
reclaim"?  Why do we need this in addition to the ksgxswapd?

> +	}
> +
> +	if (atomic_read(&sgx_nr_free_pages) < SGX_NR_LOW_PAGES)
> +		wake_up(&ksgxswapd_waitq);
> +
> +	return entry;
> +}
> +EXPORT_SYMBOL(sgx_alloc_page);

Why aren't these _GPL exports?

> +/**
> + * sgx_free_page - free an EPC page
> + *
> + * @page:	any EPC page
> + *
> + * Remove an EPC page and insert it back to the list of free pages.
> + *
> + * Return: SGX error code
> + */
> +int sgx_free_page(struct sgx_epc_page *page)
> +{
> +	struct sgx_epc_bank *bank = SGX_EPC_BANK(page);
> +	int ret;
> +
> +	ret = sgx_eremove(page);
> +	if (ret) {
> +		pr_debug("EREMOVE returned %d\n", ret);
> +		return ret;
> +	}
> +
> +	down_read(&bank->lock);
> +	bank->pages[atomic_inc_return(&bank->free_cnt) - 1] = page;
> +	atomic_inc(&sgx_nr_free_pages);
> +	up_read(&bank->lock);
> +
> +	return 0;
> +}

bank->lock confuses me.  This seems to be writing to a bank, but only
needs a read lock.  Why?

> +/**
> + * sgx_get_page - pin an EPC page
> + * @page:	an EPC page
> + *
> + * Return: a pointer to the pinned EPC page
> + */
> +void *sgx_get_page(struct sgx_epc_page *page)
> +{
> +	struct sgx_epc_bank *bank = SGX_EPC_BANK(page);
> +
> +	if (IS_ENABLED(CONFIG_X86_64))
> +		return (void *)(bank->va + SGX_EPC_ADDR(page) - bank->pa);
> +
> +	return kmap_atomic_pfn(SGX_EPC_PFN(page));
> +}
> +EXPORT_SYMBOL(sgx_get_page);

This is odd.  Do you really want to detect 64-bit, or CONFIG_HIGHMEM?

> +struct page *sgx_get_backing(struct file *file, pgoff_t index)
> +{
> +	struct inode *inode = file->f_path.dentry->d_inode;
> +	struct address_space *mapping = inode->i_mapping;
> +	gfp_t gfpmask = mapping_gfp_mask(mapping);
> +
> +	return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
> +}
> +EXPORT_SYMBOL(sgx_get_backing);

What does shmem have to do with all this?

> +void sgx_put_backing(struct page *backing_page, bool write)
> +{
> +	if (write)
> +		set_page_dirty(backing_page);
> +
> +	put_page(backing_page);
> +}
> +EXPORT_SYMBOL(sgx_put_backing);

I'm not a big fan of stuff getting added with no apparent user and no
explaination of what it is doing.  There's no way for me to assess
whether this is sane or not.

> +static __init int sgx_page_cache_init(void)
> +{
> +	struct task_struct *tsk;
> +	unsigned long size;
> +	unsigned int eax;
> +	unsigned int ebx;
> +	unsigned int ecx;
> +	unsigned int edx;
> +	unsigned long pa;
> +	int i;
> +	int ret;
> +
> +	for (i = 0; i < SGX_MAX_EPC_BANKS; i++) {
> +		cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC_BANKS, &eax, &ebx,
> +			    &ecx, &edx);
> +		if (!(eax & 0xf))
> +			break;
> +
> +		pa   = ((u64)(ebx & 0xfffff) << 32) + (u64)(eax & 0xfffff000);
> +		size = ((u64)(edx & 0xfffff) << 32) + (u64)(ecx & 0xfffff000);

Please align these like I did ^

> +		pr_info("EPC bank 0x%lx-0x%lx\n", pa, pa + size);
> +
> +		ret = sgx_init_epc_bank(pa, size, i, &sgx_epc_banks[i]);
> +		if (ret) {
> +			sgx_page_cache_teardown();
> +			return ret;
> +		}
> +
> +		sgx_nr_epc_banks++;
> +	}

This is also rather sparsely commented.

> +static __init bool sgx_is_enabled(bool *lc_enabled)
>  {
>  	unsigned long fc;
>  
> @@ -41,12 +466,26 @@ static __init bool sgx_is_enabled(void)
>  	if (!(fc & FEATURE_CONTROL_SGX_ENABLE))
>  		return false;
>  
> +	*lc_enabled = !!(fc & FEATURE_CONTROL_SGX_LE_WR);
> +
>  	return true;
>  }

I'm baffled why lc_enabled is connected to the enclave page cache.

>  static __init int sgx_init(void)
>  {
> -	sgx_enabled = sgx_is_enabled();
> +	bool lc_enabled;
> +	int ret;
> +
> +	if (!sgx_is_enabled(&lc_enabled))
> +		return 0;
> +
> +	ret = sgx_page_cache_init();
> +	if (ret)
> +		return ret;
> +
> +	sgx_enabled = true;
> +	sgx_lc_enabled = lc_enabled;
> +
>  	return 0;
>  }
>  
>
Andy Lutomirski June 10, 2018, 5:32 a.m. UTC | #3
On Fri, Jun 8, 2018 at 10:22 AM Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> SGX has a set of data structures to maintain information about the enclaves
> and their security properties. BIOS reserves a fixed size region of
> physical memory for these structures by setting Processor Reserved Memory
> Range Registers (PRMRR). This memory area is called Enclave Page Cache
> (EPC).
>

> +/**
> + * sgx_einit - EINIT an enclave with the appropriate LE pubkey hash
> + * @sigstruct:         a pointer to the enclave's sigstruct
> + * @token:             a pointer to the enclave's EINIT token
> + * @secs_page:         a pointer to the enclave's SECS EPC page
> + * @le_pubkey_hash:    the desired LE pubkey hash for EINIT
> + */
> +int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token,
> +             struct sgx_epc_page *secs_page, u64 le_pubkey_hash[4])
> +{
> +       u64 __percpu *cache;
> +       void *secs;
> +       int i, ret;
> +
> +       secs = sgx_get_page(secs_page);
> +
> +       if (!sgx_lc_enabled) {

I'm confused.  What does this code path do?  It kind of looks like the
driver will load and just malfunction if we don't have write access to
the MSRs.  What is the intended behavior?

> +               ret = __einit(sigstruct, token, secs);
> +               goto out;
> +       }
> +
> +       cache = per_cpu(sgx_le_pubkey_hash_cache, smp_processor_id());
> +
> +       preempt_disable();
> +       for (i = 0; i < 4; i++) {
> +               if (le_pubkey_hash[i] == cache[i])
> +                       continue;
> +
> +               wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, le_pubkey_hash[i]);
> +               cache[i] = le_pubkey_hash[i];
> +       }
> +       ret = __einit(sigstruct, token, secs);
> +       preempt_enable();
> +
> +out:
> +       sgx_put_page(secs);
> +       return ret;
> +}
> +EXPORT_SYMBOL(sgx_einit);
> +
Sean Christopherson June 11, 2018, 3:12 p.m. UTC | #4
On Sat, 2018-06-09 at 22:32 -0700, Andy Lutomirski wrote:
> On Fri, Jun 8, 2018 at 10:22 AM Jarkko Sakkinen
> <jarkko.sakkinen@linux.intel.com> wrote:
> > 
> > 
> > SGX has a set of data structures to maintain information about the enclaves
> > and their security properties. BIOS reserves a fixed size region of
> > physical memory for these structures by setting Processor Reserved Memory
> > Range Registers (PRMRR). This memory area is called Enclave Page Cache
> > (EPC).
> > 
> > 
> > +/**
> > + * sgx_einit - EINIT an enclave with the appropriate LE pubkey hash
> > + * @sigstruct:         a pointer to the enclave's sigstruct
> > + * @token:             a pointer to the enclave's EINIT token
> > + * @secs_page:         a pointer to the enclave's SECS EPC page
> > + * @le_pubkey_hash:    the desired LE pubkey hash for EINIT
> > + */
> > +int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token,
> > +             struct sgx_epc_page *secs_page, u64 le_pubkey_hash[4])
> > +{
> > +       u64 __percpu *cache;
> > +       void *secs;
> > +       int i, ret;
> > +
> > +       secs = sgx_get_page(secs_page);
> > +
> > +       if (!sgx_lc_enabled) {
> I'm confused.  What does this code path do?  It kind of looks like the
> driver will load and just malfunction if we don't have write access to
> the MSRs.  What is the intended behavior?

The driver will also allow itself to load if the MSRs are read-only,
but only if the MSRs' pubkey hash matches that of its launch enclave,
i.e. the system has been pre-configured for the kernel's LE.  Whether
or not that is a valid scenario is probably a different discussion.

> > +               ret = __einit(sigstruct, token, secs);
> > +               goto out;
> > +       }
> > +
> > +       cache = per_cpu(sgx_le_pubkey_hash_cache, smp_processor_id());
> > +
> > +       preempt_disable();
> > +       for (i = 0; i < 4; i++) {
> > +               if (le_pubkey_hash[i] == cache[i])
> > +                       continue;
> > +
> > +               wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, le_pubkey_hash[i]);
> > +               cache[i] = le_pubkey_hash[i];
> > +       }
> > +       ret = __einit(sigstruct, token, secs);
> > +       preempt_enable();
> > +
> > +out:
> > +       sgx_put_page(secs);
> > +       return ret;
> > +}
> > +EXPORT_SYMBOL(sgx_einit);
> > +
Andy Lutomirski June 18, 2018, 9:33 p.m. UTC | #5
On Mon, Jun 18, 2018 at 8:11 AM Jethro Beekman <jethro@fortanix.com> wrote:
>
> On 2018-06-08 10:09, Jarkko Sakkinen wrote:
> > +/*
> > + * Writing the LE hash MSRs is extraordinarily expensive, e.g.
> > + * 3-4x slower than normal MSRs, so we use a per-cpu cache to
> > + * track the last known value of the MSRs to avoid unnecessarily
> > + * writing the MSRs with the current value.  Because most Linux
> > + * kernels will use an LE that is signed with a non-Intel key,
>
> I don't think you can predict what most Linux kernels will be doing. I
> think not initializing the cache to the CPU's initial value is fine, but
> this particular argument shouldn't appear in the rationale.

No, it's quite predictable.  Upstream Linux will not permit the
Intel-signed LE to be used unless Intel makes some major changes to
the way the launch process works.

>
> > + * i.e. the first EINIT will need to write the MSRs regardless
> > + * of the cache, the cache is intentionally left uninitialized
> > + * during boot as initializing the cache would be pure overhead
> > + * for the majority of systems.

But the comment does need changes.  How about saying that the cache
either contains all zeros or matches the MSRs?  All zeros is used
when, for whatever reason, we choose not to care what the current MSR
values are.  Leaving it genuinely uninitialized would be incorrect.
(And, of course, we're assuming that no one ever needs the MSRs
zeroed.)

When KVM host support goes in, even this won't be good enough if we
want to allow passthrough access to the MSRs because we will no longer
be able to guarantee that all zeros is invalid.  Instead we'd need an
actual flag saying that the cache is invalid.

--Andy
Jarkko Sakkinen June 19, 2018, 2:08 p.m. UTC | #6
On Fri, Jun 08, 2018 at 11:21:48AM -0700, Jethro Beekman wrote:
> On 2018-06-08 10:09, Jarkko Sakkinen wrote:
> > +/*
> > + * Writing the LE hash MSRs is extraordinarily expensive, e.g.
> > + * 3-4x slower than normal MSRs, so we use a per-cpu cache to
> > + * track the last known value of the MSRs to avoid unnecessarily
> > + * writing the MSRs with the current value.  Because most Linux
> > + * kernels will use an LE that is signed with a non-Intel key,
> 
> I don't think you can predict what most Linux kernels will be doing. I think
> not initializing the cache to the CPU's initial value is fine, but this
> particular argument shouldn't appear in the rationale.

Are you just referring to the last sentence or the whole paragraph?

/Jarkko
Jarkko Sakkinen June 19, 2018, 2:57 p.m. UTC | #7
On Fri, Jun 08, 2018 at 11:24:12AM -0700, Dave Hansen wrote:
> On 06/08/2018 10:09 AM, Jarkko Sakkinen wrote:
> > SGX has a set of data structures to maintain information about the enclaves
> > and their security properties. BIOS reserves a fixed size region of
> > physical memory for these structures by setting Processor Reserved Memory
> > Range Registers (PRMRR). This memory area is called Enclave Page Cache
> > (EPC).
> > 
> > This commit implements the basic routines to allocate and free pages from
> > different EPC banks. There is also a swapper thread ksgxswapd for EPC pages
> > that gets woken up by sgx_alloc_page() when we run below the low watermark.
> > The swapper thread continues swapping pages up until it reaches the high
> > watermark.
> 
> Yay!  A new memory manager in arch-specific code.
> 
> > Each subsystem that uses SGX must provide a set of callbacks for EPC
> > pages that are used to reclaim, block and write an EPC page. Kernel
> > takes the responsibility of maintaining LRU cache for them.
> 
> What does a "subsystem that uses SGX" mean?  Do we have one of those
> already?

Driver and KVM.

> > +struct sgx_secs {
> > +	uint64_t size;
> > +	uint64_t base;
> > +	uint32_t ssaframesize;
> > +	uint32_t miscselect;
> > +	uint8_t reserved1[SGX_SECS_RESERVED1_SIZE];
> > +	uint64_t attributes;
> > +	uint64_t xfrm;
> > +	uint32_t mrenclave[8];
> > +	uint8_t reserved2[SGX_SECS_RESERVED2_SIZE];
> > +	uint32_t mrsigner[8];
> > +	uint8_t	reserved3[SGX_SECS_RESERVED3_SIZE];
> > +	uint16_t isvvprodid;
> > +	uint16_t isvsvn;
> > +	uint8_t reserved4[SGX_SECS_RESERVED4_SIZE];
> > +};
> 
> This is a hardware structure, right?  Doesn't it need to be packed?

Everything is aligned properly in this struct.

> > +enum sgx_tcs_flags {
> > +	SGX_TCS_DBGOPTIN	= 0x01, /* cleared on EADD */
> > +};
> > +
> > +#define SGX_TCS_RESERVED_MASK 0xFFFFFFFFFFFFFFFEL
> 
> Would it be possible to separate out the SGX software structures from
> SGX hardware?  It's hard to tell them apart.

How do you draw the line in the architectural structures?

> > +#define SGX_NR_TO_SCAN	16
> > +#define SGX_NR_LOW_PAGES 32
> > +#define SGX_NR_HIGH_PAGES 64
> > +
> >  bool sgx_enabled __ro_after_init = false;
> >  EXPORT_SYMBOL(sgx_enabled);
> > +bool sgx_lc_enabled __ro_after_init;
> > +EXPORT_SYMBOL(sgx_lc_enabled);
> > +atomic_t sgx_nr_free_pages = ATOMIC_INIT(0);
> 
> Hmmm, global atomic.  Doesn't sound very scalable.

We could potentially remove this completely as banks have 'free_cnt'
field and use the sum when needed as the value.

> > +struct sgx_epc_bank sgx_epc_banks[SGX_MAX_EPC_BANKS];
> > +EXPORT_SYMBOL(sgx_epc_banks);
> > +int sgx_nr_epc_banks;
> > +EXPORT_SYMBOL(sgx_nr_epc_banks);
> > +LIST_HEAD(sgx_active_page_list);
> > +EXPORT_SYMBOL(sgx_active_page_list);
> > +DEFINE_SPINLOCK(sgx_active_page_list_lock);
> > +EXPORT_SYMBOL(sgx_active_page_list_lock);
> 
> Hmmm, global spinlock protecting a page allocator linked list.  Sounds
> even worse than at atomic.
> 
> Why is this OK?

Any suggestions what would be a better place in order to make a
fine grained granularity?

> > +static struct task_struct *ksgxswapd_tsk;
> > +static DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
> > +
> > +/*
> > + * Writing the LE hash MSRs is extraordinarily expensive, e.g.
> > + * 3-4x slower than normal MSRs, so we use a per-cpu cache to
> > + * track the last known value of the MSRs to avoid unnecessarily
> > + * writing the MSRs with the current value.  Because most Linux
> > + * kernels will use an LE that is signed with a non-Intel key,
> > + * i.e. the first EINIT will need to write the MSRs regardless
> > + * of the cache, the cache is intentionally left uninitialized
> > + * during boot as initializing the cache would be pure overhead
> > + * for the majority of systems.  Furthermore, the MSRs are per-cpu
> > + * and the boot-time values aren't guaranteed to be identical
> > + * across cpus, so we'd have to run code all all cpus to properly
> > + * init the cache.  All in all, the complexity and overhead of
> > + * initializing the cache is not justified.
> > + */
> > +static DEFINE_PER_CPU(u64 [4], sgx_le_pubkey_hash_cache);
> 
> Justifying the design decisions is great for changelogs, not so great
> for comments.  Also, looking at this, I have no idea what this has to do
> with the "enclave page cache".

We'll have to revisit comment, I see your point.

> > +static void sgx_swap_cluster(void)
> > +{
> > +	struct sgx_epc_page *cluster[SGX_NR_TO_SCAN + 1];
> > +	struct sgx_epc_page *epc_page;
> > +	int i;
> > +	int j;
> 
> This is rather free of comments or explanation of what this is doing,
> how it is related to swapping as everyone else knows it

I can document this function properly.

> > +	memset(cluster, 0, sizeof(cluster));
> > +
> > +	for (i = 0, j = 0; i < SGX_NR_TO_SCAN; i++) {
> > +		spin_lock(&sgx_active_page_list_lock);
> > +		if (list_empty(&sgx_active_page_list)) {
> > +			spin_unlock(&sgx_active_page_list_lock);
> > +			break;
> > +		}
> > +		epc_page = list_first_entry(&sgx_active_page_list,
> > +					    struct sgx_epc_page, list);
> > +		if (!epc_page->impl->ops->get(epc_page)) {
> > +			list_move_tail(&epc_page->list, &sgx_active_page_list);
> > +			spin_unlock(&sgx_active_page_list_lock);
> > +			continue;
> > +		}
> > +		list_del(&epc_page->list);
> > +		spin_unlock(&sgx_active_page_list_lock);
> >  
> > -static __init bool sgx_is_enabled(void)
> > +		if (epc_page->impl->ops->reclaim(epc_page)) {
> > +			cluster[j++] = epc_page;
> > +		} else {
> > +			spin_lock(&sgx_active_page_list_lock);
> > +			list_add_tail(&epc_page->list, &sgx_active_page_list);
> > +			spin_unlock(&sgx_active_page_list_lock);
> > +			epc_page->impl->ops->put(epc_page);
> > +		}
> > +	}
> > +
> > +	for (i = 0; cluster[i]; i++) {
> > +		epc_page = cluster[i];
> > +		epc_page->impl->ops->block(epc_page);
> > +	}
> > +
> > +	for (i = 0; cluster[i]; i++) {
> > +		epc_page = cluster[i];
> > +		epc_page->impl->ops->write(epc_page);
> > +		epc_page->impl->ops->put(epc_page);
> > +		sgx_free_page(epc_page);
> > +	}
> > +}
> 
> This is also gloriously free of any superfluous comments.  Could you fix
> that?

Yes.

> > +/**
> > + * sgx_try_alloc_page - try to allocate an EPC page
> > + * @impl:	implementation for the struct sgx_epc_page
> > + *
> > + * Try to grab a page from the free EPC page list. If there is a free page
> > + * available, it is returned to the caller.
> > + *
> > + * Return:
> > + *   a &struct sgx_epc_page instace,
> > + *   NULL otherwise
> > + */
> > +struct sgx_epc_page *sgx_try_alloc_page(struct sgx_epc_page_impl *impl)
> > +{
> > +	struct sgx_epc_bank *bank;
> > +	struct sgx_epc_page *page = NULL;
> > +	int i;
> > +
> > +	for (i = 0; i < sgx_nr_epc_banks; i++) {
> > +		bank = &sgx_epc_banks[i];
> 
> What's a bank?  How many banks does a system have?

AFAIK, UMA systems have one bank. NUMA have multiple. It is a physical
memory region reserved for enclave pages.

> > +		down_write(&bank->lock);
> > +
> > +		if (atomic_read(&bank->free_cnt))
> > +			page = bank->pages[atomic_dec_return(&bank->free_cnt)];
> 
> Why is a semaphore getting used here?  I don't see any sleeping or
> anything happening under this lock.

Should be changed to reader-writer spinlock, thanks.

> > +		up_write(&bank->lock);
> > +
> > +		if (page)
> > +			break;
> > +	}
> > +
> > +	if (page) {
> > +		atomic_dec(&sgx_nr_free_pages);
> > +		page->impl = impl;
> > +	}
> > +
> > +	return page;
> > +}
> > +EXPORT_SYMBOL(sgx_try_alloc_page);
> > +
> > +/**
> > + * sgx_alloc_page - allocate an EPC page
> > + * @flags:	allocation flags
> > + * @impl:	implementation for the struct sgx_epc_page
> > + *
> > + * Try to grab a page from the free EPC page list. If there is a free page
> > + * available, it is returned to the caller. If called with SGX_ALLOC_ATOMIC,
> > + * the function will return immediately if the list is empty. Otherwise, it
> > + * will swap pages up until there is a free page available. Upon returning the
> > + * low watermark is checked and ksgxswapd is waken up if we are below it.
> > + *
> > + * Return:
> > + *   a &struct sgx_epc_page instace,
> > + *   -ENOMEM if all pages are unreclaimable,
> > + *   -EBUSY when called with SGX_ALLOC_ATOMIC and out of free pages
> > + */
> > +struct sgx_epc_page *sgx_alloc_page(struct sgx_epc_page_impl *impl,
> > +				    unsigned int flags)
> > +{
> > +	struct sgx_epc_page *entry;
> > +
> > +	for ( ; ; ) {
> > +		entry = sgx_try_alloc_page(impl);
> > +		if (entry)
> > +			break;
> > +
> > +		if (list_empty(&sgx_active_page_list))
> > +			return ERR_PTR(-ENOMEM);
> 
> "active" pages in the VM are allocated/in-use pages.  This doesn't look
> to be using the same terminology.
> 
> > +		if (flags & SGX_ALLOC_ATOMIC) {
> > +			entry = ERR_PTR(-EBUSY);
> > +			break;
> > +		}
> > +
> > +		if (signal_pending(current)) {
> > +			entry = ERR_PTR(-ERESTARTSYS);
> > +			break;
> > +		}
> > +
> > +		sgx_swap_cluster();
> > +		schedule();
> 
> What's the schedule trying to do?  Is this the equivalent of "direct
> reclaim"?  Why do we need this in addition to the ksgxswapd?

It tries to direct reclaim. Ugh, that schedule() call does not make
much sense though...

> > +	}
> > +
> > +	if (atomic_read(&sgx_nr_free_pages) < SGX_NR_LOW_PAGES)
> > +		wake_up(&ksgxswapd_waitq);
> > +
> > +	return entry;
> > +}
> > +EXPORT_SYMBOL(sgx_alloc_page);
> 
> Why aren't these _GPL exports?

Source files a dual licensed.

> > +/**
> > + * sgx_free_page - free an EPC page
> > + *
> > + * @page:	any EPC page
> > + *
> > + * Remove an EPC page and insert it back to the list of free pages.
> > + *
> > + * Return: SGX error code
> > + */
> > +int sgx_free_page(struct sgx_epc_page *page)
> > +{
> > +	struct sgx_epc_bank *bank = SGX_EPC_BANK(page);
> > +	int ret;
> > +
> > +	ret = sgx_eremove(page);
> > +	if (ret) {
> > +		pr_debug("EREMOVE returned %d\n", ret);
> > +		return ret;
> > +	}
> > +
> > +	down_read(&bank->lock);
> > +	bank->pages[atomic_inc_return(&bank->free_cnt) - 1] = page;
> > +	atomic_inc(&sgx_nr_free_pages);
> > +	up_read(&bank->lock);
> > +
> > +	return 0;
> > +}
> 
> bank->lock confuses me.  This seems to be writing to a bank, but only
> needs a read lock.  Why?

It could be either way around:

1. Allow multiple threads that free a page to access the array.
2. Allow multiple threads that alloc a page to access the array.

> > +/**
> > + * sgx_get_page - pin an EPC page
> > + * @page:	an EPC page
> > + *
> > + * Return: a pointer to the pinned EPC page
> > + */
> > +void *sgx_get_page(struct sgx_epc_page *page)
> > +{
> > +	struct sgx_epc_bank *bank = SGX_EPC_BANK(page);
> > +
> > +	if (IS_ENABLED(CONFIG_X86_64))
> > +		return (void *)(bank->va + SGX_EPC_ADDR(page) - bank->pa);
> > +
> > +	return kmap_atomic_pfn(SGX_EPC_PFN(page));
> > +}
> > +EXPORT_SYMBOL(sgx_get_page);
> 
> This is odd.  Do you really want to detect 64-bit, or CONFIG_HIGHMEM?

For 32-bit (albeit not supported at this point) it makes sense to always
use kmap_atomic_pfn() as the virtua address area is very limited.

> > +struct page *sgx_get_backing(struct file *file, pgoff_t index)
> > +{
> > +	struct inode *inode = file->f_path.dentry->d_inode;
> > +	struct address_space *mapping = inode->i_mapping;
> > +	gfp_t gfpmask = mapping_gfp_mask(mapping);
> > +
> > +	return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
> > +}
> > +EXPORT_SYMBOL(sgx_get_backing);
> 
> What does shmem have to do with all this?

Backing storage is an shmem file similarly is in drm.

> > +void sgx_put_backing(struct page *backing_page, bool write)
> > +{
> > +	if (write)
> > +		set_page_dirty(backing_page);
> > +
> > +	put_page(backing_page);
> > +}
> > +EXPORT_SYMBOL(sgx_put_backing);
> 
> I'm not a big fan of stuff getting added with no apparent user and no
> explaination of what it is doing.  There's no way for me to assess
> whether this is sane or not.

I'll add the documetation.

> > +static __init int sgx_page_cache_init(void)
> > +{
> > +	struct task_struct *tsk;
> > +	unsigned long size;
> > +	unsigned int eax;
> > +	unsigned int ebx;
> > +	unsigned int ecx;
> > +	unsigned int edx;
> > +	unsigned long pa;
> > +	int i;
> > +	int ret;
> > +
> > +	for (i = 0; i < SGX_MAX_EPC_BANKS; i++) {
> > +		cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC_BANKS, &eax, &ebx,
> > +			    &ecx, &edx);
> > +		if (!(eax & 0xf))
> > +			break;
> > +
> > +		pa   = ((u64)(ebx & 0xfffff) << 32) + (u64)(eax & 0xfffff000);
> > +		size = ((u64)(edx & 0xfffff) << 32) + (u64)(ecx & 0xfffff000);
> 
> Please align these like I did ^
> 
> > +		pr_info("EPC bank 0x%lx-0x%lx\n", pa, pa + size);
> > +
> > +		ret = sgx_init_epc_bank(pa, size, i, &sgx_epc_banks[i]);
> > +		if (ret) {
> > +			sgx_page_cache_teardown();
> > +			return ret;
> > +		}
> > +
> > +		sgx_nr_epc_banks++;
> > +	}
> 
> This is also rather sparsely commented.
> 
> > +static __init bool sgx_is_enabled(bool *lc_enabled)
> >  {
> >  	unsigned long fc;
> >  
> > @@ -41,12 +466,26 @@ static __init bool sgx_is_enabled(void)
> >  	if (!(fc & FEATURE_CONTROL_SGX_ENABLE))
> >  		return false;
> >  
> > +	*lc_enabled = !!(fc & FEATURE_CONTROL_SGX_LE_WR);
> > +
> >  	return true;
> >  }
> 
> I'm baffled why lc_enabled is connected to the enclave page cache.

KVM works only with writable MSRs. Driver works both with writable
and read-only MSRs.

Thanks, I'll try my best to deal with all this :-)

/Jarkko
Neil Horman June 19, 2018, 3:19 p.m. UTC | #8
On Tue, Jun 19, 2018 at 05:57:53PM +0300, Jarkko Sakkinen wrote:
> On Fri, Jun 08, 2018 at 11:24:12AM -0700, Dave Hansen wrote:
> > On 06/08/2018 10:09 AM, Jarkko Sakkinen wrote:
> > > SGX has a set of data structures to maintain information about the enclaves
> > > and their security properties. BIOS reserves a fixed size region of
> > > physical memory for these structures by setting Processor Reserved Memory
> > > Range Registers (PRMRR). This memory area is called Enclave Page Cache
> > > (EPC).
> > > 
> > > This commit implements the basic routines to allocate and free pages from
> > > different EPC banks. There is also a swapper thread ksgxswapd for EPC pages
> > > that gets woken up by sgx_alloc_page() when we run below the low watermark.
> > > The swapper thread continues swapping pages up until it reaches the high
> > > watermark.
> > 
> > Yay!  A new memory manager in arch-specific code.
> > 
> > > Each subsystem that uses SGX must provide a set of callbacks for EPC
> > > pages that are used to reclaim, block and write an EPC page. Kernel
> > > takes the responsibility of maintaining LRU cache for them.
> > 
> > What does a "subsystem that uses SGX" mean?  Do we have one of those
> > already?
> 
> Driver and KVM.
> 
> > > +struct sgx_secs {
> > > +	uint64_t size;
> > > +	uint64_t base;
> > > +	uint32_t ssaframesize;
> > > +	uint32_t miscselect;
> > > +	uint8_t reserved1[SGX_SECS_RESERVED1_SIZE];
> > > +	uint64_t attributes;
> > > +	uint64_t xfrm;
> > > +	uint32_t mrenclave[8];
> > > +	uint8_t reserved2[SGX_SECS_RESERVED2_SIZE];
> > > +	uint32_t mrsigner[8];
> > > +	uint8_t	reserved3[SGX_SECS_RESERVED3_SIZE];
> > > +	uint16_t isvvprodid;
> > > +	uint16_t isvsvn;
> > > +	uint8_t reserved4[SGX_SECS_RESERVED4_SIZE];
> > > +};
> > 
> > This is a hardware structure, right?  Doesn't it need to be packed?
> 
> Everything is aligned properly in this struct.
> 
I don't think you can guarantee that.  I understand that the reserved size is
likely computed to turn those u8's into 32/64 byte regions, but the uint16t
isvvprodid and isvsvn might get padded to 32 or 64 bytes in software dependent
on the processor you build for.

And even so, its suceptible to being
misaligned if a new version of the hardware adds or removes elements.  Adding a
packed attribute seems like a safe approach (or at least a no-op in the current
state)

Neil
Dave Hansen June 19, 2018, 3:32 p.m. UTC | #9
On 06/19/2018 07:57 AM, Jarkko Sakkinen wrote:
> On Fri, Jun 08, 2018 at 11:24:12AM -0700, Dave Hansen wrote:
>>> Each subsystem that uses SGX must provide a set of callbacks for EPC
>>> pages that are used to reclaim, block and write an EPC page. Kernel
>>> takes the responsibility of maintaining LRU cache for them.
>>
>> What does a "subsystem that uses SGX" mean?  Do we have one of those
>> already?
> 
> Driver and KVM.

Could you just say "the SGX and driver both provide a set of callbacks"?

>>> +struct sgx_secs {
>>> +	uint64_t size;
>>> +	uint64_t base;
>>> +	uint32_t ssaframesize;
>>> +	uint32_t miscselect;
>>> +	uint8_t reserved1[SGX_SECS_RESERVED1_SIZE];
>>> +	uint64_t attributes;
>>> +	uint64_t xfrm;
>>> +	uint32_t mrenclave[8];
>>> +	uint8_t reserved2[SGX_SECS_RESERVED2_SIZE];
>>> +	uint32_t mrsigner[8];
>>> +	uint8_t	reserved3[SGX_SECS_RESERVED3_SIZE];
>>> +	uint16_t isvvprodid;
>>> +	uint16_t isvsvn;
>>> +	uint8_t reserved4[SGX_SECS_RESERVED4_SIZE];
>>> +};
>>
>> This is a hardware structure, right?  Doesn't it need to be packed?
> 
> Everything is aligned properly in this struct.

The compiler doesn't guarantee the way you have it laid out.  It might
work today, but it's subject to being changed.

>>> +enum sgx_tcs_flags {
>>> +	SGX_TCS_DBGOPTIN	= 0x01, /* cleared on EADD */
>>> +};
>>> +
>>> +#define SGX_TCS_RESERVED_MASK 0xFFFFFFFFFFFFFFFEL
>>
>> Would it be possible to separate out the SGX software structures from
>> SGX hardware?  It's hard to tell them apart.
> 
> How do you draw the line in the architectural structures?

I know then when I see them.

"SGX_TCS_DBGOPTIN" - Hardware
"SGX_NR_TO_SCAN" - Software

Please at least make an effort to do this.

>>> +#define SGX_NR_TO_SCAN	16
>>> +#define SGX_NR_LOW_PAGES 32
>>> +#define SGX_NR_HIGH_PAGES 64
>>> +
>>>  bool sgx_enabled __ro_after_init = false;
>>>  EXPORT_SYMBOL(sgx_enabled);
>>> +bool sgx_lc_enabled __ro_after_init;
>>> +EXPORT_SYMBOL(sgx_lc_enabled);
>>> +atomic_t sgx_nr_free_pages = ATOMIC_INIT(0);
>>
>> Hmmm, global atomic.  Doesn't sound very scalable.
> 
> We could potentially remove this completely as banks have 'free_cnt'
> field and use the sum when needed as the value.

That seems prudent.

>>> +struct sgx_epc_bank sgx_epc_banks[SGX_MAX_EPC_BANKS];
>>> +EXPORT_SYMBOL(sgx_epc_banks);
>>> +int sgx_nr_epc_banks;
>>> +EXPORT_SYMBOL(sgx_nr_epc_banks);
>>> +LIST_HEAD(sgx_active_page_list);
>>> +EXPORT_SYMBOL(sgx_active_page_list);
>>> +DEFINE_SPINLOCK(sgx_active_page_list_lock);
>>> +EXPORT_SYMBOL(sgx_active_page_list_lock);
>>
>> Hmmm, global spinlock protecting a page allocator linked list.  Sounds
>> even worse than at atomic.
>>
>> Why is this OK?
> 
> Any suggestions what would be a better place in order to make a
> fine grained granularity?

The bank seems a logical place.  Or, create a structure that actually
hangs off NUMA nodes.

BTW, do we *have* locality information for SGX banks?
>>> +/**
>>> + * sgx_try_alloc_page - try to allocate an EPC page
>>> + * @impl:	implementation for the struct sgx_epc_page
>>> + *
>>> + * Try to grab a page from the free EPC page list. If there is a free page
>>> + * available, it is returned to the caller.
>>> + *
>>> + * Return:
>>> + *   a &struct sgx_epc_page instace,
>>> + *   NULL otherwise
>>> + */
>>> +struct sgx_epc_page *sgx_try_alloc_page(struct sgx_epc_page_impl *impl)
>>> +{
>>> +	struct sgx_epc_bank *bank;
>>> +	struct sgx_epc_page *page = NULL;
>>> +	int i;
>>> +
>>> +	for (i = 0; i < sgx_nr_epc_banks; i++) {
>>> +		bank = &sgx_epc_banks[i];
>>
>> What's a bank?  How many banks does a system have?
> 
> AFAIK, UMA systems have one bank. NUMA have multiple. It is a physical
> memory region reserved for enclave pages.

That's great text to include near the structure definition for
sgx_epc_bank.

>>> +		down_write(&bank->lock);
>>> +
>>> +		if (atomic_read(&bank->free_cnt))
>>> +			page = bank->pages[atomic_dec_return(&bank->free_cnt)];
>>
>> Why is a semaphore getting used here?  I don't see any sleeping or
>> anything happening under this lock.
> 
> Should be changed to reader-writer spinlock, thanks.

Which also reminds me...  It would be nice to explicitly call out why
you need an atomic_t inside a lock-protected structure.

>>> +	}
>>> +
>>> +	if (atomic_read(&sgx_nr_free_pages) < SGX_NR_LOW_PAGES)
>>> +		wake_up(&ksgxswapd_waitq);
>>> +
>>> +	return entry;
>>> +}
>>> +EXPORT_SYMBOL(sgx_alloc_page);
>>
>> Why aren't these _GPL exports?
> 
> Source files a dual licensed.

Sounds like a great thing to ask your licensing or legal team about.

>>> +/**
>>> + * sgx_free_page - free an EPC page
>>> + *
>>> + * @page:	any EPC page
>>> + *
>>> + * Remove an EPC page and insert it back to the list of free pages.
>>> + *
>>> + * Return: SGX error code
>>> + */
>>> +int sgx_free_page(struct sgx_epc_page *page)
>>> +{
>>> +	struct sgx_epc_bank *bank = SGX_EPC_BANK(page);
>>> +	int ret;
>>> +
>>> +	ret = sgx_eremove(page);
>>> +	if (ret) {
>>> +		pr_debug("EREMOVE returned %d\n", ret);
>>> +		return ret;
>>> +	}
>>> +
>>> +	down_read(&bank->lock);
>>> +	bank->pages[atomic_inc_return(&bank->free_cnt) - 1] = page;
>>> +	atomic_inc(&sgx_nr_free_pages);
>>> +	up_read(&bank->lock);
>>> +
>>> +	return 0;
>>> +}
>>
>> bank->lock confuses me.  This seems to be writing to a bank, but only
>> needs a read lock.  Why?
> 
> It could be either way around:
> 
> 1. Allow multiple threads that free a page to access the array.
> 2. Allow multiple threads that alloc a page to access the array.

Whatever way you choose, please document the locking scheme.

>>> +/**
>>> + * sgx_get_page - pin an EPC page
>>> + * @page:	an EPC page
>>> + *
>>> + * Return: a pointer to the pinned EPC page
>>> + */
>>> +void *sgx_get_page(struct sgx_epc_page *page)
>>> +{
>>> +	struct sgx_epc_bank *bank = SGX_EPC_BANK(page);
>>> +
>>> +	if (IS_ENABLED(CONFIG_X86_64))
>>> +		return (void *)(bank->va + SGX_EPC_ADDR(page) - bank->pa);
>>> +
>>> +	return kmap_atomic_pfn(SGX_EPC_PFN(page));
>>> +}
>>> +EXPORT_SYMBOL(sgx_get_page);
>>
>> This is odd.  Do you really want to detect 64-bit, or CONFIG_HIGHMEM?
> 
> For 32-bit (albeit not supported at this point) it makes sense to always
> use kmap_atomic_pfn() as the virtua address area is very limited.

That makes no sense.  32-bit kernels have plenty of virtual address
space if not using highmem.

>>> +struct page *sgx_get_backing(struct file *file, pgoff_t index)
>>> +{
>>> +	struct inode *inode = file->f_path.dentry->d_inode;
>>> +	struct address_space *mapping = inode->i_mapping;
>>> +	gfp_t gfpmask = mapping_gfp_mask(mapping);
>>> +
>>> +	return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
>>> +}
>>> +EXPORT_SYMBOL(sgx_get_backing);
>>
>> What does shmem have to do with all this?
> 
> Backing storage is an shmem file similarly is in drm.

That's something good to call out in the changelog: how shmem gets used
here.

>>> +static __init bool sgx_is_enabled(bool *lc_enabled)
>>>  {
>>>  	unsigned long fc;
>>>  
>>> @@ -41,12 +466,26 @@ static __init bool sgx_is_enabled(void)
>>>  	if (!(fc & FEATURE_CONTROL_SGX_ENABLE))
>>>  		return false;
>>>  
>>> +	*lc_enabled = !!(fc & FEATURE_CONTROL_SGX_LE_WR);
>>> +
>>>  	return true;
>>>  }
>>
>> I'm baffled why lc_enabled is connected to the enclave page cache.
> 
> KVM works only with writable MSRs. Driver works both with writable
> and read-only MSRs.

Could you help with my confusion by documenting this a bit?
Jethro Beekman June 19, 2018, 3:44 p.m. UTC | #10
On 2018-06-19 07:08, Jarkko Sakkinen wrote:
> On Fri, Jun 08, 2018 at 11:21:48AM -0700, Jethro Beekman wrote:
>> On 2018-06-08 10:09, Jarkko Sakkinen wrote:
>>> +/*
>>> + * Writing the LE hash MSRs is extraordinarily expensive, e.g.
>>> + * 3-4x slower than normal MSRs, so we use a per-cpu cache to
>>> + * track the last known value of the MSRs to avoid unnecessarily
>>> + * writing the MSRs with the current value.  Because most Linux
>>> + * kernels will use an LE that is signed with a non-Intel key,
>>
>> I don't think you can predict what most Linux kernels will be doing. I think
>> not initializing the cache to the CPU's initial value is fine, but this
>> particular argument shouldn't appear in the rationale.
> 
> Are you just referring to the last sentence or the whole paragraph?

Just the last sentence.

> /Jarkko
Jethro
Sean Christopherson June 19, 2018, 3:59 p.m. UTC | #11
On Tue, Jun 19, 2018 at 05:57:53PM +0300, Jarkko Sakkinen wrote:
> On Fri, Jun 08, 2018 at 11:24:12AM -0700, Dave Hansen wrote:
> > On 06/08/2018 10:09 AM, Jarkko Sakkinen wrote:
> > > +static __init bool sgx_is_enabled(bool *lc_enabled)
> > >  {
> > >  	unsigned long fc;
> > >  
> > > @@ -41,12 +466,26 @@ static __init bool sgx_is_enabled(void)
> > >  	if (!(fc & FEATURE_CONTROL_SGX_ENABLE))
> > >  		return false;
> > >  
> > > +	*lc_enabled = !!(fc & FEATURE_CONTROL_SGX_LE_WR);
> > > +
> > >  	return true;
> > >  }
> > 
> > I'm baffled why lc_enabled is connected to the enclave page cache.
> 
> KVM works only with writable MSRs. Driver works both with writable
> and read-only MSRs.

That's not true, KVM can/will support SGX regardless of whether or not
Launch Control (LC) is available and/or enabled.  KVM does need to
know whether or not LC is enabled.

Back to Dave's question, LC isn't connected to the EPC management,
the LC code should be split into a separate patch.
 
> Thanks, I'll try my best to deal with all this :-)
> 
> /Jarkko
Sean Christopherson June 20, 2018, 1:21 p.m. UTC | #12
On Fri, 2018-06-08 at 19:09 +0200, Jarkko Sakkinen wrote:
> SGX has a set of data structures to maintain information about the enclaves
> and their security properties. BIOS reserves a fixed size region of
> physical memory for these structures by setting Processor Reserved Memory
> Range Registers (PRMRR). This memory area is called Enclave Page Cache
> (EPC).
> 
> This commit implements the basic routines to allocate and free pages from
> different EPC banks. There is also a swapper thread ksgxswapd for EPC pages
> that gets woken up by sgx_alloc_page() when we run below the low watermark.
> The swapper thread continues swapping pages up until it reaches the high
> watermark.
> 
> Each subsystem that uses SGX must provide a set of callbacks for EPC
> pages that are used to reclaim, block and write an EPC page. Kernel
> takes the responsibility of maintaining LRU cache for them.
> 
> Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> ---
>  arch/x86/include/asm/sgx.h      |  67 +++++
>  arch/x86/include/asm/sgx_arch.h | 224 ++++++++++++++++
>  arch/x86/kernel/cpu/intel_sgx.c | 443 +++++++++++++++++++++++++++++++-
>  3 files changed, 732 insertions(+), 2 deletions(-)
>  create mode 100644 arch/x86/include/asm/sgx_arch.h

...

> +struct sgx_pcmd {
> +	struct sgx_secinfo secinfo;
> +	uint64_t enclave_id;
> +	uint8_t reserved[40];
> +	uint8_t mac[16];
> +};

sgx_pcmd has a 128-byte alignment requirement.  I think it's
worth specifying here as sgx_pcmd is small enough that it could
be put on the stack, e.g. by KVM when trapping and executing
ELD* on behalf of a guest VM.

In fact, it probably makes sense to add alightment attributes
to all SGX structs for self-documentation purposes, even though
many of them will never be allocated statically or on the stack.

> +
> +#define SGX_MODULUS_SIZE 384
> +
> +struct sgx_sigstruct_header {
> +	uint64_t header1[2];
> +	uint32_t vendor;
> +	uint32_t date;
> +	uint64_t header2[2];
> +	uint32_t swdefined;
> +	uint8_t reserved1[84];
> +};
> +
> +struct sgx_sigstruct_body {
> +	uint32_t miscselect;
> +	uint32_t miscmask;
> +	uint8_t reserved2[20];
> +	uint64_t attributes;
> +	uint64_t xfrm;
> +	uint8_t attributemask[16];
> +	uint8_t mrenclave[32];
> +	uint8_t reserved3[32];
> +	uint16_t isvprodid;
> +	uint16_t isvsvn;
> +} __attribute__((__packed__));
> +
> +struct sgx_sigstruct {
> +	struct sgx_sigstruct_header header;
> +	uint8_t modulus[SGX_MODULUS_SIZE];
> +	uint32_t exponent;
> +	uint8_t signature[SGX_MODULUS_SIZE];
> +	struct sgx_sigstruct_body body;
> +	uint8_t reserved4[12];
> +	uint8_t q1[SGX_MODULUS_SIZE];
> +	uint8_t q2[SGX_MODULUS_SIZE];
> +};
> +
> +struct sgx_sigstruct_payload {
> +	struct sgx_sigstruct_header header;
> +	struct sgx_sigstruct_body body;
> +};
> +
> +struct sgx_einittoken_payload {
> +	uint32_t valid;
> +	uint32_t reserved1[11];
> +	uint64_t attributes;
> +	uint64_t xfrm;
> +	uint8_t mrenclave[32];
> +	uint8_t reserved2[32];
> +	uint8_t mrsigner[32];
> +	uint8_t reserved3[32];
> +};
> +
> +struct sgx_einittoken {
> +	struct sgx_einittoken_payload payload;
> +	uint8_t cpusvnle[16];
> +	uint16_t isvprodidle;
> +	uint16_t isvsvnle;
> +	uint8_t reserved2[24];
> +	uint32_t maskedmiscselectle;
> +	uint64_t maskedattributesle;
> +	uint64_t maskedxfrmle;
> +	uint8_t keyid[32];
> +	uint8_t mac[16];
> +};
> +
> +struct sgx_report {
> +	uint8_t cpusvn[16];
> +	uint32_t miscselect;
> +	uint8_t reserved1[28];
> +	uint64_t attributes;
> +	uint64_t xfrm;
> +	uint8_t mrenclave[32];
> +	uint8_t reserved2[32];
> +	uint8_t mrsigner[32];
> +	uint8_t reserved3[96];
> +	uint16_t isvprodid;
> +	uint16_t isvsvn;
> +	uint8_t reserved4[60];
> +	uint8_t reportdata[64];
> +	uint8_t keyid[32];
> +	uint8_t mac[16];
> +};
> +
> +struct sgx_targetinfo {
> +	uint8_t mrenclave[32];
> +	uint64_t attributes;
> +	uint64_t xfrm;
> +	uint8_t reserved1[4];
> +	uint32_t miscselect;
> +	uint8_t reserved2[456];
> +};
> +
> +struct sgx_keyrequest {
> +	uint16_t keyname;
> +	uint16_t keypolicy;
> +	uint16_t isvsvn;
> +	uint16_t reserved1;
> +	uint8_t cpusvn[16];
> +	uint64_t attributemask;
> +	uint64_t xfrmmask;
> +	uint8_t keyid[32];
> +	uint32_t miscmask;
> +	uint8_t reserved2[436];
> +};
Sean Christopherson June 20, 2018, 3:26 p.m. UTC | #13
On Fri, 2018-06-08 at 19:09 +0200, Jarkko Sakkinen wrote:
> SGX has a set of data structures to maintain information about the enclaves
> and their security properties. BIOS reserves a fixed size region of
> physical memory for these structures by setting Processor Reserved Memory
> Range Registers (PRMRR). This memory area is called Enclave Page Cache
> (EPC).
> 
> This commit implements the basic routines to allocate and free pages from
> different EPC banks. There is also a swapper thread ksgxswapd for EPC pages
> that gets woken up by sgx_alloc_page() when we run below the low watermark.
> The swapper thread continues swapping pages up until it reaches the high
> watermark.
> 
> Each subsystem that uses SGX must provide a set of callbacks for EPC
> pages that are used to reclaim, block and write an EPC page. Kernel
> takes the responsibility of maintaining LRU cache for them.
> 
> Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> ---
>  arch/x86/include/asm/sgx.h      |  67 +++++
>  arch/x86/include/asm/sgx_arch.h | 224 ++++++++++++++++
>  arch/x86/kernel/cpu/intel_sgx.c | 443 +++++++++++++++++++++++++++++++-
>  3 files changed, 732 insertions(+), 2 deletions(-)
>  create mode 100644 arch/x86/include/asm/sgx_arch.h

...

> diff --git a/arch/x86/kernel/cpu/intel_sgx.c b/arch/x86/kernel/cpu/intel_sgx.c
> index db6b315334f4..ae2b5c5b455f 100644
> --- a/arch/x86/kernel/cpu/intel_sgx.c
> +++ b/arch/x86/kernel/cpu/intel_sgx.c
> @@ -14,14 +14,439 @@
>  #include <linux/freezer.h>
>  #include <linux/highmem.h>
>  #include <linux/kthread.h>
> +#include <linux/pagemap.h>
>  #include <linux/ratelimit.h>
>  #include <linux/sched/signal.h>
> +#include <linux/shmem_fs.h>
>  #include <linux/slab.h>
>  
> +#define SGX_NR_TO_SCAN	16
> +#define SGX_NR_LOW_PAGES 32
> +#define SGX_NR_HIGH_PAGES 64
> +
>  bool sgx_enabled __ro_after_init = false;
>  EXPORT_SYMBOL(sgx_enabled);
> +bool sgx_lc_enabled __ro_after_init;
> +EXPORT_SYMBOL(sgx_lc_enabled);
> +atomic_t sgx_nr_free_pages = ATOMIC_INIT(0);
> +EXPORT_SYMBOL(sgx_nr_free_pages);
> +struct sgx_epc_bank sgx_epc_banks[SGX_MAX_EPC_BANKS];
> +EXPORT_SYMBOL(sgx_epc_banks);
> +int sgx_nr_epc_banks;
> +EXPORT_SYMBOL(sgx_nr_epc_banks);
> +LIST_HEAD(sgx_active_page_list);
> +EXPORT_SYMBOL(sgx_active_page_list);
> +DEFINE_SPINLOCK(sgx_active_page_list_lock);
> +EXPORT_SYMBOL(sgx_active_page_list_lock);

I don't think we should be exporting anything other than sgx_enabled
and sgx_lc_enabled.  The only external use of a symbol that can't be
trivially (re)moved is in the driver's sgx_pm_suspend() in sgx_main.c,
which uses the sgx_active_page_list to invalidate enclaves.  And that
behavior seems unsafe, e.g. an enclave could theoretically have zero
pages on the active list and so could be missed in the suspend flow.

> +static struct task_struct *ksgxswapd_tsk;
> +static DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
Jarkko Sakkinen June 25, 2018, 7:36 a.m. UTC | #14
On Mon, 2018-06-18 at 14:33 -0700, Andy Lutomirski wrote:
> When KVM host support goes in, even this won't be good enough if we
> want to allow passthrough access to the MSRs because we will no longer
> be able to guarantee that all zeros is invalid.  Instead we'd need an
> actual flag saying that the cache is invalid.

I'm not sure if I understood this part. If it was pass-through, and
there was a flag, how that flag in the host would get updated?

/Jarkko
Jarkko Sakkinen June 25, 2018, 9:01 a.m. UTC | #15
On Tue, 2018-06-19 at 08:32 -0700, Dave Hansen wrote:
> > For 32-bit (albeit not supported at this point) it makes sense to always
> > use kmap_atomic_pfn() as the virtua address area is very limited.
> 
> That makes no sense.  32-bit kernels have plenty of virtual address
> space if not using highmem.

AFAIK the vmalloc area is located in the last 128MB that and ioremap()
of EPC would consume a big portion of that.

> > KVM works only with writable MSRs. Driver works both with writable
> > and read-only MSRs.
> 
> Could you help with my confusion by documenting this a bit?

Sure.

/Jarkko
Jarkko Sakkinen June 25, 2018, 9:14 a.m. UTC | #16
On Tue, 2018-06-19 at 08:59 -0700, Sean Christopherson wrote:
> On Tue, Jun 19, 2018 at 05:57:53PM +0300, Jarkko Sakkinen wrote:
> > On Fri, Jun 08, 2018 at 11:24:12AM -0700, Dave Hansen wrote:
> > > On 06/08/2018 10:09 AM, Jarkko Sakkinen wrote:
> > > > +static __init bool sgx_is_enabled(bool *lc_enabled)
> > > >  {
> > > >  	unsigned long fc;
> > > >  
> > > > @@ -41,12 +466,26 @@ static __init bool sgx_is_enabled(void)
> > > >  	if (!(fc & FEATURE_CONTROL_SGX_ENABLE))
> > > >  		return false;
> > > >  
> > > > +	*lc_enabled = !!(fc & FEATURE_CONTROL_SGX_LE_WR);
> > > > +
> > > >  	return true;
> > > >  }
> > > 
> > > I'm baffled why lc_enabled is connected to the enclave page cache.
> > 
> > KVM works only with writable MSRs. Driver works both with writable
> > and read-only MSRs.
> 
> That's not true, KVM can/will support SGX regardless of whether or not
> Launch Control (LC) is available and/or enabled.  KVM does need to
> know whether or not LC is enabled.

I understood from earlier discussion with talks that this was
additional requirement for KVM but please shed some light.

Could this be dropped altogether from this patch set and then brought up
in the KVM patch set? Right now it is unused.

/Jarkko
Jarkko Sakkinen June 25, 2018, 9:21 a.m. UTC | #17
On Wed, 2018-06-20 at 06:21 -0700, Sean Christopherson wrote:
> On Fri, 2018-06-08 at 19:09 +0200, Jarkko Sakkinen wrote:
> > SGX has a set of data structures to maintain information about the enclaves
> > and their security properties. BIOS reserves a fixed size region of
> > physical memory for these structures by setting Processor Reserved Memory
> > Range Registers (PRMRR). This memory area is called Enclave Page Cache
> > (EPC).
> > 
> > This commit implements the basic routines to allocate and free pages from
> > different EPC banks. There is also a swapper thread ksgxswapd for EPC pages
> > that gets woken up by sgx_alloc_page() when we run below the low watermark.
> > The swapper thread continues swapping pages up until it reaches the high
> > watermark.
> > 
> > Each subsystem that uses SGX must provide a set of callbacks for EPC
> > pages that are used to reclaim, block and write an EPC page. Kernel
> > takes the responsibility of maintaining LRU cache for them.
> > 
> > Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> > ---
> >  arch/x86/include/asm/sgx.h      |  67 +++++
> >  arch/x86/include/asm/sgx_arch.h | 224 ++++++++++++++++
> >  arch/x86/kernel/cpu/intel_sgx.c | 443 +++++++++++++++++++++++++++++++-
> >  3 files changed, 732 insertions(+), 2 deletions(-)
> >  create mode 100644 arch/x86/include/asm/sgx_arch.h
> 
> ...
> 
> > +struct sgx_pcmd {
> > +	struct sgx_secinfo secinfo;
> > +	uint64_t enclave_id;
> > +	uint8_t reserved[40];
> > +	uint8_t mac[16];
> > +};
> 
> sgx_pcmd has a 128-byte alignment requirement.  I think it's
> worth specifying here as sgx_pcmd is small enough that it could
> be put on the stack, e.g. by KVM when trapping and executing
> ELD* on behalf of a guest VM.
> 
> In fact, it probably makes sense to add alightment attributes
> to all SGX structs for self-documentation purposes, even though
> many of them will never be allocated statically or on the stack.

I agree with this. It also documents stuff so that you don't have
to look it up from the SDM.

Neil: this should also clear your concerns.

/Jarkko
Jarkko Sakkinen June 25, 2018, 9:21 a.m. UTC | #18
On Wed, 2018-06-20 at 08:26 -0700, Sean Christopherson wrote:
> >  bool sgx_enabled __ro_after_init = false;
> >  EXPORT_SYMBOL(sgx_enabled);
> > +bool sgx_lc_enabled __ro_after_init;
> > +EXPORT_SYMBOL(sgx_lc_enabled);
> > +atomic_t sgx_nr_free_pages = ATOMIC_INIT(0);
> > +EXPORT_SYMBOL(sgx_nr_free_pages);
> > +struct sgx_epc_bank sgx_epc_banks[SGX_MAX_EPC_BANKS];
> > +EXPORT_SYMBOL(sgx_epc_banks);
> > +int sgx_nr_epc_banks;
> > +EXPORT_SYMBOL(sgx_nr_epc_banks);
> > +LIST_HEAD(sgx_active_page_list);
> > +EXPORT_SYMBOL(sgx_active_page_list);
> > +DEFINE_SPINLOCK(sgx_active_page_list_lock);
> > +EXPORT_SYMBOL(sgx_active_page_list_lock);
> 
> I don't think we should be exporting anything other than sgx_enabled
> and sgx_lc_enabled.  The only external use of a symbol that can't be
> trivially (re)moved is in the driver's sgx_pm_suspend() in sgx_main.c,
> which uses the sgx_active_page_list to invalidate enclaves.  And that
> behavior seems unsafe, e.g. an enclave could theoretically have zero
> pages on the active list and so could be missed in the suspend flow.

Fully agreed. Just forgotten cruft...

/Jarkko
Neil Horman June 25, 2018, 4:14 p.m. UTC | #19
On Mon, Jun 25, 2018 at 12:21:22PM +0300, Jarkko Sakkinen wrote:
> On Wed, 2018-06-20 at 06:21 -0700, Sean Christopherson wrote:
> > On Fri, 2018-06-08 at 19:09 +0200, Jarkko Sakkinen wrote:
> > > SGX has a set of data structures to maintain information about the enclaves
> > > and their security properties. BIOS reserves a fixed size region of
> > > physical memory for these structures by setting Processor Reserved Memory
> > > Range Registers (PRMRR). This memory area is called Enclave Page Cache
> > > (EPC).
> > > 
> > > This commit implements the basic routines to allocate and free pages from
> > > different EPC banks. There is also a swapper thread ksgxswapd for EPC pages
> > > that gets woken up by sgx_alloc_page() when we run below the low watermark.
> > > The swapper thread continues swapping pages up until it reaches the high
> > > watermark.
> > > 
> > > Each subsystem that uses SGX must provide a set of callbacks for EPC
> > > pages that are used to reclaim, block and write an EPC page. Kernel
> > > takes the responsibility of maintaining LRU cache for them.
> > > 
> > > Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> > > ---
> > >  arch/x86/include/asm/sgx.h      |  67 +++++
> > >  arch/x86/include/asm/sgx_arch.h | 224 ++++++++++++++++
> > >  arch/x86/kernel/cpu/intel_sgx.c | 443 +++++++++++++++++++++++++++++++-
> > >  3 files changed, 732 insertions(+), 2 deletions(-)
> > >  create mode 100644 arch/x86/include/asm/sgx_arch.h
> > 
> > ...
> > 
> > > +struct sgx_pcmd {
> > > +	struct sgx_secinfo secinfo;
> > > +	uint64_t enclave_id;
> > > +	uint8_t reserved[40];
> > > +	uint8_t mac[16];
> > > +};
> > 
> > sgx_pcmd has a 128-byte alignment requirement.  I think it's
> > worth specifying here as sgx_pcmd is small enough that it could
> > be put on the stack, e.g. by KVM when trapping and executing
> > ELD* on behalf of a guest VM.
> > 
> > In fact, it probably makes sense to add alightment attributes
> > to all SGX structs for self-documentation purposes, even though
> > many of them will never be allocated statically or on the stack.
> 
> I agree with this. It also documents stuff so that you don't have
> to look it up from the SDM.
> 
> Neil: this should also clear your concerns.
> 
Agreed
Neil

> /Jarkko
diff mbox

Patch

diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index a2f727f85b91..ae738e16ba6c 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -14,6 +14,7 @@ 
 #include <asm/asm.h>
 #include <linux/bitops.h>
 #include <linux/err.h>
+#include <linux/rwsem.h>
 #include <linux/types.h>
 
 #define SGX_CPUID 0x12
@@ -193,7 +194,73 @@  static inline int __emodt(struct sgx_secinfo *secinfo, void *epc)
 	return __encls_ret_2(EMODT, secinfo, epc);
 }
 
+#define SGX_MAX_EPC_BANKS 8
+
+#define SGX_EPC_BANK(epc_page) \
+	(&sgx_epc_banks[(unsigned long)(epc_page->desc) & ~PAGE_MASK])
+#define SGX_EPC_PFN(epc_page) PFN_DOWN((unsigned long)(epc_page->desc))
+#define SGX_EPC_ADDR(epc_page) ((unsigned long)(epc_page->desc) & PAGE_MASK)
+
+struct sgx_epc_page;
+
+struct sgx_epc_page_ops {
+	bool (*get)(struct sgx_epc_page *epc_page);
+	void (*put)(struct sgx_epc_page *epc_page);
+	bool (*reclaim)(struct sgx_epc_page *epc_page);
+	void (*block)(struct sgx_epc_page *epc_page);
+	void (*write)(struct sgx_epc_page *epc_page);
+};
+
+struct sgx_epc_page_impl {
+	const struct sgx_epc_page_ops *ops;
+};
+
+struct sgx_epc_page {
+	unsigned long desc;
+	struct sgx_epc_page_impl *impl;
+	struct list_head list;
+};
+
+struct sgx_epc_bank {
+	unsigned long pa;
+	unsigned long va;
+	unsigned long size;
+	struct sgx_epc_page *pages_data;
+	struct sgx_epc_page **pages;
+	atomic_t free_cnt;
+	struct rw_semaphore lock;
+};
+
 extern bool sgx_enabled;
+extern bool sgx_lc_enabled;
+extern atomic_t sgx_nr_free_pages;
+extern struct sgx_epc_bank sgx_epc_banks[];
+extern int sgx_nr_epc_banks;
+extern struct list_head sgx_active_page_list;
+extern struct spinlock sgx_active_page_list_lock;
+
+enum sgx_alloc_flags {
+	SGX_ALLOC_ATOMIC	= BIT(0),
+};
+
+struct sgx_epc_page *sgx_try_alloc_page(struct sgx_epc_page_impl *impl);
+struct sgx_epc_page *sgx_alloc_page(struct sgx_epc_page_impl *impl,
+				    unsigned int flags);
+int sgx_free_page(struct sgx_epc_page *page);
+void *sgx_get_page(struct sgx_epc_page *ptr);
+void sgx_put_page(void *epc_page_ptr);
+struct page *sgx_get_backing(struct file *file, pgoff_t index);
+void sgx_put_backing(struct page *backing_page, bool write);
+int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token,
+	      struct sgx_epc_page *secs_page, u64 le_pubkey_hash[4]);
+
+struct sgx_launch_request {
+	u8 mrenclave[32];
+	u8 mrsigner[32];
+	uint64_t attributes;
+	uint64_t xfrm;
+	struct sgx_einittoken token;
+};
 
 #define SGX_FN(name, params...)		\
 {					\
diff --git a/arch/x86/include/asm/sgx_arch.h b/arch/x86/include/asm/sgx_arch.h
new file mode 100644
index 000000000000..eb64bd92f84e
--- /dev/null
+++ b/arch/x86/include/asm/sgx_arch.h
@@ -0,0 +1,224 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-17 Intel Corporation.
+//
+// Authors:
+//
+// Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
+// Suresh Siddha <suresh.b.siddha@intel.com>
+
+#ifndef _ASM_X86_SGX_ARCH_H
+#define _ASM_X86_SGX_ARCH_H
+
+#include <linux/types.h>
+
+#define SGX_SSA_GPRS_SIZE		182
+#define SGX_SSA_MISC_EXINFO_SIZE	16
+
+enum sgx_misc {
+	SGX_MISC_EXINFO		= 0x01,
+};
+
+#define SGX_MISC_RESERVED_MASK 0xFFFFFFFFFFFFFFFEL
+
+enum sgx_attribute {
+	SGX_ATTR_DEBUG		= 0x02,
+	SGX_ATTR_MODE64BIT	= 0x04,
+	SGX_ATTR_PROVISIONKEY	= 0x10,
+	SGX_ATTR_EINITTOKENKEY	= 0x20,
+};
+
+#define SGX_ATTR_RESERVED_MASK 0xFFFFFFFFFFFFFFC9L
+
+#define SGX_SECS_RESERVED1_SIZE 24
+#define SGX_SECS_RESERVED2_SIZE 32
+#define SGX_SECS_RESERVED3_SIZE 96
+#define SGX_SECS_RESERVED4_SIZE 3836
+
+struct sgx_secs {
+	uint64_t size;
+	uint64_t base;
+	uint32_t ssaframesize;
+	uint32_t miscselect;
+	uint8_t reserved1[SGX_SECS_RESERVED1_SIZE];
+	uint64_t attributes;
+	uint64_t xfrm;
+	uint32_t mrenclave[8];
+	uint8_t reserved2[SGX_SECS_RESERVED2_SIZE];
+	uint32_t mrsigner[8];
+	uint8_t	reserved3[SGX_SECS_RESERVED3_SIZE];
+	uint16_t isvvprodid;
+	uint16_t isvsvn;
+	uint8_t reserved4[SGX_SECS_RESERVED4_SIZE];
+};
+
+enum sgx_tcs_flags {
+	SGX_TCS_DBGOPTIN	= 0x01, /* cleared on EADD */
+};
+
+#define SGX_TCS_RESERVED_MASK 0xFFFFFFFFFFFFFFFEL
+
+struct sgx_tcs {
+	uint64_t state;
+	uint64_t flags;
+	uint64_t ossa;
+	uint32_t cssa;
+	uint32_t nssa;
+	uint64_t oentry;
+	uint64_t aep;
+	uint64_t ofsbase;
+	uint64_t ogsbase;
+	uint32_t fslimit;
+	uint32_t gslimit;
+	uint64_t reserved[503];
+};
+
+struct sgx_pageinfo {
+	uint64_t linaddr;
+	uint64_t srcpge;
+	union {
+		uint64_t secinfo;
+		uint64_t pcmd;
+	};
+	uint64_t secs;
+} __attribute__((aligned(32)));
+
+
+#define SGX_SECINFO_PERMISSION_MASK	0x0000000000000007L
+#define SGX_SECINFO_PAGE_TYPE_MASK	0x000000000000FF00L
+#define SGX_SECINFO_RESERVED_MASK	0xFFFFFFFFFFFF00F8L
+
+enum sgx_page_type {
+	SGX_PAGE_TYPE_SECS	= 0x00,
+	SGX_PAGE_TYPE_TCS	= 0x01,
+	SGX_PAGE_TYPE_REG	= 0x02,
+	SGX_PAGE_TYPE_VA	= 0x03,
+	SGX_PAGE_TYPE_TRIM	= 0x04,
+};
+
+enum sgx_secinfo_flags {
+	SGX_SECINFO_R		= 0x01,
+	SGX_SECINFO_W		= 0x02,
+	SGX_SECINFO_X		= 0x04,
+	SGX_SECINFO_SECS	= (SGX_PAGE_TYPE_SECS << 8),
+	SGX_SECINFO_TCS		= (SGX_PAGE_TYPE_TCS << 8),
+	SGX_SECINFO_REG		= (SGX_PAGE_TYPE_REG << 8),
+	SGX_SECINFO_TRIM	= (SGX_PAGE_TYPE_TRIM << 8),
+};
+
+struct sgx_secinfo {
+	uint64_t flags;
+	uint64_t reserved[7];
+} __attribute__((aligned(64)));
+
+struct sgx_pcmd {
+	struct sgx_secinfo secinfo;
+	uint64_t enclave_id;
+	uint8_t reserved[40];
+	uint8_t mac[16];
+};
+
+#define SGX_MODULUS_SIZE 384
+
+struct sgx_sigstruct_header {
+	uint64_t header1[2];
+	uint32_t vendor;
+	uint32_t date;
+	uint64_t header2[2];
+	uint32_t swdefined;
+	uint8_t reserved1[84];
+};
+
+struct sgx_sigstruct_body {
+	uint32_t miscselect;
+	uint32_t miscmask;
+	uint8_t reserved2[20];
+	uint64_t attributes;
+	uint64_t xfrm;
+	uint8_t attributemask[16];
+	uint8_t mrenclave[32];
+	uint8_t reserved3[32];
+	uint16_t isvprodid;
+	uint16_t isvsvn;
+} __attribute__((__packed__));
+
+struct sgx_sigstruct {
+	struct sgx_sigstruct_header header;
+	uint8_t modulus[SGX_MODULUS_SIZE];
+	uint32_t exponent;
+	uint8_t signature[SGX_MODULUS_SIZE];
+	struct sgx_sigstruct_body body;
+	uint8_t reserved4[12];
+	uint8_t q1[SGX_MODULUS_SIZE];
+	uint8_t q2[SGX_MODULUS_SIZE];
+};
+
+struct sgx_sigstruct_payload {
+	struct sgx_sigstruct_header header;
+	struct sgx_sigstruct_body body;
+};
+
+struct sgx_einittoken_payload {
+	uint32_t valid;
+	uint32_t reserved1[11];
+	uint64_t attributes;
+	uint64_t xfrm;
+	uint8_t mrenclave[32];
+	uint8_t reserved2[32];
+	uint8_t mrsigner[32];
+	uint8_t reserved3[32];
+};
+
+struct sgx_einittoken {
+	struct sgx_einittoken_payload payload;
+	uint8_t cpusvnle[16];
+	uint16_t isvprodidle;
+	uint16_t isvsvnle;
+	uint8_t reserved2[24];
+	uint32_t maskedmiscselectle;
+	uint64_t maskedattributesle;
+	uint64_t maskedxfrmle;
+	uint8_t keyid[32];
+	uint8_t mac[16];
+};
+
+struct sgx_report {
+	uint8_t cpusvn[16];
+	uint32_t miscselect;
+	uint8_t reserved1[28];
+	uint64_t attributes;
+	uint64_t xfrm;
+	uint8_t mrenclave[32];
+	uint8_t reserved2[32];
+	uint8_t mrsigner[32];
+	uint8_t reserved3[96];
+	uint16_t isvprodid;
+	uint16_t isvsvn;
+	uint8_t reserved4[60];
+	uint8_t reportdata[64];
+	uint8_t keyid[32];
+	uint8_t mac[16];
+};
+
+struct sgx_targetinfo {
+	uint8_t mrenclave[32];
+	uint64_t attributes;
+	uint64_t xfrm;
+	uint8_t reserved1[4];
+	uint32_t miscselect;
+	uint8_t reserved2[456];
+};
+
+struct sgx_keyrequest {
+	uint16_t keyname;
+	uint16_t keypolicy;
+	uint16_t isvsvn;
+	uint16_t reserved1;
+	uint8_t cpusvn[16];
+	uint64_t attributemask;
+	uint64_t xfrmmask;
+	uint8_t keyid[32];
+	uint32_t miscmask;
+	uint8_t reserved2[436];
+};
+
+#endif /* _ASM_X86_SGX_ARCH_H */
diff --git a/arch/x86/kernel/cpu/intel_sgx.c b/arch/x86/kernel/cpu/intel_sgx.c
index db6b315334f4..ae2b5c5b455f 100644
--- a/arch/x86/kernel/cpu/intel_sgx.c
+++ b/arch/x86/kernel/cpu/intel_sgx.c
@@ -14,14 +14,439 @@ 
 #include <linux/freezer.h>
 #include <linux/highmem.h>
 #include <linux/kthread.h>
+#include <linux/pagemap.h>
 #include <linux/ratelimit.h>
 #include <linux/sched/signal.h>
+#include <linux/shmem_fs.h>
 #include <linux/slab.h>
 
+#define SGX_NR_TO_SCAN	16
+#define SGX_NR_LOW_PAGES 32
+#define SGX_NR_HIGH_PAGES 64
+
 bool sgx_enabled __ro_after_init = false;
 EXPORT_SYMBOL(sgx_enabled);
+bool sgx_lc_enabled __ro_after_init;
+EXPORT_SYMBOL(sgx_lc_enabled);
+atomic_t sgx_nr_free_pages = ATOMIC_INIT(0);
+EXPORT_SYMBOL(sgx_nr_free_pages);
+struct sgx_epc_bank sgx_epc_banks[SGX_MAX_EPC_BANKS];
+EXPORT_SYMBOL(sgx_epc_banks);
+int sgx_nr_epc_banks;
+EXPORT_SYMBOL(sgx_nr_epc_banks);
+LIST_HEAD(sgx_active_page_list);
+EXPORT_SYMBOL(sgx_active_page_list);
+DEFINE_SPINLOCK(sgx_active_page_list_lock);
+EXPORT_SYMBOL(sgx_active_page_list_lock);
+
+static struct task_struct *ksgxswapd_tsk;
+static DECLARE_WAIT_QUEUE_HEAD(ksgxswapd_waitq);
+
+/*
+ * Writing the LE hash MSRs is extraordinarily expensive, e.g.
+ * 3-4x slower than normal MSRs, so we use a per-cpu cache to
+ * track the last known value of the MSRs to avoid unnecessarily
+ * writing the MSRs with the current value.  Because most Linux
+ * kernels will use an LE that is signed with a non-Intel key,
+ * i.e. the first EINIT will need to write the MSRs regardless
+ * of the cache, the cache is intentionally left uninitialized
+ * during boot as initializing the cache would be pure overhead
+ * for the majority of systems.  Furthermore, the MSRs are per-cpu
+ * and the boot-time values aren't guaranteed to be identical
+ * across cpus, so we'd have to run code all all cpus to properly
+ * init the cache.  All in all, the complexity and overhead of
+ * initializing the cache is not justified.
+ */
+static DEFINE_PER_CPU(u64 [4], sgx_le_pubkey_hash_cache);
+
+static void sgx_swap_cluster(void)
+{
+	struct sgx_epc_page *cluster[SGX_NR_TO_SCAN + 1];
+	struct sgx_epc_page *epc_page;
+	int i;
+	int j;
+
+	memset(cluster, 0, sizeof(cluster));
+
+	for (i = 0, j = 0; i < SGX_NR_TO_SCAN; i++) {
+		spin_lock(&sgx_active_page_list_lock);
+		if (list_empty(&sgx_active_page_list)) {
+			spin_unlock(&sgx_active_page_list_lock);
+			break;
+		}
+		epc_page = list_first_entry(&sgx_active_page_list,
+					    struct sgx_epc_page, list);
+		if (!epc_page->impl->ops->get(epc_page)) {
+			list_move_tail(&epc_page->list, &sgx_active_page_list);
+			spin_unlock(&sgx_active_page_list_lock);
+			continue;
+		}
+		list_del(&epc_page->list);
+		spin_unlock(&sgx_active_page_list_lock);
 
-static __init bool sgx_is_enabled(void)
+		if (epc_page->impl->ops->reclaim(epc_page)) {
+			cluster[j++] = epc_page;
+		} else {
+			spin_lock(&sgx_active_page_list_lock);
+			list_add_tail(&epc_page->list, &sgx_active_page_list);
+			spin_unlock(&sgx_active_page_list_lock);
+			epc_page->impl->ops->put(epc_page);
+		}
+	}
+
+	for (i = 0; cluster[i]; i++) {
+		epc_page = cluster[i];
+		epc_page->impl->ops->block(epc_page);
+	}
+
+	for (i = 0; cluster[i]; i++) {
+		epc_page = cluster[i];
+		epc_page->impl->ops->write(epc_page);
+		epc_page->impl->ops->put(epc_page);
+		sgx_free_page(epc_page);
+	}
+}
+
+static int ksgxswapd(void *p)
+{
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		if (try_to_freeze())
+			continue;
+
+		wait_event_freezable(ksgxswapd_waitq, kthread_should_stop() ||
+				     atomic_read(&sgx_nr_free_pages) <
+				     SGX_NR_HIGH_PAGES);
+
+		if (atomic_read(&sgx_nr_free_pages) < SGX_NR_HIGH_PAGES)
+			sgx_swap_cluster();
+	}
+
+	pr_info("%s: done\n", __func__);
+	return 0;
+}
+
+/**
+ * sgx_try_alloc_page - try to allocate an EPC page
+ * @impl:	implementation for the struct sgx_epc_page
+ *
+ * Try to grab a page from the free EPC page list. If there is a free page
+ * available, it is returned to the caller.
+ *
+ * Return:
+ *   a &struct sgx_epc_page instace,
+ *   NULL otherwise
+ */
+struct sgx_epc_page *sgx_try_alloc_page(struct sgx_epc_page_impl *impl)
+{
+	struct sgx_epc_bank *bank;
+	struct sgx_epc_page *page = NULL;
+	int i;
+
+	for (i = 0; i < sgx_nr_epc_banks; i++) {
+		bank = &sgx_epc_banks[i];
+
+		down_write(&bank->lock);
+
+		if (atomic_read(&bank->free_cnt))
+			page = bank->pages[atomic_dec_return(&bank->free_cnt)];
+
+		up_write(&bank->lock);
+
+		if (page)
+			break;
+	}
+
+	if (page) {
+		atomic_dec(&sgx_nr_free_pages);
+		page->impl = impl;
+	}
+
+	return page;
+}
+EXPORT_SYMBOL(sgx_try_alloc_page);
+
+/**
+ * sgx_alloc_page - allocate an EPC page
+ * @flags:	allocation flags
+ * @impl:	implementation for the struct sgx_epc_page
+ *
+ * Try to grab a page from the free EPC page list. If there is a free page
+ * available, it is returned to the caller. If called with SGX_ALLOC_ATOMIC,
+ * the function will return immediately if the list is empty. Otherwise, it
+ * will swap pages up until there is a free page available. Upon returning the
+ * low watermark is checked and ksgxswapd is waken up if we are below it.
+ *
+ * Return:
+ *   a &struct sgx_epc_page instace,
+ *   -ENOMEM if all pages are unreclaimable,
+ *   -EBUSY when called with SGX_ALLOC_ATOMIC and out of free pages
+ */
+struct sgx_epc_page *sgx_alloc_page(struct sgx_epc_page_impl *impl,
+				    unsigned int flags)
+{
+	struct sgx_epc_page *entry;
+
+	for ( ; ; ) {
+		entry = sgx_try_alloc_page(impl);
+		if (entry)
+			break;
+
+		if (list_empty(&sgx_active_page_list))
+			return ERR_PTR(-ENOMEM);
+
+		if (flags & SGX_ALLOC_ATOMIC) {
+			entry = ERR_PTR(-EBUSY);
+			break;
+		}
+
+		if (signal_pending(current)) {
+			entry = ERR_PTR(-ERESTARTSYS);
+			break;
+		}
+
+		sgx_swap_cluster();
+		schedule();
+	}
+
+	if (atomic_read(&sgx_nr_free_pages) < SGX_NR_LOW_PAGES)
+		wake_up(&ksgxswapd_waitq);
+
+	return entry;
+}
+EXPORT_SYMBOL(sgx_alloc_page);
+
+/**
+ * sgx_free_page - free an EPC page
+ *
+ * @page:	any EPC page
+ *
+ * Remove an EPC page and insert it back to the list of free pages.
+ *
+ * Return: SGX error code
+ */
+int sgx_free_page(struct sgx_epc_page *page)
+{
+	struct sgx_epc_bank *bank = SGX_EPC_BANK(page);
+	int ret;
+
+	ret = sgx_eremove(page);
+	if (ret) {
+		pr_debug("EREMOVE returned %d\n", ret);
+		return ret;
+	}
+
+	down_read(&bank->lock);
+	bank->pages[atomic_inc_return(&bank->free_cnt) - 1] = page;
+	atomic_inc(&sgx_nr_free_pages);
+	up_read(&bank->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(sgx_free_page);
+
+/**
+ * sgx_get_page - pin an EPC page
+ * @page:	an EPC page
+ *
+ * Return: a pointer to the pinned EPC page
+ */
+void *sgx_get_page(struct sgx_epc_page *page)
+{
+	struct sgx_epc_bank *bank = SGX_EPC_BANK(page);
+
+	if (IS_ENABLED(CONFIG_X86_64))
+		return (void *)(bank->va + SGX_EPC_ADDR(page) - bank->pa);
+
+	return kmap_atomic_pfn(SGX_EPC_PFN(page));
+}
+EXPORT_SYMBOL(sgx_get_page);
+
+/**
+ * sgx_put_page - unpin an EPC page
+ * @ptr:	a pointer to the pinned EPC page
+ */
+void sgx_put_page(void *ptr)
+{
+	if (IS_ENABLED(CONFIG_X86_64))
+		return;
+
+	kunmap_atomic(ptr);
+}
+EXPORT_SYMBOL(sgx_put_page);
+
+struct page *sgx_get_backing(struct file *file, pgoff_t index)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	gfp_t gfpmask = mapping_gfp_mask(mapping);
+
+	return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
+}
+EXPORT_SYMBOL(sgx_get_backing);
+
+void sgx_put_backing(struct page *backing_page, bool write)
+{
+	if (write)
+		set_page_dirty(backing_page);
+
+	put_page(backing_page);
+}
+EXPORT_SYMBOL(sgx_put_backing);
+
+/**
+ * sgx_einit - EINIT an enclave with the appropriate LE pubkey hash
+ * @sigstruct:		a pointer to the enclave's sigstruct
+ * @token:		a pointer to the enclave's EINIT token
+ * @secs_page:		a pointer to the enclave's SECS EPC page
+ * @le_pubkey_hash:	the desired LE pubkey hash for EINIT
+ */
+int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token,
+	      struct sgx_epc_page *secs_page, u64 le_pubkey_hash[4])
+{
+	u64 __percpu *cache;
+	void *secs;
+	int i, ret;
+
+	secs = sgx_get_page(secs_page);
+
+	if (!sgx_lc_enabled) {
+		ret = __einit(sigstruct, token, secs);
+		goto out;
+	}
+
+	cache = per_cpu(sgx_le_pubkey_hash_cache, smp_processor_id());
+
+	preempt_disable();
+	for (i = 0; i < 4; i++) {
+		if (le_pubkey_hash[i] == cache[i])
+			continue;
+
+		wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, le_pubkey_hash[i]);
+		cache[i] = le_pubkey_hash[i];
+	}
+	ret = __einit(sigstruct, token, secs);
+	preempt_enable();
+
+out:
+	sgx_put_page(secs);
+	return ret;
+}
+EXPORT_SYMBOL(sgx_einit);
+
+static __init int sgx_init_epc_bank(unsigned long addr, unsigned long size,
+				    unsigned long index,
+				    struct sgx_epc_bank *bank)
+{
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	unsigned long i;
+	void *va;
+
+	if (IS_ENABLED(CONFIG_X86_64)) {
+		va = ioremap_cache(addr, size);
+		if (!va)
+			return -ENOMEM;
+	}
+
+	bank->pages_data = kzalloc(nr_pages * sizeof(struct sgx_epc_page),
+				   GFP_KERNEL);
+	if (!bank->pages_data) {
+		if (IS_ENABLED(CONFIG_X86_64))
+			iounmap(va);
+
+		return -ENOMEM;
+	}
+
+	bank->pages = kzalloc(nr_pages * sizeof(struct sgx_epc_page *),
+			      GFP_KERNEL);
+	if (!bank->pages) {
+		if (IS_ENABLED(CONFIG_X86_64))
+			iounmap(va);
+		kfree(bank->pages_data);
+		bank->pages_data = NULL;
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nr_pages; i++) {
+		bank->pages[i] = &bank->pages_data[i];
+		bank->pages[i]->desc = (addr + (i << PAGE_SHIFT)) | index;
+	}
+
+	bank->pa = addr;
+	bank->size = size;
+	if (IS_ENABLED(CONFIG_X86_64))
+		bank->va = (unsigned long)va;
+
+	atomic_set(&bank->free_cnt, nr_pages);
+	init_rwsem(&bank->lock);
+	atomic_add(nr_pages, &sgx_nr_free_pages);
+	return 0;
+}
+
+static __init void sgx_page_cache_teardown(void)
+{
+	struct sgx_epc_bank *bank;
+	int i;
+
+	for (i = 0; i < sgx_nr_epc_banks; i++) {
+		bank = &sgx_epc_banks[i];
+
+		if (IS_ENABLED(CONFIG_X86_64))
+			iounmap((void *)bank->va);
+
+		kfree(bank->pages);
+		kfree(bank->pages_data);
+	}
+
+	if (ksgxswapd_tsk) {
+		kthread_stop(ksgxswapd_tsk);
+		ksgxswapd_tsk = NULL;
+	}
+}
+
+static __init int sgx_page_cache_init(void)
+{
+	struct task_struct *tsk;
+	unsigned long size;
+	unsigned int eax;
+	unsigned int ebx;
+	unsigned int ecx;
+	unsigned int edx;
+	unsigned long pa;
+	int i;
+	int ret;
+
+	for (i = 0; i < SGX_MAX_EPC_BANKS; i++) {
+		cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC_BANKS, &eax, &ebx,
+			    &ecx, &edx);
+		if (!(eax & 0xf))
+			break;
+
+		pa = ((u64)(ebx & 0xfffff) << 32) + (u64)(eax & 0xfffff000);
+		size = ((u64)(edx & 0xfffff) << 32) + (u64)(ecx & 0xfffff000);
+
+		pr_info("EPC bank 0x%lx-0x%lx\n", pa, pa + size);
+
+		ret = sgx_init_epc_bank(pa, size, i, &sgx_epc_banks[i]);
+		if (ret) {
+			sgx_page_cache_teardown();
+			return ret;
+		}
+
+		sgx_nr_epc_banks++;
+	}
+
+	tsk = kthread_run(ksgxswapd, NULL, "ksgxswapd");
+	if (IS_ERR(tsk)) {
+		sgx_page_cache_teardown();
+		return PTR_ERR(tsk);
+	}
+	ksgxswapd_tsk = tsk;
+	return 0;
+}
+
+static __init bool sgx_is_enabled(bool *lc_enabled)
 {
 	unsigned long fc;
 
@@ -41,12 +466,26 @@  static __init bool sgx_is_enabled(void)
 	if (!(fc & FEATURE_CONTROL_SGX_ENABLE))
 		return false;
 
+	*lc_enabled = !!(fc & FEATURE_CONTROL_SGX_LE_WR);
+
 	return true;
 }
 
 static __init int sgx_init(void)
 {
-	sgx_enabled = sgx_is_enabled();
+	bool lc_enabled;
+	int ret;
+
+	if (!sgx_is_enabled(&lc_enabled))
+		return 0;
+
+	ret = sgx_page_cache_init();
+	if (ret)
+		return ret;
+
+	sgx_enabled = true;
+	sgx_lc_enabled = lc_enabled;
+
 	return 0;
 }