diff mbox series

[RFC,03/23] x86/sgx: Introduce virtual EPC for use by KVM guests

Message ID ace9d4cb10318370f6145aaced0cfa73dda36477.1609890536.git.kai.huang@intel.com (mailing list archive)
State New, archived
Headers show
Series KVM SGX virtualization support | expand

Commit Message

Huang, Kai Jan. 6, 2021, 1:55 a.m. UTC
From: Sean Christopherson <sean.j.christopherson@intel.com>

Add a misc device /dev/sgx_virt_epc to allow userspace to allocate "raw"
EPC without an associated enclave.  The intended and only known use case
for raw EPC allocation is to expose EPC to a KVM guest, hence the
virt_epc moniker, virt.{c,h} files and X86_SGX_VIRTUALIZATION Kconfig.

Modify sgx_init() to always try to initialize virtual EPC driver, even
when SGX driver is disabled due to SGX Launch Control is in locked mode,
or not present at all, since SGX virtualization allows to expose SGX to
guests that support non-LC configurations.

Implement the "raw" EPC allocation in the x86 core-SGX subsystem via
/dev/sgx_virt_epc rather than in KVM. Doing so has two major advantages:

  - Does not require changes to KVM's uAPI, e.g. EPC gets handled as
    just another memory backend for guests.

  - EPC management is wholly contained in the SGX subsystem, e.g. SGX
    does not have to export any symbols, changes to reclaim flows don't
    need to be routed through KVM, SGX's dirty laundry doesn't have to
    get aired out for the world to see, and so on and so forth.

The virtual EPC allocated to guests is currently not reclaimable, due to
oversubscription of EPC for KVM guests is not currently supported. Due
to the complications of handling reclaim conflicts between guest and
host, KVM EPC oversubscription is significantly more complex than basic
support for SGX virtualization.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
---
 arch/x86/Kconfig                 |  12 ++
 arch/x86/kernel/cpu/sgx/Makefile |   1 +
 arch/x86/kernel/cpu/sgx/main.c   |   5 +-
 arch/x86/kernel/cpu/sgx/virt.c   | 263 +++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/sgx/virt.h   |  14 ++
 5 files changed, 294 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/cpu/sgx/virt.c
 create mode 100644 arch/x86/kernel/cpu/sgx/virt.h

Comments

Dave Hansen Jan. 6, 2021, 7:35 p.m. UTC | #1
On 1/5/21 5:55 PM, Kai Huang wrote:
> From: Sean Christopherson <sean.j.christopherson@intel.com>
> 
> Add a misc device /dev/sgx_virt_epc to allow userspace to allocate "raw"
> EPC without an associated enclave.  The intended and only known use case
> for raw EPC allocation is to expose EPC to a KVM guest, hence the
> virt_epc moniker, virt.{c,h} files and X86_SGX_VIRTUALIZATION Kconfig.
> 
> Modify sgx_init() to always try to initialize virtual EPC driver, even
> when SGX driver is disabled due to SGX Launch Control is in locked mode,
> or not present at all, since SGX virtualization allows to expose SGX to
> guests that support non-LC configurations.

The grammar here is a bit off.  Here's a rewrite:

Modify sgx_init() to always try to initialize the virtual EPC driver,
even if the bare-metal SGX driver is disabled.  The bare-metal driver
might be disabled if SGX Launch Control is in locked mode, or not
supported in the hardware at all.  This allows (non-Linux) guests that
support non-LC configurations to use SGX.


> diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
> index 91d3dc784a29..7a25bf63adfb 100644
> --- a/arch/x86/kernel/cpu/sgx/Makefile
> +++ b/arch/x86/kernel/cpu/sgx/Makefile
> @@ -3,3 +3,4 @@ obj-y += \
>  	encl.o \
>  	ioctl.o \
>  	main.o
> +obj-$(CONFIG_X86_SGX_VIRTUALIZATION)	+= virt.o
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index 95aad183bb65..02993a327a1f 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -9,9 +9,11 @@
>  #include <linux/sched/mm.h>
>  #include <linux/sched/signal.h>
>  #include <linux/slab.h>
> +#include "arch.h"
>  #include "driver.h"
>  #include "encl.h"
>  #include "encls.h"
> +#include "virt.h"
>  
>  struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
>  static int sgx_nr_epc_sections;
> @@ -726,7 +728,8 @@ static void __init sgx_init(void)
>  	if (!sgx_page_reclaimer_init())
>  		goto err_page_cache;
>  
> -	ret = sgx_drv_init();
> +	/* Success if the native *or* virtual EPC driver initialized cleanly. */
> +	ret = !!sgx_drv_init() & !!sgx_virt_epc_init();
>  	if (ret)
>  		goto err_kthread;

FWIW, I hate that conditional.  But, I tried to write to to be something
more sane and failed.

> diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
> new file mode 100644
> index 000000000000..d625551ccf25
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/virt.c
> @@ -0,0 +1,263 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*  Copyright(c) 2016-20 Intel Corporation. */
> +
> +#include <linux/miscdevice.h>
> +#include <linux/mm.h>
> +#include <linux/mman.h>
> +#include <linux/sched/mm.h>
> +#include <linux/sched/signal.h>
> +#include <linux/slab.h>
> +#include <linux/xarray.h>
> +#include <asm/sgx.h>
> +#include <uapi/asm/sgx.h>
> +
> +#include "encls.h"
> +#include "sgx.h"
> +#include "virt.h"
> +
> +struct sgx_virt_epc {
> +	struct xarray page_array;
> +	struct mutex lock;
> +	struct mm_struct *mm;
> +};
> +
> +static struct mutex virt_epc_lock;
> +static struct list_head virt_epc_zombie_pages;

What does the lock protect?

What are zombie pages?

BTW, if zombies are SECS-only, shouldn't that be in the name rather than
"epc"?

> +static int __sgx_virt_epc_fault(struct sgx_virt_epc *epc,
> +				struct vm_area_struct *vma, unsigned long addr)
> +{
> +	struct sgx_epc_page *epc_page;
> +	unsigned long index, pfn;
> +	int ret;
> +
> +	/* epc->lock must already have been hold */

	/* epc->lock must already be held */

Wouldn't this be better as:

WARN_ON(!mutex_is_locked(&epc->lock));

?


> +	/* Calculate index of EPC page in virtual EPC's page_array */
> +	index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
> +
> +	epc_page = xa_load(&epc->page_array, index);
> +	if (epc_page)
> +		return 0;
> +
> +	epc_page = sgx_alloc_epc_page(epc, false);
> +	if (IS_ERR(epc_page))
> +		return PTR_ERR(epc_page);
> +
> +	ret = xa_err(xa_store(&epc->page_array, index, epc_page, GFP_KERNEL));
> +	if (ret)
> +		goto err_free;
> +
> +	pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
> +
> +	ret = vmf_insert_pfn(vma, addr, pfn);
> +	if (ret != VM_FAULT_NOPAGE) {
> +		ret = -EFAULT;
> +		goto err_delete;
> +	}
> +
> +	return 0;
> +
> +err_delete:
> +	xa_erase(&epc->page_array, index);
> +err_free:
> +	sgx_free_epc_page(epc_page);
> +	return ret;
> +}
> +
> +static vm_fault_t sgx_virt_epc_fault(struct vm_fault *vmf)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct sgx_virt_epc *epc = vma->vm_private_data;
> +	int ret;
> +
> +	mutex_lock(&epc->lock);
> +	ret = __sgx_virt_epc_fault(epc, vma, vmf->address);
> +	mutex_unlock(&epc->lock);
> +
> +	if (!ret)
> +		return VM_FAULT_NOPAGE;
> +
> +	if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
> +		mmap_read_unlock(vma->vm_mm);
> +		return VM_FAULT_RETRY;
> +	}
> +
> +	return VM_FAULT_SIGBUS;
> +}
> +
> +const struct vm_operations_struct sgx_virt_epc_vm_ops = {
> +	.fault = sgx_virt_epc_fault,
> +};
> +
> +static int sgx_virt_epc_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	struct sgx_virt_epc *epc = file->private_data;
> +
> +	if (!(vma->vm_flags & VM_SHARED))
> +		return -EINVAL;
> +
> +	/*
> +	 * Don't allow mmap() from child after fork(), since child and parent
> +	 * cannot map to the same EPC.
> +	 */
> +	if (vma->vm_mm != epc->mm)
> +		return -EINVAL;

I mentioned this below, but I'm not buying this logic.  I know it would
be *bad*, but I don't see why the kernel needs to keep it from happening.

> +	vma->vm_ops = &sgx_virt_epc_vm_ops;
> +	/* Don't copy VMA in fork() */
> +	vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
> +	vma->vm_private_data = file->private_data;
> +
> +	return 0;
> +}
> +
> +static int sgx_virt_epc_free_page(struct sgx_epc_page *epc_page)
> +{
> +	int ret;
> +
> +	if (!epc_page)
> +		return 0;

I always worry about these.  Why is passing NULL around OK?

> +	/*
> +	 * Explicitly EREMOVE virtual EPC page. Virtual EPC is only used by
> +	 * guest, and in normal condition guest should have done EREMOVE for
> +	 * all EPC pages before they are freed here. But it's possible guest
> +	 * is killed or crashed unnormally in which case EREMOVE has not been
	
				"abnormally"

I don't think "unnormally" is a word.  Also, this isn't just about
crashing or being killed.  The guest could simply have a bug.

> +	 * done. Do EREMOVE unconditionally here to cover both cases, because
> +	 * it's not possible to tell whether guest has done EREMOVE, since
> +	 * virtual EPC page status is not tracked. And it is fine to EREMOVE
> +	 * EPC page multiple times.
> +	 */

Surprise!  I dislike this comment.

	/*
	 * Take a previously guest-owned EPC page and return it to the
	 * general EPC page pool.
	 *
	 * Guests can not be trusted to have left this page in a good
	 * state, so run EREMOVE on the page unconditionally.  In the
	 * case that a guest properly EREMOVE'd this page, a
	 * superfluous EREMOVE is harmless.
	 */

> +	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
> +	if (ret) {
> +		/*
> +		 * Only SGX_CHILD_PRESENT is expected, which is because of
> +		 * EREMOVE-ing an SECS still with child, in which case it can
> +		 * be handled by EREMOVE-ing the SECS again after all pages in
> +		 * virtual EPC have been EREMOVE-ed. See comments in below in
> +		 * sgx_virt_epc_release().
> +		 */
> +		WARN_ON_ONCE(ret != SGX_CHILD_PRESENT);
> +		return ret;
> +	}

I find myself wondering what errors could cause the WARN_ON_ONCE() to be
hit.  The SDM indicates that it's only:

	SGX_ENCLAVE_ACT If there are still logical processors executing
			inside the enclave.

Should that be mentioned in the comment?

> +
> +	__sgx_free_epc_page(epc_page);
> +	return 0;
> +}
> +
> +static int sgx_virt_epc_release(struct inode *inode, struct file *file)
> +{
> +	struct sgx_virt_epc *epc = file->private_data;

FWIW, I hate the "struct sgx_virt_epc *epc" name.  "epc" here is really
an instance

> +	struct sgx_epc_page *epc_page, *tmp, *entry;
> +	unsigned long index;
> +
> +	LIST_HEAD(secs_pages);
> +
> +	mmdrop(epc->mm);
> +
> +	xa_for_each(&epc->page_array, index, entry) {
> +		/*
> +		 * Virtual EPC pages are not tracked, so it's possible for
> +		 * EREMOVE to fail due to, e.g. a SECS page still has children
> +		 * if guest was shutdown unexpectedly. If it is the case, leave
> +		 * it in the xarray and retry EREMOVE below later.
> +		 */

I don't know what it is about the comments, but I cringe every time I
see an "i.e." or "e.g.".

I'd rewrite the comment as:

	/*
	 * Remove all normal, child pages.  sgx_virt_epc_free_page()
	 * will fail if EREMOVE fails, but this is OK and expected on
	 * SECS pages.  Those can only be EREMOVE'd *after* all their
	 * child pages. Retries below will clean them up.
 	 */

> +		if (sgx_virt_epc_free_page(entry))
> +			continue;
> +
> +		xa_erase(&epc->page_array, index);
> +	}
> +
> +	/*
> +	 * Retry all failed pages after iterating through the entire tree, at
> +	 * which point all children should be removed and the SECS pages can be
> +	 * nuked as well...unless userspace has exposed multiple instance of
> +	 * virtual EPC to a single VM.
> +	 */

I'm just a comment grouch today I guess.  That's a horrible run-on
sentence.  Let's just state the goal of the loop in the comment above it:

	Retry EREMOVE'ing pages.  This will clean up any SECS pages that
	only had children in this 'epc' area.

> +	xa_for_each(&epc->page_array, index, entry) {
> +		epc_page = entry;

Then, talk about the error condition here:

> +		/*
> +		 * Error here means that EREMOVE failed due to a SECS page
> +		 * still has child on *another* EPC instance.  Put it to a
> +		 * temporary SECS list which will be spliced to 'zombie page
> +		 * list' and will be EREMOVE-ed again when freeing another
> +		 * virtual EPC instance.
> +		 */

Surprise, I've got another rewrite:

		/*
		 * An EREMOVE failure here means that the SECS page
		 * still has children.  But, since all children in this
		 * 'sgx_virt_epc' have been removed, the SECS page must
		 * have a child on another instance.
		 */

> +		if (sgx_virt_epc_free_page(epc_page))
> +			list_add_tail(&epc_page->list, &secs_pages);

Why move these over to &secs_list here?  I think it's to avoid another
xa_for_each() below, but it's not clear.

> +		xa_erase(&epc->page_array, index);
> +	}
> +
> +	/*
> +	 * Third time's a charm.

This is confusing.  This section is *NOT* retrying a third time.  This
is a cute comment, but it's actually, logically different from the two
tries above.  I say remove it.  In fact, I'd even concentrate the
comment here to explain that this is a logically *TOALLY* disconnected
from what happened above.

>		  Try to EREMOVE zombie SECS pages from virtual
> +	 * EPC instances that were previously released, i.e. free SECS pages
> +	 * that were in limbo due to having children in *this* EPC instance.
> +	 */

This is as close as this code gets to telling me what a zombie page is.
 I don't think it gets close enough, or does it in the right spot.

I think it probably needs explicit discussion in the changelog.  I think
Sean explained this to me once, but I've forgotten by now.  The code
needs to be understandable without getting Sean on the phone anyway. :)

I'd probably just say:

	/*
	 * SECS pages are "pinned" by child pages, an unpinned once all
	 * children have been EREMOVE'd.  A child page in this instance
	 * may have pinned an SECS page encountered in an earlier
	 * release(), creating a zombie.  Since some children  were
	 * EREMOVE'd above, try to EREMOVE all zombies in the hopes that
	 * one was unpinned.
	 */

	
> +	mutex_lock(&virt_epc_lock);
> +	list_for_each_entry_safe(epc_page, tmp, &virt_epc_zombie_pages, list) {
> +		/*
> +		 * Speculatively remove the page from the list of zombies, if
> +		 * the page is successfully EREMOVE it will be added to the
> +		 * list of free pages.  If EREMOVE fails, throw the page on the
> +		 * local list, which will be spliced on at the end.
> +		 */
> +		list_del(&epc_page->list);
> +
> +		if (sgx_virt_epc_free_page(epc_page))
> +			list_add_tail(&epc_page->list, &secs_pages);

I don't get this.  Couldn't you do without the unconditional list_del()
and instead just do:

		if (!sgx_virt_epc_free_page(epc_page))
			list_del(&epc_page->list);

Or does the free() code clobber the list_head?  If that's the case,
maybe you should say that explicitly.

> +	}
> +
> +	if (!list_empty(&secs_pages))
> +		list_splice_tail(&secs_pages, &virt_epc_zombie_pages);
> +	mutex_unlock(&virt_epc_lock);
> +
> +	kfree(epc);
> +
> +	return 0;
> +}
> +
> +static int sgx_virt_epc_open(struct inode *inode, struct file *file)
> +{
> +	struct sgx_virt_epc *epc;
> +
> +	epc = kzalloc(sizeof(struct sgx_virt_epc), GFP_KERNEL);
> +	if (!epc)
> +		return -ENOMEM;
> +	/*
> +	 * Keep the current->mm to virtual EPC. It will be checked in
> +	 * sgx_virt_epc_mmap() to prevent, in case of fork, child being
> +	 * able to mmap() to the same virtual EPC pages.
> +	 */
> +	mmgrab(current->mm);
> +	epc->mm = current->mm;
> +	mutex_init(&epc->lock);
> +	xa_init(&epc->page_array);
> +
> +	file->private_data = epc;
> +
> +	return 0;
> +}

I understand why this made sense for regular enclaves, but I'm having a
harder time here.  If you mmap(fd, MAP_SHARED), fork(), and then pass
that mapping through to two different guests, you get to hold the
pieces, just like if you did the same with normal memory.

Why does the kernel need to enforce this policy?
Sean Christopherson Jan. 6, 2021, 8:35 p.m. UTC | #2
On Wed, Jan 06, 2021, Dave Hansen wrote:
> On 1/5/21 5:55 PM, Kai Huang wrote:
> > From: Sean Christopherson <sean.j.christopherson@intel.com>
> > diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> > index 95aad183bb65..02993a327a1f 100644
> > --- a/arch/x86/kernel/cpu/sgx/main.c
> > +++ b/arch/x86/kernel/cpu/sgx/main.c
> > @@ -9,9 +9,11 @@
> >  #include <linux/sched/mm.h>
> >  #include <linux/sched/signal.h>
> >  #include <linux/slab.h>
> > +#include "arch.h"
> >  #include "driver.h"
> >  #include "encl.h"
> >  #include "encls.h"
> > +#include "virt.h"
> >  
> >  struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
> >  static int sgx_nr_epc_sections;
> > @@ -726,7 +728,8 @@ static void __init sgx_init(void)
> >  	if (!sgx_page_reclaimer_init())
> >  		goto err_page_cache;
> >  
> > -	ret = sgx_drv_init();
> > +	/* Success if the native *or* virtual EPC driver initialized cleanly. */
> > +	ret = !!sgx_drv_init() & !!sgx_virt_epc_init();
> >  	if (ret)
> >  		goto err_kthread;
> 
> FWIW, I hate that conditional.  But, I tried to write to to be something
> more sane and failed.

Heh, you're welcome :-D

> > diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
> > new file mode 100644
> > index 000000000000..d625551ccf25
> > --- /dev/null
> > +++ b/arch/x86/kernel/cpu/sgx/virt.c
> > @@ -0,0 +1,263 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*  Copyright(c) 2016-20 Intel Corporation. */
> > +
> > +#include <linux/miscdevice.h>
> > +#include <linux/mm.h>
> > +#include <linux/mman.h>
> > +#include <linux/sched/mm.h>
> > +#include <linux/sched/signal.h>
> > +#include <linux/slab.h>
> > +#include <linux/xarray.h>
> > +#include <asm/sgx.h>
> > +#include <uapi/asm/sgx.h>
> > +
> > +#include "encls.h"
> > +#include "sgx.h"
> > +#include "virt.h"
> > +
> > +struct sgx_virt_epc {
> > +	struct xarray page_array;
> > +	struct mutex lock;
> > +	struct mm_struct *mm;
> > +};
> > +
> > +static struct mutex virt_epc_lock;
> > +static struct list_head virt_epc_zombie_pages;
> 
> What does the lock protect?

Effectively, the list of zombie SECS pages.  Not sure why I used a generic name.

> What are zombie pages?

My own terminology for SECS pages whose virtual EPC has been destroyed but can't
be reclaimed due to them having child EPC pages in other virtual EPCs.

> BTW, if zombies are SECS-only, shouldn't that be in the name rather than
> "epc"?

I used the virt_epc prefix/namespace to tag it as a global list.  I've no
argument against something like zombie_secs_pages.

> > +static int __sgx_virt_epc_fault(struct sgx_virt_epc *epc,
> > +				struct vm_area_struct *vma, unsigned long addr)
> > +{
> > +	struct sgx_epc_page *epc_page;
> > +	unsigned long index, pfn;
> > +	int ret;
> > +
> > +	/* epc->lock must already have been hold */
> 
> 	/* epc->lock must already be held */
> 
> Wouldn't this be better as:
> 
> WARN_ON(!mutex_is_locked(&epc->lock));
> 
> ?

Or just proper lockdep?
 
> > +	/* Calculate index of EPC page in virtual EPC's page_array */
> > +	index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
> > +
> > +	epc_page = xa_load(&epc->page_array, index);
> > +	if (epc_page)
> > +		return 0;
> > +
> > +	epc_page = sgx_alloc_epc_page(epc, false);
> > +	if (IS_ERR(epc_page))
> > +		return PTR_ERR(epc_page);
> > +
> > +	ret = xa_err(xa_store(&epc->page_array, index, epc_page, GFP_KERNEL));
> > +	if (ret)
> > +		goto err_free;
> > +
> > +	pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
> > +
> > +	ret = vmf_insert_pfn(vma, addr, pfn);
> > +	if (ret != VM_FAULT_NOPAGE) {
> > +		ret = -EFAULT;
> > +		goto err_delete;
> > +	}
> > +
> > +	return 0;
> > +
> > +err_delete:
> > +	xa_erase(&epc->page_array, index);
> > +err_free:
> > +	sgx_free_epc_page(epc_page);
> > +	return ret;
> > +}
> > +
> > +static vm_fault_t sgx_virt_epc_fault(struct vm_fault *vmf)
> > +{
> > +	struct vm_area_struct *vma = vmf->vma;
> > +	struct sgx_virt_epc *epc = vma->vm_private_data;
> > +	int ret;
> > +
> > +	mutex_lock(&epc->lock);
> > +	ret = __sgx_virt_epc_fault(epc, vma, vmf->address);
> > +	mutex_unlock(&epc->lock);
> > +
> > +	if (!ret)
> > +		return VM_FAULT_NOPAGE;
> > +
> > +	if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
> > +		mmap_read_unlock(vma->vm_mm);
> > +		return VM_FAULT_RETRY;
> > +	}
> > +
> > +	return VM_FAULT_SIGBUS;
> > +}
> > +
> > +const struct vm_operations_struct sgx_virt_epc_vm_ops = {
> > +	.fault = sgx_virt_epc_fault,
> > +};
> > +
> > +static int sgx_virt_epc_mmap(struct file *file, struct vm_area_struct *vma)
> > +{
> > +	struct sgx_virt_epc *epc = file->private_data;
> > +
> > +	if (!(vma->vm_flags & VM_SHARED))
> > +		return -EINVAL;
> > +
> > +	/*
> > +	 * Don't allow mmap() from child after fork(), since child and parent
> > +	 * cannot map to the same EPC.
> > +	 */
> > +	if (vma->vm_mm != epc->mm)
> > +		return -EINVAL;
> 
> I mentioned this below, but I'm not buying this logic.  I know it would
> be *bad*, but I don't see why the kernel needs to keep it from happening.

There's no known use case (KVM doesn't support sharing a VM across multiple
mm structs), and supporting multiple mm structs is a nightmare; see the driver
for the amount of pain incurred.

And IIRC, supporting VMM (KVM) EPC oversubscription, which may or may not ever
happen, was borderline impossible if virtual EPC supports multiple mm structs as
the interaction between KVM and virtual EPC is a disaster in that case.

> > +	vma->vm_ops = &sgx_virt_epc_vm_ops;
> > +	/* Don't copy VMA in fork() */
> > +	vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
> > +	vma->vm_private_data = file->private_data;
> > +
> > +	return 0;
> > +}
> > +
> > +static int sgx_virt_epc_free_page(struct sgx_epc_page *epc_page)
> > +{
> > +	int ret;
> > +
> > +	if (!epc_page)
> > +		return 0;
> 
> I always worry about these.  Why is passing NULL around OK?

I suspect I did it to mimic kfree() behavior.  I don't _think_ the radix (now
xarray) usage will ever encounter a NULL entry.

> 
> > +	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
> > +	if (ret) {
> > +		/*
> > +		 * Only SGX_CHILD_PRESENT is expected, which is because of
> > +		 * EREMOVE-ing an SECS still with child, in which case it can
> > +		 * be handled by EREMOVE-ing the SECS again after all pages in
> > +		 * virtual EPC have been EREMOVE-ed. See comments in below in
> > +		 * sgx_virt_epc_release().
> > +		 */
> > +		WARN_ON_ONCE(ret != SGX_CHILD_PRESENT);
> > +		return ret;
> > +	}
> 
> I find myself wondering what errors could cause the WARN_ON_ONCE() to be
> hit.  The SDM indicates that it's only:
> 
> 	SGX_ENCLAVE_ACT If there are still logical processors executing
> 			inside the enclave.
> 
> Should that be mentioned in the comment?

And faults, which are also spliced into the return value by the ENCLS macros.
I do remember hitting this WARN when I broke things, though I can't remember
whether it was a fault or the SGX_ENCLAVE_ACT scenario.  Probably the latter?

> > +
> > +	__sgx_free_epc_page(epc_page);
> > +	return 0;
> > +}
> > +

...

> > +	xa_for_each(&epc->page_array, index, entry) {
> > +		epc_page = entry;
> 
> Then, talk about the error condition here:
> 
> > +		/*
> > +		 * Error here means that EREMOVE failed due to a SECS page
> > +		 * still has child on *another* EPC instance.  Put it to a
> > +		 * temporary SECS list which will be spliced to 'zombie page
> > +		 * list' and will be EREMOVE-ed again when freeing another
> > +		 * virtual EPC instance.
> > +		 */
> 
> Surprise, I've got another rewrite:
> 
> 		/*
> 		 * An EREMOVE failure here means that the SECS page
> 		 * still has children.  But, since all children in this
> 		 * 'sgx_virt_epc' have been removed, the SECS page must
> 		 * have a child on another instance.
> 		 */
> 
> > +		if (sgx_virt_epc_free_page(epc_page))
> > +			list_add_tail(&epc_page->list, &secs_pages);
> 
> Why move these over to &secs_list here?  I think it's to avoid another
> xa_for_each() below, but it's not clear.

Yes?  IIRC, the sole motivation is to make the list_split_tail() operation as
short as possible while holding the global virt_epc_lock.
 
> > +		xa_erase(&epc->page_array, index);
> > +	}
> > +

...
 	
> > +	mutex_lock(&virt_epc_lock);
> > +	list_for_each_entry_safe(epc_page, tmp, &virt_epc_zombie_pages, list) {
> > +		/*
> > +		 * Speculatively remove the page from the list of zombies, if
> > +		 * the page is successfully EREMOVE it will be added to the
> > +		 * list of free pages.  If EREMOVE fails, throw the page on the
> > +		 * local list, which will be spliced on at the end.
> > +		 */
> > +		list_del(&epc_page->list);
> > +
> > +		if (sgx_virt_epc_free_page(epc_page))
> > +			list_add_tail(&epc_page->list, &secs_pages);
> 
> I don't get this.  Couldn't you do without the unconditional list_del()
> and instead just do:
> 
> 		if (!sgx_virt_epc_free_page(epc_page))
> 			list_del(&epc_page->list);
> 
> Or does the free() code clobber the list_head?  If that's the case,
> maybe you should say that explicitly.

More or less.  EPC pages need to be removed from their list before freeing, once
a page is freed it is owned by the allocator.  Deleting after freeing leads to
list corruption if a different thread allocates the page and adds it to a
different list.
 
> > +	}
> > +
> > +	if (!list_empty(&secs_pages))
> > +		list_splice_tail(&secs_pages, &virt_epc_zombie_pages);
> > +	mutex_unlock(&virt_epc_lock);
> > +
> > +	kfree(epc);
> > +
> > +	return 0;
> > +}
Huang, Kai Jan. 7, 2021, 12:47 a.m. UTC | #3
> > > +static struct mutex virt_epc_lock;
> > > +static struct list_head virt_epc_zombie_pages;
> > 
> > What does the lock protect?
> 
> Effectively, the list of zombie SECS pages.  Not sure why I used a generic name.
> 
> > What are zombie pages?
> 
> My own terminology for SECS pages whose virtual EPC has been destroyed but can't
> be reclaimed due to them having child EPC pages in other virtual EPCs.
> 
> > BTW, if zombies are SECS-only, shouldn't that be in the name rather than
> > "epc"?
> 
> I used the virt_epc prefix/namespace to tag it as a global list.  I've no
> argument against something like zombie_secs_pages.

I'll change to zombie_secs_pages, and lock name to zombie_secs_pages_lock,
respectively.


[...]

> > > +static int sgx_virt_epc_free_page(struct sgx_epc_page *epc_page)
> > > +{
> > > +	int ret;
> > > +
> > > +	if (!epc_page)
> > > +		return 0;
> > 
> > I always worry about these.  Why is passing NULL around OK?
> 
> I suspect I did it to mimic kfree() behavior.  I don't _think_ the radix (now
> xarray) usage will ever encounter a NULL entry.

I'll remove the NULL page check.

> 
> > 
> > > +	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
> > > +	if (ret) {
> > > +		/*
> > > +		 * Only SGX_CHILD_PRESENT is expected, which is because of
> > > +		 * EREMOVE-ing an SECS still with child, in which case it can
> > > +		 * be handled by EREMOVE-ing the SECS again after all pages in
> > > +		 * virtual EPC have been EREMOVE-ed. See comments in below in
> > > +		 * sgx_virt_epc_release().
> > > +		 */
> > > +		WARN_ON_ONCE(ret != SGX_CHILD_PRESENT);
> > > +		return ret;
> > > +	}
> > 
> > I find myself wondering what errors could cause the WARN_ON_ONCE() to be
> > hit.  The SDM indicates that it's only:
> > 
> > 	SGX_ENCLAVE_ACT If there are still logical processors executing
> > 			inside the enclave.
> > 
> > Should that be mentioned in the comment?
> 
> And faults, which are also spliced into the return value by the ENCLS macros.
> I do remember hitting this WARN when I broke things, though I can't remember
> whether it was a fault or the SGX_ENCLAVE_ACT scenario.  Probably the latter?

I'll add a comment saying that there should be no active logical processor
still running inside guest's enclave. We cannot handle SGX_ENCLAVE_ACT here
anyway.
Dave Hansen Jan. 7, 2021, 12:52 a.m. UTC | #4
On 1/6/21 4:47 PM, Kai Huang wrote:
>>>> +	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
>>>> +	if (ret) {
>>>> +		/*
>>>> +		 * Only SGX_CHILD_PRESENT is expected, which is because of
>>>> +		 * EREMOVE-ing an SECS still with child, in which case it can
>>>> +		 * be handled by EREMOVE-ing the SECS again after all pages in
>>>> +		 * virtual EPC have been EREMOVE-ed. See comments in below in
>>>> +		 * sgx_virt_epc_release().
>>>> +		 */
>>>> +		WARN_ON_ONCE(ret != SGX_CHILD_PRESENT);
>>>> +		return ret;
>>>> +	}
>>> I find myself wondering what errors could cause the WARN_ON_ONCE() to be
>>> hit.  The SDM indicates that it's only:
>>>
>>> 	SGX_ENCLAVE_ACT If there are still logical processors executing
>>> 			inside the enclave.
>>>
>>> Should that be mentioned in the comment?
>> And faults, which are also spliced into the return value by the ENCLS macros.
>> I do remember hitting this WARN when I broke things, though I can't remember
>> whether it was a fault or the SGX_ENCLAVE_ACT scenario.  Probably the latter?
> I'll add a comment saying that there should be no active logical processor
> still running inside guest's enclave. We cannot handle SGX_ENCLAVE_ACT here
> anyway.

One more thing...

Could we dump out the *actual* error code with a WARN(), please?  If we
see a warning, I'd rather not have to disassemble the instructions and
check against register values to see whether the error code was sane.
Huang, Kai Jan. 7, 2021, 1:38 a.m. UTC | #5
On Wed, 6 Jan 2021 16:52:49 -0800 Dave Hansen wrote:
> On 1/6/21 4:47 PM, Kai Huang wrote:
> >>>> +	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
> >>>> +	if (ret) {
> >>>> +		/*
> >>>> +		 * Only SGX_CHILD_PRESENT is expected, which is because of
> >>>> +		 * EREMOVE-ing an SECS still with child, in which case it can
> >>>> +		 * be handled by EREMOVE-ing the SECS again after all pages in
> >>>> +		 * virtual EPC have been EREMOVE-ed. See comments in below in
> >>>> +		 * sgx_virt_epc_release().
> >>>> +		 */
> >>>> +		WARN_ON_ONCE(ret != SGX_CHILD_PRESENT);
> >>>> +		return ret;
> >>>> +	}
> >>> I find myself wondering what errors could cause the WARN_ON_ONCE() to be
> >>> hit.  The SDM indicates that it's only:
> >>>
> >>> 	SGX_ENCLAVE_ACT If there are still logical processors executing
> >>> 			inside the enclave.
> >>>
> >>> Should that be mentioned in the comment?
> >> And faults, which are also spliced into the return value by the ENCLS macros.
> >> I do remember hitting this WARN when I broke things, though I can't remember
> >> whether it was a fault or the SGX_ENCLAVE_ACT scenario.  Probably the latter?
> > I'll add a comment saying that there should be no active logical processor
> > still running inside guest's enclave. We cannot handle SGX_ENCLAVE_ACT here
> > anyway.
> 
> One more thing...
> 
> Could we dump out the *actual* error code with a WARN(), please?  If we
> see a warning, I'd rather not have to disassemble the instructions and
> check against register values to see whether the error code was sane.

Sure. But WARN_ONCE() should be used, right, instead of WARN()?
Huang, Kai Jan. 7, 2021, 1:42 a.m. UTC | #6
On Wed, 6 Jan 2021 11:35:41 -0800 Dave Hansen wrote:
> On 1/5/21 5:55 PM, Kai Huang wrote:
> > From: Sean Christopherson <sean.j.christopherson@intel.com>
> > 
> > Add a misc device /dev/sgx_virt_epc to allow userspace to allocate "raw"
> > EPC without an associated enclave.  The intended and only known use case
> > for raw EPC allocation is to expose EPC to a KVM guest, hence the
> > virt_epc moniker, virt.{c,h} files and X86_SGX_VIRTUALIZATION Kconfig.
> > 
> > Modify sgx_init() to always try to initialize virtual EPC driver, even
> > when SGX driver is disabled due to SGX Launch Control is in locked mode,
> > or not present at all, since SGX virtualization allows to expose SGX to
> > guests that support non-LC configurations.
> 
> The grammar here is a bit off.  Here's a rewrite:
> 
> Modify sgx_init() to always try to initialize the virtual EPC driver,
> even if the bare-metal SGX driver is disabled.  The bare-metal driver
> might be disabled if SGX Launch Control is in locked mode, or not
> supported in the hardware at all.  This allows (non-Linux) guests that
> support non-LC configurations to use SGX.

Thanks. I'll use yours, except I want to change "bare-metal driver might be
disabled.." to "bare-metal driver will be disabled..".

I'll also use all your comments mentioned in your reply to this patch.

[...]

> > +
> > +static int sgx_virt_epc_release(struct inode *inode, struct file *file)
> > +{
> > +	struct sgx_virt_epc *epc = file->private_data;
> 
> FWIW, I hate the "struct sgx_virt_epc *epc" name.  "epc" here is really
> an instance
> 

How about "struct sgx_virt_epc *vepc" ?

[...]

> > +static int sgx_virt_epc_open(struct inode *inode, struct file *file)
> > +{
> > +	struct sgx_virt_epc *epc;
> > +
> > +	epc = kzalloc(sizeof(struct sgx_virt_epc), GFP_KERNEL);
> > +	if (!epc)
> > +		return -ENOMEM;
> > +	/*
> > +	 * Keep the current->mm to virtual EPC. It will be checked in
> > +	 * sgx_virt_epc_mmap() to prevent, in case of fork, child being
> > +	 * able to mmap() to the same virtual EPC pages.
> > +	 */
> > +	mmgrab(current->mm);
> > +	epc->mm = current->mm;
> > +	mutex_init(&epc->lock);
> > +	xa_init(&epc->page_array);
> > +
> > +	file->private_data = epc;
> > +
> > +	return 0;
> > +}
> 
> I understand why this made sense for regular enclaves, but I'm having a
> harder time here.  If you mmap(fd, MAP_SHARED), fork(), and then pass
> that mapping through to two different guests, you get to hold the
> pieces, just like if you did the same with normal memory.
> 
> Why does the kernel need to enforce this policy?

Does Sean's reply in another email satisfy you?
Dave Hansen Jan. 7, 2021, 5 a.m. UTC | #7
On 1/6/21 5:38 PM, Kai Huang wrote:
>> Could we dump out the *actual* error code with a WARN(), please?  If we
>> see a warning, I'd rather not have to disassemble the instructions and
>> check against register values to see whether the error code was sane.
> Sure. But WARN_ONCE() should be used, right, instead of WARN()?

Whatever will let you get a printf-format string out and only happens once.
Dave Hansen Jan. 7, 2021, 5:02 a.m. UTC | #8
On 1/6/21 5:42 PM, Kai Huang wrote:
>> I understand why this made sense for regular enclaves, but I'm having a
>> harder time here.  If you mmap(fd, MAP_SHARED), fork(), and then pass
>> that mapping through to two different guests, you get to hold the
>> pieces, just like if you did the same with normal memory.
>>
>> Why does the kernel need to enforce this policy?
> Does Sean's reply in another email satisfy you?

I'm not totally convinced.

Please give it a go in the changelog for the next one and try to
convince me that this is a good idea.  Focus on what the downsides will
be if the kernel does not enforce this policy.  What will break, and why
will it be bad?  Why is the kernel in the best position to thwart the
badness?
Jarkko Sakkinen Jan. 11, 2021, 11:38 p.m. UTC | #9
On Wed, Jan 06, 2021 at 02:55:20PM +1300, Kai Huang wrote:
> From: Sean Christopherson <sean.j.christopherson@intel.com>
> 
> Add a misc device /dev/sgx_virt_epc to allow userspace to allocate "raw"
> EPC without an associated enclave.  The intended and only known use case
> for raw EPC allocation is to expose EPC to a KVM guest, hence the
> virt_epc moniker, virt.{c,h} files and X86_SGX_VIRTUALIZATION Kconfig.
> 
> Modify sgx_init() to always try to initialize virtual EPC driver, even
> when SGX driver is disabled due to SGX Launch Control is in locked mode,
> or not present at all, since SGX virtualization allows to expose SGX to
> guests that support non-LC configurations.
> 
> Implement the "raw" EPC allocation in the x86 core-SGX subsystem via
> /dev/sgx_virt_epc rather than in KVM. Doing so has two major advantages:
> 
>   - Does not require changes to KVM's uAPI, e.g. EPC gets handled as
>     just another memory backend for guests.
> 
>   - EPC management is wholly contained in the SGX subsystem, e.g. SGX
>     does not have to export any symbols, changes to reclaim flows don't
>     need to be routed through KVM, SGX's dirty laundry doesn't have to
>     get aired out for the world to see, and so on and so forth.
> 
> The virtual EPC allocated to guests is currently not reclaimable, due to
> oversubscription of EPC for KVM guests is not currently supported. Due
> to the complications of handling reclaim conflicts between guest and
> host, KVM EPC oversubscription is significantly more complex than basic
> support for SGX virtualization.
> 
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Co-developed-by: Kai Huang <kai.huang@intel.com>
> Signed-off-by: Kai Huang <kai.huang@intel.com>

The commit message does not describe the code changes. It should
have an understandable explanation of fops. There is nothing about
the implementation right now.

/Jarkko

> ---
>  arch/x86/Kconfig                 |  12 ++
>  arch/x86/kernel/cpu/sgx/Makefile |   1 +
>  arch/x86/kernel/cpu/sgx/main.c   |   5 +-
>  arch/x86/kernel/cpu/sgx/virt.c   | 263 +++++++++++++++++++++++++++++++
>  arch/x86/kernel/cpu/sgx/virt.h   |  14 ++
>  5 files changed, 294 insertions(+), 1 deletion(-)
>  create mode 100644 arch/x86/kernel/cpu/sgx/virt.c
>  create mode 100644 arch/x86/kernel/cpu/sgx/virt.h
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 618d1aabccb8..a7318175509b 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1947,6 +1947,18 @@ config X86_SGX
>  
>  	  If unsure, say N.
>  
> +config X86_SGX_VIRTUALIZATION
> +	bool "Software Guard eXtensions (SGX) Virtualization"
> +	depends on X86_SGX && KVM_INTEL
> +	help
> +
> +	  Enables KVM guests to create SGX enclaves.
> +
> +	  This includes support to expose "raw" unreclaimable enclave memory to
> +	  guests via a device node, e.g. /dev/sgx_virt_epc.
> +
> +	  If unsure, say N.
> +
>  config EFI
>  	bool "EFI runtime service support"
>  	depends on ACPI
> diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
> index 91d3dc784a29..7a25bf63adfb 100644
> --- a/arch/x86/kernel/cpu/sgx/Makefile
> +++ b/arch/x86/kernel/cpu/sgx/Makefile
> @@ -3,3 +3,4 @@ obj-y += \
>  	encl.o \
>  	ioctl.o \
>  	main.o
> +obj-$(CONFIG_X86_SGX_VIRTUALIZATION)	+= virt.o
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index 95aad183bb65..02993a327a1f 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -9,9 +9,11 @@
>  #include <linux/sched/mm.h>
>  #include <linux/sched/signal.h>
>  #include <linux/slab.h>
> +#include "arch.h"
>  #include "driver.h"
>  #include "encl.h"
>  #include "encls.h"
> +#include "virt.h"
>  
>  struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
>  static int sgx_nr_epc_sections;
> @@ -726,7 +728,8 @@ static void __init sgx_init(void)
>  	if (!sgx_page_reclaimer_init())
>  		goto err_page_cache;
>  
> -	ret = sgx_drv_init();
> +	/* Success if the native *or* virtual EPC driver initialized cleanly. */
> +	ret = !!sgx_drv_init() & !!sgx_virt_epc_init();
>  	if (ret)
>  		goto err_kthread;
>  
> diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
> new file mode 100644
> index 000000000000..d625551ccf25
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/virt.c
> @@ -0,0 +1,263 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*  Copyright(c) 2016-20 Intel Corporation. */
> +
> +#include <linux/miscdevice.h>
> +#include <linux/mm.h>
> +#include <linux/mman.h>
> +#include <linux/sched/mm.h>
> +#include <linux/sched/signal.h>
> +#include <linux/slab.h>
> +#include <linux/xarray.h>
> +#include <asm/sgx.h>
> +#include <uapi/asm/sgx.h>
> +
> +#include "encls.h"
> +#include "sgx.h"
> +#include "virt.h"
> +
> +struct sgx_virt_epc {
> +	struct xarray page_array;
> +	struct mutex lock;
> +	struct mm_struct *mm;
> +};
> +
> +static struct mutex virt_epc_lock;
> +static struct list_head virt_epc_zombie_pages;
> +
> +static int __sgx_virt_epc_fault(struct sgx_virt_epc *epc,
> +				struct vm_area_struct *vma, unsigned long addr)
> +{
> +	struct sgx_epc_page *epc_page;
> +	unsigned long index, pfn;
> +	int ret;
> +
> +	/* epc->lock must already have been hold */
> +
> +	/* Calculate index of EPC page in virtual EPC's page_array */
> +	index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
> +
> +	epc_page = xa_load(&epc->page_array, index);
> +	if (epc_page)
> +		return 0;
> +
> +	epc_page = sgx_alloc_epc_page(epc, false);
> +	if (IS_ERR(epc_page))
> +		return PTR_ERR(epc_page);
> +
> +	ret = xa_err(xa_store(&epc->page_array, index, epc_page, GFP_KERNEL));
> +	if (ret)
> +		goto err_free;
> +
> +	pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
> +
> +	ret = vmf_insert_pfn(vma, addr, pfn);
> +	if (ret != VM_FAULT_NOPAGE) {
> +		ret = -EFAULT;
> +		goto err_delete;
> +	}
> +
> +	return 0;
> +
> +err_delete:
> +	xa_erase(&epc->page_array, index);
> +err_free:
> +	sgx_free_epc_page(epc_page);
> +	return ret;
> +}
> +
> +static vm_fault_t sgx_virt_epc_fault(struct vm_fault *vmf)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct sgx_virt_epc *epc = vma->vm_private_data;
> +	int ret;
> +
> +	mutex_lock(&epc->lock);
> +	ret = __sgx_virt_epc_fault(epc, vma, vmf->address);
> +	mutex_unlock(&epc->lock);
> +
> +	if (!ret)
> +		return VM_FAULT_NOPAGE;
> +
> +	if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
> +		mmap_read_unlock(vma->vm_mm);
> +		return VM_FAULT_RETRY;
> +	}
> +
> +	return VM_FAULT_SIGBUS;
> +}
> +
> +const struct vm_operations_struct sgx_virt_epc_vm_ops = {
> +	.fault = sgx_virt_epc_fault,
> +};
> +
> +static int sgx_virt_epc_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	struct sgx_virt_epc *epc = file->private_data;
> +
> +	if (!(vma->vm_flags & VM_SHARED))
> +		return -EINVAL;
> +
> +	/*
> +	 * Don't allow mmap() from child after fork(), since child and parent
> +	 * cannot map to the same EPC.
> +	 */
> +	if (vma->vm_mm != epc->mm)
> +		return -EINVAL;
> +
> +	vma->vm_ops = &sgx_virt_epc_vm_ops;
> +	/* Don't copy VMA in fork() */
> +	vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
> +	vma->vm_private_data = file->private_data;
> +
> +	return 0;
> +}
> +
> +static int sgx_virt_epc_free_page(struct sgx_epc_page *epc_page)
> +{
> +	int ret;
> +
> +	if (!epc_page)
> +		return 0;
> +
> +	/*
> +	 * Explicitly EREMOVE virtual EPC page. Virtual EPC is only used by
> +	 * guest, and in normal condition guest should have done EREMOVE for
> +	 * all EPC pages before they are freed here. But it's possible guest
> +	 * is killed or crashed unnormally in which case EREMOVE has not been
> +	 * done. Do EREMOVE unconditionally here to cover both cases, because
> +	 * it's not possible to tell whether guest has done EREMOVE, since
> +	 * virtual EPC page status is not tracked. And it is fine to EREMOVE
> +	 * EPC page multiple times.
> +	 */
> +	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
> +	if (ret) {
> +		/*
> +		 * Only SGX_CHILD_PRESENT is expected, which is because of
> +		 * EREMOVE-ing an SECS still with child, in which case it can
> +		 * be handled by EREMOVE-ing the SECS again after all pages in
> +		 * virtual EPC have been EREMOVE-ed. See comments in below in
> +		 * sgx_virt_epc_release().
> +		 */
> +		WARN_ON_ONCE(ret != SGX_CHILD_PRESENT);
> +		return ret;
> +	}
> +
> +	__sgx_free_epc_page(epc_page);
> +	return 0;
> +}
> +
> +static int sgx_virt_epc_release(struct inode *inode, struct file *file)
> +{
> +	struct sgx_virt_epc *epc = file->private_data;
> +	struct sgx_epc_page *epc_page, *tmp, *entry;
> +	unsigned long index;
> +
> +	LIST_HEAD(secs_pages);
> +
> +	mmdrop(epc->mm);
> +
> +	xa_for_each(&epc->page_array, index, entry) {
> +		/*
> +		 * Virtual EPC pages are not tracked, so it's possible for
> +		 * EREMOVE to fail due to, e.g. a SECS page still has children
> +		 * if guest was shutdown unexpectedly. If it is the case, leave
> +		 * it in the xarray and retry EREMOVE below later.
> +		 */
> +		if (sgx_virt_epc_free_page(entry))
> +			continue;
> +
> +		xa_erase(&epc->page_array, index);
> +	}
> +
> +	/*
> +	 * Retry all failed pages after iterating through the entire tree, at
> +	 * which point all children should be removed and the SECS pages can be
> +	 * nuked as well...unless userspace has exposed multiple instance of
> +	 * virtual EPC to a single VM.
> +	 */
> +	xa_for_each(&epc->page_array, index, entry) {
> +		epc_page = entry;
> +		/*
> +		 * Error here means that EREMOVE failed due to a SECS page
> +		 * still has child on *another* EPC instance.  Put it to a
> +		 * temporary SECS list which will be spliced to 'zombie page
> +		 * list' and will be EREMOVE-ed again when freeing another
> +		 * virtual EPC instance.
> +		 */
> +		if (sgx_virt_epc_free_page(epc_page))
> +			list_add_tail(&epc_page->list, &secs_pages);
> +
> +		xa_erase(&epc->page_array, index);
> +	}
> +
> +	/*
> +	 * Third time's a charm.  Try to EREMOVE zombie SECS pages from virtual
> +	 * EPC instances that were previously released, i.e. free SECS pages
> +	 * that were in limbo due to having children in *this* EPC instance.
> +	 */
> +	mutex_lock(&virt_epc_lock);
> +	list_for_each_entry_safe(epc_page, tmp, &virt_epc_zombie_pages, list) {
> +		/*
> +		 * Speculatively remove the page from the list of zombies, if
> +		 * the page is successfully EREMOVE it will be added to the
> +		 * list of free pages.  If EREMOVE fails, throw the page on the
> +		 * local list, which will be spliced on at the end.
> +		 */
> +		list_del(&epc_page->list);
> +
> +		if (sgx_virt_epc_free_page(epc_page))
> +			list_add_tail(&epc_page->list, &secs_pages);
> +	}
> +
> +	if (!list_empty(&secs_pages))
> +		list_splice_tail(&secs_pages, &virt_epc_zombie_pages);
> +	mutex_unlock(&virt_epc_lock);
> +
> +	kfree(epc);
> +
> +	return 0;
> +}
> +
> +static int sgx_virt_epc_open(struct inode *inode, struct file *file)
> +{
> +	struct sgx_virt_epc *epc;
> +
> +	epc = kzalloc(sizeof(struct sgx_virt_epc), GFP_KERNEL);
> +	if (!epc)
> +		return -ENOMEM;
> +	/*
> +	 * Keep the current->mm to virtual EPC. It will be checked in
> +	 * sgx_virt_epc_mmap() to prevent, in case of fork, child being
> +	 * able to mmap() to the same virtual EPC pages.
> +	 */
> +	mmgrab(current->mm);
> +	epc->mm = current->mm;
> +	mutex_init(&epc->lock);
> +	xa_init(&epc->page_array);
> +
> +	file->private_data = epc;
> +
> +	return 0;
> +}
> +
> +static const struct file_operations sgx_virt_epc_fops = {
> +	.owner			= THIS_MODULE,
> +	.open			= sgx_virt_epc_open,
> +	.release		= sgx_virt_epc_release,
> +	.mmap			= sgx_virt_epc_mmap,
> +};
> +
> +static struct miscdevice sgx_virt_epc_dev = {
> +	.minor = MISC_DYNAMIC_MINOR,
> +	.name = "sgx_virt_epc",
> +	.nodename = "sgx_virt_epc",
> +	.fops = &sgx_virt_epc_fops,
> +};
> +
> +int __init sgx_virt_epc_init(void)
> +{
> +	INIT_LIST_HEAD(&virt_epc_zombie_pages);
> +	mutex_init(&virt_epc_lock);
> +
> +	return misc_register(&sgx_virt_epc_dev);
> +}
> diff --git a/arch/x86/kernel/cpu/sgx/virt.h b/arch/x86/kernel/cpu/sgx/virt.h
> new file mode 100644
> index 000000000000..e5434541a122
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/virt.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> +#ifndef _ASM_X86_SGX_VIRT_H
> +#define _ASM_X86_SGX_VIRT_H
> +
> +#ifdef CONFIG_X86_SGX_VIRTUALIZATION
> +int __init sgx_virt_epc_init(void);
> +#else
> +static inline int __init sgx_virt_epc_init(void)
> +{
> +	return -ENODEV;
> +}
> +#endif
> +
> +#endif /* _ASM_X86_SGX_VIRT_H */
> -- 
> 2.29.2
> 
>
Huang, Kai Jan. 12, 2021, 12:56 a.m. UTC | #10
On Tue, 12 Jan 2021 01:38:23 +0200 Jarkko Sakkinen wrote:
> On Wed, Jan 06, 2021 at 02:55:20PM +1300, Kai Huang wrote:
> > From: Sean Christopherson <sean.j.christopherson@intel.com>
> > 
> > Add a misc device /dev/sgx_virt_epc to allow userspace to allocate "raw"
> > EPC without an associated enclave.  The intended and only known use case
> > for raw EPC allocation is to expose EPC to a KVM guest, hence the
> > virt_epc moniker, virt.{c,h} files and X86_SGX_VIRTUALIZATION Kconfig.
> > 
> > Modify sgx_init() to always try to initialize virtual EPC driver, even
> > when SGX driver is disabled due to SGX Launch Control is in locked mode,
> > or not present at all, since SGX virtualization allows to expose SGX to
> > guests that support non-LC configurations.
> > 
> > Implement the "raw" EPC allocation in the x86 core-SGX subsystem via
> > /dev/sgx_virt_epc rather than in KVM. Doing so has two major advantages:
> > 
> >   - Does not require changes to KVM's uAPI, e.g. EPC gets handled as
> >     just another memory backend for guests.
> > 
> >   - EPC management is wholly contained in the SGX subsystem, e.g. SGX
> >     does not have to export any symbols, changes to reclaim flows don't
> >     need to be routed through KVM, SGX's dirty laundry doesn't have to
> >     get aired out for the world to see, and so on and so forth.
> > 
> > The virtual EPC allocated to guests is currently not reclaimable, due to
> > oversubscription of EPC for KVM guests is not currently supported. Due
> > to the complications of handling reclaim conflicts between guest and
> > host, KVM EPC oversubscription is significantly more complex than basic
> > support for SGX virtualization.
> > 
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > Co-developed-by: Kai Huang <kai.huang@intel.com>
> > Signed-off-by: Kai Huang <kai.huang@intel.com>
> 
> The commit message does not describe the code changes. It should
> have an understandable explanation of fops. There is nothing about
> the implementation right now.

Thanks for feedback. Does "understabdable explanation of fops" mean I
should add one sentence to say, for instance: "userspace hypervisor should open
the /dev/sgx_virt_epc, use mmap() to get a valid address range, and then use
that address range to create KVM memory region"?

Or should I include an example of how to use /dev/sgx_virt_epc in userspace, for
instance, below?

	fd = open("/dev/sgx_virt_epc", O_RDWR);
	void *addr = mmap(NULL, size, ..., fd);
	/* userspace hypervisor uses addr, size to create KVM memory slot */
	...

I dug the SGX driver side to understand what should I add, but in below commit I
don't see description of fops either:

	commit 3fe0778edac8628637e2fd23835996523b1a3372
	Author: Jarkko Sakkinen <jarkko@kernel.org>
	Date:   Fri Nov 13 00:01:22 2020 +0200

    	    x86/sgx: Add an SGX misc driver interface


> 
> /Jarkko
> 
> > ---
> >  arch/x86/Kconfig                 |  12 ++
> >  arch/x86/kernel/cpu/sgx/Makefile |   1 +
> >  arch/x86/kernel/cpu/sgx/main.c   |   5 +-
> >  arch/x86/kernel/cpu/sgx/virt.c   | 263 +++++++++++++++++++++++++++++++
> >  arch/x86/kernel/cpu/sgx/virt.h   |  14 ++
> >  5 files changed, 294 insertions(+), 1 deletion(-)
> >  create mode 100644 arch/x86/kernel/cpu/sgx/virt.c
> >  create mode 100644 arch/x86/kernel/cpu/sgx/virt.h
> > 
> > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> > index 618d1aabccb8..a7318175509b 100644
> > --- a/arch/x86/Kconfig
> > +++ b/arch/x86/Kconfig
> > @@ -1947,6 +1947,18 @@ config X86_SGX
> >  
> >  	  If unsure, say N.
> >  
> > +config X86_SGX_VIRTUALIZATION
> > +	bool "Software Guard eXtensions (SGX) Virtualization"
> > +	depends on X86_SGX && KVM_INTEL
> > +	help
> > +
> > +	  Enables KVM guests to create SGX enclaves.
> > +
> > +	  This includes support to expose "raw" unreclaimable enclave memory to
> > +	  guests via a device node, e.g. /dev/sgx_virt_epc.
> > +
> > +	  If unsure, say N.
> > +
> >  config EFI
> >  	bool "EFI runtime service support"
> >  	depends on ACPI
> > diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
> > index 91d3dc784a29..7a25bf63adfb 100644
> > --- a/arch/x86/kernel/cpu/sgx/Makefile
> > +++ b/arch/x86/kernel/cpu/sgx/Makefile
> > @@ -3,3 +3,4 @@ obj-y += \
> >  	encl.o \
> >  	ioctl.o \
> >  	main.o
> > +obj-$(CONFIG_X86_SGX_VIRTUALIZATION)	+= virt.o
> > diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> > index 95aad183bb65..02993a327a1f 100644
> > --- a/arch/x86/kernel/cpu/sgx/main.c
> > +++ b/arch/x86/kernel/cpu/sgx/main.c
> > @@ -9,9 +9,11 @@
> >  #include <linux/sched/mm.h>
> >  #include <linux/sched/signal.h>
> >  #include <linux/slab.h>
> > +#include "arch.h"
> >  #include "driver.h"
> >  #include "encl.h"
> >  #include "encls.h"
> > +#include "virt.h"
> >  
> >  struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
> >  static int sgx_nr_epc_sections;
> > @@ -726,7 +728,8 @@ static void __init sgx_init(void)
> >  	if (!sgx_page_reclaimer_init())
> >  		goto err_page_cache;
> >  
> > -	ret = sgx_drv_init();
> > +	/* Success if the native *or* virtual EPC driver initialized cleanly. */
> > +	ret = !!sgx_drv_init() & !!sgx_virt_epc_init();
> >  	if (ret)
> >  		goto err_kthread;
> >  
> > diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
> > new file mode 100644
> > index 000000000000..d625551ccf25
> > --- /dev/null
> > +++ b/arch/x86/kernel/cpu/sgx/virt.c
> > @@ -0,0 +1,263 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*  Copyright(c) 2016-20 Intel Corporation. */
> > +
> > +#include <linux/miscdevice.h>
> > +#include <linux/mm.h>
> > +#include <linux/mman.h>
> > +#include <linux/sched/mm.h>
> > +#include <linux/sched/signal.h>
> > +#include <linux/slab.h>
> > +#include <linux/xarray.h>
> > +#include <asm/sgx.h>
> > +#include <uapi/asm/sgx.h>
> > +
> > +#include "encls.h"
> > +#include "sgx.h"
> > +#include "virt.h"
> > +
> > +struct sgx_virt_epc {
> > +	struct xarray page_array;
> > +	struct mutex lock;
> > +	struct mm_struct *mm;
> > +};
> > +
> > +static struct mutex virt_epc_lock;
> > +static struct list_head virt_epc_zombie_pages;
> > +
> > +static int __sgx_virt_epc_fault(struct sgx_virt_epc *epc,
> > +				struct vm_area_struct *vma, unsigned long addr)
> > +{
> > +	struct sgx_epc_page *epc_page;
> > +	unsigned long index, pfn;
> > +	int ret;
> > +
> > +	/* epc->lock must already have been hold */
> > +
> > +	/* Calculate index of EPC page in virtual EPC's page_array */
> > +	index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
> > +
> > +	epc_page = xa_load(&epc->page_array, index);
> > +	if (epc_page)
> > +		return 0;
> > +
> > +	epc_page = sgx_alloc_epc_page(epc, false);
> > +	if (IS_ERR(epc_page))
> > +		return PTR_ERR(epc_page);
> > +
> > +	ret = xa_err(xa_store(&epc->page_array, index, epc_page, GFP_KERNEL));
> > +	if (ret)
> > +		goto err_free;
> > +
> > +	pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
> > +
> > +	ret = vmf_insert_pfn(vma, addr, pfn);
> > +	if (ret != VM_FAULT_NOPAGE) {
> > +		ret = -EFAULT;
> > +		goto err_delete;
> > +	}
> > +
> > +	return 0;
> > +
> > +err_delete:
> > +	xa_erase(&epc->page_array, index);
> > +err_free:
> > +	sgx_free_epc_page(epc_page);
> > +	return ret;
> > +}
> > +
> > +static vm_fault_t sgx_virt_epc_fault(struct vm_fault *vmf)
> > +{
> > +	struct vm_area_struct *vma = vmf->vma;
> > +	struct sgx_virt_epc *epc = vma->vm_private_data;
> > +	int ret;
> > +
> > +	mutex_lock(&epc->lock);
> > +	ret = __sgx_virt_epc_fault(epc, vma, vmf->address);
> > +	mutex_unlock(&epc->lock);
> > +
> > +	if (!ret)
> > +		return VM_FAULT_NOPAGE;
> > +
> > +	if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
> > +		mmap_read_unlock(vma->vm_mm);
> > +		return VM_FAULT_RETRY;
> > +	}
> > +
> > +	return VM_FAULT_SIGBUS;
> > +}
> > +
> > +const struct vm_operations_struct sgx_virt_epc_vm_ops = {
> > +	.fault = sgx_virt_epc_fault,
> > +};
> > +
> > +static int sgx_virt_epc_mmap(struct file *file, struct vm_area_struct *vma)
> > +{
> > +	struct sgx_virt_epc *epc = file->private_data;
> > +
> > +	if (!(vma->vm_flags & VM_SHARED))
> > +		return -EINVAL;
> > +
> > +	/*
> > +	 * Don't allow mmap() from child after fork(), since child and parent
> > +	 * cannot map to the same EPC.
> > +	 */
> > +	if (vma->vm_mm != epc->mm)
> > +		return -EINVAL;
> > +
> > +	vma->vm_ops = &sgx_virt_epc_vm_ops;
> > +	/* Don't copy VMA in fork() */
> > +	vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
> > +	vma->vm_private_data = file->private_data;
> > +
> > +	return 0;
> > +}
> > +
> > +static int sgx_virt_epc_free_page(struct sgx_epc_page *epc_page)
> > +{
> > +	int ret;
> > +
> > +	if (!epc_page)
> > +		return 0;
> > +
> > +	/*
> > +	 * Explicitly EREMOVE virtual EPC page. Virtual EPC is only used by
> > +	 * guest, and in normal condition guest should have done EREMOVE for
> > +	 * all EPC pages before they are freed here. But it's possible guest
> > +	 * is killed or crashed unnormally in which case EREMOVE has not been
> > +	 * done. Do EREMOVE unconditionally here to cover both cases, because
> > +	 * it's not possible to tell whether guest has done EREMOVE, since
> > +	 * virtual EPC page status is not tracked. And it is fine to EREMOVE
> > +	 * EPC page multiple times.
> > +	 */
> > +	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
> > +	if (ret) {
> > +		/*
> > +		 * Only SGX_CHILD_PRESENT is expected, which is because of
> > +		 * EREMOVE-ing an SECS still with child, in which case it can
> > +		 * be handled by EREMOVE-ing the SECS again after all pages in
> > +		 * virtual EPC have been EREMOVE-ed. See comments in below in
> > +		 * sgx_virt_epc_release().
> > +		 */
> > +		WARN_ON_ONCE(ret != SGX_CHILD_PRESENT);
> > +		return ret;
> > +	}
> > +
> > +	__sgx_free_epc_page(epc_page);
> > +	return 0;
> > +}
> > +
> > +static int sgx_virt_epc_release(struct inode *inode, struct file *file)
> > +{
> > +	struct sgx_virt_epc *epc = file->private_data;
> > +	struct sgx_epc_page *epc_page, *tmp, *entry;
> > +	unsigned long index;
> > +
> > +	LIST_HEAD(secs_pages);
> > +
> > +	mmdrop(epc->mm);
> > +
> > +	xa_for_each(&epc->page_array, index, entry) {
> > +		/*
> > +		 * Virtual EPC pages are not tracked, so it's possible for
> > +		 * EREMOVE to fail due to, e.g. a SECS page still has children
> > +		 * if guest was shutdown unexpectedly. If it is the case, leave
> > +		 * it in the xarray and retry EREMOVE below later.
> > +		 */
> > +		if (sgx_virt_epc_free_page(entry))
> > +			continue;
> > +
> > +		xa_erase(&epc->page_array, index);
> > +	}
> > +
> > +	/*
> > +	 * Retry all failed pages after iterating through the entire tree, at
> > +	 * which point all children should be removed and the SECS pages can be
> > +	 * nuked as well...unless userspace has exposed multiple instance of
> > +	 * virtual EPC to a single VM.
> > +	 */
> > +	xa_for_each(&epc->page_array, index, entry) {
> > +		epc_page = entry;
> > +		/*
> > +		 * Error here means that EREMOVE failed due to a SECS page
> > +		 * still has child on *another* EPC instance.  Put it to a
> > +		 * temporary SECS list which will be spliced to 'zombie page
> > +		 * list' and will be EREMOVE-ed again when freeing another
> > +		 * virtual EPC instance.
> > +		 */
> > +		if (sgx_virt_epc_free_page(epc_page))
> > +			list_add_tail(&epc_page->list, &secs_pages);
> > +
> > +		xa_erase(&epc->page_array, index);
> > +	}
> > +
> > +	/*
> > +	 * Third time's a charm.  Try to EREMOVE zombie SECS pages from virtual
> > +	 * EPC instances that were previously released, i.e. free SECS pages
> > +	 * that were in limbo due to having children in *this* EPC instance.
> > +	 */
> > +	mutex_lock(&virt_epc_lock);
> > +	list_for_each_entry_safe(epc_page, tmp, &virt_epc_zombie_pages, list) {
> > +		/*
> > +		 * Speculatively remove the page from the list of zombies, if
> > +		 * the page is successfully EREMOVE it will be added to the
> > +		 * list of free pages.  If EREMOVE fails, throw the page on the
> > +		 * local list, which will be spliced on at the end.
> > +		 */
> > +		list_del(&epc_page->list);
> > +
> > +		if (sgx_virt_epc_free_page(epc_page))
> > +			list_add_tail(&epc_page->list, &secs_pages);
> > +	}
> > +
> > +	if (!list_empty(&secs_pages))
> > +		list_splice_tail(&secs_pages, &virt_epc_zombie_pages);
> > +	mutex_unlock(&virt_epc_lock);
> > +
> > +	kfree(epc);
> > +
> > +	return 0;
> > +}
> > +
> > +static int sgx_virt_epc_open(struct inode *inode, struct file *file)
> > +{
> > +	struct sgx_virt_epc *epc;
> > +
> > +	epc = kzalloc(sizeof(struct sgx_virt_epc), GFP_KERNEL);
> > +	if (!epc)
> > +		return -ENOMEM;
> > +	/*
> > +	 * Keep the current->mm to virtual EPC. It will be checked in
> > +	 * sgx_virt_epc_mmap() to prevent, in case of fork, child being
> > +	 * able to mmap() to the same virtual EPC pages.
> > +	 */
> > +	mmgrab(current->mm);
> > +	epc->mm = current->mm;
> > +	mutex_init(&epc->lock);
> > +	xa_init(&epc->page_array);
> > +
> > +	file->private_data = epc;
> > +
> > +	return 0;
> > +}
> > +
> > +static const struct file_operations sgx_virt_epc_fops = {
> > +	.owner			= THIS_MODULE,
> > +	.open			= sgx_virt_epc_open,
> > +	.release		= sgx_virt_epc_release,
> > +	.mmap			= sgx_virt_epc_mmap,
> > +};
> > +
> > +static struct miscdevice sgx_virt_epc_dev = {
> > +	.minor = MISC_DYNAMIC_MINOR,
> > +	.name = "sgx_virt_epc",
> > +	.nodename = "sgx_virt_epc",
> > +	.fops = &sgx_virt_epc_fops,
> > +};
> > +
> > +int __init sgx_virt_epc_init(void)
> > +{
> > +	INIT_LIST_HEAD(&virt_epc_zombie_pages);
> > +	mutex_init(&virt_epc_lock);
> > +
> > +	return misc_register(&sgx_virt_epc_dev);
> > +}
> > diff --git a/arch/x86/kernel/cpu/sgx/virt.h b/arch/x86/kernel/cpu/sgx/virt.h
> > new file mode 100644
> > index 000000000000..e5434541a122
> > --- /dev/null
> > +++ b/arch/x86/kernel/cpu/sgx/virt.h
> > @@ -0,0 +1,14 @@
> > +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> > +#ifndef _ASM_X86_SGX_VIRT_H
> > +#define _ASM_X86_SGX_VIRT_H
> > +
> > +#ifdef CONFIG_X86_SGX_VIRTUALIZATION
> > +int __init sgx_virt_epc_init(void);
> > +#else
> > +static inline int __init sgx_virt_epc_init(void)
> > +{
> > +	return -ENODEV;
> > +}
> > +#endif
> > +
> > +#endif /* _ASM_X86_SGX_VIRT_H */
> > -- 
> > 2.29.2
> > 
> >
Jarkko Sakkinen Jan. 12, 2021, 1:50 a.m. UTC | #11
On Tue, Jan 12, 2021 at 01:56:54PM +1300, Kai Huang wrote:
> On Tue, 12 Jan 2021 01:38:23 +0200 Jarkko Sakkinen wrote:
> > On Wed, Jan 06, 2021 at 02:55:20PM +1300, Kai Huang wrote:
> > > From: Sean Christopherson <sean.j.christopherson@intel.com>
> > > 
> > > Add a misc device /dev/sgx_virt_epc to allow userspace to allocate "raw"
> > > EPC without an associated enclave.  The intended and only known use case
> > > for raw EPC allocation is to expose EPC to a KVM guest, hence the
> > > virt_epc moniker, virt.{c,h} files and X86_SGX_VIRTUALIZATION Kconfig.
> > > 
> > > Modify sgx_init() to always try to initialize virtual EPC driver, even
> > > when SGX driver is disabled due to SGX Launch Control is in locked mode,
> > > or not present at all, since SGX virtualization allows to expose SGX to
> > > guests that support non-LC configurations.
> > > 
> > > Implement the "raw" EPC allocation in the x86 core-SGX subsystem via
> > > /dev/sgx_virt_epc rather than in KVM. Doing so has two major advantages:
> > > 
> > >   - Does not require changes to KVM's uAPI, e.g. EPC gets handled as
> > >     just another memory backend for guests.
> > > 
> > >   - EPC management is wholly contained in the SGX subsystem, e.g. SGX
> > >     does not have to export any symbols, changes to reclaim flows don't
> > >     need to be routed through KVM, SGX's dirty laundry doesn't have to
> > >     get aired out for the world to see, and so on and so forth.
> > > 
> > > The virtual EPC allocated to guests is currently not reclaimable, due to
> > > oversubscription of EPC for KVM guests is not currently supported. Due
> > > to the complications of handling reclaim conflicts between guest and
> > > host, KVM EPC oversubscription is significantly more complex than basic
> > > support for SGX virtualization.
> > > 
> > > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > > Co-developed-by: Kai Huang <kai.huang@intel.com>
> > > Signed-off-by: Kai Huang <kai.huang@intel.com>
> > 
> > The commit message does not describe the code changes. It should
> > have an understandable explanation of fops. There is nothing about
> > the implementation right now.
> 
> Thanks for feedback. Does "understabdable explanation of fops" mean I
> should add one sentence to say, for instance: "userspace hypervisor should open
> the /dev/sgx_virt_epc, use mmap() to get a valid address range, and then use
> that address range to create KVM memory region"?
> 
> Or should I include an example of how to use /dev/sgx_virt_epc in userspace, for
> instance, below?
> 
> 	fd = open("/dev/sgx_virt_epc", O_RDWR);
> 	void *addr = mmap(NULL, size, ..., fd);
> 	/* userspace hypervisor uses addr, size to create KVM memory slot */
> 	...

I would suggest just to describe them in few sentences. Just write
how you understand them in one paragraph.

/Jarkko
Huang, Kai Jan. 12, 2021, 2:03 a.m. UTC | #12
On Tue, 12 Jan 2021 03:50:23 +0200 Jarkko Sakkinen wrote:
> On Tue, Jan 12, 2021 at 01:56:54PM +1300, Kai Huang wrote:
> > On Tue, 12 Jan 2021 01:38:23 +0200 Jarkko Sakkinen wrote:
> > > On Wed, Jan 06, 2021 at 02:55:20PM +1300, Kai Huang wrote:
> > > > From: Sean Christopherson <sean.j.christopherson@intel.com>
> > > > 
> > > > Add a misc device /dev/sgx_virt_epc to allow userspace to allocate "raw"
> > > > EPC without an associated enclave.  The intended and only known use case
> > > > for raw EPC allocation is to expose EPC to a KVM guest, hence the
> > > > virt_epc moniker, virt.{c,h} files and X86_SGX_VIRTUALIZATION Kconfig.
> > > > 
> > > > Modify sgx_init() to always try to initialize virtual EPC driver, even
> > > > when SGX driver is disabled due to SGX Launch Control is in locked mode,
> > > > or not present at all, since SGX virtualization allows to expose SGX to
> > > > guests that support non-LC configurations.
> > > > 
> > > > Implement the "raw" EPC allocation in the x86 core-SGX subsystem via
> > > > /dev/sgx_virt_epc rather than in KVM. Doing so has two major advantages:
> > > > 
> > > >   - Does not require changes to KVM's uAPI, e.g. EPC gets handled as
> > > >     just another memory backend for guests.
> > > > 
> > > >   - EPC management is wholly contained in the SGX subsystem, e.g. SGX
> > > >     does not have to export any symbols, changes to reclaim flows don't
> > > >     need to be routed through KVM, SGX's dirty laundry doesn't have to
> > > >     get aired out for the world to see, and so on and so forth.
> > > > 
> > > > The virtual EPC allocated to guests is currently not reclaimable, due to
> > > > oversubscription of EPC for KVM guests is not currently supported. Due
> > > > to the complications of handling reclaim conflicts between guest and
> > > > host, KVM EPC oversubscription is significantly more complex than basic
> > > > support for SGX virtualization.
> > > > 
> > > > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > > > Co-developed-by: Kai Huang <kai.huang@intel.com>
> > > > Signed-off-by: Kai Huang <kai.huang@intel.com>
> > > 
> > > The commit message does not describe the code changes. It should
> > > have an understandable explanation of fops. There is nothing about
> > > the implementation right now.
> > 
> > Thanks for feedback. Does "understabdable explanation of fops" mean I
> > should add one sentence to say, for instance: "userspace hypervisor should open
> > the /dev/sgx_virt_epc, use mmap() to get a valid address range, and then use
> > that address range to create KVM memory region"?
> > 
> > Or should I include an example of how to use /dev/sgx_virt_epc in userspace, for
> > instance, below?
> > 
> > 	fd = open("/dev/sgx_virt_epc", O_RDWR);
> > 	void *addr = mmap(NULL, size, ..., fd);
> > 	/* userspace hypervisor uses addr, size to create KVM memory slot */
> > 	...
> 
> I would suggest just to describe them in few sentences. Just write
> how you understand them in one paragraph.

Will do. Thanks.
Huang, Kai Jan. 15, 2021, 2:07 p.m. UTC | #13
On Wed, 6 Jan 2021 21:02:54 -0800 Dave Hansen wrote:
> On 1/6/21 5:42 PM, Kai Huang wrote:
> >> I understand why this made sense for regular enclaves, but I'm having a
> >> harder time here.  If you mmap(fd, MAP_SHARED), fork(), and then pass
> >> that mapping through to two different guests, you get to hold the
> >> pieces, just like if you did the same with normal memory.
> >>
> >> Why does the kernel need to enforce this policy?
> > Does Sean's reply in another email satisfy you?
> 
> I'm not totally convinced.
> 
> Please give it a go in the changelog for the next one and try to
> convince me that this is a good idea.  Focus on what the downsides will
> be if the kernel does not enforce this policy.  What will break, and why
> will it be bad?  Why is the kernel in the best position to thwart the
> badness?

Hi Dave, Sean,

Sorry for late reply of this. I feel I should try again to get consensus here.

From virtual EPC's perspective, if we don't force this in kernel, then
*theoretically*, userspace can use fork() to make multiple VMs map to the
same physical EPC, which will potentially cause enclaves in all VMs to behave
abnormally. So to me, from this perspective, it's better to enforce in kernel
so that only first VM can use this virtual EPC instance, because EPC by
architectural design cannot be shared.

But as Sean said, KVM doesn't support VM across multiple mm structs. And if I
read code correctly, KVM doesn't support userspace to use fork() to create new
VM. For instance, when creating VM, KVM grabs current->mm and keeps it in
'struct kvm' for bookkeeping, and kvm_vcpu_ioctl() and kvm_device_ioctl() will
refuse to work if kvm->mm doesn't equal to current->mm. So in practice, I
believe w/o enforcing this in kernel, we should also have no problem here.

Sean, please correct me if I am wrong.

Dave, if above stands, do you think it is reasonable to keep current->mm in
epc->mm and enforce in sgx_virt_epc_mmap()?
Dave Hansen Jan. 15, 2021, 3:39 p.m. UTC | #14
On 1/15/21 6:07 AM, Kai Huang wrote:
>>From virtual EPC's perspective, if we don't force this in kernel, then
> *theoretically*, userspace can use fork() to make multiple VMs map to the
> same physical EPC, which will potentially cause enclaves in all VMs to behave
> abnormally. So to me, from this perspective, it's better to enforce in kernel
> so that only first VM can use this virtual EPC instance, because EPC by
> architectural design cannot be shared.
> 
> But as Sean said, KVM doesn't support VM across multiple mm structs. And if I
> read code correctly, KVM doesn't support userspace to use fork() to create new
> VM. For instance, when creating VM, KVM grabs current->mm and keeps it in
> 'struct kvm' for bookkeeping, and kvm_vcpu_ioctl() and kvm_device_ioctl() will
> refuse to work if kvm->mm doesn't equal to current->mm. So in practice, I
> believe w/o enforcing this in kernel, we should also have no problem here.
> 
> Sean, please correct me if I am wrong.
> 
> Dave, if above stands, do you think it is reasonable to keep current->mm in
> epc->mm and enforce in sgx_virt_epc_mmap()?

Everything you wrote above tells me the kernel should not be enforcing
the behavior.  You basically said that it's only a theoretical problem,
and old if someone goes and does something with KVM that's nobody can do
today.

You've 100% convinced me that having the kernel enforce this is
*un*reasonable.
Huang, Kai Jan. 15, 2021, 9:33 p.m. UTC | #15
On Fri, 15 Jan 2021 07:39:44 -0800 Dave Hansen wrote:
> On 1/15/21 6:07 AM, Kai Huang wrote:
> >>From virtual EPC's perspective, if we don't force this in kernel, then
> > *theoretically*, userspace can use fork() to make multiple VMs map to the
> > same physical EPC, which will potentially cause enclaves in all VMs to behave
> > abnormally. So to me, from this perspective, it's better to enforce in kernel
> > so that only first VM can use this virtual EPC instance, because EPC by
> > architectural design cannot be shared.
> > 
> > But as Sean said, KVM doesn't support VM across multiple mm structs. And if I
> > read code correctly, KVM doesn't support userspace to use fork() to create new
> > VM. For instance, when creating VM, KVM grabs current->mm and keeps it in
> > 'struct kvm' for bookkeeping, and kvm_vcpu_ioctl() and kvm_device_ioctl() will
> > refuse to work if kvm->mm doesn't equal to current->mm. So in practice, I
> > believe w/o enforcing this in kernel, we should also have no problem here.
> > 
> > Sean, please correct me if I am wrong.
> > 
> > Dave, if above stands, do you think it is reasonable to keep current->mm in
> > epc->mm and enforce in sgx_virt_epc_mmap()?
> 
> Everything you wrote above tells me the kernel should not be enforcing
> the behavior.  You basically said that it's only a theoretical problem,
> and old if someone goes and does something with KVM that's nobody can do
> today.
> 
> You've 100% convinced me that having the kernel enforce this is
> *un*reasonable.

Sean, I'll remove epc->mm, unless I see your further objection.

Thanks to you both.
Sean Christopherson Jan. 15, 2021, 9:45 p.m. UTC | #16
On Sat, Jan 16, 2021, Kai Huang wrote:
> On Fri, 15 Jan 2021 07:39:44 -0800 Dave Hansen wrote:
> > On 1/15/21 6:07 AM, Kai Huang wrote:
> > >>From virtual EPC's perspective, if we don't force this in kernel, then
> > > *theoretically*, userspace can use fork() to make multiple VMs map to the
> > > same physical EPC, which will potentially cause enclaves in all VMs to behave
> > > abnormally. So to me, from this perspective, it's better to enforce in kernel
> > > so that only first VM can use this virtual EPC instance, because EPC by
> > > architectural design cannot be shared.
> > > 
> > > But as Sean said, KVM doesn't support VM across multiple mm structs. And if I
> > > read code correctly, KVM doesn't support userspace to use fork() to create new
> > > VM. For instance, when creating VM, KVM grabs current->mm and keeps it in
> > > 'struct kvm' for bookkeeping, and kvm_vcpu_ioctl() and kvm_device_ioctl() will
> > > refuse to work if kvm->mm doesn't equal to current->mm. So in practice, I
> > > believe w/o enforcing this in kernel, we should also have no problem here.
> > > 
> > > Sean, please correct me if I am wrong.
> > > 
> > > Dave, if above stands, do you think it is reasonable to keep current->mm in
> > > epc->mm and enforce in sgx_virt_epc_mmap()?
> > 
> > Everything you wrote above tells me the kernel should not be enforcing
> > the behavior.  You basically said that it's only a theoretical problem,
> > and old if someone goes and does something with KVM that's nobody can do
> > today.
> > 
> > You've 100% convinced me that having the kernel enforce this is
> > *un*reasonable.
> 
> Sean, I'll remove epc->mm, unless I see your further objection.

It's probably ok.  I guess worst case scenario, to avoid the mm tracking
nightmare for oversubscription, you could prevent attaching KVM to a virtual EPC
if there is already a mm associated with the EPC, or if there are already EPC
pages "in" the virt EPC.
Huang, Kai Jan. 15, 2021, 10:30 p.m. UTC | #17
On Fri, 15 Jan 2021 13:45:21 -0800 Sean Christopherson wrote:
> On Sat, Jan 16, 2021, Kai Huang wrote:
> > On Fri, 15 Jan 2021 07:39:44 -0800 Dave Hansen wrote:
> > > On 1/15/21 6:07 AM, Kai Huang wrote:
> > > >>From virtual EPC's perspective, if we don't force this in kernel, then
> > > > *theoretically*, userspace can use fork() to make multiple VMs map to the
> > > > same physical EPC, which will potentially cause enclaves in all VMs to behave
> > > > abnormally. So to me, from this perspective, it's better to enforce in kernel
> > > > so that only first VM can use this virtual EPC instance, because EPC by
> > > > architectural design cannot be shared.
> > > > 
> > > > But as Sean said, KVM doesn't support VM across multiple mm structs. And if I
> > > > read code correctly, KVM doesn't support userspace to use fork() to create new
> > > > VM. For instance, when creating VM, KVM grabs current->mm and keeps it in
> > > > 'struct kvm' for bookkeeping, and kvm_vcpu_ioctl() and kvm_device_ioctl() will
> > > > refuse to work if kvm->mm doesn't equal to current->mm. So in practice, I
> > > > believe w/o enforcing this in kernel, we should also have no problem here.
> > > > 
> > > > Sean, please correct me if I am wrong.
> > > > 
> > > > Dave, if above stands, do you think it is reasonable to keep current->mm in
> > > > epc->mm and enforce in sgx_virt_epc_mmap()?
> > > 
> > > Everything you wrote above tells me the kernel should not be enforcing
> > > the behavior.  You basically said that it's only a theoretical problem,
> > > and old if someone goes and does something with KVM that's nobody can do
> > > today.
> > > 
> > > You've 100% convinced me that having the kernel enforce this is
> > > *un*reasonable.
> > 
> > Sean, I'll remove epc->mm, unless I see your further objection.
> 
> It's probably ok.  I guess worst case scenario, to avoid the mm tracking
> nightmare for oversubscription, you could prevent attaching KVM to a virtual EPC
> if there is already a mm associated with the EPC, or if there are already EPC
> pages "in" the virt EPC.

Since we are not 100% certain oversubscription will be upstreamed, I think it
makes sense to address when we do it. For now, let us just drop it. 

Makes sense? Thanks.
diff mbox series

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 618d1aabccb8..a7318175509b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1947,6 +1947,18 @@  config X86_SGX
 
 	  If unsure, say N.
 
+config X86_SGX_VIRTUALIZATION
+	bool "Software Guard eXtensions (SGX) Virtualization"
+	depends on X86_SGX && KVM_INTEL
+	help
+
+	  Enables KVM guests to create SGX enclaves.
+
+	  This includes support to expose "raw" unreclaimable enclave memory to
+	  guests via a device node, e.g. /dev/sgx_virt_epc.
+
+	  If unsure, say N.
+
 config EFI
 	bool "EFI runtime service support"
 	depends on ACPI
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
index 91d3dc784a29..7a25bf63adfb 100644
--- a/arch/x86/kernel/cpu/sgx/Makefile
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -3,3 +3,4 @@  obj-y += \
 	encl.o \
 	ioctl.o \
 	main.o
+obj-$(CONFIG_X86_SGX_VIRTUALIZATION)	+= virt.o
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 95aad183bb65..02993a327a1f 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -9,9 +9,11 @@ 
 #include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/slab.h>
+#include "arch.h"
 #include "driver.h"
 #include "encl.h"
 #include "encls.h"
+#include "virt.h"
 
 struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
 static int sgx_nr_epc_sections;
@@ -726,7 +728,8 @@  static void __init sgx_init(void)
 	if (!sgx_page_reclaimer_init())
 		goto err_page_cache;
 
-	ret = sgx_drv_init();
+	/* Success if the native *or* virtual EPC driver initialized cleanly. */
+	ret = !!sgx_drv_init() & !!sgx_virt_epc_init();
 	if (ret)
 		goto err_kthread;
 
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644
index 000000000000..d625551ccf25
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -0,0 +1,263 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*  Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+#include "virt.h"
+
+struct sgx_virt_epc {
+	struct xarray page_array;
+	struct mutex lock;
+	struct mm_struct *mm;
+};
+
+static struct mutex virt_epc_lock;
+static struct list_head virt_epc_zombie_pages;
+
+static int __sgx_virt_epc_fault(struct sgx_virt_epc *epc,
+				struct vm_area_struct *vma, unsigned long addr)
+{
+	struct sgx_epc_page *epc_page;
+	unsigned long index, pfn;
+	int ret;
+
+	/* epc->lock must already have been hold */
+
+	/* Calculate index of EPC page in virtual EPC's page_array */
+	index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+	epc_page = xa_load(&epc->page_array, index);
+	if (epc_page)
+		return 0;
+
+	epc_page = sgx_alloc_epc_page(epc, false);
+	if (IS_ERR(epc_page))
+		return PTR_ERR(epc_page);
+
+	ret = xa_err(xa_store(&epc->page_array, index, epc_page, GFP_KERNEL));
+	if (ret)
+		goto err_free;
+
+	pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+	ret = vmf_insert_pfn(vma, addr, pfn);
+	if (ret != VM_FAULT_NOPAGE) {
+		ret = -EFAULT;
+		goto err_delete;
+	}
+
+	return 0;
+
+err_delete:
+	xa_erase(&epc->page_array, index);
+err_free:
+	sgx_free_epc_page(epc_page);
+	return ret;
+}
+
+static vm_fault_t sgx_virt_epc_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct sgx_virt_epc *epc = vma->vm_private_data;
+	int ret;
+
+	mutex_lock(&epc->lock);
+	ret = __sgx_virt_epc_fault(epc, vma, vmf->address);
+	mutex_unlock(&epc->lock);
+
+	if (!ret)
+		return VM_FAULT_NOPAGE;
+
+	if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+		mmap_read_unlock(vma->vm_mm);
+		return VM_FAULT_RETRY;
+	}
+
+	return VM_FAULT_SIGBUS;
+}
+
+const struct vm_operations_struct sgx_virt_epc_vm_ops = {
+	.fault = sgx_virt_epc_fault,
+};
+
+static int sgx_virt_epc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct sgx_virt_epc *epc = file->private_data;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	/*
+	 * Don't allow mmap() from child after fork(), since child and parent
+	 * cannot map to the same EPC.
+	 */
+	if (vma->vm_mm != epc->mm)
+		return -EINVAL;
+
+	vma->vm_ops = &sgx_virt_epc_vm_ops;
+	/* Don't copy VMA in fork() */
+	vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+	vma->vm_private_data = file->private_data;
+
+	return 0;
+}
+
+static int sgx_virt_epc_free_page(struct sgx_epc_page *epc_page)
+{
+	int ret;
+
+	if (!epc_page)
+		return 0;
+
+	/*
+	 * Explicitly EREMOVE virtual EPC page. Virtual EPC is only used by
+	 * guest, and in normal condition guest should have done EREMOVE for
+	 * all EPC pages before they are freed here. But it's possible guest
+	 * is killed or crashed unnormally in which case EREMOVE has not been
+	 * done. Do EREMOVE unconditionally here to cover both cases, because
+	 * it's not possible to tell whether guest has done EREMOVE, since
+	 * virtual EPC page status is not tracked. And it is fine to EREMOVE
+	 * EPC page multiple times.
+	 */
+	ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+	if (ret) {
+		/*
+		 * Only SGX_CHILD_PRESENT is expected, which is because of
+		 * EREMOVE-ing an SECS still with child, in which case it can
+		 * be handled by EREMOVE-ing the SECS again after all pages in
+		 * virtual EPC have been EREMOVE-ed. See comments in below in
+		 * sgx_virt_epc_release().
+		 */
+		WARN_ON_ONCE(ret != SGX_CHILD_PRESENT);
+		return ret;
+	}
+
+	__sgx_free_epc_page(epc_page);
+	return 0;
+}
+
+static int sgx_virt_epc_release(struct inode *inode, struct file *file)
+{
+	struct sgx_virt_epc *epc = file->private_data;
+	struct sgx_epc_page *epc_page, *tmp, *entry;
+	unsigned long index;
+
+	LIST_HEAD(secs_pages);
+
+	mmdrop(epc->mm);
+
+	xa_for_each(&epc->page_array, index, entry) {
+		/*
+		 * Virtual EPC pages are not tracked, so it's possible for
+		 * EREMOVE to fail due to, e.g. a SECS page still has children
+		 * if guest was shutdown unexpectedly. If it is the case, leave
+		 * it in the xarray and retry EREMOVE below later.
+		 */
+		if (sgx_virt_epc_free_page(entry))
+			continue;
+
+		xa_erase(&epc->page_array, index);
+	}
+
+	/*
+	 * Retry all failed pages after iterating through the entire tree, at
+	 * which point all children should be removed and the SECS pages can be
+	 * nuked as well...unless userspace has exposed multiple instance of
+	 * virtual EPC to a single VM.
+	 */
+	xa_for_each(&epc->page_array, index, entry) {
+		epc_page = entry;
+		/*
+		 * Error here means that EREMOVE failed due to a SECS page
+		 * still has child on *another* EPC instance.  Put it to a
+		 * temporary SECS list which will be spliced to 'zombie page
+		 * list' and will be EREMOVE-ed again when freeing another
+		 * virtual EPC instance.
+		 */
+		if (sgx_virt_epc_free_page(epc_page))
+			list_add_tail(&epc_page->list, &secs_pages);
+
+		xa_erase(&epc->page_array, index);
+	}
+
+	/*
+	 * Third time's a charm.  Try to EREMOVE zombie SECS pages from virtual
+	 * EPC instances that were previously released, i.e. free SECS pages
+	 * that were in limbo due to having children in *this* EPC instance.
+	 */
+	mutex_lock(&virt_epc_lock);
+	list_for_each_entry_safe(epc_page, tmp, &virt_epc_zombie_pages, list) {
+		/*
+		 * Speculatively remove the page from the list of zombies, if
+		 * the page is successfully EREMOVE it will be added to the
+		 * list of free pages.  If EREMOVE fails, throw the page on the
+		 * local list, which will be spliced on at the end.
+		 */
+		list_del(&epc_page->list);
+
+		if (sgx_virt_epc_free_page(epc_page))
+			list_add_tail(&epc_page->list, &secs_pages);
+	}
+
+	if (!list_empty(&secs_pages))
+		list_splice_tail(&secs_pages, &virt_epc_zombie_pages);
+	mutex_unlock(&virt_epc_lock);
+
+	kfree(epc);
+
+	return 0;
+}
+
+static int sgx_virt_epc_open(struct inode *inode, struct file *file)
+{
+	struct sgx_virt_epc *epc;
+
+	epc = kzalloc(sizeof(struct sgx_virt_epc), GFP_KERNEL);
+	if (!epc)
+		return -ENOMEM;
+	/*
+	 * Keep the current->mm to virtual EPC. It will be checked in
+	 * sgx_virt_epc_mmap() to prevent, in case of fork, child being
+	 * able to mmap() to the same virtual EPC pages.
+	 */
+	mmgrab(current->mm);
+	epc->mm = current->mm;
+	mutex_init(&epc->lock);
+	xa_init(&epc->page_array);
+
+	file->private_data = epc;
+
+	return 0;
+}
+
+static const struct file_operations sgx_virt_epc_fops = {
+	.owner			= THIS_MODULE,
+	.open			= sgx_virt_epc_open,
+	.release		= sgx_virt_epc_release,
+	.mmap			= sgx_virt_epc_mmap,
+};
+
+static struct miscdevice sgx_virt_epc_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "sgx_virt_epc",
+	.nodename = "sgx_virt_epc",
+	.fops = &sgx_virt_epc_fops,
+};
+
+int __init sgx_virt_epc_init(void)
+{
+	INIT_LIST_HEAD(&virt_epc_zombie_pages);
+	mutex_init(&virt_epc_lock);
+
+	return misc_register(&sgx_virt_epc_dev);
+}
diff --git a/arch/x86/kernel/cpu/sgx/virt.h b/arch/x86/kernel/cpu/sgx/virt.h
new file mode 100644
index 000000000000..e5434541a122
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.h
@@ -0,0 +1,14 @@ 
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+#ifndef _ASM_X86_SGX_VIRT_H
+#define _ASM_X86_SGX_VIRT_H
+
+#ifdef CONFIG_X86_SGX_VIRTUALIZATION
+int __init sgx_virt_epc_init(void);
+#else
+static inline int __init sgx_virt_epc_init(void)
+{
+	return -ENODEV;
+}
+#endif
+
+#endif /* _ASM_X86_SGX_VIRT_H */