diff mbox series

[v38,13/24] x86/sgx: Add SGX_IOC_ENCLAVE_ADD_PAGES

Message ID 20200915110522.893152-14-jarkko.sakkinen@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Intel SGX foundations | expand

Commit Message

Jarkko Sakkinen Sept. 15, 2020, 11:05 a.m. UTC
Add an ioctl, which performs ENCLS[EADD] that adds new visible page to an
enclave, and optionally ENCLS[EEXTEND] operations that hash the page to the
enclave measurement. By visible we mean a page that can be mapped to the
address range of an enclave.

Acked-by: Jethro Beekman <jethro@fortanix.com>
Tested-by: Jethro Beekman <jethro@fortanix.com>
Tested-by: Haitao Huang <haitao.huang@linux.intel.com>
Tested-by: Chunyang Hui <sanqian.hcy@antfin.com>
Tested-by: Jordan Hand <jorhand@linux.microsoft.com>
Tested-by: Nathaniel McCallum <npmccallum@redhat.com>
Tested-by: Seth Moore <sethmo@google.com>
Tested-by: Darren Kenny <darren.kenny@oracle.com>
Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 arch/x86/include/uapi/asm/sgx.h |  28 +++
 arch/x86/kernel/cpu/sgx/ioctl.c | 294 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/sgx/sgx.h   |   1 +
 3 files changed, 323 insertions(+)

Comments

Jarkko Sakkinen Sept. 17, 2020, 4:02 p.m. UTC | #1
On Thu, Sep 17, 2020 at 12:34:18AM -0500, Haitao Huang wrote:
> On Tue, 15 Sep 2020 06:05:11 -0500, Jarkko Sakkinen
> <jarkko.sakkinen@linux.intel.com> wrote:
> ...
> 
> > +static int __sgx_encl_add_page(struct sgx_encl *encl,
> > +			       struct sgx_encl_page *encl_page,
> > +			       struct sgx_epc_page *epc_page,
> > +			       struct sgx_secinfo *secinfo, unsigned long src)
> > +{
> > +	struct sgx_pageinfo pginfo;
> > +	struct vm_area_struct *vma;
> > +	struct page *src_page;
> > +	int ret;
> > +
> > +	/* Query vma's VM_MAYEXEC as an indirect path_noexec() check. */
> > +	if (encl_page->vm_max_prot_bits & VM_EXEC) {
> > +		vma = find_vma(current->mm, src);
> > +		if (!vma)
> > +			return -EFAULT;
> > +
> > +		if (!(vma->vm_flags & VM_MAYEXEC))
> > +			return -EACCES;
> > +	}
> > +
> > +	ret = get_user_pages(src, 1, 0, &src_page, NULL);
> > +	if (ret < 1)
> > +		return ret;
> > +
> > +	pginfo.secs = (unsigned long)sgx_get_epc_addr(encl->secs.epc_page);
> > +	pginfo.addr = SGX_ENCL_PAGE_ADDR(encl_page);
> > +	pginfo.metadata = (unsigned long)secinfo;
> > +	pginfo.contents = (unsigned long)kmap_atomic(src_page);
> > +
> > +	ret = __eadd(&pginfo, sgx_get_epc_addr(epc_page));
> > +
> > +	kunmap_atomic((void *)pginfo.contents);
> > +	put_page(src_page);
> > +
> > +	return ret ? -EIO : 0;
> > +}
> > +
> > +/*
> > + * If the caller requires measurement of the page as a proof for the
> > content,
> > + * use EEXTEND to add a measurement for 256 bytes of the page. Repeat
> > this
> > + * operation until the entire page is measured."
> > + */
> > +static int __sgx_encl_extend(struct sgx_encl *encl,
> > +			     struct sgx_epc_page *epc_page)
> > +{
> > +	int ret;
> > +	int i;
> > +
> > +	for (i = 0; i < 16; i++) {
> > +		ret = __eextend(sgx_get_epc_addr(encl->secs.epc_page),
> > +				sgx_get_epc_addr(epc_page) + (i * 0x100));
> > +		if (ret) {
> > +			if (encls_failed(ret))
> > +				ENCLS_WARN(ret, "EEXTEND");
> > +			return -EIO;
> > +		}
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src,
> > +			     unsigned long offset, struct sgx_secinfo *secinfo,
> > +			     unsigned long flags)
> > +{
> > +	struct sgx_encl_page *encl_page;
> > +	struct sgx_epc_page *epc_page;
> > +	int ret;
> > +
> > +	encl_page = sgx_encl_page_alloc(encl, offset, secinfo->flags);
> > +	if (IS_ERR(encl_page))
> > +		return PTR_ERR(encl_page);
> > +
> > +	epc_page = __sgx_alloc_epc_page();
> > +	if (IS_ERR(epc_page)) {
> > +		kfree(encl_page);
> > +		return PTR_ERR(epc_page);
> > +	}
> > +
> > +	mmap_read_lock(current->mm);
> > +	mutex_lock(&encl->lock);
> > +
> > +	/*
> > +	 * Insert prior to EADD in case of OOM.  EADD modifies MRENCLAVE, i.e.
> > +	 * can't be gracefully unwound, while failure on EADD/EXTEND is limited
> > +	 * to userspace errors (or kernel/hardware bugs).
> > +	 */
> > +	ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
> > +			encl_page, GFP_KERNEL);
> > +	if (ret)
> > +		goto err_out_unlock;
> > +
> > +	ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo,
> > +				  src);
> > +	if (ret)
> > +		goto err_out;
> > +
> > +	/*
> > +	 * Complete the "add" before doing the "extend" so that the "add"
> > +	 * isn't in a half-baked state in the extremely unlikely scenario
> > +	 * the enclave will be destroyed in response to EEXTEND failure.
> > +	 */
> > +	encl_page->encl = encl;
> > +	encl_page->epc_page = epc_page;
> > +	encl->secs_child_cnt++;
> > +
> > +	if (flags & SGX_PAGE_MEASURE) {
> > +		ret = __sgx_encl_extend(encl, epc_page);
> > +		if (ret)
> > +			goto err_out;
> > +	}
> > +
> > +	mutex_unlock(&encl->lock);
> > +	mmap_read_unlock(current->mm);
> > +	return ret;
> > +
> > +err_out:
> > +	xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
> > +
> > +err_out_unlock:
> > +	mutex_unlock(&encl->lock);
> > +	mmap_read_unlock(current->mm);
> > +
> > +	sgx_free_epc_page(epc_page);
> > +	kfree(encl_page);
> > +
> > +	/*
> > +	 * Destroy enclave on ENCLS failure as this means that EPC has been
> > +	 * invalidated.
> > +	 */
> > +	if (ret == -EIO) {
> > +		mutex_lock(&encl->lock);
> > +		sgx_encl_destroy(encl);
> > +		mutex_unlock(&encl->lock);
> > +	}
> > +
> 
> I think originally we return both count of succeeded EADDs and the errors.
> So we only destroy enclaves in cases of fatal ENCLS failures.
> 
> Now we only return errors in all failures other than interrupted operations
> or SGX_MAX_ADD_PAGES_LENGTH is reached.
> 
> So, for the new design we should destroy enclaves in all cases here, not
> just for -EIO.
> 
> On the other hand, I do like the old way returning both the count and error
> better. It helps greatly for debugging any issues in enclave image or user
> space code, and also keeps flexibility for user space to recover in certain
> errors, such as out of EPC.

Right, I do get the OOM case but wouldn't in that case the reasonable
thing to do destroy the enclave that is not even running? I mean that
means that we are globally out of EPC.

/Jarkko
Jarkko Sakkinen Sept. 17, 2020, 4:03 p.m. UTC | #2
> > +	for (c = 0 ; c < addp.length; c += PAGE_SIZE) {
> > +		if (signal_pending(current)) {
> > +			if (!c)
> > +				ret = -EINTR;
> > +
> > +			break;
> > +		}
> > +
> 
> Return -ERESTARTSYS so that kernel can restart this IOC when SA_RESTART is
> set?

Yes, this is quite obvious. I will change it, thank you.

/Jarkko
Haitao Huang Sept. 17, 2020, 6:35 p.m. UTC | #3
On Thu, 17 Sep 2020 11:02:06 -0500, Jarkko Sakkinen  
<jarkko.sakkinen@linux.intel.com> wrote:

> On Thu, Sep 17, 2020 at 12:34:18AM -0500, Haitao Huang wrote:
>> On Tue, 15 Sep 2020 06:05:11 -0500, Jarkko Sakkinen
>> <jarkko.sakkinen@linux.intel.com> wrote:
>> ...
>>
>> > +static int __sgx_encl_add_page(struct sgx_encl *encl,
>> > +			       struct sgx_encl_page *encl_page,
>> > +			       struct sgx_epc_page *epc_page,
>> > +			       struct sgx_secinfo *secinfo, unsigned long src)
>> > +{
>> > +	struct sgx_pageinfo pginfo;
>> > +	struct vm_area_struct *vma;
>> > +	struct page *src_page;
>> > +	int ret;
>> > +
>> > +	/* Query vma's VM_MAYEXEC as an indirect path_noexec() check. */
>> > +	if (encl_page->vm_max_prot_bits & VM_EXEC) {
>> > +		vma = find_vma(current->mm, src);
>> > +		if (!vma)
>> > +			return -EFAULT;
>> > +
>> > +		if (!(vma->vm_flags & VM_MAYEXEC))
>> > +			return -EACCES;
>> > +	}
>> > +
>> > +	ret = get_user_pages(src, 1, 0, &src_page, NULL);
>> > +	if (ret < 1)
>> > +		return ret;
>> > +
>> > +	pginfo.secs = (unsigned long)sgx_get_epc_addr(encl->secs.epc_page);
>> > +	pginfo.addr = SGX_ENCL_PAGE_ADDR(encl_page);
>> > +	pginfo.metadata = (unsigned long)secinfo;
>> > +	pginfo.contents = (unsigned long)kmap_atomic(src_page);
>> > +
>> > +	ret = __eadd(&pginfo, sgx_get_epc_addr(epc_page));
>> > +
>> > +	kunmap_atomic((void *)pginfo.contents);
>> > +	put_page(src_page);
>> > +
>> > +	return ret ? -EIO : 0;
>> > +}
>> > +
>> > +/*
>> > + * If the caller requires measurement of the page as a proof for the
>> > content,
>> > + * use EEXTEND to add a measurement for 256 bytes of the page. Repeat
>> > this
>> > + * operation until the entire page is measured."
>> > + */
>> > +static int __sgx_encl_extend(struct sgx_encl *encl,
>> > +			     struct sgx_epc_page *epc_page)
>> > +{
>> > +	int ret;
>> > +	int i;
>> > +
>> > +	for (i = 0; i < 16; i++) {
>> > +		ret = __eextend(sgx_get_epc_addr(encl->secs.epc_page),
>> > +				sgx_get_epc_addr(epc_page) + (i * 0x100));
>> > +		if (ret) {
>> > +			if (encls_failed(ret))
>> > +				ENCLS_WARN(ret, "EEXTEND");
>> > +			return -EIO;
>> > +		}
>> > +	}
>> > +
>> > +	return 0;
>> > +}
>> > +
>> > +static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long  
>> src,
>> > +			     unsigned long offset, struct sgx_secinfo *secinfo,
>> > +			     unsigned long flags)
>> > +{
>> > +	struct sgx_encl_page *encl_page;
>> > +	struct sgx_epc_page *epc_page;
>> > +	int ret;
>> > +
>> > +	encl_page = sgx_encl_page_alloc(encl, offset, secinfo->flags);
>> > +	if (IS_ERR(encl_page))
>> > +		return PTR_ERR(encl_page);
>> > +
>> > +	epc_page = __sgx_alloc_epc_page();
>> > +	if (IS_ERR(epc_page)) {
>> > +		kfree(encl_page);
>> > +		return PTR_ERR(epc_page);
>> > +	}
>> > +
>> > +	mmap_read_lock(current->mm);
>> > +	mutex_lock(&encl->lock);
>> > +
>> > +	/*
>> > +	 * Insert prior to EADD in case of OOM.  EADD modifies MRENCLAVE,  
>> i.e.
>> > +	 * can't be gracefully unwound, while failure on EADD/EXTEND is  
>> limited
>> > +	 * to userspace errors (or kernel/hardware bugs).
>> > +	 */
>> > +	ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
>> > +			encl_page, GFP_KERNEL);
>> > +	if (ret)
>> > +		goto err_out_unlock;
>> > +
>> > +	ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo,
>> > +				  src);
>> > +	if (ret)
>> > +		goto err_out;
>> > +
>> > +	/*
>> > +	 * Complete the "add" before doing the "extend" so that the "add"
>> > +	 * isn't in a half-baked state in the extremely unlikely scenario
>> > +	 * the enclave will be destroyed in response to EEXTEND failure.
>> > +	 */
>> > +	encl_page->encl = encl;
>> > +	encl_page->epc_page = epc_page;
>> > +	encl->secs_child_cnt++;
>> > +
>> > +	if (flags & SGX_PAGE_MEASURE) {
>> > +		ret = __sgx_encl_extend(encl, epc_page);
>> > +		if (ret)
>> > +			goto err_out;
>> > +	}
>> > +
>> > +	mutex_unlock(&encl->lock);
>> > +	mmap_read_unlock(current->mm);
>> > +	return ret;
>> > +
>> > +err_out:
>> > +	xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
>> > +
>> > +err_out_unlock:
>> > +	mutex_unlock(&encl->lock);
>> > +	mmap_read_unlock(current->mm);
>> > +
>> > +	sgx_free_epc_page(epc_page);
>> > +	kfree(encl_page);
>> > +
>> > +	/*
>> > +	 * Destroy enclave on ENCLS failure as this means that EPC has been
>> > +	 * invalidated.
>> > +	 */
>> > +	if (ret == -EIO) {
>> > +		mutex_lock(&encl->lock);
>> > +		sgx_encl_destroy(encl);
>> > +		mutex_unlock(&encl->lock);
>> > +	}
>> > +
>>
>> I think originally we return both count of succeeded EADDs and the  
>> errors.
>> So we only destroy enclaves in cases of fatal ENCLS failures.
>>
>> Now we only return errors in all failures other than interrupted  
>> operations
>> or SGX_MAX_ADD_PAGES_LENGTH is reached.
>>
>> So, for the new design we should destroy enclaves in all cases here, not
>> just for -EIO.
>>
>> On the other hand, I do like the old way returning both the count and  
>> error
>> better. It helps greatly for debugging any issues in enclave image or  
>> user
>> space code, and also keeps flexibility for user space to recover in  
>> certain
>> errors, such as out of EPC.
>
> Right, I do get the OOM case but wouldn't in that case the reasonable
> thing to do destroy the enclave that is not even running? I mean that
> means that we are globally out of EPC.
>

I would say it could be a policy, but not the only one. If it does not  
make much difference to kernel, IMHO we should  not set it in stone now.   
Debugging is also huge benefit to me.
Sean Christopherson Sept. 18, 2020, 2:09 a.m. UTC | #4
On Thu, Sep 17, 2020 at 01:35:10PM -0500, Haitao Huang wrote:
> On Thu, 17 Sep 2020 11:02:06 -0500, Jarkko Sakkinen
> <jarkko.sakkinen@linux.intel.com> wrote:
> > 
> > Right, I do get the OOM case but wouldn't in that case the reasonable
> > thing to do destroy the enclave that is not even running? I mean that
> > means that we are globally out of EPC.
> > 
> 
> I would say it could be a policy, but not the only one. If it does not make
> much difference to kernel, IMHO we should  not set it in stone now.
> Debugging is also huge benefit to me.

Agreed, an EPC cgroup is the proper way to define/enforce what happens when
there is EPC pressure.  E.g. if process A is consuming 99% of the EPC, then
it doesn't make sense to unconditionally kill enclaves from process B.  If
the admin wants to give process A priority, so be it, but such a decision
shouldn't be baked into the kernel.

This series obviously doesn't provide an EPC cgroup, but that doesn't mean
we can't make decisions that will play nice with a cgroup in the future.
Jarkko Sakkinen Sept. 18, 2020, 12:20 p.m. UTC | #5
On Thu, Sep 17, 2020 at 07:09:40PM -0700, Sean Christopherson wrote:
> On Thu, Sep 17, 2020 at 01:35:10PM -0500, Haitao Huang wrote:
> > On Thu, 17 Sep 2020 11:02:06 -0500, Jarkko Sakkinen
> > <jarkko.sakkinen@linux.intel.com> wrote:
> > > 
> > > Right, I do get the OOM case but wouldn't in that case the reasonable
> > > thing to do destroy the enclave that is not even running? I mean that
> > > means that we are globally out of EPC.
> > > 
> > 
> > I would say it could be a policy, but not the only one. If it does not make
> > much difference to kernel, IMHO we should  not set it in stone now.
> > Debugging is also huge benefit to me.
> 
> Agreed, an EPC cgroup is the proper way to define/enforce what happens when
> there is EPC pressure.  E.g. if process A is consuming 99% of the EPC, then
> it doesn't make sense to unconditionally kill enclaves from process B.  If
> the admin wants to give process A priority, so be it, but such a decision
> shouldn't be baked into the kernel.
> 
> This series obviously doesn't provide an EPC cgroup, but that doesn't mean
> we can't make decisions that will play nice with a cgroup in the future.

Here's the core issue why the API "as is used to be" does not work:

	if (ret == -EIO) {
		mutex_lock(&encl->lock);
		sgx_encl_destroy(encl);
		mutex_unlock(&encl->lock);
	}

It would be better to instead whitelist *when* the enclave is preserved.

	if (ret != -ENOMEM) {
		mutex_lock(&encl->lock);
		sgx_encl_destroy(encl);
		mutex_unlock(&encl->lock);
	}

That is the information we *deterministically* want to know. Otherwise,
we will live in ultimate chaos.

Only this way can caller know when there are means to continue, and when
to quit. I.e. the code is whitelisting wrong way around currently.

/Jarkko
Jarkko Sakkinen Sept. 18, 2020, 12:39 p.m. UTC | #6
On Fri, Sep 18, 2020 at 03:20:39PM +0300, Jarkko Sakkinen wrote:
> On Thu, Sep 17, 2020 at 07:09:40PM -0700, Sean Christopherson wrote:
> > On Thu, Sep 17, 2020 at 01:35:10PM -0500, Haitao Huang wrote:
> > > On Thu, 17 Sep 2020 11:02:06 -0500, Jarkko Sakkinen
> > > <jarkko.sakkinen@linux.intel.com> wrote:
> > > > 
> > > > Right, I do get the OOM case but wouldn't in that case the reasonable
> > > > thing to do destroy the enclave that is not even running? I mean that
> > > > means that we are globally out of EPC.
> > > > 
> > > 
> > > I would say it could be a policy, but not the only one. If it does not make
> > > much difference to kernel, IMHO we should  not set it in stone now.
> > > Debugging is also huge benefit to me.
> > 
> > Agreed, an EPC cgroup is the proper way to define/enforce what happens when
> > there is EPC pressure.  E.g. if process A is consuming 99% of the EPC, then
> > it doesn't make sense to unconditionally kill enclaves from process B.  If
> > the admin wants to give process A priority, so be it, but such a decision
> > shouldn't be baked into the kernel.
> > 
> > This series obviously doesn't provide an EPC cgroup, but that doesn't mean
> > we can't make decisions that will play nice with a cgroup in the future.
> 
> Here's the core issue why the API "as is used to be" does not work:
> 
> 	if (ret == -EIO) {
> 		mutex_lock(&encl->lock);
> 		sgx_encl_destroy(encl);
> 		mutex_unlock(&encl->lock);
> 	}
> 
> It would be better to instead whitelist *when* the enclave is preserved.
> 
> 	if (ret != -ENOMEM) {
> 		mutex_lock(&encl->lock);
> 		sgx_encl_destroy(encl);
> 		mutex_unlock(&encl->lock);
> 	}
> 
> That is the information we *deterministically* want to know. Otherwise,
> we will live in ultimate chaos.
> 
> Only this way can caller know when there are means to continue, and when
> to quit. I.e. the code is whitelisting wrong way around currently.

Actually since the state cannot corrupt unless EADD or EEXTEND fails it
is fine to have the enclave alive on any other error condition. I think
what I do is that I move the check out sgx_encl_add_page() to the main
ioctl and update the kdoc. It is not too clear on persistence. I'll fix
this.

/Jarkko
Jarkko Sakkinen Sept. 21, 2020, 11:41 a.m. UTC | #7
On Fri, Sep 18, 2020 at 05:09:19PM -0700, Sean Christopherson wrote:
> On Fri, Sep 18, 2020 at 03:39:32PM +0300, Jarkko Sakkinen wrote:
> > On Fri, Sep 18, 2020 at 03:20:39PM +0300, Jarkko Sakkinen wrote:
> > > On Thu, Sep 17, 2020 at 07:09:40PM -0700, Sean Christopherson wrote:
> > > > On Thu, Sep 17, 2020 at 01:35:10PM -0500, Haitao Huang wrote:
> > > > > On Thu, 17 Sep 2020 11:02:06 -0500, Jarkko Sakkinen
> > > > > <jarkko.sakkinen@linux.intel.com> wrote:
> > > > > > 
> > > > > > Right, I do get the OOM case but wouldn't in that case the reasonable
> > > > > > thing to do destroy the enclave that is not even running? I mean that
> > > > > > means that we are globally out of EPC.
> > > > > > 
> > > > > 
> > > > > I would say it could be a policy, but not the only one. If it does not make
> > > > > much difference to kernel, IMHO we should  not set it in stone now.
> > > > > Debugging is also huge benefit to me.
> > > > 
> > > > Agreed, an EPC cgroup is the proper way to define/enforce what happens when
> > > > there is EPC pressure.  E.g. if process A is consuming 99% of the EPC, then
> > > > it doesn't make sense to unconditionally kill enclaves from process B.  If
> > > > the admin wants to give process A priority, so be it, but such a decision
> > > > shouldn't be baked into the kernel.
> > > > 
> > > > This series obviously doesn't provide an EPC cgroup, but that doesn't mean
> > > > we can't make decisions that will play nice with a cgroup in the future.
> > > 
> > > Here's the core issue why the API "as is used to be" does not work:
> > > 
> > > 	if (ret == -EIO) {
> > > 		mutex_lock(&encl->lock);
> > > 		sgx_encl_destroy(encl);
> > > 		mutex_unlock(&encl->lock);
> > > 	}
> > > 
> > > It would be better to instead whitelist *when* the enclave is preserved.
> > > 
> > > 	if (ret != -ENOMEM) {
> > > 		mutex_lock(&encl->lock);
> > > 		sgx_encl_destroy(encl);
> > > 		mutex_unlock(&encl->lock);
> > > 	}
> > > 
> > > That is the information we *deterministically* want to know. Otherwise,
> > > we will live in ultimate chaos.
> > > 
> > > Only this way can caller know when there are means to continue, and when
> > > to quit. I.e. the code is whitelisting wrong way around currently.
> > 
> > Actually since the state cannot corrupt unless EADD or EEXTEND fails it
> > is fine to have the enclave alive on any other error condition. I think
> 
> EADD and EEXTEND failure don't corrupt state.  Killing the enclave if EEXTEND
> fails makes sense because failure at that point is either due to a kernel bug
> or loss of EPC, both of which are fatal to the enclave.

This is also true. I meant by corrupt state e.g. a kernel bug, which
causes uninitalizes pages go the free queue.

I'd rephrase this in kdoc as: "The function deinitializes enclave and
returns -EIO when EPC is lost, while entering to a new power cycle".

Documentation describes only legit behaviour, let's ignore the corrupt
part.

> EADD is a little different, e.g. it could fault due to a bad source address,
> in which case the failure is not technically fatal.  But, Jarkko wanted to
> have consistent behavior for EADD and EEXTEND failures, and practically
> speaking the enclave is probably hosed anyways if EADD fails, i.e. killing the
> enclave on EADD failure isn't a sticking point (for me).

We need to figure out own return value for EADD, but I agree with this.

I would go with -EFAULT as we do when source VMA is no available. Does
this make sense to you?

/Jarkko
Sean Christopherson Sept. 21, 2020, 4:46 p.m. UTC | #8
On Mon, Sep 21, 2020 at 02:41:04PM +0300, Jarkko Sakkinen wrote:
> On Fri, Sep 18, 2020 at 05:09:19PM -0700, Sean Christopherson wrote:
> > On Fri, Sep 18, 2020 at 03:39:32PM +0300, Jarkko Sakkinen wrote:
> > > On Fri, Sep 18, 2020 at 03:20:39PM +0300, Jarkko Sakkinen wrote:
> > > > On Thu, Sep 17, 2020 at 07:09:40PM -0700, Sean Christopherson wrote:
> > > > > On Thu, Sep 17, 2020 at 01:35:10PM -0500, Haitao Huang wrote:
> > > > > > On Thu, 17 Sep 2020 11:02:06 -0500, Jarkko Sakkinen
> > > > > > <jarkko.sakkinen@linux.intel.com> wrote:
> > > > > > > 
> > > > > > > Right, I do get the OOM case but wouldn't in that case the reasonable
> > > > > > > thing to do destroy the enclave that is not even running? I mean that
> > > > > > > means that we are globally out of EPC.
> > > > > > > 
> > > > > > 
> > > > > > I would say it could be a policy, but not the only one. If it does not make
> > > > > > much difference to kernel, IMHO we should  not set it in stone now.
> > > > > > Debugging is also huge benefit to me.
> > > > > 
> > > > > Agreed, an EPC cgroup is the proper way to define/enforce what happens when
> > > > > there is EPC pressure.  E.g. if process A is consuming 99% of the EPC, then
> > > > > it doesn't make sense to unconditionally kill enclaves from process B.  If
> > > > > the admin wants to give process A priority, so be it, but such a decision
> > > > > shouldn't be baked into the kernel.
> > > > > 
> > > > > This series obviously doesn't provide an EPC cgroup, but that doesn't mean
> > > > > we can't make decisions that will play nice with a cgroup in the future.
> > > > 
> > > > Here's the core issue why the API "as is used to be" does not work:
> > > > 
> > > > 	if (ret == -EIO) {
> > > > 		mutex_lock(&encl->lock);
> > > > 		sgx_encl_destroy(encl);
> > > > 		mutex_unlock(&encl->lock);
> > > > 	}
> > > > 
> > > > It would be better to instead whitelist *when* the enclave is preserved.
> > > > 
> > > > 	if (ret != -ENOMEM) {
> > > > 		mutex_lock(&encl->lock);
> > > > 		sgx_encl_destroy(encl);
> > > > 		mutex_unlock(&encl->lock);
> > > > 	}
> > > > 
> > > > That is the information we *deterministically* want to know. Otherwise,
> > > > we will live in ultimate chaos.
> > > > 
> > > > Only this way can caller know when there are means to continue, and when
> > > > to quit. I.e. the code is whitelisting wrong way around currently.
> > > 
> > > Actually since the state cannot corrupt unless EADD or EEXTEND fails it
> > > is fine to have the enclave alive on any other error condition. I think
> > 
> > EADD and EEXTEND failure don't corrupt state.  Killing the enclave if EEXTEND
> > fails makes sense because failure at that point is either due to a kernel bug
> > or loss of EPC, both of which are fatal to the enclave.
> 
> This is also true. I meant by corrupt state e.g. a kernel bug, which
> causes uninitalizes pages go the free queue.
> 
> I'd rephrase this in kdoc as: "The function deinitializes enclave and
> returns -EIO when EPC is lost, while entering to a new power cycle".

The kdocs shouldn't speculate on why EEXTEND might fail.  E.g. in some (and
possibility most) environments, the most common scenario of EEXTEND failure
will be EPC invalidation due to virtual machine migration.

This is why I'd prefer that the kernel kill the enclave if and only if the
error is guaranteed to be fatal, e.g. the docs can have a blanket statement
along the lines of:

  An enclave will be killed and its EPC resources will be freed if an error that
  is guaranteed to be fatal is encountered at any time, e.g. if EEXTEND fails as
  EEXTEND can only fail due to loss of EPC, a kernel bug, or silicon bug, all of
  which are unrecoverable.
 
> Documentation describes only legit behaviour, let's ignore the corrupt
> part.
> 
> > EADD is a little different, e.g. it could fault due to a bad source address,
> > in which case the failure is not technically fatal.  But, Jarkko wanted to
> > have consistent behavior for EADD and EEXTEND failures, and practically
> > speaking the enclave is probably hosed anyways if EADD fails, i.e. killing the
> > enclave on EADD failure isn't a sticking point (for me).
> 
> We need to figure out own return value for EADD, but I agree with this.
> 
> I would go with -EFAULT as we do when source VMA is no available. Does
> this make sense to you?

If only EEXTEND will be treated as fatal, then I see no need to worry about
the return code for EADD.  In that case, simply kill the enclave on EEXTEND
failure instead of on a specific return code.
Jarkko Sakkinen Sept. 21, 2020, 6:49 p.m. UTC | #9
On Mon, Sep 21, 2020 at 09:46:48AM -0700, Sean Christopherson wrote:
> > This is also true. I meant by corrupt state e.g. a kernel bug, which
> > causes uninitalizes pages go the free queue.
> > 
> > I'd rephrase this in kdoc as: "The function deinitializes enclave and
> > returns -EIO when EPC is lost, while entering to a new power cycle".
> 
> The kdocs shouldn't speculate on why EEXTEND might fail.  E.g. in some (and
> possibility most) environments, the most common scenario of EEXTEND failure
> will be EPC invalidation due to virtual machine migration.
> 
> This is why I'd prefer that the kernel kill the enclave if and only if the
> error is guaranteed to be fatal, e.g. the docs can have a blanket statement
> along the lines of:
> 
>   An enclave will be killed and its EPC resources will be freed if an error that
>   is guaranteed to be fatal is encountered at any time, e.g. if EEXTEND fails as
>   EEXTEND can only fail due to loss of EPC, a kernel bug, or silicon bug, all of
>   which are unrecoverable.

Kernel bug is not a legit condition. Neither is a silicon failure. We do
not document speculated kernel bugs. If we used that kind of pattern for
documentation, we would have to put similar statements about every
single line of code.

Describing legit failure conditions with the best knowledge available is
the whole point why people read documentation in the first place.
Otherwise, the documentation has absolutely no value.

Documentation is also always, without exception, inaccurate. Lacking
something is not an issue, if it is not done on purpose.

I'd refine what I did as

"The function deinitializes enclave and returns -EIO when EPC was lost,
 while entering to a new power cycle, or any other condition where EPC
 gets invalidated."

It is not perfect, nothing ever is, but it is heck a lot more useful
than being too generic to fail.

> > > EADD is a little different, e.g. it could fault due to a bad source address,
> > > in which case the failure is not technically fatal.  But, Jarkko wanted to
> > > have consistent behavior for EADD and EEXTEND failures, and practically
> > > speaking the enclave is probably hosed anyways if EADD fails, i.e. killing the
> > > enclave on EADD failure isn't a sticking point (for me).
> > 
> > We need to figure out own return value for EADD, but I agree with this.
> > 
> > I would go with -EFAULT as we do when source VMA is no available. Does
> > this make sense to you?
> 
> If only EEXTEND will be treated as fatal, then I see no need to worry about
> the return code for EADD.  In that case, simply kill the enclave on EEXTEND
> failure instead of on a specific return code.

To have understandable semantics you have to map error codes to
conditions rather than opcodes. -EIO means loss of enclave in the event
of EPC gone invalid. Enclave is already lost, that is the reason why we
deinitialize the kernel data structures.

EADD must have a different error code because nothing is actually lost
but the failure conditions are triggered outside. -EFAULT would be
probably the most reasonable choice for that.

/Jarkko
Jarkko Sakkinen Sept. 21, 2020, 7:44 p.m. UTC | #10
On Mon, Sep 21, 2020 at 09:49:48PM +0300, Jarkko Sakkinen wrote:
> To have understandable semantics you have to map error codes to
> conditions rather than opcodes. -EIO means loss of enclave in the event
> of EPC gone invalid. Enclave is already lost, that is the reason why we
> deinitialize the kernel data structures.
> 
> EADD must have a different error code because nothing is actually lost
> but the failure conditions are triggered outside. -EFAULT would be
> probably the most reasonable choice for that.

Now that I did all the changes discussed and then I remember why EADD
and EEXTEND had a common error code, and common behaviour. Obviously EADD
can also fail because of EPC reset because it depends on a valid SECS
page.

If we cannot distinct from EADD caused by EPC loss and EADD caused by
problems with the source, it should have the same error code, and also
the enclave should be deinitialized, whenver this happens.

So I would just revert to v38 behaviour, keeping of course the whole
check more visible in sgx_ioc_enclave_add_pages(), and just refine
the documentation better describe the whole situation.

/Jarkko
Sean Christopherson Sept. 21, 2020, 7:57 p.m. UTC | #11
On Mon, Sep 21, 2020 at 10:44:19PM +0300, Jarkko Sakkinen wrote:
> On Mon, Sep 21, 2020 at 09:49:48PM +0300, Jarkko Sakkinen wrote:
> > To have understandable semantics you have to map error codes to
> > conditions rather than opcodes. -EIO means loss of enclave in the event
> > of EPC gone invalid. Enclave is already lost, that is the reason why we
> > deinitialize the kernel data structures.
> > 
> > EADD must have a different error code because nothing is actually lost
> > but the failure conditions are triggered outside. -EFAULT would be
> > probably the most reasonable choice for that.
> 
> Now that I did all the changes discussed and then I remember why EADD
> and EEXTEND had a common error code, and common behaviour. Obviously EADD
> can also fail because of EPC reset because it depends on a valid SECS
> page.
> 
> If we cannot distinct from EADD caused by EPC loss and EADD caused by
> problems with the source, it should have the same error code, and also
> the enclave should be deinitialized, whenver this happens.

Hmm, on SGX2 hardware the kernel can precisely and accurately identify loss
of EPC, or at least "problem with the EPCM", as such a condition will be a
page fault with PFEC.SGX=1.

But getting that info back to the ENCLS invocation would require adding a
new exception fixup handler in order to "return" the error code.  Given that
this is the only case where that level of precision makes a difference, I
think it's ok to just kill the enclave on any EADD failure.  Practically
speaking I highly doubt the overzealous killing will impact userspace, I
would imagine any SGX runtime would treat -EFAULT as fatal anyways.

Side topic, this does invalidate my argument for not killing the enclave on
EADD failure.  If EADD fails due to loss of EPC, it's theoretically possible
userspace could get stuck in an infinite loop if it does a naive retry on
-EIO or whatever.

> So I would just revert to v38 behaviour, keeping of course the whole
> check more visible in sgx_ioc_enclave_add_pages(), and just refine
> the documentation better describe the whole situation.
> 
> /Jarkko
Jarkko Sakkinen Sept. 21, 2020, 7:58 p.m. UTC | #12
On Mon, Sep 21, 2020 at 10:44:19PM +0300, Jarkko Sakkinen wrote:
> On Mon, Sep 21, 2020 at 09:49:48PM +0300, Jarkko Sakkinen wrote:
> > To have understandable semantics you have to map error codes to
> > conditions rather than opcodes. -EIO means loss of enclave in the event
> > of EPC gone invalid. Enclave is already lost, that is the reason why we
> > deinitialize the kernel data structures.
> > 
> > EADD must have a different error code because nothing is actually lost
> > but the failure conditions are triggered outside. -EFAULT would be
> > probably the most reasonable choice for that.
> 
> Now that I did all the changes discussed and then I remember why EADD
> and EEXTEND had a common error code, and common behaviour. Obviously EADD
> can also fail because of EPC reset because it depends on a valid SECS
> page.
> 
> If we cannot distinct from EADD caused by EPC loss and EADD caused by
> problems with the source, it should have the same error code, and also
> the enclave should be deinitialized, whenver this happens.
> 
> So I would just revert to v38 behaviour, keeping of course the whole
> check more visible in sgx_ioc_enclave_add_pages(), and just refine
> the documentation better describe the whole situation.

So now the behaviour is reverted back to same as it was before [*] and
I refined the documenation as:

"
 * The function deinitializes kernel data structures for enclave and returns
 * -EIO in any of the following conditions:
 *
 * - Enclave Page Cache (EPC), the physical memory holding enclaves, has
 *   been invalidated. This will cause EADD and EEXTEND to fail.
 * - If the source address is corrupted somehow when executing EADD.
"

[*] https://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-sgx.git/tree/arch/x86/kernel/cpu/sgx/ioctl.c

/Jarkko
Jarkko Sakkinen Sept. 21, 2020, 9:24 p.m. UTC | #13
On Mon, Sep 21, 2020 at 12:57:39PM -0700, Sean Christopherson wrote:
> On Mon, Sep 21, 2020 at 10:44:19PM +0300, Jarkko Sakkinen wrote:
> > On Mon, Sep 21, 2020 at 09:49:48PM +0300, Jarkko Sakkinen wrote:
> > > To have understandable semantics you have to map error codes to
> > > conditions rather than opcodes. -EIO means loss of enclave in the event
> > > of EPC gone invalid. Enclave is already lost, that is the reason why we
> > > deinitialize the kernel data structures.
> > > 
> > > EADD must have a different error code because nothing is actually lost
> > > but the failure conditions are triggered outside. -EFAULT would be
> > > probably the most reasonable choice for that.
> > 
> > Now that I did all the changes discussed and then I remember why EADD
> > and EEXTEND had a common error code, and common behaviour. Obviously EADD
> > can also fail because of EPC reset because it depends on a valid SECS
> > page.
> > 
> > If we cannot distinct from EADD caused by EPC loss and EADD caused by
> > problems with the source, it should have the same error code, and also
> > the enclave should be deinitialized, whenver this happens.
> 
> Hmm, on SGX2 hardware the kernel can precisely and accurately identify loss
> of EPC, or at least "problem with the EPCM", as such a condition will be a
> page fault with PFEC.SGX=1.

True.

> But getting that info back to the ENCLS invocation would require adding a
> new exception fixup handler in order to "return" the error code.  Given that
> this is the only case where that level of precision makes a difference, I
> think it's ok to just kill the enclave on any EADD failure.  Practically
> speaking I highly doubt the overzealous killing will impact userspace, I
> would imagine any SGX runtime would treat -EFAULT as fatal anyways.

This is true. We could do this if wanted. Most of the time bad source
address would require either badly behaving run-time or doing it on
purpose. For the former case, since it is badly behaving by definition,
this granularity would not improve situation. For the latter case, we do
not want to do any active support.

I guess I'll still have to correct the documentation just a bit.

> Side topic, this does invalidate my argument for not killing the enclave on
> EADD failure.  If EADD fails due to loss of EPC, it's theoretically possible
> userspace could get stuck in an infinite loop if it does a naive retry on
> -EIO or whatever.

I don't think we care about that unless it renders out any legit
correctly working feature for a run-time.

/Jarkko
diff mbox series

Patch

diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
index c75b375f3770..c42a2ad3ca0b 100644
--- a/arch/x86/include/uapi/asm/sgx.h
+++ b/arch/x86/include/uapi/asm/sgx.h
@@ -8,10 +8,21 @@ 
 #include <linux/types.h>
 #include <linux/ioctl.h>
 
+/**
+ * enum sgx_epage_flags - page control flags
+ * %SGX_PAGE_MEASURE:	Measure the page contents with a sequence of
+ *			ENCLS[EEXTEND] operations.
+ */
+enum sgx_page_flags {
+	SGX_PAGE_MEASURE	= 0x01,
+};
+
 #define SGX_MAGIC 0xA4
 
 #define SGX_IOC_ENCLAVE_CREATE \
 	_IOW(SGX_MAGIC, 0x00, struct sgx_enclave_create)
+#define SGX_IOC_ENCLAVE_ADD_PAGES \
+	_IOWR(SGX_MAGIC, 0x01, struct sgx_enclave_add_pages)
 
 /**
  * struct sgx_enclave_create - parameter structure for the
@@ -22,4 +33,21 @@  struct sgx_enclave_create  {
 	__u64	src;
 };
 
+/**
+ * struct sgx_enclave_add_pages - parameter structure for the
+ *                                %SGX_IOC_ENCLAVE_ADD_PAGE ioctl
+ * @src:	start address for the page data
+ * @offset:	starting page offset
+ * @length:	length of the data (multiple of the page size)
+ * @secinfo:	address for the SECINFO data
+ * @flags:	page control flags
+ */
+struct sgx_enclave_add_pages {
+	__u64	src;
+	__u64	offset;
+	__u64	length;
+	__u64	secinfo;
+	__u64	flags;
+};
+
 #endif /* _UAPI_ASM_X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 352a3c461812..202680a06c17 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -191,6 +191,297 @@  static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg)
 	return ret;
 }
 
+static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
+						 unsigned long offset,
+						 u64 secinfo_flags)
+{
+	struct sgx_encl_page *encl_page;
+	unsigned long prot;
+
+	encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
+	if (!encl_page)
+		return ERR_PTR(-ENOMEM);
+
+	encl_page->desc = encl->base + offset;
+	encl_page->encl = encl;
+
+	prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ)  |
+	       _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
+	       _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
+
+	/*
+	 * TCS pages must always RW set for CPU access while the SECINFO
+	 * permissions are *always* zero - the CPU ignores the user provided
+	 * values and silently overwrites them with zero permissions.
+	 */
+	if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
+		prot |= PROT_READ | PROT_WRITE;
+
+	/* Calculate maximum of the VM flags for the page. */
+	encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
+
+	return encl_page;
+}
+
+static int sgx_validate_secinfo(struct sgx_secinfo *secinfo)
+{
+	u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK;
+	u64 pt = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK;
+
+	if (pt != SGX_SECINFO_REG && pt != SGX_SECINFO_TCS)
+		return -EINVAL;
+
+	if ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R))
+		return -EINVAL;
+
+	/*
+	 * CPU will silently overwrite the permissions as zero, which means
+	 * that we need to validate it ourselves.
+	 */
+	if (pt == SGX_SECINFO_TCS && perm)
+		return -EINVAL;
+
+	if (secinfo->flags & SGX_SECINFO_RESERVED_MASK)
+		return -EINVAL;
+
+	if (memchr_inv(secinfo->reserved, 0, sizeof(secinfo->reserved)))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int __sgx_encl_add_page(struct sgx_encl *encl,
+			       struct sgx_encl_page *encl_page,
+			       struct sgx_epc_page *epc_page,
+			       struct sgx_secinfo *secinfo, unsigned long src)
+{
+	struct sgx_pageinfo pginfo;
+	struct vm_area_struct *vma;
+	struct page *src_page;
+	int ret;
+
+	/* Query vma's VM_MAYEXEC as an indirect path_noexec() check. */
+	if (encl_page->vm_max_prot_bits & VM_EXEC) {
+		vma = find_vma(current->mm, src);
+		if (!vma)
+			return -EFAULT;
+
+		if (!(vma->vm_flags & VM_MAYEXEC))
+			return -EACCES;
+	}
+
+	ret = get_user_pages(src, 1, 0, &src_page, NULL);
+	if (ret < 1)
+		return ret;
+
+	pginfo.secs = (unsigned long)sgx_get_epc_addr(encl->secs.epc_page);
+	pginfo.addr = SGX_ENCL_PAGE_ADDR(encl_page);
+	pginfo.metadata = (unsigned long)secinfo;
+	pginfo.contents = (unsigned long)kmap_atomic(src_page);
+
+	ret = __eadd(&pginfo, sgx_get_epc_addr(epc_page));
+
+	kunmap_atomic((void *)pginfo.contents);
+	put_page(src_page);
+
+	return ret ? -EIO : 0;
+}
+
+/*
+ * If the caller requires measurement of the page as a proof for the content,
+ * use EEXTEND to add a measurement for 256 bytes of the page. Repeat this
+ * operation until the entire page is measured."
+ */
+static int __sgx_encl_extend(struct sgx_encl *encl,
+			     struct sgx_epc_page *epc_page)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < 16; i++) {
+		ret = __eextend(sgx_get_epc_addr(encl->secs.epc_page),
+				sgx_get_epc_addr(epc_page) + (i * 0x100));
+		if (ret) {
+			if (encls_failed(ret))
+				ENCLS_WARN(ret, "EEXTEND");
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src,
+			     unsigned long offset, struct sgx_secinfo *secinfo,
+			     unsigned long flags)
+{
+	struct sgx_encl_page *encl_page;
+	struct sgx_epc_page *epc_page;
+	int ret;
+
+	encl_page = sgx_encl_page_alloc(encl, offset, secinfo->flags);
+	if (IS_ERR(encl_page))
+		return PTR_ERR(encl_page);
+
+	epc_page = __sgx_alloc_epc_page();
+	if (IS_ERR(epc_page)) {
+		kfree(encl_page);
+		return PTR_ERR(epc_page);
+	}
+
+	mmap_read_lock(current->mm);
+	mutex_lock(&encl->lock);
+
+	/*
+	 * Insert prior to EADD in case of OOM.  EADD modifies MRENCLAVE, i.e.
+	 * can't be gracefully unwound, while failure on EADD/EXTEND is limited
+	 * to userspace errors (or kernel/hardware bugs).
+	 */
+	ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
+			encl_page, GFP_KERNEL);
+	if (ret)
+		goto err_out_unlock;
+
+	ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo,
+				  src);
+	if (ret)
+		goto err_out;
+
+	/*
+	 * Complete the "add" before doing the "extend" so that the "add"
+	 * isn't in a half-baked state in the extremely unlikely scenario
+	 * the enclave will be destroyed in response to EEXTEND failure.
+	 */
+	encl_page->encl = encl;
+	encl_page->epc_page = epc_page;
+	encl->secs_child_cnt++;
+
+	if (flags & SGX_PAGE_MEASURE) {
+		ret = __sgx_encl_extend(encl, epc_page);
+		if (ret)
+			goto err_out;
+	}
+
+	mutex_unlock(&encl->lock);
+	mmap_read_unlock(current->mm);
+	return ret;
+
+err_out:
+	xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
+
+err_out_unlock:
+	mutex_unlock(&encl->lock);
+	mmap_read_unlock(current->mm);
+
+	sgx_free_epc_page(epc_page);
+	kfree(encl_page);
+
+	/*
+	 * Destroy enclave on ENCLS failure as this means that EPC has been
+	 * invalidated.
+	 */
+	if (ret == -EIO) {
+		mutex_lock(&encl->lock);
+		sgx_encl_destroy(encl);
+		mutex_unlock(&encl->lock);
+	}
+
+	return ret;
+}
+
+/**
+ * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES
+ * @encl:       an enclave pointer
+ * @arg:	a user pointer to a struct sgx_enclave_add_pages instance
+ *
+ * Add one or more pages to an uninitialized enclave, and optionally extend the
+ * measurement with the contents of the page. The SECINFO and measurement mask
+ * are applied to all pages.
+ *
+ * A SECINFO for a TCS is required to always contain zero permissions because
+ * CPU silently zeros them. Allowing anything else would cause a mismatch in
+ * the measurement.
+ *
+ * mmap()'s protection bits are capped by the page permissions. For each page
+ * address, the maximum protection bits are computed with the following
+ * heuristics:
+ *
+ * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions.
+ * 2. A TCS page: PROT_R | PROT_W.
+ *
+ * mmap() is not allowed to surpass the minimum of the maximum protection bits
+ * within the given address range.
+ *
+ * If ENCLS opcode fails, that effectively means that EPC has been invalidated.
+ * When this happens the enclave is destroyed and -EIO is returned to the
+ * caller.
+ *
+ * Return:
+ *   length of the data processed on success,
+ *   -EACCES if an executable source page is located in a noexec partition,
+ *   -EIO if either ENCLS[EADD] or ENCLS[EEXTEND] fails
+ *   -errno otherwise
+ */
+static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
+{
+	struct sgx_enclave_add_pages addp;
+	struct sgx_secinfo secinfo;
+	unsigned long c;
+	int ret;
+
+	if ((atomic_read(&encl->flags) & SGX_ENCL_INITIALIZED) ||
+	    !(atomic_read(&encl->flags) & SGX_ENCL_CREATED))
+		return -EINVAL;
+
+	if (copy_from_user(&addp, arg, sizeof(addp)))
+		return -EFAULT;
+
+	if (!IS_ALIGNED(addp.offset, PAGE_SIZE) ||
+	    !IS_ALIGNED(addp.src, PAGE_SIZE))
+		return -EINVAL;
+
+	if (!(access_ok(addp.src, PAGE_SIZE)))
+		return -EFAULT;
+
+	if (addp.length & (PAGE_SIZE - 1))
+		return -EINVAL;
+
+	if (addp.offset + addp.length - PAGE_SIZE >= encl->size)
+		return -EINVAL;
+
+	if (copy_from_user(&secinfo, (void __user *)addp.secinfo,
+			   sizeof(secinfo)))
+		return -EFAULT;
+
+	if (sgx_validate_secinfo(&secinfo))
+		return -EINVAL;
+
+	for (c = 0 ; c < addp.length; c += PAGE_SIZE) {
+		if (signal_pending(current)) {
+			if (!c)
+				ret = -EINTR;
+
+			break;
+		}
+
+		if (c == SGX_MAX_ADD_PAGES_LENGTH)
+			break;
+
+		if (need_resched())
+			cond_resched();
+
+		ret = sgx_encl_add_page(encl, addp.src + c, addp.offset + c,
+					&secinfo, addp.flags);
+		if (ret)
+			break;
+	}
+
+	if (ret)
+		return ret;
+
+	return c;
+}
+
 long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 {
 	struct sgx_encl *encl = filep->private_data;
@@ -209,6 +500,9 @@  long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 	case SGX_IOC_ENCLAVE_CREATE:
 		ret = sgx_ioc_enclave_create(encl, (void __user *)arg);
 		break;
+	case SGX_IOC_ENCLAVE_ADD_PAGES:
+		ret = sgx_ioc_enclave_add_pages(encl, (void __user *)arg);
+		break;
 	default:
 		ret = -ENOIOCTLCMD;
 		break;
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index fce756c3434b..8d126070db1e 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -34,6 +34,7 @@  struct sgx_epc_section {
 
 #define SGX_EPC_SECTION_MASK		GENMASK(7, 0)
 #define SGX_MAX_EPC_SECTIONS		(SGX_EPC_SECTION_MASK + 1)
+#define SGX_MAX_ADD_PAGES_LENGTH	0x100000
 
 extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];