diff mbox series

[RFC,v4,2/4] x86/sgx: Implement support for MADV_WILLNEED

Message ID 20230128045529.15749-3-haitao.huang@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series x86/sgx: implement support for MADV_WILLNEED | expand

Commit Message

Haitao Huang Jan. 28, 2023, 4:55 a.m. UTC
Support madvise(..., MADV_WILLNEED) by adding EPC pages with EAUG in
the newly added fops->fadvise() callback implementation, sgx_fadvise().

Change the return type and values of the sgx_encl_eaug_page function
so that more specific error codes are returned for different treatment
by the page fault handler and the fadvise callback.
On any error, sgx_fadvise() will discontinue further operations
and return as normal. The page fault handler allows a PF retried
by returning VM_FAULT_NOPAGE in handling -EBUSY returned from
sgx_encl_eaug_page.

Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
---
 arch/x86/kernel/cpu/sgx/driver.c | 74 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/sgx/encl.c   | 59 ++++++++++++++-----------
 arch/x86/kernel/cpu/sgx/encl.h   |  4 +-
 3 files changed, 111 insertions(+), 26 deletions(-)

Comments

Jarkko Sakkinen Feb. 7, 2023, 11:28 p.m. UTC | #1
On Fri, Jan 27, 2023 at 08:55:27PM -0800, Haitao Huang wrote:
> Support madvise(..., MADV_WILLNEED) by adding EPC pages with EAUG in
> the newly added fops->fadvise() callback implementation, sgx_fadvise().
> 
> Change the return type and values of the sgx_encl_eaug_page function
> so that more specific error codes are returned for different treatment
> by the page fault handler and the fadvise callback.
> On any error, sgx_fadvise() will discontinue further operations
> and return as normal. The page fault handler allows a PF retried
> by returning VM_FAULT_NOPAGE in handling -EBUSY returned from
> sgx_encl_eaug_page.
> 
> Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
> ---
>  arch/x86/kernel/cpu/sgx/driver.c | 74 ++++++++++++++++++++++++++++++++
>  arch/x86/kernel/cpu/sgx/encl.c   | 59 ++++++++++++++-----------
>  arch/x86/kernel/cpu/sgx/encl.h   |  4 +-
>  3 files changed, 111 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
> index aa9b8b868867..3a88daddc1a1 100644
> --- a/arch/x86/kernel/cpu/sgx/driver.c
> +++ b/arch/x86/kernel/cpu/sgx/driver.c
> @@ -2,6 +2,7 @@
>  /*  Copyright(c) 2016-20 Intel Corporation. */
>  
>  #include <linux/acpi.h>
> +#include <linux/fadvise.h>
>  #include <linux/miscdevice.h>
>  #include <linux/mman.h>
>  #include <linux/security.h>
> @@ -9,6 +10,7 @@
>  #include <asm/traps.h>
>  #include "driver.h"
>  #include "encl.h"
> +#include "encls.h"
>  
>  u64 sgx_attributes_reserved_mask;
>  u64 sgx_xfrm_reserved_mask = ~0x3;
> @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
>  	vma->vm_ops = &sgx_vm_ops;
>  	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
>  	vma->vm_private_data = encl;
> +	vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base);
>  
>  	return 0;
>  }
>  
> +/*
> + * Add new pages to the enclave sequentially with ENCLS[EAUG] for the WILLNEED advice.
> + * Only do this to existing VMAs in the same enclave and reject the request otherwise.
> + */
> +static int sgx_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
> +{
> +	struct sgx_encl *encl = file->private_data;
> +	unsigned long start = offset + encl->base;
> +	struct vm_area_struct *vma = NULL;
> +	unsigned long end = start + len;
> +	unsigned long pos;
> +	int ret = -EINVAL;
> +
> +	if (!cpu_feature_enabled(X86_FEATURE_SGX2))
> +		return -EINVAL;
> +	/* Only support WILLNEED */
> +	if (advice != POSIX_FADV_WILLNEED)
> +		return -EINVAL;
> +
> +	if (offset + len < offset)
> +		return -EINVAL;
> +	if (start < encl->base)
> +		return -EINVAL;
> +	if (end < start)
> +		return -EINVAL;
> +	if (end > encl->base + encl->size)
> +		return -EINVAL;
> +
> +	if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
> +		return -EINVAL;
> +
> +	mmap_read_lock(current->mm);
> +
> +	vma = find_vma(current->mm, start);
> +	if (!vma)
> +		goto unlock;
> +	if (vma->vm_private_data != encl)
> +		goto unlock;
> +
> +	pos = start;
> +	if (pos < vma->vm_start || end > vma->vm_end) {
> +		/* Don't allow any gaps */
> +		goto unlock;
> +	}
> +
> +	/* Here: vm_start <= pos < end <= vm_end */
> +	while (pos < end) {
> +		if (xa_load(&encl->page_array, PFN_DOWN(pos)))
> +			continue;
> +		if (signal_pending(current)) {
> +			if (pos == start)
> +				ret = -ERESTARTSYS;
> +			else
> +				ret = -EINTR;
> +			goto unlock;
> +		}
> +		ret = sgx_encl_eaug_page(vma, encl, pos);
> +		/* It's OK to not finish */
> +		if (ret)
> +			break;
> +		pos = pos + PAGE_SIZE;
> +		cond_resched();
> +	}
> +	ret = 0;
> +
> +unlock:
> +	mmap_read_unlock(current->mm);
> +	return ret;
> +}
> +
>  static unsigned long sgx_get_unmapped_area(struct file *file,
>  					   unsigned long addr,
>  					   unsigned long len,
> @@ -133,6 +206,7 @@ static const struct file_operations sgx_encl_fops = {
>  	.compat_ioctl		= sgx_compat_ioctl,
>  #endif
>  	.mmap			= sgx_mmap,
> +	.fadvise		= sgx_fadvise,
>  	.get_unmapped_area	= sgx_get_unmapped_area,
>  };
>  
> diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
> index 0185c5ab48dd..592cfea4c9e4 100644
> --- a/arch/x86/kernel/cpu/sgx/encl.c
> +++ b/arch/x86/kernel/cpu/sgx/encl.c
> @@ -299,20 +299,17 @@ struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
>  }
>  
>  /**
> - * sgx_encl_eaug_page() - Dynamically add page to initialized enclave
> - * @vma:	VMA obtained from fault info from where page is accessed
> - * @encl:	enclave accessing the page
> - * @addr:	address that triggered the page fault
> + * sgx_encl_eaug_page() - Dynamically add an EPC page to initialized enclave
> + * @vma:	the VMA into which the page is to be added
> + * @encl:	the enclave for which the page is to be added
> + * @addr:	the start address of the page to be added
>   *
> - * When an initialized enclave accesses a page with no backing EPC page
> - * on a SGX2 system then the EPC can be added dynamically via the SGX2
> - * ENCLS[EAUG] instruction.
> - *
> - * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed
> - * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise.
> + * Returns: 0 on EAUG success and PTE was installed successfully, -EBUSY for
> + * waiting on reclaimer to free EPC, -ENOMEM for out of RAM, -EFAULT for
> + * all other failures.
>   */
> -vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
> -			      struct sgx_encl *encl, unsigned long addr)
> +int sgx_encl_eaug_page(struct vm_area_struct *vma,
> +		       struct sgx_encl *encl, unsigned long addr)
>  {
>  	vm_fault_t vmret = VM_FAULT_SIGBUS;
>  	struct sgx_pageinfo pginfo = {0};
> @@ -321,10 +318,10 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
>  	struct sgx_va_page *va_page;
>  	unsigned long phys_addr;
>  	u64 secinfo_flags;
> -	int ret;
> +	int ret = -EFAULT;
>  
>  	if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
> -		return VM_FAULT_SIGBUS;
> +		return -EFAULT;
>  
>  	/*
>  	 * Ignore internal permission checking for dynamically added pages.
> @@ -335,21 +332,21 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
>  	secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
>  	encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags);
>  	if (IS_ERR(encl_page))
> -		return VM_FAULT_OOM;
> +		return -ENOMEM;
>  
>  	mutex_lock(&encl->lock);
>  
>  	epc_page = sgx_alloc_epc_page(encl_page, false);
>  	if (IS_ERR(epc_page)) {
>  		if (PTR_ERR(epc_page) == -EBUSY)
> -			vmret =  VM_FAULT_NOPAGE;
> +			ret =  -EBUSY;
>  		goto err_out_unlock;
>  	}
>  
>  	va_page = sgx_encl_grow(encl, false);
>  	if (IS_ERR(va_page)) {
>  		if (PTR_ERR(va_page) == -EBUSY)
> -			vmret = VM_FAULT_NOPAGE;
> +			ret = -EBUSY;
>  		goto err_out_epc;
>  	}
>  
> @@ -362,16 +359,20 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
>  	 * If ret == -EBUSY then page was created in another flow while
>  	 * running without encl->lock
>  	 */
> -	if (ret)
> +	if (ret) {
> +		ret = -EFAULT;
>  		goto err_out_shrink;
> +	}
>  
>  	pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
>  	pginfo.addr = encl_page->desc & PAGE_MASK;
>  	pginfo.metadata = 0;
>  
>  	ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page));
> -	if (ret)
> +	if (ret) {
> +		ret = -EFAULT;
>  		goto err_out;
> +	}
>  
>  	encl_page->encl = encl;
>  	encl_page->epc_page = epc_page;
> @@ -388,10 +389,10 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
>  	vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
>  	if (vmret != VM_FAULT_NOPAGE) {
>  		mutex_unlock(&encl->lock);
> -		return VM_FAULT_SIGBUS;
> +		return -EFAULT;
>  	}
>  	mutex_unlock(&encl->lock);
> -	return VM_FAULT_NOPAGE;
> +	return 0;
>  
>  err_out:
>  	xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
> @@ -404,7 +405,7 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
>  	mutex_unlock(&encl->lock);
>  	kfree(encl_page);
>  
> -	return vmret;
> +	return ret;
>  }
>  
>  static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
> @@ -434,8 +435,18 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
>  	 * enclave that will be checked for right away.
>  	 */
>  	if (cpu_feature_enabled(X86_FEATURE_SGX2) &&
> -	    (!xa_load(&encl->page_array, PFN_DOWN(addr))))
> -		return sgx_encl_eaug_page(vma, encl, addr);
> +	    (!xa_load(&encl->page_array, PFN_DOWN(addr)))) {
> +		switch (sgx_encl_eaug_page(vma, encl, addr)) {
> +		case 0:
> +		case -EBUSY:
> +			return VM_FAULT_NOPAGE;
> +		case -ENOMEM:
> +			return VM_FAULT_OOM;
> +		case -EFAULT:
> +		default:
> +			return VM_FAULT_SIGBUS;
> +		}
> +	}
>  
>  	mutex_lock(&encl->lock);
>  
> diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
> index 9f19b06c3ae3..e5a507871fa3 100644
> --- a/arch/x86/kernel/cpu/sgx/encl.h
> +++ b/arch/x86/kernel/cpu/sgx/encl.h
> @@ -125,7 +125,7 @@ struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
>  					 unsigned long addr);
>  struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim);
>  void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page);
> -vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
> -			      struct sgx_encl *encl, unsigned long addr);
> +int sgx_encl_eaug_page(struct vm_area_struct *vma,
> +		       struct sgx_encl *encl, unsigned long addr);
>  
>  #endif /* _X86_ENCL_H */
> -- 
> 2.25.1
> 

Looks good to me!

Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>

BR, Jarkko
Huang, Kai Feb. 14, 2023, 9:47 a.m. UTC | #2
On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote:
> @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
>  	vma->vm_ops = &sgx_vm_ops;
>  	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
>  	vma->vm_private_data = encl;
> +	vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base);
>  
>  	return 0;
>  }

Perhaps I am missing something, but above change looks weird.  

Conceptually, it doesn't/shouldn't belong to this series, which essentially
preallocates and does EAUG EPC pages for a (or part of) given enclave.  The EAUG
logic should already be working for the normal fault path, which means the code
change above either: 1) has been done at other place; 2) isn't needed.

I have kinda forgotten the userspace sequence to create an enclave.  If I recall
correctly, you do below to create an enclave:

	1) encl_fd = open("/dev/sgx_enclave");
	2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */);
	3) IOCTL(ECREATE, encl_addr, encl_size);

Would the above code change break the "mmap()" in above step 2?
Haitao Huang Feb. 14, 2023, 7:18 p.m. UTC | #3
Hi Kai

On Tue, 14 Feb 2023 03:47:24 -0600, Huang, Kai <kai.huang@intel.com> wrote:

> On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote:
>> @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct  
>> vm_area_struct *vma)
>>  	vma->vm_ops = &sgx_vm_ops;
>>  	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
>>  	vma->vm_private_data = encl;
>> +	vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base);
>>   	return 0;
>>  }
>
> Perhaps I am missing something, but above change looks weird.  
> Conceptually, it doesn't/shouldn't belong to this series, which  
> essentially
> preallocates and does EAUG EPC pages for a (or part of) given enclave.   
> The EAUG
> logic should already be working for the normal fault path, which means  
> the code
> change above either: 1) has been done at other place; 2) isn't needed.
>
> I have kinda forgotten the userspace sequence to create an enclave.  If  
> I recall
> correctly, you do below to create an enclave:
>
> 	1) encl_fd = open("/dev/sgx_enclave");
> 	2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */);
> 	3) IOCTL(ECREATE, encl_addr, encl_size);
>
> Would the above code change break the "mmap()" in above step 2?
> 	

No, vm_pgoff was not used previously for enclave VMAs. I had to add this  
because the offset passed to sgx_fadvise is relative to file base and  
calculated in mm/madvise.c like this:

         offset = (loff_t)(start - vma->vm_start)
                         + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);

I had a comment in first version but removed it based on Jarkko's  
suggestion here: https://lore.kernel.org/all/Y2B0jBsG6HE4KVk7@kernel.org/

The original comments probably seemed redundant to the definitions of the  
vm_pgoff field and the fadvise interface. But let me know if we need add a  
more helpful version of comments or any suggestion on the comments.

Thanks
Haitao
Huang, Kai Feb. 14, 2023, 8:54 p.m. UTC | #4
On Tue, 2023-02-14 at 13:18 -0600, Haitao Huang wrote:
> Hi Kai
> 
> On Tue, 14 Feb 2023 03:47:24 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> 
> > On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote:
> > > @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct  
> > > vm_area_struct *vma)
> > >  	vma->vm_ops = &sgx_vm_ops;
> > >  	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
> > >  	vma->vm_private_data = encl;
> > > +	vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base);
> > >   	return 0;
> > >  }
> > 
> > Perhaps I am missing something, but above change looks weird.  
> > Conceptually, it doesn't/shouldn't belong to this series, which  
> > essentially
> > preallocates and does EAUG EPC pages for a (or part of) given enclave.   
> > The EAUG
> > logic should already be working for the normal fault path, which means  
> > the code
> > change above either: 1) has been done at other place; 2) isn't needed.
> > 
> > I have kinda forgotten the userspace sequence to create an enclave.  If  
> > I recall
> > correctly, you do below to create an enclave:
> > 
> > 	1) encl_fd = open("/dev/sgx_enclave");
> > 	2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */);
> > 	3) IOCTL(ECREATE, encl_addr, encl_size);
> > 
> > Would the above code change break the "mmap()" in above step 2?
> > 	
> 
> No, vm_pgoff was not used previously for enclave VMAs. I had to add this  
> because the offset passed to sgx_fadvise is relative to file base and  
> calculated in mm/madvise.c like this:
> 
>          offset = (loff_t)(start - vma->vm_start)
>                          + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);

But shouldn't 'offset is relative to the file base' be conceptually correct from
the fadvice()'s point of view?

I think you should do:

	encl_offset = offset + encl->base;

inside sgx_fadvice()?

> 
> I had a comment in first version but removed it based on Jarkko's  
> suggestion here: https://lore.kernel.org/all/Y2B0jBsG6HE4KVk7@kernel.org/
> 
> The original comments probably seemed redundant to the definitions of the  
> vm_pgoff field and the fadvise interface. But let me know if we need add a  
> more helpful version of comments or any suggestion on the comments.

I still think this code change is wrong.

For instance, IIUC, it at least breaks the case where enclave hasn't been
created/initialized, where encl->base == 0 (although normal code path doesn't
use vm_pgoff, conceptually it's still wrong IIUC).

Maybe I am missing something?
Haitao Huang Feb. 14, 2023, 9:42 p.m. UTC | #5
On Tue, 14 Feb 2023 14:54:53 -0600, Huang, Kai <kai.huang@intel.com> wrote:

> On Tue, 2023-02-14 at 13:18 -0600, Haitao Huang wrote:
>> Hi Kai
>>
>> On Tue, 14 Feb 2023 03:47:24 -0600, Huang, Kai <kai.huang@intel.com>  
>> wrote:
>>
>> > On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote:
>> > > @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct
>> > > vm_area_struct *vma)
>> > >  	vma->vm_ops = &sgx_vm_ops;
>> > >  	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
>> > >  	vma->vm_private_data = encl;
>> > > +	vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base);
>> > >   	return 0;
>> > >  }
>> >
>> > Perhaps I am missing something, but above change looks weird.
>> > Conceptually, it doesn't/shouldn't belong to this series, which
>> > essentially
>> > preallocates and does EAUG EPC pages for a (or part of) given enclave.
>> > The EAUG
>> > logic should already be working for the normal fault path, which means
>> > the code
>> > change above either: 1) has been done at other place; 2) isn't needed.
>> >
>> > I have kinda forgotten the userspace sequence to create an enclave.   
>> If
>> > I recall
>> > correctly, you do below to create an enclave:
>> >
>> > 	1) encl_fd = open("/dev/sgx_enclave");
>> > 	2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */);
>> > 	3) IOCTL(ECREATE, encl_addr, encl_size);
>> >
>> > Would the above code change break the "mmap()" in above step 2?
>> > 	
>>
>> No, vm_pgoff was not used previously for enclave VMAs. I had to add this
>> because the offset passed to sgx_fadvise is relative to file base and
>> calculated in mm/madvise.c like this:
>>
>>          offset = (loff_t)(start - vma->vm_start)
>>                          + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
>
> But shouldn't 'offset is relative to the file base' be conceptually  
> correct from
> the fadvice()'s point of view?
>
> I think you should do:
>
> 	encl_offset = offset + encl->base;
>
> inside sgx_fadvice()?
>
>>
If we don't set vma->vm_pgoff (default to zero), then offset will be  
calculated as (start - vma->vm_start). Then the above calculation is wrong  
if we have multiple VMAs for the same enclave, which is usually the case.

>> I had a comment in first version but removed it based on Jarkko's
>> suggestion here:  
>> https://lore.kernel.org/all/Y2B0jBsG6HE4KVk7@kernel.org/
>>
>> The original comments probably seemed redundant to the definitions of  
>> the
>> vm_pgoff field and the fadvise interface. But let me know if we need  
>> add a
>> more helpful version of comments or any suggestion on the comments.
>
> I still think this code change is wrong.
>
> For instance, IIUC, it at least breaks the case where enclave hasn't been
> created/initialized, where encl->base == 0 (although normal code path  
> doesn't
> use vm_pgoff, conceptually it's still wrong IIUC).
>
> Maybe I am missing something?

The fadvise interface is only usable for an initialized enclave,  
sgx_fadvise will return error otherwise. Conceptually I view enclave base  
as "file base", it's just that we don't ever need handle the zero case  
caused by uninitialized enclave (kind of like a file never mapped). If an  
initialized enclave happens to have zero base, it would also work.

Thanks
Haitao
Huang, Kai Feb. 14, 2023, 10:36 p.m. UTC | #6
On Tue, 2023-02-14 at 15:42 -0600, Haitao Huang wrote:
> On Tue, 14 Feb 2023 14:54:53 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> 
> > On Tue, 2023-02-14 at 13:18 -0600, Haitao Huang wrote:
> > > Hi Kai
> > > 
> > > On Tue, 14 Feb 2023 03:47:24 -0600, Huang, Kai <kai.huang@intel.com>  
> > > wrote:
> > > 
> > > > On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote:
> > > > > @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct
> > > > > vm_area_struct *vma)
> > > > >  	vma->vm_ops = &sgx_vm_ops;
> > > > >  	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
> > > > >  	vma->vm_private_data = encl;
> > > > > +	vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base);
> > > > >   	return 0;
> > > > >  }
> > > > 
> > > > Perhaps I am missing something, but above change looks weird.
> > > > Conceptually, it doesn't/shouldn't belong to this series, which
> > > > essentially
> > > > preallocates and does EAUG EPC pages for a (or part of) given enclave.
> > > > The EAUG
> > > > logic should already be working for the normal fault path, which means
> > > > the code
> > > > change above either: 1) has been done at other place; 2) isn't needed.
> > > > 
> > > > I have kinda forgotten the userspace sequence to create an enclave.   
> > > If
> > > > I recall
> > > > correctly, you do below to create an enclave:
> > > > 
> > > > 	1) encl_fd = open("/dev/sgx_enclave");
> > > > 	2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */);
> > > > 	3) IOCTL(ECREATE, encl_addr, encl_size);
> > > > 
> > > > Would the above code change break the "mmap()" in above step 2?
> > > > 	
> > > 
> > > No, vm_pgoff was not used previously for enclave VMAs. I had to add this
> > > because the offset passed to sgx_fadvise is relative to file base and
> > > calculated in mm/madvise.c like this:
> > > 
> > >          offset = (loff_t)(start - vma->vm_start)
> > >                          + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
> > 
> > But shouldn't 'offset is relative to the file base' be conceptually  
> > correct from
> > the fadvice()'s point of view?
> > 
> > I think you should do:
> > 
> > 	encl_offset = offset + encl->base;
> > 
> > inside sgx_fadvice()?
> > 
> > > 
> If we don't set vma->vm_pgoff (default to zero), then offset will be  
> calculated as (start - vma->vm_start). Then the above calculation is wrong  
> if we have multiple VMAs for the same enclave, which is usually the case.

do_mmap() -> mmap_region() itself sets vma->vm_pgoff:

	vma = vm_area_alloc();
	...
	vma->vm_pgoff = pgoff;

	if (file)
		call_mmap(file, vma);	<- sgx_mmap()

I think you will always call mmap() against enclave's fd with 'pgoff' being set
to the offset relative to the file?

> 
> > > I had a comment in first version but removed it based on Jarkko's
> > > suggestion here:  
> > > https://lore.kernel.org/all/Y2B0jBsG6HE4KVk7@kernel.org/
> > > 
> > > The original comments probably seemed redundant to the definitions of  
> > > the
> > > vm_pgoff field and the fadvise interface. But let me know if we need  
> > > add a
> > > more helpful version of comments or any suggestion on the comments.
> > 
> > I still think this code change is wrong.
> > 
> > For instance, IIUC, it at least breaks the case where enclave hasn't been
> > created/initialized, where encl->base == 0 (although normal code path  
> > doesn't
> > use vm_pgoff, conceptually it's still wrong IIUC).
> > 
> > Maybe I am missing something?
> 
> The fadvise interface is only usable for an initialized enclave,  
> sgx_fadvise will return error otherwise. 
> 

True.  But that code change is unconditionally called for all mmap(), even when
enclave hasn't been created.

> Conceptually I view enclave base  
> as "file base", it's just that we don't ever need handle the zero case  
> caused by uninitialized enclave (kind of like a file never mapped). If an  
> initialized enclave happens to have zero base, it would also work.

A little bit confused about what does "enclave base" here.

To me, A file is an enclave, meaning the "file offset" equals to "enclave
offset".  "enclave base" is the base linear address of the enclave, it doesn't
matter whether it is 0 or not.  You get an "enclave address" from "enclave base"
plus "enclave offset" (or "file offset"):

	enclave_addr = enclave_base + enclave_offset/file_offset;

And such calculation is only valid after enclave has been created (enclave_base
is valid -- can be 0 or whatever).

Since sgx_mmap() can happen before enclave is created, calculating the vm_pgoff
from enclave_base is conceptually wrong.  Even if you really want to do it, it
should be:

	if (enclave_has_initialized())
		vma->vm_pgoff = ...;

But again I am not convinced why you cannot get the enclave_addr inside
sgx_fadvice().
Haitao Huang Feb. 15, 2023, 3:59 a.m. UTC | #7
On Tue, 14 Feb 2023 16:36:40 -0600, Huang, Kai <kai.huang@intel.com> wrote:

> On Tue, 2023-02-14 at 15:42 -0600, Haitao Huang wrote:
>> On Tue, 14 Feb 2023 14:54:53 -0600, Huang, Kai <kai.huang@intel.com>  
>> wrote:
>>
>> > On Tue, 2023-02-14 at 13:18 -0600, Haitao Huang wrote:
>> > > Hi Kai
>> > >
>> > > On Tue, 14 Feb 2023 03:47:24 -0600, Huang, Kai <kai.huang@intel.com>
>> > > wrote:
>> > >
>> > > > On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote:
>> > > > > @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file,  
>> struct
>> > > > > vm_area_struct *vma)
>> > > > >  	vma->vm_ops = &sgx_vm_ops;
>> > > > >  	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP |  
>> VM_IO;
>> > > > >  	vma->vm_private_data = encl;
>> > > > > +	vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base);
>> > > > >   	return 0;
>> > > > >  }
>> > > >
>> > > > Perhaps I am missing something, but above change looks weird.
>> > > > Conceptually, it doesn't/shouldn't belong to this series, which
>> > > > essentially
>> > > > preallocates and does EAUG EPC pages for a (or part of) given  
>> enclave.
>> > > > The EAUG
>> > > > logic should already be working for the normal fault path, which  
>> means
>> > > > the code
>> > > > change above either: 1) has been done at other place; 2) isn't  
>> needed.
>> > > >
>> > > > I have kinda forgotten the userspace sequence to create an  
>> enclave.
>> > > If
>> > > > I recall
>> > > > correctly, you do below to create an enclave:
>> > > >
>> > > > 	1) encl_fd = open("/dev/sgx_enclave");
>> > > > 	2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */);
>> > > > 	3) IOCTL(ECREATE, encl_addr, encl_size);
>> > > >
>> > > > Would the above code change break the "mmap()" in above step 2?
>> > > > 	
>> > >
>> > > No, vm_pgoff was not used previously for enclave VMAs. I had to add  
>> this
>> > > because the offset passed to sgx_fadvise is relative to file base  
>> and
>> > > calculated in mm/madvise.c like this:
>> > >
>> > >          offset = (loff_t)(start - vma->vm_start)
>> > >                          + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
>> >
>> > But shouldn't 'offset is relative to the file base' be conceptually
>> > correct from
>> > the fadvice()'s point of view?
>> >
>> > I think you should do:
>> >
>> > 	encl_offset = offset + encl->base;
>> >
>> > inside sgx_fadvice()?
>> >
>> > >
>> If we don't set vma->vm_pgoff (default to zero), then offset will be
>> calculated as (start - vma->vm_start). Then the above calculation is  
>> wrong
>> if we have multiple VMAs for the same enclave, which is usually the  
>> case.
>
> do_mmap() -> mmap_region() itself sets vma->vm_pgoff:
>
> 	vma = vm_area_alloc();
> 	...
> 	vma->vm_pgoff = pgoff;
>
> 	if (file)
> 		call_mmap(file, vma);	<- sgx_mmap()
>
> I think you will always call mmap() against enclave's fd with 'pgoff'  
> being set
> to the offset relative to the file?
>

right. (pgoff above is most likely zero in enclave cases but that's not  
important)

>>
>> > > I had a comment in first version but removed it based on Jarkko's
>> > > suggestion here:
>> > > https://lore.kernel.org/all/Y2B0jBsG6HE4KVk7@kernel.org/
>> > >
>> > > The original comments probably seemed redundant to the definitions  
>> of
>> > > the
>> > > vm_pgoff field and the fadvise interface. But let me know if we need
>> > > add a
>> > > more helpful version of comments or any suggestion on the comments.
>> >
>> > I still think this code change is wrong.
>> >
>> > For instance, IIUC, it at least breaks the case where enclave hasn't  
>> been
>> > created/initialized, where encl->base == 0 (although normal code path
>> > doesn't
>> > use vm_pgoff, conceptually it's still wrong IIUC).
>> >
>> > Maybe I am missing something?
>>
>> The fadvise interface is only usable for an initialized enclave,
>> sgx_fadvise will return error otherwise.
>
> True.  But that code change is unconditionally called for all mmap(),  
> even when
> enclave hasn't been created.
>
Theoretically yes. However, the user space sequences I am aware are  
following:

1. enclave_fd = fopen("/dev/sgx_enclave")
2. mmap(..., 2*enclave_size, MAP_ANONYMOUS, ...) to reserve a larger  
range, then trim the reserved address range to get enclave_base aligned  
with enclave_size
3. ioctl(ECREATE, enclave_base, enclave_size, enclave_fd)

Only after ECREATE, we do mmap(..., enclave_fd) calls.

>> Conceptually I view enclave base
>> as "file base", it's just that we don't ever need handle the zero case
>> caused by uninitialized enclave (kind of like a file never mapped). If  
>> an
>> initialized enclave happens to have zero base, it would also work.
>
> A little bit confused about what does "enclave base" here.
>
> To me, A file is an enclave, meaning the "file offset" equals to "enclave
> offset".  "enclave base" is the base linear address of the enclave, it  
> doesn't
> matter whether it is 0 or not.  You get an "enclave address" from  
> "enclave base"
> plus "enclave offset" (or "file offset"):
>
> 	enclave_addr = enclave_base + enclave_offset/file_offset;
>
Yes this is in sgx_fadvise:
	start = offset + encl->base

The issue is that the file_offset is calculated in madvise.c before  
calling sgx_fadvise like this:
         offset = (loff_t)(start - vma->vm_start)
                         + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
         ...
         vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);

So vma->vm_pgoff is expected being set correctly relative to file.

> And such calculation is only valid after enclave has been created  
> (enclave_base
> is valid -- can be 0 or whatever).
>
> Since sgx_mmap() can happen before enclave is created, calculating the  
> vm_pgoff
> from enclave_base is conceptually wrong.  Even if you really want to do  
> it, it
> should be:
>
> 	if (enclave_has_initialized())
> 		vma->vm_pgoff = ...;

I got your point now. I can add a condition to test the SGX_ENCL_CREATED  
bit. However, we still have a hole if we must handle the sequence  
mmap(..., enclave_fd) being called before ECREATE ioctl. We can't leave  
vm_pgoff not set for those cases.

Since no one does that so far, can we explicitly return an error from  
sgx_mmap when that happens?
Other suggestions?

>But again I am not convinced why you cannot get the enclave_addr inside
> sgx_fadvice().
>

I hope the above comments addressed this. Basically we need set vm_pgoff  
relative to file for sgx_fadvise callback to receive meaningful `offset`  
relative to file.

Thanks
Haitao
Huang, Kai Feb. 15, 2023, 8:51 a.m. UTC | #8
> > > > 
> > 
> > Since sgx_mmap() can happen before enclave is created, calculating the  
> > vm_pgoff
> > from enclave_base is conceptually wrong.  Even if you really want to do  
> > it, it
> > should be:
> > 
> > 	if (enclave_has_initialized())
> > 		vma->vm_pgoff = ...;
> 
> I got your point now. I can add a condition to test the SGX_ENCL_CREATED  
> bit. However, we still have a hole if we must handle the sequence  
> mmap(..., enclave_fd) being called before ECREATE ioctl. We can't leave  
> vm_pgoff not set for those cases.
> 
> Since no one does that so far, can we explicitly return an error from  
> sgx_mmap when that happens?
> Other suggestions?

As I replied to patch 4/4, I believe userspace should pass the correct pgoff in
mmap().  It's wrong to always pass 0 or any random value.

If userspace follow the mmap() rule, you won't need to manually set vm_pgoff
here (which is hacky IMHO).  Everything works fine.
Haitao Huang Feb. 15, 2023, 3:42 p.m. UTC | #9
On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com> wrote:

>> > > >
>> >
>> > Since sgx_mmap() can happen before enclave is created, calculating the
>> > vm_pgoff
>> > from enclave_base is conceptually wrong.  Even if you really want to  
>> do
>> > it, it
>> > should be:
>> >
>> > 	if (enclave_has_initialized())
>> > 		vma->vm_pgoff = ...;
>>
>> I got your point now. I can add a condition to test the SGX_ENCL_CREATED
>> bit. However, we still have a hole if we must handle the sequence
>> mmap(..., enclave_fd) being called before ECREATE ioctl. We can't leave
>> vm_pgoff not set for those cases.
>>
>> Since no one does that so far, can we explicitly return an error from
>> sgx_mmap when that happens?
>> Other suggestions?
>
> As I replied to patch 4/4, I believe userspace should pass the correct  
> pgoff in
> mmap().  It's wrong to always pass 0 or any random value.
>
> If userspace follow the mmap() rule, you won't need to manually set  
> vm_pgoff
> here (which is hacky IMHO).  Everything works fine.
>

SGX driver was following MAP_ANONYMOUS semantics. If we change that, it'd  
break current usage/ABI.

I still think returning error for cases mmap(..., enclave_fd) if enclave  
is not created would be less intrusive change.

Haitao
Huang, Kai Feb. 16, 2023, 7:53 a.m. UTC | #10
On Wed, 2023-02-15 at 09:42 -0600, Haitao Huang wrote:
> On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> 
> > > > > > 
> > > > 
> > > > Since sgx_mmap() can happen before enclave is created, calculating the
> > > > vm_pgoff
> > > > from enclave_base is conceptually wrong.  Even if you really want to  
> > > do
> > > > it, it
> > > > should be:
> > > > 
> > > > 	if (enclave_has_initialized())
> > > > 		vma->vm_pgoff = ...;
> > > 
> > > I got your point now. I can add a condition to test the SGX_ENCL_CREATED
> > > bit. However, we still have a hole if we must handle the sequence
> > > mmap(..., enclave_fd) being called before ECREATE ioctl. We can't leave
> > > vm_pgoff not set for those cases.
> > > 
> > > Since no one does that so far, can we explicitly return an error from
> > > sgx_mmap when that happens?
> > > Other suggestions?
> > 
> > As I replied to patch 4/4, I believe userspace should pass the correct  
> > pgoff in
> > mmap().  It's wrong to always pass 0 or any random value.
> > 
> > If userspace follow the mmap() rule, you won't need to manually set  
> > vm_pgoff
> > here (which is hacky IMHO).  Everything works fine.
> > 
> 
> SGX driver was following MAP_ANONYMOUS semantics. If we change that, it'd  
> break current usage/ABI.

Is there any doc/reference mentioning this? 

> 
> I still think returning error for cases mmap(..., enclave_fd) if enclave  
> is not created would be less intrusive change.

But you need the first mmap() to work before IOCTL(ecreate) to get enclave's
base address, correct?

> 
> Haitao
Haitao Huang Feb. 16, 2023, 5:12 p.m. UTC | #11
Hi Kai

On Thu, 16 Feb 2023 01:53:47 -0600, Huang, Kai <kai.huang@intel.com> wrote:

> On Wed, 2023-02-15 at 09:42 -0600, Haitao Huang wrote:
>> On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com>  
>> wrote:
>>
>> > > > > >
>> > > >
>> > > > Since sgx_mmap() can happen before enclave is created,  
>> calculating the
>> > > > vm_pgoff
>> > > > from enclave_base is conceptually wrong.  Even if you really want  
>> to
>> > > do
>> > > > it, it
>> > > > should be:
>> > > >
>> > > > 	if (enclave_has_initialized())
>> > > > 		vma->vm_pgoff = ...;
>> > >
>> > > I got your point now. I can add a condition to test the  
>> SGX_ENCL_CREATED
>> > > bit. However, we still have a hole if we must handle the sequence
>> > > mmap(..., enclave_fd) being called before ECREATE ioctl. We can't  
>> leave
>> > > vm_pgoff not set for those cases.
>> > >
>> > > Since no one does that so far, can we explicitly return an error  
>> from
>> > > sgx_mmap when that happens?
>> > > Other suggestions?
>> >
>> > As I replied to patch 4/4, I believe userspace should pass the correct
>> > pgoff in
>> > mmap().  It's wrong to always pass 0 or any random value.
>> >
>> > If userspace follow the mmap() rule, you won't need to manually set
>> > vm_pgoff
>> > here (which is hacky IMHO).  Everything works fine.
>> >
>>
>> SGX driver was following MAP_ANONYMOUS semantics. If we change that,  
>> it'd
>> break current usage/ABI.
>
> Is there any doc/reference mentioning this?
>
No, just from how we use it :-). Jarkko/Sean/Dave may have  
history/insights on this?
But file offset in mmap spec implies there is a "real" file independent of  
the memory it is mapped into, and you can mmap arbitrary offset of that  
file to any memory address. That does not fit well with SGX enclave  
memory: there is no separate physical file other than enclave residing in  
memory and its base is fixed.
Again, my main concern is impact to existing usage.

>>
>> I still think returning error for cases mmap(..., enclave_fd) if enclave
>> is not created would be less intrusive change.
>
> But you need the first mmap() to work before IOCTL(ecreate) to get  
> enclave's
> base address, correct?
>

No, mmap(..., MAP_ANONYMOUS, fd=-1) must be used to reserve address range  
for enclave before calling IOCTL(ecreate). In fact, I realize mmap(...,  
fd=enclave_fd) before ioctl(ecreate) should fail in sgx_encl_may_map  
according to its definition but IIUC it would skip the loop xas_for_each  
when no entries yet available in encl->page_array. This might be a hole to  
enforcement of vma_prot_bits not exceeding the vm_max_prot_bits which is  
recorded first during EADD.
So if my understanding is correct, we just need fix sgx_encl_may_map to  
return -EACCES when mmap is called before ecreate is done.

Thanks
Haitao
Jarkko Sakkinen Feb. 17, 2023, 9:50 p.m. UTC | #12
On Tue, Feb 14, 2023 at 09:47:24AM +0000, Huang, Kai wrote:
> On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote:
> > @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
> >  	vma->vm_ops = &sgx_vm_ops;
> >  	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
> >  	vma->vm_private_data = encl;
> > +	vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base);
> >  
> >  	return 0;
> >  }
> 
> Perhaps I am missing something, but above change looks weird.  
> 
> Conceptually, it doesn't/shouldn't belong to this series, which essentially
> preallocates and does EAUG EPC pages for a (or part of) given enclave.  The EAUG
> logic should already be working for the normal fault path, which means the code
> change above either: 1) has been done at other place; 2) isn't needed.
> 
> I have kinda forgotten the userspace sequence to create an enclave.  If I recall
> correctly, you do below to create an enclave:
> 
> 	1) encl_fd = open("/dev/sgx_enclave");
> 	2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */);
> 	3) IOCTL(ECREATE, encl_addr, encl_size);
> 
> Would the above code change break the "mmap()" in above step 2?

Good catch! Actually shouldn't be validate it to be always zero? We
are essentially MAP_ANONYMOUS mapping semantics with a device file.

BR, Jarkko
Jarkko Sakkinen Feb. 17, 2023, 10:07 p.m. UTC | #13
On Tue, Feb 14, 2023 at 08:54:53PM +0000, Huang, Kai wrote:
> On Tue, 2023-02-14 at 13:18 -0600, Haitao Huang wrote:
> > Hi Kai
> > 
> > On Tue, 14 Feb 2023 03:47:24 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> > 
> > > On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote:
> > > > @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct  
> > > > vm_area_struct *vma)
> > > >  	vma->vm_ops = &sgx_vm_ops;
> > > >  	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
> > > >  	vma->vm_private_data = encl;
> > > > +	vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base);
> > > >   	return 0;
> > > >  }
> > > 
> > > Perhaps I am missing something, but above change looks weird.  
> > > Conceptually, it doesn't/shouldn't belong to this series, which  
> > > essentially
> > > preallocates and does EAUG EPC pages for a (or part of) given enclave.   
> > > The EAUG
> > > logic should already be working for the normal fault path, which means  
> > > the code
> > > change above either: 1) has been done at other place; 2) isn't needed.
> > > 
> > > I have kinda forgotten the userspace sequence to create an enclave.  If  
> > > I recall
> > > correctly, you do below to create an enclave:
> > > 
> > > 	1) encl_fd = open("/dev/sgx_enclave");
> > > 	2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */);
> > > 	3) IOCTL(ECREATE, encl_addr, encl_size);
> > > 
> > > Would the above code change break the "mmap()" in above step 2?
> > > 	
> > 
> > No, vm_pgoff was not used previously for enclave VMAs. I had to add this  
> > because the offset passed to sgx_fadvise is relative to file base and  
> > calculated in mm/madvise.c like this:
> > 
> >          offset = (loff_t)(start - vma->vm_start)
> >                          + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
> 
> But shouldn't 'offset is relative to the file base' be conceptually correct from
> the fadvice()'s point of view?
> 
> I think you should do:
> 
> 	encl_offset = offset + encl->base;
> 
> inside sgx_fadvice()?

I agree.

BR, Jarkko
Jarkko Sakkinen Feb. 17, 2023, 10:32 p.m. UTC | #14
On Wed, Feb 15, 2023 at 09:42:30AM -0600, Haitao Huang wrote:
> On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> 
> > > > > >
> > > >
> > > > Since sgx_mmap() can happen before enclave is created, calculating the
> > > > vm_pgoff
> > > > from enclave_base is conceptually wrong.  Even if you really want
> > > to do
> > > > it, it
> > > > should be:
> > > >
> > > > 	if (enclave_has_initialized())
> > > > 		vma->vm_pgoff = ...;
> > > 
> > > I got your point now. I can add a condition to test the SGX_ENCL_CREATED
> > > bit. However, we still have a hole if we must handle the sequence
> > > mmap(..., enclave_fd) being called before ECREATE ioctl. We can't leave
> > > vm_pgoff not set for those cases.
> > > 
> > > Since no one does that so far, can we explicitly return an error from
> > > sgx_mmap when that happens?
> > > Other suggestions?
> > 
> > As I replied to patch 4/4, I believe userspace should pass the correct
> > pgoff in
> > mmap().  It's wrong to always pass 0 or any random value.
> > 
> > If userspace follow the mmap() rule, you won't need to manually set
> > vm_pgoff
> > here (which is hacky IMHO).  Everything works fine.
> > 
> 
> SGX driver was following MAP_ANONYMOUS semantics. If we change that, it'd
> break current usage/ABI.
> 
> I still think returning error for cases mmap(..., enclave_fd) if enclave is
> not created would be less intrusive change.

Is this something you care in SGX SDK?

It is not categorically forbidden so that's why I'm asking.

BR, Jarkko
Haitao Huang Feb. 17, 2023, 11:03 p.m. UTC | #15
Hi Jarkko

On Fri, 17 Feb 2023 16:32:12 -0600, jarkko@kernel.org <jarkko@kernel.org>  
wrote:

> On Wed, Feb 15, 2023 at 09:42:30AM -0600, Haitao Huang wrote:
>> On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com>  
>> wrote:
>>
>> > > > > >
>> > > >
>> > > > Since sgx_mmap() can happen before enclave is created,  
>> calculating the
>> > > > vm_pgoff
>> > > > from enclave_base is conceptually wrong.  Even if you really want
>> > > to do
>> > > > it, it
>> > > > should be:
>> > > >
>> > > > 	if (enclave_has_initialized())
>> > > > 		vma->vm_pgoff = ...;
>> > >
>> > > I got your point now. I can add a condition to test the  
>> SGX_ENCL_CREATED
>> > > bit. However, we still have a hole if we must handle the sequence
>> > > mmap(..., enclave_fd) being called before ECREATE ioctl. We can't  
>> leave
>> > > vm_pgoff not set for those cases.
>> > >
>> > > Since no one does that so far, can we explicitly return an error  
>> from
>> > > sgx_mmap when that happens?
>> > > Other suggestions?
>> >
>> > As I replied to patch 4/4, I believe userspace should pass the correct
>> > pgoff in
>> > mmap().  It's wrong to always pass 0 or any random value.
>> >
>> > If userspace follow the mmap() rule, you won't need to manually set
>> > vm_pgoff
>> > here (which is hacky IMHO).  Everything works fine.
>> >
>>
>> SGX driver was following MAP_ANONYMOUS semantics. If we change that,  
>> it'd
>> break current usage/ABI.
>>
>> I still think returning error for cases mmap(..., enclave_fd) if  
>> enclave is
>> not created would be less intrusive change.
>
> Is this something you care in SGX SDK?
>
> It is not categorically forbidden so that's why I'm asking.

SDK does not care as we would never do this and don't think anyone is  
doing that either. Suggesting returning error to cover all cases so user  
space would not accidentally cause incorrect vm_pgoff set.
Thanks
Haitao
Jarkko Sakkinen Feb. 21, 2023, 10:10 p.m. UTC | #16
On Fri, Feb 17, 2023 at 05:03:05PM -0600, Haitao Huang wrote:
> Hi Jarkko
> 
> On Fri, 17 Feb 2023 16:32:12 -0600, jarkko@kernel.org <jarkko@kernel.org>
> wrote:
> 
> > On Wed, Feb 15, 2023 at 09:42:30AM -0600, Haitao Huang wrote:
> > > On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com>
> > > wrote:
> > > 
> > > > > > > >
> > > > > >
> > > > > > Since sgx_mmap() can happen before enclave is created,
> > > calculating the
> > > > > > vm_pgoff
> > > > > > from enclave_base is conceptually wrong.  Even if you really want
> > > > > to do
> > > > > > it, it
> > > > > > should be:
> > > > > >
> > > > > > 	if (enclave_has_initialized())
> > > > > > 		vma->vm_pgoff = ...;
> > > > >
> > > > > I got your point now. I can add a condition to test the
> > > SGX_ENCL_CREATED
> > > > > bit. However, we still have a hole if we must handle the sequence
> > > > > mmap(..., enclave_fd) being called before ECREATE ioctl. We
> > > can't leave
> > > > > vm_pgoff not set for those cases.
> > > > >
> > > > > Since no one does that so far, can we explicitly return an error
> > > from
> > > > > sgx_mmap when that happens?
> > > > > Other suggestions?
> > > >
> > > > As I replied to patch 4/4, I believe userspace should pass the correct
> > > > pgoff in
> > > > mmap().  It's wrong to always pass 0 or any random value.
> > > >
> > > > If userspace follow the mmap() rule, you won't need to manually set
> > > > vm_pgoff
> > > > here (which is hacky IMHO).  Everything works fine.
> > > >
> > > 
> > > SGX driver was following MAP_ANONYMOUS semantics. If we change that,
> > > it'd
> > > break current usage/ABI.
> > > 
> > > I still think returning error for cases mmap(..., enclave_fd) if
> > > enclave is
> > > not created would be less intrusive change.
> > 
> > Is this something you care in SGX SDK?
> > 
> > It is not categorically forbidden so that's why I'm asking.
> 
> SDK does not care as we would never do this and don't think anyone is doing
> that either. Suggesting returning error to cover all cases so user space
> would not accidentally cause incorrect vm_pgoff set.

vm_pgoff != 0 should result -EINVAL.

BR, Jarkko
Haitao Huang Feb. 22, 2023, 1:37 a.m. UTC | #17
On Tue, 21 Feb 2023 16:10:02 -0600, jarkko@kernel.org <jarkko@kernel.org>  
wrote:

> On Fri, Feb 17, 2023 at 05:03:05PM -0600, Haitao Huang wrote:
>> Hi Jarkko
>>
>> On Fri, 17 Feb 2023 16:32:12 -0600, jarkko@kernel.org  
>> <jarkko@kernel.org>
>> wrote:
>>
>> > On Wed, Feb 15, 2023 at 09:42:30AM -0600, Haitao Huang wrote:
>> > > On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com>
>> > > wrote:
>> > >
>> > > > > > > >
>> > > > > >
>> > > > > > Since sgx_mmap() can happen before enclave is created,
>> > > calculating the
>> > > > > > vm_pgoff
>> > > > > > from enclave_base is conceptually wrong.  Even if you really  
>> want
>> > > > > to do
>> > > > > > it, it
>> > > > > > should be:
>> > > > > >
>> > > > > > 	if (enclave_has_initialized())
>> > > > > > 		vma->vm_pgoff = ...;
>> > > > >
>> > > > > I got your point now. I can add a condition to test the
>> > > SGX_ENCL_CREATED
>> > > > > bit. However, we still have a hole if we must handle the  
>> sequence
>> > > > > mmap(..., enclave_fd) being called before ECREATE ioctl. We
>> > > can't leave
>> > > > > vm_pgoff not set for those cases.
>> > > > >
>> > > > > Since no one does that so far, can we explicitly return an error
>> > > from
>> > > > > sgx_mmap when that happens?
>> > > > > Other suggestions?
>> > > >
>> > > > As I replied to patch 4/4, I believe userspace should pass the  
>> correct
>> > > > pgoff in
>> > > > mmap().  It's wrong to always pass 0 or any random value.
>> > > >
>> > > > If userspace follow the mmap() rule, you won't need to manually  
>> set
>> > > > vm_pgoff
>> > > > here (which is hacky IMHO).  Everything works fine.
>> > > >
>> > >
>> > > SGX driver was following MAP_ANONYMOUS semantics. If we change that,
>> > > it'd
>> > > break current usage/ABI.
>> > >
>> > > I still think returning error for cases mmap(..., enclave_fd) if
>> > > enclave is
>> > > not created would be less intrusive change.
>> >
>> > Is this something you care in SGX SDK?
>> >
>> > It is not categorically forbidden so that's why I'm asking.
>>
>> SDK does not care as we would never do this and don't think anyone is  
>> doing
>> that either. Suggesting returning error to cover all cases so user space
>> would not accidentally cause incorrect vm_pgoff set.
>
> vm_pgoff != 0 should result -EINVAL.
>

Do you mean 'offset' passed in from mmap syscall and converted to pgoff in  
kernel? I can see it only passed to driver in get_unmapped_area. I can add  
this enforcement there so it is consistent to the MAP_ANONYMOUS spec if  
that's what you suggest.

  vma->vm_pgoff is the offset in pages of the vma->vm_start relative to  
'file'.
  It can not be zero for all VMAs of the enclave.

Kernel sets vma->vm_pgoff == vma->vm_start>>PAGE_SHIFT for private anon  
VMAs, and ignore it (zero) for shared anon mapping.

For us, we propose to set it to PFN_DOWN(vma->vm_start - encl->base) upon  
mmap.
The concern Kai raised about encl->base not available in the window  
between mmap and ECREATE can be addressed by disallowing mmap before  
ECREATE is done. It does not make much sense anyway to mmap(enclave_fd)  
without a valid enclave range associated with enclave_fd.

Does that sound reasonable or I still miss some point here?

Thanks
Haitao
Huang, Kai March 7, 2023, 11:32 p.m. UTC | #18
On Tue, 2023-02-21 at 19:37 -0600, Haitao Huang wrote:
> On Tue, 21 Feb 2023 16:10:02 -0600, jarkko@kernel.org <jarkko@kernel.org>  
> wrote:
> 
> > On Fri, Feb 17, 2023 at 05:03:05PM -0600, Haitao Huang wrote:
> > > Hi Jarkko
> > > 
> > > On Fri, 17 Feb 2023 16:32:12 -0600, jarkko@kernel.org  
> > > <jarkko@kernel.org>
> > > wrote:
> > > 
> > > > On Wed, Feb 15, 2023 at 09:42:30AM -0600, Haitao Huang wrote:
> > > > > On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com>
> > > > > wrote:
> > > > > 
> > > > > > > > > > 
> > > > > > > > 
> > > > > > > > Since sgx_mmap() can happen before enclave is created,
> > > > > calculating the
> > > > > > > > vm_pgoff
> > > > > > > > from enclave_base is conceptually wrong.  Even if you really  
> > > want
> > > > > > > to do
> > > > > > > > it, it
> > > > > > > > should be:
> > > > > > > > 
> > > > > > > > 	if (enclave_has_initialized())
> > > > > > > > 		vma->vm_pgoff = ...;
> > > > > > > 
> > > > > > > I got your point now. I can add a condition to test the
> > > > > SGX_ENCL_CREATED
> > > > > > > bit. However, we still have a hole if we must handle the  
> > > sequence
> > > > > > > mmap(..., enclave_fd) being called before ECREATE ioctl. We
> > > > > can't leave
> > > > > > > vm_pgoff not set for those cases.
> > > > > > > 
> > > > > > > Since no one does that so far, can we explicitly return an error
> > > > > from
> > > > > > > sgx_mmap when that happens?
> > > > > > > Other suggestions?
> > > > > > 
> > > > > > As I replied to patch 4/4, I believe userspace should pass the  
> > > correct
> > > > > > pgoff in
> > > > > > mmap().  It's wrong to always pass 0 or any random value.
> > > > > > 
> > > > > > If userspace follow the mmap() rule, you won't need to manually  
> > > set
> > > > > > vm_pgoff
> > > > > > here (which is hacky IMHO).  Everything works fine.
> > > > > > 
> > > > > 
> > > > > SGX driver was following MAP_ANONYMOUS semantics. If we change that,
> > > > > it'd
> > > > > break current usage/ABI.
> > > > > 
> > > > > I still think returning error for cases mmap(..., enclave_fd) if
> > > > > enclave is
> > > > > not created would be less intrusive change.
> > > > 
> > > > Is this something you care in SGX SDK?
> > > > 
> > > > It is not categorically forbidden so that's why I'm asking.
> > > 
> > > SDK does not care as we would never do this and don't think anyone is  
> > > doing
> > > that either. Suggesting returning error to cover all cases so user space
> > > would not accidentally cause incorrect vm_pgoff set.
> > 
> > vm_pgoff != 0 should result -EINVAL.
> > 
> 
> Do you mean 'offset' passed in from mmap syscall and converted to pgoff in  
> kernel? I can see it only passed to driver in get_unmapped_area. I can add  
> this enforcement there so it is consistent to the MAP_ANONYMOUS spec if  
> that's what you suggest.
> 
>   vma->vm_pgoff is the offset in pages of the vma->vm_start relative to  
> 'file'.
>   It can not be zero for all VMAs of the enclave.
> 
> Kernel sets vma->vm_pgoff == vma->vm_start>>PAGE_SHIFT for private anon  
> VMAs, and ignore it (zero) for shared anon mapping.
> 
> For us, we propose to set it to PFN_DOWN(vma->vm_start - encl->base) upon  
> mmap.

Sorry for late reply.  Basically I was sick and having limited working time in
the passed two weeks.

For what I understand now, SGX driver wants to use MAP_ANONYMOUS semantic for
mmap()s against /dev/sgx_enclave (for whatever reason that I don't understand). 

And because of that, we even want to explicitly enforce userspace to always pass
0 as pgoff in mmap()s (hasn't been done yet).

And then here, we want to rewrite vma->pgoff so that the VMA can act _like_ a
normal file-based mmap() semantics (despite it is indeed a file-based VMA),
because the VFS fadvice() implementation assumes file-based VMAs are always
following file-based mmap() semantics.

Hmm..

Doesn't this sound weird?

I must have been missing something, but why cannot SGX driver just always follow
file-based mmap() semantics at the beginning?

For instance, what's wrong with:

	1) encl_fd = open("/dev/sgx_enclave")
	2) encl_base = mmap(..., encl_size, MAP_SHARED, encl_fd, 0 /* pgoff */)
	3) ioctl(ECREATE, encl_base, encl_size)

(In ioctl(ECREATE), we might want to verify the VMA we found has the same
encl_base as starting address, and 0 pgoff).

And in following mmap()s in which we want to map a small range of enclave:

	encl_addr = mmap(encl_addr, MAP_SHARED|MAP_FIXED, encl_fd,
			(encl_addr - encl_base) >> PAGE_SHIFT);

?

Anything wrong above?

In fact, if the first mmap() in step 2) before IOCTL(ecreate) used
MAP_ANONYMOUS, IIUC that VMA will stay as anonymous VMA but will never be
converted to SGX file-based one, because by looking at the code, I see neither
ioctl(ECREATE) nor ioctl(EINIT) converts that VMA to file-based one.

That being said, all page faults caused by userspace access to that VMA will
still go to anonymous VMA's fault path, but not SGX driver's.

This is fine for SGX1 as all enclave pages are fully populated.  For SGX2,
_unless_ you explicitly mmap("/dev/sgx_enclave") those dynamical ranges, the
fault will never be handled by SGX driver.

Architecturally, for SGX2, if you mmap() the entire enclave range (as in step
2), you don't need explicitly mmap() all dynamic ranges.  The userspace should
just be able to access those dynamic ranges (i.e. EACCEPT) and the kernel driver
should gracefully handle the fault.

Shouldn't this be a problem?

(Btw, there might be other corner cases that could cause VMA splitting/merging,
etc, but I haven't thought about those)


> The concern Kai raised about encl->base not available in the window  
> between mmap and ECREATE can be addressed by disallowing mmap before  
> ECREATE is done. It does not make much sense anyway to mmap(enclave_fd)  
> without a valid enclave range associated with enclave_fd.
> 

We can, but I don't understand why the first mmap() before ECREATE must/should
be done via MMAP_ANONYMOUS.  In fact, I think it might be wrong for SGX2.
Haitao Huang March 9, 2023, 12:50 a.m. UTC | #19
Hi Kai

Thanks for your detailed review.

On Tue, 07 Mar 2023 17:32:15 -0600, Huang, Kai <kai.huang@intel.com> wrote:

> On Tue, 2023-02-21 at 19:37 -0600, Haitao Huang wrote:
>> On Tue, 21 Feb 2023 16:10:02 -0600, jarkko@kernel.org  
>> <jarkko@kernel.org>
>> wrote:
>>
>> > On Fri, Feb 17, 2023 at 05:03:05PM -0600, Haitao Huang wrote:
>> > > Hi Jarkko
>> > >
>> > > On Fri, 17 Feb 2023 16:32:12 -0600, jarkko@kernel.org
>> > > <jarkko@kernel.org>
>> > > wrote:
>> > >
>> > > > On Wed, Feb 15, 2023 at 09:42:30AM -0600, Haitao Huang wrote:
>> > > > > On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai  
>> <kai.huang@intel.com>
>> > > > > wrote:
>> > > > >
>> > > > > > > > > >
>> > > > > > > >
>> > > > > > > > Since sgx_mmap() can happen before enclave is created,
>> > > > > calculating the
>> > > > > > > > vm_pgoff
>> > > > > > > > from enclave_base is conceptually wrong.  Even if you  
>> really
>> > > want
>> > > > > > > to do
>> > > > > > > > it, it
>> > > > > > > > should be:
>> > > > > > > >
>> > > > > > > > 	if (enclave_has_initialized())
>> > > > > > > > 		vma->vm_pgoff = ...;
>> > > > > > >
>> > > > > > > I got your point now. I can add a condition to test the
>> > > > > SGX_ENCL_CREATED
>> > > > > > > bit. However, we still have a hole if we must handle the
>> > > sequence
>> > > > > > > mmap(..., enclave_fd) being called before ECREATE ioctl. We
>> > > > > can't leave
>> > > > > > > vm_pgoff not set for those cases.
>> > > > > > >
>> > > > > > > Since no one does that so far, can we explicitly return an  
>> error
>> > > > > from
>> > > > > > > sgx_mmap when that happens?
>> > > > > > > Other suggestions?
>> > > > > >
>> > > > > > As I replied to patch 4/4, I believe userspace should pass the
>> > > correct
>> > > > > > pgoff in
>> > > > > > mmap().  It's wrong to always pass 0 or any random value.
>> > > > > >
>> > > > > > If userspace follow the mmap() rule, you won't need to  
>> manually
>> > > set
>> > > > > > vm_pgoff
>> > > > > > here (which is hacky IMHO).  Everything works fine.
>> > > > > >
>> > > > >
>> > > > > SGX driver was following MAP_ANONYMOUS semantics. If we change  
>> that,
>> > > > > it'd
>> > > > > break current usage/ABI.
>> > > > >
>> > > > > I still think returning error for cases mmap(..., enclave_fd) if
>> > > > > enclave is
>> > > > > not created would be less intrusive change.
>> > > >
>> > > > Is this something you care in SGX SDK?
>> > > >
>> > > > It is not categorically forbidden so that's why I'm asking.
>> > >
>> > > SDK does not care as we would never do this and don't think anyone  
>> is
>> > > doing
>> > > that either. Suggesting returning error to cover all cases so user  
>> space
>> > > would not accidentally cause incorrect vm_pgoff set.
>> >
>> > vm_pgoff != 0 should result -EINVAL.
>> >
>>
>> Do you mean 'offset' passed in from mmap syscall and converted to pgoff  
>> in
>> kernel? I can see it only passed to driver in get_unmapped_area. I can  
>> add
>> this enforcement there so it is consistent to the MAP_ANONYMOUS spec if
>> that's what you suggest.
>>
>>   vma->vm_pgoff is the offset in pages of the vma->vm_start relative to
>> 'file'.
>>   It can not be zero for all VMAs of the enclave.
>>
>> Kernel sets vma->vm_pgoff == vma->vm_start>>PAGE_SHIFT for private anon
>> VMAs, and ignore it (zero) for shared anon mapping.
>>
>> For us, we propose to set it to PFN_DOWN(vma->vm_start - encl->base)  
>> upon
>> mmap.
>
> Sorry for late reply.  Basically I was sick and having limited working  
> time in
> the passed two weeks.
>
> For what I understand now, SGX driver wants to use MAP_ANONYMOUS  
> semantic for
> mmap()s against /dev/sgx_enclave (for whatever reason that I don't  
> understand).
> And because of that, we even want to explicitly enforce userspace to  
> always pass
> 0 as pgoff in mmap()s (hasn't been done yet).
>
> And then here, we want to rewrite vma->pgoff so that the VMA can act  
> _like_ a
> normal file-based mmap() semantics (despite it is indeed a file-based  
> VMA),
> because the VFS fadvice() implementation assumes file-based VMAs are  
> always
> following file-based mmap() semantics.
>
> Hmm..
>
> Doesn't this sound weird?
>

Sometime weird means good or creative as expressed in a popular slogan in  
the city where I live. Just joking:-)

> I must have been missing something, but why cannot SGX driver just  
> always follow
> file-based mmap() semantics at the beginning?
>
> For instance, what's wrong with:
>
> 	1) encl_fd = open("/dev/sgx_enclave")
> 	2) encl_base = mmap(..., encl_size, MAP_SHARED, encl_fd, 0 /* pgoff */)
> 	3) ioctl(ECREATE, encl_base, encl_size)
>
> (In ioctl(ECREATE), we might want to verify the VMA we found has the same
> encl_base as starting address, and 0 pgoff).
>
> And in following mmap()s in which we want to map a small range of  
> enclave:
>
> 	encl_addr = mmap(encl_addr, MAP_SHARED|MAP_FIXED, encl_fd,
> 			(encl_addr - encl_base) >> PAGE_SHIFT);
>
> ?
>
> Anything wrong above?
>

I believe this can be done except for that you are breaking existing code  
by requiring pg_off for all mmaps now.

Doing mmap before ECREATE also effectively bypasses sgx_encl_may_map. And  
we have a very long thread of discussion about this[1] and that's why  
sgx_encl_may_map come into play to limit VMA permissions to those given in  
SecInfo by EADD ioctl. However, current code does not explicitly block it  
and I think it'd be better do it.  More comment below on this point.

> In fact, if the first mmap() in step 2) before IOCTL(ecreate) used
> MAP_ANONYMOUS, IIUC that VMA will stay as anonymous VMA but will never be
> converted to SGX file-based one, because by looking at the code, I see  
> neither
> ioctl(ECREATE) nor ioctl(EINIT) converts that VMA to file-based one.
>
> That being said, all page faults caused by userspace access to that VMA  
> will
> still go to anonymous VMA's fault path, but not SGX driver's.
>

It is required to mmap anonymously with PROT_NONE initially and
user space must invoke mmap with enclave fd after EADDing each  
segments/areas. PF handler will check VMA prot bits against  
vm_max_prot_bits in sgx_encl_load_page_in_vma, and report failure on any  
mismatch.
If user space does not mmap(...,encl_fd) after EADD, pages will have  
PROT_NONE and fail on any access.

> This is fine for SGX1 as all enclave pages are fully populated.  For  
> SGX2,
> _unless_ you explicitly mmap("/dev/sgx_enclave") those dynamical ranges,  
> the
> fault will never be handled by SGX driver.
>

Indeed, it is required user space invoke mmap to config the regions for  
EAUG.

> Architecturally, for SGX2, if you mmap() the entire enclave range (as in  
> step
> 2), you don't need explicitly mmap() all dynamic ranges.  The userspace  
> should
> just be able to access those dynamic ranges (i.e. EACCEPT) and the  
> kernel driver
> should gracefully handle the fault.
>
> Shouldn't this be a problem?
>
> (Btw, there might be other corner cases that could cause VMA  
> splitting/merging,
> etc, but I haven't thought about those)
>
>> The concern Kai raised about encl->base not available in the window
>> between mmap and ECREATE can be addressed by disallowing mmap before
>> ECREATE is done. It does not make much sense anyway to mmap(enclave_fd)
>> without a valid enclave range associated with enclave_fd.
>>
>
> We can, but I don't understand why the first mmap() before ECREATE  
> must/should
> be done via MMAP_ANONYMOUS.  In fact, I think it might be wrong for SGX2.
>

There is a long history behind it. The use of anonymous mapping can be  
traced back to v29.
See this thread[2].

Before v29, user space can do mmap(PROT_NONE, ..., enclave_fd) before  
ECREATE to reserve a range for the enclave. But it must be PROT_NONE for  
the reservation, otherwise sgx_encl_may_map will block it by the "!page"  
check below [3]:

int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
		     unsigned long end, unsigned long vm_prot_bits)
{
	unsigned long idx, idx_start, idx_end;
	struct sgx_encl_page *page;

	/* PROT_NONE always succeeds. */
	if (!vm_prot_bits)
		return 0;

	idx_start = PFN_DOWN(start);
	idx_end = PFN_DOWN(end - 1);
	for (idx = idx_start; idx <= idx_end; ++idx) {
		mutex_lock(&encl->lock);
		page = radix_tree_lookup(&encl->page_tree, idx);
		mutex_unlock(&encl->lock);

		if (!page || (~page->vm_prot_bits & vm_prot_bits))
			return -EACCES;
...

Then in v29, PROT_NONE mapping was disallowed for encl_fd before pages  
EADDed so user space has to mmap anonymously to reserve the range.The  
intent was still not to allow mmap before pages EADDed (the !page check  
was still there up to v38)

Later version (around v39?) we switched to enforce the permissions in PF  
handler and mmap with any permissions before EADD is allowed, but may  
cause failure later on PF. I'm not sure it was intentional but pretty sure  
no meaningful usage for doing mmap before ECREATE due to PF handler  
enforcement.

I think that's the history behind it. Others more knowledgeable can  
correct me as needed.

Again, even if we redesign the whole thing to be less "weird", then  
requiring pgoff for each mmap would break existing user space code. And I  
think explicitly block mmap before ECREATE make the API more consistent  
with the PF handler enforcement and original intent of sgx_encl_may_map.

Thanks again for detailed review and those thoughtful questions.

Haitao

[1]https://lore.kernel.org/linux-sgx/CALCETrXf8mSK45h7sTK5Wf+pXLVn=Bjsc_RLpgO-h-qdzBRo5Q@mail.gmail.com/
[2]https://lore.kernel.org/linux-sgx/CAOASepPFe_ucuwe7JW_-+VBQ4=+sHqyGXOdA9kUbcYA_9=v0sA@mail.gmail.com/
[3]https://lore.kernel.org/linux-sgx/20190713170804.2340-17-jarkko.sakkinen@linux.intel.com/
Huang, Kai March 9, 2023, 11:31 a.m. UTC | #20
On Wed, 2023-03-08 at 18:50 -0600, Haitao Huang wrote:
> Hi Kai
> 
> Thanks for your detailed review.
> 
> On Tue, 07 Mar 2023 17:32:15 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> 
> > On Tue, 2023-02-21 at 19:37 -0600, Haitao Huang wrote:
> > > On Tue, 21 Feb 2023 16:10:02 -0600, jarkko@kernel.org  
> > > <jarkko@kernel.org>
> > > wrote:
> > > 
> > > > On Fri, Feb 17, 2023 at 05:03:05PM -0600, Haitao Huang wrote:
> > > > > Hi Jarkko
> > > > > 
> > > > > On Fri, 17 Feb 2023 16:32:12 -0600, jarkko@kernel.org
> > > > > <jarkko@kernel.org>
> > > > > wrote:
> > > > > 
> > > > > > On Wed, Feb 15, 2023 at 09:42:30AM -0600, Haitao Huang wrote:
> > > > > > > On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai  
> > > <kai.huang@intel.com>
> > > > > > > wrote:
> > > > > > > 
> > > > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > Since sgx_mmap() can happen before enclave is created,
> > > > > > > calculating the
> > > > > > > > > > vm_pgoff
> > > > > > > > > > from enclave_base is conceptually wrong.  Even if you  
> > > really
> > > > > want
> > > > > > > > > to do
> > > > > > > > > > it, it
> > > > > > > > > > should be:
> > > > > > > > > > 
> > > > > > > > > > 	if (enclave_has_initialized())
> > > > > > > > > > 		vma->vm_pgoff = ...;
> > > > > > > > > 
> > > > > > > > > I got your point now. I can add a condition to test the
> > > > > > > SGX_ENCL_CREATED
> > > > > > > > > bit. However, we still have a hole if we must handle the
> > > > > sequence
> > > > > > > > > mmap(..., enclave_fd) being called before ECREATE ioctl. We
> > > > > > > can't leave
> > > > > > > > > vm_pgoff not set for those cases.
> > > > > > > > > 
> > > > > > > > > Since no one does that so far, can we explicitly return an  
> > > error
> > > > > > > from
> > > > > > > > > sgx_mmap when that happens?
> > > > > > > > > Other suggestions?
> > > > > > > > 
> > > > > > > > As I replied to patch 4/4, I believe userspace should pass the
> > > > > correct
> > > > > > > > pgoff in
> > > > > > > > mmap().  It's wrong to always pass 0 or any random value.
> > > > > > > > 
> > > > > > > > If userspace follow the mmap() rule, you won't need to  
> > > manually
> > > > > set
> > > > > > > > vm_pgoff
> > > > > > > > here (which is hacky IMHO).  Everything works fine.
> > > > > > > > 
> > > > > > > 
> > > > > > > SGX driver was following MAP_ANONYMOUS semantics. If we change  
> > > that,
> > > > > > > it'd
> > > > > > > break current usage/ABI.
> > > > > > > 
> > > > > > > I still think returning error for cases mmap(..., enclave_fd) if
> > > > > > > enclave is
> > > > > > > not created would be less intrusive change.
> > > > > > 
> > > > > > Is this something you care in SGX SDK?
> > > > > > 
> > > > > > It is not categorically forbidden so that's why I'm asking.
> > > > > 
> > > > > SDK does not care as we would never do this and don't think anyone  
> > > is
> > > > > doing
> > > > > that either. Suggesting returning error to cover all cases so user  
> > > space
> > > > > would not accidentally cause incorrect vm_pgoff set.
> > > > 
> > > > vm_pgoff != 0 should result -EINVAL.
> > > > 
> > > 
> > > Do you mean 'offset' passed in from mmap syscall and converted to pgoff  
> > > in
> > > kernel? I can see it only passed to driver in get_unmapped_area. I can  
> > > add
> > > this enforcement there so it is consistent to the MAP_ANONYMOUS spec if
> > > that's what you suggest.
> > > 
> > >   vma->vm_pgoff is the offset in pages of the vma->vm_start relative to
> > > 'file'.
> > >   It can not be zero for all VMAs of the enclave.
> > > 
> > > Kernel sets vma->vm_pgoff == vma->vm_start>>PAGE_SHIFT for private anon
> > > VMAs, and ignore it (zero) for shared anon mapping.
> > > 
> > > For us, we propose to set it to PFN_DOWN(vma->vm_start - encl->base)  
> > > upon
> > > mmap.
> > 
> > Sorry for late reply.  Basically I was sick and having limited working  
> > time in
> > the passed two weeks.
> > 
> > For what I understand now, SGX driver wants to use MAP_ANONYMOUS  
> > semantic for
> > mmap()s against /dev/sgx_enclave (for whatever reason that I don't  
> > understand).
> > And because of that, we even want to explicitly enforce userspace to  
> > always pass
> > 0 as pgoff in mmap()s (hasn't been done yet).
> > 
> > And then here, we want to rewrite vma->pgoff so that the VMA can act  
> > _like_ a
> > normal file-based mmap() semantics (despite it is indeed a file-based  
> > VMA),
> > because the VFS fadvice() implementation assumes file-based VMAs are  
> > always
> > following file-based mmap() semantics.
> > 
> > Hmm..
> > 
> > Doesn't this sound weird?
> > 
> 
> Sometime weird means good or creative as expressed in a popular slogan in  
> the city where I live. Just joking:-)
> 
> > I must have been missing something, but why cannot SGX driver just  
> > always follow
> > file-based mmap() semantics at the beginning?
> > 
> > For instance, what's wrong with:
> > 
> > 	1) encl_fd = open("/dev/sgx_enclave")
> > 	2) encl_base = mmap(..., encl_size, MAP_SHARED, encl_fd, 0 /* pgoff */)
> > 	3) ioctl(ECREATE, encl_base, encl_size)
> > 
> > (In ioctl(ECREATE), we might want to verify the VMA we found has the same
> > encl_base as starting address, and 0 pgoff).
> > 
> > And in following mmap()s in which we want to map a small range of  
> > enclave:
> > 
> > 	encl_addr = mmap(encl_addr, MAP_SHARED|MAP_FIXED, encl_fd,
> > 			(encl_addr - encl_base) >> PAGE_SHIFT);
> > 
> > ?
> > 
> > Anything wrong above?
> > 
> 
> I believe this can be done except for that you are breaking existing code  
> by requiring pg_off for all mmaps now.
> 
> Doing mmap before ECREATE also effectively bypasses sgx_encl_may_map. And  
> we have a very long thread of discussion about this[1] and that's why  
> sgx_encl_may_map come into play to limit VMA permissions to those given in  
> SecInfo by EADD ioctl. However, current code does not explicitly block it  
> and I think it'd be better do it.  More comment below on this point.

Thanks for the info!
> 
> > In fact, if the first mmap() in step 2) before IOCTL(ecreate) used
> > MAP_ANONYMOUS, IIUC that VMA will stay as anonymous VMA but will never be
> > converted to SGX file-based one, because by looking at the code, I see  
> > neither
> > ioctl(ECREATE) nor ioctl(EINIT) converts that VMA to file-based one.
> > 
> > That being said, all page faults caused by userspace access to that VMA  
> > will
> > still go to anonymous VMA's fault path, but not SGX driver's.
> > 
> 
> It is required to mmap anonymously with PROT_NONE initially and
> user space must invoke mmap with enclave fd after EADDing each  
> segments/areas. PF handler will check VMA prot bits against  
> vm_max_prot_bits in sgx_encl_load_page_in_vma, and report failure on any  
> mismatch.
> If user space does not mmap(...,encl_fd) after EADD, pages will have  
> PROT_NONE and fail on any access.
> 
> > This is fine for SGX1 as all enclave pages are fully populated.  For  
> > SGX2,
> > _unless_ you explicitly mmap("/dev/sgx_enclave") those dynamical ranges,  
> > the
> > fault will never be handled by SGX driver.
> > 
> 
> Indeed, it is required user space invoke mmap to config the regions for  
> EAUG.
> 
> > Architecturally, for SGX2, if you mmap() the entire enclave range (as in  
> > step
> > 2), you don't need explicitly mmap() all dynamic ranges.  The userspace  
> > should
> > just be able to access those dynamic ranges (i.e. EACCEPT) and the  
> > kernel driver
> > should gracefully handle the fault.
> > 
> > Shouldn't this be a problem?
> > 
> > (Btw, there might be other corner cases that could cause VMA  
> > splitting/merging,
> > etc, but I haven't thought about those)
> > 
> > > The concern Kai raised about encl->base not available in the window
> > > between mmap and ECREATE can be addressed by disallowing mmap before
> > > ECREATE is done. It does not make much sense anyway to mmap(enclave_fd)
> > > without a valid enclave range associated with enclave_fd.
> > > 
> > 
> > We can, but I don't understand why the first mmap() before ECREATE  
> > must/should
> > be done via MMAP_ANONYMOUS.  In fact, I think it might be wrong for SGX2.
> > 
> 
> There is a long history behind it. The use of anonymous mapping can be  
> traced back to v29.
> See this thread[2].
> 
> Before v29, user space can do mmap(PROT_NONE, ..., enclave_fd) before  
> ECREATE to reserve a range for the enclave. But it must be PROT_NONE for  
> the reservation, otherwise sgx_encl_may_map will block it by the "!page"  
> check below [3]:
> 
> int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
> 		     unsigned long end, unsigned long vm_prot_bits)
> {
> 	unsigned long idx, idx_start, idx_end;
> 	struct sgx_encl_page *page;
> 
> 	/* PROT_NONE always succeeds. */
> 	if (!vm_prot_bits)
> 		return 0;
> 
> 	idx_start = PFN_DOWN(start);
> 	idx_end = PFN_DOWN(end - 1);
> 	for (idx = idx_start; idx <= idx_end; ++idx) {
> 		mutex_lock(&encl->lock);
> 		page = radix_tree_lookup(&encl->page_tree, idx);
> 		mutex_unlock(&encl->lock);
> 
> 		if (!page || (~page->vm_prot_bits & vm_prot_bits))
> 			return -EACCES;
> ...
> 
> Then in v29, PROT_NONE mapping was disallowed for encl_fd before pages  
> EADDed so user space has to mmap anonymously to reserve the range.The  
> intent was still not to allow mmap before pages EADDed (the !page check  
> was still there up to v38)

Do you know the reason of disallowing PROT_NONE mapping against encl_fd?

I dig v28 roughly but didn't find any clue.

Basically no comments related to this were made to:

	[PATCH v28 11/22] x86/sgx: Linux Enclave Driver

> 
> Later version (around v39?) we switched to enforce the permissions in PF  
> handler and mmap with any permissions before EADD is allowed, but may  
> cause failure later on PF. I'm not sure it was intentional but pretty sure  
> no meaningful usage for doing mmap before ECREATE due to PF handler  
> enforcement.

IIUC this change around v39 re-allows mmap() against encl_fd before ECREATE?

IIUC the only enforcement is VMA's permission must be more restricted than
enclave page's permission.

So, theoretically, we can mmap() encl_fd with PROT_READ|PROT_WRITE|PROT_EXEC,
and then mprotect() the address range based on the actual enclave pages (in
EADD) before actually doing EADD those pages? 

> 
> I think that's the history behind it. Others more knowledgeable can  
> correct me as needed.

Thanks for the information.

> 
> Again, even if we redesign the whole thing to be less "weird", then  
> requiring pgoff for each mmap would break existing user space code. And I  
> think explicitly block mmap before ECREATE make the API more consistent  
> with the PF handler enforcement and original intent of sgx_encl_may_map.

I think permission check in sgx_encl_may_map() isn't restricted related to the
vma->pgoff issue here.  They are basically two different topics, IIUC.

So I am still a little bit confused about where does "SGX driver uses
MAP_ANONYMOUS semantics for fd-based mmap()" come from.

Anyway, we certainly don't want to break userspace.  However, IIUC, even from
now on we change the driver to depend on userspace to pass the correct pgoff in
mmap(), this won't break userspace, because old userspace which doesn't use
fadvice() and pgoff actually doesn't matter.  For new userspace which uses
fadvice(), it needs to pass the correct pgoff.

I am not saying we should do this, but it doesn't seem we can break userspace?

> 
> Thanks again for detailed review and those thoughtful questions.
> 
> Haitao
> 
> [1]https://lore.kernel.org/linux-sgx/CALCETrXf8mSK45h7sTK5Wf+pXLVn=Bjsc_RLpgO-h-qdzBRo5Q@mail.gmail.com/
> [2]https://lore.kernel.org/linux-sgx/CAOASepPFe_ucuwe7JW_-+VBQ4=+sHqyGXOdA9kUbcYA_9=v0sA@mail.gmail.com/
> [3]https://lore.kernel.org/linux-sgx/20190713170804.2340-17-jarkko.sakkinen@linux.intel.com/

Thanks again for digging out those links.  I haven't read the entire discussion,
though, as they are pretty long.
Jarkko Sakkinen March 12, 2023, 1:25 a.m. UTC | #21
On Tue, Mar 07, 2023 at 11:32:15PM +0000, Huang, Kai wrote:
> And in following mmap()s in which we want to map a small range of enclave:
> 
> 	encl_addr = mmap(encl_addr, MAP_SHARED|MAP_FIXED, encl_fd,
> 			(encl_addr - encl_base) >> PAGE_SHIFT);
> 
> ?
> 
> Anything wrong above?

I'm not sure I fully comprehended your response because it was
honestly a bit scattered so please correct me if I'm missing
something but: why a process would want to map a small range
of an enclave?

You anyway most likely want to enter to the enclave. Is this
for ptrace()?

BR, Jarkko
Huang, Kai March 12, 2023, 10:25 p.m. UTC | #22
On Sun, 2023-03-12 at 03:25 +0200, jarkko@kernel.org wrote:
> On Tue, Mar 07, 2023 at 11:32:15PM +0000, Huang, Kai wrote:
> > And in following mmap()s in which we want to map a small range of enclave:
> > 
> > 	encl_addr = mmap(encl_addr, MAP_SHARED|MAP_FIXED, encl_fd,
> > 			(encl_addr - encl_base) >> PAGE_SHIFT);
> > 
> > ?
> > 
> > Anything wrong above?
> 
> I'm not sure I fully comprehended your response because it was
> honestly a bit scattered so please correct me if I'm missing
> something but: why a process would want to map a small range
> of an enclave?

For SGX2, if mmap(MAP_ANONYMOUS) was used to get the enclave base address before
ECREATE, you will need to mmap(encl_fd) for any regions that are not pre-
populated (via EADD), no matter whether the first mmap(MAP_ANONYMOUS) has
covered all enclave range or not.  Otherwise, the fault to SGX2 regions won't be
handled by SGX driver.

You can find such mmap() in patch 4 in this series too.

Also, architecturally, for SGX2 the first mmap() (before ECREATE) doesn't have
to map the entire enclave range.  For SGX2 it's fine to pass a larger enclave
range in ioctl(ECREATE) than the range got from the first mmap().  Userspace can
later on choose to only mmap() the dynamic ranges that it wants to use.
Haitao Huang March 14, 2023, 2:54 p.m. UTC | #23
On Thu, 09 Mar 2023 05:31:29 -0600, Huang, Kai <kai.huang@intel.com> wrote:
>>
>> Then in v29, PROT_NONE mapping was disallowed for encl_fd before pages
>> EADDed so user space has to mmap anonymously to reserve the range.The
>> intent was still not to allow mmap before pages EADDed (the !page check
>> was still there up to v38)
>
> Do you know the reason of disallowing PROT_NONE mapping against encl_fd?
>

I think it was to allow user space to do anonymous mapping to reserve  
address space for enclave.
Before this point, you have to use PROT_NONE to reserve with encl_fd.  
There might be an issue with how #PF and EPC swapping was handled or the  
elegance of those flows that motivated the move but I can't remember. ABI  
was not fixed at that time so it was OK to change.

> I dig v28 roughly but didn't find any clue.
>
> Basically no comments related to this were made to:
>
> 	[PATCH v28 11/22] x86/sgx: Linux Enclave Driver
>
>>
>> Later version (around v39?) we switched to enforce the permissions in PF
>> handler and mmap with any permissions before EADD is allowed, but may
>> cause failure later on PF. I'm not sure it was intentional but pretty  
>> sure
>> no meaningful usage for doing mmap before ECREATE due to PF handler
>> enforcement.
>
> IIUC this change around v39 re-allows mmap() against encl_fd before  
> ECREATE?
>

I don't think so. There is no meaningful usage for that as I said.

> IIUC the only enforcement is VMA's permission must be more restricted  
> than
> enclave page's permission.
>
> So, theoretically, we can mmap() encl_fd with  
> PROT_READ|PROT_WRITE|PROT_EXEC,
> and then mprotect() the address range based on the actual enclave pages  
> (in
> EADD) before actually doing EADD those pages?
>
>>
>> I think that's the history behind it. Others more knowledgeable can
>> correct me as needed.
>
> Thanks for the information.
>
>>
>> Again, even if we redesign the whole thing to be less "weird", then
>> requiring pgoff for each mmap would break existing user space code. And  
>> I
>> think explicitly block mmap before ECREATE make the API more consistent
>> with the PF handler enforcement and original intent of sgx_encl_may_map.
>
> I think permission check in sgx_encl_may_map() isn't restricted related  
> to the
> vma->pgoff issue here.  They are basically two different topics, IIUC.
>

They are related because the intention was not allowing user space to do  
arbitrary mmaping before EADD.
So in the context of the gap you identified for this patch series, i.e,  
mmap before ECREATE, it is relevant. My view is that we never intended and  
there is no use case for allowing that to happen. So it naturally follows  
that we fix that gap by not allowing those scenarios explicitly.

> So I am still a little bit confused about where does "SGX driver uses
> MAP_ANONYMOUS semantics for fd-based mmap()" come from.
>
> Anyway, we certainly don't want to break userspace.  However, IIUC, even  
> from
> now on we change the driver to depend on userspace to pass the correct  
> pgoff in
> mmap(), this won't break userspace, because old userspace which doesn't  
> use
> fadvice() and pgoff actually doesn't matter.  For new userspace which  
> uses
> fadvice(), it needs to pass the correct pgoff.
>
> I am not saying we should do this, but it doesn't seem we can break  
> userspace?
>

I'll leave it for others to chime in on this.
With my user space developer hat on, I would not like this because in one  
place (mmap before madvise call) I have to pass pgoff and not in another  
place (mmap after EADD). If you see really big issues on the current  
MAP_ANONYMOUS + enclave_fd semantics, then we should fix the design. But  
it should be in separate patches.

Thanks
Haitao
Jarkko Sakkinen March 19, 2023, 1:26 p.m. UTC | #24
On Tue, Mar 14, 2023 at 09:54:30AM -0500, Haitao Huang wrote:
> On Thu, 09 Mar 2023 05:31:29 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> > > 
> > > Then in v29, PROT_NONE mapping was disallowed for encl_fd before pages
> > > EADDed so user space has to mmap anonymously to reserve the range.The
> > > intent was still not to allow mmap before pages EADDed (the !page check
> > > was still there up to v38)
> > 
> > Do you know the reason of disallowing PROT_NONE mapping against encl_fd?
> > 
> 
> I think it was to allow user space to do anonymous mapping to reserve
> address space for enclave.
> Before this point, you have to use PROT_NONE to reserve with encl_fd. There
> might be an issue with how #PF and EPC swapping was handled or the elegance
> of those flows that motivated the move but I can't remember. ABI was not
> fixed at that time so it was OK to change.

Yes, this was done because enclave naturally aligned base address.

BR, Jarkko
Huang, Kai March 20, 2023, 9:36 a.m. UTC | #25
On Sun, 2023-03-19 at 15:26 +0200, jarkko@kernel.org wrote:
> On Tue, Mar 14, 2023 at 09:54:30AM -0500, Haitao Huang wrote:
> > On Thu, 09 Mar 2023 05:31:29 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> > > > 
> > > > Then in v29, PROT_NONE mapping was disallowed for encl_fd before pages
> > > > EADDed so user space has to mmap anonymously to reserve the range.The
> > > > intent was still not to allow mmap before pages EADDed (the !page check
> > > > was still there up to v38)
> > > 
> > > Do you know the reason of disallowing PROT_NONE mapping against encl_fd?
> > > 
> > 
> > I think it was to allow user space to do anonymous mapping to reserve
> > address space for enclave.
> > Before this point, you have to use PROT_NONE to reserve with encl_fd. There
> > might be an issue with how #PF and EPC swapping was handled or the elegance
> > of those flows that motivated the move but I can't remember. ABI was not
> > fixed at that time so it was OK to change.
> 
> Yes, this was done because enclave naturally aligned base address.
> 
> 

Sorry for being ignorant, but you can also get the aligned base address via
mmap() against encl_fd, correct?  What is the fundamental difference between
mmap(MAP_ANONYMOUS) vs mmap(encl_fd)?
Jarkko Sakkinen March 20, 2023, 2:04 p.m. UTC | #26
On Mon, Mar 20, 2023 at 09:36:14AM +0000, Huang, Kai wrote:
> On Sun, 2023-03-19 at 15:26 +0200, jarkko@kernel.org wrote:
> > On Tue, Mar 14, 2023 at 09:54:30AM -0500, Haitao Huang wrote:
> > > On Thu, 09 Mar 2023 05:31:29 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> > > > > 
> > > > > Then in v29, PROT_NONE mapping was disallowed for encl_fd before pages
> > > > > EADDed so user space has to mmap anonymously to reserve the range.The
> > > > > intent was still not to allow mmap before pages EADDed (the !page check
> > > > > was still there up to v38)
> > > > 
> > > > Do you know the reason of disallowing PROT_NONE mapping against encl_fd?
> > > > 
> > > 
> > > I think it was to allow user space to do anonymous mapping to reserve
> > > address space for enclave.
> > > Before this point, you have to use PROT_NONE to reserve with encl_fd. There
> > > might be an issue with how #PF and EPC swapping was handled or the elegance
> > > of those flows that motivated the move but I can't remember. ABI was not
> > > fixed at that time so it was OK to change.
> > 
> > Yes, this was done because enclave naturally aligned base address.
> > 
> > 
> 
> Sorry for being ignorant, but you can also get the aligned base address via
> mmap() against encl_fd, correct?  What is the fundamental difference between
> mmap(MAP_ANONYMOUS) vs mmap(encl_fd)?

This is what we do atm:

static unsigned long sgx_get_unmapped_area(struct file *file,
                                           unsigned long addr,
                                           unsigned long len,
                                           unsigned long pgoff,
                                           unsigned long flags)
{
        if ((flags & MAP_TYPE) == MAP_PRIVATE)
                return -EINVAL;

        if (flags & MAP_FIXED)
                return addr;

        return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
}

BR, Jarkko
Haitao Huang May 27, 2023, 12:32 a.m. UTC | #27
Hi Kai, Jarkko and Dave

On Thu, 09 Mar 2023 05:31:29 -0600, Huang, Kai <kai.huang@intel.com> wrote:
>
> So I am still a little bit confused about where does "SGX driver uses
> MAP_ANONYMOUS semantics for fd-based mmap()" come from.
>
> Anyway, we certainly don't want to break userspace.  However, IIUC, even  
> from
> now on we change the driver to depend on userspace to pass the correct  
> pgoff in
> mmap(), this won't break userspace, because old userspace which doesn't  
> use
> fadvice() and pgoff actually doesn't matter.  For new userspace which  
> uses
> fadvice(), it needs to pass the correct pgoff.
>
> I am not saying we should do this, but it doesn't seem we can break  
> userspace?
>

Sorry for delayed update but I thought about this more and likely to  
propose a new EAUG ioctl for this and for enabling SGX-CET shadow stack  
pages. But regardless, I'd like to wrap up this discussion to just clarify  
this anonymous semantics design in documentation so people won't get  
confused in future.

I think we all agree to keep this semantics so no user space would need  
specify 'offset' for mmap with enclave fd. And here is my proposed  
documentation changes.

--- a/Documentation/x86/sgx.rst
+++ b/Documentation/x86/sgx.rst
@@ -100,6 +100,23 @@ pages and establish enclave page permissions.
                 sgx_ioc_enclave_init
                 sgx_ioc_enclave_provision

+Enclave memory mapping
+----------------------
+
+A file descriptor created from opening **/dev/sgx_enclave** represents an
+enclave object. The mmap() syscall with enclave file descriptors does not
+support non-zero value for the 'offset' parameter.
+
+Rational:
+
+Enclave mapping is very similar to anonymous mapping in that it maps  
physical
+EPC pages to virtual addresses and the physical pages need not be  
contiguous. And
+the content of each enclave page must be loaded at an expected offset  
relative
+to SECS.BASEADDR as is reflected in measurements in its SIGSTRUCT.  
Otherwise
+EINIT would fail to verify the measurements and initialize the enclave.  
This is
+unlike regular file mapping in that no content offset can be defined that  
is
+independent from the virtual address it is loaded to.
+
  Enclave runtime management
  --------------------------

Let me know your thoughts and I can submit the patch if you think this is  
the direction to go.

Thanks
Haitao
Huang, Kai June 6, 2023, 4:11 a.m. UTC | #28
On Fri, 2023-05-26 at 19:32 -0500, Haitao Huang wrote:
> Hi Kai, Jarkko and Dave
> 
> On Thu, 09 Mar 2023 05:31:29 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> > 
> > So I am still a little bit confused about where does "SGX driver uses
> > MAP_ANONYMOUS semantics for fd-based mmap()" come from.
> > 
> > Anyway, we certainly don't want to break userspace.  However, IIUC, even  
> > from
> > now on we change the driver to depend on userspace to pass the correct  
> > pgoff in
> > mmap(), this won't break userspace, because old userspace which doesn't  
> > use
> > fadvice() and pgoff actually doesn't matter.  For new userspace which  
> > uses
> > fadvice(), it needs to pass the correct pgoff.
> > 
> > I am not saying we should do this, but it doesn't seem we can break  
> > userspace?
> > 
> 
> Sorry for delayed update but I thought about this more and likely to  
> propose a new EAUG ioctl for this and for enabling SGX-CET shadow stack  
> pages. But regardless, I'd like to wrap up this discussion to just clarify  
> this anonymous semantics design in documentation so people won't get  
> confused in future.
> 
> I think we all agree to keep this semantics so no user space would need  
> specify 'offset' for mmap with enclave fd. And here is my proposed  
> documentation changes.
> 
> --- a/Documentation/x86/sgx.rst
> +++ b/Documentation/x86/sgx.rst
> @@ -100,6 +100,23 @@ pages and establish enclave page permissions.
>                  sgx_ioc_enclave_init
>                  sgx_ioc_enclave_provision
> 
> +Enclave memory mapping
> +----------------------
> +
> +A file descriptor created from opening **/dev/sgx_enclave** represents an
> +enclave object. The mmap() syscall with enclave file descriptors does not
> +support non-zero value for the 'offset' parameter.

I think we all need to understand better why SGX driver requires anonymous
semantics mmap() against /dev/sgx_enclave, and as a result of that, requires
mmap() to pass  0 as pgoff (which looks wasn't even discussed when upstreaming
the driver).

I'll do some investigation and try to summerize and report back.  Thanks.

[...]

> This is
> +unlike regular file mapping in that no content offset can be defined that  
> is
> +independent from the virtual address it is loaded to.
> +
> 

Don't quite understand this.  The virtual address of a regular file mapping can
be linked to file's offest from VMA's pgoff.
Haitao Huang June 7, 2023, 4:59 p.m. UTC | #29
On Mon, 05 Jun 2023 23:11:59 -0500, Huang, Kai <kai.huang@intel.com> wrote:

> On Fri, 2023-05-26 at 19:32 -0500, Haitao Huang wrote:
>> Hi Kai, Jarkko and Dave
>>
>> On Thu, 09 Mar 2023 05:31:29 -0600, Huang, Kai <kai.huang@intel.com>  
>> wrote:
>> >
>> > So I am still a little bit confused about where does "SGX driver uses
>> > MAP_ANONYMOUS semantics for fd-based mmap()" come from.
>> >
>> > Anyway, we certainly don't want to break userspace.  However, IIUC,  
>> even
>> > from
>> > now on we change the driver to depend on userspace to pass the correct
>> > pgoff in
>> > mmap(), this won't break userspace, because old userspace which  
>> doesn't
>> > use
>> > fadvice() and pgoff actually doesn't matter.  For new userspace which
>> > uses
>> > fadvice(), it needs to pass the correct pgoff.
>> >
>> > I am not saying we should do this, but it doesn't seem we can break
>> > userspace?
>> >
>>
>> Sorry for delayed update but I thought about this more and likely to
>> propose a new EAUG ioctl for this and for enabling SGX-CET shadow stack
>> pages. But regardless, I'd like to wrap up this discussion to just  
>> clarify
>> this anonymous semantics design in documentation so people won't get
>> confused in future.
>>
>> I think we all agree to keep this semantics so no user space would need
>> specify 'offset' for mmap with enclave fd. And here is my proposed
>> documentation changes.
>>
>> --- a/Documentation/x86/sgx.rst
>> +++ b/Documentation/x86/sgx.rst
>> @@ -100,6 +100,23 @@ pages and establish enclave page permissions.
>>                  sgx_ioc_enclave_init
>>                  sgx_ioc_enclave_provision
>>
>> +Enclave memory mapping
>> +----------------------
>> +
>> +A file descriptor created from opening **/dev/sgx_enclave** represents  
>> an
>> +enclave object. The mmap() syscall with enclave file descriptors does  
>> not
>> +support non-zero value for the 'offset' parameter.
>
> I think we all need to understand better why SGX driver requires  
> anonymous
> semantics mmap() against /dev/sgx_enclave, and as a result of that,  
> requires
> mmap() to pass  0 as pgoff (which looks wasn't even discussed when  
> upstreaming
> the driver).
>
> I'll do some investigation and try to summerize and report back.  Thanks.
>
> [...]
>
>> This is
>> +unlike regular file mapping in that no content offset can be defined  
>> that
>> is
>> +independent from the virtual address it is loaded to.
>> +
>>
>
> Don't quite understand this.  The virtual address of a regular file  
> mapping can
> be linked to file's offest from VMA's pgoff.
>

For file mapping, the offset is the 'content offset' relative to the  
beginning of the file content. The file 'content' is independent from the  
memory it is mapped to.

mmap(..., encl_fd, ...) just creates VMAs as windows/views into the  
enclave memory whose range is already defined by [encl->base,  
encl->base+encl->size) when ECREATE is done. The 'content' of enclave and  
the memory to which the 'content' is mapped are the same. Hence, no  
independent  'content offset' can be defined from user point of view.

 From implementation point of view:

In regular file mapping, vma->vm_pgoff has nothing to do with  
vma->vm_start (or 'addr' passed by mmap). It is used to load bytes at  
pg_offset in the 'content' referenced by vma->vm_file, which is backed up  
by a real file or object that contains the bytes.

In enclave mapping, vma->vm_file is the '/dev/sgx_enclave' device node,  
and it does not refer to any content. It does not make sense to have  
'offset' into '/dev/sgx_enclave'. vma->pg_offset is meaningless for  
enclave mapping.

Thanks
Haitao
Huang, Kai June 16, 2023, 3:49 a.m. UTC | #30
On Tue, 2023-06-06 at 04:11 +0000, Huang, Kai wrote:
> On Fri, 2023-05-26 at 19:32 -0500, Haitao Huang wrote:
> > Hi Kai, Jarkko and Dave
> > 
> > On Thu, 09 Mar 2023 05:31:29 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> > > 
> > > So I am still a little bit confused about where does "SGX driver uses
> > > MAP_ANONYMOUS semantics for fd-based mmap()" come from.
> > > 
> > > Anyway, we certainly don't want to break userspace.  However, IIUC, even  
> > > from
> > > now on we change the driver to depend on userspace to pass the correct  
> > > pgoff in
> > > mmap(), this won't break userspace, because old userspace which doesn't  
> > > use
> > > fadvice() and pgoff actually doesn't matter.  For new userspace which  
> > > uses
> > > fadvice(), it needs to pass the correct pgoff.
> > > 
> > > I am not saying we should do this, but it doesn't seem we can break  
> > > userspace?
> > > 
> > 
> > Sorry for delayed update but I thought about this more and likely to  
> > propose a new EAUG ioctl for this and for enabling SGX-CET shadow stack  
> > pages. But regardless, I'd like to wrap up this discussion to just clarify  
> > this anonymous semantics design in documentation so people won't get  
> > confused in future.
> > 
> > I think we all agree to keep this semantics so no user space would need  
> > specify 'offset' for mmap with enclave fd. And here is my proposed  
> > documentation changes.
> > 
> > --- a/Documentation/x86/sgx.rst
> > +++ b/Documentation/x86/sgx.rst
> > @@ -100,6 +100,23 @@ pages and establish enclave page permissions.
> >                  sgx_ioc_enclave_init
> >                  sgx_ioc_enclave_provision
> > 
> > +Enclave memory mapping
> > +----------------------
> > +
> > +A file descriptor created from opening **/dev/sgx_enclave** represents an
> > +enclave object. The mmap() syscall with enclave file descriptors does not
> > +support non-zero value for the 'offset' parameter.
> 
> I think we all need to understand better why SGX driver requires anonymous
> semantics mmap() against /dev/sgx_enclave, and as a result of that, requires
> mmap() to pass  0 as pgoff (which looks wasn't even discussed when upstreaming
> the driver).
> 
> I'll do some investigation and try to summerize and report back.  Thanks.
> 

+ Sean.

Hi Sean,

If you see this and have time, please help to comment.  Thanks.

I've spent plenty of time to look into the discussions around v20/v28/v29 and
roughly v38/v39 to find out why SGX driver requires MAP_ANONYMOUS semantics, and
AFAICT it turns out it was never explicitly discussed.  Or perhaps the
"MAP_ANONYMOUS semantics" actually just means "MAP_SHARED | MAP_FIXED + pgoff is
ignored", and everyone believed there was no need to explain what does "SGX
driver uses MAP_ANONYMOUS semantics for mmap()" mean.

Details:

The v20 story (that I spent most of my time on) mentioned by Haitao was actually
about how to make SGX and LSM work together but not related to SGX driver mmap()
semantic. 

Also Haitao mentioned "the use of anonymous mapping can be traced back to v29"
but this actually was just about how to use the first mmap() to "reserve the
ELRANGE before ECREATE".  It wasn't about to changing mmap(/dev/sgx_enclave)
semantics at all.

Sean actually suggested to explicitly document "how does SGX driver recommend
the user to reserve ELRANGE", but Jarkko didn't think we should do:

https://lore.kernel.org/linux-sgx/20200528111910.GB1666298@linux.intel.com/

which is a pity IMHO, because I believe for anyone, naturally, the first
instinct to reserve ELRANGE is to use mmap(/dev/sgx_enclave) but not
mmap(MAP_ANONYMOUS).  If we suggest user to use the latter then there must be
some reason and IMHO such suggestion and reason should be documented.

Also, if I am not missing something, the current driver doesn't prevent using
mmap(/dev/sgx_enclave, PROT_NONE) to reserve ELANGE.  So having clear
documentation is helpful for SGX users to choose how to write their apps.

Go back to the "SGX driver uses MAP_ANONYMOUS semantics for mmap()", I believe
this just is "SGX driver requires mmap() after ECREATE/EINIT to use MAP_SHARED |
MAP_FIXED and pgoff is ignored".  Or more precisely, pgoff is "not _used_ by SGX
driver".

In fact, I think "pgoff is ignored/not used" is technically wrong for enclave.

Pgoff is ignored in case of MAP_SHARED | MAP_ANONYMOUS makes sense, because you
get a new shmem file everytime you do so.  But this isn't the case for enclave.
For all mmap()s against the same enclave, pgoff has a valid meaning.  SGX driver
doesn't use vma->pgoff thus it's OK to not have valid vma->pgoff but this
confuses the core-MM, because now we can easily end up having multiple VMAs
mapping to different part of enclave, but core-MM believes they all map to the
start of the enclave.

For instance, have we tested all corner cases around VMA splitting/merging, etc?

To conculde:

IMHO we should stop saying SGX driver uses MAP_ANONYMOUS semantics, because the
truth is it just takes advantage of MAP_FIXED and carelessly ignores the pgoff
due to the nature of SGX w/o considering from core-MM's perspective.
  
And IMHO there are two ways to fix:

1) From now on, we ask SGX apps to use the correct pgoff in their
mmap(/dev/sgx_enclave).  This shouldn't impact the existing SGX apps because SGX
driver doesn't use vma->pgoff anyway.

2) For the sake of avoiding having to ask existing SGX apps to change their
mmap()s, we _officially_ say that userspace isn't required to pass a correct
pgoff to mmap() (i.e. passing 0 as did in existing apps), but the kernel should
fix the vma->pgoff internally.

I do prefer option 2) because it has no harm to anyone: 1) No changes to
existing SGX apps; 2) It aligns with the core-MM to so all existing mmap()
operations should work as expected, meaning no surprise; 3) And this patchset
from Haitao to use fadvice() to accelerate EAUG flow just works.

And I believe we should document all those staffs so everyone can understand.
Sean Christopherson June 16, 2023, 10:05 p.m. UTC | #31
On Fri, Jun 16, 2023, Kai Huang wrote:
> On Tue, 2023-06-06 at 04:11 +0000, Huang, Kai wrote:
> > On Fri, 2023-05-26 at 19:32 -0500, Haitao Huang wrote:
> > > Hi Kai, Jarkko and Dave
> > > 
> > > On Thu, 09 Mar 2023 05:31:29 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> > > > 
> > > > So I am still a little bit confused about where does "SGX driver uses
> > > > MAP_ANONYMOUS semantics for fd-based mmap()" come from.
> > > > 
> > > > Anyway, we certainly don't want to break userspace.  However, IIUC,
> > > > even  from now on we change the driver to depend on userspace to pass
> > > > the correct  pgoff in mmap(), this won't break userspace, because old
> > > > userspace which doesn't  use fadvice() and pgoff actually doesn't
> > > > matter.  For new userspace which  uses fadvice(), it needs to pass the
> > > > correct pgoff.
> > > > 
> > > > I am not saying we should do this, but it doesn't seem we can break  
> > > > userspace?
> > > > 
> > > 
> > > Sorry for delayed update but I thought about this more and likely to  
> > > propose a new EAUG ioctl for this and for enabling SGX-CET shadow stack  
> > > pages. But regardless, I'd like to wrap up this discussion to just clarify  
> > > this anonymous semantics design in documentation so people won't get  
> > > confused in future.
> > > 
> > > I think we all agree to keep this semantics so no user space would need  
> > > specify 'offset' for mmap with enclave fd. And here is my proposed  
> > > documentation changes.
> > > 
> > > --- a/Documentation/x86/sgx.rst
> > > +++ b/Documentation/x86/sgx.rst
> > > @@ -100,6 +100,23 @@ pages and establish enclave page permissions.
> > >                  sgx_ioc_enclave_init
> > >                  sgx_ioc_enclave_provision
> > > 
> > > +Enclave memory mapping
> > > +----------------------
> > > +
> > > +A file descriptor created from opening **/dev/sgx_enclave** represents an
> > > +enclave object. The mmap() syscall with enclave file descriptors does not
> > > +support non-zero value for the 'offset' parameter.
> > 
> > I think we all need to understand better why SGX driver requires anonymous
> > semantics mmap() against /dev/sgx_enclave, and as a result of that, requires
> > mmap() to pass  0 as pgoff (which looks wasn't even discussed when upstreaming
> > the driver).
> > 
> > I'll do some investigation and try to summerize and report back.  Thanks.
> > 
> 
> + Sean.
> 
> Hi Sean,
> 
> If you see this and have time, please help to comment.  Thanks.
> 
> I've spent plenty of time to look into the discussions around v20/v28/v29 and
> roughly v38/v39 to find out why SGX driver requires MAP_ANONYMOUS semantics,
> AFAICT it turns out it was never explicitly discussed.  Or perhaps the
> "MAP_ANONYMOUS semantics" actually just means "MAP_SHARED | MAP_FIXED + pgoff is
> ignored", and everyone believed there was no need to explain what does "SGX
> driver uses MAP_ANONYMOUS semantics for mmap()" mean.
> 
> Details:
> 
> The v20 story (that I spent most of my time on) mentioned by Haitao was actually
> about how to make SGX and LSM work together but not related to SGX driver mmap()
> semantic. 
> 
> Also Haitao mentioned "the use of anonymous mapping can be traced back to v29"
> but this actually was just about how to use the first mmap() to "reserve the
> ELRANGE before ECREATE".  It wasn't about to changing mmap(/dev/sgx_enclave)
> semantics at all.
> 
> Sean actually suggested to explicitly document "how does SGX driver recommend
> the user to reserve ELRANGE", but Jarkko didn't think we should do:
> 
> https://lore.kernel.org/linux-sgx/20200528111910.GB1666298@linux.intel.com/
> 
> which is a pity IMHO, because I believe for anyone, naturally, the first
> instinct to reserve ELRANGE is to use mmap(/dev/sgx_enclave) but not
> mmap(MAP_ANONYMOUS).  If we suggest user to use the latter then there must be
> some reason and IMHO such suggestion and reason should be documented.

Ya, the use of mmap() on fd=-1 is done in order to find an available, naturally
aligned chunk of virtual memory[*].  IIRC, there was a (very brief) discussion
about enhancing .mmap() so that userspace wouldn't be responsible for doing the
alignment, but I think we didn't pursue that idea very because we had bigger fish
to fry.

But I think this is unrelated to what you really care about, e.g. a userspace that
tightly controls its virtual memory could hardcode enclave placement (IIRC graphene
did/does do that).  I.e. the alignment issue is a completely different discussion.

[*] https://lore.kernel.org/all/20190522153836.GA24833@linux.intel.com

> Also, if I am not missing something, the current driver doesn't prevent using
> mmap(/dev/sgx_enclave, PROT_NONE) to reserve ELANGE.  So having clear
> documentation is helpful for SGX users to choose how to write their apps.
> 
> Go back to the "SGX driver uses MAP_ANONYMOUS semantics for mmap()", I believe
> this just is "SGX driver requires mmap() after ECREATE/EINIT to use MAP_SHARED |
> MAP_FIXED and pgoff is ignored".  Or more precisely, pgoff is "not _used_ by SGX
> driver".
> 
> In fact, I think "pgoff is ignored/not used" is technically wrong for enclave.

Yeah, it's wrong.  It works because, until now apparently, there was never a reason
a need to care about the file offset since ELRANGE base always provided the necessary
information.  It wasn't a deliberate design choice, we simply overlooked that detail
(unless Jarkko was holding back on me all those years ago :-) ).
 
> IMHO we should stop saying SGX driver uses MAP_ANONYMOUS semantics, because the
> truth is it just takes advantage of MAP_FIXED and carelessly ignores the pgoff
> due to the nature of SGX w/o considering from core-MM's perspective.
>   
> And IMHO there are two ways to fix:
> 
> 1) From now on, we ask SGX apps to use the correct pgoff in their
> mmap(/dev/sgx_enclave).  This shouldn't impact the existing SGX apps because SGX
> driver doesn't use vma->pgoff anyway.

Heh, just "asking" won't help.  And breaking userspace, i.e. requiring all apps
to update, will just get you yelled at :-)

> 2) For the sake of avoiding having to ask existing SGX apps to change their
> mmap()s, we _officially_ say that userspace isn't required to pass a correct
> pgoff to mmap() (i.e. passing 0 as did in existing apps), but the kernel should
> fix the vma->pgoff internally.

I recommend you don't do this.  Overwriting vm_pgoff would probably work, but it's
going to make a flawed API even messier.  E.g. you'll have painted SGX into a
corner if you ever want to decouple vma->start/end from encl->base.  I highly
doubt that will ever happen given how ELRANGE works, but I don't think a hack-a-fix
buys you enough to justify any more ugliness.

> I do prefer option 2) because it has no harm to anyone: 1) No changes to
> existing SGX apps; 2) It aligns with the core-MM to so all existing mmap()
> operations should work as expected, meaning no surprise; 3) And this patchset
> from Haitao to use fadvice() to accelerate EAUG flow just works.

I think you can have your cake and eat it too.  IIUC, the goal is to get fadvise()
working, and to do that without creating a truly heinous uAPI, you need an accurate
vm_pgoff.  So, use a carrot and stick approach.

If userspace properly defines vm_pgoff during mmap() and doesn't specify MAP_ANONYMOUS,
then they get to use fadvise() (give 'em a carrot).  But if *any* mmap() on the
enclave doesn't followo those rules, mark the enclave as tainted or whatever and
disallow fadvise() (hit 'em with a stick).

That way there is no ABI breakage and no chance of causing weirdness for existing
userspace applications, while at the same time enabling the fadvise() for userspace
that has been updated to play nice.  And as a bonus, the actual (and sane) semantics
will be shown in userspace apps that are updated to do the right thing.
Huang, Kai June 19, 2023, 11:17 a.m. UTC | #32
On Fri, 2023-06-16 at 15:05 -0700, Sean Christopherson wrote:
> On Fri, Jun 16, 2023, Kai Huang wrote:
> > On Tue, 2023-06-06 at 04:11 +0000, Huang, Kai wrote:
> > > On Fri, 2023-05-26 at 19:32 -0500, Haitao Huang wrote:
> > > > Hi Kai, Jarkko and Dave
> > > > 
> > > > On Thu, 09 Mar 2023 05:31:29 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> > > > > 
> > > > > So I am still a little bit confused about where does "SGX driver uses
> > > > > MAP_ANONYMOUS semantics for fd-based mmap()" come from.
> > > > > 
> > > > > Anyway, we certainly don't want to break userspace.  However, IIUC,
> > > > > even  from now on we change the driver to depend on userspace to pass
> > > > > the correct  pgoff in mmap(), this won't break userspace, because old
> > > > > userspace which doesn't  use fadvice() and pgoff actually doesn't
> > > > > matter.  For new userspace which  uses fadvice(), it needs to pass the
> > > > > correct pgoff.
> > > > > 
> > > > > I am not saying we should do this, but it doesn't seem we can break  
> > > > > userspace?
> > > > > 
> > > > 
> > > > Sorry for delayed update but I thought about this more and likely to  
> > > > propose a new EAUG ioctl for this and for enabling SGX-CET shadow stack  
> > > > pages. But regardless, I'd like to wrap up this discussion to just clarify  
> > > > this anonymous semantics design in documentation so people won't get  
> > > > confused in future.
> > > > 
> > > > I think we all agree to keep this semantics so no user space would need  
> > > > specify 'offset' for mmap with enclave fd. And here is my proposed  
> > > > documentation changes.
> > > > 
> > > > --- a/Documentation/x86/sgx.rst
> > > > +++ b/Documentation/x86/sgx.rst
> > > > @@ -100,6 +100,23 @@ pages and establish enclave page permissions.
> > > >                  sgx_ioc_enclave_init
> > > >                  sgx_ioc_enclave_provision
> > > > 
> > > > +Enclave memory mapping
> > > > +----------------------
> > > > +
> > > > +A file descriptor created from opening **/dev/sgx_enclave** represents an
> > > > +enclave object. The mmap() syscall with enclave file descriptors does not
> > > > +support non-zero value for the 'offset' parameter.
> > > 
> > > I think we all need to understand better why SGX driver requires anonymous
> > > semantics mmap() against /dev/sgx_enclave, and as a result of that, requires
> > > mmap() to pass  0 as pgoff (which looks wasn't even discussed when upstreaming
> > > the driver).
> > > 
> > > I'll do some investigation and try to summerize and report back.  Thanks.
> > > 
> > 
> > + Sean.
> > 
> > Hi Sean,
> > 
> > If you see this and have time, please help to comment.  Thanks.
> > 
> > I've spent plenty of time to look into the discussions around v20/v28/v29 and
> > roughly v38/v39 to find out why SGX driver requires MAP_ANONYMOUS semantics,
> > AFAICT it turns out it was never explicitly discussed.  Or perhaps the
> > "MAP_ANONYMOUS semantics" actually just means "MAP_SHARED | MAP_FIXED + pgoff is
> > ignored", and everyone believed there was no need to explain what does "SGX
> > driver uses MAP_ANONYMOUS semantics for mmap()" mean.
> > 
> > Details:
> > 
> > The v20 story (that I spent most of my time on) mentioned by Haitao was actually
> > about how to make SGX and LSM work together but not related to SGX driver mmap()
> > semantic. 
> > 
> > Also Haitao mentioned "the use of anonymous mapping can be traced back to v29"
> > but this actually was just about how to use the first mmap() to "reserve the
> > ELRANGE before ECREATE".  It wasn't about to changing mmap(/dev/sgx_enclave)
> > semantics at all.
> > 
> > Sean actually suggested to explicitly document "how does SGX driver recommend
> > the user to reserve ELRANGE", but Jarkko didn't think we should do:
> > 
> > https://lore.kernel.org/linux-sgx/20200528111910.GB1666298@linux.intel.com/
> > 
> > which is a pity IMHO, because I believe for anyone, naturally, the first
> > instinct to reserve ELRANGE is to use mmap(/dev/sgx_enclave) but not
> > mmap(MAP_ANONYMOUS).  If we suggest user to use the latter then there must be
> > some reason and IMHO such suggestion and reason should be documented.
> 
> Ya, the use of mmap() on fd=-1 is done in order to find an available, naturally
> aligned chunk of virtual memory[*].  IIRC, there was a (very brief) discussion
> about enhancing .mmap() so that userspace wouldn't be responsible for doing the
> alignment, but I think we didn't pursue that idea very because we had bigger fish
> to fry.

Thanks for your time :)

Yeah I noticed in v20, the sgx_get_unmapped_area() internally would allocate
2*len and then manually return the aligned address.  This wouldn't work for
mmap()ing the small chunks with an offset smaller than the size 

> 
> But I think this is unrelated to what you really care about, e.g. a userspace that
> tightly controls its virtual memory could hardcode enclave placement (IIRC graphene
> did/does do that).  I.e. the alignment issue is a completely different discussion.
> 
> [*] https://lore.kernel.org/all/20190522153836.GA24833@linux.intel.com

Right.  For the sake of making progress of this Haitao's series, we want to
understand where did "SGX driver requires mmap(/dev/sgx_enclave) to use
MAP_ANONYMOUS semantics" come from.

But for this topic (how to reserve ELRANGE).  I am not sure the reason that we
prefer mmap(MAP_ANONYMOUS) still stands.  For instance, the discussion in your
above link was old implementation/assumption -- e.g., MAP_FIXED wasn't even
used/supported for mmap()ing enclave chunks.

So I am wondering now whether we suggest user to use mmap(MAP_ANONYMOUS) to get
a size-aligned address still stands?  The current SGX driver's
get_unmapped_area:

static unsigned long sgx_get_unmapped_area(struct file *file,                  
                                           unsigned long addr,
                                           unsigned long len,                  
                                           unsigned long pgoff,                
                                           unsigned long flags)                
{       
        if ((flags & MAP_TYPE) == MAP_PRIVATE)                                 
                return -EINVAL;
        
        if (flags & MAP_FIXED)
                return addr;
                        
        return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);  
}               
 

As you can see if we don't pass MAP_FIXED, which is the case for mmap() to
reserve ELRANGE, looks there's no difference between mmap(MAP_ANONYMOUS) and
mmap(/dev/sgx_enclave)?

> 
> > Also, if I am not missing something, the current driver doesn't prevent using
> > mmap(/dev/sgx_enclave, PROT_NONE) to reserve ELANGE.  So having clear
> > documentation is helpful for SGX users to choose how to write their apps.
> > 
> > Go back to the "SGX driver uses MAP_ANONYMOUS semantics for mmap()", I believe
> > this just is "SGX driver requires mmap() after ECREATE/EINIT to use MAP_SHARED |
> > MAP_FIXED and pgoff is ignored".  Or more precisely, pgoff is "not _used_ by SGX
> > driver".
> > 
> > In fact, I think "pgoff is ignored/not used" is technically wrong for enclave.
> 
> Yeah, it's wrong.  It works because, until now apparently, there was never a reason
> a need to care about the file offset since ELRANGE base always provided the necessary
> information.  It wasn't a deliberate design choice, we simply overlooked that detail
> (unless Jarkko was holding back on me all those years ago :-) ).

Yeah.  But I am not sure whether there are corner cases that we can have
potential bug around here, since those VMA's are not aligned between the core-mm
and the driver.

I haven't thought carefully though.  I guess from core-mm's perspective there
are multi-VMAs mapping to the same/overlapping part of the enclave, while the
SGX driver treats them as mapping to different non-overlapping parts.  Perhaps
it's OK as long as SGX driver doesn't use vma->pgoff to do something???

>  
> > IMHO we should stop saying SGX driver uses MAP_ANONYMOUS semantics, because the
> > truth is it just takes advantage of MAP_FIXED and carelessly ignores the pgoff
> > due to the nature of SGX w/o considering from core-MM's perspective.
> >   
> > And IMHO there are two ways to fix:
> > 
> > 1) From now on, we ask SGX apps to use the correct pgoff in their
> > mmap(/dev/sgx_enclave).  This shouldn't impact the existing SGX apps because SGX
> > driver doesn't use vma->pgoff anyway.
> 
> Heh, just "asking" won't help.  And breaking userspace, i.e. requiring all apps
> to update, will just get you yelled at :-)

We can document properly and assume the new apps will follow.  As I mentioned
above, the old apps, which doesn't/shouldn't have been using fadvice() anyway,
doesn't need to be changed, i.e.,  they should continue to work. :)

No?

> 
> > 2) For the sake of avoiding having to ask existing SGX apps to change their
> > mmap()s, we _officially_ say that userspace isn't required to pass a correct
> > pgoff to mmap() (i.e. passing 0 as did in existing apps), but the kernel should
> > fix the vma->pgoff internally.
> 
> I recommend you don't do this.  Overwriting vm_pgoff would probably work, but it's
> going to make a flawed API even messier.  E.g. you'll have painted SGX into a
> corner if you ever want to decouple vma->start/end from encl->base.  I highly
> doubt that will ever happen given how ELRANGE works, but I don't think a hack-a-fix
> buys you enough to justify any more ugliness.

I also found it's not feasible to cleanly fix the pgoff from userspace (I
thought the pgoff could be fixed at very early stage of do_mmap() so everything
later just worked, but it's not the case).  In do_mmap() the pgoff from
userspace is already used to VMA merging/splitting etc before creating the
target VMA.

Hmm.. Now I think we shouldn't silently fix pgoff in SGX driver as it may
surprise the core-mm later because the core-mm has already done some job around
VMAs before vma->pgoff is changed.

> 
> > I do prefer option 2) because it has no harm to anyone: 1) No changes to
> > existing SGX apps; 2) It aligns with the core-MM to so all existing mmap()
> > operations should work as expected, meaning no surprise; 3) And this patchset
> > from Haitao to use fadvice() to accelerate EAUG flow just works.
> 
> I think you can have your cake and eat it too.  IIUC, the goal is to get fadvise()
> working, and to do that without creating a truly heinous uAPI, you need an accurate
> vm_pgoff.  So, use a carrot and stick approach.
> 
> If userspace properly defines vm_pgoff during mmap() and doesn't specify MAP_ANONYMOUS,
> then they get to use fadvise() (give 'em a carrot).  But if *any* mmap() on the
> enclave doesn't followo those rules, mark the enclave as tainted or whatever and
> disallow fadvise() (hit 'em with a stick).

If we are supposed to use mmap(MAP_ANONYMOUS) to reserve ELRANGE, then I think
userspace will just use MAP_FIXED for all mmap()s to /dev/sgx_enclave.  To
detect whether a VMA has the correct pgoff, we need to somehow mark it in
sgx_mmap(), but currently we don't have facility to do that.  Even we can do,
the core madvice() -> vfs_fadvice() cannot be aware of such VMA either, and in
sgx_fadvice() we may have problem locating the correct VMA to do EAUG (because
the vfs_fadvice()  uses  0 pgoff to calculate the file offset).

So, now I think we should just let userspace itself to pass the correct pgoff
when it wants to use fadvice(), which is the option 1) I mentioned above.  The
kernel doesn't/shouldn't need to fix vma->pgoff.

In another words, I think:

 - For the old apps, doesn't matter, continue to work;
 - For new apps which want to use fadvice() to accelerate EAUG, it should pass 
   the correct pgoff in mmap(/dev/sgx_enclave) even with  MAP_FIXED.

And we don't say "SGX driver uses MAP_ANONYMOUS semantics for
mmap(/dev/sgx_enclave) any more".

Does this make sense?
Sean Christopherson June 22, 2023, 10:01 p.m. UTC | #33
On Mon, Jun 19, 2023, Kai Huang wrote:
> On Fri, 2023-06-16 at 15:05 -0700, Sean Christopherson wrote:
> > But I think this is unrelated to what you really care about, e.g. a userspace that
> > tightly controls its virtual memory could hardcode enclave placement (IIRC graphene
> > did/does do that).  I.e. the alignment issue is a completely different discussion.
> > 
> > [*] https://lore.kernel.org/all/20190522153836.GA24833@linux.intel.com
> 
> Right.  For the sake of making progress of this Haitao's series, we want to
> understand where did "SGX driver requires mmap(/dev/sgx_enclave) to use
> MAP_ANONYMOUS semantics" come from.
> 
> But for this topic (how to reserve ELRANGE).  I am not sure the reason that we
> prefer mmap(MAP_ANONYMOUS) still stands.  For instance, the discussion in your
> above link was old implementation/assumption -- e.g., MAP_FIXED wasn't even
> used/supported for mmap()ing enclave chunks.
> 
> So I am wondering now whether we suggest user to use mmap(MAP_ANONYMOUS) to get
> a size-aligned address still stands?  The current SGX driver's
> get_unmapped_area:
> 
> static unsigned long sgx_get_unmapped_area(struct file *file,                  
>                                            unsigned long addr,
>                                            unsigned long len,                  
>                                            unsigned long pgoff,                
>                                            unsigned long flags)                
> {       
>         if ((flags & MAP_TYPE) == MAP_PRIVATE)                                 
>                 return -EINVAL;
>         
>         if (flags & MAP_FIXED)
>                 return addr;
>                         
>         return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);  
> }               
>  
> 
> As you can see if we don't pass MAP_FIXED, which is the case for mmap() to
> reserve ELRANGE, looks there's no difference between mmap(MAP_ANONYMOUS) and
> mmap(/dev/sgx_enclave)?

In practice, there's *probably* no significant difference.  Using an anonymous
mapping is advantageous because there's no additional overhead, e.g. for locking
the file, it can be done in advance of actually opening /dev/sgx_enclave, helps
document (in userspace) that the code isn't actually creating an enclave, just
finding a naturally aligned area in virtual memory (which isn't SGX specific), etc.

There are definitely differences, e.g. an LSM could restrict access to
/dev/sgx_enclave.  That particular one is obviously a "don't care", but it's quite
difficult to say that mmap(/dev/sgx_enclave) and mmap(NULL) are equivalent due to
the amount of code that's involved in handling mmap().

> > > Also, if I am not missing something, the current driver doesn't prevent using
> > > mmap(/dev/sgx_enclave, PROT_NONE) to reserve ELANGE.  So having clear
> > > documentation is helpful for SGX users to choose how to write their apps.
> > > 
> > > Go back to the "SGX driver uses MAP_ANONYMOUS semantics for mmap()", I believe
> > > this just is "SGX driver requires mmap() after ECREATE/EINIT to use MAP_SHARED |
> > > MAP_FIXED and pgoff is ignored".  Or more precisely, pgoff is "not _used_ by SGX
> > > driver".
> > > 
> > > In fact, I think "pgoff is ignored/not used" is technically wrong for enclave.
> > 
> > Yeah, it's wrong.  It works because, until now apparently, there was never a reason
> > a need to care about the file offset since ELRANGE base always provided the necessary
> > information.  It wasn't a deliberate design choice, we simply overlooked that detail
> > (unless Jarkko was holding back on me all those years ago :-) ).
> 
> Yeah.  But I am not sure whether there are corner cases that we can have
> potential bug around here, since those VMA's are not aligned between the core-mm
> and the driver.
> 
> I haven't thought carefully though.  I guess from core-mm's perspective there
> are multi-VMAs mapping to the same/overlapping part of the enclave, while the
> SGX driver treats them as mapping to different non-overlapping parts.  Perhaps
> it's OK as long as SGX driver doesn't use vma->pgoff to do something???

Ya, exactly.

> > > IMHO we should stop saying SGX driver uses MAP_ANONYMOUS semantics, because the
> > > truth is it just takes advantage of MAP_FIXED and carelessly ignores the pgoff
> > > due to the nature of SGX w/o considering from core-MM's perspective.
> > >   
> > > And IMHO there are two ways to fix:
> > > 
> > > 1) From now on, we ask SGX apps to use the correct pgoff in their
> > > mmap(/dev/sgx_enclave).  This shouldn't impact the existing SGX apps because SGX
> > > driver doesn't use vma->pgoff anyway.
> > 
> > Heh, just "asking" won't help.  And breaking userspace, i.e. requiring all apps
> > to update, will just get you yelled at :-)
> 
> We can document properly and assume the new apps will follow.  As I mentioned
> above, the old apps, which doesn't/shouldn't have been using fadvice() anyway,
> doesn't need to be changed, i.e.,  they should continue to work. :)
> 
> No?

You can't build an ABI on assumptions.  E.g. even if userspace *intends* to behave,
it wouldn't take much to compute the wrong offset (math is hard).

> > > 2) For the sake of avoiding having to ask existing SGX apps to change their
> > > mmap()s, we _officially_ say that userspace isn't required to pass a correct
> > > pgoff to mmap() (i.e. passing 0 as did in existing apps), but the kernel should
> > > fix the vma->pgoff internally.
> > 
> > I recommend you don't do this.  Overwriting vm_pgoff would probably work, but it's
> > going to make a flawed API even messier.  E.g. you'll have painted SGX into a
> > corner if you ever want to decouple vma->start/end from encl->base.  I highly
> > doubt that will ever happen given how ELRANGE works, but I don't think a hack-a-fix
> > buys you enough to justify any more ugliness.
> 
> I also found it's not feasible to cleanly fix the pgoff from userspace (I
> thought the pgoff could be fixed at very early stage of do_mmap() so everything
> later just worked, but it's not the case).  In do_mmap() the pgoff from
> userspace is already used to VMA merging/splitting etc before creating the
> target VMA.
> 
> Hmm.. Now I think we shouldn't silently fix pgoff in SGX driver as it may
> surprise the core-mm later because the core-mm has already done some job around
> VMAs before vma->pgoff is changed.
> 
> > 
> > > I do prefer option 2) because it has no harm to anyone: 1) No changes to
> > > existing SGX apps; 2) It aligns with the core-MM to so all existing mmap()
> > > operations should work as expected, meaning no surprise; 3) And this patchset
> > > from Haitao to use fadvice() to accelerate EAUG flow just works.
> > 
> > I think you can have your cake and eat it too.  IIUC, the goal is to get fadvise()
> > working, and to do that without creating a truly heinous uAPI, you need an accurate
> > vm_pgoff.  So, use a carrot and stick approach.
> > 
> > If userspace properly defines vm_pgoff during mmap() and doesn't specify MAP_ANONYMOUS,
> > then they get to use fadvise() (give 'em a carrot).  But if *any* mmap() on the
> > enclave doesn't followo those rules, mark the enclave as tainted or whatever and
> > disallow fadvise() (hit 'em with a stick).
> 
> If we are supposed to use mmap(MAP_ANONYMOUS) to reserve ELRANGE, then I think
> userspace will just use MAP_FIXED for all mmap()s to /dev/sgx_enclave.

There is no "supposed" to.  Using mmap(NULL) is purely a suggestion to avoid
running into overheads and checks that are ultimately unnecessary.  The only
requirement is that userspace provide a compatible chunk for virtual memory,
*how* userspace finds that chunk can be done in a number of ways.  mmap(NULL)
just happens to be the most convenient one (IMO).

> To detect whether a VMA has the correct pgoff, we need to somehow mark it in
> sgx_mmap(), but currently we don't have facility to do that.  Even we can do,
> the core madvice() -> vfs_fadvice() cannot be aware of such VMA either, and
> in sgx_fadvice() we may have problem locating the correct VMA to do EAUG
> (because the vfs_fadvice()  uses  0 pgoff to calculate the file offset).
> 
> So, now I think we should just let userspace itself to pass the correct pgoff
> when it wants to use fadvice(), which is the option 1) I mentioned above.  The
> kernel doesn't/shouldn't need to fix vma->pgoff.
> 
> In another words, I think:
> 
>  - For the old apps, doesn't matter, continue to work;
>  - For new apps which want to use fadvice() to accelerate EAUG, it should pass�
>    the correct pgoff in mmap(/dev/sgx_enclave) even with  MAP_FIXED.
> 
> And we don't say "SGX driver uses MAP_ANONYMOUS semantics for
> mmap(/dev/sgx_enclave) any more".
> 
> Does this make sense?

Yes, but as above, IMO the kernel needs to enforce that userspace has properly
set the offset, otherwise userspace will end up with completely nonsensical
behavior if it fails to set the correct offset.  The proposed sgx_fadvise() already
takes mmap_lock for read, i.e. is mutually exclusive with sgx_mmap(), so encforcing
the requirement should be quite straightforward, e.g.

        mmap_read_lock(current->mm);

        vma = find_vma(current->mm, start);
        if (!vma)
                goto unlock;
        if (vma->vm_private_data != encl)
                goto unlock;
	if (encl->has_mismatched_offsets)  <======
		goto unlock;

        pos = start;
        if (pos < vma->vm_start || end > vma->vm_end) {
                /* Don't allow any gaps */
                goto unlock;
        }
Huang, Kai June 22, 2023, 11:21 p.m. UTC | #34
On Thu, 2023-06-22 at 15:01 -0700, Sean Christopherson wrote:
> On Mon, Jun 19, 2023, Kai Huang wrote:
> > On Fri, 2023-06-16 at 15:05 -0700, Sean Christopherson wrote:
> > > But I think this is unrelated to what you really care about, e.g. a userspace that
> > > tightly controls its virtual memory could hardcode enclave placement (IIRC graphene
> > > did/does do that).  I.e. the alignment issue is a completely different discussion.
> > > 
> > > [*] https://lore.kernel.org/all/20190522153836.GA24833@linux.intel.com
> > 
> > Right.  For the sake of making progress of this Haitao's series, we want to
> > understand where did "SGX driver requires mmap(/dev/sgx_enclave) to use
> > MAP_ANONYMOUS semantics" come from.
> > 
> > But for this topic (how to reserve ELRANGE).  I am not sure the reason that we
> > prefer mmap(MAP_ANONYMOUS) still stands.  For instance, the discussion in your
> > above link was old implementation/assumption -- e.g., MAP_FIXED wasn't even
> > used/supported for mmap()ing enclave chunks.
> > 
> > So I am wondering now whether we suggest user to use mmap(MAP_ANONYMOUS) to get
> > a size-aligned address still stands?  The current SGX driver's
> > get_unmapped_area:
> > 
> > static unsigned long sgx_get_unmapped_area(struct file *file,                  
> >                                            unsigned long addr,
> >                                            unsigned long len,                  
> >                                            unsigned long pgoff,                
> >                                            unsigned long flags)                
> > {       
> >         if ((flags & MAP_TYPE) == MAP_PRIVATE)                                 
> >                 return -EINVAL;
> >         
> >         if (flags & MAP_FIXED)
> >                 return addr;
> >                         
> >         return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);  
> > }               
> >  
> > 
> > As you can see if we don't pass MAP_FIXED, which is the case for mmap() to
> > reserve ELRANGE, looks there's no difference between mmap(MAP_ANONYMOUS) and
> > mmap(/dev/sgx_enclave)?
> 
> In practice, there's *probably* no significant difference.  Using an anonymous
> mapping is advantageous because there's no additional overhead, e.g. for locking
> the file, it can be done in advance of actually opening /dev/sgx_enclave, helps
> document (in userspace) that the code isn't actually creating an enclave, just
> finding a naturally aligned area in virtual memory (which isn't SGX specific), etc.

Sure, and I agree this perhaps is cleaner and simpler for SGX.  But for this
purpose I think we should just enforce this policy in SGX driver, i.e., we
should disable mmap(/dev/sgx_enclave) before ECREATE.

To me at least the SGX driver should be clear whether it supports userspace to
use mmap(/dev/sgx_enclave) to reserve ELRANGE.  There should be no ambiguity
here.

The current driver only disallows mmap(/dev/sgx_enclave) outside of enclave
range after EINIT:

int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
                     unsigned long end, unsigned long vm_flags)
{
	...

        /* Disallow mapping outside enclave's address range. */
        if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) &&
            (start < encl->base || end > encl->base + encl->size))
                return -EACCES;
}

which has what value btw?

> 
> There are definitely differences, e.g. an LSM could restrict access to
> /dev/sgx_enclave.  That particular one is obviously a "don't care", 
> 

I think you meant "RW->RX" requires EXECMOD, etc?

Yeah I am not sure whether this matters to reserving ELRANGE.  That part should
apply to the mmap(/dev/sgx_enclave) after ECREATE.

> but it's quite
> difficult to say that mmap(/dev/sgx_enclave) and mmap(NULL) are equivalent due to
> the amount of code that's involved in handling mmap().

Sure.  I was talking about only from "allocating size-aligned" part.  Arguably,
in terms of reserving ELRANGE, userspace only cares about:

 1) The ELRANGE is big enough to hold the enclave
 2) The ELRANGE meets alignment requirement
 3) Usperspace can mmap(/dev/sgx_enclave) and/or mprotect() small areas to get
    the correct enclave addr within ELRANGE with desired permission

> 
> > > > Also, if I am not missing something, the current driver doesn't prevent using
> > > > mmap(/dev/sgx_enclave, PROT_NONE) to reserve ELANGE.  So having clear
> > > > documentation is helpful for SGX users to choose how to write their apps.
> > > > 
> > > > Go back to the "SGX driver uses MAP_ANONYMOUS semantics for mmap()", I believe
> > > > this just is "SGX driver requires mmap() after ECREATE/EINIT to use MAP_SHARED |
> > > > MAP_FIXED and pgoff is ignored".  Or more precisely, pgoff is "not _used_ by SGX
> > > > driver".
> > > > 
> > > > In fact, I think "pgoff is ignored/not used" is technically wrong for enclave.
> > > 
> > > Yeah, it's wrong.  It works because, until now apparently, there was never a reason
> > > a need to care about the file offset since ELRANGE base always provided the necessary
> > > information.  It wasn't a deliberate design choice, we simply overlooked that detail
> > > (unless Jarkko was holding back on me all those years ago :-) ).
> > 
> > Yeah.  But I am not sure whether there are corner cases that we can have
> > potential bug around here, since those VMA's are not aligned between the core-mm
> > and the driver.
> > 
> > I haven't thought carefully though.  I guess from core-mm's perspective there
> > are multi-VMAs mapping to the same/overlapping part of the enclave, while the
> > SGX driver treats them as mapping to different non-overlapping parts.  Perhaps
> > it's OK as long as SGX driver doesn't use vma->pgoff to do something???
> 
> Ya, exactly.
> 
> > > > IMHO we should stop saying SGX driver uses MAP_ANONYMOUS semantics, because the
> > > > truth is it just takes advantage of MAP_FIXED and carelessly ignores the pgoff
> > > > due to the nature of SGX w/o considering from core-MM's perspective.
> > > >   
> > > > And IMHO there are two ways to fix:
> > > > 
> > > > 1) From now on, we ask SGX apps to use the correct pgoff in their
> > > > mmap(/dev/sgx_enclave).  This shouldn't impact the existing SGX apps because SGX
> > > > driver doesn't use vma->pgoff anyway.
> > > 
> > > Heh, just "asking" won't help.  And breaking userspace, i.e. requiring all apps
> > > to update, will just get you yelled at :-)
> > 
> > We can document properly and assume the new apps will follow.  As I mentioned
> > above, the old apps, which doesn't/shouldn't have been using fadvice() anyway,
> > doesn't need to be changed, i.e.,  they should continue to work. :)
> > 
> > No?
> 
> You can't build an ABI on assumptions.  E.g. even if userspace *intends* to behave,
> it wouldn't take much to compute the wrong offset (math is hard).

But I don't think we have an well established ABI now?  Nothing is documented.  

No?

> 
> > > > 2) For the sake of avoiding having to ask existing SGX apps to change their
> > > > mmap()s, we _officially_ say that userspace isn't required to pass a correct
> > > > pgoff to mmap() (i.e. passing 0 as did in existing apps), but the kernel should
> > > > fix the vma->pgoff internally.
> > > 
> > > I recommend you don't do this.  Overwriting vm_pgoff would probably work, but it's
> > > going to make a flawed API even messier.  E.g. you'll have painted SGX into a
> > > corner if you ever want to decouple vma->start/end from encl->base.  I highly
> > > doubt that will ever happen given how ELRANGE works, but I don't think a hack-a-fix
> > > buys you enough to justify any more ugliness.
> > 
> > I also found it's not feasible to cleanly fix the pgoff from userspace (I
> > thought the pgoff could be fixed at very early stage of do_mmap() so everything
> > later just worked, but it's not the case).  In do_mmap() the pgoff from
> > userspace is already used to VMA merging/splitting etc before creating the
> > target VMA.
> > 
> > Hmm.. Now I think we shouldn't silently fix pgoff in SGX driver as it may
> > surprise the core-mm later because the core-mm has already done some job around
> > VMAs before vma->pgoff is changed.
> > 
> > > 
> > > > I do prefer option 2) because it has no harm to anyone: 1) No changes to
> > > > existing SGX apps; 2) It aligns with the core-MM to so all existing mmap()
> > > > operations should work as expected, meaning no surprise; 3) And this patchset
> > > > from Haitao to use fadvice() to accelerate EAUG flow just works.
> > > 
> > > I think you can have your cake and eat it too.  IIUC, the goal is to get fadvise()
> > > working, and to do that without creating a truly heinous uAPI, you need an accurate
> > > vm_pgoff.  So, use a carrot and stick approach.
> > > 
> > > If userspace properly defines vm_pgoff during mmap() and doesn't specify MAP_ANONYMOUS,
> > > then they get to use fadvise() (give 'em a carrot).  But if *any* mmap() on the
> > > enclave doesn't followo those rules, mark the enclave as tainted or whatever and
> > > disallow fadvise() (hit 'em with a stick).
> > 
> > If we are supposed to use mmap(MAP_ANONYMOUS) to reserve ELRANGE, then I think
> > userspace will just use MAP_FIXED for all mmap()s to /dev/sgx_enclave.
> 
> There is no "supposed" to.  Using mmap(NULL) is purely a suggestion to avoid
> running into overheads and checks that are ultimately unnecessary.  The only
> requirement is that userspace provide a compatible chunk for virtual memory,
> *how* userspace finds that chunk can be done in a number of ways.  mmap(NULL)
> just happens to be the most convenient one (IMO).

Even this, I think we should document it IMO, at least whether
mmap(/dev/sgx_enclave) can be used to reserve ELRANGE.

> 
> > To detect whether a VMA has the correct pgoff, we need to somehow mark it in
> > sgx_mmap(), but currently we don't have facility to do that.  Even we can do,
> > the core madvice() -> vfs_fadvice() cannot be aware of such VMA either, and
> > in sgx_fadvice() we may have problem locating the correct VMA to do EAUG
> > (because the vfs_fadvice()  uses  0 pgoff to calculate the file offset).
> > 
> > So, now I think we should just let userspace itself to pass the correct pgoff
> > when it wants to use fadvice(), which is the option 1) I mentioned above.  The
> > kernel doesn't/shouldn't need to fix vma->pgoff.
> > 
> > In another words, I think:
> > 
> >  - For the old apps, doesn't matter, continue to work;
> >  - For new apps which want to use fadvice() to accelerate EAUG, it should pass�
> >    the correct pgoff in mmap(/dev/sgx_enclave) even with  MAP_FIXED.
> > 
> > And we don't say "SGX driver uses MAP_ANONYMOUS semantics for
> > mmap(/dev/sgx_enclave) any more".
> > 
> > Does this make sense?
> 
> Yes, but as above, IMO the kernel needs to enforce that userspace has properly
> set the offset, otherwise userspace will end up with completely nonsensical
> behavior if it fails to set the correct offset.  The proposed sgx_fadvise() already
> takes mmap_lock for read, i.e. is mutually exclusive with sgx_mmap(), so encforcing
> the requirement should be quite straightforward, e.g.
> 
>         mmap_read_lock(current->mm);
> 
>         vma = find_vma(current->mm, start);

The "start" here may already be wrong because before vfs_fadvice() uses VMA's pgoff
to calculate the start before passing it to sgx_fadvice().

>         if (!vma)
>                 goto unlock;
>         if (vma->vm_private_data != encl)
>                 goto unlock;
> 	if (encl->has_mismatched_offsets)  <======
> 		goto unlock;

Sorry I am a little bit slow, how do you set "has_mismatched_offsests" to true?

> 
>         pos = start;
>         if (pos < vma->vm_start || end > vma->vm_end) {
>                 /* Don't allow any gaps */
>                 goto unlock;
>         }
Sean Christopherson June 26, 2023, 10:28 p.m. UTC | #35
On Thu, Jun 22, 2023, Kai Huang wrote:
> On Thu, 2023-06-22 at 15:01 -0700, Sean Christopherson wrote:
> > On Mon, Jun 19, 2023, Kai Huang wrote:
> > > On Fri, 2023-06-16 at 15:05 -0700, Sean Christopherson wrote:
> > > As you can see if we don't pass MAP_FIXED, which is the case for mmap() to
> > > reserve ELRANGE, looks there's no difference between mmap(MAP_ANONYMOUS) and
> > > mmap(/dev/sgx_enclave)?
> > 
> > In practice, there's *probably* no significant difference.  Using an anonymous
> > mapping is advantageous because there's no additional overhead, e.g. for locking
> > the file, it can be done in advance of actually opening /dev/sgx_enclave, helps
> > document (in userspace) that the code isn't actually creating an enclave, just
> > finding a naturally aligned area in virtual memory (which isn't SGX specific), etc.
> 
> Sure, and I agree this perhaps is cleaner and simpler for SGX.  But for this
> purpose I think we should just enforce this policy in SGX driver, i.e., we
> should disable mmap(/dev/sgx_enclave) before ECREATE.
> 
> To me at least the SGX driver should be clear whether it supports userspace to
> use mmap(/dev/sgx_enclave) to reserve ELRANGE.  There should be no ambiguity
> here.
> 
> The current driver only disallows mmap(/dev/sgx_enclave) outside of enclave
> range after EINIT:
> 
> int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
>                      unsigned long end, unsigned long vm_flags)
> {
> 	...
> 
>         /* Disallow mapping outside enclave's address range. */
>         if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) &&
>             (start < encl->base || end > encl->base + encl->size))
>                 return -EACCES;
> }
> 
> which has what value btw?

That code came in after my involvement, see commit 7b013e723a1f ("x86/sgx: Tighten
accessible memory range after enclave initialization").

> > There are definitely differences, e.g. an LSM could restrict access to
> > /dev/sgx_enclave. �That particular one is obviously a "don't care",�
> > 
> 
> I think you meant "RW->RX" requires EXECMOD, etc?

I meant access to /dev/sgx_enclave itself.  What I was pointing out is that,
in theory, an LSM could deny mmap() on /dev/sgx_enclave, but it's a pointless
distinction since in that case userspace can't create SGX enclaves anyways, i.e.
how userspace finds ELRANGE for an enclave it can't create is irrelevant :-)

> Yeah I am not sure whether this matters to reserving ELRANGE.  That part should
> apply to the mmap(/dev/sgx_enclave) after ECREATE.
> 
> > but it's quite
> > difficult to say that mmap(/dev/sgx_enclave) and mmap(NULL) are equivalent due to
> > the amount of code that's involved in handling mmap().
> 
> Sure.  I was talking about only from "allocating size-aligned" part.  Arguably,
> in terms of reserving ELRANGE, userspace only cares about:
> 
>  1) The ELRANGE is big enough to hold the enclave
>  2) The ELRANGE meets alignment requirement
>  3) Usperspace can mmap(/dev/sgx_enclave) and/or mprotect() small areas to get
>     the correct enclave addr within ELRANGE with desired permission
> 
> > 
> > > > > Also, if I am not missing something, the current driver doesn't prevent using
> > > > > mmap(/dev/sgx_enclave, PROT_NONE) to reserve ELANGE.  So having clear
> > > > > documentation is helpful for SGX users to choose how to write their apps.
> > > > > 
> > > > > Go back to the "SGX driver uses MAP_ANONYMOUS semantics for mmap()", I believe
> > > > > this just is "SGX driver requires mmap() after ECREATE/EINIT to use MAP_SHARED |
> > > > > MAP_FIXED and pgoff is ignored".  Or more precisely, pgoff is "not _used_ by SGX
> > > > > driver".
> > > > > 
> > > > > In fact, I think "pgoff is ignored/not used" is technically wrong for enclave.
> > > > 
> > > > Yeah, it's wrong.  It works because, until now apparently, there was never a reason
> > > > a need to care about the file offset since ELRANGE base always provided the necessary
> > > > information.  It wasn't a deliberate design choice, we simply overlooked that detail
> > > > (unless Jarkko was holding back on me all those years ago :-) ).
> > > 
> > > Yeah.  But I am not sure whether there are corner cases that we can have
> > > potential bug around here, since those VMA's are not aligned between the core-mm
> > > and the driver.
> > > 
> > > I haven't thought carefully though.  I guess from core-mm's perspective there
> > > are multi-VMAs mapping to the same/overlapping part of the enclave, while the
> > > SGX driver treats them as mapping to different non-overlapping parts.  Perhaps
> > > it's OK as long as SGX driver doesn't use vma->pgoff to do something???
> > 
> > Ya, exactly.
> > 
> > > > > IMHO we should stop saying SGX driver uses MAP_ANONYMOUS semantics, because the
> > > > > truth is it just takes advantage of MAP_FIXED and carelessly ignores the pgoff
> > > > > due to the nature of SGX w/o considering from core-MM's perspective.
> > > > >   
> > > > > And IMHO there are two ways to fix:
> > > > > 
> > > > > 1) From now on, we ask SGX apps to use the correct pgoff in their
> > > > > mmap(/dev/sgx_enclave).  This shouldn't impact the existing SGX apps because SGX
> > > > > driver doesn't use vma->pgoff anyway.
> > > > 
> > > > Heh, just "asking" won't help.  And breaking userspace, i.e. requiring all apps
> > > > to update, will just get you yelled at :-)
> > > 
> > > We can document properly and assume the new apps will follow.  As I mentioned
> > > above, the old apps, which doesn't/shouldn't have been using fadvice() anyway,
> > > doesn't need to be changed, i.e.,  they should continue to work. :)
> > > 
> > > No?
> > 
> > You can't build an ABI on assumptions.  E.g. even if userspace *intends* to behave,
> > it wouldn't take much to compute the wrong offset (math is hard).
> 
> But I don't think we have an well established ABI now?  Nothing is documented.

Heh, just because the ABI isn't formally documented doesn't mean it doesn't exist.
In fact, the most problematic ABIs are the ones that aren't documented.

> > There is no "supposed" to.  Using mmap(NULL) is purely a suggestion to avoid
> > running into overheads and checks that are ultimately unnecessary.  The only
> > requirement is that userspace provide a compatible chunk for virtual memory,
> > *how* userspace finds that chunk can be done in a number of ways.  mmap(NULL)
> > just happens to be the most convenient one (IMO).
> 
> Even this, I think we should document it IMO, at least whether
> mmap(/dev/sgx_enclave) can be used to reserve ELRANGE.

See below.

> > > To detect whether a VMA has the correct pgoff, we need to somehow mark it in
> > > sgx_mmap(), but currently we don't have facility to do that.  Even we can do,
> > > the core madvice() -> vfs_fadvice() cannot be aware of such VMA either, and
> > > in sgx_fadvice() we may have problem locating the correct VMA to do EAUG
> > > (because the vfs_fadvice()  uses  0 pgoff to calculate the file offset).
> > > 
> > > So, now I think we should just let userspace itself to pass the correct pgoff
> > > when it wants to use fadvice(), which is the option 1) I mentioned above.  The
> > > kernel doesn't/shouldn't need to fix vma->pgoff.
> > > 
> > > In another words, I think:
> > > 
> > >  - For the old apps, doesn't matter, continue to work;
> > >  - For new apps which want to use fadvice() to accelerate EAUG, it should pass�
> > >    the correct pgoff in mmap(/dev/sgx_enclave) even with  MAP_FIXED.
> > > 
> > > And we don't say "SGX driver uses MAP_ANONYMOUS semantics for
> > > mmap(/dev/sgx_enclave) any more".
> > > 
> > > Does this make sense?
> > 
> > Yes, but as above, IMO the kernel needs to enforce that userspace has properly
> > set the offset, otherwise userspace will end up with completely nonsensical
> > behavior if it fails to set the correct offset.  The proposed sgx_fadvise() already
> > takes mmap_lock for read, i.e. is mutually exclusive with sgx_mmap(), so encforcing
> > the requirement should be quite straightforward, e.g.
> > 
> >         mmap_read_lock(current->mm);
> > 
> >         vma = find_vma(current->mm, start);
> 
> The "start" here may already be wrong because before vfs_fadvice() uses VMA's pgoff
> to calculate the start before passing it to sgx_fadvice().

Not my code :-)

> >         if (!vma)
> >                 goto unlock;
> >         if (vma->vm_private_data != encl)
> >                 goto unlock;
> > 	if (encl->has_mismatched_offsets)  <======
> > 		goto unlock;
> 
> Sorry I am a little bit slow, how do you set "has_mismatched_offsests" to true?

During sgx_mmap().  Though there's a wrinkle I initially missed: if the enclave
hasn't gone through ECREATE, encl->base is garbage.  So either sgx_mmap() needs
to assume the !CREATED is creating a mismatched offset, or sgx_encl_create() needs
to iterate over and check all VMAs.

Since there are advantages to usuing mmap(NULL) to find ELRANGE, IMO your best
option is to do the below.  And then that mostly answers the question about
using mmap(/dev/sgx_enclave) to reserve ELRANGE, i.e. if userspace wants to use
fallocate(), then it effectively *must not* use mmap(/dev/sgx_enclave).

diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 262f5fb18d74..63fb41da35aa 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -94,6 +94,11 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
        if (ret)
                return ret;
 
+       if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
+           vma->vm_pgoff != PFN_DOWN(vma->vm_start - encl->base))
+               encl->has_mismatched_offsests = true;
+
+
        vma->vm_ops = &sgx_vm_ops;
        vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO);
        vma->vm_private_data = encl;
Huang, Kai June 27, 2023, 11:43 a.m. UTC | #36
> > > 
> > > You can't build an ABI on assumptions.  E.g. even if userspace *intends* to behave,
> > > it wouldn't take much to compute the wrong offset (math is hard).
> > 
> > But I don't think we have an well established ABI now?  Nothing is documented.
> 
> Heh, just because the ABI isn't formally documented doesn't mean it doesn't exist.
> In fact, the most problematic ABIs are the ones that aren't documented.
> 
> > 

Sure.  But just want to make sure, what is the current SGX driver ABI (around
mmap()) from your perspective?

Is it "SGX driver _requires_ pgoff to be 0 for non-ELRANGE-reserve mmap()", or
"SGX driver _ignores_ pgoff"?

See below ...

[...]

> > > 	if (encl->has_mismatched_offsets)  <======
> > > 		goto unlock;
> > 
> > Sorry I am a little bit slow, how do you set "has_mismatched_offsests" to true?
> 
> During sgx_mmap().  Though there's a wrinkle I initially missed: if the enclave
> hasn't gone through ECREATE, encl->base is garbage.  So either sgx_mmap() needs
> to assume the !CREATED is creating a mismatched offset, or sgx_encl_create() needs
> to iterate over and check all VMAs.
> 
> Since there are advantages to usuing mmap(NULL) to find ELRANGE, IMO your best
> option is to do the below.  And then that mostly answers the question about
> using mmap(/dev/sgx_enclave) to reserve ELRANGE, i.e. if userspace wants to use
> fallocate(), then it effectively *must not* use mmap(/dev/sgx_enclave).
> 
> diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
> index 262f5fb18d74..63fb41da35aa 100644
> --- a/arch/x86/kernel/cpu/sgx/driver.c
> +++ b/arch/x86/kernel/cpu/sgx/driver.c
> @@ -94,6 +94,11 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
>         if (ret)
>                 return ret;
>  
> +       if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
> +           vma->vm_pgoff != PFN_DOWN(vma->vm_start - encl->base))
> +               encl->has_mismatched_offsests = true;
> +
> +
>         vma->vm_ops = &sgx_vm_ops;
>         vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO);
>         vma->vm_private_data = encl;
> 

It appears we don't need to mark "has_mismatched_offsets" as true if userspace
uses mmap(/dev/sgx_enclave) to reserve ELRANGE?  Enclave's base isn't
established yet, so theoretically, from enclave's perspective, we cannot say it
has a mismatched offset.

However we may want to return -EINVAL if userspace passes a non-zero pgoff in
the mmap(/dev/sgx_enclave) to reserve ELRANGE.

Anyway reserving ELRANGE isn't the big point.  The SGX driver ABI is.  Let's
forget about reserving ELRANGE in below context, i.e., all mmap() below means
non-ELRANGE-reserve mmap(/dev/sgx_enclave) :)

I think yes marking enclave as "has_mismatched_offsets" works if userspace wants
to use file-based ABIs (fadvice(), etc) for enclave, but this effectively means
Haitao needs to modify _ALL_ existing mmap()s to pass the correct pgoff in order
to use fadvice() for EAUG.

Therefore, it seems more like we are changing ABI (or having a new ABI) to:

Userspace must pass the correct pgoff to all mmaps() in order to use file-based
ABIs for SGX.

( Because I think other drivers/subsystems doesn't have such limitation.  For
instance, for a normal file, userspace can have two mmap()s mapping to the same
offset of the same file but with different address, and fadvice() should work
for both mappings.

A VMA-based "has_mismatched_offset" seems more reasonable for SGX but as I
mentioned previously it may not easy to do from SGX driver. )

So I am not sure whether this belongs to "breaking existing ABI"?

If the current ABI is we _require_ pgoff being zero, then the "encl-
>has_mismatched_offsets" seems is breaking this ABI.

If the current ABI is we _ignore_ pgoff, then "encl->has_mismatched_offsets"
isn't breaking existing ABI.  But in this case why do we need this at all?  SGX
driver just follows the existing file-based ABIs:

If userspace wants to use file-based ABIs for some VMA, it needs to pass the
correct pgoff.  If not, file-based ABIs won't work for that VMA, but your
enclave may still work if you don't use file-based ABIs.

Sorry for being noisy, but does above make sense?
Sean Christopherson June 27, 2023, 2:50 p.m. UTC | #37
On Tue, Jun 27, 2023, Kai Huang wrote:
> > > > 
> > > > You can't build an ABI on assumptions.  E.g. even if userspace *intends* to behave,
> > > > it wouldn't take much to compute the wrong offset (math is hard).
> > > 
> > > But I don't think we have an well established ABI now?  Nothing is documented.
> > 
> > Heh, just because the ABI isn't formally documented doesn't mean it doesn't exist.
> > In fact, the most problematic ABIs are the ones that aren't documented.
> > 
> > > 
> 
> Sure.  But just want to make sure, what is the current SGX driver ABI (around
> mmap()) from your perspective?

To be clear, it's not my perspective, it's simply what the kernel actually does.

> Is it "SGX driver _requires_ pgoff to be 0 for non-ELRANGE-reserve mmap()", or
> "SGX driver _ignores_ pgoff"?

Unless there are checks hiding somewhere, it's the latter.  You might be able to
get away with changing the kernel to require pgoff to be '0', i.e. if literally
all users pass in '0', but proving that all users pass '0' is extremely difficult.
And I don't see any value in requiring pgoff to be '0' for "legacy" users.

> See below ...
> 
> [...]
> 
> > > > 	if (encl->has_mismatched_offsets)  <======
> > > > 		goto unlock;
> > > 
> > > Sorry I am a little bit slow, how do you set "has_mismatched_offsests" to true?
> > 
> > During sgx_mmap().  Though there's a wrinkle I initially missed: if the enclave
> > hasn't gone through ECREATE, encl->base is garbage.  So either sgx_mmap() needs
> > to assume the !CREATED is creating a mismatched offset, or sgx_encl_create() needs
> > to iterate over and check all VMAs.
> > 
> > Since there are advantages to usuing mmap(NULL) to find ELRANGE, IMO your best
> > option is to do the below.  And then that mostly answers the question about
> > using mmap(/dev/sgx_enclave) to reserve ELRANGE, i.e. if userspace wants to use
> > fallocate(), then it effectively *must not* use mmap(/dev/sgx_enclave).
> > 
> > diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
> > index 262f5fb18d74..63fb41da35aa 100644
> > --- a/arch/x86/kernel/cpu/sgx/driver.c
> > +++ b/arch/x86/kernel/cpu/sgx/driver.c
> > @@ -94,6 +94,11 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
> >         if (ret)
> >                 return ret;
> >  
> > +       if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
> > +           vma->vm_pgoff != PFN_DOWN(vma->vm_start - encl->base))
> > +               encl->has_mismatched_offsests = true;
> > +
> > +
> >         vma->vm_ops = &sgx_vm_ops;
> >         vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO);
> >         vma->vm_private_data = encl;
> > 
> 
> It appears we don't need to mark "has_mismatched_offsets" as true if userspace
> uses mmap(/dev/sgx_enclave) to reserve ELRANGE?  Enclave's base isn't
> established yet, so theoretically, from enclave's perspective, we cannot say it
> has a mismatched offset.

Yeah, it's a misnomer.  Just pick a different name and that problem goes away.

	if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
	    vma->vm_pgoff != PFN_DOWN(vma->vm_start - encl->base))
		set_bit(SGX_ENCL_DISABLE_FILE_APIS, &encl->flags);

> However we may want to return -EINVAL if userspace passes a non-zero pgoff in
> the mmap(/dev/sgx_enclave) to reserve ELRANGE.

That would potentially break userspace.  I don't personally care if you want to
try that route, but just be aware that pushing through such a change may break
someone's application.  And if that happens, be prepared to get yelled at :-)

> Anyway reserving ELRANGE isn't the big point.  The SGX driver ABI is.  Let's
> forget about reserving ELRANGE in below context, i.e., all mmap() below means
> non-ELRANGE-reserve mmap(/dev/sgx_enclave) :)
> 
> I think yes marking enclave as "has_mismatched_offsets" works if userspace wants
> to use file-based ABIs (fadvice(), etc) for enclave, but this effectively means
> Haitao needs to modify _ALL_ existing mmap()s to pass the correct pgoff in order
> to use fadvice() for EAUG.
> 
> Therefore, it seems more like we are changing ABI (or having a new ABI) to:

It's effectively new ABI.  The key point to all of this is Linus' mantra that
"we don't break userspace".   Since there can't possibly applications using fadvise()
on /dev/sgx_enclave, the kernel can define new requirements for using fadvise()
without breaking userspace.

> Userspace must pass the correct pgoff to all mmaps() in order to use file-based
> ABIs for SGX.
> 
> ( Because I think other drivers/subsystems doesn't have such limitation.  For
> instance, for a normal file, userspace can have two mmap()s mapping to the same
> offset of the same file but with different address, and fadvice() should work
> for both mappings.
>
> A VMA-based "has_mismatched_offset" seems more reasonable for SGX but as I
> mentioned previously it may not easy to do from SGX driver. )
> 
> So I am not sure whether this belongs to "breaking existing ABI"?

I think you're fixated too much on precisely defining the concept of ABI.  The
question that you really want to ask is "could change XYZ break userspace?"

> If the current ABI is we _require_ pgoff being zero, then the "encl-
> >has_mismatched_offsets" seems is breaking this ABI.
> 
> If the current ABI is we _ignore_ pgoff, then "encl->has_mismatched_offsets"
> isn't breaking existing ABI.  But in this case why do we need this at all?

Because SGX has users in the wild that don't set pgoff correctly.  Changing the
kernel to require an accurate pgoff would break those users.

> SGX driver just follows the existing file-based ABIs:
> 
> If userspace wants to use file-based ABIs for some VMA, it needs to pass the
> correct pgoff.  If not, file-based ABIs won't work for that VMA, but your
> enclave may still work if you don't use file-based ABIs.
> 
> Sorry for being noisy, but does above make sense?
Huang, Kai June 28, 2023, 9:37 a.m. UTC | #38
On Tue, 2023-06-27 at 07:50 -0700, Sean Christopherson wrote:
> On Tue, Jun 27, 2023, Kai Huang wrote:
> > > > > 
> > > > > You can't build an ABI on assumptions.  E.g. even if userspace *intends* to behave,
> > > > > it wouldn't take much to compute the wrong offset (math is hard).
> > > > 
> > > > But I don't think we have an well established ABI now?  Nothing is documented.
> > > 
> > > Heh, just because the ABI isn't formally documented doesn't mean it doesn't exist.
> > > In fact, the most problematic ABIs are the ones that aren't documented.
> > > 
> > > > 
> > 
> > Sure.  But just want to make sure, what is the current SGX driver ABI (around
> > mmap()) from your perspective?
> 
> To be clear, it's not my perspective, it's simply what the kernel actually does.

Sure.  I was just trying to hear your thoughts :)

> 
> > Is it "SGX driver _requires_ pgoff to be 0 for non-ELRANGE-reserve mmap()", or
> > "SGX driver _ignores_ pgoff"?
> 
> Unless there are checks hiding somewhere, it's the latter.  You might be able to
> get away with changing the kernel to require pgoff to be '0', i.e. if literally
> all users pass in '0', but proving that all users pass '0' is extremely difficult.
> And I don't see any value in requiring pgoff to be '0' for "legacy" users.

I certainly hate to enforce kernel to "require" 0 pgoff from userspace.  I want
to get rid of it.

I believe we both agree "SGX driver _ignores_ pgoff" is the current ABI.

[...]

> 
> I think you're fixated too much on precisely defining the concept of ABI.  
> 

Probably :)

> The
> question that you really want to ask is "could change XYZ break userspace?"

Agreed.

But since "encl->has_mismatched_offsets" is sort of new ABI, I think we need to
be careful otherwise in the future we may hit this kinda nasty issue again.

Here's my thoughts:

(Again, let's forget about mmap(/dev/sgx_enclave) to reserve ELRANGE for now.)

1) Current ABI is SGX driver _ignores_ pgoff for mmap(/dev/sgx_enclave)s (but
requires MAP_FIXED).

2) Therefore, "passing the correct pgoff" in new userspace app doesn't break the
current ABI.  If the new app chooses to pass the correct pgoff, it will work.

3) With additional support of sgx_fadvice(WILLNEED) within the driver, the new
app can use madvice(WILLNEED) if it passes the correct pgoff when mmap().  If
wrong pgoff is passed, then madvice(WILLNEED) will work unexpectedly and could
result in enclave being killed.  It's userspace app's responsibility to make
sure the correctness, not the driver's.

4) Old SGX apps which don't use file-based ABIs and still pass 0 pgoff should
continue to work.  No break of old apps either.

5) We encourage new apps to always pass the correct pgoff instead of passing 0.

6) If needed, we can modify sgx_mmap() to relax the needing to use MAP_FIXED,
but return the enclave's address "based on pgoff from userspace".  This
effectively provides additional mmap() ABI for userspace while not breaking the
existing MAP_FIXED ABI.

In this way, we don't need additional manipulation/fixing up of VMA's pgoff in
the driver.  There's no change to existing ABI either.

[...]


> Because SGX has users in the wild that don't set pgoff correctly.  Changing the
> kernel to require an accurate pgoff would break those users.

Only require an accurate pgoff if userspace wants to use file-based ABIs for the
relevant VMAs.  The old apps which pass 0 pgoff should just continue to work.
Sean Christopherson June 28, 2023, 2:57 p.m. UTC | #39
On Wed, Jun 28, 2023, Kai Huang wrote:
> On Tue, 2023-06-27 at 07:50 -0700, Sean Christopherson wrote:
> > The question that you really want to ask is "could change XYZ break
> > userspace?"
> 
> Agreed.
> 
> But since "encl->has_mismatched_offsets" is sort of new ABI, I think we need to
> be careful otherwise in the future we may hit this kinda nasty issue again.
> 
> Here's my thoughts:
> 
> (Again, let's forget about mmap(/dev/sgx_enclave) to reserve ELRANGE for now.)
> 
> 1) Current ABI is SGX driver _ignores_ pgoff for mmap(/dev/sgx_enclave)s

Yes.

> (but requires MAP_FIXED).

No, SGX doesn't require MAP_FIXED.  The requirements of ELRANGE make it extremely
unlikely that mmap() will succeed, but it's not a strict requirement. 

> 2) Therefore,�"passing the correct pgoff" in new userspace app doesn't break the
> current ABI.  If the new app chooses to pass the correct pgoff, it will work.

Yep.

> 3) With additional support of sgx_fadvice(WILLNEED) within the driver, the new
> app can use madvice(WILLNEED) if it passes the correct pgoff when mmap().  If
> wrong pgoff is passed, then madvice(WILLNEED) will work unexpectedly and could
> result in enclave being killed.  It's userspace app's responsibility to make
> sure the correctness, not the driver's.

Hmm, yeah, it's probably ok to lean on documentation if userspace screws up.  I
generally prefer explicit errors over "undefined behavior", but I don't have a
super strong opinion, especially since this isn't my area these days :-) 

> 4) Old SGX apps which don't use file-based ABIs and still pass 0 pgoff should
> continue to work.  No break of old apps either.

Yep.
 
> 5) We encourage new apps to always pass the correct pgoff instead of passing 0.
> 
> 6) If needed, we can modify sgx_mmap() to relax the needing to use MAP_FIXED,
> but return the enclave's address "based on pgoff from userspace".

As above, this isn't a relaxation of anything since MAP_FIXED isn't strictly
required, it's simply additional functionality.

> This effectively provides additional mmap() ABI for userspace while not
> breaking the existing MAP_FIXED ABI.

I don't think you want to do this.  The MAP_FIXED case won't be affected, but the
address provided to mmap() is also used as a hint in the !MAP_FIXED case,, i.e.
MAP_FIXED just means use *exactly* this address and don't try anything else.  It's
unlikely, but not _that_ unlikely, that there are userspace applications that
specify the exact address without MAP_FIXED and without the correct pgoff.

If there were more to gain by steering mmap() based solely on pgoff, then it might
be worth trying, but this doesn't seem to provide much value to userspace since
the virtual address of any given enclave mapping is mission critical, e.g. it seems
unlikely that userspace wouldn't already know the virtual address it needs.
Huang, Kai June 29, 2023, 3:10 a.m. UTC | #40
On Wed, 2023-06-28 at 07:57 -0700, Sean Christopherson wrote:
> On Wed, Jun 28, 2023, Kai Huang wrote:
> > On Tue, 2023-06-27 at 07:50 -0700, Sean Christopherson wrote:
> > > The question that you really want to ask is "could change XYZ break
> > > userspace?"
> > 
> > Agreed.
> > 
> > But since "encl->has_mismatched_offsets" is sort of new ABI, I think we need to
> > be careful otherwise in the future we may hit this kinda nasty issue again.
> > 
> > Here's my thoughts:
> > 
> > (Again, let's forget about mmap(/dev/sgx_enclave) to reserve ELRANGE for now.)
> > 
> > 1) Current ABI is SGX driver _ignores_ pgoff for mmap(/dev/sgx_enclave)s
> 
> Yes.
> 
> > (but requires MAP_FIXED).
> 
> No, SGX doesn't require MAP_FIXED.  The requirements of ELRANGE make it extremely
> unlikely that mmap() will succeed, but it's not a strict requirement. 

Looks w/o MAP_FIXED, the driver just uses the generic mm->get_unmapped_area() to
return the address, which doesn't guarantee the right address will be returned
at all.  Especially when ELRANGE is reserved via mmap(NULL), the
mmap(/dev/sgx_enclave) will not return the correct address no matter what pgoff
is used IIUC.

static unsigned long sgx_get_unmapped_area(struct file *file,
                                           unsigned long addr,
                                           unsigned long len,
                                           unsigned long pgoff,
                                           unsigned long flags)
{
        if ((flags & MAP_TYPE) == MAP_PRIVATE)
                return -EINVAL;

        if (flags & MAP_FIXED)
                return addr;

        return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
}

So to me userspace has to use MAP_FIXED to get the correct address.

> 
> > 2) Therefore,�"passing the correct pgoff" in new userspace app doesn't break the
> > current ABI.  If the new app chooses to pass the correct pgoff, it will work.
> 
> Yep.
> 
> > 3) With additional support of sgx_fadvice(WILLNEED) within the driver, the new
> > app can use madvice(WILLNEED) if it passes the correct pgoff when mmap().  If
> > wrong pgoff is passed, then madvice(WILLNEED) will work unexpectedly and could
> > result in enclave being killed.  It's userspace app's responsibility to make
> > sure the correctness, not the driver's.
> 
> Hmm, yeah, it's probably ok to lean on documentation if userspace screws up.  I
> generally prefer explicit errors over "undefined behavior", but I don't have a
> super strong opinion, especially since this isn't my area these days :-) 

My argument is for normal file operations, if you use madvice(WILLNEED) but
didn't mmap() with the correct pgoff, you will end up with kernel acting on the
wrong portion of the file (which may not result in fatal error, but certainly
not the correct thing from userspace's perspective).

So kernel doesn't guarantee anything here, but depends on userspace to do things
correctly.

> 
> > 4) Old SGX apps which don't use file-based ABIs and still pass 0 pgoff should
> > continue to work.  No break of old apps either.
> 
> Yep.
>  
> > 5) We encourage new apps to always pass the correct pgoff instead of passing 0.
> > 
> > 6) If needed, we can modify sgx_mmap() to relax the needing to use MAP_FIXED,
> > but return the enclave's address "based on pgoff from userspace".
> 
> As above, this isn't a relaxation of anything since MAP_FIXED isn't strictly
> required, it's simply additional functionality.
> 
> > This effectively provides additional mmap() ABI for userspace while not
> > breaking the existing MAP_FIXED ABI.
> 
> I don't think you want to do this.  The MAP_FIXED case won't be affected, but the
> address provided to mmap() is also used as a hint in the !MAP_FIXED case,, i.e.
> MAP_FIXED just means use *exactly* this address and don't try anything else.  It's
> unlikely, but not _that_ unlikely, that there are userspace applications that
> specify the exact address without MAP_FIXED and without the correct pgoff.
> 
> If there were more to gain by steering mmap() based solely on pgoff, then it might
> be worth trying, but this doesn't seem to provide much value to userspace since
> the virtual address of any given enclave mapping is mission critical, e.g. it seems
> unlikely that userspace wouldn't already know the virtual address it needs.

I think we can forget about this discussion right now, because it isn't strictly
related to the problem that we want to clarify at this moment.  My bad to bring
it up :)
Sean Christopherson June 29, 2023, 2:23 p.m. UTC | #41
On Thu, Jun 29, 2023, Kai Huang wrote:
> On Wed, 2023-06-28 at 07:57 -0700, Sean Christopherson wrote:
> > On Wed, Jun 28, 2023, Kai Huang wrote:
> > > (but requires MAP_FIXED).
> > 
> > No, SGX doesn't require MAP_FIXED.  The requirements of ELRANGE make it extremely
> > unlikely that mmap() will succeed, but it's not a strict requirement. 
> 
> Looks w/o MAP_FIXED, the driver just uses the generic mm->get_unmapped_area() to
> return the address, which doesn't guarantee the right address will be returned
> at all.  Especially when ELRANGE is reserved via mmap(NULL), the
> mmap(/dev/sgx_enclave) will not return the correct address no matter what pgoff
> is used IIUC.
> 
> static unsigned long sgx_get_unmapped_area(struct file *file,
>                                            unsigned long addr,
>                                            unsigned long len,
>                                            unsigned long pgoff,
>                                            unsigned long flags)
> {
>         if ((flags & MAP_TYPE) == MAP_PRIVATE)
>                 return -EINVAL;
> 
>         if (flags & MAP_FIXED)
>                 return addr;
> 
>         return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
> }
> 
> So to me userspace has to use MAP_FIXED to get the correct address.

No.  As called out below, @addr is still used as a fairly strong hint:

unsigned long
arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
			  const unsigned long len, const unsigned long pgoff,
			  const unsigned long flags)
{
	struct vm_area_struct *vma;
	struct mm_struct *mm = current->mm;
	unsigned long addr = addr0;
	struct vm_unmapped_area_info info;

	/* requested length too big for entire address space */
	if (len > TASK_SIZE)
		return -ENOMEM;

	/* No address checking. See comment at mmap_address_hint_valid() */
	if (flags & MAP_FIXED)
		return addr;

	/* for MAP_32BIT mappings we force the legacy mmap base */
	if (!in_32bit_syscall() && (flags & MAP_32BIT))
		goto bottomup;

	/* requesting a specific address */  <==================================
	if (addr) {
		addr &= PAGE_MASK;
		if (!mmap_address_hint_valid(addr, len))
			goto get_unmapped_area;

		vma = find_vma(mm, addr);
		if (!vma || addr + len <= vm_start_gap(vma))
			return addr;
	}

	...
}


In practice, I expect most/all userspace implementations do use MAP_FIXED, but
it's not strictly required.

> > > 2) Therefore,�"passing the correct pgoff" in new userspace app doesn't break the
> > > current ABI.  If the new app chooses to pass the correct pgoff, it will work.
> > 
> > Yep.
> > 
> > > 3) With additional support of sgx_fadvice(WILLNEED) within the driver, the new
> > > app can use madvice(WILLNEED) if it passes the correct pgoff when mmap().  If
> > > wrong pgoff is passed, then madvice(WILLNEED) will work unexpectedly and could
> > > result in enclave being killed.  It's userspace app's responsibility to make
> > > sure the correctness, not the driver's.
> > 
> > Hmm, yeah, it's probably ok to lean on documentation if userspace screws up.  I
> > generally prefer explicit errors over "undefined behavior", but I don't have a
> > super strong opinion, especially since this isn't my area these days :-) 
> 
> My argument is for normal file operations, if you use madvice(WILLNEED) but
> didn't mmap() with the correct pgoff, you will end up with kernel acting on the
> wrong portion of the file (which may not result in fatal error, but certainly
> not the correct thing from userspace's perspective).
> 
> So kernel doesn't guarantee anything here, but depends on userspace to do things
> correctly.

Fair enough.

> > > 4) Old SGX apps which don't use file-based ABIs and still pass 0 pgoff should
> > > continue to work.  No break of old apps either.
> > 
> > Yep.
> >  
> > > 5) We encourage new apps to always pass the correct pgoff instead of passing 0.
> > > 
> > > 6) If needed, we can modify sgx_mmap() to relax the needing to use MAP_FIXED,
> > > but return the enclave's address "based on pgoff from userspace".
> > 
> > As above, this isn't a relaxation of anything since MAP_FIXED isn't strictly
> > required, it's simply additional functionality.
> > 
> > > This effectively provides additional mmap() ABI for userspace while not
> > > breaking the existing MAP_FIXED ABI.
> > 
> > I don't think you want to do this.  The MAP_FIXED case won't be affected, but the
> > address provided to mmap() is also used as a hint in the !MAP_FIXED case,, i.e.
> > MAP_FIXED just means use *exactly* this address and don't try anything else.  It's
> > unlikely, but not _that_ unlikely, that there are userspace applications that
> > specify the exact address without MAP_FIXED and without the correct pgoff.
Huang, Kai June 29, 2023, 11:29 p.m. UTC | #42
On Thu, 2023-06-29 at 07:23 -0700, Sean Christopherson wrote:
> On Thu, Jun 29, 2023, Kai Huang wrote:
> > On Wed, 2023-06-28 at 07:57 -0700, Sean Christopherson wrote:
> > > On Wed, Jun 28, 2023, Kai Huang wrote:
> > > > (but requires MAP_FIXED).
> > > 
> > > No, SGX doesn't require MAP_FIXED.  The requirements of ELRANGE make it extremely
> > > unlikely that mmap() will succeed, but it's not a strict requirement. 
> > 
> > Looks w/o MAP_FIXED, the driver just uses the generic mm->get_unmapped_area() to
> > return the address, which doesn't guarantee the right address will be returned
> > at all.  Especially when ELRANGE is reserved via mmap(NULL), the
> > mmap(/dev/sgx_enclave) will not return the correct address no matter what pgoff
> > is used IIUC.
> > 
> > static unsigned long sgx_get_unmapped_area(struct file *file,
> >                                            unsigned long addr,
> >                                            unsigned long len,
> >                                            unsigned long pgoff,
> >                                            unsigned long flags)
> > {
> >         if ((flags & MAP_TYPE) == MAP_PRIVATE)
> >                 return -EINVAL;
> > 
> >         if (flags & MAP_FIXED)
> >                 return addr;
> > 
> >         return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
> > }
> > 
> > So to me userspace has to use MAP_FIXED to get the correct address.
> 
> No.  As called out below, @addr is still used as a fairly strong hint:
> 
> unsigned long
> arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
> 			  const unsigned long len, const unsigned long pgoff,
> 			  const unsigned long flags)
> {
> 	struct vm_area_struct *vma;
> 	struct mm_struct *mm = current->mm;
> 	unsigned long addr = addr0;
> 	struct vm_unmapped_area_info info;
> 
> 	/* requested length too big for entire address space */
> 	if (len > TASK_SIZE)
> 		return -ENOMEM;
> 
> 	/* No address checking. See comment at mmap_address_hint_valid() */
> 	if (flags & MAP_FIXED)
> 		return addr;
> 
> 	/* for MAP_32BIT mappings we force the legacy mmap base */
> 	if (!in_32bit_syscall() && (flags & MAP_32BIT))
> 		goto bottomup;
> 
> 	/* requesting a specific address */  <==================================
> 	if (addr) {
> 		addr &= PAGE_MASK;
> 		if (!mmap_address_hint_valid(addr, len))
> 			goto get_unmapped_area;
> 
> 		vma = find_vma(mm, addr);
> 		if (!vma || addr + len <= vm_start_gap(vma))
> 			return addr;
> 	}
> 
> 	...
> }
> 
> 
> In practice, I expect most/all userspace implementations do use MAP_FIXED, but
> it's not strictly required.
> 

Yeah I agree it's a strong hint, but from ABI's perspective, I think even a
strong hint isn't good enough.  If there's no guarantee userspace can 100% get
the correct enclave address, then userspace will always need to verify the
return address and if not do mmap() again.

Btw, my reading of above function is if @addr hint doesn't work if the ELRANGE
is reserved using mmap(NULL), because below code will always NOT return addr:

		vma = find_vma(mm, addr);	<--- A valid VMA will be found
		if (!vma || addr + len <= vm_start_gap(vma))	<--  This check
								 will be false
			return addr;

This is kinda reasonable because ELRANGE via mmap(NULL) doesn't have a file
backed, so the mmap(/dev/sgx_enclave) should never return an overlapping address
even we pass a addr within ELRANGE.

But my true argument is from ABI's perspective, although @addr is a hint, but
it's not guaranteed the *exact* addr will be returned  (see man page below):

"
If addr is not NULL, then the kernel takes it as a hint about where to place the
mapping; ...... If another mapping already exists there, the kernel picks a new
address that may or may not depend on the hint.
"

But SGX usrespace needs a *exact* address.  MAP_FIXED is the only ABI can
guarantee this.
> > >
Sean Christopherson June 30, 2023, 12:14 a.m. UTC | #43
On Thu, Jun 29, 2023, Kai Huang wrote:
> But SGX usrespace needs a *exact* address.  MAP_FIXED is the only ABI can
> guarantee this.

Yes, but you're assuming that userspace is sane and robust.  It's *very* possible
that there exists userspace that doesn't specify MAP_FIXED, but works anyways
because their setup doesn't cause conflicts and so the hint is sufficient.
Huang, Kai June 30, 2023, 12:56 a.m. UTC | #44
On Thu, 2023-06-29 at 17:14 -0700, Sean Christopherson wrote:
> On Thu, Jun 29, 2023, Kai Huang wrote:
> > But SGX usrespace needs a *exact* address.  MAP_FIXED is the only ABI can
> > guarantee this.
> 
> Yes, but you're assuming that userspace is sane and robust.  It's *very* possible
> that there exists userspace that doesn't specify MAP_FIXED, but works anyways
> because their setup doesn't cause conflicts and so the hint is sufficient.

I must have missed something.  ELRANGE is the first VMA that userspace needs to
do, and given enclave address must be within ELRANGE, the subsequent mmap()s
will always conflict with the ELRANGE VMA?  No?
Jarkko Sakkinen June 30, 2023, 1:54 a.m. UTC | #45
On Fri Jun 30, 2023 at 2:29 AM EEST, Huang, Kai wrote:
> On Thu, 2023-06-29 at 07:23 -0700, Sean Christopherson wrote:
> > On Thu, Jun 29, 2023, Kai Huang wrote:
> > > On Wed, 2023-06-28 at 07:57 -0700, Sean Christopherson wrote:
> > > > On Wed, Jun 28, 2023, Kai Huang wrote:
> > > > > (but requires MAP_FIXED).
> > > > 
> > > > No, SGX doesn't require MAP_FIXED.  The requirements of ELRANGE make it extremely
> > > > unlikely that mmap() will succeed, but it's not a strict requirement. 
> > > 
> > > Looks w/o MAP_FIXED, the driver just uses the generic mm->get_unmapped_area() to
> > > return the address, which doesn't guarantee the right address will be returned
> > > at all.  Especially when ELRANGE is reserved via mmap(NULL), the
> > > mmap(/dev/sgx_enclave) will not return the correct address no matter what pgoff
> > > is used IIUC.
> > > 
> > > static unsigned long sgx_get_unmapped_area(struct file *file,
> > >                                            unsigned long addr,
> > >                                            unsigned long len,
> > >                                            unsigned long pgoff,
> > >                                            unsigned long flags)
> > > {
> > >         if ((flags & MAP_TYPE) == MAP_PRIVATE)
> > >                 return -EINVAL;
> > > 
> > >         if (flags & MAP_FIXED)
> > >                 return addr;
> > > 
> > >         return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
> > > }
> > > 
> > > So to me userspace has to use MAP_FIXED to get the correct address.
> > 
> > No.  As called out below, @addr is still used as a fairly strong hint:
> > 
> > unsigned long
> > arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
> > 			  const unsigned long len, const unsigned long pgoff,
> > 			  const unsigned long flags)
> > {
> > 	struct vm_area_struct *vma;
> > 	struct mm_struct *mm = current->mm;
> > 	unsigned long addr = addr0;
> > 	struct vm_unmapped_area_info info;
> > 
> > 	/* requested length too big for entire address space */
> > 	if (len > TASK_SIZE)
> > 		return -ENOMEM;
> > 
> > 	/* No address checking. See comment at mmap_address_hint_valid() */
> > 	if (flags & MAP_FIXED)
> > 		return addr;
> > 
> > 	/* for MAP_32BIT mappings we force the legacy mmap base */
> > 	if (!in_32bit_syscall() && (flags & MAP_32BIT))
> > 		goto bottomup;
> > 
> > 	/* requesting a specific address */  <==================================
> > 	if (addr) {
> > 		addr &= PAGE_MASK;
> > 		if (!mmap_address_hint_valid(addr, len))
> > 			goto get_unmapped_area;
> > 
> > 		vma = find_vma(mm, addr);
> > 		if (!vma || addr + len <= vm_start_gap(vma))
> > 			return addr;
> > 	}
> > 
> > 	...
> > }
> > 
> > 
> > In practice, I expect most/all userspace implementations do use MAP_FIXED, but
> > it's not strictly required.
> > 
>
> Yeah I agree it's a strong hint, but from ABI's perspective, I think even a
> strong hint isn't good enough.  If there's no guarantee userspace can 100% get
> the correct enclave address, then userspace will always need to verify the
> return address and if not do mmap() again.
>
> Btw, my reading of above function is if @addr hint doesn't work if the ELRANGE
> is reserved using mmap(NULL), because below code will always NOT return addr:
>
> 		vma = find_vma(mm, addr);	<--- A valid VMA will be found
> 		if (!vma || addr + len <= vm_start_gap(vma))	<--  This check
> 								 will be false
> 			return addr;
>
> This is kinda reasonable because ELRANGE via mmap(NULL) doesn't have a file
> backed, so the mmap(/dev/sgx_enclave) should never return an overlapping address
> even we pass a addr within ELRANGE.
>
> But my true argument is from ABI's perspective, although @addr is a hint, but
> it's not guaranteed the *exact* addr will be returned  (see man page below):
>
> "
> If addr is not NULL, then the kernel takes it as a hint about where to place the
> mapping; ...... If another mapping already exists there, the kernel picks a new
> address that may or may not depend on the hint.
> "
>
> But SGX usrespace needs a *exact* address.  MAP_FIXED is the only ABI can
> guarantee this.

A practical point of view: I don't see much any other point with Intel
SDK than provide environment to compile and run attestation shenanigans.

Is there something people *actually* use it in the cloud?

I'm starting to miss the perspective on why waste so much energy on this?
Feels fuzzy.

BR, Jarkko
Jarkko Sakkinen June 30, 2023, 1:57 a.m. UTC | #46
On Fri Jun 30, 2023 at 4:54 AM EEST, Jarkko Sakkinen wrote:
> On Fri Jun 30, 2023 at 2:29 AM EEST, Huang, Kai wrote:
> > On Thu, 2023-06-29 at 07:23 -0700, Sean Christopherson wrote:
> > > On Thu, Jun 29, 2023, Kai Huang wrote:
> > > > On Wed, 2023-06-28 at 07:57 -0700, Sean Christopherson wrote:
> > > > > On Wed, Jun 28, 2023, Kai Huang wrote:
> > > > > > (but requires MAP_FIXED).
> > > > > 
> > > > > No, SGX doesn't require MAP_FIXED.  The requirements of ELRANGE make it extremely
> > > > > unlikely that mmap() will succeed, but it's not a strict requirement. 
> > > > 
> > > > Looks w/o MAP_FIXED, the driver just uses the generic mm->get_unmapped_area() to
> > > > return the address, which doesn't guarantee the right address will be returned
> > > > at all.  Especially when ELRANGE is reserved via mmap(NULL), the
> > > > mmap(/dev/sgx_enclave) will not return the correct address no matter what pgoff
> > > > is used IIUC.
> > > > 
> > > > static unsigned long sgx_get_unmapped_area(struct file *file,
> > > >                                            unsigned long addr,
> > > >                                            unsigned long len,
> > > >                                            unsigned long pgoff,
> > > >                                            unsigned long flags)
> > > > {
> > > >         if ((flags & MAP_TYPE) == MAP_PRIVATE)
> > > >                 return -EINVAL;
> > > > 
> > > >         if (flags & MAP_FIXED)
> > > >                 return addr;
> > > > 
> > > >         return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
> > > > }
> > > > 
> > > > So to me userspace has to use MAP_FIXED to get the correct address.
> > > 
> > > No.  As called out below, @addr is still used as a fairly strong hint:
> > > 
> > > unsigned long
> > > arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
> > > 			  const unsigned long len, const unsigned long pgoff,
> > > 			  const unsigned long flags)
> > > {
> > > 	struct vm_area_struct *vma;
> > > 	struct mm_struct *mm = current->mm;
> > > 	unsigned long addr = addr0;
> > > 	struct vm_unmapped_area_info info;
> > > 
> > > 	/* requested length too big for entire address space */
> > > 	if (len > TASK_SIZE)
> > > 		return -ENOMEM;
> > > 
> > > 	/* No address checking. See comment at mmap_address_hint_valid() */
> > > 	if (flags & MAP_FIXED)
> > > 		return addr;
> > > 
> > > 	/* for MAP_32BIT mappings we force the legacy mmap base */
> > > 	if (!in_32bit_syscall() && (flags & MAP_32BIT))
> > > 		goto bottomup;
> > > 
> > > 	/* requesting a specific address */  <==================================
> > > 	if (addr) {
> > > 		addr &= PAGE_MASK;
> > > 		if (!mmap_address_hint_valid(addr, len))
> > > 			goto get_unmapped_area;
> > > 
> > > 		vma = find_vma(mm, addr);
> > > 		if (!vma || addr + len <= vm_start_gap(vma))
> > > 			return addr;
> > > 	}
> > > 
> > > 	...
> > > }
> > > 
> > > 
> > > In practice, I expect most/all userspace implementations do use MAP_FIXED, but
> > > it's not strictly required.
> > > 
> >
> > Yeah I agree it's a strong hint, but from ABI's perspective, I think even a
> > strong hint isn't good enough.  If there's no guarantee userspace can 100% get
> > the correct enclave address, then userspace will always need to verify the
> > return address and if not do mmap() again.
> >
> > Btw, my reading of above function is if @addr hint doesn't work if the ELRANGE
> > is reserved using mmap(NULL), because below code will always NOT return addr:
> >
> > 		vma = find_vma(mm, addr);	<--- A valid VMA will be found
> > 		if (!vma || addr + len <= vm_start_gap(vma))	<--  This check
> > 								 will be false
> > 			return addr;
> >
> > This is kinda reasonable because ELRANGE via mmap(NULL) doesn't have a file
> > backed, so the mmap(/dev/sgx_enclave) should never return an overlapping address
> > even we pass a addr within ELRANGE.
> >
> > But my true argument is from ABI's perspective, although @addr is a hint, but
> > it's not guaranteed the *exact* addr will be returned  (see man page below):
> >
> > "
> > If addr is not NULL, then the kernel takes it as a hint about where to place the
> > mapping; ...... If another mapping already exists there, the kernel picks a new
> > address that may or may not depend on the hint.
> > "
> >
> > But SGX usrespace needs a *exact* address.  MAP_FIXED is the only ABI can
> > guarantee this.
>
> A practical point of view: I don't see much any other point with Intel
> SDK than provide environment to compile and run attestation shenanigans.
>
> Is there something people *actually* use it in the cloud?
>
> I'm starting to miss the perspective on why waste so much energy on this?
> Feels fuzzy.

Does this map to any of the runtimes that people *actually* use for
their workloads?

BR, Jarkko
Huang, Kai June 30, 2023, 4:26 a.m. UTC | #47
> 
> Does this map to any of the runtimes that people *actually* use for
> their workloads?
> 

Haitao wants to use madvice(WILLNEED) to speed up EUG flow.  For the performance
data please look at the cover letter of this series.

The whole discussion was about to get mmap(/dev/sgx_enclave) semantics clarified
so this series can move on and also future life can be easier.
Jarkko Sakkinen June 30, 2023, 9:35 a.m. UTC | #48
On Fri Jun 30, 2023 at 7:26 AM EEST, Huang, Kai wrote:
>
> > 
> > Does this map to any of the runtimes that people *actually* use for
> > their workloads?
> > 
>
> Haitao wants to use madvice(WILLNEED) to speed up EUG flow.  For the performance
> data please look at the cover letter of this series.

Right thanks for reminding! Now in the same page.

> The whole discussion was about to get mmap(/dev/sgx_enclave) semantics clarified
> so this series can move on and also future life can be easier.

+1

BR, Jarkko
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index aa9b8b868867..3a88daddc1a1 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -2,6 +2,7 @@ 
 /*  Copyright(c) 2016-20 Intel Corporation. */
 
 #include <linux/acpi.h>
+#include <linux/fadvise.h>
 #include <linux/miscdevice.h>
 #include <linux/mman.h>
 #include <linux/security.h>
@@ -9,6 +10,7 @@ 
 #include <asm/traps.h>
 #include "driver.h"
 #include "encl.h"
+#include "encls.h"
 
 u64 sgx_attributes_reserved_mask;
 u64 sgx_xfrm_reserved_mask = ~0x3;
@@ -97,10 +99,81 @@  static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
 	vma->vm_ops = &sgx_vm_ops;
 	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
 	vma->vm_private_data = encl;
+	vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base);
 
 	return 0;
 }
 
+/*
+ * Add new pages to the enclave sequentially with ENCLS[EAUG] for the WILLNEED advice.
+ * Only do this to existing VMAs in the same enclave and reject the request otherwise.
+ */
+static int sgx_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+{
+	struct sgx_encl *encl = file->private_data;
+	unsigned long start = offset + encl->base;
+	struct vm_area_struct *vma = NULL;
+	unsigned long end = start + len;
+	unsigned long pos;
+	int ret = -EINVAL;
+
+	if (!cpu_feature_enabled(X86_FEATURE_SGX2))
+		return -EINVAL;
+	/* Only support WILLNEED */
+	if (advice != POSIX_FADV_WILLNEED)
+		return -EINVAL;
+
+	if (offset + len < offset)
+		return -EINVAL;
+	if (start < encl->base)
+		return -EINVAL;
+	if (end < start)
+		return -EINVAL;
+	if (end > encl->base + encl->size)
+		return -EINVAL;
+
+	if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+		return -EINVAL;
+
+	mmap_read_lock(current->mm);
+
+	vma = find_vma(current->mm, start);
+	if (!vma)
+		goto unlock;
+	if (vma->vm_private_data != encl)
+		goto unlock;
+
+	pos = start;
+	if (pos < vma->vm_start || end > vma->vm_end) {
+		/* Don't allow any gaps */
+		goto unlock;
+	}
+
+	/* Here: vm_start <= pos < end <= vm_end */
+	while (pos < end) {
+		if (xa_load(&encl->page_array, PFN_DOWN(pos)))
+			continue;
+		if (signal_pending(current)) {
+			if (pos == start)
+				ret = -ERESTARTSYS;
+			else
+				ret = -EINTR;
+			goto unlock;
+		}
+		ret = sgx_encl_eaug_page(vma, encl, pos);
+		/* It's OK to not finish */
+		if (ret)
+			break;
+		pos = pos + PAGE_SIZE;
+		cond_resched();
+	}
+	ret = 0;
+
+unlock:
+	mmap_read_unlock(current->mm);
+	return ret;
+}
+
 static unsigned long sgx_get_unmapped_area(struct file *file,
 					   unsigned long addr,
 					   unsigned long len,
@@ -133,6 +206,7 @@  static const struct file_operations sgx_encl_fops = {
 	.compat_ioctl		= sgx_compat_ioctl,
 #endif
 	.mmap			= sgx_mmap,
+	.fadvise		= sgx_fadvise,
 	.get_unmapped_area	= sgx_get_unmapped_area,
 };
 
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 0185c5ab48dd..592cfea4c9e4 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -299,20 +299,17 @@  struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
 }
 
 /**
- * sgx_encl_eaug_page() - Dynamically add page to initialized enclave
- * @vma:	VMA obtained from fault info from where page is accessed
- * @encl:	enclave accessing the page
- * @addr:	address that triggered the page fault
+ * sgx_encl_eaug_page() - Dynamically add an EPC page to initialized enclave
+ * @vma:	the VMA into which the page is to be added
+ * @encl:	the enclave for which the page is to be added
+ * @addr:	the start address of the page to be added
  *
- * When an initialized enclave accesses a page with no backing EPC page
- * on a SGX2 system then the EPC can be added dynamically via the SGX2
- * ENCLS[EAUG] instruction.
- *
- * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed
- * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise.
+ * Returns: 0 on EAUG success and PTE was installed successfully, -EBUSY for
+ * waiting on reclaimer to free EPC, -ENOMEM for out of RAM, -EFAULT for
+ * all other failures.
  */
-vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
-			      struct sgx_encl *encl, unsigned long addr)
+int sgx_encl_eaug_page(struct vm_area_struct *vma,
+		       struct sgx_encl *encl, unsigned long addr)
 {
 	vm_fault_t vmret = VM_FAULT_SIGBUS;
 	struct sgx_pageinfo pginfo = {0};
@@ -321,10 +318,10 @@  vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
 	struct sgx_va_page *va_page;
 	unsigned long phys_addr;
 	u64 secinfo_flags;
-	int ret;
+	int ret = -EFAULT;
 
 	if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
-		return VM_FAULT_SIGBUS;
+		return -EFAULT;
 
 	/*
 	 * Ignore internal permission checking for dynamically added pages.
@@ -335,21 +332,21 @@  vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
 	secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
 	encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags);
 	if (IS_ERR(encl_page))
-		return VM_FAULT_OOM;
+		return -ENOMEM;
 
 	mutex_lock(&encl->lock);
 
 	epc_page = sgx_alloc_epc_page(encl_page, false);
 	if (IS_ERR(epc_page)) {
 		if (PTR_ERR(epc_page) == -EBUSY)
-			vmret =  VM_FAULT_NOPAGE;
+			ret =  -EBUSY;
 		goto err_out_unlock;
 	}
 
 	va_page = sgx_encl_grow(encl, false);
 	if (IS_ERR(va_page)) {
 		if (PTR_ERR(va_page) == -EBUSY)
-			vmret = VM_FAULT_NOPAGE;
+			ret = -EBUSY;
 		goto err_out_epc;
 	}
 
@@ -362,16 +359,20 @@  vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
 	 * If ret == -EBUSY then page was created in another flow while
 	 * running without encl->lock
 	 */
-	if (ret)
+	if (ret) {
+		ret = -EFAULT;
 		goto err_out_shrink;
+	}
 
 	pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
 	pginfo.addr = encl_page->desc & PAGE_MASK;
 	pginfo.metadata = 0;
 
 	ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page));
-	if (ret)
+	if (ret) {
+		ret = -EFAULT;
 		goto err_out;
+	}
 
 	encl_page->encl = encl;
 	encl_page->epc_page = epc_page;
@@ -388,10 +389,10 @@  vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
 	vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
 	if (vmret != VM_FAULT_NOPAGE) {
 		mutex_unlock(&encl->lock);
-		return VM_FAULT_SIGBUS;
+		return -EFAULT;
 	}
 	mutex_unlock(&encl->lock);
-	return VM_FAULT_NOPAGE;
+	return 0;
 
 err_out:
 	xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
@@ -404,7 +405,7 @@  vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
 	mutex_unlock(&encl->lock);
 	kfree(encl_page);
 
-	return vmret;
+	return ret;
 }
 
 static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
@@ -434,8 +435,18 @@  static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
 	 * enclave that will be checked for right away.
 	 */
 	if (cpu_feature_enabled(X86_FEATURE_SGX2) &&
-	    (!xa_load(&encl->page_array, PFN_DOWN(addr))))
-		return sgx_encl_eaug_page(vma, encl, addr);
+	    (!xa_load(&encl->page_array, PFN_DOWN(addr)))) {
+		switch (sgx_encl_eaug_page(vma, encl, addr)) {
+		case 0:
+		case -EBUSY:
+			return VM_FAULT_NOPAGE;
+		case -ENOMEM:
+			return VM_FAULT_OOM;
+		case -EFAULT:
+		default:
+			return VM_FAULT_SIGBUS;
+		}
+	}
 
 	mutex_lock(&encl->lock);
 
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index 9f19b06c3ae3..e5a507871fa3 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -125,7 +125,7 @@  struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
 					 unsigned long addr);
 struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim);
 void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page);
-vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
-			      struct sgx_encl *encl, unsigned long addr);
+int sgx_encl_eaug_page(struct vm_area_struct *vma,
+		       struct sgx_encl *encl, unsigned long addr);
 
 #endif /* _X86_ENCL_H */