Message ID | 20230128045529.15749-3-haitao.huang@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | x86/sgx: implement support for MADV_WILLNEED | expand |
On Fri, Jan 27, 2023 at 08:55:27PM -0800, Haitao Huang wrote: > Support madvise(..., MADV_WILLNEED) by adding EPC pages with EAUG in > the newly added fops->fadvise() callback implementation, sgx_fadvise(). > > Change the return type and values of the sgx_encl_eaug_page function > so that more specific error codes are returned for different treatment > by the page fault handler and the fadvise callback. > On any error, sgx_fadvise() will discontinue further operations > and return as normal. The page fault handler allows a PF retried > by returning VM_FAULT_NOPAGE in handling -EBUSY returned from > sgx_encl_eaug_page. > > Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com> > --- > arch/x86/kernel/cpu/sgx/driver.c | 74 ++++++++++++++++++++++++++++++++ > arch/x86/kernel/cpu/sgx/encl.c | 59 ++++++++++++++----------- > arch/x86/kernel/cpu/sgx/encl.h | 4 +- > 3 files changed, 111 insertions(+), 26 deletions(-) > > diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c > index aa9b8b868867..3a88daddc1a1 100644 > --- a/arch/x86/kernel/cpu/sgx/driver.c > +++ b/arch/x86/kernel/cpu/sgx/driver.c > @@ -2,6 +2,7 @@ > /* Copyright(c) 2016-20 Intel Corporation. */ > > #include <linux/acpi.h> > +#include <linux/fadvise.h> > #include <linux/miscdevice.h> > #include <linux/mman.h> > #include <linux/security.h> > @@ -9,6 +10,7 @@ > #include <asm/traps.h> > #include "driver.h" > #include "encl.h" > +#include "encls.h" > > u64 sgx_attributes_reserved_mask; > u64 sgx_xfrm_reserved_mask = ~0x3; > @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma) > vma->vm_ops = &sgx_vm_ops; > vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; > vma->vm_private_data = encl; > + vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base); > > return 0; > } > > +/* > + * Add new pages to the enclave sequentially with ENCLS[EAUG] for the WILLNEED advice. > + * Only do this to existing VMAs in the same enclave and reject the request otherwise. > + */ > +static int sgx_fadvise(struct file *file, loff_t offset, loff_t len, int advice) > +{ > + struct sgx_encl *encl = file->private_data; > + unsigned long start = offset + encl->base; > + struct vm_area_struct *vma = NULL; > + unsigned long end = start + len; > + unsigned long pos; > + int ret = -EINVAL; > + > + if (!cpu_feature_enabled(X86_FEATURE_SGX2)) > + return -EINVAL; > + /* Only support WILLNEED */ > + if (advice != POSIX_FADV_WILLNEED) > + return -EINVAL; > + > + if (offset + len < offset) > + return -EINVAL; > + if (start < encl->base) > + return -EINVAL; > + if (end < start) > + return -EINVAL; > + if (end > encl->base + encl->size) > + return -EINVAL; > + > + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) > + return -EINVAL; > + > + mmap_read_lock(current->mm); > + > + vma = find_vma(current->mm, start); > + if (!vma) > + goto unlock; > + if (vma->vm_private_data != encl) > + goto unlock; > + > + pos = start; > + if (pos < vma->vm_start || end > vma->vm_end) { > + /* Don't allow any gaps */ > + goto unlock; > + } > + > + /* Here: vm_start <= pos < end <= vm_end */ > + while (pos < end) { > + if (xa_load(&encl->page_array, PFN_DOWN(pos))) > + continue; > + if (signal_pending(current)) { > + if (pos == start) > + ret = -ERESTARTSYS; > + else > + ret = -EINTR; > + goto unlock; > + } > + ret = sgx_encl_eaug_page(vma, encl, pos); > + /* It's OK to not finish */ > + if (ret) > + break; > + pos = pos + PAGE_SIZE; > + cond_resched(); > + } > + ret = 0; > + > +unlock: > + mmap_read_unlock(current->mm); > + return ret; > +} > + > static unsigned long sgx_get_unmapped_area(struct file *file, > unsigned long addr, > unsigned long len, > @@ -133,6 +206,7 @@ static const struct file_operations sgx_encl_fops = { > .compat_ioctl = sgx_compat_ioctl, > #endif > .mmap = sgx_mmap, > + .fadvise = sgx_fadvise, > .get_unmapped_area = sgx_get_unmapped_area, > }; > > diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c > index 0185c5ab48dd..592cfea4c9e4 100644 > --- a/arch/x86/kernel/cpu/sgx/encl.c > +++ b/arch/x86/kernel/cpu/sgx/encl.c > @@ -299,20 +299,17 @@ struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, > } > > /** > - * sgx_encl_eaug_page() - Dynamically add page to initialized enclave > - * @vma: VMA obtained from fault info from where page is accessed > - * @encl: enclave accessing the page > - * @addr: address that triggered the page fault > + * sgx_encl_eaug_page() - Dynamically add an EPC page to initialized enclave > + * @vma: the VMA into which the page is to be added > + * @encl: the enclave for which the page is to be added > + * @addr: the start address of the page to be added > * > - * When an initialized enclave accesses a page with no backing EPC page > - * on a SGX2 system then the EPC can be added dynamically via the SGX2 > - * ENCLS[EAUG] instruction. > - * > - * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed > - * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise. > + * Returns: 0 on EAUG success and PTE was installed successfully, -EBUSY for > + * waiting on reclaimer to free EPC, -ENOMEM for out of RAM, -EFAULT for > + * all other failures. > */ > -vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, > - struct sgx_encl *encl, unsigned long addr) > +int sgx_encl_eaug_page(struct vm_area_struct *vma, > + struct sgx_encl *encl, unsigned long addr) > { > vm_fault_t vmret = VM_FAULT_SIGBUS; > struct sgx_pageinfo pginfo = {0}; > @@ -321,10 +318,10 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, > struct sgx_va_page *va_page; > unsigned long phys_addr; > u64 secinfo_flags; > - int ret; > + int ret = -EFAULT; > > if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) > - return VM_FAULT_SIGBUS; > + return -EFAULT; > > /* > * Ignore internal permission checking for dynamically added pages. > @@ -335,21 +332,21 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, > secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; > encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags); > if (IS_ERR(encl_page)) > - return VM_FAULT_OOM; > + return -ENOMEM; > > mutex_lock(&encl->lock); > > epc_page = sgx_alloc_epc_page(encl_page, false); > if (IS_ERR(epc_page)) { > if (PTR_ERR(epc_page) == -EBUSY) > - vmret = VM_FAULT_NOPAGE; > + ret = -EBUSY; > goto err_out_unlock; > } > > va_page = sgx_encl_grow(encl, false); > if (IS_ERR(va_page)) { > if (PTR_ERR(va_page) == -EBUSY) > - vmret = VM_FAULT_NOPAGE; > + ret = -EBUSY; > goto err_out_epc; > } > > @@ -362,16 +359,20 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, > * If ret == -EBUSY then page was created in another flow while > * running without encl->lock > */ > - if (ret) > + if (ret) { > + ret = -EFAULT; > goto err_out_shrink; > + } > > pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); > pginfo.addr = encl_page->desc & PAGE_MASK; > pginfo.metadata = 0; > > ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page)); > - if (ret) > + if (ret) { > + ret = -EFAULT; > goto err_out; > + } > > encl_page->encl = encl; > encl_page->epc_page = epc_page; > @@ -388,10 +389,10 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, > vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); > if (vmret != VM_FAULT_NOPAGE) { > mutex_unlock(&encl->lock); > - return VM_FAULT_SIGBUS; > + return -EFAULT; > } > mutex_unlock(&encl->lock); > - return VM_FAULT_NOPAGE; > + return 0; > > err_out: > xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); > @@ -404,7 +405,7 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, > mutex_unlock(&encl->lock); > kfree(encl_page); > > - return vmret; > + return ret; > } > > static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) > @@ -434,8 +435,18 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) > * enclave that will be checked for right away. > */ > if (cpu_feature_enabled(X86_FEATURE_SGX2) && > - (!xa_load(&encl->page_array, PFN_DOWN(addr)))) > - return sgx_encl_eaug_page(vma, encl, addr); > + (!xa_load(&encl->page_array, PFN_DOWN(addr)))) { > + switch (sgx_encl_eaug_page(vma, encl, addr)) { > + case 0: > + case -EBUSY: > + return VM_FAULT_NOPAGE; > + case -ENOMEM: > + return VM_FAULT_OOM; > + case -EFAULT: > + default: > + return VM_FAULT_SIGBUS; > + } > + } > > mutex_lock(&encl->lock); > > diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h > index 9f19b06c3ae3..e5a507871fa3 100644 > --- a/arch/x86/kernel/cpu/sgx/encl.h > +++ b/arch/x86/kernel/cpu/sgx/encl.h > @@ -125,7 +125,7 @@ struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, > unsigned long addr); > struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim); > void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page); > -vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, > - struct sgx_encl *encl, unsigned long addr); > +int sgx_encl_eaug_page(struct vm_area_struct *vma, > + struct sgx_encl *encl, unsigned long addr); > > #endif /* _X86_ENCL_H */ > -- > 2.25.1 > Looks good to me! Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org> BR, Jarkko
On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote: > @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma) > vma->vm_ops = &sgx_vm_ops; > vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; > vma->vm_private_data = encl; > + vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base); > > return 0; > } Perhaps I am missing something, but above change looks weird. Conceptually, it doesn't/shouldn't belong to this series, which essentially preallocates and does EAUG EPC pages for a (or part of) given enclave. The EAUG logic should already be working for the normal fault path, which means the code change above either: 1) has been done at other place; 2) isn't needed. I have kinda forgotten the userspace sequence to create an enclave. If I recall correctly, you do below to create an enclave: 1) encl_fd = open("/dev/sgx_enclave"); 2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */); 3) IOCTL(ECREATE, encl_addr, encl_size); Would the above code change break the "mmap()" in above step 2?
Hi Kai On Tue, 14 Feb 2023 03:47:24 -0600, Huang, Kai <kai.huang@intel.com> wrote: > On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote: >> @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct >> vm_area_struct *vma) >> vma->vm_ops = &sgx_vm_ops; >> vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; >> vma->vm_private_data = encl; >> + vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base); >> return 0; >> } > > Perhaps I am missing something, but above change looks weird. > Conceptually, it doesn't/shouldn't belong to this series, which > essentially > preallocates and does EAUG EPC pages for a (or part of) given enclave. > The EAUG > logic should already be working for the normal fault path, which means > the code > change above either: 1) has been done at other place; 2) isn't needed. > > I have kinda forgotten the userspace sequence to create an enclave. If > I recall > correctly, you do below to create an enclave: > > 1) encl_fd = open("/dev/sgx_enclave"); > 2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */); > 3) IOCTL(ECREATE, encl_addr, encl_size); > > Would the above code change break the "mmap()" in above step 2? > No, vm_pgoff was not used previously for enclave VMAs. I had to add this because the offset passed to sgx_fadvise is relative to file base and calculated in mm/madvise.c like this: offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); I had a comment in first version but removed it based on Jarkko's suggestion here: https://lore.kernel.org/all/Y2B0jBsG6HE4KVk7@kernel.org/ The original comments probably seemed redundant to the definitions of the vm_pgoff field and the fadvise interface. But let me know if we need add a more helpful version of comments or any suggestion on the comments. Thanks Haitao
On Tue, 2023-02-14 at 13:18 -0600, Haitao Huang wrote: > Hi Kai > > On Tue, 14 Feb 2023 03:47:24 -0600, Huang, Kai <kai.huang@intel.com> wrote: > > > On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote: > > > @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct > > > vm_area_struct *vma) > > > vma->vm_ops = &sgx_vm_ops; > > > vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; > > > vma->vm_private_data = encl; > > > + vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base); > > > return 0; > > > } > > > > Perhaps I am missing something, but above change looks weird. > > Conceptually, it doesn't/shouldn't belong to this series, which > > essentially > > preallocates and does EAUG EPC pages for a (or part of) given enclave. > > The EAUG > > logic should already be working for the normal fault path, which means > > the code > > change above either: 1) has been done at other place; 2) isn't needed. > > > > I have kinda forgotten the userspace sequence to create an enclave. If > > I recall > > correctly, you do below to create an enclave: > > > > 1) encl_fd = open("/dev/sgx_enclave"); > > 2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */); > > 3) IOCTL(ECREATE, encl_addr, encl_size); > > > > Would the above code change break the "mmap()" in above step 2? > > > > No, vm_pgoff was not used previously for enclave VMAs. I had to add this > because the offset passed to sgx_fadvise is relative to file base and > calculated in mm/madvise.c like this: > > offset = (loff_t)(start - vma->vm_start) > + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); But shouldn't 'offset is relative to the file base' be conceptually correct from the fadvice()'s point of view? I think you should do: encl_offset = offset + encl->base; inside sgx_fadvice()? > > I had a comment in first version but removed it based on Jarkko's > suggestion here: https://lore.kernel.org/all/Y2B0jBsG6HE4KVk7@kernel.org/ > > The original comments probably seemed redundant to the definitions of the > vm_pgoff field and the fadvise interface. But let me know if we need add a > more helpful version of comments or any suggestion on the comments. I still think this code change is wrong. For instance, IIUC, it at least breaks the case where enclave hasn't been created/initialized, where encl->base == 0 (although normal code path doesn't use vm_pgoff, conceptually it's still wrong IIUC). Maybe I am missing something?
On Tue, 14 Feb 2023 14:54:53 -0600, Huang, Kai <kai.huang@intel.com> wrote: > On Tue, 2023-02-14 at 13:18 -0600, Haitao Huang wrote: >> Hi Kai >> >> On Tue, 14 Feb 2023 03:47:24 -0600, Huang, Kai <kai.huang@intel.com> >> wrote: >> >> > On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote: >> > > @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct >> > > vm_area_struct *vma) >> > > vma->vm_ops = &sgx_vm_ops; >> > > vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; >> > > vma->vm_private_data = encl; >> > > + vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base); >> > > return 0; >> > > } >> > >> > Perhaps I am missing something, but above change looks weird. >> > Conceptually, it doesn't/shouldn't belong to this series, which >> > essentially >> > preallocates and does EAUG EPC pages for a (or part of) given enclave. >> > The EAUG >> > logic should already be working for the normal fault path, which means >> > the code >> > change above either: 1) has been done at other place; 2) isn't needed. >> > >> > I have kinda forgotten the userspace sequence to create an enclave. >> If >> > I recall >> > correctly, you do below to create an enclave: >> > >> > 1) encl_fd = open("/dev/sgx_enclave"); >> > 2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */); >> > 3) IOCTL(ECREATE, encl_addr, encl_size); >> > >> > Would the above code change break the "mmap()" in above step 2? >> > >> >> No, vm_pgoff was not used previously for enclave VMAs. I had to add this >> because the offset passed to sgx_fadvise is relative to file base and >> calculated in mm/madvise.c like this: >> >> offset = (loff_t)(start - vma->vm_start) >> + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); > > But shouldn't 'offset is relative to the file base' be conceptually > correct from > the fadvice()'s point of view? > > I think you should do: > > encl_offset = offset + encl->base; > > inside sgx_fadvice()? > >> If we don't set vma->vm_pgoff (default to zero), then offset will be calculated as (start - vma->vm_start). Then the above calculation is wrong if we have multiple VMAs for the same enclave, which is usually the case. >> I had a comment in first version but removed it based on Jarkko's >> suggestion here: >> https://lore.kernel.org/all/Y2B0jBsG6HE4KVk7@kernel.org/ >> >> The original comments probably seemed redundant to the definitions of >> the >> vm_pgoff field and the fadvise interface. But let me know if we need >> add a >> more helpful version of comments or any suggestion on the comments. > > I still think this code change is wrong. > > For instance, IIUC, it at least breaks the case where enclave hasn't been > created/initialized, where encl->base == 0 (although normal code path > doesn't > use vm_pgoff, conceptually it's still wrong IIUC). > > Maybe I am missing something? The fadvise interface is only usable for an initialized enclave, sgx_fadvise will return error otherwise. Conceptually I view enclave base as "file base", it's just that we don't ever need handle the zero case caused by uninitialized enclave (kind of like a file never mapped). If an initialized enclave happens to have zero base, it would also work. Thanks Haitao
On Tue, 2023-02-14 at 15:42 -0600, Haitao Huang wrote: > On Tue, 14 Feb 2023 14:54:53 -0600, Huang, Kai <kai.huang@intel.com> wrote: > > > On Tue, 2023-02-14 at 13:18 -0600, Haitao Huang wrote: > > > Hi Kai > > > > > > On Tue, 14 Feb 2023 03:47:24 -0600, Huang, Kai <kai.huang@intel.com> > > > wrote: > > > > > > > On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote: > > > > > @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct > > > > > vm_area_struct *vma) > > > > > vma->vm_ops = &sgx_vm_ops; > > > > > vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; > > > > > vma->vm_private_data = encl; > > > > > + vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base); > > > > > return 0; > > > > > } > > > > > > > > Perhaps I am missing something, but above change looks weird. > > > > Conceptually, it doesn't/shouldn't belong to this series, which > > > > essentially > > > > preallocates and does EAUG EPC pages for a (or part of) given enclave. > > > > The EAUG > > > > logic should already be working for the normal fault path, which means > > > > the code > > > > change above either: 1) has been done at other place; 2) isn't needed. > > > > > > > > I have kinda forgotten the userspace sequence to create an enclave. > > > If > > > > I recall > > > > correctly, you do below to create an enclave: > > > > > > > > 1) encl_fd = open("/dev/sgx_enclave"); > > > > 2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */); > > > > 3) IOCTL(ECREATE, encl_addr, encl_size); > > > > > > > > Would the above code change break the "mmap()" in above step 2? > > > > > > > > > > No, vm_pgoff was not used previously for enclave VMAs. I had to add this > > > because the offset passed to sgx_fadvise is relative to file base and > > > calculated in mm/madvise.c like this: > > > > > > offset = (loff_t)(start - vma->vm_start) > > > + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); > > > > But shouldn't 'offset is relative to the file base' be conceptually > > correct from > > the fadvice()'s point of view? > > > > I think you should do: > > > > encl_offset = offset + encl->base; > > > > inside sgx_fadvice()? > > > > > > If we don't set vma->vm_pgoff (default to zero), then offset will be > calculated as (start - vma->vm_start). Then the above calculation is wrong > if we have multiple VMAs for the same enclave, which is usually the case. do_mmap() -> mmap_region() itself sets vma->vm_pgoff: vma = vm_area_alloc(); ... vma->vm_pgoff = pgoff; if (file) call_mmap(file, vma); <- sgx_mmap() I think you will always call mmap() against enclave's fd with 'pgoff' being set to the offset relative to the file? > > > > I had a comment in first version but removed it based on Jarkko's > > > suggestion here: > > > https://lore.kernel.org/all/Y2B0jBsG6HE4KVk7@kernel.org/ > > > > > > The original comments probably seemed redundant to the definitions of > > > the > > > vm_pgoff field and the fadvise interface. But let me know if we need > > > add a > > > more helpful version of comments or any suggestion on the comments. > > > > I still think this code change is wrong. > > > > For instance, IIUC, it at least breaks the case where enclave hasn't been > > created/initialized, where encl->base == 0 (although normal code path > > doesn't > > use vm_pgoff, conceptually it's still wrong IIUC). > > > > Maybe I am missing something? > > The fadvise interface is only usable for an initialized enclave, > sgx_fadvise will return error otherwise. > True. But that code change is unconditionally called for all mmap(), even when enclave hasn't been created. > Conceptually I view enclave base > as "file base", it's just that we don't ever need handle the zero case > caused by uninitialized enclave (kind of like a file never mapped). If an > initialized enclave happens to have zero base, it would also work. A little bit confused about what does "enclave base" here. To me, A file is an enclave, meaning the "file offset" equals to "enclave offset". "enclave base" is the base linear address of the enclave, it doesn't matter whether it is 0 or not. You get an "enclave address" from "enclave base" plus "enclave offset" (or "file offset"): enclave_addr = enclave_base + enclave_offset/file_offset; And such calculation is only valid after enclave has been created (enclave_base is valid -- can be 0 or whatever). Since sgx_mmap() can happen before enclave is created, calculating the vm_pgoff from enclave_base is conceptually wrong. Even if you really want to do it, it should be: if (enclave_has_initialized()) vma->vm_pgoff = ...; But again I am not convinced why you cannot get the enclave_addr inside sgx_fadvice().
On Tue, 14 Feb 2023 16:36:40 -0600, Huang, Kai <kai.huang@intel.com> wrote: > On Tue, 2023-02-14 at 15:42 -0600, Haitao Huang wrote: >> On Tue, 14 Feb 2023 14:54:53 -0600, Huang, Kai <kai.huang@intel.com> >> wrote: >> >> > On Tue, 2023-02-14 at 13:18 -0600, Haitao Huang wrote: >> > > Hi Kai >> > > >> > > On Tue, 14 Feb 2023 03:47:24 -0600, Huang, Kai <kai.huang@intel.com> >> > > wrote: >> > > >> > > > On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote: >> > > > > @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, >> struct >> > > > > vm_area_struct *vma) >> > > > > vma->vm_ops = &sgx_vm_ops; >> > > > > vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | >> VM_IO; >> > > > > vma->vm_private_data = encl; >> > > > > + vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base); >> > > > > return 0; >> > > > > } >> > > > >> > > > Perhaps I am missing something, but above change looks weird. >> > > > Conceptually, it doesn't/shouldn't belong to this series, which >> > > > essentially >> > > > preallocates and does EAUG EPC pages for a (or part of) given >> enclave. >> > > > The EAUG >> > > > logic should already be working for the normal fault path, which >> means >> > > > the code >> > > > change above either: 1) has been done at other place; 2) isn't >> needed. >> > > > >> > > > I have kinda forgotten the userspace sequence to create an >> enclave. >> > > If >> > > > I recall >> > > > correctly, you do below to create an enclave: >> > > > >> > > > 1) encl_fd = open("/dev/sgx_enclave"); >> > > > 2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */); >> > > > 3) IOCTL(ECREATE, encl_addr, encl_size); >> > > > >> > > > Would the above code change break the "mmap()" in above step 2? >> > > > >> > > >> > > No, vm_pgoff was not used previously for enclave VMAs. I had to add >> this >> > > because the offset passed to sgx_fadvise is relative to file base >> and >> > > calculated in mm/madvise.c like this: >> > > >> > > offset = (loff_t)(start - vma->vm_start) >> > > + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); >> > >> > But shouldn't 'offset is relative to the file base' be conceptually >> > correct from >> > the fadvice()'s point of view? >> > >> > I think you should do: >> > >> > encl_offset = offset + encl->base; >> > >> > inside sgx_fadvice()? >> > >> > > >> If we don't set vma->vm_pgoff (default to zero), then offset will be >> calculated as (start - vma->vm_start). Then the above calculation is >> wrong >> if we have multiple VMAs for the same enclave, which is usually the >> case. > > do_mmap() -> mmap_region() itself sets vma->vm_pgoff: > > vma = vm_area_alloc(); > ... > vma->vm_pgoff = pgoff; > > if (file) > call_mmap(file, vma); <- sgx_mmap() > > I think you will always call mmap() against enclave's fd with 'pgoff' > being set > to the offset relative to the file? > right. (pgoff above is most likely zero in enclave cases but that's not important) >> >> > > I had a comment in first version but removed it based on Jarkko's >> > > suggestion here: >> > > https://lore.kernel.org/all/Y2B0jBsG6HE4KVk7@kernel.org/ >> > > >> > > The original comments probably seemed redundant to the definitions >> of >> > > the >> > > vm_pgoff field and the fadvise interface. But let me know if we need >> > > add a >> > > more helpful version of comments or any suggestion on the comments. >> > >> > I still think this code change is wrong. >> > >> > For instance, IIUC, it at least breaks the case where enclave hasn't >> been >> > created/initialized, where encl->base == 0 (although normal code path >> > doesn't >> > use vm_pgoff, conceptually it's still wrong IIUC). >> > >> > Maybe I am missing something? >> >> The fadvise interface is only usable for an initialized enclave, >> sgx_fadvise will return error otherwise. > > True. But that code change is unconditionally called for all mmap(), > even when > enclave hasn't been created. > Theoretically yes. However, the user space sequences I am aware are following: 1. enclave_fd = fopen("/dev/sgx_enclave") 2. mmap(..., 2*enclave_size, MAP_ANONYMOUS, ...) to reserve a larger range, then trim the reserved address range to get enclave_base aligned with enclave_size 3. ioctl(ECREATE, enclave_base, enclave_size, enclave_fd) Only after ECREATE, we do mmap(..., enclave_fd) calls. >> Conceptually I view enclave base >> as "file base", it's just that we don't ever need handle the zero case >> caused by uninitialized enclave (kind of like a file never mapped). If >> an >> initialized enclave happens to have zero base, it would also work. > > A little bit confused about what does "enclave base" here. > > To me, A file is an enclave, meaning the "file offset" equals to "enclave > offset". "enclave base" is the base linear address of the enclave, it > doesn't > matter whether it is 0 or not. You get an "enclave address" from > "enclave base" > plus "enclave offset" (or "file offset"): > > enclave_addr = enclave_base + enclave_offset/file_offset; > Yes this is in sgx_fadvise: start = offset + encl->base The issue is that the file_offset is calculated in madvise.c before calling sgx_fadvise like this: offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); ... vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); So vma->vm_pgoff is expected being set correctly relative to file. > And such calculation is only valid after enclave has been created > (enclave_base > is valid -- can be 0 or whatever). > > Since sgx_mmap() can happen before enclave is created, calculating the > vm_pgoff > from enclave_base is conceptually wrong. Even if you really want to do > it, it > should be: > > if (enclave_has_initialized()) > vma->vm_pgoff = ...; I got your point now. I can add a condition to test the SGX_ENCL_CREATED bit. However, we still have a hole if we must handle the sequence mmap(..., enclave_fd) being called before ECREATE ioctl. We can't leave vm_pgoff not set for those cases. Since no one does that so far, can we explicitly return an error from sgx_mmap when that happens? Other suggestions? >But again I am not convinced why you cannot get the enclave_addr inside > sgx_fadvice(). > I hope the above comments addressed this. Basically we need set vm_pgoff relative to file for sgx_fadvise callback to receive meaningful `offset` relative to file. Thanks Haitao
> > > > > > > > Since sgx_mmap() can happen before enclave is created, calculating the > > vm_pgoff > > from enclave_base is conceptually wrong. Even if you really want to do > > it, it > > should be: > > > > if (enclave_has_initialized()) > > vma->vm_pgoff = ...; > > I got your point now. I can add a condition to test the SGX_ENCL_CREATED > bit. However, we still have a hole if we must handle the sequence > mmap(..., enclave_fd) being called before ECREATE ioctl. We can't leave > vm_pgoff not set for those cases. > > Since no one does that so far, can we explicitly return an error from > sgx_mmap when that happens? > Other suggestions? As I replied to patch 4/4, I believe userspace should pass the correct pgoff in mmap(). It's wrong to always pass 0 or any random value. If userspace follow the mmap() rule, you won't need to manually set vm_pgoff here (which is hacky IMHO). Everything works fine.
On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com> wrote: >> > > > >> > >> > Since sgx_mmap() can happen before enclave is created, calculating the >> > vm_pgoff >> > from enclave_base is conceptually wrong. Even if you really want to >> do >> > it, it >> > should be: >> > >> > if (enclave_has_initialized()) >> > vma->vm_pgoff = ...; >> >> I got your point now. I can add a condition to test the SGX_ENCL_CREATED >> bit. However, we still have a hole if we must handle the sequence >> mmap(..., enclave_fd) being called before ECREATE ioctl. We can't leave >> vm_pgoff not set for those cases. >> >> Since no one does that so far, can we explicitly return an error from >> sgx_mmap when that happens? >> Other suggestions? > > As I replied to patch 4/4, I believe userspace should pass the correct > pgoff in > mmap(). It's wrong to always pass 0 or any random value. > > If userspace follow the mmap() rule, you won't need to manually set > vm_pgoff > here (which is hacky IMHO). Everything works fine. > SGX driver was following MAP_ANONYMOUS semantics. If we change that, it'd break current usage/ABI. I still think returning error for cases mmap(..., enclave_fd) if enclave is not created would be less intrusive change. Haitao
On Wed, 2023-02-15 at 09:42 -0600, Haitao Huang wrote: > On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com> wrote: > > > > > > > > > > > > > > > Since sgx_mmap() can happen before enclave is created, calculating the > > > > vm_pgoff > > > > from enclave_base is conceptually wrong. Even if you really want to > > > do > > > > it, it > > > > should be: > > > > > > > > if (enclave_has_initialized()) > > > > vma->vm_pgoff = ...; > > > > > > I got your point now. I can add a condition to test the SGX_ENCL_CREATED > > > bit. However, we still have a hole if we must handle the sequence > > > mmap(..., enclave_fd) being called before ECREATE ioctl. We can't leave > > > vm_pgoff not set for those cases. > > > > > > Since no one does that so far, can we explicitly return an error from > > > sgx_mmap when that happens? > > > Other suggestions? > > > > As I replied to patch 4/4, I believe userspace should pass the correct > > pgoff in > > mmap(). It's wrong to always pass 0 or any random value. > > > > If userspace follow the mmap() rule, you won't need to manually set > > vm_pgoff > > here (which is hacky IMHO). Everything works fine. > > > > SGX driver was following MAP_ANONYMOUS semantics. If we change that, it'd > break current usage/ABI. Is there any doc/reference mentioning this? > > I still think returning error for cases mmap(..., enclave_fd) if enclave > is not created would be less intrusive change. But you need the first mmap() to work before IOCTL(ecreate) to get enclave's base address, correct? > > Haitao
Hi Kai On Thu, 16 Feb 2023 01:53:47 -0600, Huang, Kai <kai.huang@intel.com> wrote: > On Wed, 2023-02-15 at 09:42 -0600, Haitao Huang wrote: >> On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com> >> wrote: >> >> > > > > > >> > > > >> > > > Since sgx_mmap() can happen before enclave is created, >> calculating the >> > > > vm_pgoff >> > > > from enclave_base is conceptually wrong. Even if you really want >> to >> > > do >> > > > it, it >> > > > should be: >> > > > >> > > > if (enclave_has_initialized()) >> > > > vma->vm_pgoff = ...; >> > > >> > > I got your point now. I can add a condition to test the >> SGX_ENCL_CREATED >> > > bit. However, we still have a hole if we must handle the sequence >> > > mmap(..., enclave_fd) being called before ECREATE ioctl. We can't >> leave >> > > vm_pgoff not set for those cases. >> > > >> > > Since no one does that so far, can we explicitly return an error >> from >> > > sgx_mmap when that happens? >> > > Other suggestions? >> > >> > As I replied to patch 4/4, I believe userspace should pass the correct >> > pgoff in >> > mmap(). It's wrong to always pass 0 or any random value. >> > >> > If userspace follow the mmap() rule, you won't need to manually set >> > vm_pgoff >> > here (which is hacky IMHO). Everything works fine. >> > >> >> SGX driver was following MAP_ANONYMOUS semantics. If we change that, >> it'd >> break current usage/ABI. > > Is there any doc/reference mentioning this? > No, just from how we use it :-). Jarkko/Sean/Dave may have history/insights on this? But file offset in mmap spec implies there is a "real" file independent of the memory it is mapped into, and you can mmap arbitrary offset of that file to any memory address. That does not fit well with SGX enclave memory: there is no separate physical file other than enclave residing in memory and its base is fixed. Again, my main concern is impact to existing usage. >> >> I still think returning error for cases mmap(..., enclave_fd) if enclave >> is not created would be less intrusive change. > > But you need the first mmap() to work before IOCTL(ecreate) to get > enclave's > base address, correct? > No, mmap(..., MAP_ANONYMOUS, fd=-1) must be used to reserve address range for enclave before calling IOCTL(ecreate). In fact, I realize mmap(..., fd=enclave_fd) before ioctl(ecreate) should fail in sgx_encl_may_map according to its definition but IIUC it would skip the loop xas_for_each when no entries yet available in encl->page_array. This might be a hole to enforcement of vma_prot_bits not exceeding the vm_max_prot_bits which is recorded first during EADD. So if my understanding is correct, we just need fix sgx_encl_may_map to return -EACCES when mmap is called before ecreate is done. Thanks Haitao
On Tue, Feb 14, 2023 at 09:47:24AM +0000, Huang, Kai wrote: > On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote: > > @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma) > > vma->vm_ops = &sgx_vm_ops; > > vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; > > vma->vm_private_data = encl; > > + vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base); > > > > return 0; > > } > > Perhaps I am missing something, but above change looks weird. > > Conceptually, it doesn't/shouldn't belong to this series, which essentially > preallocates and does EAUG EPC pages for a (or part of) given enclave. The EAUG > logic should already be working for the normal fault path, which means the code > change above either: 1) has been done at other place; 2) isn't needed. > > I have kinda forgotten the userspace sequence to create an enclave. If I recall > correctly, you do below to create an enclave: > > 1) encl_fd = open("/dev/sgx_enclave"); > 2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */); > 3) IOCTL(ECREATE, encl_addr, encl_size); > > Would the above code change break the "mmap()" in above step 2? Good catch! Actually shouldn't be validate it to be always zero? We are essentially MAP_ANONYMOUS mapping semantics with a device file. BR, Jarkko
On Tue, Feb 14, 2023 at 08:54:53PM +0000, Huang, Kai wrote: > On Tue, 2023-02-14 at 13:18 -0600, Haitao Huang wrote: > > Hi Kai > > > > On Tue, 14 Feb 2023 03:47:24 -0600, Huang, Kai <kai.huang@intel.com> wrote: > > > > > On Fri, 2023-01-27 at 20:55 -0800, Haitao Huang wrote: > > > > @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct > > > > vm_area_struct *vma) > > > > vma->vm_ops = &sgx_vm_ops; > > > > vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; > > > > vma->vm_private_data = encl; > > > > + vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base); > > > > return 0; > > > > } > > > > > > Perhaps I am missing something, but above change looks weird. > > > Conceptually, it doesn't/shouldn't belong to this series, which > > > essentially > > > preallocates and does EAUG EPC pages for a (or part of) given enclave. > > > The EAUG > > > logic should already be working for the normal fault path, which means > > > the code > > > change above either: 1) has been done at other place; 2) isn't needed. > > > > > > I have kinda forgotten the userspace sequence to create an enclave. If > > > I recall > > > correctly, you do below to create an enclave: > > > > > > 1) encl_fd = open("/dev/sgx_enclave"); > > > 2) encl_addr = mmap(encl_fd, encl_size, 0 /* pgoff */); > > > 3) IOCTL(ECREATE, encl_addr, encl_size); > > > > > > Would the above code change break the "mmap()" in above step 2? > > > > > > > No, vm_pgoff was not used previously for enclave VMAs. I had to add this > > because the offset passed to sgx_fadvise is relative to file base and > > calculated in mm/madvise.c like this: > > > > offset = (loff_t)(start - vma->vm_start) > > + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); > > But shouldn't 'offset is relative to the file base' be conceptually correct from > the fadvice()'s point of view? > > I think you should do: > > encl_offset = offset + encl->base; > > inside sgx_fadvice()? I agree. BR, Jarkko
On Wed, Feb 15, 2023 at 09:42:30AM -0600, Haitao Huang wrote: > On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com> wrote: > > > > > > > > > > > > > > > Since sgx_mmap() can happen before enclave is created, calculating the > > > > vm_pgoff > > > > from enclave_base is conceptually wrong. Even if you really want > > > to do > > > > it, it > > > > should be: > > > > > > > > if (enclave_has_initialized()) > > > > vma->vm_pgoff = ...; > > > > > > I got your point now. I can add a condition to test the SGX_ENCL_CREATED > > > bit. However, we still have a hole if we must handle the sequence > > > mmap(..., enclave_fd) being called before ECREATE ioctl. We can't leave > > > vm_pgoff not set for those cases. > > > > > > Since no one does that so far, can we explicitly return an error from > > > sgx_mmap when that happens? > > > Other suggestions? > > > > As I replied to patch 4/4, I believe userspace should pass the correct > > pgoff in > > mmap(). It's wrong to always pass 0 or any random value. > > > > If userspace follow the mmap() rule, you won't need to manually set > > vm_pgoff > > here (which is hacky IMHO). Everything works fine. > > > > SGX driver was following MAP_ANONYMOUS semantics. If we change that, it'd > break current usage/ABI. > > I still think returning error for cases mmap(..., enclave_fd) if enclave is > not created would be less intrusive change. Is this something you care in SGX SDK? It is not categorically forbidden so that's why I'm asking. BR, Jarkko
Hi Jarkko On Fri, 17 Feb 2023 16:32:12 -0600, jarkko@kernel.org <jarkko@kernel.org> wrote: > On Wed, Feb 15, 2023 at 09:42:30AM -0600, Haitao Huang wrote: >> On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com> >> wrote: >> >> > > > > > >> > > > >> > > > Since sgx_mmap() can happen before enclave is created, >> calculating the >> > > > vm_pgoff >> > > > from enclave_base is conceptually wrong. Even if you really want >> > > to do >> > > > it, it >> > > > should be: >> > > > >> > > > if (enclave_has_initialized()) >> > > > vma->vm_pgoff = ...; >> > > >> > > I got your point now. I can add a condition to test the >> SGX_ENCL_CREATED >> > > bit. However, we still have a hole if we must handle the sequence >> > > mmap(..., enclave_fd) being called before ECREATE ioctl. We can't >> leave >> > > vm_pgoff not set for those cases. >> > > >> > > Since no one does that so far, can we explicitly return an error >> from >> > > sgx_mmap when that happens? >> > > Other suggestions? >> > >> > As I replied to patch 4/4, I believe userspace should pass the correct >> > pgoff in >> > mmap(). It's wrong to always pass 0 or any random value. >> > >> > If userspace follow the mmap() rule, you won't need to manually set >> > vm_pgoff >> > here (which is hacky IMHO). Everything works fine. >> > >> >> SGX driver was following MAP_ANONYMOUS semantics. If we change that, >> it'd >> break current usage/ABI. >> >> I still think returning error for cases mmap(..., enclave_fd) if >> enclave is >> not created would be less intrusive change. > > Is this something you care in SGX SDK? > > It is not categorically forbidden so that's why I'm asking. SDK does not care as we would never do this and don't think anyone is doing that either. Suggesting returning error to cover all cases so user space would not accidentally cause incorrect vm_pgoff set. Thanks Haitao
On Fri, Feb 17, 2023 at 05:03:05PM -0600, Haitao Huang wrote: > Hi Jarkko > > On Fri, 17 Feb 2023 16:32:12 -0600, jarkko@kernel.org <jarkko@kernel.org> > wrote: > > > On Wed, Feb 15, 2023 at 09:42:30AM -0600, Haitao Huang wrote: > > > On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com> > > > wrote: > > > > > > > > > > > > > > > > > > > > > > > Since sgx_mmap() can happen before enclave is created, > > > calculating the > > > > > > vm_pgoff > > > > > > from enclave_base is conceptually wrong. Even if you really want > > > > > to do > > > > > > it, it > > > > > > should be: > > > > > > > > > > > > if (enclave_has_initialized()) > > > > > > vma->vm_pgoff = ...; > > > > > > > > > > I got your point now. I can add a condition to test the > > > SGX_ENCL_CREATED > > > > > bit. However, we still have a hole if we must handle the sequence > > > > > mmap(..., enclave_fd) being called before ECREATE ioctl. We > > > can't leave > > > > > vm_pgoff not set for those cases. > > > > > > > > > > Since no one does that so far, can we explicitly return an error > > > from > > > > > sgx_mmap when that happens? > > > > > Other suggestions? > > > > > > > > As I replied to patch 4/4, I believe userspace should pass the correct > > > > pgoff in > > > > mmap(). It's wrong to always pass 0 or any random value. > > > > > > > > If userspace follow the mmap() rule, you won't need to manually set > > > > vm_pgoff > > > > here (which is hacky IMHO). Everything works fine. > > > > > > > > > > SGX driver was following MAP_ANONYMOUS semantics. If we change that, > > > it'd > > > break current usage/ABI. > > > > > > I still think returning error for cases mmap(..., enclave_fd) if > > > enclave is > > > not created would be less intrusive change. > > > > Is this something you care in SGX SDK? > > > > It is not categorically forbidden so that's why I'm asking. > > SDK does not care as we would never do this and don't think anyone is doing > that either. Suggesting returning error to cover all cases so user space > would not accidentally cause incorrect vm_pgoff set. vm_pgoff != 0 should result -EINVAL. BR, Jarkko
On Tue, 21 Feb 2023 16:10:02 -0600, jarkko@kernel.org <jarkko@kernel.org> wrote: > On Fri, Feb 17, 2023 at 05:03:05PM -0600, Haitao Huang wrote: >> Hi Jarkko >> >> On Fri, 17 Feb 2023 16:32:12 -0600, jarkko@kernel.org >> <jarkko@kernel.org> >> wrote: >> >> > On Wed, Feb 15, 2023 at 09:42:30AM -0600, Haitao Huang wrote: >> > > On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com> >> > > wrote: >> > > >> > > > > > > > >> > > > > > >> > > > > > Since sgx_mmap() can happen before enclave is created, >> > > calculating the >> > > > > > vm_pgoff >> > > > > > from enclave_base is conceptually wrong. Even if you really >> want >> > > > > to do >> > > > > > it, it >> > > > > > should be: >> > > > > > >> > > > > > if (enclave_has_initialized()) >> > > > > > vma->vm_pgoff = ...; >> > > > > >> > > > > I got your point now. I can add a condition to test the >> > > SGX_ENCL_CREATED >> > > > > bit. However, we still have a hole if we must handle the >> sequence >> > > > > mmap(..., enclave_fd) being called before ECREATE ioctl. We >> > > can't leave >> > > > > vm_pgoff not set for those cases. >> > > > > >> > > > > Since no one does that so far, can we explicitly return an error >> > > from >> > > > > sgx_mmap when that happens? >> > > > > Other suggestions? >> > > > >> > > > As I replied to patch 4/4, I believe userspace should pass the >> correct >> > > > pgoff in >> > > > mmap(). It's wrong to always pass 0 or any random value. >> > > > >> > > > If userspace follow the mmap() rule, you won't need to manually >> set >> > > > vm_pgoff >> > > > here (which is hacky IMHO). Everything works fine. >> > > > >> > > >> > > SGX driver was following MAP_ANONYMOUS semantics. If we change that, >> > > it'd >> > > break current usage/ABI. >> > > >> > > I still think returning error for cases mmap(..., enclave_fd) if >> > > enclave is >> > > not created would be less intrusive change. >> > >> > Is this something you care in SGX SDK? >> > >> > It is not categorically forbidden so that's why I'm asking. >> >> SDK does not care as we would never do this and don't think anyone is >> doing >> that either. Suggesting returning error to cover all cases so user space >> would not accidentally cause incorrect vm_pgoff set. > > vm_pgoff != 0 should result -EINVAL. > Do you mean 'offset' passed in from mmap syscall and converted to pgoff in kernel? I can see it only passed to driver in get_unmapped_area. I can add this enforcement there so it is consistent to the MAP_ANONYMOUS spec if that's what you suggest. vma->vm_pgoff is the offset in pages of the vma->vm_start relative to 'file'. It can not be zero for all VMAs of the enclave. Kernel sets vma->vm_pgoff == vma->vm_start>>PAGE_SHIFT for private anon VMAs, and ignore it (zero) for shared anon mapping. For us, we propose to set it to PFN_DOWN(vma->vm_start - encl->base) upon mmap. The concern Kai raised about encl->base not available in the window between mmap and ECREATE can be addressed by disallowing mmap before ECREATE is done. It does not make much sense anyway to mmap(enclave_fd) without a valid enclave range associated with enclave_fd. Does that sound reasonable or I still miss some point here? Thanks Haitao
On Tue, 2023-02-21 at 19:37 -0600, Haitao Huang wrote: > On Tue, 21 Feb 2023 16:10:02 -0600, jarkko@kernel.org <jarkko@kernel.org> > wrote: > > > On Fri, Feb 17, 2023 at 05:03:05PM -0600, Haitao Huang wrote: > > > Hi Jarkko > > > > > > On Fri, 17 Feb 2023 16:32:12 -0600, jarkko@kernel.org > > > <jarkko@kernel.org> > > > wrote: > > > > > > > On Wed, Feb 15, 2023 at 09:42:30AM -0600, Haitao Huang wrote: > > > > > On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai <kai.huang@intel.com> > > > > > wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Since sgx_mmap() can happen before enclave is created, > > > > > calculating the > > > > > > > > vm_pgoff > > > > > > > > from enclave_base is conceptually wrong. Even if you really > > > want > > > > > > > to do > > > > > > > > it, it > > > > > > > > should be: > > > > > > > > > > > > > > > > if (enclave_has_initialized()) > > > > > > > > vma->vm_pgoff = ...; > > > > > > > > > > > > > > I got your point now. I can add a condition to test the > > > > > SGX_ENCL_CREATED > > > > > > > bit. However, we still have a hole if we must handle the > > > sequence > > > > > > > mmap(..., enclave_fd) being called before ECREATE ioctl. We > > > > > can't leave > > > > > > > vm_pgoff not set for those cases. > > > > > > > > > > > > > > Since no one does that so far, can we explicitly return an error > > > > > from > > > > > > > sgx_mmap when that happens? > > > > > > > Other suggestions? > > > > > > > > > > > > As I replied to patch 4/4, I believe userspace should pass the > > > correct > > > > > > pgoff in > > > > > > mmap(). It's wrong to always pass 0 or any random value. > > > > > > > > > > > > If userspace follow the mmap() rule, you won't need to manually > > > set > > > > > > vm_pgoff > > > > > > here (which is hacky IMHO). Everything works fine. > > > > > > > > > > > > > > > > SGX driver was following MAP_ANONYMOUS semantics. If we change that, > > > > > it'd > > > > > break current usage/ABI. > > > > > > > > > > I still think returning error for cases mmap(..., enclave_fd) if > > > > > enclave is > > > > > not created would be less intrusive change. > > > > > > > > Is this something you care in SGX SDK? > > > > > > > > It is not categorically forbidden so that's why I'm asking. > > > > > > SDK does not care as we would never do this and don't think anyone is > > > doing > > > that either. Suggesting returning error to cover all cases so user space > > > would not accidentally cause incorrect vm_pgoff set. > > > > vm_pgoff != 0 should result -EINVAL. > > > > Do you mean 'offset' passed in from mmap syscall and converted to pgoff in > kernel? I can see it only passed to driver in get_unmapped_area. I can add > this enforcement there so it is consistent to the MAP_ANONYMOUS spec if > that's what you suggest. > > vma->vm_pgoff is the offset in pages of the vma->vm_start relative to > 'file'. > It can not be zero for all VMAs of the enclave. > > Kernel sets vma->vm_pgoff == vma->vm_start>>PAGE_SHIFT for private anon > VMAs, and ignore it (zero) for shared anon mapping. > > For us, we propose to set it to PFN_DOWN(vma->vm_start - encl->base) upon > mmap. Sorry for late reply. Basically I was sick and having limited working time in the passed two weeks. For what I understand now, SGX driver wants to use MAP_ANONYMOUS semantic for mmap()s against /dev/sgx_enclave (for whatever reason that I don't understand). And because of that, we even want to explicitly enforce userspace to always pass 0 as pgoff in mmap()s (hasn't been done yet). And then here, we want to rewrite vma->pgoff so that the VMA can act _like_ a normal file-based mmap() semantics (despite it is indeed a file-based VMA), because the VFS fadvice() implementation assumes file-based VMAs are always following file-based mmap() semantics. Hmm.. Doesn't this sound weird? I must have been missing something, but why cannot SGX driver just always follow file-based mmap() semantics at the beginning? For instance, what's wrong with: 1) encl_fd = open("/dev/sgx_enclave") 2) encl_base = mmap(..., encl_size, MAP_SHARED, encl_fd, 0 /* pgoff */) 3) ioctl(ECREATE, encl_base, encl_size) (In ioctl(ECREATE), we might want to verify the VMA we found has the same encl_base as starting address, and 0 pgoff). And in following mmap()s in which we want to map a small range of enclave: encl_addr = mmap(encl_addr, MAP_SHARED|MAP_FIXED, encl_fd, (encl_addr - encl_base) >> PAGE_SHIFT); ? Anything wrong above? In fact, if the first mmap() in step 2) before IOCTL(ecreate) used MAP_ANONYMOUS, IIUC that VMA will stay as anonymous VMA but will never be converted to SGX file-based one, because by looking at the code, I see neither ioctl(ECREATE) nor ioctl(EINIT) converts that VMA to file-based one. That being said, all page faults caused by userspace access to that VMA will still go to anonymous VMA's fault path, but not SGX driver's. This is fine for SGX1 as all enclave pages are fully populated. For SGX2, _unless_ you explicitly mmap("/dev/sgx_enclave") those dynamical ranges, the fault will never be handled by SGX driver. Architecturally, for SGX2, if you mmap() the entire enclave range (as in step 2), you don't need explicitly mmap() all dynamic ranges. The userspace should just be able to access those dynamic ranges (i.e. EACCEPT) and the kernel driver should gracefully handle the fault. Shouldn't this be a problem? (Btw, there might be other corner cases that could cause VMA splitting/merging, etc, but I haven't thought about those) > The concern Kai raised about encl->base not available in the window > between mmap and ECREATE can be addressed by disallowing mmap before > ECREATE is done. It does not make much sense anyway to mmap(enclave_fd) > without a valid enclave range associated with enclave_fd. > We can, but I don't understand why the first mmap() before ECREATE must/should be done via MMAP_ANONYMOUS. In fact, I think it might be wrong for SGX2.
Hi Kai Thanks for your detailed review. On Tue, 07 Mar 2023 17:32:15 -0600, Huang, Kai <kai.huang@intel.com> wrote: > On Tue, 2023-02-21 at 19:37 -0600, Haitao Huang wrote: >> On Tue, 21 Feb 2023 16:10:02 -0600, jarkko@kernel.org >> <jarkko@kernel.org> >> wrote: >> >> > On Fri, Feb 17, 2023 at 05:03:05PM -0600, Haitao Huang wrote: >> > > Hi Jarkko >> > > >> > > On Fri, 17 Feb 2023 16:32:12 -0600, jarkko@kernel.org >> > > <jarkko@kernel.org> >> > > wrote: >> > > >> > > > On Wed, Feb 15, 2023 at 09:42:30AM -0600, Haitao Huang wrote: >> > > > > On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai >> <kai.huang@intel.com> >> > > > > wrote: >> > > > > >> > > > > > > > > > >> > > > > > > > >> > > > > > > > Since sgx_mmap() can happen before enclave is created, >> > > > > calculating the >> > > > > > > > vm_pgoff >> > > > > > > > from enclave_base is conceptually wrong. Even if you >> really >> > > want >> > > > > > > to do >> > > > > > > > it, it >> > > > > > > > should be: >> > > > > > > > >> > > > > > > > if (enclave_has_initialized()) >> > > > > > > > vma->vm_pgoff = ...; >> > > > > > > >> > > > > > > I got your point now. I can add a condition to test the >> > > > > SGX_ENCL_CREATED >> > > > > > > bit. However, we still have a hole if we must handle the >> > > sequence >> > > > > > > mmap(..., enclave_fd) being called before ECREATE ioctl. We >> > > > > can't leave >> > > > > > > vm_pgoff not set for those cases. >> > > > > > > >> > > > > > > Since no one does that so far, can we explicitly return an >> error >> > > > > from >> > > > > > > sgx_mmap when that happens? >> > > > > > > Other suggestions? >> > > > > > >> > > > > > As I replied to patch 4/4, I believe userspace should pass the >> > > correct >> > > > > > pgoff in >> > > > > > mmap(). It's wrong to always pass 0 or any random value. >> > > > > > >> > > > > > If userspace follow the mmap() rule, you won't need to >> manually >> > > set >> > > > > > vm_pgoff >> > > > > > here (which is hacky IMHO). Everything works fine. >> > > > > > >> > > > > >> > > > > SGX driver was following MAP_ANONYMOUS semantics. If we change >> that, >> > > > > it'd >> > > > > break current usage/ABI. >> > > > > >> > > > > I still think returning error for cases mmap(..., enclave_fd) if >> > > > > enclave is >> > > > > not created would be less intrusive change. >> > > > >> > > > Is this something you care in SGX SDK? >> > > > >> > > > It is not categorically forbidden so that's why I'm asking. >> > > >> > > SDK does not care as we would never do this and don't think anyone >> is >> > > doing >> > > that either. Suggesting returning error to cover all cases so user >> space >> > > would not accidentally cause incorrect vm_pgoff set. >> > >> > vm_pgoff != 0 should result -EINVAL. >> > >> >> Do you mean 'offset' passed in from mmap syscall and converted to pgoff >> in >> kernel? I can see it only passed to driver in get_unmapped_area. I can >> add >> this enforcement there so it is consistent to the MAP_ANONYMOUS spec if >> that's what you suggest. >> >> vma->vm_pgoff is the offset in pages of the vma->vm_start relative to >> 'file'. >> It can not be zero for all VMAs of the enclave. >> >> Kernel sets vma->vm_pgoff == vma->vm_start>>PAGE_SHIFT for private anon >> VMAs, and ignore it (zero) for shared anon mapping. >> >> For us, we propose to set it to PFN_DOWN(vma->vm_start - encl->base) >> upon >> mmap. > > Sorry for late reply. Basically I was sick and having limited working > time in > the passed two weeks. > > For what I understand now, SGX driver wants to use MAP_ANONYMOUS > semantic for > mmap()s against /dev/sgx_enclave (for whatever reason that I don't > understand). > And because of that, we even want to explicitly enforce userspace to > always pass > 0 as pgoff in mmap()s (hasn't been done yet). > > And then here, we want to rewrite vma->pgoff so that the VMA can act > _like_ a > normal file-based mmap() semantics (despite it is indeed a file-based > VMA), > because the VFS fadvice() implementation assumes file-based VMAs are > always > following file-based mmap() semantics. > > Hmm.. > > Doesn't this sound weird? > Sometime weird means good or creative as expressed in a popular slogan in the city where I live. Just joking:-) > I must have been missing something, but why cannot SGX driver just > always follow > file-based mmap() semantics at the beginning? > > For instance, what's wrong with: > > 1) encl_fd = open("/dev/sgx_enclave") > 2) encl_base = mmap(..., encl_size, MAP_SHARED, encl_fd, 0 /* pgoff */) > 3) ioctl(ECREATE, encl_base, encl_size) > > (In ioctl(ECREATE), we might want to verify the VMA we found has the same > encl_base as starting address, and 0 pgoff). > > And in following mmap()s in which we want to map a small range of > enclave: > > encl_addr = mmap(encl_addr, MAP_SHARED|MAP_FIXED, encl_fd, > (encl_addr - encl_base) >> PAGE_SHIFT); > > ? > > Anything wrong above? > I believe this can be done except for that you are breaking existing code by requiring pg_off for all mmaps now. Doing mmap before ECREATE also effectively bypasses sgx_encl_may_map. And we have a very long thread of discussion about this[1] and that's why sgx_encl_may_map come into play to limit VMA permissions to those given in SecInfo by EADD ioctl. However, current code does not explicitly block it and I think it'd be better do it. More comment below on this point. > In fact, if the first mmap() in step 2) before IOCTL(ecreate) used > MAP_ANONYMOUS, IIUC that VMA will stay as anonymous VMA but will never be > converted to SGX file-based one, because by looking at the code, I see > neither > ioctl(ECREATE) nor ioctl(EINIT) converts that VMA to file-based one. > > That being said, all page faults caused by userspace access to that VMA > will > still go to anonymous VMA's fault path, but not SGX driver's. > It is required to mmap anonymously with PROT_NONE initially and user space must invoke mmap with enclave fd after EADDing each segments/areas. PF handler will check VMA prot bits against vm_max_prot_bits in sgx_encl_load_page_in_vma, and report failure on any mismatch. If user space does not mmap(...,encl_fd) after EADD, pages will have PROT_NONE and fail on any access. > This is fine for SGX1 as all enclave pages are fully populated. For > SGX2, > _unless_ you explicitly mmap("/dev/sgx_enclave") those dynamical ranges, > the > fault will never be handled by SGX driver. > Indeed, it is required user space invoke mmap to config the regions for EAUG. > Architecturally, for SGX2, if you mmap() the entire enclave range (as in > step > 2), you don't need explicitly mmap() all dynamic ranges. The userspace > should > just be able to access those dynamic ranges (i.e. EACCEPT) and the > kernel driver > should gracefully handle the fault. > > Shouldn't this be a problem? > > (Btw, there might be other corner cases that could cause VMA > splitting/merging, > etc, but I haven't thought about those) > >> The concern Kai raised about encl->base not available in the window >> between mmap and ECREATE can be addressed by disallowing mmap before >> ECREATE is done. It does not make much sense anyway to mmap(enclave_fd) >> without a valid enclave range associated with enclave_fd. >> > > We can, but I don't understand why the first mmap() before ECREATE > must/should > be done via MMAP_ANONYMOUS. In fact, I think it might be wrong for SGX2. > There is a long history behind it. The use of anonymous mapping can be traced back to v29. See this thread[2]. Before v29, user space can do mmap(PROT_NONE, ..., enclave_fd) before ECREATE to reserve a range for the enclave. But it must be PROT_NONE for the reservation, otherwise sgx_encl_may_map will block it by the "!page" check below [3]: int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, unsigned long end, unsigned long vm_prot_bits) { unsigned long idx, idx_start, idx_end; struct sgx_encl_page *page; /* PROT_NONE always succeeds. */ if (!vm_prot_bits) return 0; idx_start = PFN_DOWN(start); idx_end = PFN_DOWN(end - 1); for (idx = idx_start; idx <= idx_end; ++idx) { mutex_lock(&encl->lock); page = radix_tree_lookup(&encl->page_tree, idx); mutex_unlock(&encl->lock); if (!page || (~page->vm_prot_bits & vm_prot_bits)) return -EACCES; ... Then in v29, PROT_NONE mapping was disallowed for encl_fd before pages EADDed so user space has to mmap anonymously to reserve the range.The intent was still not to allow mmap before pages EADDed (the !page check was still there up to v38) Later version (around v39?) we switched to enforce the permissions in PF handler and mmap with any permissions before EADD is allowed, but may cause failure later on PF. I'm not sure it was intentional but pretty sure no meaningful usage for doing mmap before ECREATE due to PF handler enforcement. I think that's the history behind it. Others more knowledgeable can correct me as needed. Again, even if we redesign the whole thing to be less "weird", then requiring pgoff for each mmap would break existing user space code. And I think explicitly block mmap before ECREATE make the API more consistent with the PF handler enforcement and original intent of sgx_encl_may_map. Thanks again for detailed review and those thoughtful questions. Haitao [1]https://lore.kernel.org/linux-sgx/CALCETrXf8mSK45h7sTK5Wf+pXLVn=Bjsc_RLpgO-h-qdzBRo5Q@mail.gmail.com/ [2]https://lore.kernel.org/linux-sgx/CAOASepPFe_ucuwe7JW_-+VBQ4=+sHqyGXOdA9kUbcYA_9=v0sA@mail.gmail.com/ [3]https://lore.kernel.org/linux-sgx/20190713170804.2340-17-jarkko.sakkinen@linux.intel.com/
On Wed, 2023-03-08 at 18:50 -0600, Haitao Huang wrote: > Hi Kai > > Thanks for your detailed review. > > On Tue, 07 Mar 2023 17:32:15 -0600, Huang, Kai <kai.huang@intel.com> wrote: > > > On Tue, 2023-02-21 at 19:37 -0600, Haitao Huang wrote: > > > On Tue, 21 Feb 2023 16:10:02 -0600, jarkko@kernel.org > > > <jarkko@kernel.org> > > > wrote: > > > > > > > On Fri, Feb 17, 2023 at 05:03:05PM -0600, Haitao Huang wrote: > > > > > Hi Jarkko > > > > > > > > > > On Fri, 17 Feb 2023 16:32:12 -0600, jarkko@kernel.org > > > > > <jarkko@kernel.org> > > > > > wrote: > > > > > > > > > > > On Wed, Feb 15, 2023 at 09:42:30AM -0600, Haitao Huang wrote: > > > > > > > On Wed, 15 Feb 2023 02:51:23 -0600, Huang, Kai > > > <kai.huang@intel.com> > > > > > > > wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Since sgx_mmap() can happen before enclave is created, > > > > > > > calculating the > > > > > > > > > > vm_pgoff > > > > > > > > > > from enclave_base is conceptually wrong. Even if you > > > really > > > > > want > > > > > > > > > to do > > > > > > > > > > it, it > > > > > > > > > > should be: > > > > > > > > > > > > > > > > > > > > if (enclave_has_initialized()) > > > > > > > > > > vma->vm_pgoff = ...; > > > > > > > > > > > > > > > > > > I got your point now. I can add a condition to test the > > > > > > > SGX_ENCL_CREATED > > > > > > > > > bit. However, we still have a hole if we must handle the > > > > > sequence > > > > > > > > > mmap(..., enclave_fd) being called before ECREATE ioctl. We > > > > > > > can't leave > > > > > > > > > vm_pgoff not set for those cases. > > > > > > > > > > > > > > > > > > Since no one does that so far, can we explicitly return an > > > error > > > > > > > from > > > > > > > > > sgx_mmap when that happens? > > > > > > > > > Other suggestions? > > > > > > > > > > > > > > > > As I replied to patch 4/4, I believe userspace should pass the > > > > > correct > > > > > > > > pgoff in > > > > > > > > mmap(). It's wrong to always pass 0 or any random value. > > > > > > > > > > > > > > > > If userspace follow the mmap() rule, you won't need to > > > manually > > > > > set > > > > > > > > vm_pgoff > > > > > > > > here (which is hacky IMHO). Everything works fine. > > > > > > > > > > > > > > > > > > > > > > SGX driver was following MAP_ANONYMOUS semantics. If we change > > > that, > > > > > > > it'd > > > > > > > break current usage/ABI. > > > > > > > > > > > > > > I still think returning error for cases mmap(..., enclave_fd) if > > > > > > > enclave is > > > > > > > not created would be less intrusive change. > > > > > > > > > > > > Is this something you care in SGX SDK? > > > > > > > > > > > > It is not categorically forbidden so that's why I'm asking. > > > > > > > > > > SDK does not care as we would never do this and don't think anyone > > > is > > > > > doing > > > > > that either. Suggesting returning error to cover all cases so user > > > space > > > > > would not accidentally cause incorrect vm_pgoff set. > > > > > > > > vm_pgoff != 0 should result -EINVAL. > > > > > > > > > > Do you mean 'offset' passed in from mmap syscall and converted to pgoff > > > in > > > kernel? I can see it only passed to driver in get_unmapped_area. I can > > > add > > > this enforcement there so it is consistent to the MAP_ANONYMOUS spec if > > > that's what you suggest. > > > > > > vma->vm_pgoff is the offset in pages of the vma->vm_start relative to > > > 'file'. > > > It can not be zero for all VMAs of the enclave. > > > > > > Kernel sets vma->vm_pgoff == vma->vm_start>>PAGE_SHIFT for private anon > > > VMAs, and ignore it (zero) for shared anon mapping. > > > > > > For us, we propose to set it to PFN_DOWN(vma->vm_start - encl->base) > > > upon > > > mmap. > > > > Sorry for late reply. Basically I was sick and having limited working > > time in > > the passed two weeks. > > > > For what I understand now, SGX driver wants to use MAP_ANONYMOUS > > semantic for > > mmap()s against /dev/sgx_enclave (for whatever reason that I don't > > understand). > > And because of that, we even want to explicitly enforce userspace to > > always pass > > 0 as pgoff in mmap()s (hasn't been done yet). > > > > And then here, we want to rewrite vma->pgoff so that the VMA can act > > _like_ a > > normal file-based mmap() semantics (despite it is indeed a file-based > > VMA), > > because the VFS fadvice() implementation assumes file-based VMAs are > > always > > following file-based mmap() semantics. > > > > Hmm.. > > > > Doesn't this sound weird? > > > > Sometime weird means good or creative as expressed in a popular slogan in > the city where I live. Just joking:-) > > > I must have been missing something, but why cannot SGX driver just > > always follow > > file-based mmap() semantics at the beginning? > > > > For instance, what's wrong with: > > > > 1) encl_fd = open("/dev/sgx_enclave") > > 2) encl_base = mmap(..., encl_size, MAP_SHARED, encl_fd, 0 /* pgoff */) > > 3) ioctl(ECREATE, encl_base, encl_size) > > > > (In ioctl(ECREATE), we might want to verify the VMA we found has the same > > encl_base as starting address, and 0 pgoff). > > > > And in following mmap()s in which we want to map a small range of > > enclave: > > > > encl_addr = mmap(encl_addr, MAP_SHARED|MAP_FIXED, encl_fd, > > (encl_addr - encl_base) >> PAGE_SHIFT); > > > > ? > > > > Anything wrong above? > > > > I believe this can be done except for that you are breaking existing code > by requiring pg_off for all mmaps now. > > Doing mmap before ECREATE also effectively bypasses sgx_encl_may_map. And > we have a very long thread of discussion about this[1] and that's why > sgx_encl_may_map come into play to limit VMA permissions to those given in > SecInfo by EADD ioctl. However, current code does not explicitly block it > and I think it'd be better do it. More comment below on this point. Thanks for the info! > > > In fact, if the first mmap() in step 2) before IOCTL(ecreate) used > > MAP_ANONYMOUS, IIUC that VMA will stay as anonymous VMA but will never be > > converted to SGX file-based one, because by looking at the code, I see > > neither > > ioctl(ECREATE) nor ioctl(EINIT) converts that VMA to file-based one. > > > > That being said, all page faults caused by userspace access to that VMA > > will > > still go to anonymous VMA's fault path, but not SGX driver's. > > > > It is required to mmap anonymously with PROT_NONE initially and > user space must invoke mmap with enclave fd after EADDing each > segments/areas. PF handler will check VMA prot bits against > vm_max_prot_bits in sgx_encl_load_page_in_vma, and report failure on any > mismatch. > If user space does not mmap(...,encl_fd) after EADD, pages will have > PROT_NONE and fail on any access. > > > This is fine for SGX1 as all enclave pages are fully populated. For > > SGX2, > > _unless_ you explicitly mmap("/dev/sgx_enclave") those dynamical ranges, > > the > > fault will never be handled by SGX driver. > > > > Indeed, it is required user space invoke mmap to config the regions for > EAUG. > > > Architecturally, for SGX2, if you mmap() the entire enclave range (as in > > step > > 2), you don't need explicitly mmap() all dynamic ranges. The userspace > > should > > just be able to access those dynamic ranges (i.e. EACCEPT) and the > > kernel driver > > should gracefully handle the fault. > > > > Shouldn't this be a problem? > > > > (Btw, there might be other corner cases that could cause VMA > > splitting/merging, > > etc, but I haven't thought about those) > > > > > The concern Kai raised about encl->base not available in the window > > > between mmap and ECREATE can be addressed by disallowing mmap before > > > ECREATE is done. It does not make much sense anyway to mmap(enclave_fd) > > > without a valid enclave range associated with enclave_fd. > > > > > > > We can, but I don't understand why the first mmap() before ECREATE > > must/should > > be done via MMAP_ANONYMOUS. In fact, I think it might be wrong for SGX2. > > > > There is a long history behind it. The use of anonymous mapping can be > traced back to v29. > See this thread[2]. > > Before v29, user space can do mmap(PROT_NONE, ..., enclave_fd) before > ECREATE to reserve a range for the enclave. But it must be PROT_NONE for > the reservation, otherwise sgx_encl_may_map will block it by the "!page" > check below [3]: > > int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, > unsigned long end, unsigned long vm_prot_bits) > { > unsigned long idx, idx_start, idx_end; > struct sgx_encl_page *page; > > /* PROT_NONE always succeeds. */ > if (!vm_prot_bits) > return 0; > > idx_start = PFN_DOWN(start); > idx_end = PFN_DOWN(end - 1); > for (idx = idx_start; idx <= idx_end; ++idx) { > mutex_lock(&encl->lock); > page = radix_tree_lookup(&encl->page_tree, idx); > mutex_unlock(&encl->lock); > > if (!page || (~page->vm_prot_bits & vm_prot_bits)) > return -EACCES; > ... > > Then in v29, PROT_NONE mapping was disallowed for encl_fd before pages > EADDed so user space has to mmap anonymously to reserve the range.The > intent was still not to allow mmap before pages EADDed (the !page check > was still there up to v38) Do you know the reason of disallowing PROT_NONE mapping against encl_fd? I dig v28 roughly but didn't find any clue. Basically no comments related to this were made to: [PATCH v28 11/22] x86/sgx: Linux Enclave Driver > > Later version (around v39?) we switched to enforce the permissions in PF > handler and mmap with any permissions before EADD is allowed, but may > cause failure later on PF. I'm not sure it was intentional but pretty sure > no meaningful usage for doing mmap before ECREATE due to PF handler > enforcement. IIUC this change around v39 re-allows mmap() against encl_fd before ECREATE? IIUC the only enforcement is VMA's permission must be more restricted than enclave page's permission. So, theoretically, we can mmap() encl_fd with PROT_READ|PROT_WRITE|PROT_EXEC, and then mprotect() the address range based on the actual enclave pages (in EADD) before actually doing EADD those pages? > > I think that's the history behind it. Others more knowledgeable can > correct me as needed. Thanks for the information. > > Again, even if we redesign the whole thing to be less "weird", then > requiring pgoff for each mmap would break existing user space code. And I > think explicitly block mmap before ECREATE make the API more consistent > with the PF handler enforcement and original intent of sgx_encl_may_map. I think permission check in sgx_encl_may_map() isn't restricted related to the vma->pgoff issue here. They are basically two different topics, IIUC. So I am still a little bit confused about where does "SGX driver uses MAP_ANONYMOUS semantics for fd-based mmap()" come from. Anyway, we certainly don't want to break userspace. However, IIUC, even from now on we change the driver to depend on userspace to pass the correct pgoff in mmap(), this won't break userspace, because old userspace which doesn't use fadvice() and pgoff actually doesn't matter. For new userspace which uses fadvice(), it needs to pass the correct pgoff. I am not saying we should do this, but it doesn't seem we can break userspace? > > Thanks again for detailed review and those thoughtful questions. > > Haitao > > [1]https://lore.kernel.org/linux-sgx/CALCETrXf8mSK45h7sTK5Wf+pXLVn=Bjsc_RLpgO-h-qdzBRo5Q@mail.gmail.com/ > [2]https://lore.kernel.org/linux-sgx/CAOASepPFe_ucuwe7JW_-+VBQ4=+sHqyGXOdA9kUbcYA_9=v0sA@mail.gmail.com/ > [3]https://lore.kernel.org/linux-sgx/20190713170804.2340-17-jarkko.sakkinen@linux.intel.com/ Thanks again for digging out those links. I haven't read the entire discussion, though, as they are pretty long.
On Tue, Mar 07, 2023 at 11:32:15PM +0000, Huang, Kai wrote: > And in following mmap()s in which we want to map a small range of enclave: > > encl_addr = mmap(encl_addr, MAP_SHARED|MAP_FIXED, encl_fd, > (encl_addr - encl_base) >> PAGE_SHIFT); > > ? > > Anything wrong above? I'm not sure I fully comprehended your response because it was honestly a bit scattered so please correct me if I'm missing something but: why a process would want to map a small range of an enclave? You anyway most likely want to enter to the enclave. Is this for ptrace()? BR, Jarkko
On Sun, 2023-03-12 at 03:25 +0200, jarkko@kernel.org wrote: > On Tue, Mar 07, 2023 at 11:32:15PM +0000, Huang, Kai wrote: > > And in following mmap()s in which we want to map a small range of enclave: > > > > encl_addr = mmap(encl_addr, MAP_SHARED|MAP_FIXED, encl_fd, > > (encl_addr - encl_base) >> PAGE_SHIFT); > > > > ? > > > > Anything wrong above? > > I'm not sure I fully comprehended your response because it was > honestly a bit scattered so please correct me if I'm missing > something but: why a process would want to map a small range > of an enclave? For SGX2, if mmap(MAP_ANONYMOUS) was used to get the enclave base address before ECREATE, you will need to mmap(encl_fd) for any regions that are not pre- populated (via EADD), no matter whether the first mmap(MAP_ANONYMOUS) has covered all enclave range or not. Otherwise, the fault to SGX2 regions won't be handled by SGX driver. You can find such mmap() in patch 4 in this series too. Also, architecturally, for SGX2 the first mmap() (before ECREATE) doesn't have to map the entire enclave range. For SGX2 it's fine to pass a larger enclave range in ioctl(ECREATE) than the range got from the first mmap(). Userspace can later on choose to only mmap() the dynamic ranges that it wants to use.
On Thu, 09 Mar 2023 05:31:29 -0600, Huang, Kai <kai.huang@intel.com> wrote: >> >> Then in v29, PROT_NONE mapping was disallowed for encl_fd before pages >> EADDed so user space has to mmap anonymously to reserve the range.The >> intent was still not to allow mmap before pages EADDed (the !page check >> was still there up to v38) > > Do you know the reason of disallowing PROT_NONE mapping against encl_fd? > I think it was to allow user space to do anonymous mapping to reserve address space for enclave. Before this point, you have to use PROT_NONE to reserve with encl_fd. There might be an issue with how #PF and EPC swapping was handled or the elegance of those flows that motivated the move but I can't remember. ABI was not fixed at that time so it was OK to change. > I dig v28 roughly but didn't find any clue. > > Basically no comments related to this were made to: > > [PATCH v28 11/22] x86/sgx: Linux Enclave Driver > >> >> Later version (around v39?) we switched to enforce the permissions in PF >> handler and mmap with any permissions before EADD is allowed, but may >> cause failure later on PF. I'm not sure it was intentional but pretty >> sure >> no meaningful usage for doing mmap before ECREATE due to PF handler >> enforcement. > > IIUC this change around v39 re-allows mmap() against encl_fd before > ECREATE? > I don't think so. There is no meaningful usage for that as I said. > IIUC the only enforcement is VMA's permission must be more restricted > than > enclave page's permission. > > So, theoretically, we can mmap() encl_fd with > PROT_READ|PROT_WRITE|PROT_EXEC, > and then mprotect() the address range based on the actual enclave pages > (in > EADD) before actually doing EADD those pages? > >> >> I think that's the history behind it. Others more knowledgeable can >> correct me as needed. > > Thanks for the information. > >> >> Again, even if we redesign the whole thing to be less "weird", then >> requiring pgoff for each mmap would break existing user space code. And >> I >> think explicitly block mmap before ECREATE make the API more consistent >> with the PF handler enforcement and original intent of sgx_encl_may_map. > > I think permission check in sgx_encl_may_map() isn't restricted related > to the > vma->pgoff issue here. They are basically two different topics, IIUC. > They are related because the intention was not allowing user space to do arbitrary mmaping before EADD. So in the context of the gap you identified for this patch series, i.e, mmap before ECREATE, it is relevant. My view is that we never intended and there is no use case for allowing that to happen. So it naturally follows that we fix that gap by not allowing those scenarios explicitly. > So I am still a little bit confused about where does "SGX driver uses > MAP_ANONYMOUS semantics for fd-based mmap()" come from. > > Anyway, we certainly don't want to break userspace. However, IIUC, even > from > now on we change the driver to depend on userspace to pass the correct > pgoff in > mmap(), this won't break userspace, because old userspace which doesn't > use > fadvice() and pgoff actually doesn't matter. For new userspace which > uses > fadvice(), it needs to pass the correct pgoff. > > I am not saying we should do this, but it doesn't seem we can break > userspace? > I'll leave it for others to chime in on this. With my user space developer hat on, I would not like this because in one place (mmap before madvise call) I have to pass pgoff and not in another place (mmap after EADD). If you see really big issues on the current MAP_ANONYMOUS + enclave_fd semantics, then we should fix the design. But it should be in separate patches. Thanks Haitao
On Tue, Mar 14, 2023 at 09:54:30AM -0500, Haitao Huang wrote: > On Thu, 09 Mar 2023 05:31:29 -0600, Huang, Kai <kai.huang@intel.com> wrote: > > > > > > Then in v29, PROT_NONE mapping was disallowed for encl_fd before pages > > > EADDed so user space has to mmap anonymously to reserve the range.The > > > intent was still not to allow mmap before pages EADDed (the !page check > > > was still there up to v38) > > > > Do you know the reason of disallowing PROT_NONE mapping against encl_fd? > > > > I think it was to allow user space to do anonymous mapping to reserve > address space for enclave. > Before this point, you have to use PROT_NONE to reserve with encl_fd. There > might be an issue with how #PF and EPC swapping was handled or the elegance > of those flows that motivated the move but I can't remember. ABI was not > fixed at that time so it was OK to change. Yes, this was done because enclave naturally aligned base address. BR, Jarkko
On Sun, 2023-03-19 at 15:26 +0200, jarkko@kernel.org wrote: > On Tue, Mar 14, 2023 at 09:54:30AM -0500, Haitao Huang wrote: > > On Thu, 09 Mar 2023 05:31:29 -0600, Huang, Kai <kai.huang@intel.com> wrote: > > > > > > > > Then in v29, PROT_NONE mapping was disallowed for encl_fd before pages > > > > EADDed so user space has to mmap anonymously to reserve the range.The > > > > intent was still not to allow mmap before pages EADDed (the !page check > > > > was still there up to v38) > > > > > > Do you know the reason of disallowing PROT_NONE mapping against encl_fd? > > > > > > > I think it was to allow user space to do anonymous mapping to reserve > > address space for enclave. > > Before this point, you have to use PROT_NONE to reserve with encl_fd. There > > might be an issue with how #PF and EPC swapping was handled or the elegance > > of those flows that motivated the move but I can't remember. ABI was not > > fixed at that time so it was OK to change. > > Yes, this was done because enclave naturally aligned base address. > > Sorry for being ignorant, but you can also get the aligned base address via mmap() against encl_fd, correct? What is the fundamental difference between mmap(MAP_ANONYMOUS) vs mmap(encl_fd)?
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index aa9b8b868867..3a88daddc1a1 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -2,6 +2,7 @@ /* Copyright(c) 2016-20 Intel Corporation. */ #include <linux/acpi.h> +#include <linux/fadvise.h> #include <linux/miscdevice.h> #include <linux/mman.h> #include <linux/security.h> @@ -9,6 +10,7 @@ #include <asm/traps.h> #include "driver.h" #include "encl.h" +#include "encls.h" u64 sgx_attributes_reserved_mask; u64 sgx_xfrm_reserved_mask = ~0x3; @@ -97,10 +99,81 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &sgx_vm_ops; vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; vma->vm_private_data = encl; + vma->vm_pgoff = PFN_DOWN(vma->vm_start - encl->base); return 0; } +/* + * Add new pages to the enclave sequentially with ENCLS[EAUG] for the WILLNEED advice. + * Only do this to existing VMAs in the same enclave and reject the request otherwise. + */ +static int sgx_fadvise(struct file *file, loff_t offset, loff_t len, int advice) +{ + struct sgx_encl *encl = file->private_data; + unsigned long start = offset + encl->base; + struct vm_area_struct *vma = NULL; + unsigned long end = start + len; + unsigned long pos; + int ret = -EINVAL; + + if (!cpu_feature_enabled(X86_FEATURE_SGX2)) + return -EINVAL; + /* Only support WILLNEED */ + if (advice != POSIX_FADV_WILLNEED) + return -EINVAL; + + if (offset + len < offset) + return -EINVAL; + if (start < encl->base) + return -EINVAL; + if (end < start) + return -EINVAL; + if (end > encl->base + encl->size) + return -EINVAL; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + mmap_read_lock(current->mm); + + vma = find_vma(current->mm, start); + if (!vma) + goto unlock; + if (vma->vm_private_data != encl) + goto unlock; + + pos = start; + if (pos < vma->vm_start || end > vma->vm_end) { + /* Don't allow any gaps */ + goto unlock; + } + + /* Here: vm_start <= pos < end <= vm_end */ + while (pos < end) { + if (xa_load(&encl->page_array, PFN_DOWN(pos))) + continue; + if (signal_pending(current)) { + if (pos == start) + ret = -ERESTARTSYS; + else + ret = -EINTR; + goto unlock; + } + ret = sgx_encl_eaug_page(vma, encl, pos); + /* It's OK to not finish */ + if (ret) + break; + pos = pos + PAGE_SIZE; + cond_resched(); + } + ret = 0; + +unlock: + mmap_read_unlock(current->mm); + return ret; +} + static unsigned long sgx_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, @@ -133,6 +206,7 @@ static const struct file_operations sgx_encl_fops = { .compat_ioctl = sgx_compat_ioctl, #endif .mmap = sgx_mmap, + .fadvise = sgx_fadvise, .get_unmapped_area = sgx_get_unmapped_area, }; diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 0185c5ab48dd..592cfea4c9e4 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -299,20 +299,17 @@ struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, } /** - * sgx_encl_eaug_page() - Dynamically add page to initialized enclave - * @vma: VMA obtained from fault info from where page is accessed - * @encl: enclave accessing the page - * @addr: address that triggered the page fault + * sgx_encl_eaug_page() - Dynamically add an EPC page to initialized enclave + * @vma: the VMA into which the page is to be added + * @encl: the enclave for which the page is to be added + * @addr: the start address of the page to be added * - * When an initialized enclave accesses a page with no backing EPC page - * on a SGX2 system then the EPC can be added dynamically via the SGX2 - * ENCLS[EAUG] instruction. - * - * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed - * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise. + * Returns: 0 on EAUG success and PTE was installed successfully, -EBUSY for + * waiting on reclaimer to free EPC, -ENOMEM for out of RAM, -EFAULT for + * all other failures. */ -vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, - struct sgx_encl *encl, unsigned long addr) +int sgx_encl_eaug_page(struct vm_area_struct *vma, + struct sgx_encl *encl, unsigned long addr) { vm_fault_t vmret = VM_FAULT_SIGBUS; struct sgx_pageinfo pginfo = {0}; @@ -321,10 +318,10 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, struct sgx_va_page *va_page; unsigned long phys_addr; u64 secinfo_flags; - int ret; + int ret = -EFAULT; if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) - return VM_FAULT_SIGBUS; + return -EFAULT; /* * Ignore internal permission checking for dynamically added pages. @@ -335,21 +332,21 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags); if (IS_ERR(encl_page)) - return VM_FAULT_OOM; + return -ENOMEM; mutex_lock(&encl->lock); epc_page = sgx_alloc_epc_page(encl_page, false); if (IS_ERR(epc_page)) { if (PTR_ERR(epc_page) == -EBUSY) - vmret = VM_FAULT_NOPAGE; + ret = -EBUSY; goto err_out_unlock; } va_page = sgx_encl_grow(encl, false); if (IS_ERR(va_page)) { if (PTR_ERR(va_page) == -EBUSY) - vmret = VM_FAULT_NOPAGE; + ret = -EBUSY; goto err_out_epc; } @@ -362,16 +359,20 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, * If ret == -EBUSY then page was created in another flow while * running without encl->lock */ - if (ret) + if (ret) { + ret = -EFAULT; goto err_out_shrink; + } pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); pginfo.addr = encl_page->desc & PAGE_MASK; pginfo.metadata = 0; ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page)); - if (ret) + if (ret) { + ret = -EFAULT; goto err_out; + } encl_page->encl = encl; encl_page->epc_page = epc_page; @@ -388,10 +389,10 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); if (vmret != VM_FAULT_NOPAGE) { mutex_unlock(&encl->lock); - return VM_FAULT_SIGBUS; + return -EFAULT; } mutex_unlock(&encl->lock); - return VM_FAULT_NOPAGE; + return 0; err_out: xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); @@ -404,7 +405,7 @@ vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, mutex_unlock(&encl->lock); kfree(encl_page); - return vmret; + return ret; } static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) @@ -434,8 +435,18 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) * enclave that will be checked for right away. */ if (cpu_feature_enabled(X86_FEATURE_SGX2) && - (!xa_load(&encl->page_array, PFN_DOWN(addr)))) - return sgx_encl_eaug_page(vma, encl, addr); + (!xa_load(&encl->page_array, PFN_DOWN(addr)))) { + switch (sgx_encl_eaug_page(vma, encl, addr)) { + case 0: + case -EBUSY: + return VM_FAULT_NOPAGE; + case -ENOMEM: + return VM_FAULT_OOM; + case -EFAULT: + default: + return VM_FAULT_SIGBUS; + } + } mutex_lock(&encl->lock); diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 9f19b06c3ae3..e5a507871fa3 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -125,7 +125,7 @@ struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, unsigned long addr); struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim); void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page); -vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, - struct sgx_encl *encl, unsigned long addr); +int sgx_encl_eaug_page(struct vm_area_struct *vma, + struct sgx_encl *encl, unsigned long addr); #endif /* _X86_ENCL_H */
Support madvise(..., MADV_WILLNEED) by adding EPC pages with EAUG in the newly added fops->fadvise() callback implementation, sgx_fadvise(). Change the return type and values of the sgx_encl_eaug_page function so that more specific error codes are returned for different treatment by the page fault handler and the fadvise callback. On any error, sgx_fadvise() will discontinue further operations and return as normal. The page fault handler allows a PF retried by returning VM_FAULT_NOPAGE in handling -EBUSY returned from sgx_encl_eaug_page. Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com> --- arch/x86/kernel/cpu/sgx/driver.c | 74 ++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/sgx/encl.c | 59 ++++++++++++++----------- arch/x86/kernel/cpu/sgx/encl.h | 4 +- 3 files changed, 111 insertions(+), 26 deletions(-)