Message ID | 20240911143421.85612-2-faresx@amazon.de (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | support for mm-local memory allocations and use it | expand |
* Fares Mehanna <faresx@amazon.de> [240911 10:36]: > To make sure the kernel mm-local mapping is untouched by the user, we will seal > the VMA before changing the protection to be used by the kernel. > > This will guarantee that userspace can't unmap or alter this VMA while it is > being used by the kernel. > > After the kernel is done with the secret memory, it will unseal the VMA to be > able to unmap and free it. > > Unseal operation is not exposed to userspace. We can't use the mseal feature for this; it is supposed to be a one way transition. Willy describes the feature best here [1]. It is not clear from the change log above or the cover letter as to why you need to go this route instead of using the mmap lock. [1] https://lore.kernel.org/lkml/ZS%2F3GCKvNn5qzhC4@casper.infradead.org/ > > Signed-off-by: Fares Mehanna <faresx@amazon.de> > Signed-off-by: Roman Kagan <rkagan@amazon.de> > --- > mm/internal.h | 7 +++++ > mm/mseal.c | 81 ++++++++++++++++++++++++++++++++------------------- > 2 files changed, 58 insertions(+), 30 deletions(-) > > diff --git a/mm/internal.h b/mm/internal.h > index b4d86436565b..cf7280d101e9 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -1501,6 +1501,8 @@ bool can_modify_mm(struct mm_struct *mm, unsigned long start, > unsigned long end); > bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, > unsigned long end, int behavior); > +/* mm's mmap write lock must be taken before seal/unseal operation */ > +int do_mseal(unsigned long start, unsigned long end, bool seal); > #else > static inline int can_do_mseal(unsigned long flags) > { > @@ -1518,6 +1520,11 @@ static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, > { > return true; > } > + > +static inline int do_mseal(unsigned long start, unsigned long end, bool seal) > +{ > + return -EINVAL; > +} > #endif > > #ifdef CONFIG_SHRINKER_DEBUG > diff --git a/mm/mseal.c b/mm/mseal.c > index 15bba28acc00..aac9399ffd5d 100644 > --- a/mm/mseal.c > +++ b/mm/mseal.c > @@ -26,6 +26,11 @@ static inline void set_vma_sealed(struct vm_area_struct *vma) > vm_flags_set(vma, VM_SEALED); > } > > +static inline void clear_vma_sealed(struct vm_area_struct *vma) > +{ > + vm_flags_clear(vma, VM_SEALED); > +} > + > /* > * check if a vma is sealed for modification. > * return true, if modification is allowed. > @@ -117,7 +122,7 @@ bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long > > static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, > struct vm_area_struct **prev, unsigned long start, > - unsigned long end, vm_flags_t newflags) > + unsigned long end, vm_flags_t newflags, bool seal) > { > int ret = 0; > vm_flags_t oldflags = vma->vm_flags; > @@ -131,7 +136,10 @@ static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, > goto out; > } > > - set_vma_sealed(vma); > + if (seal) > + set_vma_sealed(vma); > + else > + clear_vma_sealed(vma); > out: > *prev = vma; > return ret; > @@ -167,9 +175,9 @@ static int check_mm_seal(unsigned long start, unsigned long end) > } > > /* > - * Apply sealing. > + * Apply sealing / unsealing. > */ > -static int apply_mm_seal(unsigned long start, unsigned long end) > +static int apply_mm_seal(unsigned long start, unsigned long end, bool seal) > { > unsigned long nstart; > struct vm_area_struct *vma, *prev; > @@ -191,11 +199,14 @@ static int apply_mm_seal(unsigned long start, unsigned long end) > unsigned long tmp; > vm_flags_t newflags; > > - newflags = vma->vm_flags | VM_SEALED; > + if (seal) > + newflags = vma->vm_flags | VM_SEALED; > + else > + newflags = vma->vm_flags & ~(VM_SEALED); > tmp = vma->vm_end; > if (tmp > end) > tmp = end; > - error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); > + error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags, seal); > if (error) > return error; > nstart = vma_iter_end(&vmi); > @@ -204,6 +215,37 @@ static int apply_mm_seal(unsigned long start, unsigned long end) > return 0; > } > > +int do_mseal(unsigned long start, unsigned long end, bool seal) > +{ > + int ret; > + > + if (end < start) > + return -EINVAL; > + > + if (end == start) > + return 0; > + > + /* > + * First pass, this helps to avoid > + * partial sealing in case of error in input address range, > + * e.g. ENOMEM error. > + */ > + ret = check_mm_seal(start, end); > + if (ret) > + goto out; > + > + /* > + * Second pass, this should success, unless there are errors > + * from vma_modify_flags, e.g. merge/split error, or process > + * reaching the max supported VMAs, however, those cases shall > + * be rare. > + */ > + ret = apply_mm_seal(start, end, seal); > + > +out: > + return ret; > +} > + > /* > * mseal(2) seals the VM's meta data from > * selected syscalls. > @@ -256,7 +298,7 @@ static int apply_mm_seal(unsigned long start, unsigned long end) > * > * unseal() is not supported. > */ > -static int do_mseal(unsigned long start, size_t len_in, unsigned long flags) > +static int __do_mseal(unsigned long start, size_t len_in, unsigned long flags) > { > size_t len; > int ret = 0; > @@ -277,33 +319,12 @@ static int do_mseal(unsigned long start, size_t len_in, unsigned long flags) > return -EINVAL; > > end = start + len; > - if (end < start) > - return -EINVAL; > - > - if (end == start) > - return 0; > > if (mmap_write_lock_killable(mm)) > return -EINTR; > > - /* > - * First pass, this helps to avoid > - * partial sealing in case of error in input address range, > - * e.g. ENOMEM error. > - */ > - ret = check_mm_seal(start, end); > - if (ret) > - goto out; > - > - /* > - * Second pass, this should success, unless there are errors > - * from vma_modify_flags, e.g. merge/split error, or process > - * reaching the max supported VMAs, however, those cases shall > - * be rare. > - */ > - ret = apply_mm_seal(start, end); > + ret = do_mseal(start, end, true); > > -out: > mmap_write_unlock(current->mm); > return ret; > } > @@ -311,5 +332,5 @@ static int do_mseal(unsigned long start, size_t len_in, unsigned long flags) > SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, > flags) > { > - return do_mseal(start, len, flags); > + return __do_mseal(start, len, flags); > } > -- > 2.40.1 > > > > > Amazon Web Services Development Center Germany GmbH > Krausenstr. 38 > 10117 Berlin > Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss > Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B > Sitz: Berlin > Ust-ID: DE 365 538 597 > >
Hi, Thanks for taking a look and apologies for my delayed response. > It is not clear from the change log above or the cover letter as to why > you need to go this route instead of using the mmap lock. In the current form of the patches I use memfd_secret() to allocate the pages and remove them from kernel linear address. [1] This allocate pages, map them in user virtual addresses and track them in a VMA. Before flipping the permissions on those pages to be used by the kernel, I need to make sure that those virtual addresses and this VMA is off-limits to the owning process. memfd_secret() pages are locked by default, so they won't swap out. I need to seal the VMA to make sure the owner process can't unmap/remap/... or change the protection of this VMA. So before changing the permissions on the secret pages, I make sure the pages are faulted in, locked and sealed. So userspace can't influence this mapping. > We can't use the mseal feature for this; it is supposed to be a one way > transition. For this approach, I need the unseal operation when releasing the memory range. The kernel can be done with the secret pages in one of two scenarios: 1. During lifecycle of the process. 2. When the process terminates. For the first case, I need to unmap the VMA so it can be reused by the owning process later, so I need the unseal operation. For the second case however we don't need that since the process mm is already destructed or just about to be destructed anyway, regardless of sealed/unsealed VMAs. [1] I didn't expose the unseal operation to userspace. [1] https://lore.kernel.org/linux-arm-kernel/20240911143421.85612-3-faresx@amazon.de/ Thanks! Fares. Amazon Web Services Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B Sitz: Berlin Ust-ID: DE 365 538 597
diff --git a/mm/internal.h b/mm/internal.h index b4d86436565b..cf7280d101e9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1501,6 +1501,8 @@ bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end); bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end, int behavior); +/* mm's mmap write lock must be taken before seal/unseal operation */ +int do_mseal(unsigned long start, unsigned long end, bool seal); #else static inline int can_do_mseal(unsigned long flags) { @@ -1518,6 +1520,11 @@ static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, { return true; } + +static inline int do_mseal(unsigned long start, unsigned long end, bool seal) +{ + return -EINVAL; +} #endif #ifdef CONFIG_SHRINKER_DEBUG diff --git a/mm/mseal.c b/mm/mseal.c index 15bba28acc00..aac9399ffd5d 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -26,6 +26,11 @@ static inline void set_vma_sealed(struct vm_area_struct *vma) vm_flags_set(vma, VM_SEALED); } +static inline void clear_vma_sealed(struct vm_area_struct *vma) +{ + vm_flags_clear(vma, VM_SEALED); +} + /* * check if a vma is sealed for modification. * return true, if modification is allowed. @@ -117,7 +122,7 @@ bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, vm_flags_t newflags) + unsigned long end, vm_flags_t newflags, bool seal) { int ret = 0; vm_flags_t oldflags = vma->vm_flags; @@ -131,7 +136,10 @@ static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, goto out; } - set_vma_sealed(vma); + if (seal) + set_vma_sealed(vma); + else + clear_vma_sealed(vma); out: *prev = vma; return ret; @@ -167,9 +175,9 @@ static int check_mm_seal(unsigned long start, unsigned long end) } /* - * Apply sealing. + * Apply sealing / unsealing. */ -static int apply_mm_seal(unsigned long start, unsigned long end) +static int apply_mm_seal(unsigned long start, unsigned long end, bool seal) { unsigned long nstart; struct vm_area_struct *vma, *prev; @@ -191,11 +199,14 @@ static int apply_mm_seal(unsigned long start, unsigned long end) unsigned long tmp; vm_flags_t newflags; - newflags = vma->vm_flags | VM_SEALED; + if (seal) + newflags = vma->vm_flags | VM_SEALED; + else + newflags = vma->vm_flags & ~(VM_SEALED); tmp = vma->vm_end; if (tmp > end) tmp = end; - error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); + error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags, seal); if (error) return error; nstart = vma_iter_end(&vmi); @@ -204,6 +215,37 @@ static int apply_mm_seal(unsigned long start, unsigned long end) return 0; } +int do_mseal(unsigned long start, unsigned long end, bool seal) +{ + int ret; + + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + /* + * First pass, this helps to avoid + * partial sealing in case of error in input address range, + * e.g. ENOMEM error. + */ + ret = check_mm_seal(start, end); + if (ret) + goto out; + + /* + * Second pass, this should success, unless there are errors + * from vma_modify_flags, e.g. merge/split error, or process + * reaching the max supported VMAs, however, those cases shall + * be rare. + */ + ret = apply_mm_seal(start, end, seal); + +out: + return ret; +} + /* * mseal(2) seals the VM's meta data from * selected syscalls. @@ -256,7 +298,7 @@ static int apply_mm_seal(unsigned long start, unsigned long end) * * unseal() is not supported. */ -static int do_mseal(unsigned long start, size_t len_in, unsigned long flags) +static int __do_mseal(unsigned long start, size_t len_in, unsigned long flags) { size_t len; int ret = 0; @@ -277,33 +319,12 @@ static int do_mseal(unsigned long start, size_t len_in, unsigned long flags) return -EINVAL; end = start + len; - if (end < start) - return -EINVAL; - - if (end == start) - return 0; if (mmap_write_lock_killable(mm)) return -EINTR; - /* - * First pass, this helps to avoid - * partial sealing in case of error in input address range, - * e.g. ENOMEM error. - */ - ret = check_mm_seal(start, end); - if (ret) - goto out; - - /* - * Second pass, this should success, unless there are errors - * from vma_modify_flags, e.g. merge/split error, or process - * reaching the max supported VMAs, however, those cases shall - * be rare. - */ - ret = apply_mm_seal(start, end); + ret = do_mseal(start, end, true); -out: mmap_write_unlock(current->mm); return ret; } @@ -311,5 +332,5 @@ static int do_mseal(unsigned long start, size_t len_in, unsigned long flags) SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, flags) { - return do_mseal(start, len, flags); + return __do_mseal(start, len, flags); }