Message ID | 20240817-mseal-depessimize-v3-5-d8d2e037df30@gmail.com (mailing list archive) |
---|---|
State | Accepted |
Commit | 23c57d1fa2b9530e38f7964b4e457fed5a7a0ae8 |
Headers | show |
Series | mm: Optimize mseal checks | expand |
* Pedro Falcato <pedro.falcato@gmail.com> [240816 20:18]: > Replace can_modify_mm_madv() with a single vma variant, and associated > checks in madvise. > > While we're at it, also invert the order of checks in: > if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)) > > Checking if we can modify the vma itself (through vm_flags) is > certainly cheaper than is_ro_anon() due to arch_vma_access_permitted() > looking at e.g pkeys registers (with extra branches) in some > architectures. > > This patch allows for partial madvise success when finding a sealed VMA, > which historically has been allowed in Linux. > > Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com> Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com> > --- > mm/internal.h | 2 -- > mm/madvise.c | 13 +++---------- > mm/mseal.c | 17 ++++------------- > mm/vma.h | 7 +++++++ > 4 files changed, 14 insertions(+), 25 deletions(-) > > diff --git a/mm/internal.h b/mm/internal.h > index ca422aede342..1db320650539 100644 > --- a/mm/internal.h > +++ b/mm/internal.h:q! > @@ -1363,8 +1363,6 @@ static inline int can_do_mseal(unsigned long flags) > > bool can_modify_mm(struct mm_struct *mm, unsigned long start, > unsigned long end); > -bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, > - unsigned long end, int behavior); > #else > static inline int can_do_mseal(unsigned long flags) > { > diff --git a/mm/madvise.c b/mm/madvise.c > index 89089d84f8df..4e64770be16c 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -1031,6 +1031,9 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, > struct anon_vma_name *anon_name; > unsigned long new_flags = vma->vm_flags; > > + if (unlikely(!can_modify_vma_madv(vma, behavior))) > + return -EPERM; > + > switch (behavior) { > case MADV_REMOVE: > return madvise_remove(vma, prev, start, end); > @@ -1448,15 +1451,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh > start = untagged_addr_remote(mm, start); > end = start + len; > > - /* > - * Check if the address range is sealed for do_madvise(). > - * can_modify_mm_madv assumes we have acquired the lock on MM. > - */ > - if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) { > - error = -EPERM; > - goto out; > - } > - > blk_start_plug(&plug); > switch (behavior) { > case MADV_POPULATE_READ: > @@ -1470,7 +1464,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh > } > blk_finish_plug(&plug); > > -out: > if (write) > mmap_write_unlock(mm); > else > diff --git a/mm/mseal.c b/mm/mseal.c > index 2170e2139ca0..fdd1666344fa 100644 > --- a/mm/mseal.c > +++ b/mm/mseal.c > @@ -75,24 +75,15 @@ bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) > } > > /* > - * Check if the vmas of a memory range are allowed to be modified by madvise. > - * the memory ranger can have a gap (unallocated memory). > - * return true, if it is allowed. > + * Check if a vma is allowed to be modified by madvise. > */ > -bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end, > - int behavior) > +bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) > { > - struct vm_area_struct *vma; > - > - VMA_ITERATOR(vmi, mm, start); > - > if (!is_madv_discard(behavior)) > return true; > > - /* going through each vma to check. */ > - for_each_vma_range(vmi, vma, end) > - if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma))) > - return false; > + if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma))) > + return false; > > /* Allow by default. */ > return true; > diff --git a/mm/vma.h b/mm/vma.h > index e979015cc7fc..da31d0f62157 100644 > --- a/mm/vma.h > +++ b/mm/vma.h > @@ -380,6 +380,8 @@ static inline bool can_modify_vma(struct vm_area_struct *vma) > return true; > } > > +bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior); > + > #else > > static inline bool can_modify_vma(struct vm_area_struct *vma) > @@ -387,6 +389,11 @@ static inline bool can_modify_vma(struct vm_area_struct *vma) > return true; > } > > +static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) > +{ > + return true; > +} > + > #endif > > #endif /* __MM_VMA_H */ > > -- > 2.46.0 >
On Sat, Aug 17, 2024 at 01:18:32AM GMT, Pedro Falcato wrote: > Replace can_modify_mm_madv() with a single vma variant, and associated > checks in madvise. > > While we're at it, also invert the order of checks in: > if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)) > > Checking if we can modify the vma itself (through vm_flags) is > certainly cheaper than is_ro_anon() due to arch_vma_access_permitted() > looking at e.g pkeys registers (with extra branches) in some > architectures. > > This patch allows for partial madvise success when finding a sealed VMA, > which historically has been allowed in Linux. > > Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com> > --- > mm/internal.h | 2 -- > mm/madvise.c | 13 +++---------- > mm/mseal.c | 17 ++++------------- > mm/vma.h | 7 +++++++ > 4 files changed, 14 insertions(+), 25 deletions(-) > > diff --git a/mm/internal.h b/mm/internal.h > index ca422aede342..1db320650539 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -1363,8 +1363,6 @@ static inline int can_do_mseal(unsigned long flags) > > bool can_modify_mm(struct mm_struct *mm, unsigned long start, > unsigned long end); > -bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, > - unsigned long end, int behavior); > #else > static inline int can_do_mseal(unsigned long flags) > { > diff --git a/mm/madvise.c b/mm/madvise.c > index 89089d84f8df..4e64770be16c 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -1031,6 +1031,9 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, > struct anon_vma_name *anon_name; > unsigned long new_flags = vma->vm_flags; > > + if (unlikely(!can_modify_vma_madv(vma, behavior))) > + return -EPERM; > + > switch (behavior) { > case MADV_REMOVE: > return madvise_remove(vma, prev, start, end); > @@ -1448,15 +1451,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh > start = untagged_addr_remote(mm, start); > end = start + len; > > - /* > - * Check if the address range is sealed for do_madvise(). > - * can_modify_mm_madv assumes we have acquired the lock on MM. > - */ > - if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) { > - error = -EPERM; > - goto out; > - } > - > blk_start_plug(&plug); > switch (behavior) { > case MADV_POPULATE_READ: > @@ -1470,7 +1464,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh > } > blk_finish_plug(&plug); > > -out: > if (write) > mmap_write_unlock(mm); > else > diff --git a/mm/mseal.c b/mm/mseal.c > index 2170e2139ca0..fdd1666344fa 100644 > --- a/mm/mseal.c > +++ b/mm/mseal.c > @@ -75,24 +75,15 @@ bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) > } > > /* > - * Check if the vmas of a memory range are allowed to be modified by madvise. > - * the memory ranger can have a gap (unallocated memory). > - * return true, if it is allowed. > + * Check if a vma is allowed to be modified by madvise. > */ > -bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end, > - int behavior) > +bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) > { > - struct vm_area_struct *vma; > - > - VMA_ITERATOR(vmi, mm, start); > - > if (!is_madv_discard(behavior)) > return true; > > - /* going through each vma to check. */ > - for_each_vma_range(vmi, vma, end) > - if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma))) > - return false; > + if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma))) > + return false; Not your fault, but I find it extremely irritating that something this subtle has literally zero comments. mseal()'d + user does not have permission to modify pages = potentially discards, as per the original message: 6> Some destructive madvice() behaviors (e.g. MADV_DONTNEED) for anonymous memory, when users don't have write permission to the memory. Those behaviors can alter region contents by discarding pages, effectively a memset(0) for anonymous memory. For something so invasive to just leave this as implied + needing to look up the commit message to understand is just... yeah. But again, not your fault... > > /* Allow by default. */ > return true; > diff --git a/mm/vma.h b/mm/vma.h > index e979015cc7fc..da31d0f62157 100644 > --- a/mm/vma.h > +++ b/mm/vma.h > @@ -380,6 +380,8 @@ static inline bool can_modify_vma(struct vm_area_struct *vma) > return true; > } > > +bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior); > + > #else > > static inline bool can_modify_vma(struct vm_area_struct *vma) > @@ -387,6 +389,11 @@ static inline bool can_modify_vma(struct vm_area_struct *vma) > return true; > } > > +static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) > +{ > + return true; > +} > + > #endif > > #endif /* __MM_VMA_H */ > > -- > 2.46.0 > I remain baffled that the original implementation tried to do these things at an mm- granularity. Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
diff --git a/mm/internal.h b/mm/internal.h index ca422aede342..1db320650539 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1363,8 +1363,6 @@ static inline int can_do_mseal(unsigned long flags) bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end); -bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, - unsigned long end, int behavior); #else static inline int can_do_mseal(unsigned long flags) { diff --git a/mm/madvise.c b/mm/madvise.c index 89089d84f8df..4e64770be16c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1031,6 +1031,9 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; + if (unlikely(!can_modify_vma_madv(vma, behavior))) + return -EPERM; + switch (behavior) { case MADV_REMOVE: return madvise_remove(vma, prev, start, end); @@ -1448,15 +1451,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh start = untagged_addr_remote(mm, start); end = start + len; - /* - * Check if the address range is sealed for do_madvise(). - * can_modify_mm_madv assumes we have acquired the lock on MM. - */ - if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) { - error = -EPERM; - goto out; - } - blk_start_plug(&plug); switch (behavior) { case MADV_POPULATE_READ: @@ -1470,7 +1464,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh } blk_finish_plug(&plug); -out: if (write) mmap_write_unlock(mm); else diff --git a/mm/mseal.c b/mm/mseal.c index 2170e2139ca0..fdd1666344fa 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -75,24 +75,15 @@ bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) } /* - * Check if the vmas of a memory range are allowed to be modified by madvise. - * the memory ranger can have a gap (unallocated memory). - * return true, if it is allowed. + * Check if a vma is allowed to be modified by madvise. */ -bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end, - int behavior) +bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) { - struct vm_area_struct *vma; - - VMA_ITERATOR(vmi, mm, start); - if (!is_madv_discard(behavior)) return true; - /* going through each vma to check. */ - for_each_vma_range(vmi, vma, end) - if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma))) - return false; + if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma))) + return false; /* Allow by default. */ return true; diff --git a/mm/vma.h b/mm/vma.h index e979015cc7fc..da31d0f62157 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -380,6 +380,8 @@ static inline bool can_modify_vma(struct vm_area_struct *vma) return true; } +bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior); + #else static inline bool can_modify_vma(struct vm_area_struct *vma) @@ -387,6 +389,11 @@ static inline bool can_modify_vma(struct vm_area_struct *vma) return true; } +static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) +{ + return true; +} + #endif #endif /* __MM_VMA_H */
Replace can_modify_mm_madv() with a single vma variant, and associated checks in madvise. While we're at it, also invert the order of checks in: if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)) Checking if we can modify the vma itself (through vm_flags) is certainly cheaper than is_ro_anon() due to arch_vma_access_permitted() looking at e.g pkeys registers (with extra branches) in some architectures. This patch allows for partial madvise success when finding a sealed VMA, which historically has been allowed in Linux. Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com> --- mm/internal.h | 2 -- mm/madvise.c | 13 +++---------- mm/mseal.c | 17 ++++------------- mm/vma.h | 7 +++++++ 4 files changed, 14 insertions(+), 25 deletions(-)