diff mbox series

[v3,5/7] mseal: Replace can_modify_mm_madv with a vma variant

Message ID 20240817-mseal-depessimize-v3-5-d8d2e037df30@gmail.com (mailing list archive)
State Accepted
Commit 23c57d1fa2b9530e38f7964b4e457fed5a7a0ae8
Headers show
Series mm: Optimize mseal checks | expand

Commit Message

Pedro Falcato Aug. 17, 2024, 12:18 a.m. UTC
Replace can_modify_mm_madv() with a single vma variant, and associated
checks in madvise.

While we're at it, also invert the order of checks in:
 if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma))

Checking if we can modify the vma itself (through vm_flags) is
certainly cheaper than is_ro_anon() due to arch_vma_access_permitted()
looking at e.g pkeys registers (with extra branches) in some
architectures.

This patch allows for partial madvise success when finding a sealed VMA,
which historically has been allowed in Linux.

Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
---
 mm/internal.h |  2 --
 mm/madvise.c  | 13 +++----------
 mm/mseal.c    | 17 ++++-------------
 mm/vma.h      |  7 +++++++
 4 files changed, 14 insertions(+), 25 deletions(-)

Comments

Liam R. Howlett Aug. 19, 2024, 8:32 p.m. UTC | #1
* Pedro Falcato <pedro.falcato@gmail.com> [240816 20:18]:
> Replace can_modify_mm_madv() with a single vma variant, and associated
> checks in madvise.
> 
> While we're at it, also invert the order of checks in:
>  if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma))
> 
> Checking if we can modify the vma itself (through vm_flags) is
> certainly cheaper than is_ro_anon() due to arch_vma_access_permitted()
> looking at e.g pkeys registers (with extra branches) in some
> architectures.
> 
> This patch allows for partial madvise success when finding a sealed VMA,
> which historically has been allowed in Linux.
> 
> Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>

Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com>

> ---
>  mm/internal.h |  2 --
>  mm/madvise.c  | 13 +++----------
>  mm/mseal.c    | 17 ++++-------------
>  mm/vma.h      |  7 +++++++
>  4 files changed, 14 insertions(+), 25 deletions(-)
> 
> diff --git a/mm/internal.h b/mm/internal.h
> index ca422aede342..1db320650539 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h:q!
> @@ -1363,8 +1363,6 @@ static inline int can_do_mseal(unsigned long flags)
>  
>  bool can_modify_mm(struct mm_struct *mm, unsigned long start,
>  		unsigned long end);
> -bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
> -		unsigned long end, int behavior);
>  #else
>  static inline int can_do_mseal(unsigned long flags)
>  {
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 89089d84f8df..4e64770be16c 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -1031,6 +1031,9 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
>  	struct anon_vma_name *anon_name;
>  	unsigned long new_flags = vma->vm_flags;
>  
> +	if (unlikely(!can_modify_vma_madv(vma, behavior)))
> +		return -EPERM;
> +
>  	switch (behavior) {
>  	case MADV_REMOVE:
>  		return madvise_remove(vma, prev, start, end);
> @@ -1448,15 +1451,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
>  	start = untagged_addr_remote(mm, start);
>  	end = start + len;
>  
> -	/*
> -	 * Check if the address range is sealed for do_madvise().
> -	 * can_modify_mm_madv assumes we have acquired the lock on MM.
> -	 */
> -	if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) {
> -		error = -EPERM;
> -		goto out;
> -	}
> -
>  	blk_start_plug(&plug);
>  	switch (behavior) {
>  	case MADV_POPULATE_READ:
> @@ -1470,7 +1464,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
>  	}
>  	blk_finish_plug(&plug);
>  
> -out:
>  	if (write)
>  		mmap_write_unlock(mm);
>  	else
> diff --git a/mm/mseal.c b/mm/mseal.c
> index 2170e2139ca0..fdd1666344fa 100644
> --- a/mm/mseal.c
> +++ b/mm/mseal.c
> @@ -75,24 +75,15 @@ bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
>  }
>  
>  /*
> - * Check if the vmas of a memory range are allowed to be modified by madvise.
> - * the memory ranger can have a gap (unallocated memory).
> - * return true, if it is allowed.
> + * Check if a vma is allowed to be modified by madvise.
>   */
> -bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
> -		int behavior)
> +bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
>  {
> -	struct vm_area_struct *vma;
> -
> -	VMA_ITERATOR(vmi, mm, start);
> -
>  	if (!is_madv_discard(behavior))
>  		return true;
>  
> -	/* going through each vma to check. */
> -	for_each_vma_range(vmi, vma, end)
> -		if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
> -			return false;
> +	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
> +		return false;
>  
>  	/* Allow by default. */
>  	return true;
> diff --git a/mm/vma.h b/mm/vma.h
> index e979015cc7fc..da31d0f62157 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -380,6 +380,8 @@ static inline bool can_modify_vma(struct vm_area_struct *vma)
>  	return true;
>  }
>  
> +bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior);
> +
>  #else
>  
>  static inline bool can_modify_vma(struct vm_area_struct *vma)
> @@ -387,6 +389,11 @@ static inline bool can_modify_vma(struct vm_area_struct *vma)
>  	return true;
>  }
>  
> +static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
> +{
> +	return true;
> +}
> +
>  #endif
>  
>  #endif	/* __MM_VMA_H */
> 
> -- 
> 2.46.0
>
Lorenzo Stoakes Aug. 21, 2024, 8:41 a.m. UTC | #2
On Sat, Aug 17, 2024 at 01:18:32AM GMT, Pedro Falcato wrote:
> Replace can_modify_mm_madv() with a single vma variant, and associated
> checks in madvise.
>
> While we're at it, also invert the order of checks in:
>  if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma))
>
> Checking if we can modify the vma itself (through vm_flags) is
> certainly cheaper than is_ro_anon() due to arch_vma_access_permitted()
> looking at e.g pkeys registers (with extra branches) in some
> architectures.
>
> This patch allows for partial madvise success when finding a sealed VMA,
> which historically has been allowed in Linux.
>
> Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
> ---
>  mm/internal.h |  2 --
>  mm/madvise.c  | 13 +++----------
>  mm/mseal.c    | 17 ++++-------------
>  mm/vma.h      |  7 +++++++
>  4 files changed, 14 insertions(+), 25 deletions(-)
>
> diff --git a/mm/internal.h b/mm/internal.h
> index ca422aede342..1db320650539 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1363,8 +1363,6 @@ static inline int can_do_mseal(unsigned long flags)
>
>  bool can_modify_mm(struct mm_struct *mm, unsigned long start,
>  		unsigned long end);
> -bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
> -		unsigned long end, int behavior);
>  #else
>  static inline int can_do_mseal(unsigned long flags)
>  {
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 89089d84f8df..4e64770be16c 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -1031,6 +1031,9 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
>  	struct anon_vma_name *anon_name;
>  	unsigned long new_flags = vma->vm_flags;
>
> +	if (unlikely(!can_modify_vma_madv(vma, behavior)))
> +		return -EPERM;
> +
>  	switch (behavior) {
>  	case MADV_REMOVE:
>  		return madvise_remove(vma, prev, start, end);
> @@ -1448,15 +1451,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
>  	start = untagged_addr_remote(mm, start);
>  	end = start + len;
>
> -	/*
> -	 * Check if the address range is sealed for do_madvise().
> -	 * can_modify_mm_madv assumes we have acquired the lock on MM.
> -	 */
> -	if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) {
> -		error = -EPERM;
> -		goto out;
> -	}
> -
>  	blk_start_plug(&plug);
>  	switch (behavior) {
>  	case MADV_POPULATE_READ:
> @@ -1470,7 +1464,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
>  	}
>  	blk_finish_plug(&plug);
>
> -out:
>  	if (write)
>  		mmap_write_unlock(mm);
>  	else
> diff --git a/mm/mseal.c b/mm/mseal.c
> index 2170e2139ca0..fdd1666344fa 100644
> --- a/mm/mseal.c
> +++ b/mm/mseal.c
> @@ -75,24 +75,15 @@ bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
>  }
>
>  /*
> - * Check if the vmas of a memory range are allowed to be modified by madvise.
> - * the memory ranger can have a gap (unallocated memory).
> - * return true, if it is allowed.
> + * Check if a vma is allowed to be modified by madvise.
>   */
> -bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
> -		int behavior)
> +bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
>  {
> -	struct vm_area_struct *vma;
> -
> -	VMA_ITERATOR(vmi, mm, start);
> -
>  	if (!is_madv_discard(behavior))
>  		return true;
>
> -	/* going through each vma to check. */
> -	for_each_vma_range(vmi, vma, end)
> -		if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
> -			return false;
> +	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
> +		return false;

Not your fault, but I find it extremely irritating that something this subtle
has literally zero comments.

mseal()'d + user does not have permission to modify pages = potentially
discards, as per the original message:

   6> Some destructive madvice() behaviors (e.g. MADV_DONTNEED) for anonymous
      memory, when users don't have write permission to the memory. Those
      behaviors can alter region contents by discarding pages, effectively a
      memset(0) for anonymous memory.

For something so invasive to just leave this as implied + needing to look
up the commit message to understand is just... yeah. But again, not your
fault...

>
>  	/* Allow by default. */
>  	return true;
> diff --git a/mm/vma.h b/mm/vma.h
> index e979015cc7fc..da31d0f62157 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -380,6 +380,8 @@ static inline bool can_modify_vma(struct vm_area_struct *vma)
>  	return true;
>  }
>
> +bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior);
> +
>  #else
>
>  static inline bool can_modify_vma(struct vm_area_struct *vma)
> @@ -387,6 +389,11 @@ static inline bool can_modify_vma(struct vm_area_struct *vma)
>  	return true;
>  }
>
> +static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
> +{
> +	return true;
> +}
> +
>  #endif
>
>  #endif	/* __MM_VMA_H */
>
> --
> 2.46.0
>

I remain baffled that the original implementation tried to do these things
at an mm- granularity.

Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
diff mbox series

Patch

diff --git a/mm/internal.h b/mm/internal.h
index ca422aede342..1db320650539 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1363,8 +1363,6 @@  static inline int can_do_mseal(unsigned long flags)
 
 bool can_modify_mm(struct mm_struct *mm, unsigned long start,
 		unsigned long end);
-bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
-		unsigned long end, int behavior);
 #else
 static inline int can_do_mseal(unsigned long flags)
 {
diff --git a/mm/madvise.c b/mm/madvise.c
index 89089d84f8df..4e64770be16c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1031,6 +1031,9 @@  static int madvise_vma_behavior(struct vm_area_struct *vma,
 	struct anon_vma_name *anon_name;
 	unsigned long new_flags = vma->vm_flags;
 
+	if (unlikely(!can_modify_vma_madv(vma, behavior)))
+		return -EPERM;
+
 	switch (behavior) {
 	case MADV_REMOVE:
 		return madvise_remove(vma, prev, start, end);
@@ -1448,15 +1451,6 @@  int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 	start = untagged_addr_remote(mm, start);
 	end = start + len;
 
-	/*
-	 * Check if the address range is sealed for do_madvise().
-	 * can_modify_mm_madv assumes we have acquired the lock on MM.
-	 */
-	if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) {
-		error = -EPERM;
-		goto out;
-	}
-
 	blk_start_plug(&plug);
 	switch (behavior) {
 	case MADV_POPULATE_READ:
@@ -1470,7 +1464,6 @@  int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 	}
 	blk_finish_plug(&plug);
 
-out:
 	if (write)
 		mmap_write_unlock(mm);
 	else
diff --git a/mm/mseal.c b/mm/mseal.c
index 2170e2139ca0..fdd1666344fa 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -75,24 +75,15 @@  bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
 }
 
 /*
- * Check if the vmas of a memory range are allowed to be modified by madvise.
- * the memory ranger can have a gap (unallocated memory).
- * return true, if it is allowed.
+ * Check if a vma is allowed to be modified by madvise.
  */
-bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
-		int behavior)
+bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
 {
-	struct vm_area_struct *vma;
-
-	VMA_ITERATOR(vmi, mm, start);
-
 	if (!is_madv_discard(behavior))
 		return true;
 
-	/* going through each vma to check. */
-	for_each_vma_range(vmi, vma, end)
-		if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
-			return false;
+	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
+		return false;
 
 	/* Allow by default. */
 	return true;
diff --git a/mm/vma.h b/mm/vma.h
index e979015cc7fc..da31d0f62157 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -380,6 +380,8 @@  static inline bool can_modify_vma(struct vm_area_struct *vma)
 	return true;
 }
 
+bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior);
+
 #else
 
 static inline bool can_modify_vma(struct vm_area_struct *vma)
@@ -387,6 +389,11 @@  static inline bool can_modify_vma(struct vm_area_struct *vma)
 	return true;
 }
 
+static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
+{
+	return true;
+}
+
 #endif
 
 #endif	/* __MM_VMA_H */