diff mbox series

[v6,15/20] mm: Change failure of MAP_FIXED to restoring the gap on failure

Message ID 20240820235730.2852400-16-Liam.Howlett@oracle.com (mailing list archive)
State New
Headers show
Series Avoid MAP_FIXED gap exposure | expand

Commit Message

Liam R. Howlett Aug. 20, 2024, 11:57 p.m. UTC
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>

Prior to call_mmap(), the vmas that will be replaced need to clear the
way for what may happen in the call_mmap().  This clean up work includes
clearing the ptes and calling the close() vm_ops.  Some users do more
setup than can be restored by calling the vm_ops open() function.  It is
safer to store the gap in the vma tree in these cases.

That is to say that the failure scenario that existed before the
MAP_FIXED gap exposure is restored as it is safer than trying to undo a
partial mapping.

There is also a secondary failure that may occur if there is not enough
memory to store the gap.  In this case, the vmas are reattached and
resources freed.  If the system cannot complete the call_mmap() and
fails to allocate with GFP_KERNEL, then the system will print a warning
about the failure.

Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
---
 mm/mmap.c |  3 +--
 mm/vma.h  | 62 +++++++++++++++++++++++++++++++++++++------------------
 2 files changed, 43 insertions(+), 22 deletions(-)

Comments

Lorenzo Stoakes Aug. 21, 2024, 11:56 a.m. UTC | #1
On Tue, Aug 20, 2024 at 07:57:24PM GMT, Liam R. Howlett wrote:
> From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
>
> Prior to call_mmap(), the vmas that will be replaced need to clear the
> way for what may happen in the call_mmap().  This clean up work includes
> clearing the ptes and calling the close() vm_ops.  Some users do more
> setup than can be restored by calling the vm_ops open() function.  It is
> safer to store the gap in the vma tree in these cases.
>
> That is to say that the failure scenario that existed before the
> MAP_FIXED gap exposure is restored as it is safer than trying to undo a
> partial mapping.
>
> There is also a secondary failure that may occur if there is not enough
> memory to store the gap.  In this case, the vmas are reattached and
> resources freed.  If the system cannot complete the call_mmap() and
> fails to allocate with GFP_KERNEL, then the system will print a warning
> about the failure.
>
> Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>

Just some nits etc. below, otherwise:

Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>

> ---
>  mm/mmap.c |  3 +--
>  mm/vma.h  | 62 +++++++++++++++++++++++++++++++++++++------------------
>  2 files changed, 43 insertions(+), 22 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 6550d9470d3a..c1b3d17f97be 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1622,8 +1622,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  		vm_unacct_memory(charged);
>
>  abort_munmap:
> -	if (vms.nr_pages)
> -		abort_munmap_vmas(&mas_detach, vms.closed_vm_ops);
> +	vms_abort_munmap_vmas(&vms, &mas_detach, vms.closed_vm_ops);
>  	validate_mm(mm);
>  	return error;
>  }
> diff --git a/mm/vma.h b/mm/vma.h
> index 756dd42a6ec4..7618ddbfd2b2 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -82,6 +82,22 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  	       unsigned long start, unsigned long end, pgoff_t pgoff);
>
> +static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
> +			struct vm_area_struct *vma, gfp_t gfp)
> +
> +{
> +	if (vmi->mas.status != ma_start &&
> +	    ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
> +		vma_iter_invalidate(vmi);
> +
> +	__mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
> +	mas_store_gfp(&vmi->mas, vma, gfp);
> +	if (unlikely(mas_is_err(&vmi->mas)))
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
>  /*
>   * init_vma_munmap() - Initializer wrapper for vma_munmap_struct
>   * @vms: The vma munmap struct
> @@ -136,15 +152,37 @@ static inline void abort_munmap_vmas(struct ma_state *mas_detach, bool closed)
>  	struct vm_area_struct *vma;
>
>  	mas_set(mas_detach, 0);
> -	mas_for_each(mas_detach, vma, ULONG_MAX) {
> +	mas_for_each(mas_detach, vma, ULONG_MAX)
>  		vma_mark_detached(vma, false);
> -		if (closed && vma->vm_ops && vma->vm_ops->open)
> -			vma->vm_ops->open(vma);
> -	}

Yes.

>
>  	__mt_destroy(mas_detach->tree);
>  }
>
> +static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
> +		struct ma_state *mas_detach, bool closed)

Nitty, but seems strange to comment abort_munmap_vmas() and say that it
undoes any munmap work + frees resources, but not to comment this function.

Also I wonder if, with these changes, abort_munmap_vmas() needs a rename?
Something like reattach_vmas() or something?

> +{
> +	if (!vms->nr_pages)
> +		return;
> +
> +	if (vms->clear_ptes)
> +		return abort_munmap_vmas(mas_detach, vms->closed_vm_ops);
> +
> +	/*
> +	 * Aborting cannot just call the vm_ops open() because they are often
> +	 * not symmetrical and state data has been lost.  Resort to the old
> +	 * failure method of leaving a gap where the MAP_FIXED mapping failed.
> +	 */
> +	if (unlikely(vma_iter_store_gfp(vms->vmi, NULL, GFP_KERNEL))) {
> +		pr_warn_once("%s: (%d) Unable to abort munmap() operation\n",
> +			     current->comm, current->pid);
> +		/* Leaving vmas detached and in-tree may hamper recovery */
> +		abort_munmap_vmas(mas_detach, vms->closed_vm_ops);

OK so I guess rather than trying to do some clever preallocation, we just
warn + get out?

> +	} else {
> +		/* Clean up the insertion of unfortunate the gap */

I'm not sure what this means :P 'unfortunate, the gap, isn't it?'

> +		vms_complete_munmap_vmas(vms, mas_detach);
> +	}
> +}
> +
>  int
>  do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  		    struct mm_struct *mm, unsigned long start,
> @@ -297,22 +335,6 @@ static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
>  	return mas_prev(&vmi->mas, min);
>  }
>
> -static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
> -			struct vm_area_struct *vma, gfp_t gfp)
> -{
> -	if (vmi->mas.status != ma_start &&
> -	    ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
> -		vma_iter_invalidate(vmi);
> -
> -	__mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
> -	mas_store_gfp(&vmi->mas, vma, gfp);
> -	if (unlikely(mas_is_err(&vmi->mas)))
> -		return -ENOMEM;
> -
> -	return 0;
> -}
> -
> -
>  /*
>   * These three helpers classifies VMAs for virtual memory accounting.
>   */
> --
> 2.43.0
>
Liam R. Howlett Aug. 21, 2024, 1:31 p.m. UTC | #2
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240821 07:56]:
> On Tue, Aug 20, 2024 at 07:57:24PM GMT, Liam R. Howlett wrote:
> > From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
> >
> > Prior to call_mmap(), the vmas that will be replaced need to clear the
> > way for what may happen in the call_mmap().  This clean up work includes
> > clearing the ptes and calling the close() vm_ops.  Some users do more
> > setup than can be restored by calling the vm_ops open() function.  It is
> > safer to store the gap in the vma tree in these cases.
> >
> > That is to say that the failure scenario that existed before the
> > MAP_FIXED gap exposure is restored as it is safer than trying to undo a
> > partial mapping.
> >
> > There is also a secondary failure that may occur if there is not enough
> > memory to store the gap.  In this case, the vmas are reattached and
> > resources freed.  If the system cannot complete the call_mmap() and
> > fails to allocate with GFP_KERNEL, then the system will print a warning
> > about the failure.
> >
> > Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
> 
> Just some nits etc. below, otherwise:
> 
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> 
> > ---
> >  mm/mmap.c |  3 +--
> >  mm/vma.h  | 62 +++++++++++++++++++++++++++++++++++++------------------
> >  2 files changed, 43 insertions(+), 22 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 6550d9470d3a..c1b3d17f97be 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1622,8 +1622,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  		vm_unacct_memory(charged);
> >
> >  abort_munmap:
> > -	if (vms.nr_pages)
> > -		abort_munmap_vmas(&mas_detach, vms.closed_vm_ops);
> > +	vms_abort_munmap_vmas(&vms, &mas_detach, vms.closed_vm_ops);
> >  	validate_mm(mm);
> >  	return error;
> >  }
> > diff --git a/mm/vma.h b/mm/vma.h
> > index 756dd42a6ec4..7618ddbfd2b2 100644
> > --- a/mm/vma.h
> > +++ b/mm/vma.h
> > @@ -82,6 +82,22 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >  int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >  	       unsigned long start, unsigned long end, pgoff_t pgoff);
> >
> > +static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
> > +			struct vm_area_struct *vma, gfp_t gfp)
> > +
> > +{
> > +	if (vmi->mas.status != ma_start &&
> > +	    ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
> > +		vma_iter_invalidate(vmi);
> > +
> > +	__mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
> > +	mas_store_gfp(&vmi->mas, vma, gfp);
> > +	if (unlikely(mas_is_err(&vmi->mas)))
> > +		return -ENOMEM;
> > +
> > +	return 0;
> > +}
> > +
> >  /*
> >   * init_vma_munmap() - Initializer wrapper for vma_munmap_struct
> >   * @vms: The vma munmap struct
> > @@ -136,15 +152,37 @@ static inline void abort_munmap_vmas(struct ma_state *mas_detach, bool closed)
> >  	struct vm_area_struct *vma;
> >
> >  	mas_set(mas_detach, 0);
> > -	mas_for_each(mas_detach, vma, ULONG_MAX) {
> > +	mas_for_each(mas_detach, vma, ULONG_MAX)
> >  		vma_mark_detached(vma, false);
> > -		if (closed && vma->vm_ops && vma->vm_ops->open)
> > -			vma->vm_ops->open(vma);
> > -	}
> 
> Yes.

Woops, closed isn't used anymore here.

> 
> >
> >  	__mt_destroy(mas_detach->tree);
> >  }
> >
> > +static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
> > +		struct ma_state *mas_detach, bool closed)
> 
> Nitty, but seems strange to comment abort_munmap_vmas() and say that it
> undoes any munmap work + frees resources, but not to comment this function.

Sure, I can add a comment, it's probably more useful than the one above.

> 
> Also I wonder if, with these changes, abort_munmap_vmas() needs a rename?
> Something like reattach_vmas() or something?

I can rename it.  This is exclusively used in vma.c now besides the
below use.

> 
> > +{
> > +	if (!vms->nr_pages)
> > +		return;
> > +
> > +	if (vms->clear_ptes)
> > +		return abort_munmap_vmas(mas_detach, vms->closed_vm_ops);
> > +
> > +	/*
> > +	 * Aborting cannot just call the vm_ops open() because they are often
> > +	 * not symmetrical and state data has been lost.  Resort to the old
> > +	 * failure method of leaving a gap where the MAP_FIXED mapping failed.
> > +	 */
> > +	if (unlikely(vma_iter_store_gfp(vms->vmi, NULL, GFP_KERNEL))) {
> > +		pr_warn_once("%s: (%d) Unable to abort munmap() operation\n",
> > +			     current->comm, current->pid);
> > +		/* Leaving vmas detached and in-tree may hamper recovery */
> > +		abort_munmap_vmas(mas_detach, vms->closed_vm_ops);
> 
> OK so I guess rather than trying to do some clever preallocation, we just
> warn + get out?

Hmm, clever preallocations... I could maybe figure out something here.
No double failure left unfixed!

> 
> > +	} else {
> > +		/* Clean up the insertion of unfortunate the gap */
> 
> I'm not sure what this means :P 'unfortunate, the gap, isn't it?'

Right.  Perhaps I should update this to 'mind the gap'.

> 
> > +		vms_complete_munmap_vmas(vms, mas_detach);
> > +	}
> > +}
> > +
> >  int
> >  do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >  		    struct mm_struct *mm, unsigned long start,
> > @@ -297,22 +335,6 @@ static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
> >  	return mas_prev(&vmi->mas, min);
> >  }
> >
> > -static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
> > -			struct vm_area_struct *vma, gfp_t gfp)
> > -{
> > -	if (vmi->mas.status != ma_start &&
> > -	    ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
> > -		vma_iter_invalidate(vmi);
> > -
> > -	__mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
> > -	mas_store_gfp(&vmi->mas, vma, gfp);
> > -	if (unlikely(mas_is_err(&vmi->mas)))
> > -		return -ENOMEM;
> > -
> > -	return 0;
> > -}
> > -
> > -
> >  /*
> >   * These three helpers classifies VMAs for virtual memory accounting.
> >   */
> > --
> > 2.43.0
> >
diff mbox series

Patch

diff --git a/mm/mmap.c b/mm/mmap.c
index 6550d9470d3a..c1b3d17f97be 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1622,8 +1622,7 @@  unsigned long mmap_region(struct file *file, unsigned long addr,
 		vm_unacct_memory(charged);
 
 abort_munmap:
-	if (vms.nr_pages)
-		abort_munmap_vmas(&mas_detach, vms.closed_vm_ops);
+	vms_abort_munmap_vmas(&vms, &mas_detach, vms.closed_vm_ops);
 	validate_mm(mm);
 	return error;
 }
diff --git a/mm/vma.h b/mm/vma.h
index 756dd42a6ec4..7618ddbfd2b2 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -82,6 +82,22 @@  int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
 int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	       unsigned long start, unsigned long end, pgoff_t pgoff);
 
+static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
+			struct vm_area_struct *vma, gfp_t gfp)
+
+{
+	if (vmi->mas.status != ma_start &&
+	    ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
+		vma_iter_invalidate(vmi);
+
+	__mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
+	mas_store_gfp(&vmi->mas, vma, gfp);
+	if (unlikely(mas_is_err(&vmi->mas)))
+		return -ENOMEM;
+
+	return 0;
+}
+
 /*
  * init_vma_munmap() - Initializer wrapper for vma_munmap_struct
  * @vms: The vma munmap struct
@@ -136,15 +152,37 @@  static inline void abort_munmap_vmas(struct ma_state *mas_detach, bool closed)
 	struct vm_area_struct *vma;
 
 	mas_set(mas_detach, 0);
-	mas_for_each(mas_detach, vma, ULONG_MAX) {
+	mas_for_each(mas_detach, vma, ULONG_MAX)
 		vma_mark_detached(vma, false);
-		if (closed && vma->vm_ops && vma->vm_ops->open)
-			vma->vm_ops->open(vma);
-	}
 
 	__mt_destroy(mas_detach->tree);
 }
 
+static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
+		struct ma_state *mas_detach, bool closed)
+{
+	if (!vms->nr_pages)
+		return;
+
+	if (vms->clear_ptes)
+		return abort_munmap_vmas(mas_detach, vms->closed_vm_ops);
+
+	/*
+	 * Aborting cannot just call the vm_ops open() because they are often
+	 * not symmetrical and state data has been lost.  Resort to the old
+	 * failure method of leaving a gap where the MAP_FIXED mapping failed.
+	 */
+	if (unlikely(vma_iter_store_gfp(vms->vmi, NULL, GFP_KERNEL))) {
+		pr_warn_once("%s: (%d) Unable to abort munmap() operation\n",
+			     current->comm, current->pid);
+		/* Leaving vmas detached and in-tree may hamper recovery */
+		abort_munmap_vmas(mas_detach, vms->closed_vm_ops);
+	} else {
+		/* Clean up the insertion of unfortunate the gap */
+		vms_complete_munmap_vmas(vms, mas_detach);
+	}
+}
+
 int
 do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		    struct mm_struct *mm, unsigned long start,
@@ -297,22 +335,6 @@  static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
 	return mas_prev(&vmi->mas, min);
 }
 
-static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
-			struct vm_area_struct *vma, gfp_t gfp)
-{
-	if (vmi->mas.status != ma_start &&
-	    ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
-		vma_iter_invalidate(vmi);
-
-	__mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
-	mas_store_gfp(&vmi->mas, vma, gfp);
-	if (unlikely(mas_is_err(&vmi->mas)))
-		return -ENOMEM;
-
-	return 0;
-}
-
-
 /*
  * These three helpers classifies VMAs for virtual memory accounting.
  */