diff mbox series

[hmm,2/8] mm/hmm: don't free the cached pgmap while scanning

Message ID 20200311183506.3997-3-jgg@ziepe.ca (mailing list archive)
State New, archived
Headers show
Series Various error case bug fixes for hmm_range_fault() | expand

Commit Message

Jason Gunthorpe March 11, 2020, 6:35 p.m. UTC
From: Jason Gunthorpe <jgg@mellanox.com>

The pgmap is held in the hmm_vma_walk variable in hope of speeding up
future get_dev_pagemap() calls by hitting the same pointer. The algorithm
doesn't actually care about how long the pgmap is held for.

Move the put of the cached pgmap to after the walk is completed and delete
all the other now redundant puts.

This solves a possible leak of the reference in hmm_vma_walk_pmd() if a
hmm_vma_handle_pte() fails while looping.

Fixes: 992de9a8b751 ("mm/hmm: allow to mirror vma of a file on a DAX backed filesystem")
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

We talked about just deleting this stuff, but I think it makes alot sense for
hmm_range_fault() to trigger fault on devmap pages that are not compatible
with the caller - so lets just fix the leak on error path for now.

Comments

Ralph Campbell March 12, 2020, 1:29 a.m. UTC | #1
On 3/11/20 11:35 AM, Jason Gunthorpe wrote:
> From: Jason Gunthorpe <jgg@mellanox.com>
> 
> The pgmap is held in the hmm_vma_walk variable in hope of speeding up
> future get_dev_pagemap() calls by hitting the same pointer. The algorithm
> doesn't actually care about how long the pgmap is held for.
> 
> Move the put of the cached pgmap to after the walk is completed and delete
> all the other now redundant puts.
> 
> This solves a possible leak of the reference in hmm_vma_walk_pmd() if a
> hmm_vma_handle_pte() fails while looping.
> 
> Fixes: 992de9a8b751 ("mm/hmm: allow to mirror vma of a file on a DAX backed filesystem")
> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>

Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>

> ---
>   mm/hmm.c | 31 +++++++++----------------------
>   1 file changed, 9 insertions(+), 22 deletions(-)
> 
> We talked about just deleting this stuff, but I think it makes alot sense for
> hmm_range_fault() to trigger fault on devmap pages that are not compatible
> with the caller - so lets just fix the leak on error path for now.
> 
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 35f85424176d14..9e8f68eb83287a 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -239,10 +239,6 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
>   		}
>   		pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
>   	}
> -	if (hmm_vma_walk->pgmap) {
> -		put_dev_pagemap(hmm_vma_walk->pgmap);
> -		hmm_vma_walk->pgmap = NULL;
> -	}
>   	hmm_vma_walk->last = end;
>   	return 0;
>   }
> @@ -360,10 +356,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
>   	return 0;
>   
>   fault:
> -	if (hmm_vma_walk->pgmap) {
> -		put_dev_pagemap(hmm_vma_walk->pgmap);
> -		hmm_vma_walk->pgmap = NULL;
> -	}
>   	pte_unmap(ptep);
>   	/* Fault any virtual address we were asked to fault */
>   	return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
> @@ -446,16 +438,6 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>   			return r;
>   		}
>   	}
> -	if (hmm_vma_walk->pgmap) {
> -		/*
> -		 * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
> -		 * so that we can leverage get_dev_pagemap() optimization which
> -		 * will not re-take a reference on a pgmap if we already have
> -		 * one.
> -		 */
> -		put_dev_pagemap(hmm_vma_walk->pgmap);
> -		hmm_vma_walk->pgmap = NULL;
> -	}
>   	pte_unmap(ptep - 1);
>   
>   	hmm_vma_walk->last = addr;
> @@ -529,10 +511,6 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
>   			pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
>   				  cpu_flags;
>   		}
> -		if (hmm_vma_walk->pgmap) {
> -			put_dev_pagemap(hmm_vma_walk->pgmap);
> -			hmm_vma_walk->pgmap = NULL;
> -		}
>   		hmm_vma_walk->last = end;
>   		goto out_unlock;
>   	}
> @@ -694,6 +672,15 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags)
>   			return -EBUSY;
>   		ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
>   				      &hmm_walk_ops, &hmm_vma_walk);
> +		/*
> +		 * A pgmap is kept cached in the hmm_vma_walk to avoid expensive
> +		 * searching in the probably common case that the pgmap is the
> +		 * same for the entire requested range.
> +		 */
> +		if (hmm_vma_walk.pgmap) {
> +			put_dev_pagemap(hmm_vma_walk.pgmap);
> +			hmm_vma_walk.pgmap = NULL;
> +		}
>   	} while (ret == -EBUSY);
>   
>   	if (ret)
>
Christoph Hellwig March 16, 2020, 9:02 a.m. UTC | #2
On Wed, Mar 11, 2020 at 03:35:00PM -0300, Jason Gunthorpe wrote:
> @@ -694,6 +672,15 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags)
>  			return -EBUSY;
>  		ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
>  				      &hmm_walk_ops, &hmm_vma_walk);
> +		/*
> +		 * A pgmap is kept cached in the hmm_vma_walk to avoid expensive
> +		 * searching in the probably common case that the pgmap is the
> +		 * same for the entire requested range.
> +		 */
> +		if (hmm_vma_walk.pgmap) {
> +			put_dev_pagemap(hmm_vma_walk.pgmap);
> +			hmm_vma_walk.pgmap = NULL;
> +		}
>  	} while (ret == -EBUSY);

In which case it should only be put on return, and not for every loop.

I still think the right fix is to just delete all the unused and broken
pgmap handling code.  If we ever need to add it back it can be added
in a proper understood and tested way.
Jason Gunthorpe March 16, 2020, 6:07 p.m. UTC | #3
On Mon, Mar 16, 2020 at 10:02:50AM +0100, Christoph Hellwig wrote:
> On Wed, Mar 11, 2020 at 03:35:00PM -0300, Jason Gunthorpe wrote:
> > @@ -694,6 +672,15 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags)
> >  			return -EBUSY;
> >  		ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
> >  				      &hmm_walk_ops, &hmm_vma_walk);
> > +		/*
> > +		 * A pgmap is kept cached in the hmm_vma_walk to avoid expensive
> > +		 * searching in the probably common case that the pgmap is the
> > +		 * same for the entire requested range.
> > +		 */
> > +		if (hmm_vma_walk.pgmap) {
> > +			put_dev_pagemap(hmm_vma_walk.pgmap);
> > +			hmm_vma_walk.pgmap = NULL;
> > +		}
> >  	} while (ret == -EBUSY);
> 
> In which case it should only be put on return, and not for every loop.

I chose this to be simple without having to goto unwind it.

So, instead like this:

@@ -683,21 +661,33 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags)
 		.flags = flags,
 	};
 	struct mm_struct *mm = range->notifier->mm;
-	int ret;
+	long ret;
 
 	lockdep_assert_held(&mm->mmap_sem);
 
 	do {
 		/* If range is no longer valid force retry. */
 		if (mmu_interval_check_retry(range->notifier,
-					     range->notifier_seq))
-			return -EBUSY;
+					     range->notifier_seq)) {
+			ret = -EBUSY;
+			goto out;
+		}
 		ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
 				      &hmm_walk_ops, &hmm_vma_walk);
 	} while (ret == -EBUSY);
 
 	if (ret)
-		return ret;
-	return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+		goto out;
+	ret = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+
+out:
+	/*
+	 * A pgmap is kept cached in the hmm_vma_walk to avoid expensive
+	 * searching in the probably common case that the pgmap is the
+	 * same for the entire requested range.
+	 */
+	if (hmm_vma_walk.pgmap)
+		put_dev_pagemap(hmm_vma_walk.pgmap);
+	return ret;
 }
 EXPORT_SYMBOL(hmm_range_fault);

?

> I still think the right fix is to just delete all the unused and broken
> pgmap handling code.  If we ever need to add it back it can be added
> in a proper understood and tested way.

What I want to add is something like

 if (pgmap != walk->required_pgmap)
     cpu_flags = 0
 hmm_range_need_fault(..., cpu_flags, ...)

Which will fix a bug in nouveau where it blindly assumes any device
pages are its own, IIRC.

I think Ralph observed it needs to be here, because if the pgmap
doesn't match then it should trigger migration, in a single call,
rather than iterating.

I'm mostly expecting to replace all the other pgmap code, but keep the
pgmap caching. The existing pgmap stuff seems approx right, but
useless..

Jason
Christoph Hellwig March 16, 2020, 6:13 p.m. UTC | #4
On Mon, Mar 16, 2020 at 03:07:13PM -0300, Jason Gunthorpe wrote:
> I chose this to be simple without having to goto unwind it.
> 
> So, instead like this:

As ѕaid, and per the previous discussion:  I think just removing the
pgmap lookup is the right thing to do here.  Something like this patch:

diff --git a/mm/hmm.c b/mm/hmm.c
index 3d10485bf323..9f1049815d44 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -28,7 +28,6 @@
 
 struct hmm_vma_walk {
 	struct hmm_range	*range;
-	struct dev_pagemap	*pgmap;
 	unsigned long		last;
 	unsigned int		flags;
 };
@@ -198,15 +197,8 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
 		return hmm_vma_fault(addr, end, fault, write_fault, walk);
 
 	pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
-		if (pmd_devmap(pmd)) {
-			hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
-					      hmm_vma_walk->pgmap);
-			if (unlikely(!hmm_vma_walk->pgmap))
-				return -EBUSY;
-		}
+	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
 		pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
-	}
 	hmm_vma_walk->last = end;
 	return 0;
 }
@@ -277,15 +269,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	if (fault || write_fault)
 		goto fault;
 
-	if (pte_devmap(pte)) {
-		hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
-					      hmm_vma_walk->pgmap);
-		if (unlikely(!hmm_vma_walk->pgmap)) {
-			pte_unmap(ptep);
-			return -EBUSY;
-		}
-	}
-
 	/*
 	 * Since each architecture defines a struct page for the zero page, just
 	 * fall through and treat it like a normal page.
@@ -455,12 +438,6 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
 
 		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
 		for (i = 0; i < npages; ++i, ++pfn) {
-			hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
-					      hmm_vma_walk->pgmap);
-			if (unlikely(!hmm_vma_walk->pgmap)) {
-				ret = -EBUSY;
-				goto out_unlock;
-			}
 			pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
 				  cpu_flags;
 		}
@@ -614,15 +591,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags)
 			return -EBUSY;
 		ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
 				      &hmm_walk_ops, &hmm_vma_walk);
-		/*
-		 * A pgmap is kept cached in the hmm_vma_walk to avoid expensive
-		 * searching in the probably common case that the pgmap is the
-		 * same for the entire requested range.
-		 */
-		if (hmm_vma_walk.pgmap) {
-			put_dev_pagemap(hmm_vma_walk.pgmap);
-			hmm_vma_walk.pgmap = NULL;
-		}
 	} while (ret == -EBUSY);
 
 	if (ret)
Jason Gunthorpe March 16, 2020, 7:23 p.m. UTC | #5
On Mon, Mar 16, 2020 at 07:13:24PM +0100, Christoph Hellwig wrote:
> On Mon, Mar 16, 2020 at 03:07:13PM -0300, Jason Gunthorpe wrote:
> > I chose this to be simple without having to goto unwind it.
> > 
> > So, instead like this:
> 
> As ѕaid, and per the previous discussion:  I think just removing the
> pgmap lookup is the right thing to do here.  Something like this patch:

OK. I think I get it now.

We don't even signal that the pfn is a pgmap to the caller, so the
caller must assume the pfn is CPU memory and can be dma mapped. At
that point it doesn't matter what kind of pgmap it is.

Races here are resolved by notifiers as we can't destroy the pgmap
without triggering invalidation of the pte

So removing is the right thing to do, and the fixing for the
device_private case is closer to the hunk I just sent.

Jason
diff mbox series

Patch

diff --git a/mm/hmm.c b/mm/hmm.c
index 35f85424176d14..9e8f68eb83287a 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -239,10 +239,6 @@  static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
 		}
 		pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
 	}
-	if (hmm_vma_walk->pgmap) {
-		put_dev_pagemap(hmm_vma_walk->pgmap);
-		hmm_vma_walk->pgmap = NULL;
-	}
 	hmm_vma_walk->last = end;
 	return 0;
 }
@@ -360,10 +356,6 @@  static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	return 0;
 
 fault:
-	if (hmm_vma_walk->pgmap) {
-		put_dev_pagemap(hmm_vma_walk->pgmap);
-		hmm_vma_walk->pgmap = NULL;
-	}
 	pte_unmap(ptep);
 	/* Fault any virtual address we were asked to fault */
 	return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -446,16 +438,6 @@  static int hmm_vma_walk_pmd(pmd_t *pmdp,
 			return r;
 		}
 	}
-	if (hmm_vma_walk->pgmap) {
-		/*
-		 * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
-		 * so that we can leverage get_dev_pagemap() optimization which
-		 * will not re-take a reference on a pgmap if we already have
-		 * one.
-		 */
-		put_dev_pagemap(hmm_vma_walk->pgmap);
-		hmm_vma_walk->pgmap = NULL;
-	}
 	pte_unmap(ptep - 1);
 
 	hmm_vma_walk->last = addr;
@@ -529,10 +511,6 @@  static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
 			pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
 				  cpu_flags;
 		}
-		if (hmm_vma_walk->pgmap) {
-			put_dev_pagemap(hmm_vma_walk->pgmap);
-			hmm_vma_walk->pgmap = NULL;
-		}
 		hmm_vma_walk->last = end;
 		goto out_unlock;
 	}
@@ -694,6 +672,15 @@  long hmm_range_fault(struct hmm_range *range, unsigned int flags)
 			return -EBUSY;
 		ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
 				      &hmm_walk_ops, &hmm_vma_walk);
+		/*
+		 * A pgmap is kept cached in the hmm_vma_walk to avoid expensive
+		 * searching in the probably common case that the pgmap is the
+		 * same for the entire requested range.
+		 */
+		if (hmm_vma_walk.pgmap) {
+			put_dev_pagemap(hmm_vma_walk.pgmap);
+			hmm_vma_walk.pgmap = NULL;
+		}
 	} while (ret == -EBUSY);
 
 	if (ret)