diff mbox series

[hmm,7/8] mm/hmm: return -EFAULT when setting HMM_PFN_ERROR on requested valid pages

Message ID 20200311183506.3997-8-jgg@ziepe.ca (mailing list archive)
State New, archived
Headers show
Series Various error case bug fixes for hmm_range_fault() | expand

Commit Message

Jason Gunthorpe March 11, 2020, 6:35 p.m. UTC
From: Jason Gunthorpe <jgg@mellanox.com>

hmm_range_fault() should never return 0 if the caller requested a valid
page, but the pfns output for that page would be HMM_PFN_ERROR.

hmm_pte_need_fault() must always be called before setting HMM_PFN_ERROR to
detect if the page is in faulting mode or not.

Fix two cases in hmm_vma_walk_pmd() and reorganize some of the duplicated
code.

Fixes: d08faca018c4 ("mm/hmm: properly handle migration pmd")
Fixes: da4c3c735ea4 ("mm/hmm/mirror: helper to snapshot CPU page table")
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

Comments

Ralph Campbell March 12, 2020, 1:36 a.m. UTC | #1
On 3/11/20 11:35 AM, Jason Gunthorpe wrote:
> From: Jason Gunthorpe <jgg@mellanox.com>
> 
> hmm_range_fault() should never return 0 if the caller requested a valid
> page, but the pfns output for that page would be HMM_PFN_ERROR.
> 
> hmm_pte_need_fault() must always be called before setting HMM_PFN_ERROR to
> detect if the page is in faulting mode or not.
> 
> Fix two cases in hmm_vma_walk_pmd() and reorganize some of the duplicated
> code.
> 
> Fixes: d08faca018c4 ("mm/hmm: properly handle migration pmd")
> Fixes: da4c3c735ea4 ("mm/hmm/mirror: helper to snapshot CPU page table")
> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>

Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>

> ---
>   mm/hmm.c | 38 +++++++++++++++++++++-----------------
>   1 file changed, 21 insertions(+), 17 deletions(-)
> 
> diff --git a/mm/hmm.c b/mm/hmm.c
> index bf676cfef3e8ee..f61fddf2ef6505 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -363,8 +363,10 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>   {
>   	struct hmm_vma_walk *hmm_vma_walk = walk->private;
>   	struct hmm_range *range = hmm_vma_walk->range;
> -	uint64_t *pfns = range->pfns;
> -	unsigned long addr = start, i;
> +	uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT];
> +	unsigned long npages = (end - start) >> PAGE_SHIFT;
> +	unsigned long addr = start;
> +	bool fault, write_fault;
>   	pte_t *ptep;
>   	pmd_t pmd;
>   
> @@ -374,14 +376,6 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>   		return hmm_vma_walk_hole(start, end, -1, walk);
>   
>   	if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
> -		bool fault, write_fault;
> -		unsigned long npages;
> -		uint64_t *pfns;
> -
> -		i = (addr - range->start) >> PAGE_SHIFT;
> -		npages = (end - addr) >> PAGE_SHIFT;
> -		pfns = &range->pfns[i];
> -
>   		hmm_range_need_fault(hmm_vma_walk, pfns, npages,
>   				     0, &fault, &write_fault);
>   		if (fault || write_fault) {
> @@ -390,8 +384,15 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>   			return -EBUSY;
>   		}
>   		return hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
> -	} else if (!pmd_present(pmd))
> +	}
> +
> +	if (!pmd_present(pmd)) {
> +		hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault,
> +				     &write_fault);
> +		if (fault || write_fault)
> +			return -EFAULT;
>   		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);

Shouldn't this fill with HMM_PFN_NONE instead of HMM_PFN_ERROR?
Otherwise, when a THP is swapped out, you will get a different
value than if a PTE is swapped out and you are prefetching/snapshotting.

> +	}
>   
>   	if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
>   		/*
> @@ -408,8 +409,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>   		if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
>   			goto again;
>   
> -		i = (addr - range->start) >> PAGE_SHIFT;
> -		return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
> +		return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd);
>   	}
>   
>   	/*
> @@ -418,15 +418,19 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>   	 * entry pointing to pte directory or it is a bad pmd that will not
>   	 * recover.
>   	 */
> -	if (pmd_bad(pmd))
> +	if (pmd_bad(pmd)) {
> +		hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault,
> +				     &write_fault);
> +		if (fault || write_fault)
> +			return -EFAULT;
>   		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> +	}
>   
>   	ptep = pte_offset_map(pmdp, addr);
> -	i = (addr - range->start) >> PAGE_SHIFT;
> -	for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
> +	for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) {
>   		int r;
>   
> -		r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
> +		r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns);
>   		if (r) {
>   			/* hmm_vma_handle_pte() did pte_unmap() */
>   			hmm_vma_walk->last = addr;
>
Jason Gunthorpe March 12, 2020, 2:35 p.m. UTC | #2
On Wed, Mar 11, 2020 at 06:36:47PM -0700, Ralph Campbell wrote:
> > @@ -390,8 +384,15 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> >   			return -EBUSY;
> >   		}
> >   		return hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
> > -	} else if (!pmd_present(pmd))
> > +	}
> > +
> > +	if (!pmd_present(pmd)) {
> > +		hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault,
> > +				     &write_fault);
> > +		if (fault || write_fault)
> > +			return -EFAULT;
> >   		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
> 
> Shouldn't this fill with HMM_PFN_NONE instead of HMM_PFN_ERROR?
> Otherwise, when a THP is swapped out, you will get a different
> value than if a PTE is swapped out and you are prefetching/snapshotting.

If this is the case then the problem is that the return -EFAULT path
needs to do something else.. ie since the above code can't trigger
swap in, it is correct to return PFN_ERROR.

I'm completely guessing, but do we need to call pmd_to_swp_entry() and
handle things similarly to the pte? What swp_entries are valid for a
pmd?

Do you understand this better, or know how to trigger a !pmd_present
for test?

I suppose another option would be this:

	if (!pmd_present(pmd)) {
		hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault,
				     &write_fault);
                /* We can't handle this. Cause the PMD to be split and
		 * handle it in the pte handler. */
		if (fault || write_fault)
 		        return 0;
  		return hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
        }

Which, I think, must be correct, but inefficient?

Jason
Christoph Hellwig March 16, 2020, 9:12 a.m. UTC | #3
On Wed, Mar 11, 2020 at 03:35:05PM -0300, Jason Gunthorpe wrote:
> From: Jason Gunthorpe <jgg@mellanox.com>
> 
> hmm_range_fault() should never return 0 if the caller requested a valid
> page, but the pfns output for that page would be HMM_PFN_ERROR.
> 
> hmm_pte_need_fault() must always be called before setting HMM_PFN_ERROR to
> detect if the page is in faulting mode or not.
> 
> Fix two cases in hmm_vma_walk_pmd() and reorganize some of the duplicated
> code.

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>
diff mbox series

Patch

diff --git a/mm/hmm.c b/mm/hmm.c
index bf676cfef3e8ee..f61fddf2ef6505 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -363,8 +363,10 @@  static int hmm_vma_walk_pmd(pmd_t *pmdp,
 {
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
 	struct hmm_range *range = hmm_vma_walk->range;
-	uint64_t *pfns = range->pfns;
-	unsigned long addr = start, i;
+	uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT];
+	unsigned long npages = (end - start) >> PAGE_SHIFT;
+	unsigned long addr = start;
+	bool fault, write_fault;
 	pte_t *ptep;
 	pmd_t pmd;
 
@@ -374,14 +376,6 @@  static int hmm_vma_walk_pmd(pmd_t *pmdp,
 		return hmm_vma_walk_hole(start, end, -1, walk);
 
 	if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
-		bool fault, write_fault;
-		unsigned long npages;
-		uint64_t *pfns;
-
-		i = (addr - range->start) >> PAGE_SHIFT;
-		npages = (end - addr) >> PAGE_SHIFT;
-		pfns = &range->pfns[i];
-
 		hmm_range_need_fault(hmm_vma_walk, pfns, npages,
 				     0, &fault, &write_fault);
 		if (fault || write_fault) {
@@ -390,8 +384,15 @@  static int hmm_vma_walk_pmd(pmd_t *pmdp,
 			return -EBUSY;
 		}
 		return hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
-	} else if (!pmd_present(pmd))
+	}
+
+	if (!pmd_present(pmd)) {
+		hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault,
+				     &write_fault);
+		if (fault || write_fault)
+			return -EFAULT;
 		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+	}
 
 	if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
 		/*
@@ -408,8 +409,7 @@  static int hmm_vma_walk_pmd(pmd_t *pmdp,
 		if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
 			goto again;
 
-		i = (addr - range->start) >> PAGE_SHIFT;
-		return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
+		return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd);
 	}
 
 	/*
@@ -418,15 +418,19 @@  static int hmm_vma_walk_pmd(pmd_t *pmdp,
 	 * entry pointing to pte directory or it is a bad pmd that will not
 	 * recover.
 	 */
-	if (pmd_bad(pmd))
+	if (pmd_bad(pmd)) {
+		hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault,
+				     &write_fault);
+		if (fault || write_fault)
+			return -EFAULT;
 		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+	}
 
 	ptep = pte_offset_map(pmdp, addr);
-	i = (addr - range->start) >> PAGE_SHIFT;
-	for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
+	for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) {
 		int r;
 
-		r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
+		r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns);
 		if (r) {
 			/* hmm_vma_handle_pte() did pte_unmap() */
 			hmm_vma_walk->last = addr;