Message ID | 20200311183506.3997-8-jgg@ziepe.ca (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Various error case bug fixes for hmm_range_fault() | expand |
On 3/11/20 11:35 AM, Jason Gunthorpe wrote: > From: Jason Gunthorpe <jgg@mellanox.com> > > hmm_range_fault() should never return 0 if the caller requested a valid > page, but the pfns output for that page would be HMM_PFN_ERROR. > > hmm_pte_need_fault() must always be called before setting HMM_PFN_ERROR to > detect if the page is in faulting mode or not. > > Fix two cases in hmm_vma_walk_pmd() and reorganize some of the duplicated > code. > > Fixes: d08faca018c4 ("mm/hmm: properly handle migration pmd") > Fixes: da4c3c735ea4 ("mm/hmm/mirror: helper to snapshot CPU page table") > Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> Reviewed-by: Ralph Campbell <rcampbell@nvidia.com> > --- > mm/hmm.c | 38 +++++++++++++++++++++----------------- > 1 file changed, 21 insertions(+), 17 deletions(-) > > diff --git a/mm/hmm.c b/mm/hmm.c > index bf676cfef3e8ee..f61fddf2ef6505 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -363,8 +363,10 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, > { > struct hmm_vma_walk *hmm_vma_walk = walk->private; > struct hmm_range *range = hmm_vma_walk->range; > - uint64_t *pfns = range->pfns; > - unsigned long addr = start, i; > + uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT]; > + unsigned long npages = (end - start) >> PAGE_SHIFT; > + unsigned long addr = start; > + bool fault, write_fault; > pte_t *ptep; > pmd_t pmd; > > @@ -374,14 +376,6 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, > return hmm_vma_walk_hole(start, end, -1, walk); > > if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { > - bool fault, write_fault; > - unsigned long npages; > - uint64_t *pfns; > - > - i = (addr - range->start) >> PAGE_SHIFT; > - npages = (end - addr) >> PAGE_SHIFT; > - pfns = &range->pfns[i]; > - > hmm_range_need_fault(hmm_vma_walk, pfns, npages, > 0, &fault, &write_fault); > if (fault || write_fault) { > @@ -390,8 +384,15 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, > return -EBUSY; > } > return hmm_pfns_fill(start, end, range, HMM_PFN_NONE); > - } else if (!pmd_present(pmd)) > + } > + > + if (!pmd_present(pmd)) { > + hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault, > + &write_fault); > + if (fault || write_fault) > + return -EFAULT; > return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); Shouldn't this fill with HMM_PFN_NONE instead of HMM_PFN_ERROR? Otherwise, when a THP is swapped out, you will get a different value than if a PTE is swapped out and you are prefetching/snapshotting. > + } > > if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { > /* > @@ -408,8 +409,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, > if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) > goto again; > > - i = (addr - range->start) >> PAGE_SHIFT; > - return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); > + return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd); > } > > /* > @@ -418,15 +418,19 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, > * entry pointing to pte directory or it is a bad pmd that will not > * recover. > */ > - if (pmd_bad(pmd)) > + if (pmd_bad(pmd)) { > + hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault, > + &write_fault); > + if (fault || write_fault) > + return -EFAULT; > return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); > + } > > ptep = pte_offset_map(pmdp, addr); > - i = (addr - range->start) >> PAGE_SHIFT; > - for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { > + for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) { > int r; > > - r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); > + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns); > if (r) { > /* hmm_vma_handle_pte() did pte_unmap() */ > hmm_vma_walk->last = addr; >
On Wed, Mar 11, 2020 at 06:36:47PM -0700, Ralph Campbell wrote: > > @@ -390,8 +384,15 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, > > return -EBUSY; > > } > > return hmm_pfns_fill(start, end, range, HMM_PFN_NONE); > > - } else if (!pmd_present(pmd)) > > + } > > + > > + if (!pmd_present(pmd)) { > > + hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault, > > + &write_fault); > > + if (fault || write_fault) > > + return -EFAULT; > > return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); > > Shouldn't this fill with HMM_PFN_NONE instead of HMM_PFN_ERROR? > Otherwise, when a THP is swapped out, you will get a different > value than if a PTE is swapped out and you are prefetching/snapshotting. If this is the case then the problem is that the return -EFAULT path needs to do something else.. ie since the above code can't trigger swap in, it is correct to return PFN_ERROR. I'm completely guessing, but do we need to call pmd_to_swp_entry() and handle things similarly to the pte? What swp_entries are valid for a pmd? Do you understand this better, or know how to trigger a !pmd_present for test? I suppose another option would be this: if (!pmd_present(pmd)) { hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault, &write_fault); /* We can't handle this. Cause the PMD to be split and * handle it in the pte handler. */ if (fault || write_fault) return 0; return hmm_pfns_fill(start, end, range, HMM_PFN_NONE); } Which, I think, must be correct, but inefficient? Jason
On Wed, Mar 11, 2020 at 03:35:05PM -0300, Jason Gunthorpe wrote: > From: Jason Gunthorpe <jgg@mellanox.com> > > hmm_range_fault() should never return 0 if the caller requested a valid > page, but the pfns output for that page would be HMM_PFN_ERROR. > > hmm_pte_need_fault() must always be called before setting HMM_PFN_ERROR to > detect if the page is in faulting mode or not. > > Fix two cases in hmm_vma_walk_pmd() and reorganize some of the duplicated > code. Looks good, Reviewed-by: Christoph Hellwig <hch@lst.de>
diff --git a/mm/hmm.c b/mm/hmm.c index bf676cfef3e8ee..f61fddf2ef6505 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -363,8 +363,10 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - uint64_t *pfns = range->pfns; - unsigned long addr = start, i; + uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT]; + unsigned long npages = (end - start) >> PAGE_SHIFT; + unsigned long addr = start; + bool fault, write_fault; pte_t *ptep; pmd_t pmd; @@ -374,14 +376,6 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, return hmm_vma_walk_hole(start, end, -1, walk); if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { - bool fault, write_fault; - unsigned long npages; - uint64_t *pfns; - - i = (addr - range->start) >> PAGE_SHIFT; - npages = (end - addr) >> PAGE_SHIFT; - pfns = &range->pfns[i]; - hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault, &write_fault); if (fault || write_fault) { @@ -390,8 +384,15 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, return -EBUSY; } return hmm_pfns_fill(start, end, range, HMM_PFN_NONE); - } else if (!pmd_present(pmd)) + } + + if (!pmd_present(pmd)) { + hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault, + &write_fault); + if (fault || write_fault) + return -EFAULT; return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); + } if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { /* @@ -408,8 +409,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; - i = (addr - range->start) >> PAGE_SHIFT; - return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); + return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd); } /* @@ -418,15 +418,19 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, * entry pointing to pte directory or it is a bad pmd that will not * recover. */ - if (pmd_bad(pmd)) + if (pmd_bad(pmd)) { + hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault, + &write_fault); + if (fault || write_fault) + return -EFAULT; return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); + } ptep = pte_offset_map(pmdp, addr); - i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { + for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) { int r; - r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns); if (r) { /* hmm_vma_handle_pte() did pte_unmap() */ hmm_vma_walk->last = addr;