diff mbox series

[v2,09/11] mm/hmm: allow to mirror vma of a file on a DAX backed filesystem v2

Message ID 20190325144011.10560-10-jglisse@redhat.com (mailing list archive)
State New, archived
Headers show
Series Improve HMM driver API v2 | expand

Commit Message

Jerome Glisse March 25, 2019, 2:40 p.m. UTC
From: Jérôme Glisse <jglisse@redhat.com>

HMM mirror is a device driver helpers to mirror range of virtual address.
It means that the process jobs running on the device can access the same
virtual address as the CPU threads of that process. This patch adds support
for mirroring mapping of file that are on a DAX block device (ie range of
virtual address that is an mmap of a file in a filesystem on a DAX block
device). There is no reason to not support such case when mirroring virtual
address on a device.

Note that unlike GUP code we do not take page reference hence when we
back-off we have nothing to undo.

Changes since v1:
    - improved commit message
    - squashed: Arnd Bergmann: fix unused variable warning in hmm_vma_walk_pud

Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 mm/hmm.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 111 insertions(+), 21 deletions(-)

Comments

Ira Weiny March 28, 2019, 6:04 p.m. UTC | #1
On Mon, Mar 25, 2019 at 10:40:09AM -0400, Jerome Glisse wrote:
> From: Jérôme Glisse <jglisse@redhat.com>
> 
> HMM mirror is a device driver helpers to mirror range of virtual address.
> It means that the process jobs running on the device can access the same
> virtual address as the CPU threads of that process. This patch adds support
> for mirroring mapping of file that are on a DAX block device (ie range of
> virtual address that is an mmap of a file in a filesystem on a DAX block
> device). There is no reason to not support such case when mirroring virtual
> address on a device.
> 
> Note that unlike GUP code we do not take page reference hence when we
> back-off we have nothing to undo.
> 
> Changes since v1:
>     - improved commit message
>     - squashed: Arnd Bergmann: fix unused variable warning in hmm_vma_walk_pud
> 
> Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
> Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: John Hubbard <jhubbard@nvidia.com>
> Cc: Arnd Bergmann <arnd@arndb.de>
> ---
>  mm/hmm.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 111 insertions(+), 21 deletions(-)
> 
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 64a33770813b..ce33151c6832 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -325,6 +325,7 @@ EXPORT_SYMBOL(hmm_mirror_unregister);
>  
>  struct hmm_vma_walk {
>  	struct hmm_range	*range;
> +	struct dev_pagemap	*pgmap;
>  	unsigned long		last;
>  	bool			fault;
>  	bool			block;
> @@ -499,6 +500,15 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
>  				range->flags[HMM_PFN_VALID];
>  }
>  
> +static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
> +{
> +	if (!pud_present(pud))
> +		return 0;
> +	return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
> +				range->flags[HMM_PFN_WRITE] :
> +				range->flags[HMM_PFN_VALID];
> +}
> +
>  static int hmm_vma_handle_pmd(struct mm_walk *walk,
>  			      unsigned long addr,
>  			      unsigned long end,
> @@ -520,8 +530,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
>  		return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
>  
>  	pfn = pmd_pfn(pmd) + pte_index(addr);
> -	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
> +	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
> +		if (pmd_devmap(pmd)) {
> +			hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
> +					      hmm_vma_walk->pgmap);
> +			if (unlikely(!hmm_vma_walk->pgmap))
> +				return -EBUSY;
> +		}
>  		pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
> +	}
> +	if (hmm_vma_walk->pgmap) {
> +		put_dev_pagemap(hmm_vma_walk->pgmap);
> +		hmm_vma_walk->pgmap = NULL;
> +	}
>  	hmm_vma_walk->last = end;
>  	return 0;
>  }
> @@ -608,10 +629,24 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
>  	if (fault || write_fault)
>  		goto fault;
>  
> +	if (pte_devmap(pte)) {
> +		hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
> +					      hmm_vma_walk->pgmap);
> +		if (unlikely(!hmm_vma_walk->pgmap))
> +			return -EBUSY;
> +	} else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
> +		*pfn = range->values[HMM_PFN_SPECIAL];
> +		return -EFAULT;
> +	}
> +
>  	*pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;

	<tag>

>  	return 0;
>  
>  fault:
> +	if (hmm_vma_walk->pgmap) {
> +		put_dev_pagemap(hmm_vma_walk->pgmap);
> +		hmm_vma_walk->pgmap = NULL;
> +	}
>  	pte_unmap(ptep);
>  	/* Fault any virtual address we were asked to fault */
>  	return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
> @@ -699,12 +734,83 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>  			return r;
>  		}
>  	}
> +	if (hmm_vma_walk->pgmap) {
> +		put_dev_pagemap(hmm_vma_walk->pgmap);
> +		hmm_vma_walk->pgmap = NULL;
> +	}


Why is this here and not in hmm_vma_handle_pte()?  Unless I'm just getting
tired this is the corresponding put when hmm_vma_handle_pte() returns 0 above
at <tag> above.

Ira
Jerome Glisse March 29, 2019, 2:17 a.m. UTC | #2
On Thu, Mar 28, 2019 at 11:04:26AM -0700, Ira Weiny wrote:
> On Mon, Mar 25, 2019 at 10:40:09AM -0400, Jerome Glisse wrote:
> > From: Jérôme Glisse <jglisse@redhat.com>
> > 
> > HMM mirror is a device driver helpers to mirror range of virtual address.
> > It means that the process jobs running on the device can access the same
> > virtual address as the CPU threads of that process. This patch adds support
> > for mirroring mapping of file that are on a DAX block device (ie range of
> > virtual address that is an mmap of a file in a filesystem on a DAX block
> > device). There is no reason to not support such case when mirroring virtual
> > address on a device.
> > 
> > Note that unlike GUP code we do not take page reference hence when we
> > back-off we have nothing to undo.
> > 
> > Changes since v1:
> >     - improved commit message
> >     - squashed: Arnd Bergmann: fix unused variable warning in hmm_vma_walk_pud
> > 
> > Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
> > Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: Dan Williams <dan.j.williams@intel.com>
> > Cc: John Hubbard <jhubbard@nvidia.com>
> > Cc: Arnd Bergmann <arnd@arndb.de>
> > ---
> >  mm/hmm.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++---------
> >  1 file changed, 111 insertions(+), 21 deletions(-)
> > 
> > diff --git a/mm/hmm.c b/mm/hmm.c
> > index 64a33770813b..ce33151c6832 100644
> > --- a/mm/hmm.c
> > +++ b/mm/hmm.c
> > @@ -325,6 +325,7 @@ EXPORT_SYMBOL(hmm_mirror_unregister);
> >  
> >  struct hmm_vma_walk {
> >  	struct hmm_range	*range;
> > +	struct dev_pagemap	*pgmap;
> >  	unsigned long		last;
> >  	bool			fault;
> >  	bool			block;
> > @@ -499,6 +500,15 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
> >  				range->flags[HMM_PFN_VALID];
> >  }
> >  
> > +static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
> > +{
> > +	if (!pud_present(pud))
> > +		return 0;
> > +	return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
> > +				range->flags[HMM_PFN_WRITE] :
> > +				range->flags[HMM_PFN_VALID];
> > +}
> > +
> >  static int hmm_vma_handle_pmd(struct mm_walk *walk,
> >  			      unsigned long addr,
> >  			      unsigned long end,
> > @@ -520,8 +530,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
> >  		return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
> >  
> >  	pfn = pmd_pfn(pmd) + pte_index(addr);
> > -	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
> > +	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
> > +		if (pmd_devmap(pmd)) {
> > +			hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
> > +					      hmm_vma_walk->pgmap);
> > +			if (unlikely(!hmm_vma_walk->pgmap))
> > +				return -EBUSY;
> > +		}
> >  		pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
> > +	}
> > +	if (hmm_vma_walk->pgmap) {
> > +		put_dev_pagemap(hmm_vma_walk->pgmap);
> > +		hmm_vma_walk->pgmap = NULL;
> > +	}
> >  	hmm_vma_walk->last = end;
> >  	return 0;
> >  }
> > @@ -608,10 +629,24 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
> >  	if (fault || write_fault)
> >  		goto fault;
> >  
> > +	if (pte_devmap(pte)) {
> > +		hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
> > +					      hmm_vma_walk->pgmap);
> > +		if (unlikely(!hmm_vma_walk->pgmap))
> > +			return -EBUSY;
> > +	} else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
> > +		*pfn = range->values[HMM_PFN_SPECIAL];
> > +		return -EFAULT;
> > +	}
> > +
> >  	*pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
> 
> 	<tag>
> 
> >  	return 0;
> >  
> >  fault:
> > +	if (hmm_vma_walk->pgmap) {
> > +		put_dev_pagemap(hmm_vma_walk->pgmap);
> > +		hmm_vma_walk->pgmap = NULL;
> > +	}
> >  	pte_unmap(ptep);
> >  	/* Fault any virtual address we were asked to fault */
> >  	return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
> > @@ -699,12 +734,83 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> >  			return r;
> >  		}
> >  	}
> > +	if (hmm_vma_walk->pgmap) {
> > +		put_dev_pagemap(hmm_vma_walk->pgmap);
> > +		hmm_vma_walk->pgmap = NULL;
> > +	}
> 
> 
> Why is this here and not in hmm_vma_handle_pte()?  Unless I'm just getting
> tired this is the corresponding put when hmm_vma_handle_pte() returns 0 above
> at <tag> above.

This is because get_dev_pagemap() optimize away the reference getting
if we already hold a reference on the correct dev_pagemap. So if we
were releasing the reference within hmm_vma_handle_pte() then we would
loose the get_dev_pagemap() optimization.

Cheers,
Jérôme
diff mbox series

Patch

diff --git a/mm/hmm.c b/mm/hmm.c
index 64a33770813b..ce33151c6832 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -325,6 +325,7 @@  EXPORT_SYMBOL(hmm_mirror_unregister);
 
 struct hmm_vma_walk {
 	struct hmm_range	*range;
+	struct dev_pagemap	*pgmap;
 	unsigned long		last;
 	bool			fault;
 	bool			block;
@@ -499,6 +500,15 @@  static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
 				range->flags[HMM_PFN_VALID];
 }
 
+static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+{
+	if (!pud_present(pud))
+		return 0;
+	return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
+				range->flags[HMM_PFN_WRITE] :
+				range->flags[HMM_PFN_VALID];
+}
+
 static int hmm_vma_handle_pmd(struct mm_walk *walk,
 			      unsigned long addr,
 			      unsigned long end,
@@ -520,8 +530,19 @@  static int hmm_vma_handle_pmd(struct mm_walk *walk,
 		return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
 
 	pfn = pmd_pfn(pmd) + pte_index(addr);
-	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
+	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+		if (pmd_devmap(pmd)) {
+			hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+					      hmm_vma_walk->pgmap);
+			if (unlikely(!hmm_vma_walk->pgmap))
+				return -EBUSY;
+		}
 		pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+	}
+	if (hmm_vma_walk->pgmap) {
+		put_dev_pagemap(hmm_vma_walk->pgmap);
+		hmm_vma_walk->pgmap = NULL;
+	}
 	hmm_vma_walk->last = end;
 	return 0;
 }
@@ -608,10 +629,24 @@  static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	if (fault || write_fault)
 		goto fault;
 
+	if (pte_devmap(pte)) {
+		hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
+					      hmm_vma_walk->pgmap);
+		if (unlikely(!hmm_vma_walk->pgmap))
+			return -EBUSY;
+	} else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
+		*pfn = range->values[HMM_PFN_SPECIAL];
+		return -EFAULT;
+	}
+
 	*pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
 	return 0;
 
 fault:
+	if (hmm_vma_walk->pgmap) {
+		put_dev_pagemap(hmm_vma_walk->pgmap);
+		hmm_vma_walk->pgmap = NULL;
+	}
 	pte_unmap(ptep);
 	/* Fault any virtual address we were asked to fault */
 	return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -699,12 +734,83 @@  static int hmm_vma_walk_pmd(pmd_t *pmdp,
 			return r;
 		}
 	}
+	if (hmm_vma_walk->pgmap) {
+		put_dev_pagemap(hmm_vma_walk->pgmap);
+		hmm_vma_walk->pgmap = NULL;
+	}
 	pte_unmap(ptep - 1);
 
 	hmm_vma_walk->last = addr;
 	return 0;
 }
 
+static int hmm_vma_walk_pud(pud_t *pudp,
+			    unsigned long start,
+			    unsigned long end,
+			    struct mm_walk *walk)
+{
+	struct hmm_vma_walk *hmm_vma_walk = walk->private;
+	struct hmm_range *range = hmm_vma_walk->range;
+	unsigned long addr = start, next;
+	pmd_t *pmdp;
+	pud_t pud;
+	int ret;
+
+again:
+	pud = READ_ONCE(*pudp);
+	if (pud_none(pud))
+		return hmm_vma_walk_hole(start, end, walk);
+
+	if (pud_huge(pud) && pud_devmap(pud)) {
+		unsigned long i, npages, pfn;
+		uint64_t *pfns, cpu_flags;
+		bool fault, write_fault;
+
+		if (!pud_present(pud))
+			return hmm_vma_walk_hole(start, end, walk);
+
+		i = (addr - range->start) >> PAGE_SHIFT;
+		npages = (end - addr) >> PAGE_SHIFT;
+		pfns = &range->pfns[i];
+
+		cpu_flags = pud_to_hmm_pfn_flags(range, pud);
+		hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+				     cpu_flags, &fault, &write_fault);
+		if (fault || write_fault)
+			return hmm_vma_walk_hole_(addr, end, fault,
+						write_fault, walk);
+
+		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+		for (i = 0; i < npages; ++i, ++pfn) {
+			hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+					      hmm_vma_walk->pgmap);
+			if (unlikely(!hmm_vma_walk->pgmap))
+				return -EBUSY;
+			pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+		}
+		if (hmm_vma_walk->pgmap) {
+			put_dev_pagemap(hmm_vma_walk->pgmap);
+			hmm_vma_walk->pgmap = NULL;
+		}
+		hmm_vma_walk->last = end;
+		return 0;
+	}
+
+	split_huge_pud(walk->vma, pudp, addr);
+	if (pud_none(*pudp))
+		goto again;
+
+	pmdp = pmd_offset(pudp, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
+		if (ret)
+			return ret;
+	} while (pmdp++, addr = next, addr != end);
+
+	return 0;
+}
+
 static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 				      unsigned long start, unsigned long end,
 				      struct mm_walk *walk)
@@ -777,14 +883,6 @@  static void hmm_pfns_clear(struct hmm_range *range,
 		*pfns = range->values[HMM_PFN_NONE];
 }
 
-static void hmm_pfns_special(struct hmm_range *range)
-{
-	unsigned long addr = range->start, i = 0;
-
-	for (; addr < range->end; addr += PAGE_SIZE, i++)
-		range->pfns[i] = range->values[HMM_PFN_SPECIAL];
-}
-
 /*
  * hmm_range_register() - start tracking change to CPU page table over a range
  * @range: range
@@ -902,12 +1000,6 @@  long hmm_range_snapshot(struct hmm_range *range)
 		if (vma == NULL || (vma->vm_flags & device_vma))
 			return -EFAULT;
 
-		/* FIXME support dax */
-		if (vma_is_dax(vma)) {
-			hmm_pfns_special(range);
-			return -EINVAL;
-		}
-
 		if (is_vm_hugetlb_page(vma)) {
 			struct hstate *h = hstate_vma(vma);
 
@@ -931,6 +1023,7 @@  long hmm_range_snapshot(struct hmm_range *range)
 		}
 
 		range->vma = vma;
+		hmm_vma_walk.pgmap = NULL;
 		hmm_vma_walk.last = start;
 		hmm_vma_walk.fault = false;
 		hmm_vma_walk.range = range;
@@ -942,6 +1035,7 @@  long hmm_range_snapshot(struct hmm_range *range)
 		mm_walk.pte_entry = NULL;
 		mm_walk.test_walk = NULL;
 		mm_walk.hugetlb_entry = NULL;
+		mm_walk.pud_entry = hmm_vma_walk_pud;
 		mm_walk.pmd_entry = hmm_vma_walk_pmd;
 		mm_walk.pte_hole = hmm_vma_walk_hole;
 		mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
@@ -1007,12 +1101,6 @@  long hmm_range_fault(struct hmm_range *range, bool block)
 		if (vma == NULL || (vma->vm_flags & device_vma))
 			return -EFAULT;
 
-		/* FIXME support dax */
-		if (vma_is_dax(vma)) {
-			hmm_pfns_special(range);
-			return -EINVAL;
-		}
-
 		if (is_vm_hugetlb_page(vma)) {
 			if (huge_page_shift(hstate_vma(vma)) !=
 			    range->page_shift &&
@@ -1035,6 +1123,7 @@  long hmm_range_fault(struct hmm_range *range, bool block)
 		}
 
 		range->vma = vma;
+		hmm_vma_walk.pgmap = NULL;
 		hmm_vma_walk.last = start;
 		hmm_vma_walk.fault = true;
 		hmm_vma_walk.block = block;
@@ -1047,6 +1136,7 @@  long hmm_range_fault(struct hmm_range *range, bool block)
 		mm_walk.pte_entry = NULL;
 		mm_walk.test_walk = NULL;
 		mm_walk.hugetlb_entry = NULL;
+		mm_walk.pud_entry = hmm_vma_walk_pud;
 		mm_walk.pmd_entry = hmm_vma_walk_pmd;
 		mm_walk.pte_hole = hmm_vma_walk_hole;
 		mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;