diff mbox series

[09/10] mm/hmm: allow to mirror vma of a file on a DAX backed filesystem

Message ID 20190129165428.3931-10-jglisse@redhat.com (mailing list archive)
State New, archived
Headers show
Series HMM updates for 5.1 | expand

Commit Message

Jerome Glisse Jan. 29, 2019, 4:54 p.m. UTC
From: Jérôme Glisse <jglisse@redhat.com>

This add support to mirror vma which is an mmap of a file which is on
a filesystem that using a DAX block device. There is no reason not to
support that case.

Note that unlike GUP code we do not take page reference hence when we
back-off we have nothing to undo.

Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
---
 mm/hmm.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 112 insertions(+), 21 deletions(-)

Comments

Dan Williams Jan. 29, 2019, 6:41 p.m. UTC | #1
On Tue, Jan 29, 2019 at 8:54 AM <jglisse@redhat.com> wrote:
>
> From: Jérôme Glisse <jglisse@redhat.com>
>
> This add support to mirror vma which is an mmap of a file which is on
> a filesystem that using a DAX block device. There is no reason not to
> support that case.
>

The reason not to support it would be if it gets in the way of future
DAX development. How does this interact with MAP_SYNC? I'm also
concerned if this complicates DAX reflink support. In general I'd
rather prioritize fixing the places where DAX is broken today before
adding more cross-subsystem entanglements. The unit tests for
filesystems (xfstests) are readily accessible. How would I go about
regression testing DAX + HMM interactions?

> Note that unlike GUP code we do not take page reference hence when we
> back-off we have nothing to undo.
>
> Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Ralph Campbell <rcampbell@nvidia.com>
> Cc: John Hubbard <jhubbard@nvidia.com>
> ---
>  mm/hmm.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 112 insertions(+), 21 deletions(-)
>
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 8b87e1813313..1a444885404e 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -334,6 +334,7 @@ EXPORT_SYMBOL(hmm_mirror_unregister);
>
>  struct hmm_vma_walk {
>         struct hmm_range        *range;
> +       struct dev_pagemap      *pgmap;
>         unsigned long           last;
>         bool                    fault;
>         bool                    block;
> @@ -508,6 +509,15 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
>                                 range->flags[HMM_PFN_VALID];
>  }
>
> +static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
> +{
> +       if (!pud_present(pud))
> +               return 0;
> +       return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
> +                               range->flags[HMM_PFN_WRITE] :
> +                               range->flags[HMM_PFN_VALID];
> +}
> +
>  static int hmm_vma_handle_pmd(struct mm_walk *walk,
>                               unsigned long addr,
>                               unsigned long end,
> @@ -529,8 +539,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
>                 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
>
>         pfn = pmd_pfn(pmd) + pte_index(addr);
> -       for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
> +       for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
> +               if (pmd_devmap(pmd)) {
> +                       hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
> +                                             hmm_vma_walk->pgmap);
> +                       if (unlikely(!hmm_vma_walk->pgmap))
> +                               return -EBUSY;
> +               }
>                 pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
> +       }
> +       if (hmm_vma_walk->pgmap) {
> +               put_dev_pagemap(hmm_vma_walk->pgmap);
> +               hmm_vma_walk->pgmap = NULL;
> +       }
>         hmm_vma_walk->last = end;
>         return 0;
>  }
> @@ -617,10 +638,24 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
>         if (fault || write_fault)
>                 goto fault;
>
> +       if (pte_devmap(pte)) {
> +               hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
> +                                             hmm_vma_walk->pgmap);
> +               if (unlikely(!hmm_vma_walk->pgmap))
> +                       return -EBUSY;
> +       } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
> +               *pfn = range->values[HMM_PFN_SPECIAL];
> +               return -EFAULT;
> +       }
> +
>         *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
>         return 0;
>
>  fault:
> +       if (hmm_vma_walk->pgmap) {
> +               put_dev_pagemap(hmm_vma_walk->pgmap);
> +               hmm_vma_walk->pgmap = NULL;
> +       }
>         pte_unmap(ptep);
>         /* Fault any virtual address we were asked to fault */
>         return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
> @@ -708,12 +743,84 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
>                         return r;
>                 }
>         }
> +       if (hmm_vma_walk->pgmap) {
> +               put_dev_pagemap(hmm_vma_walk->pgmap);
> +               hmm_vma_walk->pgmap = NULL;
> +       }
>         pte_unmap(ptep - 1);
>
>         hmm_vma_walk->last = addr;
>         return 0;
>  }
>
> +static int hmm_vma_walk_pud(pud_t *pudp,
> +                           unsigned long start,
> +                           unsigned long end,
> +                           struct mm_walk *walk)
> +{
> +       struct hmm_vma_walk *hmm_vma_walk = walk->private;
> +       struct hmm_range *range = hmm_vma_walk->range;
> +       struct vm_area_struct *vma = walk->vma;
> +       unsigned long addr = start, next;
> +       pmd_t *pmdp;
> +       pud_t pud;
> +       int ret;
> +
> +again:
> +       pud = READ_ONCE(*pudp);
> +       if (pud_none(pud))
> +               return hmm_vma_walk_hole(start, end, walk);
> +
> +       if (pud_huge(pud) && pud_devmap(pud)) {
> +               unsigned long i, npages, pfn;
> +               uint64_t *pfns, cpu_flags;
> +               bool fault, write_fault;
> +
> +               if (!pud_present(pud))
> +                       return hmm_vma_walk_hole(start, end, walk);
> +
> +               i = (addr - range->start) >> PAGE_SHIFT;
> +               npages = (end - addr) >> PAGE_SHIFT;
> +               pfns = &range->pfns[i];
> +
> +               cpu_flags = pud_to_hmm_pfn_flags(range, pud);
> +               hmm_range_need_fault(hmm_vma_walk, pfns, npages,
> +                                    cpu_flags, &fault, &write_fault);
> +               if (fault || write_fault)
> +                       return hmm_vma_walk_hole_(addr, end, fault,
> +                                               write_fault, walk);
> +
> +               pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> +               for (i = 0; i < npages; ++i, ++pfn) {
> +                       hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
> +                                             hmm_vma_walk->pgmap);
> +                       if (unlikely(!hmm_vma_walk->pgmap))
> +                               return -EBUSY;
> +                       pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
> +               }
> +               if (hmm_vma_walk->pgmap) {
> +                       put_dev_pagemap(hmm_vma_walk->pgmap);
> +                       hmm_vma_walk->pgmap = NULL;
> +               }
> +               hmm_vma_walk->last = end;
> +               return 0;
> +       }
> +
> +       split_huge_pud(vma, pudp, addr);
> +       if (pud_none(*pudp))
> +               goto again;
> +
> +       pmdp = pmd_offset(pudp, addr);
> +       do {
> +               next = pmd_addr_end(addr, end);
> +               ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
> +               if (ret)
> +                       return ret;
> +       } while (pmdp++, addr = next, addr != end);
> +
> +       return 0;
> +}
> +
>  static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
>                                       unsigned long start, unsigned long end,
>                                       struct mm_walk *walk)
> @@ -786,14 +893,6 @@ static void hmm_pfns_clear(struct hmm_range *range,
>                 *pfns = range->values[HMM_PFN_NONE];
>  }
>
> -static void hmm_pfns_special(struct hmm_range *range)
> -{
> -       unsigned long addr = range->start, i = 0;
> -
> -       for (; addr < range->end; addr += PAGE_SIZE, i++)
> -               range->pfns[i] = range->values[HMM_PFN_SPECIAL];
> -}
> -
>  /*
>   * hmm_range_register() - start tracking change to CPU page table over a range
>   * @range: range
> @@ -911,12 +1010,6 @@ long hmm_range_snapshot(struct hmm_range *range)
>                 if (vma == NULL || (vma->vm_flags & device_vma))
>                         return -EFAULT;
>
> -               /* FIXME support dax */
> -               if (vma_is_dax(vma)) {
> -                       hmm_pfns_special(range);
> -                       return -EINVAL;
> -               }
> -
>                 if (is_vm_hugetlb_page(vma)) {
>                         struct hstate *h = hstate_vma(vma);
>
> @@ -940,6 +1033,7 @@ long hmm_range_snapshot(struct hmm_range *range)
>                 }
>
>                 range->vma = vma;
> +               hmm_vma_walk.pgmap = NULL;
>                 hmm_vma_walk.last = start;
>                 hmm_vma_walk.fault = false;
>                 hmm_vma_walk.range = range;
> @@ -951,6 +1045,7 @@ long hmm_range_snapshot(struct hmm_range *range)
>                 mm_walk.pte_entry = NULL;
>                 mm_walk.test_walk = NULL;
>                 mm_walk.hugetlb_entry = NULL;
> +               mm_walk.pud_entry = hmm_vma_walk_pud;
>                 mm_walk.pmd_entry = hmm_vma_walk_pmd;
>                 mm_walk.pte_hole = hmm_vma_walk_hole;
>                 mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
> @@ -1018,12 +1113,6 @@ long hmm_range_fault(struct hmm_range *range, bool block)
>                 if (vma == NULL || (vma->vm_flags & device_vma))
>                         return -EFAULT;
>
> -               /* FIXME support dax */
> -               if (vma_is_dax(vma)) {
> -                       hmm_pfns_special(range);
> -                       return -EINVAL;
> -               }
> -
>                 if (is_vm_hugetlb_page(vma)) {
>                         struct hstate *h = hstate_vma(vma);
>
> @@ -1047,6 +1136,7 @@ long hmm_range_fault(struct hmm_range *range, bool block)
>                 }
>
>                 range->vma = vma;
> +               hmm_vma_walk.pgmap = NULL;
>                 hmm_vma_walk.last = start;
>                 hmm_vma_walk.fault = true;
>                 hmm_vma_walk.block = block;
> @@ -1059,6 +1149,7 @@ long hmm_range_fault(struct hmm_range *range, bool block)
>                 mm_walk.pte_entry = NULL;
>                 mm_walk.test_walk = NULL;
>                 mm_walk.hugetlb_entry = NULL;
> +               mm_walk.pud_entry = hmm_vma_walk_pud;
>                 mm_walk.pmd_entry = hmm_vma_walk_pmd;
>                 mm_walk.pte_hole = hmm_vma_walk_hole;
>                 mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
> --
> 2.17.2
>
Jerome Glisse Jan. 29, 2019, 7:31 p.m. UTC | #2
On Tue, Jan 29, 2019 at 10:41:23AM -0800, Dan Williams wrote:
> On Tue, Jan 29, 2019 at 8:54 AM <jglisse@redhat.com> wrote:
> >
> > From: Jérôme Glisse <jglisse@redhat.com>
> >
> > This add support to mirror vma which is an mmap of a file which is on
> > a filesystem that using a DAX block device. There is no reason not to
> > support that case.
> >
> 
> The reason not to support it would be if it gets in the way of future
> DAX development. How does this interact with MAP_SYNC? I'm also
> concerned if this complicates DAX reflink support. In general I'd
> rather prioritize fixing the places where DAX is broken today before
> adding more cross-subsystem entanglements. The unit tests for
> filesystems (xfstests) are readily accessible. How would I go about
> regression testing DAX + HMM interactions?

HMM mirror CPU page table so anything you do to CPU page table will
be reflected to all HMM mirror user. So MAP_SYNC has no bearing here
whatsoever as all HMM mirror user must do cache coherent access to
range they mirror so from DAX point of view this is just _exactly_
the same as CPU access.

Note that you can not migrate DAX memory to GPU memory and thus for a
mmap of a file on a filesystem that use a DAX block device then you can
not do migration to device memory. Also at this time migration of file
back page is only supported for cache coherent device memory so for
instance on OpenCAPI platform.

Bottom line is you just have to worry about the CPU page table. What
ever you do there will be reflected properly. It does not add any
burden to people working on DAX. Unless you want to modify CPU page
table without calling mmu notifier but in that case you would not
only break HMM mirror user but other thing like KVM ...


For testing the issue is what do you want to test ? Do you want to test
that a device properly mirror some mmap of a file back by DAX ? ie
device driver which use HMM mirror keep working after changes made to
DAX.

Or do you want to run filesystem test suite using the GPU to access
mmap of the file (read or write) instead of the CPU ? In that case any
such test suite would need to be updated to be able to use something
like OpenCL for. At this time i do not see much need for that but maybe
this is something people would like to see.

Cheers,
Jérôme


> 
> > Note that unlike GUP code we do not take page reference hence when we
> > back-off we have nothing to undo.
> >
> > Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: Dan Williams <dan.j.williams@intel.com>
> > Cc: Ralph Campbell <rcampbell@nvidia.com>
> > Cc: John Hubbard <jhubbard@nvidia.com>
> > ---
> >  mm/hmm.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++---------
> >  1 file changed, 112 insertions(+), 21 deletions(-)
> >
> > diff --git a/mm/hmm.c b/mm/hmm.c
> > index 8b87e1813313..1a444885404e 100644
> > --- a/mm/hmm.c
> > +++ b/mm/hmm.c
> > @@ -334,6 +334,7 @@ EXPORT_SYMBOL(hmm_mirror_unregister);
> >
> >  struct hmm_vma_walk {
> >         struct hmm_range        *range;
> > +       struct dev_pagemap      *pgmap;
> >         unsigned long           last;
> >         bool                    fault;
> >         bool                    block;
> > @@ -508,6 +509,15 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
> >                                 range->flags[HMM_PFN_VALID];
> >  }
> >
> > +static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
> > +{
> > +       if (!pud_present(pud))
> > +               return 0;
> > +       return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
> > +                               range->flags[HMM_PFN_WRITE] :
> > +                               range->flags[HMM_PFN_VALID];
> > +}
> > +
> >  static int hmm_vma_handle_pmd(struct mm_walk *walk,
> >                               unsigned long addr,
> >                               unsigned long end,
> > @@ -529,8 +539,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
> >                 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
> >
> >         pfn = pmd_pfn(pmd) + pte_index(addr);
> > -       for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
> > +       for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
> > +               if (pmd_devmap(pmd)) {
> > +                       hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
> > +                                             hmm_vma_walk->pgmap);
> > +                       if (unlikely(!hmm_vma_walk->pgmap))
> > +                               return -EBUSY;
> > +               }
> >                 pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
> > +       }
> > +       if (hmm_vma_walk->pgmap) {
> > +               put_dev_pagemap(hmm_vma_walk->pgmap);
> > +               hmm_vma_walk->pgmap = NULL;
> > +       }
> >         hmm_vma_walk->last = end;
> >         return 0;
> >  }
> > @@ -617,10 +638,24 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
> >         if (fault || write_fault)
> >                 goto fault;
> >
> > +       if (pte_devmap(pte)) {
> > +               hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
> > +                                             hmm_vma_walk->pgmap);
> > +               if (unlikely(!hmm_vma_walk->pgmap))
> > +                       return -EBUSY;
> > +       } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
> > +               *pfn = range->values[HMM_PFN_SPECIAL];
> > +               return -EFAULT;
> > +       }
> > +
> >         *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
> >         return 0;
> >
> >  fault:
> > +       if (hmm_vma_walk->pgmap) {
> > +               put_dev_pagemap(hmm_vma_walk->pgmap);
> > +               hmm_vma_walk->pgmap = NULL;
> > +       }
> >         pte_unmap(ptep);
> >         /* Fault any virtual address we were asked to fault */
> >         return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
> > @@ -708,12 +743,84 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
> >                         return r;
> >                 }
> >         }
> > +       if (hmm_vma_walk->pgmap) {
> > +               put_dev_pagemap(hmm_vma_walk->pgmap);
> > +               hmm_vma_walk->pgmap = NULL;
> > +       }
> >         pte_unmap(ptep - 1);
> >
> >         hmm_vma_walk->last = addr;
> >         return 0;
> >  }
> >
> > +static int hmm_vma_walk_pud(pud_t *pudp,
> > +                           unsigned long start,
> > +                           unsigned long end,
> > +                           struct mm_walk *walk)
> > +{
> > +       struct hmm_vma_walk *hmm_vma_walk = walk->private;
> > +       struct hmm_range *range = hmm_vma_walk->range;
> > +       struct vm_area_struct *vma = walk->vma;
> > +       unsigned long addr = start, next;
> > +       pmd_t *pmdp;
> > +       pud_t pud;
> > +       int ret;
> > +
> > +again:
> > +       pud = READ_ONCE(*pudp);
> > +       if (pud_none(pud))
> > +               return hmm_vma_walk_hole(start, end, walk);
> > +
> > +       if (pud_huge(pud) && pud_devmap(pud)) {
> > +               unsigned long i, npages, pfn;
> > +               uint64_t *pfns, cpu_flags;
> > +               bool fault, write_fault;
> > +
> > +               if (!pud_present(pud))
> > +                       return hmm_vma_walk_hole(start, end, walk);
> > +
> > +               i = (addr - range->start) >> PAGE_SHIFT;
> > +               npages = (end - addr) >> PAGE_SHIFT;
> > +               pfns = &range->pfns[i];
> > +
> > +               cpu_flags = pud_to_hmm_pfn_flags(range, pud);
> > +               hmm_range_need_fault(hmm_vma_walk, pfns, npages,
> > +                                    cpu_flags, &fault, &write_fault);
> > +               if (fault || write_fault)
> > +                       return hmm_vma_walk_hole_(addr, end, fault,
> > +                                               write_fault, walk);
> > +
> > +               pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> > +               for (i = 0; i < npages; ++i, ++pfn) {
> > +                       hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
> > +                                             hmm_vma_walk->pgmap);
> > +                       if (unlikely(!hmm_vma_walk->pgmap))
> > +                               return -EBUSY;
> > +                       pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
> > +               }
> > +               if (hmm_vma_walk->pgmap) {
> > +                       put_dev_pagemap(hmm_vma_walk->pgmap);
> > +                       hmm_vma_walk->pgmap = NULL;
> > +               }
> > +               hmm_vma_walk->last = end;
> > +               return 0;
> > +       }
> > +
> > +       split_huge_pud(vma, pudp, addr);
> > +       if (pud_none(*pudp))
> > +               goto again;
> > +
> > +       pmdp = pmd_offset(pudp, addr);
> > +       do {
> > +               next = pmd_addr_end(addr, end);
> > +               ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
> > +               if (ret)
> > +                       return ret;
> > +       } while (pmdp++, addr = next, addr != end);
> > +
> > +       return 0;
> > +}
> > +
> >  static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
> >                                       unsigned long start, unsigned long end,
> >                                       struct mm_walk *walk)
> > @@ -786,14 +893,6 @@ static void hmm_pfns_clear(struct hmm_range *range,
> >                 *pfns = range->values[HMM_PFN_NONE];
> >  }
> >
> > -static void hmm_pfns_special(struct hmm_range *range)
> > -{
> > -       unsigned long addr = range->start, i = 0;
> > -
> > -       for (; addr < range->end; addr += PAGE_SIZE, i++)
> > -               range->pfns[i] = range->values[HMM_PFN_SPECIAL];
> > -}
> > -
> >  /*
> >   * hmm_range_register() - start tracking change to CPU page table over a range
> >   * @range: range
> > @@ -911,12 +1010,6 @@ long hmm_range_snapshot(struct hmm_range *range)
> >                 if (vma == NULL || (vma->vm_flags & device_vma))
> >                         return -EFAULT;
> >
> > -               /* FIXME support dax */
> > -               if (vma_is_dax(vma)) {
> > -                       hmm_pfns_special(range);
> > -                       return -EINVAL;
> > -               }
> > -
> >                 if (is_vm_hugetlb_page(vma)) {
> >                         struct hstate *h = hstate_vma(vma);
> >
> > @@ -940,6 +1033,7 @@ long hmm_range_snapshot(struct hmm_range *range)
> >                 }
> >
> >                 range->vma = vma;
> > +               hmm_vma_walk.pgmap = NULL;
> >                 hmm_vma_walk.last = start;
> >                 hmm_vma_walk.fault = false;
> >                 hmm_vma_walk.range = range;
> > @@ -951,6 +1045,7 @@ long hmm_range_snapshot(struct hmm_range *range)
> >                 mm_walk.pte_entry = NULL;
> >                 mm_walk.test_walk = NULL;
> >                 mm_walk.hugetlb_entry = NULL;
> > +               mm_walk.pud_entry = hmm_vma_walk_pud;
> >                 mm_walk.pmd_entry = hmm_vma_walk_pmd;
> >                 mm_walk.pte_hole = hmm_vma_walk_hole;
> >                 mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
> > @@ -1018,12 +1113,6 @@ long hmm_range_fault(struct hmm_range *range, bool block)
> >                 if (vma == NULL || (vma->vm_flags & device_vma))
> >                         return -EFAULT;
> >
> > -               /* FIXME support dax */
> > -               if (vma_is_dax(vma)) {
> > -                       hmm_pfns_special(range);
> > -                       return -EINVAL;
> > -               }
> > -
> >                 if (is_vm_hugetlb_page(vma)) {
> >                         struct hstate *h = hstate_vma(vma);
> >
> > @@ -1047,6 +1136,7 @@ long hmm_range_fault(struct hmm_range *range, bool block)
> >                 }
> >
> >                 range->vma = vma;
> > +               hmm_vma_walk.pgmap = NULL;
> >                 hmm_vma_walk.last = start;
> >                 hmm_vma_walk.fault = true;
> >                 hmm_vma_walk.block = block;
> > @@ -1059,6 +1149,7 @@ long hmm_range_fault(struct hmm_range *range, bool block)
> >                 mm_walk.pte_entry = NULL;
> >                 mm_walk.test_walk = NULL;
> >                 mm_walk.hugetlb_entry = NULL;
> > +               mm_walk.pud_entry = hmm_vma_walk_pud;
> >                 mm_walk.pmd_entry = hmm_vma_walk_pmd;
> >                 mm_walk.pte_hole = hmm_vma_walk_hole;
> >                 mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
> > --
> > 2.17.2
> >
Dan Williams Jan. 29, 2019, 8:51 p.m. UTC | #3
On Tue, Jan 29, 2019 at 11:32 AM Jerome Glisse <jglisse@redhat.com> wrote:
>
> On Tue, Jan 29, 2019 at 10:41:23AM -0800, Dan Williams wrote:
> > On Tue, Jan 29, 2019 at 8:54 AM <jglisse@redhat.com> wrote:
> > >
> > > From: Jérôme Glisse <jglisse@redhat.com>
> > >
> > > This add support to mirror vma which is an mmap of a file which is on
> > > a filesystem that using a DAX block device. There is no reason not to
> > > support that case.
> > >
> >
> > The reason not to support it would be if it gets in the way of future
> > DAX development. How does this interact with MAP_SYNC? I'm also
> > concerned if this complicates DAX reflink support. In general I'd
> > rather prioritize fixing the places where DAX is broken today before
> > adding more cross-subsystem entanglements. The unit tests for
> > filesystems (xfstests) are readily accessible. How would I go about
> > regression testing DAX + HMM interactions?
>
> HMM mirror CPU page table so anything you do to CPU page table will
> be reflected to all HMM mirror user. So MAP_SYNC has no bearing here
> whatsoever as all HMM mirror user must do cache coherent access to
> range they mirror so from DAX point of view this is just _exactly_
> the same as CPU access.
>
> Note that you can not migrate DAX memory to GPU memory and thus for a
> mmap of a file on a filesystem that use a DAX block device then you can
> not do migration to device memory. Also at this time migration of file
> back page is only supported for cache coherent device memory so for
> instance on OpenCAPI platform.

Ok, this addresses the primary concern about maintenance burden. Thanks.

However the changelog still amounts to a justification of "change
this, because we can". At least, that's how it reads to me. Is there
any positive benefit to merging this patch? Can you spell that out in
the changelog?

> Bottom line is you just have to worry about the CPU page table. What
> ever you do there will be reflected properly. It does not add any
> burden to people working on DAX. Unless you want to modify CPU page
> table without calling mmu notifier but in that case you would not
> only break HMM mirror user but other thing like KVM ...
>
>
> For testing the issue is what do you want to test ? Do you want to test
> that a device properly mirror some mmap of a file back by DAX ? ie
> device driver which use HMM mirror keep working after changes made to
> DAX.
>
> Or do you want to run filesystem test suite using the GPU to access
> mmap of the file (read or write) instead of the CPU ? In that case any
> such test suite would need to be updated to be able to use something
> like OpenCL for. At this time i do not see much need for that but maybe
> this is something people would like to see.

In general, as HMM grows intercept points throughout the mm it would
be helpful to be able to sanity check the implementation.
Jerome Glisse Jan. 29, 2019, 9:21 p.m. UTC | #4
On Tue, Jan 29, 2019 at 12:51:25PM -0800, Dan Williams wrote:
> On Tue, Jan 29, 2019 at 11:32 AM Jerome Glisse <jglisse@redhat.com> wrote:
> >
> > On Tue, Jan 29, 2019 at 10:41:23AM -0800, Dan Williams wrote:
> > > On Tue, Jan 29, 2019 at 8:54 AM <jglisse@redhat.com> wrote:
> > > >
> > > > From: Jérôme Glisse <jglisse@redhat.com>
> > > >
> > > > This add support to mirror vma which is an mmap of a file which is on
> > > > a filesystem that using a DAX block device. There is no reason not to
> > > > support that case.
> > > >
> > >
> > > The reason not to support it would be if it gets in the way of future
> > > DAX development. How does this interact with MAP_SYNC? I'm also
> > > concerned if this complicates DAX reflink support. In general I'd
> > > rather prioritize fixing the places where DAX is broken today before
> > > adding more cross-subsystem entanglements. The unit tests for
> > > filesystems (xfstests) are readily accessible. How would I go about
> > > regression testing DAX + HMM interactions?
> >
> > HMM mirror CPU page table so anything you do to CPU page table will
> > be reflected to all HMM mirror user. So MAP_SYNC has no bearing here
> > whatsoever as all HMM mirror user must do cache coherent access to
> > range they mirror so from DAX point of view this is just _exactly_
> > the same as CPU access.
> >
> > Note that you can not migrate DAX memory to GPU memory and thus for a
> > mmap of a file on a filesystem that use a DAX block device then you can
> > not do migration to device memory. Also at this time migration of file
> > back page is only supported for cache coherent device memory so for
> > instance on OpenCAPI platform.
> 
> Ok, this addresses the primary concern about maintenance burden. Thanks.
> 
> However the changelog still amounts to a justification of "change
> this, because we can". At least, that's how it reads to me. Is there
> any positive benefit to merging this patch? Can you spell that out in
> the changelog?

There is 3 reasons for this:
    1) Convert ODP to use HMM underneath so that we share code between
    infiniband ODP and GPU drivers. ODP do support DAX today so i can
    not convert ODP to HMM without also supporting DAX in HMM otherwise
    i would regress the ODP features.

    2) I expect people will be running GPGPU on computer with file that
    use DAX and they will want to use HMM there too, in fact from user-
    space point of view wether the file is DAX or not should only change
    one thing ie for DAX file you will never be able to use GPU memory.

    3) I want to convert as many user of GUP to HMM (already posted
    several patchset to GPU mailing list for that and i intend to post
    a v2 of those latter on). Using HMM avoids GUP and it will avoid
    the GUP pin as here we abide by mmu notifier hence we do not want to
    inhibit any of the filesystem regular operation. Some of those GPU
    driver do allow GUP on DAX file. So again i can not regress them.


> > Bottom line is you just have to worry about the CPU page table. What
> > ever you do there will be reflected properly. It does not add any
> > burden to people working on DAX. Unless you want to modify CPU page
> > table without calling mmu notifier but in that case you would not
> > only break HMM mirror user but other thing like KVM ...
> >
> >
> > For testing the issue is what do you want to test ? Do you want to test
> > that a device properly mirror some mmap of a file back by DAX ? ie
> > device driver which use HMM mirror keep working after changes made to
> > DAX.
> >
> > Or do you want to run filesystem test suite using the GPU to access
> > mmap of the file (read or write) instead of the CPU ? In that case any
> > such test suite would need to be updated to be able to use something
> > like OpenCL for. At this time i do not see much need for that but maybe
> > this is something people would like to see.
> 
> In general, as HMM grows intercept points throughout the mm it would
> be helpful to be able to sanity check the implementation.

I usualy use a combination of simple OpenCL programs and hand tailor direct
ioctl hack to force specific code path to happen. I should probably create
a repository with a set of OpenCL tests so that other can also use them.
I need to clean those up into something not too ugly so i am not ashame
of them.

Also at this time the OpenCL bits are not in any distro, most of the bits
are in mesa and Karol and others are doing a great jobs at polishing things
and getting all the bits in. I do expect that in couple months the mainline
of all projects (LLVM, Mesa, libdrm, ...) will have all the bits and then it
will trickle down to your favorite distribution (assuming they build mesa
with OpenCL enabled).

Cheers,
Jérôme
Dan Williams Jan. 30, 2019, 2:32 a.m. UTC | #5
On Tue, Jan 29, 2019 at 1:21 PM Jerome Glisse <jglisse@redhat.com> wrote:
>
> On Tue, Jan 29, 2019 at 12:51:25PM -0800, Dan Williams wrote:
> > On Tue, Jan 29, 2019 at 11:32 AM Jerome Glisse <jglisse@redhat.com> wrote:
> > >
> > > On Tue, Jan 29, 2019 at 10:41:23AM -0800, Dan Williams wrote:
> > > > On Tue, Jan 29, 2019 at 8:54 AM <jglisse@redhat.com> wrote:
> > > > >
> > > > > From: Jérôme Glisse <jglisse@redhat.com>
> > > > >
> > > > > This add support to mirror vma which is an mmap of a file which is on
> > > > > a filesystem that using a DAX block device. There is no reason not to
> > > > > support that case.
> > > > >
> > > >
> > > > The reason not to support it would be if it gets in the way of future
> > > > DAX development. How does this interact with MAP_SYNC? I'm also
> > > > concerned if this complicates DAX reflink support. In general I'd
> > > > rather prioritize fixing the places where DAX is broken today before
> > > > adding more cross-subsystem entanglements. The unit tests for
> > > > filesystems (xfstests) are readily accessible. How would I go about
> > > > regression testing DAX + HMM interactions?
> > >
> > > HMM mirror CPU page table so anything you do to CPU page table will
> > > be reflected to all HMM mirror user. So MAP_SYNC has no bearing here
> > > whatsoever as all HMM mirror user must do cache coherent access to
> > > range they mirror so from DAX point of view this is just _exactly_
> > > the same as CPU access.
> > >
> > > Note that you can not migrate DAX memory to GPU memory and thus for a
> > > mmap of a file on a filesystem that use a DAX block device then you can
> > > not do migration to device memory. Also at this time migration of file
> > > back page is only supported for cache coherent device memory so for
> > > instance on OpenCAPI platform.
> >
> > Ok, this addresses the primary concern about maintenance burden. Thanks.
> >
> > However the changelog still amounts to a justification of "change
> > this, because we can". At least, that's how it reads to me. Is there
> > any positive benefit to merging this patch? Can you spell that out in
> > the changelog?
>
> There is 3 reasons for this:

Thanks for this.

>     1) Convert ODP to use HMM underneath so that we share code between
>     infiniband ODP and GPU drivers. ODP do support DAX today so i can
>     not convert ODP to HMM without also supporting DAX in HMM otherwise
>     i would regress the ODP features.
>
>     2) I expect people will be running GPGPU on computer with file that
>     use DAX and they will want to use HMM there too, in fact from user-
>     space point of view wether the file is DAX or not should only change
>     one thing ie for DAX file you will never be able to use GPU memory.
>
>     3) I want to convert as many user of GUP to HMM (already posted
>     several patchset to GPU mailing list for that and i intend to post
>     a v2 of those latter on). Using HMM avoids GUP and it will avoid
>     the GUP pin as here we abide by mmu notifier hence we do not want to
>     inhibit any of the filesystem regular operation. Some of those GPU
>     driver do allow GUP on DAX file. So again i can not regress them.

Is this really a GUP to HMM conversion, or a GUP to mmu_notifier
solution? It would be good to boil this conversion down to the base
building blocks. It seems "HMM" can mean several distinct pieces of
infrastructure. Is it possible to replace some GUP usage with an
mmu_notifier based solution without pulling in all of HMM?

> > > Bottom line is you just have to worry about the CPU page table. What
> > > ever you do there will be reflected properly. It does not add any
> > > burden to people working on DAX. Unless you want to modify CPU page
> > > table without calling mmu notifier but in that case you would not
> > > only break HMM mirror user but other thing like KVM ...
> > >
> > >
> > > For testing the issue is what do you want to test ? Do you want to test
> > > that a device properly mirror some mmap of a file back by DAX ? ie
> > > device driver which use HMM mirror keep working after changes made to
> > > DAX.
> > >
> > > Or do you want to run filesystem test suite using the GPU to access
> > > mmap of the file (read or write) instead of the CPU ? In that case any
> > > such test suite would need to be updated to be able to use something
> > > like OpenCL for. At this time i do not see much need for that but maybe
> > > this is something people would like to see.
> >
> > In general, as HMM grows intercept points throughout the mm it would
> > be helpful to be able to sanity check the implementation.
>
> I usualy use a combination of simple OpenCL programs and hand tailor direct
> ioctl hack to force specific code path to happen. I should probably create
> a repository with a set of OpenCL tests so that other can also use them.
> I need to clean those up into something not too ugly so i am not ashame
> of them.

That would be great, even it is messy.

> Also at this time the OpenCL bits are not in any distro, most of the bits
> are in mesa and Karol and others are doing a great jobs at polishing things
> and getting all the bits in. I do expect that in couple months the mainline
> of all projects (LLVM, Mesa, libdrm, ...) will have all the bits and then it
> will trickle down to your favorite distribution (assuming they build mesa
> with OpenCL enabled).

Ok.
Jerome Glisse Jan. 30, 2019, 3:03 a.m. UTC | #6
On Tue, Jan 29, 2019 at 06:32:56PM -0800, Dan Williams wrote:
> On Tue, Jan 29, 2019 at 1:21 PM Jerome Glisse <jglisse@redhat.com> wrote:
> >
> > On Tue, Jan 29, 2019 at 12:51:25PM -0800, Dan Williams wrote:
> > > On Tue, Jan 29, 2019 at 11:32 AM Jerome Glisse <jglisse@redhat.com> wrote:
> > > >
> > > > On Tue, Jan 29, 2019 at 10:41:23AM -0800, Dan Williams wrote:
> > > > > On Tue, Jan 29, 2019 at 8:54 AM <jglisse@redhat.com> wrote:
> > > > > >
> > > > > > From: Jérôme Glisse <jglisse@redhat.com>
> > > > > >
> > > > > > This add support to mirror vma which is an mmap of a file which is on
> > > > > > a filesystem that using a DAX block device. There is no reason not to
> > > > > > support that case.
> > > > > >
> > > > >
> > > > > The reason not to support it would be if it gets in the way of future
> > > > > DAX development. How does this interact with MAP_SYNC? I'm also
> > > > > concerned if this complicates DAX reflink support. In general I'd
> > > > > rather prioritize fixing the places where DAX is broken today before
> > > > > adding more cross-subsystem entanglements. The unit tests for
> > > > > filesystems (xfstests) are readily accessible. How would I go about
> > > > > regression testing DAX + HMM interactions?
> > > >
> > > > HMM mirror CPU page table so anything you do to CPU page table will
> > > > be reflected to all HMM mirror user. So MAP_SYNC has no bearing here
> > > > whatsoever as all HMM mirror user must do cache coherent access to
> > > > range they mirror so from DAX point of view this is just _exactly_
> > > > the same as CPU access.
> > > >
> > > > Note that you can not migrate DAX memory to GPU memory and thus for a
> > > > mmap of a file on a filesystem that use a DAX block device then you can
> > > > not do migration to device memory. Also at this time migration of file
> > > > back page is only supported for cache coherent device memory so for
> > > > instance on OpenCAPI platform.
> > >
> > > Ok, this addresses the primary concern about maintenance burden. Thanks.
> > >
> > > However the changelog still amounts to a justification of "change
> > > this, because we can". At least, that's how it reads to me. Is there
> > > any positive benefit to merging this patch? Can you spell that out in
> > > the changelog?
> >
> > There is 3 reasons for this:
> 
> Thanks for this.
> 
> >     1) Convert ODP to use HMM underneath so that we share code between
> >     infiniband ODP and GPU drivers. ODP do support DAX today so i can
> >     not convert ODP to HMM without also supporting DAX in HMM otherwise
> >     i would regress the ODP features.
> >
> >     2) I expect people will be running GPGPU on computer with file that
> >     use DAX and they will want to use HMM there too, in fact from user-
> >     space point of view wether the file is DAX or not should only change
> >     one thing ie for DAX file you will never be able to use GPU memory.
> >
> >     3) I want to convert as many user of GUP to HMM (already posted
> >     several patchset to GPU mailing list for that and i intend to post
> >     a v2 of those latter on). Using HMM avoids GUP and it will avoid
> >     the GUP pin as here we abide by mmu notifier hence we do not want to
> >     inhibit any of the filesystem regular operation. Some of those GPU
> >     driver do allow GUP on DAX file. So again i can not regress them.
> 
> Is this really a GUP to HMM conversion, or a GUP to mmu_notifier
> solution? It would be good to boil this conversion down to the base
> building blocks. It seems "HMM" can mean several distinct pieces of
> infrastructure. Is it possible to replace some GUP usage with an
> mmu_notifier based solution without pulling in all of HMM?

Kind of both, some of the GPU driver i am converting will use HMM for
more than just this GUP reason. But when and for what hardware they
will use HMM for is not something i can share (it is up to each vendor
to announce their hardware and feature on their own timeline).

So yes you could do the mmu notifier solution without pulling HMM
mirror (note that you do not need to pull all of HMM, HMM as many
kernel config option and for this you only need to use HMM mirror).
But if you are not using HMM then you will just be duplicating the
same code as HMM mirror. So i believe it is better to share this
code and if we want to change core mm then we only have to update
HMM while keeping the API/contract with device driver intact. This
is one of the motivation behind HMM ie have it as an impedence layer
between mm and device drivers so that mm folks do not have to under-
stand every single device driver but only have to understand the
contract HMM has with all device driver that uses it.

Also having each driver duplicating this code increase the risk of
one getting a little detail wrong. The hope is that sharing same
HMM code with all the driver then everyone benefit from debugging
the same code (i am hopping i do not have many bugs left :))


> > > > Bottom line is you just have to worry about the CPU page table. What
> > > > ever you do there will be reflected properly. It does not add any
> > > > burden to people working on DAX. Unless you want to modify CPU page
> > > > table without calling mmu notifier but in that case you would not
> > > > only break HMM mirror user but other thing like KVM ...
> > > >
> > > >
> > > > For testing the issue is what do you want to test ? Do you want to test
> > > > that a device properly mirror some mmap of a file back by DAX ? ie
> > > > device driver which use HMM mirror keep working after changes made to
> > > > DAX.
> > > >
> > > > Or do you want to run filesystem test suite using the GPU to access
> > > > mmap of the file (read or write) instead of the CPU ? In that case any
> > > > such test suite would need to be updated to be able to use something
> > > > like OpenCL for. At this time i do not see much need for that but maybe
> > > > this is something people would like to see.
> > >
> > > In general, as HMM grows intercept points throughout the mm it would
> > > be helpful to be able to sanity check the implementation.
> >
> > I usualy use a combination of simple OpenCL programs and hand tailor direct
> > ioctl hack to force specific code path to happen. I should probably create
> > a repository with a set of OpenCL tests so that other can also use them.
> > I need to clean those up into something not too ugly so i am not ashame
> > of them.
> 
> That would be great, even it is messy.

I will clean them up a put something together that i am not too ashame to
push :) I am on PTO for next couple weeks so it will probably not happens
before i am back. I still should have email access.

Cheers,
Jérôme
Dan Williams Jan. 30, 2019, 5:25 p.m. UTC | #7
On Tue, Jan 29, 2019 at 7:03 PM Jerome Glisse <jglisse@redhat.com> wrote:
[..]
> > >     1) Convert ODP to use HMM underneath so that we share code between
> > >     infiniband ODP and GPU drivers. ODP do support DAX today so i can
> > >     not convert ODP to HMM without also supporting DAX in HMM otherwise
> > >     i would regress the ODP features.
> > >
> > >     2) I expect people will be running GPGPU on computer with file that
> > >     use DAX and they will want to use HMM there too, in fact from user-
> > >     space point of view wether the file is DAX or not should only change
> > >     one thing ie for DAX file you will never be able to use GPU memory.
> > >
> > >     3) I want to convert as many user of GUP to HMM (already posted
> > >     several patchset to GPU mailing list for that and i intend to post
> > >     a v2 of those latter on). Using HMM avoids GUP and it will avoid
> > >     the GUP pin as here we abide by mmu notifier hence we do not want to
> > >     inhibit any of the filesystem regular operation. Some of those GPU
> > >     driver do allow GUP on DAX file. So again i can not regress them.
> >
> > Is this really a GUP to HMM conversion, or a GUP to mmu_notifier
> > solution? It would be good to boil this conversion down to the base
> > building blocks. It seems "HMM" can mean several distinct pieces of
> > infrastructure. Is it possible to replace some GUP usage with an
> > mmu_notifier based solution without pulling in all of HMM?
>
> Kind of both, some of the GPU driver i am converting will use HMM for
> more than just this GUP reason. But when and for what hardware they
> will use HMM for is not something i can share (it is up to each vendor
> to announce their hardware and feature on their own timeline).

Typically a spec document precedes specific hardware announcement and
Linux enabling is gated on public spec availability.

> So yes you could do the mmu notifier solution without pulling HMM
> mirror (note that you do not need to pull all of HMM, HMM as many
> kernel config option and for this you only need to use HMM mirror).
> But if you are not using HMM then you will just be duplicating the
> same code as HMM mirror. So i believe it is better to share this
> code and if we want to change core mm then we only have to update
> HMM while keeping the API/contract with device driver intact.

No. Linux should not end up with the HMM-mm as distinct from the
Core-mm. For long term maintainability of Linux, the target should be
one mm.

> This
> is one of the motivation behind HMM ie have it as an impedence layer
> between mm and device drivers so that mm folks do not have to under-
> stand every single device driver but only have to understand the
> contract HMM has with all device driver that uses it.

This gets to heart of my critique of the approach taken with HMM. The
above statement is antithetical to
Documentation/process/stable-api-nonsense.rst. If HMM is trying to set
expectations that device-driver projects can write to a "stable" HMM
api then HMM is setting those device-drivers up for failure.

The possibility of refactoring driver code *across* vendors is a core
tenet of Linux maintainability. If the refactoring requires the API
exported to drivers to change then so be it. The expectation that all
out-of-tree device-drivers should have is that the API they are using
in kernel version X may not be there in version X+1. Having the driver
upstream is the only surefire insurance against that thrash.

HMM seems a bold experiment in trying to violate Linux development norms.

> Also having each driver duplicating this code increase the risk of
> one getting a little detail wrong. The hope is that sharing same
> HMM code with all the driver then everyone benefit from debugging
> the same code (i am hopping i do not have many bugs left :))

"each driver duplicating code" begs for refactoring driver code to
common code and this refactoring is hindered if it must adhere to an
"HMM" api.
Jerome Glisse Jan. 30, 2019, 6:36 p.m. UTC | #8
On Wed, Jan 30, 2019 at 09:25:21AM -0800, Dan Williams wrote:
> On Tue, Jan 29, 2019 at 7:03 PM Jerome Glisse <jglisse@redhat.com> wrote:
> [..]
> > > >     1) Convert ODP to use HMM underneath so that we share code between
> > > >     infiniband ODP and GPU drivers. ODP do support DAX today so i can
> > > >     not convert ODP to HMM without also supporting DAX in HMM otherwise
> > > >     i would regress the ODP features.
> > > >
> > > >     2) I expect people will be running GPGPU on computer with file that
> > > >     use DAX and they will want to use HMM there too, in fact from user-
> > > >     space point of view wether the file is DAX or not should only change
> > > >     one thing ie for DAX file you will never be able to use GPU memory.
> > > >
> > > >     3) I want to convert as many user of GUP to HMM (already posted
> > > >     several patchset to GPU mailing list for that and i intend to post
> > > >     a v2 of those latter on). Using HMM avoids GUP and it will avoid
> > > >     the GUP pin as here we abide by mmu notifier hence we do not want to
> > > >     inhibit any of the filesystem regular operation. Some of those GPU
> > > >     driver do allow GUP on DAX file. So again i can not regress them.
> > >
> > > Is this really a GUP to HMM conversion, or a GUP to mmu_notifier
> > > solution? It would be good to boil this conversion down to the base
> > > building blocks. It seems "HMM" can mean several distinct pieces of
> > > infrastructure. Is it possible to replace some GUP usage with an
> > > mmu_notifier based solution without pulling in all of HMM?
> >
> > Kind of both, some of the GPU driver i am converting will use HMM for
> > more than just this GUP reason. But when and for what hardware they
> > will use HMM for is not something i can share (it is up to each vendor
> > to announce their hardware and feature on their own timeline).
> 
> Typically a spec document precedes specific hardware announcement and
> Linux enabling is gated on public spec availability.
> 
> > So yes you could do the mmu notifier solution without pulling HMM
> > mirror (note that you do not need to pull all of HMM, HMM as many
> > kernel config option and for this you only need to use HMM mirror).
> > But if you are not using HMM then you will just be duplicating the
> > same code as HMM mirror. So i believe it is better to share this
> > code and if we want to change core mm then we only have to update
> > HMM while keeping the API/contract with device driver intact.
> 
> No. Linux should not end up with the HMM-mm as distinct from the
> Core-mm. For long term maintainability of Linux, the target should be
> one mm.

Hu ? I do not follow here. Maybe i am unclear and we are talking past
each other.

> 
> > This
> > is one of the motivation behind HMM ie have it as an impedence layer
> > between mm and device drivers so that mm folks do not have to under-
> > stand every single device driver but only have to understand the
> > contract HMM has with all device driver that uses it.
> 
> This gets to heart of my critique of the approach taken with HMM. The
> above statement is antithetical to
> Documentation/process/stable-api-nonsense.rst. If HMM is trying to set
> expectations that device-driver projects can write to a "stable" HMM
> api then HMM is setting those device-drivers up for failure.

So i am not expressing myself correctly. If someone want to change mm
in anyway that would affect HMM user, it can and it is welcome too
(assuming that those change are wanted by the community and motivated
for good reasons). Here by understanding HMM contract and preserving it
what i mean is that all you have to do is update the HMM API in anyway
that deliver the same result to the device driver. So what i means is
that instead of having to understand each device driver. For instance
you have HMM provide X so that driver can do Y; then what can be Z a
replacement for X that allow driver to do Y. The point here is that
HMM define what Y is and provide X for current kernel mm code. If X
ever need to change so that core mm can evolve than you can and are
more than welcome to do it. With HMM Y is defined and you only need to
figure out how to achieve the same end result for the device driver.

The point is that you do not have to go read each device driver to
figure out Y.driver_foo, Y.driver_bar, ... you only have HMM that
define what Y means and is ie this what device driver are trying to
do.

Obviously here i assume that we do not want to regress features ie
we want to keep device driver features intact when we modify anything.

> 
> The possibility of refactoring driver code *across* vendors is a core
> tenet of Linux maintainability. If the refactoring requires the API
> exported to drivers to change then so be it. The expectation that all
> out-of-tree device-drivers should have is that the API they are using
> in kernel version X may not be there in version X+1. Having the driver
> upstream is the only surefire insurance against that thrash.
> 
> HMM seems a bold experiment in trying to violate Linux development norms.

We are definitly talking past each other. HMM is _not_ a stable API or
any kind of contract with anybody outside upstream. If you want to change
the API HMM expose to device driver you are free to do so provided that
the change still allow device driver to achieve their objective.

HMM is not here to hinder that, quite the opposite in fact, it is here
to help that. Helping people that want to evolve mm by not requirement
them to understand every single device driver.


> 
> > Also having each driver duplicating this code increase the risk of
> > one getting a little detail wrong. The hope is that sharing same
> > HMM code with all the driver then everyone benefit from debugging
> > the same code (i am hopping i do not have many bugs left :))
> 
> "each driver duplicating code" begs for refactoring driver code to
> common code and this refactoring is hindered if it must adhere to an
> "HMM" api.

Again HMM API can evolve, i am happy to help with any such change, given
it provides benefit to either mm or device driver (ie changing the HMM
just for the sake of changing the HMM API would not make much sense to
me).

So if after converting driver A, B and C we see that it would be nicer
to change HMM in someway then i will definitly do that and this patchset
is a testimony of that. Converting ODP to use HMM is easier after this
patchset and this patchset changes the HMM API. I will be updating the
nouveau driver to the new API and use the new API for the other driver
patchset i am working on.

If i bump again into something that would be better done any differently
i will definitly change the HMM API and update all upstream driver
accordingly.

I am a strong believer in full freedom for internal kernel API changes
and my intention have always been to help and facilitate such process.
I am sorry this was unclear to any body :( and i am hopping that this
email make my intention clear.

Cheers,
Jérôme
Dan Williams Jan. 31, 2019, 3:28 a.m. UTC | #9
On Wed, Jan 30, 2019 at 10:36 AM Jerome Glisse <jglisse@redhat.com> wrote:
[..]
> > > This
> > > is one of the motivation behind HMM ie have it as an impedence layer
> > > between mm and device drivers so that mm folks do not have to under-
> > > stand every single device driver but only have to understand the
> > > contract HMM has with all device driver that uses it.
> >
> > This gets to heart of my critique of the approach taken with HMM. The
> > above statement is antithetical to
> > Documentation/process/stable-api-nonsense.rst. If HMM is trying to set
> > expectations that device-driver projects can write to a "stable" HMM
> > api then HMM is setting those device-drivers up for failure.
>
> So i am not expressing myself correctly. If someone want to change mm
> in anyway that would affect HMM user, it can and it is welcome too
> (assuming that those change are wanted by the community and motivated
> for good reasons). Here by understanding HMM contract and preserving it
> what i mean is that all you have to do is update the HMM API in anyway
> that deliver the same result to the device driver. So what i means is
> that instead of having to understand each device driver. For instance
> you have HMM provide X so that driver can do Y; then what can be Z a
> replacement for X that allow driver to do Y. The point here is that
> HMM define what Y is and provide X for current kernel mm code. If X
> ever need to change so that core mm can evolve than you can and are
> more than welcome to do it. With HMM Y is defined and you only need to
> figure out how to achieve the same end result for the device driver.
>
> The point is that you do not have to go read each device driver to
> figure out Y.driver_foo, Y.driver_bar, ... you only have HMM that
> define what Y means and is ie this what device driver are trying to
> do.
>
> Obviously here i assume that we do not want to regress features ie
> we want to keep device driver features intact when we modify anything.

The specific concern is HMM attempting to expand the regression
boundary beyond drivers that exist in the kernel. The regression
contract that has priority is the one established for in-tree users.
If an in-tree change to mm semantics is fine for in-tree mm users, but
breaks out of tree users the question to those out of tree users is
"why isn't your use case upstream?". HMM is not that use case in and
of itself.

[..]
> Again HMM API can evolve, i am happy to help with any such change, given
> it provides benefit to either mm or device driver (ie changing the HMM
> just for the sake of changing the HMM API would not make much sense to
> me).
>
> So if after converting driver A, B and C we see that it would be nicer
> to change HMM in someway then i will definitly do that and this patchset
> is a testimony of that. Converting ODP to use HMM is easier after this
> patchset and this patchset changes the HMM API. I will be updating the
> nouveau driver to the new API and use the new API for the other driver
> patchset i am working on.
>
> If i bump again into something that would be better done any differently
> i will definitly change the HMM API and update all upstream driver
> accordingly.
>
> I am a strong believer in full freedom for internal kernel API changes
> and my intention have always been to help and facilitate such process.
> I am sorry this was unclear to any body :( and i am hopping that this
> email make my intention clear.''

A simple way to ensure that out-of-tree consumers don't come beat us
up over a backwards incompatible HMM change is to mark all the exports
with _GPL. I'm not requiring that, the devm_memremap_pages() fight was
hard enough, but the pace of new exports vs arrival of consumers for
those exports has me worried that this arrangement will fall over at
some point.

Another way to help allay these worries is commit to no new exports
without in-tree users. In general, that should go without saying for
any core changes for new or future hardware.
Jerome Glisse Jan. 31, 2019, 4:16 a.m. UTC | #10
On Wed, Jan 30, 2019 at 07:28:12PM -0800, Dan Williams wrote:
> On Wed, Jan 30, 2019 at 10:36 AM Jerome Glisse <jglisse@redhat.com> wrote:
> [..]
> > > > This
> > > > is one of the motivation behind HMM ie have it as an impedence layer
> > > > between mm and device drivers so that mm folks do not have to under-
> > > > stand every single device driver but only have to understand the
> > > > contract HMM has with all device driver that uses it.
> > >
> > > This gets to heart of my critique of the approach taken with HMM. The
> > > above statement is antithetical to
> > > Documentation/process/stable-api-nonsense.rst. If HMM is trying to set
> > > expectations that device-driver projects can write to a "stable" HMM
> > > api then HMM is setting those device-drivers up for failure.
> >
> > So i am not expressing myself correctly. If someone want to change mm
> > in anyway that would affect HMM user, it can and it is welcome too
> > (assuming that those change are wanted by the community and motivated
> > for good reasons). Here by understanding HMM contract and preserving it
> > what i mean is that all you have to do is update the HMM API in anyway
> > that deliver the same result to the device driver. So what i means is
> > that instead of having to understand each device driver. For instance
> > you have HMM provide X so that driver can do Y; then what can be Z a
> > replacement for X that allow driver to do Y. The point here is that
> > HMM define what Y is and provide X for current kernel mm code. If X
> > ever need to change so that core mm can evolve than you can and are
> > more than welcome to do it. With HMM Y is defined and you only need to
> > figure out how to achieve the same end result for the device driver.
> >
> > The point is that you do not have to go read each device driver to
> > figure out Y.driver_foo, Y.driver_bar, ... you only have HMM that
> > define what Y means and is ie this what device driver are trying to
> > do.
> >
> > Obviously here i assume that we do not want to regress features ie
> > we want to keep device driver features intact when we modify anything.
> 
> The specific concern is HMM attempting to expand the regression
> boundary beyond drivers that exist in the kernel. The regression
> contract that has priority is the one established for in-tree users.
> If an in-tree change to mm semantics is fine for in-tree mm users, but
> breaks out of tree users the question to those out of tree users is
> "why isn't your use case upstream?". HMM is not that use case in and
> of itself.

I do not worry about out of tree user and we should not worry about
them. I care only about upstream driver (AMD, Intel, NVidia) and i
will not do an HMM feature if i do not intend to use it in at least
one of those upstream driver. Yes i have work with NVidia on the
design simply because they are the market leader on GPU compute and
have talented engineers that know a little about what would work
well. Not working with them to get their input on design just because
their driver is closed source seems radical to me. I believe i
benefited from their valuable input. But in the end my aim is, and
have always been, to make the upstream kernel driver the best as
possible. I will talk with anyone that can help in achieving that
objective.

So do not worry about non upstream driver.


> [..]
> > Again HMM API can evolve, i am happy to help with any such change, given
> > it provides benefit to either mm or device driver (ie changing the HMM
> > just for the sake of changing the HMM API would not make much sense to
> > me).
> >
> > So if after converting driver A, B and C we see that it would be nicer
> > to change HMM in someway then i will definitly do that and this patchset
> > is a testimony of that. Converting ODP to use HMM is easier after this
> > patchset and this patchset changes the HMM API. I will be updating the
> > nouveau driver to the new API and use the new API for the other driver
> > patchset i am working on.
> >
> > If i bump again into something that would be better done any differently
> > i will definitly change the HMM API and update all upstream driver
> > accordingly.
> >
> > I am a strong believer in full freedom for internal kernel API changes
> > and my intention have always been to help and facilitate such process.
> > I am sorry this was unclear to any body :( and i am hopping that this
> > email make my intention clear.''
> 
> A simple way to ensure that out-of-tree consumers don't come beat us
> up over a backwards incompatible HMM change is to mark all the exports
> with _GPL. I'm not requiring that, the devm_memremap_pages() fight was
> hard enough, but the pace of new exports vs arrival of consumers for
> those exports has me worried that this arrangement will fall over at
> some point.

I was reluctant with the devm_memremap_pages() GPL changes because i
think we should not change symbol export after an initial choice have
been made on those.

I don't think GPL or non GPL export change one bit in respect to out
of tree user. They know they can not make any legitimate regression
claim, nor should we care. So i fail to see how GPL export would make
it any different.

> Another way to help allay these worries is commit to no new exports
> without in-tree users. In general, that should go without saying for
> any core changes for new or future hardware.

I always intend to have an upstream user the issue is that the device
driver tree and the mm tree move a different pace and there is always
a chicken and egg problem. I do not think Andrew wants to have to
merge driver patches through its tree, nor Linus want to have to merge
drivers and mm trees in specific order. So it is easier to introduce
mm change in one release and driver change in the next. This is what
i am doing with ODP. Adding things necessary in 5.1 and working with
Mellanox to have the ODP HMM patch fully tested and ready to go in
5.2 (the patch is available today and Mellanox have begin testing it
AFAIK). So this is the guideline i will be following. Post mm bits
with driver patches, push to merge mm bits one release and have the
driver bits in the next. I do hope this sound fine to everyone.

It is also easier for the driver folks as then they do not need to
have a special tree just to test my changes. They can integrate it
in their regular workflow ie merge the new kernel release in their
tree and then start pilling up changes to their driver for the next
kernel release.

Cheers,
Jérôme
Dan Williams Jan. 31, 2019, 5:44 a.m. UTC | #11
On Wed, Jan 30, 2019 at 8:17 PM Jerome Glisse <jglisse@redhat.com> wrote:
> On Wed, Jan 30, 2019 at 07:28:12PM -0800, Dan Williams wrote:
[..]
> > > Again HMM API can evolve, i am happy to help with any such change, given
> > > it provides benefit to either mm or device driver (ie changing the HMM
> > > just for the sake of changing the HMM API would not make much sense to
> > > me).
> > >
> > > So if after converting driver A, B and C we see that it would be nicer
> > > to change HMM in someway then i will definitly do that and this patchset
> > > is a testimony of that. Converting ODP to use HMM is easier after this
> > > patchset and this patchset changes the HMM API. I will be updating the
> > > nouveau driver to the new API and use the new API for the other driver
> > > patchset i am working on.
> > >
> > > If i bump again into something that would be better done any differently
> > > i will definitly change the HMM API and update all upstream driver
> > > accordingly.
> > >
> > > I am a strong believer in full freedom for internal kernel API changes
> > > and my intention have always been to help and facilitate such process.
> > > I am sorry this was unclear to any body :( and i am hopping that this
> > > email make my intention clear.''
> >
> > A simple way to ensure that out-of-tree consumers don't come beat us
> > up over a backwards incompatible HMM change is to mark all the exports
> > with _GPL. I'm not requiring that, the devm_memremap_pages() fight was
> > hard enough, but the pace of new exports vs arrival of consumers for
> > those exports has me worried that this arrangement will fall over at
> > some point.
>
> I was reluctant with the devm_memremap_pages() GPL changes because i
> think we should not change symbol export after an initial choice have
> been made on those.
>
> I don't think GPL or non GPL export change one bit in respect to out
> of tree user. They know they can not make any legitimate regression
> claim, nor should we care. So i fail to see how GPL export would make
> it any different.

It does matter. It's a perennial fight. For a recent example see the
discussion around: "x86/fpu: Don't export __kernel_fpu_{begin,end}()".
If you're not sure you can keep an api trivially stable it should have
a GPL export to minimize the exposure surface of out-of-tree users
that might grow attached to it.

>
> > Another way to help allay these worries is commit to no new exports
> > without in-tree users. In general, that should go without saying for
> > any core changes for new or future hardware.
>
> I always intend to have an upstream user the issue is that the device
> driver tree and the mm tree move a different pace and there is always
> a chicken and egg problem. I do not think Andrew wants to have to
> merge driver patches through its tree, nor Linus want to have to merge
> drivers and mm trees in specific order. So it is easier to introduce
> mm change in one release and driver change in the next. This is what
> i am doing with ODP. Adding things necessary in 5.1 and working with
> Mellanox to have the ODP HMM patch fully tested and ready to go in
> 5.2 (the patch is available today and Mellanox have begin testing it
> AFAIK). So this is the guideline i will be following. Post mm bits
> with driver patches, push to merge mm bits one release and have the
> driver bits in the next. I do hope this sound fine to everyone.

The track record to date has not been "merge HMM patch in one release
and merge the driver updates the next". If that is the plan going
forward that's great, and I do appreciate that this set came with
driver changes, and maintain hope the existing exports don't go
user-less for too much longer.

> It is also easier for the driver folks as then they do not need to
> have a special tree just to test my changes. They can integrate it
> in their regular workflow ie merge the new kernel release in their
> tree and then start pilling up changes to their driver for the next
> kernel release.

Everyone agrees that coordinating cross-tree updates is hard, but it's
managaeble. HMM as far I can see is taking an unprecedented approach
to early merging of core infrastructure.
Andrew Morton March 5, 2019, 10:16 p.m. UTC | #12
On Wed, 30 Jan 2019 21:44:46 -0800 Dan Williams <dan.j.williams@intel.com> wrote:

> >
> > > Another way to help allay these worries is commit to no new exports
> > > without in-tree users. In general, that should go without saying for
> > > any core changes for new or future hardware.
> >
> > I always intend to have an upstream user the issue is that the device
> > driver tree and the mm tree move a different pace and there is always
> > a chicken and egg problem. I do not think Andrew wants to have to
> > merge driver patches through its tree, nor Linus want to have to merge
> > drivers and mm trees in specific order. So it is easier to introduce
> > mm change in one release and driver change in the next. This is what
> > i am doing with ODP. Adding things necessary in 5.1 and working with
> > Mellanox to have the ODP HMM patch fully tested and ready to go in
> > 5.2 (the patch is available today and Mellanox have begin testing it
> > AFAIK). So this is the guideline i will be following. Post mm bits
> > with driver patches, push to merge mm bits one release and have the
> > driver bits in the next. I do hope this sound fine to everyone.
> 
> The track record to date has not been "merge HMM patch in one release
> and merge the driver updates the next". If that is the plan going
> forward that's great, and I do appreciate that this set came with
> driver changes, and maintain hope the existing exports don't go
> user-less for too much longer.

Decision time.  Jerome, how are things looking for getting these driver
changes merged in the next cycle?

Dan, what's your overall take on this series for a 5.1-rc1 merge?

Jerome, what would be the risks in skipping just this [09/10] patch?
Dan Williams March 6, 2019, 4:20 a.m. UTC | #13
On Tue, Mar 5, 2019 at 2:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Wed, 30 Jan 2019 21:44:46 -0800 Dan Williams <dan.j.williams@intel.com> wrote:
>
> > >
> > > > Another way to help allay these worries is commit to no new exports
> > > > without in-tree users. In general, that should go without saying for
> > > > any core changes for new or future hardware.
> > >
> > > I always intend to have an upstream user the issue is that the device
> > > driver tree and the mm tree move a different pace and there is always
> > > a chicken and egg problem. I do not think Andrew wants to have to
> > > merge driver patches through its tree, nor Linus want to have to merge
> > > drivers and mm trees in specific order. So it is easier to introduce
> > > mm change in one release and driver change in the next. This is what
> > > i am doing with ODP. Adding things necessary in 5.1 and working with
> > > Mellanox to have the ODP HMM patch fully tested and ready to go in
> > > 5.2 (the patch is available today and Mellanox have begin testing it
> > > AFAIK). So this is the guideline i will be following. Post mm bits
> > > with driver patches, push to merge mm bits one release and have the
> > > driver bits in the next. I do hope this sound fine to everyone.
> >
> > The track record to date has not been "merge HMM patch in one release
> > and merge the driver updates the next". If that is the plan going
> > forward that's great, and I do appreciate that this set came with
> > driver changes, and maintain hope the existing exports don't go
> > user-less for too much longer.
>
> Decision time.  Jerome, how are things looking for getting these driver
> changes merged in the next cycle?
>
> Dan, what's your overall take on this series for a 5.1-rc1 merge?

My hesitation would be drastically reduced if there was a plan to
avoid dangling unconsumed symbols and functionality. Specifically one
or more of the following suggestions:

* EXPORT_SYMBOL_GPL on all exports to avoid a growing liability
surface for out-of-tree consumers to come grumble at us when we
continue to refactor the kernel as we are wont to do.

* A commitment to consume newly exported symbols in the same merge
window, or the following merge window. When that goal is missed revert
the functionality until such time that it can be consumed, or
otherwise abandoned.

* No new symbol exports and functionality while existing symbols go unconsumed.

These are the minimum requirements I would expect my work, or any
core-mm work for that matter, to be held to, I see no reason why HMM
could not meet the same.

On this specific patch I would ask that the changelog incorporate the
motivation that was teased out of our follow-on discussion, not "There
is no reason not to support that case." which isn't a justification.
Jerome Glisse March 6, 2019, 3:49 p.m. UTC | #14
On Tue, Mar 05, 2019 at 02:16:35PM -0800, Andrew Morton wrote:
> On Wed, 30 Jan 2019 21:44:46 -0800 Dan Williams <dan.j.williams@intel.com> wrote:
> 
> > >
> > > > Another way to help allay these worries is commit to no new exports
> > > > without in-tree users. In general, that should go without saying for
> > > > any core changes for new or future hardware.
> > >
> > > I always intend to have an upstream user the issue is that the device
> > > driver tree and the mm tree move a different pace and there is always
> > > a chicken and egg problem. I do not think Andrew wants to have to
> > > merge driver patches through its tree, nor Linus want to have to merge
> > > drivers and mm trees in specific order. So it is easier to introduce
> > > mm change in one release and driver change in the next. This is what
> > > i am doing with ODP. Adding things necessary in 5.1 and working with
> > > Mellanox to have the ODP HMM patch fully tested and ready to go in
> > > 5.2 (the patch is available today and Mellanox have begin testing it
> > > AFAIK). So this is the guideline i will be following. Post mm bits
> > > with driver patches, push to merge mm bits one release and have the
> > > driver bits in the next. I do hope this sound fine to everyone.
> > 
> > The track record to date has not been "merge HMM patch in one release
> > and merge the driver updates the next". If that is the plan going
> > forward that's great, and I do appreciate that this set came with
> > driver changes, and maintain hope the existing exports don't go
> > user-less for too much longer.
> 
> Decision time.  Jerome, how are things looking for getting these driver
> changes merged in the next cycle?

nouveau is merge already.

> 
> Dan, what's your overall take on this series for a 5.1-rc1 merge?
> 
> Jerome, what would be the risks in skipping just this [09/10] patch?

As nouveau is a new user it does not regress anything but for RDMA
mlx5 (which i expect to merge new window) it would regress that
driver.

Cheers,
Jérôme
Jerome Glisse March 6, 2019, 3:51 p.m. UTC | #15
On Tue, Mar 05, 2019 at 08:20:10PM -0800, Dan Williams wrote:
> On Tue, Mar 5, 2019 at 2:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Wed, 30 Jan 2019 21:44:46 -0800 Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > > >
> > > > > Another way to help allay these worries is commit to no new exports
> > > > > without in-tree users. In general, that should go without saying for
> > > > > any core changes for new or future hardware.
> > > >
> > > > I always intend to have an upstream user the issue is that the device
> > > > driver tree and the mm tree move a different pace and there is always
> > > > a chicken and egg problem. I do not think Andrew wants to have to
> > > > merge driver patches through its tree, nor Linus want to have to merge
> > > > drivers and mm trees in specific order. So it is easier to introduce
> > > > mm change in one release and driver change in the next. This is what
> > > > i am doing with ODP. Adding things necessary in 5.1 and working with
> > > > Mellanox to have the ODP HMM patch fully tested and ready to go in
> > > > 5.2 (the patch is available today and Mellanox have begin testing it
> > > > AFAIK). So this is the guideline i will be following. Post mm bits
> > > > with driver patches, push to merge mm bits one release and have the
> > > > driver bits in the next. I do hope this sound fine to everyone.
> > >
> > > The track record to date has not been "merge HMM patch in one release
> > > and merge the driver updates the next". If that is the plan going
> > > forward that's great, and I do appreciate that this set came with
> > > driver changes, and maintain hope the existing exports don't go
> > > user-less for too much longer.
> >
> > Decision time.  Jerome, how are things looking for getting these driver
> > changes merged in the next cycle?
> >
> > Dan, what's your overall take on this series for a 5.1-rc1 merge?
> 
> My hesitation would be drastically reduced if there was a plan to
> avoid dangling unconsumed symbols and functionality. Specifically one
> or more of the following suggestions:
> 
> * EXPORT_SYMBOL_GPL on all exports to avoid a growing liability
> surface for out-of-tree consumers to come grumble at us when we
> continue to refactor the kernel as we are wont to do.
> 
> * A commitment to consume newly exported symbols in the same merge
> window, or the following merge window. When that goal is missed revert
> the functionality until such time that it can be consumed, or
> otherwise abandoned.
> 
> * No new symbol exports and functionality while existing symbols go unconsumed.
> 
> These are the minimum requirements I would expect my work, or any
> core-mm work for that matter, to be held to, I see no reason why HMM
> could not meet the same.

nouveau use all of this and other driver patchset have been posted to
also use this API.

> On this specific patch I would ask that the changelog incorporate the
> motivation that was teased out of our follow-on discussion, not "There
> is no reason not to support that case." which isn't a justification.

mlx5 wants to use HMM without DAX support it would regress mlx5. Other
driver like nouveau also want to access DAX filesystem. So yes there is
no reason not to support DAX filesystem. Why do you not want DAX with
mirroring ? You want to cripple HMM ? Why ?

Cheers,
Jérôme
Dan Williams March 6, 2019, 3:57 p.m. UTC | #16
On Wed, Mar 6, 2019 at 7:51 AM Jerome Glisse <jglisse@redhat.com> wrote:
>
> On Tue, Mar 05, 2019 at 08:20:10PM -0800, Dan Williams wrote:
> > On Tue, Mar 5, 2019 at 2:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> > >
> > > On Wed, 30 Jan 2019 21:44:46 -0800 Dan Williams <dan.j.williams@intel.com> wrote:
> > >
> > > > >
> > > > > > Another way to help allay these worries is commit to no new exports
> > > > > > without in-tree users. In general, that should go without saying for
> > > > > > any core changes for new or future hardware.
> > > > >
> > > > > I always intend to have an upstream user the issue is that the device
> > > > > driver tree and the mm tree move a different pace and there is always
> > > > > a chicken and egg problem. I do not think Andrew wants to have to
> > > > > merge driver patches through its tree, nor Linus want to have to merge
> > > > > drivers and mm trees in specific order. So it is easier to introduce
> > > > > mm change in one release and driver change in the next. This is what
> > > > > i am doing with ODP. Adding things necessary in 5.1 and working with
> > > > > Mellanox to have the ODP HMM patch fully tested and ready to go in
> > > > > 5.2 (the patch is available today and Mellanox have begin testing it
> > > > > AFAIK). So this is the guideline i will be following. Post mm bits
> > > > > with driver patches, push to merge mm bits one release and have the
> > > > > driver bits in the next. I do hope this sound fine to everyone.
> > > >
> > > > The track record to date has not been "merge HMM patch in one release
> > > > and merge the driver updates the next". If that is the plan going
> > > > forward that's great, and I do appreciate that this set came with
> > > > driver changes, and maintain hope the existing exports don't go
> > > > user-less for too much longer.
> > >
> > > Decision time.  Jerome, how are things looking for getting these driver
> > > changes merged in the next cycle?
> > >
> > > Dan, what's your overall take on this series for a 5.1-rc1 merge?
> >
> > My hesitation would be drastically reduced if there was a plan to
> > avoid dangling unconsumed symbols and functionality. Specifically one
> > or more of the following suggestions:
> >
> > * EXPORT_SYMBOL_GPL on all exports to avoid a growing liability
> > surface for out-of-tree consumers to come grumble at us when we
> > continue to refactor the kernel as we are wont to do.
> >
> > * A commitment to consume newly exported symbols in the same merge
> > window, or the following merge window. When that goal is missed revert
> > the functionality until such time that it can be consumed, or
> > otherwise abandoned.
> >
> > * No new symbol exports and functionality while existing symbols go unconsumed.
> >
> > These are the minimum requirements I would expect my work, or any
> > core-mm work for that matter, to be held to, I see no reason why HMM
> > could not meet the same.
>
> nouveau use all of this and other driver patchset have been posted to
> also use this API.
>
> > On this specific patch I would ask that the changelog incorporate the
> > motivation that was teased out of our follow-on discussion, not "There
> > is no reason not to support that case." which isn't a justification.
>
> mlx5 wants to use HMM without DAX support it would regress mlx5. Other
> driver like nouveau also want to access DAX filesystem. So yes there is
> no reason not to support DAX filesystem. Why do you not want DAX with
> mirroring ? You want to cripple HMM ? Why ?

There is a misunderstanding... my request for this patch was to update
the changelog to describe the merits of DAX mirroring to replace the
"There is no reason not to support that case." Otherwise someone
reading this changelog in a year will wonder what the motivation was.
Jerome Glisse March 6, 2019, 4:03 p.m. UTC | #17
On Wed, Mar 06, 2019 at 07:57:30AM -0800, Dan Williams wrote:
> On Wed, Mar 6, 2019 at 7:51 AM Jerome Glisse <jglisse@redhat.com> wrote:
> >
> > On Tue, Mar 05, 2019 at 08:20:10PM -0800, Dan Williams wrote:
> > > On Tue, Mar 5, 2019 at 2:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> > > >
> > > > On Wed, 30 Jan 2019 21:44:46 -0800 Dan Williams <dan.j.williams@intel.com> wrote:
> > > >
> > > > > >
> > > > > > > Another way to help allay these worries is commit to no new exports
> > > > > > > without in-tree users. In general, that should go without saying for
> > > > > > > any core changes for new or future hardware.
> > > > > >
> > > > > > I always intend to have an upstream user the issue is that the device
> > > > > > driver tree and the mm tree move a different pace and there is always
> > > > > > a chicken and egg problem. I do not think Andrew wants to have to
> > > > > > merge driver patches through its tree, nor Linus want to have to merge
> > > > > > drivers and mm trees in specific order. So it is easier to introduce
> > > > > > mm change in one release and driver change in the next. This is what
> > > > > > i am doing with ODP. Adding things necessary in 5.1 and working with
> > > > > > Mellanox to have the ODP HMM patch fully tested and ready to go in
> > > > > > 5.2 (the patch is available today and Mellanox have begin testing it
> > > > > > AFAIK). So this is the guideline i will be following. Post mm bits
> > > > > > with driver patches, push to merge mm bits one release and have the
> > > > > > driver bits in the next. I do hope this sound fine to everyone.
> > > > >
> > > > > The track record to date has not been "merge HMM patch in one release
> > > > > and merge the driver updates the next". If that is the plan going
> > > > > forward that's great, and I do appreciate that this set came with
> > > > > driver changes, and maintain hope the existing exports don't go
> > > > > user-less for too much longer.
> > > >
> > > > Decision time.  Jerome, how are things looking for getting these driver
> > > > changes merged in the next cycle?
> > > >
> > > > Dan, what's your overall take on this series for a 5.1-rc1 merge?
> > >
> > > My hesitation would be drastically reduced if there was a plan to
> > > avoid dangling unconsumed symbols and functionality. Specifically one
> > > or more of the following suggestions:
> > >
> > > * EXPORT_SYMBOL_GPL on all exports to avoid a growing liability
> > > surface for out-of-tree consumers to come grumble at us when we
> > > continue to refactor the kernel as we are wont to do.
> > >
> > > * A commitment to consume newly exported symbols in the same merge
> > > window, or the following merge window. When that goal is missed revert
> > > the functionality until such time that it can be consumed, or
> > > otherwise abandoned.
> > >
> > > * No new symbol exports and functionality while existing symbols go unconsumed.
> > >
> > > These are the minimum requirements I would expect my work, or any
> > > core-mm work for that matter, to be held to, I see no reason why HMM
> > > could not meet the same.
> >
> > nouveau use all of this and other driver patchset have been posted to
> > also use this API.
> >
> > > On this specific patch I would ask that the changelog incorporate the
> > > motivation that was teased out of our follow-on discussion, not "There
> > > is no reason not to support that case." which isn't a justification.
> >
> > mlx5 wants to use HMM without DAX support it would regress mlx5. Other
> > driver like nouveau also want to access DAX filesystem. So yes there is
> > no reason not to support DAX filesystem. Why do you not want DAX with
> > mirroring ? You want to cripple HMM ? Why ?
> 
> There is a misunderstanding... my request for this patch was to update
> the changelog to describe the merits of DAX mirroring to replace the
> "There is no reason not to support that case." Otherwise someone
> reading this changelog in a year will wonder what the motivation was.

So what about:

HMM mirroring allow device to mirror process address onto device
there is no reason for that mirroring to not work if the virtual
address are the result of an mmap of a file on DAX enabled file-
system.
Dan Williams March 6, 2019, 4:06 p.m. UTC | #18
On Wed, Mar 6, 2019 at 8:03 AM Jerome Glisse <jglisse@redhat.com> wrote:
>
> On Wed, Mar 06, 2019 at 07:57:30AM -0800, Dan Williams wrote:
> > On Wed, Mar 6, 2019 at 7:51 AM Jerome Glisse <jglisse@redhat.com> wrote:
> > >
> > > On Tue, Mar 05, 2019 at 08:20:10PM -0800, Dan Williams wrote:
> > > > On Tue, Mar 5, 2019 at 2:16 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> > > > >
> > > > > On Wed, 30 Jan 2019 21:44:46 -0800 Dan Williams <dan.j.williams@intel.com> wrote:
> > > > >
> > > > > > >
> > > > > > > > Another way to help allay these worries is commit to no new exports
> > > > > > > > without in-tree users. In general, that should go without saying for
> > > > > > > > any core changes for new or future hardware.
> > > > > > >
> > > > > > > I always intend to have an upstream user the issue is that the device
> > > > > > > driver tree and the mm tree move a different pace and there is always
> > > > > > > a chicken and egg problem. I do not think Andrew wants to have to
> > > > > > > merge driver patches through its tree, nor Linus want to have to merge
> > > > > > > drivers and mm trees in specific order. So it is easier to introduce
> > > > > > > mm change in one release and driver change in the next. This is what
> > > > > > > i am doing with ODP. Adding things necessary in 5.1 and working with
> > > > > > > Mellanox to have the ODP HMM patch fully tested and ready to go in
> > > > > > > 5.2 (the patch is available today and Mellanox have begin testing it
> > > > > > > AFAIK). So this is the guideline i will be following. Post mm bits
> > > > > > > with driver patches, push to merge mm bits one release and have the
> > > > > > > driver bits in the next. I do hope this sound fine to everyone.
> > > > > >
> > > > > > The track record to date has not been "merge HMM patch in one release
> > > > > > and merge the driver updates the next". If that is the plan going
> > > > > > forward that's great, and I do appreciate that this set came with
> > > > > > driver changes, and maintain hope the existing exports don't go
> > > > > > user-less for too much longer.
> > > > >
> > > > > Decision time.  Jerome, how are things looking for getting these driver
> > > > > changes merged in the next cycle?
> > > > >
> > > > > Dan, what's your overall take on this series for a 5.1-rc1 merge?
> > > >
> > > > My hesitation would be drastically reduced if there was a plan to
> > > > avoid dangling unconsumed symbols and functionality. Specifically one
> > > > or more of the following suggestions:
> > > >
> > > > * EXPORT_SYMBOL_GPL on all exports to avoid a growing liability
> > > > surface for out-of-tree consumers to come grumble at us when we
> > > > continue to refactor the kernel as we are wont to do.
> > > >
> > > > * A commitment to consume newly exported symbols in the same merge
> > > > window, or the following merge window. When that goal is missed revert
> > > > the functionality until such time that it can be consumed, or
> > > > otherwise abandoned.
> > > >
> > > > * No new symbol exports and functionality while existing symbols go unconsumed.
> > > >
> > > > These are the minimum requirements I would expect my work, or any
> > > > core-mm work for that matter, to be held to, I see no reason why HMM
> > > > could not meet the same.
> > >
> > > nouveau use all of this and other driver patchset have been posted to
> > > also use this API.
> > >
> > > > On this specific patch I would ask that the changelog incorporate the
> > > > motivation that was teased out of our follow-on discussion, not "There
> > > > is no reason not to support that case." which isn't a justification.
> > >
> > > mlx5 wants to use HMM without DAX support it would regress mlx5. Other
> > > driver like nouveau also want to access DAX filesystem. So yes there is
> > > no reason not to support DAX filesystem. Why do you not want DAX with
> > > mirroring ? You want to cripple HMM ? Why ?
> >
> > There is a misunderstanding... my request for this patch was to update
> > the changelog to describe the merits of DAX mirroring to replace the
> > "There is no reason not to support that case." Otherwise someone
> > reading this changelog in a year will wonder what the motivation was.
>
> So what about:
>
> HMM mirroring allow device to mirror process address onto device
> there is no reason for that mirroring to not work if the virtual
> address are the result of an mmap of a file on DAX enabled file-
> system.

Looks like an improvement to me.
Andrew Morton March 6, 2019, 10:18 p.m. UTC | #19
On Wed, 6 Mar 2019 10:49:04 -0500 Jerome Glisse <jglisse@redhat.com> wrote:

> On Tue, Mar 05, 2019 at 02:16:35PM -0800, Andrew Morton wrote:
> > On Wed, 30 Jan 2019 21:44:46 -0800 Dan Williams <dan.j.williams@intel.com> wrote:
> > 
> > > >
> > > > > Another way to help allay these worries is commit to no new exports
> > > > > without in-tree users. In general, that should go without saying for
> > > > > any core changes for new or future hardware.
> > > >
> > > > I always intend to have an upstream user the issue is that the device
> > > > driver tree and the mm tree move a different pace and there is always
> > > > a chicken and egg problem. I do not think Andrew wants to have to
> > > > merge driver patches through its tree, nor Linus want to have to merge
> > > > drivers and mm trees in specific order. So it is easier to introduce
> > > > mm change in one release and driver change in the next. This is what
> > > > i am doing with ODP. Adding things necessary in 5.1 and working with
> > > > Mellanox to have the ODP HMM patch fully tested and ready to go in
> > > > 5.2 (the patch is available today and Mellanox have begin testing it
> > > > AFAIK). So this is the guideline i will be following. Post mm bits
> > > > with driver patches, push to merge mm bits one release and have the
> > > > driver bits in the next. I do hope this sound fine to everyone.
> > > 
> > > The track record to date has not been "merge HMM patch in one release
> > > and merge the driver updates the next". If that is the plan going
> > > forward that's great, and I do appreciate that this set came with
> > > driver changes, and maintain hope the existing exports don't go
> > > user-less for too much longer.
> > 
> > Decision time.  Jerome, how are things looking for getting these driver
> > changes merged in the next cycle?
> 
> nouveau is merge already.

Confused.  Nouveau in mainline is dependent upon "mm/hmm: allow to
mirror vma of a file on a DAX backed filesystem"?  That can't be the
case?

> > 
> > Dan, what's your overall take on this series for a 5.1-rc1 merge?
> > 
> > Jerome, what would be the risks in skipping just this [09/10] patch?
> 
> As nouveau is a new user it does not regress anything but for RDMA
> mlx5 (which i expect to merge new window) it would regress that
> driver.

Also confused.  How can omitting "mm/hmm: allow to mirror vma of a file
on a DAX backed filesystem" from 5.1-rc1 cause an mlx5 regression?
Jerome Glisse March 7, 2019, 12:36 a.m. UTC | #20
On Wed, Mar 06, 2019 at 02:18:20PM -0800, Andrew Morton wrote:
> On Wed, 6 Mar 2019 10:49:04 -0500 Jerome Glisse <jglisse@redhat.com> wrote:
> 
> > On Tue, Mar 05, 2019 at 02:16:35PM -0800, Andrew Morton wrote:
> > > On Wed, 30 Jan 2019 21:44:46 -0800 Dan Williams <dan.j.williams@intel.com> wrote:
> > > 
> > > > >
> > > > > > Another way to help allay these worries is commit to no new exports
> > > > > > without in-tree users. In general, that should go without saying for
> > > > > > any core changes for new or future hardware.
> > > > >
> > > > > I always intend to have an upstream user the issue is that the device
> > > > > driver tree and the mm tree move a different pace and there is always
> > > > > a chicken and egg problem. I do not think Andrew wants to have to
> > > > > merge driver patches through its tree, nor Linus want to have to merge
> > > > > drivers and mm trees in specific order. So it is easier to introduce
> > > > > mm change in one release and driver change in the next. This is what
> > > > > i am doing with ODP. Adding things necessary in 5.1 and working with
> > > > > Mellanox to have the ODP HMM patch fully tested and ready to go in
> > > > > 5.2 (the patch is available today and Mellanox have begin testing it
> > > > > AFAIK). So this is the guideline i will be following. Post mm bits
> > > > > with driver patches, push to merge mm bits one release and have the
> > > > > driver bits in the next. I do hope this sound fine to everyone.
> > > > 
> > > > The track record to date has not been "merge HMM patch in one release
> > > > and merge the driver updates the next". If that is the plan going
> > > > forward that's great, and I do appreciate that this set came with
> > > > driver changes, and maintain hope the existing exports don't go
> > > > user-less for too much longer.
> > > 
> > > Decision time.  Jerome, how are things looking for getting these driver
> > > changes merged in the next cycle?
> > 
> > nouveau is merge already.
> 
> Confused.  Nouveau in mainline is dependent upon "mm/hmm: allow to
> mirror vma of a file on a DAX backed filesystem"?  That can't be the
> case?

Not really, HMM mirror is about mirroring address space onto the device
so if mirroring does not work for file that are on a filesystem that use
DAX it fails in un-expected way from user point of view. But as nouveau
is just getting upstrean you can argue that no one previously depended
on that working for file backed page on DAX filesystem.

Now the ODP RDMA case is different, what is upstream today works on DAX
so if that patch is not upstream in 5.1 then i can not merge HMM ODP in
5.2 as it would regress and the ODP people would not take the risk of
regression ie ODP folks want the DAX support to be upstream first.

> 
> > > 
> > > Dan, what's your overall take on this series for a 5.1-rc1 merge?
> > > 
> > > Jerome, what would be the risks in skipping just this [09/10] patch?
> > 
> > As nouveau is a new user it does not regress anything but for RDMA
> > mlx5 (which i expect to merge new window) it would regress that
> > driver.
> 
> Also confused.  How can omitting "mm/hmm: allow to mirror vma of a file
> on a DAX backed filesystem" from 5.1-rc1 cause an mlx5 regression?

Not in 5.1 but i can not merge HMM ODP in 5.2 if that is not in 5.1.
I know this circular dependency between sub-system is painful but i
do not see any simpler way.

Cheers,
Jérôme
Andrew Morton March 7, 2019, 5:46 p.m. UTC | #21
On Tue, 5 Mar 2019 20:20:10 -0800 Dan Williams <dan.j.williams@intel.com> wrote:

> My hesitation would be drastically reduced if there was a plan to
> avoid dangling unconsumed symbols and functionality. Specifically one
> or more of the following suggestions:
> 
> * EXPORT_SYMBOL_GPL on all exports to avoid a growing liability
> surface for out-of-tree consumers to come grumble at us when we
> continue to refactor the kernel as we are wont to do.

The existing patches use EXPORT_SYMBOL() so that's a sticking point. 
Jerome, what would happen is we made these EXPORT_SYMBOL_GPL()?

> * A commitment to consume newly exported symbols in the same merge
> window, or the following merge window. When that goal is missed revert
> the functionality until such time that it can be consumed, or
> otherwise abandoned.

It sounds like we can tick this box.

> * No new symbol exports and functionality while existing symbols go unconsumed.

Unsure about this one?
Jerome Glisse March 7, 2019, 6:56 p.m. UTC | #22
On Thu, Mar 07, 2019 at 09:46:54AM -0800, Andrew Morton wrote:
> On Tue, 5 Mar 2019 20:20:10 -0800 Dan Williams <dan.j.williams@intel.com> wrote:
> 
> > My hesitation would be drastically reduced if there was a plan to
> > avoid dangling unconsumed symbols and functionality. Specifically one
> > or more of the following suggestions:
> > 
> > * EXPORT_SYMBOL_GPL on all exports to avoid a growing liability
> > surface for out-of-tree consumers to come grumble at us when we
> > continue to refactor the kernel as we are wont to do.
> 
> The existing patches use EXPORT_SYMBOL() so that's a sticking point. 
> Jerome, what would happen is we made these EXPORT_SYMBOL_GPL()?

So Dan argue that GPL export solve the problem of out of tree user and
my personnal experience is that it does not. The GPU sub-system has tons
of GPL drivers that are not upstream and we never felt that we were bound
to support them in anyway. We always were very clear that if you are not
upstream that you do not have any voice on changes we do.

So my exeperience is that GPL does not help here. It is just about being
clear and ignoring anyone who does not have an upstream driver ie we have
free hands to update HMM in anyway as long as we keep supporting the
upstream user.

That being said if the GPL aspect is that much important to some then
fine let switch all HMM symbol to GPL.

> 
> > * A commitment to consume newly exported symbols in the same merge
> > window, or the following merge window. When that goal is missed revert
> > the functionality until such time that it can be consumed, or
> > otherwise abandoned.
> 
> It sounds like we can tick this box.

I wouldn't be too strick either, when adding something in release N
the driver change in N+1 can miss N+1 because of bug or regression
and be push to N+2.

I think a better stance here is that if we do not get any sign-off
on the feature from driver maintainer for which the feature is intended
then we just do not merge. If after few release we still can not get
the driver to use it then we revert.

It just feels dumb to revert at N+1 just to get it back in N+2 as
the driver bit get fix.

> 
> > * No new symbol exports and functionality while existing symbols go unconsumed.
> 
> Unsure about this one?

With nouveau upstream now everything is use. ODP will use some of the
symbol too. PPC has patchset posted to use lot of HMM too. I have been
working with other vendor that have patchset being work on to use HMM
too.

I have not done all those function just for the fun of it :) They do
have real use and user. It took a longtime to get nouveau because of
userspace we had a lot of catchup to do in mesa and llvm and we are
still very rough there.

Cheers,
Jérôme
Dan Williams March 12, 2019, 3:13 a.m. UTC | #23
On Thu, Mar 7, 2019 at 10:56 AM Jerome Glisse <jglisse@redhat.com> wrote:
>
> On Thu, Mar 07, 2019 at 09:46:54AM -0800, Andrew Morton wrote:
> > On Tue, 5 Mar 2019 20:20:10 -0800 Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > > My hesitation would be drastically reduced if there was a plan to
> > > avoid dangling unconsumed symbols and functionality. Specifically one
> > > or more of the following suggestions:
> > >
> > > * EXPORT_SYMBOL_GPL on all exports to avoid a growing liability
> > > surface for out-of-tree consumers to come grumble at us when we
> > > continue to refactor the kernel as we are wont to do.
> >
> > The existing patches use EXPORT_SYMBOL() so that's a sticking point.
> > Jerome, what would happen is we made these EXPORT_SYMBOL_GPL()?
>
> So Dan argue that GPL export solve the problem of out of tree user and
> my personnal experience is that it does not. The GPU sub-system has tons
> of GPL drivers that are not upstream and we never felt that we were bound
> to support them in anyway. We always were very clear that if you are not
> upstream that you do not have any voice on changes we do.
>
> So my exeperience is that GPL does not help here. It is just about being
> clear and ignoring anyone who does not have an upstream driver ie we have
> free hands to update HMM in anyway as long as we keep supporting the
> upstream user.
>
> That being said if the GPL aspect is that much important to some then
> fine let switch all HMM symbol to GPL.

I should add that I would not be opposed to moving symbols to
non-GPL-only over time, but that should be based on our experience
with the stability and utility of the implementation. For brand new
symbols there's just no data to argue that we can / should keep the
interface stable, or that the interface exposes something fragile that
we'd rather not export at all. That experience gathering and thrash is
best constrained to upstream GPL-only drivers that are signing up to
participate in that maturation process.

So I think it is important from a practical perspective and is a lower
risk way to run this HMM experiment of "merge infrastructure way in
advance of an upstream user".

> > > * A commitment to consume newly exported symbols in the same merge
> > > window, or the following merge window. When that goal is missed revert
> > > the functionality until such time that it can be consumed, or
> > > otherwise abandoned.
> >
> > It sounds like we can tick this box.
>
> I wouldn't be too strick either, when adding something in release N
> the driver change in N+1 can miss N+1 because of bug or regression
> and be push to N+2.
>
> I think a better stance here is that if we do not get any sign-off
> on the feature from driver maintainer for which the feature is intended
> then we just do not merge.

Agree, no driver maintainer sign-off then no merge.

> If after few release we still can not get
> the driver to use it then we revert.

As long as it is made clear to the driver maintainer that they have
one cycle to consume it then we can have a conversation if it is too
early to merge the infrastructure. If no one has time to consume the
feature, why rush dead code into the kernel? Also, waiting 2 cycles
means the infrastructure that was hard to review without a user is now
even harder to review because any review momentum has been lost by the
time the user show up, so we're better off keeping them close together
in time.


> It just feels dumb to revert at N+1 just to get it back in N+2 as
> the driver bit get fix.

No, I think it just means the infrastructure went in too early if a
driver can't consume it in a development cycle. Lets revisit if it
becomes a problem in practice.

> > > * No new symbol exports and functionality while existing symbols go unconsumed.
> >
> > Unsure about this one?
>
> With nouveau upstream now everything is use. ODP will use some of the
> symbol too. PPC has patchset posted to use lot of HMM too. I have been
> working with other vendor that have patchset being work on to use HMM
> too.
>
> I have not done all those function just for the fun of it :) They do
> have real use and user. It took a longtime to get nouveau because of
> userspace we had a lot of catchup to do in mesa and llvm and we are
> still very rough there.

Sure, this one is less of a concern if we can stick to tighter
timelines between infrastructure and driver consumer merge.
Jerome Glisse March 12, 2019, 3:25 p.m. UTC | #24
On Mon, Mar 11, 2019 at 08:13:53PM -0700, Dan Williams wrote:
> On Thu, Mar 7, 2019 at 10:56 AM Jerome Glisse <jglisse@redhat.com> wrote:
> >
> > On Thu, Mar 07, 2019 at 09:46:54AM -0800, Andrew Morton wrote:
> > > On Tue, 5 Mar 2019 20:20:10 -0800 Dan Williams <dan.j.williams@intel.com> wrote:
> > >
> > > > My hesitation would be drastically reduced if there was a plan to
> > > > avoid dangling unconsumed symbols and functionality. Specifically one
> > > > or more of the following suggestions:
> > > >
> > > > * EXPORT_SYMBOL_GPL on all exports to avoid a growing liability
> > > > surface for out-of-tree consumers to come grumble at us when we
> > > > continue to refactor the kernel as we are wont to do.
> > >
> > > The existing patches use EXPORT_SYMBOL() so that's a sticking point.
> > > Jerome, what would happen is we made these EXPORT_SYMBOL_GPL()?
> >
> > So Dan argue that GPL export solve the problem of out of tree user and
> > my personnal experience is that it does not. The GPU sub-system has tons
> > of GPL drivers that are not upstream and we never felt that we were bound
> > to support them in anyway. We always were very clear that if you are not
> > upstream that you do not have any voice on changes we do.
> >
> > So my exeperience is that GPL does not help here. It is just about being
> > clear and ignoring anyone who does not have an upstream driver ie we have
> > free hands to update HMM in anyway as long as we keep supporting the
> > upstream user.
> >
> > That being said if the GPL aspect is that much important to some then
> > fine let switch all HMM symbol to GPL.
> 
> I should add that I would not be opposed to moving symbols to
> non-GPL-only over time, but that should be based on our experience
> with the stability and utility of the implementation. For brand new
> symbols there's just no data to argue that we can / should keep the
> interface stable, or that the interface exposes something fragile that
> we'd rather not export at all. That experience gathering and thrash is
> best constrained to upstream GPL-only drivers that are signing up to
> participate in that maturation process.
> 
> So I think it is important from a practical perspective and is a lower
> risk way to run this HMM experiment of "merge infrastructure way in
> advance of an upstream user".
> 
> > > > * A commitment to consume newly exported symbols in the same merge
> > > > window, or the following merge window. When that goal is missed revert
> > > > the functionality until such time that it can be consumed, or
> > > > otherwise abandoned.
> > >
> > > It sounds like we can tick this box.
> >
> > I wouldn't be too strick either, when adding something in release N
> > the driver change in N+1 can miss N+1 because of bug or regression
> > and be push to N+2.
> >
> > I think a better stance here is that if we do not get any sign-off
> > on the feature from driver maintainer for which the feature is intended
> > then we just do not merge.
> 
> Agree, no driver maintainer sign-off then no merge.
> 
> > If after few release we still can not get
> > the driver to use it then we revert.
> 
> As long as it is made clear to the driver maintainer that they have
> one cycle to consume it then we can have a conversation if it is too
> early to merge the infrastructure. If no one has time to consume the
> feature, why rush dead code into the kernel? Also, waiting 2 cycles
> means the infrastructure that was hard to review without a user is now
> even harder to review because any review momentum has been lost by the
> time the user show up, so we're better off keeping them close together
> in time.

Miss-understanding here, in first post the infrastructure and the driver
bit get posted just like have been doing lately. So that you know that
you have working user with the feature and what is left is pushing the
driver bits throught the appropriate tree. So driver maintainer support
is about knowing that they want the feature and have some confidence
that it looks ready.

It also means you can review the infrastructure along side user of it.

> 
> 
> > It just feels dumb to revert at N+1 just to get it back in N+2 as
> > the driver bit get fix.
> 
> No, I think it just means the infrastructure went in too early if a
> driver can't consume it in a development cycle. Lets revisit if it
> becomes a problem in practice.

Well that's just dumb to have hard guideline like that. Many things
can lead to missing deadline. For instance bug i am refering too might
have nothing to do with the feature, it can be something related to
integrating the feature an unforseen side effect. So i believe a better
guideline is that driver maintainer rejecting the feature rather than
just failure to meet one deadline.


> > > > * No new symbol exports and functionality while existing symbols go unconsumed.
> > >
> > > Unsure about this one?
> >
> > With nouveau upstream now everything is use. ODP will use some of the
> > symbol too. PPC has patchset posted to use lot of HMM too. I have been
> > working with other vendor that have patchset being work on to use HMM
> > too.
> >
> > I have not done all those function just for the fun of it :) They do
> > have real use and user. It took a longtime to get nouveau because of
> > userspace we had a lot of catchup to do in mesa and llvm and we are
> > still very rough there.
> 
> Sure, this one is less of a concern if we can stick to tighter
> timelines between infrastructure and driver consumer merge.

Issue is that consumer timeline can be hard to know, sometimes
the consumer go over few revision (like ppc for instance) and
not because of the infrastructure but for other reasons. So
reverting the infrastructure just because user had its timeline
change is not productive. User missing one cycle means they would
get delayed for 2 cycles ie reupstreaming the infrastructure in
next cycle and repushing the user the cycle after. This sounds
like a total wastage of everyone times. While keeping the infra-
structure would allow the timeline to slip by just one cycle.

Spirit of the rule is better than blind application of rule.

Cheers,
Jérôme
Dan Williams March 12, 2019, 4:06 p.m. UTC | #25
On Tue, Mar 12, 2019 at 8:26 AM Jerome Glisse <jglisse@redhat.com> wrote:
>
> On Mon, Mar 11, 2019 at 08:13:53PM -0700, Dan Williams wrote:
> > On Thu, Mar 7, 2019 at 10:56 AM Jerome Glisse <jglisse@redhat.com> wrote:
> > >
> > > On Thu, Mar 07, 2019 at 09:46:54AM -0800, Andrew Morton wrote:
> > > > On Tue, 5 Mar 2019 20:20:10 -0800 Dan Williams <dan.j.williams@intel.com> wrote:
> > > >
> > > > > My hesitation would be drastically reduced if there was a plan to
> > > > > avoid dangling unconsumed symbols and functionality. Specifically one
> > > > > or more of the following suggestions:
> > > > >
> > > > > * EXPORT_SYMBOL_GPL on all exports to avoid a growing liability
> > > > > surface for out-of-tree consumers to come grumble at us when we
> > > > > continue to refactor the kernel as we are wont to do.
> > > >
> > > > The existing patches use EXPORT_SYMBOL() so that's a sticking point.
> > > > Jerome, what would happen is we made these EXPORT_SYMBOL_GPL()?
> > >
> > > So Dan argue that GPL export solve the problem of out of tree user and
> > > my personnal experience is that it does not. The GPU sub-system has tons
> > > of GPL drivers that are not upstream and we never felt that we were bound
> > > to support them in anyway. We always were very clear that if you are not
> > > upstream that you do not have any voice on changes we do.
> > >
> > > So my exeperience is that GPL does not help here. It is just about being
> > > clear and ignoring anyone who does not have an upstream driver ie we have
> > > free hands to update HMM in anyway as long as we keep supporting the
> > > upstream user.
> > >
> > > That being said if the GPL aspect is that much important to some then
> > > fine let switch all HMM symbol to GPL.
> >
> > I should add that I would not be opposed to moving symbols to
> > non-GPL-only over time, but that should be based on our experience
> > with the stability and utility of the implementation. For brand new
> > symbols there's just no data to argue that we can / should keep the
> > interface stable, or that the interface exposes something fragile that
> > we'd rather not export at all. That experience gathering and thrash is
> > best constrained to upstream GPL-only drivers that are signing up to
> > participate in that maturation process.
> >
> > So I think it is important from a practical perspective and is a lower
> > risk way to run this HMM experiment of "merge infrastructure way in
> > advance of an upstream user".
> >
> > > > > * A commitment to consume newly exported symbols in the same merge
> > > > > window, or the following merge window. When that goal is missed revert
> > > > > the functionality until such time that it can be consumed, or
> > > > > otherwise abandoned.
> > > >
> > > > It sounds like we can tick this box.
> > >
> > > I wouldn't be too strick either, when adding something in release N
> > > the driver change in N+1 can miss N+1 because of bug or regression
> > > and be push to N+2.
> > >
> > > I think a better stance here is that if we do not get any sign-off
> > > on the feature from driver maintainer for which the feature is intended
> > > then we just do not merge.
> >
> > Agree, no driver maintainer sign-off then no merge.
> >
> > > If after few release we still can not get
> > > the driver to use it then we revert.
> >
> > As long as it is made clear to the driver maintainer that they have
> > one cycle to consume it then we can have a conversation if it is too
> > early to merge the infrastructure. If no one has time to consume the
> > feature, why rush dead code into the kernel? Also, waiting 2 cycles
> > means the infrastructure that was hard to review without a user is now
> > even harder to review because any review momentum has been lost by the
> > time the user show up, so we're better off keeping them close together
> > in time.
>
> Miss-understanding here, in first post the infrastructure and the driver
> bit get posted just like have been doing lately. So that you know that
> you have working user with the feature and what is left is pushing the
> driver bits throught the appropriate tree. So driver maintainer support
> is about knowing that they want the feature and have some confidence
> that it looks ready.
>
> It also means you can review the infrastructure along side user of it.

Sounds good.

> > > It just feels dumb to revert at N+1 just to get it back in N+2 as
> > > the driver bit get fix.
> >
> > No, I think it just means the infrastructure went in too early if a
> > driver can't consume it in a development cycle. Lets revisit if it
> > becomes a problem in practice.
>
> Well that's just dumb to have hard guideline like that. Many things
> can lead to missing deadline. For instance bug i am refering too might
> have nothing to do with the feature, it can be something related to
> integrating the feature an unforseen side effect. So i believe a better
> guideline is that driver maintainer rejecting the feature rather than
> just failure to meet one deadline.

The history of the Linux kernel disagrees with this statement. It's
only HMM that has recently ignored precedent and pushed to land
infrastructure in advance of consumers, a one cycle constraint is
already generous in that light.

> > > > > * No new symbol exports and functionality while existing symbols go unconsumed.
> > > >
> > > > Unsure about this one?
> > >
> > > With nouveau upstream now everything is use. ODP will use some of the
> > > symbol too. PPC has patchset posted to use lot of HMM too. I have been
> > > working with other vendor that have patchset being work on to use HMM
> > > too.
> > >
> > > I have not done all those function just for the fun of it :) They do
> > > have real use and user. It took a longtime to get nouveau because of
> > > userspace we had a lot of catchup to do in mesa and llvm and we are
> > > still very rough there.
> >
> > Sure, this one is less of a concern if we can stick to tighter
> > timelines between infrastructure and driver consumer merge.
>
> Issue is that consumer timeline can be hard to know, sometimes
> the consumer go over few revision (like ppc for instance) and
> not because of the infrastructure but for other reasons. So
> reverting the infrastructure just because user had its timeline
> change is not productive. User missing one cycle means they would
> get delayed for 2 cycles ie reupstreaming the infrastructure in
> next cycle and repushing the user the cycle after. This sounds
> like a total wastage of everyone times. While keeping the infra-
> structure would allow the timeline to slip by just one cycle.
>
> Spirit of the rule is better than blind application of rule.

Again, I fail to see why HMM is suddenly unable to make forward
progress when the infrastructure that came before it was merged with
consumers in the same development cycle.

A gate to upstream merge is about the only lever a reviewer has to
push for change, and these requests to uncouple the consumer only
serve to weaken that review tool in my mind.
Jerome Glisse March 12, 2019, 7:06 p.m. UTC | #26
On Tue, Mar 12, 2019 at 09:06:12AM -0700, Dan Williams wrote:
> On Tue, Mar 12, 2019 at 8:26 AM Jerome Glisse <jglisse@redhat.com> wrote:
> >
> > On Mon, Mar 11, 2019 at 08:13:53PM -0700, Dan Williams wrote:
> > > On Thu, Mar 7, 2019 at 10:56 AM Jerome Glisse <jglisse@redhat.com> wrote:
> > > >
> > > > On Thu, Mar 07, 2019 at 09:46:54AM -0800, Andrew Morton wrote:
> > > > > On Tue, 5 Mar 2019 20:20:10 -0800 Dan Williams <dan.j.williams@intel.com> wrote:
> > > > >
> > > > > > My hesitation would be drastically reduced if there was a plan to
> > > > > > avoid dangling unconsumed symbols and functionality. Specifically one
> > > > > > or more of the following suggestions:
> > > > > >
> > > > > > * EXPORT_SYMBOL_GPL on all exports to avoid a growing liability
> > > > > > surface for out-of-tree consumers to come grumble at us when we
> > > > > > continue to refactor the kernel as we are wont to do.
> > > > >
> > > > > The existing patches use EXPORT_SYMBOL() so that's a sticking point.
> > > > > Jerome, what would happen is we made these EXPORT_SYMBOL_GPL()?
> > > >
> > > > So Dan argue that GPL export solve the problem of out of tree user and
> > > > my personnal experience is that it does not. The GPU sub-system has tons
> > > > of GPL drivers that are not upstream and we never felt that we were bound
> > > > to support them in anyway. We always were very clear that if you are not
> > > > upstream that you do not have any voice on changes we do.
> > > >
> > > > So my exeperience is that GPL does not help here. It is just about being
> > > > clear and ignoring anyone who does not have an upstream driver ie we have
> > > > free hands to update HMM in anyway as long as we keep supporting the
> > > > upstream user.
> > > >
> > > > That being said if the GPL aspect is that much important to some then
> > > > fine let switch all HMM symbol to GPL.
> > >
> > > I should add that I would not be opposed to moving symbols to
> > > non-GPL-only over time, but that should be based on our experience
> > > with the stability and utility of the implementation. For brand new
> > > symbols there's just no data to argue that we can / should keep the
> > > interface stable, or that the interface exposes something fragile that
> > > we'd rather not export at all. That experience gathering and thrash is
> > > best constrained to upstream GPL-only drivers that are signing up to
> > > participate in that maturation process.
> > >
> > > So I think it is important from a practical perspective and is a lower
> > > risk way to run this HMM experiment of "merge infrastructure way in
> > > advance of an upstream user".
> > >
> > > > > > * A commitment to consume newly exported symbols in the same merge
> > > > > > window, or the following merge window. When that goal is missed revert
> > > > > > the functionality until such time that it can be consumed, or
> > > > > > otherwise abandoned.
> > > > >
> > > > > It sounds like we can tick this box.
> > > >
> > > > I wouldn't be too strick either, when adding something in release N
> > > > the driver change in N+1 can miss N+1 because of bug or regression
> > > > and be push to N+2.
> > > >
> > > > I think a better stance here is that if we do not get any sign-off
> > > > on the feature from driver maintainer for which the feature is intended
> > > > then we just do not merge.
> > >
> > > Agree, no driver maintainer sign-off then no merge.
> > >
> > > > If after few release we still can not get
> > > > the driver to use it then we revert.
> > >
> > > As long as it is made clear to the driver maintainer that they have
> > > one cycle to consume it then we can have a conversation if it is too
> > > early to merge the infrastructure. If no one has time to consume the
> > > feature, why rush dead code into the kernel? Also, waiting 2 cycles
> > > means the infrastructure that was hard to review without a user is now
> > > even harder to review because any review momentum has been lost by the
> > > time the user show up, so we're better off keeping them close together
> > > in time.
> >
> > Miss-understanding here, in first post the infrastructure and the driver
> > bit get posted just like have been doing lately. So that you know that
> > you have working user with the feature and what is left is pushing the
> > driver bits throught the appropriate tree. So driver maintainer support
> > is about knowing that they want the feature and have some confidence
> > that it looks ready.
> >
> > It also means you can review the infrastructure along side user of it.
> 
> Sounds good.
> 
> > > > It just feels dumb to revert at N+1 just to get it back in N+2 as
> > > > the driver bit get fix.
> > >
> > > No, I think it just means the infrastructure went in too early if a
> > > driver can't consume it in a development cycle. Lets revisit if it
> > > becomes a problem in practice.
> >
> > Well that's just dumb to have hard guideline like that. Many things
> > can lead to missing deadline. For instance bug i am refering too might
> > have nothing to do with the feature, it can be something related to
> > integrating the feature an unforseen side effect. So i believe a better
> > guideline is that driver maintainer rejecting the feature rather than
> > just failure to meet one deadline.
> 
> The history of the Linux kernel disagrees with this statement. It's
> only HMM that has recently ignored precedent and pushed to land
> infrastructure in advance of consumers, a one cycle constraint is
> already generous in that light.
> 
> > > > > > * No new symbol exports and functionality while existing symbols go unconsumed.
> > > > >
> > > > > Unsure about this one?
> > > >
> > > > With nouveau upstream now everything is use. ODP will use some of the
> > > > symbol too. PPC has patchset posted to use lot of HMM too. I have been
> > > > working with other vendor that have patchset being work on to use HMM
> > > > too.
> > > >
> > > > I have not done all those function just for the fun of it :) They do
> > > > have real use and user. It took a longtime to get nouveau because of
> > > > userspace we had a lot of catchup to do in mesa and llvm and we are
> > > > still very rough there.
> > >
> > > Sure, this one is less of a concern if we can stick to tighter
> > > timelines between infrastructure and driver consumer merge.
> >
> > Issue is that consumer timeline can be hard to know, sometimes
> > the consumer go over few revision (like ppc for instance) and
> > not because of the infrastructure but for other reasons. So
> > reverting the infrastructure just because user had its timeline
> > change is not productive. User missing one cycle means they would
> > get delayed for 2 cycles ie reupstreaming the infrastructure in
> > next cycle and repushing the user the cycle after. This sounds
> > like a total wastage of everyone times. While keeping the infra-
> > structure would allow the timeline to slip by just one cycle.
> >
> > Spirit of the rule is better than blind application of rule.
> 
> Again, I fail to see why HMM is suddenly unable to make forward
> progress when the infrastructure that came before it was merged with
> consumers in the same development cycle.
> 
> A gate to upstream merge is about the only lever a reviewer has to
> push for change, and these requests to uncouple the consumer only
> serve to weaken that review tool in my mind.

Well let just agree to disagree and leave it at that and stop
wasting each other time

Jérôme
Dan Williams March 12, 2019, 7:30 p.m. UTC | #27
On Tue, Mar 12, 2019 at 12:06 PM Jerome Glisse <jglisse@redhat.com> wrote:
> On Tue, Mar 12, 2019 at 09:06:12AM -0700, Dan Williams wrote:
> > On Tue, Mar 12, 2019 at 8:26 AM Jerome Glisse <jglisse@redhat.com> wrote:
[..]
> > > Spirit of the rule is better than blind application of rule.
> >
> > Again, I fail to see why HMM is suddenly unable to make forward
> > progress when the infrastructure that came before it was merged with
> > consumers in the same development cycle.
> >
> > A gate to upstream merge is about the only lever a reviewer has to
> > push for change, and these requests to uncouple the consumer only
> > serve to weaken that review tool in my mind.
>
> Well let just agree to disagree and leave it at that and stop
> wasting each other time

I'm fine to continue this discussion if you are. Please be specific
about where we disagree and what aspect of the proposed rules about
merge staging are either acceptable, painful-but-doable, or
show-stoppers. Do you agree that HMM is doing something novel with
merge staging, am I off base there? I expect I can find folks that
would balk with even a one cycle deferment of consumers, but can we
start with that concession and see how it goes? I'm missing where I've
proposed something that is untenable for the future of HMM which is
addressing some real needs in gaps in the kernel's support for new
hardware.
Dave Chinner March 12, 2019, 8:34 p.m. UTC | #28
On Tue, Mar 12, 2019 at 12:30:52PM -0700, Dan Williams wrote:
> On Tue, Mar 12, 2019 at 12:06 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > On Tue, Mar 12, 2019 at 09:06:12AM -0700, Dan Williams wrote:
> > > On Tue, Mar 12, 2019 at 8:26 AM Jerome Glisse <jglisse@redhat.com> wrote:
> [..]
> > > > Spirit of the rule is better than blind application of rule.
> > >
> > > Again, I fail to see why HMM is suddenly unable to make forward
> > > progress when the infrastructure that came before it was merged with
> > > consumers in the same development cycle.
> > >
> > > A gate to upstream merge is about the only lever a reviewer has to
> > > push for change, and these requests to uncouple the consumer only
> > > serve to weaken that review tool in my mind.
> >
> > Well let just agree to disagree and leave it at that and stop
> > wasting each other time
> 
> I'm fine to continue this discussion if you are. Please be specific
> about where we disagree and what aspect of the proposed rules about
> merge staging are either acceptable, painful-but-doable, or
> show-stoppers. Do you agree that HMM is doing something novel with
> merge staging, am I off base there? I expect I can find folks that
> would balk with even a one cycle deferment of consumers, but can we
> start with that concession and see how it goes? I'm missing where I've
> proposed something that is untenable for the future of HMM which is
> addressing some real needs in gaps in the kernel's support for new
> hardware.

/me quietly wonders why the hmm infrastructure can't be staged in a
maintainer tree development branch on a kernel.org and then
all merged in one go when that branch has both infrastructure and
drivers merged into it...

i.e. everyone doing hmm driver work gets the infrastructure from the
dev tree, not mainline. That's a pretty standard procedure for
developing complex features, and it avoids all the issues being
argued over right now...

Cheers,

Dave/
Andrew Morton March 12, 2019, 9:52 p.m. UTC | #29
On Tue, 12 Mar 2019 12:30:52 -0700 Dan Williams <dan.j.williams@intel.com> wrote:

> On Tue, Mar 12, 2019 at 12:06 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > On Tue, Mar 12, 2019 at 09:06:12AM -0700, Dan Williams wrote:
> > > On Tue, Mar 12, 2019 at 8:26 AM Jerome Glisse <jglisse@redhat.com> wrote:
> [..]
> > > > Spirit of the rule is better than blind application of rule.
> > >
> > > Again, I fail to see why HMM is suddenly unable to make forward
> > > progress when the infrastructure that came before it was merged with
> > > consumers in the same development cycle.
> > >
> > > A gate to upstream merge is about the only lever a reviewer has to
> > > push for change, and these requests to uncouple the consumer only
> > > serve to weaken that review tool in my mind.
> >
> > Well let just agree to disagree and leave it at that and stop
> > wasting each other time
> 
> I'm fine to continue this discussion if you are. Please be specific
> about where we disagree and what aspect of the proposed rules about
> merge staging are either acceptable, painful-but-doable, or
> show-stoppers. Do you agree that HMM is doing something novel with
> merge staging, am I off base there?

You're correct.  We chose to go this way because the HMM code is so
large and all-over-the-place that developing it in a standalone tree
seemed impractical - better to feed it into mainline piecewise.

This decision very much assumed that HMM users would definitely be
merged, and that it would happen soon.  I was skeptical for a long time
and was eventually persuaded by quite a few conversations with various
architecture and driver maintainers indicating that these HMM users
would be forthcoming.

In retrospect, the arrival of HMM clients took quite a lot longer than
was anticipated and I'm not sure that all of the anticipated usage
sites will actually be using it.  I wish I'd kept records of
who-said-what, but I didn't and the info is now all rather dissipated.

So the plan didn't really work out as hoped.  Lesson learned, I would
now very much prefer that new HMM feature work's changelogs include
links to the driver patchsets which will be using those features and
acks and review input from the developers of those driver patchsets.
Jerome Glisse March 13, 2019, 12:10 a.m. UTC | #30
On Tue, Mar 12, 2019 at 02:52:14PM -0700, Andrew Morton wrote:
> On Tue, 12 Mar 2019 12:30:52 -0700 Dan Williams <dan.j.williams@intel.com> wrote:
> 
> > On Tue, Mar 12, 2019 at 12:06 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > > On Tue, Mar 12, 2019 at 09:06:12AM -0700, Dan Williams wrote:
> > > > On Tue, Mar 12, 2019 at 8:26 AM Jerome Glisse <jglisse@redhat.com> wrote:
> > [..]
> > > > > Spirit of the rule is better than blind application of rule.
> > > >
> > > > Again, I fail to see why HMM is suddenly unable to make forward
> > > > progress when the infrastructure that came before it was merged with
> > > > consumers in the same development cycle.
> > > >
> > > > A gate to upstream merge is about the only lever a reviewer has to
> > > > push for change, and these requests to uncouple the consumer only
> > > > serve to weaken that review tool in my mind.
> > >
> > > Well let just agree to disagree and leave it at that and stop
> > > wasting each other time
> > 
> > I'm fine to continue this discussion if you are. Please be specific
> > about where we disagree and what aspect of the proposed rules about
> > merge staging are either acceptable, painful-but-doable, or
> > show-stoppers. Do you agree that HMM is doing something novel with
> > merge staging, am I off base there?
> 
> You're correct.  We chose to go this way because the HMM code is so
> large and all-over-the-place that developing it in a standalone tree
> seemed impractical - better to feed it into mainline piecewise.
> 
> This decision very much assumed that HMM users would definitely be
> merged, and that it would happen soon.  I was skeptical for a long time
> and was eventually persuaded by quite a few conversations with various
> architecture and driver maintainers indicating that these HMM users
> would be forthcoming.
> 
> In retrospect, the arrival of HMM clients took quite a lot longer than
> was anticipated and I'm not sure that all of the anticipated usage
> sites will actually be using it.  I wish I'd kept records of
> who-said-what, but I didn't and the info is now all rather dissipated.
> 
> So the plan didn't really work out as hoped.  Lesson learned, I would
> now very much prefer that new HMM feature work's changelogs include
> links to the driver patchsets which will be using those features and
> acks and review input from the developers of those driver patchsets.

This is what i am doing now and this patchset falls into that. I did
post the ODP and nouveau bits to use the 2 new functions (dma map and
unmap). I expect to merge both ODP and nouveau bits for that during
the next merge window.

Also with 5.1 everything that is upstream is use by nouveau at least.
They are posted patches to use HMM for AMD, Intel, Radeon, ODP, PPC.
Some are going through several revisions so i do not know exactly when
each will make it upstream but i keep working on all this.

So the guideline we agree on:
    - no new infrastructure without user
    - device driver maintainer for which new infrastructure is done
      must either sign off or review of explicitly say that they want
      the feature I do not expect all driver maintainer will have
      the bandwidth to do proper review of the mm part of the infra-
      structure and it would not be fair to ask that from them. They
      can still provide feedback on the API expose to the device
      driver.
    - driver bits must be posted at the same time as the new infra-
      structure even if they target the next release cycle to avoid
      inter-tree dependency
    - driver bits must be merge as soon as possible

Thing we do not agree on:
    - If driver bits miss for any reason the +1 target directly
      revert the new infra-structure. I think it should not be black
      and white and the reasons why the driver bit missed the merge
      window should be taken into account. If the feature is still
      wanted and the driver bits missed the window for simple reasons
      then it means that we push everything by 2 release ie the
      revert is done in +1 then we reupload the infra-structure in
      +2 and finaly repush the driver bit in +3 so we loose 1 cycle.
      Hence why i would rather that the revert would only happen if
      it is clear that the infrastructure is not ready or can not
      be use in timely (over couple kernel release) fashion by any
      drivers.

Cheers,
Jérôme
Dan Williams March 13, 2019, 12:46 a.m. UTC | #31
On Tue, Mar 12, 2019 at 5:10 PM Jerome Glisse <jglisse@redhat.com> wrote:
>
> On Tue, Mar 12, 2019 at 02:52:14PM -0700, Andrew Morton wrote:
> > On Tue, 12 Mar 2019 12:30:52 -0700 Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > > On Tue, Mar 12, 2019 at 12:06 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > > > On Tue, Mar 12, 2019 at 09:06:12AM -0700, Dan Williams wrote:
> > > > > On Tue, Mar 12, 2019 at 8:26 AM Jerome Glisse <jglisse@redhat.com> wrote:
> > > [..]
> > > > > > Spirit of the rule is better than blind application of rule.
> > > > >
> > > > > Again, I fail to see why HMM is suddenly unable to make forward
> > > > > progress when the infrastructure that came before it was merged with
> > > > > consumers in the same development cycle.
> > > > >
> > > > > A gate to upstream merge is about the only lever a reviewer has to
> > > > > push for change, and these requests to uncouple the consumer only
> > > > > serve to weaken that review tool in my mind.
> > > >
> > > > Well let just agree to disagree and leave it at that and stop
> > > > wasting each other time
> > >
> > > I'm fine to continue this discussion if you are. Please be specific
> > > about where we disagree and what aspect of the proposed rules about
> > > merge staging are either acceptable, painful-but-doable, or
> > > show-stoppers. Do you agree that HMM is doing something novel with
> > > merge staging, am I off base there?
> >
> > You're correct.  We chose to go this way because the HMM code is so
> > large and all-over-the-place that developing it in a standalone tree
> > seemed impractical - better to feed it into mainline piecewise.
> >
> > This decision very much assumed that HMM users would definitely be
> > merged, and that it would happen soon.  I was skeptical for a long time
> > and was eventually persuaded by quite a few conversations with various
> > architecture and driver maintainers indicating that these HMM users
> > would be forthcoming.
> >
> > In retrospect, the arrival of HMM clients took quite a lot longer than
> > was anticipated and I'm not sure that all of the anticipated usage
> > sites will actually be using it.  I wish I'd kept records of
> > who-said-what, but I didn't and the info is now all rather dissipated.
> >
> > So the plan didn't really work out as hoped.  Lesson learned, I would
> > now very much prefer that new HMM feature work's changelogs include
> > links to the driver patchsets which will be using those features and
> > acks and review input from the developers of those driver patchsets.
>
> This is what i am doing now and this patchset falls into that. I did
> post the ODP and nouveau bits to use the 2 new functions (dma map and
> unmap). I expect to merge both ODP and nouveau bits for that during
> the next merge window.
>
> Also with 5.1 everything that is upstream is use by nouveau at least.
> They are posted patches to use HMM for AMD, Intel, Radeon, ODP, PPC.
> Some are going through several revisions so i do not know exactly when
> each will make it upstream but i keep working on all this.
>
> So the guideline we agree on:
>     - no new infrastructure without user
>     - device driver maintainer for which new infrastructure is done
>       must either sign off or review of explicitly say that they want
>       the feature I do not expect all driver maintainer will have
>       the bandwidth to do proper review of the mm part of the infra-
>       structure and it would not be fair to ask that from them. They
>       can still provide feedback on the API expose to the device
>       driver.
>     - driver bits must be posted at the same time as the new infra-
>       structure even if they target the next release cycle to avoid
>       inter-tree dependency
>     - driver bits must be merge as soon as possible

What about EXPORT_SYMBOL_GPL?

>
> Thing we do not agree on:
>     - If driver bits miss for any reason the +1 target directly
>       revert the new infra-structure. I think it should not be black
>       and white and the reasons why the driver bit missed the merge
>       window should be taken into account. If the feature is still
>       wanted and the driver bits missed the window for simple reasons
>       then it means that we push everything by 2 release ie the
>       revert is done in +1 then we reupload the infra-structure in
>       +2 and finaly repush the driver bit in +3 so we loose 1 cycle.

I think that pain is reasonable.

>       Hence why i would rather that the revert would only happen if
>       it is clear that the infrastructure is not ready or can not
>       be use in timely (over couple kernel release) fashion by any
>       drivers.

This seems too generous to me, but in the interest of moving this
discussion forward let's cross that bridge if/when it happens.
Hopefully the threat of this debate recurring means consumers put in
the due diligence to get things merged at infrastructure + 1 time.
Jerome Glisse March 13, 2019, 1 a.m. UTC | #32
On Tue, Mar 12, 2019 at 05:46:51PM -0700, Dan Williams wrote:
> On Tue, Mar 12, 2019 at 5:10 PM Jerome Glisse <jglisse@redhat.com> wrote:
> >
> > On Tue, Mar 12, 2019 at 02:52:14PM -0700, Andrew Morton wrote:
> > > On Tue, 12 Mar 2019 12:30:52 -0700 Dan Williams <dan.j.williams@intel.com> wrote:
> > >
> > > > On Tue, Mar 12, 2019 at 12:06 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > > > > On Tue, Mar 12, 2019 at 09:06:12AM -0700, Dan Williams wrote:
> > > > > > On Tue, Mar 12, 2019 at 8:26 AM Jerome Glisse <jglisse@redhat.com> wrote:
> > > > [..]
> > > > > > > Spirit of the rule is better than blind application of rule.
> > > > > >
> > > > > > Again, I fail to see why HMM is suddenly unable to make forward
> > > > > > progress when the infrastructure that came before it was merged with
> > > > > > consumers in the same development cycle.
> > > > > >
> > > > > > A gate to upstream merge is about the only lever a reviewer has to
> > > > > > push for change, and these requests to uncouple the consumer only
> > > > > > serve to weaken that review tool in my mind.
> > > > >
> > > > > Well let just agree to disagree and leave it at that and stop
> > > > > wasting each other time
> > > >
> > > > I'm fine to continue this discussion if you are. Please be specific
> > > > about where we disagree and what aspect of the proposed rules about
> > > > merge staging are either acceptable, painful-but-doable, or
> > > > show-stoppers. Do you agree that HMM is doing something novel with
> > > > merge staging, am I off base there?
> > >
> > > You're correct.  We chose to go this way because the HMM code is so
> > > large and all-over-the-place that developing it in a standalone tree
> > > seemed impractical - better to feed it into mainline piecewise.
> > >
> > > This decision very much assumed that HMM users would definitely be
> > > merged, and that it would happen soon.  I was skeptical for a long time
> > > and was eventually persuaded by quite a few conversations with various
> > > architecture and driver maintainers indicating that these HMM users
> > > would be forthcoming.
> > >
> > > In retrospect, the arrival of HMM clients took quite a lot longer than
> > > was anticipated and I'm not sure that all of the anticipated usage
> > > sites will actually be using it.  I wish I'd kept records of
> > > who-said-what, but I didn't and the info is now all rather dissipated.
> > >
> > > So the plan didn't really work out as hoped.  Lesson learned, I would
> > > now very much prefer that new HMM feature work's changelogs include
> > > links to the driver patchsets which will be using those features and
> > > acks and review input from the developers of those driver patchsets.
> >
> > This is what i am doing now and this patchset falls into that. I did
> > post the ODP and nouveau bits to use the 2 new functions (dma map and
> > unmap). I expect to merge both ODP and nouveau bits for that during
> > the next merge window.
> >
> > Also with 5.1 everything that is upstream is use by nouveau at least.
> > They are posted patches to use HMM for AMD, Intel, Radeon, ODP, PPC.
> > Some are going through several revisions so i do not know exactly when
> > each will make it upstream but i keep working on all this.
> >
> > So the guideline we agree on:
> >     - no new infrastructure without user
> >     - device driver maintainer for which new infrastructure is done
> >       must either sign off or review of explicitly say that they want
> >       the feature I do not expect all driver maintainer will have
> >       the bandwidth to do proper review of the mm part of the infra-
> >       structure and it would not be fair to ask that from them. They
> >       can still provide feedback on the API expose to the device
> >       driver.
> >     - driver bits must be posted at the same time as the new infra-
> >       structure even if they target the next release cycle to avoid
> >       inter-tree dependency
> >     - driver bits must be merge as soon as possible
> 
> What about EXPORT_SYMBOL_GPL?

I explained why i do not see value in changing export, but i will not
oppose that change either.


> > Thing we do not agree on:
> >     - If driver bits miss for any reason the +1 target directly
> >       revert the new infra-structure. I think it should not be black
> >       and white and the reasons why the driver bit missed the merge
> >       window should be taken into account. If the feature is still
> >       wanted and the driver bits missed the window for simple reasons
> >       then it means that we push everything by 2 release ie the
> >       revert is done in +1 then we reupload the infra-structure in
> >       +2 and finaly repush the driver bit in +3 so we loose 1 cycle.
> 
> I think that pain is reasonable.
> 
> >       Hence why i would rather that the revert would only happen if
> >       it is clear that the infrastructure is not ready or can not
> >       be use in timely (over couple kernel release) fashion by any
> >       drivers.
> 
> This seems too generous to me, but in the interest of moving this
> discussion forward let's cross that bridge if/when it happens.
> Hopefully the threat of this debate recurring means consumers put in
> the due diligence to get things merged at infrastructure + 1 time.
Dan Williams March 13, 2019, 1:06 a.m. UTC | #33
On Tue, Mar 12, 2019 at 1:34 PM Dave Chinner <david@fromorbit.com> wrote:
>
> On Tue, Mar 12, 2019 at 12:30:52PM -0700, Dan Williams wrote:
> > On Tue, Mar 12, 2019 at 12:06 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > > On Tue, Mar 12, 2019 at 09:06:12AM -0700, Dan Williams wrote:
> > > > On Tue, Mar 12, 2019 at 8:26 AM Jerome Glisse <jglisse@redhat.com> wrote:
> > [..]
> > > > > Spirit of the rule is better than blind application of rule.
> > > >
> > > > Again, I fail to see why HMM is suddenly unable to make forward
> > > > progress when the infrastructure that came before it was merged with
> > > > consumers in the same development cycle.
> > > >
> > > > A gate to upstream merge is about the only lever a reviewer has to
> > > > push for change, and these requests to uncouple the consumer only
> > > > serve to weaken that review tool in my mind.
> > >
> > > Well let just agree to disagree and leave it at that and stop
> > > wasting each other time
> >
> > I'm fine to continue this discussion if you are. Please be specific
> > about where we disagree and what aspect of the proposed rules about
> > merge staging are either acceptable, painful-but-doable, or
> > show-stoppers. Do you agree that HMM is doing something novel with
> > merge staging, am I off base there? I expect I can find folks that
> > would balk with even a one cycle deferment of consumers, but can we
> > start with that concession and see how it goes? I'm missing where I've
> > proposed something that is untenable for the future of HMM which is
> > addressing some real needs in gaps in the kernel's support for new
> > hardware.
>
> /me quietly wonders why the hmm infrastructure can't be staged in a
> maintainer tree development branch on a kernel.org and then
> all merged in one go when that branch has both infrastructure and
> drivers merged into it...
>
> i.e. everyone doing hmm driver work gets the infrastructure from the
> dev tree, not mainline. That's a pretty standard procedure for
> developing complex features, and it avoids all the issues being
> argued over right now...

True, but I wasn't considering that because the mm tree does not do
stable topic branches. This kind of staging seems not amenable to a
quilt workflow and it needs to keep pace with the rest of mm.
Andrew Morton March 13, 2019, 4:06 p.m. UTC | #34
On Tue, 12 Mar 2019 20:10:19 -0400 Jerome Glisse <jglisse@redhat.com> wrote:

> > You're correct.  We chose to go this way because the HMM code is so
> > large and all-over-the-place that developing it in a standalone tree
> > seemed impractical - better to feed it into mainline piecewise.
> > 
> > This decision very much assumed that HMM users would definitely be
> > merged, and that it would happen soon.  I was skeptical for a long time
> > and was eventually persuaded by quite a few conversations with various
> > architecture and driver maintainers indicating that these HMM users
> > would be forthcoming.
> > 
> > In retrospect, the arrival of HMM clients took quite a lot longer than
> > was anticipated and I'm not sure that all of the anticipated usage
> > sites will actually be using it.  I wish I'd kept records of
> > who-said-what, but I didn't and the info is now all rather dissipated.
> > 
> > So the plan didn't really work out as hoped.  Lesson learned, I would
> > now very much prefer that new HMM feature work's changelogs include
> > links to the driver patchsets which will be using those features and
> > acks and review input from the developers of those driver patchsets.
> 
> This is what i am doing now and this patchset falls into that. I did
> post the ODP and nouveau bits to use the 2 new functions (dma map and
> unmap). I expect to merge both ODP and nouveau bits for that during
> the next merge window.
> 
> Also with 5.1 everything that is upstream is use by nouveau at least.
> They are posted patches to use HMM for AMD, Intel, Radeon, ODP, PPC.
> Some are going through several revisions so i do not know exactly when
> each will make it upstream but i keep working on all this.
> 
> So the guideline we agree on:
>     - no new infrastructure without user
>     - device driver maintainer for which new infrastructure is done
>       must either sign off or review of explicitly say that they want
>       the feature I do not expect all driver maintainer will have
>       the bandwidth to do proper review of the mm part of the infra-
>       structure and it would not be fair to ask that from them. They
>       can still provide feedback on the API expose to the device
>       driver.

The patchset in -mm ("HMM updates for 5.1") has review from Ralph
Campbell @ nvidia.  Are there any other maintainers who we should have
feedback from?

>     - driver bits must be posted at the same time as the new infra-
>       structure even if they target the next release cycle to avoid
>       inter-tree dependency
>     - driver bits must be merge as soon as possible

Are there links to driver patchsets which we can add to the changelogs?

> Thing we do not agree on:
>     - If driver bits miss for any reason the +1 target directly
>       revert the new infra-structure. I think it should not be black
>       and white and the reasons why the driver bit missed the merge
>       window should be taken into account. If the feature is still
>       wanted and the driver bits missed the window for simple reasons
>       then it means that we push everything by 2 release ie the
>       revert is done in +1 then we reupload the infra-structure in
>       +2 and finaly repush the driver bit in +3 so we loose 1 cycle.
>       Hence why i would rather that the revert would only happen if
>       it is clear that the infrastructure is not ready or can not
>       be use in timely (over couple kernel release) fashion by any
>       drivers.

I agree that this should be more a philosophy than a set of hard rules.
Jerome Glisse March 13, 2019, 6:39 p.m. UTC | #35
On Wed, Mar 13, 2019 at 09:06:04AM -0700, Andrew Morton wrote:
> On Tue, 12 Mar 2019 20:10:19 -0400 Jerome Glisse <jglisse@redhat.com> wrote:
> 
> > > You're correct.  We chose to go this way because the HMM code is so
> > > large and all-over-the-place that developing it in a standalone tree
> > > seemed impractical - better to feed it into mainline piecewise.
> > > 
> > > This decision very much assumed that HMM users would definitely be
> > > merged, and that it would happen soon.  I was skeptical for a long time
> > > and was eventually persuaded by quite a few conversations with various
> > > architecture and driver maintainers indicating that these HMM users
> > > would be forthcoming.
> > > 
> > > In retrospect, the arrival of HMM clients took quite a lot longer than
> > > was anticipated and I'm not sure that all of the anticipated usage
> > > sites will actually be using it.  I wish I'd kept records of
> > > who-said-what, but I didn't and the info is now all rather dissipated.
> > > 
> > > So the plan didn't really work out as hoped.  Lesson learned, I would
> > > now very much prefer that new HMM feature work's changelogs include
> > > links to the driver patchsets which will be using those features and
> > > acks and review input from the developers of those driver patchsets.
> > 
> > This is what i am doing now and this patchset falls into that. I did
> > post the ODP and nouveau bits to use the 2 new functions (dma map and
> > unmap). I expect to merge both ODP and nouveau bits for that during
> > the next merge window.
> > 
> > Also with 5.1 everything that is upstream is use by nouveau at least.
> > They are posted patches to use HMM for AMD, Intel, Radeon, ODP, PPC.
> > Some are going through several revisions so i do not know exactly when
> > each will make it upstream but i keep working on all this.
> > 
> > So the guideline we agree on:
> >     - no new infrastructure without user
> >     - device driver maintainer for which new infrastructure is done
> >       must either sign off or review of explicitly say that they want
> >       the feature I do not expect all driver maintainer will have
> >       the bandwidth to do proper review of the mm part of the infra-
> >       structure and it would not be fair to ask that from them. They
> >       can still provide feedback on the API expose to the device
> >       driver.
> 
> The patchset in -mm ("HMM updates for 5.1") has review from Ralph
> Campbell @ nvidia.  Are there any other maintainers who we should have
> feedback from?

John Hubbard also give his review on couple of them iirc.

> 
> >     - driver bits must be posted at the same time as the new infra-
> >       structure even if they target the next release cycle to avoid
> >       inter-tree dependency
> >     - driver bits must be merge as soon as possible
> 
> Are there links to driver patchsets which we can add to the changelogs?
> 

Issue with that is that i often post the infrastructure bit first and
then the driver bit so i have email circular dependency :) I can alway
post driver bits first and then add links to driver bits. Or i can
reply after posting so that i can cross link both.

Or i can post the driver bit on mm the first time around and mark them
as "not for Andrew" or any tag that make it clear that those patch will
be merge through the appropriate driver tree.

In any case for this patchset there is:

https://patchwork.kernel.org/patch/10786625/

Also this patchset refactor some of the hmm internal for better API so
it is getting use by nouveau too which is already upstream.


> > Thing we do not agree on:
> >     - If driver bits miss for any reason the +1 target directly
> >       revert the new infra-structure. I think it should not be black
> >       and white and the reasons why the driver bit missed the merge
> >       window should be taken into account. If the feature is still
> >       wanted and the driver bits missed the window for simple reasons
> >       then it means that we push everything by 2 release ie the
> >       revert is done in +1 then we reupload the infra-structure in
> >       +2 and finaly repush the driver bit in +3 so we loose 1 cycle.
> >       Hence why i would rather that the revert would only happen if
> >       it is clear that the infrastructure is not ready or can not
> >       be use in timely (over couple kernel release) fashion by any
> >       drivers.
> 
> I agree that this should be more a philosophy than a set of hard rules.
>
diff mbox series

Patch

diff --git a/mm/hmm.c b/mm/hmm.c
index 8b87e1813313..1a444885404e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -334,6 +334,7 @@  EXPORT_SYMBOL(hmm_mirror_unregister);
 
 struct hmm_vma_walk {
 	struct hmm_range	*range;
+	struct dev_pagemap	*pgmap;
 	unsigned long		last;
 	bool			fault;
 	bool			block;
@@ -508,6 +509,15 @@  static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
 				range->flags[HMM_PFN_VALID];
 }
 
+static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+{
+	if (!pud_present(pud))
+		return 0;
+	return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
+				range->flags[HMM_PFN_WRITE] :
+				range->flags[HMM_PFN_VALID];
+}
+
 static int hmm_vma_handle_pmd(struct mm_walk *walk,
 			      unsigned long addr,
 			      unsigned long end,
@@ -529,8 +539,19 @@  static int hmm_vma_handle_pmd(struct mm_walk *walk,
 		return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
 
 	pfn = pmd_pfn(pmd) + pte_index(addr);
-	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
+	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+		if (pmd_devmap(pmd)) {
+			hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+					      hmm_vma_walk->pgmap);
+			if (unlikely(!hmm_vma_walk->pgmap))
+				return -EBUSY;
+		}
 		pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+	}
+	if (hmm_vma_walk->pgmap) {
+		put_dev_pagemap(hmm_vma_walk->pgmap);
+		hmm_vma_walk->pgmap = NULL;
+	}
 	hmm_vma_walk->last = end;
 	return 0;
 }
@@ -617,10 +638,24 @@  static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	if (fault || write_fault)
 		goto fault;
 
+	if (pte_devmap(pte)) {
+		hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
+					      hmm_vma_walk->pgmap);
+		if (unlikely(!hmm_vma_walk->pgmap))
+			return -EBUSY;
+	} else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
+		*pfn = range->values[HMM_PFN_SPECIAL];
+		return -EFAULT;
+	}
+
 	*pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
 	return 0;
 
 fault:
+	if (hmm_vma_walk->pgmap) {
+		put_dev_pagemap(hmm_vma_walk->pgmap);
+		hmm_vma_walk->pgmap = NULL;
+	}
 	pte_unmap(ptep);
 	/* Fault any virtual address we were asked to fault */
 	return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -708,12 +743,84 @@  static int hmm_vma_walk_pmd(pmd_t *pmdp,
 			return r;
 		}
 	}
+	if (hmm_vma_walk->pgmap) {
+		put_dev_pagemap(hmm_vma_walk->pgmap);
+		hmm_vma_walk->pgmap = NULL;
+	}
 	pte_unmap(ptep - 1);
 
 	hmm_vma_walk->last = addr;
 	return 0;
 }
 
+static int hmm_vma_walk_pud(pud_t *pudp,
+			    unsigned long start,
+			    unsigned long end,
+			    struct mm_walk *walk)
+{
+	struct hmm_vma_walk *hmm_vma_walk = walk->private;
+	struct hmm_range *range = hmm_vma_walk->range;
+	struct vm_area_struct *vma = walk->vma;
+	unsigned long addr = start, next;
+	pmd_t *pmdp;
+	pud_t pud;
+	int ret;
+
+again:
+	pud = READ_ONCE(*pudp);
+	if (pud_none(pud))
+		return hmm_vma_walk_hole(start, end, walk);
+
+	if (pud_huge(pud) && pud_devmap(pud)) {
+		unsigned long i, npages, pfn;
+		uint64_t *pfns, cpu_flags;
+		bool fault, write_fault;
+
+		if (!pud_present(pud))
+			return hmm_vma_walk_hole(start, end, walk);
+
+		i = (addr - range->start) >> PAGE_SHIFT;
+		npages = (end - addr) >> PAGE_SHIFT;
+		pfns = &range->pfns[i];
+
+		cpu_flags = pud_to_hmm_pfn_flags(range, pud);
+		hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+				     cpu_flags, &fault, &write_fault);
+		if (fault || write_fault)
+			return hmm_vma_walk_hole_(addr, end, fault,
+						write_fault, walk);
+
+		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+		for (i = 0; i < npages; ++i, ++pfn) {
+			hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+					      hmm_vma_walk->pgmap);
+			if (unlikely(!hmm_vma_walk->pgmap))
+				return -EBUSY;
+			pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+		}
+		if (hmm_vma_walk->pgmap) {
+			put_dev_pagemap(hmm_vma_walk->pgmap);
+			hmm_vma_walk->pgmap = NULL;
+		}
+		hmm_vma_walk->last = end;
+		return 0;
+	}
+
+	split_huge_pud(vma, pudp, addr);
+	if (pud_none(*pudp))
+		goto again;
+
+	pmdp = pmd_offset(pudp, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
+		if (ret)
+			return ret;
+	} while (pmdp++, addr = next, addr != end);
+
+	return 0;
+}
+
 static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 				      unsigned long start, unsigned long end,
 				      struct mm_walk *walk)
@@ -786,14 +893,6 @@  static void hmm_pfns_clear(struct hmm_range *range,
 		*pfns = range->values[HMM_PFN_NONE];
 }
 
-static void hmm_pfns_special(struct hmm_range *range)
-{
-	unsigned long addr = range->start, i = 0;
-
-	for (; addr < range->end; addr += PAGE_SIZE, i++)
-		range->pfns[i] = range->values[HMM_PFN_SPECIAL];
-}
-
 /*
  * hmm_range_register() - start tracking change to CPU page table over a range
  * @range: range
@@ -911,12 +1010,6 @@  long hmm_range_snapshot(struct hmm_range *range)
 		if (vma == NULL || (vma->vm_flags & device_vma))
 			return -EFAULT;
 
-		/* FIXME support dax */
-		if (vma_is_dax(vma)) {
-			hmm_pfns_special(range);
-			return -EINVAL;
-		}
-
 		if (is_vm_hugetlb_page(vma)) {
 			struct hstate *h = hstate_vma(vma);
 
@@ -940,6 +1033,7 @@  long hmm_range_snapshot(struct hmm_range *range)
 		}
 
 		range->vma = vma;
+		hmm_vma_walk.pgmap = NULL;
 		hmm_vma_walk.last = start;
 		hmm_vma_walk.fault = false;
 		hmm_vma_walk.range = range;
@@ -951,6 +1045,7 @@  long hmm_range_snapshot(struct hmm_range *range)
 		mm_walk.pte_entry = NULL;
 		mm_walk.test_walk = NULL;
 		mm_walk.hugetlb_entry = NULL;
+		mm_walk.pud_entry = hmm_vma_walk_pud;
 		mm_walk.pmd_entry = hmm_vma_walk_pmd;
 		mm_walk.pte_hole = hmm_vma_walk_hole;
 		mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
@@ -1018,12 +1113,6 @@  long hmm_range_fault(struct hmm_range *range, bool block)
 		if (vma == NULL || (vma->vm_flags & device_vma))
 			return -EFAULT;
 
-		/* FIXME support dax */
-		if (vma_is_dax(vma)) {
-			hmm_pfns_special(range);
-			return -EINVAL;
-		}
-
 		if (is_vm_hugetlb_page(vma)) {
 			struct hstate *h = hstate_vma(vma);
 
@@ -1047,6 +1136,7 @@  long hmm_range_fault(struct hmm_range *range, bool block)
 		}
 
 		range->vma = vma;
+		hmm_vma_walk.pgmap = NULL;
 		hmm_vma_walk.last = start;
 		hmm_vma_walk.fault = true;
 		hmm_vma_walk.block = block;
@@ -1059,6 +1149,7 @@  long hmm_range_fault(struct hmm_range *range, bool block)
 		mm_walk.pte_entry = NULL;
 		mm_walk.test_walk = NULL;
 		mm_walk.hugetlb_entry = NULL;
+		mm_walk.pud_entry = hmm_vma_walk_pud;
 		mm_walk.pmd_entry = hmm_vma_walk_pmd;
 		mm_walk.pte_hole = hmm_vma_walk_hole;
 		mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;