diff mbox series

[07/10] mm/hmm: add an helper function that fault pages and map them to a device

Message ID 20190129165428.3931-8-jglisse@redhat.com (mailing list archive)
State New, archived
Headers show
Series HMM updates for 5.1 | expand

Commit Message

Jerome Glisse Jan. 29, 2019, 4:54 p.m. UTC
From: Jérôme Glisse <jglisse@redhat.com>

This is a all in one helper that fault pages in a range and map them to
a device so that every single device driver do not have to re-implement
this common pattern.

Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
---
 include/linux/hmm.h |   9 +++
 mm/hmm.c            | 152 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 161 insertions(+)

Comments

Dan Williams March 18, 2019, 8:21 p.m. UTC | #1
On Tue, Jan 29, 2019 at 8:55 AM <jglisse@redhat.com> wrote:
>
> From: Jérôme Glisse <jglisse@redhat.com>
>
> This is a all in one helper that fault pages in a range and map them to
> a device so that every single device driver do not have to re-implement
> this common pattern.

Ok, correct me if I am wrong but these seem effectively be the typical
"get_user_pages() + dma_map_page()" pattern that non-HMM drivers would
follow. Could we just teach get_user_pages() to take an HMM shortcut
based on the range?

I'm interested in being able to share code across drivers and not have
to worry about the HMM special case at the api level.

And to be clear this isn't an anti-HMM critique this is a "yes, let's
do this, but how about a more fundamental change".

>
> Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Ralph Campbell <rcampbell@nvidia.com>
> Cc: John Hubbard <jhubbard@nvidia.com>
> ---
>  include/linux/hmm.h |   9 +++
>  mm/hmm.c            | 152 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 161 insertions(+)
>
> diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> index 4263f8fb32e5..fc3630d0bbfd 100644
> --- a/include/linux/hmm.h
> +++ b/include/linux/hmm.h
> @@ -502,6 +502,15 @@ int hmm_range_register(struct hmm_range *range,
>  void hmm_range_unregister(struct hmm_range *range);
>  long hmm_range_snapshot(struct hmm_range *range);
>  long hmm_range_fault(struct hmm_range *range, bool block);
> +long hmm_range_dma_map(struct hmm_range *range,
> +                      struct device *device,
> +                      dma_addr_t *daddrs,
> +                      bool block);
> +long hmm_range_dma_unmap(struct hmm_range *range,
> +                        struct vm_area_struct *vma,
> +                        struct device *device,
> +                        dma_addr_t *daddrs,
> +                        bool dirty);
>
>  /*
>   * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 0a4ff31e9d7a..9cd68334a759 100644
> --- a/mm/hmm.c
> +++ b/mm/hmm.c
> @@ -30,6 +30,7 @@
>  #include <linux/hugetlb.h>
>  #include <linux/memremap.h>
>  #include <linux/jump_label.h>
> +#include <linux/dma-mapping.h>
>  #include <linux/mmu_notifier.h>
>  #include <linux/memory_hotplug.h>
>
> @@ -985,6 +986,157 @@ long hmm_range_fault(struct hmm_range *range, bool block)
>         return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
>  }
>  EXPORT_SYMBOL(hmm_range_fault);
> +
> +/*
> + * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one.
> + * @range: range being faulted
> + * @device: device against to dma map page to
> + * @daddrs: dma address of mapped pages
> + * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
> + * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
> + *          drop and you need to try again, some other error value otherwise
> + *
> + * Note same usage pattern as hmm_range_fault().
> + */
> +long hmm_range_dma_map(struct hmm_range *range,
> +                      struct device *device,
> +                      dma_addr_t *daddrs,
> +                      bool block)
> +{
> +       unsigned long i, npages, mapped;
> +       long ret;
> +
> +       ret = hmm_range_fault(range, block);
> +       if (ret <= 0)
> +               return ret ? ret : -EBUSY;
> +
> +       npages = (range->end - range->start) >> PAGE_SHIFT;
> +       for (i = 0, mapped = 0; i < npages; ++i) {
> +               enum dma_data_direction dir = DMA_FROM_DEVICE;
> +               struct page *page;
> +
> +               /*
> +                * FIXME need to update DMA API to provide invalid DMA address
> +                * value instead of a function to test dma address value. This
> +                * would remove lot of dumb code duplicated accross many arch.
> +                *
> +                * For now setting it to 0 here is good enough as the pfns[]
> +                * value is what is use to check what is valid and what isn't.
> +                */
> +               daddrs[i] = 0;
> +
> +               page = hmm_pfn_to_page(range, range->pfns[i]);
> +               if (page == NULL)
> +                       continue;
> +
> +               /* Check if range is being invalidated */
> +               if (!range->valid) {
> +                       ret = -EBUSY;
> +                       goto unmap;
> +               }
> +
> +               /* If it is read and write than map bi-directional. */
> +               if (range->pfns[i] & range->values[HMM_PFN_WRITE])
> +                       dir = DMA_BIDIRECTIONAL;
> +
> +               daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
> +               if (dma_mapping_error(device, daddrs[i])) {
> +                       ret = -EFAULT;
> +                       goto unmap;
> +               }
> +
> +               mapped++;
> +       }
> +
> +       return mapped;
> +
> +unmap:
> +       for (npages = i, i = 0; (i < npages) && mapped; ++i) {
> +               enum dma_data_direction dir = DMA_FROM_DEVICE;
> +               struct page *page;
> +
> +               page = hmm_pfn_to_page(range, range->pfns[i]);
> +               if (page == NULL)
> +                       continue;
> +
> +               if (dma_mapping_error(device, daddrs[i]))
> +                       continue;
> +
> +               /* If it is read and write than map bi-directional. */
> +               if (range->pfns[i] & range->values[HMM_PFN_WRITE])
> +                       dir = DMA_BIDIRECTIONAL;
> +
> +               dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
> +               mapped--;
> +       }
> +
> +       return ret;
> +}
> +EXPORT_SYMBOL(hmm_range_dma_map);
> +
> +/*
> + * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
> + * @range: range being unmapped
> + * @vma: the vma against which the range (optional)
> + * @device: device against which dma map was done
> + * @daddrs: dma address of mapped pages
> + * @dirty: dirty page if it had the write flag set
> + * Returns: number of page unmapped on success, -EINVAL otherwise
> + *
> + * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
> + * to the sync_cpu_device_pagetables() callback so that it is safe here to
> + * call set_page_dirty(). Caller must also take appropriate locks to avoid
> + * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
> + */
> +long hmm_range_dma_unmap(struct hmm_range *range,
> +                        struct vm_area_struct *vma,
> +                        struct device *device,
> +                        dma_addr_t *daddrs,
> +                        bool dirty)
> +{
> +       unsigned long i, npages;
> +       long cpages = 0;
> +
> +       /* Sanity check. */
> +       if (range->end <= range->start)
> +               return -EINVAL;
> +       if (!daddrs)
> +               return -EINVAL;
> +       if (!range->pfns)
> +               return -EINVAL;
> +
> +       npages = (range->end - range->start) >> PAGE_SHIFT;
> +       for (i = 0; i < npages; ++i) {
> +               enum dma_data_direction dir = DMA_FROM_DEVICE;
> +               struct page *page;
> +
> +               page = hmm_pfn_to_page(range, range->pfns[i]);
> +               if (page == NULL)
> +                       continue;
> +
> +               /* If it is read and write than map bi-directional. */
> +               if (range->pfns[i] & range->values[HMM_PFN_WRITE]) {
> +                       dir = DMA_BIDIRECTIONAL;
> +
> +                       /*
> +                        * See comments in function description on why it is
> +                        * safe here to call set_page_dirty()
> +                        */
> +                       if (dirty)
> +                               set_page_dirty(page);
> +               }
> +
> +               /* Unmap and clear pfns/dma address */
> +               dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
> +               range->pfns[i] = range->values[HMM_PFN_NONE];
> +               /* FIXME see comments in hmm_vma_dma_map() */
> +               daddrs[i] = 0;
> +               cpages++;
> +       }
> +
> +       return cpages;
> +}
> +EXPORT_SYMBOL(hmm_range_dma_unmap);
>  #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
>
>
> --
> 2.17.2
>
Jerome Glisse March 18, 2019, 8:41 p.m. UTC | #2
On Mon, Mar 18, 2019 at 01:21:00PM -0700, Dan Williams wrote:
> On Tue, Jan 29, 2019 at 8:55 AM <jglisse@redhat.com> wrote:
> >
> > From: Jérôme Glisse <jglisse@redhat.com>
> >
> > This is a all in one helper that fault pages in a range and map them to
> > a device so that every single device driver do not have to re-implement
> > this common pattern.
> 
> Ok, correct me if I am wrong but these seem effectively be the typical
> "get_user_pages() + dma_map_page()" pattern that non-HMM drivers would
> follow. Could we just teach get_user_pages() to take an HMM shortcut
> based on the range?
> 
> I'm interested in being able to share code across drivers and not have
> to worry about the HMM special case at the api level.
> 
> And to be clear this isn't an anti-HMM critique this is a "yes, let's
> do this, but how about a more fundamental change".

It is a yes and no, HMM have the synchronization with mmu notifier
which is not common to all device driver ie you have device driver
that do not synchronize with mmu notifier and use GUP. For instance
see the range->valid test in below code this is HMM specific and it
would not apply to GUP user.

Nonetheless i want to remove more HMM code and grow GUP to do some
of this too so that HMM and non HMM driver can share the common part
(under GUP). But right now updating GUP is a too big endeavor. I need
to make progress on more driver with HMM before thinking of messing
with GUP code. Making that code HMM only for now will make the GUP
factorization easier and smaller down the road (should only need to
update HMM helper and not each individual driver which use HMM).

FYI here is my todo list:
    - this patchset
    - HMM ODP
    - mmu notifier changes for optimization and device range binding
    - device range binding (amdgpu/nouveau/...)
    - factor out some nouveau deep inner-layer code to outer-layer for
      more code sharing
    - page->mapping endeavor for generic page protection for instance
      KSM with file back page
    - grow GUP to remove HMM code and consolidate with GUP code
    ...

Cheers,
Jérôme

> 
> >
> > Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: Ralph Campbell <rcampbell@nvidia.com>
> > Cc: John Hubbard <jhubbard@nvidia.com>
> > ---
> >  include/linux/hmm.h |   9 +++
> >  mm/hmm.c            | 152 ++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 161 insertions(+)
> >
> > diff --git a/include/linux/hmm.h b/include/linux/hmm.h
> > index 4263f8fb32e5..fc3630d0bbfd 100644
> > --- a/include/linux/hmm.h
> > +++ b/include/linux/hmm.h
> > @@ -502,6 +502,15 @@ int hmm_range_register(struct hmm_range *range,
> >  void hmm_range_unregister(struct hmm_range *range);
> >  long hmm_range_snapshot(struct hmm_range *range);
> >  long hmm_range_fault(struct hmm_range *range, bool block);
> > +long hmm_range_dma_map(struct hmm_range *range,
> > +                      struct device *device,
> > +                      dma_addr_t *daddrs,
> > +                      bool block);
> > +long hmm_range_dma_unmap(struct hmm_range *range,
> > +                        struct vm_area_struct *vma,
> > +                        struct device *device,
> > +                        dma_addr_t *daddrs,
> > +                        bool dirty);
> >
> >  /*
> >   * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
> > diff --git a/mm/hmm.c b/mm/hmm.c
> > index 0a4ff31e9d7a..9cd68334a759 100644
> > --- a/mm/hmm.c
> > +++ b/mm/hmm.c
> > @@ -30,6 +30,7 @@
> >  #include <linux/hugetlb.h>
> >  #include <linux/memremap.h>
> >  #include <linux/jump_label.h>
> > +#include <linux/dma-mapping.h>
> >  #include <linux/mmu_notifier.h>
> >  #include <linux/memory_hotplug.h>
> >
> > @@ -985,6 +986,157 @@ long hmm_range_fault(struct hmm_range *range, bool block)
> >         return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
> >  }
> >  EXPORT_SYMBOL(hmm_range_fault);
> > +
> > +/*
> > + * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one.
> > + * @range: range being faulted
> > + * @device: device against to dma map page to
> > + * @daddrs: dma address of mapped pages
> > + * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
> > + * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
> > + *          drop and you need to try again, some other error value otherwise
> > + *
> > + * Note same usage pattern as hmm_range_fault().
> > + */
> > +long hmm_range_dma_map(struct hmm_range *range,
> > +                      struct device *device,
> > +                      dma_addr_t *daddrs,
> > +                      bool block)
> > +{
> > +       unsigned long i, npages, mapped;
> > +       long ret;
> > +
> > +       ret = hmm_range_fault(range, block);
> > +       if (ret <= 0)
> > +               return ret ? ret : -EBUSY;
> > +
> > +       npages = (range->end - range->start) >> PAGE_SHIFT;
> > +       for (i = 0, mapped = 0; i < npages; ++i) {
> > +               enum dma_data_direction dir = DMA_FROM_DEVICE;
> > +               struct page *page;
> > +
> > +               /*
> > +                * FIXME need to update DMA API to provide invalid DMA address
> > +                * value instead of a function to test dma address value. This
> > +                * would remove lot of dumb code duplicated accross many arch.
> > +                *
> > +                * For now setting it to 0 here is good enough as the pfns[]
> > +                * value is what is use to check what is valid and what isn't.
> > +                */
> > +               daddrs[i] = 0;
> > +
> > +               page = hmm_pfn_to_page(range, range->pfns[i]);
> > +               if (page == NULL)
> > +                       continue;
> > +
> > +               /* Check if range is being invalidated */
> > +               if (!range->valid) {
> > +                       ret = -EBUSY;
> > +                       goto unmap;
> > +               }
> > +
> > +               /* If it is read and write than map bi-directional. */
> > +               if (range->pfns[i] & range->values[HMM_PFN_WRITE])
> > +                       dir = DMA_BIDIRECTIONAL;
> > +
> > +               daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
> > +               if (dma_mapping_error(device, daddrs[i])) {
> > +                       ret = -EFAULT;
> > +                       goto unmap;
> > +               }
> > +
> > +               mapped++;
> > +       }
> > +
> > +       return mapped;
> > +
> > +unmap:
> > +       for (npages = i, i = 0; (i < npages) && mapped; ++i) {
> > +               enum dma_data_direction dir = DMA_FROM_DEVICE;
> > +               struct page *page;
> > +
> > +               page = hmm_pfn_to_page(range, range->pfns[i]);
> > +               if (page == NULL)
> > +                       continue;
> > +
> > +               if (dma_mapping_error(device, daddrs[i]))
> > +                       continue;
> > +
> > +               /* If it is read and write than map bi-directional. */
> > +               if (range->pfns[i] & range->values[HMM_PFN_WRITE])
> > +                       dir = DMA_BIDIRECTIONAL;
> > +
> > +               dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
> > +               mapped--;
> > +       }
> > +
> > +       return ret;
> > +}
> > +EXPORT_SYMBOL(hmm_range_dma_map);
> > +
> > +/*
> > + * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
> > + * @range: range being unmapped
> > + * @vma: the vma against which the range (optional)
> > + * @device: device against which dma map was done
> > + * @daddrs: dma address of mapped pages
> > + * @dirty: dirty page if it had the write flag set
> > + * Returns: number of page unmapped on success, -EINVAL otherwise
> > + *
> > + * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
> > + * to the sync_cpu_device_pagetables() callback so that it is safe here to
> > + * call set_page_dirty(). Caller must also take appropriate locks to avoid
> > + * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
> > + */
> > +long hmm_range_dma_unmap(struct hmm_range *range,
> > +                        struct vm_area_struct *vma,
> > +                        struct device *device,
> > +                        dma_addr_t *daddrs,
> > +                        bool dirty)
> > +{
> > +       unsigned long i, npages;
> > +       long cpages = 0;
> > +
> > +       /* Sanity check. */
> > +       if (range->end <= range->start)
> > +               return -EINVAL;
> > +       if (!daddrs)
> > +               return -EINVAL;
> > +       if (!range->pfns)
> > +               return -EINVAL;
> > +
> > +       npages = (range->end - range->start) >> PAGE_SHIFT;
> > +       for (i = 0; i < npages; ++i) {
> > +               enum dma_data_direction dir = DMA_FROM_DEVICE;
> > +               struct page *page;
> > +
> > +               page = hmm_pfn_to_page(range, range->pfns[i]);
> > +               if (page == NULL)
> > +                       continue;
> > +
> > +               /* If it is read and write than map bi-directional. */
> > +               if (range->pfns[i] & range->values[HMM_PFN_WRITE]) {
> > +                       dir = DMA_BIDIRECTIONAL;
> > +
> > +                       /*
> > +                        * See comments in function description on why it is
> > +                        * safe here to call set_page_dirty()
> > +                        */
> > +                       if (dirty)
> > +                               set_page_dirty(page);
> > +               }
> > +
> > +               /* Unmap and clear pfns/dma address */
> > +               dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
> > +               range->pfns[i] = range->values[HMM_PFN_NONE];
> > +               /* FIXME see comments in hmm_vma_dma_map() */
> > +               daddrs[i] = 0;
> > +               cpages++;
> > +       }
> > +
> > +       return cpages;
> > +}
> > +EXPORT_SYMBOL(hmm_range_dma_unmap);
> >  #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
> >
> >
> > --
> > 2.17.2
> >
Dan Williams March 18, 2019, 9:30 p.m. UTC | #3
On Mon, Mar 18, 2019 at 1:41 PM Jerome Glisse <jglisse@redhat.com> wrote:
>
> On Mon, Mar 18, 2019 at 01:21:00PM -0700, Dan Williams wrote:
> > On Tue, Jan 29, 2019 at 8:55 AM <jglisse@redhat.com> wrote:
> > >
> > > From: Jérôme Glisse <jglisse@redhat.com>
> > >
> > > This is a all in one helper that fault pages in a range and map them to
> > > a device so that every single device driver do not have to re-implement
> > > this common pattern.
> >
> > Ok, correct me if I am wrong but these seem effectively be the typical
> > "get_user_pages() + dma_map_page()" pattern that non-HMM drivers would
> > follow. Could we just teach get_user_pages() to take an HMM shortcut
> > based on the range?
> >
> > I'm interested in being able to share code across drivers and not have
> > to worry about the HMM special case at the api level.
> >
> > And to be clear this isn't an anti-HMM critique this is a "yes, let's
> > do this, but how about a more fundamental change".
>
> It is a yes and no, HMM have the synchronization with mmu notifier
> which is not common to all device driver ie you have device driver
> that do not synchronize with mmu notifier and use GUP. For instance
> see the range->valid test in below code this is HMM specific and it
> would not apply to GUP user.
>
> Nonetheless i want to remove more HMM code and grow GUP to do some
> of this too so that HMM and non HMM driver can share the common part
> (under GUP). But right now updating GUP is a too big endeavor.

I'm open to that argument, but that statement then seems to indicate
that these apis are indeed temporary. If the end game is common api
between HMM and non-HMM drivers then I think these should at least
come with /* TODO: */ comments about what might change in the future,
and then should be EXPORT_SYMBOL_GPL since they're already planning to
be deprecated. They are a point in time export for a work-in-progress
interface.

> I need
> to make progress on more driver with HMM before thinking of messing
> with GUP code. Making that code HMM only for now will make the GUP
> factorization easier and smaller down the road (should only need to
> update HMM helper and not each individual driver which use HMM).
>
> FYI here is my todo list:
>     - this patchset
>     - HMM ODP
>     - mmu notifier changes for optimization and device range binding
>     - device range binding (amdgpu/nouveau/...)
>     - factor out some nouveau deep inner-layer code to outer-layer for
>       more code sharing
>     - page->mapping endeavor for generic page protection for instance
>       KSM with file back page
>     - grow GUP to remove HMM code and consolidate with GUP code

Sounds workable as a plan.
Jerome Glisse March 18, 2019, 10:15 p.m. UTC | #4
On Mon, Mar 18, 2019 at 02:30:15PM -0700, Dan Williams wrote:
> On Mon, Mar 18, 2019 at 1:41 PM Jerome Glisse <jglisse@redhat.com> wrote:
> >
> > On Mon, Mar 18, 2019 at 01:21:00PM -0700, Dan Williams wrote:
> > > On Tue, Jan 29, 2019 at 8:55 AM <jglisse@redhat.com> wrote:
> > > >
> > > > From: Jérôme Glisse <jglisse@redhat.com>
> > > >
> > > > This is a all in one helper that fault pages in a range and map them to
> > > > a device so that every single device driver do not have to re-implement
> > > > this common pattern.
> > >
> > > Ok, correct me if I am wrong but these seem effectively be the typical
> > > "get_user_pages() + dma_map_page()" pattern that non-HMM drivers would
> > > follow. Could we just teach get_user_pages() to take an HMM shortcut
> > > based on the range?
> > >
> > > I'm interested in being able to share code across drivers and not have
> > > to worry about the HMM special case at the api level.
> > >
> > > And to be clear this isn't an anti-HMM critique this is a "yes, let's
> > > do this, but how about a more fundamental change".
> >
> > It is a yes and no, HMM have the synchronization with mmu notifier
> > which is not common to all device driver ie you have device driver
> > that do not synchronize with mmu notifier and use GUP. For instance
> > see the range->valid test in below code this is HMM specific and it
> > would not apply to GUP user.
> >
> > Nonetheless i want to remove more HMM code and grow GUP to do some
> > of this too so that HMM and non HMM driver can share the common part
> > (under GUP). But right now updating GUP is a too big endeavor.
> 
> I'm open to that argument, but that statement then seems to indicate
> that these apis are indeed temporary. If the end game is common api
> between HMM and non-HMM drivers then I think these should at least
> come with /* TODO: */ comments about what might change in the future,
> and then should be EXPORT_SYMBOL_GPL since they're already planning to
> be deprecated. They are a point in time export for a work-in-progress
> interface.

The API is not temporary it will stay the same ie the device driver
using HMM would not need further modification. Only the inner working
of HMM would be ported over to use improved common GUP. But GUP has
few shortcoming today that would be a regression for HMM:
    - huge page handling (ie dma mapping huge page not 4k chunk of
      huge page)
    - not incrementing page refcount for HMM (other user like user-
      faultd also want a GUP without FOLL_GET because they abide by
      mmu notifier)
    - support for device memory without leaking it ie restrict such
      memory to caller that can handle it properly and are fully
      aware of the gotcha that comes with it
    ...

So before converting HMM to use common GUP code under-neath those GUP
shortcoming (from HMM POV) need to be addressed and at the same time
the common dma map pattern can be added as an extra GUP helper.

The issue is that some of the above changes need to be done carefully
to not impact existing GUP users. So i rather clear some of my plate
before starting chewing on this carefully.

Also doing this patch first and then the GUP thing solve the first user
problem you have been asking for. With that code in first the first user
of the GUP convertion will be all the devices that use those two HMM
functions. In turn the first user of that code is the ODP RDMA patch
i already posted. Second will be nouveau once i tackle out some nouveau
changes. I expect amdgpu to come close third as a user and other device
driver who are working on HMM integration to come shortly after.



> > I need
> > to make progress on more driver with HMM before thinking of messing
> > with GUP code. Making that code HMM only for now will make the GUP
> > factorization easier and smaller down the road (should only need to
> > update HMM helper and not each individual driver which use HMM).
> >
> > FYI here is my todo list:
> >     - this patchset
> >     - HMM ODP
> >     - mmu notifier changes for optimization and device range binding
> >     - device range binding (amdgpu/nouveau/...)
> >     - factor out some nouveau deep inner-layer code to outer-layer for
> >       more code sharing
> >     - page->mapping endeavor for generic page protection for instance
> >       KSM with file back page
> >     - grow GUP to remove HMM code and consolidate with GUP code
> 
> Sounds workable as a plan.
Dan Williams March 19, 2019, 3:29 a.m. UTC | #5
On Mon, Mar 18, 2019 at 3:15 PM Jerome Glisse <jglisse@redhat.com> wrote:
>
> On Mon, Mar 18, 2019 at 02:30:15PM -0700, Dan Williams wrote:
> > On Mon, Mar 18, 2019 at 1:41 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > >
> > > On Mon, Mar 18, 2019 at 01:21:00PM -0700, Dan Williams wrote:
> > > > On Tue, Jan 29, 2019 at 8:55 AM <jglisse@redhat.com> wrote:
> > > > >
> > > > > From: Jérôme Glisse <jglisse@redhat.com>
> > > > >
> > > > > This is a all in one helper that fault pages in a range and map them to
> > > > > a device so that every single device driver do not have to re-implement
> > > > > this common pattern.
> > > >
> > > > Ok, correct me if I am wrong but these seem effectively be the typical
> > > > "get_user_pages() + dma_map_page()" pattern that non-HMM drivers would
> > > > follow. Could we just teach get_user_pages() to take an HMM shortcut
> > > > based on the range?
> > > >
> > > > I'm interested in being able to share code across drivers and not have
> > > > to worry about the HMM special case at the api level.
> > > >
> > > > And to be clear this isn't an anti-HMM critique this is a "yes, let's
> > > > do this, but how about a more fundamental change".
> > >
> > > It is a yes and no, HMM have the synchronization with mmu notifier
> > > which is not common to all device driver ie you have device driver
> > > that do not synchronize with mmu notifier and use GUP. For instance
> > > see the range->valid test in below code this is HMM specific and it
> > > would not apply to GUP user.
> > >
> > > Nonetheless i want to remove more HMM code and grow GUP to do some
> > > of this too so that HMM and non HMM driver can share the common part
> > > (under GUP). But right now updating GUP is a too big endeavor.
> >
> > I'm open to that argument, but that statement then seems to indicate
> > that these apis are indeed temporary. If the end game is common api
> > between HMM and non-HMM drivers then I think these should at least
> > come with /* TODO: */ comments about what might change in the future,
> > and then should be EXPORT_SYMBOL_GPL since they're already planning to
> > be deprecated. They are a point in time export for a work-in-progress
> > interface.
>
> The API is not temporary it will stay the same ie the device driver
> using HMM would not need further modification. Only the inner working
> of HMM would be ported over to use improved common GUP. But GUP has
> few shortcoming today that would be a regression for HMM:
>     - huge page handling (ie dma mapping huge page not 4k chunk of
>       huge page)
>     - not incrementing page refcount for HMM (other user like user-
>       faultd also want a GUP without FOLL_GET because they abide by
>       mmu notifier)
>     - support for device memory without leaking it ie restrict such
>       memory to caller that can handle it properly and are fully
>       aware of the gotcha that comes with it
>     ...

...but this is backwards because the end state is 2 driver interfaces
for dealing with page mappings instead of one. My primary critique of
HMM is that it creates a parallel universe of HMM apis rather than
evolving the existing core apis.

> So before converting HMM to use common GUP code under-neath those GUP
> shortcoming (from HMM POV) need to be addressed and at the same time
> the common dma map pattern can be added as an extra GUP helper.

If the HMM special cases are not being absorbed into the core-mm over
time then I think this is going in the wrong direction. Specifically a
direction that increases the long term maintenance burden over time as
HMM drivers stay needlessly separated.

> The issue is that some of the above changes need to be done carefully
> to not impact existing GUP users. So i rather clear some of my plate
> before starting chewing on this carefully.

I urge you to put this kind of consideration first and not "merge
first, ask hard questions later".

> Also doing this patch first and then the GUP thing solve the first user
> problem you have been asking for. With that code in first the first user
> of the GUP convertion will be all the devices that use those two HMM
> functions. In turn the first user of that code is the ODP RDMA patch
> i already posted. Second will be nouveau once i tackle out some nouveau
> changes. I expect amdgpu to come close third as a user and other device
> driver who are working on HMM integration to come shortly after.

I appreciate that it has users, but the point of having users is so
that the code review can actually be fruitful to see if the
infrastructure makes sense, and in this case it seems to be
duplicating an existing common pattern in the kernel.
Ira Weiny March 19, 2019, 8:44 a.m. UTC | #6
On Tue, Mar 19, 2019 at 09:30:05AM -0400, Jerome Glisse wrote:
> On Mon, Mar 18, 2019 at 08:29:45PM -0700, Dan Williams wrote:
> > On Mon, Mar 18, 2019 at 3:15 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > >
> > > On Mon, Mar 18, 2019 at 02:30:15PM -0700, Dan Williams wrote:
> > > > On Mon, Mar 18, 2019 at 1:41 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > > > >
> > > > > On Mon, Mar 18, 2019 at 01:21:00PM -0700, Dan Williams wrote:
> > > > > > On Tue, Jan 29, 2019 at 8:55 AM <jglisse@redhat.com> wrote:
> > > > > > >
> > > > > > > From: Jérôme Glisse <jglisse@redhat.com>
> > > > > > >
> > > > > > > This is a all in one helper that fault pages in a range and map them to
> > > > > > > a device so that every single device driver do not have to re-implement
> > > > > > > this common pattern.
> > > > > >
> > > > > > Ok, correct me if I am wrong but these seem effectively be the typical
> > > > > > "get_user_pages() + dma_map_page()" pattern that non-HMM drivers would
> > > > > > follow. Could we just teach get_user_pages() to take an HMM shortcut
> > > > > > based on the range?
> > > > > >
> > > > > > I'm interested in being able to share code across drivers and not have
> > > > > > to worry about the HMM special case at the api level.
> > > > > >
> > > > > > And to be clear this isn't an anti-HMM critique this is a "yes, let's
> > > > > > do this, but how about a more fundamental change".
> > > > >
> > > > > It is a yes and no, HMM have the synchronization with mmu notifier
> > > > > which is not common to all device driver ie you have device driver
> > > > > that do not synchronize with mmu notifier and use GUP. For instance
> > > > > see the range->valid test in below code this is HMM specific and it
> > > > > would not apply to GUP user.
> > > > >
> > > > > Nonetheless i want to remove more HMM code and grow GUP to do some
> > > > > of this too so that HMM and non HMM driver can share the common part
> > > > > (under GUP). But right now updating GUP is a too big endeavor.
> > > >
> > > > I'm open to that argument, but that statement then seems to indicate
> > > > that these apis are indeed temporary. If the end game is common api
> > > > between HMM and non-HMM drivers then I think these should at least
> > > > come with /* TODO: */ comments about what might change in the future,
> > > > and then should be EXPORT_SYMBOL_GPL since they're already planning to
> > > > be deprecated. They are a point in time export for a work-in-progress
> > > > interface.
> > >
> > > The API is not temporary it will stay the same ie the device driver
> > > using HMM would not need further modification. Only the inner working
> > > of HMM would be ported over to use improved common GUP. But GUP has
> > > few shortcoming today that would be a regression for HMM:
> > >     - huge page handling (ie dma mapping huge page not 4k chunk of
> > >       huge page)

Agreed.

> > >     - not incrementing page refcount for HMM (other user like user-
> > >       faultd also want a GUP without FOLL_GET because they abide by
> > >       mmu notifier)
> > >     - support for device memory without leaking it ie restrict such
> > >       memory to caller that can handle it properly and are fully
> > >       aware of the gotcha that comes with it
> > >     ...
> > 
> > ...but this is backwards because the end state is 2 driver interfaces
> > for dealing with page mappings instead of one. My primary critique of
> > HMM is that it creates a parallel universe of HMM apis rather than
> > evolving the existing core apis.
> 
> Just to make it clear here is pseudo code:
>     gup_range_dma_map() {...}
> 
>     hmm_range_dma_map() {
>         hmm_specific_prep_step();
>         gup_range_dma_map();

Does this GUP use FOLL_GET and then a put after the mmu_notifier is setup?

>         hmm_specific_post_step();
>     }
> 
> Like i said HMM do have the synchronization with mmu notifier to take
> care of and other user of GUP and dma map pattern do not care about
> that. Hence why not everything can be share between device driver that
> can not do mmu notifier and other.
> 
> Is that not acceptable to you ? Should every driver duplicate the code
> HMM factorize ?
> 

In the final API you envision will drivers be able to call gup_range_dma_map()
_or_ hmm_range_dma_map()?

If so, at that time how will drivers know which to call and parameters control
those calls?

> 
> > > So before converting HMM to use common GUP code under-neath those GUP
> > > shortcoming (from HMM POV) need to be addressed and at the same time
> > > the common dma map pattern can be added as an extra GUP helper.
> > 
> > If the HMM special cases are not being absorbed into the core-mm over
> > time then I think this is going in the wrong direction. Specifically a
> > direction that increases the long term maintenance burden over time as
> > HMM drivers stay needlessly separated.
> 
> HMM is core mm and other thing like GUP do not need to absord all of HMM
> as it would be forcing down on them mmu notifier and those other user can
> not leverage mmu notifier. So forcing down something that is useless on
> other is pointless, don't you agree ?
> 
> > 
> > > The issue is that some of the above changes need to be done carefully
> > > to not impact existing GUP users. So i rather clear some of my plate
> > > before starting chewing on this carefully.
> > 
> > I urge you to put this kind of consideration first and not "merge
> > first, ask hard questions later".
> 
> There is no hard question here. GUP does not handle THP optimization and
> other thing HMM and ODP has. Adding this to GUP need to be done carefully
> to not break existing GUP user. So i taking a small step approach since
> when that is a bad thing. First merge HMM and ODP together then push down
> common thing into GUP. It is a lot safer than a huge jump.

FWIW I think it is fine to have a new interface which allows new features
during a transition is a good thing.  But if that comes at the price of leaving
the old "deficient" interface sitting around that presents confusion for driver
writers and we get users calling GUP when perhaps they should be calling HMM.

I think having GPL exports helps to ensure we can later merge these to make it
clear to driver writers what the right thing to do is.

> 
> > 
> > > Also doing this patch first and then the GUP thing solve the first user
> > > problem you have been asking for. With that code in first the first user
> > > of the GUP convertion will be all the devices that use those two HMM
> > > functions. In turn the first user of that code is the ODP RDMA patch i
> > > already posted. Second will be nouveau once i tackle out some nouveau
> > > changes. I expect amdgpu to come close third as a user and other device
> > > driver who are working on HMM integration to come shortly after.
> > 
> > I appreciate that it has users, but the point of having users is so that
> > the code review can actually be fruitful to see if the infrastructure makes
> > sense, and in this case it seems to be duplicating an existing common
> > pattern in the kernel.
> 
> It is not duplicating anything i am removing code at the end if you include
           ^^^^^^^^^^^^^

The duplication is in how drivers indicate to the core that a set of pages is
being used by the hardware the driver is controlling, what the rules for those
pages are and how the use by that hardware is going to be coordinated with the
other hardware vying for those pages.  There are differences, true, but
fundamentally it would be nice for drivers to not have to care about the
details.

Maybe that is a dream we will never realize but if there are going to be
different ways for drivers to "get pages" then we need to make it clear what it
means when those pages come to the driver and how they can be used safely.

Ira

> the odp convertion patch and i will remove more code once i am done with
> nouveau changes, and again more code once other driver catchup.
> 
> Cheers, Jérôme
Jerome Glisse March 19, 2019, 1:30 p.m. UTC | #7
On Mon, Mar 18, 2019 at 08:29:45PM -0700, Dan Williams wrote:
> On Mon, Mar 18, 2019 at 3:15 PM Jerome Glisse <jglisse@redhat.com> wrote:
> >
> > On Mon, Mar 18, 2019 at 02:30:15PM -0700, Dan Williams wrote:
> > > On Mon, Mar 18, 2019 at 1:41 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > > >
> > > > On Mon, Mar 18, 2019 at 01:21:00PM -0700, Dan Williams wrote:
> > > > > On Tue, Jan 29, 2019 at 8:55 AM <jglisse@redhat.com> wrote:
> > > > > >
> > > > > > From: Jérôme Glisse <jglisse@redhat.com>
> > > > > >
> > > > > > This is a all in one helper that fault pages in a range and map them to
> > > > > > a device so that every single device driver do not have to re-implement
> > > > > > this common pattern.
> > > > >
> > > > > Ok, correct me if I am wrong but these seem effectively be the typical
> > > > > "get_user_pages() + dma_map_page()" pattern that non-HMM drivers would
> > > > > follow. Could we just teach get_user_pages() to take an HMM shortcut
> > > > > based on the range?
> > > > >
> > > > > I'm interested in being able to share code across drivers and not have
> > > > > to worry about the HMM special case at the api level.
> > > > >
> > > > > And to be clear this isn't an anti-HMM critique this is a "yes, let's
> > > > > do this, but how about a more fundamental change".
> > > >
> > > > It is a yes and no, HMM have the synchronization with mmu notifier
> > > > which is not common to all device driver ie you have device driver
> > > > that do not synchronize with mmu notifier and use GUP. For instance
> > > > see the range->valid test in below code this is HMM specific and it
> > > > would not apply to GUP user.
> > > >
> > > > Nonetheless i want to remove more HMM code and grow GUP to do some
> > > > of this too so that HMM and non HMM driver can share the common part
> > > > (under GUP). But right now updating GUP is a too big endeavor.
> > >
> > > I'm open to that argument, but that statement then seems to indicate
> > > that these apis are indeed temporary. If the end game is common api
> > > between HMM and non-HMM drivers then I think these should at least
> > > come with /* TODO: */ comments about what might change in the future,
> > > and then should be EXPORT_SYMBOL_GPL since they're already planning to
> > > be deprecated. They are a point in time export for a work-in-progress
> > > interface.
> >
> > The API is not temporary it will stay the same ie the device driver
> > using HMM would not need further modification. Only the inner working
> > of HMM would be ported over to use improved common GUP. But GUP has
> > few shortcoming today that would be a regression for HMM:
> >     - huge page handling (ie dma mapping huge page not 4k chunk of
> >       huge page)
> >     - not incrementing page refcount for HMM (other user like user-
> >       faultd also want a GUP without FOLL_GET because they abide by
> >       mmu notifier)
> >     - support for device memory without leaking it ie restrict such
> >       memory to caller that can handle it properly and are fully
> >       aware of the gotcha that comes with it
> >     ...
> 
> ...but this is backwards because the end state is 2 driver interfaces
> for dealing with page mappings instead of one. My primary critique of
> HMM is that it creates a parallel universe of HMM apis rather than
> evolving the existing core apis.

Just to make it clear here is pseudo code:
    gup_range_dma_map() {...}

    hmm_range_dma_map() {
        hmm_specific_prep_step();
        gup_range_dma_map();
        hmm_specific_post_step();
    }

Like i said HMM do have the synchronization with mmu notifier to take
care of and other user of GUP and dma map pattern do not care about
that. Hence why not everything can be share between device driver that
can not do mmu notifier and other.

Is that not acceptable to you ? Should every driver duplicate the code
HMM factorize ?


> > So before converting HMM to use common GUP code under-neath those GUP
> > shortcoming (from HMM POV) need to be addressed and at the same time
> > the common dma map pattern can be added as an extra GUP helper.
> 
> If the HMM special cases are not being absorbed into the core-mm over
> time then I think this is going in the wrong direction. Specifically a
> direction that increases the long term maintenance burden over time as
> HMM drivers stay needlessly separated.

HMM is core mm and other thing like GUP do not need to absord all of HMM
as it would be forcing down on them mmu notifier and those other user can
not leverage mmu notifier. So forcing down something that is useless on
other is pointless, don't you agree ?

> 
> > The issue is that some of the above changes need to be done carefully
> > to not impact existing GUP users. So i rather clear some of my plate
> > before starting chewing on this carefully.
> 
> I urge you to put this kind of consideration first and not "merge
> first, ask hard questions later".

There is no hard question here. GUP does not handle THP optimization and
other thing HMM and ODP has. Adding this to GUP need to be done carefully
to not break existing GUP user. So i taking a small step approach since
when that is a bad thing. First merge HMM and ODP together then push down
common thing into GUP. It is a lot safer than a huge jump.

> 
> > Also doing this patch first and then the GUP thing solve the first user
> > problem you have been asking for. With that code in first the first user
> > of the GUP convertion will be all the devices that use those two HMM
> > functions. In turn the first user of that code is the ODP RDMA patch
> > i already posted. Second will be nouveau once i tackle out some nouveau
> > changes. I expect amdgpu to come close third as a user and other device
> > driver who are working on HMM integration to come shortly after.
> 
> I appreciate that it has users, but the point of having users is so
> that the code review can actually be fruitful to see if the
> infrastructure makes sense, and in this case it seems to be
> duplicating an existing common pattern in the kernel.

It is not duplicating anything i am removing code at the end if you
include the odp convertion patch and i will remove more code once
i am done with nouveau changes, and again more code once other driver
catchup.

Cheers,
Jérôme
Ira Weiny March 19, 2019, 2:10 p.m. UTC | #8
On Tue, Mar 19, 2019 at 01:10:43PM -0400, Jerome Glisse wrote:
> On Tue, Mar 19, 2019 at 01:44:57AM -0700, Ira Weiny wrote:
> > On Tue, Mar 19, 2019 at 09:30:05AM -0400, Jerome Glisse wrote:
> > > On Mon, Mar 18, 2019 at 08:29:45PM -0700, Dan Williams wrote:
> > > > On Mon, Mar 18, 2019 at 3:15 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > > > >
> > > > > On Mon, Mar 18, 2019 at 02:30:15PM -0700, Dan Williams wrote:
> > > > > > On Mon, Mar 18, 2019 at 1:41 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > > > > > >
> > > > > > > On Mon, Mar 18, 2019 at 01:21:00PM -0700, Dan Williams wrote:
> > > > > > > > On Tue, Jan 29, 2019 at 8:55 AM <jglisse@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > From: Jérôme Glisse <jglisse@redhat.com>
> > > > > > > > >

[snip]

> > > > >
> > > > > The API is not temporary it will stay the same ie the device driver
> > > > > using HMM would not need further modification. Only the inner working
> > > > > of HMM would be ported over to use improved common GUP. But GUP has
> > > > > few shortcoming today that would be a regression for HMM:
> > > > >     - huge page handling (ie dma mapping huge page not 4k chunk of
> > > > >       huge page)
> > 
> > Agreed.
> > 
> > > > >     - not incrementing page refcount for HMM (other user like user-
> > > > >       faultd also want a GUP without FOLL_GET because they abide by
> > > > >       mmu notifier)
> > > > >     - support for device memory without leaking it ie restrict such
> > > > >       memory to caller that can handle it properly and are fully
> > > > >       aware of the gotcha that comes with it
> > > > >     ...
> > > > 
> > > > ...but this is backwards because the end state is 2 driver interfaces
> > > > for dealing with page mappings instead of one. My primary critique of
> > > > HMM is that it creates a parallel universe of HMM apis rather than
> > > > evolving the existing core apis.
> > > 
> > > Just to make it clear here is pseudo code:
> > >     gup_range_dma_map() {...}
> > > 
> > >     hmm_range_dma_map() {
> > >         hmm_specific_prep_step();
> > >         gup_range_dma_map();
> > 
> > Does this GUP use FOLL_GET and then a put after the mmu_notifier is setup?
> 
> No it avoids incrementing page refcount all together and use mmu notifier
> synchronization to garantee that it is fine to do so. Hence we need a way
> to do GUP without incrementing the page refcount (ie no FOLL_GET but still
> returning page).

Isn't this follow_page?  I'll admit it may be broken and I'll further admit
that fixing it may have unintended consequences on drivers using GUP but some
of the code in this series looks a lot like the code there.

> 
> > 
> > >         hmm_specific_post_step();
> > >     }
> > > 
> > > Like i said HMM do have the synchronization with mmu notifier to take
> > > care of and other user of GUP and dma map pattern do not care about
> > > that. Hence why not everything can be share between device driver that
> > > can not do mmu notifier and other.
> > > 
> > > Is that not acceptable to you ? Should every driver duplicate the code
> > > HMM factorize ?
> > > 
> > 
> > In the final API you envision will drivers be able to call gup_range_dma_map()
> > _or_ hmm_range_dma_map()?
> > 
> > If so, at that time how will drivers know which to call and parameters control
> > those calls?
> 
> Device that can do invalidation at anytime and thus that can support
> mmu notifier will use HMM and thus the HMM version of it and they will
> always stick with the HMM version.
> 
> Device that can not do invalidation at anytime and thus require pin
> will use the GUP version and always the GUP version.
> 
> What the HMM version does is extra synchronization with mmu notifier
> to ensure that not incrementing page refcount is fine. You can think
> of HMM mirror as an helper than handle mmu notifier common device
> driver pattern.

ok sounds fair.

> 
> > 
> > > 
> > > > > So before converting HMM to use common GUP code under-neath those GUP
> > > > > shortcoming (from HMM POV) need to be addressed and at the same time
> > > > > the common dma map pattern can be added as an extra GUP helper.
> > > > 
> > > > If the HMM special cases are not being absorbed into the core-mm over
> > > > time then I think this is going in the wrong direction. Specifically a
> > > > direction that increases the long term maintenance burden over time as
> > > > HMM drivers stay needlessly separated.
> > > 
> > > HMM is core mm and other thing like GUP do not need to absord all of HMM
> > > as it would be forcing down on them mmu notifier and those other user can
> > > not leverage mmu notifier. So forcing down something that is useless on
> > > other is pointless, don't you agree ?
> > > 
> > > > 
> > > > > The issue is that some of the above changes need to be done carefully
> > > > > to not impact existing GUP users. So i rather clear some of my plate
> > > > > before starting chewing on this carefully.
> > > > 
> > > > I urge you to put this kind of consideration first and not "merge
> > > > first, ask hard questions later".
> > > 
> > > There is no hard question here. GUP does not handle THP optimization and
> > > other thing HMM and ODP has. Adding this to GUP need to be done carefully
> > > to not break existing GUP user. So i taking a small step approach since
> > > when that is a bad thing. First merge HMM and ODP together then push down
> > > common thing into GUP. It is a lot safer than a huge jump.
> > 
> > FWIW I think it is fine to have a new interface which allows new features
> > during a transition is a good thing.  But if that comes at the price of leaving
> > the old "deficient" interface sitting around that presents confusion for driver
> > writers and we get users calling GUP when perhaps they should be calling HMM.
> 
> This is not the intention here, i am converting device driver that can use
> HMM to HMM. Those device driver do not need GUP in the sense that they do
> not need the page refcount increment and this is the short path the HMM does
> provide today. Now i want to convert all device that can follow that to use
> HMM (i posted patchset for amdgpu, radeon, nouveau, i915 and odp rdma for
> that already).
> 
> Device driver that can not do mmu notifier will never use HMM and stick to
> the GUP/dma map pattern. But i want to share the same underlying code for
> both API latter on.

Great!  We agree on something!  :-D

> 
> So i do not see how it would confuse anyone. I am probably bad at expressing
> intent but HMM is not for all device driver it is only for device driver that
> would be able to do mmu notifier but instead of doing mmu notifier directly
> and duplicating common code they can use HMM which has all the common code
> they would need.

I guess I see HMM being bigger than that _eventually_.  I see it being a "one
stop shop" for devices to get pages from the system...  But I think what you
have limited it to is good for now.

Basic pseudocode:

hmm_get_pages()
	if (!mmu_capability)
		do_gup_stuff
	else
		do_hmm_stuff

	return pages;

> 
> > 
> > I think having GPL exports helps to ensure we can later merge these to make it
> > clear to driver writers what the right thing to do is.
> 
> I am fine with GPL export but i stress agains this does not help in the GPU
> world we had tons of GPL driver that are not upstream. GPL was not the issue.
> So i fail to see how GPL helps device driver writer in anyway.

GPL to ensure we can change the interfaces of HMM at will and have a good
chance of getting all the drivers in tree fixed.  There are a couple of patches
in this series which change the interface of exported symbols.  I think this is
fine but it shows we are not ready to export this interface to out of tree users.

> 
> > > 
> > > > 
> > > > > Also doing this patch first and then the GUP thing solve the first user
> > > > > problem you have been asking for. With that code in first the first user
> > > > > of the GUP convertion will be all the devices that use those two HMM
> > > > > functions. In turn the first user of that code is the ODP RDMA patch i
> > > > > already posted. Second will be nouveau once i tackle out some nouveau
> > > > > changes. I expect amdgpu to come close third as a user and other device
> > > > > driver who are working on HMM integration to come shortly after.
> > > > 
> > > > I appreciate that it has users, but the point of having users is so that
> > > > the code review can actually be fruitful to see if the infrastructure makes
> > > > sense, and in this case it seems to be duplicating an existing common
> > > > pattern in the kernel.
> > > 
> > > It is not duplicating anything i am removing code at the end if you include
> >            ^^^^^^^^^^^^^
> > 
> > The duplication is in how drivers indicate to the core that a set of pages is
> > being used by the hardware the driver is controlling, what the rules for those
> > pages are and how the use by that hardware is going to be coordinated with the
> > other hardware vying for those pages.  There are differences, true, but
> > fundamentally it would be nice for drivers to not have to care about the
> > details.
> > 
> > Maybe that is a dream we will never realize but if there are going to be
> > different ways for drivers to "get pages" then we need to make it clear what it
> > means when those pages come to the driver and how they can be used safely.
> 
> This is exactly what HMM mirror is. Device driver do not have to care about
> mm gory details or about mmu notifier subtilities, HMM provide an abstracted
> API easy to understand for device driver and takes care of the sublte details.

If the device supports MMU notification.  ;-)

> 
> Please read the HMM documentation and provide feedback if that is not clear.

FWIW I also want to be clear that having some common code to handle MMU
notification would be great.  I've had to fix mmu_notification code in the past
because mmu notification code can be tricky.  So I'm not against HMM helping
out there.  But I also don't want to leave drivers which don't do MMU
notification with a broken GUP interface.

Ira

> 
> Cheers,
> Jérôme
Jerome Glisse March 19, 2019, 5:10 p.m. UTC | #9
On Tue, Mar 19, 2019 at 01:44:57AM -0700, Ira Weiny wrote:
> On Tue, Mar 19, 2019 at 09:30:05AM -0400, Jerome Glisse wrote:
> > On Mon, Mar 18, 2019 at 08:29:45PM -0700, Dan Williams wrote:
> > > On Mon, Mar 18, 2019 at 3:15 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > > >
> > > > On Mon, Mar 18, 2019 at 02:30:15PM -0700, Dan Williams wrote:
> > > > > On Mon, Mar 18, 2019 at 1:41 PM Jerome Glisse <jglisse@redhat.com> wrote:
> > > > > >
> > > > > > On Mon, Mar 18, 2019 at 01:21:00PM -0700, Dan Williams wrote:
> > > > > > > On Tue, Jan 29, 2019 at 8:55 AM <jglisse@redhat.com> wrote:
> > > > > > > >
> > > > > > > > From: Jérôme Glisse <jglisse@redhat.com>
> > > > > > > >
> > > > > > > > This is a all in one helper that fault pages in a range and map them to
> > > > > > > > a device so that every single device driver do not have to re-implement
> > > > > > > > this common pattern.
> > > > > > >
> > > > > > > Ok, correct me if I am wrong but these seem effectively be the typical
> > > > > > > "get_user_pages() + dma_map_page()" pattern that non-HMM drivers would
> > > > > > > follow. Could we just teach get_user_pages() to take an HMM shortcut
> > > > > > > based on the range?
> > > > > > >
> > > > > > > I'm interested in being able to share code across drivers and not have
> > > > > > > to worry about the HMM special case at the api level.
> > > > > > >
> > > > > > > And to be clear this isn't an anti-HMM critique this is a "yes, let's
> > > > > > > do this, but how about a more fundamental change".
> > > > > >
> > > > > > It is a yes and no, HMM have the synchronization with mmu notifier
> > > > > > which is not common to all device driver ie you have device driver
> > > > > > that do not synchronize with mmu notifier and use GUP. For instance
> > > > > > see the range->valid test in below code this is HMM specific and it
> > > > > > would not apply to GUP user.
> > > > > >
> > > > > > Nonetheless i want to remove more HMM code and grow GUP to do some
> > > > > > of this too so that HMM and non HMM driver can share the common part
> > > > > > (under GUP). But right now updating GUP is a too big endeavor.
> > > > >
> > > > > I'm open to that argument, but that statement then seems to indicate
> > > > > that these apis are indeed temporary. If the end game is common api
> > > > > between HMM and non-HMM drivers then I think these should at least
> > > > > come with /* TODO: */ comments about what might change in the future,
> > > > > and then should be EXPORT_SYMBOL_GPL since they're already planning to
> > > > > be deprecated. They are a point in time export for a work-in-progress
> > > > > interface.
> > > >
> > > > The API is not temporary it will stay the same ie the device driver
> > > > using HMM would not need further modification. Only the inner working
> > > > of HMM would be ported over to use improved common GUP. But GUP has
> > > > few shortcoming today that would be a regression for HMM:
> > > >     - huge page handling (ie dma mapping huge page not 4k chunk of
> > > >       huge page)
> 
> Agreed.
> 
> > > >     - not incrementing page refcount for HMM (other user like user-
> > > >       faultd also want a GUP without FOLL_GET because they abide by
> > > >       mmu notifier)
> > > >     - support for device memory without leaking it ie restrict such
> > > >       memory to caller that can handle it properly and are fully
> > > >       aware of the gotcha that comes with it
> > > >     ...
> > > 
> > > ...but this is backwards because the end state is 2 driver interfaces
> > > for dealing with page mappings instead of one. My primary critique of
> > > HMM is that it creates a parallel universe of HMM apis rather than
> > > evolving the existing core apis.
> > 
> > Just to make it clear here is pseudo code:
> >     gup_range_dma_map() {...}
> > 
> >     hmm_range_dma_map() {
> >         hmm_specific_prep_step();
> >         gup_range_dma_map();
> 
> Does this GUP use FOLL_GET and then a put after the mmu_notifier is setup?

No it avoids incrementing page refcount all together and use mmu notifier
synchronization to garantee that it is fine to do so. Hence we need a way
to do GUP without incrementing the page refcount (ie no FOLL_GET but still
returning page).

> 
> >         hmm_specific_post_step();
> >     }
> > 
> > Like i said HMM do have the synchronization with mmu notifier to take
> > care of and other user of GUP and dma map pattern do not care about
> > that. Hence why not everything can be share between device driver that
> > can not do mmu notifier and other.
> > 
> > Is that not acceptable to you ? Should every driver duplicate the code
> > HMM factorize ?
> > 
> 
> In the final API you envision will drivers be able to call gup_range_dma_map()
> _or_ hmm_range_dma_map()?
> 
> If so, at that time how will drivers know which to call and parameters control
> those calls?

Device that can do invalidation at anytime and thus that can support
mmu notifier will use HMM and thus the HMM version of it and they will
always stick with the HMM version.

Device that can not do invalidation at anytime and thus require pin
will use the GUP version and always the GUP version.

What the HMM version does is extra synchronization with mmu notifier
to ensure that not incrementing page refcount is fine. You can think
of HMM mirror as an helper than handle mmu notifier common device
driver pattern.

> 
> > 
> > > > So before converting HMM to use common GUP code under-neath those GUP
> > > > shortcoming (from HMM POV) need to be addressed and at the same time
> > > > the common dma map pattern can be added as an extra GUP helper.
> > > 
> > > If the HMM special cases are not being absorbed into the core-mm over
> > > time then I think this is going in the wrong direction. Specifically a
> > > direction that increases the long term maintenance burden over time as
> > > HMM drivers stay needlessly separated.
> > 
> > HMM is core mm and other thing like GUP do not need to absord all of HMM
> > as it would be forcing down on them mmu notifier and those other user can
> > not leverage mmu notifier. So forcing down something that is useless on
> > other is pointless, don't you agree ?
> > 
> > > 
> > > > The issue is that some of the above changes need to be done carefully
> > > > to not impact existing GUP users. So i rather clear some of my plate
> > > > before starting chewing on this carefully.
> > > 
> > > I urge you to put this kind of consideration first and not "merge
> > > first, ask hard questions later".
> > 
> > There is no hard question here. GUP does not handle THP optimization and
> > other thing HMM and ODP has. Adding this to GUP need to be done carefully
> > to not break existing GUP user. So i taking a small step approach since
> > when that is a bad thing. First merge HMM and ODP together then push down
> > common thing into GUP. It is a lot safer than a huge jump.
> 
> FWIW I think it is fine to have a new interface which allows new features
> during a transition is a good thing.  But if that comes at the price of leaving
> the old "deficient" interface sitting around that presents confusion for driver
> writers and we get users calling GUP when perhaps they should be calling HMM.

This is not the intention here, i am converting device driver that can use
HMM to HMM. Those device driver do not need GUP in the sense that they do
not need the page refcount increment and this is the short path the HMM does
provide today. Now i want to convert all device that can follow that to use
HMM (i posted patchset for amdgpu, radeon, nouveau, i915 and odp rdma for
that already).

Device driver that can not do mmu notifier will never use HMM and stick to
the GUP/dma map pattern. But i want to share the same underlying code for
both API latter on.

So i do not see how it would confuse anyone. I am probably bad at expressing
intent but HMM is not for all device driver it is only for device driver that
would be able to do mmu notifier but instead of doing mmu notifier directly
and duplicating common code they can use HMM which has all the common code
they would need.

> 
> I think having GPL exports helps to ensure we can later merge these to make it
> clear to driver writers what the right thing to do is.

I am fine with GPL export but i stress agains this does not help in the GPU
world we had tons of GPL driver that are not upstream. GPL was not the issue.
So i fail to see how GPL helps device driver writer in anyway.

> > 
> > > 
> > > > Also doing this patch first and then the GUP thing solve the first user
> > > > problem you have been asking for. With that code in first the first user
> > > > of the GUP convertion will be all the devices that use those two HMM
> > > > functions. In turn the first user of that code is the ODP RDMA patch i
> > > > already posted. Second will be nouveau once i tackle out some nouveau
> > > > changes. I expect amdgpu to come close third as a user and other device
> > > > driver who are working on HMM integration to come shortly after.
> > > 
> > > I appreciate that it has users, but the point of having users is so that
> > > the code review can actually be fruitful to see if the infrastructure makes
> > > sense, and in this case it seems to be duplicating an existing common
> > > pattern in the kernel.
> > 
> > It is not duplicating anything i am removing code at the end if you include
>            ^^^^^^^^^^^^^
> 
> The duplication is in how drivers indicate to the core that a set of pages is
> being used by the hardware the driver is controlling, what the rules for those
> pages are and how the use by that hardware is going to be coordinated with the
> other hardware vying for those pages.  There are differences, true, but
> fundamentally it would be nice for drivers to not have to care about the
> details.
> 
> Maybe that is a dream we will never realize but if there are going to be
> different ways for drivers to "get pages" then we need to make it clear what it
> means when those pages come to the driver and how they can be used safely.

This is exactly what HMM mirror is. Device driver do not have to care about
mm gory details or about mmu notifier subtilities, HMM provide an abstracted
API easy to understand for device driver and takes care of the sublte details.

Please read the HMM documentation and provide feedback if that is not clear.

Cheers,
Jérôme
diff mbox series

Patch

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4263f8fb32e5..fc3630d0bbfd 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -502,6 +502,15 @@  int hmm_range_register(struct hmm_range *range,
 void hmm_range_unregister(struct hmm_range *range);
 long hmm_range_snapshot(struct hmm_range *range);
 long hmm_range_fault(struct hmm_range *range, bool block);
+long hmm_range_dma_map(struct hmm_range *range,
+		       struct device *device,
+		       dma_addr_t *daddrs,
+		       bool block);
+long hmm_range_dma_unmap(struct hmm_range *range,
+			 struct vm_area_struct *vma,
+			 struct device *device,
+			 dma_addr_t *daddrs,
+			 bool dirty);
 
 /*
  * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
diff --git a/mm/hmm.c b/mm/hmm.c
index 0a4ff31e9d7a..9cd68334a759 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -30,6 +30,7 @@ 
 #include <linux/hugetlb.h>
 #include <linux/memremap.h>
 #include <linux/jump_label.h>
+#include <linux/dma-mapping.h>
 #include <linux/mmu_notifier.h>
 #include <linux/memory_hotplug.h>
 
@@ -985,6 +986,157 @@  long hmm_range_fault(struct hmm_range *range, bool block)
 	return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
 }
 EXPORT_SYMBOL(hmm_range_fault);
+
+/*
+ * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one.
+ * @range: range being faulted
+ * @device: device against to dma map page to
+ * @daddrs: dma address of mapped pages
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
+ *          drop and you need to try again, some other error value otherwise
+ *
+ * Note same usage pattern as hmm_range_fault().
+ */
+long hmm_range_dma_map(struct hmm_range *range,
+		       struct device *device,
+		       dma_addr_t *daddrs,
+		       bool block)
+{
+	unsigned long i, npages, mapped;
+	long ret;
+
+	ret = hmm_range_fault(range, block);
+	if (ret <= 0)
+		return ret ? ret : -EBUSY;
+
+	npages = (range->end - range->start) >> PAGE_SHIFT;
+	for (i = 0, mapped = 0; i < npages; ++i) {
+		enum dma_data_direction dir = DMA_FROM_DEVICE;
+		struct page *page;
+
+		/*
+		 * FIXME need to update DMA API to provide invalid DMA address
+		 * value instead of a function to test dma address value. This
+		 * would remove lot of dumb code duplicated accross many arch.
+		 *
+		 * For now setting it to 0 here is good enough as the pfns[]
+		 * value is what is use to check what is valid and what isn't.
+		 */
+		daddrs[i] = 0;
+
+		page = hmm_pfn_to_page(range, range->pfns[i]);
+		if (page == NULL)
+			continue;
+
+		/* Check if range is being invalidated */
+		if (!range->valid) {
+			ret = -EBUSY;
+			goto unmap;
+		}
+
+		/* If it is read and write than map bi-directional. */
+		if (range->pfns[i] & range->values[HMM_PFN_WRITE])
+			dir = DMA_BIDIRECTIONAL;
+
+		daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
+		if (dma_mapping_error(device, daddrs[i])) {
+			ret = -EFAULT;
+			goto unmap;
+		}
+
+		mapped++;
+	}
+
+	return mapped;
+
+unmap:
+	for (npages = i, i = 0; (i < npages) && mapped; ++i) {
+		enum dma_data_direction dir = DMA_FROM_DEVICE;
+		struct page *page;
+
+		page = hmm_pfn_to_page(range, range->pfns[i]);
+		if (page == NULL)
+			continue;
+
+		if (dma_mapping_error(device, daddrs[i]))
+			continue;
+
+		/* If it is read and write than map bi-directional. */
+		if (range->pfns[i] & range->values[HMM_PFN_WRITE])
+			dir = DMA_BIDIRECTIONAL;
+
+		dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+		mapped--;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(hmm_range_dma_map);
+
+/*
+ * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
+ * @range: range being unmapped
+ * @vma: the vma against which the range (optional)
+ * @device: device against which dma map was done
+ * @daddrs: dma address of mapped pages
+ * @dirty: dirty page if it had the write flag set
+ * Returns: number of page unmapped on success, -EINVAL otherwise
+ *
+ * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
+ * to the sync_cpu_device_pagetables() callback so that it is safe here to
+ * call set_page_dirty(). Caller must also take appropriate locks to avoid
+ * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
+ */
+long hmm_range_dma_unmap(struct hmm_range *range,
+			 struct vm_area_struct *vma,
+			 struct device *device,
+			 dma_addr_t *daddrs,
+			 bool dirty)
+{
+	unsigned long i, npages;
+	long cpages = 0;
+
+	/* Sanity check. */
+	if (range->end <= range->start)
+		return -EINVAL;
+	if (!daddrs)
+		return -EINVAL;
+	if (!range->pfns)
+		return -EINVAL;
+
+	npages = (range->end - range->start) >> PAGE_SHIFT;
+	for (i = 0; i < npages; ++i) {
+		enum dma_data_direction dir = DMA_FROM_DEVICE;
+		struct page *page;
+
+		page = hmm_pfn_to_page(range, range->pfns[i]);
+		if (page == NULL)
+			continue;
+
+		/* If it is read and write than map bi-directional. */
+		if (range->pfns[i] & range->values[HMM_PFN_WRITE]) {
+			dir = DMA_BIDIRECTIONAL;
+
+			/*
+			 * See comments in function description on why it is
+			 * safe here to call set_page_dirty()
+			 */
+			if (dirty)
+				set_page_dirty(page);
+		}
+
+		/* Unmap and clear pfns/dma address */
+		dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+		range->pfns[i] = range->values[HMM_PFN_NONE];
+		/* FIXME see comments in hmm_vma_dma_map() */
+		daddrs[i] = 0;
+		cpages++;
+	}
+
+	return cpages;
+}
+EXPORT_SYMBOL(hmm_range_dma_unmap);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */