[RESEND,1/7] mm/gup: Replace get_user_pages_longterm() with FOLL_LONGTERM
diff mbox series

Message ID 20190317183438.2057-2-ira.weiny@intel.com
State New
Headers show
Series
  • Add FOLL_LONGTERM to GUP fast and use it
Related show

Commit Message

Ira Weiny March 17, 2019, 6:34 p.m. UTC
From: Ira Weiny <ira.weiny@intel.com>

Rather than have a separate get_user_pages_longterm() call,
introduce FOLL_LONGTERM and change the longterm callers to use
it.

This patch does not change any functionality.

FOLL_LONGTERM can only be supported with get_user_pages() as it
requires vmas to determine if DAX is in use.

CC: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>

---
Changes from V1:
	Rebased on 5.1 merge
	Adjusted for changes introduced by CONFIG_CMA
	Convert new users of GUP longterm
		io_uring.c
		xdp_umem.c

 arch/powerpc/mm/mmu_context_iommu.c        |   3 +-
 drivers/infiniband/core/umem.c             |   5 +-
 drivers/infiniband/hw/qib/qib_user_pages.c |   8 +-
 drivers/infiniband/hw/usnic/usnic_uiom.c   |   9 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c  |   6 +-
 drivers/vfio/vfio_iommu_type1.c            |   3 +-
 fs/io_uring.c                              |   5 +-
 include/linux/mm.h                         |  14 +-
 mm/gup.c                                   | 171 ++++++++++++---------
 mm/gup_benchmark.c                         |   5 +-
 net/xdp/xdp_umem.c                         |   4 +-
 11 files changed, 129 insertions(+), 104 deletions(-)

Comments

Dan Williams March 22, 2019, 9:24 p.m. UTC | #1
On Sun, Mar 17, 2019 at 7:36 PM <ira.weiny@intel.com> wrote:
>
> From: Ira Weiny <ira.weiny@intel.com>
>
> Rather than have a separate get_user_pages_longterm() call,
> introduce FOLL_LONGTERM and change the longterm callers to use
> it.
>
> This patch does not change any functionality.
>
> FOLL_LONGTERM can only be supported with get_user_pages() as it
> requires vmas to determine if DAX is in use.
>
> CC: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> CC: Andrew Morton <akpm@linux-foundation.org>
> CC: Michal Hocko <mhocko@kernel.org>
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
[..]
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 2d483dbdffc0..6831077d126c 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
[..]
> @@ -2609,6 +2596,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
>  #define FOLL_REMOTE    0x2000  /* we are working on non-current tsk/mm */
>  #define FOLL_COW       0x4000  /* internal GUP flag */
>  #define FOLL_ANON      0x8000  /* don't do file mappings */
> +#define FOLL_LONGTERM  0x10000 /* mapping is intended for a long term pin */

Let's change this comment to say something like /* mapping lifetime is
indefinite / at the discretion of userspace */, since "longterm is not
well defined.

I think it should also include a /* FIXME: */ to say something about
the havoc a long term pin might wreak on fs and mm code paths.

>  static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
>  {
> diff --git a/mm/gup.c b/mm/gup.c
> index f84e22685aaa..8cb4cff067bc 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -1112,26 +1112,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
>  }
>  EXPORT_SYMBOL(get_user_pages_remote);
>
> -/*
> - * This is the same as get_user_pages_remote(), just with a
> - * less-flexible calling convention where we assume that the task
> - * and mm being operated on are the current task's and don't allow
> - * passing of a locked parameter.  We also obviously don't pass
> - * FOLL_REMOTE in here.
> - */
> -long get_user_pages(unsigned long start, unsigned long nr_pages,
> -               unsigned int gup_flags, struct page **pages,
> -               struct vm_area_struct **vmas)
> -{
> -       return __get_user_pages_locked(current, current->mm, start, nr_pages,
> -                                      pages, vmas, NULL,
> -                                      gup_flags | FOLL_TOUCH);
> -}
> -EXPORT_SYMBOL(get_user_pages);
> -
>  #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
> -
> -#ifdef CONFIG_FS_DAX
>  static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
>  {
>         long i;
> @@ -1150,12 +1131,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
>         }
>         return false;
>  }
> -#else
> -static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
> -{
> -       return false;
> -}
> -#endif
>
>  #ifdef CONFIG_CMA
>  static struct page *new_non_cma_page(struct page *page, unsigned long private)
> @@ -1209,10 +1184,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private)
>         return __alloc_pages_node(nid, gfp_mask, 0);
>  }
>
> -static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
> -                                       unsigned int gup_flags,
> +static long check_and_migrate_cma_pages(struct task_struct *tsk,
> +                                       struct mm_struct *mm,
> +                                       unsigned long start,
> +                                       unsigned long nr_pages,
>                                         struct page **pages,
> -                                       struct vm_area_struct **vmas)
> +                                       struct vm_area_struct **vmas,
> +                                       unsigned int gup_flags)
>  {
>         long i;
>         bool drain_allow = true;
> @@ -1268,10 +1246,14 @@ static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
>                                 putback_movable_pages(&cma_page_list);
>                 }
>                 /*
> -                * We did migrate all the pages, Try to get the page references again
> -                * migrating any new CMA pages which we failed to isolate earlier.
> +                * We did migrate all the pages, Try to get the page references
> +                * again migrating any new CMA pages which we failed to isolate
> +                * earlier.
>                  */
> -               nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
> +               nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
> +                                                  pages, vmas, NULL,
> +                                                  gup_flags);
> +

Why did this need to change to __get_user_pages_locked?

>                 if ((nr_pages > 0) && migrate_allow) {
>                         drain_allow = true;
>                         goto check_again;
> @@ -1281,66 +1263,115 @@ static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
>         return nr_pages;
>  }
>  #else
> -static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
> -                                              unsigned int gup_flags,
> -                                              struct page **pages,
> -                                              struct vm_area_struct **vmas)
> +static long check_and_migrate_cma_pages(struct task_struct *tsk,
> +                                       struct mm_struct *mm,
> +                                       unsigned long start,
> +                                       unsigned long nr_pages,
> +                                       struct page **pages,
> +                                       struct vm_area_struct **vmas,
> +                                       unsigned int gup_flags)
>  {
>         return nr_pages;
>  }
>  #endif
>
>  /*
> - * This is the same as get_user_pages() in that it assumes we are
> - * operating on the current task's mm, but it goes further to validate
> - * that the vmas associated with the address range are suitable for
> - * longterm elevated page reference counts. For example, filesystem-dax
> - * mappings are subject to the lifetime enforced by the filesystem and
> - * we need guarantees that longterm users like RDMA and V4L2 only
> - * establish mappings that have a kernel enforced revocation mechanism.
> + * __gup_longterm_locked() is a wrapper for __get_uer_pages_locked which

s/uer/user/

> + * allows us to process the FOLL_LONGTERM flag if present.
> + *
> + * FOLL_LONGTERM Checks for either DAX VMAs or PPC CMA regions and either fails
> + * the pin or attempts to migrate the page as appropriate.
> + *
> + * In the filesystem-dax case mappings are subject to the lifetime enforced by
> + * the filesystem and we need guarantees that longterm users like RDMA and V4L2
> + * only establish mappings that have a kernel enforced revocation mechanism.
> + *
> + * In the CMA case pages can't be pinned in a CMA region as this would
> + * unnecessarily fragment that region.  So CMA attempts to migrate the page
> + * before pinning.
>   *
>   * "longterm" == userspace controlled elevated page count lifetime.
>   * Contrast this to iov_iter_get_pages() usages which are transient.

Ah, here's the longterm documentation, but if I was a developer
considering whether to use FOLL_LONGTERM or not I would expect to find
the documentation at the flag definition site.

I think it has become more clear since get_user_pages_longterm() was
initially merged that we need to warn people not to use it, or at
least seriously reconsider whether they want an interface to support
indefinite pins.

>   */
> -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
> -                            unsigned int gup_flags, struct page **pages,
> -                            struct vm_area_struct **vmas_arg)
> +static __always_inline long __gup_longterm_locked(struct task_struct *tsk,

...why the __always_inline?
Ira Weiny March 25, 2019, 6:19 a.m. UTC | #2
On Fri, Mar 22, 2019 at 02:24:40PM -0700, Dan Williams wrote:
> On Sun, Mar 17, 2019 at 7:36 PM <ira.weiny@intel.com> wrote:
> >
> > From: Ira Weiny <ira.weiny@intel.com>
> >
> > Rather than have a separate get_user_pages_longterm() call,
> > introduce FOLL_LONGTERM and change the longterm callers to use
> > it.
> >
> > This patch does not change any functionality.
> >
> > FOLL_LONGTERM can only be supported with get_user_pages() as it
> > requires vmas to determine if DAX is in use.
> >
> > CC: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> > CC: Andrew Morton <akpm@linux-foundation.org>
> > CC: Michal Hocko <mhocko@kernel.org>
> > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> [..]
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 2d483dbdffc0..6831077d126c 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> [..]
> > @@ -2609,6 +2596,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
> >  #define FOLL_REMOTE    0x2000  /* we are working on non-current tsk/mm */
> >  #define FOLL_COW       0x4000  /* internal GUP flag */
> >  #define FOLL_ANON      0x8000  /* don't do file mappings */
> > +#define FOLL_LONGTERM  0x10000 /* mapping is intended for a long term pin */
> 
> Let's change this comment to say something like /* mapping lifetime is
> indefinite / at the discretion of userspace */, since "longterm is not
> well defined.
> 
> I think it should also include a /* FIXME: */ to say something about
> the havoc a long term pin might wreak on fs and mm code paths.

Will do.

> 
> >  static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
> >  {
> > diff --git a/mm/gup.c b/mm/gup.c
> > index f84e22685aaa..8cb4cff067bc 100644
> > --- a/mm/gup.c
> > +++ b/mm/gup.c
> > @@ -1112,26 +1112,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
> >  }
> >  EXPORT_SYMBOL(get_user_pages_remote);
> >
> > -/*
> > - * This is the same as get_user_pages_remote(), just with a
> > - * less-flexible calling convention where we assume that the task
> > - * and mm being operated on are the current task's and don't allow
> > - * passing of a locked parameter.  We also obviously don't pass
> > - * FOLL_REMOTE in here.
> > - */
> > -long get_user_pages(unsigned long start, unsigned long nr_pages,
> > -               unsigned int gup_flags, struct page **pages,
> > -               struct vm_area_struct **vmas)
> > -{
> > -       return __get_user_pages_locked(current, current->mm, start, nr_pages,
> > -                                      pages, vmas, NULL,
> > -                                      gup_flags | FOLL_TOUCH);
> > -}
> > -EXPORT_SYMBOL(get_user_pages);
> > -
> >  #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
> > -
> > -#ifdef CONFIG_FS_DAX
> >  static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
> >  {
> >         long i;
> > @@ -1150,12 +1131,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
> >         }
> >         return false;
> >  }
> > -#else
> > -static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
> > -{
> > -       return false;
> > -}
> > -#endif
> >
> >  #ifdef CONFIG_CMA
> >  static struct page *new_non_cma_page(struct page *page, unsigned long private)
> > @@ -1209,10 +1184,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private)
> >         return __alloc_pages_node(nid, gfp_mask, 0);
> >  }
> >
> > -static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
> > -                                       unsigned int gup_flags,
> > +static long check_and_migrate_cma_pages(struct task_struct *tsk,
> > +                                       struct mm_struct *mm,
> > +                                       unsigned long start,
> > +                                       unsigned long nr_pages,
> >                                         struct page **pages,
> > -                                       struct vm_area_struct **vmas)
> > +                                       struct vm_area_struct **vmas,
> > +                                       unsigned int gup_flags)
> >  {
> >         long i;
> >         bool drain_allow = true;
> > @@ -1268,10 +1246,14 @@ static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
> >                                 putback_movable_pages(&cma_page_list);
> >                 }
> >                 /*
> > -                * We did migrate all the pages, Try to get the page references again
> > -                * migrating any new CMA pages which we failed to isolate earlier.
> > +                * We did migrate all the pages, Try to get the page references
> > +                * again migrating any new CMA pages which we failed to isolate
> > +                * earlier.
> >                  */
> > -               nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
> > +               nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
> > +                                                  pages, vmas, NULL,
> > +                                                  gup_flags);
> > +
> 
> Why did this need to change to __get_user_pages_locked?

__get_uer_pages_locked() is now the "internal call" for get_user_pages.

Technically it did not _have_ to change but there is no need to call
get_user_pages() again because the FOLL_TOUCH flags is already set.  Also this
call then matches the __get_user_pages_locked() which was called on the pages
we migrated from.  Mostly this keeps the code "symmetrical" in that we called
__get_user_pages_locked() on the pages we are migrating from and the same call
on the pages we migrated to.

While the change here looks funny I think the final code is better.

> 
> >                 if ((nr_pages > 0) && migrate_allow) {
> >                         drain_allow = true;
> >                         goto check_again;
> > @@ -1281,66 +1263,115 @@ static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
> >         return nr_pages;
> >  }
> >  #else
> > -static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
> > -                                              unsigned int gup_flags,
> > -                                              struct page **pages,
> > -                                              struct vm_area_struct **vmas)
> > +static long check_and_migrate_cma_pages(struct task_struct *tsk,
> > +                                       struct mm_struct *mm,
> > +                                       unsigned long start,
> > +                                       unsigned long nr_pages,
> > +                                       struct page **pages,
> > +                                       struct vm_area_struct **vmas,
> > +                                       unsigned int gup_flags)
> >  {
> >         return nr_pages;
> >  }
> >  #endif
> >
> >  /*
> > - * This is the same as get_user_pages() in that it assumes we are
> > - * operating on the current task's mm, but it goes further to validate
> > - * that the vmas associated with the address range are suitable for
> > - * longterm elevated page reference counts. For example, filesystem-dax
> > - * mappings are subject to the lifetime enforced by the filesystem and
> > - * we need guarantees that longterm users like RDMA and V4L2 only
> > - * establish mappings that have a kernel enforced revocation mechanism.
> > + * __gup_longterm_locked() is a wrapper for __get_uer_pages_locked which
> 
> s/uer/user/

Check

> 
> > + * allows us to process the FOLL_LONGTERM flag if present.
> > + *
> > + * FOLL_LONGTERM Checks for either DAX VMAs or PPC CMA regions and either fails
> > + * the pin or attempts to migrate the page as appropriate.
> > + *
> > + * In the filesystem-dax case mappings are subject to the lifetime enforced by
> > + * the filesystem and we need guarantees that longterm users like RDMA and V4L2
> > + * only establish mappings that have a kernel enforced revocation mechanism.
> > + *
> > + * In the CMA case pages can't be pinned in a CMA region as this would
> > + * unnecessarily fragment that region.  So CMA attempts to migrate the page
> > + * before pinning.
> >   *
> >   * "longterm" == userspace controlled elevated page count lifetime.
> >   * Contrast this to iov_iter_get_pages() usages which are transient.
> 
> Ah, here's the longterm documentation, but if I was a developer
> considering whether to use FOLL_LONGTERM or not I would expect to find
> the documentation at the flag definition site.
> 
> I think it has become more clear since get_user_pages_longterm() was
> initially merged that we need to warn people not to use it, or at
> least seriously reconsider whether they want an interface to support
> indefinite pins.

Good point will move

> 
> >   */
> > -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
> > -                            unsigned int gup_flags, struct page **pages,
> > -                            struct vm_area_struct **vmas_arg)
> > +static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
> 
> ...why the __always_inline?
 
This was because it was only called from get_user_pages() in this patch.  But
later on I use it elsewhere so __always_inline is wrong.

Ira
Ira Weiny March 25, 2019, 8:46 a.m. UTC | #3
On Mon, Mar 25, 2019 at 09:45:12AM -0700, Dan Williams wrote:
> On Mon, Mar 25, 2019 at 7:21 AM Ira Weiny <ira.weiny@intel.com> wrote:
> [..]
> > > > @@ -1268,10 +1246,14 @@ static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
> > > >                                 putback_movable_pages(&cma_page_list);
> > > >                 }
> > > >                 /*
> > > > -                * We did migrate all the pages, Try to get the page references again
> > > > -                * migrating any new CMA pages which we failed to isolate earlier.
> > > > +                * We did migrate all the pages, Try to get the page references
> > > > +                * again migrating any new CMA pages which we failed to isolate
> > > > +                * earlier.
> > > >                  */
> > > > -               nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
> > > > +               nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
> > > > +                                                  pages, vmas, NULL,
> > > > +                                                  gup_flags);
> > > > +
> > >
> > > Why did this need to change to __get_user_pages_locked?
> >
> > __get_uer_pages_locked() is now the "internal call" for get_user_pages.
> >
> > Technically it did not _have_ to change but there is no need to call
> > get_user_pages() again because the FOLL_TOUCH flags is already set.  Also this
> > call then matches the __get_user_pages_locked() which was called on the pages
> > we migrated from.  Mostly this keeps the code "symmetrical" in that we called
> > __get_user_pages_locked() on the pages we are migrating from and the same call
> > on the pages we migrated to.
> >
> > While the change here looks funny I think the final code is better.
> 
> Agree, but I think that either needs to be noted in the changelog so
> it's not a surprise, or moved to a follow-on cleanup patch.
> 

Can do...

Thanks!
Ira
Ira Weiny March 25, 2019, 10:27 a.m. UTC | #4
On Fri, Mar 22, 2019 at 02:24:40PM -0700, Dan Williams wrote:
> On Sun, Mar 17, 2019 at 7:36 PM <ira.weiny@intel.com> wrote:

[snip]

> > + * __gup_longterm_locked() is a wrapper for __get_uer_pages_locked which
> 
> s/uer/user/
> 
> > + * allows us to process the FOLL_LONGTERM flag if present.
> > + *
> > + * FOLL_LONGTERM Checks for either DAX VMAs or PPC CMA regions and either fails
> > + * the pin or attempts to migrate the page as appropriate.
> > + *
> > + * In the filesystem-dax case mappings are subject to the lifetime enforced by
> > + * the filesystem and we need guarantees that longterm users like RDMA and V4L2
> > + * only establish mappings that have a kernel enforced revocation mechanism.
> > + *
> > + * In the CMA case pages can't be pinned in a CMA region as this would
> > + * unnecessarily fragment that region.  So CMA attempts to migrate the page
> > + * before pinning.
> >   *
> >   * "longterm" == userspace controlled elevated page count lifetime.
> >   * Contrast this to iov_iter_get_pages() usages which are transient.
> 
> Ah, here's the longterm documentation, but if I was a developer
> considering whether to use FOLL_LONGTERM or not I would expect to find
> the documentation at the flag definition site.
> 
> I think it has become more clear since get_user_pages_longterm() was
> initially merged that we need to warn people not to use it, or at
> least seriously reconsider whether they want an interface to support
> indefinite pins.

I will move the comment to the flag definition but...

In reviewing this comment it occurs to me that the addition of special casing
CMA regions via FOLL_LONGTERM has made it less experimental/temporary and now
simply implies intent to the GUP code as to the use of the pages.

As I'm not super familiar with the CMA use case I can't say for certain but it
seems that it is not a temporary solution.

So I'm not going to refrain from a FIXME WRT removing the flag.

New suggested text below.

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6831077d126c..5db9d8e894aa 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2596,7 +2596,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 #define FOLL_REMOTE    0x2000  /* we are working on non-current tsk/mm */
 #define FOLL_COW       0x4000  /* internal GUP flag */
 #define FOLL_ANON      0x8000  /* don't do file mappings */
-#define FOLL_LONGTERM  0x10000 /* mapping is intended for a long term pin */
+#define FOLL_LONGTERM  0x10000 /* mapping lifetime is indefinite: see below */
+
+/*
+ * NOTE on FOLL_LONGTERM:
+ *
+ * FOLL_LONGTERM indicates that the page will be held for an indefinite time
+ * period _often_ under userspace control.  This is contrasted with
+ * iov_iter_get_pages() where usages which are transient.
+ *
+ * FIXME: For pages which are part of a filesystem, mappings are subject to the
+ * lifetime enforced by the filesystem and we need guarantees that longterm
+ * users like RDMA and V4L2 only establish mappings which coordinate usage with
+ * the filesystem.  Ideas for this coordination include revoking the longterm
+ * pin, delaying writeback, bounce buffer page writeback, etc.  As FS DAX was
+ * added after the problem with filesystems was found FS DAX VMAs are
+ * specifically failed.  Filesystem pages are still subject to bugs and use of
+ * FOLL_LONGTERM should be avoided on those pages.
+ *
+ * In the CMA case: longterm pins in a CMA region would unnecessarily fragment
+ * that region.  And so CMA attempts to migrate the page before pinning when
+ * FOLL_LONGTERM is specified.
+ */
 
 static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
 {
Dan Williams March 25, 2019, 4:45 p.m. UTC | #5
On Mon, Mar 25, 2019 at 7:21 AM Ira Weiny <ira.weiny@intel.com> wrote:
[..]
> > > @@ -1268,10 +1246,14 @@ static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
> > >                                 putback_movable_pages(&cma_page_list);
> > >                 }
> > >                 /*
> > > -                * We did migrate all the pages, Try to get the page references again
> > > -                * migrating any new CMA pages which we failed to isolate earlier.
> > > +                * We did migrate all the pages, Try to get the page references
> > > +                * again migrating any new CMA pages which we failed to isolate
> > > +                * earlier.
> > >                  */
> > > -               nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
> > > +               nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
> > > +                                                  pages, vmas, NULL,
> > > +                                                  gup_flags);
> > > +
> >
> > Why did this need to change to __get_user_pages_locked?
>
> __get_uer_pages_locked() is now the "internal call" for get_user_pages.
>
> Technically it did not _have_ to change but there is no need to call
> get_user_pages() again because the FOLL_TOUCH flags is already set.  Also this
> call then matches the __get_user_pages_locked() which was called on the pages
> we migrated from.  Mostly this keeps the code "symmetrical" in that we called
> __get_user_pages_locked() on the pages we are migrating from and the same call
> on the pages we migrated to.
>
> While the change here looks funny I think the final code is better.

Agree, but I think that either needs to be noted in the changelog so
it's not a surprise, or moved to a follow-on cleanup patch.

Patch
diff mbox series

diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index e7a9c4f6bfca..2bd48998765e 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -148,7 +148,8 @@  static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
 	}
 
 	down_read(&mm->mmap_sem);
-	ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL);
+	ret = get_user_pages(ua, entries, FOLL_WRITE | FOLL_LONGTERM,
+			     mem->hpages, NULL);
 	up_read(&mm->mmap_sem);
 	if (ret != entries) {
 		/* free the reference taken */
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index fe5551562dbc..31191f098e73 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -189,10 +189,11 @@  struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 
 	while (npages) {
 		down_read(&mm->mmap_sem);
-		ret = get_user_pages_longterm(cur_base,
+		ret = get_user_pages(cur_base,
 				     min_t(unsigned long, npages,
 					   PAGE_SIZE / sizeof (struct page *)),
-				     gup_flags, page_list, vma_list);
+				     gup_flags | FOLL_LONGTERM,
+				     page_list, vma_list);
 		if (ret < 0) {
 			up_read(&mm->mmap_sem);
 			goto umem_release;
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index 123ca8f64f75..f712fb7fa82f 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -114,10 +114,10 @@  int qib_get_user_pages(unsigned long start_page, size_t num_pages,
 
 	down_read(&current->mm->mmap_sem);
 	for (got = 0; got < num_pages; got += ret) {
-		ret = get_user_pages_longterm(start_page + got * PAGE_SIZE,
-					      num_pages - got,
-					      FOLL_WRITE | FOLL_FORCE,
-					      p + got, NULL);
+		ret = get_user_pages(start_page + got * PAGE_SIZE,
+				     num_pages - got,
+				     FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE,
+				     p + got, NULL);
 		if (ret < 0) {
 			up_read(&current->mm->mmap_sem);
 			goto bail_release;
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 06862a6af185..1d9a182ac163 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -143,10 +143,11 @@  static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
 	ret = 0;
 
 	while (npages) {
-		ret = get_user_pages_longterm(cur_base,
-					min_t(unsigned long, npages,
-					PAGE_SIZE / sizeof(struct page *)),
-					gup_flags, page_list, NULL);
+		ret = get_user_pages(cur_base,
+				     min_t(unsigned long, npages,
+				     PAGE_SIZE / sizeof(struct page *)),
+				     gup_flags | FOLL_LONGTERM,
+				     page_list, NULL);
 
 		if (ret < 0)
 			goto out;
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 08929c087e27..870a2a526e0b 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -186,12 +186,12 @@  static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
 	dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n",
 		data, size, dma->nr_pages);
 
-	err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages,
-			     flags, dma->pages, NULL);
+	err = get_user_pages(data & PAGE_MASK, dma->nr_pages,
+			     flags | FOLL_LONGTERM, dma->pages, NULL);
 
 	if (err != dma->nr_pages) {
 		dma->nr_pages = (err >= 0) ? err : 0;
-		dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err,
+		dprintk(1, "get_user_pages: err=%d [%d]\n", err,
 			dma->nr_pages);
 		return err < 0 ? err : -EINVAL;
 	}
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 73652e21efec..1500bd0bb6da 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -351,7 +351,8 @@  static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 
 	down_read(&mm->mmap_sem);
 	if (mm == current->mm) {
-		ret = get_user_pages_longterm(vaddr, 1, flags, page, vmas);
+		ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page,
+				     vmas);
 	} else {
 		ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
 					    vmas, NULL);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e2bd77da5e21..8d61b81373bd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2450,8 +2450,9 @@  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 
 		ret = 0;
 		down_read(&current->mm->mmap_sem);
-		pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
-						pages, vmas);
+		pret = get_user_pages(ubuf, nr_pages,
+				      FOLL_WRITE | FOLL_LONGTERM,
+				      pages, vmas);
 		if (pret == nr_pages) {
 			/* don't support file backed memory */
 			for (j = 0; j < nr_pages; j++) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2d483dbdffc0..6831077d126c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1531,19 +1531,6 @@  long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
 		    struct page **pages, unsigned int gup_flags);
 
-#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA)
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
-			    unsigned int gup_flags, struct page **pages,
-			    struct vm_area_struct **vmas);
-#else
-static inline long get_user_pages_longterm(unsigned long start,
-		unsigned long nr_pages, unsigned int gup_flags,
-		struct page **pages, struct vm_area_struct **vmas)
-{
-	return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
-}
-#endif /* CONFIG_FS_DAX */
-
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
 
@@ -2609,6 +2596,7 @@  struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 #define FOLL_REMOTE	0x2000	/* we are working on non-current tsk/mm */
 #define FOLL_COW	0x4000	/* internal GUP flag */
 #define FOLL_ANON	0x8000	/* don't do file mappings */
+#define FOLL_LONGTERM	0x10000	/* mapping is intended for a long term pin */
 
 static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
 {
diff --git a/mm/gup.c b/mm/gup.c
index f84e22685aaa..8cb4cff067bc 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1112,26 +1112,7 @@  long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 }
 EXPORT_SYMBOL(get_user_pages_remote);
 
-/*
- * This is the same as get_user_pages_remote(), just with a
- * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's and don't allow
- * passing of a locked parameter.  We also obviously don't pass
- * FOLL_REMOTE in here.
- */
-long get_user_pages(unsigned long start, unsigned long nr_pages,
-		unsigned int gup_flags, struct page **pages,
-		struct vm_area_struct **vmas)
-{
-	return __get_user_pages_locked(current, current->mm, start, nr_pages,
-				       pages, vmas, NULL,
-				       gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages);
-
 #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
-
-#ifdef CONFIG_FS_DAX
 static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
 {
 	long i;
@@ -1150,12 +1131,6 @@  static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
 	}
 	return false;
 }
-#else
-static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
-{
-	return false;
-}
-#endif
 
 #ifdef CONFIG_CMA
 static struct page *new_non_cma_page(struct page *page, unsigned long private)
@@ -1209,10 +1184,13 @@  static struct page *new_non_cma_page(struct page *page, unsigned long private)
 	return __alloc_pages_node(nid, gfp_mask, 0);
 }
 
-static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
-					unsigned int gup_flags,
+static long check_and_migrate_cma_pages(struct task_struct *tsk,
+					struct mm_struct *mm,
+					unsigned long start,
+					unsigned long nr_pages,
 					struct page **pages,
-					struct vm_area_struct **vmas)
+					struct vm_area_struct **vmas,
+					unsigned int gup_flags)
 {
 	long i;
 	bool drain_allow = true;
@@ -1268,10 +1246,14 @@  static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
 				putback_movable_pages(&cma_page_list);
 		}
 		/*
-		 * We did migrate all the pages, Try to get the page references again
-		 * migrating any new CMA pages which we failed to isolate earlier.
+		 * We did migrate all the pages, Try to get the page references
+		 * again migrating any new CMA pages which we failed to isolate
+		 * earlier.
 		 */
-		nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+		nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
+						   pages, vmas, NULL,
+						   gup_flags);
+
 		if ((nr_pages > 0) && migrate_allow) {
 			drain_allow = true;
 			goto check_again;
@@ -1281,66 +1263,115 @@  static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
 	return nr_pages;
 }
 #else
-static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
-					       unsigned int gup_flags,
-					       struct page **pages,
-					       struct vm_area_struct **vmas)
+static long check_and_migrate_cma_pages(struct task_struct *tsk,
+					struct mm_struct *mm,
+					unsigned long start,
+					unsigned long nr_pages,
+					struct page **pages,
+					struct vm_area_struct **vmas,
+					unsigned int gup_flags)
 {
 	return nr_pages;
 }
 #endif
 
 /*
- * This is the same as get_user_pages() in that it assumes we are
- * operating on the current task's mm, but it goes further to validate
- * that the vmas associated with the address range are suitable for
- * longterm elevated page reference counts. For example, filesystem-dax
- * mappings are subject to the lifetime enforced by the filesystem and
- * we need guarantees that longterm users like RDMA and V4L2 only
- * establish mappings that have a kernel enforced revocation mechanism.
+ * __gup_longterm_locked() is a wrapper for __get_uer_pages_locked which
+ * allows us to process the FOLL_LONGTERM flag if present.
+ *
+ * FOLL_LONGTERM Checks for either DAX VMAs or PPC CMA regions and either fails
+ * the pin or attempts to migrate the page as appropriate.
+ *
+ * In the filesystem-dax case mappings are subject to the lifetime enforced by
+ * the filesystem and we need guarantees that longterm users like RDMA and V4L2
+ * only establish mappings that have a kernel enforced revocation mechanism.
+ *
+ * In the CMA case pages can't be pinned in a CMA region as this would
+ * unnecessarily fragment that region.  So CMA attempts to migrate the page
+ * before pinning.
  *
  * "longterm" == userspace controlled elevated page count lifetime.
  * Contrast this to iov_iter_get_pages() usages which are transient.
  */
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
-			     unsigned int gup_flags, struct page **pages,
-			     struct vm_area_struct **vmas_arg)
+static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
+						  struct mm_struct *mm,
+						  unsigned long start,
+						  unsigned long nr_pages,
+						  struct page **pages,
+						  struct vm_area_struct **vmas,
+						  unsigned int gup_flags)
 {
-	struct vm_area_struct **vmas = vmas_arg;
-	unsigned long flags;
+	struct vm_area_struct **vmas_tmp = vmas;
+	unsigned long flags = 0;
 	long rc, i;
 
-	if (!pages)
-		return -EINVAL;
-
-	if (!vmas) {
-		vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
-			       GFP_KERNEL);
-		if (!vmas)
-			return -ENOMEM;
+	if (gup_flags & FOLL_LONGTERM) {
+		if (!pages)
+			return -EINVAL;
+
+		if (!vmas_tmp) {
+			vmas_tmp = kcalloc(nr_pages,
+					   sizeof(struct vm_area_struct *),
+					   GFP_KERNEL);
+			if (!vmas_tmp)
+				return -ENOMEM;
+		}
+		flags = memalloc_nocma_save();
 	}
 
-	flags = memalloc_nocma_save();
-	rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
-	memalloc_nocma_restore(flags);
-	if (rc < 0)
-		goto out;
+	rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
+				     vmas_tmp, NULL, gup_flags);
 
-	if (check_dax_vmas(vmas, rc)) {
-		for (i = 0; i < rc; i++)
-			put_page(pages[i]);
-		rc = -EOPNOTSUPP;
-		goto out;
+	if (gup_flags & FOLL_LONGTERM) {
+		memalloc_nocma_restore(flags);
+		if (rc < 0)
+			goto out;
+
+		if (check_dax_vmas(vmas_tmp, rc)) {
+			for (i = 0; i < rc; i++)
+				put_page(pages[i]);
+			rc = -EOPNOTSUPP;
+			goto out;
+		}
+
+		rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
+						 vmas_tmp, gup_flags);
 	}
 
-	rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas);
 out:
-	if (vmas != vmas_arg)
-		kfree(vmas);
+	if (vmas_tmp != vmas)
+		kfree(vmas_tmp);
 	return rc;
 }
-EXPORT_SYMBOL(get_user_pages_longterm);
-#endif /* CONFIG_FS_DAX */
+#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
+static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
+						  struct mm_struct *mm,
+						  unsigned long start,
+						  unsigned long nr_pages,
+						  struct page **pages,
+						  struct vm_area_struct **vmas,
+						  unsigned int flags)
+{
+	return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+				       NULL, flags);
+}
+#endif /* CONFIG_FS_DAX || CONFIG_CMA */
+
+/*
+ * This is the same as get_user_pages_remote(), just with a
+ * less-flexible calling convention where we assume that the task
+ * and mm being operated on are the current task's and don't allow
+ * passing of a locked parameter.  We also obviously don't pass
+ * FOLL_REMOTE in here.
+ */
+long get_user_pages(unsigned long start, unsigned long nr_pages,
+		unsigned int gup_flags, struct page **pages,
+		struct vm_area_struct **vmas)
+{
+	return __gup_longterm_locked(current, current->mm, start, nr_pages,
+				     pages, vmas, gup_flags | FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages);
 
 /**
  * populate_vma_page_range() -  populate a range of pages in the vma.
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 6c0279e70cc4..7dd602d7f8db 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -54,8 +54,9 @@  static int __gup_benchmark_ioctl(unsigned int cmd,
 						 pages + i);
 			break;
 		case GUP_LONGTERM_BENCHMARK:
-			nr = get_user_pages_longterm(addr, nr, gup->flags & 1,
-						     pages + i, NULL);
+			nr = get_user_pages(addr, nr,
+					    (gup->flags & 1) | FOLL_LONGTERM,
+					    pages + i, NULL);
 			break;
 		case GUP_BENCHMARK:
 			nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 77520eacee8f..ab489454a63e 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -267,8 +267,8 @@  static int xdp_umem_pin_pages(struct xdp_umem *umem)
 		return -ENOMEM;
 
 	down_read(&current->mm->mmap_sem);
-	npgs = get_user_pages_longterm(umem->address, umem->npgs,
-				       gup_flags, &umem->pgs[0], NULL);
+	npgs = get_user_pages(umem->address, umem->npgs,
+			      gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
 	up_read(&current->mm->mmap_sem);
 
 	if (npgs != umem->npgs) {