Message ID | 20190317183438.2057-2-ira.weiny@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add FOLL_LONGTERM to GUP fast and use it | expand |
On Sun, Mar 17, 2019 at 7:36 PM <ira.weiny@intel.com> wrote: > > From: Ira Weiny <ira.weiny@intel.com> > > Rather than have a separate get_user_pages_longterm() call, > introduce FOLL_LONGTERM and change the longterm callers to use > it. > > This patch does not change any functionality. > > FOLL_LONGTERM can only be supported with get_user_pages() as it > requires vmas to determine if DAX is in use. > > CC: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> > CC: Andrew Morton <akpm@linux-foundation.org> > CC: Michal Hocko <mhocko@kernel.org> > Signed-off-by: Ira Weiny <ira.weiny@intel.com> [..] > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 2d483dbdffc0..6831077d126c 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h [..] > @@ -2609,6 +2596,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, > #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ > #define FOLL_COW 0x4000 /* internal GUP flag */ > #define FOLL_ANON 0x8000 /* don't do file mappings */ > +#define FOLL_LONGTERM 0x10000 /* mapping is intended for a long term pin */ Let's change this comment to say something like /* mapping lifetime is indefinite / at the discretion of userspace */, since "longterm is not well defined. I think it should also include a /* FIXME: */ to say something about the havoc a long term pin might wreak on fs and mm code paths. > static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) > { > diff --git a/mm/gup.c b/mm/gup.c > index f84e22685aaa..8cb4cff067bc 100644 > --- a/mm/gup.c > +++ b/mm/gup.c > @@ -1112,26 +1112,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, > } > EXPORT_SYMBOL(get_user_pages_remote); > > -/* > - * This is the same as get_user_pages_remote(), just with a > - * less-flexible calling convention where we assume that the task > - * and mm being operated on are the current task's and don't allow > - * passing of a locked parameter. We also obviously don't pass > - * FOLL_REMOTE in here. > - */ > -long get_user_pages(unsigned long start, unsigned long nr_pages, > - unsigned int gup_flags, struct page **pages, > - struct vm_area_struct **vmas) > -{ > - return __get_user_pages_locked(current, current->mm, start, nr_pages, > - pages, vmas, NULL, > - gup_flags | FOLL_TOUCH); > -} > -EXPORT_SYMBOL(get_user_pages); > - > #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) > - > -#ifdef CONFIG_FS_DAX > static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) > { > long i; > @@ -1150,12 +1131,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) > } > return false; > } > -#else > -static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) > -{ > - return false; > -} > -#endif > > #ifdef CONFIG_CMA > static struct page *new_non_cma_page(struct page *page, unsigned long private) > @@ -1209,10 +1184,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private) > return __alloc_pages_node(nid, gfp_mask, 0); > } > > -static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, > - unsigned int gup_flags, > +static long check_and_migrate_cma_pages(struct task_struct *tsk, > + struct mm_struct *mm, > + unsigned long start, > + unsigned long nr_pages, > struct page **pages, > - struct vm_area_struct **vmas) > + struct vm_area_struct **vmas, > + unsigned int gup_flags) > { > long i; > bool drain_allow = true; > @@ -1268,10 +1246,14 @@ static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, > putback_movable_pages(&cma_page_list); > } > /* > - * We did migrate all the pages, Try to get the page references again > - * migrating any new CMA pages which we failed to isolate earlier. > + * We did migrate all the pages, Try to get the page references > + * again migrating any new CMA pages which we failed to isolate > + * earlier. > */ > - nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); > + nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, > + pages, vmas, NULL, > + gup_flags); > + Why did this need to change to __get_user_pages_locked? > if ((nr_pages > 0) && migrate_allow) { > drain_allow = true; > goto check_again; > @@ -1281,66 +1263,115 @@ static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, > return nr_pages; > } > #else > -static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, > - unsigned int gup_flags, > - struct page **pages, > - struct vm_area_struct **vmas) > +static long check_and_migrate_cma_pages(struct task_struct *tsk, > + struct mm_struct *mm, > + unsigned long start, > + unsigned long nr_pages, > + struct page **pages, > + struct vm_area_struct **vmas, > + unsigned int gup_flags) > { > return nr_pages; > } > #endif > > /* > - * This is the same as get_user_pages() in that it assumes we are > - * operating on the current task's mm, but it goes further to validate > - * that the vmas associated with the address range are suitable for > - * longterm elevated page reference counts. For example, filesystem-dax > - * mappings are subject to the lifetime enforced by the filesystem and > - * we need guarantees that longterm users like RDMA and V4L2 only > - * establish mappings that have a kernel enforced revocation mechanism. > + * __gup_longterm_locked() is a wrapper for __get_uer_pages_locked which s/uer/user/ > + * allows us to process the FOLL_LONGTERM flag if present. > + * > + * FOLL_LONGTERM Checks for either DAX VMAs or PPC CMA regions and either fails > + * the pin or attempts to migrate the page as appropriate. > + * > + * In the filesystem-dax case mappings are subject to the lifetime enforced by > + * the filesystem and we need guarantees that longterm users like RDMA and V4L2 > + * only establish mappings that have a kernel enforced revocation mechanism. > + * > + * In the CMA case pages can't be pinned in a CMA region as this would > + * unnecessarily fragment that region. So CMA attempts to migrate the page > + * before pinning. > * > * "longterm" == userspace controlled elevated page count lifetime. > * Contrast this to iov_iter_get_pages() usages which are transient. Ah, here's the longterm documentation, but if I was a developer considering whether to use FOLL_LONGTERM or not I would expect to find the documentation at the flag definition site. I think it has become more clear since get_user_pages_longterm() was initially merged that we need to warn people not to use it, or at least seriously reconsider whether they want an interface to support indefinite pins. > */ > -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, > - unsigned int gup_flags, struct page **pages, > - struct vm_area_struct **vmas_arg) > +static __always_inline long __gup_longterm_locked(struct task_struct *tsk, ...why the __always_inline?
On Fri, Mar 22, 2019 at 02:24:40PM -0700, Dan Williams wrote: > On Sun, Mar 17, 2019 at 7:36 PM <ira.weiny@intel.com> wrote: > > > > From: Ira Weiny <ira.weiny@intel.com> > > > > Rather than have a separate get_user_pages_longterm() call, > > introduce FOLL_LONGTERM and change the longterm callers to use > > it. > > > > This patch does not change any functionality. > > > > FOLL_LONGTERM can only be supported with get_user_pages() as it > > requires vmas to determine if DAX is in use. > > > > CC: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> > > CC: Andrew Morton <akpm@linux-foundation.org> > > CC: Michal Hocko <mhocko@kernel.org> > > Signed-off-by: Ira Weiny <ira.weiny@intel.com> > [..] > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index 2d483dbdffc0..6831077d126c 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > [..] > > @@ -2609,6 +2596,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, > > #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ > > #define FOLL_COW 0x4000 /* internal GUP flag */ > > #define FOLL_ANON 0x8000 /* don't do file mappings */ > > +#define FOLL_LONGTERM 0x10000 /* mapping is intended for a long term pin */ > > Let's change this comment to say something like /* mapping lifetime is > indefinite / at the discretion of userspace */, since "longterm is not > well defined. > > I think it should also include a /* FIXME: */ to say something about > the havoc a long term pin might wreak on fs and mm code paths. Will do. > > > static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) > > { > > diff --git a/mm/gup.c b/mm/gup.c > > index f84e22685aaa..8cb4cff067bc 100644 > > --- a/mm/gup.c > > +++ b/mm/gup.c > > @@ -1112,26 +1112,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, > > } > > EXPORT_SYMBOL(get_user_pages_remote); > > > > -/* > > - * This is the same as get_user_pages_remote(), just with a > > - * less-flexible calling convention where we assume that the task > > - * and mm being operated on are the current task's and don't allow > > - * passing of a locked parameter. We also obviously don't pass > > - * FOLL_REMOTE in here. > > - */ > > -long get_user_pages(unsigned long start, unsigned long nr_pages, > > - unsigned int gup_flags, struct page **pages, > > - struct vm_area_struct **vmas) > > -{ > > - return __get_user_pages_locked(current, current->mm, start, nr_pages, > > - pages, vmas, NULL, > > - gup_flags | FOLL_TOUCH); > > -} > > -EXPORT_SYMBOL(get_user_pages); > > - > > #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) > > - > > -#ifdef CONFIG_FS_DAX > > static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) > > { > > long i; > > @@ -1150,12 +1131,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) > > } > > return false; > > } > > -#else > > -static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) > > -{ > > - return false; > > -} > > -#endif > > > > #ifdef CONFIG_CMA > > static struct page *new_non_cma_page(struct page *page, unsigned long private) > > @@ -1209,10 +1184,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private) > > return __alloc_pages_node(nid, gfp_mask, 0); > > } > > > > -static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, > > - unsigned int gup_flags, > > +static long check_and_migrate_cma_pages(struct task_struct *tsk, > > + struct mm_struct *mm, > > + unsigned long start, > > + unsigned long nr_pages, > > struct page **pages, > > - struct vm_area_struct **vmas) > > + struct vm_area_struct **vmas, > > + unsigned int gup_flags) > > { > > long i; > > bool drain_allow = true; > > @@ -1268,10 +1246,14 @@ static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, > > putback_movable_pages(&cma_page_list); > > } > > /* > > - * We did migrate all the pages, Try to get the page references again > > - * migrating any new CMA pages which we failed to isolate earlier. > > + * We did migrate all the pages, Try to get the page references > > + * again migrating any new CMA pages which we failed to isolate > > + * earlier. > > */ > > - nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); > > + nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, > > + pages, vmas, NULL, > > + gup_flags); > > + > > Why did this need to change to __get_user_pages_locked? __get_uer_pages_locked() is now the "internal call" for get_user_pages. Technically it did not _have_ to change but there is no need to call get_user_pages() again because the FOLL_TOUCH flags is already set. Also this call then matches the __get_user_pages_locked() which was called on the pages we migrated from. Mostly this keeps the code "symmetrical" in that we called __get_user_pages_locked() on the pages we are migrating from and the same call on the pages we migrated to. While the change here looks funny I think the final code is better. > > > if ((nr_pages > 0) && migrate_allow) { > > drain_allow = true; > > goto check_again; > > @@ -1281,66 +1263,115 @@ static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, > > return nr_pages; > > } > > #else > > -static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, > > - unsigned int gup_flags, > > - struct page **pages, > > - struct vm_area_struct **vmas) > > +static long check_and_migrate_cma_pages(struct task_struct *tsk, > > + struct mm_struct *mm, > > + unsigned long start, > > + unsigned long nr_pages, > > + struct page **pages, > > + struct vm_area_struct **vmas, > > + unsigned int gup_flags) > > { > > return nr_pages; > > } > > #endif > > > > /* > > - * This is the same as get_user_pages() in that it assumes we are > > - * operating on the current task's mm, but it goes further to validate > > - * that the vmas associated with the address range are suitable for > > - * longterm elevated page reference counts. For example, filesystem-dax > > - * mappings are subject to the lifetime enforced by the filesystem and > > - * we need guarantees that longterm users like RDMA and V4L2 only > > - * establish mappings that have a kernel enforced revocation mechanism. > > + * __gup_longterm_locked() is a wrapper for __get_uer_pages_locked which > > s/uer/user/ Check > > > + * allows us to process the FOLL_LONGTERM flag if present. > > + * > > + * FOLL_LONGTERM Checks for either DAX VMAs or PPC CMA regions and either fails > > + * the pin or attempts to migrate the page as appropriate. > > + * > > + * In the filesystem-dax case mappings are subject to the lifetime enforced by > > + * the filesystem and we need guarantees that longterm users like RDMA and V4L2 > > + * only establish mappings that have a kernel enforced revocation mechanism. > > + * > > + * In the CMA case pages can't be pinned in a CMA region as this would > > + * unnecessarily fragment that region. So CMA attempts to migrate the page > > + * before pinning. > > * > > * "longterm" == userspace controlled elevated page count lifetime. > > * Contrast this to iov_iter_get_pages() usages which are transient. > > Ah, here's the longterm documentation, but if I was a developer > considering whether to use FOLL_LONGTERM or not I would expect to find > the documentation at the flag definition site. > > I think it has become more clear since get_user_pages_longterm() was > initially merged that we need to warn people not to use it, or at > least seriously reconsider whether they want an interface to support > indefinite pins. Good point will move > > > */ > > -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, > > - unsigned int gup_flags, struct page **pages, > > - struct vm_area_struct **vmas_arg) > > +static __always_inline long __gup_longterm_locked(struct task_struct *tsk, > > ...why the __always_inline? This was because it was only called from get_user_pages() in this patch. But later on I use it elsewhere so __always_inline is wrong. Ira
On Mon, Mar 25, 2019 at 09:45:12AM -0700, Dan Williams wrote: > On Mon, Mar 25, 2019 at 7:21 AM Ira Weiny <ira.weiny@intel.com> wrote: > [..] > > > > @@ -1268,10 +1246,14 @@ static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, > > > > putback_movable_pages(&cma_page_list); > > > > } > > > > /* > > > > - * We did migrate all the pages, Try to get the page references again > > > > - * migrating any new CMA pages which we failed to isolate earlier. > > > > + * We did migrate all the pages, Try to get the page references > > > > + * again migrating any new CMA pages which we failed to isolate > > > > + * earlier. > > > > */ > > > > - nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); > > > > + nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, > > > > + pages, vmas, NULL, > > > > + gup_flags); > > > > + > > > > > > Why did this need to change to __get_user_pages_locked? > > > > __get_uer_pages_locked() is now the "internal call" for get_user_pages. > > > > Technically it did not _have_ to change but there is no need to call > > get_user_pages() again because the FOLL_TOUCH flags is already set. Also this > > call then matches the __get_user_pages_locked() which was called on the pages > > we migrated from. Mostly this keeps the code "symmetrical" in that we called > > __get_user_pages_locked() on the pages we are migrating from and the same call > > on the pages we migrated to. > > > > While the change here looks funny I think the final code is better. > > Agree, but I think that either needs to be noted in the changelog so > it's not a surprise, or moved to a follow-on cleanup patch. > Can do... Thanks! Ira
On Fri, Mar 22, 2019 at 02:24:40PM -0700, Dan Williams wrote: > On Sun, Mar 17, 2019 at 7:36 PM <ira.weiny@intel.com> wrote: [snip] > > + * __gup_longterm_locked() is a wrapper for __get_uer_pages_locked which > > s/uer/user/ > > > + * allows us to process the FOLL_LONGTERM flag if present. > > + * > > + * FOLL_LONGTERM Checks for either DAX VMAs or PPC CMA regions and either fails > > + * the pin or attempts to migrate the page as appropriate. > > + * > > + * In the filesystem-dax case mappings are subject to the lifetime enforced by > > + * the filesystem and we need guarantees that longterm users like RDMA and V4L2 > > + * only establish mappings that have a kernel enforced revocation mechanism. > > + * > > + * In the CMA case pages can't be pinned in a CMA region as this would > > + * unnecessarily fragment that region. So CMA attempts to migrate the page > > + * before pinning. > > * > > * "longterm" == userspace controlled elevated page count lifetime. > > * Contrast this to iov_iter_get_pages() usages which are transient. > > Ah, here's the longterm documentation, but if I was a developer > considering whether to use FOLL_LONGTERM or not I would expect to find > the documentation at the flag definition site. > > I think it has become more clear since get_user_pages_longterm() was > initially merged that we need to warn people not to use it, or at > least seriously reconsider whether they want an interface to support > indefinite pins. I will move the comment to the flag definition but... In reviewing this comment it occurs to me that the addition of special casing CMA regions via FOLL_LONGTERM has made it less experimental/temporary and now simply implies intent to the GUP code as to the use of the pages. As I'm not super familiar with the CMA use case I can't say for certain but it seems that it is not a temporary solution. So I'm not going to refrain from a FIXME WRT removing the flag. New suggested text below. diff --git a/include/linux/mm.h b/include/linux/mm.h index 6831077d126c..5db9d8e894aa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2596,7 +2596,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ -#define FOLL_LONGTERM 0x10000 /* mapping is intended for a long term pin */ +#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ + +/* + * NOTE on FOLL_LONGTERM: + * + * FOLL_LONGTERM indicates that the page will be held for an indefinite time + * period _often_ under userspace control. This is contrasted with + * iov_iter_get_pages() where usages which are transient. + * + * FIXME: For pages which are part of a filesystem, mappings are subject to the + * lifetime enforced by the filesystem and we need guarantees that longterm + * users like RDMA and V4L2 only establish mappings which coordinate usage with + * the filesystem. Ideas for this coordination include revoking the longterm + * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was + * added after the problem with filesystems was found FS DAX VMAs are + * specifically failed. Filesystem pages are still subject to bugs and use of + * FOLL_LONGTERM should be avoided on those pages. + * + * In the CMA case: longterm pins in a CMA region would unnecessarily fragment + * that region. And so CMA attempts to migrate the page before pinning when + * FOLL_LONGTERM is specified. + */ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) {
On Mon, Mar 25, 2019 at 7:21 AM Ira Weiny <ira.weiny@intel.com> wrote: [..] > > > @@ -1268,10 +1246,14 @@ static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, > > > putback_movable_pages(&cma_page_list); > > > } > > > /* > > > - * We did migrate all the pages, Try to get the page references again > > > - * migrating any new CMA pages which we failed to isolate earlier. > > > + * We did migrate all the pages, Try to get the page references > > > + * again migrating any new CMA pages which we failed to isolate > > > + * earlier. > > > */ > > > - nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); > > > + nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, > > > + pages, vmas, NULL, > > > + gup_flags); > > > + > > > > Why did this need to change to __get_user_pages_locked? > > __get_uer_pages_locked() is now the "internal call" for get_user_pages. > > Technically it did not _have_ to change but there is no need to call > get_user_pages() again because the FOLL_TOUCH flags is already set. Also this > call then matches the __get_user_pages_locked() which was called on the pages > we migrated from. Mostly this keeps the code "symmetrical" in that we called > __get_user_pages_locked() on the pages we are migrating from and the same call > on the pages we migrated to. > > While the change here looks funny I think the final code is better. Agree, but I think that either needs to be noted in the changelog so it's not a surprise, or moved to a follow-on cleanup patch.
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index e7a9c4f6bfca..2bd48998765e 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -148,7 +148,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); + ret = get_user_pages(ua, entries, FOLL_WRITE | FOLL_LONGTERM, + mem->hpages, NULL); up_read(&mm->mmap_sem); if (ret != entries) { /* free the reference taken */ diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index fe5551562dbc..31191f098e73 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -189,10 +189,11 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, while (npages) { down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(cur_base, + ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), - gup_flags, page_list, vma_list); + gup_flags | FOLL_LONGTERM, + page_list, vma_list); if (ret < 0) { up_read(&mm->mmap_sem); goto umem_release; diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index 123ca8f64f75..f712fb7fa82f 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -114,10 +114,10 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages, down_read(¤t->mm->mmap_sem); for (got = 0; got < num_pages; got += ret) { - ret = get_user_pages_longterm(start_page + got * PAGE_SIZE, - num_pages - got, - FOLL_WRITE | FOLL_FORCE, - p + got, NULL); + ret = get_user_pages(start_page + got * PAGE_SIZE, + num_pages - got, + FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE, + p + got, NULL); if (ret < 0) { up_read(¤t->mm->mmap_sem); goto bail_release; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 06862a6af185..1d9a182ac163 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -143,10 +143,11 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = 0; while (npages) { - ret = get_user_pages_longterm(cur_base, - min_t(unsigned long, npages, - PAGE_SIZE / sizeof(struct page *)), - gup_flags, page_list, NULL); + ret = get_user_pages(cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof(struct page *)), + gup_flags | FOLL_LONGTERM, + page_list, NULL); if (ret < 0) goto out; diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 08929c087e27..870a2a526e0b 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -186,12 +186,12 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n", data, size, dma->nr_pages); - err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages, - flags, dma->pages, NULL); + err = get_user_pages(data & PAGE_MASK, dma->nr_pages, + flags | FOLL_LONGTERM, dma->pages, NULL); if (err != dma->nr_pages) { dma->nr_pages = (err >= 0) ? err : 0; - dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err, + dprintk(1, "get_user_pages: err=%d [%d]\n", err, dma->nr_pages); return err < 0 ? err : -EINVAL; } diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 73652e21efec..1500bd0bb6da 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -351,7 +351,8 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, down_read(&mm->mmap_sem); if (mm == current->mm) { - ret = get_user_pages_longterm(vaddr, 1, flags, page, vmas); + ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page, + vmas); } else { ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, vmas, NULL); diff --git a/fs/io_uring.c b/fs/io_uring.c index e2bd77da5e21..8d61b81373bd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2450,8 +2450,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ret = 0; down_read(¤t->mm->mmap_sem); - pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE, - pages, vmas); + pret = get_user_pages(ubuf, nr_pages, + FOLL_WRITE | FOLL_LONGTERM, + pages, vmas); if (pret == nr_pages) { /* don't support file backed memory */ for (j = 0; j < nr_pages; j++) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 2d483dbdffc0..6831077d126c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1531,19 +1531,6 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); -#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA) -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); -#else -static inline long get_user_pages_longterm(unsigned long start, - unsigned long nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas) -{ - return get_user_pages(start, nr_pages, gup_flags, pages, vmas); -} -#endif /* CONFIG_FS_DAX */ - int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); @@ -2609,6 +2596,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ +#define FOLL_LONGTERM 0x10000 /* mapping is intended for a long term pin */ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { diff --git a/mm/gup.c b/mm/gup.c index f84e22685aaa..8cb4cff067bc 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1112,26 +1112,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages_remote); -/* - * This is the same as get_user_pages_remote(), just with a - * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's and don't allow - * passing of a locked parameter. We also obviously don't pass - * FOLL_REMOTE in here. - */ -long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) -{ - return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, vmas, NULL, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(get_user_pages); - #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) - -#ifdef CONFIG_FS_DAX static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) { long i; @@ -1150,12 +1131,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) } return false; } -#else -static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) -{ - return false; -} -#endif #ifdef CONFIG_CMA static struct page *new_non_cma_page(struct page *page, unsigned long private) @@ -1209,10 +1184,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private) return __alloc_pages_node(nid, gfp_mask, 0); } -static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, - unsigned int gup_flags, +static long check_and_migrate_cma_pages(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, + unsigned int gup_flags) { long i; bool drain_allow = true; @@ -1268,10 +1246,14 @@ static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, putback_movable_pages(&cma_page_list); } /* - * We did migrate all the pages, Try to get the page references again - * migrating any new CMA pages which we failed to isolate earlier. + * We did migrate all the pages, Try to get the page references + * again migrating any new CMA pages which we failed to isolate + * earlier. */ - nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, + pages, vmas, NULL, + gup_flags); + if ((nr_pages > 0) && migrate_allow) { drain_allow = true; goto check_again; @@ -1281,66 +1263,115 @@ static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, return nr_pages; } #else -static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, - unsigned int gup_flags, - struct page **pages, - struct vm_area_struct **vmas) +static long check_and_migrate_cma_pages(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { return nr_pages; } #endif /* - * This is the same as get_user_pages() in that it assumes we are - * operating on the current task's mm, but it goes further to validate - * that the vmas associated with the address range are suitable for - * longterm elevated page reference counts. For example, filesystem-dax - * mappings are subject to the lifetime enforced by the filesystem and - * we need guarantees that longterm users like RDMA and V4L2 only - * establish mappings that have a kernel enforced revocation mechanism. + * __gup_longterm_locked() is a wrapper for __get_uer_pages_locked which + * allows us to process the FOLL_LONGTERM flag if present. + * + * FOLL_LONGTERM Checks for either DAX VMAs or PPC CMA regions and either fails + * the pin or attempts to migrate the page as appropriate. + * + * In the filesystem-dax case mappings are subject to the lifetime enforced by + * the filesystem and we need guarantees that longterm users like RDMA and V4L2 + * only establish mappings that have a kernel enforced revocation mechanism. + * + * In the CMA case pages can't be pinned in a CMA region as this would + * unnecessarily fragment that region. So CMA attempts to migrate the page + * before pinning. * * "longterm" == userspace controlled elevated page count lifetime. * Contrast this to iov_iter_get_pages() usages which are transient. */ -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas_arg) +static __always_inline long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { - struct vm_area_struct **vmas = vmas_arg; - unsigned long flags; + struct vm_area_struct **vmas_tmp = vmas; + unsigned long flags = 0; long rc, i; - if (!pages) - return -EINVAL; - - if (!vmas) { - vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas) - return -ENOMEM; + if (gup_flags & FOLL_LONGTERM) { + if (!pages) + return -EINVAL; + + if (!vmas_tmp) { + vmas_tmp = kcalloc(nr_pages, + sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!vmas_tmp) + return -ENOMEM; + } + flags = memalloc_nocma_save(); } - flags = memalloc_nocma_save(); - rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); - memalloc_nocma_restore(flags); - if (rc < 0) - goto out; + rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, + vmas_tmp, NULL, gup_flags); - if (check_dax_vmas(vmas, rc)) { - for (i = 0; i < rc; i++) - put_page(pages[i]); - rc = -EOPNOTSUPP; - goto out; + if (gup_flags & FOLL_LONGTERM) { + memalloc_nocma_restore(flags); + if (rc < 0) + goto out; + + if (check_dax_vmas(vmas_tmp, rc)) { + for (i = 0; i < rc; i++) + put_page(pages[i]); + rc = -EOPNOTSUPP; + goto out; + } + + rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages, + vmas_tmp, gup_flags); } - rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas); out: - if (vmas != vmas_arg) - kfree(vmas); + if (vmas_tmp != vmas) + kfree(vmas_tmp); return rc; } -EXPORT_SYMBOL(get_user_pages_longterm); -#endif /* CONFIG_FS_DAX */ +#else /* !CONFIG_FS_DAX && !CONFIG_CMA */ +static __always_inline long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int flags) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + NULL, flags); +} +#endif /* CONFIG_FS_DAX || CONFIG_CMA */ + +/* + * This is the same as get_user_pages_remote(), just with a + * less-flexible calling convention where we assume that the task + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. + */ +long get_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas) +{ + return __gup_longterm_locked(current, current->mm, start, nr_pages, + pages, vmas, gup_flags | FOLL_TOUCH); +} +EXPORT_SYMBOL(get_user_pages); /** * populate_vma_page_range() - populate a range of pages in the vma. diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 6c0279e70cc4..7dd602d7f8db 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -54,8 +54,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd, pages + i); break; case GUP_LONGTERM_BENCHMARK: - nr = get_user_pages_longterm(addr, nr, gup->flags & 1, - pages + i, NULL); + nr = get_user_pages(addr, nr, + (gup->flags & 1) | FOLL_LONGTERM, + pages + i, NULL); break; case GUP_BENCHMARK: nr = get_user_pages(addr, nr, gup->flags & 1, pages + i, diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 77520eacee8f..ab489454a63e 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -267,8 +267,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) return -ENOMEM; down_read(¤t->mm->mmap_sem); - npgs = get_user_pages_longterm(umem->address, umem->npgs, - gup_flags, &umem->pgs[0], NULL); + npgs = get_user_pages(umem->address, umem->npgs, + gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); up_read(¤t->mm->mmap_sem); if (npgs != umem->npgs) {