diff mbox series

[3/3] mm/devmap: Remove pgmap accounting in the get_user_pages_fast() path

Message ID 161604050866.1463742.7759521510383551055.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State New, archived
Headers show
Series mm, pmem: Force unmap pmem on surprise remove | expand

Commit Message

Dan Williams March 18, 2021, 4:08 a.m. UTC
Now that device-dax and filesystem-dax are guaranteed to unmap all user
mappings of devmap / DAX pages before tearing down the 'struct page'
array, get_user_pages_fast() can rely on its traditional synchronization
method "validate_pte(); get_page(); revalidate_pte()" to catch races with
device shutdown. Specifically the unmap guarantee ensures that gup-fast
either succeeds in taking a page reference (lock-less), or it detects a
need to fall back to the slow path where the device presence can be
revalidated with locks held.

Reported-by: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Shiyang Ruan <ruansy.fnst@fujitsu.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 mm/gup.c |   38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

Comments

Joao Martins March 18, 2021, 10 a.m. UTC | #1
On 3/18/21 4:08 AM, Dan Williams wrote:
> Now that device-dax and filesystem-dax are guaranteed to unmap all user
> mappings of devmap / DAX pages before tearing down the 'struct page'
> array, get_user_pages_fast() can rely on its traditional synchronization
> method "validate_pte(); get_page(); revalidate_pte()" to catch races with
> device shutdown. Specifically the unmap guarantee ensures that gup-fast
> either succeeds in taking a page reference (lock-less), or it detects a
> need to fall back to the slow path where the device presence can be
> revalidated with locks held.

[...]

> @@ -2087,21 +2078,26 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
>  #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
>  
>  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
> +
>  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>  			     unsigned long end, unsigned int flags,
>  			     struct page **pages, int *nr)
>  {
>  	int nr_start = *nr;
> -	struct dev_pagemap *pgmap = NULL;
>  
>  	do {
> -		struct page *page = pfn_to_page(pfn);
> +		struct page *page;
> +
> +		/*
> +		 * Typically pfn_to_page() on a devmap pfn is not safe
> +		 * without holding a live reference on the hosting
> +		 * pgmap. In the gup-fast path it is safe because any
> +		 * races will be resolved by either gup-fast taking a
> +		 * reference or the shutdown path unmapping the pte to
> +		 * trigger gup-fast to fall back to the slow path.
> +		 */
> +		page = pfn_to_page(pfn);
>  
> -		pgmap = get_dev_pagemap(pfn, pgmap);
> -		if (unlikely(!pgmap)) {
> -			undo_dev_pagemap(nr, nr_start, flags, pages);
> -			return 0;
> -		}
>  		SetPageReferenced(page);
>  		pages[*nr] = page;
>  		if (unlikely(!try_grab_page(page, flags))) {

So for allowing FOLL_LONGTERM[0] would it be OK if we used page->pgmap after
try_grab_page() for checking pgmap type to see if we are in a device-dax
longterm pin?

	Joao

[0] https://lore.kernel.org/linux-mm/6a18179e-65f7-367d-89a9-d5162f10fef0@oracle.com/
Dan Williams March 18, 2021, 5:03 p.m. UTC | #2
On Thu, Mar 18, 2021 at 3:02 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>
> On 3/18/21 4:08 AM, Dan Williams wrote:
> > Now that device-dax and filesystem-dax are guaranteed to unmap all user
> > mappings of devmap / DAX pages before tearing down the 'struct page'
> > array, get_user_pages_fast() can rely on its traditional synchronization
> > method "validate_pte(); get_page(); revalidate_pte()" to catch races with
> > device shutdown. Specifically the unmap guarantee ensures that gup-fast
> > either succeeds in taking a page reference (lock-less), or it detects a
> > need to fall back to the slow path where the device presence can be
> > revalidated with locks held.
>
> [...]
>
> > @@ -2087,21 +2078,26 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
> >  #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
> >
> >  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
> > +
> >  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
> >                            unsigned long end, unsigned int flags,
> >                            struct page **pages, int *nr)
> >  {
> >       int nr_start = *nr;
> > -     struct dev_pagemap *pgmap = NULL;
> >
> >       do {
> > -             struct page *page = pfn_to_page(pfn);
> > +             struct page *page;
> > +
> > +             /*
> > +              * Typically pfn_to_page() on a devmap pfn is not safe
> > +              * without holding a live reference on the hosting
> > +              * pgmap. In the gup-fast path it is safe because any
> > +              * races will be resolved by either gup-fast taking a
> > +              * reference or the shutdown path unmapping the pte to
> > +              * trigger gup-fast to fall back to the slow path.
> > +              */
> > +             page = pfn_to_page(pfn);
> >
> > -             pgmap = get_dev_pagemap(pfn, pgmap);
> > -             if (unlikely(!pgmap)) {
> > -                     undo_dev_pagemap(nr, nr_start, flags, pages);
> > -                     return 0;
> > -             }
> >               SetPageReferenced(page);
> >               pages[*nr] = page;
> >               if (unlikely(!try_grab_page(page, flags))) {
>
> So for allowing FOLL_LONGTERM[0] would it be OK if we used page->pgmap after
> try_grab_page() for checking pgmap type to see if we are in a device-dax
> longterm pin?
>

Yes. I still need to answer the question of whether mapping
invalidation triggers longterm pin holders to relinquish their hold,
but that's a problem regardless of whether gup-fast is supported or
not.
Dan Williams March 24, 2021, 5:45 p.m. UTC | #3
On Thu, Mar 18, 2021 at 3:02 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>
> On 3/18/21 4:08 AM, Dan Williams wrote:
> > Now that device-dax and filesystem-dax are guaranteed to unmap all user
> > mappings of devmap / DAX pages before tearing down the 'struct page'
> > array, get_user_pages_fast() can rely on its traditional synchronization
> > method "validate_pte(); get_page(); revalidate_pte()" to catch races with
> > device shutdown. Specifically the unmap guarantee ensures that gup-fast
> > either succeeds in taking a page reference (lock-less), or it detects a
> > need to fall back to the slow path where the device presence can be
> > revalidated with locks held.
>
> [...]
>
> > @@ -2087,21 +2078,26 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
> >  #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
> >
> >  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
> > +
> >  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
> >                            unsigned long end, unsigned int flags,
> >                            struct page **pages, int *nr)
> >  {
> >       int nr_start = *nr;
> > -     struct dev_pagemap *pgmap = NULL;
> >
> >       do {
> > -             struct page *page = pfn_to_page(pfn);
> > +             struct page *page;
> > +
> > +             /*
> > +              * Typically pfn_to_page() on a devmap pfn is not safe
> > +              * without holding a live reference on the hosting
> > +              * pgmap. In the gup-fast path it is safe because any
> > +              * races will be resolved by either gup-fast taking a
> > +              * reference or the shutdown path unmapping the pte to
> > +              * trigger gup-fast to fall back to the slow path.
> > +              */
> > +             page = pfn_to_page(pfn);
> >
> > -             pgmap = get_dev_pagemap(pfn, pgmap);
> > -             if (unlikely(!pgmap)) {
> > -                     undo_dev_pagemap(nr, nr_start, flags, pages);
> > -                     return 0;
> > -             }
> >               SetPageReferenced(page);
> >               pages[*nr] = page;
> >               if (unlikely(!try_grab_page(page, flags))) {
>
> So for allowing FOLL_LONGTERM[0] would it be OK if we used page->pgmap after
> try_grab_page() for checking pgmap type to see if we are in a device-dax
> longterm pin?

So, there is an effort to add a new pte bit p{m,u}d_special to disable
gup-fast for huge pages [1]. I'd like to investigate whether we could
use devmap + special as an encoding for "no longterm" and never
consult the pgmap in the gup-fast path.

[1]: https://lore.kernel.org/linux-mm/a1fa7fa2-914b-366d-9902-e5b784e8428c@shipmail.org/
Joao Martins March 24, 2021, 7 p.m. UTC | #4
On 3/24/21 5:45 PM, Dan Williams wrote:
> On Thu, Mar 18, 2021 at 3:02 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>> On 3/18/21 4:08 AM, Dan Williams wrote:
>>> Now that device-dax and filesystem-dax are guaranteed to unmap all user
>>> mappings of devmap / DAX pages before tearing down the 'struct page'
>>> array, get_user_pages_fast() can rely on its traditional synchronization
>>> method "validate_pte(); get_page(); revalidate_pte()" to catch races with
>>> device shutdown. Specifically the unmap guarantee ensures that gup-fast
>>> either succeeds in taking a page reference (lock-less), or it detects a
>>> need to fall back to the slow path where the device presence can be
>>> revalidated with locks held.
>>
>> [...]
>>
>>> @@ -2087,21 +2078,26 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
>>>  #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
>>>
>>>  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
>>> +
>>>  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>>>                            unsigned long end, unsigned int flags,
>>>                            struct page **pages, int *nr)
>>>  {
>>>       int nr_start = *nr;
>>> -     struct dev_pagemap *pgmap = NULL;
>>>
>>>       do {
>>> -             struct page *page = pfn_to_page(pfn);
>>> +             struct page *page;
>>> +
>>> +             /*
>>> +              * Typically pfn_to_page() on a devmap pfn is not safe
>>> +              * without holding a live reference on the hosting
>>> +              * pgmap. In the gup-fast path it is safe because any
>>> +              * races will be resolved by either gup-fast taking a
>>> +              * reference or the shutdown path unmapping the pte to
>>> +              * trigger gup-fast to fall back to the slow path.
>>> +              */
>>> +             page = pfn_to_page(pfn);
>>>
>>> -             pgmap = get_dev_pagemap(pfn, pgmap);
>>> -             if (unlikely(!pgmap)) {
>>> -                     undo_dev_pagemap(nr, nr_start, flags, pages);
>>> -                     return 0;
>>> -             }
>>>               SetPageReferenced(page);
>>>               pages[*nr] = page;
>>>               if (unlikely(!try_grab_page(page, flags))) {
>>
>> So for allowing FOLL_LONGTERM[0] would it be OK if we used page->pgmap after
>> try_grab_page() for checking pgmap type to see if we are in a device-dax
>> longterm pin?
> 
> So, there is an effort to add a new pte bit p{m,u}d_special to disable
> gup-fast for huge pages [1]. I'd like to investigate whether we could
> use devmap + special as an encoding for "no longterm" and never
> consult the pgmap in the gup-fast path.
> 
> [1]: https://lore.kernel.org/linux-mm/a1fa7fa2-914b-366d-9902-e5b784e8428c@shipmail.org/
>

Oh, nice! That would be ideal indeed, as we would skip the pgmap lookup enterily.

I suppose device-dax would use pfn_t PFN_MAP while fs-dax memory device would set PFN_MAP
| PFN_DEV (provided vmf_insert_pfn_{pmd,pud} calls mkspecial on PFN_DEV).

I haven't been following that thread, but for PMD/PUD special in vmf_* these might be useful:

https://lore.kernel.org/linux-mm/20200110190313.17144-2-joao.m.martins@oracle.com/
https://lore.kernel.org/linux-mm/20200110190313.17144-4-joao.m.martins@oracle.com/
Jason Gunthorpe March 25, 2021, 2:34 p.m. UTC | #5
On Thu, Mar 18, 2021 at 10:03:06AM -0700, Dan Williams wrote:
> Yes. I still need to answer the question of whether mapping
> invalidation triggers longterm pin holders to relinquish their hold,
> but that's a problem regardless of whether gup-fast is supported or
> not.

It does not, GUP users do not interact with addres_space or mmu
notifiers

Jason
Jason Gunthorpe March 25, 2021, 2:37 p.m. UTC | #6
On Wed, Mar 17, 2021 at 09:08:28PM -0700, Dan Williams wrote:
> Now that device-dax and filesystem-dax are guaranteed to unmap all user
> mappings of devmap / DAX pages before tearing down the 'struct page'
> array, get_user_pages_fast() can rely on its traditional synchronization
> method "validate_pte(); get_page(); revalidate_pte()" to catch races with
> device shutdown. Specifically the unmap guarantee ensures that gup-fast
> either succeeds in taking a page reference (lock-less), or it detects a
> need to fall back to the slow path where the device presence can be
> revalidated with locks held.
> 
> Reported-by: Jason Gunthorpe <jgg@ziepe.ca>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Shiyang Ruan <ruansy.fnst@fujitsu.com>
> Cc: Vishal Verma <vishal.l.verma@intel.com>
> Cc: Dave Jiang <dave.jiang@intel.com>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Jan Kara <jack@suse.cz>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  mm/gup.c |   38 ++++++++++++++++----------------------
>  1 file changed, 16 insertions(+), 22 deletions(-)

I'm happy to see this, and it is really the right thing that PTEs are
properly removed before anything happens to the pages under them.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>

Jason
Dan Williams March 29, 2021, 11:24 p.m. UTC | #7
On Thu, Mar 25, 2021 at 7:34 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Thu, Mar 18, 2021 at 10:03:06AM -0700, Dan Williams wrote:
> > Yes. I still need to answer the question of whether mapping
> > invalidation triggers longterm pin holders to relinquish their hold,
> > but that's a problem regardless of whether gup-fast is supported or
> > not.
>
> It does not, GUP users do not interact with addres_space or mmu
> notifiers
>

Ok, but the SIGKILL from the memory_failure() will drop the pin?

Can memory_failure() find the right processes to kill if the memory
registration has been passed by SCM_RIGHTS?
Jason Gunthorpe March 30, 2021, 1:49 p.m. UTC | #8
On Mon, Mar 29, 2021 at 04:24:19PM -0700, Dan Williams wrote:
> On Thu, Mar 25, 2021 at 7:34 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >
> > On Thu, Mar 18, 2021 at 10:03:06AM -0700, Dan Williams wrote:
> > > Yes. I still need to answer the question of whether mapping
> > > invalidation triggers longterm pin holders to relinquish their hold,
> > > but that's a problem regardless of whether gup-fast is supported or
> > > not.
> >
> > It does not, GUP users do not interact with addres_space or mmu
> > notifiers
> 
> Ok, but the SIGKILL from the memory_failure() will drop the pin?

In the general case I doubt it..

I think this is fine, we shouldn't expect unplugging a driver to not
block if there are open users.

Demading DMA users to be able to shoot down access to system memory is
a really big kernel change.

Jason
Joao Martins April 1, 2021, 7:54 p.m. UTC | #9
On 3/24/21 7:00 PM, Joao Martins wrote:
> On 3/24/21 5:45 PM, Dan Williams wrote:
>> On Thu, Mar 18, 2021 at 3:02 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>> On 3/18/21 4:08 AM, Dan Williams wrote:
>>>> Now that device-dax and filesystem-dax are guaranteed to unmap all user
>>>> mappings of devmap / DAX pages before tearing down the 'struct page'
>>>> array, get_user_pages_fast() can rely on its traditional synchronization
>>>> method "validate_pte(); get_page(); revalidate_pte()" to catch races with
>>>> device shutdown. Specifically the unmap guarantee ensures that gup-fast
>>>> either succeeds in taking a page reference (lock-less), or it detects a
>>>> need to fall back to the slow path where the device presence can be
>>>> revalidated with locks held.
>>>
>>> [...]
>>>
>>> So for allowing FOLL_LONGTERM[0] would it be OK if we used page->pgmap after
>>> try_grab_page() for checking pgmap type to see if we are in a device-dax
>>> longterm pin?
>>
>> So, there is an effort to add a new pte bit p{m,u}d_special to disable
>> gup-fast for huge pages [1]. I'd like to investigate whether we could
>> use devmap + special as an encoding for "no longterm" and never
>> consult the pgmap in the gup-fast path.
>>
>> [1]: https://lore.kernel.org/linux-mm/a1fa7fa2-914b-366d-9902-e5b784e8428c@shipmail.org/
>>
> 
> Oh, nice! That would be ideal indeed, as we would skip the pgmap lookup enterily.
> 
> I suppose device-dax would use pfn_t PFN_MAP while fs-dax memory device would set PFN_MAP
> | PFN_DEV (provided vmf_insert_pfn_{pmd,pud} calls mkspecial on PFN_DEV).
> 
> I haven't been following that thread, but for PMD/PUD special in vmf_* these might be useful:
> 
> https://lore.kernel.org/linux-mm/20200110190313.17144-2-joao.m.martins@oracle.com/
> https://lore.kernel.org/linux-mm/20200110190313.17144-4-joao.m.martins@oracle.com/
> 

On a second thought, maybe it doesn't need to be that complicated for {fs,dev}dax if the
added special bit is just a subcase of devmap pte/pmd/puds. See below scsissors mark as a
rough estimation on the changes (nothing formal/proper as it isn't properly splitted).
Running gup_test with devdax/fsdax FOLL_LONGTERM and without does the intended. (gup_test
-m 16384 -r 10 -a -S -n 512 -w -f <file> [-F 0x10000]).

Unless this is about using special PMD/PUD bits without page structs (thus without devmap
bits) as in the previous two links.

-- >8 --

Subject: mm/gup, nvdimm: allow FOLL_LONGTERM for device-dax gup-fast

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 47027796c2f9..8b5d68d89cde 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -439,11 +439,16 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd)

 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_devmap(pmd)		pte_devmap(pmd_pte(pmd))
+#define pmd_special(pmd)	pte_special(pmd_pte(pmd))
 #endif
 static inline pmd_t pmd_mkdevmap(pmd_t pmd)
 {
 	return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP)));
 }
+static inline pmd_t pmd_mkspecial(pmd_t pmd)
+{
+	return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_SPECIAL)));
+}

 #define __pmd_to_phys(pmd)	__pte_to_phys(pmd_pte(pmd))
 #define __phys_to_pmd_val(phys)	__phys_to_pte_val(phys)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b1099f2d9800..45449ee86d4f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -259,13 +259,13 @@ static inline int pmd_large(pmd_t pte)
 /* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */
 static inline int pmd_trans_huge(pmd_t pmd)
 {
-	return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
+	return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP|_PAGE_SPECIAL)) == _PAGE_PSE;
 }

 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 static inline int pud_trans_huge(pud_t pud)
 {
-	return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
+	return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP|_PAGE_SPECIAL)) == _PAGE_PSE;
 }
 #endif

@@ -297,6 +297,19 @@ static inline int pgd_devmap(pgd_t pgd)
 {
 	return 0;
 }
+
+static inline bool pmd_special(pmd_t pmd)
+{
+	return !!(pmd_flags(pmd) & _PAGE_SPECIAL);
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline bool pud_special(pud_t pud)
+{
+	return !!(pud_flags(pud) & _PAGE_SPECIAL);
+}
+#endif
+
 #endif
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */

@@ -452,6 +465,11 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
 	return pmd_set_flags(pmd, _PAGE_DEVMAP);
 }

+static inline pmd_t pmd_mkspecial(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_SPECIAL);
+}
+
 static inline pmd_t pmd_mkhuge(pmd_t pmd)
 {
 	return pmd_set_flags(pmd, _PAGE_PSE);
@@ -511,6 +529,11 @@ static inline pud_t pud_mkhuge(pud_t pud)
 	return pud_set_flags(pud, _PAGE_PSE);
 }

+static inline pud_t pud_mkspecial(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_SPECIAL);
+}
+
 static inline pud_t pud_mkyoung(pud_t pud)
 {
 	return pud_set_flags(pud, _PAGE_ACCESSED);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 16760b237229..156ceae33164 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -435,7 +435,7 @@ static int pmem_attach_disk(struct device *dev,
 		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
 		pmem->pfn_pad = resource_size(res) -
 			range_len(&pmem->pgmap.range);
-		pmem->pfn_flags |= PFN_MAP;
+		pmem->pfn_flags |= PFN_MAP | PFN_SPECIAL;
 		bb_range = pmem->pgmap.range;
 		bb_range.start += pmem->data_offset;
 	} else if (pmem_should_map_pages(dev)) {
@@ -445,7 +445,7 @@ static int pmem_attach_disk(struct device *dev,
 		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
 		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
-		pmem->pfn_flags |= PFN_MAP;
+		pmem->pfn_flags |= PFN_MAP | PFN_SPECIAL;
 		bb_range = pmem->pgmap.range;
 	} else {
 		if (devm_add_action_or_reset(dev, pmem_release_queue,
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 4ee6f734ba83..873c8e53c85d 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -748,7 +748,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev,
pgoff_t pgoff,
 		*kaddr = fs->window_kaddr + offset;
 	if (pfn)
 		*pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
-					PFN_DEV | PFN_MAP);
+					PFN_DEV | PFN_MAP | PFN_SPECIAL);
 	return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
 }

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 7364e5a70228..ad7078e38ef2 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1189,6 +1189,18 @@ static inline int pgd_devmap(pgd_t pgd)
 }
 #endif

+#if !defined(CONFIG_ARCH_HAS_PTE_SPECIAL) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
+static inline bool pmd_special(pmd_t pmd)
+{
+	return false;
+}
+
+static inline pmd_t pmd_mkspecial(pmd_t pmd)
+{
+	return pmd;
+}
+#endif
+
 #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
 	(defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
 	 !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
@@ -1196,6 +1208,14 @@ static inline int pud_trans_huge(pud_t pud)
 {
 	return 0;
 }
+static inline bool pud_special(pud_t pud)
+{
+	return false;
+}
+static inline pud_t pud_mkspecial(pud_t pud)
+{
+	return pud;
+}
 #endif

 /* See pmd_none_or_trans_huge_or_clear_bad for discussion. */
diff --git a/mm/gup.c b/mm/gup.c
index b3e647c8b7ee..87aa229a9347 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2086,7 +2086,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned
long end,
 			goto pte_unmap;

 		if (pte_devmap(pte)) {
-			if (unlikely(flags & FOLL_LONGTERM))
+			if (unlikely(flags & FOLL_LONGTERM) && pte_special(pte))
 				goto pte_unmap;

 			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
@@ -2338,7 +2338,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		return 0;

 	if (pmd_devmap(orig)) {
-		if (unlikely(flags & FOLL_LONGTERM))
+		if (unlikely(flags & FOLL_LONGTERM) && pmd_special(orig))
 			return 0;
 		return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
 					     pages, nr);
@@ -2372,7 +2372,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 		return 0;

 	if (pud_devmap(orig)) {
-		if (unlikely(flags & FOLL_LONGTERM))
+		if (unlikely(flags & FOLL_LONGTERM) && pud_special(orig))
 			return 0;
 		return __gup_device_huge_pud(orig, pudp, addr, end, flags,
 					     pages, nr);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f6f70632fc29..9d5117711919 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -796,8 +796,11 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long
addr,
 	}

 	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
-	if (pfn_t_devmap(pfn))
+	if (pfn_t_devmap(pfn)) {
 		entry = pmd_mkdevmap(entry);
+		if (pfn_t_special(pfn))
+			entry = pmd_mkspecial(entry);
+	}
 	if (write) {
 		entry = pmd_mkyoung(pmd_mkdirty(entry));
 		entry = maybe_pmd_mkwrite(entry, vma);
@@ -896,8 +899,11 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long
addr,
 	}

 	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
-	if (pfn_t_devmap(pfn))
+	if (pfn_t_devmap(pfn)) {
 		entry = pud_mkdevmap(entry);
+		if (pfn_t_special(pfn))
+			entry = pud_mkspecial(entry);
+	}
 	if (write) {
 		entry = pud_mkyoung(pud_mkdirty(entry));
 		entry = maybe_pud_mkwrite(entry, vma);
diff mbox series

Patch

diff --git a/mm/gup.c b/mm/gup.c
index e40579624f10..dfeb47e4e8d4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1996,9 +1996,8 @@  static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 			 unsigned int flags, struct page **pages, int *nr)
 {
-	struct dev_pagemap *pgmap = NULL;
-	int nr_start = *nr, ret = 0;
 	pte_t *ptep, *ptem;
+	int ret = 0;
 
 	ptem = ptep = pte_offset_map(&pmd, addr);
 	do {
@@ -2015,16 +2014,10 @@  static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 		if (!pte_access_permitted(pte, flags & FOLL_WRITE))
 			goto pte_unmap;
 
-		if (pte_devmap(pte)) {
-			if (unlikely(flags & FOLL_LONGTERM))
-				goto pte_unmap;
+		if (pte_devmap(pte) && (flags & FOLL_LONGTERM))
+			goto pte_unmap;
 
-			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
-			if (unlikely(!pgmap)) {
-				undo_dev_pagemap(nr, nr_start, flags, pages);
-				goto pte_unmap;
-			}
-		} else if (pte_special(pte))
+		if (pte_special(pte))
 			goto pte_unmap;
 
 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -2063,8 +2056,6 @@  static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 	ret = 1;
 
 pte_unmap:
-	if (pgmap)
-		put_dev_pagemap(pgmap);
 	pte_unmap(ptem);
 	return ret;
 }
@@ -2087,21 +2078,26 @@  static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 
 #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 			     unsigned long end, unsigned int flags,
 			     struct page **pages, int *nr)
 {
 	int nr_start = *nr;
-	struct dev_pagemap *pgmap = NULL;
 
 	do {
-		struct page *page = pfn_to_page(pfn);
+		struct page *page;
+
+		/*
+		 * Typically pfn_to_page() on a devmap pfn is not safe
+		 * without holding a live reference on the hosting
+		 * pgmap. In the gup-fast path it is safe because any
+		 * races will be resolved by either gup-fast taking a
+		 * reference or the shutdown path unmapping the pte to
+		 * trigger gup-fast to fall back to the slow path.
+		 */
+		page = pfn_to_page(pfn);
 
-		pgmap = get_dev_pagemap(pfn, pgmap);
-		if (unlikely(!pgmap)) {
-			undo_dev_pagemap(nr, nr_start, flags, pages);
-			return 0;
-		}
 		SetPageReferenced(page);
 		pages[*nr] = page;
 		if (unlikely(!try_grab_page(page, flags))) {
@@ -2112,8 +2108,6 @@  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 		pfn++;
 	} while (addr += PAGE_SIZE, addr != end);
 
-	if (pgmap)
-		put_dev_pagemap(pgmap);
 	return 1;
 }