diff mbox series

[RFC,04/10] fs/dax: Don't track page mapping/index

Message ID 322065d373bb6571b700dba4450f1759b304644a.1712796818.git-series.apopple@nvidia.com (mailing list archive)
State New
Headers show
Series fs/dax: Fix FS DAX page reference counts | expand

Commit Message

Alistair Popple April 11, 2024, 12:57 a.m. UTC
The page->mapping and page->index fields are normally used by the
pagecache and rmap for looking up virtual mappings of pages. FS DAX
implements it's own kind of page cache and rmap look ups so these
fields are unnecessary. They are currently only used to detect
error/warning conditions which should never occur.

A future change will change the way shared mappings are detected by
doing normal page reference counting instead, so remove the
unnecessary checks.

Signed-off-by: Alistair Popple <apopple@nvidia.com>
---
 fs/dax.c                   | 84 +---------------------------------------
 include/linux/page-flags.h |  6 +---
 2 files changed, 90 deletions(-)

Comments

Jan Kara April 12, 2024, 3:22 p.m. UTC | #1
On Thu 11-04-24 10:57:25, Alistair Popple wrote:
> The page->mapping and page->index fields are normally used by the
> pagecache and rmap for looking up virtual mappings of pages. FS DAX
> implements it's own kind of page cache and rmap look ups so these
> fields are unnecessary. They are currently only used to detect
> error/warning conditions which should never occur.
> 
> A future change will change the way shared mappings are detected by
> doing normal page reference counting instead, so remove the
> unnecessary checks.
> 
> Signed-off-by: Alistair Popple <apopple@nvidia.com>
...
> -/*
> - * When it is called in dax_insert_entry(), the shared flag will indicate that
> - * whether this entry is shared by multiple files.  If so, set the page->mapping
> - * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
> - */
> -static void dax_associate_entry(void *entry, struct address_space *mapping,
> -		struct vm_area_struct *vma, unsigned long address, bool shared)
> -{
> -	unsigned long size = dax_entry_size(entry), pfn, index;
> -	int i = 0;
> -
> -	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
> -		return;
> -
> -	index = linear_page_index(vma, address & ~(size - 1));
> -	for_each_mapped_pfn(entry, pfn) {
> -		struct page *page = pfn_to_page(pfn);
> -
> -		if (shared) {
> -			dax_page_share_get(page);
> -		} else {
> -			WARN_ON_ONCE(page->mapping);
> -			page->mapping = mapping;
> -			page->index = index + i++;
> -		}
> -	}
> -}

Hum, but what about existing uses of folio->mapping and folio->index in
fs/dax.c? AFAICT this patch breaks them. What am I missing? How can this
ever work?

								Honza
Dan Williams April 12, 2024, 5:21 p.m. UTC | #2
Alistair Popple wrote:
> The page->mapping and page->index fields are normally used by the
> pagecache and rmap for looking up virtual mappings of pages. FS DAX
> implements it's own kind of page cache and rmap look ups so these
> fields are unnecessary. They are currently only used to detect
> error/warning conditions which should never occur.
> 
> A future change will change the way shared mappings are detected by
> doing normal page reference counting instead, so remove the
> unnecessary checks.

Ignore my comment on patch3, I fumble fingered the reply, it was meant
to for this patch. I.e. that "future change" should just be present
before removing old logic.
Dan Williams April 12, 2024, 5:31 p.m. UTC | #3
Jan Kara wrote:
> On Thu 11-04-24 10:57:25, Alistair Popple wrote:
> > The page->mapping and page->index fields are normally used by the
> > pagecache and rmap for looking up virtual mappings of pages. FS DAX
> > implements it's own kind of page cache and rmap look ups so these
> > fields are unnecessary. They are currently only used to detect
> > error/warning conditions which should never occur.
> > 
> > A future change will change the way shared mappings are detected by
> > doing normal page reference counting instead, so remove the
> > unnecessary checks.
> > 
> > Signed-off-by: Alistair Popple <apopple@nvidia.com>
> ...
> > -/*
> > - * When it is called in dax_insert_entry(), the shared flag will indicate that
> > - * whether this entry is shared by multiple files.  If so, set the page->mapping
> > - * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
> > - */
> > -static void dax_associate_entry(void *entry, struct address_space *mapping,
> > -		struct vm_area_struct *vma, unsigned long address, bool shared)
> > -{
> > -	unsigned long size = dax_entry_size(entry), pfn, index;
> > -	int i = 0;
> > -
> > -	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
> > -		return;
> > -
> > -	index = linear_page_index(vma, address & ~(size - 1));
> > -	for_each_mapped_pfn(entry, pfn) {
> > -		struct page *page = pfn_to_page(pfn);
> > -
> > -		if (shared) {
> > -			dax_page_share_get(page);
> > -		} else {
> > -			WARN_ON_ONCE(page->mapping);
> > -			page->mapping = mapping;
> > -			page->index = index + i++;
> > -		}
> > -	}
> > -}
> 
> Hum, but what about existing uses of folio->mapping and folio->index in
> fs/dax.c? AFAICT this patch breaks them. What am I missing? How can this
> ever work?

Right, as far as I can see every fsdax filesystem would need to be
converted to use dax_holder_operations() so that the fs can backfill
->mapping and ->index.
Alistair Popple April 15, 2024, 7:03 a.m. UTC | #4
Dan Williams <dan.j.williams@intel.com> writes:

> Jan Kara wrote:
>> On Thu 11-04-24 10:57:25, Alistair Popple wrote:
>> > The page->mapping and page->index fields are normally used by the
>> > pagecache and rmap for looking up virtual mappings of pages. FS DAX
>> > implements it's own kind of page cache and rmap look ups so these
>> > fields are unnecessary. They are currently only used to detect
>> > error/warning conditions which should never occur.
>> > 
>> > A future change will change the way shared mappings are detected by
>> > doing normal page reference counting instead, so remove the
>> > unnecessary checks.
>> > 
>> > Signed-off-by: Alistair Popple <apopple@nvidia.com>
>> ...
>> > -/*
>> > - * When it is called in dax_insert_entry(), the shared flag will indicate that
>> > - * whether this entry is shared by multiple files.  If so, set the page->mapping
>> > - * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
>> > - */
>> > -static void dax_associate_entry(void *entry, struct address_space *mapping,
>> > -		struct vm_area_struct *vma, unsigned long address, bool shared)
>> > -{
>> > -	unsigned long size = dax_entry_size(entry), pfn, index;
>> > -	int i = 0;
>> > -
>> > -	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
>> > -		return;
>> > -
>> > -	index = linear_page_index(vma, address & ~(size - 1));
>> > -	for_each_mapped_pfn(entry, pfn) {
>> > -		struct page *page = pfn_to_page(pfn);
>> > -
>> > -		if (shared) {
>> > -			dax_page_share_get(page);
>> > -		} else {
>> > -			WARN_ON_ONCE(page->mapping);
>> > -			page->mapping = mapping;
>> > -			page->index = index + i++;
>> > -		}
>> > -	}
>> > -}
>> 
>> Hum, but what about existing uses of folio->mapping and folio->index in
>> fs/dax.c? AFAICT this patch breaks them. What am I missing? How can this
>> ever work?

I did feel I was missing something here as well, but nothing obviously
breaks with this change from a test perspective (ie. ndctl tests, manual
tests). Somehow I missed how this was used in code, but Dan provided
enough of a hint below though so now I see the errors of my ways :-)

> Right, as far as I can see every fsdax filesystem would need to be
> converted to use dax_holder_operations() so that the fs can backfill
> ->mapping and ->index.

Oh, that was the hint I needed. Thanks. So basically it's just used for
memory failure like so:

memory_failure()
 -> memory_failure_dev_pagemap()
  -> mf_generic_kill_procs()
   -> dax_lock_page()
    -> mapping = READ_ONCE(page->mapping);
 
Somehow I had missed that bleatingly obvious usage of page->mapping. I
also couldn't understand how it was important if it was safe for it to
be just randomly overwritten in the shared case.

But I think I understand now - shared fs dax pages are only supported on
xfs and the mapping/index fields aren't used there because xfs provides
it's own look up for memory failure using dax_holder_operations.

I was initially concerned about these cases because I was wondering if
folio subpages could ever get different mappings and the shared case
implied they could. But it seems that's xfs specific and there is a
separate mechanism to deal with looking up ->mapping/index for that. So
I guess we should still be able to safely store this on the folio
head. I will double check and update this change.
Dan Williams April 15, 2024, 8:51 p.m. UTC | #5
Alistair Popple wrote:
> I was initially concerned about these cases because I was wondering if
> folio subpages could ever get different mappings and the shared case
> implied they could. But it seems that's xfs specific and there is a
> separate mechanism to deal with looking up ->mapping/index for that. So
> I guess we should still be able to safely store this on the folio
> head. I will double check and update this change.
> 

I think there is path to store this information only on the folio head.
However, ugh, I think this is potentially another "head" of the
pmd_devmap() hydra.

pmd_devmap() taught the core-mm to treat dax_pmds indentically to
thp_pmds *except* for the __split_huge_pmd() case:

   5c7fb56e5e3f mm, dax: dax-pmd vs thp-pmd vs hugetlbfs-pmd

Later on pmd migration entries joined pmd_devmap() in skipping splits:

   84c3fc4e9c56 mm: thp: check pmd migration entry in common path

Unfortunately, pmd_devmap() stopped being considered for skipping
splits here:

   7f7609175ff2 mm/huge_memory: remove stale locking logic from __split_huge_pmd()

Likely __split_huge_pmd_locked() grew support for pmd migration handling
and forgot about the pmd_devmap() case.

So now Linux has been allowing FSDAX pmd splits since v5.18... but with
no reports of any issues. Likely this is benefiting from the fact that
the preconditions for splitting are rarely if ever satisfied because
FSDAX mappings are never anon, and establishing the mapping in the first
place requires a 2MB aligned file extent and that is likely never
fractured.

Same for device-dax where the fracturing *should* not be allowed, but I
will feel better with focus tests to go after mremap() cases that would
attempt to split the page.
Alistair Popple April 16, 2024, 12:07 a.m. UTC | #6
Dan Williams <dan.j.williams@intel.com> writes:

> Alistair Popple wrote:
>> I was initially concerned about these cases because I was wondering if
>> folio subpages could ever get different mappings and the shared case
>> implied they could. But it seems that's xfs specific and there is a
>> separate mechanism to deal with looking up ->mapping/index for that. So
>> I guess we should still be able to safely store this on the folio
>> head. I will double check and update this change.
>> 
>
> I think there is path to store this information only on the folio head.
> However, ugh, I think this is potentially another "head" of the
> pmd_devmap() hydra.
>
> pmd_devmap() taught the core-mm to treat dax_pmds indentically to
> thp_pmds *except* for the __split_huge_pmd() case:
>
>    5c7fb56e5e3f mm, dax: dax-pmd vs thp-pmd vs hugetlbfs-pmd
>
> Later on pmd migration entries joined pmd_devmap() in skipping splits:
>
>    84c3fc4e9c56 mm: thp: check pmd migration entry in common path
>
> Unfortunately, pmd_devmap() stopped being considered for skipping
> splits here:
>
>    7f7609175ff2 mm/huge_memory: remove stale locking logic from __split_huge_pmd()
>
> Likely __split_huge_pmd_locked() grew support for pmd migration handling
> and forgot about the pmd_devmap() case.
>
> So now Linux has been allowing FSDAX pmd splits since v5.18...

From what I see we currently (in v6.6) have this in
__split_huge_pmd_locked():

        if (!vma_is_anonymous(vma)) {
                old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
                /*
                 * We are going to unmap this huge page. So
                 * just go ahead and zap it
                 */
                if (arch_needs_pgtable_deposit())
                        zap_deposited_table(mm, pmd);
                if (vma_is_special_huge(vma))
                        return;

Where vma_is_special_huge(vma) returns true for vma_is_dax(). So AFAICT
we're still skipping the split right? In all versions we just zap the
PMD and continue. What am I missing?

> but with
> no reports of any issues. Likely this is benefiting from the fact that
> the preconditions for splitting are rarely if ever satisfied because
> FSDAX mappings are never anon, and establishing the mapping in the first
> place requires a 2MB aligned file extent and that is likely never
> fractured.
>
> Same for device-dax where the fracturing *should* not be allowed, but I
> will feel better with focus tests to go after mremap() cases that would
> attempt to split the page.
Dan Williams April 16, 2024, 12:36 a.m. UTC | #7
Alistair Popple wrote:
> 
> Dan Williams <dan.j.williams@intel.com> writes:
> 
> > Alistair Popple wrote:
> >> I was initially concerned about these cases because I was wondering if
> >> folio subpages could ever get different mappings and the shared case
> >> implied they could. But it seems that's xfs specific and there is a
> >> separate mechanism to deal with looking up ->mapping/index for that. So
> >> I guess we should still be able to safely store this on the folio
> >> head. I will double check and update this change.
> >> 
> >
> > I think there is path to store this information only on the folio head.
> > However, ugh, I think this is potentially another "head" of the
> > pmd_devmap() hydra.
> >
> > pmd_devmap() taught the core-mm to treat dax_pmds indentically to
> > thp_pmds *except* for the __split_huge_pmd() case:
> >
> >    5c7fb56e5e3f mm, dax: dax-pmd vs thp-pmd vs hugetlbfs-pmd
> >
> > Later on pmd migration entries joined pmd_devmap() in skipping splits:
> >
> >    84c3fc4e9c56 mm: thp: check pmd migration entry in common path
> >
> > Unfortunately, pmd_devmap() stopped being considered for skipping
> > splits here:
> >
> >    7f7609175ff2 mm/huge_memory: remove stale locking logic from __split_huge_pmd()
> >
> > Likely __split_huge_pmd_locked() grew support for pmd migration handling
> > and forgot about the pmd_devmap() case.
> >
> > So now Linux has been allowing FSDAX pmd splits since v5.18...
> 
> From what I see we currently (in v6.6) have this in
> __split_huge_pmd_locked():
> 
>         if (!vma_is_anonymous(vma)) {
>                 old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
>                 /*
>                  * We are going to unmap this huge page. So
>                  * just go ahead and zap it
>                  */
>                 if (arch_needs_pgtable_deposit())
>                         zap_deposited_table(mm, pmd);
>                 if (vma_is_special_huge(vma))
>                         return;
> 
> Where vma_is_special_huge(vma) returns true for vma_is_dax(). So AFAICT
> we're still skipping the split right? In all versions we just zap the
> PMD and continue. What am I missing?

Ah, good point I missed that. One more dragon vanquished.
diff mbox series

Patch

diff --git a/fs/dax.c b/fs/dax.c
index 8fafecb..a7bd423 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -320,85 +320,6 @@  static unsigned long dax_end_pfn(void *entry)
 	for (pfn = dax_to_pfn(entry); \
 			pfn < dax_end_pfn(entry); pfn++)
 
-static inline bool dax_page_is_shared(struct page *page)
-{
-	return page->mapping == PAGE_MAPPING_DAX_SHARED;
-}
-
-/*
- * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
- * refcount.
- */
-static inline void dax_page_share_get(struct page *page)
-{
-	if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
-		/*
-		 * Reset the index if the page was already mapped
-		 * regularly before.
-		 */
-		if (page->mapping)
-			page->share = 1;
-		page->mapping = PAGE_MAPPING_DAX_SHARED;
-	}
-	page->share++;
-}
-
-static inline unsigned long dax_page_share_put(struct page *page)
-{
-	return --page->share;
-}
-
-/*
- * When it is called in dax_insert_entry(), the shared flag will indicate that
- * whether this entry is shared by multiple files.  If so, set the page->mapping
- * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
- */
-static void dax_associate_entry(void *entry, struct address_space *mapping,
-		struct vm_area_struct *vma, unsigned long address, bool shared)
-{
-	unsigned long size = dax_entry_size(entry), pfn, index;
-	int i = 0;
-
-	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
-		return;
-
-	index = linear_page_index(vma, address & ~(size - 1));
-	for_each_mapped_pfn(entry, pfn) {
-		struct page *page = pfn_to_page(pfn);
-
-		if (shared) {
-			dax_page_share_get(page);
-		} else {
-			WARN_ON_ONCE(page->mapping);
-			page->mapping = mapping;
-			page->index = index + i++;
-		}
-	}
-}
-
-static void dax_disassociate_entry(void *entry, struct address_space *mapping,
-		bool trunc)
-{
-	unsigned long pfn;
-
-	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
-		return;
-
-	for_each_mapped_pfn(entry, pfn) {
-		struct page *page = pfn_to_page(pfn);
-
-		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
-		if (dax_page_is_shared(page)) {
-			/* keep the shared flag if this page is still shared */
-			if (dax_page_share_put(page) > 0)
-				continue;
-		} else
-			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
-		page->mapping = NULL;
-		page->index = 0;
-	}
-}
-
 static struct page *dax_busy_page(void *entry)
 {
 	unsigned long pfn;
@@ -620,7 +541,6 @@  static void *grab_mapping_entry(struct xa_state *xas,
 			xas_lock_irq(xas);
 		}
 
-		dax_disassociate_entry(entry, mapping, false);
 		xas_store(xas, NULL);	/* undo the PMD join */
 		dax_wake_entry(xas, entry, WAKE_ALL);
 		mapping->nrpages -= PG_PMD_NR;
@@ -757,7 +677,6 @@  static int __dax_invalidate_entry(struct address_space *mapping,
 	    (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
 	     xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
 		goto out;
-	dax_disassociate_entry(entry, mapping, trunc);
 	xas_store(&xas, NULL);
 	mapping->nrpages -= 1UL << dax_entry_order(entry);
 	ret = 1;
@@ -894,9 +813,6 @@  static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
 	if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
 		void *old;
 
-		dax_disassociate_entry(entry, mapping, false);
-		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
-				shared);
 		/*
 		 * Only swap our new entry into the page cache if the current
 		 * entry is a zero page or an empty entry.  If a normal PTE or
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5c02720..85d5427 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -631,12 +631,6 @@  PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
 #define PAGE_MAPPING_KSM	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
 #define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
 
-/*
- * Different with flags above, this flag is used only for fsdax mode.  It
- * indicates that this page->mapping is now under reflink case.
- */
-#define PAGE_MAPPING_DAX_SHARED	((void *)0x1)
-
 static __always_inline bool folio_mapping_flags(struct folio *folio)
 {
 	return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;