diff mbox series

[RFC,V3] arm64: mm: swap: save and restore mte tags for large folios

Message ID 20231114014313.67232-1-v-songbaohua@oppo.com (mailing list archive)
State New
Headers show
Series [RFC,V3] arm64: mm: swap: save and restore mte tags for large folios | expand

Commit Message

Barry Song Nov. 14, 2023, 1:43 a.m. UTC
This patch makes MTE tags saving and restoring support large folios,
then we don't need to split them into base pages for swapping out
on ARM64 SoCs with MTE.

arch_prepare_to_swap() should take folio rather than page as parameter
because we support THP swap-out as a whole.

Meanwhile, arch_swap_restore() should use page parameter rather than
folio as swap-in always works at the granularity of base pages right
now.

arch_thp_swp_supported() is dropped since ARM64 MTE was the only one
who needed it.

Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 rfc v3:
 * move arch_swap_restore() to take page rather than folio according to
 the discussion with Ryan and Steven;
 * fix some other issues commented by Ryan

 rfc v2:
 https://lore.kernel.org/lkml/20231104093423.170054-1-v-songbaohua@oppo.com/
 rfc v1:
 https://lore.kernel.org/lkml/20231102223643.7733-1-v-songbaohua@oppo.com/

 arch/arm64/include/asm/pgtable.h | 21 ++++----------------
 arch/arm64/mm/mteswap.c          | 34 ++++++++++++++++++++++++++++++++
 include/linux/huge_mm.h          | 12 -----------
 include/linux/pgtable.h          |  4 ++--
 mm/memory.c                      |  2 +-
 mm/page_io.c                     |  2 +-
 mm/shmem.c                       |  2 +-
 mm/swap_slots.c                  |  2 +-
 mm/swapfile.c                    |  2 +-
 9 files changed, 45 insertions(+), 36 deletions(-)

Comments

Steven Price Nov. 15, 2023, 2:40 p.m. UTC | #1
On 14/11/2023 01:43, Barry Song wrote:
> This patch makes MTE tags saving and restoring support large folios,
> then we don't need to split them into base pages for swapping out
> on ARM64 SoCs with MTE.
> 
> arch_prepare_to_swap() should take folio rather than page as parameter
> because we support THP swap-out as a whole.
> 
> Meanwhile, arch_swap_restore() should use page parameter rather than
> folio as swap-in always works at the granularity of base pages right
> now.
> 
> arch_thp_swp_supported() is dropped since ARM64 MTE was the only one
> who needed it.
> 
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>

LGTM!

Reviewed: Steven Price <steven.price@arm.com>

Although one NIT below you might want to fix.

> ---
>  rfc v3:
>  * move arch_swap_restore() to take page rather than folio according to
>  the discussion with Ryan and Steven;
>  * fix some other issues commented by Ryan
> 
>  rfc v2:
>  https://lore.kernel.org/lkml/20231104093423.170054-1-v-songbaohua@oppo.com/
>  rfc v1:
>  https://lore.kernel.org/lkml/20231102223643.7733-1-v-songbaohua@oppo.com/
> 
>  arch/arm64/include/asm/pgtable.h | 21 ++++----------------
>  arch/arm64/mm/mteswap.c          | 34 ++++++++++++++++++++++++++++++++
>  include/linux/huge_mm.h          | 12 -----------
>  include/linux/pgtable.h          |  4 ++--
>  mm/memory.c                      |  2 +-
>  mm/page_io.c                     |  2 +-
>  mm/shmem.c                       |  2 +-
>  mm/swap_slots.c                  |  2 +-
>  mm/swapfile.c                    |  2 +-
>  9 files changed, 45 insertions(+), 36 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index b19a8aee684c..c3eef11c1a9e 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -45,12 +45,6 @@
>  	__flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1)
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  
> -static inline bool arch_thp_swp_supported(void)
> -{
> -	return !system_supports_mte();
> -}
> -#define arch_thp_swp_supported arch_thp_swp_supported
> -
>  /*
>   * Outside of a few very special situations (e.g. hibernation), we always
>   * use broadcast TLB invalidation instructions, therefore a spurious page
> @@ -1036,12 +1030,8 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
>  #ifdef CONFIG_ARM64_MTE
>  
>  #define __HAVE_ARCH_PREPARE_TO_SWAP
> -static inline int arch_prepare_to_swap(struct page *page)
> -{
> -	if (system_supports_mte())
> -		return mte_save_tags(page);
> -	return 0;
> -}
> +#define arch_prepare_to_swap arch_prepare_to_swap
> +extern int arch_prepare_to_swap(struct folio *folio);
>  
>  #define __HAVE_ARCH_SWAP_INVALIDATE
>  static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
> @@ -1057,11 +1047,8 @@ static inline void arch_swap_invalidate_area(int type)
>  }
>  
>  #define __HAVE_ARCH_SWAP_RESTORE
> -static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
> -{
> -	if (system_supports_mte())
> -		mte_restore_tags(entry, &folio->page);
> -}
> +#define arch_swap_restore arch_swap_restore
> +extern void arch_swap_restore(swp_entry_t entry, struct page *page);
>  
>  #endif /* CONFIG_ARM64_MTE */
>  
> diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c
> index a31833e3ddc5..75c2836e8240 100644
> --- a/arch/arm64/mm/mteswap.c
> +++ b/arch/arm64/mm/mteswap.c
> @@ -68,6 +68,12 @@ void mte_invalidate_tags(int type, pgoff_t offset)
>  	mte_free_tag_storage(tags);
>  }
>  
> +static inline void __mte_invalidate_tags(struct page *page)
> +{
> +	swp_entry_t entry = page_swap_entry(page);
> +	mte_invalidate_tags(swp_type(entry), swp_offset(entry));

NIT: checkpatch will complain there should be blank line between these
two lines (declarations are followed by a blank line).

Steve

> +}
> +
>  void mte_invalidate_tags_area(int type)
>  {
>  	swp_entry_t entry = swp_entry(type, 0);
> @@ -83,3 +89,31 @@ void mte_invalidate_tags_area(int type)
>  	}
>  	xa_unlock(&mte_pages);
>  }
> +
> +int arch_prepare_to_swap(struct folio *folio)
> +{
> +	int err;
> +	long i;
> +
> +	if (system_supports_mte()) {
> +		long nr = folio_nr_pages(folio);
> +
> +		for (i = 0; i < nr; i++) {
> +			err = mte_save_tags(folio_page(folio, i));
> +			if (err)
> +				goto out;
> +		}
> +	}
> +	return 0;
> +
> +out:
> +	while (i--)
> +		__mte_invalidate_tags(folio_page(folio, i));
> +	return err;
> +}
> +
> +void arch_swap_restore(swp_entry_t entry, struct page *page)
> +{
> +	if (system_supports_mte())
> +		mte_restore_tags(entry, page);
> +}
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index fa0350b0812a..f83fb8d5241e 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -400,16 +400,4 @@ static inline int split_folio(struct folio *folio)
>  	return split_folio_to_list(folio, NULL);
>  }
>  
> -/*
> - * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to
> - * limitations in the implementation like arm64 MTE can override this to
> - * false
> - */
> -#ifndef arch_thp_swp_supported
> -static inline bool arch_thp_swp_supported(void)
> -{
> -	return true;
> -}
> -#endif
> -
>  #endif /* _LINUX_HUGE_MM_H */
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index af7639c3b0a3..87e3140a55ca 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -897,7 +897,7 @@ static inline int arch_unmap_one(struct mm_struct *mm,
>   * prototypes must be defined in the arch-specific asm/pgtable.h file.
>   */
>  #ifndef __HAVE_ARCH_PREPARE_TO_SWAP
> -static inline int arch_prepare_to_swap(struct page *page)
> +static inline int arch_prepare_to_swap(struct folio *folio)
>  {
>  	return 0;
>  }
> @@ -914,7 +914,7 @@ static inline void arch_swap_invalidate_area(int type)
>  #endif
>  
>  #ifndef __HAVE_ARCH_SWAP_RESTORE
> -static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
> +static inline void arch_swap_restore(swp_entry_t entry, struct page *page)
>  {
>  }
>  #endif
> diff --git a/mm/memory.c b/mm/memory.c
> index 1f18ed4a5497..fad238dd38e7 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4022,7 +4022,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	 * when reading from swap. This metadata may be indexed by swap entry
>  	 * so this must be called before swap_free().
>  	 */
> -	arch_swap_restore(entry, folio);
> +	arch_swap_restore(entry, page);
>  
>  	/*
>  	 * Remove the swap entry and conditionally try to free up the swapcache.
> diff --git a/mm/page_io.c b/mm/page_io.c
> index cb559ae324c6..0fd832474c1d 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -189,7 +189,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
>  	 * Arch code may have to preserve more data than just the page
>  	 * contents, e.g. memory tags.
>  	 */
> -	ret = arch_prepare_to_swap(&folio->page);
> +	ret = arch_prepare_to_swap(folio);
>  	if (ret) {
>  		folio_mark_dirty(folio);
>  		folio_unlock(folio);
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 91e2620148b2..7d32e50da121 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1892,7 +1892,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  	 * Some architectures may have to restore extra metadata to the
>  	 * folio after reading from swap.
>  	 */
> -	arch_swap_restore(swap, folio);
> +	arch_swap_restore(swap, &folio->page);
>  
>  	if (shmem_should_replace_folio(folio, gfp)) {
>  		error = shmem_replace_folio(&folio, gfp, info, index);
> diff --git a/mm/swap_slots.c b/mm/swap_slots.c
> index 0bec1f705f8e..2325adbb1f19 100644
> --- a/mm/swap_slots.c
> +++ b/mm/swap_slots.c
> @@ -307,7 +307,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio)
>  	entry.val = 0;
>  
>  	if (folio_test_large(folio)) {
> -		if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported())
> +		if (IS_ENABLED(CONFIG_THP_SWAP))
>  			get_swap_pages(1, &entry, folio_nr_pages(folio));
>  		goto out;
>  	}
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 4bc70f459164..6450e0279e35 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1784,7 +1784,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
>  	 * when reading from swap. This metadata may be indexed by swap entry
>  	 * so this must be called before swap_free().
>  	 */
> -	arch_swap_restore(entry, page_folio(page));
> +	arch_swap_restore(entry, page);
>  
>  	/* See do_swap_page() */
>  	BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
David Hildenbrand Nov. 15, 2023, 3:16 p.m. UTC | #2
On 14.11.23 02:43, Barry Song wrote:
> This patch makes MTE tags saving and restoring support large folios,
> then we don't need to split them into base pages for swapping out
> on ARM64 SoCs with MTE.
> 
> arch_prepare_to_swap() should take folio rather than page as parameter
> because we support THP swap-out as a whole.
> 
> Meanwhile, arch_swap_restore() should use page parameter rather than
> folio as swap-in always works at the granularity of base pages right
> now.

... but then we always have order-0 folios and can pass a folio, or what 
am I missing?

> 
> arch_thp_swp_supported() is dropped since ARM64 MTE was the only one
> who needed it.

Can we do that separately?
Barry Song Nov. 15, 2023, 8:49 p.m. UTC | #3
On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 14.11.23 02:43, Barry Song wrote:
> > This patch makes MTE tags saving and restoring support large folios,
> > then we don't need to split them into base pages for swapping out
> > on ARM64 SoCs with MTE.
> >
> > arch_prepare_to_swap() should take folio rather than page as parameter
> > because we support THP swap-out as a whole.
> >
> > Meanwhile, arch_swap_restore() should use page parameter rather than
> > folio as swap-in always works at the granularity of base pages right
> > now.
>
> ... but then we always have order-0 folios and can pass a folio, or what
> am I missing?

Hi David,
you missed the discussion here:

https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/

>
> >
> > arch_thp_swp_supported() is dropped since ARM64 MTE was the only one
> > who needed it.
>
> Can we do that separately?

i think it is ok.

>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry
Yang Shi Nov. 15, 2023, 10:45 p.m. UTC | #4
On Wed, Nov 15, 2023 at 12:49 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com> wrote:
> >
> > On 14.11.23 02:43, Barry Song wrote:
> > > This patch makes MTE tags saving and restoring support large folios,
> > > then we don't need to split them into base pages for swapping out
> > > on ARM64 SoCs with MTE.
> > >
> > > arch_prepare_to_swap() should take folio rather than page as parameter
> > > because we support THP swap-out as a whole.
> > >
> > > Meanwhile, arch_swap_restore() should use page parameter rather than
> > > folio as swap-in always works at the granularity of base pages right
> > > now.
> >
> > ... but then we always have order-0 folios and can pass a folio, or what
> > am I missing?
>
> Hi David,
> you missed the discussion here:
>
> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>
> >
> > >
> > > arch_thp_swp_supported() is dropped since ARM64 MTE was the only one
> > > who needed it.
> >
> > Can we do that separately?
>
> i think it is ok.

IMHO keeping it in this patch makes more sense. IIRC removing
arch_thp_swp_supported() is just because this patch made swapping
large folio with MTE more efficiently.

>
> >
> > --
> > Cheers,
> >
> > David / dhildenb
> >
>
> Thanks
> Barry
David Hildenbrand Nov. 16, 2023, 9:36 a.m. UTC | #5
On 15.11.23 21:49, Barry Song wrote:
> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 14.11.23 02:43, Barry Song wrote:
>>> This patch makes MTE tags saving and restoring support large folios,
>>> then we don't need to split them into base pages for swapping out
>>> on ARM64 SoCs with MTE.
>>>
>>> arch_prepare_to_swap() should take folio rather than page as parameter
>>> because we support THP swap-out as a whole.
>>>
>>> Meanwhile, arch_swap_restore() should use page parameter rather than
>>> folio as swap-in always works at the granularity of base pages right
>>> now.
>>
>> ... but then we always have order-0 folios and can pass a folio, or what
>> am I missing?
> 
> Hi David,
> you missed the discussion here:
> 
> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/

Okay, so you want to handle the refault-from-swapcache case where you get a
large folio.

I was mislead by your "folio as swap-in always works at the granularity of
base pages right now" comment.

What you actually wanted to say is "While we always swap in small folios, we
might refault large folios from the swapcache, and we only want to restore
the tags for the page of the large folio we are faulting on."

But, I do if we can't simply restore the tags for the whole thing at once
at make the interface page-free?

Let me elaborate:

IIRC, if we have a large folio in the swapcache, the swap entries/offset are
contiguous. If you know you are faulting on page[1] of the folio with a
given swap offset, you can calculate the swap offset for page[0] simply by
subtracting from the offset.

See page_swap_entry() on how we perform this calculation.


So you can simply pass the large folio and the swap entry corresponding
to the first page of the large folio, and restore all tags at once.

So the interface would be

arch_prepare_to_swap(struct folio *folio);
void arch_swap_restore(struct page *folio, swp_entry_t start_entry);

I'm sorry if that was also already discussed.

BUT, IIRC in the context of

commit cfeed8ffe55b37fa10286aaaa1369da00cb88440
Author: David Hildenbrand <david@redhat.com>
Date:   Mon Aug 21 18:08:46 2023 +0200

     mm/swap: stop using page->private on tail pages for THP_SWAP
     
     Patch series "mm/swap: stop using page->private on tail pages for THP_SWAP
     + cleanups".
     
     This series stops using page->private on tail pages for THP_SWAP, replaces
     folio->private by folio->swap for swapcache folios, and starts using
     "new_folio" for tail pages that we are splitting to remove the usage of
     page->private for swapcache handling completely.

As long as the folio is in the swapcache, we even do have the proper
swp_entry_t start_entry available as folio_swap_entry(folio).

But now I am confused when we actually would have to pass
"swp_entry_t start_entry". We shouldn't if the folio is in the swapcache ...
Barry Song Nov. 16, 2023, 11:47 p.m. UTC | #6
On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 15.11.23 21:49, Barry Song wrote:
> > On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 14.11.23 02:43, Barry Song wrote:
> >>> This patch makes MTE tags saving and restoring support large folios,
> >>> then we don't need to split them into base pages for swapping out
> >>> on ARM64 SoCs with MTE.
> >>>
> >>> arch_prepare_to_swap() should take folio rather than page as parameter
> >>> because we support THP swap-out as a whole.
> >>>
> >>> Meanwhile, arch_swap_restore() should use page parameter rather than
> >>> folio as swap-in always works at the granularity of base pages right
> >>> now.
> >>
> >> ... but then we always have order-0 folios and can pass a folio, or what
> >> am I missing?
> >
> > Hi David,
> > you missed the discussion here:
> >
> > https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
> > https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>
> Okay, so you want to handle the refault-from-swapcache case where you get a
> large folio.
>
> I was mislead by your "folio as swap-in always works at the granularity of
> base pages right now" comment.
>
> What you actually wanted to say is "While we always swap in small folios, we
> might refault large folios from the swapcache, and we only want to restore
> the tags for the page of the large folio we are faulting on."
>
> But, I do if we can't simply restore the tags for the whole thing at once
> at make the interface page-free?
>
> Let me elaborate:
>
> IIRC, if we have a large folio in the swapcache, the swap entries/offset are
> contiguous. If you know you are faulting on page[1] of the folio with a
> given swap offset, you can calculate the swap offset for page[0] simply by
> subtracting from the offset.
>
> See page_swap_entry() on how we perform this calculation.
>
>
> So you can simply pass the large folio and the swap entry corresponding
> to the first page of the large folio, and restore all tags at once.
>
> So the interface would be
>
> arch_prepare_to_swap(struct folio *folio);
> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
>
> I'm sorry if that was also already discussed.

This has been discussed. Steven, Ryan and I all don't think this is a good
option. in case we have a large folio with 16 basepages, as do_swap_page
can only map one base page for each page fault, that means we have
to restore 16(tags we restore in each page fault) * 16(the times of page faults)
for this large folio.

and still the worst thing is the page fault in the Nth PTE of large folio
might free swap entry as that swap has been in.
do_swap_page()
{
   /*
    * Remove the swap entry and conditionally try to free up the swapcache.
    * We're already holding a reference on the page but haven't mapped it
    * yet.
    */
    swap_free(entry);
}

So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you might access
a freed tag.

>
> BUT, IIRC in the context of
>
> commit cfeed8ffe55b37fa10286aaaa1369da00cb88440
> Author: David Hildenbrand <david@redhat.com>
> Date:   Mon Aug 21 18:08:46 2023 +0200
>
>      mm/swap: stop using page->private on tail pages for THP_SWAP
>
>      Patch series "mm/swap: stop using page->private on tail pages for THP_SWAP
>      + cleanups".
>
>      This series stops using page->private on tail pages for THP_SWAP, replaces
>      folio->private by folio->swap for swapcache folios, and starts using
>      "new_folio" for tail pages that we are splitting to remove the usage of
>      page->private for swapcache handling completely.
>
> As long as the folio is in the swapcache, we even do have the proper
> swp_entry_t start_entry available as folio_swap_entry(folio).
>
> But now I am confused when we actually would have to pass
> "swp_entry_t start_entry". We shouldn't if the folio is in the swapcache ...
>

Nop, hitting swapcache doesn't necessarily mean tags have been restored.
when A forks B,C,D,E,F. and A, B, C, D, E ,F share the swapslot.
as we have two chances to hit swapcache:
1. swap out, unmap has been done but folios haven't been dropped
2. swap in, shared processes allocate folios and add to swapcache

for 2, If A gets fault earlier than B, A will allocate folio and add
it to swapcache.
Then B will hit the swapcache. But If B's CPU is faster than A, B still might
be the one mapping PTE earlier than A though A is the one which has
added the page to swapcache. we have to make sure MTE is there when
mapping is done.

> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry
Barry Song Nov. 17, 2023, 12:15 a.m. UTC | #7
On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
>
> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
> >
> > On 15.11.23 21:49, Barry Song wrote:
> > > On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com> wrote:
> > >>
> > >> On 14.11.23 02:43, Barry Song wrote:
> > >>> This patch makes MTE tags saving and restoring support large folios,
> > >>> then we don't need to split them into base pages for swapping out
> > >>> on ARM64 SoCs with MTE.
> > >>>
> > >>> arch_prepare_to_swap() should take folio rather than page as parameter
> > >>> because we support THP swap-out as a whole.
> > >>>
> > >>> Meanwhile, arch_swap_restore() should use page parameter rather than
> > >>> folio as swap-in always works at the granularity of base pages right
> > >>> now.
> > >>
> > >> ... but then we always have order-0 folios and can pass a folio, or what
> > >> am I missing?
> > >
> > > Hi David,
> > > you missed the discussion here:
> > >
> > > https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
> > > https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
> >
> > Okay, so you want to handle the refault-from-swapcache case where you get a
> > large folio.
> >
> > I was mislead by your "folio as swap-in always works at the granularity of
> > base pages right now" comment.
> >
> > What you actually wanted to say is "While we always swap in small folios, we
> > might refault large folios from the swapcache, and we only want to restore
> > the tags for the page of the large folio we are faulting on."
> >
> > But, I do if we can't simply restore the tags for the whole thing at once
> > at make the interface page-free?
> >
> > Let me elaborate:
> >
> > IIRC, if we have a large folio in the swapcache, the swap entries/offset are
> > contiguous. If you know you are faulting on page[1] of the folio with a
> > given swap offset, you can calculate the swap offset for page[0] simply by
> > subtracting from the offset.
> >
> > See page_swap_entry() on how we perform this calculation.
> >
> >
> > So you can simply pass the large folio and the swap entry corresponding
> > to the first page of the large folio, and restore all tags at once.
> >
> > So the interface would be
> >
> > arch_prepare_to_swap(struct folio *folio);
> > void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
> >
> > I'm sorry if that was also already discussed.
>
> This has been discussed. Steven, Ryan and I all don't think this is a good
> option. in case we have a large folio with 16 basepages, as do_swap_page
> can only map one base page for each page fault, that means we have
> to restore 16(tags we restore in each page fault) * 16(the times of page faults)
> for this large folio.
>
> and still the worst thing is the page fault in the Nth PTE of large folio
> might free swap entry as that swap has been in.
> do_swap_page()
> {
>    /*
>     * Remove the swap entry and conditionally try to free up the swapcache.
>     * We're already holding a reference on the page but haven't mapped it
>     * yet.
>     */
>     swap_free(entry);
> }
>
> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you might access
> a freed tag.

And David, one more information is that to keep the parameter of
arch_swap_restore() unchanged as folio,
i actually tried an ugly approach in rfc v2:

+void arch_swap_restore(swp_entry_t entry, struct folio *folio)
+{
+ if (system_supports_mte()) {
+      /*
+       * We don't support large folios swap in as whole yet, but
+       * we can hit a large folio which is still in swapcache
+       * after those related processes' PTEs have been unmapped
+       * but before the swapcache folio  is dropped, in this case,
+       * we need to find the exact page which "entry" is mapping
+       * to. If we are not hitting swapcache, this folio won't be
+       * large
+     */
+ struct page *page = folio_file_page(folio, swp_offset(entry));
+ mte_restore_tags(entry, page);
+ }
+}

And obviously everybody in the discussion hated it :-)

i feel the only way to keep API unchanged using folio is that we
support restoring PTEs
all together for the whole large folio and we support the swap-in of
large folios. This is
in my list to do, I will send a patchset based on Ryan's large anon
folios series after a
while. till that is really done, it seems using page rather than folio
is a better choice.

>
> >
> > BUT, IIRC in the context of
> >
> > commit cfeed8ffe55b37fa10286aaaa1369da00cb88440
> > Author: David Hildenbrand <david@redhat.com>
> > Date:   Mon Aug 21 18:08:46 2023 +0200
> >
> >      mm/swap: stop using page->private on tail pages for THP_SWAP
> >
> >      Patch series "mm/swap: stop using page->private on tail pages for THP_SWAP
> >      + cleanups".
> >
> >      This series stops using page->private on tail pages for THP_SWAP, replaces
> >      folio->private by folio->swap for swapcache folios, and starts using
> >      "new_folio" for tail pages that we are splitting to remove the usage of
> >      page->private for swapcache handling completely.
> >
> > As long as the folio is in the swapcache, we even do have the proper
> > swp_entry_t start_entry available as folio_swap_entry(folio).
> >
> > But now I am confused when we actually would have to pass
> > "swp_entry_t start_entry". We shouldn't if the folio is in the swapcache ...
> >
>
> Nop, hitting swapcache doesn't necessarily mean tags have been restored.
> when A forks B,C,D,E,F. and A, B, C, D, E ,F share the swapslot.
> as we have two chances to hit swapcache:
> 1. swap out, unmap has been done but folios haven't been dropped
> 2. swap in, shared processes allocate folios and add to swapcache
>
> for 2, If A gets fault earlier than B, A will allocate folio and add
> it to swapcache.
> Then B will hit the swapcache. But If B's CPU is faster than A, B still might
> be the one mapping PTE earlier than A though A is the one which has
> added the page to swapcache. we have to make sure MTE is there when
> mapping is done.
>
> > --
> > Cheers,
> >
> > David / dhildenb

Thanks
Barry
David Hildenbrand Nov. 17, 2023, 11:25 a.m. UTC | #8
On 17.11.23 00:47, Barry Song wrote:
> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 15.11.23 21:49, Barry Song wrote:
>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>> On 14.11.23 02:43, Barry Song wrote:
>>>>> This patch makes MTE tags saving and restoring support large folios,
>>>>> then we don't need to split them into base pages for swapping out
>>>>> on ARM64 SoCs with MTE.
>>>>>
>>>>> arch_prepare_to_swap() should take folio rather than page as parameter
>>>>> because we support THP swap-out as a whole.
>>>>>
>>>>> Meanwhile, arch_swap_restore() should use page parameter rather than
>>>>> folio as swap-in always works at the granularity of base pages right
>>>>> now.
>>>>
>>>> ... but then we always have order-0 folios and can pass a folio, or what
>>>> am I missing?
>>>
>>> Hi David,
>>> you missed the discussion here:
>>>
>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>>
>> Okay, so you want to handle the refault-from-swapcache case where you get a
>> large folio.
>>
>> I was mislead by your "folio as swap-in always works at the granularity of
>> base pages right now" comment.
>>
>> What you actually wanted to say is "While we always swap in small folios, we
>> might refault large folios from the swapcache, and we only want to restore
>> the tags for the page of the large folio we are faulting on."
>>
>> But, I do if we can't simply restore the tags for the whole thing at once
>> at make the interface page-free?
>>
>> Let me elaborate:
>>
>> IIRC, if we have a large folio in the swapcache, the swap entries/offset are
>> contiguous. If you know you are faulting on page[1] of the folio with a
>> given swap offset, you can calculate the swap offset for page[0] simply by
>> subtracting from the offset.
>>
>> See page_swap_entry() on how we perform this calculation.
>>
>>
>> So you can simply pass the large folio and the swap entry corresponding
>> to the first page of the large folio, and restore all tags at once.
>>
>> So the interface would be
>>
>> arch_prepare_to_swap(struct folio *folio);
>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
>>
>> I'm sorry if that was also already discussed.
> 
> This has been discussed. Steven, Ryan and I all don't think this is a good
> option. in case we have a large folio with 16 basepages, as do_swap_page
> can only map one base page for each page fault, that means we have
> to restore 16(tags we restore in each page fault) * 16(the times of page faults)
> for this large folio.

Can't you remember that it's already been restored? That seems like a 
reasonable thing to have.

For large folios we have plenty of page flags in tail pages available?
David Hildenbrand Nov. 17, 2023, 11:28 a.m. UTC | #9
On 17.11.23 01:15, Barry Song wrote:
> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
>>
>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
>>>
>>> On 15.11.23 21:49, Barry Song wrote:
>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>
>>>>> On 14.11.23 02:43, Barry Song wrote:
>>>>>> This patch makes MTE tags saving and restoring support large folios,
>>>>>> then we don't need to split them into base pages for swapping out
>>>>>> on ARM64 SoCs with MTE.
>>>>>>
>>>>>> arch_prepare_to_swap() should take folio rather than page as parameter
>>>>>> because we support THP swap-out as a whole.
>>>>>>
>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather than
>>>>>> folio as swap-in always works at the granularity of base pages right
>>>>>> now.
>>>>>
>>>>> ... but then we always have order-0 folios and can pass a folio, or what
>>>>> am I missing?
>>>>
>>>> Hi David,
>>>> you missed the discussion here:
>>>>
>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>>>
>>> Okay, so you want to handle the refault-from-swapcache case where you get a
>>> large folio.
>>>
>>> I was mislead by your "folio as swap-in always works at the granularity of
>>> base pages right now" comment.
>>>
>>> What you actually wanted to say is "While we always swap in small folios, we
>>> might refault large folios from the swapcache, and we only want to restore
>>> the tags for the page of the large folio we are faulting on."
>>>
>>> But, I do if we can't simply restore the tags for the whole thing at once
>>> at make the interface page-free?
>>>
>>> Let me elaborate:
>>>
>>> IIRC, if we have a large folio in the swapcache, the swap entries/offset are
>>> contiguous. If you know you are faulting on page[1] of the folio with a
>>> given swap offset, you can calculate the swap offset for page[0] simply by
>>> subtracting from the offset.
>>>
>>> See page_swap_entry() on how we perform this calculation.
>>>
>>>
>>> So you can simply pass the large folio and the swap entry corresponding
>>> to the first page of the large folio, and restore all tags at once.
>>>
>>> So the interface would be
>>>
>>> arch_prepare_to_swap(struct folio *folio);
>>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
>>>
>>> I'm sorry if that was also already discussed.
>>
>> This has been discussed. Steven, Ryan and I all don't think this is a good
>> option. in case we have a large folio with 16 basepages, as do_swap_page
>> can only map one base page for each page fault, that means we have
>> to restore 16(tags we restore in each page fault) * 16(the times of page faults)
>> for this large folio.
>>
>> and still the worst thing is the page fault in the Nth PTE of large folio
>> might free swap entry as that swap has been in.
>> do_swap_page()
>> {
>>     /*
>>      * Remove the swap entry and conditionally try to free up the swapcache.
>>      * We're already holding a reference on the page but haven't mapped it
>>      * yet.
>>      */
>>      swap_free(entry);
>> }
>>
>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you might access
>> a freed tag.
> 
> And David, one more information is that to keep the parameter of
> arch_swap_restore() unchanged as folio,
> i actually tried an ugly approach in rfc v2:
> 
> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
> +{
> + if (system_supports_mte()) {
> +      /*
> +       * We don't support large folios swap in as whole yet, but
> +       * we can hit a large folio which is still in swapcache
> +       * after those related processes' PTEs have been unmapped
> +       * but before the swapcache folio  is dropped, in this case,
> +       * we need to find the exact page which "entry" is mapping
> +       * to. If we are not hitting swapcache, this folio won't be
> +       * large
> +     */
> + struct page *page = folio_file_page(folio, swp_offset(entry));
> + mte_restore_tags(entry, page);
> + }
> +}
> 
> And obviously everybody in the discussion hated it :-)
> 

I can relate :D

> i feel the only way to keep API unchanged using folio is that we
> support restoring PTEs
> all together for the whole large folio and we support the swap-in of
> large folios. This is
> in my list to do, I will send a patchset based on Ryan's large anon
> folios series after a
> while. till that is really done, it seems using page rather than folio
> is a better choice.

I think just restoring all tags and remembering for a large folio that 
they have been restored might be the low hanging fruit. But as always, 
devil is in the detail :)
Barry Song Nov. 17, 2023, 6:41 p.m. UTC | #10
On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 17.11.23 01:15, Barry Song wrote:
> > On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
> >>
> >> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
> >>>
> >>> On 15.11.23 21:49, Barry Song wrote:
> >>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com> wrote:
> >>>>>
> >>>>> On 14.11.23 02:43, Barry Song wrote:
> >>>>>> This patch makes MTE tags saving and restoring support large folios,
> >>>>>> then we don't need to split them into base pages for swapping out
> >>>>>> on ARM64 SoCs with MTE.
> >>>>>>
> >>>>>> arch_prepare_to_swap() should take folio rather than page as parameter
> >>>>>> because we support THP swap-out as a whole.
> >>>>>>
> >>>>>> Meanwhile, arch_swap_restore() should use page parameter rather than
> >>>>>> folio as swap-in always works at the granularity of base pages right
> >>>>>> now.
> >>>>>
> >>>>> ... but then we always have order-0 folios and can pass a folio, or what
> >>>>> am I missing?
> >>>>
> >>>> Hi David,
> >>>> you missed the discussion here:
> >>>>
> >>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
> >>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
> >>>
> >>> Okay, so you want to handle the refault-from-swapcache case where you get a
> >>> large folio.
> >>>
> >>> I was mislead by your "folio as swap-in always works at the granularity of
> >>> base pages right now" comment.
> >>>
> >>> What you actually wanted to say is "While we always swap in small folios, we
> >>> might refault large folios from the swapcache, and we only want to restore
> >>> the tags for the page of the large folio we are faulting on."
> >>>
> >>> But, I do if we can't simply restore the tags for the whole thing at once
> >>> at make the interface page-free?
> >>>
> >>> Let me elaborate:
> >>>
> >>> IIRC, if we have a large folio in the swapcache, the swap entries/offset are
> >>> contiguous. If you know you are faulting on page[1] of the folio with a
> >>> given swap offset, you can calculate the swap offset for page[0] simply by
> >>> subtracting from the offset.
> >>>
> >>> See page_swap_entry() on how we perform this calculation.
> >>>
> >>>
> >>> So you can simply pass the large folio and the swap entry corresponding
> >>> to the first page of the large folio, and restore all tags at once.
> >>>
> >>> So the interface would be
> >>>
> >>> arch_prepare_to_swap(struct folio *folio);
> >>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
> >>>
> >>> I'm sorry if that was also already discussed.
> >>
> >> This has been discussed. Steven, Ryan and I all don't think this is a good
> >> option. in case we have a large folio with 16 basepages, as do_swap_page
> >> can only map one base page for each page fault, that means we have
> >> to restore 16(tags we restore in each page fault) * 16(the times of page faults)
> >> for this large folio.
> >>
> >> and still the worst thing is the page fault in the Nth PTE of large folio
> >> might free swap entry as that swap has been in.
> >> do_swap_page()
> >> {
> >>     /*
> >>      * Remove the swap entry and conditionally try to free up the swapcache.
> >>      * We're already holding a reference on the page but haven't mapped it
> >>      * yet.
> >>      */
> >>      swap_free(entry);
> >> }
> >>
> >> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you might access
> >> a freed tag.
> >
> > And David, one more information is that to keep the parameter of
> > arch_swap_restore() unchanged as folio,
> > i actually tried an ugly approach in rfc v2:
> >
> > +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
> > +{
> > + if (system_supports_mte()) {
> > +      /*
> > +       * We don't support large folios swap in as whole yet, but
> > +       * we can hit a large folio which is still in swapcache
> > +       * after those related processes' PTEs have been unmapped
> > +       * but before the swapcache folio  is dropped, in this case,
> > +       * we need to find the exact page which "entry" is mapping
> > +       * to. If we are not hitting swapcache, this folio won't be
> > +       * large
> > +     */
> > + struct page *page = folio_file_page(folio, swp_offset(entry));
> > + mte_restore_tags(entry, page);
> > + }
> > +}
> >
> > And obviously everybody in the discussion hated it :-)
> >
>
> I can relate :D
>
> > i feel the only way to keep API unchanged using folio is that we
> > support restoring PTEs
> > all together for the whole large folio and we support the swap-in of
> > large folios. This is
> > in my list to do, I will send a patchset based on Ryan's large anon
> > folios series after a
> > while. till that is really done, it seems using page rather than folio
> > is a better choice.
>
> I think just restoring all tags and remembering for a large folio that
> they have been restored might be the low hanging fruit. But as always,
> devil is in the detail :)

Hi David,
thanks for all your suggestions though my feeling is this is too complex and
is not worth it for at least  three reasons.

1. In multi-thread and particularly multi-processes, we need some locks to
protect and help know if one process is the first one to restore tags and if
someone else is restoring tags when one process wants to restore. there
is not this kind of fine-grained lock at all.

2. folios are not always large, in many cases, they have just one base page
and there is no tail to remember. and it seems pretty ugly if we turn out have
to use different ways to remember restoring state for small folios and
large folios.

3. there is nothing wrong to use page to restore tags since right now swap-in
is page. restoring tags and swapping-in become harmonious with each other
after this patch. I would argue what is really wrong is the current mainline.

If eventually we are able to do_swap_page() for the whole large folio, we
move to folios for MTE tags as well. These two behaviours make a new
harmonious picture again.

>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry
Matthew Wilcox Nov. 17, 2023, 7:36 p.m. UTC | #11
On Fri, Nov 17, 2023 at 07:47:00AM +0800, Barry Song wrote:
> This has been discussed. Steven, Ryan and I all don't think this is a good
> option. in case we have a large folio with 16 basepages, as do_swap_page
> can only map one base page for each page fault, that means we have
> to restore 16(tags we restore in each page fault) * 16(the times of page faults)
> for this large folio.

That doesn't seem all that hard to fix?  Call set_ptes() instead of
set_pte_at().  The biggest thing, I guess, is making sure that all
the PTEs you're going to set up are still pte_none().
Barry Song Nov. 17, 2023, 8:36 p.m. UTC | #12
On Sat, Nov 18, 2023 at 3:37 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Fri, Nov 17, 2023 at 07:47:00AM +0800, Barry Song wrote:
> > This has been discussed. Steven, Ryan and I all don't think this is a good
> > option. in case we have a large folio with 16 basepages, as do_swap_page
> > can only map one base page for each page fault, that means we have
> > to restore 16(tags we restore in each page fault) * 16(the times of page faults)
> > for this large folio.
>
> That doesn't seem all that hard to fix?  Call set_ptes() instead of
> set_pte_at().  The biggest thing, I guess, is making sure that all
> the PTEs you're going to set up are still pte_none().

I guess you mean all are still swap entries in ptes. some risks  I can see

1. vma might be splitted after folios added into swapcache, for example
unmap or mprotect a part of large folios from userspace

2. vma is not splitted, but some basepages are MADV_DONTNEED
within the folios.

3. basepages in the large folio might become having different permissions
on R/W/X.

for example, if a large folio has 16 basepages, as userspace is still
working at 4kb, userspace can mprotect RD_ONLY for a part of them,
in this case, 16PTEs will still be swap entries, but the re-use for
write fault can't work at folio granularity.

I need to consider all the above DoubleMap/split risks rather than simply
checking PTEs as userspace is still 4KB.

>

Thanks
Barry
David Hildenbrand Nov. 20, 2023, 9:11 a.m. UTC | #13
On 17.11.23 19:41, Barry Song wrote:
> On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 17.11.23 01:15, Barry Song wrote:
>>> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
>>>>
>>>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>
>>>>> On 15.11.23 21:49, Barry Song wrote:
>>>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>>
>>>>>>> On 14.11.23 02:43, Barry Song wrote:
>>>>>>>> This patch makes MTE tags saving and restoring support large folios,
>>>>>>>> then we don't need to split them into base pages for swapping out
>>>>>>>> on ARM64 SoCs with MTE.
>>>>>>>>
>>>>>>>> arch_prepare_to_swap() should take folio rather than page as parameter
>>>>>>>> because we support THP swap-out as a whole.
>>>>>>>>
>>>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather than
>>>>>>>> folio as swap-in always works at the granularity of base pages right
>>>>>>>> now.
>>>>>>>
>>>>>>> ... but then we always have order-0 folios and can pass a folio, or what
>>>>>>> am I missing?
>>>>>>
>>>>>> Hi David,
>>>>>> you missed the discussion here:
>>>>>>
>>>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
>>>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>>>>>
>>>>> Okay, so you want to handle the refault-from-swapcache case where you get a
>>>>> large folio.
>>>>>
>>>>> I was mislead by your "folio as swap-in always works at the granularity of
>>>>> base pages right now" comment.
>>>>>
>>>>> What you actually wanted to say is "While we always swap in small folios, we
>>>>> might refault large folios from the swapcache, and we only want to restore
>>>>> the tags for the page of the large folio we are faulting on."
>>>>>
>>>>> But, I do if we can't simply restore the tags for the whole thing at once
>>>>> at make the interface page-free?
>>>>>
>>>>> Let me elaborate:
>>>>>
>>>>> IIRC, if we have a large folio in the swapcache, the swap entries/offset are
>>>>> contiguous. If you know you are faulting on page[1] of the folio with a
>>>>> given swap offset, you can calculate the swap offset for page[0] simply by
>>>>> subtracting from the offset.
>>>>>
>>>>> See page_swap_entry() on how we perform this calculation.
>>>>>
>>>>>
>>>>> So you can simply pass the large folio and the swap entry corresponding
>>>>> to the first page of the large folio, and restore all tags at once.
>>>>>
>>>>> So the interface would be
>>>>>
>>>>> arch_prepare_to_swap(struct folio *folio);
>>>>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
>>>>>
>>>>> I'm sorry if that was also already discussed.
>>>>
>>>> This has been discussed. Steven, Ryan and I all don't think this is a good
>>>> option. in case we have a large folio with 16 basepages, as do_swap_page
>>>> can only map one base page for each page fault, that means we have
>>>> to restore 16(tags we restore in each page fault) * 16(the times of page faults)
>>>> for this large folio.
>>>>
>>>> and still the worst thing is the page fault in the Nth PTE of large folio
>>>> might free swap entry as that swap has been in.
>>>> do_swap_page()
>>>> {
>>>>      /*
>>>>       * Remove the swap entry and conditionally try to free up the swapcache.
>>>>       * We're already holding a reference on the page but haven't mapped it
>>>>       * yet.
>>>>       */
>>>>       swap_free(entry);
>>>> }
>>>>
>>>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you might access
>>>> a freed tag.
>>>
>>> And David, one more information is that to keep the parameter of
>>> arch_swap_restore() unchanged as folio,
>>> i actually tried an ugly approach in rfc v2:
>>>
>>> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
>>> +{
>>> + if (system_supports_mte()) {
>>> +      /*
>>> +       * We don't support large folios swap in as whole yet, but
>>> +       * we can hit a large folio which is still in swapcache
>>> +       * after those related processes' PTEs have been unmapped
>>> +       * but before the swapcache folio  is dropped, in this case,
>>> +       * we need to find the exact page which "entry" is mapping
>>> +       * to. If we are not hitting swapcache, this folio won't be
>>> +       * large
>>> +     */
>>> + struct page *page = folio_file_page(folio, swp_offset(entry));
>>> + mte_restore_tags(entry, page);
>>> + }
>>> +}
>>>
>>> And obviously everybody in the discussion hated it :-)
>>>
>>
>> I can relate :D
>>
>>> i feel the only way to keep API unchanged using folio is that we
>>> support restoring PTEs
>>> all together for the whole large folio and we support the swap-in of
>>> large folios. This is
>>> in my list to do, I will send a patchset based on Ryan's large anon
>>> folios series after a
>>> while. till that is really done, it seems using page rather than folio
>>> is a better choice.
>>
>> I think just restoring all tags and remembering for a large folio that
>> they have been restored might be the low hanging fruit. But as always,
>> devil is in the detail :)
> 
> Hi David,
> thanks for all your suggestions though my feeling is this is too complex and
> is not worth it for at least  three reasons.

Fair enough.

> 
> 1. In multi-thread and particularly multi-processes, we need some locks to
> protect and help know if one process is the first one to restore tags and if
> someone else is restoring tags when one process wants to restore. there
> is not this kind of fine-grained lock at all.

We surely always hold the folio lock on swapin/swapout, no? So when 
these functions are called.

So that might just work already -- unless I am missing something important.

> 
> 2. folios are not always large, in many cases, they have just one base page
> and there is no tail to remember. and it seems pretty ugly if we turn out have
> to use different ways to remember restoring state for small folios and
> large folios.

if (folio_test_large(folio)) {

} else {

}

Easy ;)

Seriously, it's not that complicated and/or ugly.

> 
> 3. there is nothing wrong to use page to restore tags since right now swap-in
> is page. restoring tags and swapping-in become harmonious with each other
> after this patch. I would argue what is really wrong is the current mainline.
> 
> If eventually we are able to do_swap_page() for the whole large folio, we
> move to folios for MTE tags as well. These two behaviours make a new
> harmonious picture again.
> 

Just making both functions consume folios is much cleaner and also more 
future proof.

Consuming now a page where we used to consume a folio is a step backwards.
Ryan Roberts Nov. 20, 2023, 10:57 a.m. UTC | #14
On 20/11/2023 09:11, David Hildenbrand wrote:
> On 17.11.23 19:41, Barry Song wrote:
>> On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com> wrote:
>>>
>>> On 17.11.23 01:15, Barry Song wrote:
>>>> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
>>>>>
>>>>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>
>>>>>> On 15.11.23 21:49, Barry Song wrote:
>>>>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>>>
>>>>>>>> On 14.11.23 02:43, Barry Song wrote:
>>>>>>>>> This patch makes MTE tags saving and restoring support large folios,
>>>>>>>>> then we don't need to split them into base pages for swapping out
>>>>>>>>> on ARM64 SoCs with MTE.
>>>>>>>>>
>>>>>>>>> arch_prepare_to_swap() should take folio rather than page as parameter
>>>>>>>>> because we support THP swap-out as a whole.
>>>>>>>>>
>>>>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather than
>>>>>>>>> folio as swap-in always works at the granularity of base pages right
>>>>>>>>> now.
>>>>>>>>
>>>>>>>> ... but then we always have order-0 folios and can pass a folio, or what
>>>>>>>> am I missing?
>>>>>>>
>>>>>>> Hi David,
>>>>>>> you missed the discussion here:
>>>>>>>
>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>>>>>>
>>>>>> Okay, so you want to handle the refault-from-swapcache case where you get a
>>>>>> large folio.
>>>>>>
>>>>>> I was mislead by your "folio as swap-in always works at the granularity of
>>>>>> base pages right now" comment.
>>>>>>
>>>>>> What you actually wanted to say is "While we always swap in small folios, we
>>>>>> might refault large folios from the swapcache, and we only want to restore
>>>>>> the tags for the page of the large folio we are faulting on."
>>>>>>
>>>>>> But, I do if we can't simply restore the tags for the whole thing at once
>>>>>> at make the interface page-free?
>>>>>>
>>>>>> Let me elaborate:
>>>>>>
>>>>>> IIRC, if we have a large folio in the swapcache, the swap entries/offset are
>>>>>> contiguous. If you know you are faulting on page[1] of the folio with a
>>>>>> given swap offset, you can calculate the swap offset for page[0] simply by
>>>>>> subtracting from the offset.
>>>>>>
>>>>>> See page_swap_entry() on how we perform this calculation.
>>>>>>
>>>>>>
>>>>>> So you can simply pass the large folio and the swap entry corresponding
>>>>>> to the first page of the large folio, and restore all tags at once.
>>>>>>
>>>>>> So the interface would be
>>>>>>
>>>>>> arch_prepare_to_swap(struct folio *folio);
>>>>>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
>>>>>>
>>>>>> I'm sorry if that was also already discussed.
>>>>>
>>>>> This has been discussed. Steven, Ryan and I all don't think this is a good
>>>>> option. in case we have a large folio with 16 basepages, as do_swap_page
>>>>> can only map one base page for each page fault, that means we have
>>>>> to restore 16(tags we restore in each page fault) * 16(the times of page
>>>>> faults)
>>>>> for this large folio.
>>>>>
>>>>> and still the worst thing is the page fault in the Nth PTE of large folio
>>>>> might free swap entry as that swap has been in.
>>>>> do_swap_page()
>>>>> {
>>>>>      /*
>>>>>       * Remove the swap entry and conditionally try to free up the swapcache.
>>>>>       * We're already holding a reference on the page but haven't mapped it
>>>>>       * yet.
>>>>>       */
>>>>>       swap_free(entry);
>>>>> }
>>>>>
>>>>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you might
>>>>> access
>>>>> a freed tag.
>>>>
>>>> And David, one more information is that to keep the parameter of
>>>> arch_swap_restore() unchanged as folio,
>>>> i actually tried an ugly approach in rfc v2:
>>>>
>>>> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
>>>> +{
>>>> + if (system_supports_mte()) {
>>>> +      /*
>>>> +       * We don't support large folios swap in as whole yet, but
>>>> +       * we can hit a large folio which is still in swapcache
>>>> +       * after those related processes' PTEs have been unmapped
>>>> +       * but before the swapcache folio  is dropped, in this case,
>>>> +       * we need to find the exact page which "entry" is mapping
>>>> +       * to. If we are not hitting swapcache, this folio won't be
>>>> +       * large
>>>> +     */
>>>> + struct page *page = folio_file_page(folio, swp_offset(entry));
>>>> + mte_restore_tags(entry, page);
>>>> + }
>>>> +}
>>>>
>>>> And obviously everybody in the discussion hated it :-)
>>>>
>>>
>>> I can relate :D
>>>
>>>> i feel the only way to keep API unchanged using folio is that we
>>>> support restoring PTEs
>>>> all together for the whole large folio and we support the swap-in of
>>>> large folios. This is
>>>> in my list to do, I will send a patchset based on Ryan's large anon
>>>> folios series after a
>>>> while. till that is really done, it seems using page rather than folio
>>>> is a better choice.
>>>
>>> I think just restoring all tags and remembering for a large folio that
>>> they have been restored might be the low hanging fruit. But as always,
>>> devil is in the detail :)
>>
>> Hi David,
>> thanks for all your suggestions though my feeling is this is too complex and
>> is not worth it for at least  three reasons.
> 
> Fair enough.
> 
>>
>> 1. In multi-thread and particularly multi-processes, we need some locks to
>> protect and help know if one process is the first one to restore tags and if
>> someone else is restoring tags when one process wants to restore. there
>> is not this kind of fine-grained lock at all.
> 
> We surely always hold the folio lock on swapin/swapout, no? So when these
> functions are called.
> 
> So that might just work already -- unless I am missing something important.

We already have a page flag that we use to mark the page as having had its mte
state associated; PG_mte_tagged. This is currently per-page (and IIUC, Matthew
has been working to remove as many per-page flags as possible). Couldn't we just
make arch_swap_restore() take a folio, restore the tags for *all* the pages and
repurpose that flag to be per-folio (so head page only)? It looks like the the
mte code already manages all the serialization requirements too. Then
arch_swap_restore() can just exit early if it sees the flag is already set on
the folio.

One (probably nonsense) concern that just sprung to mind about having MTE work
with large folios in general; is it possible that user space could cause a large
anon folio to be allocated (THP), then later mark *part* of it to be tagged with
MTE? In this case you would need to apply tags to part of the folio only.
Although I have a vague recollection that any MTE areas have to be marked at
mmap time and therefore this type of thing is impossible?

Thanks,
Ryan



> 
>>
>> 2. folios are not always large, in many cases, they have just one base page
>> and there is no tail to remember. and it seems pretty ugly if we turn out have
>> to use different ways to remember restoring state for small folios and
>> large folios.
> 
> if (folio_test_large(folio)) {
> 
> } else {
> 
> }
> 
> Easy ;)
> 
> Seriously, it's not that complicated and/or ugly.
> 
>>
>> 3. there is nothing wrong to use page to restore tags since right now swap-in
>> is page. restoring tags and swapping-in become harmonious with each other
>> after this patch. I would argue what is really wrong is the current mainline.
>>
>> If eventually we are able to do_swap_page() for the whole large folio, we
>> move to folios for MTE tags as well. These two behaviours make a new
>> harmonious picture again.
>>
> 
> Just making both functions consume folios is much cleaner and also more future
> proof.
> 
> Consuming now a page where we used to consume a folio is a step backwards.
>
Steven Price Nov. 20, 2023, 2:50 p.m. UTC | #15
On 20/11/2023 10:57, Ryan Roberts wrote:
> On 20/11/2023 09:11, David Hildenbrand wrote:
>> On 17.11.23 19:41, Barry Song wrote:
>>> On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>> On 17.11.23 01:15, Barry Song wrote:
>>>>> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
>>>>>>
>>>>>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>>
>>>>>>> On 15.11.23 21:49, Barry Song wrote:
>>>>>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>>>>
>>>>>>>>> On 14.11.23 02:43, Barry Song wrote:
>>>>>>>>>> This patch makes MTE tags saving and restoring support large folios,
>>>>>>>>>> then we don't need to split them into base pages for swapping out
>>>>>>>>>> on ARM64 SoCs with MTE.
>>>>>>>>>>
>>>>>>>>>> arch_prepare_to_swap() should take folio rather than page as parameter
>>>>>>>>>> because we support THP swap-out as a whole.
>>>>>>>>>>
>>>>>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather than
>>>>>>>>>> folio as swap-in always works at the granularity of base pages right
>>>>>>>>>> now.
>>>>>>>>>
>>>>>>>>> ... but then we always have order-0 folios and can pass a folio, or what
>>>>>>>>> am I missing?
>>>>>>>>
>>>>>>>> Hi David,
>>>>>>>> you missed the discussion here:
>>>>>>>>
>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>>>>>>>
>>>>>>> Okay, so you want to handle the refault-from-swapcache case where you get a
>>>>>>> large folio.
>>>>>>>
>>>>>>> I was mislead by your "folio as swap-in always works at the granularity of
>>>>>>> base pages right now" comment.
>>>>>>>
>>>>>>> What you actually wanted to say is "While we always swap in small folios, we
>>>>>>> might refault large folios from the swapcache, and we only want to restore
>>>>>>> the tags for the page of the large folio we are faulting on."
>>>>>>>
>>>>>>> But, I do if we can't simply restore the tags for the whole thing at once
>>>>>>> at make the interface page-free?
>>>>>>>
>>>>>>> Let me elaborate:
>>>>>>>
>>>>>>> IIRC, if we have a large folio in the swapcache, the swap entries/offset are
>>>>>>> contiguous. If you know you are faulting on page[1] of the folio with a
>>>>>>> given swap offset, you can calculate the swap offset for page[0] simply by
>>>>>>> subtracting from the offset.
>>>>>>>
>>>>>>> See page_swap_entry() on how we perform this calculation.
>>>>>>>
>>>>>>>
>>>>>>> So you can simply pass the large folio and the swap entry corresponding
>>>>>>> to the first page of the large folio, and restore all tags at once.
>>>>>>>
>>>>>>> So the interface would be
>>>>>>>
>>>>>>> arch_prepare_to_swap(struct folio *folio);
>>>>>>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
>>>>>>>
>>>>>>> I'm sorry if that was also already discussed.
>>>>>>
>>>>>> This has been discussed. Steven, Ryan and I all don't think this is a good
>>>>>> option. in case we have a large folio with 16 basepages, as do_swap_page
>>>>>> can only map one base page for each page fault, that means we have
>>>>>> to restore 16(tags we restore in each page fault) * 16(the times of page
>>>>>> faults)
>>>>>> for this large folio.
>>>>>>
>>>>>> and still the worst thing is the page fault in the Nth PTE of large folio
>>>>>> might free swap entry as that swap has been in.
>>>>>> do_swap_page()
>>>>>> {
>>>>>>      /*
>>>>>>       * Remove the swap entry and conditionally try to free up the swapcache.
>>>>>>       * We're already holding a reference on the page but haven't mapped it
>>>>>>       * yet.
>>>>>>       */
>>>>>>       swap_free(entry);
>>>>>> }
>>>>>>
>>>>>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you might
>>>>>> access
>>>>>> a freed tag.
>>>>>
>>>>> And David, one more information is that to keep the parameter of
>>>>> arch_swap_restore() unchanged as folio,
>>>>> i actually tried an ugly approach in rfc v2:
>>>>>
>>>>> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
>>>>> +{
>>>>> + if (system_supports_mte()) {
>>>>> +      /*
>>>>> +       * We don't support large folios swap in as whole yet, but
>>>>> +       * we can hit a large folio which is still in swapcache
>>>>> +       * after those related processes' PTEs have been unmapped
>>>>> +       * but before the swapcache folio  is dropped, in this case,
>>>>> +       * we need to find the exact page which "entry" is mapping
>>>>> +       * to. If we are not hitting swapcache, this folio won't be
>>>>> +       * large
>>>>> +     */
>>>>> + struct page *page = folio_file_page(folio, swp_offset(entry));
>>>>> + mte_restore_tags(entry, page);
>>>>> + }
>>>>> +}
>>>>>
>>>>> And obviously everybody in the discussion hated it :-)
>>>>>
>>>>
>>>> I can relate :D
>>>>
>>>>> i feel the only way to keep API unchanged using folio is that we
>>>>> support restoring PTEs
>>>>> all together for the whole large folio and we support the swap-in of
>>>>> large folios. This is
>>>>> in my list to do, I will send a patchset based on Ryan's large anon
>>>>> folios series after a
>>>>> while. till that is really done, it seems using page rather than folio
>>>>> is a better choice.
>>>>
>>>> I think just restoring all tags and remembering for a large folio that
>>>> they have been restored might be the low hanging fruit. But as always,
>>>> devil is in the detail :)
>>>
>>> Hi David,
>>> thanks for all your suggestions though my feeling is this is too complex and
>>> is not worth it for at least  three reasons.
>>
>> Fair enough.
>>
>>>
>>> 1. In multi-thread and particularly multi-processes, we need some locks to
>>> protect and help know if one process is the first one to restore tags and if
>>> someone else is restoring tags when one process wants to restore. there
>>> is not this kind of fine-grained lock at all.
>>
>> We surely always hold the folio lock on swapin/swapout, no? So when these
>> functions are called.
>>
>> So that might just work already -- unless I am missing something important.
> 
> We already have a page flag that we use to mark the page as having had its mte
> state associated; PG_mte_tagged. This is currently per-page (and IIUC, Matthew
> has been working to remove as many per-page flags as possible). Couldn't we just
> make arch_swap_restore() take a folio, restore the tags for *all* the pages and
> repurpose that flag to be per-folio (so head page only)? It looks like the the
> mte code already manages all the serialization requirements too. Then
> arch_swap_restore() can just exit early if it sees the flag is already set on
> the folio.
> 
> One (probably nonsense) concern that just sprung to mind about having MTE work
> with large folios in general; is it possible that user space could cause a large
> anon folio to be allocated (THP), then later mark *part* of it to be tagged with
> MTE? In this case you would need to apply tags to part of the folio only.
> Although I have a vague recollection that any MTE areas have to be marked at
> mmap time and therefore this type of thing is impossible?

It is possible to use mprotect() to 'upgrade' memory to be MTE. So in
this case you'd need to be able to either split the folio or deal with
only part of it being tagged.

Steve

> Thanks,
> Ryan
> 
> 
> 
>>
>>>
>>> 2. folios are not always large, in many cases, they have just one base page
>>> and there is no tail to remember. and it seems pretty ugly if we turn out have
>>> to use different ways to remember restoring state for small folios and
>>> large folios.
>>
>> if (folio_test_large(folio)) {
>>
>> } else {
>>
>> }
>>
>> Easy ;)
>>
>> Seriously, it's not that complicated and/or ugly.
>>
>>>
>>> 3. there is nothing wrong to use page to restore tags since right now swap-in
>>> is page. restoring tags and swapping-in become harmonious with each other
>>> after this patch. I would argue what is really wrong is the current mainline.
>>>
>>> If eventually we are able to do_swap_page() for the whole large folio, we
>>> move to folios for MTE tags as well. These two behaviours make a new
>>> harmonious picture again.
>>>
>>
>> Just making both functions consume folios is much cleaner and also more future
>> proof.
>>
>> Consuming now a page where we used to consume a folio is a step backwards.
>>
>
Barry Song Nov. 24, 2023, 1:35 a.m. UTC | #16
On Mon, Nov 20, 2023 at 11:57 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 20/11/2023 09:11, David Hildenbrand wrote:
> > On 17.11.23 19:41, Barry Song wrote:
> >> On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com> wrote:
> >>>
> >>> On 17.11.23 01:15, Barry Song wrote:
> >>>> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
> >>>>>
> >>>>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
> >>>>>>
> >>>>>> On 15.11.23 21:49, Barry Song wrote:
> >>>>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com> wrote:
> >>>>>>>>
> >>>>>>>> On 14.11.23 02:43, Barry Song wrote:
> >>>>>>>>> This patch makes MTE tags saving and restoring support large folios,
> >>>>>>>>> then we don't need to split them into base pages for swapping out
> >>>>>>>>> on ARM64 SoCs with MTE.
> >>>>>>>>>
> >>>>>>>>> arch_prepare_to_swap() should take folio rather than page as parameter
> >>>>>>>>> because we support THP swap-out as a whole.
> >>>>>>>>>
> >>>>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather than
> >>>>>>>>> folio as swap-in always works at the granularity of base pages right
> >>>>>>>>> now.
> >>>>>>>>
> >>>>>>>> ... but then we always have order-0 folios and can pass a folio, or what
> >>>>>>>> am I missing?
> >>>>>>>
> >>>>>>> Hi David,
> >>>>>>> you missed the discussion here:
> >>>>>>>
> >>>>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
> >>>>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
> >>>>>>
> >>>>>> Okay, so you want to handle the refault-from-swapcache case where you get a
> >>>>>> large folio.
> >>>>>>
> >>>>>> I was mislead by your "folio as swap-in always works at the granularity of
> >>>>>> base pages right now" comment.
> >>>>>>
> >>>>>> What you actually wanted to say is "While we always swap in small folios, we
> >>>>>> might refault large folios from the swapcache, and we only want to restore
> >>>>>> the tags for the page of the large folio we are faulting on."
> >>>>>>
> >>>>>> But, I do if we can't simply restore the tags for the whole thing at once
> >>>>>> at make the interface page-free?
> >>>>>>
> >>>>>> Let me elaborate:
> >>>>>>
> >>>>>> IIRC, if we have a large folio in the swapcache, the swap entries/offset are
> >>>>>> contiguous. If you know you are faulting on page[1] of the folio with a
> >>>>>> given swap offset, you can calculate the swap offset for page[0] simply by
> >>>>>> subtracting from the offset.
> >>>>>>
> >>>>>> See page_swap_entry() on how we perform this calculation.
> >>>>>>
> >>>>>>
> >>>>>> So you can simply pass the large folio and the swap entry corresponding
> >>>>>> to the first page of the large folio, and restore all tags at once.
> >>>>>>
> >>>>>> So the interface would be
> >>>>>>
> >>>>>> arch_prepare_to_swap(struct folio *folio);
> >>>>>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
> >>>>>>
> >>>>>> I'm sorry if that was also already discussed.
> >>>>>
> >>>>> This has been discussed. Steven, Ryan and I all don't think this is a good
> >>>>> option. in case we have a large folio with 16 basepages, as do_swap_page
> >>>>> can only map one base page for each page fault, that means we have
> >>>>> to restore 16(tags we restore in each page fault) * 16(the times of page
> >>>>> faults)
> >>>>> for this large folio.
> >>>>>
> >>>>> and still the worst thing is the page fault in the Nth PTE of large folio
> >>>>> might free swap entry as that swap has been in.
> >>>>> do_swap_page()
> >>>>> {
> >>>>>      /*
> >>>>>       * Remove the swap entry and conditionally try to free up the swapcache.
> >>>>>       * We're already holding a reference on the page but haven't mapped it
> >>>>>       * yet.
> >>>>>       */
> >>>>>       swap_free(entry);
> >>>>> }
> >>>>>
> >>>>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you might
> >>>>> access
> >>>>> a freed tag.
> >>>>
> >>>> And David, one more information is that to keep the parameter of
> >>>> arch_swap_restore() unchanged as folio,
> >>>> i actually tried an ugly approach in rfc v2:
> >>>>
> >>>> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
> >>>> +{
> >>>> + if (system_supports_mte()) {
> >>>> +      /*
> >>>> +       * We don't support large folios swap in as whole yet, but
> >>>> +       * we can hit a large folio which is still in swapcache
> >>>> +       * after those related processes' PTEs have been unmapped
> >>>> +       * but before the swapcache folio  is dropped, in this case,
> >>>> +       * we need to find the exact page which "entry" is mapping
> >>>> +       * to. If we are not hitting swapcache, this folio won't be
> >>>> +       * large
> >>>> +     */
> >>>> + struct page *page = folio_file_page(folio, swp_offset(entry));
> >>>> + mte_restore_tags(entry, page);
> >>>> + }
> >>>> +}
> >>>>
> >>>> And obviously everybody in the discussion hated it :-)
> >>>>
> >>>
> >>> I can relate :D
> >>>
> >>>> i feel the only way to keep API unchanged using folio is that we
> >>>> support restoring PTEs
> >>>> all together for the whole large folio and we support the swap-in of
> >>>> large folios. This is
> >>>> in my list to do, I will send a patchset based on Ryan's large anon
> >>>> folios series after a
> >>>> while. till that is really done, it seems using page rather than folio
> >>>> is a better choice.
> >>>
> >>> I think just restoring all tags and remembering for a large folio that
> >>> they have been restored might be the low hanging fruit. But as always,
> >>> devil is in the detail :)
> >>
> >> Hi David,
> >> thanks for all your suggestions though my feeling is this is too complex and
> >> is not worth it for at least  three reasons.
> >
> > Fair enough.
> >
> >>
> >> 1. In multi-thread and particularly multi-processes, we need some locks to
> >> protect and help know if one process is the first one to restore tags and if
> >> someone else is restoring tags when one process wants to restore. there
> >> is not this kind of fine-grained lock at all.
> >
> > We surely always hold the folio lock on swapin/swapout, no? So when these
> > functions are called.
> >
> > So that might just work already -- unless I am missing something important.
>
> We already have a page flag that we use to mark the page as having had its mte
> state associated; PG_mte_tagged. This is currently per-page (and IIUC, Matthew
> has been working to remove as many per-page flags as possible). Couldn't we just
> make arch_swap_restore() take a folio, restore the tags for *all* the pages and
> repurpose that flag to be per-folio (so head page only)? It looks like the the
> mte code already manages all the serialization requirements too. Then
> arch_swap_restore() can just exit early if it sees the flag is already set on
> the folio.
>
> One (probably nonsense) concern that just sprung to mind about having MTE work
> with large folios in general; is it possible that user space could cause a large
> anon folio to be allocated (THP), then later mark *part* of it to be tagged with
> MTE? In this case you would need to apply tags to part of the folio only.
> Although I have a vague recollection that any MTE areas have to be marked at
> mmap time and therefore this type of thing is impossible?

right, we might need to consider only a part of folio needs to be
mapped and restored MTE tags.
do_swap_page() can have a chance to hit a large folio but it only
needs to fault-in a page.

A case can be quite simple as below,

1. anon folio shared by process A and B
2. add_to_swap() as a large folio;
3. try to unmap A and B;
4. after A is unmapped(ptes become swap entries), we do a
MADV_DONTNEED on a part of the folio. this can
happen very easily as userspace is still working in 4KB level;
userspace heap management can free an
basepage area by MADV_DONTNEED;
madvise(address, MADV_DONTNEED, 4KB);
5. A refault on address + 8KB, we will hit large folio in
do_swap_page() but we will only need to map
one basepage, we will never need this DONTNEEDed in process A.

another more complicated case can be mprotect and munmap a part of
large folios. since userspace
has no idea of large folios in their mind, they can do all strange
things. are we sure in all cases,
large folios have been splitted into small folios?

>
> Thanks,
> Ryan
>
>
>
> >
> >>
> >> 2. folios are not always large, in many cases, they have just one base page
> >> and there is no tail to remember. and it seems pretty ugly if we turn out have
> >> to use different ways to remember restoring state for small folios and
> >> large folios.
> >
> > if (folio_test_large(folio)) {
> >
> > } else {
> >
> > }
> >
> > Easy ;)
> >
> > Seriously, it's not that complicated and/or ugly.
> >
> >>
> >> 3. there is nothing wrong to use page to restore tags since right now swap-in
> >> is page. restoring tags and swapping-in become harmonious with each other
> >> after this patch. I would argue what is really wrong is the current mainline.
> >>
> >> If eventually we are able to do_swap_page() for the whole large folio, we
> >> move to folios for MTE tags as well. These two behaviours make a new
> >> harmonious picture again.
> >>
> >
> > Just making both functions consume folios is much cleaner and also more future
> > proof.
> >
> > Consuming now a page where we used to consume a folio is a step backwards.
> >
>

Thanks
Barry
David Hildenbrand Nov. 24, 2023, 8:55 a.m. UTC | #17
On 24.11.23 02:35, Barry Song wrote:
> On Mon, Nov 20, 2023 at 11:57 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 20/11/2023 09:11, David Hildenbrand wrote:
>>> On 17.11.23 19:41, Barry Song wrote:
>>>> On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>
>>>>> On 17.11.23 01:15, Barry Song wrote:
>>>>>> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
>>>>>>>
>>>>>>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>>>
>>>>>>>> On 15.11.23 21:49, Barry Song wrote:
>>>>>>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>>>>>
>>>>>>>>>> On 14.11.23 02:43, Barry Song wrote:
>>>>>>>>>>> This patch makes MTE tags saving and restoring support large folios,
>>>>>>>>>>> then we don't need to split them into base pages for swapping out
>>>>>>>>>>> on ARM64 SoCs with MTE.
>>>>>>>>>>>
>>>>>>>>>>> arch_prepare_to_swap() should take folio rather than page as parameter
>>>>>>>>>>> because we support THP swap-out as a whole.
>>>>>>>>>>>
>>>>>>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather than
>>>>>>>>>>> folio as swap-in always works at the granularity of base pages right
>>>>>>>>>>> now.
>>>>>>>>>>
>>>>>>>>>> ... but then we always have order-0 folios and can pass a folio, or what
>>>>>>>>>> am I missing?
>>>>>>>>>
>>>>>>>>> Hi David,
>>>>>>>>> you missed the discussion here:
>>>>>>>>>
>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>>>>>>>>
>>>>>>>> Okay, so you want to handle the refault-from-swapcache case where you get a
>>>>>>>> large folio.
>>>>>>>>
>>>>>>>> I was mislead by your "folio as swap-in always works at the granularity of
>>>>>>>> base pages right now" comment.
>>>>>>>>
>>>>>>>> What you actually wanted to say is "While we always swap in small folios, we
>>>>>>>> might refault large folios from the swapcache, and we only want to restore
>>>>>>>> the tags for the page of the large folio we are faulting on."
>>>>>>>>
>>>>>>>> But, I do if we can't simply restore the tags for the whole thing at once
>>>>>>>> at make the interface page-free?
>>>>>>>>
>>>>>>>> Let me elaborate:
>>>>>>>>
>>>>>>>> IIRC, if we have a large folio in the swapcache, the swap entries/offset are
>>>>>>>> contiguous. If you know you are faulting on page[1] of the folio with a
>>>>>>>> given swap offset, you can calculate the swap offset for page[0] simply by
>>>>>>>> subtracting from the offset.
>>>>>>>>
>>>>>>>> See page_swap_entry() on how we perform this calculation.
>>>>>>>>
>>>>>>>>
>>>>>>>> So you can simply pass the large folio and the swap entry corresponding
>>>>>>>> to the first page of the large folio, and restore all tags at once.
>>>>>>>>
>>>>>>>> So the interface would be
>>>>>>>>
>>>>>>>> arch_prepare_to_swap(struct folio *folio);
>>>>>>>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
>>>>>>>>
>>>>>>>> I'm sorry if that was also already discussed.
>>>>>>>
>>>>>>> This has been discussed. Steven, Ryan and I all don't think this is a good
>>>>>>> option. in case we have a large folio with 16 basepages, as do_swap_page
>>>>>>> can only map one base page for each page fault, that means we have
>>>>>>> to restore 16(tags we restore in each page fault) * 16(the times of page
>>>>>>> faults)
>>>>>>> for this large folio.
>>>>>>>
>>>>>>> and still the worst thing is the page fault in the Nth PTE of large folio
>>>>>>> might free swap entry as that swap has been in.
>>>>>>> do_swap_page()
>>>>>>> {
>>>>>>>       /*
>>>>>>>        * Remove the swap entry and conditionally try to free up the swapcache.
>>>>>>>        * We're already holding a reference on the page but haven't mapped it
>>>>>>>        * yet.
>>>>>>>        */
>>>>>>>        swap_free(entry);
>>>>>>> }
>>>>>>>
>>>>>>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you might
>>>>>>> access
>>>>>>> a freed tag.
>>>>>>
>>>>>> And David, one more information is that to keep the parameter of
>>>>>> arch_swap_restore() unchanged as folio,
>>>>>> i actually tried an ugly approach in rfc v2:
>>>>>>
>>>>>> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
>>>>>> +{
>>>>>> + if (system_supports_mte()) {
>>>>>> +      /*
>>>>>> +       * We don't support large folios swap in as whole yet, but
>>>>>> +       * we can hit a large folio which is still in swapcache
>>>>>> +       * after those related processes' PTEs have been unmapped
>>>>>> +       * but before the swapcache folio  is dropped, in this case,
>>>>>> +       * we need to find the exact page which "entry" is mapping
>>>>>> +       * to. If we are not hitting swapcache, this folio won't be
>>>>>> +       * large
>>>>>> +     */
>>>>>> + struct page *page = folio_file_page(folio, swp_offset(entry));
>>>>>> + mte_restore_tags(entry, page);
>>>>>> + }
>>>>>> +}
>>>>>>
>>>>>> And obviously everybody in the discussion hated it :-)
>>>>>>
>>>>>
>>>>> I can relate :D
>>>>>
>>>>>> i feel the only way to keep API unchanged using folio is that we
>>>>>> support restoring PTEs
>>>>>> all together for the whole large folio and we support the swap-in of
>>>>>> large folios. This is
>>>>>> in my list to do, I will send a patchset based on Ryan's large anon
>>>>>> folios series after a
>>>>>> while. till that is really done, it seems using page rather than folio
>>>>>> is a better choice.
>>>>>
>>>>> I think just restoring all tags and remembering for a large folio that
>>>>> they have been restored might be the low hanging fruit. But as always,
>>>>> devil is in the detail :)
>>>>
>>>> Hi David,
>>>> thanks for all your suggestions though my feeling is this is too complex and
>>>> is not worth it for at least  three reasons.
>>>
>>> Fair enough.
>>>
>>>>
>>>> 1. In multi-thread and particularly multi-processes, we need some locks to
>>>> protect and help know if one process is the first one to restore tags and if
>>>> someone else is restoring tags when one process wants to restore. there
>>>> is not this kind of fine-grained lock at all.
>>>
>>> We surely always hold the folio lock on swapin/swapout, no? So when these
>>> functions are called.
>>>
>>> So that might just work already -- unless I am missing something important.
>>
>> We already have a page flag that we use to mark the page as having had its mte
>> state associated; PG_mte_tagged. This is currently per-page (and IIUC, Matthew
>> has been working to remove as many per-page flags as possible). Couldn't we just
>> make arch_swap_restore() take a folio, restore the tags for *all* the pages and
>> repurpose that flag to be per-folio (so head page only)? It looks like the the
>> mte code already manages all the serialization requirements too. Then
>> arch_swap_restore() can just exit early if it sees the flag is already set on
>> the folio.
>>
>> One (probably nonsense) concern that just sprung to mind about having MTE work
>> with large folios in general; is it possible that user space could cause a large
>> anon folio to be allocated (THP), then later mark *part* of it to be tagged with
>> MTE? In this case you would need to apply tags to part of the folio only.
>> Although I have a vague recollection that any MTE areas have to be marked at
>> mmap time and therefore this type of thing is impossible?
> 
> right, we might need to consider only a part of folio needs to be
> mapped and restored MTE tags.
> do_swap_page() can have a chance to hit a large folio but it only
> needs to fault-in a page.
> 
> A case can be quite simple as below,
> 
> 1. anon folio shared by process A and B
> 2. add_to_swap() as a large folio;
> 3. try to unmap A and B;
> 4. after A is unmapped(ptes become swap entries), we do a
> MADV_DONTNEED on a part of the folio. this can
> happen very easily as userspace is still working in 4KB level;
> userspace heap management can free an
> basepage area by MADV_DONTNEED;
> madvise(address, MADV_DONTNEED, 4KB);
> 5. A refault on address + 8KB, we will hit large folio in
> do_swap_page() but we will only need to map
> one basepage, we will never need this DONTNEEDed in process A.
> 
> another more complicated case can be mprotect and munmap a part of
> large folios. since userspace
> has no idea of large folios in their mind, they can do all strange
> things. are we sure in all cases,
> large folios have been splitted into small folios?

To handle that, we'd have to identify

a) if a subpage has an mte tag to save during swapout
b) if a subpage has an mte tag to restore during swapin

I suspect b) can be had from whatever datastructure we're using to 
actually save the tags?

For a), is there some way to have that information from the HW?
Ryan Roberts Nov. 24, 2023, 9:01 a.m. UTC | #18
On 24/11/2023 08:55, David Hildenbrand wrote:
> On 24.11.23 02:35, Barry Song wrote:
>> On Mon, Nov 20, 2023 at 11:57 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>
>>> On 20/11/2023 09:11, David Hildenbrand wrote:
>>>> On 17.11.23 19:41, Barry Song wrote:
>>>>> On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>
>>>>>> On 17.11.23 01:15, Barry Song wrote:
>>>>>>> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
>>>>>>>>
>>>>>>>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>>>>
>>>>>>>>> On 15.11.23 21:49, Barry Song wrote:
>>>>>>>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com>
>>>>>>>>>> wrote:
>>>>>>>>>>>
>>>>>>>>>>> On 14.11.23 02:43, Barry Song wrote:
>>>>>>>>>>>> This patch makes MTE tags saving and restoring support large folios,
>>>>>>>>>>>> then we don't need to split them into base pages for swapping out
>>>>>>>>>>>> on ARM64 SoCs with MTE.
>>>>>>>>>>>>
>>>>>>>>>>>> arch_prepare_to_swap() should take folio rather than page as parameter
>>>>>>>>>>>> because we support THP swap-out as a whole.
>>>>>>>>>>>>
>>>>>>>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather than
>>>>>>>>>>>> folio as swap-in always works at the granularity of base pages right
>>>>>>>>>>>> now.
>>>>>>>>>>>
>>>>>>>>>>> ... but then we always have order-0 folios and can pass a folio, or what
>>>>>>>>>>> am I missing?
>>>>>>>>>>
>>>>>>>>>> Hi David,
>>>>>>>>>> you missed the discussion here:
>>>>>>>>>>
>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>>>>>>>>>
>>>>>>>>> Okay, so you want to handle the refault-from-swapcache case where you
>>>>>>>>> get a
>>>>>>>>> large folio.
>>>>>>>>>
>>>>>>>>> I was mislead by your "folio as swap-in always works at the granularity of
>>>>>>>>> base pages right now" comment.
>>>>>>>>>
>>>>>>>>> What you actually wanted to say is "While we always swap in small
>>>>>>>>> folios, we
>>>>>>>>> might refault large folios from the swapcache, and we only want to restore
>>>>>>>>> the tags for the page of the large folio we are faulting on."
>>>>>>>>>
>>>>>>>>> But, I do if we can't simply restore the tags for the whole thing at once
>>>>>>>>> at make the interface page-free?
>>>>>>>>>
>>>>>>>>> Let me elaborate:
>>>>>>>>>
>>>>>>>>> IIRC, if we have a large folio in the swapcache, the swap
>>>>>>>>> entries/offset are
>>>>>>>>> contiguous. If you know you are faulting on page[1] of the folio with a
>>>>>>>>> given swap offset, you can calculate the swap offset for page[0] simply by
>>>>>>>>> subtracting from the offset.
>>>>>>>>>
>>>>>>>>> See page_swap_entry() on how we perform this calculation.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> So you can simply pass the large folio and the swap entry corresponding
>>>>>>>>> to the first page of the large folio, and restore all tags at once.
>>>>>>>>>
>>>>>>>>> So the interface would be
>>>>>>>>>
>>>>>>>>> arch_prepare_to_swap(struct folio *folio);
>>>>>>>>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
>>>>>>>>>
>>>>>>>>> I'm sorry if that was also already discussed.
>>>>>>>>
>>>>>>>> This has been discussed. Steven, Ryan and I all don't think this is a good
>>>>>>>> option. in case we have a large folio with 16 basepages, as do_swap_page
>>>>>>>> can only map one base page for each page fault, that means we have
>>>>>>>> to restore 16(tags we restore in each page fault) * 16(the times of page
>>>>>>>> faults)
>>>>>>>> for this large folio.
>>>>>>>>
>>>>>>>> and still the worst thing is the page fault in the Nth PTE of large folio
>>>>>>>> might free swap entry as that swap has been in.
>>>>>>>> do_swap_page()
>>>>>>>> {
>>>>>>>>       /*
>>>>>>>>        * Remove the swap entry and conditionally try to free up the
>>>>>>>> swapcache.
>>>>>>>>        * We're already holding a reference on the page but haven't
>>>>>>>> mapped it
>>>>>>>>        * yet.
>>>>>>>>        */
>>>>>>>>        swap_free(entry);
>>>>>>>> }
>>>>>>>>
>>>>>>>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you might
>>>>>>>> access
>>>>>>>> a freed tag.
>>>>>>>
>>>>>>> And David, one more information is that to keep the parameter of
>>>>>>> arch_swap_restore() unchanged as folio,
>>>>>>> i actually tried an ugly approach in rfc v2:
>>>>>>>
>>>>>>> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
>>>>>>> +{
>>>>>>> + if (system_supports_mte()) {
>>>>>>> +      /*
>>>>>>> +       * We don't support large folios swap in as whole yet, but
>>>>>>> +       * we can hit a large folio which is still in swapcache
>>>>>>> +       * after those related processes' PTEs have been unmapped
>>>>>>> +       * but before the swapcache folio  is dropped, in this case,
>>>>>>> +       * we need to find the exact page which "entry" is mapping
>>>>>>> +       * to. If we are not hitting swapcache, this folio won't be
>>>>>>> +       * large
>>>>>>> +     */
>>>>>>> + struct page *page = folio_file_page(folio, swp_offset(entry));
>>>>>>> + mte_restore_tags(entry, page);
>>>>>>> + }
>>>>>>> +}
>>>>>>>
>>>>>>> And obviously everybody in the discussion hated it :-)
>>>>>>>
>>>>>>
>>>>>> I can relate :D
>>>>>>
>>>>>>> i feel the only way to keep API unchanged using folio is that we
>>>>>>> support restoring PTEs
>>>>>>> all together for the whole large folio and we support the swap-in of
>>>>>>> large folios. This is
>>>>>>> in my list to do, I will send a patchset based on Ryan's large anon
>>>>>>> folios series after a
>>>>>>> while. till that is really done, it seems using page rather than folio
>>>>>>> is a better choice.
>>>>>>
>>>>>> I think just restoring all tags and remembering for a large folio that
>>>>>> they have been restored might be the low hanging fruit. But as always,
>>>>>> devil is in the detail :)
>>>>>
>>>>> Hi David,
>>>>> thanks for all your suggestions though my feeling is this is too complex and
>>>>> is not worth it for at least  three reasons.
>>>>
>>>> Fair enough.
>>>>
>>>>>
>>>>> 1. In multi-thread and particularly multi-processes, we need some locks to
>>>>> protect and help know if one process is the first one to restore tags and if
>>>>> someone else is restoring tags when one process wants to restore. there
>>>>> is not this kind of fine-grained lock at all.
>>>>
>>>> We surely always hold the folio lock on swapin/swapout, no? So when these
>>>> functions are called.
>>>>
>>>> So that might just work already -- unless I am missing something important.
>>>
>>> We already have a page flag that we use to mark the page as having had its mte
>>> state associated; PG_mte_tagged. This is currently per-page (and IIUC, Matthew
>>> has been working to remove as many per-page flags as possible). Couldn't we just
>>> make arch_swap_restore() take a folio, restore the tags for *all* the pages and
>>> repurpose that flag to be per-folio (so head page only)? It looks like the the
>>> mte code already manages all the serialization requirements too. Then
>>> arch_swap_restore() can just exit early if it sees the flag is already set on
>>> the folio.
>>>
>>> One (probably nonsense) concern that just sprung to mind about having MTE work
>>> with large folios in general; is it possible that user space could cause a large
>>> anon folio to be allocated (THP), then later mark *part* of it to be tagged with
>>> MTE? In this case you would need to apply tags to part of the folio only.
>>> Although I have a vague recollection that any MTE areas have to be marked at
>>> mmap time and therefore this type of thing is impossible?
>>
>> right, we might need to consider only a part of folio needs to be
>> mapped and restored MTE tags.
>> do_swap_page() can have a chance to hit a large folio but it only
>> needs to fault-in a page.
>>
>> A case can be quite simple as below,
>>
>> 1. anon folio shared by process A and B
>> 2. add_to_swap() as a large folio;
>> 3. try to unmap A and B;
>> 4. after A is unmapped(ptes become swap entries), we do a
>> MADV_DONTNEED on a part of the folio. this can
>> happen very easily as userspace is still working in 4KB level;
>> userspace heap management can free an
>> basepage area by MADV_DONTNEED;
>> madvise(address, MADV_DONTNEED, 4KB);
>> 5. A refault on address + 8KB, we will hit large folio in
>> do_swap_page() but we will only need to map
>> one basepage, we will never need this DONTNEEDed in process A.
>>
>> another more complicated case can be mprotect and munmap a part of
>> large folios. since userspace
>> has no idea of large folios in their mind, they can do all strange
>> things. are we sure in all cases,
>> large folios have been splitted into small folios?

I don;'t think these examples you cite are problematic. Although user space
thinks about things in 4K pages, the kernel does things in units of folios. So a
folio is either fully swapped out or not swapped out at all. MTE tags can be
saved/restored per folio, even if only part of that folio ends up being mapped
back into user space.

The problem is that MTE tagging could be active only for a selection of pages
within the folio; that's where it gets tricky.

> 
> To handle that, we'd have to identify
> 
> a) if a subpage has an mte tag to save during swapout
> b) if a subpage has an mte tag to restore during swapin
> 
> I suspect b) can be had from whatever datastructure we're using to actually save
> the tags?
> 
> For a), is there some way to have that information from the HW?

Yes I agree with this approach. I don't know the answer to that question though;
I'd assume it must be possible. Steven?

>
Steven Price Nov. 24, 2023, 9:55 a.m. UTC | #19
On 24/11/2023 09:01, Ryan Roberts wrote:
> On 24/11/2023 08:55, David Hildenbrand wrote:
>> On 24.11.23 02:35, Barry Song wrote:
>>> On Mon, Nov 20, 2023 at 11:57 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>
>>>> On 20/11/2023 09:11, David Hildenbrand wrote:
>>>>> On 17.11.23 19:41, Barry Song wrote:
>>>>>> On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>>
>>>>>>> On 17.11.23 01:15, Barry Song wrote:
>>>>>>>> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
>>>>>>>>>
>>>>>>>>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>>>>>
>>>>>>>>>> On 15.11.23 21:49, Barry Song wrote:
>>>>>>>>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com>
>>>>>>>>>>> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On 14.11.23 02:43, Barry Song wrote:
>>>>>>>>>>>>> This patch makes MTE tags saving and restoring support large folios,
>>>>>>>>>>>>> then we don't need to split them into base pages for swapping out
>>>>>>>>>>>>> on ARM64 SoCs with MTE.
>>>>>>>>>>>>>
>>>>>>>>>>>>> arch_prepare_to_swap() should take folio rather than page as parameter
>>>>>>>>>>>>> because we support THP swap-out as a whole.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather than
>>>>>>>>>>>>> folio as swap-in always works at the granularity of base pages right
>>>>>>>>>>>>> now.
>>>>>>>>>>>>
>>>>>>>>>>>> ... but then we always have order-0 folios and can pass a folio, or what
>>>>>>>>>>>> am I missing?
>>>>>>>>>>>
>>>>>>>>>>> Hi David,
>>>>>>>>>>> you missed the discussion here:
>>>>>>>>>>>
>>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
>>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>>>>>>>>>>
>>>>>>>>>> Okay, so you want to handle the refault-from-swapcache case where you
>>>>>>>>>> get a
>>>>>>>>>> large folio.
>>>>>>>>>>
>>>>>>>>>> I was mislead by your "folio as swap-in always works at the granularity of
>>>>>>>>>> base pages right now" comment.
>>>>>>>>>>
>>>>>>>>>> What you actually wanted to say is "While we always swap in small
>>>>>>>>>> folios, we
>>>>>>>>>> might refault large folios from the swapcache, and we only want to restore
>>>>>>>>>> the tags for the page of the large folio we are faulting on."
>>>>>>>>>>
>>>>>>>>>> But, I do if we can't simply restore the tags for the whole thing at once
>>>>>>>>>> at make the interface page-free?
>>>>>>>>>>
>>>>>>>>>> Let me elaborate:
>>>>>>>>>>
>>>>>>>>>> IIRC, if we have a large folio in the swapcache, the swap
>>>>>>>>>> entries/offset are
>>>>>>>>>> contiguous. If you know you are faulting on page[1] of the folio with a
>>>>>>>>>> given swap offset, you can calculate the swap offset for page[0] simply by
>>>>>>>>>> subtracting from the offset.
>>>>>>>>>>
>>>>>>>>>> See page_swap_entry() on how we perform this calculation.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> So you can simply pass the large folio and the swap entry corresponding
>>>>>>>>>> to the first page of the large folio, and restore all tags at once.
>>>>>>>>>>
>>>>>>>>>> So the interface would be
>>>>>>>>>>
>>>>>>>>>> arch_prepare_to_swap(struct folio *folio);
>>>>>>>>>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
>>>>>>>>>>
>>>>>>>>>> I'm sorry if that was also already discussed.
>>>>>>>>>
>>>>>>>>> This has been discussed. Steven, Ryan and I all don't think this is a good
>>>>>>>>> option. in case we have a large folio with 16 basepages, as do_swap_page
>>>>>>>>> can only map one base page for each page fault, that means we have
>>>>>>>>> to restore 16(tags we restore in each page fault) * 16(the times of page
>>>>>>>>> faults)
>>>>>>>>> for this large folio.
>>>>>>>>>
>>>>>>>>> and still the worst thing is the page fault in the Nth PTE of large folio
>>>>>>>>> might free swap entry as that swap has been in.
>>>>>>>>> do_swap_page()
>>>>>>>>> {
>>>>>>>>>       /*
>>>>>>>>>        * Remove the swap entry and conditionally try to free up the
>>>>>>>>> swapcache.
>>>>>>>>>        * We're already holding a reference on the page but haven't
>>>>>>>>> mapped it
>>>>>>>>>        * yet.
>>>>>>>>>        */
>>>>>>>>>        swap_free(entry);
>>>>>>>>> }
>>>>>>>>>
>>>>>>>>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you might
>>>>>>>>> access
>>>>>>>>> a freed tag.
>>>>>>>>
>>>>>>>> And David, one more information is that to keep the parameter of
>>>>>>>> arch_swap_restore() unchanged as folio,
>>>>>>>> i actually tried an ugly approach in rfc v2:
>>>>>>>>
>>>>>>>> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
>>>>>>>> +{
>>>>>>>> + if (system_supports_mte()) {
>>>>>>>> +      /*
>>>>>>>> +       * We don't support large folios swap in as whole yet, but
>>>>>>>> +       * we can hit a large folio which is still in swapcache
>>>>>>>> +       * after those related processes' PTEs have been unmapped
>>>>>>>> +       * but before the swapcache folio  is dropped, in this case,
>>>>>>>> +       * we need to find the exact page which "entry" is mapping
>>>>>>>> +       * to. If we are not hitting swapcache, this folio won't be
>>>>>>>> +       * large
>>>>>>>> +     */
>>>>>>>> + struct page *page = folio_file_page(folio, swp_offset(entry));
>>>>>>>> + mte_restore_tags(entry, page);
>>>>>>>> + }
>>>>>>>> +}
>>>>>>>>
>>>>>>>> And obviously everybody in the discussion hated it :-)
>>>>>>>>
>>>>>>>
>>>>>>> I can relate :D
>>>>>>>
>>>>>>>> i feel the only way to keep API unchanged using folio is that we
>>>>>>>> support restoring PTEs
>>>>>>>> all together for the whole large folio and we support the swap-in of
>>>>>>>> large folios. This is
>>>>>>>> in my list to do, I will send a patchset based on Ryan's large anon
>>>>>>>> folios series after a
>>>>>>>> while. till that is really done, it seems using page rather than folio
>>>>>>>> is a better choice.
>>>>>>>
>>>>>>> I think just restoring all tags and remembering for a large folio that
>>>>>>> they have been restored might be the low hanging fruit. But as always,
>>>>>>> devil is in the detail :)
>>>>>>
>>>>>> Hi David,
>>>>>> thanks for all your suggestions though my feeling is this is too complex and
>>>>>> is not worth it for at least  three reasons.
>>>>>
>>>>> Fair enough.
>>>>>
>>>>>>
>>>>>> 1. In multi-thread and particularly multi-processes, we need some locks to
>>>>>> protect and help know if one process is the first one to restore tags and if
>>>>>> someone else is restoring tags when one process wants to restore. there
>>>>>> is not this kind of fine-grained lock at all.
>>>>>
>>>>> We surely always hold the folio lock on swapin/swapout, no? So when these
>>>>> functions are called.
>>>>>
>>>>> So that might just work already -- unless I am missing something important.
>>>>
>>>> We already have a page flag that we use to mark the page as having had its mte
>>>> state associated; PG_mte_tagged. This is currently per-page (and IIUC, Matthew
>>>> has been working to remove as many per-page flags as possible). Couldn't we just
>>>> make arch_swap_restore() take a folio, restore the tags for *all* the pages and
>>>> repurpose that flag to be per-folio (so head page only)? It looks like the the
>>>> mte code already manages all the serialization requirements too. Then
>>>> arch_swap_restore() can just exit early if it sees the flag is already set on
>>>> the folio.
>>>>
>>>> One (probably nonsense) concern that just sprung to mind about having MTE work
>>>> with large folios in general; is it possible that user space could cause a large
>>>> anon folio to be allocated (THP), then later mark *part* of it to be tagged with
>>>> MTE? In this case you would need to apply tags to part of the folio only.
>>>> Although I have a vague recollection that any MTE areas have to be marked at
>>>> mmap time and therefore this type of thing is impossible?
>>>
>>> right, we might need to consider only a part of folio needs to be
>>> mapped and restored MTE tags.
>>> do_swap_page() can have a chance to hit a large folio but it only
>>> needs to fault-in a page.
>>>
>>> A case can be quite simple as below,
>>>
>>> 1. anon folio shared by process A and B
>>> 2. add_to_swap() as a large folio;
>>> 3. try to unmap A and B;
>>> 4. after A is unmapped(ptes become swap entries), we do a
>>> MADV_DONTNEED on a part of the folio. this can
>>> happen very easily as userspace is still working in 4KB level;
>>> userspace heap management can free an
>>> basepage area by MADV_DONTNEED;
>>> madvise(address, MADV_DONTNEED, 4KB);
>>> 5. A refault on address + 8KB, we will hit large folio in
>>> do_swap_page() but we will only need to map
>>> one basepage, we will never need this DONTNEEDed in process A.
>>>
>>> another more complicated case can be mprotect and munmap a part of
>>> large folios. since userspace
>>> has no idea of large folios in their mind, they can do all strange
>>> things. are we sure in all cases,
>>> large folios have been splitted into small folios?
> 
> I don;'t think these examples you cite are problematic. Although user space
> thinks about things in 4K pages, the kernel does things in units of folios. So a
> folio is either fully swapped out or not swapped out at all. MTE tags can be
> saved/restored per folio, even if only part of that folio ends up being mapped
> back into user space.
> 
> The problem is that MTE tagging could be active only for a selection of pages
> within the folio; that's where it gets tricky.
> 
>>
>> To handle that, we'd have to identify
>>
>> a) if a subpage has an mte tag to save during swapout
>> b) if a subpage has an mte tag to restore during swapin
>>
>> I suspect b) can be had from whatever datastructure we're using to actually save
>> the tags?
>>
>> For a), is there some way to have that information from the HW?
> 
> Yes I agree with this approach. I don't know the answer to that question though;
> I'd assume it must be possible. Steven?

Architecturally 'all' pages have MTE metadata (although see Alex's
series[1] which would change this).

The question is: could user space have put any data in the tag storage?
We currently use the PG_mte_tagged page flag to keep track of pages
which have been mapped (to user space) with MTE enabled. If the page has
never been exposed to user space with MTE enabled (since being cleared)
then there's nothing of interest to store.

It would be possible to reverse this scheme - we could drop the page
flag and just look at the actual tag storage. If it's all zeros then
obviously there's no point in storing it. Note that originally we had a
lazy clear of tag storage - i.e. if user space only had mapping without
MTE enabled then the tag storage could contain junk. I believe that's
now changed and the tag storage is always cleared at the same time as
the data storage.

The VMAs (obviously) also carry the information about whether a range is
MTE-enabled (the VM_MTE flag controlled by PROT_MTE in user space), but
there could be many VMAs and they may have different permissions, so
fetching the state from there would be expensive.

Not really relevant here, but the kernel can also use MTE (HW_TAGS
KASAN) - AFAIK there's no way of identifying if the MTE tag storage is
used or not for kernel pages. But this only presents an issue for
hibernation which uses a different mechanism to normal swap.

Steve

[1]
https://lore.kernel.org/r/20231119165721.9849-1-alexandru.elisei%40arm.com
Barry Song Nov. 24, 2023, 6:14 p.m. UTC | #20
On Fri, Nov 24, 2023 at 10:55 PM Steven Price <steven.price@arm.com> wrote:
>
> On 24/11/2023 09:01, Ryan Roberts wrote:
> > On 24/11/2023 08:55, David Hildenbrand wrote:
> >> On 24.11.23 02:35, Barry Song wrote:
> >>> On Mon, Nov 20, 2023 at 11:57 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>>
> >>>> On 20/11/2023 09:11, David Hildenbrand wrote:
> >>>>> On 17.11.23 19:41, Barry Song wrote:
> >>>>>> On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com> wrote:
> >>>>>>>
> >>>>>>> On 17.11.23 01:15, Barry Song wrote:
> >>>>>>>> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
> >>>>>>>>>
> >>>>>>>>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
> >>>>>>>>>>
> >>>>>>>>>> On 15.11.23 21:49, Barry Song wrote:
> >>>>>>>>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com>
> >>>>>>>>>>> wrote:
> >>>>>>>>>>>>
> >>>>>>>>>>>> On 14.11.23 02:43, Barry Song wrote:
> >>>>>>>>>>>>> This patch makes MTE tags saving and restoring support large folios,
> >>>>>>>>>>>>> then we don't need to split them into base pages for swapping out
> >>>>>>>>>>>>> on ARM64 SoCs with MTE.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> arch_prepare_to_swap() should take folio rather than page as parameter
> >>>>>>>>>>>>> because we support THP swap-out as a whole.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather than
> >>>>>>>>>>>>> folio as swap-in always works at the granularity of base pages right
> >>>>>>>>>>>>> now.
> >>>>>>>>>>>>
> >>>>>>>>>>>> ... but then we always have order-0 folios and can pass a folio, or what
> >>>>>>>>>>>> am I missing?
> >>>>>>>>>>>
> >>>>>>>>>>> Hi David,
> >>>>>>>>>>> you missed the discussion here:
> >>>>>>>>>>>
> >>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
> >>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
> >>>>>>>>>>
> >>>>>>>>>> Okay, so you want to handle the refault-from-swapcache case where you
> >>>>>>>>>> get a
> >>>>>>>>>> large folio.
> >>>>>>>>>>
> >>>>>>>>>> I was mislead by your "folio as swap-in always works at the granularity of
> >>>>>>>>>> base pages right now" comment.
> >>>>>>>>>>
> >>>>>>>>>> What you actually wanted to say is "While we always swap in small
> >>>>>>>>>> folios, we
> >>>>>>>>>> might refault large folios from the swapcache, and we only want to restore
> >>>>>>>>>> the tags for the page of the large folio we are faulting on."
> >>>>>>>>>>
> >>>>>>>>>> But, I do if we can't simply restore the tags for the whole thing at once
> >>>>>>>>>> at make the interface page-free?
> >>>>>>>>>>
> >>>>>>>>>> Let me elaborate:
> >>>>>>>>>>
> >>>>>>>>>> IIRC, if we have a large folio in the swapcache, the swap
> >>>>>>>>>> entries/offset are
> >>>>>>>>>> contiguous. If you know you are faulting on page[1] of the folio with a
> >>>>>>>>>> given swap offset, you can calculate the swap offset for page[0] simply by
> >>>>>>>>>> subtracting from the offset.
> >>>>>>>>>>
> >>>>>>>>>> See page_swap_entry() on how we perform this calculation.
> >>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>> So you can simply pass the large folio and the swap entry corresponding
> >>>>>>>>>> to the first page of the large folio, and restore all tags at once.
> >>>>>>>>>>
> >>>>>>>>>> So the interface would be
> >>>>>>>>>>
> >>>>>>>>>> arch_prepare_to_swap(struct folio *folio);
> >>>>>>>>>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
> >>>>>>>>>>
> >>>>>>>>>> I'm sorry if that was also already discussed.
> >>>>>>>>>
> >>>>>>>>> This has been discussed. Steven, Ryan and I all don't think this is a good
> >>>>>>>>> option. in case we have a large folio with 16 basepages, as do_swap_page
> >>>>>>>>> can only map one base page for each page fault, that means we have
> >>>>>>>>> to restore 16(tags we restore in each page fault) * 16(the times of page
> >>>>>>>>> faults)
> >>>>>>>>> for this large folio.
> >>>>>>>>>
> >>>>>>>>> and still the worst thing is the page fault in the Nth PTE of large folio
> >>>>>>>>> might free swap entry as that swap has been in.
> >>>>>>>>> do_swap_page()
> >>>>>>>>> {
> >>>>>>>>>       /*
> >>>>>>>>>        * Remove the swap entry and conditionally try to free up the
> >>>>>>>>> swapcache.
> >>>>>>>>>        * We're already holding a reference on the page but haven't
> >>>>>>>>> mapped it
> >>>>>>>>>        * yet.
> >>>>>>>>>        */
> >>>>>>>>>        swap_free(entry);
> >>>>>>>>> }
> >>>>>>>>>
> >>>>>>>>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you might
> >>>>>>>>> access
> >>>>>>>>> a freed tag.
> >>>>>>>>
> >>>>>>>> And David, one more information is that to keep the parameter of
> >>>>>>>> arch_swap_restore() unchanged as folio,
> >>>>>>>> i actually tried an ugly approach in rfc v2:
> >>>>>>>>
> >>>>>>>> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
> >>>>>>>> +{
> >>>>>>>> + if (system_supports_mte()) {
> >>>>>>>> +      /*
> >>>>>>>> +       * We don't support large folios swap in as whole yet, but
> >>>>>>>> +       * we can hit a large folio which is still in swapcache
> >>>>>>>> +       * after those related processes' PTEs have been unmapped
> >>>>>>>> +       * but before the swapcache folio  is dropped, in this case,
> >>>>>>>> +       * we need to find the exact page which "entry" is mapping
> >>>>>>>> +       * to. If we are not hitting swapcache, this folio won't be
> >>>>>>>> +       * large
> >>>>>>>> +     */
> >>>>>>>> + struct page *page = folio_file_page(folio, swp_offset(entry));
> >>>>>>>> + mte_restore_tags(entry, page);
> >>>>>>>> + }
> >>>>>>>> +}
> >>>>>>>>
> >>>>>>>> And obviously everybody in the discussion hated it :-)
> >>>>>>>>
> >>>>>>>
> >>>>>>> I can relate :D
> >>>>>>>
> >>>>>>>> i feel the only way to keep API unchanged using folio is that we
> >>>>>>>> support restoring PTEs
> >>>>>>>> all together for the whole large folio and we support the swap-in of
> >>>>>>>> large folios. This is
> >>>>>>>> in my list to do, I will send a patchset based on Ryan's large anon
> >>>>>>>> folios series after a
> >>>>>>>> while. till that is really done, it seems using page rather than folio
> >>>>>>>> is a better choice.
> >>>>>>>
> >>>>>>> I think just restoring all tags and remembering for a large folio that
> >>>>>>> they have been restored might be the low hanging fruit. But as always,
> >>>>>>> devil is in the detail :)
> >>>>>>
> >>>>>> Hi David,
> >>>>>> thanks for all your suggestions though my feeling is this is too complex and
> >>>>>> is not worth it for at least  three reasons.
> >>>>>
> >>>>> Fair enough.
> >>>>>
> >>>>>>
> >>>>>> 1. In multi-thread and particularly multi-processes, we need some locks to
> >>>>>> protect and help know if one process is the first one to restore tags and if
> >>>>>> someone else is restoring tags when one process wants to restore. there
> >>>>>> is not this kind of fine-grained lock at all.
> >>>>>
> >>>>> We surely always hold the folio lock on swapin/swapout, no? So when these
> >>>>> functions are called.
> >>>>>
> >>>>> So that might just work already -- unless I am missing something important.
> >>>>
> >>>> We already have a page flag that we use to mark the page as having had its mte
> >>>> state associated; PG_mte_tagged. This is currently per-page (and IIUC, Matthew
> >>>> has been working to remove as many per-page flags as possible). Couldn't we just
> >>>> make arch_swap_restore() take a folio, restore the tags for *all* the pages and
> >>>> repurpose that flag to be per-folio (so head page only)? It looks like the the
> >>>> mte code already manages all the serialization requirements too. Then
> >>>> arch_swap_restore() can just exit early if it sees the flag is already set on
> >>>> the folio.
> >>>>
> >>>> One (probably nonsense) concern that just sprung to mind about having MTE work
> >>>> with large folios in general; is it possible that user space could cause a large
> >>>> anon folio to be allocated (THP), then later mark *part* of it to be tagged with
> >>>> MTE? In this case you would need to apply tags to part of the folio only.
> >>>> Although I have a vague recollection that any MTE areas have to be marked at
> >>>> mmap time and therefore this type of thing is impossible?
> >>>
> >>> right, we might need to consider only a part of folio needs to be
> >>> mapped and restored MTE tags.
> >>> do_swap_page() can have a chance to hit a large folio but it only
> >>> needs to fault-in a page.
> >>>
> >>> A case can be quite simple as below,
> >>>
> >>> 1. anon folio shared by process A and B
> >>> 2. add_to_swap() as a large folio;
> >>> 3. try to unmap A and B;
> >>> 4. after A is unmapped(ptes become swap entries), we do a
> >>> MADV_DONTNEED on a part of the folio. this can
> >>> happen very easily as userspace is still working in 4KB level;
> >>> userspace heap management can free an
> >>> basepage area by MADV_DONTNEED;
> >>> madvise(address, MADV_DONTNEED, 4KB);
> >>> 5. A refault on address + 8KB, we will hit large folio in
> >>> do_swap_page() but we will only need to map
> >>> one basepage, we will never need this DONTNEEDed in process A.
> >>>
> >>> another more complicated case can be mprotect and munmap a part of
> >>> large folios. since userspace
> >>> has no idea of large folios in their mind, they can do all strange
> >>> things. are we sure in all cases,
> >>> large folios have been splitted into small folios?
> >
> > I don;'t think these examples you cite are problematic. Although user space
> > thinks about things in 4K pages, the kernel does things in units of folios. So a
> > folio is either fully swapped out or not swapped out at all. MTE tags can be
> > saved/restored per folio, even if only part of that folio ends up being mapped
> > back into user space.

I am not so optimistic :-)

but zap_pte_range() due to DONTNEED on a part of swapped-out folio can
free a part of swap
entries? thus, free a part of MTE tags in a folio?
after process's large folios are swapped out, all PTEs in a large
folio become swap
entries, but DONTNEED on a part of this area will only set a part of
swap entries to
PTE_NONE, thus decrease the swapcount of this part?

zap_pte_range
    ->
          entry = pte_to_swp_entry
                  -> free_swap_and_cache(entry)
                      -> mte tags invalidate

> >
> > The problem is that MTE tagging could be active only for a selection of pages
> > within the folio; that's where it gets tricky.
> >
> >>
> >> To handle that, we'd have to identify
> >>
> >> a) if a subpage has an mte tag to save during swapout
> >> b) if a subpage has an mte tag to restore during swapin
> >>
> >> I suspect b) can be had from whatever datastructure we're using to actually save
> >> the tags?
> >>
> >> For a), is there some way to have that information from the HW?
> >
> > Yes I agree with this approach. I don't know the answer to that question though;
> > I'd assume it must be possible. Steven?
>
> Architecturally 'all' pages have MTE metadata (although see Alex's
> series[1] which would change this).
>
> The question is: could user space have put any data in the tag storage?
> We currently use the PG_mte_tagged page flag to keep track of pages
> which have been mapped (to user space) with MTE enabled. If the page has
> never been exposed to user space with MTE enabled (since being cleared)
> then there's nothing of interest to store.
>
> It would be possible to reverse this scheme - we could drop the page
> flag and just look at the actual tag storage. If it's all zeros then
> obviously there's no point in storing it. Note that originally we had a
> lazy clear of tag storage - i.e. if user space only had mapping without
> MTE enabled then the tag storage could contain junk. I believe that's
> now changed and the tag storage is always cleared at the same time as
> the data storage.
>
> The VMAs (obviously) also carry the information about whether a range is
> MTE-enabled (the VM_MTE flag controlled by PROT_MTE in user space), but
> there could be many VMAs and they may have different permissions, so
> fetching the state from there would be expensive.
>
> Not really relevant here, but the kernel can also use MTE (HW_TAGS
> KASAN) - AFAIK there's no way of identifying if the MTE tag storage is
> used or not for kernel pages. But this only presents an issue for
> hibernation which uses a different mechanism to normal swap.
>
> Steve
>
> [1]
> https://lore.kernel.org/r/20231119165721.9849-1-alexandru.elisei%40arm.com

Thanks
Barry
Ryan Roberts Nov. 27, 2023, 11:56 a.m. UTC | #21
On 24/11/2023 18:14, Barry Song wrote:
> On Fri, Nov 24, 2023 at 10:55 PM Steven Price <steven.price@arm.com> wrote:
>>
>> On 24/11/2023 09:01, Ryan Roberts wrote:
>>> On 24/11/2023 08:55, David Hildenbrand wrote:
>>>> On 24.11.23 02:35, Barry Song wrote:
>>>>> On Mon, Nov 20, 2023 at 11:57 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>>
>>>>>> On 20/11/2023 09:11, David Hildenbrand wrote:
>>>>>>> On 17.11.23 19:41, Barry Song wrote:
>>>>>>>> On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>>>>
>>>>>>>>> On 17.11.23 01:15, Barry Song wrote:
>>>>>>>>>> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
>>>>>>>>>>>
>>>>>>>>>>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On 15.11.23 21:49, Barry Song wrote:
>>>>>>>>>>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com>
>>>>>>>>>>>>> wrote:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 14.11.23 02:43, Barry Song wrote:
>>>>>>>>>>>>>>> This patch makes MTE tags saving and restoring support large folios,
>>>>>>>>>>>>>>> then we don't need to split them into base pages for swapping out
>>>>>>>>>>>>>>> on ARM64 SoCs with MTE.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> arch_prepare_to_swap() should take folio rather than page as parameter
>>>>>>>>>>>>>>> because we support THP swap-out as a whole.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather than
>>>>>>>>>>>>>>> folio as swap-in always works at the granularity of base pages right
>>>>>>>>>>>>>>> now.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> ... but then we always have order-0 folios and can pass a folio, or what
>>>>>>>>>>>>>> am I missing?
>>>>>>>>>>>>>
>>>>>>>>>>>>> Hi David,
>>>>>>>>>>>>> you missed the discussion here:
>>>>>>>>>>>>>
>>>>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
>>>>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>>>>>>>>>>>>
>>>>>>>>>>>> Okay, so you want to handle the refault-from-swapcache case where you
>>>>>>>>>>>> get a
>>>>>>>>>>>> large folio.
>>>>>>>>>>>>
>>>>>>>>>>>> I was mislead by your "folio as swap-in always works at the granularity of
>>>>>>>>>>>> base pages right now" comment.
>>>>>>>>>>>>
>>>>>>>>>>>> What you actually wanted to say is "While we always swap in small
>>>>>>>>>>>> folios, we
>>>>>>>>>>>> might refault large folios from the swapcache, and we only want to restore
>>>>>>>>>>>> the tags for the page of the large folio we are faulting on."
>>>>>>>>>>>>
>>>>>>>>>>>> But, I do if we can't simply restore the tags for the whole thing at once
>>>>>>>>>>>> at make the interface page-free?
>>>>>>>>>>>>
>>>>>>>>>>>> Let me elaborate:
>>>>>>>>>>>>
>>>>>>>>>>>> IIRC, if we have a large folio in the swapcache, the swap
>>>>>>>>>>>> entries/offset are
>>>>>>>>>>>> contiguous. If you know you are faulting on page[1] of the folio with a
>>>>>>>>>>>> given swap offset, you can calculate the swap offset for page[0] simply by
>>>>>>>>>>>> subtracting from the offset.
>>>>>>>>>>>>
>>>>>>>>>>>> See page_swap_entry() on how we perform this calculation.
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> So you can simply pass the large folio and the swap entry corresponding
>>>>>>>>>>>> to the first page of the large folio, and restore all tags at once.
>>>>>>>>>>>>
>>>>>>>>>>>> So the interface would be
>>>>>>>>>>>>
>>>>>>>>>>>> arch_prepare_to_swap(struct folio *folio);
>>>>>>>>>>>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
>>>>>>>>>>>>
>>>>>>>>>>>> I'm sorry if that was also already discussed.
>>>>>>>>>>>
>>>>>>>>>>> This has been discussed. Steven, Ryan and I all don't think this is a good
>>>>>>>>>>> option. in case we have a large folio with 16 basepages, as do_swap_page
>>>>>>>>>>> can only map one base page for each page fault, that means we have
>>>>>>>>>>> to restore 16(tags we restore in each page fault) * 16(the times of page
>>>>>>>>>>> faults)
>>>>>>>>>>> for this large folio.
>>>>>>>>>>>
>>>>>>>>>>> and still the worst thing is the page fault in the Nth PTE of large folio
>>>>>>>>>>> might free swap entry as that swap has been in.
>>>>>>>>>>> do_swap_page()
>>>>>>>>>>> {
>>>>>>>>>>>       /*
>>>>>>>>>>>        * Remove the swap entry and conditionally try to free up the
>>>>>>>>>>> swapcache.
>>>>>>>>>>>        * We're already holding a reference on the page but haven't
>>>>>>>>>>> mapped it
>>>>>>>>>>>        * yet.
>>>>>>>>>>>        */
>>>>>>>>>>>        swap_free(entry);
>>>>>>>>>>> }
>>>>>>>>>>>
>>>>>>>>>>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you might
>>>>>>>>>>> access
>>>>>>>>>>> a freed tag.
>>>>>>>>>>
>>>>>>>>>> And David, one more information is that to keep the parameter of
>>>>>>>>>> arch_swap_restore() unchanged as folio,
>>>>>>>>>> i actually tried an ugly approach in rfc v2:
>>>>>>>>>>
>>>>>>>>>> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
>>>>>>>>>> +{
>>>>>>>>>> + if (system_supports_mte()) {
>>>>>>>>>> +      /*
>>>>>>>>>> +       * We don't support large folios swap in as whole yet, but
>>>>>>>>>> +       * we can hit a large folio which is still in swapcache
>>>>>>>>>> +       * after those related processes' PTEs have been unmapped
>>>>>>>>>> +       * but before the swapcache folio  is dropped, in this case,
>>>>>>>>>> +       * we need to find the exact page which "entry" is mapping
>>>>>>>>>> +       * to. If we are not hitting swapcache, this folio won't be
>>>>>>>>>> +       * large
>>>>>>>>>> +     */
>>>>>>>>>> + struct page *page = folio_file_page(folio, swp_offset(entry));
>>>>>>>>>> + mte_restore_tags(entry, page);
>>>>>>>>>> + }
>>>>>>>>>> +}
>>>>>>>>>>
>>>>>>>>>> And obviously everybody in the discussion hated it :-)
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I can relate :D
>>>>>>>>>
>>>>>>>>>> i feel the only way to keep API unchanged using folio is that we
>>>>>>>>>> support restoring PTEs
>>>>>>>>>> all together for the whole large folio and we support the swap-in of
>>>>>>>>>> large folios. This is
>>>>>>>>>> in my list to do, I will send a patchset based on Ryan's large anon
>>>>>>>>>> folios series after a
>>>>>>>>>> while. till that is really done, it seems using page rather than folio
>>>>>>>>>> is a better choice.
>>>>>>>>>
>>>>>>>>> I think just restoring all tags and remembering for a large folio that
>>>>>>>>> they have been restored might be the low hanging fruit. But as always,
>>>>>>>>> devil is in the detail :)
>>>>>>>>
>>>>>>>> Hi David,
>>>>>>>> thanks for all your suggestions though my feeling is this is too complex and
>>>>>>>> is not worth it for at least  three reasons.
>>>>>>>
>>>>>>> Fair enough.
>>>>>>>
>>>>>>>>
>>>>>>>> 1. In multi-thread and particularly multi-processes, we need some locks to
>>>>>>>> protect and help know if one process is the first one to restore tags and if
>>>>>>>> someone else is restoring tags when one process wants to restore. there
>>>>>>>> is not this kind of fine-grained lock at all.
>>>>>>>
>>>>>>> We surely always hold the folio lock on swapin/swapout, no? So when these
>>>>>>> functions are called.
>>>>>>>
>>>>>>> So that might just work already -- unless I am missing something important.
>>>>>>
>>>>>> We already have a page flag that we use to mark the page as having had its mte
>>>>>> state associated; PG_mte_tagged. This is currently per-page (and IIUC, Matthew
>>>>>> has been working to remove as many per-page flags as possible). Couldn't we just
>>>>>> make arch_swap_restore() take a folio, restore the tags for *all* the pages and
>>>>>> repurpose that flag to be per-folio (so head page only)? It looks like the the
>>>>>> mte code already manages all the serialization requirements too. Then
>>>>>> arch_swap_restore() can just exit early if it sees the flag is already set on
>>>>>> the folio.
>>>>>>
>>>>>> One (probably nonsense) concern that just sprung to mind about having MTE work
>>>>>> with large folios in general; is it possible that user space could cause a large
>>>>>> anon folio to be allocated (THP), then later mark *part* of it to be tagged with
>>>>>> MTE? In this case you would need to apply tags to part of the folio only.
>>>>>> Although I have a vague recollection that any MTE areas have to be marked at
>>>>>> mmap time and therefore this type of thing is impossible?
>>>>>
>>>>> right, we might need to consider only a part of folio needs to be
>>>>> mapped and restored MTE tags.
>>>>> do_swap_page() can have a chance to hit a large folio but it only
>>>>> needs to fault-in a page.
>>>>>
>>>>> A case can be quite simple as below,
>>>>>
>>>>> 1. anon folio shared by process A and B
>>>>> 2. add_to_swap() as a large folio;
>>>>> 3. try to unmap A and B;
>>>>> 4. after A is unmapped(ptes become swap entries), we do a
>>>>> MADV_DONTNEED on a part of the folio. this can
>>>>> happen very easily as userspace is still working in 4KB level;
>>>>> userspace heap management can free an
>>>>> basepage area by MADV_DONTNEED;
>>>>> madvise(address, MADV_DONTNEED, 4KB);
>>>>> 5. A refault on address + 8KB, we will hit large folio in
>>>>> do_swap_page() but we will only need to map
>>>>> one basepage, we will never need this DONTNEEDed in process A.
>>>>>
>>>>> another more complicated case can be mprotect and munmap a part of
>>>>> large folios. since userspace
>>>>> has no idea of large folios in their mind, they can do all strange
>>>>> things. are we sure in all cases,
>>>>> large folios have been splitted into small folios?
>>>
>>> I don;'t think these examples you cite are problematic. Although user space
>>> thinks about things in 4K pages, the kernel does things in units of folios. So a
>>> folio is either fully swapped out or not swapped out at all. MTE tags can be
>>> saved/restored per folio, even if only part of that folio ends up being mapped
>>> back into user space.
> 
> I am not so optimistic :-)
> 
> but zap_pte_range() due to DONTNEED on a part of swapped-out folio can
> free a part of swap
> entries? thus, free a part of MTE tags in a folio?
> after process's large folios are swapped out, all PTEs in a large
> folio become swap
> entries, but DONTNEED on a part of this area will only set a part of
> swap entries to
> PTE_NONE, thus decrease the swapcount of this part?
> 
> zap_pte_range
>     ->
>           entry = pte_to_swp_entry
>                   -> free_swap_and_cache(entry)
>                       -> mte tags invalidate

OK I see what you mean.

Just trying to summarize this, I think there are 2 questions behind all this:

1) Can we save/restore MTE tags on at the granularity of a folio?

I think the answer is no; we can enable MTE on a individual pages within a folio
with mprotect, and we can throw away tags on individual pages as you describe
above. So we have to continue to handle tags per-page.

2) Should we use `struct folio` or `struct page` in the MTE interface? And if
using folio, what is the semantic?

If using `struct folio` I don't like the original semantic you had where the
implementation had to work out which sub-page was the target of the operation
from the swap entry. So if we opt for folio, then I guess that implies we would
want to save/restore tags for "all contained page which have MTE enabled". On
the restore path, the function will get called per-page so we will end up doing
a lot of uneccessary looping and early-exiting, I think?

So for me, it feels cleaner if we stick with the `struct page` approach on the
restore path, as you have it in your v3. It makes the semantic clear and avoids
all the extra looping.


> 
>>>
>>> The problem is that MTE tagging could be active only for a selection of pages
>>> within the folio; that's where it gets tricky.
>>>
>>>>
>>>> To handle that, we'd have to identify
>>>>
>>>> a) if a subpage has an mte tag to save during swapout
>>>> b) if a subpage has an mte tag to restore during swapin
>>>>
>>>> I suspect b) can be had from whatever datastructure we're using to actually save
>>>> the tags?
>>>>
>>>> For a), is there some way to have that information from the HW?
>>>
>>> Yes I agree with this approach. I don't know the answer to that question though;
>>> I'd assume it must be possible. Steven?
>>
>> Architecturally 'all' pages have MTE metadata (although see Alex's
>> series[1] which would change this).
>>
>> The question is: could user space have put any data in the tag storage?
>> We currently use the PG_mte_tagged page flag to keep track of pages
>> which have been mapped (to user space) with MTE enabled. If the page has
>> never been exposed to user space with MTE enabled (since being cleared)
>> then there's nothing of interest to store.
>>
>> It would be possible to reverse this scheme - we could drop the page
>> flag and just look at the actual tag storage. If it's all zeros then
>> obviously there's no point in storing it. Note that originally we had a
>> lazy clear of tag storage - i.e. if user space only had mapping without
>> MTE enabled then the tag storage could contain junk. I believe that's
>> now changed and the tag storage is always cleared at the same time as
>> the data storage.
>>
>> The VMAs (obviously) also carry the information about whether a range is
>> MTE-enabled (the VM_MTE flag controlled by PROT_MTE in user space), but
>> there could be many VMAs and they may have different permissions, so
>> fetching the state from there would be expensive.
>>
>> Not really relevant here, but the kernel can also use MTE (HW_TAGS
>> KASAN) - AFAIK there's no way of identifying if the MTE tag storage is
>> used or not for kernel pages. But this only presents an issue for
>> hibernation which uses a different mechanism to normal swap.
>>
>> Steve
>>
>> [1]
>> https://lore.kernel.org/r/20231119165721.9849-1-alexandru.elisei%40arm.com
> 
> Thanks
> Barry
David Hildenbrand Nov. 27, 2023, 12:01 p.m. UTC | #22
On 27.11.23 12:56, Ryan Roberts wrote:
> On 24/11/2023 18:14, Barry Song wrote:
>> On Fri, Nov 24, 2023 at 10:55 PM Steven Price <steven.price@arm.com> wrote:
>>>
>>> On 24/11/2023 09:01, Ryan Roberts wrote:
>>>> On 24/11/2023 08:55, David Hildenbrand wrote:
>>>>> On 24.11.23 02:35, Barry Song wrote:
>>>>>> On Mon, Nov 20, 2023 at 11:57 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>>>
>>>>>>> On 20/11/2023 09:11, David Hildenbrand wrote:
>>>>>>>> On 17.11.23 19:41, Barry Song wrote:
>>>>>>>>> On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>>>>>
>>>>>>>>>> On 17.11.23 01:15, Barry Song wrote:
>>>>>>>>>>> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand <david@redhat.com> wrote:
>>>>>>>>>>>>>
>>>>>>>>>>>>> On 15.11.23 21:49, Barry Song wrote:
>>>>>>>>>>>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand <david@redhat.com>
>>>>>>>>>>>>>> wrote:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> On 14.11.23 02:43, Barry Song wrote:
>>>>>>>>>>>>>>>> This patch makes MTE tags saving and restoring support large folios,
>>>>>>>>>>>>>>>> then we don't need to split them into base pages for swapping out
>>>>>>>>>>>>>>>> on ARM64 SoCs with MTE.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> arch_prepare_to_swap() should take folio rather than page as parameter
>>>>>>>>>>>>>>>> because we support THP swap-out as a whole.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather than
>>>>>>>>>>>>>>>> folio as swap-in always works at the granularity of base pages right
>>>>>>>>>>>>>>>> now.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> ... but then we always have order-0 folios and can pass a folio, or what
>>>>>>>>>>>>>>> am I missing?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Hi David,
>>>>>>>>>>>>>> you missed the discussion here:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
>>>>>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>>>>>>>>>>>>>
>>>>>>>>>>>>> Okay, so you want to handle the refault-from-swapcache case where you
>>>>>>>>>>>>> get a
>>>>>>>>>>>>> large folio.
>>>>>>>>>>>>>
>>>>>>>>>>>>> I was mislead by your "folio as swap-in always works at the granularity of
>>>>>>>>>>>>> base pages right now" comment.
>>>>>>>>>>>>>
>>>>>>>>>>>>> What you actually wanted to say is "While we always swap in small
>>>>>>>>>>>>> folios, we
>>>>>>>>>>>>> might refault large folios from the swapcache, and we only want to restore
>>>>>>>>>>>>> the tags for the page of the large folio we are faulting on."
>>>>>>>>>>>>>
>>>>>>>>>>>>> But, I do if we can't simply restore the tags for the whole thing at once
>>>>>>>>>>>>> at make the interface page-free?
>>>>>>>>>>>>>
>>>>>>>>>>>>> Let me elaborate:
>>>>>>>>>>>>>
>>>>>>>>>>>>> IIRC, if we have a large folio in the swapcache, the swap
>>>>>>>>>>>>> entries/offset are
>>>>>>>>>>>>> contiguous. If you know you are faulting on page[1] of the folio with a
>>>>>>>>>>>>> given swap offset, you can calculate the swap offset for page[0] simply by
>>>>>>>>>>>>> subtracting from the offset.
>>>>>>>>>>>>>
>>>>>>>>>>>>> See page_swap_entry() on how we perform this calculation.
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> So you can simply pass the large folio and the swap entry corresponding
>>>>>>>>>>>>> to the first page of the large folio, and restore all tags at once.
>>>>>>>>>>>>>
>>>>>>>>>>>>> So the interface would be
>>>>>>>>>>>>>
>>>>>>>>>>>>> arch_prepare_to_swap(struct folio *folio);
>>>>>>>>>>>>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
>>>>>>>>>>>>>
>>>>>>>>>>>>> I'm sorry if that was also already discussed.
>>>>>>>>>>>>
>>>>>>>>>>>> This has been discussed. Steven, Ryan and I all don't think this is a good
>>>>>>>>>>>> option. in case we have a large folio with 16 basepages, as do_swap_page
>>>>>>>>>>>> can only map one base page for each page fault, that means we have
>>>>>>>>>>>> to restore 16(tags we restore in each page fault) * 16(the times of page
>>>>>>>>>>>> faults)
>>>>>>>>>>>> for this large folio.
>>>>>>>>>>>>
>>>>>>>>>>>> and still the worst thing is the page fault in the Nth PTE of large folio
>>>>>>>>>>>> might free swap entry as that swap has been in.
>>>>>>>>>>>> do_swap_page()
>>>>>>>>>>>> {
>>>>>>>>>>>>        /*
>>>>>>>>>>>>         * Remove the swap entry and conditionally try to free up the
>>>>>>>>>>>> swapcache.
>>>>>>>>>>>>         * We're already holding a reference on the page but haven't
>>>>>>>>>>>> mapped it
>>>>>>>>>>>>         * yet.
>>>>>>>>>>>>         */
>>>>>>>>>>>>         swap_free(entry);
>>>>>>>>>>>> }
>>>>>>>>>>>>
>>>>>>>>>>>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you might
>>>>>>>>>>>> access
>>>>>>>>>>>> a freed tag.
>>>>>>>>>>>
>>>>>>>>>>> And David, one more information is that to keep the parameter of
>>>>>>>>>>> arch_swap_restore() unchanged as folio,
>>>>>>>>>>> i actually tried an ugly approach in rfc v2:
>>>>>>>>>>>
>>>>>>>>>>> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
>>>>>>>>>>> +{
>>>>>>>>>>> + if (system_supports_mte()) {
>>>>>>>>>>> +      /*
>>>>>>>>>>> +       * We don't support large folios swap in as whole yet, but
>>>>>>>>>>> +       * we can hit a large folio which is still in swapcache
>>>>>>>>>>> +       * after those related processes' PTEs have been unmapped
>>>>>>>>>>> +       * but before the swapcache folio  is dropped, in this case,
>>>>>>>>>>> +       * we need to find the exact page which "entry" is mapping
>>>>>>>>>>> +       * to. If we are not hitting swapcache, this folio won't be
>>>>>>>>>>> +       * large
>>>>>>>>>>> +     */
>>>>>>>>>>> + struct page *page = folio_file_page(folio, swp_offset(entry));
>>>>>>>>>>> + mte_restore_tags(entry, page);
>>>>>>>>>>> + }
>>>>>>>>>>> +}
>>>>>>>>>>>
>>>>>>>>>>> And obviously everybody in the discussion hated it :-)
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> I can relate :D
>>>>>>>>>>
>>>>>>>>>>> i feel the only way to keep API unchanged using folio is that we
>>>>>>>>>>> support restoring PTEs
>>>>>>>>>>> all together for the whole large folio and we support the swap-in of
>>>>>>>>>>> large folios. This is
>>>>>>>>>>> in my list to do, I will send a patchset based on Ryan's large anon
>>>>>>>>>>> folios series after a
>>>>>>>>>>> while. till that is really done, it seems using page rather than folio
>>>>>>>>>>> is a better choice.
>>>>>>>>>>
>>>>>>>>>> I think just restoring all tags and remembering for a large folio that
>>>>>>>>>> they have been restored might be the low hanging fruit. But as always,
>>>>>>>>>> devil is in the detail :)
>>>>>>>>>
>>>>>>>>> Hi David,
>>>>>>>>> thanks for all your suggestions though my feeling is this is too complex and
>>>>>>>>> is not worth it for at least  three reasons.
>>>>>>>>
>>>>>>>> Fair enough.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> 1. In multi-thread and particularly multi-processes, we need some locks to
>>>>>>>>> protect and help know if one process is the first one to restore tags and if
>>>>>>>>> someone else is restoring tags when one process wants to restore. there
>>>>>>>>> is not this kind of fine-grained lock at all.
>>>>>>>>
>>>>>>>> We surely always hold the folio lock on swapin/swapout, no? So when these
>>>>>>>> functions are called.
>>>>>>>>
>>>>>>>> So that might just work already -- unless I am missing something important.
>>>>>>>
>>>>>>> We already have a page flag that we use to mark the page as having had its mte
>>>>>>> state associated; PG_mte_tagged. This is currently per-page (and IIUC, Matthew
>>>>>>> has been working to remove as many per-page flags as possible). Couldn't we just
>>>>>>> make arch_swap_restore() take a folio, restore the tags for *all* the pages and
>>>>>>> repurpose that flag to be per-folio (so head page only)? It looks like the the
>>>>>>> mte code already manages all the serialization requirements too. Then
>>>>>>> arch_swap_restore() can just exit early if it sees the flag is already set on
>>>>>>> the folio.
>>>>>>>
>>>>>>> One (probably nonsense) concern that just sprung to mind about having MTE work
>>>>>>> with large folios in general; is it possible that user space could cause a large
>>>>>>> anon folio to be allocated (THP), then later mark *part* of it to be tagged with
>>>>>>> MTE? In this case you would need to apply tags to part of the folio only.
>>>>>>> Although I have a vague recollection that any MTE areas have to be marked at
>>>>>>> mmap time and therefore this type of thing is impossible?
>>>>>>
>>>>>> right, we might need to consider only a part of folio needs to be
>>>>>> mapped and restored MTE tags.
>>>>>> do_swap_page() can have a chance to hit a large folio but it only
>>>>>> needs to fault-in a page.
>>>>>>
>>>>>> A case can be quite simple as below,
>>>>>>
>>>>>> 1. anon folio shared by process A and B
>>>>>> 2. add_to_swap() as a large folio;
>>>>>> 3. try to unmap A and B;
>>>>>> 4. after A is unmapped(ptes become swap entries), we do a
>>>>>> MADV_DONTNEED on a part of the folio. this can
>>>>>> happen very easily as userspace is still working in 4KB level;
>>>>>> userspace heap management can free an
>>>>>> basepage area by MADV_DONTNEED;
>>>>>> madvise(address, MADV_DONTNEED, 4KB);
>>>>>> 5. A refault on address + 8KB, we will hit large folio in
>>>>>> do_swap_page() but we will only need to map
>>>>>> one basepage, we will never need this DONTNEEDed in process A.
>>>>>>
>>>>>> another more complicated case can be mprotect and munmap a part of
>>>>>> large folios. since userspace
>>>>>> has no idea of large folios in their mind, they can do all strange
>>>>>> things. are we sure in all cases,
>>>>>> large folios have been splitted into small folios?
>>>>
>>>> I don;'t think these examples you cite are problematic. Although user space
>>>> thinks about things in 4K pages, the kernel does things in units of folios. So a
>>>> folio is either fully swapped out or not swapped out at all. MTE tags can be
>>>> saved/restored per folio, even if only part of that folio ends up being mapped
>>>> back into user space.
>>
>> I am not so optimistic :-)
>>
>> but zap_pte_range() due to DONTNEED on a part of swapped-out folio can
>> free a part of swap
>> entries? thus, free a part of MTE tags in a folio?
>> after process's large folios are swapped out, all PTEs in a large
>> folio become swap
>> entries, but DONTNEED on a part of this area will only set a part of
>> swap entries to
>> PTE_NONE, thus decrease the swapcount of this part?
>>
>> zap_pte_range
>>      ->
>>            entry = pte_to_swp_entry
>>                    -> free_swap_and_cache(entry)
>>                        -> mte tags invalidate
> 
> OK I see what you mean.
> 
> Just trying to summarize this, I think there are 2 questions behind all this:
> 
> 1) Can we save/restore MTE tags on at the granularity of a folio?
> 
> I think the answer is no; we can enable MTE on a individual pages within a folio
> with mprotect, and we can throw away tags on individual pages as you describe
> above. So we have to continue to handle tags per-page.

Can you enlighten me why the scheme proposed by Steven doesn't work?

I mean, having a mixture of tagged vs. untagged is assumed to be the 
corner case, right?
Ryan Roberts Nov. 27, 2023, 12:14 p.m. UTC | #23
On 27/11/2023 12:01, David Hildenbrand wrote:
> On 27.11.23 12:56, Ryan Roberts wrote:
>> On 24/11/2023 18:14, Barry Song wrote:
>>> On Fri, Nov 24, 2023 at 10:55 PM Steven Price <steven.price@arm.com> wrote:
>>>>
>>>> On 24/11/2023 09:01, Ryan Roberts wrote:
>>>>> On 24/11/2023 08:55, David Hildenbrand wrote:
>>>>>> On 24.11.23 02:35, Barry Song wrote:
>>>>>>> On Mon, Nov 20, 2023 at 11:57 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>>>>
>>>>>>>> On 20/11/2023 09:11, David Hildenbrand wrote:
>>>>>>>>> On 17.11.23 19:41, Barry Song wrote:
>>>>>>>>>> On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com>
>>>>>>>>>> wrote:
>>>>>>>>>>>
>>>>>>>>>>> On 17.11.23 01:15, Barry Song wrote:
>>>>>>>>>>>> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
>>>>>>>>>>>>>
>>>>>>>>>>>>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand
>>>>>>>>>>>>> <david@redhat.com> wrote:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 15.11.23 21:49, Barry Song wrote:
>>>>>>>>>>>>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand
>>>>>>>>>>>>>>> <david@redhat.com>
>>>>>>>>>>>>>>> wrote:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> On 14.11.23 02:43, Barry Song wrote:
>>>>>>>>>>>>>>>>> This patch makes MTE tags saving and restoring support large
>>>>>>>>>>>>>>>>> folios,
>>>>>>>>>>>>>>>>> then we don't need to split them into base pages for swapping out
>>>>>>>>>>>>>>>>> on ARM64 SoCs with MTE.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> arch_prepare_to_swap() should take folio rather than page as
>>>>>>>>>>>>>>>>> parameter
>>>>>>>>>>>>>>>>> because we support THP swap-out as a whole.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather
>>>>>>>>>>>>>>>>> than
>>>>>>>>>>>>>>>>> folio as swap-in always works at the granularity of base pages
>>>>>>>>>>>>>>>>> right
>>>>>>>>>>>>>>>>> now.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> ... but then we always have order-0 folios and can pass a folio,
>>>>>>>>>>>>>>>> or what
>>>>>>>>>>>>>>>> am I missing?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Hi David,
>>>>>>>>>>>>>>> you missed the discussion here:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
>>>>>>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Okay, so you want to handle the refault-from-swapcache case where you
>>>>>>>>>>>>>> get a
>>>>>>>>>>>>>> large folio.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I was mislead by your "folio as swap-in always works at the
>>>>>>>>>>>>>> granularity of
>>>>>>>>>>>>>> base pages right now" comment.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> What you actually wanted to say is "While we always swap in small
>>>>>>>>>>>>>> folios, we
>>>>>>>>>>>>>> might refault large folios from the swapcache, and we only want to
>>>>>>>>>>>>>> restore
>>>>>>>>>>>>>> the tags for the page of the large folio we are faulting on."
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> But, I do if we can't simply restore the tags for the whole thing
>>>>>>>>>>>>>> at once
>>>>>>>>>>>>>> at make the interface page-free?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Let me elaborate:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> IIRC, if we have a large folio in the swapcache, the swap
>>>>>>>>>>>>>> entries/offset are
>>>>>>>>>>>>>> contiguous. If you know you are faulting on page[1] of the folio
>>>>>>>>>>>>>> with a
>>>>>>>>>>>>>> given swap offset, you can calculate the swap offset for page[0]
>>>>>>>>>>>>>> simply by
>>>>>>>>>>>>>> subtracting from the offset.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> See page_swap_entry() on how we perform this calculation.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> So you can simply pass the large folio and the swap entry
>>>>>>>>>>>>>> corresponding
>>>>>>>>>>>>>> to the first page of the large folio, and restore all tags at once.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> So the interface would be
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> arch_prepare_to_swap(struct folio *folio);
>>>>>>>>>>>>>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I'm sorry if that was also already discussed.
>>>>>>>>>>>>>
>>>>>>>>>>>>> This has been discussed. Steven, Ryan and I all don't think this is
>>>>>>>>>>>>> a good
>>>>>>>>>>>>> option. in case we have a large folio with 16 basepages, as
>>>>>>>>>>>>> do_swap_page
>>>>>>>>>>>>> can only map one base page for each page fault, that means we have
>>>>>>>>>>>>> to restore 16(tags we restore in each page fault) * 16(the times of
>>>>>>>>>>>>> page
>>>>>>>>>>>>> faults)
>>>>>>>>>>>>> for this large folio.
>>>>>>>>>>>>>
>>>>>>>>>>>>> and still the worst thing is the page fault in the Nth PTE of large
>>>>>>>>>>>>> folio
>>>>>>>>>>>>> might free swap entry as that swap has been in.
>>>>>>>>>>>>> do_swap_page()
>>>>>>>>>>>>> {
>>>>>>>>>>>>>        /*
>>>>>>>>>>>>>         * Remove the swap entry and conditionally try to free up the
>>>>>>>>>>>>> swapcache.
>>>>>>>>>>>>>         * We're already holding a reference on the page but haven't
>>>>>>>>>>>>> mapped it
>>>>>>>>>>>>>         * yet.
>>>>>>>>>>>>>         */
>>>>>>>>>>>>>         swap_free(entry);
>>>>>>>>>>>>> }
>>>>>>>>>>>>>
>>>>>>>>>>>>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you
>>>>>>>>>>>>> might
>>>>>>>>>>>>> access
>>>>>>>>>>>>> a freed tag.
>>>>>>>>>>>>
>>>>>>>>>>>> And David, one more information is that to keep the parameter of
>>>>>>>>>>>> arch_swap_restore() unchanged as folio,
>>>>>>>>>>>> i actually tried an ugly approach in rfc v2:
>>>>>>>>>>>>
>>>>>>>>>>>> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
>>>>>>>>>>>> +{
>>>>>>>>>>>> + if (system_supports_mte()) {
>>>>>>>>>>>> +      /*
>>>>>>>>>>>> +       * We don't support large folios swap in as whole yet, but
>>>>>>>>>>>> +       * we can hit a large folio which is still in swapcache
>>>>>>>>>>>> +       * after those related processes' PTEs have been unmapped
>>>>>>>>>>>> +       * but before the swapcache folio  is dropped, in this case,
>>>>>>>>>>>> +       * we need to find the exact page which "entry" is mapping
>>>>>>>>>>>> +       * to. If we are not hitting swapcache, this folio won't be
>>>>>>>>>>>> +       * large
>>>>>>>>>>>> +     */
>>>>>>>>>>>> + struct page *page = folio_file_page(folio, swp_offset(entry));
>>>>>>>>>>>> + mte_restore_tags(entry, page);
>>>>>>>>>>>> + }
>>>>>>>>>>>> +}
>>>>>>>>>>>>
>>>>>>>>>>>> And obviously everybody in the discussion hated it :-)
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> I can relate :D
>>>>>>>>>>>
>>>>>>>>>>>> i feel the only way to keep API unchanged using folio is that we
>>>>>>>>>>>> support restoring PTEs
>>>>>>>>>>>> all together for the whole large folio and we support the swap-in of
>>>>>>>>>>>> large folios. This is
>>>>>>>>>>>> in my list to do, I will send a patchset based on Ryan's large anon
>>>>>>>>>>>> folios series after a
>>>>>>>>>>>> while. till that is really done, it seems using page rather than folio
>>>>>>>>>>>> is a better choice.
>>>>>>>>>>>
>>>>>>>>>>> I think just restoring all tags and remembering for a large folio that
>>>>>>>>>>> they have been restored might be the low hanging fruit. But as always,
>>>>>>>>>>> devil is in the detail :)
>>>>>>>>>>
>>>>>>>>>> Hi David,
>>>>>>>>>> thanks for all your suggestions though my feeling is this is too
>>>>>>>>>> complex and
>>>>>>>>>> is not worth it for at least  three reasons.
>>>>>>>>>
>>>>>>>>> Fair enough.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> 1. In multi-thread and particularly multi-processes, we need some
>>>>>>>>>> locks to
>>>>>>>>>> protect and help know if one process is the first one to restore tags
>>>>>>>>>> and if
>>>>>>>>>> someone else is restoring tags when one process wants to restore. there
>>>>>>>>>> is not this kind of fine-grained lock at all.
>>>>>>>>>
>>>>>>>>> We surely always hold the folio lock on swapin/swapout, no? So when these
>>>>>>>>> functions are called.
>>>>>>>>>
>>>>>>>>> So that might just work already -- unless I am missing something
>>>>>>>>> important.
>>>>>>>>
>>>>>>>> We already have a page flag that we use to mark the page as having had
>>>>>>>> its mte
>>>>>>>> state associated; PG_mte_tagged. This is currently per-page (and IIUC,
>>>>>>>> Matthew
>>>>>>>> has been working to remove as many per-page flags as possible). Couldn't
>>>>>>>> we just
>>>>>>>> make arch_swap_restore() take a folio, restore the tags for *all* the
>>>>>>>> pages and
>>>>>>>> repurpose that flag to be per-folio (so head page only)? It looks like
>>>>>>>> the the
>>>>>>>> mte code already manages all the serialization requirements too. Then
>>>>>>>> arch_swap_restore() can just exit early if it sees the flag is already
>>>>>>>> set on
>>>>>>>> the folio.
>>>>>>>>
>>>>>>>> One (probably nonsense) concern that just sprung to mind about having
>>>>>>>> MTE work
>>>>>>>> with large folios in general; is it possible that user space could cause
>>>>>>>> a large
>>>>>>>> anon folio to be allocated (THP), then later mark *part* of it to be
>>>>>>>> tagged with
>>>>>>>> MTE? In this case you would need to apply tags to part of the folio only.
>>>>>>>> Although I have a vague recollection that any MTE areas have to be
>>>>>>>> marked at
>>>>>>>> mmap time and therefore this type of thing is impossible?
>>>>>>>
>>>>>>> right, we might need to consider only a part of folio needs to be
>>>>>>> mapped and restored MTE tags.
>>>>>>> do_swap_page() can have a chance to hit a large folio but it only
>>>>>>> needs to fault-in a page.
>>>>>>>
>>>>>>> A case can be quite simple as below,
>>>>>>>
>>>>>>> 1. anon folio shared by process A and B
>>>>>>> 2. add_to_swap() as a large folio;
>>>>>>> 3. try to unmap A and B;
>>>>>>> 4. after A is unmapped(ptes become swap entries), we do a
>>>>>>> MADV_DONTNEED on a part of the folio. this can
>>>>>>> happen very easily as userspace is still working in 4KB level;
>>>>>>> userspace heap management can free an
>>>>>>> basepage area by MADV_DONTNEED;
>>>>>>> madvise(address, MADV_DONTNEED, 4KB);
>>>>>>> 5. A refault on address + 8KB, we will hit large folio in
>>>>>>> do_swap_page() but we will only need to map
>>>>>>> one basepage, we will never need this DONTNEEDed in process A.
>>>>>>>
>>>>>>> another more complicated case can be mprotect and munmap a part of
>>>>>>> large folios. since userspace
>>>>>>> has no idea of large folios in their mind, they can do all strange
>>>>>>> things. are we sure in all cases,
>>>>>>> large folios have been splitted into small folios?
>>>>>
>>>>> I don;'t think these examples you cite are problematic. Although user space
>>>>> thinks about things in 4K pages, the kernel does things in units of folios.
>>>>> So a
>>>>> folio is either fully swapped out or not swapped out at all. MTE tags can be
>>>>> saved/restored per folio, even if only part of that folio ends up being mapped
>>>>> back into user space.
>>>
>>> I am not so optimistic :-)
>>>
>>> but zap_pte_range() due to DONTNEED on a part of swapped-out folio can
>>> free a part of swap
>>> entries? thus, free a part of MTE tags in a folio?
>>> after process's large folios are swapped out, all PTEs in a large
>>> folio become swap
>>> entries, but DONTNEED on a part of this area will only set a part of
>>> swap entries to
>>> PTE_NONE, thus decrease the swapcount of this part?
>>>
>>> zap_pte_range
>>>      ->
>>>            entry = pte_to_swp_entry
>>>                    -> free_swap_and_cache(entry)
>>>                        -> mte tags invalidate
>>
>> OK I see what you mean.
>>
>> Just trying to summarize this, I think there are 2 questions behind all this:
>>
>> 1) Can we save/restore MTE tags on at the granularity of a folio?
>>
>> I think the answer is no; we can enable MTE on a individual pages within a folio
>> with mprotect, and we can throw away tags on individual pages as you describe
>> above. So we have to continue to handle tags per-page.
> 
> Can you enlighten me why the scheme proposed by Steven doesn't work?

Are you referring to Steven's suggestion of reading the tag to see if it's
zeros? I think that demonstrates my point that this has to be done per-page and
not per-folio? I'm also not sure what it buys us - instead of reading a per-page
flag we now have to read 128 bytes of tag for each page and check its zero.

> 
> I mean, having a mixture of tagged vs. untagged is assumed to be the corner
> case, right?

Yes. But I'm not sure how we exploit that; I guess we could have a per-folio
flag; when set it means the whole folio is tagged and when clear it means fall
back to checking the per-page flag?
David Hildenbrand Nov. 27, 2023, 2:16 p.m. UTC | #24
On 27.11.23 13:14, Ryan Roberts wrote:
> On 27/11/2023 12:01, David Hildenbrand wrote:
>> On 27.11.23 12:56, Ryan Roberts wrote:
>>> On 24/11/2023 18:14, Barry Song wrote:
>>>> On Fri, Nov 24, 2023 at 10:55 PM Steven Price <steven.price@arm.com> wrote:
>>>>>
>>>>> On 24/11/2023 09:01, Ryan Roberts wrote:
>>>>>> On 24/11/2023 08:55, David Hildenbrand wrote:
>>>>>>> On 24.11.23 02:35, Barry Song wrote:
>>>>>>>> On Mon, Nov 20, 2023 at 11:57 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>>>>>>
>>>>>>>>> On 20/11/2023 09:11, David Hildenbrand wrote:
>>>>>>>>>> On 17.11.23 19:41, Barry Song wrote:
>>>>>>>>>>> On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com>
>>>>>>>>>>> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On 17.11.23 01:15, Barry Song wrote:
>>>>>>>>>>>>> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand
>>>>>>>>>>>>>> <david@redhat.com> wrote:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> On 15.11.23 21:49, Barry Song wrote:
>>>>>>>>>>>>>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand
>>>>>>>>>>>>>>>> <david@redhat.com>
>>>>>>>>>>>>>>>> wrote:
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> On 14.11.23 02:43, Barry Song wrote:
>>>>>>>>>>>>>>>>>> This patch makes MTE tags saving and restoring support large
>>>>>>>>>>>>>>>>>> folios,
>>>>>>>>>>>>>>>>>> then we don't need to split them into base pages for swapping out
>>>>>>>>>>>>>>>>>> on ARM64 SoCs with MTE.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> arch_prepare_to_swap() should take folio rather than page as
>>>>>>>>>>>>>>>>>> parameter
>>>>>>>>>>>>>>>>>> because we support THP swap-out as a whole.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather
>>>>>>>>>>>>>>>>>> than
>>>>>>>>>>>>>>>>>> folio as swap-in always works at the granularity of base pages
>>>>>>>>>>>>>>>>>> right
>>>>>>>>>>>>>>>>>> now.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> ... but then we always have order-0 folios and can pass a folio,
>>>>>>>>>>>>>>>>> or what
>>>>>>>>>>>>>>>>> am I missing?
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Hi David,
>>>>>>>>>>>>>>>> you missed the discussion here:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
>>>>>>>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Okay, so you want to handle the refault-from-swapcache case where you
>>>>>>>>>>>>>>> get a
>>>>>>>>>>>>>>> large folio.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I was mislead by your "folio as swap-in always works at the
>>>>>>>>>>>>>>> granularity of
>>>>>>>>>>>>>>> base pages right now" comment.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> What you actually wanted to say is "While we always swap in small
>>>>>>>>>>>>>>> folios, we
>>>>>>>>>>>>>>> might refault large folios from the swapcache, and we only want to
>>>>>>>>>>>>>>> restore
>>>>>>>>>>>>>>> the tags for the page of the large folio we are faulting on."
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> But, I do if we can't simply restore the tags for the whole thing
>>>>>>>>>>>>>>> at once
>>>>>>>>>>>>>>> at make the interface page-free?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Let me elaborate:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> IIRC, if we have a large folio in the swapcache, the swap
>>>>>>>>>>>>>>> entries/offset are
>>>>>>>>>>>>>>> contiguous. If you know you are faulting on page[1] of the folio
>>>>>>>>>>>>>>> with a
>>>>>>>>>>>>>>> given swap offset, you can calculate the swap offset for page[0]
>>>>>>>>>>>>>>> simply by
>>>>>>>>>>>>>>> subtracting from the offset.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> See page_swap_entry() on how we perform this calculation.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> So you can simply pass the large folio and the swap entry
>>>>>>>>>>>>>>> corresponding
>>>>>>>>>>>>>>> to the first page of the large folio, and restore all tags at once.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> So the interface would be
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> arch_prepare_to_swap(struct folio *folio);
>>>>>>>>>>>>>>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> I'm sorry if that was also already discussed.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> This has been discussed. Steven, Ryan and I all don't think this is
>>>>>>>>>>>>>> a good
>>>>>>>>>>>>>> option. in case we have a large folio with 16 basepages, as
>>>>>>>>>>>>>> do_swap_page
>>>>>>>>>>>>>> can only map one base page for each page fault, that means we have
>>>>>>>>>>>>>> to restore 16(tags we restore in each page fault) * 16(the times of
>>>>>>>>>>>>>> page
>>>>>>>>>>>>>> faults)
>>>>>>>>>>>>>> for this large folio.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> and still the worst thing is the page fault in the Nth PTE of large
>>>>>>>>>>>>>> folio
>>>>>>>>>>>>>> might free swap entry as that swap has been in.
>>>>>>>>>>>>>> do_swap_page()
>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>         /*
>>>>>>>>>>>>>>          * Remove the swap entry and conditionally try to free up the
>>>>>>>>>>>>>> swapcache.
>>>>>>>>>>>>>>          * We're already holding a reference on the page but haven't
>>>>>>>>>>>>>> mapped it
>>>>>>>>>>>>>>          * yet.
>>>>>>>>>>>>>>          */
>>>>>>>>>>>>>>          swap_free(entry);
>>>>>>>>>>>>>> }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you
>>>>>>>>>>>>>> might
>>>>>>>>>>>>>> access
>>>>>>>>>>>>>> a freed tag.
>>>>>>>>>>>>>
>>>>>>>>>>>>> And David, one more information is that to keep the parameter of
>>>>>>>>>>>>> arch_swap_restore() unchanged as folio,
>>>>>>>>>>>>> i actually tried an ugly approach in rfc v2:
>>>>>>>>>>>>>
>>>>>>>>>>>>> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
>>>>>>>>>>>>> +{
>>>>>>>>>>>>> + if (system_supports_mte()) {
>>>>>>>>>>>>> +      /*
>>>>>>>>>>>>> +       * We don't support large folios swap in as whole yet, but
>>>>>>>>>>>>> +       * we can hit a large folio which is still in swapcache
>>>>>>>>>>>>> +       * after those related processes' PTEs have been unmapped
>>>>>>>>>>>>> +       * but before the swapcache folio  is dropped, in this case,
>>>>>>>>>>>>> +       * we need to find the exact page which "entry" is mapping
>>>>>>>>>>>>> +       * to. If we are not hitting swapcache, this folio won't be
>>>>>>>>>>>>> +       * large
>>>>>>>>>>>>> +     */
>>>>>>>>>>>>> + struct page *page = folio_file_page(folio, swp_offset(entry));
>>>>>>>>>>>>> + mte_restore_tags(entry, page);
>>>>>>>>>>>>> + }
>>>>>>>>>>>>> +}
>>>>>>>>>>>>>
>>>>>>>>>>>>> And obviously everybody in the discussion hated it :-)
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> I can relate :D
>>>>>>>>>>>>
>>>>>>>>>>>>> i feel the only way to keep API unchanged using folio is that we
>>>>>>>>>>>>> support restoring PTEs
>>>>>>>>>>>>> all together for the whole large folio and we support the swap-in of
>>>>>>>>>>>>> large folios. This is
>>>>>>>>>>>>> in my list to do, I will send a patchset based on Ryan's large anon
>>>>>>>>>>>>> folios series after a
>>>>>>>>>>>>> while. till that is really done, it seems using page rather than folio
>>>>>>>>>>>>> is a better choice.
>>>>>>>>>>>>
>>>>>>>>>>>> I think just restoring all tags and remembering for a large folio that
>>>>>>>>>>>> they have been restored might be the low hanging fruit. But as always,
>>>>>>>>>>>> devil is in the detail :)
>>>>>>>>>>>
>>>>>>>>>>> Hi David,
>>>>>>>>>>> thanks for all your suggestions though my feeling is this is too
>>>>>>>>>>> complex and
>>>>>>>>>>> is not worth it for at least  three reasons.
>>>>>>>>>>
>>>>>>>>>> Fair enough.
>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> 1. In multi-thread and particularly multi-processes, we need some
>>>>>>>>>>> locks to
>>>>>>>>>>> protect and help know if one process is the first one to restore tags
>>>>>>>>>>> and if
>>>>>>>>>>> someone else is restoring tags when one process wants to restore. there
>>>>>>>>>>> is not this kind of fine-grained lock at all.
>>>>>>>>>>
>>>>>>>>>> We surely always hold the folio lock on swapin/swapout, no? So when these
>>>>>>>>>> functions are called.
>>>>>>>>>>
>>>>>>>>>> So that might just work already -- unless I am missing something
>>>>>>>>>> important.
>>>>>>>>>
>>>>>>>>> We already have a page flag that we use to mark the page as having had
>>>>>>>>> its mte
>>>>>>>>> state associated; PG_mte_tagged. This is currently per-page (and IIUC,
>>>>>>>>> Matthew
>>>>>>>>> has been working to remove as many per-page flags as possible). Couldn't
>>>>>>>>> we just
>>>>>>>>> make arch_swap_restore() take a folio, restore the tags for *all* the
>>>>>>>>> pages and
>>>>>>>>> repurpose that flag to be per-folio (so head page only)? It looks like
>>>>>>>>> the the
>>>>>>>>> mte code already manages all the serialization requirements too. Then
>>>>>>>>> arch_swap_restore() can just exit early if it sees the flag is already
>>>>>>>>> set on
>>>>>>>>> the folio.
>>>>>>>>>
>>>>>>>>> One (probably nonsense) concern that just sprung to mind about having
>>>>>>>>> MTE work
>>>>>>>>> with large folios in general; is it possible that user space could cause
>>>>>>>>> a large
>>>>>>>>> anon folio to be allocated (THP), then later mark *part* of it to be
>>>>>>>>> tagged with
>>>>>>>>> MTE? In this case you would need to apply tags to part of the folio only.
>>>>>>>>> Although I have a vague recollection that any MTE areas have to be
>>>>>>>>> marked at
>>>>>>>>> mmap time and therefore this type of thing is impossible?
>>>>>>>>
>>>>>>>> right, we might need to consider only a part of folio needs to be
>>>>>>>> mapped and restored MTE tags.
>>>>>>>> do_swap_page() can have a chance to hit a large folio but it only
>>>>>>>> needs to fault-in a page.
>>>>>>>>
>>>>>>>> A case can be quite simple as below,
>>>>>>>>
>>>>>>>> 1. anon folio shared by process A and B
>>>>>>>> 2. add_to_swap() as a large folio;
>>>>>>>> 3. try to unmap A and B;
>>>>>>>> 4. after A is unmapped(ptes become swap entries), we do a
>>>>>>>> MADV_DONTNEED on a part of the folio. this can
>>>>>>>> happen very easily as userspace is still working in 4KB level;
>>>>>>>> userspace heap management can free an
>>>>>>>> basepage area by MADV_DONTNEED;
>>>>>>>> madvise(address, MADV_DONTNEED, 4KB);
>>>>>>>> 5. A refault on address + 8KB, we will hit large folio in
>>>>>>>> do_swap_page() but we will only need to map
>>>>>>>> one basepage, we will never need this DONTNEEDed in process A.
>>>>>>>>
>>>>>>>> another more complicated case can be mprotect and munmap a part of
>>>>>>>> large folios. since userspace
>>>>>>>> has no idea of large folios in their mind, they can do all strange
>>>>>>>> things. are we sure in all cases,
>>>>>>>> large folios have been splitted into small folios?
>>>>>>
>>>>>> I don;'t think these examples you cite are problematic. Although user space
>>>>>> thinks about things in 4K pages, the kernel does things in units of folios.
>>>>>> So a
>>>>>> folio is either fully swapped out or not swapped out at all. MTE tags can be
>>>>>> saved/restored per folio, even if only part of that folio ends up being mapped
>>>>>> back into user space.
>>>>
>>>> I am not so optimistic :-)
>>>>
>>>> but zap_pte_range() due to DONTNEED on a part of swapped-out folio can
>>>> free a part of swap
>>>> entries? thus, free a part of MTE tags in a folio?
>>>> after process's large folios are swapped out, all PTEs in a large
>>>> folio become swap
>>>> entries, but DONTNEED on a part of this area will only set a part of
>>>> swap entries to
>>>> PTE_NONE, thus decrease the swapcount of this part?
>>>>
>>>> zap_pte_range
>>>>       ->
>>>>             entry = pte_to_swp_entry
>>>>                     -> free_swap_and_cache(entry)
>>>>                         -> mte tags invalidate
>>>
>>> OK I see what you mean.
>>>
>>> Just trying to summarize this, I think there are 2 questions behind all this:
>>>
>>> 1) Can we save/restore MTE tags on at the granularity of a folio?
>>>
>>> I think the answer is no; we can enable MTE on a individual pages within a folio
>>> with mprotect, and we can throw away tags on individual pages as you describe
>>> above. So we have to continue to handle tags per-page.
>>
>> Can you enlighten me why the scheme proposed by Steven doesn't work?
> 
> Are you referring to Steven's suggestion of reading the tag to see if it's
> zeros? I think that demonstrates my point that this has to be done per-page and

Yes.

> not per-folio? I'm also not sure what it buys us - instead of reading a per-page
> flag we now have to read 128 bytes of tag for each page and check its zero.

My point is, if that is the corner case, we might not care about that.

> 
>>
>> I mean, having a mixture of tagged vs. untagged is assumed to be the corner
>> case, right?
> 
> Yes. But I'm not sure how we exploit that; I guess we could have a per-folio
> flag; when set it means the whole folio is tagged and when clear it means fall
> back to checking the per-page flag?

Let me think this through. We have the following states:

1) Any subpage is possibly logically tagged and all relevant tags are
    applied/restored.
2) Any subpage is possibly logically tagged, and all relevant tags are
    not applied but stored in the datastructure.
3) No subpage is logically tagged.

We can identify in 1) the subpages by reading the tag from HW, and on 2) 
by checking the datastructure. For 3), there is nothing to check.

On swapout of a large folio:

* For 3) we don't do anything
* For 2) we don't do anything
* For 1) we store all tags that are non-zero (reading all tags) and
   transition to 2).

On swapin of a large folio

A) Old folio (swapcache)

If in 1) or 3) already, nothing to do.

If in 2), restore all tags that are in the datastructure and move to 1). 
Nothing to do for 1

b) Fresh folio (we lost any MTE marker)

Currently always order-0, so nothing to do. We'd have to check the 
datastructure for any tag part of the folio and set the state accordingly.


Of course, that means that on swapout, you read all tags. But if the 
common case is that all subpages have tags, we don't really care.

I'm sure I made a mistake somewhere, but where? :)
Ryan Roberts Nov. 27, 2023, 2:52 p.m. UTC | #25
On 27/11/2023 14:16, David Hildenbrand wrote:
> On 27.11.23 13:14, Ryan Roberts wrote:
>> On 27/11/2023 12:01, David Hildenbrand wrote:
>>> On 27.11.23 12:56, Ryan Roberts wrote:
>>>> On 24/11/2023 18:14, Barry Song wrote:
>>>>> On Fri, Nov 24, 2023 at 10:55 PM Steven Price <steven.price@arm.com> wrote:
>>>>>>
>>>>>> On 24/11/2023 09:01, Ryan Roberts wrote:
>>>>>>> On 24/11/2023 08:55, David Hildenbrand wrote:
>>>>>>>> On 24.11.23 02:35, Barry Song wrote:
>>>>>>>>> On Mon, Nov 20, 2023 at 11:57 PM Ryan Roberts <ryan.roberts@arm.com>
>>>>>>>>> wrote:
>>>>>>>>>>
>>>>>>>>>> On 20/11/2023 09:11, David Hildenbrand wrote:
>>>>>>>>>>> On 17.11.23 19:41, Barry Song wrote:
>>>>>>>>>>>> On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com>
>>>>>>>>>>>> wrote:
>>>>>>>>>>>>>
>>>>>>>>>>>>> On 17.11.23 01:15, Barry Song wrote:
>>>>>>>>>>>>>> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand
>>>>>>>>>>>>>>> <david@redhat.com> wrote:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> On 15.11.23 21:49, Barry Song wrote:
>>>>>>>>>>>>>>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand
>>>>>>>>>>>>>>>>> <david@redhat.com>
>>>>>>>>>>>>>>>>> wrote:
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> On 14.11.23 02:43, Barry Song wrote:
>>>>>>>>>>>>>>>>>>> This patch makes MTE tags saving and restoring support large
>>>>>>>>>>>>>>>>>>> folios,
>>>>>>>>>>>>>>>>>>> then we don't need to split them into base pages for swapping
>>>>>>>>>>>>>>>>>>> out
>>>>>>>>>>>>>>>>>>> on ARM64 SoCs with MTE.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> arch_prepare_to_swap() should take folio rather than page as
>>>>>>>>>>>>>>>>>>> parameter
>>>>>>>>>>>>>>>>>>> because we support THP swap-out as a whole.
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather
>>>>>>>>>>>>>>>>>>> than
>>>>>>>>>>>>>>>>>>> folio as swap-in always works at the granularity of base pages
>>>>>>>>>>>>>>>>>>> right
>>>>>>>>>>>>>>>>>>> now.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> ... but then we always have order-0 folios and can pass a folio,
>>>>>>>>>>>>>>>>>> or what
>>>>>>>>>>>>>>>>>> am I missing?
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Hi David,
>>>>>>>>>>>>>>>>> you missed the discussion here:
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
>>>>>>>>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Okay, so you want to handle the refault-from-swapcache case
>>>>>>>>>>>>>>>> where you
>>>>>>>>>>>>>>>> get a
>>>>>>>>>>>>>>>> large folio.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I was mislead by your "folio as swap-in always works at the
>>>>>>>>>>>>>>>> granularity of
>>>>>>>>>>>>>>>> base pages right now" comment.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> What you actually wanted to say is "While we always swap in small
>>>>>>>>>>>>>>>> folios, we
>>>>>>>>>>>>>>>> might refault large folios from the swapcache, and we only want to
>>>>>>>>>>>>>>>> restore
>>>>>>>>>>>>>>>> the tags for the page of the large folio we are faulting on."
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> But, I do if we can't simply restore the tags for the whole thing
>>>>>>>>>>>>>>>> at once
>>>>>>>>>>>>>>>> at make the interface page-free?
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Let me elaborate:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> IIRC, if we have a large folio in the swapcache, the swap
>>>>>>>>>>>>>>>> entries/offset are
>>>>>>>>>>>>>>>> contiguous. If you know you are faulting on page[1] of the folio
>>>>>>>>>>>>>>>> with a
>>>>>>>>>>>>>>>> given swap offset, you can calculate the swap offset for page[0]
>>>>>>>>>>>>>>>> simply by
>>>>>>>>>>>>>>>> subtracting from the offset.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> See page_swap_entry() on how we perform this calculation.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> So you can simply pass the large folio and the swap entry
>>>>>>>>>>>>>>>> corresponding
>>>>>>>>>>>>>>>> to the first page of the large folio, and restore all tags at once.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> So the interface would be
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> arch_prepare_to_swap(struct folio *folio);
>>>>>>>>>>>>>>>> void arch_swap_restore(struct page *folio, swp_entry_t
>>>>>>>>>>>>>>>> start_entry);
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> I'm sorry if that was also already discussed.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> This has been discussed. Steven, Ryan and I all don't think this is
>>>>>>>>>>>>>>> a good
>>>>>>>>>>>>>>> option. in case we have a large folio with 16 basepages, as
>>>>>>>>>>>>>>> do_swap_page
>>>>>>>>>>>>>>> can only map one base page for each page fault, that means we have
>>>>>>>>>>>>>>> to restore 16(tags we restore in each page fault) * 16(the times of
>>>>>>>>>>>>>>> page
>>>>>>>>>>>>>>> faults)
>>>>>>>>>>>>>>> for this large folio.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> and still the worst thing is the page fault in the Nth PTE of large
>>>>>>>>>>>>>>> folio
>>>>>>>>>>>>>>> might free swap entry as that swap has been in.
>>>>>>>>>>>>>>> do_swap_page()
>>>>>>>>>>>>>>> {
>>>>>>>>>>>>>>>         /*
>>>>>>>>>>>>>>>          * Remove the swap entry and conditionally try to free up
>>>>>>>>>>>>>>> the
>>>>>>>>>>>>>>> swapcache.
>>>>>>>>>>>>>>>          * We're already holding a reference on the page but haven't
>>>>>>>>>>>>>>> mapped it
>>>>>>>>>>>>>>>          * yet.
>>>>>>>>>>>>>>>          */
>>>>>>>>>>>>>>>          swap_free(entry);
>>>>>>>>>>>>>>> }
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you
>>>>>>>>>>>>>>> might
>>>>>>>>>>>>>>> access
>>>>>>>>>>>>>>> a freed tag.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> And David, one more information is that to keep the parameter of
>>>>>>>>>>>>>> arch_swap_restore() unchanged as folio,
>>>>>>>>>>>>>> i actually tried an ugly approach in rfc v2:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>> + if (system_supports_mte()) {
>>>>>>>>>>>>>> +      /*
>>>>>>>>>>>>>> +       * We don't support large folios swap in as whole yet, but
>>>>>>>>>>>>>> +       * we can hit a large folio which is still in swapcache
>>>>>>>>>>>>>> +       * after those related processes' PTEs have been unmapped
>>>>>>>>>>>>>> +       * but before the swapcache folio  is dropped, in this case,
>>>>>>>>>>>>>> +       * we need to find the exact page which "entry" is mapping
>>>>>>>>>>>>>> +       * to. If we are not hitting swapcache, this folio won't be
>>>>>>>>>>>>>> +       * large
>>>>>>>>>>>>>> +     */
>>>>>>>>>>>>>> + struct page *page = folio_file_page(folio, swp_offset(entry));
>>>>>>>>>>>>>> + mte_restore_tags(entry, page);
>>>>>>>>>>>>>> + }
>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> And obviously everybody in the discussion hated it :-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> I can relate :D
>>>>>>>>>>>>>
>>>>>>>>>>>>>> i feel the only way to keep API unchanged using folio is that we
>>>>>>>>>>>>>> support restoring PTEs
>>>>>>>>>>>>>> all together for the whole large folio and we support the swap-in of
>>>>>>>>>>>>>> large folios. This is
>>>>>>>>>>>>>> in my list to do, I will send a patchset based on Ryan's large anon
>>>>>>>>>>>>>> folios series after a
>>>>>>>>>>>>>> while. till that is really done, it seems using page rather than
>>>>>>>>>>>>>> folio
>>>>>>>>>>>>>> is a better choice.
>>>>>>>>>>>>>
>>>>>>>>>>>>> I think just restoring all tags and remembering for a large folio that
>>>>>>>>>>>>> they have been restored might be the low hanging fruit. But as always,
>>>>>>>>>>>>> devil is in the detail :)
>>>>>>>>>>>>
>>>>>>>>>>>> Hi David,
>>>>>>>>>>>> thanks for all your suggestions though my feeling is this is too
>>>>>>>>>>>> complex and
>>>>>>>>>>>> is not worth it for at least  three reasons.
>>>>>>>>>>>
>>>>>>>>>>> Fair enough.
>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> 1. In multi-thread and particularly multi-processes, we need some
>>>>>>>>>>>> locks to
>>>>>>>>>>>> protect and help know if one process is the first one to restore tags
>>>>>>>>>>>> and if
>>>>>>>>>>>> someone else is restoring tags when one process wants to restore. there
>>>>>>>>>>>> is not this kind of fine-grained lock at all.
>>>>>>>>>>>
>>>>>>>>>>> We surely always hold the folio lock on swapin/swapout, no? So when
>>>>>>>>>>> these
>>>>>>>>>>> functions are called.
>>>>>>>>>>>
>>>>>>>>>>> So that might just work already -- unless I am missing something
>>>>>>>>>>> important.
>>>>>>>>>>
>>>>>>>>>> We already have a page flag that we use to mark the page as having had
>>>>>>>>>> its mte
>>>>>>>>>> state associated; PG_mte_tagged. This is currently per-page (and IIUC,
>>>>>>>>>> Matthew
>>>>>>>>>> has been working to remove as many per-page flags as possible). Couldn't
>>>>>>>>>> we just
>>>>>>>>>> make arch_swap_restore() take a folio, restore the tags for *all* the
>>>>>>>>>> pages and
>>>>>>>>>> repurpose that flag to be per-folio (so head page only)? It looks like
>>>>>>>>>> the the
>>>>>>>>>> mte code already manages all the serialization requirements too. Then
>>>>>>>>>> arch_swap_restore() can just exit early if it sees the flag is already
>>>>>>>>>> set on
>>>>>>>>>> the folio.
>>>>>>>>>>
>>>>>>>>>> One (probably nonsense) concern that just sprung to mind about having
>>>>>>>>>> MTE work
>>>>>>>>>> with large folios in general; is it possible that user space could cause
>>>>>>>>>> a large
>>>>>>>>>> anon folio to be allocated (THP), then later mark *part* of it to be
>>>>>>>>>> tagged with
>>>>>>>>>> MTE? In this case you would need to apply tags to part of the folio only.
>>>>>>>>>> Although I have a vague recollection that any MTE areas have to be
>>>>>>>>>> marked at
>>>>>>>>>> mmap time and therefore this type of thing is impossible?
>>>>>>>>>
>>>>>>>>> right, we might need to consider only a part of folio needs to be
>>>>>>>>> mapped and restored MTE tags.
>>>>>>>>> do_swap_page() can have a chance to hit a large folio but it only
>>>>>>>>> needs to fault-in a page.
>>>>>>>>>
>>>>>>>>> A case can be quite simple as below,
>>>>>>>>>
>>>>>>>>> 1. anon folio shared by process A and B
>>>>>>>>> 2. add_to_swap() as a large folio;
>>>>>>>>> 3. try to unmap A and B;
>>>>>>>>> 4. after A is unmapped(ptes become swap entries), we do a
>>>>>>>>> MADV_DONTNEED on a part of the folio. this can
>>>>>>>>> happen very easily as userspace is still working in 4KB level;
>>>>>>>>> userspace heap management can free an
>>>>>>>>> basepage area by MADV_DONTNEED;
>>>>>>>>> madvise(address, MADV_DONTNEED, 4KB);
>>>>>>>>> 5. A refault on address + 8KB, we will hit large folio in
>>>>>>>>> do_swap_page() but we will only need to map
>>>>>>>>> one basepage, we will never need this DONTNEEDed in process A.
>>>>>>>>>
>>>>>>>>> another more complicated case can be mprotect and munmap a part of
>>>>>>>>> large folios. since userspace
>>>>>>>>> has no idea of large folios in their mind, they can do all strange
>>>>>>>>> things. are we sure in all cases,
>>>>>>>>> large folios have been splitted into small folios?
>>>>>>>
>>>>>>> I don;'t think these examples you cite are problematic. Although user space
>>>>>>> thinks about things in 4K pages, the kernel does things in units of folios.
>>>>>>> So a
>>>>>>> folio is either fully swapped out or not swapped out at all. MTE tags can be
>>>>>>> saved/restored per folio, even if only part of that folio ends up being
>>>>>>> mapped
>>>>>>> back into user space.
>>>>>
>>>>> I am not so optimistic :-)
>>>>>
>>>>> but zap_pte_range() due to DONTNEED on a part of swapped-out folio can
>>>>> free a part of swap
>>>>> entries? thus, free a part of MTE tags in a folio?
>>>>> after process's large folios are swapped out, all PTEs in a large
>>>>> folio become swap
>>>>> entries, but DONTNEED on a part of this area will only set a part of
>>>>> swap entries to
>>>>> PTE_NONE, thus decrease the swapcount of this part?
>>>>>
>>>>> zap_pte_range
>>>>>       ->
>>>>>             entry = pte_to_swp_entry
>>>>>                     -> free_swap_and_cache(entry)
>>>>>                         -> mte tags invalidate
>>>>
>>>> OK I see what you mean.
>>>>
>>>> Just trying to summarize this, I think there are 2 questions behind all this:
>>>>
>>>> 1) Can we save/restore MTE tags on at the granularity of a folio?
>>>>
>>>> I think the answer is no; we can enable MTE on a individual pages within a
>>>> folio
>>>> with mprotect, and we can throw away tags on individual pages as you describe
>>>> above. So we have to continue to handle tags per-page.
>>>
>>> Can you enlighten me why the scheme proposed by Steven doesn't work?
>>
>> Are you referring to Steven's suggestion of reading the tag to see if it's
>> zeros? I think that demonstrates my point that this has to be done per-page and
> 
> Yes.

OK I'm obviously being thick, because checking a page's tag to see if its zero
seems logically equivalent to checking the existing per-page flag, just more
expensive. Yes we could make that change but I don't see how it helps solve the
real problem at hand. Unless you are also deliberately trying to remove the
per-page flag at the same time, as per Matthew's master plan?

> 
>> not per-folio? I'm also not sure what it buys us - instead of reading a per-page
>> flag we now have to read 128 bytes of tag for each page and check its zero.
> 
> My point is, if that is the corner case, we might not care about that.
> 
>>
>>>
>>> I mean, having a mixture of tagged vs. untagged is assumed to be the corner
>>> case, right?
>>
>> Yes. But I'm not sure how we exploit that; I guess we could have a per-folio
>> flag; when set it means the whole folio is tagged and when clear it means fall
>> back to checking the per-page flag?
> 
> Let me think this through. We have the following states:
> 
> 1) Any subpage is possibly logically tagged and all relevant tags are
>    applied/restored.
> 2) Any subpage is possibly logically tagged, and all relevant tags are
>    not applied but stored in the datastructure.
> 3) No subpage is logically tagged.
> 
> We can identify in 1) the subpages by reading the tag from HW, 

I don't think this actually works; I'm pretty sure the optimization to clear the
tag at the same time as the page clearing only happens for small pages. I don't
think this will be done when allocating a THP today. Obviously that could change.

> and on 2) by
> checking the datastructure. For 3), there is nothing to check.
> 
> On swapout of a large folio:
> 
> * For 3) we don't do anything
> * For 2) we don't do anything
> * For 1) we store all tags that are non-zero (reading all tags) and
>   transition to 2).

Given a tag architecturally exists for every page even when unused, and we think
a folio being partially mte-tagged is the corner case, could you simplify this
further and just write out all the tags for the folio and not care if some are
not in use?

> 
> On swapin of a large folio
> 
> A) Old folio (swapcache)
> 
> If in 1) or 3) already, nothing to do.
> 
> If in 2), restore all tags that are in the datastructure and move to 1). Nothing
> to do for 1
> 
> b) Fresh folio (we lost any MTE marker)
> 
> Currently always order-0, so nothing to do. We'd have to check the datastructure
> for any tag part of the folio and set the state accordingly.
> 
> 
> Of course, that means that on swapout, you read all tags. But if the common case
> is that all subpages have tags, we don't really care.
> 
> I'm sure I made a mistake somewhere, but where? :)
>
David Hildenbrand Nov. 27, 2023, 3:42 p.m. UTC | #26
>>> Are you referring to Steven's suggestion of reading the tag to see if it's
>>> zeros? I think that demonstrates my point that this has to be done per-page and
>>
>> Yes.
> 
> OK I'm obviously being thick, because checking a page's tag to see if its zero
> seems logically equivalent to checking the existing per-page flag, just more
> expensive. Yes we could make that change but I don't see how it helps solve the
> real problem at hand. Unless you are also deliberately trying to remove the
> per-page flag at the same time, as per Matthew's master plan?

I think a per-folio interface is cleaner and more future-proof, and 
removing the per-page flag might be a nice side product of that.

Anyhow, just some thoughts from my side if it could be easily/cleanly done.

At least the "easy" part does not seem to be the case, so I'm fine with 
deferring anything like that for now.

[...]

>>
>> We can identify in 1) the subpages by reading the tag from HW,
> 
> I don't think this actually works; I'm pretty sure the optimization to clear the
> tag at the same time as the page clearing only happens for small pages. I don't
> think this will be done when allocating a THP today. Obviously that could change.
> 

Could be, absolutely no expert. I was primarily on the "It would be 
possible to reverse this scheme - we could drop the page
flag and just look at the actual tag storage. If it's all zeros then
obviously there's no point in storing it." comment from Steven.

>> and on 2) by
>> checking the datastructure. For 3), there is nothing to check.
>>
>> On swapout of a large folio:
>>
>> * For 3) we don't do anything
>> * For 2) we don't do anything
>> * For 1) we store all tags that are non-zero (reading all tags) and
>>    transition to 2).
> 
> Given a tag architecturally exists for every page even when unused, and we think
> a folio being partially mte-tagged is the corner case, could you simplify this
> further and just write out all the tags for the folio and not care if some are
> not in use?

Likely this could be simplified, yes.
Barry Song Dec. 7, 2023, 3:22 a.m. UTC | #27
On Tue, Nov 28, 2023 at 3:16 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 27.11.23 13:14, Ryan Roberts wrote:
> > On 27/11/2023 12:01, David Hildenbrand wrote:
> >> On 27.11.23 12:56, Ryan Roberts wrote:
> >>> On 24/11/2023 18:14, Barry Song wrote:
> >>>> On Fri, Nov 24, 2023 at 10:55 PM Steven Price <steven.price@arm.com> wrote:
> >>>>>
> >>>>> On 24/11/2023 09:01, Ryan Roberts wrote:
> >>>>>> On 24/11/2023 08:55, David Hildenbrand wrote:
> >>>>>>> On 24.11.23 02:35, Barry Song wrote:
> >>>>>>>> On Mon, Nov 20, 2023 at 11:57 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>>>>>>>
> >>>>>>>>> On 20/11/2023 09:11, David Hildenbrand wrote:
> >>>>>>>>>> On 17.11.23 19:41, Barry Song wrote:
> >>>>>>>>>>> On Fri, Nov 17, 2023 at 7:28 PM David Hildenbrand <david@redhat.com>
> >>>>>>>>>>> wrote:
> >>>>>>>>>>>>
> >>>>>>>>>>>> On 17.11.23 01:15, Barry Song wrote:
> >>>>>>>>>>>>> On Fri, Nov 17, 2023 at 7:47 AM Barry Song <21cnbao@gmail.com> wrote:
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> On Thu, Nov 16, 2023 at 5:36 PM David Hildenbrand
> >>>>>>>>>>>>>> <david@redhat.com> wrote:
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> On 15.11.23 21:49, Barry Song wrote:
> >>>>>>>>>>>>>>>> On Wed, Nov 15, 2023 at 11:16 PM David Hildenbrand
> >>>>>>>>>>>>>>>> <david@redhat.com>
> >>>>>>>>>>>>>>>> wrote:
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>> On 14.11.23 02:43, Barry Song wrote:
> >>>>>>>>>>>>>>>>>> This patch makes MTE tags saving and restoring support large
> >>>>>>>>>>>>>>>>>> folios,
> >>>>>>>>>>>>>>>>>> then we don't need to split them into base pages for swapping out
> >>>>>>>>>>>>>>>>>> on ARM64 SoCs with MTE.
> >>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>> arch_prepare_to_swap() should take folio rather than page as
> >>>>>>>>>>>>>>>>>> parameter
> >>>>>>>>>>>>>>>>>> because we support THP swap-out as a whole.
> >>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>> Meanwhile, arch_swap_restore() should use page parameter rather
> >>>>>>>>>>>>>>>>>> than
> >>>>>>>>>>>>>>>>>> folio as swap-in always works at the granularity of base pages
> >>>>>>>>>>>>>>>>>> right
> >>>>>>>>>>>>>>>>>> now.
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>> ... but then we always have order-0 folios and can pass a folio,
> >>>>>>>>>>>>>>>>> or what
> >>>>>>>>>>>>>>>>> am I missing?
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> Hi David,
> >>>>>>>>>>>>>>>> you missed the discussion here:
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4yXjex8txgEGt7+WMKp4uDQTn-fR06ijv4Ac68MkhjMDw@mail.gmail.com/
> >>>>>>>>>>>>>>>> https://lore.kernel.org/lkml/CAGsJ_4xmBAcApyK8NgVQeX_Znp5e8D4fbbhGguOkNzmh1Veocg@mail.gmail.com/
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> Okay, so you want to handle the refault-from-swapcache case where you
> >>>>>>>>>>>>>>> get a
> >>>>>>>>>>>>>>> large folio.
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> I was mislead by your "folio as swap-in always works at the
> >>>>>>>>>>>>>>> granularity of
> >>>>>>>>>>>>>>> base pages right now" comment.
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> What you actually wanted to say is "While we always swap in small
> >>>>>>>>>>>>>>> folios, we
> >>>>>>>>>>>>>>> might refault large folios from the swapcache, and we only want to
> >>>>>>>>>>>>>>> restore
> >>>>>>>>>>>>>>> the tags for the page of the large folio we are faulting on."
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> But, I do if we can't simply restore the tags for the whole thing
> >>>>>>>>>>>>>>> at once
> >>>>>>>>>>>>>>> at make the interface page-free?
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> Let me elaborate:
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> IIRC, if we have a large folio in the swapcache, the swap
> >>>>>>>>>>>>>>> entries/offset are
> >>>>>>>>>>>>>>> contiguous. If you know you are faulting on page[1] of the folio
> >>>>>>>>>>>>>>> with a
> >>>>>>>>>>>>>>> given swap offset, you can calculate the swap offset for page[0]
> >>>>>>>>>>>>>>> simply by
> >>>>>>>>>>>>>>> subtracting from the offset.
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> See page_swap_entry() on how we perform this calculation.
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> So you can simply pass the large folio and the swap entry
> >>>>>>>>>>>>>>> corresponding
> >>>>>>>>>>>>>>> to the first page of the large folio, and restore all tags at once.
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> So the interface would be
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> arch_prepare_to_swap(struct folio *folio);
> >>>>>>>>>>>>>>> void arch_swap_restore(struct page *folio, swp_entry_t start_entry);
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> I'm sorry if that was also already discussed.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> This has been discussed. Steven, Ryan and I all don't think this is
> >>>>>>>>>>>>>> a good
> >>>>>>>>>>>>>> option. in case we have a large folio with 16 basepages, as
> >>>>>>>>>>>>>> do_swap_page
> >>>>>>>>>>>>>> can only map one base page for each page fault, that means we have
> >>>>>>>>>>>>>> to restore 16(tags we restore in each page fault) * 16(the times of
> >>>>>>>>>>>>>> page
> >>>>>>>>>>>>>> faults)
> >>>>>>>>>>>>>> for this large folio.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> and still the worst thing is the page fault in the Nth PTE of large
> >>>>>>>>>>>>>> folio
> >>>>>>>>>>>>>> might free swap entry as that swap has been in.
> >>>>>>>>>>>>>> do_swap_page()
> >>>>>>>>>>>>>> {
> >>>>>>>>>>>>>>         /*
> >>>>>>>>>>>>>>          * Remove the swap entry and conditionally try to free up the
> >>>>>>>>>>>>>> swapcache.
> >>>>>>>>>>>>>>          * We're already holding a reference on the page but haven't
> >>>>>>>>>>>>>> mapped it
> >>>>>>>>>>>>>>          * yet.
> >>>>>>>>>>>>>>          */
> >>>>>>>>>>>>>>          swap_free(entry);
> >>>>>>>>>>>>>> }
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> So in the page faults other than N, I mean 0~N-1 and N+1 to 15, you
> >>>>>>>>>>>>>> might
> >>>>>>>>>>>>>> access
> >>>>>>>>>>>>>> a freed tag.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> And David, one more information is that to keep the parameter of
> >>>>>>>>>>>>> arch_swap_restore() unchanged as folio,
> >>>>>>>>>>>>> i actually tried an ugly approach in rfc v2:
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> +void arch_swap_restore(swp_entry_t entry, struct folio *folio)
> >>>>>>>>>>>>> +{
> >>>>>>>>>>>>> + if (system_supports_mte()) {
> >>>>>>>>>>>>> +      /*
> >>>>>>>>>>>>> +       * We don't support large folios swap in as whole yet, but
> >>>>>>>>>>>>> +       * we can hit a large folio which is still in swapcache
> >>>>>>>>>>>>> +       * after those related processes' PTEs have been unmapped
> >>>>>>>>>>>>> +       * but before the swapcache folio  is dropped, in this case,
> >>>>>>>>>>>>> +       * we need to find the exact page which "entry" is mapping
> >>>>>>>>>>>>> +       * to. If we are not hitting swapcache, this folio won't be
> >>>>>>>>>>>>> +       * large
> >>>>>>>>>>>>> +     */
> >>>>>>>>>>>>> + struct page *page = folio_file_page(folio, swp_offset(entry));
> >>>>>>>>>>>>> + mte_restore_tags(entry, page);
> >>>>>>>>>>>>> + }
> >>>>>>>>>>>>> +}
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> And obviously everybody in the discussion hated it :-)
> >>>>>>>>>>>>>
> >>>>>>>>>>>>
> >>>>>>>>>>>> I can relate :D
> >>>>>>>>>>>>
> >>>>>>>>>>>>> i feel the only way to keep API unchanged using folio is that we
> >>>>>>>>>>>>> support restoring PTEs
> >>>>>>>>>>>>> all together for the whole large folio and we support the swap-in of
> >>>>>>>>>>>>> large folios. This is
> >>>>>>>>>>>>> in my list to do, I will send a patchset based on Ryan's large anon
> >>>>>>>>>>>>> folios series after a
> >>>>>>>>>>>>> while. till that is really done, it seems using page rather than folio
> >>>>>>>>>>>>> is a better choice.
> >>>>>>>>>>>>
> >>>>>>>>>>>> I think just restoring all tags and remembering for a large folio that
> >>>>>>>>>>>> they have been restored might be the low hanging fruit. But as always,
> >>>>>>>>>>>> devil is in the detail :)
> >>>>>>>>>>>
> >>>>>>>>>>> Hi David,
> >>>>>>>>>>> thanks for all your suggestions though my feeling is this is too
> >>>>>>>>>>> complex and
> >>>>>>>>>>> is not worth it for at least  three reasons.
> >>>>>>>>>>
> >>>>>>>>>> Fair enough.
> >>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> 1. In multi-thread and particularly multi-processes, we need some
> >>>>>>>>>>> locks to
> >>>>>>>>>>> protect and help know if one process is the first one to restore tags
> >>>>>>>>>>> and if
> >>>>>>>>>>> someone else is restoring tags when one process wants to restore. there
> >>>>>>>>>>> is not this kind of fine-grained lock at all.
> >>>>>>>>>>
> >>>>>>>>>> We surely always hold the folio lock on swapin/swapout, no? So when these
> >>>>>>>>>> functions are called.
> >>>>>>>>>>
> >>>>>>>>>> So that might just work already -- unless I am missing something
> >>>>>>>>>> important.
> >>>>>>>>>
> >>>>>>>>> We already have a page flag that we use to mark the page as having had
> >>>>>>>>> its mte
> >>>>>>>>> state associated; PG_mte_tagged. This is currently per-page (and IIUC,
> >>>>>>>>> Matthew
> >>>>>>>>> has been working to remove as many per-page flags as possible). Couldn't
> >>>>>>>>> we just
> >>>>>>>>> make arch_swap_restore() take a folio, restore the tags for *all* the
> >>>>>>>>> pages and
> >>>>>>>>> repurpose that flag to be per-folio (so head page only)? It looks like
> >>>>>>>>> the the
> >>>>>>>>> mte code already manages all the serialization requirements too. Then
> >>>>>>>>> arch_swap_restore() can just exit early if it sees the flag is already
> >>>>>>>>> set on
> >>>>>>>>> the folio.
> >>>>>>>>>
> >>>>>>>>> One (probably nonsense) concern that just sprung to mind about having
> >>>>>>>>> MTE work
> >>>>>>>>> with large folios in general; is it possible that user space could cause
> >>>>>>>>> a large
> >>>>>>>>> anon folio to be allocated (THP), then later mark *part* of it to be
> >>>>>>>>> tagged with
> >>>>>>>>> MTE? In this case you would need to apply tags to part of the folio only.
> >>>>>>>>> Although I have a vague recollection that any MTE areas have to be
> >>>>>>>>> marked at
> >>>>>>>>> mmap time and therefore this type of thing is impossible?
> >>>>>>>>
> >>>>>>>> right, we might need to consider only a part of folio needs to be
> >>>>>>>> mapped and restored MTE tags.
> >>>>>>>> do_swap_page() can have a chance to hit a large folio but it only
> >>>>>>>> needs to fault-in a page.
> >>>>>>>>
> >>>>>>>> A case can be quite simple as below,
> >>>>>>>>
> >>>>>>>> 1. anon folio shared by process A and B
> >>>>>>>> 2. add_to_swap() as a large folio;
> >>>>>>>> 3. try to unmap A and B;
> >>>>>>>> 4. after A is unmapped(ptes become swap entries), we do a
> >>>>>>>> MADV_DONTNEED on a part of the folio. this can
> >>>>>>>> happen very easily as userspace is still working in 4KB level;
> >>>>>>>> userspace heap management can free an
> >>>>>>>> basepage area by MADV_DONTNEED;
> >>>>>>>> madvise(address, MADV_DONTNEED, 4KB);
> >>>>>>>> 5. A refault on address + 8KB, we will hit large folio in
> >>>>>>>> do_swap_page() but we will only need to map
> >>>>>>>> one basepage, we will never need this DONTNEEDed in process A.
> >>>>>>>>
> >>>>>>>> another more complicated case can be mprotect and munmap a part of
> >>>>>>>> large folios. since userspace
> >>>>>>>> has no idea of large folios in their mind, they can do all strange
> >>>>>>>> things. are we sure in all cases,
> >>>>>>>> large folios have been splitted into small folios?
> >>>>>>
> >>>>>> I don;'t think these examples you cite are problematic. Although user space
> >>>>>> thinks about things in 4K pages, the kernel does things in units of folios.
> >>>>>> So a
> >>>>>> folio is either fully swapped out or not swapped out at all. MTE tags can be
> >>>>>> saved/restored per folio, even if only part of that folio ends up being mapped
> >>>>>> back into user space.
> >>>>
> >>>> I am not so optimistic :-)
> >>>>
> >>>> but zap_pte_range() due to DONTNEED on a part of swapped-out folio can
> >>>> free a part of swap
> >>>> entries? thus, free a part of MTE tags in a folio?
> >>>> after process's large folios are swapped out, all PTEs in a large
> >>>> folio become swap
> >>>> entries, but DONTNEED on a part of this area will only set a part of
> >>>> swap entries to
> >>>> PTE_NONE, thus decrease the swapcount of this part?
> >>>>
> >>>> zap_pte_range
> >>>>       ->
> >>>>             entry = pte_to_swp_entry
> >>>>                     -> free_swap_and_cache(entry)
> >>>>                         -> mte tags invalidate
> >>>
> >>> OK I see what you mean.
> >>>
> >>> Just trying to summarize this, I think there are 2 questions behind all this:
> >>>
> >>> 1) Can we save/restore MTE tags on at the granularity of a folio?
> >>>
> >>> I think the answer is no; we can enable MTE on a individual pages within a folio
> >>> with mprotect, and we can throw away tags on individual pages as you describe
> >>> above. So we have to continue to handle tags per-page.
> >>
> >> Can you enlighten me why the scheme proposed by Steven doesn't work?
> >
> > Are you referring to Steven's suggestion of reading the tag to see if it's
> > zeros? I think that demonstrates my point that this has to be done per-page and
>
> Yes.
>
> > not per-folio? I'm also not sure what it buys us - instead of reading a per-page
> > flag we now have to read 128 bytes of tag for each page and check its zero.
>
> My point is, if that is the corner case, we might not care about that.

Hi David,
my understanding is that this is NOT a corner. Alternatively, it is
really a common case.

1. a large folio can be partially unmapped when it is in swapche and
after it is swapped out
in all cases, its tags can be partially invalidated. I don't think
this is a corner case, as long
as userspaces are still working at the granularity of basepages, this
is always going to
happen. For example, userspace libc such as jemalloc can identify
PAGESIZE, and use
madvise(DONTNEED) to return memory to the kernel. Heap management is
still working
at the granularity of the basepage.

2. mprotect on a part of a large folio as Steven pointed out.

3.long term, we are working to swap-in large folios as a whole[1] just
like swapping out large
folios as a whole. for those ptes which are still contiguous swap
entries, i mean, which
are not unmapped by userspace after the large folios are swapped out
to swap devices,
we have a chance to swap in a whole large folio, we do have a chance
to restore tags
for the large folio without early-exit. but we still have a good
chance to fall back to base
page if we fail to allocate large folio, in this case, do_swap_page()
still works at the
granularity of basepage. and do_swap_page() will call swap_free(entry),  tags of

this particular page can be invalidated as a result.

4. too many early-exit might be negative to performance.


So I am thinking that in the future, we need two helpers,
1. void __arch_swap_restore(swp_entry_t entry, struct page *page);
this is always needed to support page-level tag restore.

2.  void arch_swap_restore(swp_entry_t entry, struct folio *folio);
this can be a helper when we are able to swap in a whole folio. two
conditions must be met
(a). PTEs entries are still contiguous swap entries just as when large
folios were swapped
out.
(b). we succeed in the allocation of a large folio in do_swap_page.

For this moment, we only need 1; we will add 2 in swap-in large folio series.

What do you think?

[1] https://lore.kernel.org/linux-mm/CAGsJ_4zyhqwUS26bbx0a95rSO0jVANfC_RWdyTFbJQQG5KJ_cg@mail.gmail.com/

>
> >
> >>
> >> I mean, having a mixture of tagged vs. untagged is assumed to be the corner
> >> case, right?
> >
> > Yes. But I'm not sure how we exploit that; I guess we could have a per-folio
> > flag; when set it means the whole folio is tagged and when clear it means fall
> > back to checking the per-page flag?
>
> Let me think this through. We have the following states:
>
> 1) Any subpage is possibly logically tagged and all relevant tags are
>     applied/restored.
> 2) Any subpage is possibly logically tagged, and all relevant tags are
>     not applied but stored in the datastructure.
> 3) No subpage is logically tagged.
>
> We can identify in 1) the subpages by reading the tag from HW, and on 2)
> by checking the datastructure. For 3), there is nothing to check.
>
> On swapout of a large folio:
>
> * For 3) we don't do anything
> * For 2) we don't do anything
> * For 1) we store all tags that are non-zero (reading all tags) and
>    transition to 2).
>
> On swapin of a large folio
>
> A) Old folio (swapcache)
>
> If in 1) or 3) already, nothing to do.
>
> If in 2), restore all tags that are in the datastructure and move to 1).
> Nothing to do for 1
>
> b) Fresh folio (we lost any MTE marker)
>
> Currently always order-0, so nothing to do. We'd have to check the
> datastructure for any tag part of the folio and set the state accordingly.
>
>
> Of course, that means that on swapout, you read all tags. But if the
> common case is that all subpages have tags, we don't really care.
>
> I'm sure I made a mistake somewhere, but where? :)
>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry
David Hildenbrand Dec. 7, 2023, 10:03 a.m. UTC | #28
>>
>>> not per-folio? I'm also not sure what it buys us - instead of reading a per-page
>>> flag we now have to read 128 bytes of tag for each page and check its zero.
>>
>> My point is, if that is the corner case, we might not care about that.
> 
> Hi David,

Hi!

> my understanding is that this is NOT a corner. Alternatively, it is
> really a common case.

If it happens with < 1% of all large folios on swapout/swapin, it's not 
the common case. Even if some scenarios you point out below can and will 
happen.

> 
> 1. a large folio can be partially unmapped when it is in swapche and
> after it is swapped out
> in all cases, its tags can be partially invalidated. I don't think
> this is a corner case, as long
> as userspaces are still working at the granularity of basepages, this
> is always going to
> happen. For example, userspace libc such as jemalloc can identify
> PAGESIZE, and use
> madvise(DONTNEED) to return memory to the kernel. Heap management is
> still working
> at the granularity of the basepage.
> 
> 2. mprotect on a part of a large folio as Steven pointed out.
> 
> 3.long term, we are working to swap-in large folios as a whole[1] just
> like swapping out large
> folios as a whole. for those ptes which are still contiguous swap
> entries, i mean, which
> are not unmapped by userspace after the large folios are swapped out
> to swap devices,
> we have a chance to swap in a whole large folio, we do have a chance
> to restore tags
> for the large folio without early-exit. but we still have a good
> chance to fall back to base
> page if we fail to allocate large folio, in this case, do_swap_page()
> still works at the
> granularity of basepage. and do_swap_page() will call swap_free(entry),  tags of
> 
> this particular page can be invalidated as a result.

I don't immediately see how that relates. You get a fresh small folio 
and simply load that tag from the internal datastructure. No messing 
with large folios required, because you don't have a large folio. So no 
considerations about large folio batch MTE tag restore apply.

> 
> 4. too many early-exit might be negative to performance.
> 
> 
> So I am thinking that in the future, we need two helpers,
> 1. void __arch_swap_restore(swp_entry_t entry, struct page *page);
> this is always needed to support page-level tag restore.
> 
> 2.  void arch_swap_restore(swp_entry_t entry, struct folio *folio);
> this can be a helper when we are able to swap in a whole folio. two
> conditions must be met
> (a). PTEs entries are still contiguous swap entries just as when large
> folios were swapped
> out.
> (b). we succeed in the allocation of a large folio in do_swap_page.
> 
> For this moment, we only need 1; we will add 2 in swap-in large folio series.
> 
> What do you think?

I agree that it's better to keep it simple for now.
Barry Song Dec. 8, 2023, midnight UTC | #29
On Thu, Dec 7, 2023 at 11:04 PM David Hildenbrand <david@redhat.com> wrote:
>
> >>
> >>> not per-folio? I'm also not sure what it buys us - instead of reading a per-page
> >>> flag we now have to read 128 bytes of tag for each page and check its zero.
> >>
> >> My point is, if that is the corner case, we might not care about that.
> >
> > Hi David,
>
> Hi!
>
> > my understanding is that this is NOT a corner. Alternatively, it is
> > really a common case.
>
> If it happens with < 1% of all large folios on swapout/swapin, it's not
> the common case. Even if some scenarios you point out below can and will
> happen.
>

Fair enough. If we define "corner case" based on the percentage of those folios
which can get partial MTE tags set or get partial MTE tags invalidated, I agree
this is a corner case. I thought  that a corner case was a case which
could rarely
happen.

> >
> > 1. a large folio can be partially unmapped when it is in swapche and
> > after it is swapped out
> > in all cases, its tags can be partially invalidated. I don't think
> > this is a corner case, as long
> > as userspaces are still working at the granularity of basepages, this
> > is always going to
> > happen. For example, userspace libc such as jemalloc can identify
> > PAGESIZE, and use
> > madvise(DONTNEED) to return memory to the kernel. Heap management is
> > still working
> > at the granularity of the basepage.
> >
> > 2. mprotect on a part of a large folio as Steven pointed out.
> >
> > 3.long term, we are working to swap-in large folios as a whole[1] just
> > like swapping out large
> > folios as a whole. for those ptes which are still contiguous swap
> > entries, i mean, which
> > are not unmapped by userspace after the large folios are swapped out
> > to swap devices,
> > we have a chance to swap in a whole large folio, we do have a chance
> > to restore tags
> > for the large folio without early-exit. but we still have a good
> > chance to fall back to base
> > page if we fail to allocate large folio, in this case, do_swap_page()
> > still works at the
> > granularity of basepage. and do_swap_page() will call swap_free(entry),  tags of
> >
> > this particular page can be invalidated as a result.
>
> I don't immediately see how that relates. You get a fresh small folio
> and simply load that tag from the internal datastructure. No messing
> with large folios required, because you don't have a large folio. So no
> considerations about large folio batch MTE tag restore apply.

right. I was thinking the original large folio was partially
swapped-in and forgot
the new allocated page was actually one folio with only one page :-)

Indeed, in that case, it is still restoring the MTE tag for the whole
folio with one
page.

>
> >
> > 4. too many early-exit might be negative to performance.
> >
> >
> > So I am thinking that in the future, we need two helpers,
> > 1. void __arch_swap_restore(swp_entry_t entry, struct page *page);
> > this is always needed to support page-level tag restore.
> >
> > 2.  void arch_swap_restore(swp_entry_t entry, struct folio *folio);
> > this can be a helper when we are able to swap in a whole folio. two
> > conditions must be met
> > (a). PTEs entries are still contiguous swap entries just as when large
> > folios were swapped
> > out.
> > (b). we succeed in the allocation of a large folio in do_swap_page.
> >
> > For this moment, we only need 1; we will add 2 in swap-in large folio series.
> >
> > What do you think?
>
> I agree that it's better to keep it simple for now.
>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b19a8aee684c..c3eef11c1a9e 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -45,12 +45,6 @@ 
 	__flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1)
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-static inline bool arch_thp_swp_supported(void)
-{
-	return !system_supports_mte();
-}
-#define arch_thp_swp_supported arch_thp_swp_supported
-
 /*
  * Outside of a few very special situations (e.g. hibernation), we always
  * use broadcast TLB invalidation instructions, therefore a spurious page
@@ -1036,12 +1030,8 @@  static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
 #ifdef CONFIG_ARM64_MTE
 
 #define __HAVE_ARCH_PREPARE_TO_SWAP
-static inline int arch_prepare_to_swap(struct page *page)
-{
-	if (system_supports_mte())
-		return mte_save_tags(page);
-	return 0;
-}
+#define arch_prepare_to_swap arch_prepare_to_swap
+extern int arch_prepare_to_swap(struct folio *folio);
 
 #define __HAVE_ARCH_SWAP_INVALIDATE
 static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
@@ -1057,11 +1047,8 @@  static inline void arch_swap_invalidate_area(int type)
 }
 
 #define __HAVE_ARCH_SWAP_RESTORE
-static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
-{
-	if (system_supports_mte())
-		mte_restore_tags(entry, &folio->page);
-}
+#define arch_swap_restore arch_swap_restore
+extern void arch_swap_restore(swp_entry_t entry, struct page *page);
 
 #endif /* CONFIG_ARM64_MTE */
 
diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c
index a31833e3ddc5..75c2836e8240 100644
--- a/arch/arm64/mm/mteswap.c
+++ b/arch/arm64/mm/mteswap.c
@@ -68,6 +68,12 @@  void mte_invalidate_tags(int type, pgoff_t offset)
 	mte_free_tag_storage(tags);
 }
 
+static inline void __mte_invalidate_tags(struct page *page)
+{
+	swp_entry_t entry = page_swap_entry(page);
+	mte_invalidate_tags(swp_type(entry), swp_offset(entry));
+}
+
 void mte_invalidate_tags_area(int type)
 {
 	swp_entry_t entry = swp_entry(type, 0);
@@ -83,3 +89,31 @@  void mte_invalidate_tags_area(int type)
 	}
 	xa_unlock(&mte_pages);
 }
+
+int arch_prepare_to_swap(struct folio *folio)
+{
+	int err;
+	long i;
+
+	if (system_supports_mte()) {
+		long nr = folio_nr_pages(folio);
+
+		for (i = 0; i < nr; i++) {
+			err = mte_save_tags(folio_page(folio, i));
+			if (err)
+				goto out;
+		}
+	}
+	return 0;
+
+out:
+	while (i--)
+		__mte_invalidate_tags(folio_page(folio, i));
+	return err;
+}
+
+void arch_swap_restore(swp_entry_t entry, struct page *page)
+{
+	if (system_supports_mte())
+		mte_restore_tags(entry, page);
+}
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index fa0350b0812a..f83fb8d5241e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -400,16 +400,4 @@  static inline int split_folio(struct folio *folio)
 	return split_folio_to_list(folio, NULL);
 }
 
-/*
- * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to
- * limitations in the implementation like arm64 MTE can override this to
- * false
- */
-#ifndef arch_thp_swp_supported
-static inline bool arch_thp_swp_supported(void)
-{
-	return true;
-}
-#endif
-
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index af7639c3b0a3..87e3140a55ca 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -897,7 +897,7 @@  static inline int arch_unmap_one(struct mm_struct *mm,
  * prototypes must be defined in the arch-specific asm/pgtable.h file.
  */
 #ifndef __HAVE_ARCH_PREPARE_TO_SWAP
-static inline int arch_prepare_to_swap(struct page *page)
+static inline int arch_prepare_to_swap(struct folio *folio)
 {
 	return 0;
 }
@@ -914,7 +914,7 @@  static inline void arch_swap_invalidate_area(int type)
 #endif
 
 #ifndef __HAVE_ARCH_SWAP_RESTORE
-static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
+static inline void arch_swap_restore(swp_entry_t entry, struct page *page)
 {
 }
 #endif
diff --git a/mm/memory.c b/mm/memory.c
index 1f18ed4a5497..fad238dd38e7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4022,7 +4022,7 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 	 * when reading from swap. This metadata may be indexed by swap entry
 	 * so this must be called before swap_free().
 	 */
-	arch_swap_restore(entry, folio);
+	arch_swap_restore(entry, page);
 
 	/*
 	 * Remove the swap entry and conditionally try to free up the swapcache.
diff --git a/mm/page_io.c b/mm/page_io.c
index cb559ae324c6..0fd832474c1d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -189,7 +189,7 @@  int swap_writepage(struct page *page, struct writeback_control *wbc)
 	 * Arch code may have to preserve more data than just the page
 	 * contents, e.g. memory tags.
 	 */
-	ret = arch_prepare_to_swap(&folio->page);
+	ret = arch_prepare_to_swap(folio);
 	if (ret) {
 		folio_mark_dirty(folio);
 		folio_unlock(folio);
diff --git a/mm/shmem.c b/mm/shmem.c
index 91e2620148b2..7d32e50da121 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1892,7 +1892,7 @@  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	 * Some architectures may have to restore extra metadata to the
 	 * folio after reading from swap.
 	 */
-	arch_swap_restore(swap, folio);
+	arch_swap_restore(swap, &folio->page);
 
 	if (shmem_should_replace_folio(folio, gfp)) {
 		error = shmem_replace_folio(&folio, gfp, info, index);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 0bec1f705f8e..2325adbb1f19 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -307,7 +307,7 @@  swp_entry_t folio_alloc_swap(struct folio *folio)
 	entry.val = 0;
 
 	if (folio_test_large(folio)) {
-		if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported())
+		if (IS_ENABLED(CONFIG_THP_SWAP))
 			get_swap_pages(1, &entry, folio_nr_pages(folio));
 		goto out;
 	}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4bc70f459164..6450e0279e35 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1784,7 +1784,7 @@  static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 	 * when reading from swap. This metadata may be indexed by swap entry
 	 * so this must be called before swap_free().
 	 */
-	arch_swap_restore(entry, page_folio(page));
+	arch_swap_restore(entry, page);
 
 	/* See do_swap_page() */
 	BUG_ON(!PageAnon(page) && PageMappedToDisk(page));