diff mbox series

[v8,01/23] mm: Introduce PTE_MARKER swap entry

Message ID 20220405014646.13522-2-peterx@redhat.com (mailing list archive)
State New
Headers show
Series userfaultfd-wp: Support shmem and hugetlbfs | expand

Commit Message

Peter Xu April 5, 2022, 1:46 a.m. UTC
This patch introduces a new swap entry type called PTE_MARKER.  It can be
installed for any pte that maps a file-backed memory when the pte is
temporarily zapped, so as to maintain per-pte information.

The information that kept in the pte is called a "marker".  Here we define the
marker as "unsigned long" just to match pgoff_t, however it will only work if
it still fits in swp_offset(), which is e.g. currently 58 bits on x86_64.

A new config CONFIG_PTE_MARKER is introduced too; it's by default off.  A bunch
of helpers are defined altogether to service the rest of the pte marker code.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/asm-generic/hugetlb.h |  9 ++++
 include/linux/swap.h          | 15 ++++++-
 include/linux/swapops.h       | 78 +++++++++++++++++++++++++++++++++++
 mm/Kconfig                    |  6 +++
 4 files changed, 107 insertions(+), 1 deletion(-)

Comments

Alistair Popple April 12, 2022, 1:07 a.m. UTC | #1
Hi Peter,

I noticed this while reviewing the next patch in the series. I think you need to
add CONFIG_PTE_MARKER to the below as well:

#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \
    defined(CONFIG_DEVICE_PRIVATE)
static inline int non_swap_entry(swp_entry_t entry)
{
	return swp_type(entry) >= MAX_SWAPFILES;
}
#else
static inline int non_swap_entry(swp_entry_t entry)
{
	return 0;
}
#endif

Otherwise marker entries will be treated as swap entries, which is wrong for
example in swapin_walk_pmd_entry() as marker entries are no longer considered
pte_none().

- Alistair

Peter Xu <peterx@redhat.com> writes:

> This patch introduces a new swap entry type called PTE_MARKER.  It can be
> installed for any pte that maps a file-backed memory when the pte is
> temporarily zapped, so as to maintain per-pte information.
>
> The information that kept in the pte is called a "marker".  Here we define the
> marker as "unsigned long" just to match pgoff_t, however it will only work if
> it still fits in swp_offset(), which is e.g. currently 58 bits on x86_64.
>
> A new config CONFIG_PTE_MARKER is introduced too; it's by default off.  A bunch
> of helpers are defined altogether to service the rest of the pte marker code.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  include/asm-generic/hugetlb.h |  9 ++++
>  include/linux/swap.h          | 15 ++++++-
>  include/linux/swapops.h       | 78 +++++++++++++++++++++++++++++++++++
>  mm/Kconfig                    |  6 +++
>  4 files changed, 107 insertions(+), 1 deletion(-)
>
> diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
> index 8e1e6244a89d..f39cad20ffc6 100644
> --- a/include/asm-generic/hugetlb.h
> +++ b/include/asm-generic/hugetlb.h
> @@ -2,6 +2,9 @@
>  #ifndef _ASM_GENERIC_HUGETLB_H
>  #define _ASM_GENERIC_HUGETLB_H
>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
> +
>  static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot)
>  {
>  	return mk_pte(page, pgprot);
> @@ -80,6 +83,12 @@ static inline int huge_pte_none(pte_t pte)
>  }
>  #endif
>
> +/* Please refer to comments above pte_none_mostly() for the usage */
> +static inline int huge_pte_none_mostly(pte_t pte)
> +{
> +	return huge_pte_none(pte) || is_pte_marker(pte);
> +}
> +
>  #ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT
>  static inline pte_t huge_pte_wrprotect(pte_t pte)
>  {
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 7daae5a4b3e1..5553189d0215 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -55,6 +55,19 @@ static inline int current_is_kswapd(void)
>   * actions on faults.
>   */
>
> +/*
> + * PTE markers are used to persist information onto PTEs that are mapped with
> + * file-backed memories.  As its name "PTE" hints, it should only be applied to
> + * the leaves of pgtables.
> + */
> +#ifdef CONFIG_PTE_MARKER
> +#define SWP_PTE_MARKER_NUM 1
> +#define SWP_PTE_MARKER     (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
> +			    SWP_MIGRATION_NUM + SWP_DEVICE_NUM)
> +#else
> +#define SWP_PTE_MARKER_NUM 0
> +#endif
> +
>  /*
>   * Unaddressable device memory support. See include/linux/hmm.h and
>   * Documentation/vm/hmm.rst. Short description is we need struct pages for
> @@ -107,7 +120,7 @@ static inline int current_is_kswapd(void)
>
>  #define MAX_SWAPFILES \
>  	((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
> -	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
> +	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM)
>
>  /*
>   * Magic header for a swap area. The first part of the union is
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index 32d517a28969..7a00627845f0 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -274,6 +274,84 @@ static inline int is_readable_migration_entry(swp_entry_t entry)
>
>  #endif
>
> +typedef unsigned long pte_marker;
> +
> +#define  PTE_MARKER_MASK     (0)
> +
> +#ifdef CONFIG_PTE_MARKER
> +
> +static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
> +{
> +	return swp_entry(SWP_PTE_MARKER, marker);
> +}
> +
> +static inline bool is_pte_marker_entry(swp_entry_t entry)
> +{
> +	return swp_type(entry) == SWP_PTE_MARKER;
> +}
> +
> +static inline pte_marker pte_marker_get(swp_entry_t entry)
> +{
> +	return swp_offset(entry) & PTE_MARKER_MASK;
> +}
> +
> +static inline bool is_pte_marker(pte_t pte)
> +{
> +	return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte));
> +}
> +
> +#else /* CONFIG_PTE_MARKER */
> +
> +static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
> +{
> +	/* This should never be called if !CONFIG_PTE_MARKER */
> +	WARN_ON_ONCE(1);
> +	return swp_entry(0, 0);
> +}
> +
> +static inline bool is_pte_marker_entry(swp_entry_t entry)
> +{
> +	return false;
> +}
> +
> +static inline pte_marker pte_marker_get(swp_entry_t entry)
> +{
> +	return 0;
> +}
> +
> +static inline bool is_pte_marker(pte_t pte)
> +{
> +	return false;
> +}
> +
> +#endif /* CONFIG_PTE_MARKER */
> +
> +static inline pte_t make_pte_marker(pte_marker marker)
> +{
> +	return swp_entry_to_pte(make_pte_marker_entry(marker));
> +}
> +
> +/*
> + * This is a special version to check pte_none() just to cover the case when
> + * the pte is a pte marker.  It existed because in many cases the pte marker
> + * should be seen as a none pte; it's just that we have stored some information
> + * onto the none pte so it becomes not-none any more.
> + *
> + * It should be used when the pte is file-backed, ram-based and backing
> + * userspace pages, like shmem.  It is not needed upon pgtables that do not
> + * support pte markers at all.  For example, it's not needed on anonymous
> + * memory, kernel-only memory (including when the system is during-boot),
> + * non-ram based generic file-system.  It's fine to be used even there, but the
> + * extra pte marker check will be pure overhead.
> + *
> + * For systems configured with !CONFIG_PTE_MARKER this will be automatically
> + * optimized to pte_none().
> + */
> +static inline int pte_none_mostly(pte_t pte)
> +{
> +	return pte_none(pte) || is_pte_marker(pte);
> +}
> +
>  static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
>  {
>  	struct page *p = pfn_to_page(swp_offset(entry));
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 034d87953600..a1688b9314b2 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -909,6 +909,12 @@ config ANON_VMA_NAME
>  	  area from being merged with adjacent virtual memory areas due to the
>  	  difference in their name.
>
> +config PTE_MARKER
> +	bool "Marker PTEs support"
> +
> +	help
> +	  Allows to create marker PTEs for file-backed memory.
> +
>  source "mm/damon/Kconfig"
>
>  endmenu
Peter Xu April 12, 2022, 7:45 p.m. UTC | #2
On Tue, Apr 12, 2022 at 11:07:56AM +1000, Alistair Popple wrote:
> Hi Peter,

Hi, Alistair,

> 
> I noticed this while reviewing the next patch in the series. I think you need to
> add CONFIG_PTE_MARKER to the below as well:
> 
> #if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \
>     defined(CONFIG_DEVICE_PRIVATE)
> static inline int non_swap_entry(swp_entry_t entry)
> {
> 	return swp_type(entry) >= MAX_SWAPFILES;
> }
> #else
> static inline int non_swap_entry(swp_entry_t entry)
> {
> 	return 0;
> }
> #endif
> 
> Otherwise marker entries will be treated as swap entries, which is wrong for
> example in swapin_walk_pmd_entry() as marker entries are no longer considered
> pte_none().

Thanks for the comment, that makes sense.

Instead of adding PTE_MARKER into this equation, I'm going backward and
wondering purely on why we need to bother with non_swap_entry() at all if
MAX_SWAPFILES is already defined with proper knowledges of all these bits.

#define MAX_SWAPFILES \
	((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)

So, I agree with your analysis, but instead of adding PTE_MARKER, what do
you think about we dropping that complexity as a whole (possibly with a
standalone patch)?

---8<---
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index d356ab4047f7..5af852b68805 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -387,18 +387,10 @@ static inline void num_poisoned_pages_inc(void)
 }
 #endif
 
-#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \
-    defined(CONFIG_DEVICE_PRIVATE)
 static inline int non_swap_entry(swp_entry_t entry)
 {
        return swp_type(entry) >= MAX_SWAPFILES;
 }
-#else
-static inline int non_swap_entry(swp_entry_t entry)
-{
-       return 0;
-}
-#endif
 
 #endif /* CONFIG_MMU */
 #endif /* _LINUX_SWAPOPS_H */
---8<---

Thanks,
Alistair Popple April 13, 2022, 12:30 a.m. UTC | #3
Peter Xu <peterx@redhat.com> writes:

> On Tue, Apr 12, 2022 at 11:07:56AM +1000, Alistair Popple wrote:
>> Hi Peter,
>
> Hi, Alistair,
>
>>
>> I noticed this while reviewing the next patch in the series. I think you need to
>> add CONFIG_PTE_MARKER to the below as well:
>>
>> #if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \
>>     defined(CONFIG_DEVICE_PRIVATE)
>> static inline int non_swap_entry(swp_entry_t entry)
>> {
>> 	return swp_type(entry) >= MAX_SWAPFILES;
>> }
>> #else
>> static inline int non_swap_entry(swp_entry_t entry)
>> {
>> 	return 0;
>> }
>> #endif
>>
>> Otherwise marker entries will be treated as swap entries, which is wrong for
>> example in swapin_walk_pmd_entry() as marker entries are no longer considered
>> pte_none().
>
> Thanks for the comment, that makes sense.
>
> Instead of adding PTE_MARKER into this equation, I'm going backward and
> wondering purely on why we need to bother with non_swap_entry() at all if
> MAX_SWAPFILES is already defined with proper knowledges of all these bits.

I was going to suggest it was to help the compiler optimise the non-swap entry
code away. But I just tested and it makes no difference in .text section size
either way so I think your suggestion is good unless that isn't true for other
architecture/compiler combinations (I only tried gcc-10.2.1 and x86_64).

That's a possibility because the optimisation isn't obvious to me at least.

non_swap_entry() is equivalent to:

(entry.val >> SWP_TYPE_SHIFT) >= MAX_SWAPFILES;
(entry.val >> (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)) >= (1<<5);
(entry.val >> (BITS_PER_LONG - 1 - 5)) >= (1<<5);
(entry.val >> 58) >= (1<<5);

Where entry.val is a long. So from that alone it's not obvious this could be
optimised away, because nothing there implies entry.val != (1<<63) which would
make the conditional true. But there's a lot of inlining going on in the
creation of swap entries which I didn't trace, so something must end up implying
entry.val < (1<<63).

>
> #define MAX_SWAPFILES \
> 	((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
> 	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
>
> So, I agree with your analysis, but instead of adding PTE_MARKER, what do
> you think about we dropping that complexity as a whole (possibly with a
> standalone patch)?
>
> ---8<---
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index d356ab4047f7..5af852b68805 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -387,18 +387,10 @@ static inline void num_poisoned_pages_inc(void)
>  }
>  #endif
>
> -#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \
> -    defined(CONFIG_DEVICE_PRIVATE)
>  static inline int non_swap_entry(swp_entry_t entry)
>  {
>         return swp_type(entry) >= MAX_SWAPFILES;
>  }
> -#else
> -static inline int non_swap_entry(swp_entry_t entry)
> -{
> -       return 0;
> -}
> -#endif
>
>  #endif /* CONFIG_MMU */
>  #endif /* _LINUX_SWAPOPS_H */
> ---8<---
>
> Thanks,
Peter Xu April 13, 2022, 1:44 p.m. UTC | #4
On Wed, Apr 13, 2022 at 10:30:33AM +1000, Alistair Popple wrote:
> Peter Xu <peterx@redhat.com> writes:
> 
> > On Tue, Apr 12, 2022 at 11:07:56AM +1000, Alistair Popple wrote:
> >> Hi Peter,
> >
> > Hi, Alistair,
> >
> >>
> >> I noticed this while reviewing the next patch in the series. I think you need to
> >> add CONFIG_PTE_MARKER to the below as well:
> >>
> >> #if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION) || \
> >>     defined(CONFIG_DEVICE_PRIVATE)
> >> static inline int non_swap_entry(swp_entry_t entry)
> >> {
> >> 	return swp_type(entry) >= MAX_SWAPFILES;
> >> }
> >> #else
> >> static inline int non_swap_entry(swp_entry_t entry)
> >> {
> >> 	return 0;
> >> }
> >> #endif
> >>
> >> Otherwise marker entries will be treated as swap entries, which is wrong for
> >> example in swapin_walk_pmd_entry() as marker entries are no longer considered
> >> pte_none().
> >
> > Thanks for the comment, that makes sense.
> >
> > Instead of adding PTE_MARKER into this equation, I'm going backward and
> > wondering purely on why we need to bother with non_swap_entry() at all if
> > MAX_SWAPFILES is already defined with proper knowledges of all these bits.
> 
> I was going to suggest it was to help the compiler optimise the non-swap entry
> code away. But I just tested and it makes no difference in .text section size
> either way so I think your suggestion is good unless that isn't true for other
> architecture/compiler combinations (I only tried gcc-10.2.1 and x86_64).
> 
> That's a possibility because the optimisation isn't obvious to me at least.
> 
> non_swap_entry() is equivalent to:
> 
> (entry.val >> SWP_TYPE_SHIFT) >= MAX_SWAPFILES;
> (entry.val >> (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)) >= (1<<5);
> (entry.val >> (BITS_PER_LONG - 1 - 5)) >= (1<<5);
> (entry.val >> 58) >= (1<<5);
> 
> Where entry.val is a long. So from that alone it's not obvious this could be
> optimised away, because nothing there implies entry.val != (1<<63) which would
> make the conditional true. But there's a lot of inlining going on in the
> creation of swap entries which I didn't trace, so something must end up implying
> entry.val < (1<<63).

I think my point was that we check non_swap_entry() with a pre-assumption
that it's a swap entry, then it means it's the slow path already after
we've parsed the pte entry and be aware it's not present.

So I'm doubting how much the optimization (even if at last, applicable)
could help in reality, not to mention that it'll only have an effect when
all of the configs are not set, so it's a micro optimization on slow path
in a rare config setup.

For any sane modern hosts I'd expect CONFIG_MIGRATION should at least be
set.. then it invalidates any potential optimization we're discussing here.

Let me post a patch for this and move the discussion there.  Thanks a lot
for looking into it.
Alistair Popple April 19, 2022, 8:25 a.m. UTC | #5
Peter Xu <peterx@redhat.com> writes:

> This patch introduces a new swap entry type called PTE_MARKER.  It can be
> installed for any pte that maps a file-backed memory when the pte is
> temporarily zapped, so as to maintain per-pte information.

Hi Peter,

Is there something I have missed that means PTE markers can only be used with
file-backed memory? Obviously that's all you care about for this patch series,
but if we needed to mark some anonymous PTE for special processing is there
anything that would prevent us using a PTE marker? Specifically I was thinking
about it in relation to this series:
<https://lore.kernel.org/linux-mm/87pmldjxiq.fsf@nvdebian.thelocal/>

> The information that kept in the pte is called a "marker".  Here we define the
> marker as "unsigned long" just to match pgoff_t, however it will only work if
> it still fits in swp_offset(), which is e.g. currently 58 bits on x86_64.
>
> A new config CONFIG_PTE_MARKER is introduced too; it's by default off.  A bunch
> of helpers are defined altogether to service the rest of the pte marker code.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  include/asm-generic/hugetlb.h |  9 ++++
>  include/linux/swap.h          | 15 ++++++-
>  include/linux/swapops.h       | 78 +++++++++++++++++++++++++++++++++++
>  mm/Kconfig                    |  6 +++
>  4 files changed, 107 insertions(+), 1 deletion(-)
>
> diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
> index 8e1e6244a89d..f39cad20ffc6 100644
> --- a/include/asm-generic/hugetlb.h
> +++ b/include/asm-generic/hugetlb.h
> @@ -2,6 +2,9 @@
>  #ifndef _ASM_GENERIC_HUGETLB_H
>  #define _ASM_GENERIC_HUGETLB_H
>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
> +
>  static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot)
>  {
>  	return mk_pte(page, pgprot);
> @@ -80,6 +83,12 @@ static inline int huge_pte_none(pte_t pte)
>  }
>  #endif
>
> +/* Please refer to comments above pte_none_mostly() for the usage */
> +static inline int huge_pte_none_mostly(pte_t pte)
> +{
> +	return huge_pte_none(pte) || is_pte_marker(pte);
> +}
> +
>  #ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT
>  static inline pte_t huge_pte_wrprotect(pte_t pte)
>  {
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 7daae5a4b3e1..5553189d0215 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -55,6 +55,19 @@ static inline int current_is_kswapd(void)
>   * actions on faults.
>   */
>
> +/*
> + * PTE markers are used to persist information onto PTEs that are mapped with
> + * file-backed memories.  As its name "PTE" hints, it should only be applied to
> + * the leaves of pgtables.
> + */
> +#ifdef CONFIG_PTE_MARKER
> +#define SWP_PTE_MARKER_NUM 1
> +#define SWP_PTE_MARKER     (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
> +			    SWP_MIGRATION_NUM + SWP_DEVICE_NUM)
> +#else
> +#define SWP_PTE_MARKER_NUM 0
> +#endif
> +
>  /*
>   * Unaddressable device memory support. See include/linux/hmm.h and
>   * Documentation/vm/hmm.rst. Short description is we need struct pages for
> @@ -107,7 +120,7 @@ static inline int current_is_kswapd(void)
>
>  #define MAX_SWAPFILES \
>  	((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
> -	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
> +	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM)
>
>  /*
>   * Magic header for a swap area. The first part of the union is
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index 32d517a28969..7a00627845f0 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -274,6 +274,84 @@ static inline int is_readable_migration_entry(swp_entry_t entry)
>
>  #endif
>
> +typedef unsigned long pte_marker;
> +
> +#define  PTE_MARKER_MASK     (0)
> +
> +#ifdef CONFIG_PTE_MARKER
> +
> +static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
> +{
> +	return swp_entry(SWP_PTE_MARKER, marker);
> +}
> +
> +static inline bool is_pte_marker_entry(swp_entry_t entry)
> +{
> +	return swp_type(entry) == SWP_PTE_MARKER;
> +}
> +
> +static inline pte_marker pte_marker_get(swp_entry_t entry)
> +{
> +	return swp_offset(entry) & PTE_MARKER_MASK;
> +}
> +
> +static inline bool is_pte_marker(pte_t pte)
> +{
> +	return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte));
> +}
> +
> +#else /* CONFIG_PTE_MARKER */
> +
> +static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
> +{
> +	/* This should never be called if !CONFIG_PTE_MARKER */
> +	WARN_ON_ONCE(1);
> +	return swp_entry(0, 0);
> +}
> +
> +static inline bool is_pte_marker_entry(swp_entry_t entry)
> +{
> +	return false;
> +}
> +
> +static inline pte_marker pte_marker_get(swp_entry_t entry)
> +{
> +	return 0;
> +}
> +
> +static inline bool is_pte_marker(pte_t pte)
> +{
> +	return false;
> +}
> +
> +#endif /* CONFIG_PTE_MARKER */
> +
> +static inline pte_t make_pte_marker(pte_marker marker)
> +{
> +	return swp_entry_to_pte(make_pte_marker_entry(marker));
> +}
> +
> +/*
> + * This is a special version to check pte_none() just to cover the case when
> + * the pte is a pte marker.  It existed because in many cases the pte marker
> + * should be seen as a none pte; it's just that we have stored some information
> + * onto the none pte so it becomes not-none any more.
> + *
> + * It should be used when the pte is file-backed, ram-based and backing
> + * userspace pages, like shmem.  It is not needed upon pgtables that do not
> + * support pte markers at all.  For example, it's not needed on anonymous
> + * memory, kernel-only memory (including when the system is during-boot),
> + * non-ram based generic file-system.  It's fine to be used even there, but the
> + * extra pte marker check will be pure overhead.
> + *
> + * For systems configured with !CONFIG_PTE_MARKER this will be automatically
> + * optimized to pte_none().
> + */
> +static inline int pte_none_mostly(pte_t pte)
> +{
> +	return pte_none(pte) || is_pte_marker(pte);
> +}
> +
>  static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
>  {
>  	struct page *p = pfn_to_page(swp_offset(entry));
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 034d87953600..a1688b9314b2 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -909,6 +909,12 @@ config ANON_VMA_NAME
>  	  area from being merged with adjacent virtual memory areas due to the
>  	  difference in their name.
>
> +config PTE_MARKER
> +	bool "Marker PTEs support"
> +
> +	help
> +	  Allows to create marker PTEs for file-backed memory.
> +
>  source "mm/damon/Kconfig"
>
>  endmenu
Peter Xu April 19, 2022, 7:44 p.m. UTC | #6
On Tue, Apr 19, 2022 at 06:25:31PM +1000, Alistair Popple wrote:
> Hi Peter,

Hi, Alistair,

> 
> Is there something I have missed that means PTE markers can only be used with
> file-backed memory? Obviously that's all you care about for this patch series,
> but if we needed to mark some anonymous PTE for special processing is there
> anything that would prevent us using a PTE marker? Specifically I was thinking
> about it in relation to this series:
> <https://lore.kernel.org/linux-mm/87pmldjxiq.fsf@nvdebian.thelocal/>

It's not necessarily to be restricted to file-backed.  All the file-backed
check here in this series was just for safety purpose and nothing else.

I think it's a very good example of that swap-read-error case that pte
marker does sound like a great fit, but let's see whether people would
still like to stick with hwpoison which makes some sense too.  Then let's
keep the discussion in that thread.

Thanks for the pointer!
diff mbox series

Patch

diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 8e1e6244a89d..f39cad20ffc6 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -2,6 +2,9 @@ 
 #ifndef _ASM_GENERIC_HUGETLB_H
 #define _ASM_GENERIC_HUGETLB_H
 
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
 static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot)
 {
 	return mk_pte(page, pgprot);
@@ -80,6 +83,12 @@  static inline int huge_pte_none(pte_t pte)
 }
 #endif
 
+/* Please refer to comments above pte_none_mostly() for the usage */
+static inline int huge_pte_none_mostly(pte_t pte)
+{
+	return huge_pte_none(pte) || is_pte_marker(pte);
+}
+
 #ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT
 static inline pte_t huge_pte_wrprotect(pte_t pte)
 {
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7daae5a4b3e1..5553189d0215 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -55,6 +55,19 @@  static inline int current_is_kswapd(void)
  * actions on faults.
  */
 
+/*
+ * PTE markers are used to persist information onto PTEs that are mapped with
+ * file-backed memories.  As its name "PTE" hints, it should only be applied to
+ * the leaves of pgtables.
+ */
+#ifdef CONFIG_PTE_MARKER
+#define SWP_PTE_MARKER_NUM 1
+#define SWP_PTE_MARKER     (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
+			    SWP_MIGRATION_NUM + SWP_DEVICE_NUM)
+#else
+#define SWP_PTE_MARKER_NUM 0
+#endif
+
 /*
  * Unaddressable device memory support. See include/linux/hmm.h and
  * Documentation/vm/hmm.rst. Short description is we need struct pages for
@@ -107,7 +120,7 @@  static inline int current_is_kswapd(void)
 
 #define MAX_SWAPFILES \
 	((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
-	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
+	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM)
 
 /*
  * Magic header for a swap area. The first part of the union is
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 32d517a28969..7a00627845f0 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -274,6 +274,84 @@  static inline int is_readable_migration_entry(swp_entry_t entry)
 
 #endif
 
+typedef unsigned long pte_marker;
+
+#define  PTE_MARKER_MASK     (0)
+
+#ifdef CONFIG_PTE_MARKER
+
+static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
+{
+	return swp_entry(SWP_PTE_MARKER, marker);
+}
+
+static inline bool is_pte_marker_entry(swp_entry_t entry)
+{
+	return swp_type(entry) == SWP_PTE_MARKER;
+}
+
+static inline pte_marker pte_marker_get(swp_entry_t entry)
+{
+	return swp_offset(entry) & PTE_MARKER_MASK;
+}
+
+static inline bool is_pte_marker(pte_t pte)
+{
+	return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte));
+}
+
+#else /* CONFIG_PTE_MARKER */
+
+static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
+{
+	/* This should never be called if !CONFIG_PTE_MARKER */
+	WARN_ON_ONCE(1);
+	return swp_entry(0, 0);
+}
+
+static inline bool is_pte_marker_entry(swp_entry_t entry)
+{
+	return false;
+}
+
+static inline pte_marker pte_marker_get(swp_entry_t entry)
+{
+	return 0;
+}
+
+static inline bool is_pte_marker(pte_t pte)
+{
+	return false;
+}
+
+#endif /* CONFIG_PTE_MARKER */
+
+static inline pte_t make_pte_marker(pte_marker marker)
+{
+	return swp_entry_to_pte(make_pte_marker_entry(marker));
+}
+
+/*
+ * This is a special version to check pte_none() just to cover the case when
+ * the pte is a pte marker.  It existed because in many cases the pte marker
+ * should be seen as a none pte; it's just that we have stored some information
+ * onto the none pte so it becomes not-none any more.
+ *
+ * It should be used when the pte is file-backed, ram-based and backing
+ * userspace pages, like shmem.  It is not needed upon pgtables that do not
+ * support pte markers at all.  For example, it's not needed on anonymous
+ * memory, kernel-only memory (including when the system is during-boot),
+ * non-ram based generic file-system.  It's fine to be used even there, but the
+ * extra pte marker check will be pure overhead.
+ *
+ * For systems configured with !CONFIG_PTE_MARKER this will be automatically
+ * optimized to pte_none().
+ */
+static inline int pte_none_mostly(pte_t pte)
+{
+	return pte_none(pte) || is_pte_marker(pte);
+}
+
 static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
 {
 	struct page *p = pfn_to_page(swp_offset(entry));
diff --git a/mm/Kconfig b/mm/Kconfig
index 034d87953600..a1688b9314b2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -909,6 +909,12 @@  config ANON_VMA_NAME
 	  area from being merged with adjacent virtual memory areas due to the
 	  difference in their name.
 
+config PTE_MARKER
+	bool "Marker PTEs support"
+
+	help
+	  Allows to create marker PTEs for file-backed memory.
+
 source "mm/damon/Kconfig"
 
 endmenu