diff mbox series

[v2,08/26] userfaultfd: wp: add WP pagetable tracking to x86

Message ID 20190212025632.28946-9-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd: write protection support | expand

Commit Message

Peter Xu Feb. 12, 2019, 2:56 a.m. UTC
From: Andrea Arcangeli <aarcange@redhat.com>

Accurate userfaultfd WP tracking is possible by tracking exactly which
virtual memory ranges were writeprotected by userland. We can't relay
only on the RW bit of the mapped pagetable because that information is
destroyed by fork() or KSM or swap. If we were to relay on that, we'd
need to stay on the safe side and generate false positive wp faults
for every swapped out page.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 arch/x86/Kconfig                     |  1 +
 arch/x86/include/asm/pgtable.h       | 52 ++++++++++++++++++++++++++++
 arch/x86/include/asm/pgtable_64.h    |  8 ++++-
 arch/x86/include/asm/pgtable_types.h |  9 +++++
 include/asm-generic/pgtable.h        |  1 +
 include/asm-generic/pgtable_uffd.h   | 51 +++++++++++++++++++++++++++
 init/Kconfig                         |  5 +++
 7 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 include/asm-generic/pgtable_uffd.h

Comments

Jerome Glisse Feb. 21, 2019, 5:20 p.m. UTC | #1
On Tue, Feb 12, 2019 at 10:56:14AM +0800, Peter Xu wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
> 
> Accurate userfaultfd WP tracking is possible by tracking exactly which
> virtual memory ranges were writeprotected by userland. We can't relay
> only on the RW bit of the mapped pagetable because that information is
> destroyed by fork() or KSM or swap. If we were to relay on that, we'd
> need to stay on the safe side and generate false positive wp faults
> for every swapped out page.
> 
> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>

So i thought about this some more and the only alternative i see is
definining a new swap type to preserve the pte write bit when swapping,
and storing the original pte write within ksm stable_node. This would
solve false positive for swap and ksm.

But i do not see this as a better alternative to storing the wp status
as bit in the pte. So:

Reviewed-by: Jérôme Glisse <jglisse@redhat.com>

> ---
>  arch/x86/Kconfig                     |  1 +
>  arch/x86/include/asm/pgtable.h       | 52 ++++++++++++++++++++++++++++
>  arch/x86/include/asm/pgtable_64.h    |  8 ++++-
>  arch/x86/include/asm/pgtable_types.h |  9 +++++
>  include/asm-generic/pgtable.h        |  1 +
>  include/asm-generic/pgtable_uffd.h   | 51 +++++++++++++++++++++++++++
>  init/Kconfig                         |  5 +++
>  7 files changed, 126 insertions(+), 1 deletion(-)
>  create mode 100644 include/asm-generic/pgtable_uffd.h
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 68261430fe6e..cb43bc008675 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -209,6 +209,7 @@ config X86
>  	select USER_STACKTRACE_SUPPORT
>  	select VIRT_TO_BUS
>  	select X86_FEATURE_NAMES		if PROC_FS
> +	select HAVE_ARCH_USERFAULTFD_WP		if USERFAULTFD
>  
>  config INSTRUCTION_DECODER
>  	def_bool y
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 2779ace16d23..6863236e8484 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -23,6 +23,7 @@
>  
>  #ifndef __ASSEMBLY__
>  #include <asm/x86_init.h>
> +#include <asm-generic/pgtable_uffd.h>
>  
>  extern pgd_t early_top_pgt[PTRS_PER_PGD];
>  int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
> @@ -293,6 +294,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
>  	return native_make_pte(v & ~clear);
>  }
>  
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static inline int pte_uffd_wp(pte_t pte)
> +{
> +	return pte_flags(pte) & _PAGE_UFFD_WP;
> +}
> +
> +static inline pte_t pte_mkuffd_wp(pte_t pte)
> +{
> +	return pte_set_flags(pte, _PAGE_UFFD_WP);
> +}
> +
> +static inline pte_t pte_clear_uffd_wp(pte_t pte)
> +{
> +	return pte_clear_flags(pte, _PAGE_UFFD_WP);
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
>  static inline pte_t pte_mkclean(pte_t pte)
>  {
>  	return pte_clear_flags(pte, _PAGE_DIRTY);
> @@ -372,6 +390,23 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
>  	return native_make_pmd(v & ~clear);
>  }
>  
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static inline int pmd_uffd_wp(pmd_t pmd)
> +{
> +	return pmd_flags(pmd) & _PAGE_UFFD_WP;
> +}
> +
> +static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
> +{
> +	return pmd_set_flags(pmd, _PAGE_UFFD_WP);
> +}
> +
> +static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
> +{
> +	return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
>  static inline pmd_t pmd_mkold(pmd_t pmd)
>  {
>  	return pmd_clear_flags(pmd, _PAGE_ACCESSED);
> @@ -1351,6 +1386,23 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
>  #endif
>  #endif
>  
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
> +{
> +	return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
> +}
> +
> +static inline int pte_swp_uffd_wp(pte_t pte)
> +{
> +	return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
> +}
> +
> +static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
> +{
> +	return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
>  #define PKRU_AD_BIT 0x1
>  #define PKRU_WD_BIT 0x2
>  #define PKRU_BITS_PER_PKEY 2
> diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
> index 9c85b54bf03c..e0c5d29b8685 100644
> --- a/arch/x86/include/asm/pgtable_64.h
> +++ b/arch/x86/include/asm/pgtable_64.h
> @@ -189,7 +189,7 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
>   *
>   * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2| 1|0| <- bit number
>   * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
> - * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| X|X|SD|0| <- swp entry
> + * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| X|F|SD|0| <- swp entry
>   *
>   * G (8) is aliased and used as a PROT_NONE indicator for
>   * !present ptes.  We need to start storing swap entries above
> @@ -197,9 +197,15 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
>   * erratum where they can be incorrectly set by hardware on
>   * non-present PTEs.
>   *
> + * SD Bits 1-4 are not used in non-present format and available for
> + * special use described below:
> + *
>   * SD (1) in swp entry is used to store soft dirty bit, which helps us
>   * remember soft dirty over page migration
>   *
> + * F (2) in swp entry is used to record when a pagetable is
> + * writeprotected by userfaultfd WP support.
> + *
>   * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
>   * but also L and G.
>   *
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index d6ff0bbdb394..8cebcff91e57 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -32,6 +32,7 @@
>  
>  #define _PAGE_BIT_SPECIAL	_PAGE_BIT_SOFTW1
>  #define _PAGE_BIT_CPA_TEST	_PAGE_BIT_SOFTW1
> +#define _PAGE_BIT_UFFD_WP	_PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
>  #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
>  #define _PAGE_BIT_DEVMAP	_PAGE_BIT_SOFTW4
>  
> @@ -100,6 +101,14 @@
>  #define _PAGE_SWP_SOFT_DIRTY	(_AT(pteval_t, 0))
>  #endif
>  
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +#define _PAGE_UFFD_WP		(_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP)
> +#define _PAGE_SWP_UFFD_WP	_PAGE_USER
> +#else
> +#define _PAGE_UFFD_WP		(_AT(pteval_t, 0))
> +#define _PAGE_SWP_UFFD_WP	(_AT(pteval_t, 0))
> +#endif
> +
>  #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
>  #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
>  #define _PAGE_DEVMAP	(_AT(u64, 1) << _PAGE_BIT_DEVMAP)
> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
> index 05e61e6c843f..f49afe951711 100644
> --- a/include/asm-generic/pgtable.h
> +++ b/include/asm-generic/pgtable.h
> @@ -10,6 +10,7 @@
>  #include <linux/mm_types.h>
>  #include <linux/bug.h>
>  #include <linux/errno.h>
> +#include <asm-generic/pgtable_uffd.h>
>  
>  #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
>  	defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
> diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h
> new file mode 100644
> index 000000000000..643d1bf559c2
> --- /dev/null
> +++ b/include/asm-generic/pgtable_uffd.h
> @@ -0,0 +1,51 @@
> +#ifndef _ASM_GENERIC_PGTABLE_UFFD_H
> +#define _ASM_GENERIC_PGTABLE_UFFD_H
> +
> +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static __always_inline int pte_uffd_wp(pte_t pte)
> +{
> +	return 0;
> +}
> +
> +static __always_inline int pmd_uffd_wp(pmd_t pmd)
> +{
> +	return 0;
> +}
> +
> +static __always_inline pte_t pte_mkuffd_wp(pte_t pte)
> +{
> +	return pte;
> +}
> +
> +static __always_inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
> +{
> +	return pmd;
> +}
> +
> +static __always_inline pte_t pte_clear_uffd_wp(pte_t pte)
> +{
> +	return pte;
> +}
> +
> +static __always_inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
> +{
> +	return pmd;
> +}
> +
> +static __always_inline pte_t pte_swp_mkuffd_wp(pte_t pte)
> +{
> +	return pte;
> +}
> +
> +static __always_inline int pte_swp_uffd_wp(pte_t pte)
> +{
> +	return 0;
> +}
> +
> +static __always_inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
> +{
> +	return pte;
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
> +#endif /* _ASM_GENERIC_PGTABLE_UFFD_H */
> diff --git a/init/Kconfig b/init/Kconfig
> index c9386a365eea..892d61ddf2eb 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1424,6 +1424,11 @@ config ADVISE_SYSCALLS
>  	  applications use these syscalls, you can disable this option to save
>  	  space.
>  
> +config HAVE_ARCH_USERFAULTFD_WP
> +	bool
> +	help
> +	  Arch has userfaultfd write protection support
> +
>  config MEMBARRIER
>  	bool "Enable membarrier() system call" if EXPERT
>  	default y
> -- 
> 2.17.1
>
Mike Rapoport Feb. 25, 2019, 3:48 p.m. UTC | #2
On Tue, Feb 12, 2019 at 10:56:14AM +0800, Peter Xu wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
> 
> Accurate userfaultfd WP tracking is possible by tracking exactly which
> virtual memory ranges were writeprotected by userland. We can't relay
> only on the RW bit of the mapped pagetable because that information is
> destroyed by fork() or KSM or swap. If we were to relay on that, we'd
> need to stay on the safe side and generate false positive wp faults
> for every swapped out page.
> 
> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>

Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>

> ---
>  arch/x86/Kconfig                     |  1 +
>  arch/x86/include/asm/pgtable.h       | 52 ++++++++++++++++++++++++++++
>  arch/x86/include/asm/pgtable_64.h    |  8 ++++-
>  arch/x86/include/asm/pgtable_types.h |  9 +++++
>  include/asm-generic/pgtable.h        |  1 +
>  include/asm-generic/pgtable_uffd.h   | 51 +++++++++++++++++++++++++++
>  init/Kconfig                         |  5 +++
>  7 files changed, 126 insertions(+), 1 deletion(-)
>  create mode 100644 include/asm-generic/pgtable_uffd.h
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 68261430fe6e..cb43bc008675 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -209,6 +209,7 @@ config X86
>  	select USER_STACKTRACE_SUPPORT
>  	select VIRT_TO_BUS
>  	select X86_FEATURE_NAMES		if PROC_FS
> +	select HAVE_ARCH_USERFAULTFD_WP		if USERFAULTFD
> 
>  config INSTRUCTION_DECODER
>  	def_bool y
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 2779ace16d23..6863236e8484 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -23,6 +23,7 @@
> 
>  #ifndef __ASSEMBLY__
>  #include <asm/x86_init.h>
> +#include <asm-generic/pgtable_uffd.h>
> 
>  extern pgd_t early_top_pgt[PTRS_PER_PGD];
>  int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
> @@ -293,6 +294,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
>  	return native_make_pte(v & ~clear);
>  }
> 
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static inline int pte_uffd_wp(pte_t pte)
> +{
> +	return pte_flags(pte) & _PAGE_UFFD_WP;
> +}
> +
> +static inline pte_t pte_mkuffd_wp(pte_t pte)
> +{
> +	return pte_set_flags(pte, _PAGE_UFFD_WP);
> +}
> +
> +static inline pte_t pte_clear_uffd_wp(pte_t pte)
> +{
> +	return pte_clear_flags(pte, _PAGE_UFFD_WP);
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
>  static inline pte_t pte_mkclean(pte_t pte)
>  {
>  	return pte_clear_flags(pte, _PAGE_DIRTY);
> @@ -372,6 +390,23 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
>  	return native_make_pmd(v & ~clear);
>  }
> 
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static inline int pmd_uffd_wp(pmd_t pmd)
> +{
> +	return pmd_flags(pmd) & _PAGE_UFFD_WP;
> +}
> +
> +static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
> +{
> +	return pmd_set_flags(pmd, _PAGE_UFFD_WP);
> +}
> +
> +static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
> +{
> +	return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
>  static inline pmd_t pmd_mkold(pmd_t pmd)
>  {
>  	return pmd_clear_flags(pmd, _PAGE_ACCESSED);
> @@ -1351,6 +1386,23 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
>  #endif
>  #endif
> 
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
> +{
> +	return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
> +}
> +
> +static inline int pte_swp_uffd_wp(pte_t pte)
> +{
> +	return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
> +}
> +
> +static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
> +{
> +	return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
>  #define PKRU_AD_BIT 0x1
>  #define PKRU_WD_BIT 0x2
>  #define PKRU_BITS_PER_PKEY 2
> diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
> index 9c85b54bf03c..e0c5d29b8685 100644
> --- a/arch/x86/include/asm/pgtable_64.h
> +++ b/arch/x86/include/asm/pgtable_64.h
> @@ -189,7 +189,7 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
>   *
>   * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2| 1|0| <- bit number
>   * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
> - * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| X|X|SD|0| <- swp entry
> + * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| X|F|SD|0| <- swp entry
>   *
>   * G (8) is aliased and used as a PROT_NONE indicator for
>   * !present ptes.  We need to start storing swap entries above
> @@ -197,9 +197,15 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
>   * erratum where they can be incorrectly set by hardware on
>   * non-present PTEs.
>   *
> + * SD Bits 1-4 are not used in non-present format and available for
> + * special use described below:
> + *
>   * SD (1) in swp entry is used to store soft dirty bit, which helps us
>   * remember soft dirty over page migration
>   *
> + * F (2) in swp entry is used to record when a pagetable is
> + * writeprotected by userfaultfd WP support.
> + *
>   * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
>   * but also L and G.
>   *
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index d6ff0bbdb394..8cebcff91e57 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -32,6 +32,7 @@
> 
>  #define _PAGE_BIT_SPECIAL	_PAGE_BIT_SOFTW1
>  #define _PAGE_BIT_CPA_TEST	_PAGE_BIT_SOFTW1
> +#define _PAGE_BIT_UFFD_WP	_PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
>  #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
>  #define _PAGE_BIT_DEVMAP	_PAGE_BIT_SOFTW4
> 
> @@ -100,6 +101,14 @@
>  #define _PAGE_SWP_SOFT_DIRTY	(_AT(pteval_t, 0))
>  #endif
> 
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +#define _PAGE_UFFD_WP		(_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP)
> +#define _PAGE_SWP_UFFD_WP	_PAGE_USER
> +#else
> +#define _PAGE_UFFD_WP		(_AT(pteval_t, 0))
> +#define _PAGE_SWP_UFFD_WP	(_AT(pteval_t, 0))
> +#endif
> +
>  #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
>  #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
>  #define _PAGE_DEVMAP	(_AT(u64, 1) << _PAGE_BIT_DEVMAP)
> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
> index 05e61e6c843f..f49afe951711 100644
> --- a/include/asm-generic/pgtable.h
> +++ b/include/asm-generic/pgtable.h
> @@ -10,6 +10,7 @@
>  #include <linux/mm_types.h>
>  #include <linux/bug.h>
>  #include <linux/errno.h>
> +#include <asm-generic/pgtable_uffd.h>
> 
>  #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
>  	defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
> diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h
> new file mode 100644
> index 000000000000..643d1bf559c2
> --- /dev/null
> +++ b/include/asm-generic/pgtable_uffd.h
> @@ -0,0 +1,51 @@
> +#ifndef _ASM_GENERIC_PGTABLE_UFFD_H
> +#define _ASM_GENERIC_PGTABLE_UFFD_H
> +
> +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static __always_inline int pte_uffd_wp(pte_t pte)
> +{
> +	return 0;
> +}
> +
> +static __always_inline int pmd_uffd_wp(pmd_t pmd)
> +{
> +	return 0;
> +}
> +
> +static __always_inline pte_t pte_mkuffd_wp(pte_t pte)
> +{
> +	return pte;
> +}
> +
> +static __always_inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
> +{
> +	return pmd;
> +}
> +
> +static __always_inline pte_t pte_clear_uffd_wp(pte_t pte)
> +{
> +	return pte;
> +}
> +
> +static __always_inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
> +{
> +	return pmd;
> +}
> +
> +static __always_inline pte_t pte_swp_mkuffd_wp(pte_t pte)
> +{
> +	return pte;
> +}
> +
> +static __always_inline int pte_swp_uffd_wp(pte_t pte)
> +{
> +	return 0;
> +}
> +
> +static __always_inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
> +{
> +	return pte;
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
> +#endif /* _ASM_GENERIC_PGTABLE_UFFD_H */
> diff --git a/init/Kconfig b/init/Kconfig
> index c9386a365eea..892d61ddf2eb 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1424,6 +1424,11 @@ config ADVISE_SYSCALLS
>  	  applications use these syscalls, you can disable this option to save
>  	  space.
> 
> +config HAVE_ARCH_USERFAULTFD_WP
> +	bool
> +	help
> +	  Arch has userfaultfd write protection support
> +
>  config MEMBARRIER
>  	bool "Enable membarrier() system call" if EXPERT
>  	default y
> -- 
> 2.17.1
>
diff mbox series

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 68261430fe6e..cb43bc008675 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -209,6 +209,7 @@  config X86
 	select USER_STACKTRACE_SUPPORT
 	select VIRT_TO_BUS
 	select X86_FEATURE_NAMES		if PROC_FS
+	select HAVE_ARCH_USERFAULTFD_WP		if USERFAULTFD
 
 config INSTRUCTION_DECODER
 	def_bool y
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2779ace16d23..6863236e8484 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -23,6 +23,7 @@ 
 
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
+#include <asm-generic/pgtable_uffd.h>
 
 extern pgd_t early_top_pgt[PTRS_PER_PGD];
 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
@@ -293,6 +294,23 @@  static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
 	return native_make_pte(v & ~clear);
 }
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline int pte_uffd_wp(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_UFFD_WP;
+}
+
+static inline pte_t pte_mkuffd_wp(pte_t pte)
+{
+	return pte_set_flags(pte, _PAGE_UFFD_WP);
+}
+
+static inline pte_t pte_clear_uffd_wp(pte_t pte)
+{
+	return pte_clear_flags(pte, _PAGE_UFFD_WP);
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
 static inline pte_t pte_mkclean(pte_t pte)
 {
 	return pte_clear_flags(pte, _PAGE_DIRTY);
@@ -372,6 +390,23 @@  static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
 	return native_make_pmd(v & ~clear);
 }
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline int pmd_uffd_wp(pmd_t pmd)
+{
+	return pmd_flags(pmd) & _PAGE_UFFD_WP;
+}
+
+static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_UFFD_WP);
+}
+
+static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
+{
+	return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
 static inline pmd_t pmd_mkold(pmd_t pmd)
 {
 	return pmd_clear_flags(pmd, _PAGE_ACCESSED);
@@ -1351,6 +1386,23 @@  static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
 #endif
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
+{
+	return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
+}
+
+static inline int pte_swp_uffd_wp(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
+}
+
+static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
+{
+	return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
 #define PKRU_AD_BIT 0x1
 #define PKRU_WD_BIT 0x2
 #define PKRU_BITS_PER_PKEY 2
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 9c85b54bf03c..e0c5d29b8685 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -189,7 +189,7 @@  extern void sync_global_pgds(unsigned long start, unsigned long end);
  *
  * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2| 1|0| <- bit number
  * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
- * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| X|X|SD|0| <- swp entry
+ * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| X|F|SD|0| <- swp entry
  *
  * G (8) is aliased and used as a PROT_NONE indicator for
  * !present ptes.  We need to start storing swap entries above
@@ -197,9 +197,15 @@  extern void sync_global_pgds(unsigned long start, unsigned long end);
  * erratum where they can be incorrectly set by hardware on
  * non-present PTEs.
  *
+ * SD Bits 1-4 are not used in non-present format and available for
+ * special use described below:
+ *
  * SD (1) in swp entry is used to store soft dirty bit, which helps us
  * remember soft dirty over page migration
  *
+ * F (2) in swp entry is used to record when a pagetable is
+ * writeprotected by userfaultfd WP support.
+ *
  * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
  * but also L and G.
  *
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d6ff0bbdb394..8cebcff91e57 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -32,6 +32,7 @@ 
 
 #define _PAGE_BIT_SPECIAL	_PAGE_BIT_SOFTW1
 #define _PAGE_BIT_CPA_TEST	_PAGE_BIT_SOFTW1
+#define _PAGE_BIT_UFFD_WP	_PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_DEVMAP	_PAGE_BIT_SOFTW4
 
@@ -100,6 +101,14 @@ 
 #define _PAGE_SWP_SOFT_DIRTY	(_AT(pteval_t, 0))
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+#define _PAGE_UFFD_WP		(_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP)
+#define _PAGE_SWP_UFFD_WP	_PAGE_USER
+#else
+#define _PAGE_UFFD_WP		(_AT(pteval_t, 0))
+#define _PAGE_SWP_UFFD_WP	(_AT(pteval_t, 0))
+#endif
+
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #define _PAGE_DEVMAP	(_AT(u64, 1) << _PAGE_BIT_DEVMAP)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 05e61e6c843f..f49afe951711 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -10,6 +10,7 @@ 
 #include <linux/mm_types.h>
 #include <linux/bug.h>
 #include <linux/errno.h>
+#include <asm-generic/pgtable_uffd.h>
 
 #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
 	defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h
new file mode 100644
index 000000000000..643d1bf559c2
--- /dev/null
+++ b/include/asm-generic/pgtable_uffd.h
@@ -0,0 +1,51 @@ 
+#ifndef _ASM_GENERIC_PGTABLE_UFFD_H
+#define _ASM_GENERIC_PGTABLE_UFFD_H
+
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static __always_inline int pte_uffd_wp(pte_t pte)
+{
+	return 0;
+}
+
+static __always_inline int pmd_uffd_wp(pmd_t pmd)
+{
+	return 0;
+}
+
+static __always_inline pte_t pte_mkuffd_wp(pte_t pte)
+{
+	return pte;
+}
+
+static __always_inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
+{
+	return pmd;
+}
+
+static __always_inline pte_t pte_clear_uffd_wp(pte_t pte)
+{
+	return pte;
+}
+
+static __always_inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
+{
+	return pmd;
+}
+
+static __always_inline pte_t pte_swp_mkuffd_wp(pte_t pte)
+{
+	return pte;
+}
+
+static __always_inline int pte_swp_uffd_wp(pte_t pte)
+{
+	return 0;
+}
+
+static __always_inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
+{
+	return pte;
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
+#endif /* _ASM_GENERIC_PGTABLE_UFFD_H */
diff --git a/init/Kconfig b/init/Kconfig
index c9386a365eea..892d61ddf2eb 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1424,6 +1424,11 @@  config ADVISE_SYSCALLS
 	  applications use these syscalls, you can disable this option to save
 	  space.
 
+config HAVE_ARCH_USERFAULTFD_WP
+	bool
+	help
+	  Arch has userfaultfd write protection support
+
 config MEMBARRIER
 	bool "Enable membarrier() system call" if EXPERT
 	default y