diff mbox series

[RFC,10/24] userfaultfd: wp: add WP pagetable tracking to x86

Message ID 20190121075722.7945-11-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd: write protection support | expand

Commit Message

Peter Xu Jan. 21, 2019, 7:57 a.m. UTC
From: Andrea Arcangeli <aarcange@redhat.com>

Accurate userfaultfd WP tracking is possible by tracking exactly which
virtual memory ranges were writeprotected by userland. We can't relay
only on the RW bit of the mapped pagetable because that information is
destroyed by fork() or KSM or swap. If we were to relay on that, we'd
need to stay on the safe side and generate false positive wp faults
for every swapped out page.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 arch/x86/Kconfig                     |  1 +
 arch/x86/include/asm/pgtable.h       | 52 ++++++++++++++++++++++++++++
 arch/x86/include/asm/pgtable_64.h    |  8 ++++-
 arch/x86/include/asm/pgtable_types.h |  9 +++++
 include/asm-generic/pgtable.h        |  1 +
 include/asm-generic/pgtable_uffd.h   | 51 +++++++++++++++++++++++++++
 init/Kconfig                         |  5 +++
 7 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 include/asm-generic/pgtable_uffd.h

Comments

Jerome Glisse Jan. 21, 2019, 3:09 p.m. UTC | #1
On Mon, Jan 21, 2019 at 03:57:08PM +0800, Peter Xu wrote:
> From: Andrea Arcangeli <aarcange@redhat.com>
> 
> Accurate userfaultfd WP tracking is possible by tracking exactly which
> virtual memory ranges were writeprotected by userland. We can't relay
> only on the RW bit of the mapped pagetable because that information is
> destroyed by fork() or KSM or swap. If we were to relay on that, we'd
> need to stay on the safe side and generate false positive wp faults
> for every swapped out page.

So you want to forward write fault (of a protected range) to user space
only if page is not write protected because of fork(), KSM or swap.

This write protection feature is only for anonymous page right ? Other-
wise how would you protect a share page (ie anyone can look it up and
call page_mkwrite on it and start writting to it) ?

So for anonymous page for fork() the mapcount will tell you if page is
write protected for COW. For KSM it is easy check the page flag.

For swap you can use the page lock to synchronize. A page that is
write protected because of swap is write protected because it is being
write to disk thus either under page lock, or with PageWriteback()
returning true while write is on going.

So to me it seems you could properly identify if a page is write
protected for fork, swap or KSM without a new flag.

Cheers,
Jérôme

> 
> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  arch/x86/Kconfig                     |  1 +
>  arch/x86/include/asm/pgtable.h       | 52 ++++++++++++++++++++++++++++
>  arch/x86/include/asm/pgtable_64.h    |  8 ++++-
>  arch/x86/include/asm/pgtable_types.h |  9 +++++
>  include/asm-generic/pgtable.h        |  1 +
>  include/asm-generic/pgtable_uffd.h   | 51 +++++++++++++++++++++++++++
>  init/Kconfig                         |  5 +++
>  7 files changed, 126 insertions(+), 1 deletion(-)
>  create mode 100644 include/asm-generic/pgtable_uffd.h
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 8689e794a43c..096c773452d0 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -207,6 +207,7 @@ config X86
>  	select USER_STACKTRACE_SUPPORT
>  	select VIRT_TO_BUS
>  	select X86_FEATURE_NAMES		if PROC_FS
> +	select HAVE_ARCH_USERFAULTFD_WP		if USERFAULTFD
>  
>  config INSTRUCTION_DECODER
>  	def_bool y
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 40616e805292..7a71158982f4 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -23,6 +23,7 @@
>  
>  #ifndef __ASSEMBLY__
>  #include <asm/x86_init.h>
> +#include <asm-generic/pgtable_uffd.h>
>  
>  extern pgd_t early_top_pgt[PTRS_PER_PGD];
>  int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
> @@ -293,6 +294,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
>  	return native_make_pte(v & ~clear);
>  }
>  
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static inline int pte_uffd_wp(pte_t pte)
> +{
> +	return pte_flags(pte) & _PAGE_UFFD_WP;
> +}
> +
> +static inline pte_t pte_mkuffd_wp(pte_t pte)
> +{
> +	return pte_set_flags(pte, _PAGE_UFFD_WP);
> +}
> +
> +static inline pte_t pte_clear_uffd_wp(pte_t pte)
> +{
> +	return pte_clear_flags(pte, _PAGE_UFFD_WP);
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
>  static inline pte_t pte_mkclean(pte_t pte)
>  {
>  	return pte_clear_flags(pte, _PAGE_DIRTY);
> @@ -372,6 +390,23 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
>  	return native_make_pmd(v & ~clear);
>  }
>  
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static inline int pmd_uffd_wp(pmd_t pmd)
> +{
> +	return pmd_flags(pmd) & _PAGE_UFFD_WP;
> +}
> +
> +static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
> +{
> +	return pmd_set_flags(pmd, _PAGE_UFFD_WP);
> +}
> +
> +static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
> +{
> +	return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
>  static inline pmd_t pmd_mkold(pmd_t pmd)
>  {
>  	return pmd_clear_flags(pmd, _PAGE_ACCESSED);
> @@ -1351,6 +1386,23 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
>  #endif
>  #endif
>  
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
> +{
> +	return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
> +}
> +
> +static inline int pte_swp_uffd_wp(pte_t pte)
> +{
> +	return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
> +}
> +
> +static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
> +{
> +	return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
>  #define PKRU_AD_BIT 0x1
>  #define PKRU_WD_BIT 0x2
>  #define PKRU_BITS_PER_PKEY 2
> diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
> index 9c85b54bf03c..e0c5d29b8685 100644
> --- a/arch/x86/include/asm/pgtable_64.h
> +++ b/arch/x86/include/asm/pgtable_64.h
> @@ -189,7 +189,7 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
>   *
>   * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2| 1|0| <- bit number
>   * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
> - * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| X|X|SD|0| <- swp entry
> + * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| X|F|SD|0| <- swp entry
>   *
>   * G (8) is aliased and used as a PROT_NONE indicator for
>   * !present ptes.  We need to start storing swap entries above
> @@ -197,9 +197,15 @@ extern void sync_global_pgds(unsigned long start, unsigned long end);
>   * erratum where they can be incorrectly set by hardware on
>   * non-present PTEs.
>   *
> + * SD Bits 1-4 are not used in non-present format and available for
> + * special use described below:
> + *
>   * SD (1) in swp entry is used to store soft dirty bit, which helps us
>   * remember soft dirty over page migration
>   *
> + * F (2) in swp entry is used to record when a pagetable is
> + * writeprotected by userfaultfd WP support.
> + *
>   * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
>   * but also L and G.
>   *
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index 106b7d0e2dae..163043ab142d 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -32,6 +32,7 @@
>  
>  #define _PAGE_BIT_SPECIAL	_PAGE_BIT_SOFTW1
>  #define _PAGE_BIT_CPA_TEST	_PAGE_BIT_SOFTW1
> +#define _PAGE_BIT_UFFD_WP	_PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
>  #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
>  #define _PAGE_BIT_DEVMAP	_PAGE_BIT_SOFTW4
>  
> @@ -100,6 +101,14 @@
>  #define _PAGE_SWP_SOFT_DIRTY	(_AT(pteval_t, 0))
>  #endif
>  
> +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +#define _PAGE_UFFD_WP		(_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP)
> +#define _PAGE_SWP_UFFD_WP	_PAGE_USER
> +#else
> +#define _PAGE_UFFD_WP		(_AT(pteval_t, 0))
> +#define _PAGE_SWP_UFFD_WP	(_AT(pteval_t, 0))
> +#endif
> +
>  #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
>  #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
>  #define _PAGE_DEVMAP	(_AT(u64, 1) << _PAGE_BIT_DEVMAP)
> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
> index 359fb935ded6..0e1470ecf7b5 100644
> --- a/include/asm-generic/pgtable.h
> +++ b/include/asm-generic/pgtable.h
> @@ -10,6 +10,7 @@
>  #include <linux/mm_types.h>
>  #include <linux/bug.h>
>  #include <linux/errno.h>
> +#include <asm-generic/pgtable_uffd.h>
>  
>  #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
>  	defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
> diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h
> new file mode 100644
> index 000000000000..643d1bf559c2
> --- /dev/null
> +++ b/include/asm-generic/pgtable_uffd.h
> @@ -0,0 +1,51 @@
> +#ifndef _ASM_GENERIC_PGTABLE_UFFD_H
> +#define _ASM_GENERIC_PGTABLE_UFFD_H
> +
> +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> +static __always_inline int pte_uffd_wp(pte_t pte)
> +{
> +	return 0;
> +}
> +
> +static __always_inline int pmd_uffd_wp(pmd_t pmd)
> +{
> +	return 0;
> +}
> +
> +static __always_inline pte_t pte_mkuffd_wp(pte_t pte)
> +{
> +	return pte;
> +}
> +
> +static __always_inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
> +{
> +	return pmd;
> +}
> +
> +static __always_inline pte_t pte_clear_uffd_wp(pte_t pte)
> +{
> +	return pte;
> +}
> +
> +static __always_inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
> +{
> +	return pmd;
> +}
> +
> +static __always_inline pte_t pte_swp_mkuffd_wp(pte_t pte)
> +{
> +	return pte;
> +}
> +
> +static __always_inline int pte_swp_uffd_wp(pte_t pte)
> +{
> +	return 0;
> +}
> +
> +static __always_inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
> +{
> +	return pte;
> +}
> +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
> +
> +#endif /* _ASM_GENERIC_PGTABLE_UFFD_H */
> diff --git a/init/Kconfig b/init/Kconfig
> index cf5b5a0dcbc2..2a02e004874e 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1418,6 +1418,11 @@ config ADVISE_SYSCALLS
>  	  applications use these syscalls, you can disable this option to save
>  	  space.
>  
> +config HAVE_ARCH_USERFAULTFD_WP
> +	bool
> +	help
> +	  Arch has userfaultfd write protection support
> +
>  config MEMBARRIER
>  	bool "Enable membarrier() system call" if EXPERT
>  	default y
> -- 
> 2.17.1
>
Peter Xu Jan. 24, 2019, 5:16 a.m. UTC | #2
On Mon, Jan 21, 2019 at 10:09:38AM -0500, Jerome Glisse wrote:
> On Mon, Jan 21, 2019 at 03:57:08PM +0800, Peter Xu wrote:
> > From: Andrea Arcangeli <aarcange@redhat.com>
> > 
> > Accurate userfaultfd WP tracking is possible by tracking exactly which
> > virtual memory ranges were writeprotected by userland. We can't relay
> > only on the RW bit of the mapped pagetable because that information is
> > destroyed by fork() or KSM or swap. If we were to relay on that, we'd
> > need to stay on the safe side and generate false positive wp faults
> > for every swapped out page.

(I'm trying to leave comments with my own understanding here; they
 might not be the original purposes when Andrea proposed the idea.
 Andrea, please feel free to chim in anytime especially if I am
 wrong... :-)

> 
> So you want to forward write fault (of a protected range) to user space
> only if page is not write protected because of fork(), KSM or swap.
> 
> This write protection feature is only for anonymous page right ? Other-
> wise how would you protect a share page (ie anyone can look it up and
> call page_mkwrite on it and start writting to it) ?

AFAIU we want to support shared memory too in the future.  One example
I can think of is current QEMU usage with DPDK: we have two processes
sharing the guest memory range.  So indeed this might not work if
there are unknown/malicious users of the shared memory, however in
many use cases the users are all known and AFAIU we should just write
protect all these users then we'll still get notified when any of them
write to a page.

> 
> So for anonymous page for fork() the mapcount will tell you if page is
> write protected for COW. For KSM it is easy check the page flag.

Yes I agree that KSM should be easy.  But for COW, please consider
when we write protect a page that was shared and RW removed due to
COW.  Then when we page fault on this page should we report to the
monitor?  IMHO we can't know if without a specific bit in the PTE.

> 
> For swap you can use the page lock to synchronize. A page that is
> write protected because of swap is write protected because it is being
> write to disk thus either under page lock, or with PageWriteback()
> returning true while write is on going.

For swap I think the major problem is when the page was swapped out of
main memory and then we write to the page (which was already a swap
entry now).  Then we'll first swap in the page into main memory again,
but then IMHO we will face the similar issue like COW above - we can't
judge whether this page was write protected by uffd-wp at all.  Of
course here we can detect the VMA flags and assuming it's write
protected if the UFFD_WP flag was set on the VMA flag, however we'll
also mark those pages which were not write protected at all hence
it'll generate false positives of write protection messages.  This
idea can apply too to above COW use case.  As a conclusion, in these
use cases we should not be able to identify explicitly on page
granularity write protection if without a specific _PAGE_UFFD_WP bit
in the PTE entries.

Thanks,
Jerome Glisse Jan. 24, 2019, 3:40 p.m. UTC | #3
On Thu, Jan 24, 2019 at 01:16:16PM +0800, Peter Xu wrote:
> On Mon, Jan 21, 2019 at 10:09:38AM -0500, Jerome Glisse wrote:
> > On Mon, Jan 21, 2019 at 03:57:08PM +0800, Peter Xu wrote:
> > > From: Andrea Arcangeli <aarcange@redhat.com>
> > > 
> > > Accurate userfaultfd WP tracking is possible by tracking exactly which
> > > virtual memory ranges were writeprotected by userland. We can't relay
> > > only on the RW bit of the mapped pagetable because that information is
> > > destroyed by fork() or KSM or swap. If we were to relay on that, we'd
> > > need to stay on the safe side and generate false positive wp faults
> > > for every swapped out page.
> 
> (I'm trying to leave comments with my own understanding here; they
>  might not be the original purposes when Andrea proposed the idea.
>  Andrea, please feel free to chim in anytime especially if I am
>  wrong... :-)
> 
> > 
> > So you want to forward write fault (of a protected range) to user space
> > only if page is not write protected because of fork(), KSM or swap.
> > 
> > This write protection feature is only for anonymous page right ? Other-
> > wise how would you protect a share page (ie anyone can look it up and
> > call page_mkwrite on it and start writting to it) ?
> 
> AFAIU we want to support shared memory too in the future.  One example
> I can think of is current QEMU usage with DPDK: we have two processes
> sharing the guest memory range.  So indeed this might not work if
> there are unknown/malicious users of the shared memory, however in
> many use cases the users are all known and AFAIU we should just write
> protect all these users then we'll still get notified when any of them
> write to a page.
> 
> > 
> > So for anonymous page for fork() the mapcount will tell you if page is
> > write protected for COW. For KSM it is easy check the page flag.
> 
> Yes I agree that KSM should be easy.  But for COW, please consider
> when we write protect a page that was shared and RW removed due to
> COW.  Then when we page fault on this page should we report to the
> monitor?  IMHO we can't know if without a specific bit in the PTE.
> 
> > 
> > For swap you can use the page lock to synchronize. A page that is
> > write protected because of swap is write protected because it is being
> > write to disk thus either under page lock, or with PageWriteback()
> > returning true while write is on going.
> 
> For swap I think the major problem is when the page was swapped out of
> main memory and then we write to the page (which was already a swap
> entry now).  Then we'll first swap in the page into main memory again,
> but then IMHO we will face the similar issue like COW above - we can't
> judge whether this page was write protected by uffd-wp at all.  Of
> course here we can detect the VMA flags and assuming it's write
> protected if the UFFD_WP flag was set on the VMA flag, however we'll
> also mark those pages which were not write protected at all hence
> it'll generate false positives of write protection messages.  This
> idea can apply too to above COW use case.  As a conclusion, in these
> use cases we should not be able to identify explicitly on page
> granularity write protection if without a specific _PAGE_UFFD_WP bit
> in the PTE entries.

So i need to think a bit more on this, probably not right now
but just so i get the chain of event properly:
  1 - user space ioctl UFD to write protect a range
  2 - UFD set a flag on the vma and update CPU page table
  3 - page can be individualy write faulted and it sends a
      signal to UFD listener and they handle the fault
  4 - UFD kernel update the page table once userspace have
      handled the fault and sent result to UFD. At this point
      the vma still has the UFD write protect flag set.

So at any point in time in a range you might have writeable
pte that correspond to already handled UFD write fault. Now
if COW,KSM or swap happens on those then on the next write
fault you do not want to send a signal to userspace but handle
the fault just as usual ?

I believe this is the event flow, so i will ponder on this some
more :)

Cheers,
Jérôme
Peter Xu Jan. 25, 2019, 3:30 a.m. UTC | #4
On Thu, Jan 24, 2019 at 10:40:50AM -0500, Jerome Glisse wrote:
> On Thu, Jan 24, 2019 at 01:16:16PM +0800, Peter Xu wrote:
> > On Mon, Jan 21, 2019 at 10:09:38AM -0500, Jerome Glisse wrote:
> > > On Mon, Jan 21, 2019 at 03:57:08PM +0800, Peter Xu wrote:
> > > > From: Andrea Arcangeli <aarcange@redhat.com>
> > > > 
> > > > Accurate userfaultfd WP tracking is possible by tracking exactly which
> > > > virtual memory ranges were writeprotected by userland. We can't relay
> > > > only on the RW bit of the mapped pagetable because that information is
> > > > destroyed by fork() or KSM or swap. If we were to relay on that, we'd
> > > > need to stay on the safe side and generate false positive wp faults
> > > > for every swapped out page.
> > 
> > (I'm trying to leave comments with my own understanding here; they
> >  might not be the original purposes when Andrea proposed the idea.
> >  Andrea, please feel free to chim in anytime especially if I am
> >  wrong... :-)
> > 
> > > 
> > > So you want to forward write fault (of a protected range) to user space
> > > only if page is not write protected because of fork(), KSM or swap.
> > > 
> > > This write protection feature is only for anonymous page right ? Other-
> > > wise how would you protect a share page (ie anyone can look it up and
> > > call page_mkwrite on it and start writting to it) ?
> > 
> > AFAIU we want to support shared memory too in the future.  One example
> > I can think of is current QEMU usage with DPDK: we have two processes
> > sharing the guest memory range.  So indeed this might not work if
> > there are unknown/malicious users of the shared memory, however in
> > many use cases the users are all known and AFAIU we should just write
> > protect all these users then we'll still get notified when any of them
> > write to a page.
> > 
> > > 
> > > So for anonymous page for fork() the mapcount will tell you if page is
> > > write protected for COW. For KSM it is easy check the page flag.
> > 
> > Yes I agree that KSM should be easy.  But for COW, please consider
> > when we write protect a page that was shared and RW removed due to
> > COW.  Then when we page fault on this page should we report to the
> > monitor?  IMHO we can't know if without a specific bit in the PTE.
> > 
> > > 
> > > For swap you can use the page lock to synchronize. A page that is
> > > write protected because of swap is write protected because it is being
> > > write to disk thus either under page lock, or with PageWriteback()
> > > returning true while write is on going.
> > 
> > For swap I think the major problem is when the page was swapped out of
> > main memory and then we write to the page (which was already a swap
> > entry now).  Then we'll first swap in the page into main memory again,
> > but then IMHO we will face the similar issue like COW above - we can't
> > judge whether this page was write protected by uffd-wp at all.  Of
> > course here we can detect the VMA flags and assuming it's write
> > protected if the UFFD_WP flag was set on the VMA flag, however we'll
> > also mark those pages which were not write protected at all hence
> > it'll generate false positives of write protection messages.  This
> > idea can apply too to above COW use case.  As a conclusion, in these
> > use cases we should not be able to identify explicitly on page
> > granularity write protection if without a specific _PAGE_UFFD_WP bit
> > in the PTE entries.
> 
> So i need to think a bit more on this, probably not right now
> but just so i get the chain of event properly:
>   1 - user space ioctl UFD to write protect a range
>   2 - UFD set a flag on the vma and update CPU page table

A trivial supplement to these two steps to be clear: the change to VMA
flags and PTE permissions are different steps.  Say, to write protect
a newly mmap()ed region, we need to do:

  (a) ioctl UFFDIO_REGISTER upon the range: this will properly attach
      the VM_UFFD_WP flag upon the VMA object, and...

  (b) ioctl UFFDIO_WRITEPROTECT upon the range again: this will
      properly apply the new uffd-wp bit and write protect the
      PTEs/PMDs.

Note that the range specified in step (b) could also be part of the
buffer, so it does not need to cover the whole VMA, and it's in page
granularity.

>   3 - page can be individualy write faulted and it sends a
>       signal to UFD listener and they handle the fault
>   4 - UFD kernel update the page table once userspace have
>       handled the fault and sent result to UFD. At this point
>       the vma still has the UFD write protect flag set.

Yes. As explained above, the VMA can have the VM_UFFD_WP flag even if
none of the PTEs underneath was write protected.

> 
> So at any point in time in a range you might have writeable
> pte that correspond to already handled UFD write fault. Now
> if COW,KSM or swap happens on those then on the next write
> fault you do not want to send a signal to userspace but handle
> the fault just as usual ?

Yes, if the PTE has already resolved the uffd write protection and
then it will be just like a normal PTE, because when resolving the
uffd-wp page fault we'll also remove the special uffd-wp bit on the
PTE/PMD.

And IMHO actually what's more special here is when we write protect a
shared private page that is for COW (I'll skip KSM since it looks very
like this case IIUC): here due to COW the PTE already lost the RW bit,
and here when we do uffd-wp upon this page we'll simply apply the
uffd-wp bit only to mark that this PTE was especially write protected
by userfaults.  And when we want to resolve the uffd-wp for such a PTE
we'll first try to do COW if it is shared by others by checking
against page_mapcount().

> 
> I believe this is the event flow, so i will ponder on this some
> more :)

Yes please. :) The workflow of the new ioctl()s was also mentioned in
the cover letter.  Please feel free to have a look too.

Thanks,
diff mbox series

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8689e794a43c..096c773452d0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -207,6 +207,7 @@  config X86
 	select USER_STACKTRACE_SUPPORT
 	select VIRT_TO_BUS
 	select X86_FEATURE_NAMES		if PROC_FS
+	select HAVE_ARCH_USERFAULTFD_WP		if USERFAULTFD
 
 config INSTRUCTION_DECODER
 	def_bool y
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 40616e805292..7a71158982f4 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -23,6 +23,7 @@ 
 
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
+#include <asm-generic/pgtable_uffd.h>
 
 extern pgd_t early_top_pgt[PTRS_PER_PGD];
 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
@@ -293,6 +294,23 @@  static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
 	return native_make_pte(v & ~clear);
 }
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline int pte_uffd_wp(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_UFFD_WP;
+}
+
+static inline pte_t pte_mkuffd_wp(pte_t pte)
+{
+	return pte_set_flags(pte, _PAGE_UFFD_WP);
+}
+
+static inline pte_t pte_clear_uffd_wp(pte_t pte)
+{
+	return pte_clear_flags(pte, _PAGE_UFFD_WP);
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
 static inline pte_t pte_mkclean(pte_t pte)
 {
 	return pte_clear_flags(pte, _PAGE_DIRTY);
@@ -372,6 +390,23 @@  static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
 	return native_make_pmd(v & ~clear);
 }
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline int pmd_uffd_wp(pmd_t pmd)
+{
+	return pmd_flags(pmd) & _PAGE_UFFD_WP;
+}
+
+static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_UFFD_WP);
+}
+
+static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
+{
+	return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
 static inline pmd_t pmd_mkold(pmd_t pmd)
 {
 	return pmd_clear_flags(pmd, _PAGE_ACCESSED);
@@ -1351,6 +1386,23 @@  static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
 #endif
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
+{
+	return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
+}
+
+static inline int pte_swp_uffd_wp(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
+}
+
+static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
+{
+	return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
 #define PKRU_AD_BIT 0x1
 #define PKRU_WD_BIT 0x2
 #define PKRU_BITS_PER_PKEY 2
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 9c85b54bf03c..e0c5d29b8685 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -189,7 +189,7 @@  extern void sync_global_pgds(unsigned long start, unsigned long end);
  *
  * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2| 1|0| <- bit number
  * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
- * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| X|X|SD|0| <- swp entry
+ * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| X|F|SD|0| <- swp entry
  *
  * G (8) is aliased and used as a PROT_NONE indicator for
  * !present ptes.  We need to start storing swap entries above
@@ -197,9 +197,15 @@  extern void sync_global_pgds(unsigned long start, unsigned long end);
  * erratum where they can be incorrectly set by hardware on
  * non-present PTEs.
  *
+ * SD Bits 1-4 are not used in non-present format and available for
+ * special use described below:
+ *
  * SD (1) in swp entry is used to store soft dirty bit, which helps us
  * remember soft dirty over page migration
  *
+ * F (2) in swp entry is used to record when a pagetable is
+ * writeprotected by userfaultfd WP support.
+ *
  * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
  * but also L and G.
  *
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 106b7d0e2dae..163043ab142d 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -32,6 +32,7 @@ 
 
 #define _PAGE_BIT_SPECIAL	_PAGE_BIT_SOFTW1
 #define _PAGE_BIT_CPA_TEST	_PAGE_BIT_SOFTW1
+#define _PAGE_BIT_UFFD_WP	_PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_DEVMAP	_PAGE_BIT_SOFTW4
 
@@ -100,6 +101,14 @@ 
 #define _PAGE_SWP_SOFT_DIRTY	(_AT(pteval_t, 0))
 #endif
 
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+#define _PAGE_UFFD_WP		(_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP)
+#define _PAGE_SWP_UFFD_WP	_PAGE_USER
+#else
+#define _PAGE_UFFD_WP		(_AT(pteval_t, 0))
+#define _PAGE_SWP_UFFD_WP	(_AT(pteval_t, 0))
+#endif
+
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #define _PAGE_DEVMAP	(_AT(u64, 1) << _PAGE_BIT_DEVMAP)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 359fb935ded6..0e1470ecf7b5 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -10,6 +10,7 @@ 
 #include <linux/mm_types.h>
 #include <linux/bug.h>
 #include <linux/errno.h>
+#include <asm-generic/pgtable_uffd.h>
 
 #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
 	defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h
new file mode 100644
index 000000000000..643d1bf559c2
--- /dev/null
+++ b/include/asm-generic/pgtable_uffd.h
@@ -0,0 +1,51 @@ 
+#ifndef _ASM_GENERIC_PGTABLE_UFFD_H
+#define _ASM_GENERIC_PGTABLE_UFFD_H
+
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static __always_inline int pte_uffd_wp(pte_t pte)
+{
+	return 0;
+}
+
+static __always_inline int pmd_uffd_wp(pmd_t pmd)
+{
+	return 0;
+}
+
+static __always_inline pte_t pte_mkuffd_wp(pte_t pte)
+{
+	return pte;
+}
+
+static __always_inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
+{
+	return pmd;
+}
+
+static __always_inline pte_t pte_clear_uffd_wp(pte_t pte)
+{
+	return pte;
+}
+
+static __always_inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
+{
+	return pmd;
+}
+
+static __always_inline pte_t pte_swp_mkuffd_wp(pte_t pte)
+{
+	return pte;
+}
+
+static __always_inline int pte_swp_uffd_wp(pte_t pte)
+{
+	return 0;
+}
+
+static __always_inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
+{
+	return pte;
+}
+#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
+
+#endif /* _ASM_GENERIC_PGTABLE_UFFD_H */
diff --git a/init/Kconfig b/init/Kconfig
index cf5b5a0dcbc2..2a02e004874e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1418,6 +1418,11 @@  config ADVISE_SYSCALLS
 	  applications use these syscalls, you can disable this option to save
 	  space.
 
+config HAVE_ARCH_USERFAULTFD_WP
+	bool
+	help
+	  Arch has userfaultfd write protection support
+
 config MEMBARRIER
 	bool "Enable membarrier() system call" if EXPERT
 	default y