diff mbox series

[v2,3/5] userfaultfd: introduce write-likely mode for uffd operations

Message ID 20220718114748.2623-4-namit@vmware.com (mailing list archive)
State New
Headers show
Series userfaultfd: support access/write hints | expand

Commit Message

Nadav Amit July 18, 2022, 11:47 a.m. UTC
From: Nadav Amit <namit@vmware.com>

Introduce write-likely hints for uffd. These hints would be used in a
future patch to decide whether to attempt to map pages in the page-table
or whether to only mark them logically as writable. This allows
userspace to determine whether a page would be accessed faster or
whether removal of the page would be possible, potentially, without
writeback and TLB flush.

Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Nadav Amit <namit@vmware.com>
---
 fs/userfaultfd.c                 | 32 ++++++++++++++++++++++++--------
 include/linux/userfaultfd_k.h    |  1 +
 include/uapi/linux/userfaultfd.h | 13 ++++++++++++-
 3 files changed, 37 insertions(+), 9 deletions(-)

Comments

Peter Xu July 18, 2022, 8:12 p.m. UTC | #1
On Mon, Jul 18, 2022 at 04:47:46AM -0700, Nadav Amit wrote:
> From: Nadav Amit <namit@vmware.com>
> 
> Introduce write-likely hints for uffd. These hints would be used in a
> future patch to decide whether to attempt to map pages in the page-table
> or whether to only mark them logically as writable. This allows
> userspace to determine whether a page would be accessed faster or
> whether removal of the page would be possible, potentially, without
> writeback and TLB flush.
> 
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Axel Rasmussen <axelrasmussen@google.com>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Mike Rapoport <rppt@linux.ibm.com>
> Signed-off-by: Nadav Amit <namit@vmware.com>
> ---
>  fs/userfaultfd.c                 | 32 ++++++++++++++++++++++++--------
>  include/linux/userfaultfd_k.h    |  1 +
>  include/uapi/linux/userfaultfd.h | 13 ++++++++++++-
>  3 files changed, 37 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 8d8792b27c53..3027d228550a 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1709,7 +1709,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
>  	if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
>  		goto out;
>  	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP|
> -				 UFFDIO_COPY_MODE_ACCESS_LIKELY))
> +				 UFFDIO_COPY_MODE_ACCESS_LIKELY|
> +				 UFFDIO_COPY_MODE_WRITE_LIKELY))
>  		goto out;
>  
>  	mode_wp = uffdio_copy.mode & UFFDIO_COPY_MODE_WP;
> @@ -1719,8 +1720,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
>  	if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) {
>  		if (uffdio_copy.mode & UFFDIO_COPY_MODE_ACCESS_LIKELY)
>  			uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
> +		if (uffdio_copy.mode & UFFDIO_COPY_MODE_WRITE_LIKELY)
> +			uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
>  	} else {
> -		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
> +		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY |
> +			      UFFD_FLAGS_WRITE_LIKELY;
>  	}
>  
>  	if (mmget_not_zero(ctx->mm)) {
> @@ -1774,14 +1778,18 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
>  		goto out;
>  	ret = -EINVAL;
>  	if (uffdio_zeropage.mode & ~(UFFDIO_ZEROPAGE_MODE_DONTWAKE|
> -				     UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY))
> +				     UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY|
> +				     UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY))
>  		goto out;
>  
>  	if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) {
>  		if (uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY)
>  			uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
> +		if (uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY)
> +			uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
>  	} else {
> -		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
> +		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY |
> +			      UFFD_FLAGS_WRITE_LIKELY;
>  	}
>  
>  	if (mmget_not_zero(ctx->mm)) {
> @@ -1834,7 +1842,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
>  
>  	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
>  			       UFFDIO_WRITEPROTECT_MODE_WP |
> -			       UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY))
> +			       UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY |
> +			       UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY))
>  		return -EINVAL;
>  
>  	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
> @@ -1847,8 +1856,11 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
>  	if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) {
>  		if (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY)
>  			uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
> +		if (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY)
> +			uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
>  	} else {
> -		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
> +		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY |
> +			      UFFD_FLAGS_WRITE_LIKELY;
>  	}
>  
>  	if (mmget_not_zero(ctx->mm)) {
> @@ -1903,14 +1915,18 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
>  		goto out;
>  	}
>  	if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE|
> -				     UFFDIO_CONTINUE_MODE_ACCESS_LIKELY))
> +				     UFFDIO_CONTINUE_MODE_ACCESS_LIKELY|
> +				     UFFDIO_CONTINUE_MODE_WRITE_LIKELY))
>  		goto out;
>  
>  	if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) {
>  		if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_ACCESS_LIKELY)
>  			uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
> +		if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WRITE_LIKELY)
> +			uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
>  	} else {
> -		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
> +		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY |
> +			      UFFD_FLAGS_WRITE_LIKELY;
>  	}
>  
>  	if (mmget_not_zero(ctx->mm)) {
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index b326798b5677..4968c86938b2 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -60,6 +60,7 @@ typedef unsigned int __bitwise uffd_flags_t;
>  #define UFFD_FLAGS_NONE			((__force uffd_flags_t)0)
>  #define UFFD_FLAGS_WP			((__force uffd_flags_t)BIT(0))
>  #define UFFD_FLAGS_ACCESS_LIKELY	((__force uffd_flags_t)BIT(1))
> +#define UFFD_FLAGS_WRITE_LIKELY		((__force uffd_flags_t)BIT(2))
>  
>  extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
>  				    struct vm_area_struct *dst_vma,
> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> index 02e0c1f56939..f52cbe4c9c44 100644
> --- a/include/uapi/linux/userfaultfd.h
> +++ b/include/uapi/linux/userfaultfd.h
> @@ -202,7 +202,7 @@ struct uffdio_api {
>  	 * write-protection mode is supported on both shmem and hugetlbfs.
>  	 *
>  	 * UFFD_FEATURE_ACCESS_HINTS indicates that the ioctl operations
> -	 * support the UFFDIO_*_MODE_ACCESS_LIKELY hints.
> +	 * support the UFFDIO_*_MODE_[ACCESS|WRITE]_LIKELY hints.
>  	 */
>  #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
>  #define UFFD_FEATURE_EVENT_FORK			(1<<1)
> @@ -257,9 +257,13 @@ struct uffdio_copy {
>  	 * page is likely to be access in the near future. Providing the hint
>  	 * properly can improve performance.
>  	 *
> +	 * UFFDIO_COPY_MODE_WRITE_LIKELY provides a hint to the kernel that the
> +	 * page is likely to be written in the near future. Providing the hint
> +	 * properly can improve performance.
>  	 */
>  #define UFFDIO_COPY_MODE_WP			((__u64)1<<1)
>  #define UFFDIO_COPY_MODE_ACCESS_LIKELY		((__u64)1<<2)
> +#define UFFDIO_COPY_MODE_WRITE_LIKELY		((__u64)1<<3)
>  	__u64 mode;
>  
>  	/*
> @@ -273,6 +277,7 @@ struct uffdio_zeropage {
>  	struct uffdio_range range;
>  #define UFFDIO_ZEROPAGE_MODE_DONTWAKE		((__u64)1<<0)
>  #define UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY	((__u64)1<<1)
> +#define UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY	((__u64)1<<2)
>  	__u64 mode;
>  
>  	/*
> @@ -296,6 +301,10 @@ struct uffdio_writeprotect {
>   * that the page is likely to be access in the near future. Providing
>   * the hint properly can improve performance.
>   *
> + * UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY: provides a hint to the kernel
> + * that the page is likely to be written in the near future. Providing
> + * the hint properly can improve performance.
> + *
>   * NOTE: Write protecting a region (WP=1) is unrelated to page faults,
>   * therefore DONTWAKE flag is meaningless with WP=1.  Removing write
>   * protection (WP=0) in response to a page fault wakes the faulting
> @@ -304,6 +313,7 @@ struct uffdio_writeprotect {
>  #define UFFDIO_WRITEPROTECT_MODE_WP		((__u64)1<<0)
>  #define UFFDIO_WRITEPROTECT_MODE_DONTWAKE	((__u64)1<<1)
>  #define UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY	((__u64)1<<2)
> +#define UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY	((__u64)1<<3)
>  	__u64 mode;
>  };
>  
> @@ -311,6 +321,7 @@ struct uffdio_continue {
>  	struct uffdio_range range;
>  #define UFFDIO_CONTINUE_MODE_DONTWAKE		((__u64)1<<0)
>  #define UFFDIO_CONTINUE_MODE_ACCESS_LIKELY	((__u64)1<<1)
> +#define UFFDIO_CONTINUE_MODE_WRITE_LIKELY	((__u64)1<<2)
>  	__u64 mode;

I thought you would have some reasoning on having the flag for unprotect
(since our last discussion you mentioned it) but it seems not there..

Then, could we only keep the zeropage write hint but drop the rest?
They're never used in this whole series besides the zeropage one, meanwhile
I think we're still not reaching consensus on whether they'll be helpful?

Thanks,
Nadav Amit July 18, 2022, 8:25 p.m. UTC | #2
On Jul 18, 2022, at 1:12 PM, Peter Xu <peterx@redhat.com> wrote:

> ⚠ External Email
> 
> On Mon, Jul 18, 2022 at 04:47:46AM -0700, Nadav Amit wrote:
>> From: Nadav Amit <namit@vmware.com>
>> 
>> Introduce write-likely hints for uffd. These hints would be used in a
>> future patch to decide whether to attempt to map pages in the page-table
>> or whether to only mark them logically as writable. This allows
>> userspace to determine whether a page would be accessed faster or
>> whether removal of the page would be possible, potentially, without
>> writeback and TLB flush.
>> 
>> Cc: Mike Kravetz <mike.kravetz@oracle.com>
>> Cc: Hugh Dickins <hughd@google.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Axel Rasmussen <axelrasmussen@google.com>
>> Cc: Peter Xu <peterx@redhat.com>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: Mike Rapoport <rppt@linux.ibm.com>
>> Signed-off-by: Nadav Amit <namit@vmware.com>
>> ---
>> fs/userfaultfd.c | 32 ++++++++++++++++++++++++--------
>> include/linux/userfaultfd_k.h | 1 +
>> include/uapi/linux/userfaultfd.h | 13 ++++++++++++-
>> 3 files changed, 37 insertions(+), 9 deletions(-)
>> 
>> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
>> index 8d8792b27c53..3027d228550a 100644
>> --- a/fs/userfaultfd.c
>> +++ b/fs/userfaultfd.c
>> @@ -1709,7 +1709,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
>> if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
>> goto out;
>> if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP|
>> - UFFDIO_COPY_MODE_ACCESS_LIKELY))
>> + UFFDIO_COPY_MODE_ACCESS_LIKELY|
>> + UFFDIO_COPY_MODE_WRITE_LIKELY))
>> goto out;
>> 
>> mode_wp = uffdio_copy.mode & UFFDIO_COPY_MODE_WP;
>> @@ -1719,8 +1720,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
>> if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) {
>> if (uffdio_copy.mode & UFFDIO_COPY_MODE_ACCESS_LIKELY)
>> uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
>> + if (uffdio_copy.mode & UFFDIO_COPY_MODE_WRITE_LIKELY)
>> + uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
>> } else {
>> - uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
>> + uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY |
>> + UFFD_FLAGS_WRITE_LIKELY;
>> }
>> 
>> if (mmget_not_zero(ctx->mm)) {
>> @@ -1774,14 +1778,18 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
>> goto out;
>> ret = -EINVAL;
>> if (uffdio_zeropage.mode & ~(UFFDIO_ZEROPAGE_MODE_DONTWAKE|
>> - UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY))
>> + UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY|
>> + UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY))
>> goto out;
>> 
>> if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) {
>> if (uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY)
>> uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
>> + if (uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY)
>> + uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
>> } else {
>> - uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
>> + uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY |
>> + UFFD_FLAGS_WRITE_LIKELY;
>> }
>> 
>> if (mmget_not_zero(ctx->mm)) {
>> @@ -1834,7 +1842,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
>> 
>> if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
>> UFFDIO_WRITEPROTECT_MODE_WP |
>> - UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY))
>> + UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY |
>> + UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY))
>> return -EINVAL;
>> 
>> mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
>> @@ -1847,8 +1856,11 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
>> if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) {
>> if (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY)
>> uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
>> + if (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY)
>> + uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
>> } else {
>> - uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
>> + uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY |
>> + UFFD_FLAGS_WRITE_LIKELY;
>> }
>> 
>> if (mmget_not_zero(ctx->mm)) {
>> @@ -1903,14 +1915,18 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
>> goto out;
>> }
>> if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE|
>> - UFFDIO_CONTINUE_MODE_ACCESS_LIKELY))
>> + UFFDIO_CONTINUE_MODE_ACCESS_LIKELY|
>> + UFFDIO_CONTINUE_MODE_WRITE_LIKELY))
>> goto out;
>> 
>> if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) {
>> if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_ACCESS_LIKELY)
>> uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
>> + if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WRITE_LIKELY)
>> + uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
>> } else {
>> - uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
>> + uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY |
>> + UFFD_FLAGS_WRITE_LIKELY;
>> }
>> 
>> if (mmget_not_zero(ctx->mm)) {
>> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
>> index b326798b5677..4968c86938b2 100644
>> --- a/include/linux/userfaultfd_k.h
>> +++ b/include/linux/userfaultfd_k.h
>> @@ -60,6 +60,7 @@ typedef unsigned int __bitwise uffd_flags_t;
>> #define UFFD_FLAGS_NONE ((__force uffd_flags_t)0)
>> #define UFFD_FLAGS_WP ((__force uffd_flags_t)BIT(0))
>> #define UFFD_FLAGS_ACCESS_LIKELY ((__force uffd_flags_t)BIT(1))
>> +#define UFFD_FLAGS_WRITE_LIKELY ((__force uffd_flags_t)BIT(2))
>> 
>> extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
>> struct vm_area_struct *dst_vma,
>> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
>> index 02e0c1f56939..f52cbe4c9c44 100644
>> --- a/include/uapi/linux/userfaultfd.h
>> +++ b/include/uapi/linux/userfaultfd.h
>> @@ -202,7 +202,7 @@ struct uffdio_api {
>> * write-protection mode is supported on both shmem and hugetlbfs.
>> *
>> * UFFD_FEATURE_ACCESS_HINTS indicates that the ioctl operations
>> - * support the UFFDIO_*_MODE_ACCESS_LIKELY hints.
>> + * support the UFFDIO_*_MODE_[ACCESS|WRITE]_LIKELY hints.
>> */
>> #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
>> #define UFFD_FEATURE_EVENT_FORK (1<<1)
>> @@ -257,9 +257,13 @@ struct uffdio_copy {
>> * page is likely to be access in the near future. Providing the hint
>> * properly can improve performance.
>> *
>> + * UFFDIO_COPY_MODE_WRITE_LIKELY provides a hint to the kernel that the
>> + * page is likely to be written in the near future. Providing the hint
>> + * properly can improve performance.
>> */
>> #define UFFDIO_COPY_MODE_WP ((__u64)1<<1)
>> #define UFFDIO_COPY_MODE_ACCESS_LIKELY ((__u64)1<<2)
>> +#define UFFDIO_COPY_MODE_WRITE_LIKELY ((__u64)1<<3)
>> __u64 mode;
>> 
>> /*
>> @@ -273,6 +277,7 @@ struct uffdio_zeropage {
>> struct uffdio_range range;
>> #define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0)
>> #define UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY ((__u64)1<<1)
>> +#define UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY ((__u64)1<<2)
>> __u64 mode;
>> 
>> /*
>> @@ -296,6 +301,10 @@ struct uffdio_writeprotect {
>> * that the page is likely to be access in the near future. Providing
>> * the hint properly can improve performance.
>> *
>> + * UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY: provides a hint to the kernel
>> + * that the page is likely to be written in the near future. Providing
>> + * the hint properly can improve performance.
>> + *
>> * NOTE: Write protecting a region (WP=1) is unrelated to page faults,
>> * therefore DONTWAKE flag is meaningless with WP=1. Removing write
>> * protection (WP=0) in response to a page fault wakes the faulting
>> @@ -304,6 +313,7 @@ struct uffdio_writeprotect {
>> #define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0)
>> #define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1)
>> #define UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY ((__u64)1<<2)
>> +#define UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY ((__u64)1<<3)
>> __u64 mode;
>> };
>> 
>> @@ -311,6 +321,7 @@ struct uffdio_continue {
>> struct uffdio_range range;
>> #define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
>> #define UFFDIO_CONTINUE_MODE_ACCESS_LIKELY ((__u64)1<<1)
>> +#define UFFDIO_CONTINUE_MODE_WRITE_LIKELY ((__u64)1<<2)
>> __u64 mode;
> 
> I thought you would have some reasoning on having the flag for unprotect
> (since our last discussion you mentioned it) but it seems not there..
> 
> Then, could we only keep the zeropage write hint but drop the rest?
> They're never used in this whole series besides the zeropage one, meanwhile
> I think we're still not reaching consensus on whether they'll be helpful?

I think that I didn’t communicate clearly enough two things. First, the
access flags are used here.

Now, you are correct that although the unprotect flag is defined here, it is
not used in this patch-set. There is a reason for that.

It turns out that using David’s work to map a writable page can cause
undesired behaviors - the clean PTE, which we discussed, and additional TLB
shootdowns. Since it required a lot of changes to get rid of these
additional shootdowns, I put the unprotect changes in a different patch-set.

https://lore.kernel.org/all/20220718120212.3180-1-namit@vmware.com/

Let me know if that answers your question.
Peter Xu July 18, 2022, 9:27 p.m. UTC | #3
On Mon, Jul 18, 2022 at 08:25:46PM +0000, Nadav Amit wrote:
> >> @@ -311,6 +321,7 @@ struct uffdio_continue {
> >> struct uffdio_range range;
> >> #define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
> >> #define UFFDIO_CONTINUE_MODE_ACCESS_LIKELY ((__u64)1<<1)
> >> +#define UFFDIO_CONTINUE_MODE_WRITE_LIKELY ((__u64)1<<2)
> >> __u64 mode;
> > 
> > I thought you would have some reasoning on having the flag for unprotect
> > (since our last discussion you mentioned it) but it seems not there..
> > 
> > Then, could we only keep the zeropage write hint but drop the rest?
> > They're never used in this whole series besides the zeropage one, meanwhile
> > I think we're still not reaching consensus on whether they'll be helpful?
> 
> I think that I didn’t communicate clearly enough two things. First, the
> access flags are used here.
> 
> Now, you are correct that although the unprotect flag is defined here, it is
> not used in this patch-set. There is a reason for that.
> 
> It turns out that using David’s work to map a writable page can cause
> undesired behaviors - the clean PTE, which we discussed, and additional TLB
> shootdowns. Since it required a lot of changes to get rid of these
> additional shootdowns, I put the unprotect changes in a different patch-set.
> 
> https://lore.kernel.org/all/20220718120212.3180-1-namit@vmware.com/
> 
> Let me know if that answers your question.

Okay, I'll read it tomorrow, thanks.  Though note that IMHO we should have
the fix without depending on WRITE_HINT at all.  I hope that's what'll
happen in the other patchset, or I can also comment there.

Btw, if there's direct dependency on flags I'd rather squash the two
patchsets.  The thing is by sololy reading this patch the reader will have
no idea why you wanted to have WRITE_HINT outside ZEROPAGE, at least to me.
We could have introduced WRITE_HINT for ZEROPAGE in this patch (then IMO
you can squash that part with patch 4) then leave the rest for the other
patchset.
diff mbox series

Patch

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8d8792b27c53..3027d228550a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1709,7 +1709,8 @@  static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 	if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
 		goto out;
 	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP|
-				 UFFDIO_COPY_MODE_ACCESS_LIKELY))
+				 UFFDIO_COPY_MODE_ACCESS_LIKELY|
+				 UFFDIO_COPY_MODE_WRITE_LIKELY))
 		goto out;
 
 	mode_wp = uffdio_copy.mode & UFFDIO_COPY_MODE_WP;
@@ -1719,8 +1720,11 @@  static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 	if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) {
 		if (uffdio_copy.mode & UFFDIO_COPY_MODE_ACCESS_LIKELY)
 			uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+		if (uffdio_copy.mode & UFFDIO_COPY_MODE_WRITE_LIKELY)
+			uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
 	} else {
-		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY |
+			      UFFD_FLAGS_WRITE_LIKELY;
 	}
 
 	if (mmget_not_zero(ctx->mm)) {
@@ -1774,14 +1778,18 @@  static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 		goto out;
 	ret = -EINVAL;
 	if (uffdio_zeropage.mode & ~(UFFDIO_ZEROPAGE_MODE_DONTWAKE|
-				     UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY))
+				     UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY|
+				     UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY))
 		goto out;
 
 	if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) {
 		if (uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY)
 			uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+		if (uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY)
+			uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
 	} else {
-		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY |
+			      UFFD_FLAGS_WRITE_LIKELY;
 	}
 
 	if (mmget_not_zero(ctx->mm)) {
@@ -1834,7 +1842,8 @@  static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 
 	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
 			       UFFDIO_WRITEPROTECT_MODE_WP |
-			       UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY))
+			       UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY |
+			       UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY))
 		return -EINVAL;
 
 	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
@@ -1847,8 +1856,11 @@  static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 	if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) {
 		if (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY)
 			uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+		if (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY)
+			uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
 	} else {
-		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY |
+			      UFFD_FLAGS_WRITE_LIKELY;
 	}
 
 	if (mmget_not_zero(ctx->mm)) {
@@ -1903,14 +1915,18 @@  static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 		goto out;
 	}
 	if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE|
-				     UFFDIO_CONTINUE_MODE_ACCESS_LIKELY))
+				     UFFDIO_CONTINUE_MODE_ACCESS_LIKELY|
+				     UFFDIO_CONTINUE_MODE_WRITE_LIKELY))
 		goto out;
 
 	if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) {
 		if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_ACCESS_LIKELY)
 			uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+		if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WRITE_LIKELY)
+			uffd_flags |= UFFD_FLAGS_WRITE_LIKELY;
 	} else {
-		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
+		uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY |
+			      UFFD_FLAGS_WRITE_LIKELY;
 	}
 
 	if (mmget_not_zero(ctx->mm)) {
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index b326798b5677..4968c86938b2 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -60,6 +60,7 @@  typedef unsigned int __bitwise uffd_flags_t;
 #define UFFD_FLAGS_NONE			((__force uffd_flags_t)0)
 #define UFFD_FLAGS_WP			((__force uffd_flags_t)BIT(0))
 #define UFFD_FLAGS_ACCESS_LIKELY	((__force uffd_flags_t)BIT(1))
+#define UFFD_FLAGS_WRITE_LIKELY		((__force uffd_flags_t)BIT(2))
 
 extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 				    struct vm_area_struct *dst_vma,
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 02e0c1f56939..f52cbe4c9c44 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -202,7 +202,7 @@  struct uffdio_api {
 	 * write-protection mode is supported on both shmem and hugetlbfs.
 	 *
 	 * UFFD_FEATURE_ACCESS_HINTS indicates that the ioctl operations
-	 * support the UFFDIO_*_MODE_ACCESS_LIKELY hints.
+	 * support the UFFDIO_*_MODE_[ACCESS|WRITE]_LIKELY hints.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -257,9 +257,13 @@  struct uffdio_copy {
 	 * page is likely to be access in the near future. Providing the hint
 	 * properly can improve performance.
 	 *
+	 * UFFDIO_COPY_MODE_WRITE_LIKELY provides a hint to the kernel that the
+	 * page is likely to be written in the near future. Providing the hint
+	 * properly can improve performance.
 	 */
 #define UFFDIO_COPY_MODE_WP			((__u64)1<<1)
 #define UFFDIO_COPY_MODE_ACCESS_LIKELY		((__u64)1<<2)
+#define UFFDIO_COPY_MODE_WRITE_LIKELY		((__u64)1<<3)
 	__u64 mode;
 
 	/*
@@ -273,6 +277,7 @@  struct uffdio_zeropage {
 	struct uffdio_range range;
 #define UFFDIO_ZEROPAGE_MODE_DONTWAKE		((__u64)1<<0)
 #define UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY	((__u64)1<<1)
+#define UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY	((__u64)1<<2)
 	__u64 mode;
 
 	/*
@@ -296,6 +301,10 @@  struct uffdio_writeprotect {
  * that the page is likely to be access in the near future. Providing
  * the hint properly can improve performance.
  *
+ * UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY: provides a hint to the kernel
+ * that the page is likely to be written in the near future. Providing
+ * the hint properly can improve performance.
+ *
  * NOTE: Write protecting a region (WP=1) is unrelated to page faults,
  * therefore DONTWAKE flag is meaningless with WP=1.  Removing write
  * protection (WP=0) in response to a page fault wakes the faulting
@@ -304,6 +313,7 @@  struct uffdio_writeprotect {
 #define UFFDIO_WRITEPROTECT_MODE_WP		((__u64)1<<0)
 #define UFFDIO_WRITEPROTECT_MODE_DONTWAKE	((__u64)1<<1)
 #define UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY	((__u64)1<<2)
+#define UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY	((__u64)1<<3)
 	__u64 mode;
 };
 
@@ -311,6 +321,7 @@  struct uffdio_continue {
 	struct uffdio_range range;
 #define UFFDIO_CONTINUE_MODE_DONTWAKE		((__u64)1<<0)
 #define UFFDIO_CONTINUE_MODE_ACCESS_LIKELY	((__u64)1<<1)
+#define UFFDIO_CONTINUE_MODE_WRITE_LIKELY	((__u64)1<<2)
 	__u64 mode;
 
 	/*