diff mbox series

[v2,3/9] userfaultfd/shmem: support minor fault registration for shmem

Message ID 20210413051721.2896915-4-axelrasmussen@google.com (mailing list archive)
State New, archived
Headers show
Series userfaultfd: add minor fault handling for shmem | expand

Commit Message

Axel Rasmussen April 13, 2021, 5:17 a.m. UTC
This patch allows shmem-backed VMAs to be registered for minor faults.
Minor faults are appropriately relayed to userspace in the fault path,
for VMAs with the relevant flag.

This commit doesn't hook up the UFFDIO_CONTINUE ioctl for shmem-backed
minor faults, though, so userspace doesn't yet have a way to resolve
such faults.

Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
---
 fs/userfaultfd.c                 |  6 +++---
 include/uapi/linux/userfaultfd.h |  7 ++++++-
 mm/memory.c                      |  8 +++++---
 mm/shmem.c                       | 10 +++++++++-
 4 files changed, 23 insertions(+), 8 deletions(-)

Comments

Peter Xu April 13, 2021, 8:43 p.m. UTC | #1
On Mon, Apr 12, 2021 at 10:17:15PM -0700, Axel Rasmussen wrote:
> This patch allows shmem-backed VMAs to be registered for minor faults.
> Minor faults are appropriately relayed to userspace in the fault path,
> for VMAs with the relevant flag.
> 
> This commit doesn't hook up the UFFDIO_CONTINUE ioctl for shmem-backed
> minor faults, though, so userspace doesn't yet have a way to resolve
> such faults.
> 
> Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>

Everything looks right to me, but it'll be great if Andrea or Hugh will have a
look too.

Acked-by: Peter Xu <peterx@redhat.com>
Hugh Dickins April 14, 2021, 7:36 a.m. UTC | #2
On Mon, 12 Apr 2021, Axel Rasmussen wrote:

> This patch allows shmem-backed VMAs to be registered for minor faults.
> Minor faults are appropriately relayed to userspace in the fault path,
> for VMAs with the relevant flag.
> 
> This commit doesn't hook up the UFFDIO_CONTINUE ioctl for shmem-backed
> minor faults, though, so userspace doesn't yet have a way to resolve
> such faults.

This is a very odd way to divide up the series: an "Intermission"
half way through the implementation of MINOR/CONTINUE: this 3/9
makes little sense without the 4/9 to mm/userfaultfd.c which follows.

But, having said that, I won't object and Peter did not object, and
I don't know of anyone else looking here: it will only give each of
us more trouble to insist on repartitioning the series, and it's the
end state that's far more important to me and to all of us.

And I'll even seize on it, to give myself an intermission after
this one, until tomorrow (when I'll look at 4/9 and 9/9 - but
shall not look at the selftests ones at all).

Most of this is okay, except the mm/shmem.c part; and I've just now
realized that somewhere (whether in this patch or separately) there
needs to be an update to Documentation/admin-guide/mm/userfaultfd.rst
(admin-guide? how weird, but not this series' business to correct).

> 
> Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
> ---
>  fs/userfaultfd.c                 |  6 +++---
>  include/uapi/linux/userfaultfd.h |  7 ++++++-
>  mm/memory.c                      |  8 +++++---
>  mm/shmem.c                       | 10 +++++++++-
>  4 files changed, 23 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 14f92285d04f..9f3b8684cf3c 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
>  	}
>  
>  	if (vm_flags & VM_UFFD_MINOR) {
> -		/* FIXME: Add minor fault interception for shmem. */
> -		if (!is_vm_hugetlb_page(vma))
> +		if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
>  			return false;
>  	}
>  
> @@ -1941,7 +1940,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
>  	/* report all available features and ioctls to userland */
>  	uffdio_api.features = UFFD_API_FEATURES;
>  #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
> -	uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
> +	uffdio_api.features &=
> +		~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
>  #endif
>  	uffdio_api.ioctls = UFFD_API_IOCTLS;
>  	ret = -EFAULT;
> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> index bafbeb1a2624..159a74e9564f 100644
> --- a/include/uapi/linux/userfaultfd.h
> +++ b/include/uapi/linux/userfaultfd.h
> @@ -31,7 +31,8 @@
>  			   UFFD_FEATURE_MISSING_SHMEM |		\
>  			   UFFD_FEATURE_SIGBUS |		\
>  			   UFFD_FEATURE_THREAD_ID |		\
> -			   UFFD_FEATURE_MINOR_HUGETLBFS)
> +			   UFFD_FEATURE_MINOR_HUGETLBFS |	\
> +			   UFFD_FEATURE_MINOR_SHMEM)
>  #define UFFD_API_IOCTLS				\
>  	((__u64)1 << _UFFDIO_REGISTER |		\
>  	 (__u64)1 << _UFFDIO_UNREGISTER |	\
> @@ -185,6 +186,9 @@ struct uffdio_api {
>  	 * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults
>  	 * can be intercepted (via REGISTER_MODE_MINOR) for
>  	 * hugetlbfs-backed pages.
> +	 *
> +	 * UFFD_FEATURE_MINOR_SHMEM indicates the same support as
> +	 * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead.
>  	 */
>  #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
>  #define UFFD_FEATURE_EVENT_FORK			(1<<1)
> @@ -196,6 +200,7 @@ struct uffdio_api {
>  #define UFFD_FEATURE_SIGBUS			(1<<7)
>  #define UFFD_FEATURE_THREAD_ID			(1<<8)
>  #define UFFD_FEATURE_MINOR_HUGETLBFS		(1<<9)
> +#define UFFD_FEATURE_MINOR_SHMEM		(1<<10)
>  	__u64 features;
>  
>  	__u64 ioctls;
> diff --git a/mm/memory.c b/mm/memory.c
> index 4e358601c5d6..cc71a445c76c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3972,9 +3972,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
>  	 * something).
>  	 */
>  	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
> -		ret = do_fault_around(vmf);
> -		if (ret)
> -			return ret;
> +		if (likely(!userfaultfd_minor(vmf->vma))) {
> +			ret = do_fault_around(vmf);
> +			if (ret)
> +				return ret;
> +		}
>  	}
>  
>  	ret = __do_fault(vmf);
> diff --git a/mm/shmem.c b/mm/shmem.c
> index b72c55aa07fc..3f48cb5e8404 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1785,7 +1785,7 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
>   * vm. If we swap it in we mark it dirty since we also free the swap
>   * entry since a page cannot live in both the swap and page cache.
>   *
> - * vmf and fault_type are only supplied by shmem_fault:
> + * vma, vmf, and fault_type are only supplied by shmem_fault:
>   * otherwise they are NULL.
>   */
>  static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
> @@ -1820,6 +1820,14 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
>  
>  	page = pagecache_get_page(mapping, index,
>  					FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
> +
> +	if (page && vma && userfaultfd_minor(vma)) {
> +		unlock_page(page);
> +		put_page(page);
> +		*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
> +		return 0;
> +	}
> +

Okay, Peter persuaded you to move that up here: where indeed it
does look better than the earlier "swapped" version.

But will crash on swap as it's currently written: it needs to say
		if (!xa_is_value(page)) {
			unlock_page(page);
			put_page(page);
		}

I did say before that it's more robust to return from the swap
case after doing the shmem_swapin_page(). But I might be slowly
realizing that the ioctl to add the pte (in 4/9) will do its
shmem_getpage_gfp(), and that will bring in the swap if user
did not already do so: so I was wrong to claim more robustness
the other way, this placement should be fine. I think.

>  	if (xa_is_value(page)) {
>  		error = shmem_swapin_page(inode, index, &page,
>  					  sgp, gfp, vma, fault_type);
> -- 
> 2.31.1.295.g9ea45b61b8-goog
Peter Xu April 14, 2021, 6:51 p.m. UTC | #3
On Wed, Apr 14, 2021 at 12:36:13AM -0700, Hugh Dickins wrote:
> On Mon, 12 Apr 2021, Axel Rasmussen wrote:
> 
> > This patch allows shmem-backed VMAs to be registered for minor faults.
> > Minor faults are appropriately relayed to userspace in the fault path,
> > for VMAs with the relevant flag.
> > 
> > This commit doesn't hook up the UFFDIO_CONTINUE ioctl for shmem-backed
> > minor faults, though, so userspace doesn't yet have a way to resolve
> > such faults.
> 
> This is a very odd way to divide up the series: an "Intermission"
> half way through the implementation of MINOR/CONTINUE: this 3/9
> makes little sense without the 4/9 to mm/userfaultfd.c which follows.
> 
> But, having said that, I won't object and Peter did not object, and
> I don't know of anyone else looking here: it will only give each of
> us more trouble to insist on repartitioning the series, and it's the
> end state that's far more important to me and to all of us.

Agreed, ideally it should be after patch 4 since this patch enables the
feature already.

> 
> And I'll even seize on it, to give myself an intermission after
> this one, until tomorrow (when I'll look at 4/9 and 9/9 - but
> shall not look at the selftests ones at all).
> 
> Most of this is okay, except the mm/shmem.c part; and I've just now
> realized that somewhere (whether in this patch or separately) there
> needs to be an update to Documentation/admin-guide/mm/userfaultfd.rst
> (admin-guide? how weird, but not this series' business to correct).

(maybe some dir "devel" would suite better?  But I do also see soft-dirty.rst,
 idle_page_tracking.rst,..)

[...]

> >  static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
> > @@ -1820,6 +1820,14 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
> >  
> >  	page = pagecache_get_page(mapping, index,
> >  					FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
> > +
> > +	if (page && vma && userfaultfd_minor(vma)) {
> > +		unlock_page(page);
> > +		put_page(page);
> > +		*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
> > +		return 0;
> > +	}
> > +
> 
> Okay, Peter persuaded you to move that up here: where indeed it
> does look better than the earlier "swapped" version.
> 
> But will crash on swap as it's currently written: it needs to say
> 		if (!xa_is_value(page)) {
> 			unlock_page(page);
> 			put_page(page);
> 		}

And this is definitely true...  Thanks,

> 
> I did say before that it's more robust to return from the swap
> case after doing the shmem_swapin_page(). But I might be slowly
> realizing that the ioctl to add the pte (in 4/9) will do its
> shmem_getpage_gfp(), and that will bring in the swap if user
> did not already do so: so I was wrong to claim more robustness
> the other way, this placement should be fine. I think.
> 
> >  	if (xa_is_value(page)) {
> >  		error = shmem_swapin_page(inode, index, &page,
> >  					  sgp, gfp, vma, fault_type);
> > -- 
> > 2.31.1.295.g9ea45b61b8-goog
>
diff mbox series

Patch

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 14f92285d04f..9f3b8684cf3c 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1267,8 +1267,7 @@  static inline bool vma_can_userfault(struct vm_area_struct *vma,
 	}
 
 	if (vm_flags & VM_UFFD_MINOR) {
-		/* FIXME: Add minor fault interception for shmem. */
-		if (!is_vm_hugetlb_page(vma))
+		if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
 			return false;
 	}
 
@@ -1941,7 +1940,8 @@  static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 	/* report all available features and ioctls to userland */
 	uffdio_api.features = UFFD_API_FEATURES;
 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
-	uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
+	uffdio_api.features &=
+		~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
 #endif
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
 	ret = -EFAULT;
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index bafbeb1a2624..159a74e9564f 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -31,7 +31,8 @@ 
 			   UFFD_FEATURE_MISSING_SHMEM |		\
 			   UFFD_FEATURE_SIGBUS |		\
 			   UFFD_FEATURE_THREAD_ID |		\
-			   UFFD_FEATURE_MINOR_HUGETLBFS)
+			   UFFD_FEATURE_MINOR_HUGETLBFS |	\
+			   UFFD_FEATURE_MINOR_SHMEM)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -185,6 +186,9 @@  struct uffdio_api {
 	 * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults
 	 * can be intercepted (via REGISTER_MODE_MINOR) for
 	 * hugetlbfs-backed pages.
+	 *
+	 * UFFD_FEATURE_MINOR_SHMEM indicates the same support as
+	 * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -196,6 +200,7 @@  struct uffdio_api {
 #define UFFD_FEATURE_SIGBUS			(1<<7)
 #define UFFD_FEATURE_THREAD_ID			(1<<8)
 #define UFFD_FEATURE_MINOR_HUGETLBFS		(1<<9)
+#define UFFD_FEATURE_MINOR_SHMEM		(1<<10)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/mm/memory.c b/mm/memory.c
index 4e358601c5d6..cc71a445c76c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3972,9 +3972,11 @@  static vm_fault_t do_read_fault(struct vm_fault *vmf)
 	 * something).
 	 */
 	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
-		ret = do_fault_around(vmf);
-		if (ret)
-			return ret;
+		if (likely(!userfaultfd_minor(vmf->vma))) {
+			ret = do_fault_around(vmf);
+			if (ret)
+				return ret;
+		}
 	}
 
 	ret = __do_fault(vmf);
diff --git a/mm/shmem.c b/mm/shmem.c
index b72c55aa07fc..3f48cb5e8404 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1785,7 +1785,7 @@  static int shmem_swapin_page(struct inode *inode, pgoff_t index,
  * vm. If we swap it in we mark it dirty since we also free the swap
  * entry since a page cannot live in both the swap and page cache.
  *
- * vmf and fault_type are only supplied by shmem_fault:
+ * vma, vmf, and fault_type are only supplied by shmem_fault:
  * otherwise they are NULL.
  */
 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
@@ -1820,6 +1820,14 @@  static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 
 	page = pagecache_get_page(mapping, index,
 					FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
+
+	if (page && vma && userfaultfd_minor(vma)) {
+		unlock_page(page);
+		put_page(page);
+		*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
+		return 0;
+	}
+
 	if (xa_is_value(page)) {
 		error = shmem_swapin_page(inode, index, &page,
 					  sgp, gfp, vma, fault_type);