diff mbox series

[RFC,14/14] mm/madvise: add process_madvise(MADV_COLLAPSE)

Message ID 20220308213417.1407042-15-zokeefe@google.com (mailing list archive)
State New
Headers show
Series mm: userspace hugepage collapse | expand

Commit Message

Zach O'Keefe March 8, 2022, 9:34 p.m. UTC
This is the first advice that makes use of process_madvise() flags.

Add the necessary plumbing to make the flags available from do_madvise()
handlers.

For MADV_COLLAPSE, the added flags are:

* MADV_F_COLLAPSE_LIMITS - controls if we should respect
			   khugepaged/max_ptes_* limits
			   (requires CAP_SYS_ADMIN if not acting on
			    self)
* MADV_F_COLLAPSE_DEFRAG - force enable defrag, despite vma or system
			   settings.

These two flags together provide userspace flexibility in defining
separate policies for synchronous userspace-directed collapse, and
asynchronous kernel (khugepaged) collapse.

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
---
 fs/io_uring.c                          |  3 +-
 include/linux/huge_mm.h                |  3 +-
 include/linux/mm.h                     |  3 +-
 include/uapi/asm-generic/mman-common.h |  8 +++++
 mm/khugepaged.c                        |  7 +++--
 mm/madvise.c                           | 42 ++++++++++++++------------
 6 files changed, 41 insertions(+), 25 deletions(-)
diff mbox series

Patch

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 23e7f93d3956..8558b7549431 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4720,7 +4720,8 @@  static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 
-	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
+	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice,
+			 MADV_F_NONE);
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_complete(req, ret);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 407b63ab4185..31f514ff36be 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -228,7 +228,8 @@  int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
 		     int advice);
 int madvise_collapse(struct vm_area_struct *vma,
 		     struct vm_area_struct **prev,
-		     unsigned long start, unsigned long end);
+		     unsigned long start, unsigned long end,
+		     unsigned int flags);
 void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
 			   unsigned long end, long adjust_next);
 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dc69d2a69912..f4776f4cda48 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2690,7 +2690,8 @@  extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
 		       struct list_head *uf, bool downgrade);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t,
 		     struct list_head *uf);
-extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);
+extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in,
+		      int behavior, unsigned int flags);
 
 #ifdef CONFIG_MMU
 extern int __mm_populate(unsigned long addr, unsigned long len,
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 6ce1f1ceb432..b81f4b1b18ba 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -79,6 +79,14 @@ 
 
 #define MADV_COLLAPSE	25		/* Synchronous hugepage collapse */
 
+/* process_madvise() flags */
+#define MADV_F_NONE		0x0
+
+/* process_madvise(MADV_COLLAPSE) flags */
+#define MADV_F_COLLAPSE_LIMITS	0x1	/* respect system khugepaged/max_ptes_* sysfs limits */
+#define MADV_F_COLLAPSE_DEFRAG	0x2	/* force enable sync collapse + reclaim */
+#define MADV_F_COLLAPSE_MASK	(MADV_F_COLLAPSE_LIMITS | MADV_F_COLLAPSE_DEFRAG)
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e8156f15a3da..993de0c6eaa9 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2942,7 +2942,7 @@  static int _madvise_collapse(struct mm_struct *mm,
 
 int madvise_collapse(struct vm_area_struct *vma,
 		     struct vm_area_struct **prev, unsigned long start,
-		     unsigned long end)
+		     unsigned long end, unsigned int flags)
 {
 	struct collapse_control cc;
 	gfp_t gfp;
@@ -2953,8 +2953,9 @@  int madvise_collapse(struct vm_area_struct *vma,
 	mmap_assert_locked(mm);
 
 	mmgrab(mm);
-	collapse_control_init(&cc, /* enforce_pte_scan_limits= */ false);
-	gfp = vma_thp_gfp_mask(vma);
+	collapse_control_init(&cc, flags & MADV_F_COLLAPSE_LIMITS);
+	gfp = vma_thp_gfp_mask(vma) | (flags & MADV_F_COLLAPSE_DEFRAG
+			? __GFP_DIRECT_RECLAIM : 0);
 	lru_add_drain(); /* lru_add_drain_all() too heavy here */
 	error = _madvise_collapse(mm, vma, prev, start, end, gfp, &cc);
 	mmap_assert_locked(mm);
diff --git a/mm/madvise.c b/mm/madvise.c
index 292aa017c150..7d094d86d2f1 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -979,7 +979,7 @@  static long madvise_remove(struct vm_area_struct *vma,
 static int madvise_vma_behavior(struct vm_area_struct *vma,
 				struct vm_area_struct **prev,
 				unsigned long start, unsigned long end,
-				unsigned long behavior)
+				unsigned long behavior, unsigned int flags)
 {
 	int error;
 	struct anon_vma_name *anon_name;
@@ -1048,7 +1048,7 @@  static int madvise_vma_behavior(struct vm_area_struct *vma,
 			goto out;
 		break;
 	case MADV_COLLAPSE:
-		return madvise_collapse(vma, prev, start, end);
+		return madvise_collapse(vma, prev, start, end, flags);
 	}
 
 	anon_name = anon_vma_name(vma);
@@ -1160,13 +1160,19 @@  madvise_behavior_valid(int behavior)
 }
 
 static bool
-process_madvise_behavior_valid(int behavior)
+process_madvise_behavior_valid(int behavior, struct task_struct *task,
+			       unsigned int flags)
 {
 	switch (behavior) {
 	case MADV_COLD:
 	case MADV_PAGEOUT:
 	case MADV_WILLNEED:
-		return true;
+		return flags == 0;
+	case MADV_COLLAPSE:
+		return (flags & ~MADV_F_COLLAPSE_MASK) == 0 &&
+				(capable(CAP_SYS_ADMIN) ||
+				 (task == current) ||
+				 (flags & MADV_F_COLLAPSE_LIMITS));
 	default:
 		return false;
 	}
@@ -1182,10 +1188,11 @@  process_madvise_behavior_valid(int behavior)
  */
 static
 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
-		      unsigned long end, unsigned long arg,
+		      unsigned long end, unsigned long arg, unsigned int flags,
 		      int (*visit)(struct vm_area_struct *vma,
 				   struct vm_area_struct **prev, unsigned long start,
-				   unsigned long end, unsigned long arg))
+				   unsigned long end, unsigned long arg,
+				   unsigned int flags))
 {
 	struct vm_area_struct *vma;
 	struct vm_area_struct *prev;
@@ -1222,7 +1229,7 @@  int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
 			tmp = end;
 
 		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
-		error = visit(vma, &prev, start, tmp, arg);
+		error = visit(vma, &prev, start, tmp, arg, flags);
 		if (error)
 			return error;
 		start = tmp;
@@ -1285,7 +1292,7 @@  int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 		return 0;
 
 	return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
-				 madvise_vma_anon_name);
+				 madvise_vma_anon_name, MADV_F_NONE);
 }
 #endif /* CONFIG_ANON_VMA_NAME */
 /*
@@ -1359,7 +1366,8 @@  int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
  *  -EBADF  - map exists, but area maps something that isn't a file.
  *  -EAGAIN - a kernel resource was temporarily unavailable.
  */
-int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
+int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in,
+	       int behavior, unsigned int flags)
 {
 	unsigned long end;
 	int error;
@@ -1401,8 +1409,8 @@  int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 	}
 
 	blk_start_plug(&plug);
-	error = madvise_walk_vmas(mm, start, end, behavior,
-			madvise_vma_behavior);
+	error = madvise_walk_vmas(mm, start, end, behavior, flags,
+				  madvise_vma_behavior);
 	blk_finish_plug(&plug);
 	if (write)
 		mmap_write_unlock(mm);
@@ -1414,7 +1422,8 @@  int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 
 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
-	return do_madvise(current->mm, start, len_in, behavior);
+	return do_madvise(current->mm, start, len_in, behavior,
+			  MADV_F_NONE);
 }
 
 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
@@ -1429,11 +1438,6 @@  SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 	size_t total_len;
 	unsigned int f_flags;
 
-	if (flags != 0) {
-		ret = -EINVAL;
-		goto out;
-	}
-
 	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
 	if (ret < 0)
 		goto out;
@@ -1444,7 +1448,7 @@  SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 		goto free_iov;
 	}
 
-	if (!process_madvise_behavior_valid(behavior)) {
+	if (!process_madvise_behavior_valid(behavior, task, flags)) {
 		ret = -EINVAL;
 		goto release_task;
 	}
@@ -1470,7 +1474,7 @@  SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 	while (iov_iter_count(&iter)) {
 		iovec = iov_iter_iovec(&iter);
 		ret = do_madvise(mm, (unsigned long)iovec.iov_base,
-					iovec.iov_len, behavior);
+					iovec.iov_len, behavior, flags);
 		if (ret < 0)
 			break;
 		iov_iter_advance(&iter, iovec.iov_len);