diff mbox series

[RFC,5/8] mm/madvise: perform certain operations once on process_madvise()

Message ID 20210926161259.238054-6-namit@vmware.com (mailing list archive)
State New
Headers show
Series mm/madvise: support process_madvise(MADV_DONTNEED) | expand

Commit Message

Nadav Amit Sept. 26, 2021, 4:12 p.m. UTC
From: Nadav Amit <namit@vmware.com>

There are certain operations that can be performed only once on
process_madvise() instead of performing them for each IO vector.
Acquiring the mmap-lock, and initializing blk_plug are specifically such
operations.

Collect the aforementioned operations into madvise_start() and
madvise_finish(). The next patches will add additional operations into
these functions.

Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Colin Cross <ccross@google.com>
Cc: Suren Baghdasarya <surenb@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Nadav Amit <namit@vmware.com>
---
 mm/madvise.c | 139 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 86 insertions(+), 53 deletions(-)
diff mbox series

Patch

diff --git a/mm/madvise.c b/mm/madvise.c
index 127507c71ba9..84b86ae85671 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -43,6 +43,13 @@  struct madvise_walk_private {
 struct madvise_info {
 	u8 behavior_valid: 1;
 	u8 process_behavior_valid: 1;
+	u8 no_mmap_lock: 1;
+
+	/*
+	 * Any behaviour which results in changes to the vma->vm_flags needs to
+	 * take mmap_lock for writing. Others, which simply traverse vmas, need
+	 * to only take it for reading.
+	 */
 	u8 need_mmap_read_only: 1;
 };
 
@@ -120,9 +127,11 @@  static const struct madvise_info madvise_info[MADV_SOFT_OFFLINE+1] = {
 #ifdef CONFIG_MEMORY_FAILURE
 	[MADV_HWPOISON] = {
 		.behavior_valid = 1,
+		.no_mmap_lock = 1,
 	},
 	[MADV_SOFT_OFFLINE] = {
 		.behavior_valid = 1,
+		.no_mmap_lock = 1,
 	},
 #endif
 	[MADV_POPULATE_READ] = {
@@ -135,16 +144,6 @@  static const struct madvise_info madvise_info[MADV_SOFT_OFFLINE+1] = {
 	},
 };
 
-/*
- * Any behaviour which results in changes to the vma->vm_flags needs to
- * take mmap_lock for writing. Others, which simply traverse vmas, need
- * to only take it for reading.
- */
-static int madvise_need_mmap_write(int behavior)
-{
-	return !madvise_info[behavior].need_mmap_read_only;
-}
-
 /*
  * We can potentially split a vm area into separate
  * areas, each area with its own behavior.
@@ -1081,26 +1080,6 @@  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	}
 }
 
-static bool
-madvise_behavior_valid(int *behavior)
-{
-	if (*behavior >= ARRAY_SIZE(madvise_info))
-		return false;
-
-	*behavior = array_index_nospec(*behavior, ARRAY_SIZE(madvise_info));
-	return madvise_info[*behavior].behavior_valid;
-}
-
-static bool
-process_madvise_behavior_valid(int *behavior)
-{
-	if (*behavior >= ARRAY_SIZE(madvise_info))
-		return false;
-
-	*behavior = array_index_nospec(*behavior, ARRAY_SIZE(madvise_info));
-	return madvise_info[*behavior].process_behavior_valid;
-}
-
 /*
  * The madvise(2) system call.
  *
@@ -1171,21 +1150,17 @@  process_madvise_behavior_valid(int *behavior)
  *  -EBADF  - map exists, but area maps something that isn't a file.
  *  -EAGAIN - a kernel resource was temporarily unavailable.
  */
-int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
+int madvise_one_range(struct mm_struct *mm, unsigned long start, size_t len_in,
+		      int behavior)
 {
 	unsigned long end, tmp;
 	struct vm_area_struct *vma, *prev;
 	int unmapped_error = 0;
 	int error = -EINVAL;
-	int write;
 	size_t len;
-	struct blk_plug plug;
 
 	start = untagged_addr(start);
 
-	if (!madvise_behavior_valid(&behavior))
-		return error;
-
 	if (!PAGE_ALIGNED(start))
 		return error;
 	len = PAGE_ALIGN(len_in);
@@ -1207,14 +1182,6 @@  int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 		return madvise_inject_error(behavior, start, start + len_in);
 #endif
 
-	write = madvise_need_mmap_write(behavior);
-	if (write) {
-		if (mmap_write_lock_killable(mm))
-			return -EINTR;
-	} else {
-		mmap_read_lock(mm);
-	}
-
 	/*
 	 * If the interval [start,end) covers some unmapped address
 	 * ranges, just ignore them, but return -ENOMEM at the end.
@@ -1224,7 +1191,6 @@  int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 	if (vma && start > vma->vm_start)
 		prev = vma;
 
-	blk_start_plug(&plug);
 	for (;;) {
 		/* Still start < end. */
 		error = -ENOMEM;
@@ -1260,15 +1226,72 @@  int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 			vma = find_vma(mm, start);
 	}
 out:
-	blk_finish_plug(&plug);
-	if (write)
-		mmap_write_unlock(mm);
-	else
-		mmap_read_unlock(mm);
 
 	return error;
 }
 
+static int
+madvise_start(struct mm_struct *mm, struct madvise_info behavior_info,
+	      struct blk_plug *plug)
+{
+	if (!behavior_info.no_mmap_lock) {
+		if (behavior_info.need_mmap_read_only)
+			mmap_read_lock(mm);
+		else if (mmap_write_lock_killable(mm))
+			return -EINTR;
+	}
+
+	blk_start_plug(plug);
+	return 0;
+}
+
+static void
+madvise_finish(struct mm_struct *mm, struct madvise_info behavior_info,
+	       struct blk_plug *plug)
+{
+	blk_finish_plug(plug);
+
+	if (!behavior_info.no_mmap_lock) {
+		if (behavior_info.need_mmap_read_only)
+			mmap_read_unlock(mm);
+		else
+			mmap_write_unlock(mm);
+	}
+}
+
+static struct madvise_info madvise_behavior_info(int behavior)
+{
+	if (behavior >= ARRAY_SIZE(madvise_info) || behavior < 0) {
+		const struct madvise_info invalid = {0};
+		return invalid;
+	}
+
+	behavior = array_index_nospec(behavior, ARRAY_SIZE(madvise_info));
+	return madvise_info[behavior];
+}
+
+int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in,
+	       int behavior)
+{
+	struct madvise_info behavior_info;
+	struct blk_plug plug;
+	int ret = -EINVAL;
+
+	behavior_info = madvise_behavior_info(behavior);
+
+	if (!behavior_info.behavior_valid)
+		return ret;
+
+	ret = madvise_start(mm, behavior_info, &plug);
+	if (ret != 0)
+		return ret;
+
+	ret = madvise_one_range(mm, start, len_in, behavior);
+
+	madvise_finish(mm, behavior_info, &plug);
+	return ret;
+}
+
 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
 	return do_madvise(current->mm, start, len_in, behavior);
@@ -1286,6 +1309,8 @@  SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 	struct mm_struct *mm;
 	size_t total_len;
 	unsigned int f_flags;
+	struct madvise_info behavior_info;
+	struct blk_plug plug;
 
 	if (flags != 0) {
 		ret = -EINVAL;
@@ -1308,7 +1333,9 @@  SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 		goto put_pid;
 	}
 
-	if (!process_madvise_behavior_valid(&behavior)) {
+	behavior_info = madvise_behavior_info(behavior);
+
+	if (!behavior_info.process_behavior_valid) {
 		ret = -EINVAL;
 		goto release_task;
 	}
@@ -1331,15 +1358,21 @@  SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 
 	total_len = iov_iter_count(&iter);
 
+	ret = madvise_start(mm, behavior_info, &plug);
+	if (ret != 0)
+		goto release_mm;
+
 	while (iov_iter_count(&iter)) {
 		iovec = iov_iter_iovec(&iter);
-		ret = do_madvise(mm, (unsigned long)iovec.iov_base,
-					iovec.iov_len, behavior);
+		ret = madvise_one_range(mm, (unsigned long)iovec.iov_base,
+				iovec.iov_len, behavior);
 		if (ret < 0)
 			break;
 		iov_iter_advance(&iter, iovec.iov_len);
 	}
 
+	madvise_finish(mm, behavior_info, &plug);
+
 	if (ret == 0)
 		ret = total_len - iov_iter_count(&iter);