@@ -43,6 +43,13 @@ struct madvise_walk_private {
struct madvise_info {
u8 behavior_valid: 1;
u8 process_behavior_valid: 1;
+ u8 no_mmap_lock: 1;
+
+ /*
+ * Any behaviour which results in changes to the vma->vm_flags needs to
+ * take mmap_lock for writing. Others, which simply traverse vmas, need
+ * to only take it for reading.
+ */
u8 need_mmap_read_only: 1;
};
@@ -120,9 +127,11 @@ static const struct madvise_info madvise_info[MADV_SOFT_OFFLINE+1] = {
#ifdef CONFIG_MEMORY_FAILURE
[MADV_HWPOISON] = {
.behavior_valid = 1,
+ .no_mmap_lock = 1,
},
[MADV_SOFT_OFFLINE] = {
.behavior_valid = 1,
+ .no_mmap_lock = 1,
},
#endif
[MADV_POPULATE_READ] = {
@@ -135,16 +144,6 @@ static const struct madvise_info madvise_info[MADV_SOFT_OFFLINE+1] = {
},
};
-/*
- * Any behaviour which results in changes to the vma->vm_flags needs to
- * take mmap_lock for writing. Others, which simply traverse vmas, need
- * to only take it for reading.
- */
-static int madvise_need_mmap_write(int behavior)
-{
- return !madvise_info[behavior].need_mmap_read_only;
-}
-
/*
* We can potentially split a vm area into separate
* areas, each area with its own behavior.
@@ -1081,26 +1080,6 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
}
}
-static bool
-madvise_behavior_valid(int *behavior)
-{
- if (*behavior >= ARRAY_SIZE(madvise_info))
- return false;
-
- *behavior = array_index_nospec(*behavior, ARRAY_SIZE(madvise_info));
- return madvise_info[*behavior].behavior_valid;
-}
-
-static bool
-process_madvise_behavior_valid(int *behavior)
-{
- if (*behavior >= ARRAY_SIZE(madvise_info))
- return false;
-
- *behavior = array_index_nospec(*behavior, ARRAY_SIZE(madvise_info));
- return madvise_info[*behavior].process_behavior_valid;
-}
-
/*
* The madvise(2) system call.
*
@@ -1171,21 +1150,17 @@ process_madvise_behavior_valid(int *behavior)
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
-int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
+int madvise_one_range(struct mm_struct *mm, unsigned long start, size_t len_in,
+ int behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
int unmapped_error = 0;
int error = -EINVAL;
- int write;
size_t len;
- struct blk_plug plug;
start = untagged_addr(start);
- if (!madvise_behavior_valid(&behavior))
- return error;
-
if (!PAGE_ALIGNED(start))
return error;
len = PAGE_ALIGN(len_in);
@@ -1207,14 +1182,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
return madvise_inject_error(behavior, start, start + len_in);
#endif
- write = madvise_need_mmap_write(behavior);
- if (write) {
- if (mmap_write_lock_killable(mm))
- return -EINTR;
- } else {
- mmap_read_lock(mm);
- }
-
/*
* If the interval [start,end) covers some unmapped address
* ranges, just ignore them, but return -ENOMEM at the end.
@@ -1224,7 +1191,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
if (vma && start > vma->vm_start)
prev = vma;
- blk_start_plug(&plug);
for (;;) {
/* Still start < end. */
error = -ENOMEM;
@@ -1260,15 +1226,72 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
vma = find_vma(mm, start);
}
out:
- blk_finish_plug(&plug);
- if (write)
- mmap_write_unlock(mm);
- else
- mmap_read_unlock(mm);
return error;
}
+static int
+madvise_start(struct mm_struct *mm, struct madvise_info behavior_info,
+ struct blk_plug *plug)
+{
+ if (!behavior_info.no_mmap_lock) {
+ if (behavior_info.need_mmap_read_only)
+ mmap_read_lock(mm);
+ else if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ }
+
+ blk_start_plug(plug);
+ return 0;
+}
+
+static void
+madvise_finish(struct mm_struct *mm, struct madvise_info behavior_info,
+ struct blk_plug *plug)
+{
+ blk_finish_plug(plug);
+
+ if (!behavior_info.no_mmap_lock) {
+ if (behavior_info.need_mmap_read_only)
+ mmap_read_unlock(mm);
+ else
+ mmap_write_unlock(mm);
+ }
+}
+
+static struct madvise_info madvise_behavior_info(int behavior)
+{
+ if (behavior >= ARRAY_SIZE(madvise_info) || behavior < 0) {
+ const struct madvise_info invalid = {0};
+ return invalid;
+ }
+
+ behavior = array_index_nospec(behavior, ARRAY_SIZE(madvise_info));
+ return madvise_info[behavior];
+}
+
+int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in,
+ int behavior)
+{
+ struct madvise_info behavior_info;
+ struct blk_plug plug;
+ int ret = -EINVAL;
+
+ behavior_info = madvise_behavior_info(behavior);
+
+ if (!behavior_info.behavior_valid)
+ return ret;
+
+ ret = madvise_start(mm, behavior_info, &plug);
+ if (ret != 0)
+ return ret;
+
+ ret = madvise_one_range(mm, start, len_in, behavior);
+
+ madvise_finish(mm, behavior_info, &plug);
+ return ret;
+}
+
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
return do_madvise(current->mm, start, len_in, behavior);
@@ -1286,6 +1309,8 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
struct mm_struct *mm;
size_t total_len;
unsigned int f_flags;
+ struct madvise_info behavior_info;
+ struct blk_plug plug;
if (flags != 0) {
ret = -EINVAL;
@@ -1308,7 +1333,9 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto put_pid;
}
- if (!process_madvise_behavior_valid(&behavior)) {
+ behavior_info = madvise_behavior_info(behavior);
+
+ if (!behavior_info.process_behavior_valid) {
ret = -EINVAL;
goto release_task;
}
@@ -1331,15 +1358,21 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
total_len = iov_iter_count(&iter);
+ ret = madvise_start(mm, behavior_info, &plug);
+ if (ret != 0)
+ goto release_mm;
+
while (iov_iter_count(&iter)) {
iovec = iov_iter_iovec(&iter);
- ret = do_madvise(mm, (unsigned long)iovec.iov_base,
- iovec.iov_len, behavior);
+ ret = madvise_one_range(mm, (unsigned long)iovec.iov_base,
+ iovec.iov_len, behavior);
if (ret < 0)
break;
iov_iter_advance(&iter, iovec.iov_len);
}
+ madvise_finish(mm, behavior_info, &plug);
+
if (ret == 0)
ret = total_len - iov_iter_count(&iter);