diff mbox series

[1/2] mm/madvise: introduce PR_MADV_SELF flag to process_madvise()

Message ID 077be0d59cb1047870a84c87c62e7b027af1c75d.1727106751.git.lorenzo.stoakes@oracle.com (mailing list archive)
State New
Headers show
Series unrestrict process_madvise() for current process | expand

Commit Message

Lorenzo Stoakes Sept. 23, 2024, 4:03 p.m. UTC
process_madvise() was conceived as a useful means for performing a vector
of madvise() operations on a remote process's address space.

However it's useful to be able to do so on the current process also. It is
currently rather clunky to do this (requiring a pidfd to be opened for the
current process) and introduces unnecessary overhead in incrementing
reference counts for the task and mm.

Avoid all of this by providing a PR_MADV_SELF flag, which causes
process_madvise() to simply ignore the pidfd parameter and instead apply
the operation to the current process.

Since we are operating on our own process, no restrictions need be applied
on behaviors we can perform, so do not limit these in that case.

Also extend the case of a user specifying the current process via pidfd to
not be restricted on behaviors which can be performed.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 include/uapi/asm-generic/mman-common.h |  2 +
 mm/madvise.c                           | 58 +++++++++++++++++++-------
 2 files changed, 44 insertions(+), 16 deletions(-)

Comments

Shakeel Butt Sept. 23, 2024, 6:56 p.m. UTC | #1
On Mon, Sep 23, 2024 at 05:03:56PM GMT, Lorenzo Stoakes wrote:
[...]
>  SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
>  		size_t, vlen, int, behavior, unsigned int, flags)
>  {
> @@ -1486,10 +1509,9 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
>  	struct iov_iter iter;
>  	struct task_struct *task;
>  	struct mm_struct *mm;
> -	size_t total_len;
>  	unsigned int f_flags;
>  
> -	if (flags != 0) {
> +	if (flags & ~PR_MADV_SELF) {
>  		ret = -EINVAL;
>  		goto out;
>  	}
> @@ -1498,13 +1520,26 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
>  	if (ret < 0)
>  		goto out;
>  
> +	/*
> +	 * Perform an madvise operation on the current process. No restrictions
> +	 * need be applied, nor do we need to pin the task or mm_struct.
> +	 */
> +	if (flags & PR_MADV_SELF) {
> +		ret = vector_madvise(current->mm, &iter, behavior);
> +		goto free_iov;
> +	}
> +
>  	task = pidfd_get_task(pidfd, &f_flags);
>  	if (IS_ERR(task)) {
>  		ret = PTR_ERR(task);
>  		goto free_iov;
>  	}
>  
> -	if (!process_madvise_behavior_valid(behavior)) {
> +	/*
> +	 * We need only perform this check if we are attempting to manipulate a
> +	 * remote process's address space.
> +	 */
> +	if (mm != current->mm && !process_madvise_remote_valid(behavior)) {

Move the above check after mm is initialized i.e. mm = mm_access().

Shakeel
Lorenzo Stoakes Sept. 23, 2024, 7:34 p.m. UTC | #2
On Mon, Sep 23, 2024 at 11:56:06AM GMT, Shakeel Butt wrote:
> On Mon, Sep 23, 2024 at 05:03:56PM GMT, Lorenzo Stoakes wrote:
> [...]
> >  SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
> >  		size_t, vlen, int, behavior, unsigned int, flags)
> >  {
> > @@ -1486,10 +1509,9 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
> >  	struct iov_iter iter;
> >  	struct task_struct *task;
> >  	struct mm_struct *mm;
> > -	size_t total_len;
> >  	unsigned int f_flags;
> >
> > -	if (flags != 0) {
> > +	if (flags & ~PR_MADV_SELF) {
> >  		ret = -EINVAL;
> >  		goto out;
> >  	}
> > @@ -1498,13 +1520,26 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
> >  	if (ret < 0)
> >  		goto out;
> >
> > +	/*
> > +	 * Perform an madvise operation on the current process. No restrictions
> > +	 * need be applied, nor do we need to pin the task or mm_struct.
> > +	 */
> > +	if (flags & PR_MADV_SELF) {
> > +		ret = vector_madvise(current->mm, &iter, behavior);
> > +		goto free_iov;
> > +	}
> > +
> >  	task = pidfd_get_task(pidfd, &f_flags);
> >  	if (IS_ERR(task)) {
> >  		ret = PTR_ERR(task);
> >  		goto free_iov;
> >  	}
> >
> > -	if (!process_madvise_behavior_valid(behavior)) {
> > +	/*
> > +	 * We need only perform this check if we are attempting to manipulate a
> > +	 * remote process's address space.
> > +	 */
> > +	if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
>
> Move the above check after mm is initialized i.e. mm = mm_access().
>
> Shakeel

Ugh, sorry silly one there! Reflexively put that check in the original position.

Enclose a quick fix-patch for it, will fix on any respin also.

----8<----
From dc09e0edf1cf71a89cc4cfc3ec73fdae3c2ab86c Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 23 Sep 2024 20:33:07 +0100
Subject: [PATCH] mm/madvise: retrieve mm before checking

---
 mm/madvise.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 549b36d1463c..49d12f98b677 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1535,20 +1535,20 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 		goto free_iov;
 	}

+	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
+	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
+	if (IS_ERR_OR_NULL(mm)) {
+		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+		goto release_task;
+	}
+
 	/*
 	 * We need only perform this check if we are attempting to manipulate a
 	 * remote process's address space.
 	 */
 	if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
 		ret = -EINVAL;
-		goto release_task;
-	}
-
-	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
-	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
-	if (IS_ERR_OR_NULL(mm)) {
-		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
-		goto release_task;
+		goto release_mm;
 	}

 	/*
--
2.46.0
kernel test robot Sept. 23, 2024, 9:20 p.m. UTC | #3
Hi Lorenzo,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on arnd-asm-generic/master soc/for-next linus/master v6.11 next-20240923]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Lorenzo-Stoakes/mm-madvise-introduce-PR_MADV_SELF-flag-to-process_madvise/20240924-000845
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/077be0d59cb1047870a84c87c62e7b027af1c75d.1727106751.git.lorenzo.stoakes%40oracle.com
patch subject: [PATCH 1/2] mm/madvise: introduce PR_MADV_SELF flag to process_madvise()
config: arm-aspeed_g4_defconfig (https://download.01.org/0day-ci/archive/20240924/202409240527.pAgR35QJ-lkp@intel.com/config)
compiler: clang version 20.0.0git (https://github.com/llvm/llvm-project 8663a75fa2f31299ab8d1d90288d9df92aadee88)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240924/202409240527.pAgR35QJ-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409240527.pAgR35QJ-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from mm/madvise.c:9:
   In file included from include/linux/mman.h:5:
   In file included from include/linux/mm.h:2198:
   include/linux/vmstat.h:518:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
     518 |         return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
         |                               ~~~~~~~~~~~ ^ ~~~
   In file included from mm/madvise.c:21:
   include/linux/mm_inline.h:47:41: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
      47 |         __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
         |                                    ~~~~~~~~~~~ ^ ~~~
   include/linux/mm_inline.h:49:22: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
      49 |                                 NR_ZONE_LRU_BASE + lru, nr_pages);
         |                                 ~~~~~~~~~~~~~~~~ ^ ~~~
>> mm/madvise.c:1542:6: warning: variable 'mm' is uninitialized when used here [-Wuninitialized]
    1542 |         if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
         |             ^~
   mm/madvise.c:1511:22: note: initialize the variable 'mm' to silence this warning
    1511 |         struct mm_struct *mm;
         |                             ^
         |                              = NULL
   4 warnings generated.


vim +/mm +1542 mm/madvise.c

  1502	
  1503	SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
  1504			size_t, vlen, int, behavior, unsigned int, flags)
  1505	{
  1506		ssize_t ret;
  1507		struct iovec iovstack[UIO_FASTIOV];
  1508		struct iovec *iov = iovstack;
  1509		struct iov_iter iter;
  1510		struct task_struct *task;
  1511		struct mm_struct *mm;
  1512		unsigned int f_flags;
  1513	
  1514		if (flags & ~PR_MADV_SELF) {
  1515			ret = -EINVAL;
  1516			goto out;
  1517		}
  1518	
  1519		ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
  1520		if (ret < 0)
  1521			goto out;
  1522	
  1523		/*
  1524		 * Perform an madvise operation on the current process. No restrictions
  1525		 * need be applied, nor do we need to pin the task or mm_struct.
  1526		 */
  1527		if (flags & PR_MADV_SELF) {
  1528			ret = vector_madvise(current->mm, &iter, behavior);
  1529			goto free_iov;
  1530		}
  1531	
  1532		task = pidfd_get_task(pidfd, &f_flags);
  1533		if (IS_ERR(task)) {
  1534			ret = PTR_ERR(task);
  1535			goto free_iov;
  1536		}
  1537	
  1538		/*
  1539		 * We need only perform this check if we are attempting to manipulate a
  1540		 * remote process's address space.
  1541		 */
> 1542		if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
kernel test robot Sept. 23, 2024, 9:30 p.m. UTC | #4
Hi Lorenzo,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on arnd-asm-generic/master soc/for-next linus/master v6.11 next-20240923]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Lorenzo-Stoakes/mm-madvise-introduce-PR_MADV_SELF-flag-to-process_madvise/20240924-000845
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/077be0d59cb1047870a84c87c62e7b027af1c75d.1727106751.git.lorenzo.stoakes%40oracle.com
patch subject: [PATCH 1/2] mm/madvise: introduce PR_MADV_SELF flag to process_madvise()
config: parisc-allnoconfig (https://download.01.org/0day-ci/archive/20240924/202409240556.LgM8vOIF-lkp@intel.com/config)
compiler: hppa-linux-gcc (GCC) 14.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240924/202409240556.LgM8vOIF-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409240556.LgM8vOIF-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/madvise.c: In function '__do_sys_process_madvise':
>> mm/madvise.c:1514:22: error: 'PR_MADV_SELF' undeclared (first use in this function)
    1514 |         if (flags & ~PR_MADV_SELF) {
         |                      ^~~~~~~~~~~~
   mm/madvise.c:1514:22: note: each undeclared identifier is reported only once for each function it appears in


vim +/PR_MADV_SELF +1514 mm/madvise.c

  1502	
  1503	SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
  1504			size_t, vlen, int, behavior, unsigned int, flags)
  1505	{
  1506		ssize_t ret;
  1507		struct iovec iovstack[UIO_FASTIOV];
  1508		struct iovec *iov = iovstack;
  1509		struct iov_iter iter;
  1510		struct task_struct *task;
  1511		struct mm_struct *mm;
  1512		unsigned int f_flags;
  1513	
> 1514		if (flags & ~PR_MADV_SELF) {
Arnd Bergmann Sept. 23, 2024, 9:49 p.m. UTC | #5
On Mon, Sep 23, 2024, at 19:34, Lorenzo Stoakes wrote:
> On Mon, Sep 23, 2024 at 11:56:06AM GMT, Shakeel Butt wrote:
>
> +	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
> +	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
> +	if (IS_ERR_OR_NULL(mm)) {
> +		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
> +		goto release_task;
> +	}

Any chance we can fix mm_access() to not be able to return
a NULL pointer and an error pointer?  IS_ERR_OR_NULL() is
usually an indication of a confusing API, and this is
clearly one of them, given that only one of the
callers actually wants the NULL value instead of -ESRCH.

    Arnd
kernel test robot Sept. 24, 2024, 3:15 a.m. UTC | #6
Hi Lorenzo,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on soc/for-next linus/master v6.11 next-20240923]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Lorenzo-Stoakes/mm-madvise-introduce-PR_MADV_SELF-flag-to-process_madvise/20240924-000845
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/077be0d59cb1047870a84c87c62e7b027af1c75d.1727106751.git.lorenzo.stoakes%40oracle.com
patch subject: [PATCH 1/2] mm/madvise: introduce PR_MADV_SELF flag to process_madvise()
config: mips-ip32_defconfig (https://download.01.org/0day-ci/archive/20240924/202409241034.6ilzMh4w-lkp@intel.com/config)
compiler: clang version 20.0.0git (https://github.com/llvm/llvm-project 8663a75fa2f31299ab8d1d90288d9df92aadee88)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240924/202409241034.6ilzMh4w-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409241034.6ilzMh4w-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from mm/madvise.c:9:
   In file included from include/linux/mman.h:5:
   In file included from include/linux/mm.h:2198:
   include/linux/vmstat.h:518:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
     518 |         return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
         |                               ~~~~~~~~~~~ ^ ~~~
   In file included from mm/madvise.c:21:
   include/linux/mm_inline.h:47:41: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
      47 |         __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
         |                                    ~~~~~~~~~~~ ^ ~~~
   include/linux/mm_inline.h:49:22: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
      49 |                                 NR_ZONE_LRU_BASE + lru, nr_pages);
         |                                 ~~~~~~~~~~~~~~~~ ^ ~~~
>> mm/madvise.c:1514:15: error: use of undeclared identifier 'PR_MADV_SELF'
    1514 |         if (flags & ~PR_MADV_SELF) {
         |                      ^
   mm/madvise.c:1527:14: error: use of undeclared identifier 'PR_MADV_SELF'
    1527 |         if (flags & PR_MADV_SELF) {
         |                     ^
   3 warnings and 2 errors generated.


vim +/PR_MADV_SELF +1514 mm/madvise.c

  1502	
  1503	SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
  1504			size_t, vlen, int, behavior, unsigned int, flags)
  1505	{
  1506		ssize_t ret;
  1507		struct iovec iovstack[UIO_FASTIOV];
  1508		struct iovec *iov = iovstack;
  1509		struct iov_iter iter;
  1510		struct task_struct *task;
  1511		struct mm_struct *mm;
  1512		unsigned int f_flags;
  1513	
> 1514		if (flags & ~PR_MADV_SELF) {
diff mbox series

Patch

diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 6ce1f1ceb432..8f59f23dee09 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -87,4 +87,6 @@ 
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
 				 PKEY_DISABLE_WRITE)
 
+#define PR_MADV_SELF	(1<<0)		/* process_madvise() flag - apply to self */
+
 #endif /* __ASM_GENERIC_MMAN_COMMON_H */
diff --git a/mm/madvise.c b/mm/madvise.c
index ff139e57cca2..549b36d1463c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1208,7 +1208,8 @@  madvise_behavior_valid(int behavior)
 	}
 }
 
-static bool process_madvise_behavior_valid(int behavior)
+/* Can we invoke process_madvise() on a remote mm for the specified behavior? */
+static bool process_madvise_remote_valid(int behavior)
 {
 	switch (behavior) {
 	case MADV_COLD:
@@ -1477,6 +1478,28 @@  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 	return do_madvise(current->mm, start, len_in, behavior);
 }
 
+/* Perform an madvise operation over a vector of addresses and lengths. */
+static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
+			      int behavior)
+{
+	ssize_t ret = 0;
+	size_t total_len;
+
+	total_len = iov_iter_count(iter);
+
+	while (iov_iter_count(iter)) {
+		ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
+				 iter_iov_len(iter), behavior);
+		if (ret < 0)
+			break;
+		iov_iter_advance(iter, iter_iov_len(iter));
+	}
+
+	ret = (total_len - iov_iter_count(iter)) ? : ret;
+
+	return ret;
+}
+
 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 		size_t, vlen, int, behavior, unsigned int, flags)
 {
@@ -1486,10 +1509,9 @@  SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 	struct iov_iter iter;
 	struct task_struct *task;
 	struct mm_struct *mm;
-	size_t total_len;
 	unsigned int f_flags;
 
-	if (flags != 0) {
+	if (flags & ~PR_MADV_SELF) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1498,13 +1520,26 @@  SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 	if (ret < 0)
 		goto out;
 
+	/*
+	 * Perform an madvise operation on the current process. No restrictions
+	 * need be applied, nor do we need to pin the task or mm_struct.
+	 */
+	if (flags & PR_MADV_SELF) {
+		ret = vector_madvise(current->mm, &iter, behavior);
+		goto free_iov;
+	}
+
 	task = pidfd_get_task(pidfd, &f_flags);
 	if (IS_ERR(task)) {
 		ret = PTR_ERR(task);
 		goto free_iov;
 	}
 
-	if (!process_madvise_behavior_valid(behavior)) {
+	/*
+	 * We need only perform this check if we are attempting to manipulate a
+	 * remote process's address space.
+	 */
+	if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
 		ret = -EINVAL;
 		goto release_task;
 	}
@@ -1518,24 +1553,15 @@  SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 
 	/*
 	 * Require CAP_SYS_NICE for influencing process performance. Note that
-	 * only non-destructive hints are currently supported.
+	 * only non-destructive hints are currently supported for remote
+	 * processes.
 	 */
 	if (mm != current->mm && !capable(CAP_SYS_NICE)) {
 		ret = -EPERM;
 		goto release_mm;
 	}
 
-	total_len = iov_iter_count(&iter);
-
-	while (iov_iter_count(&iter)) {
-		ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
-					iter_iov_len(&iter), behavior);
-		if (ret < 0)
-			break;
-		iov_iter_advance(&iter, iter_iov_len(&iter));
-	}
-
-	ret = (total_len - iov_iter_count(&iter)) ? : ret;
+	ret = vector_madvise(mm, &iter, behavior);
 
 release_mm:
 	mmput(mm);