diff mbox series

[v1,1/2] mseal: Two fixes for madvise(MADV_DONTNEED) when sealed

Message ID 20241017005105.3047458-2-jeffxu@chromium.org (mailing list archive)
State New, archived
Headers show
Series mseal: fixing madvise for file-backed mapping and PROT_NONE | expand

Commit Message

Jeff Xu Oct. 17, 2024, 12:51 a.m. UTC
From: Jeff Xu <jeffxu@chromium.org>

Two fixes for madvise(MADV_DONTNEED) when sealed.

For PROT_NONE mappings, the previous blocking of
madvise(MADV_DONTNEED) is unnecessary. As PROT_NONE already prohibits
memory access, madvise(MADV_DONTNEED) should be allowed to proceed in
order to free the page.

For file-backed, private, read-only memory mappings, we previously did
not block the madvise(MADV_DONTNEED). This was based on
the assumption that the memory's content, being file-backed, could be
retrieved from the file if accessed again. However, this assumption
failed to consider scenarios where a mapping is initially created as
read-write, modified, and subsequently changed to read-only. The newly
introduced VM_WASWRITE flag addresses this oversight.

Reported-by: Pedro Falcato <pedro.falcato@gmail.com>
Link:https://lore.kernel.org/all/CABi2SkW2XzuZ2-TunWOVzTEX1qc29LhjfNQ3hD4Nym8U-_f+ug@mail.gmail.com/
Fixes: 8be7258aad44 ("mseal: add mseal syscall")
Cc: <stable@vger.kernel.org> # 6.11.y: 4d1b3416659b: mm: move can_modify_vma to mm/vma.h
Cc: <stable@vger.kernel.org> # 6.11.y: 4a2dd02b0916: mm/mprotect: replace can_modify_mm with can_modify_vma
Cc: <stable@vger.kernel.org> # 6.11.y: 23c57d1fa2b9: mseal: replace can_modify_mm_madv with a vma variant
Cc: <stable@vger.kernel.org> # 6.11.y
Signed-off-by: Jeff Xu <jeffxu@chromium.org>
---
 include/linux/mm.h |  2 ++
 mm/mprotect.c      |  3 +++
 mm/mseal.c         | 42 ++++++++++++++++++++++++++++++++++++------
 3 files changed, 41 insertions(+), 6 deletions(-)

Comments

Lorenzo Stoakes Oct. 17, 2024, 8:32 a.m. UTC | #1
On Thu, Oct 17, 2024 at 12:51:04AM +0000, jeffxu@chromium.org wrote:
> From: Jeff Xu <jeffxu@chromium.org>
>
> Two fixes for madvise(MADV_DONTNEED) when sealed.
>
> For PROT_NONE mappings, the previous blocking of
> madvise(MADV_DONTNEED) is unnecessary. As PROT_NONE already prohibits
> memory access, madvise(MADV_DONTNEED) should be allowed to proceed in
> order to free the page.
>
> For file-backed, private, read-only memory mappings, we previously did
> not block the madvise(MADV_DONTNEED). This was based on
> the assumption that the memory's content, being file-backed, could be
> retrieved from the file if accessed again. However, this assumption
> failed to consider scenarios where a mapping is initially created as
> read-write, modified, and subsequently changed to read-only. The newly
> introduced VM_WASWRITE flag addresses this oversight.
>
> Reported-by: Pedro Falcato <pedro.falcato@gmail.com>
> Link:https://lore.kernel.org/all/CABi2SkW2XzuZ2-TunWOVzTEX1qc29LhjfNQ3hD4Nym8U-_f+ug@mail.gmail.com/
> Fixes: 8be7258aad44 ("mseal: add mseal syscall")
> Cc: <stable@vger.kernel.org> # 6.11.y: 4d1b3416659b: mm: move can_modify_vma to mm/vma.h
> Cc: <stable@vger.kernel.org> # 6.11.y: 4a2dd02b0916: mm/mprotect: replace can_modify_mm with can_modify_vma
> Cc: <stable@vger.kernel.org> # 6.11.y: 23c57d1fa2b9: mseal: replace can_modify_mm_madv with a vma variant
> Cc: <stable@vger.kernel.org> # 6.11.y
> Signed-off-by: Jeff Xu <jeffxu@chromium.org>
> ---
>  include/linux/mm.h |  2 ++
>  mm/mprotect.c      |  3 +++
>  mm/mseal.c         | 42 ++++++++++++++++++++++++++++++++++++------
>  3 files changed, 41 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 4c32003c8404..b402eca2565a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -430,6 +430,8 @@ extern unsigned int kobjsize(const void *objp);
>  #ifdef CONFIG_64BIT
>  /* VM is sealed, in vm_flags */
>  #define VM_SEALED	_BITUL(63)
> +/* VM was writable */

Woefully poor and misleading comment.

> +#define VM_WASWRITE	_BITUL(62)

The bar for an additional VMA flag is _really high_. As far as I'm
concerned you absolutely do not hit that bar here.

>  #endif
>
>  /* Bits set in the VMA until the stack is in its final location */
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 0c5d6d06107d..6397135ca526 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -821,6 +821,9 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
>  			break;
>  		}
>
> +		if ((vma->vm_flags & VM_WRITE) && !(newflags & VM_WRITE))
> +			newflags |= VM_WASWRITE;
> +

You're making this unmergeable now!!! No! Lord this is horrid.

You can't fundamentally change how mprotect() functions to suit edge cases
for mseal, sorry.

>  		error = security_file_mprotect(vma, reqprot, prot);
>  		if (error)
>  			break;
> diff --git a/mm/mseal.c b/mm/mseal.c
> index ece977bd21e1..28f28487be17 100644
> --- a/mm/mseal.c
> +++ b/mm/mseal.c
> @@ -36,12 +36,8 @@ static bool is_madv_discard(int behavior)
>  	return false;
>  }
>
> -static bool is_ro_anon(struct vm_area_struct *vma)
> +static bool anon_is_ro(struct vm_area_struct *vma)
>  {
> -	/* check anonymous mapping. */
> -	if (vma->vm_file || vma->vm_flags & VM_SHARED)
> -		return false;
> -
>  	/*
>  	 * check for non-writable:
>  	 * PROT=RO or PKRU is not writeable.
> @@ -53,6 +49,22 @@ static bool is_ro_anon(struct vm_area_struct *vma)
>  	return false;
>  }
>
> +static bool vma_is_prot_none(struct vm_area_struct *vma)
> +{
> +	if ((vma->vm_flags & VM_ACCESS_FLAGS) == VM_NONE)
> +		return true;
> +
> +	return false;
> +}

You don't need this, there is already vma_is_accessible() in mm.h.

> +
> +static bool vma_was_writable_turn_readonly(struct vm_area_struct *vma)
> +{
> +	if (!(vma->vm_flags & VM_WRITE) && vma->vm_flags & VM_WASWRITE)
> +		return true;
> +
> +	return false;
> +}

The naming of this is horrid and confusing.

> +
>  /*
>   * Check if a vma is allowed to be modified by madvise.
>   */
> @@ -61,7 +73,25 @@ bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
>  	if (!is_madv_discard(behavior))
>  		return true;
>
> -	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
> +	/* not sealed */
> +	if (likely(can_modify_vma(vma)))

Please don't just use likely() / unlikely() because _you_ think they're
likely/unlikely. Only use them based on profiling data. if you don't have it,
remove them.

> +		return true;
> +
> +	/* PROT_NONE mapping */

Useless comment.

> +	if (vma_is_prot_none(vma))
> +		return true;
> +
> +	/* file-backed private mapping */

Err... how do you know it's a private mapping?

> +	if (vma->vm_file) {
> +		/* read-only but was writeable */
> +		if (vma_was_writable_turn_readonly(vma))
> +			return false;

This whole thing seems broken, and we already have a mechanism for this,
see mapping_writably_mapped() which _also_ handles write seals for memfd's
which you are not accounting for here.

> +
> +		return true;
> +	}
> +
> +	/* anonymous mapping is read-only */
> +	if (anon_is_ro(vma))

You're implementing subtle details here with 1 line comments (that are
pretty well useless), that's just not good enough.

Please make sure to add _meaningful_ comments that will help another
developer understand what's going on.

>  		return false;
>
>  	/* Allow by default. */
> --
> 2.47.0.rc1.288.g06298d1525-goog
>
Pedro Falcato Oct. 17, 2024, 7:37 p.m. UTC | #2
On Thu, Oct 17, 2024 at 12:51:04AM +0000, jeffxu@chromium.org wrote:
> From: Jeff Xu <jeffxu@chromium.org>
> 
> Two fixes for madvise(MADV_DONTNEED) when sealed.
>

Please separate these fixes into two separate patches.

> For PROT_NONE mappings, the previous blocking of
> madvise(MADV_DONTNEED) is unnecessary. As PROT_NONE already prohibits
> memory access, madvise(MADV_DONTNEED) should be allowed to proceed in
> order to free the page.

I don't get it. Is there an actual use case for this?

> For file-backed, private, read-only memory mappings, we previously did
> not block the madvise(MADV_DONTNEED). This was based on
> the assumption that the memory's content, being file-backed, could be
> retrieved from the file if accessed again. However, this assumption
> failed to consider scenarios where a mapping is initially created as
> read-write, modified, and subsequently changed to read-only. The newly
> introduced VM_WASWRITE flag addresses this oversight.

We *do not* need this. It's sufficient to just block discard operations on read-only
private mappings. Sending a possible (fully untested) fix. If you like this approach
I can resend properly, or Andrew can pick it up, whatever floats people's boats.

----8<----

From dc5ec662dcb79156f4bdc1cba2a2575dce905ffa Mon Sep 17 00:00:00 2001
From: Pedro Falcato <pedro.falcato@gmail.com>
Date: Thu, 17 Oct 2024 20:21:10 +0100
Subject: [PATCH] mm/mseal: Disallow madvise discard on file-private sealed
 mappings

Doing an operation such as MADV_DONTNEED on a file-private mapping may
forcibly alter data by discarding CoW'd, anon pages and replacing them
with page cache pages fresh from the filesystem.

As such, this somewhat bypasses the mseal of a read-only mapping, and
should be disallowed.

Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
Fixes: 8be7258aad44 ("mseal: add mseal syscall")
Cc: <stable@vger.kernel.org> # 6.11.y
---
 mm/mseal.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/mm/mseal.c b/mm/mseal.c
index 28cd17d7aaf2..d053303c5542 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -36,10 +36,15 @@ static bool is_madv_discard(int behavior)
        return false;
 }
 
-static bool is_ro_anon(struct vm_area_struct *vma)
+static bool is_ro_private(struct vm_area_struct *vma)
 {
-       /* check anonymous mapping. */
-       if (vma->vm_file || vma->vm_flags & VM_SHARED)
+       /*
+        * If shared, allow discard operations - it shouldn't
+        * affect the underlying data. Discard on private VMAs may
+        * forcibly alter data by replacing CoW'd anonymous pages
+        * with ones fresh from the page cache.
+        */
+       if (vma->vm_flags & VM_SHARED)
                return false;
 
        /*
@@ -61,7 +66,7 @@ bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
        if (!is_madv_discard(behavior))
                return true;
 
-       if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
+       if (unlikely(!can_modify_vma(vma) && is_ro_private(vma)))
                return false;
 
        /* Allow by default. */
Jeff Xu Oct. 17, 2024, 8:34 p.m. UTC | #3
Hi Pedro

On Thu, Oct 17, 2024 at 12:37 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
>
> > For PROT_NONE mappings, the previous blocking of
> > madvise(MADV_DONTNEED) is unnecessary. As PROT_NONE already prohibits
> > memory access, madvise(MADV_DONTNEED) should be allowed to proceed in
> > order to free the page.
>
> I don't get it. Is there an actual use case for this?
>
Sealing should not over-blocking API that it can allow to pass without
security concern, this is a case in that principle.

There is a user case for this as well: to seal NX stack on android,
Android uses PROT_NONE/madvise to set up a guide page to prevent stack
run over boundary. So we need to let madvise to pass.

> > For file-backed, private, read-only memory mappings, we previously did
> > not block the madvise(MADV_DONTNEED). This was based on
> > the assumption that the memory's content, being file-backed, could be
> > retrieved from the file if accessed again. However, this assumption
> > failed to consider scenarios where a mapping is initially created as
> > read-write, modified, and subsequently changed to read-only. The newly
> > introduced VM_WASWRITE flag addresses this oversight.
>
> We *do not* need this. It's sufficient to just block discard operations on read-only
> private mappings.
I think you meant blocking madvise(MADV_DONTNEED) on all read-only
private file-backed mappings.

I considered that option, but there is a use case for madvise on those
mappings that never get modified.

Apps can use that to free up RAM. e.g. Considering read-only .text
section, which never gets modified, madvise( MADV_DONTNEED) can free
up RAM when memory is in-stress, memory will be reclaimed from a
backed-file on next read access. Therefore we can't just block all
read-only private file-backed mapping, only those that really need to,
such as mapping changed from rw=>r (what you described)
Pedro Falcato Oct. 17, 2024, 8:49 p.m. UTC | #4
On Thu, Oct 17, 2024 at 01:34:53PM -0700, Jeff Xu wrote:
> Hi Pedro
> 
> On Thu, Oct 17, 2024 at 12:37 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
> >
> > > For PROT_NONE mappings, the previous blocking of
> > > madvise(MADV_DONTNEED) is unnecessary. As PROT_NONE already prohibits
> > > memory access, madvise(MADV_DONTNEED) should be allowed to proceed in
> > > order to free the page.
> >
> > I don't get it. Is there an actual use case for this?
> >
> Sealing should not over-blocking API that it can allow to pass without
> security concern, this is a case in that principle.

Well, making the interface simple is also important. OpenBSD's mimmutable()
doesn't do any of this and it Just Works(tm)...

> 
> There is a user case for this as well: to seal NX stack on android,
> Android uses PROT_NONE/madvise to set up a guide page to prevent stack
> run over boundary. So we need to let madvise to pass.

And you need to MADV_DONTNEED this guard page?

> 
> > > For file-backed, private, read-only memory mappings, we previously did
> > > not block the madvise(MADV_DONTNEED). This was based on
> > > the assumption that the memory's content, being file-backed, could be
> > > retrieved from the file if accessed again. However, this assumption
> > > failed to consider scenarios where a mapping is initially created as
> > > read-write, modified, and subsequently changed to read-only. The newly
> > > introduced VM_WASWRITE flag addresses this oversight.
> >
> > We *do not* need this. It's sufficient to just block discard operations on read-only
> > private mappings.
> I think you meant blocking madvise(MADV_DONTNEED) on all read-only
> private file-backed mappings.
> 
> I considered that option, but there is a use case for madvise on those
> mappings that never get modified.
> 
> Apps can use that to free up RAM. e.g. Considering read-only .text
> section, which never gets modified, madvise( MADV_DONTNEED) can free
> up RAM when memory is in-stress, memory will be reclaimed from a
> backed-file on next read access. Therefore we can't just block all
> read-only private file-backed mapping, only those that really need to,
> such as mapping changed from rw=>r (what you described)

Does anyone actually do this? If so, why? WHYYYY?

The kernel's page reclaim logic should be perfectly cromulent. Please don't do this.
MADV_DONTNEED will also not free any pages if those are shared (rather they'll just be unmapped).

If we really need to do this, I'd maybe suggest walking through page tables, looking for
anon ptes or swap ptes (maybe inside the actual zap code?). But I would really prefer if we
didn't need to do this.
Jeff Xu Oct. 17, 2024, 8:57 p.m. UTC | #5
On Thu, Oct 17, 2024 at 1:49 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
>
> On Thu, Oct 17, 2024 at 01:34:53PM -0700, Jeff Xu wrote:
> > Hi Pedro
> >
> > On Thu, Oct 17, 2024 at 12:37 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
> > >
> > > > For PROT_NONE mappings, the previous blocking of
> > > > madvise(MADV_DONTNEED) is unnecessary. As PROT_NONE already prohibits
> > > > memory access, madvise(MADV_DONTNEED) should be allowed to proceed in
> > > > order to free the page.
> > >
> > > I don't get it. Is there an actual use case for this?
> > >
> > Sealing should not over-blocking API that it can allow to pass without
> > security concern, this is a case in that principle.
>
> Well, making the interface simple is also important. OpenBSD's mimmutable()
> doesn't do any of this and it Just Works(tm)...
>
> >
> > There is a user case for this as well: to seal NX stack on android,
> > Android uses PROT_NONE/madvise to set up a guide page to prevent stack
> > run over boundary. So we need to let madvise to pass.
>
> And you need to MADV_DONTNEED this guard page?
>
Yes.

> >
> > > > For file-backed, private, read-only memory mappings, we previously did
> > > > not block the madvise(MADV_DONTNEED). This was based on
> > > > the assumption that the memory's content, being file-backed, could be
> > > > retrieved from the file if accessed again. However, this assumption
> > > > failed to consider scenarios where a mapping is initially created as
> > > > read-write, modified, and subsequently changed to read-only. The newly
> > > > introduced VM_WASWRITE flag addresses this oversight.
> > >
> > > We *do not* need this. It's sufficient to just block discard operations on read-only
> > > private mappings.
> > I think you meant blocking madvise(MADV_DONTNEED) on all read-only
> > private file-backed mappings.
> >
> > I considered that option, but there is a use case for madvise on those
> > mappings that never get modified.
> >
> > Apps can use that to free up RAM. e.g. Considering read-only .text
> > section, which never gets modified, madvise( MADV_DONTNEED) can free
> > up RAM when memory is in-stress, memory will be reclaimed from a
> > backed-file on next read access. Therefore we can't just block all
> > read-only private file-backed mapping, only those that really need to,
> > such as mapping changed from rw=>r (what you described)
>
> Does anyone actually do this? If so, why? WHYYYY?
>
This is a legit use case, I can't argue that it isn't.

> The kernel's page reclaim logic should be perfectly cromulent. Please don't do this.
> MADV_DONTNEED will also not free any pages if those are shared (rather they'll just be unmapped).
>
> If we really need to do this, I'd maybe suggest walking through page tables, looking for
> anon ptes or swap ptes (maybe inside the actual zap code?). But I would really prefer if we
> didn't need to do this.
>
I also considered this route, but it is too complicated. The
copy-on-write pages can be put into a swap file, also there is a huge
page to consider, etc, The complication makes it really difficult to
code it right, also scanning those pages on per VMA level will require
lock and also impact performance.


> --
> Pedro
kernel test robot Oct. 20, 2024, 9:20 a.m. UTC | #6
Hi,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.12-rc3 next-20241018]
[cannot apply to kees/for-next/pstore kees/for-next/kspp linux/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/jeffxu-chromium-org/mseal-Two-fixes-for-madvise-MADV_DONTNEED-when-sealed/20241017-085203
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20241017005105.3047458-2-jeffxu%40chromium.org
patch subject: [PATCH v1 1/2] mseal: Two fixes for madvise(MADV_DONTNEED) when sealed
config: i386-allnoconfig (https://download.01.org/0day-ci/archive/20241020/202410201611.Xd6J8QCm-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241020/202410201611.Xd6J8QCm-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202410201611.Xd6J8QCm-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/mprotect.c: In function 'do_mprotect_pkey':
>> mm/mprotect.c:825:37: error: 'VM_WASWRITE' undeclared (first use in this function); did you mean 'VM_MAYWRITE'?
     825 |                         newflags |= VM_WASWRITE;
         |                                     ^~~~~~~~~~~
         |                                     VM_MAYWRITE
   mm/mprotect.c:825:37: note: each undeclared identifier is reported only once for each function it appears in


vim +825 mm/mprotect.c

   705	
   706	/*
   707	 * pkey==-1 when doing a legacy mprotect()
   708	 */
   709	static int do_mprotect_pkey(unsigned long start, size_t len,
   710			unsigned long prot, int pkey)
   711	{
   712		unsigned long nstart, end, tmp, reqprot;
   713		struct vm_area_struct *vma, *prev;
   714		int error;
   715		const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
   716		const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
   717					(prot & PROT_READ);
   718		struct mmu_gather tlb;
   719		struct vma_iterator vmi;
   720	
   721		start = untagged_addr(start);
   722	
   723		prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
   724		if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
   725			return -EINVAL;
   726	
   727		if (start & ~PAGE_MASK)
   728			return -EINVAL;
   729		if (!len)
   730			return 0;
   731		len = PAGE_ALIGN(len);
   732		end = start + len;
   733		if (end <= start)
   734			return -ENOMEM;
   735		if (!arch_validate_prot(prot, start))
   736			return -EINVAL;
   737	
   738		reqprot = prot;
   739	
   740		if (mmap_write_lock_killable(current->mm))
   741			return -EINTR;
   742	
   743		/*
   744		 * If userspace did not allocate the pkey, do not let
   745		 * them use it here.
   746		 */
   747		error = -EINVAL;
   748		if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
   749			goto out;
   750	
   751		vma_iter_init(&vmi, current->mm, start);
   752		vma = vma_find(&vmi, end);
   753		error = -ENOMEM;
   754		if (!vma)
   755			goto out;
   756	
   757		if (unlikely(grows & PROT_GROWSDOWN)) {
   758			if (vma->vm_start >= end)
   759				goto out;
   760			start = vma->vm_start;
   761			error = -EINVAL;
   762			if (!(vma->vm_flags & VM_GROWSDOWN))
   763				goto out;
   764		} else {
   765			if (vma->vm_start > start)
   766				goto out;
   767			if (unlikely(grows & PROT_GROWSUP)) {
   768				end = vma->vm_end;
   769				error = -EINVAL;
   770				if (!(vma->vm_flags & VM_GROWSUP))
   771					goto out;
   772			}
   773		}
   774	
   775		prev = vma_prev(&vmi);
   776		if (start > vma->vm_start)
   777			prev = vma;
   778	
   779		tlb_gather_mmu(&tlb, current->mm);
   780		nstart = start;
   781		tmp = vma->vm_start;
   782		for_each_vma_range(vmi, vma, end) {
   783			unsigned long mask_off_old_flags;
   784			unsigned long newflags;
   785			int new_vma_pkey;
   786	
   787			if (vma->vm_start != tmp) {
   788				error = -ENOMEM;
   789				break;
   790			}
   791	
   792			/* Does the application expect PROT_READ to imply PROT_EXEC */
   793			if (rier && (vma->vm_flags & VM_MAYEXEC))
   794				prot |= PROT_EXEC;
   795	
   796			/*
   797			 * Each mprotect() call explicitly passes r/w/x permissions.
   798			 * If a permission is not passed to mprotect(), it must be
   799			 * cleared from the VMA.
   800			 */
   801			mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR;
   802	
   803			new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
   804			newflags = calc_vm_prot_bits(prot, new_vma_pkey);
   805			newflags |= (vma->vm_flags & ~mask_off_old_flags);
   806	
   807			/* newflags >> 4 shift VM_MAY% in place of VM_% */
   808			if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) {
   809				error = -EACCES;
   810				break;
   811			}
   812	
   813			if (map_deny_write_exec(vma, newflags)) {
   814				error = -EACCES;
   815				break;
   816			}
   817	
   818			/* Allow architectures to sanity-check the new flags */
   819			if (!arch_validate_flags(newflags)) {
   820				error = -EINVAL;
   821				break;
   822			}
   823	
   824			if ((vma->vm_flags & VM_WRITE) && !(newflags & VM_WRITE))
 > 825				newflags |= VM_WASWRITE;
   826	
   827			error = security_file_mprotect(vma, reqprot, prot);
   828			if (error)
   829				break;
   830	
   831			tmp = vma->vm_end;
   832			if (tmp > end)
   833				tmp = end;
   834	
   835			if (vma->vm_ops && vma->vm_ops->mprotect) {
   836				error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags);
   837				if (error)
   838					break;
   839			}
   840	
   841			error = mprotect_fixup(&vmi, &tlb, vma, &prev, nstart, tmp, newflags);
   842			if (error)
   843				break;
   844	
   845			tmp = vma_iter_end(&vmi);
   846			nstart = tmp;
   847			prot = reqprot;
   848		}
   849		tlb_finish_mmu(&tlb);
   850	
   851		if (!error && tmp < end)
   852			error = -ENOMEM;
   853	
   854	out:
   855		mmap_write_unlock(current->mm);
   856		return error;
   857	}
   858
kernel test robot Oct. 20, 2024, 9:20 a.m. UTC | #7
Hi,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.12-rc3 next-20241018]
[cannot apply to kees/for-next/pstore kees/for-next/kspp linux/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/jeffxu-chromium-org/mseal-Two-fixes-for-madvise-MADV_DONTNEED-when-sealed/20241017-085203
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20241017005105.3047458-2-jeffxu%40chromium.org
patch subject: [PATCH v1 1/2] mseal: Two fixes for madvise(MADV_DONTNEED) when sealed
config: i386-defconfig (https://download.01.org/0day-ci/archive/20241020/202410201724.kKCsANsw-lkp@intel.com/config)
compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241020/202410201724.kKCsANsw-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202410201724.kKCsANsw-lkp@intel.com/

All errors (new ones prefixed by >>):

>> mm/mprotect.c:825:16: error: use of undeclared identifier 'VM_WASWRITE'
     825 |                         newflags |= VM_WASWRITE;
         |                                     ^
   1 error generated.


vim +/VM_WASWRITE +825 mm/mprotect.c

   705	
   706	/*
   707	 * pkey==-1 when doing a legacy mprotect()
   708	 */
   709	static int do_mprotect_pkey(unsigned long start, size_t len,
   710			unsigned long prot, int pkey)
   711	{
   712		unsigned long nstart, end, tmp, reqprot;
   713		struct vm_area_struct *vma, *prev;
   714		int error;
   715		const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
   716		const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
   717					(prot & PROT_READ);
   718		struct mmu_gather tlb;
   719		struct vma_iterator vmi;
   720	
   721		start = untagged_addr(start);
   722	
   723		prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
   724		if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
   725			return -EINVAL;
   726	
   727		if (start & ~PAGE_MASK)
   728			return -EINVAL;
   729		if (!len)
   730			return 0;
   731		len = PAGE_ALIGN(len);
   732		end = start + len;
   733		if (end <= start)
   734			return -ENOMEM;
   735		if (!arch_validate_prot(prot, start))
   736			return -EINVAL;
   737	
   738		reqprot = prot;
   739	
   740		if (mmap_write_lock_killable(current->mm))
   741			return -EINTR;
   742	
   743		/*
   744		 * If userspace did not allocate the pkey, do not let
   745		 * them use it here.
   746		 */
   747		error = -EINVAL;
   748		if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
   749			goto out;
   750	
   751		vma_iter_init(&vmi, current->mm, start);
   752		vma = vma_find(&vmi, end);
   753		error = -ENOMEM;
   754		if (!vma)
   755			goto out;
   756	
   757		if (unlikely(grows & PROT_GROWSDOWN)) {
   758			if (vma->vm_start >= end)
   759				goto out;
   760			start = vma->vm_start;
   761			error = -EINVAL;
   762			if (!(vma->vm_flags & VM_GROWSDOWN))
   763				goto out;
   764		} else {
   765			if (vma->vm_start > start)
   766				goto out;
   767			if (unlikely(grows & PROT_GROWSUP)) {
   768				end = vma->vm_end;
   769				error = -EINVAL;
   770				if (!(vma->vm_flags & VM_GROWSUP))
   771					goto out;
   772			}
   773		}
   774	
   775		prev = vma_prev(&vmi);
   776		if (start > vma->vm_start)
   777			prev = vma;
   778	
   779		tlb_gather_mmu(&tlb, current->mm);
   780		nstart = start;
   781		tmp = vma->vm_start;
   782		for_each_vma_range(vmi, vma, end) {
   783			unsigned long mask_off_old_flags;
   784			unsigned long newflags;
   785			int new_vma_pkey;
   786	
   787			if (vma->vm_start != tmp) {
   788				error = -ENOMEM;
   789				break;
   790			}
   791	
   792			/* Does the application expect PROT_READ to imply PROT_EXEC */
   793			if (rier && (vma->vm_flags & VM_MAYEXEC))
   794				prot |= PROT_EXEC;
   795	
   796			/*
   797			 * Each mprotect() call explicitly passes r/w/x permissions.
   798			 * If a permission is not passed to mprotect(), it must be
   799			 * cleared from the VMA.
   800			 */
   801			mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR;
   802	
   803			new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
   804			newflags = calc_vm_prot_bits(prot, new_vma_pkey);
   805			newflags |= (vma->vm_flags & ~mask_off_old_flags);
   806	
   807			/* newflags >> 4 shift VM_MAY% in place of VM_% */
   808			if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) {
   809				error = -EACCES;
   810				break;
   811			}
   812	
   813			if (map_deny_write_exec(vma, newflags)) {
   814				error = -EACCES;
   815				break;
   816			}
   817	
   818			/* Allow architectures to sanity-check the new flags */
   819			if (!arch_validate_flags(newflags)) {
   820				error = -EINVAL;
   821				break;
   822			}
   823	
   824			if ((vma->vm_flags & VM_WRITE) && !(newflags & VM_WRITE))
 > 825				newflags |= VM_WASWRITE;
   826	
   827			error = security_file_mprotect(vma, reqprot, prot);
   828			if (error)
   829				break;
   830	
   831			tmp = vma->vm_end;
   832			if (tmp > end)
   833				tmp = end;
   834	
   835			if (vma->vm_ops && vma->vm_ops->mprotect) {
   836				error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags);
   837				if (error)
   838					break;
   839			}
   840	
   841			error = mprotect_fixup(&vmi, &tlb, vma, &prev, nstart, tmp, newflags);
   842			if (error)
   843				break;
   844	
   845			tmp = vma_iter_end(&vmi);
   846			nstart = tmp;
   847			prot = reqprot;
   848		}
   849		tlb_finish_mmu(&tlb);
   850	
   851		if (!error && tmp < end)
   852			error = -ENOMEM;
   853	
   854	out:
   855		mmap_write_unlock(current->mm);
   856		return error;
   857	}
   858
Vlastimil Babka Oct. 22, 2024, 3:55 p.m. UTC | #8
On 10/17/24 22:57, Jeff Xu wrote:
> On Thu, Oct 17, 2024 at 1:49 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
>> >
>> > > > For file-backed, private, read-only memory mappings, we previously did
>> > > > not block the madvise(MADV_DONTNEED). This was based on
>> > > > the assumption that the memory's content, being file-backed, could be
>> > > > retrieved from the file if accessed again. However, this assumption
>> > > > failed to consider scenarios where a mapping is initially created as
>> > > > read-write, modified, and subsequently changed to read-only. The newly
>> > > > introduced VM_WASWRITE flag addresses this oversight.
>> > >
>> > > We *do not* need this. It's sufficient to just block discard operations on read-only
>> > > private mappings.
>> > I think you meant blocking madvise(MADV_DONTNEED) on all read-only
>> > private file-backed mappings.
>> >
>> > I considered that option, but there is a use case for madvise on those
>> > mappings that never get modified.
>> >
>> > Apps can use that to free up RAM. e.g. Considering read-only .text
>> > section, which never gets modified, madvise( MADV_DONTNEED) can free
>> > up RAM when memory is in-stress, memory will be reclaimed from a
>> > backed-file on next read access. Therefore we can't just block all
>> > read-only private file-backed mapping, only those that really need to,
>> > such as mapping changed from rw=>r (what you described)
>>
>> Does anyone actually do this? If so, why? WHYYYY?
>>
> This is a legit use case, I can't argue that it isn't.

Could the same effect be simply achieved with MADV_COLD/MADV_PAGEOUT? That
should be able to reclaim the pages as well if they are indeed not used, but
it's non-destructive and you don't want to allow destructive madvise anyway
(i.e. no throwing away data that would be replaced by zeroes or original
file content on the next touch) so it seems overall a better fit for sealed
areas?
Theo de Raadt Oct. 22, 2024, 10:54 p.m. UTC | #9
Vlastimil Babka <vbabka@suse.cz> wrote:

> On 10/17/24 22:57, Jeff Xu wrote:
> > On Thu, Oct 17, 2024 at 1:49 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
> >> >
> >> > > > For file-backed, private, read-only memory mappings, we previously did
> >> > > > not block the madvise(MADV_DONTNEED). This was based on
> >> > > > the assumption that the memory's content, being file-backed, could be
> >> > > > retrieved from the file if accessed again. However, this assumption
> >> > > > failed to consider scenarios where a mapping is initially created as
> >> > > > read-write, modified, and subsequently changed to read-only. The newly
> >> > > > introduced VM_WASWRITE flag addresses this oversight.
> >> > >
> >> > > We *do not* need this. It's sufficient to just block discard operations on read-only
> >> > > private mappings.
> >> > I think you meant blocking madvise(MADV_DONTNEED) on all read-only
> >> > private file-backed mappings.
> >> >
> >> > I considered that option, but there is a use case for madvise on those
> >> > mappings that never get modified.
> >> >
> >> > Apps can use that to free up RAM. e.g. Considering read-only .text
> >> > section, which never gets modified, madvise( MADV_DONTNEED) can free
> >> > up RAM when memory is in-stress, memory will be reclaimed from a
> >> > backed-file on next read access. Therefore we can't just block all
> >> > read-only private file-backed mapping, only those that really need to,
> >> > such as mapping changed from rw=>r (what you described)
> >>
> >> Does anyone actually do this? If so, why? WHYYYY?
> >>
> > This is a legit use case, I can't argue that it isn't.
> 
> Could the same effect be simply achieved with MADV_COLD/MADV_PAGEOUT? That
> should be able to reclaim the pages as well if they are indeed not used, but
> it's non-destructive and you don't want to allow destructive madvise anyway
> (i.e. no throwing away data that would be replaced by zeroes or original
> file content on the next touch) so it seems overall a better fit for sealed
> areas?

Comment from the sidelines: That seems clever enough.
Jeff Xu Oct. 23, 2024, 6:33 p.m. UTC | #10
Hi Vlastimil

On Tue, Oct 22, 2024 at 8:55 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 10/17/24 22:57, Jeff Xu wrote:
> > On Thu, Oct 17, 2024 at 1:49 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
> >> >
> >> > > > For file-backed, private, read-only memory mappings, we previously did
> >> > > > not block the madvise(MADV_DONTNEED). This was based on
> >> > > > the assumption that the memory's content, being file-backed, could be
> >> > > > retrieved from the file if accessed again. However, this assumption
> >> > > > failed to consider scenarios where a mapping is initially created as
> >> > > > read-write, modified, and subsequently changed to read-only. The newly
> >> > > > introduced VM_WASWRITE flag addresses this oversight.
> >> > >
> >> > > We *do not* need this. It's sufficient to just block discard operations on read-only
> >> > > private mappings.
> >> > I think you meant blocking madvise(MADV_DONTNEED) on all read-only
> >> > private file-backed mappings.
> >> >
> >> > I considered that option, but there is a use case for madvise on those
> >> > mappings that never get modified.
> >> >
> >> > Apps can use that to free up RAM. e.g. Considering read-only .text
> >> > section, which never gets modified, madvise( MADV_DONTNEED) can free
> >> > up RAM when memory is in-stress, memory will be reclaimed from a
> >> > backed-file on next read access. Therefore we can't just block all
> >> > read-only private file-backed mapping, only those that really need to,
> >> > such as mapping changed from rw=>r (what you described)
> >>
> >> Does anyone actually do this? If so, why? WHYYYY?
> >>
> > This is a legit use case, I can't argue that it isn't.
>
> Could the same effect be simply achieved with MADV_COLD/MADV_PAGEOUT? That
> should be able to reclaim the pages as well if they are indeed not used, but
> it's non-destructive and you don't want to allow destructive madvise anyway
> (i.e. no throwing away data that would be replaced by zeroes or original
> file content on the next touch) so it seems overall a better fit for sealed
> areas?
>
Thanks for the suggestion. This opens a new way to solve this, I need
to do some research and testing  to verify the solutions work for us.
I will respond after I'm done with those.

Best regards,
-Jeff
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4c32003c8404..b402eca2565a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -430,6 +430,8 @@  extern unsigned int kobjsize(const void *objp);
 #ifdef CONFIG_64BIT
 /* VM is sealed, in vm_flags */
 #define VM_SEALED	_BITUL(63)
+/* VM was writable */
+#define VM_WASWRITE	_BITUL(62)
 #endif
 
 /* Bits set in the VMA until the stack is in its final location */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0c5d6d06107d..6397135ca526 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -821,6 +821,9 @@  static int do_mprotect_pkey(unsigned long start, size_t len,
 			break;
 		}
 
+		if ((vma->vm_flags & VM_WRITE) && !(newflags & VM_WRITE))
+			newflags |= VM_WASWRITE;
+
 		error = security_file_mprotect(vma, reqprot, prot);
 		if (error)
 			break;
diff --git a/mm/mseal.c b/mm/mseal.c
index ece977bd21e1..28f28487be17 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -36,12 +36,8 @@  static bool is_madv_discard(int behavior)
 	return false;
 }
 
-static bool is_ro_anon(struct vm_area_struct *vma)
+static bool anon_is_ro(struct vm_area_struct *vma)
 {
-	/* check anonymous mapping. */
-	if (vma->vm_file || vma->vm_flags & VM_SHARED)
-		return false;
-
 	/*
 	 * check for non-writable:
 	 * PROT=RO or PKRU is not writeable.
@@ -53,6 +49,22 @@  static bool is_ro_anon(struct vm_area_struct *vma)
 	return false;
 }
 
+static bool vma_is_prot_none(struct vm_area_struct *vma)
+{
+	if ((vma->vm_flags & VM_ACCESS_FLAGS) == VM_NONE)
+		return true;
+
+	return false;
+}
+
+static bool vma_was_writable_turn_readonly(struct vm_area_struct *vma)
+{
+	if (!(vma->vm_flags & VM_WRITE) && vma->vm_flags & VM_WASWRITE)
+		return true;
+
+	return false;
+}
+
 /*
  * Check if a vma is allowed to be modified by madvise.
  */
@@ -61,7 +73,25 @@  bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
 	if (!is_madv_discard(behavior))
 		return true;
 
-	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
+	/* not sealed */
+	if (likely(can_modify_vma(vma)))
+		return true;
+
+	/* PROT_NONE mapping */
+	if (vma_is_prot_none(vma))
+		return true;
+
+	/* file-backed private mapping */
+	if (vma->vm_file) {
+		/* read-only but was writeable */
+		if (vma_was_writable_turn_readonly(vma))
+			return false;
+
+		return true;
+	}
+
+	/* anonymous mapping is read-only */
+	if (anon_is_ro(vma))
 		return false;
 
 	/* Allow by default. */