diff mbox series

[v1,1/2] mseal: Two fixes for madvise(MADV_DONTNEED) when sealed

Message ID 20241017005105.3047458-2-jeffxu@chromium.org (mailing list archive)
State New
Headers show
Series mseal: fixing madvise for file-backed mapping and PROT_NONE | expand

Commit Message

Jeff Xu Oct. 17, 2024, 12:51 a.m. UTC
From: Jeff Xu <jeffxu@chromium.org>

Two fixes for madvise(MADV_DONTNEED) when sealed.

For PROT_NONE mappings, the previous blocking of
madvise(MADV_DONTNEED) is unnecessary. As PROT_NONE already prohibits
memory access, madvise(MADV_DONTNEED) should be allowed to proceed in
order to free the page.

For file-backed, private, read-only memory mappings, we previously did
not block the madvise(MADV_DONTNEED). This was based on
the assumption that the memory's content, being file-backed, could be
retrieved from the file if accessed again. However, this assumption
failed to consider scenarios where a mapping is initially created as
read-write, modified, and subsequently changed to read-only. The newly
introduced VM_WASWRITE flag addresses this oversight.

Reported-by: Pedro Falcato <pedro.falcato@gmail.com>
Link:https://lore.kernel.org/all/CABi2SkW2XzuZ2-TunWOVzTEX1qc29LhjfNQ3hD4Nym8U-_f+ug@mail.gmail.com/
Fixes: 8be7258aad44 ("mseal: add mseal syscall")
Cc: <stable@vger.kernel.org> # 6.11.y: 4d1b3416659b: mm: move can_modify_vma to mm/vma.h
Cc: <stable@vger.kernel.org> # 6.11.y: 4a2dd02b0916: mm/mprotect: replace can_modify_mm with can_modify_vma
Cc: <stable@vger.kernel.org> # 6.11.y: 23c57d1fa2b9: mseal: replace can_modify_mm_madv with a vma variant
Cc: <stable@vger.kernel.org> # 6.11.y
Signed-off-by: Jeff Xu <jeffxu@chromium.org>
---
 include/linux/mm.h |  2 ++
 mm/mprotect.c      |  3 +++
 mm/mseal.c         | 42 ++++++++++++++++++++++++++++++++++++------
 3 files changed, 41 insertions(+), 6 deletions(-)

Comments

Lorenzo Stoakes Oct. 17, 2024, 8:32 a.m. UTC | #1
On Thu, Oct 17, 2024 at 12:51:04AM +0000, jeffxu@chromium.org wrote:
> From: Jeff Xu <jeffxu@chromium.org>
>
> Two fixes for madvise(MADV_DONTNEED) when sealed.
>
> For PROT_NONE mappings, the previous blocking of
> madvise(MADV_DONTNEED) is unnecessary. As PROT_NONE already prohibits
> memory access, madvise(MADV_DONTNEED) should be allowed to proceed in
> order to free the page.
>
> For file-backed, private, read-only memory mappings, we previously did
> not block the madvise(MADV_DONTNEED). This was based on
> the assumption that the memory's content, being file-backed, could be
> retrieved from the file if accessed again. However, this assumption
> failed to consider scenarios where a mapping is initially created as
> read-write, modified, and subsequently changed to read-only. The newly
> introduced VM_WASWRITE flag addresses this oversight.
>
> Reported-by: Pedro Falcato <pedro.falcato@gmail.com>
> Link:https://lore.kernel.org/all/CABi2SkW2XzuZ2-TunWOVzTEX1qc29LhjfNQ3hD4Nym8U-_f+ug@mail.gmail.com/
> Fixes: 8be7258aad44 ("mseal: add mseal syscall")
> Cc: <stable@vger.kernel.org> # 6.11.y: 4d1b3416659b: mm: move can_modify_vma to mm/vma.h
> Cc: <stable@vger.kernel.org> # 6.11.y: 4a2dd02b0916: mm/mprotect: replace can_modify_mm with can_modify_vma
> Cc: <stable@vger.kernel.org> # 6.11.y: 23c57d1fa2b9: mseal: replace can_modify_mm_madv with a vma variant
> Cc: <stable@vger.kernel.org> # 6.11.y
> Signed-off-by: Jeff Xu <jeffxu@chromium.org>
> ---
>  include/linux/mm.h |  2 ++
>  mm/mprotect.c      |  3 +++
>  mm/mseal.c         | 42 ++++++++++++++++++++++++++++++++++++------
>  3 files changed, 41 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 4c32003c8404..b402eca2565a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -430,6 +430,8 @@ extern unsigned int kobjsize(const void *objp);
>  #ifdef CONFIG_64BIT
>  /* VM is sealed, in vm_flags */
>  #define VM_SEALED	_BITUL(63)
> +/* VM was writable */

Woefully poor and misleading comment.

> +#define VM_WASWRITE	_BITUL(62)

The bar for an additional VMA flag is _really high_. As far as I'm
concerned you absolutely do not hit that bar here.

>  #endif
>
>  /* Bits set in the VMA until the stack is in its final location */
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 0c5d6d06107d..6397135ca526 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -821,6 +821,9 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
>  			break;
>  		}
>
> +		if ((vma->vm_flags & VM_WRITE) && !(newflags & VM_WRITE))
> +			newflags |= VM_WASWRITE;
> +

You're making this unmergeable now!!! No! Lord this is horrid.

You can't fundamentally change how mprotect() functions to suit edge cases
for mseal, sorry.

>  		error = security_file_mprotect(vma, reqprot, prot);
>  		if (error)
>  			break;
> diff --git a/mm/mseal.c b/mm/mseal.c
> index ece977bd21e1..28f28487be17 100644
> --- a/mm/mseal.c
> +++ b/mm/mseal.c
> @@ -36,12 +36,8 @@ static bool is_madv_discard(int behavior)
>  	return false;
>  }
>
> -static bool is_ro_anon(struct vm_area_struct *vma)
> +static bool anon_is_ro(struct vm_area_struct *vma)
>  {
> -	/* check anonymous mapping. */
> -	if (vma->vm_file || vma->vm_flags & VM_SHARED)
> -		return false;
> -
>  	/*
>  	 * check for non-writable:
>  	 * PROT=RO or PKRU is not writeable.
> @@ -53,6 +49,22 @@ static bool is_ro_anon(struct vm_area_struct *vma)
>  	return false;
>  }
>
> +static bool vma_is_prot_none(struct vm_area_struct *vma)
> +{
> +	if ((vma->vm_flags & VM_ACCESS_FLAGS) == VM_NONE)
> +		return true;
> +
> +	return false;
> +}

You don't need this, there is already vma_is_accessible() in mm.h.

> +
> +static bool vma_was_writable_turn_readonly(struct vm_area_struct *vma)
> +{
> +	if (!(vma->vm_flags & VM_WRITE) && vma->vm_flags & VM_WASWRITE)
> +		return true;
> +
> +	return false;
> +}

The naming of this is horrid and confusing.

> +
>  /*
>   * Check if a vma is allowed to be modified by madvise.
>   */
> @@ -61,7 +73,25 @@ bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
>  	if (!is_madv_discard(behavior))
>  		return true;
>
> -	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
> +	/* not sealed */
> +	if (likely(can_modify_vma(vma)))

Please don't just use likely() / unlikely() because _you_ think they're
likely/unlikely. Only use them based on profiling data. if you don't have it,
remove them.

> +		return true;
> +
> +	/* PROT_NONE mapping */

Useless comment.

> +	if (vma_is_prot_none(vma))
> +		return true;
> +
> +	/* file-backed private mapping */

Err... how do you know it's a private mapping?

> +	if (vma->vm_file) {
> +		/* read-only but was writeable */
> +		if (vma_was_writable_turn_readonly(vma))
> +			return false;

This whole thing seems broken, and we already have a mechanism for this,
see mapping_writably_mapped() which _also_ handles write seals for memfd's
which you are not accounting for here.

> +
> +		return true;
> +	}
> +
> +	/* anonymous mapping is read-only */
> +	if (anon_is_ro(vma))

You're implementing subtle details here with 1 line comments (that are
pretty well useless), that's just not good enough.

Please make sure to add _meaningful_ comments that will help another
developer understand what's going on.

>  		return false;
>
>  	/* Allow by default. */
> --
> 2.47.0.rc1.288.g06298d1525-goog
>
Pedro Falcato Oct. 17, 2024, 7:37 p.m. UTC | #2
On Thu, Oct 17, 2024 at 12:51:04AM +0000, jeffxu@chromium.org wrote:
> From: Jeff Xu <jeffxu@chromium.org>
> 
> Two fixes for madvise(MADV_DONTNEED) when sealed.
>

Please separate these fixes into two separate patches.

> For PROT_NONE mappings, the previous blocking of
> madvise(MADV_DONTNEED) is unnecessary. As PROT_NONE already prohibits
> memory access, madvise(MADV_DONTNEED) should be allowed to proceed in
> order to free the page.

I don't get it. Is there an actual use case for this?

> For file-backed, private, read-only memory mappings, we previously did
> not block the madvise(MADV_DONTNEED). This was based on
> the assumption that the memory's content, being file-backed, could be
> retrieved from the file if accessed again. However, this assumption
> failed to consider scenarios where a mapping is initially created as
> read-write, modified, and subsequently changed to read-only. The newly
> introduced VM_WASWRITE flag addresses this oversight.

We *do not* need this. It's sufficient to just block discard operations on read-only
private mappings. Sending a possible (fully untested) fix. If you like this approach
I can resend properly, or Andrew can pick it up, whatever floats people's boats.

----8<----

From dc5ec662dcb79156f4bdc1cba2a2575dce905ffa Mon Sep 17 00:00:00 2001
From: Pedro Falcato <pedro.falcato@gmail.com>
Date: Thu, 17 Oct 2024 20:21:10 +0100
Subject: [PATCH] mm/mseal: Disallow madvise discard on file-private sealed
 mappings

Doing an operation such as MADV_DONTNEED on a file-private mapping may
forcibly alter data by discarding CoW'd, anon pages and replacing them
with page cache pages fresh from the filesystem.

As such, this somewhat bypasses the mseal of a read-only mapping, and
should be disallowed.

Signed-off-by: Pedro Falcato <pedro.falcato@gmail.com>
Fixes: 8be7258aad44 ("mseal: add mseal syscall")
Cc: <stable@vger.kernel.org> # 6.11.y
---
 mm/mseal.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/mm/mseal.c b/mm/mseal.c
index 28cd17d7aaf2..d053303c5542 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -36,10 +36,15 @@ static bool is_madv_discard(int behavior)
        return false;
 }
 
-static bool is_ro_anon(struct vm_area_struct *vma)
+static bool is_ro_private(struct vm_area_struct *vma)
 {
-       /* check anonymous mapping. */
-       if (vma->vm_file || vma->vm_flags & VM_SHARED)
+       /*
+        * If shared, allow discard operations - it shouldn't
+        * affect the underlying data. Discard on private VMAs may
+        * forcibly alter data by replacing CoW'd anonymous pages
+        * with ones fresh from the page cache.
+        */
+       if (vma->vm_flags & VM_SHARED)
                return false;
 
        /*
@@ -61,7 +66,7 @@ bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
        if (!is_madv_discard(behavior))
                return true;
 
-       if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
+       if (unlikely(!can_modify_vma(vma) && is_ro_private(vma)))
                return false;
 
        /* Allow by default. */
Jeff Xu Oct. 17, 2024, 8:34 p.m. UTC | #3
Hi Pedro

On Thu, Oct 17, 2024 at 12:37 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
>
> > For PROT_NONE mappings, the previous blocking of
> > madvise(MADV_DONTNEED) is unnecessary. As PROT_NONE already prohibits
> > memory access, madvise(MADV_DONTNEED) should be allowed to proceed in
> > order to free the page.
>
> I don't get it. Is there an actual use case for this?
>
Sealing should not over-blocking API that it can allow to pass without
security concern, this is a case in that principle.

There is a user case for this as well: to seal NX stack on android,
Android uses PROT_NONE/madvise to set up a guide page to prevent stack
run over boundary. So we need to let madvise to pass.

> > For file-backed, private, read-only memory mappings, we previously did
> > not block the madvise(MADV_DONTNEED). This was based on
> > the assumption that the memory's content, being file-backed, could be
> > retrieved from the file if accessed again. However, this assumption
> > failed to consider scenarios where a mapping is initially created as
> > read-write, modified, and subsequently changed to read-only. The newly
> > introduced VM_WASWRITE flag addresses this oversight.
>
> We *do not* need this. It's sufficient to just block discard operations on read-only
> private mappings.
I think you meant blocking madvise(MADV_DONTNEED) on all read-only
private file-backed mappings.

I considered that option, but there is a use case for madvise on those
mappings that never get modified.

Apps can use that to free up RAM. e.g. Considering read-only .text
section, which never gets modified, madvise( MADV_DONTNEED) can free
up RAM when memory is in-stress, memory will be reclaimed from a
backed-file on next read access. Therefore we can't just block all
read-only private file-backed mapping, only those that really need to,
such as mapping changed from rw=>r (what you described)
Pedro Falcato Oct. 17, 2024, 8:49 p.m. UTC | #4
On Thu, Oct 17, 2024 at 01:34:53PM -0700, Jeff Xu wrote:
> Hi Pedro
> 
> On Thu, Oct 17, 2024 at 12:37 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
> >
> > > For PROT_NONE mappings, the previous blocking of
> > > madvise(MADV_DONTNEED) is unnecessary. As PROT_NONE already prohibits
> > > memory access, madvise(MADV_DONTNEED) should be allowed to proceed in
> > > order to free the page.
> >
> > I don't get it. Is there an actual use case for this?
> >
> Sealing should not over-blocking API that it can allow to pass without
> security concern, this is a case in that principle.

Well, making the interface simple is also important. OpenBSD's mimmutable()
doesn't do any of this and it Just Works(tm)...

> 
> There is a user case for this as well: to seal NX stack on android,
> Android uses PROT_NONE/madvise to set up a guide page to prevent stack
> run over boundary. So we need to let madvise to pass.

And you need to MADV_DONTNEED this guard page?

> 
> > > For file-backed, private, read-only memory mappings, we previously did
> > > not block the madvise(MADV_DONTNEED). This was based on
> > > the assumption that the memory's content, being file-backed, could be
> > > retrieved from the file if accessed again. However, this assumption
> > > failed to consider scenarios where a mapping is initially created as
> > > read-write, modified, and subsequently changed to read-only. The newly
> > > introduced VM_WASWRITE flag addresses this oversight.
> >
> > We *do not* need this. It's sufficient to just block discard operations on read-only
> > private mappings.
> I think you meant blocking madvise(MADV_DONTNEED) on all read-only
> private file-backed mappings.
> 
> I considered that option, but there is a use case for madvise on those
> mappings that never get modified.
> 
> Apps can use that to free up RAM. e.g. Considering read-only .text
> section, which never gets modified, madvise( MADV_DONTNEED) can free
> up RAM when memory is in-stress, memory will be reclaimed from a
> backed-file on next read access. Therefore we can't just block all
> read-only private file-backed mapping, only those that really need to,
> such as mapping changed from rw=>r (what you described)

Does anyone actually do this? If so, why? WHYYYY?

The kernel's page reclaim logic should be perfectly cromulent. Please don't do this.
MADV_DONTNEED will also not free any pages if those are shared (rather they'll just be unmapped).

If we really need to do this, I'd maybe suggest walking through page tables, looking for
anon ptes or swap ptes (maybe inside the actual zap code?). But I would really prefer if we
didn't need to do this.
Jeff Xu Oct. 17, 2024, 8:57 p.m. UTC | #5
On Thu, Oct 17, 2024 at 1:49 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
>
> On Thu, Oct 17, 2024 at 01:34:53PM -0700, Jeff Xu wrote:
> > Hi Pedro
> >
> > On Thu, Oct 17, 2024 at 12:37 PM Pedro Falcato <pedro.falcato@gmail.com> wrote:
> > >
> > > > For PROT_NONE mappings, the previous blocking of
> > > > madvise(MADV_DONTNEED) is unnecessary. As PROT_NONE already prohibits
> > > > memory access, madvise(MADV_DONTNEED) should be allowed to proceed in
> > > > order to free the page.
> > >
> > > I don't get it. Is there an actual use case for this?
> > >
> > Sealing should not over-blocking API that it can allow to pass without
> > security concern, this is a case in that principle.
>
> Well, making the interface simple is also important. OpenBSD's mimmutable()
> doesn't do any of this and it Just Works(tm)...
>
> >
> > There is a user case for this as well: to seal NX stack on android,
> > Android uses PROT_NONE/madvise to set up a guide page to prevent stack
> > run over boundary. So we need to let madvise to pass.
>
> And you need to MADV_DONTNEED this guard page?
>
Yes.

> >
> > > > For file-backed, private, read-only memory mappings, we previously did
> > > > not block the madvise(MADV_DONTNEED). This was based on
> > > > the assumption that the memory's content, being file-backed, could be
> > > > retrieved from the file if accessed again. However, this assumption
> > > > failed to consider scenarios where a mapping is initially created as
> > > > read-write, modified, and subsequently changed to read-only. The newly
> > > > introduced VM_WASWRITE flag addresses this oversight.
> > >
> > > We *do not* need this. It's sufficient to just block discard operations on read-only
> > > private mappings.
> > I think you meant blocking madvise(MADV_DONTNEED) on all read-only
> > private file-backed mappings.
> >
> > I considered that option, but there is a use case for madvise on those
> > mappings that never get modified.
> >
> > Apps can use that to free up RAM. e.g. Considering read-only .text
> > section, which never gets modified, madvise( MADV_DONTNEED) can free
> > up RAM when memory is in-stress, memory will be reclaimed from a
> > backed-file on next read access. Therefore we can't just block all
> > read-only private file-backed mapping, only those that really need to,
> > such as mapping changed from rw=>r (what you described)
>
> Does anyone actually do this? If so, why? WHYYYY?
>
This is a legit use case, I can't argue that it isn't.

> The kernel's page reclaim logic should be perfectly cromulent. Please don't do this.
> MADV_DONTNEED will also not free any pages if those are shared (rather they'll just be unmapped).
>
> If we really need to do this, I'd maybe suggest walking through page tables, looking for
> anon ptes or swap ptes (maybe inside the actual zap code?). But I would really prefer if we
> didn't need to do this.
>
I also considered this route, but it is too complicated. The
copy-on-write pages can be put into a swap file, also there is a huge
page to consider, etc, The complication makes it really difficult to
code it right, also scanning those pages on per VMA level will require
lock and also impact performance.


> --
> Pedro
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4c32003c8404..b402eca2565a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -430,6 +430,8 @@  extern unsigned int kobjsize(const void *objp);
 #ifdef CONFIG_64BIT
 /* VM is sealed, in vm_flags */
 #define VM_SEALED	_BITUL(63)
+/* VM was writable */
+#define VM_WASWRITE	_BITUL(62)
 #endif
 
 /* Bits set in the VMA until the stack is in its final location */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0c5d6d06107d..6397135ca526 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -821,6 +821,9 @@  static int do_mprotect_pkey(unsigned long start, size_t len,
 			break;
 		}
 
+		if ((vma->vm_flags & VM_WRITE) && !(newflags & VM_WRITE))
+			newflags |= VM_WASWRITE;
+
 		error = security_file_mprotect(vma, reqprot, prot);
 		if (error)
 			break;
diff --git a/mm/mseal.c b/mm/mseal.c
index ece977bd21e1..28f28487be17 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -36,12 +36,8 @@  static bool is_madv_discard(int behavior)
 	return false;
 }
 
-static bool is_ro_anon(struct vm_area_struct *vma)
+static bool anon_is_ro(struct vm_area_struct *vma)
 {
-	/* check anonymous mapping. */
-	if (vma->vm_file || vma->vm_flags & VM_SHARED)
-		return false;
-
 	/*
 	 * check for non-writable:
 	 * PROT=RO or PKRU is not writeable.
@@ -53,6 +49,22 @@  static bool is_ro_anon(struct vm_area_struct *vma)
 	return false;
 }
 
+static bool vma_is_prot_none(struct vm_area_struct *vma)
+{
+	if ((vma->vm_flags & VM_ACCESS_FLAGS) == VM_NONE)
+		return true;
+
+	return false;
+}
+
+static bool vma_was_writable_turn_readonly(struct vm_area_struct *vma)
+{
+	if (!(vma->vm_flags & VM_WRITE) && vma->vm_flags & VM_WASWRITE)
+		return true;
+
+	return false;
+}
+
 /*
  * Check if a vma is allowed to be modified by madvise.
  */
@@ -61,7 +73,25 @@  bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
 	if (!is_madv_discard(behavior))
 		return true;
 
-	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
+	/* not sealed */
+	if (likely(can_modify_vma(vma)))
+		return true;
+
+	/* PROT_NONE mapping */
+	if (vma_is_prot_none(vma))
+		return true;
+
+	/* file-backed private mapping */
+	if (vma->vm_file) {
+		/* read-only but was writeable */
+		if (vma_was_writable_turn_readonly(vma))
+			return false;
+
+		return true;
+	}
+
+	/* anonymous mapping is read-only */
+	if (anon_is_ro(vma))
 		return false;
 
 	/* Allow by default. */