Message ID | 20190424211038.204001-1-matthewgarrett@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [V2] mm: Allow userland to request that the kernel clear memory on release | expand |
Please cc linux-api for user visible API proposals (now done). Keep the rest of the email intact for reference. On Wed 24-04-19 14:10:39, Matthew Garrett wrote: > From: Matthew Garrett <mjg59@google.com> > > Applications that hold secrets and wish to avoid them leaking can use > mlock() to prevent the page from being pushed out to swap and > MADV_DONTDUMP to prevent it from being included in core dumps. Applications > can also use atexit() handlers to overwrite secrets on application exit. > However, if an attacker can reboot the system into another OS, they can > dump the contents of RAM and extract secrets. We can avoid this by setting > CONFIG_RESET_ATTACK_MITIGATION on UEFI systems in order to request that the > firmware wipe the contents of RAM before booting another OS, but this means > rebooting takes a *long* time - the expected behaviour is for a clean > shutdown to remove the request after scrubbing secrets from RAM in order to > avoid this. > > Unfortunately, if an application exits uncleanly, its secrets may still be > present in RAM. This can't be easily fixed in userland (eg, if the OOM > killer decides to kill a process holding secrets, we're not going to be able > to avoid that), so this patch adds a new flag to madvise() to allow userland > to request that the kernel clear the covered pages whenever the page > reference count hits zero. Since vm_flags is already full on 32-bit, it > will only work on 64-bit systems. > > Signed-off-by: Matthew Garrett <mjg59@google.com> > --- > > Modified to wipe when the VMA is released rather than on page freeing > > include/linux/mm.h | 6 ++++++ > include/uapi/asm-generic/mman-common.h | 2 ++ > mm/madvise.c | 21 +++++++++++++++++++++ > mm/memory.c | 3 +++ > 4 files changed, 32 insertions(+) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 6b10c21630f5..64bdab679275 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -257,6 +257,8 @@ extern unsigned int kobjsize(const void *objp); > #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) > #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) > #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) > + > +#define VM_WIPEONRELEASE BIT(37) /* Clear pages when releasing them */ > #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ > > #ifdef CONFIG_ARCH_HAS_PKEYS > @@ -298,6 +300,10 @@ extern unsigned int kobjsize(const void *objp); > # define VM_GROWSUP VM_NONE > #endif > > +#ifndef VM_WIPEONRELEASE > +# define VM_WIPEONRELEASE VM_NONE > +#endif > + > /* Bits set in the VMA until the stack is in its final location */ > #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) > > diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h > index abd238d0f7a4..82dfff4a8e3d 100644 > --- a/include/uapi/asm-generic/mman-common.h > +++ b/include/uapi/asm-generic/mman-common.h > @@ -64,6 +64,8 @@ > #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ > #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ > > +#define MADV_WIPEONRELEASE 20 > +#define MADV_DONTWIPEONRELEASE 21 > /* compatibility flags */ > #define MAP_FILE 0 > > diff --git a/mm/madvise.c b/mm/madvise.c > index 21a7881a2db4..989c2fde15cf 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -92,6 +92,22 @@ static long madvise_behavior(struct vm_area_struct *vma, > case MADV_KEEPONFORK: > new_flags &= ~VM_WIPEONFORK; > break; > + case MADV_WIPEONRELEASE: > + /* MADV_WIPEONRELEASE is only supported on anonymous memory. */ > + if (VM_WIPEONRELEASE == 0 || vma->vm_file || > + vma->vm_flags & VM_SHARED) { > + error = -EINVAL; > + goto out; > + } > + new_flags |= VM_WIPEONRELEASE; > + break; > + case MADV_DONTWIPEONRELEASE: > + if (VM_WIPEONRELEASE == 0) { > + error = -EINVAL; > + goto out; > + } > + new_flags &= ~VM_WIPEONRELEASE; > + break; > case MADV_DONTDUMP: > new_flags |= VM_DONTDUMP; > break; > @@ -727,6 +743,8 @@ madvise_behavior_valid(int behavior) > case MADV_DODUMP: > case MADV_WIPEONFORK: > case MADV_KEEPONFORK: > + case MADV_WIPEONRELEASE: > + case MADV_DONTWIPEONRELEASE: > #ifdef CONFIG_MEMORY_FAILURE > case MADV_SOFT_OFFLINE: > case MADV_HWPOISON: > @@ -785,6 +803,9 @@ madvise_behavior_valid(int behavior) > * MADV_DONTDUMP - the application wants to prevent pages in the given range > * from being included in its core dump. > * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. > + * MADV_WIPEONRELEASE - clear the contents of the memory after the last > + * reference to it has been released > + * MADV_DONTWIPEONRELEASE - cancel MADV_WIPEONRELEASE > * > * return values: > * zero - success > diff --git a/mm/memory.c b/mm/memory.c > index ab650c21bccd..ff78b527660e 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -1091,6 +1091,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, > page_remove_rmap(page, false); > if (unlikely(page_mapcount(page) < 0)) > print_bad_pte(vma, addr, ptent, page); > + if (unlikely(vma->vm_flags & VM_WIPEONRELEASE) && > + page_mapcount(page) == 0) > + clear_highpage(page); > if (unlikely(__tlb_remove_page(tlb, page))) { > force_flush = 1; > addr += PAGE_SIZE; > -- > 2.21.0.593.g511ec345e18-goog
On Thu 25-04-19 14:14:10, Michal Hocko wrote: > Please cc linux-api for user visible API proposals (now done). Keep the > rest of the email intact for reference. > > On Wed 24-04-19 14:10:39, Matthew Garrett wrote: > > From: Matthew Garrett <mjg59@google.com> > > > > Applications that hold secrets and wish to avoid them leaking can use > > mlock() to prevent the page from being pushed out to swap and > > MADV_DONTDUMP to prevent it from being included in core dumps. Applications > > can also use atexit() handlers to overwrite secrets on application exit. > > However, if an attacker can reboot the system into another OS, they can > > dump the contents of RAM and extract secrets. We can avoid this by setting > > CONFIG_RESET_ATTACK_MITIGATION on UEFI systems in order to request that the > > firmware wipe the contents of RAM before booting another OS, but this means > > rebooting takes a *long* time - the expected behaviour is for a clean > > shutdown to remove the request after scrubbing secrets from RAM in order to > > avoid this. > > > > Unfortunately, if an application exits uncleanly, its secrets may still be > > present in RAM. This can't be easily fixed in userland (eg, if the OOM > > killer decides to kill a process holding secrets, we're not going to be able > > to avoid that), so this patch adds a new flag to madvise() to allow userland > > to request that the kernel clear the covered pages whenever the page > > reference count hits zero. Since vm_flags is already full on 32-bit, it > > will only work on 64-bit systems. The changelog seems stale. You are hooking into unmap path where the reference count might be still > 0 and the page still held by somebody. A previous email from Willy said " It could be the target/source of direct I/O, or userspace could have registered it with an RDMA device, or ... It depends on the semantics you want. There's no legacy code to worry about here. I was seeing this as the equivalent of an atexit() handler; userspace is saying "When this page is unmapped, zero it". So it doesn't matter that somebody else might be able to reference it -- userspace could have zeroed it themselves. " I am not sure this is really a bullet proof argumentation but it should definitely be part of the changelog. Besides that you inherently assume that the user would do mlock because you do not try to wipe the swap content. Is this intentional? Another question would be regarding the targeted user API. There are some attempts to make all the freed memory to be zeroed/poisoned. Are users who would like to use this feature also be interested in using system wide setting as well?
On 4/25/19 2:14 PM, Michal Hocko wrote: > Please cc linux-api for user visible API proposals (now done). Keep the > rest of the email intact for reference. > > On Wed 24-04-19 14:10:39, Matthew Garrett wrote: >> From: Matthew Garrett <mjg59@google.com> >> >> Applications that hold secrets and wish to avoid them leaking can use >> mlock() to prevent the page from being pushed out to swap and >> MADV_DONTDUMP to prevent it from being included in core dumps. Applications So, do we really need a new madvise() flag and VMA flag, or can we just infer this page clearing from mlock+MADV_DONTDUMP being both applied?
On Thu, Apr 25, 2019 at 2:14 PM Michal Hocko <mhocko@kernel.org> wrote: [...] > On Wed 24-04-19 14:10:39, Matthew Garrett wrote: > > From: Matthew Garrett <mjg59@google.com> > > > > Applications that hold secrets and wish to avoid them leaking can use > > mlock() to prevent the page from being pushed out to swap and > > MADV_DONTDUMP to prevent it from being included in core dumps. Applications > > can also use atexit() handlers to overwrite secrets on application exit. > > However, if an attacker can reboot the system into another OS, they can > > dump the contents of RAM and extract secrets. We can avoid this by setting > > CONFIG_RESET_ATTACK_MITIGATION on UEFI systems in order to request that the > > firmware wipe the contents of RAM before booting another OS, but this means > > rebooting takes a *long* time - the expected behaviour is for a clean > > shutdown to remove the request after scrubbing secrets from RAM in order to > > avoid this. > > > > Unfortunately, if an application exits uncleanly, its secrets may still be > > present in RAM. This can't be easily fixed in userland (eg, if the OOM > > killer decides to kill a process holding secrets, we're not going to be able > > to avoid that), so this patch adds a new flag to madvise() to allow userland > > to request that the kernel clear the covered pages whenever the page > > reference count hits zero. Since vm_flags is already full on 32-bit, it > > will only work on 64-bit systems. [...] > > diff --git a/mm/madvise.c b/mm/madvise.c > > index 21a7881a2db4..989c2fde15cf 100644 > > --- a/mm/madvise.c > > +++ b/mm/madvise.c > > @@ -92,6 +92,22 @@ static long madvise_behavior(struct vm_area_struct *vma, > > case MADV_KEEPONFORK: > > new_flags &= ~VM_WIPEONFORK; > > break; > > + case MADV_WIPEONRELEASE: > > + /* MADV_WIPEONRELEASE is only supported on anonymous memory. */ > > + if (VM_WIPEONRELEASE == 0 || vma->vm_file || > > + vma->vm_flags & VM_SHARED) { > > + error = -EINVAL; > > + goto out; > > + } > > + new_flags |= VM_WIPEONRELEASE; > > + break; An interesting effect of this is that it will be possible to set this on a CoW anon VMA in a fork() child, and then the semantics in the parent will be subtly different - e.g. if the parent vmsplice()d a CoWed page into a pipe, then forked an unprivileged child, the child set MADV_WIPEONRELEASE on its VMA, the parent died somehow, and then the child died, the page in the pipe would be zeroed out. A child should not be able to affect its parent like this, I think. If this was an mmap() flag instead of a madvise() command, that issue could be avoided. Alternatively, if adding more mmap() flags doesn't work, perhaps you could scan the VMA and ensure that it contains no pages yet, or something like that? > > diff --git a/mm/memory.c b/mm/memory.c > > index ab650c21bccd..ff78b527660e 100644 > > --- a/mm/memory.c > > +++ b/mm/memory.c > > @@ -1091,6 +1091,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, > > page_remove_rmap(page, false); > > if (unlikely(page_mapcount(page) < 0)) > > print_bad_pte(vma, addr, ptent, page); > > + if (unlikely(vma->vm_flags & VM_WIPEONRELEASE) && > > + page_mapcount(page) == 0) > > + clear_highpage(page); > > if (unlikely(__tlb_remove_page(tlb, page))) { > > force_flush = 1; > > addr += PAGE_SIZE; Should something like this perhaps be added in page_remove_rmap() instead? That's where the mapcount is decremented; and looking at other callers of page_remove_rmap(), in particular the following ones look interesting: - do_huge_pmd_wp_page()/do_huge_pmd_wp_page_fallback() might be relevant in the case where a forking process contains transparent hugepages? - zap_huge_pmd() is relevant when transparent hugepages are used, I think (otherwise transparent hugepages might not be wiped?) - there are various callers related to migration; I think this is relevant on a NUMA system where memory is moved between nodes to improve locality (moving memory to a new page and freeing the old one, in which case you'd want to wipe the old page) I think all the callers have a reference to the VMA, so perhaps you could add a VMA parameter to page_remove_rmap() and then look at the VMA in there?
On Thu, Apr 25, 2019 at 5:37 AM Michal Hocko <mhocko@kernel.org> wrote: > Besides that you inherently assume that the user would do mlock because > you do not try to wipe the swap content. Is this intentional? Yes, given MADV_DONTDUMP doesn't imply mlock I thought it'd be more consistent to keep those independent. > Another question would be regarding the targeted user API. There are > some attempts to make all the freed memory to be zeroed/poisoned. Are > users who would like to use this feature also be interested in using > system wide setting as well? I think that depends on the performance overhead of a global setting, but it's also influenced by the semantics around when the freeing occurs, which is something I haven't nailed down yet. If the expectation is that the page is freed whenever the process exits, even if the page is in use somewhere else, then we'd still want this to be separate to poisoning on final page free.
On Thu, Apr 25, 2019 at 5:43 AM Jann Horn <jannh@google.com> wrote: > An interesting effect of this is that it will be possible to set this > on a CoW anon VMA in a fork() child, and then the semantics in the > parent will be subtly different - e.g. if the parent vmsplice()d a > CoWed page into a pipe, then forked an unprivileged child, the child > set MADV_WIPEONRELEASE on its VMA, the parent died somehow, and then > the child died, the page in the pipe would be zeroed out. A child > should not be able to affect its parent like this, I think. If this > was an mmap() flag instead of a madvise() command, that issue could be > avoided. Alternatively, if adding more mmap() flags doesn't work, > perhaps you could scan the VMA and ensure that it contains no pages > yet, or something like that? I /think/ my argument here would be not to do that? I agree that it's unexpected, but I guess the other alternative would be to force a copy on any existing COW pages in the VMA at madvise() time, and maybe also at fork() time (sort of like the behaviour of MADV_WIPEONFORK, but copying the page rather than providing a new zero page) > I think all the callers have a reference to the VMA, so perhaps you > could add a VMA parameter to page_remove_rmap() and then look at the > VMA in there? I'll dig into that, thanks!
On Thu, Apr 25, 2019 at 5:44 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > On 4/25/19 2:14 PM, Michal Hocko wrote: > > Please cc linux-api for user visible API proposals (now done). Keep the > > rest of the email intact for reference. > > > > On Wed 24-04-19 14:10:39, Matthew Garrett wrote: > >> From: Matthew Garrett <mjg59@google.com> > >> > >> Applications that hold secrets and wish to avoid them leaking can use > >> mlock() to prevent the page from being pushed out to swap and > >> MADV_DONTDUMP to prevent it from being included in core dumps. Applications > > So, do we really need a new madvise() flag and VMA flag, or can we just > infer this page clearing from mlock+MADV_DONTDUMP being both applied? I think the combination would probably imply that this is the behaviour you want, but I'm a little concerned about changing the semantics given the corner cases described earlier in the thread. If we can figure those out in a way that won't break any existing code, I could buy this.
On Thu 25-04-19 13:39:01, Matthew Garrett wrote: > On Thu, Apr 25, 2019 at 5:37 AM Michal Hocko <mhocko@kernel.org> wrote: > > Besides that you inherently assume that the user would do mlock because > > you do not try to wipe the swap content. Is this intentional? > > Yes, given MADV_DONTDUMP doesn't imply mlock I thought it'd be more > consistent to keep those independent. Do we want to fail madvise call on VMAs that are not mlocked then? What if the munlock happens later after the madvise is called?
On Thu 25-04-19 14:42:52, Jann Horn wrote: > On Thu, Apr 25, 2019 at 2:14 PM Michal Hocko <mhocko@kernel.org> wrote: > [...] > > On Wed 24-04-19 14:10:39, Matthew Garrett wrote: > > > From: Matthew Garrett <mjg59@google.com> > > > > > > Applications that hold secrets and wish to avoid them leaking can use > > > mlock() to prevent the page from being pushed out to swap and > > > MADV_DONTDUMP to prevent it from being included in core dumps. Applications > > > can also use atexit() handlers to overwrite secrets on application exit. > > > However, if an attacker can reboot the system into another OS, they can > > > dump the contents of RAM and extract secrets. We can avoid this by setting > > > CONFIG_RESET_ATTACK_MITIGATION on UEFI systems in order to request that the > > > firmware wipe the contents of RAM before booting another OS, but this means > > > rebooting takes a *long* time - the expected behaviour is for a clean > > > shutdown to remove the request after scrubbing secrets from RAM in order to > > > avoid this. > > > > > > Unfortunately, if an application exits uncleanly, its secrets may still be > > > present in RAM. This can't be easily fixed in userland (eg, if the OOM > > > killer decides to kill a process holding secrets, we're not going to be able > > > to avoid that), so this patch adds a new flag to madvise() to allow userland > > > to request that the kernel clear the covered pages whenever the page > > > reference count hits zero. Since vm_flags is already full on 32-bit, it > > > will only work on 64-bit systems. > [...] > > > diff --git a/mm/madvise.c b/mm/madvise.c > > > index 21a7881a2db4..989c2fde15cf 100644 > > > --- a/mm/madvise.c > > > +++ b/mm/madvise.c > > > @@ -92,6 +92,22 @@ static long madvise_behavior(struct vm_area_struct *vma, > > > case MADV_KEEPONFORK: > > > new_flags &= ~VM_WIPEONFORK; > > > break; > > > + case MADV_WIPEONRELEASE: > > > + /* MADV_WIPEONRELEASE is only supported on anonymous memory. */ > > > + if (VM_WIPEONRELEASE == 0 || vma->vm_file || > > > + vma->vm_flags & VM_SHARED) { > > > + error = -EINVAL; > > > + goto out; > > > + } > > > + new_flags |= VM_WIPEONRELEASE; > > > + break; > > An interesting effect of this is that it will be possible to set this > on a CoW anon VMA in a fork() child, and then the semantics in the > parent will be subtly different - e.g. if the parent vmsplice()d a > CoWed page into a pipe, then forked an unprivileged child, the child Maybe a stupid question. How do you fork an unprivileged child (without exec)? Child would have to drop priviledges on its own, no? > set MADV_WIPEONRELEASE on its VMA, the parent died somehow, and then > the child died, the page in the pipe would be zeroed out. A child > should not be able to affect its parent like this, I think. If this > was an mmap() flag instead of a madvise() command, that issue could be > avoided. With a VMA flag underneath, I think you can do an early CoW during fork to prevent from that. > Alternatively, if adding more mmap() flags doesn't work, > perhaps you could scan the VMA and ensure that it contains no pages > yet, or something like that? > > > > diff --git a/mm/memory.c b/mm/memory.c > > > index ab650c21bccd..ff78b527660e 100644 > > > --- a/mm/memory.c > > > +++ b/mm/memory.c > > > @@ -1091,6 +1091,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, > > > page_remove_rmap(page, false); > > > if (unlikely(page_mapcount(page) < 0)) > > > print_bad_pte(vma, addr, ptent, page); > > > + if (unlikely(vma->vm_flags & VM_WIPEONRELEASE) && > > > + page_mapcount(page) == 0) > > > + clear_highpage(page); > > > if (unlikely(__tlb_remove_page(tlb, page))) { > > > force_flush = 1; > > > addr += PAGE_SIZE; > > Should something like this perhaps be added in page_remove_rmap() > instead? That's where the mapcount is decremented; and looking at > other callers of page_remove_rmap(), in particular the following ones > look interesting: Well spotted!
On Fri, Apr 26, 2019 at 7:31 AM Michal Hocko <mhocko@kernel.org> wrote: > On Thu 25-04-19 14:42:52, Jann Horn wrote: > > On Thu, Apr 25, 2019 at 2:14 PM Michal Hocko <mhocko@kernel.org> wrote: > > [...] > > > On Wed 24-04-19 14:10:39, Matthew Garrett wrote: > > > > From: Matthew Garrett <mjg59@google.com> > > > > > > > > Applications that hold secrets and wish to avoid them leaking can use > > > > mlock() to prevent the page from being pushed out to swap and > > > > MADV_DONTDUMP to prevent it from being included in core dumps. Applications > > > > can also use atexit() handlers to overwrite secrets on application exit. > > > > However, if an attacker can reboot the system into another OS, they can > > > > dump the contents of RAM and extract secrets. We can avoid this by setting > > > > CONFIG_RESET_ATTACK_MITIGATION on UEFI systems in order to request that the > > > > firmware wipe the contents of RAM before booting another OS, but this means > > > > rebooting takes a *long* time - the expected behaviour is for a clean > > > > shutdown to remove the request after scrubbing secrets from RAM in order to > > > > avoid this. > > > > > > > > Unfortunately, if an application exits uncleanly, its secrets may still be > > > > present in RAM. This can't be easily fixed in userland (eg, if the OOM > > > > killer decides to kill a process holding secrets, we're not going to be able > > > > to avoid that), so this patch adds a new flag to madvise() to allow userland > > > > to request that the kernel clear the covered pages whenever the page > > > > reference count hits zero. Since vm_flags is already full on 32-bit, it > > > > will only work on 64-bit systems. > > [...] > > > > diff --git a/mm/madvise.c b/mm/madvise.c > > > > index 21a7881a2db4..989c2fde15cf 100644 > > > > --- a/mm/madvise.c > > > > +++ b/mm/madvise.c > > > > @@ -92,6 +92,22 @@ static long madvise_behavior(struct vm_area_struct *vma, > > > > case MADV_KEEPONFORK: > > > > new_flags &= ~VM_WIPEONFORK; > > > > break; > > > > + case MADV_WIPEONRELEASE: > > > > + /* MADV_WIPEONRELEASE is only supported on anonymous memory. */ > > > > + if (VM_WIPEONRELEASE == 0 || vma->vm_file || > > > > + vma->vm_flags & VM_SHARED) { > > > > + error = -EINVAL; > > > > + goto out; > > > > + } > > > > + new_flags |= VM_WIPEONRELEASE; > > > > + break; > > > > An interesting effect of this is that it will be possible to set this > > on a CoW anon VMA in a fork() child, and then the semantics in the > > parent will be subtly different - e.g. if the parent vmsplice()d a > > CoWed page into a pipe, then forked an unprivileged child, the child > > Maybe a stupid question. How do you fork an unprivileged child (without > exec)? Child would have to drop priviledges on its own, no? Sorry, yes, that's what I meant.
On Fri 26-04-19 15:33:25, Jann Horn wrote: > On Fri, Apr 26, 2019 at 7:31 AM Michal Hocko <mhocko@kernel.org> wrote: > > On Thu 25-04-19 14:42:52, Jann Horn wrote: > > > On Thu, Apr 25, 2019 at 2:14 PM Michal Hocko <mhocko@kernel.org> wrote: > > > [...] > > > > On Wed 24-04-19 14:10:39, Matthew Garrett wrote: > > > > > From: Matthew Garrett <mjg59@google.com> > > > > > > > > > > Applications that hold secrets and wish to avoid them leaking can use > > > > > mlock() to prevent the page from being pushed out to swap and > > > > > MADV_DONTDUMP to prevent it from being included in core dumps. Applications > > > > > can also use atexit() handlers to overwrite secrets on application exit. > > > > > However, if an attacker can reboot the system into another OS, they can > > > > > dump the contents of RAM and extract secrets. We can avoid this by setting > > > > > CONFIG_RESET_ATTACK_MITIGATION on UEFI systems in order to request that the > > > > > firmware wipe the contents of RAM before booting another OS, but this means > > > > > rebooting takes a *long* time - the expected behaviour is for a clean > > > > > shutdown to remove the request after scrubbing secrets from RAM in order to > > > > > avoid this. > > > > > > > > > > Unfortunately, if an application exits uncleanly, its secrets may still be > > > > > present in RAM. This can't be easily fixed in userland (eg, if the OOM > > > > > killer decides to kill a process holding secrets, we're not going to be able > > > > > to avoid that), so this patch adds a new flag to madvise() to allow userland > > > > > to request that the kernel clear the covered pages whenever the page > > > > > reference count hits zero. Since vm_flags is already full on 32-bit, it > > > > > will only work on 64-bit systems. > > > [...] > > > > > diff --git a/mm/madvise.c b/mm/madvise.c > > > > > index 21a7881a2db4..989c2fde15cf 100644 > > > > > --- a/mm/madvise.c > > > > > +++ b/mm/madvise.c > > > > > @@ -92,6 +92,22 @@ static long madvise_behavior(struct vm_area_struct *vma, > > > > > case MADV_KEEPONFORK: > > > > > new_flags &= ~VM_WIPEONFORK; > > > > > break; > > > > > + case MADV_WIPEONRELEASE: > > > > > + /* MADV_WIPEONRELEASE is only supported on anonymous memory. */ > > > > > + if (VM_WIPEONRELEASE == 0 || vma->vm_file || > > > > > + vma->vm_flags & VM_SHARED) { > > > > > + error = -EINVAL; > > > > > + goto out; > > > > > + } > > > > > + new_flags |= VM_WIPEONRELEASE; > > > > > + break; > > > > > > An interesting effect of this is that it will be possible to set this > > > on a CoW anon VMA in a fork() child, and then the semantics in the > > > parent will be subtly different - e.g. if the parent vmsplice()d a > > > CoWed page into a pipe, then forked an unprivileged child, the child > > > > Maybe a stupid question. How do you fork an unprivileged child (without > > exec)? Child would have to drop priviledges on its own, no? > > Sorry, yes, that's what I meant. But then the VMA is gone along with the flag so why does it matter?
On Fri, Apr 26, 2019 at 3:47 PM Michal Hocko <mhocko@kernel.org> wrote: > On Fri 26-04-19 15:33:25, Jann Horn wrote: > > On Fri, Apr 26, 2019 at 7:31 AM Michal Hocko <mhocko@kernel.org> wrote: > > > On Thu 25-04-19 14:42:52, Jann Horn wrote: > > > > On Thu, Apr 25, 2019 at 2:14 PM Michal Hocko <mhocko@kernel.org> wrote: > > > > [...] > > > > > On Wed 24-04-19 14:10:39, Matthew Garrett wrote: > > > > > > From: Matthew Garrett <mjg59@google.com> > > > > > > > > > > > > Applications that hold secrets and wish to avoid them leaking can use > > > > > > mlock() to prevent the page from being pushed out to swap and > > > > > > MADV_DONTDUMP to prevent it from being included in core dumps. Applications > > > > > > can also use atexit() handlers to overwrite secrets on application exit. > > > > > > However, if an attacker can reboot the system into another OS, they can > > > > > > dump the contents of RAM and extract secrets. We can avoid this by setting > > > > > > CONFIG_RESET_ATTACK_MITIGATION on UEFI systems in order to request that the > > > > > > firmware wipe the contents of RAM before booting another OS, but this means > > > > > > rebooting takes a *long* time - the expected behaviour is for a clean > > > > > > shutdown to remove the request after scrubbing secrets from RAM in order to > > > > > > avoid this. > > > > > > > > > > > > Unfortunately, if an application exits uncleanly, its secrets may still be > > > > > > present in RAM. This can't be easily fixed in userland (eg, if the OOM > > > > > > killer decides to kill a process holding secrets, we're not going to be able > > > > > > to avoid that), so this patch adds a new flag to madvise() to allow userland > > > > > > to request that the kernel clear the covered pages whenever the page > > > > > > reference count hits zero. Since vm_flags is already full on 32-bit, it > > > > > > will only work on 64-bit systems. > > > > [...] > > > > > > diff --git a/mm/madvise.c b/mm/madvise.c > > > > > > index 21a7881a2db4..989c2fde15cf 100644 > > > > > > --- a/mm/madvise.c > > > > > > +++ b/mm/madvise.c > > > > > > @@ -92,6 +92,22 @@ static long madvise_behavior(struct vm_area_struct *vma, > > > > > > case MADV_KEEPONFORK: > > > > > > new_flags &= ~VM_WIPEONFORK; > > > > > > break; > > > > > > + case MADV_WIPEONRELEASE: > > > > > > + /* MADV_WIPEONRELEASE is only supported on anonymous memory. */ > > > > > > + if (VM_WIPEONRELEASE == 0 || vma->vm_file || > > > > > > + vma->vm_flags & VM_SHARED) { > > > > > > + error = -EINVAL; > > > > > > + goto out; > > > > > > + } > > > > > > + new_flags |= VM_WIPEONRELEASE; > > > > > > + break; > > > > > > > > An interesting effect of this is that it will be possible to set this > > > > on a CoW anon VMA in a fork() child, and then the semantics in the > > > > parent will be subtly different - e.g. if the parent vmsplice()d a > > > > CoWed page into a pipe, then forked an unprivileged child, the child > > > > > > Maybe a stupid question. How do you fork an unprivileged child (without > > > exec)? Child would have to drop priviledges on its own, no? > > > > Sorry, yes, that's what I meant. > > But then the VMA is gone along with the flag so why does it matter? But in theory, the page might still be used somewhere, e.g. as data in a pipe (into which the parent wrote it) or whatever. Parent vmsplice()s a page into a pipe, parent exits, child marks the VMA as WIPEONRELEASE and exits, page gets wiped, someone else reads the page from the pipe. Yes, this is very theoretical, and you'd have to write some pretty weird software for this to matter. But it doesn't seem clean to me to allow a child to affect the data in e.g. a pipe that it isn't supposed to have access to like this. Then again, this could probably already happen, since do_wp_page() reuses pages depending on only the mapcount, without looking at the refcount.
On Fri 26-04-19 16:03:26, Jann Horn wrote: > On Fri, Apr 26, 2019 at 3:47 PM Michal Hocko <mhocko@kernel.org> wrote: > > On Fri 26-04-19 15:33:25, Jann Horn wrote: > > > On Fri, Apr 26, 2019 at 7:31 AM Michal Hocko <mhocko@kernel.org> wrote: > > > > On Thu 25-04-19 14:42:52, Jann Horn wrote: > > > > > On Thu, Apr 25, 2019 at 2:14 PM Michal Hocko <mhocko@kernel.org> wrote: > > > > > [...] > > > > > > On Wed 24-04-19 14:10:39, Matthew Garrett wrote: > > > > > > > From: Matthew Garrett <mjg59@google.com> > > > > > > > > > > > > > > Applications that hold secrets and wish to avoid them leaking can use > > > > > > > mlock() to prevent the page from being pushed out to swap and > > > > > > > MADV_DONTDUMP to prevent it from being included in core dumps. Applications > > > > > > > can also use atexit() handlers to overwrite secrets on application exit. > > > > > > > However, if an attacker can reboot the system into another OS, they can > > > > > > > dump the contents of RAM and extract secrets. We can avoid this by setting > > > > > > > CONFIG_RESET_ATTACK_MITIGATION on UEFI systems in order to request that the > > > > > > > firmware wipe the contents of RAM before booting another OS, but this means > > > > > > > rebooting takes a *long* time - the expected behaviour is for a clean > > > > > > > shutdown to remove the request after scrubbing secrets from RAM in order to > > > > > > > avoid this. > > > > > > > > > > > > > > Unfortunately, if an application exits uncleanly, its secrets may still be > > > > > > > present in RAM. This can't be easily fixed in userland (eg, if the OOM > > > > > > > killer decides to kill a process holding secrets, we're not going to be able > > > > > > > to avoid that), so this patch adds a new flag to madvise() to allow userland > > > > > > > to request that the kernel clear the covered pages whenever the page > > > > > > > reference count hits zero. Since vm_flags is already full on 32-bit, it > > > > > > > will only work on 64-bit systems. > > > > > [...] > > > > > > > diff --git a/mm/madvise.c b/mm/madvise.c > > > > > > > index 21a7881a2db4..989c2fde15cf 100644 > > > > > > > --- a/mm/madvise.c > > > > > > > +++ b/mm/madvise.c > > > > > > > @@ -92,6 +92,22 @@ static long madvise_behavior(struct vm_area_struct *vma, > > > > > > > case MADV_KEEPONFORK: > > > > > > > new_flags &= ~VM_WIPEONFORK; > > > > > > > break; > > > > > > > + case MADV_WIPEONRELEASE: > > > > > > > + /* MADV_WIPEONRELEASE is only supported on anonymous memory. */ > > > > > > > + if (VM_WIPEONRELEASE == 0 || vma->vm_file || > > > > > > > + vma->vm_flags & VM_SHARED) { > > > > > > > + error = -EINVAL; > > > > > > > + goto out; > > > > > > > + } > > > > > > > + new_flags |= VM_WIPEONRELEASE; > > > > > > > + break; > > > > > > > > > > An interesting effect of this is that it will be possible to set this > > > > > on a CoW anon VMA in a fork() child, and then the semantics in the > > > > > parent will be subtly different - e.g. if the parent vmsplice()d a > > > > > CoWed page into a pipe, then forked an unprivileged child, the child > > > > > > > > Maybe a stupid question. How do you fork an unprivileged child (without > > > > exec)? Child would have to drop priviledges on its own, no? > > > > > > Sorry, yes, that's what I meant. > > > > But then the VMA is gone along with the flag so why does it matter? > > But in theory, the page might still be used somewhere, e.g. as data in > a pipe (into which the parent wrote it) or whatever. Parent > vmsplice()s a page into a pipe, parent exits, child marks the VMA as > WIPEONRELEASE and exits, page gets wiped, someone else reads the page > from the pipe. > > Yes, this is very theoretical, and you'd have to write some pretty > weird software for this to matter. But it doesn't seem clean to me to > allow a child to affect the data in e.g. a pipe that it isn't supposed > to have access to like this. > > Then again, this could probably already happen, since do_wp_page() > reuses pages depending on only the mapcount, without looking at the > refcount. OK, now I see your point. I was confused about the unprivileged child. You are right that this looks weird but we have traditionally trusted child processes to not do a harm. I guess this falls down to the same bucket. An early CoW on these mapping should solve the problem AFAICS.
On Thu, Apr 25, 2019 at 10:25 PM Michal Hocko <mhocko@kernel.org> wrote: > > On Thu 25-04-19 13:39:01, Matthew Garrett wrote: > > Yes, given MADV_DONTDUMP doesn't imply mlock I thought it'd be more > > consistent to keep those independent. > > Do we want to fail madvise call on VMAs that are not mlocked then? What > if the munlock happens later after the madvise is called? I'm not sure if it's strictly necessary. We already have various combinations of features that only make sense when used together and which can be undermined by later actions. I can see the appeal of designing this in a way that makes it harder to misuse, but is that worth additional implementation complexity?
On Fri 26-04-19 11:08:44, Matthew Garrett wrote: > On Thu, Apr 25, 2019 at 10:25 PM Michal Hocko <mhocko@kernel.org> wrote: > > > > On Thu 25-04-19 13:39:01, Matthew Garrett wrote: > > > Yes, given MADV_DONTDUMP doesn't imply mlock I thought it'd be more > > > consistent to keep those independent. > > > > Do we want to fail madvise call on VMAs that are not mlocked then? What > > if the munlock happens later after the madvise is called? > > I'm not sure if it's strictly necessary. We already have various > combinations of features that only make sense when used together and > which can be undermined by later actions. I can see the appeal of > designing this in a way that makes it harder to misuse, but is that > worth additional implementation complexity? If the complexity is not worth the usual usecases then this should be really documented and noted that without an mlock you are not getting the full semantic and you can leave memory behind on the swap partition. I cannot judge how much that matter but it certainly looks half feature to me but if nobody is going to use the madvise without mlock then it looks certainly much easier to implement.
diff --git a/include/linux/mm.h b/include/linux/mm.h index 6b10c21630f5..64bdab679275 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -257,6 +257,8 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) + +#define VM_WIPEONRELEASE BIT(37) /* Clear pages when releasing them */ #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS @@ -298,6 +300,10 @@ extern unsigned int kobjsize(const void *objp); # define VM_GROWSUP VM_NONE #endif +#ifndef VM_WIPEONRELEASE +# define VM_WIPEONRELEASE VM_NONE +#endif + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index abd238d0f7a4..82dfff4a8e3d 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -64,6 +64,8 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_WIPEONRELEASE 20 +#define MADV_DONTWIPEONRELEASE 21 /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/madvise.c b/mm/madvise.c index 21a7881a2db4..989c2fde15cf 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -92,6 +92,22 @@ static long madvise_behavior(struct vm_area_struct *vma, case MADV_KEEPONFORK: new_flags &= ~VM_WIPEONFORK; break; + case MADV_WIPEONRELEASE: + /* MADV_WIPEONRELEASE is only supported on anonymous memory. */ + if (VM_WIPEONRELEASE == 0 || vma->vm_file || + vma->vm_flags & VM_SHARED) { + error = -EINVAL; + goto out; + } + new_flags |= VM_WIPEONRELEASE; + break; + case MADV_DONTWIPEONRELEASE: + if (VM_WIPEONRELEASE == 0) { + error = -EINVAL; + goto out; + } + new_flags &= ~VM_WIPEONRELEASE; + break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; @@ -727,6 +743,8 @@ madvise_behavior_valid(int behavior) case MADV_DODUMP: case MADV_WIPEONFORK: case MADV_KEEPONFORK: + case MADV_WIPEONRELEASE: + case MADV_DONTWIPEONRELEASE: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: @@ -785,6 +803,9 @@ madvise_behavior_valid(int behavior) * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. + * MADV_WIPEONRELEASE - clear the contents of the memory after the last + * reference to it has been released + * MADV_DONTWIPEONRELEASE - cancel MADV_WIPEONRELEASE * * return values: * zero - success diff --git a/mm/memory.c b/mm/memory.c index ab650c21bccd..ff78b527660e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1091,6 +1091,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, page_remove_rmap(page, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); + if (unlikely(vma->vm_flags & VM_WIPEONRELEASE) && + page_mapcount(page) == 0) + clear_highpage(page); if (unlikely(__tlb_remove_page(tlb, page))) { force_flush = 1; addr += PAGE_SIZE;