diff mbox series

[v1,4/7] mm/ksm: fix KSM COW breaking with userfaultfd-wp via FAULT_FLAG_UNSHARE

Message ID 20220930141931.174362-5-david@redhat.com (mailing list archive)
State New
Headers show
Series mm/ksm: break_ksm() cleanups and fixes | expand

Commit Message

David Hildenbrand Sept. 30, 2022, 2:19 p.m. UTC
Let's stop breaking COW via a fake write fault and let's use
FAULT_FLAG_UNSHARE instead. This avoids any wrong side effects of the fake
write fault, such as mapping the PTE writable and marking the pte
dirty/softdirty.

Also, this fixes KSM interaction with userfaultfd-wp: when we have a KSM
page that's write-protected by userfaultfd, break_ksm()->handle_mm_fault()
will fail with VM_FAULT_SIGBUS and will simpy return in break_ksm() with 0.
The warning in dmesg indicates this wrong handling:

[  230.096368] FAULT_FLAG_ALLOW_RETRY missing 881
[  230.100822] CPU: 1 PID: 1643 Comm: ksm-uffd-wp [...]
[  230.110124] Hardware name: [...]
[  230.117775] Call Trace:
[  230.120227]  <TASK>
[  230.122334]  dump_stack_lvl+0x44/0x5c
[  230.126010]  handle_userfault.cold+0x14/0x19
[  230.130281]  ? tlb_finish_mmu+0x65/0x170
[  230.134207]  ? uffd_wp_range+0x65/0xa0
[  230.137959]  ? _raw_spin_unlock+0x15/0x30
[  230.141972]  ? do_wp_page+0x50/0x590
[  230.145551]  __handle_mm_fault+0x9f5/0xf50
[  230.149652]  ? mmput+0x1f/0x40
[  230.152712]  handle_mm_fault+0xb9/0x2a0
[  230.156550]  break_ksm+0x141/0x180
[  230.159964]  unmerge_ksm_pages+0x60/0x90
[  230.163890]  ksm_madvise+0x3c/0xb0
[  230.167295]  do_madvise.part.0+0x10c/0xeb0
[  230.171396]  ? do_syscall_64+0x67/0x80
[  230.175157]  __x64_sys_madvise+0x5a/0x70
[  230.179082]  do_syscall_64+0x58/0x80
[  230.182661]  ? do_syscall_64+0x67/0x80
[  230.186413]  entry_SYSCALL_64_after_hwframe+0x63/0xcd

--------------------------------------------------------------------------

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <errno.h>
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
 #include <linux/userfaultfd.h>

 #define MMAP_SIZE (2 * 1024 * 1024u)

 static char *map;
 int uffd;

 static int setup_uffd(void)
 {
 	struct uffdio_api uffdio_api;
 	struct uffdio_register uffdio_register;
 	struct uffdio_writeprotect uffd_writeprotect;
 	struct uffdio_range uffd_range;

 	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
 	if (uffd < 0) {
 		fprintf(stderr, "syscall() failed: %d\n", errno);
 		return -errno;
 	}

 	uffdio_api.api = UFFD_API;
 	uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
 	if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
 		fprintf(stderr, "UFFDIO_API failed: %d\n", errno);
 		return -errno;
 	}

 	if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
 		fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n");
 		return -ENOSYS;
 	}

 	/* Register UFFD-WP */
 	uffdio_register.range.start = (unsigned long) map;
 	uffdio_register.range.len = MMAP_SIZE;
 	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
 		fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno);
 		return -errno;
 	}

 	/* Writeprotect the range. */
 	uffd_writeprotect.range.start = (unsigned long) map;
 	uffd_writeprotect.range.len = MMAP_SIZE;
 	uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
 	if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
 		fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno);
 		return -errno;
 	}

 	return 0;
 }

 int main(int argc, char **argv)
 {
 	int ksm_fd, ret;

 	ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR);
 	if (ksm_fd < 0) {
 		fprintf(stderr, "KSM not available\n");
 		return -errno;
 	}

 	map = mmap(NULL, MMAP_SIZE, PROT_READ|PROT_WRITE,
 		   MAP_PRIVATE|MAP_ANON, -1, 0);
 	if (map == MAP_FAILED) {
 		fprintf(stderr, "mmap() failed\n");
 		return -errno;
 	}
 	ret = madvise(map, MMAP_SIZE, MADV_NOHUGEPAGE);
 	if (ret) {
 		fprintf(stderr, "MADV_NOHUGEPAGE failed\n");
 		return -errno;
 	}

 	/* Fill with same value and trigger merging. */
 	memset(map, 0xff, MMAP_SIZE);
 	ret = madvise(map, MMAP_SIZE, MADV_MERGEABLE);
 	if (ret) {
 		fprintf(stderr, "MADV_MERGEABLE failed\n");
 		return -errno;
 	}

 	/*
 	 * Run KSM to trigger merging and wait a bit until merging should be
 	 * done.
 	 */
 	if (write(ksm_fd, "1", 1) != 1) {
 		fprintf(stderr, "Running KSM failed\n");
 	}
 	sleep(10);

 	/* Write-protect the range with UFFD. */
 	if (setup_uffd())
 		return 1;

 	/* Trigger unsharing. */
 	ret = madvise(map, MMAP_SIZE, MADV_UNMERGEABLE);
 	if (ret) {
 		fprintf(stderr, "MADV_UNMERGEABLE failed\n");
 		return -errno;
 	}

 	return 0;
 }

--------------------------------------------------------------------------

Consequently, we will no longer trigger a fake write fault and break COW
without any such side-effects.

This is primarily a fix for KSM+userfaultfd-wp, however, the fake write
fault was always questionable. As this fix is not easy to backport and it's
not very critical, let's not cc stable.

Fixes: 529b930b87d9 ("userfaultfd: wp: hook userfault handler to write protection fault")
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 mm/ksm.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

Comments

Andrew Morton Sept. 30, 2022, 10:27 p.m. UTC | #1
On Fri, 30 Sep 2022 16:19:28 +0200 David Hildenbrand <david@redhat.com> wrote:

> Let's stop breaking COW via a fake write fault and let's use
> FAULT_FLAG_UNSHARE instead. This avoids any wrong side effects of the fake
> write fault, such as mapping the PTE writable and marking the pte
> dirty/softdirty.
> 
> Also, this fixes KSM interaction with userfaultfd-wp: when we have a KSM
> page that's write-protected by userfaultfd, break_ksm()->handle_mm_fault()
> will fail with VM_FAULT_SIGBUS and will simpy return in break_ksm() with 0.
> The warning in dmesg indicates this wrong handling:

We're at -rc7.  I'd prefer to avoid merging larger patchsets at this
time.

Is there some minimal fix for 6.0 and -stable?  Or is the problem
non-serious enough to only fix it in 6.1 and later?
David Hildenbrand Oct. 1, 2022, 8:13 a.m. UTC | #2
On 01.10.22 00:27, Andrew Morton wrote:
> On Fri, 30 Sep 2022 16:19:28 +0200 David Hildenbrand <david@redhat.com> wrote:
> 
>> Let's stop breaking COW via a fake write fault and let's use
>> FAULT_FLAG_UNSHARE instead. This avoids any wrong side effects of the fake
>> write fault, such as mapping the PTE writable and marking the pte
>> dirty/softdirty.
>>
>> Also, this fixes KSM interaction with userfaultfd-wp: when we have a KSM
>> page that's write-protected by userfaultfd, break_ksm()->handle_mm_fault()
>> will fail with VM_FAULT_SIGBUS and will simpy return in break_ksm() with 0.
>> The warning in dmesg indicates this wrong handling:
> 
> We're at -rc7.  I'd prefer to avoid merging larger patchsets at this
> time.

Yes, this is 6.1 material.

> 
> Is there some minimal fix for 6.0 and -stable?  Or is the problem
> non-serious enough to only fix it in 6.1 and later?
> 

See the end of this lengthy patch description:

"This is primarily a fix for KSM+userfaultfd-wp, however, the fake write
fault was always questionable. As this fix is not easy to backport and 
it's not very critical, let's not cc stable."

This can wait, thanks!
Peter Xu Oct. 5, 2022, 8:35 p.m. UTC | #3
On Fri, Sep 30, 2022 at 04:19:28PM +0200, David Hildenbrand wrote:
> Let's stop breaking COW via a fake write fault and let's use
> FAULT_FLAG_UNSHARE instead. This avoids any wrong side effects of the fake
> write fault, such as mapping the PTE writable and marking the pte
> dirty/softdirty.
> 
> Also, this fixes KSM interaction with userfaultfd-wp: when we have a KSM
> page that's write-protected by userfaultfd, break_ksm()->handle_mm_fault()
> will fail with VM_FAULT_SIGBUS and will simpy return in break_ksm() with 0.
> The warning in dmesg indicates this wrong handling:
> 
> [  230.096368] FAULT_FLAG_ALLOW_RETRY missing 881
> [  230.100822] CPU: 1 PID: 1643 Comm: ksm-uffd-wp [...]
> [  230.110124] Hardware name: [...]
> [  230.117775] Call Trace:
> [  230.120227]  <TASK>
> [  230.122334]  dump_stack_lvl+0x44/0x5c
> [  230.126010]  handle_userfault.cold+0x14/0x19
> [  230.130281]  ? tlb_finish_mmu+0x65/0x170
> [  230.134207]  ? uffd_wp_range+0x65/0xa0
> [  230.137959]  ? _raw_spin_unlock+0x15/0x30
> [  230.141972]  ? do_wp_page+0x50/0x590
> [  230.145551]  __handle_mm_fault+0x9f5/0xf50
> [  230.149652]  ? mmput+0x1f/0x40
> [  230.152712]  handle_mm_fault+0xb9/0x2a0
> [  230.156550]  break_ksm+0x141/0x180
> [  230.159964]  unmerge_ksm_pages+0x60/0x90
> [  230.163890]  ksm_madvise+0x3c/0xb0
> [  230.167295]  do_madvise.part.0+0x10c/0xeb0
> [  230.171396]  ? do_syscall_64+0x67/0x80
> [  230.175157]  __x64_sys_madvise+0x5a/0x70
> [  230.179082]  do_syscall_64+0x58/0x80
> [  230.182661]  ? do_syscall_64+0x67/0x80
> [  230.186413]  entry_SYSCALL_64_after_hwframe+0x63/0xcd

Since it's already there, worth adding the test into ksm_test.c?

> 
> Consequently, we will no longer trigger a fake write fault and break COW
> without any such side-effects.
> 
> This is primarily a fix for KSM+userfaultfd-wp, however, the fake write
> fault was always questionable. As this fix is not easy to backport and it's
> not very critical, let's not cc stable.

A patch to cc most of the stable would probably need to still go with the
old write approach but attaching ALLOW_RETRY.  But I agree maybe that may
not need to bother, or a report should have arrived earlier..  The unshare
approach looks much cleaner indeed.

> 
> Fixes: 529b930b87d9 ("userfaultfd: wp: hook userfault handler to write protection fault")
> Signed-off-by: David Hildenbrand <david@redhat.com>

Acked-by: Peter Xu <peterx@redhat.com>
David Hildenbrand Oct. 6, 2022, 9:29 a.m. UTC | #4
On 05.10.22 22:35, Peter Xu wrote:
> On Fri, Sep 30, 2022 at 04:19:28PM +0200, David Hildenbrand wrote:
>> Let's stop breaking COW via a fake write fault and let's use
>> FAULT_FLAG_UNSHARE instead. This avoids any wrong side effects of the fake
>> write fault, such as mapping the PTE writable and marking the pte
>> dirty/softdirty.
>>
>> Also, this fixes KSM interaction with userfaultfd-wp: when we have a KSM
>> page that's write-protected by userfaultfd, break_ksm()->handle_mm_fault()
>> will fail with VM_FAULT_SIGBUS and will simpy return in break_ksm() with 0.
>> The warning in dmesg indicates this wrong handling:
>>
>> [  230.096368] FAULT_FLAG_ALLOW_RETRY missing 881
>> [  230.100822] CPU: 1 PID: 1643 Comm: ksm-uffd-wp [...]
>> [  230.110124] Hardware name: [...]
>> [  230.117775] Call Trace:
>> [  230.120227]  <TASK>
>> [  230.122334]  dump_stack_lvl+0x44/0x5c
>> [  230.126010]  handle_userfault.cold+0x14/0x19
>> [  230.130281]  ? tlb_finish_mmu+0x65/0x170
>> [  230.134207]  ? uffd_wp_range+0x65/0xa0
>> [  230.137959]  ? _raw_spin_unlock+0x15/0x30
>> [  230.141972]  ? do_wp_page+0x50/0x590
>> [  230.145551]  __handle_mm_fault+0x9f5/0xf50
>> [  230.149652]  ? mmput+0x1f/0x40
>> [  230.152712]  handle_mm_fault+0xb9/0x2a0
>> [  230.156550]  break_ksm+0x141/0x180
>> [  230.159964]  unmerge_ksm_pages+0x60/0x90
>> [  230.163890]  ksm_madvise+0x3c/0xb0
>> [  230.167295]  do_madvise.part.0+0x10c/0xeb0
>> [  230.171396]  ? do_syscall_64+0x67/0x80
>> [  230.175157]  __x64_sys_madvise+0x5a/0x70
>> [  230.179082]  do_syscall_64+0x58/0x80
>> [  230.182661]  ? do_syscall_64+0x67/0x80
>> [  230.186413]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
> 
> Since it's already there, worth adding the test into ksm_test.c?

Yes, I can give it a try. What I dislike about ksm_test is that it's a 
mixture of benchmarks and test cases that have to explicitly triggered 
by parameters. It's not a simple "run all available test cases" tests as 
we know it. So maybe something separate (or having it as part of the 
uffd tests) makes more sense.

> 
>>
>> Consequently, we will no longer trigger a fake write fault and break COW
>> without any such side-effects.
>>
>> This is primarily a fix for KSM+userfaultfd-wp, however, the fake write
>> fault was always questionable. As this fix is not easy to backport and it's
>> not very critical, let's not cc stable.
> 
> A patch to cc most of the stable would probably need to still go with the
> old write approach but attaching ALLOW_RETRY.  But I agree maybe that may
> not need to bother, or a report should have arrived earlier..  The unshare
> approach looks much cleaner indeed.

A fix without FAULT_FLAG_UNSHARE is not straight forward. We really 
don't want to notify user space about write events here (because there 
is none). And there is no way around the uffd handling in WP code 
without that.

FAULT_FLAG_ALLOW_RETRY would rely on userfaultfd triggering and having 
to resolve the WP event.

> 
>>
>> Fixes: 529b930b87d9 ("userfaultfd: wp: hook userfault handler to write protection fault")
>> Signed-off-by: David Hildenbrand <david@redhat.com>
> 
> Acked-by: Peter Xu <peterx@redhat.com>
> 

Thanks!
Peter Xu Oct. 6, 2022, 7:04 p.m. UTC | #5
On Thu, Oct 06, 2022 at 11:29:29AM +0200, David Hildenbrand wrote:
> On 05.10.22 22:35, Peter Xu wrote:
> > On Fri, Sep 30, 2022 at 04:19:28PM +0200, David Hildenbrand wrote:
> > > Let's stop breaking COW via a fake write fault and let's use
> > > FAULT_FLAG_UNSHARE instead. This avoids any wrong side effects of the fake
> > > write fault, such as mapping the PTE writable and marking the pte
> > > dirty/softdirty.
> > > 
> > > Also, this fixes KSM interaction with userfaultfd-wp: when we have a KSM
> > > page that's write-protected by userfaultfd, break_ksm()->handle_mm_fault()
> > > will fail with VM_FAULT_SIGBUS and will simpy return in break_ksm() with 0.
> > > The warning in dmesg indicates this wrong handling:
> > > 
> > > [  230.096368] FAULT_FLAG_ALLOW_RETRY missing 881
> > > [  230.100822] CPU: 1 PID: 1643 Comm: ksm-uffd-wp [...]
> > > [  230.110124] Hardware name: [...]
> > > [  230.117775] Call Trace:
> > > [  230.120227]  <TASK>
> > > [  230.122334]  dump_stack_lvl+0x44/0x5c
> > > [  230.126010]  handle_userfault.cold+0x14/0x19
> > > [  230.130281]  ? tlb_finish_mmu+0x65/0x170
> > > [  230.134207]  ? uffd_wp_range+0x65/0xa0
> > > [  230.137959]  ? _raw_spin_unlock+0x15/0x30
> > > [  230.141972]  ? do_wp_page+0x50/0x590
> > > [  230.145551]  __handle_mm_fault+0x9f5/0xf50
> > > [  230.149652]  ? mmput+0x1f/0x40
> > > [  230.152712]  handle_mm_fault+0xb9/0x2a0
> > > [  230.156550]  break_ksm+0x141/0x180
> > > [  230.159964]  unmerge_ksm_pages+0x60/0x90
> > > [  230.163890]  ksm_madvise+0x3c/0xb0
> > > [  230.167295]  do_madvise.part.0+0x10c/0xeb0
> > > [  230.171396]  ? do_syscall_64+0x67/0x80
> > > [  230.175157]  __x64_sys_madvise+0x5a/0x70
> > > [  230.179082]  do_syscall_64+0x58/0x80
> > > [  230.182661]  ? do_syscall_64+0x67/0x80
> > > [  230.186413]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
> > 
> > Since it's already there, worth adding the test into ksm_test.c?
> 
> Yes, I can give it a try. What I dislike about ksm_test is that it's a
> mixture of benchmarks and test cases that have to explicitly triggered by
> parameters. It's not a simple "run all available test cases" tests as we
> know it. So maybe something separate (or having it as part of the uffd
> tests) makes more sense.

We can add an entry into run_vmtests.sh.  That's also what current ksm_test
does.

Yes adding into uffd test would work too, but I do have a plan that we
should move functional tests out of userfaultfd.c, leaving that with the
stress test only.  Not really a big deal, though.

> 
> > 
> > > 
> > > Consequently, we will no longer trigger a fake write fault and break COW
> > > without any such side-effects.
> > > 
> > > This is primarily a fix for KSM+userfaultfd-wp, however, the fake write
> > > fault was always questionable. As this fix is not easy to backport and it's
> > > not very critical, let's not cc stable.
> > 
> > A patch to cc most of the stable would probably need to still go with the
> > old write approach but attaching ALLOW_RETRY.  But I agree maybe that may
> > not need to bother, or a report should have arrived earlier..  The unshare
> > approach looks much cleaner indeed.
> 
> A fix without FAULT_FLAG_UNSHARE is not straight forward. We really don't
> want to notify user space about write events here (because there is none).
> And there is no way around the uffd handling in WP code without that.
> 
> FAULT_FLAG_ALLOW_RETRY would rely on userfaultfd triggering and having to
> resolve the WP event.

Right it'll be very much a false positive, but the userspace should be fine
with it e.g. for live snapshot we need to copy page earlier; it still won't
stop the process from running along the way.  But I agree that's not ideal.
David Hildenbrand Oct. 20, 2022, 10:05 a.m. UTC | #6
Hi Peter,

sorry for replying so late, I only managed now to get back to this patch 
set.

>> Yes, I can give it a try. What I dislike about ksm_test is that it's a
>> mixture of benchmarks and test cases that have to explicitly triggered by
>> parameters. It's not a simple "run all available test cases" tests as we
>> know it. So maybe something separate (or having it as part of the uffd
>> tests) makes more sense.
> 
> We can add an entry into run_vmtests.sh.  That's also what current ksm_test
> does.

Right, but I kind-of don't like that way at all and would much rather do 
it cleaner...

> 
> Yes adding into uffd test would work too, but I do have a plan that we
> should move functional tests out of userfaultfd.c, leaving that with the
> stress test only.  Not really a big deal, though.

... similar to what you want to do with userfaultfd.c

So maybe I'll just add a new test for ksm functional tests.

> 
>>
>>>
>>>>
>>>> Consequently, we will no longer trigger a fake write fault and break COW
>>>> without any such side-effects.
>>>>
>>>> This is primarily a fix for KSM+userfaultfd-wp, however, the fake write
>>>> fault was always questionable. As this fix is not easy to backport and it's
>>>> not very critical, let's not cc stable.
>>>
>>> A patch to cc most of the stable would probably need to still go with the
>>> old write approach but attaching ALLOW_RETRY.  But I agree maybe that may
>>> not need to bother, or a report should have arrived earlier..  The unshare
>>> approach looks much cleaner indeed.
>>
>> A fix without FAULT_FLAG_UNSHARE is not straight forward. We really don't
>> want to notify user space about write events here (because there is none).
>> And there is no way around the uffd handling in WP code without that.
>>
>> FAULT_FLAG_ALLOW_RETRY would rely on userfaultfd triggering and having to
>> resolve the WP event.
> 
> Right it'll be very much a false positive, but the userspace should be fine
> with it e.g. for live snapshot we need to copy page earlier; it still won't
> stop the process from running along the way.  But I agree that's not ideal.

At least the test case at hand will wait until infinitely, because there 
is no handler that would allow break_cow to make progress (well, we 
don't expect write events in the test :) ).

Anyhow, I don't think messing with that for stable kernels is worth the 
pain / complexity / possible issues.
diff mbox series

Patch

diff --git a/mm/ksm.c b/mm/ksm.c
index e8d987fb379e..4d7bcf7da7c3 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -453,17 +453,15 @@  static inline bool ksm_test_exit(struct mm_struct *mm)
 }
 
 /*
- * We use break_ksm to break COW on a ksm page: it's a stripped down
+ * We use break_ksm to break COW on a ksm page by triggering unsharing,
+ * such that the ksm page will get replaced by an exclusive anonymous page.
  *
- *	if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1)
- *		put_page(page);
- *
- * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
+ * We take great care only to touch a ksm page, in a VM_MERGEABLE vma,
  * in case the application has unmapped and remapped mm,addr meanwhile.
  * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
  * mmap of /dev/mem, where we would not want to touch it.
  *
- * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
+ * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context
  * of the process that owns 'vma'.  We also do not want to enforce
  * protection keys here anyway.
  */
@@ -487,7 +485,7 @@  static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 		if (!ksm_page)
 			return 0;
 		ret = handle_mm_fault(vma, addr,
-				      FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+				      FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
 				      NULL);
 	} while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
 	/*