diff mbox series

[v2,2/2] mm/hugetlb: support write-faults in shared mappings

Message ID 20220811103435.188481-3-david@redhat.com (mailing list archive)
State New
Headers show
Series mm/hugetlb: fix write-fault handling for shared mappings | expand

Commit Message

David Hildenbrand Aug. 11, 2022, 10:34 a.m. UTC
If we ever get a write-fault on a write-protected page in a shared mapping,
we'd be in trouble (again). Instead, we can simply map the page writable.

And in fact, there is even a way right now to trigger that code via
uffd-wp ever since we stared to support it for shmem in 5.19:

--------------------------------------------------------------------------
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <errno.h>
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
 #include <linux/userfaultfd.h>

 #define HUGETLB_SIZE (2 * 1024 * 1024u)

 static char *map;
 int uffd;

 static int temp_setup_uffd(void)
 {
 	struct uffdio_api uffdio_api;
 	struct uffdio_register uffdio_register;
 	struct uffdio_writeprotect uffd_writeprotect;
 	struct uffdio_range uffd_range;

 	uffd = syscall(__NR_userfaultfd,
 		       O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
 	if (uffd < 0) {
 		fprintf(stderr, "syscall() failed: %d\n", errno);
 		return -errno;
 	}

 	uffdio_api.api = UFFD_API;
 	uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
 	if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
 		fprintf(stderr, "UFFDIO_API failed: %d\n", errno);
 		return -errno;
 	}

 	if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
 		fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n");
 		return -ENOSYS;
 	}

 	/* Register UFFD-WP */
 	uffdio_register.range.start = (unsigned long) map;
 	uffdio_register.range.len = HUGETLB_SIZE;
 	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
 		fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno);
 		return -errno;
 	}

 	/* Writeprotect a single page. */
 	uffd_writeprotect.range.start = (unsigned long) map;
 	uffd_writeprotect.range.len = HUGETLB_SIZE;
 	uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
 	if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
 		fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno);
 		return -errno;
 	}

 	/* Unregister UFFD-WP without prior writeunprotection. */
 	uffd_range.start = (unsigned long) map;
 	uffd_range.len = HUGETLB_SIZE;
 	if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_range)) {
 		fprintf(stderr, "UFFDIO_UNREGISTER failed: %d\n", errno);
 		return -errno;
 	}

 	return 0;
 }

 int main(int argc, char **argv)
 {
 	int fd;

 	fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT);
 	if (!fd) {
 		fprintf(stderr, "open() failed\n");
 		return -errno;
 	}
 	if (ftruncate(fd, HUGETLB_SIZE)) {
 		fprintf(stderr, "ftruncate() failed\n");
 		return -errno;
 	}

 	map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
 	if (map == MAP_FAILED) {
 		fprintf(stderr, "mmap() failed\n");
 		return -errno;
 	}

 	*map = 0;

 	if (temp_setup_uffd())
 		return 1;

 	*map = 0;

 	return 0;
 }
--------------------------------------------------------------------------

Above test fails with SIGBUS when there is only a single free hugetlb page.
 # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
 # ./test
 Bus error (core dumped)

And worse, with sufficient free hugetlb pages it will map an anonymous page
into a shared mapping, for example, messing up accounting during unmap
and breaking MAP_SHARED semantics:
 # echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
 # ./test
 # cat /proc/meminfo | grep HugePages_
 HugePages_Total:       2
 HugePages_Free:        1
 HugePages_Rsvd:    18446744073709551615
 HugePages_Surp:        0

Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when
unregistering and consequently keeps the PTE writeprotected. Reason for
this is to avoid the additional overhead when unregistering. Note
that this is the case also for !hugetlb and that we will end up with
writable PTEs that still have the uffd-wp PTE bit set once we return
from hugetlb_wp(). I'm not touching the uffd-wp PTE bit for now, because it
seems to be a generic thing -- wp_page_reuse() also doesn't clear it.

VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE
indicates that MAP_SHARED handling was at least envisioned, but could never
have worked as expected.

While at it, make sure that we never end up in hugetlb_wp() on write
faults without VM_WRITE, because we don't support maybe_mkwrite()
semantics as commonly used in the !hugetlb case -- for example, in
wp_page_reuse().

Note that there is no need to do any kind of reservation in hugetlb_fault()
in this case ... because we already have a hugetlb page mapped R/O
that we will simply map writable and we are not dealing with COW/unsharing.

Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs")
Cc: <stable@vger.kernel.org> # v5.19
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 mm/hugetlb.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

Comments

Peter Xu Aug. 11, 2022, 1:59 p.m. UTC | #1
On Thu, Aug 11, 2022 at 12:34:35PM +0200, David Hildenbrand wrote:
> Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when
> unregistering and consequently keeps the PTE writeprotected. Reason for
> this is to avoid the additional overhead when unregistering. Note
> that this is the case also for !hugetlb and that we will end up with
> writable PTEs that still have the uffd-wp PTE bit set once we return
> from hugetlb_wp(). I'm not touching the uffd-wp PTE bit for now, because it
> seems to be a generic thing -- wp_page_reuse() also doesn't clear it.

This may justify that lazy reset of ptes may not really be a good idea,
including anonymous.  I'm indeed not aware of any app that do frequent
reg/unreg at least.

I'll prepare a patch to change it from uffd side too.

Thanks again for finding this problem.
David Hildenbrand Aug. 11, 2022, 4:24 p.m. UTC | #2
On 11.08.22 15:59, Peter Xu wrote:
> On Thu, Aug 11, 2022 at 12:34:35PM +0200, David Hildenbrand wrote:
>> Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when
>> unregistering and consequently keeps the PTE writeprotected. Reason for
>> this is to avoid the additional overhead when unregistering. Note
>> that this is the case also for !hugetlb and that we will end up with
>> writable PTEs that still have the uffd-wp PTE bit set once we return
>> from hugetlb_wp(). I'm not touching the uffd-wp PTE bit for now, because it
>> seems to be a generic thing -- wp_page_reuse() also doesn't clear it.
> 
> This may justify that lazy reset of ptes may not really be a good idea,
> including anonymous.  I'm indeed not aware of any app that do frequent
> reg/unreg at least.

Yeah. QEMU snapshots come to mind, but I guess the reg/unreg overhead is
the smallest issue.


> 
> I'll prepare a patch to change it from uffd side too.
> 
> Thanks again for finding this problem.

YW!
Mike Kravetz Aug. 11, 2022, 6:59 p.m. UTC | #3
On 08/11/22 12:34, David Hildenbrand wrote:
> If we ever get a write-fault on a write-protected page in a shared mapping,
> we'd be in trouble (again). Instead, we can simply map the page writable.
> 
<snip>
> 
> Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when
> unregistering and consequently keeps the PTE writeprotected. Reason for
> this is to avoid the additional overhead when unregistering. Note
> that this is the case also for !hugetlb and that we will end up with
> writable PTEs that still have the uffd-wp PTE bit set once we return
> from hugetlb_wp(). I'm not touching the uffd-wp PTE bit for now, because it
> seems to be a generic thing -- wp_page_reuse() also doesn't clear it.
> 
> VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE
> indicates that MAP_SHARED handling was at least envisioned, but could never
> have worked as expected.
> 
> While at it, make sure that we never end up in hugetlb_wp() on write
> faults without VM_WRITE, because we don't support maybe_mkwrite()
> semantics as commonly used in the !hugetlb case -- for example, in
> wp_page_reuse().

Nit,
to me 'make sure that we never end up in hugetlb_wp()' implies that
we would check for condition in callers as opposed to first thing in
hugetlb_wp().  However, I am OK with description as it.

> Note that there is no need to do any kind of reservation in hugetlb_fault()
> in this case ... because we already have a hugetlb page mapped R/O
> that we will simply map writable and we are not dealing with COW/unsharing.

Note that we are not really doing any reservation adjustment in
hugetlb_fault().  That code does pre-allocation of reservation data in
case we might need it in hugetlb_wp.  Since hugetlb_wp will certainly
not do an allocation in this case, we do not even need to do the
preallocation here.  This change is more of an optimization.  I am still
happy with it.

> 
> Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs")
> Cc: <stable@vger.kernel.org> # v5.19
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  mm/hugetlb.c | 26 +++++++++++++++++++-------
>  1 file changed, 19 insertions(+), 7 deletions(-)

Thanks,

Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Gerald Schaefer Aug. 15, 2022, 1:35 p.m. UTC | #4
On Thu, 11 Aug 2022 11:59:09 -0700
Mike Kravetz <mike.kravetz@oracle.com> wrote:

> On 08/11/22 12:34, David Hildenbrand wrote:
> > If we ever get a write-fault on a write-protected page in a shared mapping,
> > we'd be in trouble (again). Instead, we can simply map the page writable.
> > 
> <snip>
> > 
> > Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when
> > unregistering and consequently keeps the PTE writeprotected. Reason for
> > this is to avoid the additional overhead when unregistering. Note
> > that this is the case also for !hugetlb and that we will end up with
> > writable PTEs that still have the uffd-wp PTE bit set once we return
> > from hugetlb_wp(). I'm not touching the uffd-wp PTE bit for now, because it
> > seems to be a generic thing -- wp_page_reuse() also doesn't clear it.
> > 
> > VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE
> > indicates that MAP_SHARED handling was at least envisioned, but could never
> > have worked as expected.
> > 
> > While at it, make sure that we never end up in hugetlb_wp() on write
> > faults without VM_WRITE, because we don't support maybe_mkwrite()
> > semantics as commonly used in the !hugetlb case -- for example, in
> > wp_page_reuse().
> 
> Nit,
> to me 'make sure that we never end up in hugetlb_wp()' implies that
> we would check for condition in callers as opposed to first thing in
> hugetlb_wp().  However, I am OK with description as it.

Is that new WARN_ON_ONCE() in hugetlb_wp() meant to indicate a real bug?
It is triggered by libhugetlbfs testcase "HUGETLB_ELFMAP=R linkhuge_rw"
(at least on s390), and crashes our CI, because it runs with panic_on_warn
enabled.

Not sure if this means that we have bug elsewhere, allowing us to
get to the WARN in hugetlb_wp().
David Hildenbrand Aug. 15, 2022, 3:07 p.m. UTC | #5
On Mon, Aug 15, 2022 at 3:36 PM Gerald Schaefer
<gerald.schaefer@linux.ibm.com> wrote:
>
> On Thu, 11 Aug 2022 11:59:09 -0700
> Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> > On 08/11/22 12:34, David Hildenbrand wrote:
> > > If we ever get a write-fault on a write-protected page in a shared mapping,
> > > we'd be in trouble (again). Instead, we can simply map the page writable.
> > >
> > <snip>
> > >
> > > Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when
> > > unregistering and consequently keeps the PTE writeprotected. Reason for
> > > this is to avoid the additional overhead when unregistering. Note
> > > that this is the case also for !hugetlb and that we will end up with
> > > writable PTEs that still have the uffd-wp PTE bit set once we return
> > > from hugetlb_wp(). I'm not touching the uffd-wp PTE bit for now, because it
> > > seems to be a generic thing -- wp_page_reuse() also doesn't clear it.
> > >
> > > VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE
> > > indicates that MAP_SHARED handling was at least envisioned, but could never
> > > have worked as expected.
> > >
> > > While at it, make sure that we never end up in hugetlb_wp() on write
> > > faults without VM_WRITE, because we don't support maybe_mkwrite()
> > > semantics as commonly used in the !hugetlb case -- for example, in
> > > wp_page_reuse().
> >
> > Nit,
> > to me 'make sure that we never end up in hugetlb_wp()' implies that
> > we would check for condition in callers as opposed to first thing in
> > hugetlb_wp().  However, I am OK with description as it.
>

Hi Gerald,

> Is that new WARN_ON_ONCE() in hugetlb_wp() meant to indicate a real bug?

Most probably, unless I am missing something important.

Something triggers FAULT_FLAG_WRITE on a VMA without VM_WRITE and
hugetlb_wp() would map the pte writable.
Consequently, we'd have a writable pte inside a VMA that does not have
write permissions, which is dubious. My check prevents that and bails
out.

Ordinary (!hugetlb) faults have maybe_mkwrite() (e.g., for FOLL_FORCE
or breaking COW) semantics such that we won't be mapping PTEs writable
if the VMA does not have write permissions.

I suspect that either

a) Some write fault misses a protection check and ends up triggering a
FAULT_FLAG_WRITE where we should actually fail early.

b) The write fault is valid and some VMA misses proper flags (VM_WRITE).

c) The write fault is valid (e.g., for breaking COW or FOLL_FORCE) and
we'd actually want maybe_mkwrite semantics.

> It is triggered by libhugetlbfs testcase "HUGETLB_ELFMAP=R linkhuge_rw"
> (at least on s390), and crashes our CI, because it runs with panic_on_warn
> enabled.
>
> Not sure if this means that we have bug elsewhere, allowing us to
> get to the WARN in hugetlb_wp().

That's what I suspect. Do you have a backtrace?

Note that I'm on vacation this week and might not reply as fast as usual.
Gerald Schaefer Aug. 15, 2022, 3:59 p.m. UTC | #6
On Mon, 15 Aug 2022 17:07:32 +0200
David Hildenbrand <david@redhat.com> wrote:

> On Mon, Aug 15, 2022 at 3:36 PM Gerald Schaefer
> <gerald.schaefer@linux.ibm.com> wrote:
> >
> > On Thu, 11 Aug 2022 11:59:09 -0700
> > Mike Kravetz <mike.kravetz@oracle.com> wrote:
> >
> > > On 08/11/22 12:34, David Hildenbrand wrote:
> > > > If we ever get a write-fault on a write-protected page in a shared mapping,
> > > > we'd be in trouble (again). Instead, we can simply map the page writable.
> > > >
> > > <snip>
> > > >
> > > > Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when
> > > > unregistering and consequently keeps the PTE writeprotected. Reason for
> > > > this is to avoid the additional overhead when unregistering. Note
> > > > that this is the case also for !hugetlb and that we will end up with
> > > > writable PTEs that still have the uffd-wp PTE bit set once we return
> > > > from hugetlb_wp(). I'm not touching the uffd-wp PTE bit for now, because it
> > > > seems to be a generic thing -- wp_page_reuse() also doesn't clear it.
> > > >
> > > > VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE
> > > > indicates that MAP_SHARED handling was at least envisioned, but could never
> > > > have worked as expected.
> > > >
> > > > While at it, make sure that we never end up in hugetlb_wp() on write
> > > > faults without VM_WRITE, because we don't support maybe_mkwrite()
> > > > semantics as commonly used in the !hugetlb case -- for example, in
> > > > wp_page_reuse().
> > >
> > > Nit,
> > > to me 'make sure that we never end up in hugetlb_wp()' implies that
> > > we would check for condition in callers as opposed to first thing in
> > > hugetlb_wp().  However, I am OK with description as it.
> >
> 
> Hi Gerald,
> 
> > Is that new WARN_ON_ONCE() in hugetlb_wp() meant to indicate a real bug?
> 
> Most probably, unless I am missing something important.
> 
> Something triggers FAULT_FLAG_WRITE on a VMA without VM_WRITE and
> hugetlb_wp() would map the pte writable.
> Consequently, we'd have a writable pte inside a VMA that does not have
> write permissions, which is dubious. My check prevents that and bails
> out.
> 
> Ordinary (!hugetlb) faults have maybe_mkwrite() (e.g., for FOLL_FORCE
> or breaking COW) semantics such that we won't be mapping PTEs writable
> if the VMA does not have write permissions.
> 
> I suspect that either
> 
> a) Some write fault misses a protection check and ends up triggering a
> FAULT_FLAG_WRITE where we should actually fail early.
> 
> b) The write fault is valid and some VMA misses proper flags (VM_WRITE).
> 
> c) The write fault is valid (e.g., for breaking COW or FOLL_FORCE) and
> we'd actually want maybe_mkwrite semantics.
> 
> > It is triggered by libhugetlbfs testcase "HUGETLB_ELFMAP=R linkhuge_rw"
> > (at least on s390), and crashes our CI, because it runs with panic_on_warn
> > enabled.
> >
> > Not sure if this means that we have bug elsewhere, allowing us to
> > get to the WARN in hugetlb_wp().
> 
> That's what I suspect. Do you have a backtrace?

Sure, forgot to send it with initial reply...

[   82.574749] ------------[ cut here ]------------
[   82.574751] WARNING: CPU: 9 PID: 1674 at mm/hugetlb.c:5264 hugetlb_wp+0x3be/0x818
[   82.574759] Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink sunrpc uvdevice s390_trng vfio_ccw mdev vfio_iommu_type1 eadm_sch vfio zcrypt_cex4 sch_fq_codel configfs ghash_s390 prng chacha_s390 libchacha aes_s390 des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common pkey zcrypt rng_core autofs4
[   82.574785] CPU: 9 PID: 1674 Comm: linkhuge_rw Kdump: loaded Not tainted 5.19.0-next-20220815 #36
[   82.574787] Hardware name: IBM 3931 A01 704 (LPAR)
[   82.574788] Krnl PSW : 0704c00180000000 00000006c9d4bc6a (hugetlb_wp+0x3c2/0x818)
[   82.574791]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
[   82.574794] Krnl GPRS: 000000000227c000 0000000008640071 0000000000000000 0000000001200000
[   82.574796]            0000000001200000 00000000b5a98090 0000000000000255 00000000adb2c898
[   82.574797]            0000000000000000 00000000adb2c898 0000000001200000 00000000b5a98090
[   82.574799]            000000008c408000 0000000092fd7300 000003800339bc10 000003800339baf8
[   82.574803] Krnl Code: 00000006c9d4bc5c: f160000407fe        mvo     4(7,%r0),2046(1,%r0)
           00000006c9d4bc62: 47000700           bc      0,1792
          #00000006c9d4bc66: af000000           mc      0,0
          >00000006c9d4bc6a: a7a80040           lhi     %r10,64
           00000006c9d4bc6e: b916002a           llgfr   %r2,%r10
           00000006c9d4bc72: eb6ff1600004       lmg     %r6,%r15,352(%r15)
           00000006c9d4bc78: 07fe               bcr     15,%r14
           00000006c9d4bc7a: 47000700           bc      0,1792
[   82.574814] Call Trace:
[   82.574842]  [<00000006c9d4bc6a>] hugetlb_wp+0x3c2/0x818 
[   82.574846]  [<00000006c9d4c62e>] hugetlb_no_page+0x56e/0x5a8 
[   82.574848]  [<00000006c9d4cac2>] hugetlb_fault+0x45a/0x590 
[   82.574850]  [<00000006c9d06d4a>] handle_mm_fault+0x182/0x220 
[   82.574855]  [<00000006c9a9d70e>] do_exception+0x19e/0x470 
[   82.574858]  [<00000006c9a9dff2>] do_dat_exception+0x2a/0x50 
[   82.574861]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0 
[   82.574866]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170 
[   82.574870] Last Breaking-Event-Address:
[   82.574871]  [<00000006c9d4b926>] hugetlb_wp+0x7e/0x818
[   82.574873] Kernel panic - not syncing: panic_on_warn set ...
[   82.574875] CPU: 9 PID: 1674 Comm: linkhuge_rw Kdump: loaded Not tainted 5.19.0-next-20220815 #36
[   82.574877] Hardware name: IBM 3931 A01 704 (LPAR)
[   82.574878] Call Trace:
[   82.574879]  [<00000006ca664f22>] dump_stack_lvl+0x62/0x80 
[   82.574881]  [<00000006ca657af8>] panic+0x118/0x300 
[   82.574884]  [<00000006c9ac3da6>] __warn+0xb6/0x160 
[   82.574887]  [<00000006ca29b1ea>] report_bug+0xba/0x140 
[   82.574890]  [<00000006c9a75194>] monitor_event_exception+0x44/0x80 
[   82.574892]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0 
[   82.574894]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170 
[   82.574897]  [<00000006c9d4bc6a>] hugetlb_wp+0x3c2/0x818 
[   82.574899]  [<00000006c9d4c62e>] hugetlb_no_page+0x56e/0x5a8 
[   82.574901]  [<00000006c9d4cac2>] hugetlb_fault+0x45a/0x590 
[   82.574903]  [<00000006c9d06d4a>] handle_mm_fault+0x182/0x220 
[   82.574906]  [<00000006c9a9d70e>] do_exception+0x19e/0x470 
[   82.574907]  [<00000006c9a9dff2>] do_dat_exception+0x2a/0x50 
[   82.574909]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0 
[   82.574912]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170 

There are also some "User process fault" messages before that, but those
seem to be expected by the nature of that linhuge_rw tests, i.e. they are
not new and also showed before. This is why I assumed it could be just
some weird user space logic, triggering that new WARN_ON_ONCE.
David Hildenbrand Aug. 15, 2022, 6:03 p.m. UTC | #7
On Mon, Aug 15, 2022 at 5:59 PM Gerald Schaefer
<gerald.schaefer@linux.ibm.com> wrote:
>
> On Mon, 15 Aug 2022 17:07:32 +0200
> David Hildenbrand <david@redhat.com> wrote:
>
> > On Mon, Aug 15, 2022 at 3:36 PM Gerald Schaefer
> > <gerald.schaefer@linux.ibm.com> wrote:
> > >
> > > On Thu, 11 Aug 2022 11:59:09 -0700
> > > Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > >
> > > > On 08/11/22 12:34, David Hildenbrand wrote:
> > > > > If we ever get a write-fault on a write-protected page in a shared mapping,
> > > > > we'd be in trouble (again). Instead, we can simply map the page writable.
> > > > >
> > > > <snip>
> > > > >
> > > > > Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when
> > > > > unregistering and consequently keeps the PTE writeprotected. Reason for
> > > > > this is to avoid the additional overhead when unregistering. Note
> > > > > that this is the case also for !hugetlb and that we will end up with
> > > > > writable PTEs that still have the uffd-wp PTE bit set once we return
> > > > > from hugetlb_wp(). I'm not touching the uffd-wp PTE bit for now, because it
> > > > > seems to be a generic thing -- wp_page_reuse() also doesn't clear it.
> > > > >
> > > > > VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE
> > > > > indicates that MAP_SHARED handling was at least envisioned, but could never
> > > > > have worked as expected.
> > > > >
> > > > > While at it, make sure that we never end up in hugetlb_wp() on write
> > > > > faults without VM_WRITE, because we don't support maybe_mkwrite()
> > > > > semantics as commonly used in the !hugetlb case -- for example, in
> > > > > wp_page_reuse().
> > > >
> > > > Nit,
> > > > to me 'make sure that we never end up in hugetlb_wp()' implies that
> > > > we would check for condition in callers as opposed to first thing in
> > > > hugetlb_wp().  However, I am OK with description as it.
> > >
> >
> > Hi Gerald,
> >
> > > Is that new WARN_ON_ONCE() in hugetlb_wp() meant to indicate a real bug?
> >
> > Most probably, unless I am missing something important.
> >
> > Something triggers FAULT_FLAG_WRITE on a VMA without VM_WRITE and
> > hugetlb_wp() would map the pte writable.
> > Consequently, we'd have a writable pte inside a VMA that does not have
> > write permissions, which is dubious. My check prevents that and bails
> > out.
> >
> > Ordinary (!hugetlb) faults have maybe_mkwrite() (e.g., for FOLL_FORCE
> > or breaking COW) semantics such that we won't be mapping PTEs writable
> > if the VMA does not have write permissions.
> >
> > I suspect that either
> >
> > a) Some write fault misses a protection check and ends up triggering a
> > FAULT_FLAG_WRITE where we should actually fail early.
> >
> > b) The write fault is valid and some VMA misses proper flags (VM_WRITE).
> >
> > c) The write fault is valid (e.g., for breaking COW or FOLL_FORCE) and
> > we'd actually want maybe_mkwrite semantics.
> >
> > > It is triggered by libhugetlbfs testcase "HUGETLB_ELFMAP=R linkhuge_rw"
> > > (at least on s390), and crashes our CI, because it runs with panic_on_warn
> > > enabled.
> > >
> > > Not sure if this means that we have bug elsewhere, allowing us to
> > > get to the WARN in hugetlb_wp().
> >
> > That's what I suspect. Do you have a backtrace?
>
> Sure, forgot to send it with initial reply...
>
> [   82.574749] ------------[ cut here ]------------
> [   82.574751] WARNING: CPU: 9 PID: 1674 at mm/hugetlb.c:5264 hugetlb_wp+0x3be/0x818
> [   82.574759] Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink sunrpc uvdevice s390_trng vfio_ccw mdev vfio_iommu_type1 eadm_sch vfio zcrypt_cex4 sch_fq_codel configfs ghash_s390 prng chacha_s390 libchacha aes_s390 des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common pkey zcrypt rng_core autofs4
> [   82.574785] CPU: 9 PID: 1674 Comm: linkhuge_rw Kdump: loaded Not tainted 5.19.0-next-20220815 #36
> [   82.574787] Hardware name: IBM 3931 A01 704 (LPAR)
> [   82.574788] Krnl PSW : 0704c00180000000 00000006c9d4bc6a (hugetlb_wp+0x3c2/0x818)
> [   82.574791]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> [   82.574794] Krnl GPRS: 000000000227c000 0000000008640071 0000000000000000 0000000001200000
> [   82.574796]            0000000001200000 00000000b5a98090 0000000000000255 00000000adb2c898
> [   82.574797]            0000000000000000 00000000adb2c898 0000000001200000 00000000b5a98090
> [   82.574799]            000000008c408000 0000000092fd7300 000003800339bc10 000003800339baf8
> [   82.574803] Krnl Code: 00000006c9d4bc5c: f160000407fe        mvo     4(7,%r0),2046(1,%r0)
>            00000006c9d4bc62: 47000700           bc      0,1792
>           #00000006c9d4bc66: af000000           mc      0,0
>           >00000006c9d4bc6a: a7a80040           lhi     %r10,64
>            00000006c9d4bc6e: b916002a           llgfr   %r2,%r10
>            00000006c9d4bc72: eb6ff1600004       lmg     %r6,%r15,352(%r15)
>            00000006c9d4bc78: 07fe               bcr     15,%r14
>            00000006c9d4bc7a: 47000700           bc      0,1792
> [   82.574814] Call Trace:
> [   82.574842]  [<00000006c9d4bc6a>] hugetlb_wp+0x3c2/0x818
> [   82.574846]  [<00000006c9d4c62e>] hugetlb_no_page+0x56e/0x5a8
> [   82.574848]  [<00000006c9d4cac2>] hugetlb_fault+0x45a/0x590
> [   82.574850]  [<00000006c9d06d4a>] handle_mm_fault+0x182/0x220
> [   82.574855]  [<00000006c9a9d70e>] do_exception+0x19e/0x470
> [   82.574858]  [<00000006c9a9dff2>] do_dat_exception+0x2a/0x50
> [   82.574861]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0
> [   82.574866]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170
> [   82.574870] Last Breaking-Event-Address:
> [   82.574871]  [<00000006c9d4b926>] hugetlb_wp+0x7e/0x818
> [   82.574873] Kernel panic - not syncing: panic_on_warn set ...
> [   82.574875] CPU: 9 PID: 1674 Comm: linkhuge_rw Kdump: loaded Not tainted 5.19.0-next-20220815 #36
> [   82.574877] Hardware name: IBM 3931 A01 704 (LPAR)
> [   82.574878] Call Trace:
> [   82.574879]  [<00000006ca664f22>] dump_stack_lvl+0x62/0x80
> [   82.574881]  [<00000006ca657af8>] panic+0x118/0x300
> [   82.574884]  [<00000006c9ac3da6>] __warn+0xb6/0x160
> [   82.574887]  [<00000006ca29b1ea>] report_bug+0xba/0x140
> [   82.574890]  [<00000006c9a75194>] monitor_event_exception+0x44/0x80
> [   82.574892]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0
> [   82.574894]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170
> [   82.574897]  [<00000006c9d4bc6a>] hugetlb_wp+0x3c2/0x818
> [   82.574899]  [<00000006c9d4c62e>] hugetlb_no_page+0x56e/0x5a8
> [   82.574901]  [<00000006c9d4cac2>] hugetlb_fault+0x45a/0x590
> [   82.574903]  [<00000006c9d06d4a>] handle_mm_fault+0x182/0x220
> [   82.574906]  [<00000006c9a9d70e>] do_exception+0x19e/0x470
> [   82.574907]  [<00000006c9a9dff2>] do_dat_exception+0x2a/0x50
> [   82.574909]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0
> [   82.574912]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170


do_dat_exception() sets
  access = VM_ACCESS_FLAGS;

do_exception() sets
  is_write = (trans_exc_code & store_indication) == 0x400;

and FAULT_FLAG_WRITE
   if (access == VM_WRITE || is_write)
          flags |= FAULT_FLAG_WRITE;

however, for VMA permission checks it only checks
  if (unlikely(!(vma->vm_flags & access)))
          goto out_up;

as VM_ACCESS_FLAGS includes VM_WRITE | VM_READ ...

We end up triggering a write fault (FAULT_FLAG_WRITE), even though the
VMA does not allow for writes.

I assume that's what happens and that it's a bug in s390x code.
Gerald Schaefer Aug. 15, 2022, 6:38 p.m. UTC | #8
On Mon, 15 Aug 2022 20:03:20 +0200
David Hildenbrand <david@redhat.com> wrote:

> On Mon, Aug 15, 2022 at 5:59 PM Gerald Schaefer
> <gerald.schaefer@linux.ibm.com> wrote:
> >
> > On Mon, 15 Aug 2022 17:07:32 +0200
> > David Hildenbrand <david@redhat.com> wrote:
> >
> > > On Mon, Aug 15, 2022 at 3:36 PM Gerald Schaefer
> > > <gerald.schaefer@linux.ibm.com> wrote:
> > > >
> > > > On Thu, 11 Aug 2022 11:59:09 -0700
> > > > Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > > >
> > > > > On 08/11/22 12:34, David Hildenbrand wrote:
> > > > > > If we ever get a write-fault on a write-protected page in a shared mapping,
> > > > > > we'd be in trouble (again). Instead, we can simply map the page writable.
> > > > > >
> > > > > <snip>
> > > > > >
> > > > > > Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when
> > > > > > unregistering and consequently keeps the PTE writeprotected. Reason for
> > > > > > this is to avoid the additional overhead when unregistering. Note
> > > > > > that this is the case also for !hugetlb and that we will end up with
> > > > > > writable PTEs that still have the uffd-wp PTE bit set once we return
> > > > > > from hugetlb_wp(). I'm not touching the uffd-wp PTE bit for now, because it
> > > > > > seems to be a generic thing -- wp_page_reuse() also doesn't clear it.
> > > > > >
> > > > > > VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE
> > > > > > indicates that MAP_SHARED handling was at least envisioned, but could never
> > > > > > have worked as expected.
> > > > > >
> > > > > > While at it, make sure that we never end up in hugetlb_wp() on write
> > > > > > faults without VM_WRITE, because we don't support maybe_mkwrite()
> > > > > > semantics as commonly used in the !hugetlb case -- for example, in
> > > > > > wp_page_reuse().
> > > > >
> > > > > Nit,
> > > > > to me 'make sure that we never end up in hugetlb_wp()' implies that
> > > > > we would check for condition in callers as opposed to first thing in
> > > > > hugetlb_wp().  However, I am OK with description as it.
> > > >
> > >
> > > Hi Gerald,
> > >
> > > > Is that new WARN_ON_ONCE() in hugetlb_wp() meant to indicate a real bug?
> > >
> > > Most probably, unless I am missing something important.
> > >
> > > Something triggers FAULT_FLAG_WRITE on a VMA without VM_WRITE and
> > > hugetlb_wp() would map the pte writable.
> > > Consequently, we'd have a writable pte inside a VMA that does not have
> > > write permissions, which is dubious. My check prevents that and bails
> > > out.
> > >
> > > Ordinary (!hugetlb) faults have maybe_mkwrite() (e.g., for FOLL_FORCE
> > > or breaking COW) semantics such that we won't be mapping PTEs writable
> > > if the VMA does not have write permissions.
> > >
> > > I suspect that either
> > >
> > > a) Some write fault misses a protection check and ends up triggering a
> > > FAULT_FLAG_WRITE where we should actually fail early.
> > >
> > > b) The write fault is valid and some VMA misses proper flags (VM_WRITE).
> > >
> > > c) The write fault is valid (e.g., for breaking COW or FOLL_FORCE) and
> > > we'd actually want maybe_mkwrite semantics.
> > >
> > > > It is triggered by libhugetlbfs testcase "HUGETLB_ELFMAP=R linkhuge_rw"
> > > > (at least on s390), and crashes our CI, because it runs with panic_on_warn
> > > > enabled.
> > > >
> > > > Not sure if this means that we have bug elsewhere, allowing us to
> > > > get to the WARN in hugetlb_wp().
> > >
> > > That's what I suspect. Do you have a backtrace?
> >
> > Sure, forgot to send it with initial reply...
> >
> > [   82.574749] ------------[ cut here ]------------
> > [   82.574751] WARNING: CPU: 9 PID: 1674 at mm/hugetlb.c:5264 hugetlb_wp+0x3be/0x818
> > [   82.574759] Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink sunrpc uvdevice s390_trng vfio_ccw mdev vfio_iommu_type1 eadm_sch vfio zcrypt_cex4 sch_fq_codel configfs ghash_s390 prng chacha_s390 libchacha aes_s390 des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common pkey zcrypt rng_core autofs4
> > [   82.574785] CPU: 9 PID: 1674 Comm: linkhuge_rw Kdump: loaded Not tainted 5.19.0-next-20220815 #36
> > [   82.574787] Hardware name: IBM 3931 A01 704 (LPAR)
> > [   82.574788] Krnl PSW : 0704c00180000000 00000006c9d4bc6a (hugetlb_wp+0x3c2/0x818)
> > [   82.574791]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> > [   82.574794] Krnl GPRS: 000000000227c000 0000000008640071 0000000000000000 0000000001200000
> > [   82.574796]            0000000001200000 00000000b5a98090 0000000000000255 00000000adb2c898
> > [   82.574797]            0000000000000000 00000000adb2c898 0000000001200000 00000000b5a98090
> > [   82.574799]            000000008c408000 0000000092fd7300 000003800339bc10 000003800339baf8
> > [   82.574803] Krnl Code: 00000006c9d4bc5c: f160000407fe        mvo     4(7,%r0),2046(1,%r0)
> >            00000006c9d4bc62: 47000700           bc      0,1792
> >           #00000006c9d4bc66: af000000           mc      0,0
> >           >00000006c9d4bc6a: a7a80040           lhi     %r10,64
> >            00000006c9d4bc6e: b916002a           llgfr   %r2,%r10
> >            00000006c9d4bc72: eb6ff1600004       lmg     %r6,%r15,352(%r15)
> >            00000006c9d4bc78: 07fe               bcr     15,%r14
> >            00000006c9d4bc7a: 47000700           bc      0,1792
> > [   82.574814] Call Trace:
> > [   82.574842]  [<00000006c9d4bc6a>] hugetlb_wp+0x3c2/0x818
> > [   82.574846]  [<00000006c9d4c62e>] hugetlb_no_page+0x56e/0x5a8
> > [   82.574848]  [<00000006c9d4cac2>] hugetlb_fault+0x45a/0x590
> > [   82.574850]  [<00000006c9d06d4a>] handle_mm_fault+0x182/0x220
> > [   82.574855]  [<00000006c9a9d70e>] do_exception+0x19e/0x470
> > [   82.574858]  [<00000006c9a9dff2>] do_dat_exception+0x2a/0x50
> > [   82.574861]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0
> > [   82.574866]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170
> > [   82.574870] Last Breaking-Event-Address:
> > [   82.574871]  [<00000006c9d4b926>] hugetlb_wp+0x7e/0x818
> > [   82.574873] Kernel panic - not syncing: panic_on_warn set ...
> > [   82.574875] CPU: 9 PID: 1674 Comm: linkhuge_rw Kdump: loaded Not tainted 5.19.0-next-20220815 #36
> > [   82.574877] Hardware name: IBM 3931 A01 704 (LPAR)
> > [   82.574878] Call Trace:
> > [   82.574879]  [<00000006ca664f22>] dump_stack_lvl+0x62/0x80
> > [   82.574881]  [<00000006ca657af8>] panic+0x118/0x300
> > [   82.574884]  [<00000006c9ac3da6>] __warn+0xb6/0x160
> > [   82.574887]  [<00000006ca29b1ea>] report_bug+0xba/0x140
> > [   82.574890]  [<00000006c9a75194>] monitor_event_exception+0x44/0x80
> > [   82.574892]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0
> > [   82.574894]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170
> > [   82.574897]  [<00000006c9d4bc6a>] hugetlb_wp+0x3c2/0x818
> > [   82.574899]  [<00000006c9d4c62e>] hugetlb_no_page+0x56e/0x5a8
> > [   82.574901]  [<00000006c9d4cac2>] hugetlb_fault+0x45a/0x590
> > [   82.574903]  [<00000006c9d06d4a>] handle_mm_fault+0x182/0x220
> > [   82.574906]  [<00000006c9a9d70e>] do_exception+0x19e/0x470
> > [   82.574907]  [<00000006c9a9dff2>] do_dat_exception+0x2a/0x50
> > [   82.574909]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0
> > [   82.574912]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170
> 
> 
> do_dat_exception() sets
>   access = VM_ACCESS_FLAGS;
> 
> do_exception() sets
>   is_write = (trans_exc_code & store_indication) == 0x400;
> 
> and FAULT_FLAG_WRITE
>    if (access == VM_WRITE || is_write)
>           flags |= FAULT_FLAG_WRITE;
> 
> however, for VMA permission checks it only checks
>   if (unlikely(!(vma->vm_flags & access)))
>           goto out_up;
> 
> as VM_ACCESS_FLAGS includes VM_WRITE | VM_READ ...
> 
> We end up triggering a write fault (FAULT_FLAG_WRITE), even though the
> VMA does not allow for writes.
> 
> I assume that's what happens and that it's a bug in s390x code.
> 

Hmm, that looks weird, but that doesn't mean it has to be broken.
We are talking about a pte_none() fault, not a protection exception
(do_dat_exception vs. do_protection_exception). Not sure if we get
any proper store indication in that case, but yes, this looks weird,
will have a closer look. Thanks for pointing out!

FWIW, meanwhile, I added a check to hugetlb_wp() in v5.19, for
(!unshare && !(vma->vm_flags & VM_WRITE)). This did not trigger,
however, it did trigger already before your commit. So something
already changed before your commit, and after v5.19.

Further bisecting showed that the check started to trigger
after commit bcd51a3c679d ("hugetlb: lazy page table copies in fork()"),
and after that the "HUGETLB_ELFMAP=R linkhuge_rw" testcase also
started segfaulting (not sure why we did not notice earlier...).

Anyway, I guess this means that your commit only made that change
in behavior more obvious, by adding the WARN_ON_ONCE, but it really
was introduced by that other commit.

Not sure if this gives any more insight to anyone, still confused
by your comments on do_exception(), which also sound like a possible
root cause for ending up in hugetlb_wp() w/o VM_WRITE (but why only
after commit bcd51a3c679d?).
Mike Kravetz Aug. 15, 2022, 9:43 p.m. UTC | #9
On 08/15/22 20:38, Gerald Schaefer wrote:
> On Mon, 15 Aug 2022 20:03:20 +0200
> David Hildenbrand <david@redhat.com> wrote:
> > On Mon, Aug 15, 2022 at 5:59 PM Gerald Schaefer
> > <gerald.schaefer@linux.ibm.com> wrote:
> > > On Mon, 15 Aug 2022 17:07:32 +0200
> > > David Hildenbrand <david@redhat.com> wrote:
> > > > On Mon, Aug 15, 2022 at 3:36 PM Gerald Schaefer
> > > > <gerald.schaefer@linux.ibm.com> wrote:
> > > > > On Thu, 11 Aug 2022 11:59:09 -0700
> > > > > Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > > > >
> > > Sure, forgot to send it with initial reply...
> > >
> > > [   82.574749] ------------[ cut here ]------------
> > > [   82.574751] WARNING: CPU: 9 PID: 1674 at mm/hugetlb.c:5264 hugetlb_wp+0x3be/0x818
> > > [   82.574759] Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink sunrpc uvdevice s390_trng vfio_ccw mdev vfio_iommu_type1 eadm_sch vfio zcrypt_cex4 sch_fq_codel configfs ghash_s390 prng chacha_s390 libchacha aes_s390 des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common pkey zcrypt rng_core autofs4
> > > [   82.574785] CPU: 9 PID: 1674 Comm: linkhuge_rw Kdump: loaded Not tainted 5.19.0-next-20220815 #36
> > > [   82.574787] Hardware name: IBM 3931 A01 704 (LPAR)
> > > [   82.574788] Krnl PSW : 0704c00180000000 00000006c9d4bc6a (hugetlb_wp+0x3c2/0x818)
> > > [   82.574791]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> > > [   82.574794] Krnl GPRS: 000000000227c000 0000000008640071 0000000000000000 0000000001200000
> > > [   82.574796]            0000000001200000 00000000b5a98090 0000000000000255 00000000adb2c898
> > > [   82.574797]            0000000000000000 00000000adb2c898 0000000001200000 00000000b5a98090
> > > [   82.574799]            000000008c408000 0000000092fd7300 000003800339bc10 000003800339baf8
> > > [   82.574803] Krnl Code: 00000006c9d4bc5c: f160000407fe        mvo     4(7,%r0),2046(1,%r0)
> > >            00000006c9d4bc62: 47000700           bc      0,1792
> > >           #00000006c9d4bc66: af000000           mc      0,0
> > >           >00000006c9d4bc6a: a7a80040           lhi     %r10,64
> > >            00000006c9d4bc6e: b916002a           llgfr   %r2,%r10
> > >            00000006c9d4bc72: eb6ff1600004       lmg     %r6,%r15,352(%r15)
> > >            00000006c9d4bc78: 07fe               bcr     15,%r14
> > >            00000006c9d4bc7a: 47000700           bc      0,1792
> > > [   82.574814] Call Trace:
> > > [   82.574842]  [<00000006c9d4bc6a>] hugetlb_wp+0x3c2/0x818
> > > [   82.574846]  [<00000006c9d4c62e>] hugetlb_no_page+0x56e/0x5a8
> > > [   82.574848]  [<00000006c9d4cac2>] hugetlb_fault+0x45a/0x590
> > > [   82.574850]  [<00000006c9d06d4a>] handle_mm_fault+0x182/0x220
> > > [   82.574855]  [<00000006c9a9d70e>] do_exception+0x19e/0x470
> > > [   82.574858]  [<00000006c9a9dff2>] do_dat_exception+0x2a/0x50
> > > [   82.574861]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0
> > > [   82.574866]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170
> > > [   82.574870] Last Breaking-Event-Address:
> > > [   82.574871]  [<00000006c9d4b926>] hugetlb_wp+0x7e/0x818
> > > [   82.574873] Kernel panic - not syncing: panic_on_warn set ...
> > > [   82.574875] CPU: 9 PID: 1674 Comm: linkhuge_rw Kdump: loaded Not tainted 5.19.0-next-20220815 #36
> > > [   82.574877] Hardware name: IBM 3931 A01 704 (LPAR)
> > > [   82.574878] Call Trace:
> > > [   82.574879]  [<00000006ca664f22>] dump_stack_lvl+0x62/0x80
> > > [   82.574881]  [<00000006ca657af8>] panic+0x118/0x300
> > > [   82.574884]  [<00000006c9ac3da6>] __warn+0xb6/0x160
> > > [   82.574887]  [<00000006ca29b1ea>] report_bug+0xba/0x140
> > > [   82.574890]  [<00000006c9a75194>] monitor_event_exception+0x44/0x80
> > > [   82.574892]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0
> > > [   82.574894]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170
> > > [   82.574897]  [<00000006c9d4bc6a>] hugetlb_wp+0x3c2/0x818
> > > [   82.574899]  [<00000006c9d4c62e>] hugetlb_no_page+0x56e/0x5a8
> > > [   82.574901]  [<00000006c9d4cac2>] hugetlb_fault+0x45a/0x590
> > > [   82.574903]  [<00000006c9d06d4a>] handle_mm_fault+0x182/0x220
> > > [   82.574906]  [<00000006c9a9d70e>] do_exception+0x19e/0x470
> > > [   82.574907]  [<00000006c9a9dff2>] do_dat_exception+0x2a/0x50
> > > [   82.574909]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0
> > > [   82.574912]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170
> > 
> > 
> > do_dat_exception() sets
> >   access = VM_ACCESS_FLAGS;
> > 
> > do_exception() sets
> >   is_write = (trans_exc_code & store_indication) == 0x400;
> > 
> > and FAULT_FLAG_WRITE
> >    if (access == VM_WRITE || is_write)
> >           flags |= FAULT_FLAG_WRITE;
> > 
> > however, for VMA permission checks it only checks
> >   if (unlikely(!(vma->vm_flags & access)))
> >           goto out_up;
> > 
> > as VM_ACCESS_FLAGS includes VM_WRITE | VM_READ ...
> > 
> > We end up triggering a write fault (FAULT_FLAG_WRITE), even though the
> > VMA does not allow for writes.
> > 
> > I assume that's what happens and that it's a bug in s390x code.
> > 
> 
> Hmm, that looks weird, but that doesn't mean it has to be broken.
> We are talking about a pte_none() fault, not a protection exception
> (do_dat_exception vs. do_protection_exception). Not sure if we get
> any proper store indication in that case, but yes, this looks weird,
> will have a closer look. Thanks for pointing out!
> 
> FWIW, meanwhile, I added a check to hugetlb_wp() in v5.19, for
> (!unshare && !(vma->vm_flags & VM_WRITE)). This did not trigger,
> however, it did trigger already before your commit. So something
> already changed before your commit, and after v5.19.
> 
> Further bisecting showed that the check started to trigger
> after commit bcd51a3c679d ("hugetlb: lazy page table copies in fork()"),
> and after that the "HUGETLB_ELFMAP=R linkhuge_rw" testcase also
> started segfaulting (not sure why we did not notice earlier...).
> 
> Anyway, I guess this means that your commit only made that change
> in behavior more obvious, by adding the WARN_ON_ONCE, but it really
> was introduced by that other commit.
> 
> Not sure if this gives any more insight to anyone, still confused
> by your comments on do_exception(), which also sound like a possible
> root cause for ending up in hugetlb_wp() w/o VM_WRITE (but why only
> after commit bcd51a3c679d?).

I know it doesn't mean much, but I did not/do not see these issues on x86.

bcd51a3c679d eliminates the copying of page tables at fork for non-anon
hugetlb vmas.  So, in these tests you would likely see more pte_none()
faults.

I'm trying to figure out exactly what this test is doing.  From a quick
look it is doing a fork/write fault to determine if addresses are writable.
Guessing that such faults are triggering the warning.  Will look at this
some more to try and gain more insight.
Gerald Schaefer Aug. 16, 2022, 9:33 a.m. UTC | #10
On Mon, 15 Aug 2022 14:43:16 -0700
Mike Kravetz <mike.kravetz@oracle.com> wrote:

> On 08/15/22 20:38, Gerald Schaefer wrote:
> > On Mon, 15 Aug 2022 20:03:20 +0200
> > David Hildenbrand <david@redhat.com> wrote:
> > > On Mon, Aug 15, 2022 at 5:59 PM Gerald Schaefer
> > > <gerald.schaefer@linux.ibm.com> wrote:
> > > > On Mon, 15 Aug 2022 17:07:32 +0200
> > > > David Hildenbrand <david@redhat.com> wrote:
> > > > > On Mon, Aug 15, 2022 at 3:36 PM Gerald Schaefer
> > > > > <gerald.schaefer@linux.ibm.com> wrote:
> > > > > > On Thu, 11 Aug 2022 11:59:09 -0700
> > > > > > Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > > > > >
> > > > Sure, forgot to send it with initial reply...
> > > >
> > > > [   82.574749] ------------[ cut here ]------------
> > > > [   82.574751] WARNING: CPU: 9 PID: 1674 at mm/hugetlb.c:5264 hugetlb_wp+0x3be/0x818
> > > > [   82.574759] Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink sunrpc uvdevice s390_trng vfio_ccw mdev vfio_iommu_type1 eadm_sch vfio zcrypt_cex4 sch_fq_codel configfs ghash_s390 prng chacha_s390 libchacha aes_s390 des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common pkey zcrypt rng_core autofs4
> > > > [   82.574785] CPU: 9 PID: 1674 Comm: linkhuge_rw Kdump: loaded Not tainted 5.19.0-next-20220815 #36
> > > > [   82.574787] Hardware name: IBM 3931 A01 704 (LPAR)
> > > > [   82.574788] Krnl PSW : 0704c00180000000 00000006c9d4bc6a (hugetlb_wp+0x3c2/0x818)
> > > > [   82.574791]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> > > > [   82.574794] Krnl GPRS: 000000000227c000 0000000008640071 0000000000000000 0000000001200000
> > > > [   82.574796]            0000000001200000 00000000b5a98090 0000000000000255 00000000adb2c898
> > > > [   82.574797]            0000000000000000 00000000adb2c898 0000000001200000 00000000b5a98090
> > > > [   82.574799]            000000008c408000 0000000092fd7300 000003800339bc10 000003800339baf8
> > > > [   82.574803] Krnl Code: 00000006c9d4bc5c: f160000407fe        mvo     4(7,%r0),2046(1,%r0)
> > > >            00000006c9d4bc62: 47000700           bc      0,1792
> > > >           #00000006c9d4bc66: af000000           mc      0,0
> > > >           >00000006c9d4bc6a: a7a80040           lhi     %r10,64
> > > >            00000006c9d4bc6e: b916002a           llgfr   %r2,%r10
> > > >            00000006c9d4bc72: eb6ff1600004       lmg     %r6,%r15,352(%r15)
> > > >            00000006c9d4bc78: 07fe               bcr     15,%r14
> > > >            00000006c9d4bc7a: 47000700           bc      0,1792
> > > > [   82.574814] Call Trace:
> > > > [   82.574842]  [<00000006c9d4bc6a>] hugetlb_wp+0x3c2/0x818
> > > > [   82.574846]  [<00000006c9d4c62e>] hugetlb_no_page+0x56e/0x5a8
> > > > [   82.574848]  [<00000006c9d4cac2>] hugetlb_fault+0x45a/0x590
> > > > [   82.574850]  [<00000006c9d06d4a>] handle_mm_fault+0x182/0x220
> > > > [   82.574855]  [<00000006c9a9d70e>] do_exception+0x19e/0x470
> > > > [   82.574858]  [<00000006c9a9dff2>] do_dat_exception+0x2a/0x50
> > > > [   82.574861]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0
> > > > [   82.574866]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170
> > > > [   82.574870] Last Breaking-Event-Address:
> > > > [   82.574871]  [<00000006c9d4b926>] hugetlb_wp+0x7e/0x818
> > > > [   82.574873] Kernel panic - not syncing: panic_on_warn set ...
> > > > [   82.574875] CPU: 9 PID: 1674 Comm: linkhuge_rw Kdump: loaded Not tainted 5.19.0-next-20220815 #36
> > > > [   82.574877] Hardware name: IBM 3931 A01 704 (LPAR)
> > > > [   82.574878] Call Trace:
> > > > [   82.574879]  [<00000006ca664f22>] dump_stack_lvl+0x62/0x80
> > > > [   82.574881]  [<00000006ca657af8>] panic+0x118/0x300
> > > > [   82.574884]  [<00000006c9ac3da6>] __warn+0xb6/0x160
> > > > [   82.574887]  [<00000006ca29b1ea>] report_bug+0xba/0x140
> > > > [   82.574890]  [<00000006c9a75194>] monitor_event_exception+0x44/0x80
> > > > [   82.574892]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0
> > > > [   82.574894]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170
> > > > [   82.574897]  [<00000006c9d4bc6a>] hugetlb_wp+0x3c2/0x818
> > > > [   82.574899]  [<00000006c9d4c62e>] hugetlb_no_page+0x56e/0x5a8
> > > > [   82.574901]  [<00000006c9d4cac2>] hugetlb_fault+0x45a/0x590
> > > > [   82.574903]  [<00000006c9d06d4a>] handle_mm_fault+0x182/0x220
> > > > [   82.574906]  [<00000006c9a9d70e>] do_exception+0x19e/0x470
> > > > [   82.574907]  [<00000006c9a9dff2>] do_dat_exception+0x2a/0x50
> > > > [   82.574909]  [<00000006ca668a18>] __do_pgm_check+0xf0/0x1b0
> > > > [   82.574912]  [<00000006ca677b3c>] pgm_check_handler+0x11c/0x170
> > > 
> > > 
> > > do_dat_exception() sets
> > >   access = VM_ACCESS_FLAGS;
> > > 
> > > do_exception() sets
> > >   is_write = (trans_exc_code & store_indication) == 0x400;
> > > 
> > > and FAULT_FLAG_WRITE
> > >    if (access == VM_WRITE || is_write)
> > >           flags |= FAULT_FLAG_WRITE;
> > > 
> > > however, for VMA permission checks it only checks
> > >   if (unlikely(!(vma->vm_flags & access)))
> > >           goto out_up;
> > > 
> > > as VM_ACCESS_FLAGS includes VM_WRITE | VM_READ ...
> > > 
> > > We end up triggering a write fault (FAULT_FLAG_WRITE), even though the
> > > VMA does not allow for writes.
> > > 
> > > I assume that's what happens and that it's a bug in s390x code.
> > > 
> > 
> > Hmm, that looks weird, but that doesn't mean it has to be broken.
> > We are talking about a pte_none() fault, not a protection exception
> > (do_dat_exception vs. do_protection_exception). Not sure if we get
> > any proper store indication in that case, but yes, this looks weird,
> > will have a closer look. Thanks for pointing out!
> > 
> > FWIW, meanwhile, I added a check to hugetlb_wp() in v5.19, for
> > (!unshare && !(vma->vm_flags & VM_WRITE)). This did not trigger,
> > however, it did trigger already before your commit. So something
> > already changed before your commit, and after v5.19.
> > 
> > Further bisecting showed that the check started to trigger
> > after commit bcd51a3c679d ("hugetlb: lazy page table copies in fork()"),
> > and after that the "HUGETLB_ELFMAP=R linkhuge_rw" testcase also
> > started segfaulting (not sure why we did not notice earlier...).
> > 
> > Anyway, I guess this means that your commit only made that change
> > in behavior more obvious, by adding the WARN_ON_ONCE, but it really
> > was introduced by that other commit.
> > 
> > Not sure if this gives any more insight to anyone, still confused
> > by your comments on do_exception(), which also sound like a possible
> > root cause for ending up in hugetlb_wp() w/o VM_WRITE (but why only
> > after commit bcd51a3c679d?).
> 
> I know it doesn't mean much, but I did not/do not see these issues on x86.

Thanks, we were also trying to reproduce on x86, w/o success so far. But
I guess that matches David latest observations wrt to our exception handling
code on s390.

Good news is that the problem goes away when I add this simple patch, which
should result in proper VM_WRITE check for vma flags, before triggering a
FAULT_FLAG_WRITE fault:

--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -379,7 +379,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
        flags = FAULT_FLAG_DEFAULT;
        if (user_mode(regs))
                flags |= FAULT_FLAG_USER;
-       if (access == VM_WRITE || is_write)
+       if (is_write)
+               access = VM_WRITE;
+       if (access == VM_WRITE)
                flags |= FAULT_FLAG_WRITE;
        mmap_read_lock(mm);
 
Still find it a bit hard to believe that this > 10 years old logic really
is/was broken all the time. I guess it simply did not matter for normal
PTE faults, probably because the common fault handling code later would
check itself via maybe_mkwrite(). And for hugetlb PTEs, it might not have
mattered before commit bcd51a3c679d.

> 
> bcd51a3c679d eliminates the copying of page tables at fork for non-anon
> hugetlb vmas.  So, in these tests you would likely see more pte_none()
> faults.

Yes, makes sense, assuming now that it actually is related to s390
exception handling code, not checking for VM_WRITE before triggering a
write fault for pte_none().

Thanks for checking! And Thanks a lot to David for finding that issue
in s390 exception handling code!
David Hildenbrand Aug. 16, 2022, 8:43 p.m. UTC | #11
Hi Gerald,

>
> Thanks, we were also trying to reproduce on x86, w/o success so far. But
> I guess that matches David latest observations wrt to our exception handling
> code on s390.
>
> Good news is that the problem goes away when I add this simple patch, which
> should result in proper VM_WRITE check for vma flags, before triggering a
> FAULT_FLAG_WRITE fault:
>
> --- a/arch/s390/mm/fault.c
> +++ b/arch/s390/mm/fault.c
> @@ -379,7 +379,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
>         flags = FAULT_FLAG_DEFAULT;
>         if (user_mode(regs))
>                 flags |= FAULT_FLAG_USER;
> -       if (access == VM_WRITE || is_write)
> +       if (is_write)
> +               access = VM_WRITE;
> +       if (access == VM_WRITE)
>                 flags |= FAULT_FLAG_WRITE;
>         mmap_read_lock(mm);

That's what I had in mind, good.

>
> Still find it a bit hard to believe that this > 10 years old logic really
> is/was broken all the time. I guess it simply did not matter for normal
> PTE faults, probably because the common fault handling code later would
> check itself via maybe_mkwrite(). And for hugetlb PTEs, it might not have
> mattered before commit bcd51a3c679d.

It is akward, but maybe we never really noticed for hugetlb (not sure
how common read-only mappings are after all).

>
> >
> > bcd51a3c679d eliminates the copying of page tables at fork for non-anon
> > hugetlb vmas.  So, in these tests you would likely see more pte_none()
> > faults.
>
> Yes, makes sense, assuming now that it actually is related to s390
> exception handling code, not checking for VM_WRITE before triggering a
> write fault for pte_none().
>
> Thanks for checking! And Thanks a lot to David for finding that issue
> in s390 exception handling code!

Thanks! Looks like adding the WARN_ON_ONCE was the right decision.
diff mbox series

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0aee2f3ae15c..2480ba627aa5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5241,6 +5241,21 @@  static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 	VM_BUG_ON(unshare && (flags & FOLL_WRITE));
 	VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
 
+	/*
+	 * hugetlb does not support FOLL_FORCE-style write faults that keep the
+	 * PTE mapped R/O such as maybe_mkwrite() would do.
+	 */
+	if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
+		return VM_FAULT_SIGSEGV;
+
+	/* Let's take out MAP_SHARED mappings first. */
+	if (vma->vm_flags & VM_MAYSHARE) {
+		if (unlikely(unshare))
+			return 0;
+		set_huge_ptep_writable(vma, haddr, ptep);
+		return 0;
+	}
+
 	pte = huge_ptep_get(ptep);
 	old_page = pte_page(pte);
 
@@ -5781,12 +5796,11 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * If we are going to COW/unshare the mapping later, we examine the
 	 * pending reservations for this page now. This will ensure that any
 	 * allocations necessary to record that reservation occur outside the
-	 * spinlock. For private mappings, we also lookup the pagecache
-	 * page now as it is used to determine if a reservation has been
-	 * consumed.
+	 * spinlock. Also lookup the pagecache page now as it is used to
+	 * determine if a reservation has been consumed.
 	 */
 	if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
-	    !huge_pte_write(entry)) {
+	    !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
 		if (vma_needs_reservation(h, vma, haddr) < 0) {
 			ret = VM_FAULT_OOM;
 			goto out_mutex;
@@ -5794,9 +5808,7 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		/* Just decrements count, does not deallocate */
 		vma_end_reservation(h, vma, haddr);
 
-		if (!(vma->vm_flags & VM_MAYSHARE))
-			pagecache_page = hugetlbfs_pagecache_page(h,
-								vma, haddr);
+		pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
 	}
 
 	ptl = huge_pte_lock(h, mm, ptep);