diff mbox series

[v2,1/2] mm/hugetlb: fix hugetlb not supporting softdirty tracking

Message ID 20220811103435.188481-2-david@redhat.com (mailing list archive)
State New
Headers show
Series mm/hugetlb: fix write-fault handling for shared mappings | expand

Commit Message

David Hildenbrand Aug. 11, 2022, 10:34 a.m. UTC
Staring at hugetlb_wp(), one might wonder where all the logic for shared
mappings is when stumbling over a write-protected page in a shared
mapping. In fact, there is none, and so far we thought we could get
away with that because e.g., mprotect() should always do the right thing
and map all pages directly writable.

Looks like we were wrong:

--------------------------------------------------------------------------
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <errno.h>
 #include <sys/mman.h>

 #define HUGETLB_SIZE (2 * 1024 * 1024u)

 static void clear_softdirty(void)
 {
         int fd = open("/proc/self/clear_refs", O_WRONLY);
         const char *ctrl = "4";
         int ret;

         if (fd < 0) {
                 fprintf(stderr, "open(clear_refs) failed\n");
                 exit(1);
         }
         ret = write(fd, ctrl, strlen(ctrl));
         if (ret != strlen(ctrl)) {
                 fprintf(stderr, "write(clear_refs) failed\n");
                 exit(1);
         }
         close(fd);
 }

 int main(int argc, char **argv)
 {
         char *map;
         int fd;

         fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT);
         if (!fd) {
                 fprintf(stderr, "open() failed\n");
                 return -errno;
         }
         if (ftruncate(fd, HUGETLB_SIZE)) {
                 fprintf(stderr, "ftruncate() failed\n");
                 return -errno;
         }

         map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
         if (map == MAP_FAILED) {
                 fprintf(stderr, "mmap() failed\n");
                 return -errno;
         }

         *map = 0;

         if (mprotect(map, HUGETLB_SIZE, PROT_READ)) {
                 fprintf(stderr, "mmprotect() failed\n");
                 return -errno;
         }

         clear_softdirty();

         if (mprotect(map, HUGETLB_SIZE, PROT_READ|PROT_WRITE)) {
                 fprintf(stderr, "mmprotect() failed\n");
                 return -errno;
         }

         *map = 0;

         return 0;
 }
--------------------------------------------------------------------------

Above test fails with SIGBUS when there is only a single free hugetlb page.
 # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
 # ./test
 Bus error (core dumped)

And worse, with sufficient free hugetlb pages it will map an anonymous page
into a shared mapping, for example, messing up accounting during unmap
and breaking MAP_SHARED semantics:
 # echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
 # ./test
 # cat /proc/meminfo | grep HugePages_
 HugePages_Total:       2
 HugePages_Free:        1
 HugePages_Rsvd:    18446744073709551615
 HugePages_Surp:        0

Reason in this particular case is that vma_wants_writenotify() will
return "true", removing VM_SHARED in vma_set_page_prot() to map pages
write-protected. Let's teach vma_wants_writenotify() that hugetlb does not
support softdirty tracking.

Fixes: 64e455079e1b ("mm: softdirty: enable write notifications on VMAs after VM_SOFTDIRTY cleared")
Cc: <stable@vger.kernel.org> # v3.18+
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 mm/mmap.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

Comments

Mike Kravetz Aug. 11, 2022, 6:27 p.m. UTC | #1
On 08/11/22 12:34, David Hildenbrand wrote:
> Staring at hugetlb_wp(), one might wonder where all the logic for shared
> mappings is when stumbling over a write-protected page in a shared
> mapping. In fact, there is none, and so far we thought we could get
> away with that because e.g., mprotect() should always do the right thing
> and map all pages directly writable.
> 
> Looks like we were wrong:
> 
> --------------------------------------------------------------------------
>  #include <stdio.h>
>  #include <stdlib.h>
>  #include <string.h>
>  #include <fcntl.h>
>  #include <unistd.h>
>  #include <errno.h>
>  #include <sys/mman.h>
> 
>  #define HUGETLB_SIZE (2 * 1024 * 1024u)
> 
>  static void clear_softdirty(void)
>  {
>          int fd = open("/proc/self/clear_refs", O_WRONLY);
>          const char *ctrl = "4";
>          int ret;
> 
>          if (fd < 0) {
>                  fprintf(stderr, "open(clear_refs) failed\n");
>                  exit(1);
>          }
>          ret = write(fd, ctrl, strlen(ctrl));
>          if (ret != strlen(ctrl)) {
>                  fprintf(stderr, "write(clear_refs) failed\n");
>                  exit(1);
>          }
>          close(fd);
>  }
> 
>  int main(int argc, char **argv)
>  {
>          char *map;
>          int fd;
> 
>          fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT);
>          if (!fd) {
>                  fprintf(stderr, "open() failed\n");
>                  return -errno;
>          }
>          if (ftruncate(fd, HUGETLB_SIZE)) {
>                  fprintf(stderr, "ftruncate() failed\n");
>                  return -errno;
>          }
> 
>          map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
>          if (map == MAP_FAILED) {
>                  fprintf(stderr, "mmap() failed\n");
>                  return -errno;
>          }
> 
>          *map = 0;
> 
>          if (mprotect(map, HUGETLB_SIZE, PROT_READ)) {
>                  fprintf(stderr, "mmprotect() failed\n");
>                  return -errno;
>          }
> 
>          clear_softdirty();
> 
>          if (mprotect(map, HUGETLB_SIZE, PROT_READ|PROT_WRITE)) {
>                  fprintf(stderr, "mmprotect() failed\n");
>                  return -errno;
>          }
> 
>          *map = 0;
> 
>          return 0;
>  }
> --------------------------------------------------------------------------
> 
> Above test fails with SIGBUS when there is only a single free hugetlb page.
>  # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
>  # ./test
>  Bus error (core dumped)
> 
> And worse, with sufficient free hugetlb pages it will map an anonymous page
> into a shared mapping, for example, messing up accounting during unmap
> and breaking MAP_SHARED semantics:
>  # echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
>  # ./test
>  # cat /proc/meminfo | grep HugePages_
>  HugePages_Total:       2
>  HugePages_Free:        1
>  HugePages_Rsvd:    18446744073709551615
>  HugePages_Surp:        0
> 
> Reason in this particular case is that vma_wants_writenotify() will
> return "true", removing VM_SHARED in vma_set_page_prot() to map pages
> write-protected. Let's teach vma_wants_writenotify() that hugetlb does not
> support softdirty tracking.
> 
> Fixes: 64e455079e1b ("mm: softdirty: enable write notifications on VMAs after VM_SOFTDIRTY cleared")
> Cc: <stable@vger.kernel.org> # v3.18+
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  mm/mmap.c | 7 +++++--
>  1 file changed, 5 insertions(+), 2 deletions(-)

Thanks,

Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
diff mbox series

Patch

diff --git a/mm/mmap.c b/mm/mmap.c
index c035020d0c89..9d780f415be3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1646,8 +1646,11 @@  int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
 	    pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
 		return 0;
 
-	/* Do we need to track softdirty? */
-	if (vma_soft_dirty_enabled(vma))
+	/*
+	 * Do we need to track softdirty? hugetlb does not support softdirty
+	 * tracking yet.
+	 */
+	if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
 		return 1;
 
 	/* Specialty mapping? */