diff mbox series

maple tree change made it possible for VMA iteration to see same VMA twice due to late vma_merge() failure

Message ID CAG48ez12VN1JAOtTNMY+Y2YnsU45yL5giS-Qn=ejtiHpgJAbdQ@mail.gmail.com (mailing list archive)
State New
Headers show
Series maple tree change made it possible for VMA iteration to see same VMA twice due to late vma_merge() failure | expand

Commit Message

Jann Horn Aug. 15, 2023, 7:36 p.m. UTC
commit 18b098af2890 ("vma_merge: set vma iterator to correct
position.") added a vma_prev(vmi) call to vma_merge() at a point where
it's still possible to bail out. My understanding is that this moves
the VMA iterator back by one VMA.

If you patch some extra logging into the kernel and inject a fake
out-of-memory error at the vma_iter_prealloc() call in vma_split() (a
real out-of-memory error there is very unlikely to happen in practice,
I think - my understanding is that the kernel will basically kill
every process on the system except for init before it starts failing
GFP_KERNEL allocations that fit within a single slab, unless the
allocation uses GFP_ACCOUNT or stuff like that, which the maple tree
doesn't):

```
        init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
```

and then you run this reproducer:
```
#define _GNU_SOURCE
#include <err.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/prctl.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <linux/userfaultfd.h>

#ifndef UFFD_USER_MODE_ONLY
#define UFFD_USER_MODE_ONLY 1
#endif

#define SYSCHK(x) ({          \
  typeof(x) __res = (x);      \
  if (__res == (typeof(x))-1) \
    err(1, "SYSCHK(" #x ")"); \
  __res;                      \
})

int main(void) {
  int uffd = SYSCHK(syscall(__NR_userfaultfd, UFFD_USER_MODE_ONLY));

  struct uffdio_api api = { .api = UFFD_API, .features = 0 };
  SYSCHK(ioctl(uffd, UFFDIO_API, &api));

  /* create vma1 */
  SYSCHK(mmap((void*)0x100000UL, 0x1000, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE, -1, 0));

  /* set uffd on vma1 */
  struct uffdio_register reg1 = {
    .range = { .start = 0x100000, .len = 0x1000 },
    .mode = UFFDIO_REGISTER_MODE_MISSING
  };
  SYSCHK(ioctl(uffd, UFFDIO_REGISTER, &reg1));

  /* create vma2 */
  SYSCHK(mmap((void*)0x101000UL, 0x1000, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE, -1, 0));

  /* tries to merge vma2 into vma1, with injected allocation failure
causing merge failure */
  SYSCHK(prctl(PR_SET_NAME, "FAILME"));
  struct uffdio_register reg2 = {
    .range = { .start = 0x101000, .len = 0x1000 },
    .mode = UFFDIO_REGISTER_MODE_MISSING
  };
  SYSCHK(ioctl(uffd, UFFDIO_REGISTER, &reg2));
  SYSCHK(prctl(PR_SET_NAME, "normal"));
}
```

then you'll get this fun log output, showing that the same VMA
(ffff88810c0b5e00) was visited by two iterations of the VMA iteration
loop, and on the second iteration, prev==vma:

[  326.765586] userfaultfd_register: begin vma iteration
[  326.766985] userfaultfd_register: prev=ffff88810c0b5ef0,
vma=ffff88810c0b5e00 (0000000000101000-0000000000102000)
[  326.768786] userfaultfd_register: vma_merge returned 0000000000000000
[  326.769898] userfaultfd_register: prev=ffff88810c0b5e00,
vma=ffff88810c0b5e00 (0000000000101000-0000000000102000)

I don't know if this can lead to anything bad but it seems pretty
clearly unintended?

Comments

Jann Horn Aug. 15, 2023, 7:44 p.m. UTC | #1
On Tue, Aug 15, 2023 at 9:36 PM Jann Horn <jannh@google.com> wrote:
> I think - my understanding is that the kernel will basically kill
> every process on the system except for init before it starts failing
> GFP_KERNEL allocations that fit within a single slab, unless the

s/slab/page/
Liam R. Howlett Aug. 16, 2023, 4:17 p.m. UTC | #2
* Jann Horn <jannh@google.com> [230815 15:37]:
> commit 18b098af2890 ("vma_merge: set vma iterator to correct
> position.") added a vma_prev(vmi) call to vma_merge() at a point where
> it's still possible to bail out. My understanding is that this moves
> the VMA iterator back by one VMA.
> 
> If you patch some extra logging into the kernel and inject a fake
> out-of-memory error at the vma_iter_prealloc() call in vma_split() (a
> real out-of-memory error there is very unlikely to happen in practice,
> I think - my understanding is that the kernel will basically kill
> every process on the system except for init before it starts failing
> GFP_KERNEL allocations that fit within a single slab, unless the
> allocation uses GFP_ACCOUNT or stuff like that, which the maple tree
> doesn't):
> 
> ```
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 7cecd49e078b..a7be4d6a5db6 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1454,9 +1454,16 @@ static int userfaultfd_register(struct
> userfaultfd_ctx *ctx,
>                 prev = vma;
> 
>         ret = 0;
> +       if (strcmp(current->comm, "FAILME") == 0)
> +               pr_warn("%s: begin vma iteration\n", __func__);
>         for_each_vma_range(vmi, vma, end) {
>                 cond_resched();
> 
> +               if (strcmp(current->comm, "FAILME") == 0) {
> +                       pr_warn("%s: prev=%px, vma=%px (%016lx-%016lx)\n",
> +                               __func__, prev, vma, vma->vm_start,
> vma->vm_end);
> +               }
> +
>                 BUG_ON(!vma_can_userfault(vma, vm_flags));
>                 BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
>                        vma->vm_userfaultfd_ctx.ctx != ctx);
> @@ -1481,6 +1488,8 @@ static int userfaultfd_register(struct
> userfaultfd_ctx *ctx,
>                                  vma_policy(vma),
>                                  ((struct vm_userfaultfd_ctx){ ctx }),
>                                  anon_vma_name(vma));
> +               if (strcmp(current->comm, "FAILME") == 0)
> +                       pr_warn("%s: vma_merge returned %px\n", __func__, prev);
>                 if (prev) {
>                         /* vma_merge() invalidated the mas */
>                         vma = prev;
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 3937479d0e07..fd435c40c43d 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -990,7 +990,7 @@ struct vm_area_struct *vma_merge(struct
> vma_iterator *vmi, struct mm_struct *mm,
>         if (err)
>                 return NULL;
> 
> -       if (vma_iter_prealloc(vmi))
> +       if (strcmp(current->comm, "FAILME")==0 || vma_iter_prealloc(vmi))
>                 return NULL;
> 
>         init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
> ```
> 
> and then you run this reproducer:
> ```
> #define _GNU_SOURCE
> #include <err.h>
> #include <stdlib.h>
> #include <unistd.h>
> #include <sys/syscall.h>
> #include <sys/prctl.h>
> #include <sys/mman.h>
> #include <sys/ioctl.h>
> #include <linux/userfaultfd.h>
> 
> #ifndef UFFD_USER_MODE_ONLY
> #define UFFD_USER_MODE_ONLY 1
> #endif
> 
> #define SYSCHK(x) ({          \
>   typeof(x) __res = (x);      \
>   if (__res == (typeof(x))-1) \
>     err(1, "SYSCHK(" #x ")"); \
>   __res;                      \
> })
> 
> int main(void) {
>   int uffd = SYSCHK(syscall(__NR_userfaultfd, UFFD_USER_MODE_ONLY));
> 
>   struct uffdio_api api = { .api = UFFD_API, .features = 0 };
>   SYSCHK(ioctl(uffd, UFFDIO_API, &api));
> 
>   /* create vma1 */
>   SYSCHK(mmap((void*)0x100000UL, 0x1000, PROT_READ|PROT_WRITE,
> MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE, -1, 0));
> 
>   /* set uffd on vma1 */
>   struct uffdio_register reg1 = {
>     .range = { .start = 0x100000, .len = 0x1000 },
>     .mode = UFFDIO_REGISTER_MODE_MISSING
>   };
>   SYSCHK(ioctl(uffd, UFFDIO_REGISTER, &reg1));
> 
>   /* create vma2 */
>   SYSCHK(mmap((void*)0x101000UL, 0x1000, PROT_READ|PROT_WRITE,
> MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE, -1, 0));
> 
>   /* tries to merge vma2 into vma1, with injected allocation failure
> causing merge failure */
>   SYSCHK(prctl(PR_SET_NAME, "FAILME"));
>   struct uffdio_register reg2 = {
>     .range = { .start = 0x101000, .len = 0x1000 },
>     .mode = UFFDIO_REGISTER_MODE_MISSING
>   };
>   SYSCHK(ioctl(uffd, UFFDIO_REGISTER, &reg2));
>   SYSCHK(prctl(PR_SET_NAME, "normal"));
> }
> ```
> 
> then you'll get this fun log output, showing that the same VMA
> (ffff88810c0b5e00) was visited by two iterations of the VMA iteration
> loop, and on the second iteration, prev==vma:
> 
> [  326.765586] userfaultfd_register: begin vma iteration
> [  326.766985] userfaultfd_register: prev=ffff88810c0b5ef0,
> vma=ffff88810c0b5e00 (0000000000101000-0000000000102000)
> [  326.768786] userfaultfd_register: vma_merge returned 0000000000000000
> [  326.769898] userfaultfd_register: prev=ffff88810c0b5e00,
> vma=ffff88810c0b5e00 (0000000000101000-0000000000102000)
> 
> I don't know if this can lead to anything bad but it seems pretty
> clearly unintended?

Yes, unintended.

So we are running out of memory, but since vma_merge() doesn't
differentiate between failure and 'nothing to merge', we end up in a
situation that we will revisit the same VMA.

I've been thinking about a way to work this into the interface and I
don't see a clean way because we (could) do different things before the
call depending on the situation.

I think we need to undo any vma iterator changes in the failure
scenarios if there is a chance of the iterator continuing to be used,
which is probably not limited to just this case.

I will audit these areas and CC you on the result.

Thanks,
Liam
Jann Horn Aug. 16, 2023, 5:12 p.m. UTC | #3
On Wed, Aug 16, 2023 at 6:18 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> * Jann Horn <jannh@google.com> [230815 15:37]:
> > commit 18b098af2890 ("vma_merge: set vma iterator to correct
> > position.") added a vma_prev(vmi) call to vma_merge() at a point where
> > it's still possible to bail out. My understanding is that this moves
> > the VMA iterator back by one VMA.
> >
> > If you patch some extra logging into the kernel and inject a fake
> > out-of-memory error at the vma_iter_prealloc() call in vma_split() (a
> > real out-of-memory error there is very unlikely to happen in practice,
> > I think - my understanding is that the kernel will basically kill
> > every process on the system except for init before it starts failing
> > GFP_KERNEL allocations that fit within a single slab, unless the
> > allocation uses GFP_ACCOUNT or stuff like that, which the maple tree
> > doesn't):
[...]
> > then you'll get this fun log output, showing that the same VMA
> > (ffff88810c0b5e00) was visited by two iterations of the VMA iteration
> > loop, and on the second iteration, prev==vma:
> >
> > [  326.765586] userfaultfd_register: begin vma iteration
> > [  326.766985] userfaultfd_register: prev=ffff88810c0b5ef0,
> > vma=ffff88810c0b5e00 (0000000000101000-0000000000102000)
> > [  326.768786] userfaultfd_register: vma_merge returned 0000000000000000
> > [  326.769898] userfaultfd_register: prev=ffff88810c0b5e00,
> > vma=ffff88810c0b5e00 (0000000000101000-0000000000102000)
> >
> > I don't know if this can lead to anything bad but it seems pretty
> > clearly unintended?
>
> Yes, unintended.
>
> So we are running out of memory, but since vma_merge() doesn't
> differentiate between failure and 'nothing to merge', we end up in a
> situation that we will revisit the same VMA.
>
> I've been thinking about a way to work this into the interface and I
> don't see a clean way because we (could) do different things before the
> call depending on the situation.
>
> I think we need to undo any vma iterator changes in the failure
> scenarios if there is a chance of the iterator continuing to be used,
> which is probably not limited to just this case.

I don't fully understand the maple tree interface - in the specific
case of vma_merge(), could you move the vma_prev() call down below the
point of no return, after vma_iter_prealloc()? Or does
vma_iter_prealloc() require that the iterator is already in the insert
position?

> I will audit these areas and CC you on the result.

Thanks!
Liam R. Howlett Aug. 16, 2023, 7:18 p.m. UTC | #4
* Jann Horn <jannh@google.com> [230816 13:13]:
> On Wed, Aug 16, 2023 at 6:18 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > * Jann Horn <jannh@google.com> [230815 15:37]:
> > > commit 18b098af2890 ("vma_merge: set vma iterator to correct
> > > position.") added a vma_prev(vmi) call to vma_merge() at a point where
> > > it's still possible to bail out. My understanding is that this moves
> > > the VMA iterator back by one VMA.
> > >
> > > If you patch some extra logging into the kernel and inject a fake
> > > out-of-memory error at the vma_iter_prealloc() call in vma_split() (a
> > > real out-of-memory error there is very unlikely to happen in practice,
> > > I think - my understanding is that the kernel will basically kill
> > > every process on the system except for init before it starts failing
> > > GFP_KERNEL allocations that fit within a single slab, unless the
> > > allocation uses GFP_ACCOUNT or stuff like that, which the maple tree
> > > doesn't):
> [...]
> > > then you'll get this fun log output, showing that the same VMA
> > > (ffff88810c0b5e00) was visited by two iterations of the VMA iteration
> > > loop, and on the second iteration, prev==vma:
> > >
> > > [  326.765586] userfaultfd_register: begin vma iteration
> > > [  326.766985] userfaultfd_register: prev=ffff88810c0b5ef0,
> > > vma=ffff88810c0b5e00 (0000000000101000-0000000000102000)
> > > [  326.768786] userfaultfd_register: vma_merge returned 0000000000000000
> > > [  326.769898] userfaultfd_register: prev=ffff88810c0b5e00,
> > > vma=ffff88810c0b5e00 (0000000000101000-0000000000102000)
> > >
> > > I don't know if this can lead to anything bad but it seems pretty
> > > clearly unintended?
> >
> > Yes, unintended.
> >
> > So we are running out of memory, but since vma_merge() doesn't
> > differentiate between failure and 'nothing to merge', we end up in a
> > situation that we will revisit the same VMA.
> >
> > I've been thinking about a way to work this into the interface and I
> > don't see a clean way because we (could) do different things before the
> > call depending on the situation.
> >
> > I think we need to undo any vma iterator changes in the failure
> > scenarios if there is a chance of the iterator continuing to be used,
> > which is probably not limited to just this case.
> 
> I don't fully understand the maple tree interface - in the specific
> case of vma_merge(), could you move the vma_prev() call down below the
> point of no return, after vma_iter_prealloc()? Or does
> vma_iter_prealloc() require that the iterator is already in the insert
> position?

Yes, but maybe it shouldn't.  I detect a write going beyond the end of a
node and take corrective action, but not to the front of a node.

If I change the internal code to figure out the preallocations without
being pointed at the insert location, I still cannot take corrective
action on failure since I don't know where I should have been within the
tree structure, that is, I have lost the original range.

I'm still looking at this, but I'm wondering if I should change my
interface for preallocations so I can handle this internally.  That
would be a bigger change.

> 
> > I will audit these areas and CC you on the result.
> 
> Thanks!
Liam R. Howlett Sept. 22, 2023, 4:19 p.m. UTC | #5
* Liam R. Howlett <Liam.Howlett@Oracle.com> [230816 15:18]:
> * Jann Horn <jannh@google.com> [230816 13:13]:
> > On Wed, Aug 16, 2023 at 6:18 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > * Jann Horn <jannh@google.com> [230815 15:37]:
> > > > commit 18b098af2890 ("vma_merge: set vma iterator to correct
> > > > position.") added a vma_prev(vmi) call to vma_merge() at a point where
> > > > it's still possible to bail out. My understanding is that this moves
> > > > the VMA iterator back by one VMA.
> > > >
> > > > If you patch some extra logging into the kernel and inject a fake
> > > > out-of-memory error at the vma_iter_prealloc() call in vma_split() (a
> > > > real out-of-memory error there is very unlikely to happen in practice,
> > > > I think - my understanding is that the kernel will basically kill
> > > > every process on the system except for init before it starts failing
> > > > GFP_KERNEL allocations that fit within a single slab, unless the
> > > > allocation uses GFP_ACCOUNT or stuff like that, which the maple tree
> > > > doesn't):
> > [...]
> > > > then you'll get this fun log output, showing that the same VMA
> > > > (ffff88810c0b5e00) was visited by two iterations of the VMA iteration
> > > > loop, and on the second iteration, prev==vma:
> > > >
> > > > [  326.765586] userfaultfd_register: begin vma iteration
> > > > [  326.766985] userfaultfd_register: prev=ffff88810c0b5ef0,
> > > > vma=ffff88810c0b5e00 (0000000000101000-0000000000102000)
> > > > [  326.768786] userfaultfd_register: vma_merge returned 0000000000000000
> > > > [  326.769898] userfaultfd_register: prev=ffff88810c0b5e00,
> > > > vma=ffff88810c0b5e00 (0000000000101000-0000000000102000)
> > > >
> > > > I don't know if this can lead to anything bad but it seems pretty
> > > > clearly unintended?
> > >
> > > Yes, unintended.
> > >
> > > So we are running out of memory, but since vma_merge() doesn't
> > > differentiate between failure and 'nothing to merge', we end up in a
> > > situation that we will revisit the same VMA.
> > >
> > > I've been thinking about a way to work this into the interface and I
> > > don't see a clean way because we (could) do different things before the
> > > call depending on the situation.
> > >
> > > I think we need to undo any vma iterator changes in the failure
> > > scenarios if there is a chance of the iterator continuing to be used,
> > > which is probably not limited to just this case.
> > 
> > I don't fully understand the maple tree interface - in the specific
> > case of vma_merge(), could you move the vma_prev() call down below the
> > point of no return, after vma_iter_prealloc()? Or does
> > vma_iter_prealloc() require that the iterator is already in the insert
> > position?
> 
> Yes, but maybe it shouldn't.  I detect a write going beyond the end of a
> node and take corrective action, but not to the front of a node.
> 
> If I change the internal code to figure out the preallocations without
> being pointed at the insert location, I still cannot take corrective
> action on failure since I don't know where I should have been within the
> tree structure, that is, I have lost the original range.
> 
> I'm still looking at this, but I'm wondering if I should change my
> interface for preallocations so I can handle this internally.  That
> would be a bigger change.
> 
> > 
> > > I will audit these areas and CC you on the result.

Looking at this, I think it's best to make a label and undo the
vma_prev() with a vma_next() - at least for now.

I'm also reading this for the error path on dup_anon_vma() failure, and
it appears to also have an issue which I'd like to point out here before
I send the fix for the first issue.

-----------
                vma_start_write(next);                                                                                  
                remove = next;                          /* case 1 */                                                    
                vma_end = next->vm_end;                                                                                 
                err = dup_anon_vma(prev, next);                                                                         
                if (curr) {                             /* case 6 */                                                    
                        vma_start_write(curr);                                                                          
                        remove = curr;                                                                                  
                        remove2 = next;                                                                                 
                        if (!next->anon_vma)                                                                            
                                err = dup_anon_vma(prev, curr);  
-----------

Since dup_anon_vma() can fail, I think here in case 6 we could overwrite
the failure.

That is, we will fail to clone the anon vma and mask the failure if we
are running case 6 with an anon in next.  Once the first dup_anon_vma()
returns error, the next call to clone curr vma may return 0 if there is
no anon vma (this, I think _must_ be the case). Then we are in a
situation where we will be removing next and expanding prev over curr
and next, but have not dup'ed the anon vma from next.


Thanks,
Liam
Liam R. Howlett Sept. 22, 2023, 5:52 p.m. UTC | #6
...
> 
> Looking at this, I think it's best to make a label and undo the
> vma_prev() with a vma_next() - at least for now.
> 
> I'm also reading this for the error path on dup_anon_vma() failure, and
> it appears to also have an issue which I'd like to point out here before
> I send the fix for the first issue.
> 
> -----------
>                 vma_start_write(next);                                                                                  
>                 remove = next;                          /* case 1 */                                                    
>                 vma_end = next->vm_end;                                                                                 
>                 err = dup_anon_vma(prev, next);                                                                         
>                 if (curr) {                             /* case 6 */                                                    
>                         vma_start_write(curr);                                                                          
>                         remove = curr;                                                                                  
>                         remove2 = next;                                                                                 
>                         if (!next->anon_vma)                                                                            
>                                 err = dup_anon_vma(prev, curr);  
> -----------
> 
> Since dup_anon_vma() can fail, I think here in case 6 we could overwrite
> the failure.
> 
> That is, we will fail to clone the anon vma and mask the failure if we
> are running case 6 with an anon in next.  Once the first dup_anon_vma()
> returns error, the next call to clone curr vma may return 0 if there is
> no anon vma (this, I think _must_ be the case). Then we are in a
> situation where we will be removing next and expanding prev over curr
> and next, but have not dup'ed the anon vma from next.
> 

I think I am incorrect in the error being overwritten because we won't
call dup_anon_vma(prev, curr) if the source of the previous call (next)
has an anon vma.

Thanks,
Liam
Jann Horn Sept. 22, 2023, 6:02 p.m. UTC | #7
On Fri, Sep 22, 2023 at 7:52 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> ...
> >
> > Looking at this, I think it's best to make a label and undo the
> > vma_prev() with a vma_next() - at least for now.
> >
> > I'm also reading this for the error path on dup_anon_vma() failure, and
> > it appears to also have an issue which I'd like to point out here before
> > I send the fix for the first issue.
> >
> > -----------
> >                 vma_start_write(next);
> >                 remove = next;                          /* case 1 */
> >                 vma_end = next->vm_end;
> >                 err = dup_anon_vma(prev, next);
> >                 if (curr) {                             /* case 6 */
> >                         vma_start_write(curr);
> >                         remove = curr;
> >                         remove2 = next;
> >                         if (!next->anon_vma)
> >                                 err = dup_anon_vma(prev, curr);
> > -----------
> >
> > Since dup_anon_vma() can fail, I think here in case 6 we could overwrite
> > the failure.
> >
> > That is, we will fail to clone the anon vma and mask the failure if we
> > are running case 6 with an anon in next.  Once the first dup_anon_vma()
> > returns error, the next call to clone curr vma may return 0 if there is
> > no anon vma (this, I think _must_ be the case). Then we are in a
> > situation where we will be removing next and expanding prev over curr
> > and next, but have not dup'ed the anon vma from next.
> >
>
> I think I am incorrect in the error being overwritten because we won't
> call dup_anon_vma(prev, curr) if the source of the previous call (next)
> has an anon vma.

Hm, yeah. It looks pretty dodgy and I guess it could use a comment,
but as you said, it seems to actually not be a problem...

We could do "err |= dup_anon_vma(...)" there for clarity instead, as
long as the only thing we care about is whether we have a nonzero
error...
diff mbox series

Patch

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7cecd49e078b..a7be4d6a5db6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1454,9 +1454,16 @@  static int userfaultfd_register(struct
userfaultfd_ctx *ctx,
                prev = vma;

        ret = 0;
+       if (strcmp(current->comm, "FAILME") == 0)
+               pr_warn("%s: begin vma iteration\n", __func__);
        for_each_vma_range(vmi, vma, end) {
                cond_resched();

+               if (strcmp(current->comm, "FAILME") == 0) {
+                       pr_warn("%s: prev=%px, vma=%px (%016lx-%016lx)\n",
+                               __func__, prev, vma, vma->vm_start,
vma->vm_end);
+               }
+
                BUG_ON(!vma_can_userfault(vma, vm_flags));
                BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
                       vma->vm_userfaultfd_ctx.ctx != ctx);
@@ -1481,6 +1488,8 @@  static int userfaultfd_register(struct
userfaultfd_ctx *ctx,
                                 vma_policy(vma),
                                 ((struct vm_userfaultfd_ctx){ ctx }),
                                 anon_vma_name(vma));
+               if (strcmp(current->comm, "FAILME") == 0)
+                       pr_warn("%s: vma_merge returned %px\n", __func__, prev);
                if (prev) {
                        /* vma_merge() invalidated the mas */
                        vma = prev;
diff --git a/mm/mmap.c b/mm/mmap.c
index 3937479d0e07..fd435c40c43d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -990,7 +990,7 @@  struct vm_area_struct *vma_merge(struct
vma_iterator *vmi, struct mm_struct *mm,
        if (err)
                return NULL;

-       if (vma_iter_prealloc(vmi))
+       if (strcmp(current->comm, "FAILME")==0 || vma_iter_prealloc(vmi))
                return NULL;