diff mbox series

mm, gpu: fix error when FOLL_MLOCK an unpresent page

Message ID 20210829011953.9051-1-laoar.shao@gmail.com (mailing list archive)
State New
Headers show
Series mm, gpu: fix error when FOLL_MLOCK an unpresent page | expand

Commit Message

Yafang Shao Aug. 29, 2021, 1:19 a.m. UTC
There was an error occurred in my server as follows,
[137830.506483] general protection fault: 0000 [#2] SMP NOPTI
[137830.506488] CPU: 5 PID: 8500 Comm: user Tainted: P      D    OE     4.18.0-22-generic #23~18.04.1-Ubuntu
[137830.506491] Hardware name: Dell Inc. Precision 5820 Tower/06JWJY, BIOS 2.8.0 01/15/2021
[137830.506498] RIP: 0010:tlp_write+0x16a/0x1d0 [tlp]
[137830.506500] Code: 25 00 00 00 4c 89 e2 4c 89 e8 48 2b 05 5f ad f1 cb 48 c1 f8 06 48 c1 e0 0c 48 03 05 60 ad f1 cb 48 89 05 81 22 00 00 48 89 c7 <f3> a4 48 c7 c7 50 81 8a c0 48 8b 35 6e 22 00 00 e8 e4 02 e5 ca 48
[137830.506545] RSP: 0018:ffffa79708cfbe10 EFLAGS: 00010207
[137830.506549] RAX: 0004098280000000 RBX: 000000000000000e RCX: 0000000000000025
[137830.506552] RDX: 00007f3fdcf77000 RSI: ffffffffc08a8128 RDI: 0004098280000000
[137830.506554] RBP: ffffa79708cfbe88 R08: 000000000000046b R09: 000000000000000f
[137830.506557] R10: ffff9c4280000738 R11: ffffffff8cf8a80e R12: 00007f3fdcf77000
[137830.506559] R13: 0000000000000000 R14: ffff9c52a2e13b80 R15: ffff9c5299e29800
[137830.506562] FS:  00007f3fdcf414c0(0000) GS:ffff9c52df740000(0000) knlGS:0000000000000000
[137830.506564] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[137830.506567] CR2: 00007f3fdcf78000 CR3: 0000001020b66002 CR4: 00000000003606e0
[137830.506570] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[137830.506572] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[137830.506574] Call Trace:
[137830.506584]  __vfs_write+0x1b/0x40
[137830.506587]  vfs_write+0xb1/0x1a0
[137830.506590]  ksys_write+0x55/0xc0
[137830.506594]  __x64_sys_write+0x1a/0x20
[137830.506600]  do_syscall_64+0x5a/0x120
[137830.506608]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[137830.506611] RIP: 0033:0x7f3fdca71154
[137830.506612] Code: 89 02 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 8d 05 b1 07 2e 00 8b 00 85 c0 75 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 f3 c3 66 90 41 54 55 49 89 d4 53 48 89 f5
[137830.506656] RSP: 002b:00007ffe7efbb5f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[137830.506659] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f3fdca71154
[137830.506661] RDX: 000000000000000e RSI: 00007f3fdcf78000 RDI: 0000000000000003
[137830.506663] RBP: 00007ffe7efbb640 R08: 0000000000000000 R09: 0000000000000000
[137830.506665] R10: 0000000000000000 R11: 0000000000000246 R12: 0000562b09bbd810
[137830.506667] R13: 00007ffe7efbb720 R14: 0000000000000000 R15: 0000000000000000
[137830.506670] Modules linked in: tlp(OE) snd_hda_codec_hdmi binfmt_misc nls_iso8859_1 nvidia_uvm(OE) nvidia_drm(POE) nvidia_modeset(POE) intel_rapl skx_edac nfit nvidia(POE) x86_pkg_temp_thermal intel_powerclamp coretemp snd_hda_codec_realtek snd_hda_codec_generic kvm_intel kvm joydev irqbypass crct10dif_pclmul snd_hda_intel crc32_pclmul ghash_clmulni_intel pcbc snd_hda_codec snd_hda_core snd_hwdep snd_pcm dell_wmi sparse_keymap aesni_intel wmi_bmof dell_smbios video aes_x86_64 crypto_simd snd_seq_midi cryptd snd_seq_midi_event intel_wmi_thunderbolt glue_helper dell_wmi_descriptor snd_rawmidi dcdbas intel_cstate drm_kms_helper snd_seq input_leds dell_smm_hwmon intel_rapl_perf drm serio_raw snd_seq_device fb_sys_fops snd_timer syscopyarea sysfillrect sysimgblt snd mei_me soundcore mei ioatdma wmi dca
[137830.506738]  acpi_tad mac_hid sch_fq_codel parport_pc ppdev lp parport ip_tables x_tables autofs4 hid_generic usbhid hid uas usb_storage nvme nvme_core vmd ahci e1000e libahci
[137830.506761] ---[ end trace a751f7202b6306cc ]---
[137830.507839] RIP: 0010:tlp_write+0x16a/0x1d0 [tlp]
[137830.507842] Code: 25 00 00 00 4c 89 e2 4c 89 e8 48 2b 05 5f ad f1 cb 48 c1 f8 06 48 c1 e0 0c 48 03 05 60 ad f1 cb 48 89 05 81 22 00 00 48 89 c7 <f3> a4 48 c7 c7 50 81 8a c0 48 8b 35 6e 22 00 00 e8 e4 02 e5 ca 48
[137830.507886] RSP: 0018:ffffa79709cf7e10 EFLAGS: 00010207
[137830.507889] RAX: 0004098280000000 RBX: 000000000000000e RCX: 0000000000000025
[137830.507891] RDX: 00007f1b78242000 RSI: ffffffffc08a8128 RDI: 0004098280000000
[137830.507893] RBP: ffffa79709cf7e88 R08: 0000000000000437 R09: 0000000000000004
[137830.507896] R10: ffff9c4280000e08 R11: ffffffff8cf8a80d R12: 00007f1b78242000
[137830.507898] R13: 0000000000000000 R14: ffff9c5289665940 R15: ffff9c52d52f4400
[137830.507900] FS:  00007f3fdcf414c0(0000) GS:ffff9c52df740000(0000) knlGS:0000000000000000
[137830.507903] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[137830.507905] CR2: 00007f3fdcf78000 CR3: 0000001020b66002 CR4: 00000000003606e0
[137830.507907] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[137830.507909] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400

As we can see, the error was caused by writing an unpresent page in my
kernel module 'tlp'.

After some analyzation, I found it was caused by a bug in GUP.
When the kernel module calls get_user_pages() with FOLL_MLOCK being set but
FOLL_POPULATE being unset, if the page of the user addr isn't present, the
kernel won't fault in the user page. But get_user_pages() still count
this unpresent page as a present page and then returns a wrong number of
pinned pages. Then the kernel module will think it has already pinned the
page successfully and do the DMA in the unpresent page. At the last the
above error will occur.

After this change, the above error disappears.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/gup.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

Comments

Christoph Hellwig Aug. 30, 2021, 7 a.m. UTC | #1
On Sun, Aug 29, 2021 at 01:19:53AM +0000, Yafang Shao wrote:
> After some analyzation, I found it was caused by a bug in GUP.
> When the kernel module calls get_user_pages() with FOLL_MLOCK being set but
> FOLL_POPULATE being unset, if the page of the user addr isn't present, the

Which is not a valid way to call get_user_pages.  What we need to do is
to reject that case.  No-tree user does this so that bug is what ever
crap out of tree code you're using.
Yafang Shao Aug. 30, 2021, 9:12 a.m. UTC | #2
On Mon, Aug 30, 2021 at 3:00 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Sun, Aug 29, 2021 at 01:19:53AM +0000, Yafang Shao wrote:
> > After some analyzation, I found it was caused by a bug in GUP.
> > When the kernel module calls get_user_pages() with FOLL_MLOCK being set but
> > FOLL_POPULATE being unset, if the page of the user addr isn't present, the
>
> Which is not a valid way to call get_user_pages.  What we need to do is
> to reject that case.

Do you mean below change ?

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ca22e6e694a..10f7d6f2ad7b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2855,7 +2855,7 @@ struct page *follow_page(struct vm_area_struct
*vma, unsigned long address,
 #define FOLL_NUMA      0x200   /* force NUMA hinting page fault */
 #define FOLL_MIGRATION 0x400   /* wait for page to replace migration entry */
 #define FOLL_TRIED     0x800   /* a retry, previous pass started an IO */
-#define FOLL_MLOCK     0x1000  /* lock present pages */
+#define FOLL_MLOCK     0x1000  /* lock present pages, must be set
with FOLL_POPULATE */
 #define FOLL_REMOTE    0x2000  /* we are working on non-current tsk/mm */
 #define FOLL_COW       0x4000  /* internal GUP flag */
 #define FOLL_ANON      0x8000  /* don't do file mappings */
diff --git a/mm/gup.c b/mm/gup.c
index b94717977d17..dfdc0654f7a5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -929,9 +929,9 @@ static int faultin_page(struct vm_area_struct *vma,
        unsigned int fault_flags = 0;
        vm_fault_t ret;

-       /* mlock all present pages, but do not fault in new pages */
-       if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
-               return -ENOENT;
+       /* FOLL_MLOCK must be set with FOLL_POPULATE */
+       BUG_ON((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK);
+
        if (*flags & FOLL_WRITE)
                fault_flags |= FAULT_FLAG_WRITE;
        if (*flags & FOLL_REMOTE)
@@ -1181,8 +1181,6 @@ static long __get_user_pages(struct mm_struct *mm,
                        case -ENOMEM:
                        case -EHWPOISON:
                                goto out;
-                       case -ENOENT:
-                               goto next_page;
                        }
                        BUG();
                } else if (PTR_ERR(page) == -EEXIST) {
@@ -1823,6 +1821,10 @@ static long __gup_longterm_locked(struct mm_struct *mm,

 static bool is_valid_gup_flags(unsigned int gup_flags)
 {
+       /* FOLL_MLOCK must be set with FOLL_POPULATE */
+       if (WARN_ON_ONCE((gup_flags & (FOLL_POPULATE | FOLL_MLOCK)) ==
FOLL_MLOCK))
+               return false;
+
        /*
         * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
         * never directly by the caller, so enforce that with an assertion:


>  No-tree user does this so that bug is what ever
> crap out of tree code you're using.

populate_vma_page_range() may trigger this bug, but I haven't verified it yet.

populate_vma_page_range
    gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
    if (vma->vm_flags & VM_LOCKONFAULT)
        gup_flags &= ~FOLL_POPULATE;  // FOLL_MLOCK without FOLL_POPULATE then.

    __get_user_pages(..., gup_flags, ...);
Christoph Hellwig Aug. 30, 2021, 10:08 a.m. UTC | #3
On Mon, Aug 30, 2021 at 05:12:32PM +0800, Yafang Shao wrote:
> > Which is not a valid way to call get_user_pages.  What we need to do is
> > to reject that case.
> 
> Do you mean below change ?

Sory of.  I think once touching this we should do a few more cleanups
including making many of the flags private to gup.c.  I'll try to find
some time to post a more complete series.
Yafang Shao Aug. 30, 2021, 10:55 a.m. UTC | #4
On Mon, Aug 30, 2021 at 6:08 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Mon, Aug 30, 2021 at 05:12:32PM +0800, Yafang Shao wrote:
> > > Which is not a valid way to call get_user_pages.  What we need to do is
> > > to reject that case.
> >
> > Do you mean below change ?
>
> Sory of.  I think once touching this we should do a few more cleanups
> including making many of the flags private to gup.c.  I'll try to find
> some time to post a more complete series.

JFYI, below test case can also hit the bug I reported above.

#define _GNU_SOURCE
#include <stdio.h>
#include <sys/mman.h>

#define LEN 4096

int main()
{
        char *addr;
        int ret;

        addr = mmap(NULL, LEN, PROT_READ|PROT_WRITE, MAP_PRIVATE |
MAP_ANON , -1, 0);
        if (addr == MAP_FAILED) {
                perror("mmap");
                return ret;
        }

        /*
         * MLOCK_ONFAULT  will hit below if condition.
         *  if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
         *       return -ENOENT;
         */
        ret = mlock2(addr, LEN, MLOCK_ONFAULT);
//      ret = mlock2(addr, LEN, 0);
        if (ret < 0) {
                perror("mlock2");
                return ret;
        }

        return 0;
}
Kirill A . Shutemov Aug. 31, 2021, 1:36 a.m. UTC | #5
On Mon, Aug 30, 2021 at 06:55:02PM +0800, Yafang Shao wrote:
> On Mon, Aug 30, 2021 at 6:08 PM Christoph Hellwig <hch@infradead.org> wrote:
> >
> > On Mon, Aug 30, 2021 at 05:12:32PM +0800, Yafang Shao wrote:
> > > > Which is not a valid way to call get_user_pages.  What we need to do is
> > > > to reject that case.
> > >
> > > Do you mean below change ?
> >
> > Sory of.  I think once touching this we should do a few more cleanups
> > including making many of the flags private to gup.c.  I'll try to find
> > some time to post a more complete series.
> 
> JFYI, below test case can also hit the bug I reported above.

How does the bug manifests with the test case? I don't see any crash with
it in my setup.

Or do you mean you can hit __get_user_pages() with FOLL_MLOCK, but without
FOLL_POPULATE?

My guess is that you have wrong expectation from GUP: it will return a
number of pages it advanced in the mapping, not number of present pages
there. For your case it means that the array of pages can have gaps and
it's okay.

Fill the array with zeros before calling GUP and check if the entry is
non-NULL before dereferencing it.

> 
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <sys/mman.h>
> 
> #define LEN 4096
> 
> int main()
> {
>         char *addr;
>         int ret;
> 
>         addr = mmap(NULL, LEN, PROT_READ|PROT_WRITE, MAP_PRIVATE |
> MAP_ANON , -1, 0);
>         if (addr == MAP_FAILED) {
>                 perror("mmap");
>                 return ret;
>         }
> 
>         /*
>          * MLOCK_ONFAULT  will hit below if condition.
>          *  if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
>          *       return -ENOENT;
>          */
>         ret = mlock2(addr, LEN, MLOCK_ONFAULT);
> //      ret = mlock2(addr, LEN, 0);
>         if (ret < 0) {
>                 perror("mlock2");
>                 return ret;
>         }
> 
>         return 0;
> }
> 
> -- 
> Thanks
> Yafang
>
Yafang Shao Aug. 31, 2021, 12:32 p.m. UTC | #6
On Tue, Aug 31, 2021 at 9:36 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Mon, Aug 30, 2021 at 06:55:02PM +0800, Yafang Shao wrote:
> > On Mon, Aug 30, 2021 at 6:08 PM Christoph Hellwig <hch@infradead.org> wrote:
> > >
> > > On Mon, Aug 30, 2021 at 05:12:32PM +0800, Yafang Shao wrote:
> > > > > Which is not a valid way to call get_user_pages.  What we need to do is
> > > > > to reject that case.
> > > >
> > > > Do you mean below change ?
> > >
> > > Sory of.  I think once touching this we should do a few more cleanups
> > > including making many of the flags private to gup.c.  I'll try to find
> > > some time to post a more complete series.
> >
> > JFYI, below test case can also hit the bug I reported above.
>
> How does the bug manifests with the test case? I don't see any crash with
> it in my setup.
>
> Or do you mean you can hit __get_user_pages() with FOLL_MLOCK, but without
> FOLL_POPULATE?
>

Right, that is what I meant.

> My guess is that you have wrong expectation from GUP: it will return a
> number of pages it advanced in the mapping, not number of present pages
> there. For your case it means that the array of pages can have gaps and
> it's okay.
>

Right, I misunderstood __get_user_pages().  Thanks for the explanation.

> Fill the array with zeros before calling GUP and check if the entry is
> non-NULL before dereferencing it.
>

Sure.

> >
> > #define _GNU_SOURCE
> > #include <stdio.h>
> > #include <sys/mman.h>
> >
> > #define LEN 4096
> >
> > int main()
> > {
> >         char *addr;
> >         int ret;
> >
> >         addr = mmap(NULL, LEN, PROT_READ|PROT_WRITE, MAP_PRIVATE |
> > MAP_ANON , -1, 0);
> >         if (addr == MAP_FAILED) {
> >                 perror("mmap");
> >                 return ret;
> >         }
> >
> >         /*
> >          * MLOCK_ONFAULT  will hit below if condition.
> >          *  if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
> >          *       return -ENOENT;
> >          */
> >         ret = mlock2(addr, LEN, MLOCK_ONFAULT);
> > //      ret = mlock2(addr, LEN, 0);
> >         if (ret < 0) {
> >                 perror("mlock2");
> >                 return ret;
> >         }
> >
> >         return 0;
> > }
> >
> > --
> > Thanks
> > Yafang
> >
>
> --
>  Kirill A. Shutemov
diff mbox series

Patch

diff --git a/mm/gup.c b/mm/gup.c
index b94717977d17..bb0edfd77ca4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1098,6 +1098,7 @@  static long __get_user_pages(struct mm_struct *mm,
 	long ret = 0, i = 0;
 	struct vm_area_struct *vma = NULL;
 	struct follow_page_context ctx = { NULL };
+	unsigned long unpresent = 0;
 
 	if (!nr_pages)
 		return 0;
@@ -1182,6 +1183,8 @@  static long __get_user_pages(struct mm_struct *mm,
 			case -EHWPOISON:
 				goto out;
 			case -ENOENT:
+				/* ctx.page_mask is 0 */
+				unpresent += 1;
 				goto next_page;
 			}
 			BUG();
@@ -1216,7 +1219,7 @@  static long __get_user_pages(struct mm_struct *mm,
 out:
 	if (ctx.pgmap)
 		put_dev_pagemap(ctx.pgmap);
-	return i ? i : ret;
+	return (i - unpresent) ? (i - unpresent) : ret;
 }
 
 static bool vma_permits_fault(struct vm_area_struct *vma,