diff mbox

post-copy is broken?

Message ID 20160413114103.GB2270@work-vm (mailing list archive)
State New, archived
Headers show

Commit Message

Dr. David Alan Gilbert April 13, 2016, 11:41 a.m. UTC
* Dr. David Alan Gilbert (dgilbert@redhat.com) wrote:
> * Li, Liang Z (liang.z.li@intel.com) wrote:
> > > > > I used the latest qemu code (commit id: 4e71220387e88a22) and kernel
> > > > (v4.5) to test the post-copy, and find the guest get crashed after
> > > > live migration, no matter I did a local live migration or live
> > > > migration between two hosts. I just ran the stress as the workload in
> > > > guest. It seems the post-copy is broken?
> > > > >
> > > > > Stress parameters:  stress --vm 2 --vm-hang 1 --vm-bytes 2048M
> > > > > --vm-keep QEMU parameters: ./qemu-system-x86_64 --enable-kvm -
> > > smp
> > > > 4 -m
> > > > > 8192  -monitor stdio -drive file=/share/centos6u6.qcow
> > > >
> > > > My test seems to be working here (4.4.6-301.fc23 kernel) same qemu
> > > > version.
> > > > This is with an f20 guest running google stressapptest.
> > > >
> > > > What's your last working version?
> > > >
> > > 
> > > This is my first try of post-copy after the related patches been merged.
> > > I will double check and get back to you.
> > > 
> > > Thanks!
> > > Liang
> > > 
> > > > Dave
> > 
> > I tried the v4.4 upstream kernel, the issue was disappeared. It must be some changes between kernel v4.4 and v4.5
> > breaks post-copy.  
> 
> Oh, fun.  cc'ing in Andrea.

OK, I can confirm this bug on Fedora24 (4.5.0-302);  see below for
the postcopy test I've written that I intend to add to qemu; it works
on my f23 host but not in f24.

Dave


From 304829b6414dbd070b08ff03c1f155d229b5c492 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Wed, 13 Apr 2016 12:35:41 +0100
Subject: [PATCH] test: Postcopy

This is a postcopy test (x86 only) that actually runs the guest
and checks the memory contents.

The test runs from an x86 boot block with the hex embedded in the test;
the source for this is:

...........

.code16
.org 0x7c00
	.file	"fill.s"
	.text
	.globl	start
	.type	start, @function
start:             # at 0x7c00 ?
        cli
        lgdt gdtdesc
        mov $1,%eax
        mov %eax,%cr0  # Protected mode enable
        data32 ljmp $8,$0x7c20

.org 0x7c20
.code32
        # A20 enable - not sure I actually need this
        inb $0x92,%al
        or  $2,%al
        outb %al, $0x92

        # set up DS for the whole of RAM (needed on KVM)
        mov $16,%eax
        mov %eax,%ds

        mov $65,%ax
        mov $0x3f8,%dx
        outb %al,%dx

        # bl keeps a counter so we limit the output speed
        mov $0, %bl
mainloop:
        # Start from 1MB
        mov $(1024*1024),%eax
innerloop:
        incb (%eax)
        add $4096,%eax
        cmp $(100*1024*1024),%eax
        jl innerloop

        inc %bl
        jnz mainloop

        mov $66,%ax
        mov $0x3f8,%dx
        outb %al,%dx

	jmp mainloop

        # GDT magic from old (GPLv2)  Grub startup.S
        .p2align        2       /* force 4-byte alignment */
gdt:
        .word   0, 0
        .byte   0, 0, 0, 0

        /* -- code segment --
         * base = 0x00000000, limit = 0xFFFFF (4 KiB Granularity), present
         * type = 32bit code execute/read, DPL = 0
         */
        .word   0xFFFF, 0
        .byte   0, 0x9A, 0xCF, 0

        /* -- data segment --
         * base = 0x00000000, limit 0xFFFFF (4 KiB Granularity), present
         * type = 32 bit data read/write, DPL = 0
         */
        .word   0xFFFF, 0
        .byte   0, 0x92, 0xCF, 0

gdtdesc:
        .word   0x27                    /* limit */
        .long   gdt                     /* addr */

/* I'm a bootable disk */
.org 0x7dfe
        .byte 0x55
        .byte 0xAA

...........

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 tests/Makefile        |   1 +
 tests/postcopy-test.c | 419 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 420 insertions(+)
 create mode 100644 tests/postcopy-test.c

Comments

Dr. David Alan Gilbert April 13, 2016, 12:50 p.m. UTC | #1
* Dr. David Alan Gilbert (dgilbert@redhat.com) wrote:

> +            if ( ((b + 1) % 255) == last_byte && !hit_edge) {

Ahem, that should be 256.

I'm going to bisect the kernel and see where we get to.
Andrea's userfaultfd self-test passes on 2.5, so it's something more
subtle.

Dave
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Andrea Arcangeli April 13, 2016, 8:51 p.m. UTC | #2
On Wed, Apr 13, 2016 at 01:50:53PM +0100, Dr. David Alan Gilbert wrote:
> * Dr. David Alan Gilbert (dgilbert@redhat.com) wrote:
> 
> > +            if ( ((b + 1) % 255) == last_byte && !hit_edge) {
> 
> Ahem, that should be 256.
> 
> I'm going to bisect the kernel and see where we get to.
> Andrea's userfaultfd self-test passes on 2.5, so it's something more
> subtle.
> 

David already tracked down 1df59b8497f47495e873c23abd6d3d290c730505
good and 984065055e6e39f8dd812529e11922374bd39352 bad.

git diff 1df59b8497f47495e873c23abd6d3d290c730505..984065055e6e39f8dd812529e11922374bd39352 fs/userfaultfd.c mm/userfaultfd.c

Nothing that could break it in the diff of the relevant two files.

The only other userfault related change in this commit range that
comes to mind is in fixup_user_fault, but if that was buggy you don't
userfault into futexes with postcopy so you couldn't notice, so the
only other user of that is s390.

The next suspect is the massive THP refcounting change that went
upstream recently:

 mm/filemap.c         |   34 +-
 mm/slab.c            |   48 +-
 mm/hugetlb.c         |   51 +-
 mm/util.c            |   55 +-
 mm/vmscan.c          |   56 +-
 mm/swapfile.c        |   57 +-
 mm/internal.h        |   70 +-
 mm/memblock.c        |   73 +-
 mm/mempolicy.c       |   75 ++-
 mm/sparse-vmemmap.c  |   76 ++-
 mm/vmpressure.c      |   78 ++-
 mm/vmstat.c          |   86 ++-
 mm/ksm.c             |   89 +--
 mm/mmap.c            |  106 +--
 mm/memory_hotplug.c  |  107 ++-
 mm/memory-failure.c  |  125 ++--
 mm/memory.c          |  148 ++--
 mm/gup.c             |  172 +++--
 mm/madvise.c         |  201 ++++++
 mm/page_alloc.c      |  205 +++---
 mm/shmem.c           |  289 ++++----
 mm/swap.c            |  319 ++-------
 mm/rmap.c            |  387 +++++++----
 mm/memcontrol.c      |  478 +++++++------
 mm/huge_memory.c     | 1814 ++++++++++++++++++++++++++++++++------------------

As further debug hint, can you try to disable THP and see if that
makes the problem go away?

Thanks,
Andrea
Dr. David Alan Gilbert April 14, 2016, 10:13 a.m. UTC | #3
* Andrea Arcangeli (aarcange@redhat.com) wrote:
> On Wed, Apr 13, 2016 at 01:50:53PM +0100, Dr. David Alan Gilbert wrote:
> > * Dr. David Alan Gilbert (dgilbert@redhat.com) wrote:
> > 
> > > +            if ( ((b + 1) % 255) == last_byte && !hit_edge) {
> > 
> > Ahem, that should be 256.
> > 
> > I'm going to bisect the kernel and see where we get to.
> > Andrea's userfaultfd self-test passes on 2.5, so it's something more
> > subtle.
> > 
> 
> David already tracked down 1df59b8497f47495e873c23abd6d3d290c730505
> good and 984065055e6e39f8dd812529e11922374bd39352 bad.
> 
> git diff 1df59b8497f47495e873c23abd6d3d290c730505..984065055e6e39f8dd812529e11922374bd39352 fs/userfaultfd.c mm/userfaultfd.c
> 
> Nothing that could break it in the diff of the relevant two files.
> 
> The only other userfault related change in this commit range that
> comes to mind is in fixup_user_fault, but if that was buggy you don't
> userfault into futexes with postcopy so you couldn't notice, so the
> only other user of that is s390.
> 
> The next suspect is the massive THP refcounting change that went
> upstream recently:

...

> As further debug hint, can you try to disable THP and see if that
> makes the problem go away?

Yeh, looks like it is THP.
My bisect is currently at 17ec4cd985780a7e30aa45bb8f272237c12502a4
and with that from a fresh boot it fails, if I disable THP it works
and if I reenable THP back to madvise it fails.

I spotted that my previous bisect point it failed before I'd done
the next kernel build but failed after I'd done the build (but before
I rebooted!) - so I guess after the build it couldn't find any THPs to do.

Dave

> 
> Thanks,
> Andrea
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert April 14, 2016, 12:34 p.m. UTC | #4
* Andrea Arcangeli (aarcange@redhat.com) wrote:

> The next suspect is the massive THP refcounting change that went
> upstream recently:

> As further debug hint, can you try to disable THP and see if that
> makes the problem go away?

Yep, this seems to be the problem (cc'ing in Kirill).

122afea9626ab3f717b250a8dd3d5ebf57cdb56c - works (just before Kirill disables THP)
61f5d698cc97600e813ca5cf8e449b1ea1c11492 - breaks (when THP is reenabled)

It's pretty reliable; as you say disabling THP makes it work again
and putting it back to THP/madvise mode makes it break.  And you need
to test on a machine with some free ram to make sure THP has a chance
to have happened.

I'm not sure of all of the rework that happened in that series,
but my reading of it is that splitting of THP pages gets deferred;
so I wonder if when I do the madvise to turn THP off, if it's actually
still got THP pages and thus we end up with a whole THP mapped
when I'm expecting to be userfaulting those pages.

Dave

> 
> Thanks,
> Andrea
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Andrea Arcangeli April 14, 2016, 4:22 p.m. UTC | #5
Adding linux-mm too,

On Thu, Apr 14, 2016 at 01:34:41PM +0100, Dr. David Alan Gilbert wrote:
> * Andrea Arcangeli (aarcange@redhat.com) wrote:
> 
> > The next suspect is the massive THP refcounting change that went
> > upstream recently:
> 
> > As further debug hint, can you try to disable THP and see if that
> > makes the problem go away?
> 
> Yep, this seems to be the problem (cc'ing in Kirill).
> 
> 122afea9626ab3f717b250a8dd3d5ebf57cdb56c - works (just before Kirill disables THP)
> 61f5d698cc97600e813ca5cf8e449b1ea1c11492 - breaks (when THP is reenabled)
> 
> It's pretty reliable; as you say disabling THP makes it work again
> and putting it back to THP/madvise mode makes it break.  And you need
> to test on a machine with some free ram to make sure THP has a chance
> to have happened.
> 
> I'm not sure of all of the rework that happened in that series,
> but my reading of it is that splitting of THP pages gets deferred;
> so I wonder if when I do the madvise to turn THP off, if it's actually
> still got THP pages and thus we end up with a whole THP mapped
> when I'm expecting to be userfaulting those pages.

Good thing at least I didn't make UFFDIO_COPY THP aware yet so there's
less variables (as no user was interested to handle userfaults at THP
granularity yet, and from userland such an improvement would be
completely invisible in terms of API, so if an user starts doing that
we can just optimize the kernel for it, criu restore could do that as
the faults will come from disk-I/O, when network is involved THP
userfaults wouldn't have a great tradeoff with regard to the increased
fault latency).

I suspect there is an handle_userfault missing somewhere in connection
with trans_huge_pmd splits (not anymore THP splits) that you're doing
with MADV_DONTNEED to zap those pages in the destination that got
redirtied in source during the last precopy stage. Or more simply
MADV_DONTNEED isn't zapping all the right ptes after the trans huge
pmd got splitted.

The fact the page isn't splitted shouldn't matter too much, all we care
about is the pte triggers handle_userfault after MADV_DONTNEED.

The userfaultfd testcase in the kernel isn't exercising this case
unfortunately, that should probably be improved too, so there is a
simpler way to reproduce than running precopy before postcopy in qemu.

Thanks,
Andrea
Kirill A. Shutemov April 15, 2016, 12:52 p.m. UTC | #6
On Thu, Apr 14, 2016 at 12:22:30PM -0400, Andrea Arcangeli wrote:
> Adding linux-mm too,
> 
> On Thu, Apr 14, 2016 at 01:34:41PM +0100, Dr. David Alan Gilbert wrote:
> > * Andrea Arcangeli (aarcange@redhat.com) wrote:
> > 
> > > The next suspect is the massive THP refcounting change that went
> > > upstream recently:
> > 
> > > As further debug hint, can you try to disable THP and see if that
> > > makes the problem go away?
> > 
> > Yep, this seems to be the problem (cc'ing in Kirill).
> > 
> > 122afea9626ab3f717b250a8dd3d5ebf57cdb56c - works (just before Kirill disables THP)
> > 61f5d698cc97600e813ca5cf8e449b1ea1c11492 - breaks (when THP is reenabled)
> > 
> > It's pretty reliable; as you say disabling THP makes it work again
> > and putting it back to THP/madvise mode makes it break.  And you need
> > to test on a machine with some free ram to make sure THP has a chance
> > to have happened.
> > 
> > I'm not sure of all of the rework that happened in that series,
> > but my reading of it is that splitting of THP pages gets deferred;
> > so I wonder if when I do the madvise to turn THP off, if it's actually
> > still got THP pages and thus we end up with a whole THP mapped
> > when I'm expecting to be userfaulting those pages.
> 
> Good thing at least I didn't make UFFDIO_COPY THP aware yet so there's
> less variables (as no user was interested to handle userfaults at THP
> granularity yet, and from userland such an improvement would be
> completely invisible in terms of API, so if an user starts doing that
> we can just optimize the kernel for it, criu restore could do that as
> the faults will come from disk-I/O, when network is involved THP
> userfaults wouldn't have a great tradeoff with regard to the increased
> fault latency).
> 
> I suspect there is an handle_userfault missing somewhere in connection
> with trans_huge_pmd splits (not anymore THP splits) that you're doing
> with MADV_DONTNEED to zap those pages in the destination that got
> redirtied in source during the last precopy stage. Or more simply
> MADV_DONTNEED isn't zapping all the right ptes after the trans huge
> pmd got splitted.
> 
> The fact the page isn't splitted shouldn't matter too much, all we care
> about is the pte triggers handle_userfault after MADV_DONTNEED.
> 
> The userfaultfd testcase in the kernel isn't exercising this case
> unfortunately, that should probably be improved too, so there is a
> simpler way to reproduce than running precopy before postcopy in qemu.

I've tested current Linus' tree and v4.5 using qemu postcopy test case for
both x86-64 and i386 and it never failed for me:

/x86_64/postcopy: first_byte = 7e last_byte = 7d hit_edge = 1 OK
OK
/i386/postcopy: first_byte = f6 last_byte = f5 hit_edge = 1 OK
OK

I've run it directly, setting relevant QTEST_QEMU_BINARY.
Dr. David Alan Gilbert April 15, 2016, 1:42 p.m. UTC | #7
* Kirill A. Shutemov (kirill@shutemov.name) wrote:
> On Thu, Apr 14, 2016 at 12:22:30PM -0400, Andrea Arcangeli wrote:
> > Adding linux-mm too,
> > 
> > On Thu, Apr 14, 2016 at 01:34:41PM +0100, Dr. David Alan Gilbert wrote:
> > > * Andrea Arcangeli (aarcange@redhat.com) wrote:
> > > 
> > > > The next suspect is the massive THP refcounting change that went
> > > > upstream recently:
> > > 
> > > > As further debug hint, can you try to disable THP and see if that
> > > > makes the problem go away?
> > > 
> > > Yep, this seems to be the problem (cc'ing in Kirill).
> > > 
> > > 122afea9626ab3f717b250a8dd3d5ebf57cdb56c - works (just before Kirill disables THP)
> > > 61f5d698cc97600e813ca5cf8e449b1ea1c11492 - breaks (when THP is reenabled)
> > > 
> > > It's pretty reliable; as you say disabling THP makes it work again
> > > and putting it back to THP/madvise mode makes it break.  And you need
> > > to test on a machine with some free ram to make sure THP has a chance
> > > to have happened.
> > > 
> > > I'm not sure of all of the rework that happened in that series,
> > > but my reading of it is that splitting of THP pages gets deferred;
> > > so I wonder if when I do the madvise to turn THP off, if it's actually
> > > still got THP pages and thus we end up with a whole THP mapped
> > > when I'm expecting to be userfaulting those pages.
> > 
> > Good thing at least I didn't make UFFDIO_COPY THP aware yet so there's
> > less variables (as no user was interested to handle userfaults at THP
> > granularity yet, and from userland such an improvement would be
> > completely invisible in terms of API, so if an user starts doing that
> > we can just optimize the kernel for it, criu restore could do that as
> > the faults will come from disk-I/O, when network is involved THP
> > userfaults wouldn't have a great tradeoff with regard to the increased
> > fault latency).
> > 
> > I suspect there is an handle_userfault missing somewhere in connection
> > with trans_huge_pmd splits (not anymore THP splits) that you're doing
> > with MADV_DONTNEED to zap those pages in the destination that got
> > redirtied in source during the last precopy stage. Or more simply
> > MADV_DONTNEED isn't zapping all the right ptes after the trans huge
> > pmd got splitted.
> > 
> > The fact the page isn't splitted shouldn't matter too much, all we care
> > about is the pte triggers handle_userfault after MADV_DONTNEED.
> > 
> > The userfaultfd testcase in the kernel isn't exercising this case
> > unfortunately, that should probably be improved too, so there is a
> > simpler way to reproduce than running precopy before postcopy in qemu.
> 
> I've tested current Linus' tree and v4.5 using qemu postcopy test case for
> both x86-64 and i386 and it never failed for me:
> 
> /x86_64/postcopy: first_byte = 7e last_byte = 7d hit_edge = 1 OK
> OK
> /i386/postcopy: first_byte = f6 last_byte = f5 hit_edge = 1 OK
> OK
> 
> I've run it directly, setting relevant QTEST_QEMU_BINARY.

Interesting; it's failing reliably for me - but only with a reasonably
freshly booted machine (so that the pages get THPd).

Dave

> 
> -- 
>  Kirill A. Shutemov
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Kirill A. Shutemov April 15, 2016, 3:23 p.m. UTC | #8
On Fri, Apr 15, 2016 at 02:42:33PM +0100, Dr. David Alan Gilbert wrote:
> * Kirill A. Shutemov (kirill@shutemov.name) wrote:
> > On Thu, Apr 14, 2016 at 12:22:30PM -0400, Andrea Arcangeli wrote:
> > > Adding linux-mm too,
> > > 
> > > On Thu, Apr 14, 2016 at 01:34:41PM +0100, Dr. David Alan Gilbert wrote:
> > > > * Andrea Arcangeli (aarcange@redhat.com) wrote:
> > > > 
> > > > > The next suspect is the massive THP refcounting change that went
> > > > > upstream recently:
> > > > 
> > > > > As further debug hint, can you try to disable THP and see if that
> > > > > makes the problem go away?
> > > > 
> > > > Yep, this seems to be the problem (cc'ing in Kirill).
> > > > 
> > > > 122afea9626ab3f717b250a8dd3d5ebf57cdb56c - works (just before Kirill disables THP)
> > > > 61f5d698cc97600e813ca5cf8e449b1ea1c11492 - breaks (when THP is reenabled)
> > > > 
> > > > It's pretty reliable; as you say disabling THP makes it work again
> > > > and putting it back to THP/madvise mode makes it break.  And you need
> > > > to test on a machine with some free ram to make sure THP has a chance
> > > > to have happened.
> > > > 
> > > > I'm not sure of all of the rework that happened in that series,
> > > > but my reading of it is that splitting of THP pages gets deferred;
> > > > so I wonder if when I do the madvise to turn THP off, if it's actually
> > > > still got THP pages and thus we end up with a whole THP mapped
> > > > when I'm expecting to be userfaulting those pages.
> > > 
> > > Good thing at least I didn't make UFFDIO_COPY THP aware yet so there's
> > > less variables (as no user was interested to handle userfaults at THP
> > > granularity yet, and from userland such an improvement would be
> > > completely invisible in terms of API, so if an user starts doing that
> > > we can just optimize the kernel for it, criu restore could do that as
> > > the faults will come from disk-I/O, when network is involved THP
> > > userfaults wouldn't have a great tradeoff with regard to the increased
> > > fault latency).
> > > 
> > > I suspect there is an handle_userfault missing somewhere in connection
> > > with trans_huge_pmd splits (not anymore THP splits) that you're doing
> > > with MADV_DONTNEED to zap those pages in the destination that got
> > > redirtied in source during the last precopy stage. Or more simply
> > > MADV_DONTNEED isn't zapping all the right ptes after the trans huge
> > > pmd got splitted.
> > > 
> > > The fact the page isn't splitted shouldn't matter too much, all we care
> > > about is the pte triggers handle_userfault after MADV_DONTNEED.
> > > 
> > > The userfaultfd testcase in the kernel isn't exercising this case
> > > unfortunately, that should probably be improved too, so there is a
> > > simpler way to reproduce than running precopy before postcopy in qemu.
> > 
> > I've tested current Linus' tree and v4.5 using qemu postcopy test case for
> > both x86-64 and i386 and it never failed for me:
> > 
> > /x86_64/postcopy: first_byte = 7e last_byte = 7d hit_edge = 1 OK
> > OK
> > /i386/postcopy: first_byte = f6 last_byte = f5 hit_edge = 1 OK
> > OK
> > 
> > I've run it directly, setting relevant QTEST_QEMU_BINARY.
> 
> Interesting; it's failing reliably for me - but only with a reasonably
> freshly booted machine (so that the pages get THPd).

The same here. Freshly booted machine with 64GiB ram. I've checked
/proc/vmstat: huge pages were allocated
Dr. David Alan Gilbert April 15, 2016, 4:34 p.m. UTC | #9
* Kirill A. Shutemov (kirill@shutemov.name) wrote:
> On Fri, Apr 15, 2016 at 02:42:33PM +0100, Dr. David Alan Gilbert wrote:
> > * Kirill A. Shutemov (kirill@shutemov.name) wrote:
> > > On Thu, Apr 14, 2016 at 12:22:30PM -0400, Andrea Arcangeli wrote:
> > > > Adding linux-mm too,
> > > > 
> > > > On Thu, Apr 14, 2016 at 01:34:41PM +0100, Dr. David Alan Gilbert wrote:
> > > > > * Andrea Arcangeli (aarcange@redhat.com) wrote:
> > > > > 
> > > > > > The next suspect is the massive THP refcounting change that went
> > > > > > upstream recently:
> > > > > 
> > > > > > As further debug hint, can you try to disable THP and see if that
> > > > > > makes the problem go away?
> > > > > 
> > > > > Yep, this seems to be the problem (cc'ing in Kirill).
> > > > > 
> > > > > 122afea9626ab3f717b250a8dd3d5ebf57cdb56c - works (just before Kirill disables THP)
> > > > > 61f5d698cc97600e813ca5cf8e449b1ea1c11492 - breaks (when THP is reenabled)
> > > > > 
> > > > > It's pretty reliable; as you say disabling THP makes it work again
> > > > > and putting it back to THP/madvise mode makes it break.  And you need
> > > > > to test on a machine with some free ram to make sure THP has a chance
> > > > > to have happened.
> > > > > 
> > > > > I'm not sure of all of the rework that happened in that series,
> > > > > but my reading of it is that splitting of THP pages gets deferred;
> > > > > so I wonder if when I do the madvise to turn THP off, if it's actually
> > > > > still got THP pages and thus we end up with a whole THP mapped
> > > > > when I'm expecting to be userfaulting those pages.
> > > > 
> > > > Good thing at least I didn't make UFFDIO_COPY THP aware yet so there's
> > > > less variables (as no user was interested to handle userfaults at THP
> > > > granularity yet, and from userland such an improvement would be
> > > > completely invisible in terms of API, so if an user starts doing that
> > > > we can just optimize the kernel for it, criu restore could do that as
> > > > the faults will come from disk-I/O, when network is involved THP
> > > > userfaults wouldn't have a great tradeoff with regard to the increased
> > > > fault latency).
> > > > 
> > > > I suspect there is an handle_userfault missing somewhere in connection
> > > > with trans_huge_pmd splits (not anymore THP splits) that you're doing
> > > > with MADV_DONTNEED to zap those pages in the destination that got
> > > > redirtied in source during the last precopy stage. Or more simply
> > > > MADV_DONTNEED isn't zapping all the right ptes after the trans huge
> > > > pmd got splitted.
> > > > 
> > > > The fact the page isn't splitted shouldn't matter too much, all we care
> > > > about is the pte triggers handle_userfault after MADV_DONTNEED.
> > > > 
> > > > The userfaultfd testcase in the kernel isn't exercising this case
> > > > unfortunately, that should probably be improved too, so there is a
> > > > simpler way to reproduce than running precopy before postcopy in qemu.
> > > 
> > > I've tested current Linus' tree and v4.5 using qemu postcopy test case for
> > > both x86-64 and i386 and it never failed for me:
> > > 
> > > /x86_64/postcopy: first_byte = 7e last_byte = 7d hit_edge = 1 OK
> > > OK
> > > /i386/postcopy: first_byte = f6 last_byte = f5 hit_edge = 1 OK
> > > OK
> > > 
> > > I've run it directly, setting relevant QTEST_QEMU_BINARY.
> > 
> > Interesting; it's failing reliably for me - but only with a reasonably
> > freshly booted machine (so that the pages get THPd).
> 
> The same here. Freshly booted machine with 64GiB ram. I've checked
> /proc/vmstat: huge pages were allocated

Thanks for testing.

Damn; this is confusing now.  I've got a RHEL7 box with 4.6.0-rc3 on where it
works, and a fedora24 VM where it fails (the f24 VM is where I did the bisect
so it works fine with the older kernel on the f24 userspace in that VM).

So lets see:
   works: Kirill's (64GB machine)
          Dave's RHEL7 host (24GB RAM, dual xeon, RHEL7 userspace and kernel config)
   fails: Dave's f24 VM (4GB RAM, 4 vcpus VM on my laptop24 userspace and kernel config)

So it's any of userspace, kernel config, machine hardware or hmm.

My f24 box has transparent_hugepage_madvise, where my rhel7 has transparent_hugepage_always
(but still works if I flip it to madvise at run time).  I'll try and get the configs
closer together.

Liang Li: Can you run my test on your setup which fails the migrate and tell
me what your userspace is?

(If you've not built my test yet, you might find you need to add a :
   tests/postcopy-test$(EXESUF): tests/postcopy-test.o

  to the tests/Makefile)


Dave
> 
> -- 
>  Kirill A. Shutemov
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Andrea Arcangeli April 15, 2016, 10:19 p.m. UTC | #10
On Fri, Apr 15, 2016 at 06:23:30PM +0300, Kirill A. Shutemov wrote:
> The same here. Freshly booted machine with 64GiB ram. I've checked
> /proc/vmstat: huge pages were allocated

I tried the test in a loop and I can't reproduce it here.

Tested with gcc 4.9.3 and glibc 2.21 and glibc 2.22 so far,
qemu&kernel/KVM latest upstream (4.6-rc3..).

You can run this in between each invocation to guarantee all memory is
backed by THP (no need of reboot):

# echo 3 >/proc/sys/vm/drop_caches
# echo >/proc/sys/vm/compact_memory

4.5 kernel built with gcc 5.3.1 run on a older userland worked fine
too.

Next thing to test would be if there's something wrong with qemu built
with gcc 5.3.1 if run on top of a 4.4 kernel?
Dr. David Alan Gilbert April 18, 2016, 9:40 a.m. UTC | #11
* Andrea Arcangeli (aarcange@redhat.com) wrote:
> On Fri, Apr 15, 2016 at 06:23:30PM +0300, Kirill A. Shutemov wrote:
> > The same here. Freshly booted machine with 64GiB ram. I've checked
> > /proc/vmstat: huge pages were allocated
> 
> I tried the test in a loop and I can't reproduce it here.
> 
> Tested with gcc 4.9.3 and glibc 2.21 and glibc 2.22 so far,
> qemu&kernel/KVM latest upstream (4.6-rc3..).
> 
> You can run this in between each invocation to guarantee all memory is
> backed by THP (no need of reboot):
> 
> # echo 3 >/proc/sys/vm/drop_caches
> # echo >/proc/sys/vm/compact_memory
> 
> 4.5 kernel built with gcc 5.3.1 run on a older userland worked fine
> too.
> 
> Next thing to test would be if there's something wrong with qemu built
> with gcc 5.3.1 if run on top of a 4.4 kernel?

It's also working for me on f24 (4.5.0-320 packaged kernel) on a real machine;
so we currently have two sets that break:

   a) Liang Li's setup (that breaks with the migrate of a real VM but I don't
                        think we have any details of the setup);
                        works with 4.4.x breaks with 4.5.x

   b) f24 nested with my test, with THP enabled after Kirill's changes.

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Liang Li April 18, 2016, 9:50 a.m. UTC | #12
> > > > I've run it directly, setting relevant QTEST_QEMU_BINARY.
> > >
> > > Interesting; it's failing reliably for me - but only with a
> > > reasonably freshly booted machine (so that the pages get THPd).
> >
> > The same here. Freshly booted machine with 64GiB ram. I've checked
> > /proc/vmstat: huge pages were allocated
> 
> Thanks for testing.
> 
> Damn; this is confusing now.  I've got a RHEL7 box with 4.6.0-rc3 on where it
> works, and a fedora24 VM where it fails (the f24 VM is where I did the bisect
> so it works fine with the older kernel on the f24 userspace in that VM).
> 
> So lets see:
>    works: Kirill's (64GB machine)
>           Dave's RHEL7 host (24GB RAM, dual xeon, RHEL7 userspace and kernel
> config)
>    fails: Dave's f24 VM (4GB RAM, 4 vcpus VM on my laptop24 userspace and
> kernel config)
> 
> So it's any of userspace, kernel config, machine hardware or hmm.
> 
> My f24 box has transparent_hugepage_madvise, where my rhel7 has
> transparent_hugepage_always (but still works if I flip it to madvise at run
> time).  I'll try and get the configs closer together.
> 
> Liang Li: Can you run my test on your setup which fails the migrate and tell
> me what your userspace is?
> 
> (If you've not built my test yet, you might find you need to add a :
>    tests/postcopy-test$(EXESUF): tests/postcopy-test.o
> 
>   to the tests/Makefile)
> 

Hi Dave,

  How to build and run you test? I didn't do that before.

Thanks!
Liang

> 
> Dave
> >
> > --
> >  Kirill A. Shutemov
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert April 18, 2016, 9:55 a.m. UTC | #13
* Li, Liang Z (liang.z.li@intel.com) wrote:
> > > > > I've run it directly, setting relevant QTEST_QEMU_BINARY.
> > > >
> > > > Interesting; it's failing reliably for me - but only with a
> > > > reasonably freshly booted machine (so that the pages get THPd).
> > >
> > > The same here. Freshly booted machine with 64GiB ram. I've checked
> > > /proc/vmstat: huge pages were allocated
> > 
> > Thanks for testing.
> > 
> > Damn; this is confusing now.  I've got a RHEL7 box with 4.6.0-rc3 on where it
> > works, and a fedora24 VM where it fails (the f24 VM is where I did the bisect
> > so it works fine with the older kernel on the f24 userspace in that VM).
> > 
> > So lets see:
> >    works: Kirill's (64GB machine)
> >           Dave's RHEL7 host (24GB RAM, dual xeon, RHEL7 userspace and kernel
> > config)
> >    fails: Dave's f24 VM (4GB RAM, 4 vcpus VM on my laptop24 userspace and
> > kernel config)
> > 
> > So it's any of userspace, kernel config, machine hardware or hmm.
> > 
> > My f24 box has transparent_hugepage_madvise, where my rhel7 has
> > transparent_hugepage_always (but still works if I flip it to madvise at run
> > time).  I'll try and get the configs closer together.
> > 
> > Liang Li: Can you run my test on your setup which fails the migrate and tell
> > me what your userspace is?
> > 
> > (If you've not built my test yet, you might find you need to add a :
> >    tests/postcopy-test$(EXESUF): tests/postcopy-test.o
> > 
> >   to the tests/Makefile)
> > 
> 
> Hi Dave,
> 
>   How to build and run you test? I didn't do that before.

Apply the code in:
http://lists.gnu.org/archive/html/qemu-devel/2016-04/msg02138.html

fix the:
+            if ( ((b + 1) % 255) == last_byte && !hit_edge) {
to:
+            if ( ((b + 1) % 256) == last_byte && !hit_edge) {

to tests/Makefile
   tests/postcopy-test$(EXESUF): tests/postcopy-test.o

and do a:
    make check

in qemu.
Then you can rerun the test with:
    QTEST_QEMU_BINARY=path/to/qemu-system-x86_64 ./tests/postcopy-test

if it works, reboot and check it still works from a fresh boot.

Can you describe the system which your full test failed on? What distro on
the host? What type of host was it tested on?

Dave

> 
> Thanks!
> Liang
> 
> > 
> > Dave
> > >
> > > --
> > >  Kirill A. Shutemov
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Liang Li April 18, 2016, 9:58 a.m. UTC | #14
> * Andrea Arcangeli (aarcange@redhat.com) wrote:
> > On Fri, Apr 15, 2016 at 06:23:30PM +0300, Kirill A. Shutemov wrote:
> > > The same here. Freshly booted machine with 64GiB ram. I've checked
> > > /proc/vmstat: huge pages were allocated
> >
> > I tried the test in a loop and I can't reproduce it here.
> >
> > Tested with gcc 4.9.3 and glibc 2.21 and glibc 2.22 so far,
> > qemu&kernel/KVM latest upstream (4.6-rc3..).
> >
> > You can run this in between each invocation to guarantee all memory is
> > backed by THP (no need of reboot):
> >
> > # echo 3 >/proc/sys/vm/drop_caches
> > # echo >/proc/sys/vm/compact_memory
> >
> > 4.5 kernel built with gcc 5.3.1 run on a older userland worked fine
> > too.
> >
> > Next thing to test would be if there's something wrong with qemu built
> > with gcc 5.3.1 if run on top of a 4.4 kernel?
> 
> It's also working for me on f24 (4.5.0-320 packaged kernel) on a real machine;
> so we currently have two sets that break:
> 
>    a) Liang Li's setup (that breaks with the migrate of a real VM but I don't
>                         think we have any details of the setup);
>                         works with 4.4.x breaks with 4.5.x
> 

The host OS is Centos 7, I just replace the kernel with 4.4/4.5, I can provide the .config for the kernel 
if you think it's helpful.
The guest OS is CentOS6.6, 2.6.32-504.el6.x86_64

Liang
>    b) f24 nested with my test, with THP enabled after Kirill's changes.
> 
> Dave
> 
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Liang Li April 18, 2016, 10:06 a.m. UTC | #15
> * Li, Liang Z (liang.z.li@intel.com) wrote:
> > > > > > I've run it directly, setting relevant QTEST_QEMU_BINARY.
> > > > >
> > > > > Interesting; it's failing reliably for me - but only with a
> > > > > reasonably freshly booted machine (so that the pages get THPd).
> > > >
> > > > The same here. Freshly booted machine with 64GiB ram. I've checked
> > > > /proc/vmstat: huge pages were allocated
> > >
> > > Thanks for testing.
> > >
> > > Damn; this is confusing now.  I've got a RHEL7 box with 4.6.0-rc3 on
> > > where it works, and a fedora24 VM where it fails (the f24 VM is
> > > where I did the bisect so it works fine with the older kernel on the f24
> userspace in that VM).
> > >
> > > So lets see:
> > >    works: Kirill's (64GB machine)
> > >           Dave's RHEL7 host (24GB RAM, dual xeon, RHEL7 userspace
> > > and kernel
> > > config)
> > >    fails: Dave's f24 VM (4GB RAM, 4 vcpus VM on my laptop24
> > > userspace and kernel config)
> > >
> > > So it's any of userspace, kernel config, machine hardware or hmm.
> > >
> > > My f24 box has transparent_hugepage_madvise, where my rhel7 has
> > > transparent_hugepage_always (but still works if I flip it to madvise
> > > at run time).  I'll try and get the configs closer together.
> > >
> > > Liang Li: Can you run my test on your setup which fails the migrate
> > > and tell me what your userspace is?
> > >
> > > (If you've not built my test yet, you might find you need to add a :
> > >    tests/postcopy-test$(EXESUF): tests/postcopy-test.o
> > >
> > >   to the tests/Makefile)
> > >
> >
> > Hi Dave,
> >
> >   How to build and run you test? I didn't do that before.
> 
> Apply the code in:
> http://lists.gnu.org/archive/html/qemu-devel/2016-04/msg02138.html
> 
> fix the:
> +            if ( ((b + 1) % 255) == last_byte && !hit_edge) {
> to:
> +            if ( ((b + 1) % 256) == last_byte && !hit_edge) {
> 
> to tests/Makefile
>    tests/postcopy-test$(EXESUF): tests/postcopy-test.o
> 
> and do a:
>     make check
> 
> in qemu.
> Then you can rerun the test with:
>     QTEST_QEMU_BINARY=path/to/qemu-system-x86_64 ./tests/postcopy-
> test
> 
> if it works, reboot and check it still works from a fresh boot.
> 
> Can you describe the system which your full test failed on? What distro on
> the host? What type of host was it tested on?
> 
> Dave
> 


Thanks, Dave

The host is CenOS7, its original kernel is 3.10.0-327.el7.x86_64 (CentOS 7.1?),
The hardware platform is HSW-EP with 64GB RAM.


> >
> > Thanks!
> > Liang
> >
> > >
> > > Dave
> > > >
> > > > --
> > > >  Kirill A. Shutemov
> > > --
> > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert April 18, 2016, 10:15 a.m. UTC | #16
* Li, Liang Z (liang.z.li@intel.com) wrote:
> > * Li, Liang Z (liang.z.li@intel.com) wrote:
> > > > > > > I've run it directly, setting relevant QTEST_QEMU_BINARY.
> > > > > >
> > > > > > Interesting; it's failing reliably for me - but only with a
> > > > > > reasonably freshly booted machine (so that the pages get THPd).
> > > > >
> > > > > The same here. Freshly booted machine with 64GiB ram. I've checked
> > > > > /proc/vmstat: huge pages were allocated
> > > >
> > > > Thanks for testing.
> > > >
> > > > Damn; this is confusing now.  I've got a RHEL7 box with 4.6.0-rc3 on
> > > > where it works, and a fedora24 VM where it fails (the f24 VM is
> > > > where I did the bisect so it works fine with the older kernel on the f24
> > userspace in that VM).
> > > >
> > > > So lets see:
> > > >    works: Kirill's (64GB machine)
> > > >           Dave's RHEL7 host (24GB RAM, dual xeon, RHEL7 userspace
> > > > and kernel
> > > > config)
> > > >    fails: Dave's f24 VM (4GB RAM, 4 vcpus VM on my laptop24
> > > > userspace and kernel config)
> > > >
> > > > So it's any of userspace, kernel config, machine hardware or hmm.
> > > >
> > > > My f24 box has transparent_hugepage_madvise, where my rhel7 has
> > > > transparent_hugepage_always (but still works if I flip it to madvise
> > > > at run time).  I'll try and get the configs closer together.
> > > >
> > > > Liang Li: Can you run my test on your setup which fails the migrate
> > > > and tell me what your userspace is?
> > > >
> > > > (If you've not built my test yet, you might find you need to add a :
> > > >    tests/postcopy-test$(EXESUF): tests/postcopy-test.o
> > > >
> > > >   to the tests/Makefile)
> > > >
> > >
> > > Hi Dave,
> > >
> > >   How to build and run you test? I didn't do that before.
> > 
> > Apply the code in:
> > http://lists.gnu.org/archive/html/qemu-devel/2016-04/msg02138.html
> > 
> > fix the:
> > +            if ( ((b + 1) % 255) == last_byte && !hit_edge) {
> > to:
> > +            if ( ((b + 1) % 256) == last_byte && !hit_edge) {
> > 
> > to tests/Makefile
> >    tests/postcopy-test$(EXESUF): tests/postcopy-test.o
> > 
> > and do a:
> >     make check
> > 
> > in qemu.
> > Then you can rerun the test with:
> >     QTEST_QEMU_BINARY=path/to/qemu-system-x86_64 ./tests/postcopy-
> > test
> > 
> > if it works, reboot and check it still works from a fresh boot.
> > 
> > Can you describe the system which your full test failed on? What distro on
> > the host? What type of host was it tested on?
> > 
> > Dave
> > 
> 
> 
> Thanks, Dave
> 
> The host is CenOS7, its original kernel is 3.10.0-327.el7.x86_64 (CentOS 7.1?),
> The hardware platform is HSW-EP with 64GB RAM.

OK, so your test fails on real hardware; my guess is that my test will work
on there.
Can you try your test with THP disabled on the host:

echo never > /sys/kernel/mm/transparent_hugepage/enabled

Dave

> 
> 
> > >
> > > Thanks!
> > > Liang
> > >
> > > >
> > > > Dave
> > > > >
> > > > > --
> > > > >  Kirill A. Shutemov
> > > > --
> > > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Liang Li April 18, 2016, 10:33 a.m. UTC | #17
> > > > > > > Interesting; it's failing reliably for me - but only with a
> > > > > > > reasonably freshly booted machine (so that the pages get THPd).
> > > > > >
> > > > > > The same here. Freshly booted machine with 64GiB ram. I've
> > > > > > checked
> > > > > > /proc/vmstat: huge pages were allocated
> > > > >
> > > > > Thanks for testing.
> > > > >
> > > > > Damn; this is confusing now.  I've got a RHEL7 box with
> > > > > 4.6.0-rc3 on where it works, and a fedora24 VM where it fails
> > > > > (the f24 VM is where I did the bisect so it works fine with the
> > > > > older kernel on the f24
> > > userspace in that VM).
> > > > >
> > > > > So lets see:
> > > > >    works: Kirill's (64GB machine)
> > > > >           Dave's RHEL7 host (24GB RAM, dual xeon, RHEL7
> > > > > userspace and kernel
> > > > > config)
> > > > >    fails: Dave's f24 VM (4GB RAM, 4 vcpus VM on my laptop24
> > > > > userspace and kernel config)
> > > > >
> > > > > So it's any of userspace, kernel config, machine hardware or hmm.
> > > > >
> > > > > My f24 box has transparent_hugepage_madvise, where my rhel7 has
> > > > > transparent_hugepage_always (but still works if I flip it to
> > > > > madvise at run time).  I'll try and get the configs closer together.
> > > > >
> > > > > Liang Li: Can you run my test on your setup which fails the
> > > > > migrate and tell me what your userspace is?
> > > > >
> > > > > (If you've not built my test yet, you might find you need to add a :
> > > > >    tests/postcopy-test$(EXESUF): tests/postcopy-test.o
> > > > >
> > > > >   to the tests/Makefile)
> > > > >
> > > >
> > > > Hi Dave,
> > > >
> > > >   How to build and run you test? I didn't do that before.
> > >
> > > Apply the code in:
> > > http://lists.gnu.org/archive/html/qemu-devel/2016-04/msg02138.html
> > >
> > > fix the:
> > > +            if ( ((b + 1) % 255) == last_byte && !hit_edge) {
> > > to:
> > > +            if ( ((b + 1) % 256) == last_byte && !hit_edge) {
> > >
> > > to tests/Makefile
> > >    tests/postcopy-test$(EXESUF): tests/postcopy-test.o
> > >
> > > and do a:
> > >     make check
> > >
> > > in qemu.
> > > Then you can rerun the test with:
> > >     QTEST_QEMU_BINARY=path/to/qemu-system-
> x86_64 ./tests/postcopy-
> > > test
> > >
> > > if it works, reboot and check it still works from a fresh boot.
> > >
> > > Can you describe the system which your full test failed on? What
> > > distro on the host? What type of host was it tested on?
> > >
> > > Dave
> > >
> >
> >
> > Thanks, Dave
> >
> > The host is CenOS7, its original kernel is 3.10.0-327.el7.x86_64
> > (CentOS 7.1?), The hardware platform is HSW-EP with 64GB RAM.
> 
> OK, so your test fails on real hardware; my guess is that my test will work on
> there.
> Can you try your test with THP disabled on the host:
> 
> echo never > /sys/kernel/mm/transparent_hugepage/enabled
> 

If the THP is disabled, no fails.
And your test was always passed, even when  real post-copy was failed. 

In my env, the output of 
'cat /sys/kernel/mm/transparent_hugepage/enabled'  is:

 [always] ...

Liang

> Dave
> 
> >
> >
> > > >
> > > > Thanks!
> > > > Liang
> > > >
> > > > >
> > > > > Dave
> > > > > >
> > > > > > --
> > > > > >  Kirill A. Shutemov
> > > > > --
> > > > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > > --
> > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert April 18, 2016, 1:23 p.m. UTC | #18
* Li, Liang Z (liang.z.li@intel.com) wrote:
> > > > > > > > Interesting; it's failing reliably for me - but only with a
> > > > > > > > reasonably freshly booted machine (so that the pages get THPd).
> > > > > > >
> > > > > > > The same here. Freshly booted machine with 64GiB ram. I've
> > > > > > > checked
> > > > > > > /proc/vmstat: huge pages were allocated
> > > > > >
> > > > > > Thanks for testing.
> > > > > >
> > > > > > Damn; this is confusing now.  I've got a RHEL7 box with
> > > > > > 4.6.0-rc3 on where it works, and a fedora24 VM where it fails
> > > > > > (the f24 VM is where I did the bisect so it works fine with the
> > > > > > older kernel on the f24
> > > > userspace in that VM).
> > > > > >
> > > > > > So lets see:
> > > > > >    works: Kirill's (64GB machine)
> > > > > >           Dave's RHEL7 host (24GB RAM, dual xeon, RHEL7
> > > > > > userspace and kernel
> > > > > > config)
> > > > > >    fails: Dave's f24 VM (4GB RAM, 4 vcpus VM on my laptop24
> > > > > > userspace and kernel config)
> > > > > >
> > > > > > So it's any of userspace, kernel config, machine hardware or hmm.
> > > > > >
> > > > > > My f24 box has transparent_hugepage_madvise, where my rhel7 has
> > > > > > transparent_hugepage_always (but still works if I flip it to
> > > > > > madvise at run time).  I'll try and get the configs closer together.
> > > > > >
> > > > > > Liang Li: Can you run my test on your setup which fails the
> > > > > > migrate and tell me what your userspace is?
> > > > > >
> > > > > > (If you've not built my test yet, you might find you need to add a :
> > > > > >    tests/postcopy-test$(EXESUF): tests/postcopy-test.o
> > > > > >
> > > > > >   to the tests/Makefile)
> > > > > >
> > > > >
> > > > > Hi Dave,
> > > > >
> > > > >   How to build and run you test? I didn't do that before.
> > > >
> > > > Apply the code in:
> > > > http://lists.gnu.org/archive/html/qemu-devel/2016-04/msg02138.html
> > > >
> > > > fix the:
> > > > +            if ( ((b + 1) % 255) == last_byte && !hit_edge) {
> > > > to:
> > > > +            if ( ((b + 1) % 256) == last_byte && !hit_edge) {
> > > >
> > > > to tests/Makefile
> > > >    tests/postcopy-test$(EXESUF): tests/postcopy-test.o
> > > >
> > > > and do a:
> > > >     make check
> > > >
> > > > in qemu.
> > > > Then you can rerun the test with:
> > > >     QTEST_QEMU_BINARY=path/to/qemu-system-
> > x86_64 ./tests/postcopy-
> > > > test
> > > >
> > > > if it works, reboot and check it still works from a fresh boot.
> > > >
> > > > Can you describe the system which your full test failed on? What
> > > > distro on the host? What type of host was it tested on?
> > > >
> > > > Dave
> > > >
> > >
> > >
> > > Thanks, Dave
> > >
> > > The host is CenOS7, its original kernel is 3.10.0-327.el7.x86_64
> > > (CentOS 7.1?), The hardware platform is HSW-EP with 64GB RAM.
> > 
> > OK, so your test fails on real hardware; my guess is that my test will work on
> > there.
> > Can you try your test with THP disabled on the host:
> > 
> > echo never > /sys/kernel/mm/transparent_hugepage/enabled
> > 
> 
> If the THP is disabled, no fails.
> And your test was always passed, even when  real post-copy was failed. 
> 
> In my env, the output of 
> 'cat /sys/kernel/mm/transparent_hugepage/enabled'  is:
> 
>  [always] ...

OK, I can't get my test to fail on real hardware - only in a VM; but my
suspicion is we're looking at the same bug; both of them it goes away
if we disable THP, both of them work on 4.4.x and fail on 4.5.x.
I'd love to be able to find a nice easy test to be able to give to Andrea
and Kirill

I've also just confirmed that running (in a VM) a fedora-24 4.5.0 kernel
with a fedora-23 userspace (qemu built under f23) still fails with my test.
So the problem there is definitely triggered by the newer kernel not
the newer userspace.

Dave

> 
> Liang
> 
> > Dave
> > 
> > >
> > >
> > > > >
> > > > > Thanks!
> > > > > Liang
> > > > >
> > > > > >
> > > > > > Dave
> > > > > > >
> > > > > > > --
> > > > > > >  Kirill A. Shutemov
> > > > > > --
> > > > > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > > > --
> > > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert April 18, 2016, 5:18 p.m. UTC | #19
* Dr. David Alan Gilbert (dgilbert@redhat.com) wrote:
> * Li, Liang Z (liang.z.li@intel.com) wrote:
> > > > > > > > > Interesting; it's failing reliably for me - but only with a
> > > > > > > > > reasonably freshly booted machine (so that the pages get THPd).
> > > > > > > >
> > > > > > > > The same here. Freshly booted machine with 64GiB ram. I've
> > > > > > > > checked
> > > > > > > > /proc/vmstat: huge pages were allocated
> > > > > > >
> > > > > > > Thanks for testing.
> > > > > > >
> > > > > > > Damn; this is confusing now.  I've got a RHEL7 box with
> > > > > > > 4.6.0-rc3 on where it works, and a fedora24 VM where it fails
> > > > > > > (the f24 VM is where I did the bisect so it works fine with the
> > > > > > > older kernel on the f24
> > > > > userspace in that VM).
> > > > > > >
> > > > > > > So lets see:
> > > > > > >    works: Kirill's (64GB machine)
> > > > > > >           Dave's RHEL7 host (24GB RAM, dual xeon, RHEL7
> > > > > > > userspace and kernel
> > > > > > > config)
> > > > > > >    fails: Dave's f24 VM (4GB RAM, 4 vcpus VM on my laptop24
> > > > > > > userspace and kernel config)
> > > > > > >
> > > > > > > So it's any of userspace, kernel config, machine hardware or hmm.
> > > > > > >
> > > > > > > My f24 box has transparent_hugepage_madvise, where my rhel7 has
> > > > > > > transparent_hugepage_always (but still works if I flip it to
> > > > > > > madvise at run time).  I'll try and get the configs closer together.
> > > > > > >
> > > > > > > Liang Li: Can you run my test on your setup which fails the
> > > > > > > migrate and tell me what your userspace is?
> > > > > > >
> > > > > > > (If you've not built my test yet, you might find you need to add a :
> > > > > > >    tests/postcopy-test$(EXESUF): tests/postcopy-test.o
> > > > > > >
> > > > > > >   to the tests/Makefile)
> > > > > > >
> > > > > >
> > > > > > Hi Dave,
> > > > > >
> > > > > >   How to build and run you test? I didn't do that before.
> > > > >
> > > > > Apply the code in:
> > > > > http://lists.gnu.org/archive/html/qemu-devel/2016-04/msg02138.html
> > > > >
> > > > > fix the:
> > > > > +            if ( ((b + 1) % 255) == last_byte && !hit_edge) {
> > > > > to:
> > > > > +            if ( ((b + 1) % 256) == last_byte && !hit_edge) {
> > > > >
> > > > > to tests/Makefile
> > > > >    tests/postcopy-test$(EXESUF): tests/postcopy-test.o
> > > > >
> > > > > and do a:
> > > > >     make check
> > > > >
> > > > > in qemu.
> > > > > Then you can rerun the test with:
> > > > >     QTEST_QEMU_BINARY=path/to/qemu-system-
> > > x86_64 ./tests/postcopy-
> > > > > test
> > > > >
> > > > > if it works, reboot and check it still works from a fresh boot.
> > > > >
> > > > > Can you describe the system which your full test failed on? What
> > > > > distro on the host? What type of host was it tested on?
> > > > >
> > > > > Dave
> > > > >
> > > >
> > > >
> > > > Thanks, Dave
> > > >
> > > > The host is CenOS7, its original kernel is 3.10.0-327.el7.x86_64
> > > > (CentOS 7.1?), The hardware platform is HSW-EP with 64GB RAM.
> > > 
> > > OK, so your test fails on real hardware; my guess is that my test will work on
> > > there.
> > > Can you try your test with THP disabled on the host:
> > > 
> > > echo never > /sys/kernel/mm/transparent_hugepage/enabled
> > > 
> > 
> > If the THP is disabled, no fails.
> > And your test was always passed, even when  real post-copy was failed. 
> > 
> > In my env, the output of 
> > 'cat /sys/kernel/mm/transparent_hugepage/enabled'  is:
> > 
> >  [always] ...
> 
> OK, I can't get my test to fail on real hardware - only in a VM; but my
> suspicion is we're looking at the same bug; both of them it goes away
> if we disable THP, both of them work on 4.4.x and fail on 4.5.x.
> I'd love to be able to find a nice easy test to be able to give to Andrea
> and Kirill
> 
> I've also just confirmed that running (in a VM) a fedora-24 4.5.0 kernel
> with a fedora-23 userspace (qemu built under f23) still fails with my test.
> So the problem there is definitely triggered by the newer kernel not
> the newer userspace.

OK, some more results - I *can* get it to fail on real hardware - it's just
really really rare, and the failure is slightly different than in the nest.

I'm using the following magic:
count=0; while true; do count=$(($count+1)); echo 3 >/proc/sys/vm/drop_caches; echo >/proc/sys/vm/compact_memory; echo "Iteration $count"; QTEST_QEMU_BINARY=./bin/qemu-system-x86_64 ./tests/postcopy-test || break; done

I've had about 4 failures out of about 5000 runs (ouch);

On the real hardware the failure addresses are always 2MB aligned, even though
other than the start address, everything in the test is 4K page based - so again
this is pointing the finger at THP:

/x86_64/postcopy: Memory content inconsistency at 4200000 first_byte = 48 last_byte = 47 current = 1 hit_edge = 1
postcopy-test: /root/git/qemu/tests/postcopy-test.c:274: check_guests_ram: Assertion `0' failed.
/x86_64/postcopy: Memory content inconsistency at 4200000 first_byte = e last_byte = d current = 9b hit_edge = 1
postcopy-test: /root/git/qemu/tests/postcopy-test.c:274: check_guests_ram: Assertion `0' failed.
/x86_64/postcopy: Memory content inconsistency at 4800000 first_byte = 19 last_byte = 18 current = 1 hit_edge = 1
postcopy-test: /root/git/qemu/tests/postcopy-test.c:274: check_guests_ram: Assertion `0' failed.
/x86_64/postcopy: Memory content inconsistency at 5e00000 first_byte = d6 last_byte = d5 current = 1 hit_edge = 1
postcopy-test: /root/git/qemu/tests/postcopy-test.c:274: check_guests_ram: Assertion `0' failed.

(My test host for the real hardware is 2x E5-2640 v3 running fedora 24)

where as in the VM I'm seeing immediate failures with addresses just on any 4k alignment.

You can run a couple in parallel; but if your load is too high the test will fail with an
assertion (postcopy-test.c:196 ...(qdict_haskey(rsp, "return")) - but that's
my test - so don't worry if you hit that; decreasing the migrate_speed_set value should
avoid that if you're hitting it repeatedly.

(Could this be something like a missing TLB flush?)

Dave


> 
> Dave
> 
> > 
> > Liang
> > 
> > > Dave
> > > 
> > > >
> > > >
> > > > > >
> > > > > > Thanks!
> > > > > > Liang
> > > > > >
> > > > > > >
> > > > > > > Dave
> > > > > > > >
> > > > > > > > --
> > > > > > > >  Kirill A. Shutemov
> > > > > > > --
> > > > > > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > > > > --
> > > > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > > --
> > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert April 20, 2016, 5:27 p.m. UTC | #20
Hi,
  Just a follow up with a little more debug;

I modified the test so it doesn't quit after the first miscomparison (see
diff below), and looking on the failures on real hardware I've seen:

/x86_64/postcopy: Memory content inconsistency at 3800000 first_byte = 30 last_byte = 30 current = 10 hit_edge = 0
                  Memory content inconsistency at 38fe000 first_byte = 30 last_byte = 10 current = 30 hit_edge = 0

and then another time:
/x86_64/postcopy: Memory content inconsistency at 4c00000 first_byte = 9a last_byte = 99 current = 1 hit_edge = 1
                  Memory content inconsistency at 4cec000 first_byte = 9a last_byte = 1 current = 99 hit_edge = 1

so in both cases what we're seeing there is starting on a 2M page boundary, a page
that is read on the destination as zero instead of getting the migrated value -
but somewhere later in the page it starts behaving. (in the first example the counter
had reached 0x30 - except for those pages which hadn't been transferred where
the counter is much lower at 0x10).

Testing it in my VM, I added some debug for where I'd been doing an madvise DONTNEED
previously:

ram_discard_range: pc.ram:0xf51000 for 42094592
ram_discard_range: pc.ram:0x5259000 for 18509824
Memory content inconsistency at f51000 first_byte = 6d last_byte = 6d current = 9e hit_edge = 0
Memory content inconsistency at 1000000 first_byte = 6d last_byte = 9e current = 6d hit_edge = 0

   So that's saying that from f51000..1000000 it was wrong - so not just one page, but upto the THP edge.
(It then got back to the right value - 6d - on the page edge).  Note how the start corresponds
to the address I'd previously done a discard on, but not the whole discard range - just
upto the THP page boundary.  Nothing in my userspace code knows about THP
(other than turning it off).

Dave



@@ -251,6 +251,7 @@ static void check_guests_ram(void)
     uint8_t first_byte;
     uint8_t last_byte;
     bool hit_edge = false;
+    bool bad = false;
 
     qtest_memread(global_qtest, start_address, &first_byte, 1);
     last_byte = first_byte;
@@ -271,11 +272,12 @@ static void check_guests_ram(void)
                                 " first_byte = %x last_byte = %x current = %x"
                                 " hit_edge = %x\n",
                                 address, first_byte, last_byte, b, hit_edge);
-                assert(0);
+                bad = true;
             }
         }
         last_byte = b;
     }
+    assert(!bad);
     fprintf(stderr, "first_byte = %x last_byte = %x hit_edge = %x OK\n",
                     first_byte, last_byte, hit_edge);
 }

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert April 21, 2016, 7:21 p.m. UTC | #21
Hi Andrea,

I'm wondering if this bug is the opposite way around from what I originally
thought it was - I don't think the problem is 0 pages on the destination; I think
it's more subtle.

I added some debug to print the source VMs memory and also
the byte in the destination's 1st page (this is in the nest):

nhp_range: block: pc.ram @ 0x7fc59a800000
Destination 1st byte: e8,df <stop> df

   OK, so that tells us that the destination is running OK, and that it
stops running when we tell it to.

Memory content inconsistency at f79000 first_byte = df last_byte = de current = 9 hit_edge = 1 src_byte = 9

'src_byte' is saying that the source VM had the byte 9 in that page (we've still got the source VMs memory - it's
paused at this point in the test)
  so adding the start of pc.ram we get that being a host address of 0x7FC59B779000 and in the logs I see:
postcopy_place_page: 0x55ba64503f7d->0x7fc59b779000 copy=4096 1stbyte=9/9

  OK, so that shows that when the destination received the page it was also '9' and after the uffdio_copy
it read as 9 - so the page made it into RAM; it wasn't 0.

But that also means, that page hasn't changed *after* migration; why not?

We can see that the other pages are changing (that Destination 1st byte
line shows the 1st byte of the test memory changed) - so the incrementer
loop has apparently incremented every byte of the test memory multiple
times - except these pages are still stuck at the '9' it got when we
placed the page into it atomically.

I've been unable to trigger this bug in a standalone test case that ran
without kvm.

Is it possible that the guest KVM CPU isn't noticing some change to
the mapping?

Dave


--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Andrea Arcangeli April 27, 2016, 2:47 p.m. UTC | #22
Hello Liang,

On Mon, Apr 18, 2016 at 10:33:14AM +0000, Li, Liang Z wrote:
> If the THP is disabled, no fails.
> And your test was always passed, even when  real post-copy was failed. 
> 
> In my env, the output of 
> 'cat /sys/kernel/mm/transparent_hugepage/enabled'  is:
> 
>  [always] ...
> 

Can you test the fix?
https://marc.info/?l=linux-mm&m=146175869123580&w=2

This was not a breakage in userfaultfd nor in postcopy. userfaultfd
had no bugs and is fully rock solid and with zero chances of
generating undetected memory corruption like it was happening in v4.5.

As I suspected, the same problem would have happened with any THP
pmd_trans_huge split (swapping/inflating-balloon etc..). Postcopy just
makes it easier to reproduce the problem because it does a scattered
MADV_DONTNEED on the destination qemu guest memory for the pages
redirtied during the last precopy pass that run, or not transferred
(to allow THP faults in destination qemu during precopy), just before
starting the guest in the destination node.

Other reports of KVM memory corruption happening on v4.5 with THP
enabled will also be taken care of by the above fix.

I hope I managed to fix this in time for v4.6 final (current is
v4.6-rc5-69), so the only kernel where KVM must not be used with THP
enabled will be v4.5.

On a side note, this MADV_DONTEED trigger reminded me as soon as the
madvisev syscall is merged, loadvm_postcopy_ram_handle_discard should
start using it to reduce the enter/exit kernel to just 1 (or a few
madvisev in case we want to give a limit to the temporary buffer to
avoid the risk of allocating too much temporary RAM for very large
guests) to do the MADV_DONTNEED scattered zapping. Same thing in
virtio_balloon_handle_output.

Thanks,
Andrea
Liang Li April 28, 2016, 2:59 a.m. UTC | #23
> -----Original Message-----
> From: Andrea Arcangeli [mailto:aarcange@redhat.com]
> Sent: Wednesday, April 27, 2016 10:48 PM
> To: Li, Liang Z
> Cc: Dr. David Alan Gilbert; Kirill A. Shutemov; kirill.shutemov@linux.intel.com;
> Amit Shah; qemu-devel@nongnu.org; quintela@redhat.com; linux-
> mm@kvack.org
> Subject: Re: post-copy is broken?
> 
> Hello Liang,
> 
> On Mon, Apr 18, 2016 at 10:33:14AM +0000, Li, Liang Z wrote:
> > If the THP is disabled, no fails.
> > And your test was always passed, even when  real post-copy was failed.
> >
> > In my env, the output of
> > 'cat /sys/kernel/mm/transparent_hugepage/enabled'  is:
> >
> >  [always] ...
> >
> 
> Can you test the fix?
> https://marc.info/?l=linux-mm&m=146175869123580&w=2
> 
> This was not a breakage in userfaultfd nor in postcopy. userfaultfd had no
> bugs and is fully rock solid and with zero chances of generating undetected
> memory corruption like it was happening in v4.5.
> 
> As I suspected, the same problem would have happened with any THP
> pmd_trans_huge split (swapping/inflating-balloon etc..). Postcopy just
> makes it easier to reproduce the problem because it does a scattered
> MADV_DONTNEED on the destination qemu guest memory for the pages
> redirtied during the last precopy pass that run, or not transferred (to allow
> THP faults in destination qemu during precopy), just before starting the
> guest in the destination node.
> 
> Other reports of KVM memory corruption happening on v4.5 with THP
> enabled will also be taken care of by the above fix.
> 
> I hope I managed to fix this in time for v4.6 final (current is v4.6-rc5-69), so
> the only kernel where KVM must not be used with THP enabled will be v4.5.
> 
> On a side note, this MADV_DONTEED trigger reminded me as soon as the
> madvisev syscall is merged, loadvm_postcopy_ram_handle_discard should
> start using it to reduce the enter/exit kernel to just 1 (or a few madvisev in
> case we want to give a limit to the temporary buffer to avoid the risk of
> allocating too much temporary RAM for very large
> guests) to do the MADV_DONTNEED scattered zapping. Same thing in
> virtio_balloon_handle_output.
> 

I have test the patch, guest doesn't crash anymore, I think the issue is fixed. Thanks!

Liang
> Thanks,
> Andrea
Dr. David Alan Gilbert April 28, 2016, 8:03 a.m. UTC | #24
* Li, Liang Z (liang.z.li@intel.com) wrote:

> 
> I have test the patch, guest doesn't crash anymore, I think the issue is fixed. Thanks!

Great!  Thanks for reporting it.

Dave
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox

Patch

diff --git a/tests/Makefile b/tests/Makefile
index 9de9598..6aebddd 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -222,6 +222,7 @@  endif
 check-qtest-i386-y += tests/test-netfilter$(EXESUF)
 check-qtest-i386-y += tests/test-filter-mirror$(EXESUF)
 check-qtest-i386-y += tests/test-filter-redirector$(EXESUF)
+check-qtest-i386-y += tests/postcopy-test$(EXESUF)
 check-qtest-x86_64-y = $(check-qtest-i386-y)
 gcov-files-i386-y += i386-softmmu/hw/timer/mc146818rtc.c
 gcov-files-x86_64-y = $(subst i386-softmmu/,x86_64-softmmu/,$(gcov-files-i386-y))
diff --git a/tests/postcopy-test.c b/tests/postcopy-test.c
new file mode 100644
index 0000000..5e5940b
--- /dev/null
+++ b/tests/postcopy-test.c
@@ -0,0 +1,419 @@ 
+/*
+ * QTest testcase for postcopy
+ *
+ * Copyright (c) 2016 Red Hat, Inc. and/or its affiliates
+ *   based on the vhost-user-test.c that is:
+ *      Copyright (c) 2014 Virtual Open Systems Sarl.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include <glib.h>
+
+#include "libqtest.h"
+#include "qemu/option.h"
+#include "qemu/range.h"
+#include "sysemu/char.h"
+#include "sysemu/sysemu.h"
+
+#include <sys/mman.h>
+#include <sys/vfs.h>
+#include <qemu/sockets.h>
+
+#if defined(__linux__)
+#include <sys/syscall.h>
+#endif
+
+#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+#include <linux/userfaultfd.h>
+
+static bool ufd_version_check(void)
+{
+    struct uffdio_api api_struct;
+    uint64_t ioctl_mask;
+
+    int ufd = ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
+
+    if (ufd == -1) {
+        g_test_message("Skipping test: userfaultfd not available");
+        return false;
+    }
+
+    api_struct.api = UFFD_API;
+    api_struct.features = 0;
+    if (ioctl(ufd, UFFDIO_API, &api_struct)) {
+        g_test_message("Skipping test: UFFDIO_API failed");
+        return false;
+    }
+
+    ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
+                 (__u64)1 << _UFFDIO_UNREGISTER;
+    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
+        g_test_message("Skipping test: Missing userfault feature");
+        return false;
+    }
+
+    return true;
+}
+
+#else
+static bool ufd_version_check(void)
+{
+    g_test_message("Skipping test: Userfault not available (builtdtime)");
+    return false;
+}
+
+#endif
+
+/* GLIB version compatibility flags */
+#if !GLIB_CHECK_VERSION(2, 26, 0)
+#define G_TIME_SPAN_SECOND              (G_GINT64_CONSTANT(1000000))
+#endif
+
+#if GLIB_CHECK_VERSION(2, 28, 0)
+#define HAVE_MONOTONIC_TIME
+#endif
+
+
+#if !GLIB_CHECK_VERSION(2, 32, 0)
+static gboolean g_cond_wait_until(CompatGCond cond, CompatGMutex mutex,
+                                  gint64 end_time)
+{
+    gboolean ret = FALSE;
+    end_time -= g_get_monotonic_time();
+    GTimeVal time = { end_time / G_TIME_SPAN_SECOND,
+                      end_time % G_TIME_SPAN_SECOND };
+    ret = g_cond_timed_wait(cond, mutex, &time);
+    return ret;
+}
+#endif
+
+static const char *tmpfs;
+
+/* A simple PC boot sector that modifies memory (1-100MB) quickly
+ * outputing a 'B' every so often if it's still running.
+ */
+unsigned char bootsect[] = {
+  0xfa, 0x0f, 0x01, 0x16, 0x74, 0x7c, 0x66, 0xb8, 0x01, 0x00, 0x00, 0x00,
+  0x0f, 0x22, 0xc0, 0x66, 0xea, 0x20, 0x7c, 0x00, 0x00, 0x08, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe4, 0x92, 0x0c, 0x02,
+  0xe6, 0x92, 0xb8, 0x10, 0x00, 0x00, 0x00, 0x8e, 0xd8, 0x66, 0xb8, 0x41,
+  0x00, 0x66, 0xba, 0xf8, 0x03, 0xee, 0xb3, 0x00, 0xb8, 0x00, 0x00, 0x10,
+  0x00, 0xfe, 0x00, 0x05, 0x00, 0x10, 0x00, 0x00, 0x3d, 0x00, 0x00, 0x40,
+  0x06, 0x7c, 0xf2, 0xfe, 0xc3, 0x75, 0xe9, 0x66, 0xb8, 0x42, 0x00, 0x66,
+  0xba, 0xf8, 0x03, 0xee, 0xeb, 0xde, 0x66, 0x90, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x9a, 0xcf, 0x00,
+  0xff, 0xff, 0x00, 0x00, 0x00, 0x92, 0xcf, 0x00, 0x27, 0x00, 0x5c, 0x7c,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0xaa
+};
+
+/*
+ * Wait for some output in the serial output file,
+ * we get an 'A' followed by an endless string of 'B's
+ * but on the destination we won't have the A.
+ */
+static void wait_for_serial(const char *side)
+{
+    char *serialpath = g_strdup_printf("%s/%s", tmpfs, side);
+    FILE *serialfile = fopen(serialpath, "r");
+
+    do {
+        int readvalue = fgetc(serialfile);
+
+        switch (readvalue) {
+        case 'A':
+            /* Fine */
+            break;
+
+        case 'B':
+            /* It's alive! */
+            fclose(serialfile);
+            g_free(serialpath);
+            return;
+
+        case EOF:
+            fseek(serialfile, 0, SEEK_SET);
+            usleep(1000);
+            break;
+
+        default:
+            fprintf(stderr, "Unexpected %d on %s serial\n", readvalue, side);
+            assert(0);
+        }
+    } while (true);
+}
+
+/*
+ * It's tricky to use qemu's migration event capability with qtest,
+ * events suddenly appearing confuse the qmp()/hmp() responses.
+ * so wait for a couple of passes to have happened before
+ * going postcopy.
+ */
+
+static uint64_t get_migration_pass(void)
+{
+    QDict *rsp, *rsp_return, *rsp_ram;
+    uint64_t result;
+
+    rsp = qmp("{ 'execute': 'query-migrate' }");
+    g_assert(qdict_haskey(rsp, "return"));
+    rsp_return = qdict_get_qdict(rsp, "return");
+    if (!qdict_haskey(rsp_return, "ram")) {
+        /* Still in setup */
+        result = 0;
+    } else {
+        rsp_ram = qdict_get_qdict(rsp_return, "ram");
+        result = qdict_get_try_int(rsp_ram, "dirty-sync-count", 0);
+        QDECREF(rsp);
+    }
+    return result;
+}
+
+static void wait_for_migration_complete(void)
+{
+    QDict *rsp, *rsp_return;
+    bool completed;
+
+    do {
+        const char *status;
+
+        rsp = qmp("{ 'execute': 'query-migrate' }");
+        rsp_return = qdict_get_qdict(rsp, "return");
+        status = qdict_get_str(rsp_return, "status");
+    
+        completed = strcmp(status, "completed") == 0;
+        assert(strcmp(status, "failed"));
+        QDECREF(rsp);
+        usleep(1000*100);
+    } while (!completed);
+}
+
+static void wait_for_migration_pass(void)
+{
+    uint64_t initial_pass = get_migration_pass();
+    uint64_t pass;
+
+    do {
+        usleep(1000*100);
+        pass = get_migration_pass();
+    } while (pass == initial_pass);
+}
+
+static void check_guests_ram(void)
+{
+    const unsigned start_address = 1024 * 1024;
+    const unsigned end_address = 100 * 1024 * 1024;
+    /* Our ASM test will have been incrementing one byte from each page from
+     * 1MB to <100MB in order.
+     * This gives us a constraint that any page's byte should be equal or less
+     * than the previous pages byte (mod 256); and they should all be equal
+     * except for one transition at the point where we meet the incrementer.
+     * (We're running this with the guest stopped).
+     */
+    unsigned address;
+    uint8_t first_byte;
+    uint8_t last_byte;
+    bool hit_edge = false;
+
+    qtest_memread(global_qtest, start_address, &first_byte, 1);
+    last_byte = first_byte;
+
+    for (address = start_address + 4096; address < end_address; address += 4096)
+    {
+        uint8_t b;
+        qtest_memread(global_qtest, address, &b, 1);
+        if (b != last_byte) {
+            if ( ((b + 1) % 255) == last_byte && !hit_edge) {
+                /* This is OK, the guest stopped at the point of
+                 * incrementing the previous page but didn't get
+                 * to us yet.
+                 */
+                hit_edge = true;
+            } else {
+                fprintf(stderr, "Memory content inconsistency at %x"
+                                " first_byte = %x last_byte = %x current = %x"
+                                " hit_edge = %x\n",
+                                address, first_byte, last_byte, b, hit_edge);
+                assert(0);
+            }
+        }
+        last_byte = b;
+    }
+    fprintf(stderr, "first_byte = %x last_byte = %x hit_edge = %x OK\n",
+                    first_byte, last_byte, hit_edge);
+}
+
+static void cleanup(const char *filename)
+{
+    char *path = g_strdup_printf("%s/%s", tmpfs, filename);
+
+    unlink(path);
+}
+
+static void test_migrate(void)
+{
+    char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs );
+    QTestState *global = global_qtest, *from, *to;
+    gchar *cmd;
+    QDict *rsp;
+
+    char *bootpath = g_strdup_printf("%s/bootsect", tmpfs);
+    FILE *bootfile = fopen(bootpath, "wb");
+
+    assert(fwrite(bootsect, 512, 1, bootfile) == 1);
+    fclose(bootfile);
+
+    cmd = g_strdup_printf("-machine accel=kvm:tcg -m 150M"
+                          " -name pcsource,debug-threads=on"
+                          " -serial file:%s/src_serial"
+                          " -drive file=%s,format=raw",
+                          tmpfs, bootpath);
+    from = qtest_start(cmd);
+    g_free(cmd);
+
+    cmd = g_strdup_printf("-machine accel=kvm:tcg -m 150M"
+                          " -name pcdest,debug-threads=on"
+                          " -serial file:%s/dest_serial"
+                          " -drive file=%s,format=raw"
+                          " -incoming %s",
+                          tmpfs, bootpath, uri);
+    to = qtest_init(cmd);
+    g_free(cmd);
+
+    global_qtest = from;
+    rsp = qmp("{ 'execute': 'migrate-set-capabilities',"
+                  "'arguments': { "
+                      "'capabilities': [ {"
+                          "'capability': 'postcopy-ram',"
+                          "'state': true } ] } }");
+    g_assert(qdict_haskey(rsp, "return"));
+    QDECREF(rsp);
+
+    global_qtest = to;
+    rsp = qmp("{ 'execute': 'migrate-set-capabilities',"
+                  "'arguments': { "
+                      "'capabilities': [ {"
+                          "'capability': 'postcopy-ram',"
+                          "'state': true } ] } }");
+    g_assert(qdict_haskey(rsp, "return"));
+    QDECREF(rsp);
+
+    global_qtest = from;
+    rsp = qmp("{ 'execute': 'migrate_set_speed',"
+              "'arguments': { 'value': 100000000 } }");
+    g_assert(qdict_haskey(rsp, "return"));
+    QDECREF(rsp);
+
+    /* Wait for the first serial output from the source */
+    wait_for_serial("src_serial");
+
+    cmd = g_strdup_printf("{ 'execute': 'migrate',"
+                          "'arguments': { 'uri': '%s' } }",
+                          uri);
+    rsp = qmp(cmd);
+    g_free(cmd);
+    g_assert(qdict_haskey(rsp, "return"));
+    QDECREF(rsp);
+
+    wait_for_migration_pass();
+
+    rsp = qmp("{ 'execute': 'migrate-start-postcopy' }");
+    g_assert(qdict_haskey(rsp, "return"));
+    QDECREF(rsp);
+
+    qmp_eventwait("STOP");
+
+    global_qtest = to;
+    qmp_eventwait("RESUME");
+
+    wait_for_serial("dest_serial");
+    global_qtest = from;
+    wait_for_migration_complete();
+
+    qtest_quit(from);
+
+    global_qtest = to;
+    qmp("{ 'execute' : 'stop'}");
+    check_guests_ram();
+
+    qtest_quit(to);
+    g_free(uri);
+
+    global_qtest = global;
+
+    cleanup("bootsect");
+    cleanup("migsocket");
+    cleanup("src_serial");
+    cleanup("dest_serial");
+}
+
+int main(int argc, char **argv)
+{
+    char template[] = "/tmp/postcopy-test-XXXXXX";
+    int ret;
+
+    g_test_init(&argc, &argv, NULL);
+
+    if (!ufd_version_check()) {
+        return 0;
+    }
+
+    tmpfs = mkdtemp(template);
+    if (!tmpfs) {
+        g_test_message("mkdtemp on path (%s): %s\n", template, strerror(errno));
+    }
+    g_assert(tmpfs);
+
+    module_call_init(MODULE_INIT_QOM);
+
+    qtest_add_func("/postcopy", test_migrate);
+
+    ret = g_test_run();
+
+    g_assert_cmpint(ret, ==, 0);
+
+    ret = rmdir(tmpfs);
+    if (ret != 0) {
+        g_test_message("unable to rmdir: path (%s): %s\n",
+                       tmpfs, strerror(errno));
+    }
+
+    return ret;
+}