diff mbox series

[RFC,14/21] migration: Map hugetlbfs ramblocks twice, and pre-allocate

Message ID 20230117220914.2062125-15-peterx@redhat.com (mailing list archive)
State New, archived
Headers show
Series migration: Support hugetlb doublemaps | expand

Commit Message

Peter Xu Jan. 17, 2023, 10:09 p.m. UTC
Add a RAMBlock.host_mirror for all the hugetlbfs backed guest memories.
It'll be used to remap the same region twice and it'll be used to service
page faults using UFFDIO_CONTINUE.

To make sure all accesses to these ranges will generate minor page faults
not missing page faults, we need to pre-allocate the files to make sure
page cache exist start from the beginning.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/exec/ramblock.h |  7 +++++
 migration/ram.c         | 59 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

Comments

Dr. David Alan Gilbert Jan. 25, 2023, 2:25 p.m. UTC | #1
* Peter Xu (peterx@redhat.com) wrote:
> Add a RAMBlock.host_mirror for all the hugetlbfs backed guest memories.
> It'll be used to remap the same region twice and it'll be used to service
> page faults using UFFDIO_CONTINUE.
> 
> To make sure all accesses to these ranges will generate minor page faults
> not missing page faults, we need to pre-allocate the files to make sure
> page cache exist start from the beginning.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>

Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

> ---
>  include/exec/ramblock.h |  7 +++++
>  migration/ram.c         | 59 +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 66 insertions(+)
> 
> diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
> index 3f31ce1591..c76683c3c8 100644
> --- a/include/exec/ramblock.h
> +++ b/include/exec/ramblock.h
> @@ -28,6 +28,13 @@ struct RAMBlock {
>      struct rcu_head rcu;
>      struct MemoryRegion *mr;
>      uint8_t *host;
> +    /*
> +     * This is only used for hugetlbfs ramblocks where doublemap is
> +     * enabled.  The pointer is managed by dest host migration code, and
> +     * should be NULL when migration is finished.  On src host, it should
> +     * always be NULL.
> +     */
> +    uint8_t *host_mirror;
>      uint8_t *colo_cache; /* For colo, VM's ram cache */
>      ram_addr_t offset;
>      ram_addr_t used_length;
> diff --git a/migration/ram.c b/migration/ram.c
> index 2ebf414f5f..37d7b3553a 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -3879,6 +3879,57 @@ void colo_release_ram_cache(void)
>      ram_state_cleanup(&ram_state);
>  }
>  
> +static int migrate_hugetlb_doublemap_init(void)
> +{
> +    RAMBlock *rb;
> +    void *addr;
> +    int ret;
> +
> +    if (!migrate_hugetlb_doublemap()) {
> +        return 0;
> +    }
> +
> +    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
> +        if (qemu_ram_is_hugetlb(rb)) {
> +            /*
> +             * Firstly, we remap the same ramblock into another range of
> +             * virtual address, so that we can write to the pages without
> +             * touching the page tables that directly mapped for the guest.
> +             */
> +            addr = ramblock_file_map(rb);
> +            if (addr == MAP_FAILED) {
> +                ret = -errno;
> +                error_report("%s: Duplicate mapping for hugetlb ramblock '%s'"
> +                             "failed: %s", __func__, qemu_ram_get_idstr(rb),
> +                             strerror(errno));
> +                return ret;
> +            }
> +            rb->host_mirror = addr;
> +
> +            /*
> +             * We need to make sure we pre-allocate the range with
> +             * hugetlbfs pages before hand, so that all the page fault will
> +             * be trapped as MINOR faults always, rather than MISSING
> +             * faults in userfaultfd.
> +             */
> +            ret = qemu_madvise(addr, rb->mmap_length, QEMU_MADV_POPULATE_WRITE);
> +            if (ret) {
> +                error_report("Failed to populate hugetlb ramblock '%s': "
> +                             "%s", qemu_ram_get_idstr(rb), strerror(-ret));
> +                return ret;
> +            }
> +        }
> +    }
> +
> +    /*
> +     * When reach here, it means we've setup the mirror mapping for all the
> +     * hugetlbfs pages.  Hence when page fault happens, we'll be able to
> +     * resolve page faults using UFFDIO_CONTINUE for hugetlbfs pages, but
> +     * we'll keep using UFFDIO_COPY for anonymous pages.
> +     */
> +    return 0;
> +}
> +
>  /**
>   * ram_load_setup: Setup RAM for migration incoming side
>   *
> @@ -3893,6 +3944,10 @@ static int ram_load_setup(QEMUFile *f, void *opaque)
>          return -1;
>      }
>  
> +    if (migrate_hugetlb_doublemap_init()) {
> +        return -1;
> +    }
> +
>      xbzrle_load_setup();
>      ramblock_recv_map_init();
>  
> @@ -3913,6 +3968,10 @@ static int ram_load_cleanup(void *opaque)
>      RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
>          g_free(rb->receivedmap);
>          rb->receivedmap = NULL;
> +        if (rb->host_mirror) {
> +            munmap(rb->host_mirror, rb->mmap_length);
> +            rb->host_mirror = NULL;
> +        }
>      }
>  
>      return 0;
> -- 
> 2.37.3
>
Juan Quintela Jan. 30, 2023, 5:24 a.m. UTC | #2
Peter Xu <peterx@redhat.com> wrote:
> Add a RAMBlock.host_mirror for all the hugetlbfs backed guest memories.
> It'll be used to remap the same region twice and it'll be used to service
> page faults using UFFDIO_CONTINUE.
>
> To make sure all accesses to these ranges will generate minor page faults
> not missing page faults, we need to pre-allocate the files to make sure
> page cache exist start from the beginning.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>

Reviewed-by: Juan Quintela <quintela@redhat.com>

but what about this change

> ---
>  include/exec/ramblock.h |  7 +++++
>  migration/ram.c         | 59 +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 66 insertions(+)
>
> diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
> index 3f31ce1591..c76683c3c8 100644
> --- a/include/exec/ramblock.h
> +++ b/include/exec/ramblock.h
> @@ -28,6 +28,13 @@ struct RAMBlock {
>      struct rcu_head rcu;
>      struct MemoryRegion *mr;
>      uint8_t *host;
> +    /*
> +     * This is only used for hugetlbfs ramblocks where doublemap is
> +     * enabled.  The pointer is managed by dest host migration code, and
> +     * should be NULL when migration is finished.  On src host, it should
> +     * always be NULL.
> +     */
> +    uint8_t *host_mirror;

I would consider here:

    uint8_t *host_doublemap;

as I have not a small name that means
    uint8_t *host_map_smaller_size_pages;

That explains why we need it.


>      uint8_t *colo_cache; /* For colo, VM's ram cache */
>      ram_addr_t offset;
>      ram_addr_t used_length;
> diff --git a/migration/ram.c b/migration/ram.c
> index 2ebf414f5f..37d7b3553a 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -3879,6 +3879,57 @@ void colo_release_ram_cache(void)
>      ram_state_cleanup(&ram_state);
>  }
>  
> +static int migrate_hugetlb_doublemap_init(void)
> +{
> +    RAMBlock *rb;
> +    void *addr;
> +    int ret;

Not initialized variables, remove the last two.

> +    if (!migrate_hugetlb_doublemap()) {
> +        return 0;
> +    }
> +

I would move the declaration of the RAMBlock here.

> +    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
> +        if (qemu_ram_is_hugetlb(rb)) {
> +            /*
> +             * Firstly, we remap the same ramblock into another range of
> +             * virtual address, so that we can write to the pages without
> +             * touching the page tables that directly mapped for the guest.
> +             */
> +            addr = ramblock_file_map(rb);

               void *addr = ramblock_file_map(rb);

> +            if (addr == MAP_FAILED) {
> +                ret = -errno;
                   int ret = -errno;
> +                error_report("%s: Duplicate mapping for hugetlb ramblock '%s'"
> +                             "failed: %s", __func__, qemu_ram_get_idstr(rb),
> +                             strerror(errno));
> +                return ret;
> +            }
> +            rb->host_mirror = addr;
> +
> +            /*
> +             * We need to make sure we pre-allocate the range with
> +             * hugetlbfs pages before hand, so that all the page fault will
> +             * be trapped as MINOR faults always, rather than MISSING
> +             * faults in userfaultfd.
> +             */
> +            ret = qemu_madvise(addr, rb->mmap_length, QEMU_MADV_POPULATE_WRITE);

               int ret = qemu_madvise(addr, rb->mmap_length, QEMU_MADV_POPULATE_WRITE);

> +            if (ret) {
> +                error_report("Failed to populate hugetlb ramblock '%s': "
> +                             "%s", qemu_ram_get_idstr(rb), strerror(-ret));
> +                return ret;
> +            }
> +        }
> +    }
> +
> +    /*
> +     * When reach here, it means we've setup the mirror mapping for all the
> +     * hugetlbfs pages.  Hence when page fault happens, we'll be able to
> +     * resolve page faults using UFFDIO_CONTINUE for hugetlbfs pages, but
> +     * we'll keep using UFFDIO_COPY for anonymous pages.
> +     */
> +    return 0;
> +}
Peter Xu Jan. 30, 2023, 10:35 p.m. UTC | #3
On Mon, Jan 30, 2023 at 06:24:20AM +0100, Juan Quintela wrote:
> Peter Xu <peterx@redhat.com> wrote:
> > Add a RAMBlock.host_mirror for all the hugetlbfs backed guest memories.
> > It'll be used to remap the same region twice and it'll be used to service
> > page faults using UFFDIO_CONTINUE.
> >
> > To make sure all accesses to these ranges will generate minor page faults
> > not missing page faults, we need to pre-allocate the files to make sure
> > page cache exist start from the beginning.
> >
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> 
> Reviewed-by: Juan Quintela <quintela@redhat.com>
> 
> but what about this change
> 
> > ---
> >  include/exec/ramblock.h |  7 +++++
> >  migration/ram.c         | 59 +++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 66 insertions(+)
> >
> > diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
> > index 3f31ce1591..c76683c3c8 100644
> > --- a/include/exec/ramblock.h
> > +++ b/include/exec/ramblock.h
> > @@ -28,6 +28,13 @@ struct RAMBlock {
> >      struct rcu_head rcu;
> >      struct MemoryRegion *mr;
> >      uint8_t *host;
> > +    /*
> > +     * This is only used for hugetlbfs ramblocks where doublemap is
> > +     * enabled.  The pointer is managed by dest host migration code, and
> > +     * should be NULL when migration is finished.  On src host, it should
> > +     * always be NULL.
> > +     */
> > +    uint8_t *host_mirror;
> 
> I would consider here:
> 
>     uint8_t *host_doublemap;
> 
> as I have not a small name that means
>     uint8_t *host_map_smaller_size_pages;
> 
> That explains why we need it.

Sure, I can rename this one if it helps.

One thing worth mention is that, it's not mapping things in small page size
here with host_doublemap but in huge page size only.

It's just that UFFDIO_CONTINUE needs another mapping to resolve the page
faults. It'll be the guest hugetlb ramblocks that will be mapped in small
pages during postcopy.

> 
> 
> >      uint8_t *colo_cache; /* For colo, VM's ram cache */
> >      ram_addr_t offset;
> >      ram_addr_t used_length;
> > diff --git a/migration/ram.c b/migration/ram.c
> > index 2ebf414f5f..37d7b3553a 100644
> > --- a/migration/ram.c
> > +++ b/migration/ram.c
> > @@ -3879,6 +3879,57 @@ void colo_release_ram_cache(void)
> >      ram_state_cleanup(&ram_state);
> >  }
> >  
> > +static int migrate_hugetlb_doublemap_init(void)
> > +{
> > +    RAMBlock *rb;
> > +    void *addr;
> > +    int ret;
> 
> Not initialized variables, remove the last two.

I can do this.

> 
> > +    if (!migrate_hugetlb_doublemap()) {
> > +        return 0;
> > +    }
> > +
> 
> I would move the declaration of the RAMBlock here.

But isn't QEMU in most cases declaring variables at the start of any code
block, rather than after or in the middle of any code segments?  IIRC some
compiler should start to fail with it, even though not on the modern ones.

Thanks,
Juan Quintela Feb. 1, 2023, 6:53 p.m. UTC | #4
Peter Xu <peterx@redhat.com> wrote:
> On Mon, Jan 30, 2023 at 06:24:20AM +0100, Juan Quintela wrote:
>> I would consider here:
>> 
>>     uint8_t *host_doublemap;
>> 
>> as I have not a small name that means
>>     uint8_t *host_map_smaller_size_pages;
>> 
>> That explains why we need it.
>
> Sure, I can rename this one if it helps.
>
> One thing worth mention is that, it's not mapping things in small page size
> here with host_doublemap but in huge page size only.

Thanks.


> It's just that UFFDIO_CONTINUE needs another mapping to resolve the page
> faults. It'll be the guest hugetlb ramblocks that will be mapped in small
> pages during postcopy.

ok
>> Not initialized variables, remove the last two.
>
> I can do this.
>
>> 
>> > +    if (!migrate_hugetlb_doublemap()) {
>> > +        return 0;
>> > +    }
>> > +
>> 
>> I would move the declaration of the RAMBlock here.
>
> But isn't QEMU in most cases declaring variables at the start of any code
> block, rather than after or in the middle of any code segments?  IIRC some
> compiler should start to fail with it, even though not on the modern ones.

We can declare variables since c99.  Only 24 years have passed O:-)

Anyways:

Exhibit A: We already have that kind of code

static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
{
    MultiFDPages_t *pages = p->pages;

    for (int i = 0; i < p->normal_num; i++) {
        p->iov[p->iovs_num].iov_base = pages->block->host + p->normal[i];
        p->iov[p->iovs_num].iov_len = p->page_size;
        p->iovs_num++;
    }

    p->next_packet_size = p->normal_num * p->page_size;
    p->flags |= MULTIFD_FLAG_NOCOMP;
    return nocomp;
}

Exhibit B:

from configure:

#if defined(__clang_major__) && defined(__clang_minor__)
# ifdef __apple_build_version__
#  if __clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 0)
#   error You need at least XCode Clang v10.0 to compile QEMU
#  endif
# else
#  if __clang_major__ < 6 || (__clang_major__ == 6 && __clang_minor__ < 0)
#   error You need at least Clang v6.0 to compile QEMU
#  endif
# endif
#elif defined(__GNUC__) && defined(__GNUC_MINOR__)
# if __GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ < 4)
#  error You need at least GCC v7.4.0 to compile QEMU
# endif
#else
# error You either need GCC or Clang to compiler QEMU
#endif
int main (void) { return 0; }
EOF

gcc-7.4.0: supports C11, so we are good here
https://gcc.gnu.org/onlinedocs/gcc-7.4.0/gcc/Standards.html#C-Language

clang 6.0: supports c11 and c17 standard
https://releases.llvm.org/6.0.0/tools/clang/docs/ReleaseNotes.html


So as far as I can see, we are good here.

Later, Juan.
Peter Xu Feb. 6, 2023, 9:40 p.m. UTC | #5
On Wed, Feb 01, 2023 at 07:53:28PM +0100, Juan Quintela wrote:
> Peter Xu <peterx@redhat.com> wrote:
> > On Mon, Jan 30, 2023 at 06:24:20AM +0100, Juan Quintela wrote:
> >> I would consider here:
> >> 
> >>     uint8_t *host_doublemap;
> >> 
> >> as I have not a small name that means
> >>     uint8_t *host_map_smaller_size_pages;
> >> 
> >> That explains why we need it.
> >
> > Sure, I can rename this one if it helps.
> >
> > One thing worth mention is that, it's not mapping things in small page size
> > here with host_doublemap but in huge page size only.
> 
> Thanks.
> 
> 
> > It's just that UFFDIO_CONTINUE needs another mapping to resolve the page
> > faults. It'll be the guest hugetlb ramblocks that will be mapped in small
> > pages during postcopy.
> 
> ok
> >> Not initialized variables, remove the last two.
> >
> > I can do this.
> >
> >> 
> >> > +    if (!migrate_hugetlb_doublemap()) {
> >> > +        return 0;
> >> > +    }
> >> > +
> >> 
> >> I would move the declaration of the RAMBlock here.
> >
> > But isn't QEMU in most cases declaring variables at the start of any code
> > block, rather than after or in the middle of any code segments?  IIRC some
> > compiler should start to fail with it, even though not on the modern ones.
> 
> We can declare variables since c99.  Only 24 years have passed O:-)

Oh OK :)

> 
> Anyways:
> 
> Exhibit A: We already have that kind of code
> 
> static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
> {
>     MultiFDPages_t *pages = p->pages;
> 
>     for (int i = 0; i < p->normal_num; i++) {
>         p->iov[p->iovs_num].iov_base = pages->block->host + p->normal[i];
>         p->iov[p->iovs_num].iov_len = p->page_size;
>         p->iovs_num++;
>     }
> 
>     p->next_packet_size = p->normal_num * p->page_size;
>     p->flags |= MULTIFD_FLAG_NOCOMP;
>     return nocomp;
> }
> 
> Exhibit B:
> 
> from configure:
> 
> #if defined(__clang_major__) && defined(__clang_minor__)
> # ifdef __apple_build_version__
> #  if __clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 0)
> #   error You need at least XCode Clang v10.0 to compile QEMU
> #  endif
> # else
> #  if __clang_major__ < 6 || (__clang_major__ == 6 && __clang_minor__ < 0)
> #   error You need at least Clang v6.0 to compile QEMU
> #  endif
> # endif
> #elif defined(__GNUC__) && defined(__GNUC_MINOR__)
> # if __GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ < 4)
> #  error You need at least GCC v7.4.0 to compile QEMU
> # endif
> #else
> # error You either need GCC or Clang to compiler QEMU
> #endif
> int main (void) { return 0; }
> EOF
> 
> gcc-7.4.0: supports C11, so we are good here
> https://gcc.gnu.org/onlinedocs/gcc-7.4.0/gcc/Standards.html#C-Language
> 
> clang 6.0: supports c11 and c17 standard
> https://releases.llvm.org/6.0.0/tools/clang/docs/ReleaseNotes.html
> 
> 
> So as far as I can see, we are good here.

Thanks, I'll switch over.
diff mbox series

Patch

diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
index 3f31ce1591..c76683c3c8 100644
--- a/include/exec/ramblock.h
+++ b/include/exec/ramblock.h
@@ -28,6 +28,13 @@  struct RAMBlock {
     struct rcu_head rcu;
     struct MemoryRegion *mr;
     uint8_t *host;
+    /*
+     * This is only used for hugetlbfs ramblocks where doublemap is
+     * enabled.  The pointer is managed by dest host migration code, and
+     * should be NULL when migration is finished.  On src host, it should
+     * always be NULL.
+     */
+    uint8_t *host_mirror;
     uint8_t *colo_cache; /* For colo, VM's ram cache */
     ram_addr_t offset;
     ram_addr_t used_length;
diff --git a/migration/ram.c b/migration/ram.c
index 2ebf414f5f..37d7b3553a 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -3879,6 +3879,57 @@  void colo_release_ram_cache(void)
     ram_state_cleanup(&ram_state);
 }
 
+static int migrate_hugetlb_doublemap_init(void)
+{
+    RAMBlock *rb;
+    void *addr;
+    int ret;
+
+    if (!migrate_hugetlb_doublemap()) {
+        return 0;
+    }
+
+    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
+        if (qemu_ram_is_hugetlb(rb)) {
+            /*
+             * Firstly, we remap the same ramblock into another range of
+             * virtual address, so that we can write to the pages without
+             * touching the page tables that directly mapped for the guest.
+             */
+            addr = ramblock_file_map(rb);
+            if (addr == MAP_FAILED) {
+                ret = -errno;
+                error_report("%s: Duplicate mapping for hugetlb ramblock '%s'"
+                             "failed: %s", __func__, qemu_ram_get_idstr(rb),
+                             strerror(errno));
+                return ret;
+            }
+            rb->host_mirror = addr;
+
+            /*
+             * We need to make sure we pre-allocate the range with
+             * hugetlbfs pages before hand, so that all the page fault will
+             * be trapped as MINOR faults always, rather than MISSING
+             * faults in userfaultfd.
+             */
+            ret = qemu_madvise(addr, rb->mmap_length, QEMU_MADV_POPULATE_WRITE);
+            if (ret) {
+                error_report("Failed to populate hugetlb ramblock '%s': "
+                             "%s", qemu_ram_get_idstr(rb), strerror(-ret));
+                return ret;
+            }
+        }
+    }
+
+    /*
+     * When reach here, it means we've setup the mirror mapping for all the
+     * hugetlbfs pages.  Hence when page fault happens, we'll be able to
+     * resolve page faults using UFFDIO_CONTINUE for hugetlbfs pages, but
+     * we'll keep using UFFDIO_COPY for anonymous pages.
+     */
+    return 0;
+}
+
 /**
  * ram_load_setup: Setup RAM for migration incoming side
  *
@@ -3893,6 +3944,10 @@  static int ram_load_setup(QEMUFile *f, void *opaque)
         return -1;
     }
 
+    if (migrate_hugetlb_doublemap_init()) {
+        return -1;
+    }
+
     xbzrle_load_setup();
     ramblock_recv_map_init();
 
@@ -3913,6 +3968,10 @@  static int ram_load_cleanup(void *opaque)
     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
         g_free(rb->receivedmap);
         rb->receivedmap = NULL;
+        if (rb->host_mirror) {
+            munmap(rb->host_mirror, rb->mmap_length);
+            rb->host_mirror = NULL;
+        }
     }
 
     return 0;