diff mbox series

[v3,03/20] multifd: Zero pages transmission

Message ID 20240104004452.324068-4-hao.xiang@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Use Intel DSA accelerator to offload zero page checking in multifd live migration. | expand

Commit Message

Hao Xiang Jan. 4, 2024, 12:44 a.m. UTC
From: Juan Quintela <quintela@redhat.com>

This implements the zero page dection and handling.

Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 migration/multifd.c | 41 +++++++++++++++++++++++++++++++++++++++--
 migration/multifd.h |  5 +++++
 2 files changed, 44 insertions(+), 2 deletions(-)

Comments

Shivam Kumar Jan. 15, 2024, 7:01 a.m. UTC | #1
> On 04-Jan-2024, at 6:14 AM, Hao Xiang <hao.xiang@bytedance.com> wrote:
> 
> From: Juan Quintela <quintela@redhat.com>
> 
> This implements the zero page dection and handling.
> 
> Signed-off-by: Juan Quintela <quintela@redhat.com>
> ---
> migration/multifd.c | 41 +++++++++++++++++++++++++++++++++++++++--
> migration/multifd.h |  5 +++++
> 2 files changed, 44 insertions(+), 2 deletions(-)
> 
> diff --git a/migration/multifd.c b/migration/multifd.c
> index 5a1f50c7e8..756673029d 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -11,6 +11,7 @@
>  */
> 
> #include "qemu/osdep.h"
> +#include "qemu/cutils.h"
> #include "qemu/rcu.h"
> #include "exec/target_page.h"
> #include "sysemu/sysemu.h"
> @@ -279,6 +280,12 @@ static void multifd_send_fill_packet(MultiFDSendParams *p)
> 
>         packet->offset[i] = cpu_to_be64(temp);
>     }
> +    for (i = 0; i < p->zero_num; i++) {
> +        /* there are architectures where ram_addr_t is 32 bit */
> +        uint64_t temp = p->zero[i];
> +
> +        packet->offset[p->normal_num + i] = cpu_to_be64(temp);
> +    }
> }
> 
> static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
> @@ -361,6 +368,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
>         p->normal[i] = offset;
>     }
> 
> +    for (i = 0; i < p->zero_num; i++) {
> +        uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
> +
> +        if (offset > (p->block->used_length - p->page_size)) {
> +            error_setg(errp, "multifd: offset too long %" PRIu64
> +                       " (max " RAM_ADDR_FMT ")",
> +                       offset, p->block->used_length);
> +            return -1;
> +        }
> +        p->zero[i] = offset;
> +    }
> +
>     return 0;
> }
> 
> @@ -664,6 +683,8 @@ static void *multifd_send_thread(void *opaque)
>     MultiFDSendParams *p = opaque;
>     MigrationThread *thread = NULL;
>     Error *local_err = NULL;
> +    /* qemu older than 8.2 don't understand zero page on multifd channel */
> +    bool use_zero_page = !migrate_use_main_zero_page();
>     int ret = 0;
>     bool use_zero_copy_send = migrate_zero_copy_send();
> 
> @@ -689,6 +710,7 @@ static void *multifd_send_thread(void *opaque)
>         qemu_mutex_lock(&p->mutex);
> 
>         if (p->pending_job) {
> +            RAMBlock *rb = p->pages->block;
>             uint64_t packet_num = p->packet_num;
>             uint32_t flags;
>             p->normal_num = 0;
> @@ -701,8 +723,16 @@ static void *multifd_send_thread(void *opaque)
>             }
> 
>             for (int i = 0; i < p->pages->num; i++) {
> -                p->normal[p->normal_num] = p->pages->offset[i];
> -                p->normal_num++;
> +                uint64_t offset = p->pages->offset[i];
> +                if (use_zero_page &&
> +                    buffer_is_zero(rb->host + offset, p->page_size)) {
> +                    p->zero[p->zero_num] = offset;
> +                    p->zero_num++;
> +                    ram_release_page(rb->idstr, offset);
> +                } else {
> +                    p->normal[p->normal_num] = offset;
> +                    p->normal_num++;
> +                }
>             }
> 
>             if (p->normal_num) {
> @@ -1155,6 +1185,13 @@ static void *multifd_recv_thread(void *opaque)
>             }
>         }
> 
> +        for (int i = 0; i < p->zero_num; i++) {
> +            void *page = p->host + p->zero[i];
> +            if (!buffer_is_zero(page, p->page_size)) {
> +                memset(page, 0, p->page_size);
> +            }
> +        }
> +
I am wondering if zeroing the zero page on the destination can also be offloaded to DSA. Can it help in reducing cpu consumption on the destination in case of multifd-based migration?
>         if (flags & MULTIFD_FLAG_SYNC) {
>             qemu_sem_post(&multifd_recv_state->sem_sync);
>             qemu_sem_wait(&p->sem_sync);
> diff --git a/migration/multifd.h b/migration/multifd.h
> index d587b0e19c..13762900d4 100644
> --- a/migration/multifd.h
> +++ b/migration/multifd.h
> @@ -53,6 +53,11 @@ typedef struct {
>     uint32_t unused32[1];    /* Reserved for future use */
>     uint64_t unused64[3];    /* Reserved for future use */
>     char ramblock[256];
> +    /*
> +     * This array contains the pointers to:
> +     *  - normal pages (initial normal_pages entries)
> +     *  - zero pages (following zero_pages entries)
> +     */
>     uint64_t offset[];
> } __attribute__((packed)) MultiFDPacket_t;
> 
> -- 
> 2.30.2
> 
> 
>
Hao Xiang Jan. 23, 2024, 12:46 a.m. UTC | #2
On Sun, Jan 14, 2024 at 11:01 PM Shivam Kumar <shivam.kumar1@nutanix.com> wrote:
>
>
>
> > On 04-Jan-2024, at 6:14 AM, Hao Xiang <hao.xiang@bytedance.com> wrote:
> >
> > From: Juan Quintela <quintela@redhat.com>
> >
> > This implements the zero page dection and handling.
> >
> > Signed-off-by: Juan Quintela <quintela@redhat.com>
> > ---
> > migration/multifd.c | 41 +++++++++++++++++++++++++++++++++++++++--
> > migration/multifd.h |  5 +++++
> > 2 files changed, 44 insertions(+), 2 deletions(-)
> >
> > diff --git a/migration/multifd.c b/migration/multifd.c
> > index 5a1f50c7e8..756673029d 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -11,6 +11,7 @@
> >  */
> >
> > #include "qemu/osdep.h"
> > +#include "qemu/cutils.h"
> > #include "qemu/rcu.h"
> > #include "exec/target_page.h"
> > #include "sysemu/sysemu.h"
> > @@ -279,6 +280,12 @@ static void multifd_send_fill_packet(MultiFDSendParams *p)
> >
> >         packet->offset[i] = cpu_to_be64(temp);
> >     }
> > +    for (i = 0; i < p->zero_num; i++) {
> > +        /* there are architectures where ram_addr_t is 32 bit */
> > +        uint64_t temp = p->zero[i];
> > +
> > +        packet->offset[p->normal_num + i] = cpu_to_be64(temp);
> > +    }
> > }
> >
> > static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
> > @@ -361,6 +368,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
> >         p->normal[i] = offset;
> >     }
> >
> > +    for (i = 0; i < p->zero_num; i++) {
> > +        uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
> > +
> > +        if (offset > (p->block->used_length - p->page_size)) {
> > +            error_setg(errp, "multifd: offset too long %" PRIu64
> > +                       " (max " RAM_ADDR_FMT ")",
> > +                       offset, p->block->used_length);
> > +            return -1;
> > +        }
> > +        p->zero[i] = offset;
> > +    }
> > +
> >     return 0;
> > }
> >
> > @@ -664,6 +683,8 @@ static void *multifd_send_thread(void *opaque)
> >     MultiFDSendParams *p = opaque;
> >     MigrationThread *thread = NULL;
> >     Error *local_err = NULL;
> > +    /* qemu older than 8.2 don't understand zero page on multifd channel */
> > +    bool use_zero_page = !migrate_use_main_zero_page();
> >     int ret = 0;
> >     bool use_zero_copy_send = migrate_zero_copy_send();
> >
> > @@ -689,6 +710,7 @@ static void *multifd_send_thread(void *opaque)
> >         qemu_mutex_lock(&p->mutex);
> >
> >         if (p->pending_job) {
> > +            RAMBlock *rb = p->pages->block;
> >             uint64_t packet_num = p->packet_num;
> >             uint32_t flags;
> >             p->normal_num = 0;
> > @@ -701,8 +723,16 @@ static void *multifd_send_thread(void *opaque)
> >             }
> >
> >             for (int i = 0; i < p->pages->num; i++) {
> > -                p->normal[p->normal_num] = p->pages->offset[i];
> > -                p->normal_num++;
> > +                uint64_t offset = p->pages->offset[i];
> > +                if (use_zero_page &&
> > +                    buffer_is_zero(rb->host + offset, p->page_size)) {
> > +                    p->zero[p->zero_num] = offset;
> > +                    p->zero_num++;
> > +                    ram_release_page(rb->idstr, offset);
> > +                } else {
> > +                    p->normal[p->normal_num] = offset;
> > +                    p->normal_num++;
> > +                }
> >             }
> >
> >             if (p->normal_num) {
> > @@ -1155,6 +1185,13 @@ static void *multifd_recv_thread(void *opaque)
> >             }
> >         }
> >
> > +        for (int i = 0; i < p->zero_num; i++) {
> > +            void *page = p->host + p->zero[i];
> > +            if (!buffer_is_zero(page, p->page_size)) {
> > +                memset(page, 0, p->page_size);
> > +            }
> > +        }
> > +
> I am wondering if zeroing the zero page on the destination can also be offloaded to DSA. Can it help in reducing cpu consumption on the destination in case of multifd-based migration?

I have that change coded up and I have tested for performance. It's
not saving as much CPU cycles as I hoped. The problem is that on the
sender side, we run zero page detection on every single page but on
the receiver side, we only zero-ing the pages if the sender tells us
so. For instance, if a multifd packet with 128 pages are all zero
pages, we can offload the zero-ing pages on the entire 128 pages in a
single DSA operation and that's the best case. In a worse case, if a
multifd packet with 128 pages only has say 10 zero pages, we can
offload the zero-ing pages on only the 10 pages instead of the entire
128 pages. Considering DSA software overhead, that is not a good deal.

In the future, if we refactor the multifd path to separate zero pages
in its own packet, it will be a different story. For instance, if
there are 1024 non-contiguous zero pages detected, they will be sent
across multiple packets. With a new packet format, those pages can be
put into the same packet (and we can put more than 1024 zero pages in
a packet) and DSA offloading can become much more effective. We can
add that function after we have the new packet format implemented.

> >         if (flags & MULTIFD_FLAG_SYNC) {
> >             qemu_sem_post(&multifd_recv_state->sem_sync);
> >             qemu_sem_wait(&p->sem_sync);
> > diff --git a/migration/multifd.h b/migration/multifd.h
> > index d587b0e19c..13762900d4 100644
> > --- a/migration/multifd.h
> > +++ b/migration/multifd.h
> > @@ -53,6 +53,11 @@ typedef struct {
> >     uint32_t unused32[1];    /* Reserved for future use */
> >     uint64_t unused64[3];    /* Reserved for future use */
> >     char ramblock[256];
> > +    /*
> > +     * This array contains the pointers to:
> > +     *  - normal pages (initial normal_pages entries)
> > +     *  - zero pages (following zero_pages entries)
> > +     */
> >     uint64_t offset[];
> > } __attribute__((packed)) MultiFDPacket_t;
> >
> > --
> > 2.30.2
> >
> >
> >
>
Peter Xu Feb. 1, 2024, 5:22 a.m. UTC | #3
On Thu, Jan 04, 2024 at 12:44:35AM +0000, Hao Xiang wrote:
> From: Juan Quintela <quintela@redhat.com>
> 
> This implements the zero page dection and handling.
> 
> Signed-off-by: Juan Quintela <quintela@redhat.com>
> ---
>  migration/multifd.c | 41 +++++++++++++++++++++++++++++++++++++++--
>  migration/multifd.h |  5 +++++
>  2 files changed, 44 insertions(+), 2 deletions(-)
> 
> diff --git a/migration/multifd.c b/migration/multifd.c
> index 5a1f50c7e8..756673029d 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -11,6 +11,7 @@
>   */
>  
>  #include "qemu/osdep.h"
> +#include "qemu/cutils.h"
>  #include "qemu/rcu.h"
>  #include "exec/target_page.h"
>  #include "sysemu/sysemu.h"
> @@ -279,6 +280,12 @@ static void multifd_send_fill_packet(MultiFDSendParams *p)
>  
>          packet->offset[i] = cpu_to_be64(temp);
>      }
> +    for (i = 0; i < p->zero_num; i++) {
> +        /* there are architectures where ram_addr_t is 32 bit */
> +        uint64_t temp = p->zero[i];
> +
> +        packet->offset[p->normal_num + i] = cpu_to_be64(temp);
> +    }
>  }

I think changes like this needs to be moved into the previous patch.  I got
quite confused when reading previous one and only understood what happens
until now.  Fabiano, if you're going to pick these ones out and post
separately, please also consider.  Perhaps squashing them together?
Hao Xiang Feb. 1, 2024, 11:24 p.m. UTC | #4
On Wed, Jan 31, 2024 at 9:22 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Thu, Jan 04, 2024 at 12:44:35AM +0000, Hao Xiang wrote:
> > From: Juan Quintela <quintela@redhat.com>
> >
> > This implements the zero page dection and handling.
> >
> > Signed-off-by: Juan Quintela <quintela@redhat.com>
> > ---
> >  migration/multifd.c | 41 +++++++++++++++++++++++++++++++++++++++--
> >  migration/multifd.h |  5 +++++
> >  2 files changed, 44 insertions(+), 2 deletions(-)
> >
> > diff --git a/migration/multifd.c b/migration/multifd.c
> > index 5a1f50c7e8..756673029d 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -11,6 +11,7 @@
> >   */
> >
> >  #include "qemu/osdep.h"
> > +#include "qemu/cutils.h"
> >  #include "qemu/rcu.h"
> >  #include "exec/target_page.h"
> >  #include "sysemu/sysemu.h"
> > @@ -279,6 +280,12 @@ static void multifd_send_fill_packet(MultiFDSendParams *p)
> >
> >          packet->offset[i] = cpu_to_be64(temp);
> >      }
> > +    for (i = 0; i < p->zero_num; i++) {
> > +        /* there are architectures where ram_addr_t is 32 bit */
> > +        uint64_t temp = p->zero[i];
> > +
> > +        packet->offset[p->normal_num + i] = cpu_to_be64(temp);
> > +    }
> >  }
>
> I think changes like this needs to be moved into the previous patch.  I got
> quite confused when reading previous one and only understood what happens
> until now.  Fabiano, if you're going to pick these ones out and post
> separately, please also consider.  Perhaps squashing them together?
>

Discussed with Fabiano on a separate thread here
https://lore.kernel.org/all/CAAYibXi=WB5wfvLFM0b=d9oJf66Lb7FTGoNzzZ-tvK4RbBXxDw@mail.gmail.com/

I am moving the original multifd zero page checking changes into a
seperate patchset. There is some necessary refactoring work on the top
of the original series. I will send that out this week.
> --
> Peter Xu
>
diff mbox series

Patch

diff --git a/migration/multifd.c b/migration/multifd.c
index 5a1f50c7e8..756673029d 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -11,6 +11,7 @@ 
  */
 
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qemu/rcu.h"
 #include "exec/target_page.h"
 #include "sysemu/sysemu.h"
@@ -279,6 +280,12 @@  static void multifd_send_fill_packet(MultiFDSendParams *p)
 
         packet->offset[i] = cpu_to_be64(temp);
     }
+    for (i = 0; i < p->zero_num; i++) {
+        /* there are architectures where ram_addr_t is 32 bit */
+        uint64_t temp = p->zero[i];
+
+        packet->offset[p->normal_num + i] = cpu_to_be64(temp);
+    }
 }
 
 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
@@ -361,6 +368,18 @@  static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
         p->normal[i] = offset;
     }
 
+    for (i = 0; i < p->zero_num; i++) {
+        uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
+
+        if (offset > (p->block->used_length - p->page_size)) {
+            error_setg(errp, "multifd: offset too long %" PRIu64
+                       " (max " RAM_ADDR_FMT ")",
+                       offset, p->block->used_length);
+            return -1;
+        }
+        p->zero[i] = offset;
+    }
+
     return 0;
 }
 
@@ -664,6 +683,8 @@  static void *multifd_send_thread(void *opaque)
     MultiFDSendParams *p = opaque;
     MigrationThread *thread = NULL;
     Error *local_err = NULL;
+    /* qemu older than 8.2 don't understand zero page on multifd channel */
+    bool use_zero_page = !migrate_use_main_zero_page();
     int ret = 0;
     bool use_zero_copy_send = migrate_zero_copy_send();
 
@@ -689,6 +710,7 @@  static void *multifd_send_thread(void *opaque)
         qemu_mutex_lock(&p->mutex);
 
         if (p->pending_job) {
+            RAMBlock *rb = p->pages->block;
             uint64_t packet_num = p->packet_num;
             uint32_t flags;
             p->normal_num = 0;
@@ -701,8 +723,16 @@  static void *multifd_send_thread(void *opaque)
             }
 
             for (int i = 0; i < p->pages->num; i++) {
-                p->normal[p->normal_num] = p->pages->offset[i];
-                p->normal_num++;
+                uint64_t offset = p->pages->offset[i];
+                if (use_zero_page &&
+                    buffer_is_zero(rb->host + offset, p->page_size)) {
+                    p->zero[p->zero_num] = offset;
+                    p->zero_num++;
+                    ram_release_page(rb->idstr, offset);
+                } else {
+                    p->normal[p->normal_num] = offset;
+                    p->normal_num++;
+                }
             }
 
             if (p->normal_num) {
@@ -1155,6 +1185,13 @@  static void *multifd_recv_thread(void *opaque)
             }
         }
 
+        for (int i = 0; i < p->zero_num; i++) {
+            void *page = p->host + p->zero[i];
+            if (!buffer_is_zero(page, p->page_size)) {
+                memset(page, 0, p->page_size);
+            }
+        }
+
         if (flags & MULTIFD_FLAG_SYNC) {
             qemu_sem_post(&multifd_recv_state->sem_sync);
             qemu_sem_wait(&p->sem_sync);
diff --git a/migration/multifd.h b/migration/multifd.h
index d587b0e19c..13762900d4 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -53,6 +53,11 @@  typedef struct {
     uint32_t unused32[1];    /* Reserved for future use */
     uint64_t unused64[3];    /* Reserved for future use */
     char ramblock[256];
+    /*
+     * This array contains the pointers to:
+     *  - normal pages (initial normal_pages entries)
+     *  - zero pages (following zero_pages entries)
+     */
     uint64_t offset[];
 } __attribute__((packed)) MultiFDPacket_t;