diff mbox series

[v2,12/17] migration/multifd: Device state transfer support - send side

Message ID fdcfd68dfcf3b20278a4495eb639905b2a8e8ff3.1724701542.git.maciej.szmigiero@oracle.com (mailing list archive)
State New
Headers show
Series Multifd | expand

Commit Message

Maciej S. Szmigiero Aug. 27, 2024, 5:54 p.m. UTC
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>

A new function multifd_queue_device_state() is provided for device to queue
its state for transmission via a multifd channel.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
---
 include/migration/misc.h         |  4 ++
 migration/meson.build            |  1 +
 migration/multifd-device-state.c | 99 ++++++++++++++++++++++++++++++++
 migration/multifd-nocomp.c       |  6 +-
 migration/multifd-qpl.c          |  2 +-
 migration/multifd-uadk.c         |  2 +-
 migration/multifd-zlib.c         |  2 +-
 migration/multifd-zstd.c         |  2 +-
 migration/multifd.c              | 65 +++++++++++++++------
 migration/multifd.h              | 29 +++++++++-
 10 files changed, 184 insertions(+), 28 deletions(-)
 create mode 100644 migration/multifd-device-state.c

Comments

Fabiano Rosas Aug. 29, 2024, 12:41 a.m. UTC | #1
"Maciej S. Szmigiero" <mail@maciej.szmigiero.name> writes:

> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>
> A new function multifd_queue_device_state() is provided for device to queue
> its state for transmission via a multifd channel.
>
> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> ---
>  include/migration/misc.h         |  4 ++
>  migration/meson.build            |  1 +
>  migration/multifd-device-state.c | 99 ++++++++++++++++++++++++++++++++
>  migration/multifd-nocomp.c       |  6 +-
>  migration/multifd-qpl.c          |  2 +-
>  migration/multifd-uadk.c         |  2 +-
>  migration/multifd-zlib.c         |  2 +-
>  migration/multifd-zstd.c         |  2 +-
>  migration/multifd.c              | 65 +++++++++++++++------
>  migration/multifd.h              | 29 +++++++++-
>  10 files changed, 184 insertions(+), 28 deletions(-)
>  create mode 100644 migration/multifd-device-state.c
>
> diff --git a/include/migration/misc.h b/include/migration/misc.h
> index bfadc5613bac..7266b1b77d1f 100644
> --- a/include/migration/misc.h
> +++ b/include/migration/misc.h
> @@ -111,4 +111,8 @@ bool migration_in_bg_snapshot(void);
>  /* migration/block-dirty-bitmap.c */
>  void dirty_bitmap_mig_init(void);
>  
> +/* migration/multifd-device-state.c */
> +bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
> +                                char *data, size_t len);
> +
>  #endif
> diff --git a/migration/meson.build b/migration/meson.build
> index 77f3abf08eb1..00853595894f 100644
> --- a/migration/meson.build
> +++ b/migration/meson.build
> @@ -21,6 +21,7 @@ system_ss.add(files(
>    'migration-hmp-cmds.c',
>    'migration.c',
>    'multifd.c',
> +  'multifd-device-state.c',
>    'multifd-nocomp.c',
>    'multifd-zlib.c',
>    'multifd-zero-page.c',
> diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
> new file mode 100644
> index 000000000000..c9b44f0b5ab9
> --- /dev/null
> +++ b/migration/multifd-device-state.c
> @@ -0,0 +1,99 @@
> +/*
> + * Multifd device state migration
> + *
> + * Copyright (C) 2024 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/lockable.h"
> +#include "migration/misc.h"
> +#include "multifd.h"
> +
> +static QemuMutex queue_job_mutex;
> +
> +static MultiFDSendData *device_state_send;
> +
> +size_t multifd_device_state_payload_size(void)
> +{
> +    return sizeof(MultiFDDeviceState_t);
> +}

This will not be necessary because the payload size is the same as the
data type. We only need it for the special case where the MultiFDPages_t
is smaller than the total ram payload size.

> +
> +void multifd_device_state_save_setup(void)

s/save/send/. The ram ones are only called "save" because they're called
from ram_save_setup(), but we then have the proper nocomp_send_setup
hook.

> +{
> +    qemu_mutex_init(&queue_job_mutex);
> +
> +    device_state_send = multifd_send_data_alloc();
> +}
> +
> +void multifd_device_state_clear(MultiFDDeviceState_t *device_state)
> +{
> +    g_clear_pointer(&device_state->idstr, g_free);
> +    g_clear_pointer(&device_state->buf, g_free);
> +}
> +
> +void multifd_device_state_save_cleanup(void)

s/save/send/

> +{
> +    g_clear_pointer(&device_state_send, multifd_send_data_free);
> +
> +    qemu_mutex_destroy(&queue_job_mutex);
> +}
> +
> +static void multifd_device_state_fill_packet(MultiFDSendParams *p)
> +{
> +    MultiFDDeviceState_t *device_state = &p->data->u.device_state;
> +    MultiFDPacketDeviceState_t *packet = p->packet_device_state;
> +
> +    packet->hdr.flags = cpu_to_be32(p->flags);
> +    strncpy(packet->idstr, device_state->idstr, sizeof(packet->idstr));
> +    packet->instance_id = cpu_to_be32(device_state->instance_id);
> +    packet->next_packet_size = cpu_to_be32(p->next_packet_size);
> +}
> +
> +void multifd_device_state_send_prepare(MultiFDSendParams *p)
> +{
> +    MultiFDDeviceState_t *device_state = &p->data->u.device_state;
> +
> +    assert(multifd_payload_device_state(p->data));
> +
> +    multifd_send_prepare_header_device_state(p);
> +
> +    assert(!(p->flags & MULTIFD_FLAG_SYNC));
> +
> +    p->next_packet_size = device_state->buf_len;
> +    if (p->next_packet_size > 0) {
> +        p->iov[p->iovs_num].iov_base = device_state->buf;
> +        p->iov[p->iovs_num].iov_len = p->next_packet_size;
> +        p->iovs_num++;
> +    }
> +
> +    p->flags |= MULTIFD_FLAG_NOCOMP | MULTIFD_FLAG_DEVICE_STATE;
> +
> +    multifd_device_state_fill_packet(p);
> +}
> +
> +bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
> +                                char *data, size_t len)
> +{
> +    /* Device state submissions can come from multiple threads */
> +    QEMU_LOCK_GUARD(&queue_job_mutex);
> +    MultiFDDeviceState_t *device_state;
> +
> +    assert(multifd_payload_empty(device_state_send));
> +
> +    multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE);
> +    device_state = &device_state_send->u.device_state;
> +    device_state->idstr = g_strdup(idstr);
> +    device_state->instance_id = instance_id;
> +    device_state->buf = g_memdup2(data, len);
> +    device_state->buf_len = len;
> +
> +    if (!multifd_send(&device_state_send)) {
> +        multifd_send_data_clear(device_state_send);
> +        return false;
> +    }
> +
> +    return true;
> +}
> diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
> index 39eb77c9b3b7..0b7b543f44db 100644
> --- a/migration/multifd-nocomp.c
> +++ b/migration/multifd-nocomp.c
> @@ -116,13 +116,13 @@ static int multifd_nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
>           * Only !zerocopy needs the header in IOV; zerocopy will
>           * send it separately.
>           */
> -        multifd_send_prepare_header(p);
> +        multifd_send_prepare_header_ram(p);
>      }
>  
>      multifd_send_prepare_iovs(p);
>      p->flags |= MULTIFD_FLAG_NOCOMP;
>  
> -    multifd_send_fill_packet(p);
> +    multifd_send_fill_packet_ram(p);
>  
>      if (use_zero_copy_send) {
>          /* Send header first, without zerocopy */
> @@ -371,7 +371,7 @@ bool multifd_send_prepare_common(MultiFDSendParams *p)
>          return false;
>      }
>  
> -    multifd_send_prepare_header(p);
> +    multifd_send_prepare_header_ram(p);
>  
>      return true;
>  }
> diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> index 75041a4c4dfe..bd6b5b6a3868 100644
> --- a/migration/multifd-qpl.c
> +++ b/migration/multifd-qpl.c
> @@ -490,7 +490,7 @@ static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp)
>  
>  out:
>      p->flags |= MULTIFD_FLAG_QPL;
> -    multifd_send_fill_packet(p);
> +    multifd_send_fill_packet_ram(p);
>      return 0;
>  }
>  
> diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c
> index db2549f59bfe..6e2d26010742 100644
> --- a/migration/multifd-uadk.c
> +++ b/migration/multifd-uadk.c
> @@ -198,7 +198,7 @@ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp)
>      }
>  out:
>      p->flags |= MULTIFD_FLAG_UADK;
> -    multifd_send_fill_packet(p);
> +    multifd_send_fill_packet_ram(p);
>      return 0;
>  }
>  
> diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> index 6787538762d2..62a1fe59ad3e 100644
> --- a/migration/multifd-zlib.c
> +++ b/migration/multifd-zlib.c
> @@ -156,7 +156,7 @@ static int multifd_zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>  
>  out:
>      p->flags |= MULTIFD_FLAG_ZLIB;
> -    multifd_send_fill_packet(p);
> +    multifd_send_fill_packet_ram(p);
>      return 0;
>  }
>  
> diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
> index 1576b1e2adc6..f98b07e7f9f5 100644
> --- a/migration/multifd-zstd.c
> +++ b/migration/multifd-zstd.c
> @@ -143,7 +143,7 @@ static int multifd_zstd_send_prepare(MultiFDSendParams *p, Error **errp)
>  
>  out:
>      p->flags |= MULTIFD_FLAG_ZSTD;
> -    multifd_send_fill_packet(p);
> +    multifd_send_fill_packet_ram(p);
>      return 0;
>  }
>  
> diff --git a/migration/multifd.c b/migration/multifd.c
> index a74e8a5cc891..bebe5b5a9b9c 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -12,6 +12,7 @@
>  
>  #include "qemu/osdep.h"
>  #include "qemu/cutils.h"
> +#include "qemu/iov.h"
>  #include "qemu/rcu.h"
>  #include "exec/target_page.h"
>  #include "sysemu/sysemu.h"
> @@ -19,6 +20,7 @@
>  #include "qemu/error-report.h"
>  #include "qapi/error.h"
>  #include "file.h"
> +#include "migration/misc.h"
>  #include "migration.h"
>  #include "migration-stats.h"
>  #include "savevm.h"
> @@ -107,7 +109,9 @@ MultiFDSendData *multifd_send_data_alloc(void)
>       * added to the union in the future are larger than
>       * (MultiFDPages_t + flex array).
>       */
> -    max_payload_size = MAX(multifd_ram_payload_size(), sizeof(MultiFDPayload));
> +    max_payload_size = MAX(multifd_ram_payload_size(),
> +                           multifd_device_state_payload_size());

This is not needed, the sizeof(MultiFDPayload) below already has the
same effect.

> +    max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload));
>  
>      /*
>       * Account for any holes the compiler might insert. We can't pack
> @@ -126,6 +130,9 @@ void multifd_send_data_clear(MultiFDSendData *data)
>      }
>  
>      switch (data->type) {
> +    case MULTIFD_PAYLOAD_DEVICE_STATE:
> +        multifd_device_state_clear(&data->u.device_state);
> +        break;
>      default:
>          /* Nothing to do */
>          break;
> @@ -228,7 +235,7 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
>      return msg.id;
>  }
>  
> -void multifd_send_fill_packet(MultiFDSendParams *p)
> +void multifd_send_fill_packet_ram(MultiFDSendParams *p)

Do we need this change if there's no counterpart for device_state? It
might be less confusing to just leave this one as it is.

>  {
>      MultiFDPacket_t *packet = p->packet;
>      uint64_t packet_num;
> @@ -397,20 +404,16 @@ bool multifd_send(MultiFDSendData **send_data)
>  
>          p = &multifd_send_state->params[i];
>          /*
> -         * Lockless read to p->pending_job is safe, because only multifd
> -         * sender thread can clear it.
> +         * Lockless RMW on p->pending_job_preparing is safe, because only multifd
> +         * sender thread can clear it after it had seen p->pending_job being set.
> +         *
> +         * Pairs with qatomic_store_release() in multifd_send_thread().
>           */
> -        if (qatomic_read(&p->pending_job) == false) {
> +        if (qatomic_cmpxchg(&p->pending_job_preparing, false, true) == false) {

What's the motivation for this change? It would be better to have it in
a separate patch with a proper justification.

>              break;
>          }
>      }
>  
> -    /*
> -     * Make sure we read p->pending_job before all the rest.  Pairs with
> -     * qatomic_store_release() in multifd_send_thread().
> -     */
> -    smp_mb_acquire();
> -
>      assert(multifd_payload_empty(p->data));
>  
>      /*
> @@ -534,6 +537,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp)
>      p->name = NULL;
>      g_clear_pointer(&p->data, multifd_send_data_free);
>      p->packet_len = 0;
> +    g_clear_pointer(&p->packet_device_state, g_free);
>      g_free(p->packet);
>      p->packet = NULL;
>      multifd_send_state->ops->send_cleanup(p, errp);
> @@ -545,6 +549,7 @@ static void multifd_send_cleanup_state(void)
>  {
>      file_cleanup_outgoing_migration();
>      socket_cleanup_outgoing_migration();
> +    multifd_device_state_save_cleanup();
>      qemu_sem_destroy(&multifd_send_state->channels_created);
>      qemu_sem_destroy(&multifd_send_state->channels_ready);
>      g_free(multifd_send_state->params);
> @@ -670,19 +675,29 @@ static void *multifd_send_thread(void *opaque)
>           * qatomic_store_release() in multifd_send().
>           */
>          if (qatomic_load_acquire(&p->pending_job)) {
> +            bool is_device_state = multifd_payload_device_state(p->data);
> +            size_t total_size;
> +
>              p->flags = 0;
>              p->iovs_num = 0;
>              assert(!multifd_payload_empty(p->data));
>  
> -            ret = multifd_send_state->ops->send_prepare(p, &local_err);
> -            if (ret != 0) {
> -                break;
> +            if (is_device_state) {
> +                multifd_device_state_send_prepare(p);
> +            } else {
> +                ret = multifd_send_state->ops->send_prepare(p, &local_err);
> +                if (ret != 0) {
> +                    break;
> +                }
>              }
>  
>              if (migrate_mapped_ram()) {
> +                assert(!is_device_state);
> +
>                  ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num,
>                                                &p->data->u.ram, &local_err);
>              } else {
> +                total_size = iov_size(p->iov, p->iovs_num);
>                  ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num,
>                                                    NULL, 0, p->write_flags,
>                                                    &local_err);
> @@ -692,18 +707,27 @@ static void *multifd_send_thread(void *opaque)
>                  break;
>              }
>  
> -            stat64_add(&mig_stats.multifd_bytes,
> -                       p->next_packet_size + p->packet_len);
> +            if (is_device_state) {
> +                stat64_add(&mig_stats.multifd_bytes, total_size);
> +            } else {
> +                /*
> +                 * Can't just always add total_size since IOVs do not include
> +                 * packet header in the zerocopy RAM case.
> +                 */
> +                stat64_add(&mig_stats.multifd_bytes,
> +                           p->next_packet_size + p->packet_len);

You could set total_size for both branches after send_prepare and use it
here unconditionally.

> +            }
>  
>              p->next_packet_size = 0;
>              multifd_send_data_clear(p->data);
>  
>              /*
>               * Making sure p->data is published before saying "we're
> -             * free".  Pairs with the smp_mb_acquire() in
> +             * free".  Pairs with the qatomic_cmpxchg() in
>               * multifd_send().
>               */
>              qatomic_store_release(&p->pending_job, false);
> +            qatomic_store_release(&p->pending_job_preparing, false);
>          } else {
>              /*
>               * If not a normal job, must be a sync request.  Note that
> @@ -714,7 +738,7 @@ static void *multifd_send_thread(void *opaque)
>  
>              if (use_packets) {
>                  p->flags = MULTIFD_FLAG_SYNC;
> -                multifd_send_fill_packet(p);
> +                multifd_send_fill_packet_ram(p);
>                  ret = qio_channel_write_all(p->c, (void *)p->packet,
>                                              p->packet_len, &local_err);
>                  if (ret != 0) {
> @@ -910,6 +934,9 @@ bool multifd_send_setup(void)
>              p->packet_len = sizeof(MultiFDPacket_t)
>                            + sizeof(uint64_t) * page_count;
>              p->packet = g_malloc0(p->packet_len);
> +            p->packet_device_state = g_malloc0(sizeof(*p->packet_device_state));
> +            p->packet_device_state->hdr.magic = cpu_to_be32(MULTIFD_MAGIC);
> +            p->packet_device_state->hdr.version = cpu_to_be32(MULTIFD_VERSION);
>          }
>          p->name = g_strdup_printf("mig/src/send_%d", i);
>          p->write_flags = 0;
> @@ -944,6 +971,8 @@ bool multifd_send_setup(void)
>          }
>      }
>  
> +    multifd_device_state_save_setup();
> +
>      return true;
>  
>  err:
> diff --git a/migration/multifd.h b/migration/multifd.h
> index a0853622153e..c15c83104c8b 100644
> --- a/migration/multifd.h
> +++ b/migration/multifd.h
> @@ -120,10 +120,12 @@ typedef struct {
>  typedef enum {
>      MULTIFD_PAYLOAD_NONE,
>      MULTIFD_PAYLOAD_RAM,
> +    MULTIFD_PAYLOAD_DEVICE_STATE,
>  } MultiFDPayloadType;
>  
>  typedef union MultiFDPayload {
>      MultiFDPages_t ram;
> +    MultiFDDeviceState_t device_state;
>  } MultiFDPayload;
>  
>  struct MultiFDSendData {
> @@ -136,6 +138,11 @@ static inline bool multifd_payload_empty(MultiFDSendData *data)
>      return data->type == MULTIFD_PAYLOAD_NONE;
>  }
>  
> +static inline bool multifd_payload_device_state(MultiFDSendData *data)
> +{
> +    return data->type == MULTIFD_PAYLOAD_DEVICE_STATE;
> +}
> +
>  static inline void multifd_set_payload_type(MultiFDSendData *data,
>                                              MultiFDPayloadType type)
>  {
> @@ -182,13 +189,15 @@ typedef struct {
>       * cleared by the multifd sender threads.
>       */
>      bool pending_job;
> +    bool pending_job_preparing;
>      bool pending_sync;
>      MultiFDSendData *data;
>  
>      /* thread local variables. No locking required */
>  
> -    /* pointer to the packet */
> +    /* pointers to the possible packet types */
>      MultiFDPacket_t *packet;
> +    MultiFDPacketDeviceState_t *packet_device_state;
>      /* size of the next packet that contains pages */
>      uint32_t next_packet_size;
>      /* packets sent through this channel */
> @@ -276,18 +285,25 @@ typedef struct {
>  } MultiFDMethods;
>  
>  void multifd_register_ops(int method, MultiFDMethods *ops);
> -void multifd_send_fill_packet(MultiFDSendParams *p);
> +void multifd_send_fill_packet_ram(MultiFDSendParams *p);
>  bool multifd_send_prepare_common(MultiFDSendParams *p);
>  void multifd_send_zero_page_detect(MultiFDSendParams *p);
>  void multifd_recv_zero_page_process(MultiFDRecvParams *p);
>  
> -static inline void multifd_send_prepare_header(MultiFDSendParams *p)
> +static inline void multifd_send_prepare_header_ram(MultiFDSendParams *p)

This could instead go to multifd-nocomp.c and become multifd_ram_prepare_header.

>  {
>      p->iov[0].iov_len = p->packet_len;
>      p->iov[0].iov_base = p->packet;
>      p->iovs_num++;
>  }
>  
> +static inline void multifd_send_prepare_header_device_state(MultiFDSendParams *p)

Seem like this could also move to multifd-device-state.c and drop the
"send" part.

> +{
> +    p->iov[0].iov_len = sizeof(*p->packet_device_state);
> +    p->iov[0].iov_base = p->packet_device_state;
> +    p->iovs_num++;
> +}
> +
>  void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc);
>  bool multifd_send(MultiFDSendData **send_data);
>  MultiFDSendData *multifd_send_data_alloc(void);
> @@ -310,4 +326,11 @@ int multifd_ram_flush_and_sync(void);
>  size_t multifd_ram_payload_size(void);
>  void multifd_ram_fill_packet(MultiFDSendParams *p);
>  int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp);
> +
> +size_t multifd_device_state_payload_size(void);
> +void multifd_device_state_save_setup(void);
> +void multifd_device_state_clear(MultiFDDeviceState_t *device_state);
> +void multifd_device_state_save_cleanup(void);
> +void multifd_device_state_send_prepare(MultiFDSendParams *p);
> +
>  #endif
Maciej S. Szmigiero Aug. 29, 2024, 8:03 p.m. UTC | #2
On 29.08.2024 02:41, Fabiano Rosas wrote:
> "Maciej S. Szmigiero" <mail@maciej.szmigiero.name> writes:
> 
>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>
>> A new function multifd_queue_device_state() is provided for device to queue
>> its state for transmission via a multifd channel.
>>
>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>> ---
>>   include/migration/misc.h         |  4 ++
>>   migration/meson.build            |  1 +
>>   migration/multifd-device-state.c | 99 ++++++++++++++++++++++++++++++++
>>   migration/multifd-nocomp.c       |  6 +-
>>   migration/multifd-qpl.c          |  2 +-
>>   migration/multifd-uadk.c         |  2 +-
>>   migration/multifd-zlib.c         |  2 +-
>>   migration/multifd-zstd.c         |  2 +-
>>   migration/multifd.c              | 65 +++++++++++++++------
>>   migration/multifd.h              | 29 +++++++++-
>>   10 files changed, 184 insertions(+), 28 deletions(-)
>>   create mode 100644 migration/multifd-device-state.c
>>
>> diff --git a/include/migration/misc.h b/include/migration/misc.h
>> index bfadc5613bac..7266b1b77d1f 100644
>> --- a/include/migration/misc.h
>> +++ b/include/migration/misc.h
>> @@ -111,4 +111,8 @@ bool migration_in_bg_snapshot(void);
>>   /* migration/block-dirty-bitmap.c */
>>   void dirty_bitmap_mig_init(void);
>>   
>> +/* migration/multifd-device-state.c */
>> +bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
>> +                                char *data, size_t len);
>> +
>>   #endif
>> diff --git a/migration/meson.build b/migration/meson.build
>> index 77f3abf08eb1..00853595894f 100644
>> --- a/migration/meson.build
>> +++ b/migration/meson.build
>> @@ -21,6 +21,7 @@ system_ss.add(files(
>>     'migration-hmp-cmds.c',
>>     'migration.c',
>>     'multifd.c',
>> +  'multifd-device-state.c',
>>     'multifd-nocomp.c',
>>     'multifd-zlib.c',
>>     'multifd-zero-page.c',
>> diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
>> new file mode 100644
>> index 000000000000..c9b44f0b5ab9
>> --- /dev/null
>> +++ b/migration/multifd-device-state.c
>> @@ -0,0 +1,99 @@
>> +/*
>> + * Multifd device state migration
>> + *
>> + * Copyright (C) 2024 Oracle and/or its affiliates.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu/lockable.h"
>> +#include "migration/misc.h"
>> +#include "multifd.h"
>> +
>> +static QemuMutex queue_job_mutex;
>> +
>> +static MultiFDSendData *device_state_send;
>> +
>> +size_t multifd_device_state_payload_size(void)
>> +{
>> +    return sizeof(MultiFDDeviceState_t);
>> +}
> 
> This will not be necessary because the payload size is the same as the
> data type. We only need it for the special case where the MultiFDPages_t
> is smaller than the total ram payload size.

I know - I just wanted to make the API consistent with the one RAM
handler provides since these multifd_send_data_alloc() calls are done
just once per migration - it isn't any kind of a hot path.

>> +
>> +void multifd_device_state_save_setup(void)
> 
> s/save/send/. The ram ones are only called "save" because they're called
> from ram_save_setup(), but we then have the proper nocomp_send_setup
> hook.

Ack.

>> +{
>> +    qemu_mutex_init(&queue_job_mutex);
>> +
>> +    device_state_send = multifd_send_data_alloc();
>> +}
>> +
>> +void multifd_device_state_clear(MultiFDDeviceState_t *device_state)
>> +{
>> +    g_clear_pointer(&device_state->idstr, g_free);
>> +    g_clear_pointer(&device_state->buf, g_free);
>> +}
>> +
>> +void multifd_device_state_save_cleanup(void)
> 
> s/save/send/

Ack.

>> +{
>> +    g_clear_pointer(&device_state_send, multifd_send_data_free);
>> +
>> +    qemu_mutex_destroy(&queue_job_mutex);
>> +}
>> +
>> +static void multifd_device_state_fill_packet(MultiFDSendParams *p)
>> +{
>> +    MultiFDDeviceState_t *device_state = &p->data->u.device_state;
>> +    MultiFDPacketDeviceState_t *packet = p->packet_device_state;
>> +
>> +    packet->hdr.flags = cpu_to_be32(p->flags);
>> +    strncpy(packet->idstr, device_state->idstr, sizeof(packet->idstr));
>> +    packet->instance_id = cpu_to_be32(device_state->instance_id);
>> +    packet->next_packet_size = cpu_to_be32(p->next_packet_size);
>> +}
>> +
>> +void multifd_device_state_send_prepare(MultiFDSendParams *p)
>> +{
>> +    MultiFDDeviceState_t *device_state = &p->data->u.device_state;
>> +
>> +    assert(multifd_payload_device_state(p->data));
>> +
>> +    multifd_send_prepare_header_device_state(p);
>> +
>> +    assert(!(p->flags & MULTIFD_FLAG_SYNC));
>> +
>> +    p->next_packet_size = device_state->buf_len;
>> +    if (p->next_packet_size > 0) {
>> +        p->iov[p->iovs_num].iov_base = device_state->buf;
>> +        p->iov[p->iovs_num].iov_len = p->next_packet_size;
>> +        p->iovs_num++;
>> +    }
>> +
>> +    p->flags |= MULTIFD_FLAG_NOCOMP | MULTIFD_FLAG_DEVICE_STATE;
>> +
>> +    multifd_device_state_fill_packet(p);
>> +}
>> +
>> +bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
>> +                                char *data, size_t len)
>> +{
>> +    /* Device state submissions can come from multiple threads */
>> +    QEMU_LOCK_GUARD(&queue_job_mutex);
>> +    MultiFDDeviceState_t *device_state;
>> +
>> +    assert(multifd_payload_empty(device_state_send));
>> +
>> +    multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE);
>> +    device_state = &device_state_send->u.device_state;
>> +    device_state->idstr = g_strdup(idstr);
>> +    device_state->instance_id = instance_id;
>> +    device_state->buf = g_memdup2(data, len);
>> +    device_state->buf_len = len;
>> +
>> +    if (!multifd_send(&device_state_send)) {
>> +        multifd_send_data_clear(device_state_send);
>> +        return false;
>> +    }
>> +
>> +    return true;
>> +}
>> diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
>> index 39eb77c9b3b7..0b7b543f44db 100644
>> --- a/migration/multifd-nocomp.c
>> +++ b/migration/multifd-nocomp.c
>> @@ -116,13 +116,13 @@ static int multifd_nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
>>            * Only !zerocopy needs the header in IOV; zerocopy will
>>            * send it separately.
>>            */
>> -        multifd_send_prepare_header(p);
>> +        multifd_send_prepare_header_ram(p);
>>       }
>>   
>>       multifd_send_prepare_iovs(p);
>>       p->flags |= MULTIFD_FLAG_NOCOMP;
>>   
>> -    multifd_send_fill_packet(p);
>> +    multifd_send_fill_packet_ram(p);
>>   
>>       if (use_zero_copy_send) {
>>           /* Send header first, without zerocopy */
>> @@ -371,7 +371,7 @@ bool multifd_send_prepare_common(MultiFDSendParams *p)
>>           return false;
>>       }
>>   
>> -    multifd_send_prepare_header(p);
>> +    multifd_send_prepare_header_ram(p);
>>   
>>       return true;
>>   }
>> diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
>> index 75041a4c4dfe..bd6b5b6a3868 100644
>> --- a/migration/multifd-qpl.c
>> +++ b/migration/multifd-qpl.c
>> @@ -490,7 +490,7 @@ static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp)
>>   
>>   out:
>>       p->flags |= MULTIFD_FLAG_QPL;
>> -    multifd_send_fill_packet(p);
>> +    multifd_send_fill_packet_ram(p);
>>       return 0;
>>   }
>>   
>> diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c
>> index db2549f59bfe..6e2d26010742 100644
>> --- a/migration/multifd-uadk.c
>> +++ b/migration/multifd-uadk.c
>> @@ -198,7 +198,7 @@ static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp)
>>       }
>>   out:
>>       p->flags |= MULTIFD_FLAG_UADK;
>> -    multifd_send_fill_packet(p);
>> +    multifd_send_fill_packet_ram(p);
>>       return 0;
>>   }
>>   
>> diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
>> index 6787538762d2..62a1fe59ad3e 100644
>> --- a/migration/multifd-zlib.c
>> +++ b/migration/multifd-zlib.c
>> @@ -156,7 +156,7 @@ static int multifd_zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>>   
>>   out:
>>       p->flags |= MULTIFD_FLAG_ZLIB;
>> -    multifd_send_fill_packet(p);
>> +    multifd_send_fill_packet_ram(p);
>>       return 0;
>>   }
>>   
>> diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
>> index 1576b1e2adc6..f98b07e7f9f5 100644
>> --- a/migration/multifd-zstd.c
>> +++ b/migration/multifd-zstd.c
>> @@ -143,7 +143,7 @@ static int multifd_zstd_send_prepare(MultiFDSendParams *p, Error **errp)
>>   
>>   out:
>>       p->flags |= MULTIFD_FLAG_ZSTD;
>> -    multifd_send_fill_packet(p);
>> +    multifd_send_fill_packet_ram(p);
>>       return 0;
>>   }
>>   
>> diff --git a/migration/multifd.c b/migration/multifd.c
>> index a74e8a5cc891..bebe5b5a9b9c 100644
>> --- a/migration/multifd.c
>> +++ b/migration/multifd.c
>> @@ -12,6 +12,7 @@
>>   
>>   #include "qemu/osdep.h"
>>   #include "qemu/cutils.h"
>> +#include "qemu/iov.h"
>>   #include "qemu/rcu.h"
>>   #include "exec/target_page.h"
>>   #include "sysemu/sysemu.h"
>> @@ -19,6 +20,7 @@
>>   #include "qemu/error-report.h"
>>   #include "qapi/error.h"
>>   #include "file.h"
>> +#include "migration/misc.h"
>>   #include "migration.h"
>>   #include "migration-stats.h"
>>   #include "savevm.h"
>> @@ -107,7 +109,9 @@ MultiFDSendData *multifd_send_data_alloc(void)
>>        * added to the union in the future are larger than
>>        * (MultiFDPages_t + flex array).
>>        */
>> -    max_payload_size = MAX(multifd_ram_payload_size(), sizeof(MultiFDPayload));
>> +    max_payload_size = MAX(multifd_ram_payload_size(),
>> +                           multifd_device_state_payload_size());
> 
> This is not needed, the sizeof(MultiFDPayload) below already has the
> same effect.

Same as above, I think it's good for consistency, but I don't
mind removing it either (maybe by replacing it with a comment
describing that it isn't currently needed).

>> +    max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload));
>>   
>>       /*
>>        * Account for any holes the compiler might insert. We can't pack
>> @@ -126,6 +130,9 @@ void multifd_send_data_clear(MultiFDSendData *data)
>>       }
>>   
>>       switch (data->type) {
>> +    case MULTIFD_PAYLOAD_DEVICE_STATE:
>> +        multifd_device_state_clear(&data->u.device_state);
>> +        break;
>>       default:
>>           /* Nothing to do */
>>           break;
>> @@ -228,7 +235,7 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
>>       return msg.id;
>>   }
>>   
>> -void multifd_send_fill_packet(MultiFDSendParams *p)
>> +void multifd_send_fill_packet_ram(MultiFDSendParams *p)
> 
> Do we need this change if there's no counterpart for device_state? It
> might be less confusing to just leave this one as it is.

Not really, will drop this change in the next patch set version.

>>   {
>>       MultiFDPacket_t *packet = p->packet;
>>       uint64_t packet_num;
>> @@ -397,20 +404,16 @@ bool multifd_send(MultiFDSendData **send_data)
>>   
>>           p = &multifd_send_state->params[i];
>>           /*
>> -         * Lockless read to p->pending_job is safe, because only multifd
>> -         * sender thread can clear it.
>> +         * Lockless RMW on p->pending_job_preparing is safe, because only multifd
>> +         * sender thread can clear it after it had seen p->pending_job being set.
>> +         *
>> +         * Pairs with qatomic_store_release() in multifd_send_thread().
>>            */
>> -        if (qatomic_read(&p->pending_job) == false) {
>> +        if (qatomic_cmpxchg(&p->pending_job_preparing, false, true) == false) {
> 
> What's the motivation for this change? It would be better to have it in
> a separate patch with a proper justification.

The original RFC patch set used dedicated device state multifd channels.

Peter and other people wanted this functionality removed, however this caused
a performance (downtime) regression.

One of the things that seemed to help mitigate this regression was making
the multifd channel selection more fair via this change.

But I can split out it to a separate commit in the next patch set version and
then see what performance improvement it currently brings.

>>               break;
>>           }
>>       }
>>   
>> -    /*
>> -     * Make sure we read p->pending_job before all the rest.  Pairs with
>> -     * qatomic_store_release() in multifd_send_thread().
>> -     */
>> -    smp_mb_acquire();
>> -
>>       assert(multifd_payload_empty(p->data));
>>   
>>       /*
>> @@ -534,6 +537,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp)
>>       p->name = NULL;
>>       g_clear_pointer(&p->data, multifd_send_data_free);
>>       p->packet_len = 0;
>> +    g_clear_pointer(&p->packet_device_state, g_free);
>>       g_free(p->packet);
>>       p->packet = NULL;
>>       multifd_send_state->ops->send_cleanup(p, errp);
>> @@ -545,6 +549,7 @@ static void multifd_send_cleanup_state(void)
>>   {
>>       file_cleanup_outgoing_migration();
>>       socket_cleanup_outgoing_migration();
>> +    multifd_device_state_save_cleanup();
>>       qemu_sem_destroy(&multifd_send_state->channels_created);
>>       qemu_sem_destroy(&multifd_send_state->channels_ready);
>>       g_free(multifd_send_state->params);
>> @@ -670,19 +675,29 @@ static void *multifd_send_thread(void *opaque)
>>            * qatomic_store_release() in multifd_send().
>>            */
>>           if (qatomic_load_acquire(&p->pending_job)) {
>> +            bool is_device_state = multifd_payload_device_state(p->data);
>> +            size_t total_size;
>> +
>>               p->flags = 0;
>>               p->iovs_num = 0;
>>               assert(!multifd_payload_empty(p->data));
>>   
>> -            ret = multifd_send_state->ops->send_prepare(p, &local_err);
>> -            if (ret != 0) {
>> -                break;
>> +            if (is_device_state) {
>> +                multifd_device_state_send_prepare(p);
>> +            } else {
>> +                ret = multifd_send_state->ops->send_prepare(p, &local_err);
>> +                if (ret != 0) {
>> +                    break;
>> +                }
>>               }
>>   
>>               if (migrate_mapped_ram()) {
>> +                assert(!is_device_state);
>> +
>>                   ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num,
>>                                                 &p->data->u.ram, &local_err);
>>               } else {
>> +                total_size = iov_size(p->iov, p->iovs_num);
>>                   ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num,
>>                                                     NULL, 0, p->write_flags,
>>                                                     &local_err);
>> @@ -692,18 +707,27 @@ static void *multifd_send_thread(void *opaque)
>>                   break;
>>               }
>>   
>> -            stat64_add(&mig_stats.multifd_bytes,
>> -                       p->next_packet_size + p->packet_len);
>> +            if (is_device_state) {
>> +                stat64_add(&mig_stats.multifd_bytes, total_size);
>> +            } else {
>> +                /*
>> +                 * Can't just always add total_size since IOVs do not include
>> +                 * packet header in the zerocopy RAM case.
>> +                 */
>> +                stat64_add(&mig_stats.multifd_bytes,
>> +                           p->next_packet_size + p->packet_len);
> 
> You could set total_size for both branches after send_prepare and use it
> here unconditionally.

Ack.

>> +            }
>>   
>>               p->next_packet_size = 0;
>>               multifd_send_data_clear(p->data);
>>   
>>               /*
>>                * Making sure p->data is published before saying "we're
>> -             * free".  Pairs with the smp_mb_acquire() in
>> +             * free".  Pairs with the qatomic_cmpxchg() in
>>                * multifd_send().
>>                */
>>               qatomic_store_release(&p->pending_job, false);
>> +            qatomic_store_release(&p->pending_job_preparing, false);
>>           } else {
>>               /*
>>                * If not a normal job, must be a sync request.  Note that
>> @@ -714,7 +738,7 @@ static void *multifd_send_thread(void *opaque)
>>   
>>               if (use_packets) {
>>                   p->flags = MULTIFD_FLAG_SYNC;
>> -                multifd_send_fill_packet(p);
>> +                multifd_send_fill_packet_ram(p);
>>                   ret = qio_channel_write_all(p->c, (void *)p->packet,
>>                                               p->packet_len, &local_err);
>>                   if (ret != 0) {
>> @@ -910,6 +934,9 @@ bool multifd_send_setup(void)
>>               p->packet_len = sizeof(MultiFDPacket_t)
>>                             + sizeof(uint64_t) * page_count;
>>               p->packet = g_malloc0(p->packet_len);
>> +            p->packet_device_state = g_malloc0(sizeof(*p->packet_device_state));
>> +            p->packet_device_state->hdr.magic = cpu_to_be32(MULTIFD_MAGIC);
>> +            p->packet_device_state->hdr.version = cpu_to_be32(MULTIFD_VERSION);
>>           }
>>           p->name = g_strdup_printf("mig/src/send_%d", i);
>>           p->write_flags = 0;
>> @@ -944,6 +971,8 @@ bool multifd_send_setup(void)
>>           }
>>       }
>>   
>> +    multifd_device_state_save_setup();
>> +
>>       return true;
>>   
>>   err:
>> diff --git a/migration/multifd.h b/migration/multifd.h
>> index a0853622153e..c15c83104c8b 100644
>> --- a/migration/multifd.h
>> +++ b/migration/multifd.h
>> @@ -120,10 +120,12 @@ typedef struct {
>>   typedef enum {
>>       MULTIFD_PAYLOAD_NONE,
>>       MULTIFD_PAYLOAD_RAM,
>> +    MULTIFD_PAYLOAD_DEVICE_STATE,
>>   } MultiFDPayloadType;
>>   
>>   typedef union MultiFDPayload {
>>       MultiFDPages_t ram;
>> +    MultiFDDeviceState_t device_state;
>>   } MultiFDPayload;
>>   
>>   struct MultiFDSendData {
>> @@ -136,6 +138,11 @@ static inline bool multifd_payload_empty(MultiFDSendData *data)
>>       return data->type == MULTIFD_PAYLOAD_NONE;
>>   }
>>   
>> +static inline bool multifd_payload_device_state(MultiFDSendData *data)
>> +{
>> +    return data->type == MULTIFD_PAYLOAD_DEVICE_STATE;
>> +}
>> +
>>   static inline void multifd_set_payload_type(MultiFDSendData *data,
>>                                               MultiFDPayloadType type)
>>   {
>> @@ -182,13 +189,15 @@ typedef struct {
>>        * cleared by the multifd sender threads.
>>        */
>>       bool pending_job;
>> +    bool pending_job_preparing;
>>       bool pending_sync;
>>       MultiFDSendData *data;
>>   
>>       /* thread local variables. No locking required */
>>   
>> -    /* pointer to the packet */
>> +    /* pointers to the possible packet types */
>>       MultiFDPacket_t *packet;
>> +    MultiFDPacketDeviceState_t *packet_device_state;
>>       /* size of the next packet that contains pages */
>>       uint32_t next_packet_size;
>>       /* packets sent through this channel */
>> @@ -276,18 +285,25 @@ typedef struct {
>>   } MultiFDMethods;
>>   
>>   void multifd_register_ops(int method, MultiFDMethods *ops);
>> -void multifd_send_fill_packet(MultiFDSendParams *p);
>> +void multifd_send_fill_packet_ram(MultiFDSendParams *p);
>>   bool multifd_send_prepare_common(MultiFDSendParams *p);
>>   void multifd_send_zero_page_detect(MultiFDSendParams *p);
>>   void multifd_recv_zero_page_process(MultiFDRecvParams *p);
>>   
>> -static inline void multifd_send_prepare_header(MultiFDSendParams *p)
>> +static inline void multifd_send_prepare_header_ram(MultiFDSendParams *p)
> 
> This could instead go to multifd-nocomp.c and become multifd_ram_prepare_header.

Ack.

>>   {
>>       p->iov[0].iov_len = p->packet_len;
>>       p->iov[0].iov_base = p->packet;
>>       p->iovs_num++;
>>   }
>>   
>> +static inline void multifd_send_prepare_header_device_state(MultiFDSendParams *p)
> 
> Seem like this could also move to multifd-device-state.c and drop the
> "send" part.
> 

Ack.

Thanks,
Maciej
Fabiano Rosas Aug. 30, 2024, 1:02 p.m. UTC | #3
"Maciej S. Szmigiero" <mail@maciej.szmigiero.name> writes:

> On 29.08.2024 02:41, Fabiano Rosas wrote:
>> "Maciej S. Szmigiero" <mail@maciej.szmigiero.name> writes:
>> 
>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>>
>>> A new function multifd_queue_device_state() is provided for device to queue
>>> its state for transmission via a multifd channel.
>>>
>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>> ---
>>>   include/migration/misc.h         |  4 ++
>>>   migration/meson.build            |  1 +
>>>   migration/multifd-device-state.c | 99 ++++++++++++++++++++++++++++++++
>>>   migration/multifd-nocomp.c       |  6 +-
>>>   migration/multifd-qpl.c          |  2 +-
>>>   migration/multifd-uadk.c         |  2 +-
>>>   migration/multifd-zlib.c         |  2 +-
>>>   migration/multifd-zstd.c         |  2 +-
>>>   migration/multifd.c              | 65 +++++++++++++++------
>>>   migration/multifd.h              | 29 +++++++++-
>>>   10 files changed, 184 insertions(+), 28 deletions(-)
>>>   create mode 100644 migration/multifd-device-state.c
>>>
>>> diff --git a/include/migration/misc.h b/include/migration/misc.h
>>> index bfadc5613bac..7266b1b77d1f 100644
>>> --- a/include/migration/misc.h
>>> +++ b/include/migration/misc.h
>>> @@ -111,4 +111,8 @@ bool migration_in_bg_snapshot(void);
>>>   /* migration/block-dirty-bitmap.c */
>>>   void dirty_bitmap_mig_init(void);
>>>   
>>> +/* migration/multifd-device-state.c */
>>> +bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
>>> +                                char *data, size_t len);
>>> +
>>>   #endif
>>> diff --git a/migration/meson.build b/migration/meson.build
>>> index 77f3abf08eb1..00853595894f 100644
>>> --- a/migration/meson.build
>>> +++ b/migration/meson.build
>>> @@ -21,6 +21,7 @@ system_ss.add(files(
>>>     'migration-hmp-cmds.c',
>>>     'migration.c',
>>>     'multifd.c',
>>> +  'multifd-device-state.c',
>>>     'multifd-nocomp.c',
>>>     'multifd-zlib.c',
>>>     'multifd-zero-page.c',
>>> diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
>>> new file mode 100644
>>> index 000000000000..c9b44f0b5ab9
>>> --- /dev/null
>>> +++ b/migration/multifd-device-state.c
>>> @@ -0,0 +1,99 @@
>>> +/*
>>> + * Multifd device state migration
>>> + *
>>> + * Copyright (C) 2024 Oracle and/or its affiliates.
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>>> + * See the COPYING file in the top-level directory.
>>> + */
>>> +
>>> +#include "qemu/osdep.h"
>>> +#include "qemu/lockable.h"
>>> +#include "migration/misc.h"
>>> +#include "multifd.h"
>>> +
>>> +static QemuMutex queue_job_mutex;
>>> +
>>> +static MultiFDSendData *device_state_send;
>>> +
>>> +size_t multifd_device_state_payload_size(void)
>>> +{
>>> +    return sizeof(MultiFDDeviceState_t);
>>> +}
>> 
>> This will not be necessary because the payload size is the same as the
>> data type. We only need it for the special case where the MultiFDPages_t
>> is smaller than the total ram payload size.
>
> I know - I just wanted to make the API consistent with the one RAM
> handler provides since these multifd_send_data_alloc() calls are done
> just once per migration - it isn't any kind of a hot path.
>

I think the array at the end of MultiFDPages_t should be considered
enough of a hack that we might want to keep anything related to it
outside of the interface. Other clients shouldn't have to think about
that at all.

>>> @@ -397,20 +404,16 @@ bool multifd_send(MultiFDSendData **send_data)
>>>   
>>>           p = &multifd_send_state->params[i];
>>>           /*
>>> -         * Lockless read to p->pending_job is safe, because only multifd
>>> -         * sender thread can clear it.
>>> +         * Lockless RMW on p->pending_job_preparing is safe, because only multifd
>>> +         * sender thread can clear it after it had seen p->pending_job being set.
>>> +         *
>>> +         * Pairs with qatomic_store_release() in multifd_send_thread().
>>>            */
>>> -        if (qatomic_read(&p->pending_job) == false) {
>>> +        if (qatomic_cmpxchg(&p->pending_job_preparing, false, true) == false) {
>> 
>> What's the motivation for this change? It would be better to have it in
>> a separate patch with a proper justification.
>
> The original RFC patch set used dedicated device state multifd channels.
>
> Peter and other people wanted this functionality removed, however this caused
> a performance (downtime) regression.
>
> One of the things that seemed to help mitigate this regression was making
> the multifd channel selection more fair via this change.
>
> But I can split out it to a separate commit in the next patch set version and
> then see what performance improvement it currently brings.

Yes, better to have it separate if anything for documentation of the
rationale.
Peter Xu Sept. 9, 2024, 7:40 p.m. UTC | #4
On Fri, Aug 30, 2024 at 10:02:40AM -0300, Fabiano Rosas wrote:
> >>> @@ -397,20 +404,16 @@ bool multifd_send(MultiFDSendData **send_data)
> >>>   
> >>>           p = &multifd_send_state->params[i];
> >>>           /*
> >>> -         * Lockless read to p->pending_job is safe, because only multifd
> >>> -         * sender thread can clear it.
> >>> +         * Lockless RMW on p->pending_job_preparing is safe, because only multifd
> >>> +         * sender thread can clear it after it had seen p->pending_job being set.
> >>> +         *
> >>> +         * Pairs with qatomic_store_release() in multifd_send_thread().
> >>>            */
> >>> -        if (qatomic_read(&p->pending_job) == false) {
> >>> +        if (qatomic_cmpxchg(&p->pending_job_preparing, false, true) == false) {
> >> 
> >> What's the motivation for this change? It would be better to have it in
> >> a separate patch with a proper justification.
> >
> > The original RFC patch set used dedicated device state multifd channels.
> >
> > Peter and other people wanted this functionality removed, however this caused
> > a performance (downtime) regression.
> >
> > One of the things that seemed to help mitigate this regression was making
> > the multifd channel selection more fair via this change.
> >
> > But I can split out it to a separate commit in the next patch set version and
> > then see what performance improvement it currently brings.
> 
> Yes, better to have it separate if anything for documentation of the
> rationale.

And when drafting that patch, please add a comment explaining the field.
Currently it's missing:

    /*
     * The sender thread has work to do if either of below boolean is set.
     *
     * @pending_job:  a job is pending
     * @pending_sync: a sync request is pending
     *
     * For both of these fields, they're only set by the requesters, and
     * cleared by the multifd sender threads.
     */
    bool pending_job;
    bool pending_job_preparing;
    bool pending_sync;
Peter Xu Sept. 10, 2024, 4:06 p.m. UTC | #5
On Tue, Aug 27, 2024 at 07:54:31PM +0200, Maciej S. Szmigiero wrote:
> +bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
> +                                char *data, size_t len)
> +{
> +    /* Device state submissions can come from multiple threads */
> +    QEMU_LOCK_GUARD(&queue_job_mutex);

Ah, just notice there's the mutex.

So please consider the reply in the other thread, IIUC we can make it for
multifd_send() to be a generic mutex to simplify the other patch too, then
drop here.

I assume the ram code should be fine taking one more mutex even without
vfio, if it only takes once for each ~128 pages to enqueue, and only take
in the main thread, then each update should be also in the hot path
(e.g. no cache bouncing).

> +    MultiFDDeviceState_t *device_state;
> +
> +    assert(multifd_payload_empty(device_state_send));
> +
> +    multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE);
> +    device_state = &device_state_send->u.device_state;
> +    device_state->idstr = g_strdup(idstr);
> +    device_state->instance_id = instance_id;
> +    device_state->buf = g_memdup2(data, len);
> +    device_state->buf_len = len;
> +
> +    if (!multifd_send(&device_state_send)) {
> +        multifd_send_data_clear(device_state_send);
> +        return false;
> +    }
> +
> +    return true;
> +}
Peter Xu Sept. 10, 2024, 7:48 p.m. UTC | #6
On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote:
> > +size_t multifd_device_state_payload_size(void)
> > +{
> > +    return sizeof(MultiFDDeviceState_t);
> > +}
> 
> This will not be necessary because the payload size is the same as the
> data type. We only need it for the special case where the MultiFDPages_t
> is smaller than the total ram payload size.

Today I was thinking maybe we should really clean this up, as the current
multifd_send_data_alloc() is indeed too tricky (blame me.. who requested
that more or less).  Knowing that VFIO can use dynamic buffers with ->idstr
and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made
that feeling stronger.

I think we should change it now perhaps, otherwise we'll need to introduce
other helpers to e.g. reset the device buffers, and that's not only slow
but also not good looking, IMO.

So I went ahead with the idea in previous discussion, that I managed to
change the SendData union into struct; the memory consumption is not super
important yet, IMHO, but we should still stick with the object model where
multifd enqueue thread switch buffer with multifd, as it still sounds a
sane way to do.

Then when that patch is ready, I further tried to make VFIO reuse multifd
buffers just like what we do with MultiFDPages_t->offset[]: in RAM code we
don't allocate it every time we enqueue.

I hope it'll also work for VFIO.  VFIO has a specialty on being able to
dump the config space so it's more complex (and I noticed Maciej's current
design requires the final chunk of VFIO config data be migrated in one
packet.. that is also part of the complexity there).  So I allowed that
part to allocate a buffer but only that.  IOW, I made some API (see below)
that can either reuse preallocated buffer, or use a separate one only for
the final bulk.

In short, could both of you have a look at what I came up with below?  I
did that in patches because I think it's too much to comment, so patches
may work better.  No concern if any of below could be good changes to you,
then either Maciej can squash whatever into existing patches (and I feel
like some existing patches in this series can go away with below design),
or I can post pre-requisite patch but only if any of you prefer that.

Anyway, let me know, the patches apply on top of this whole series applied
first.

I also wonder whether there can be any perf difference already (I tested
all multifd qtest with below, but no VFIO I can run), perhaps not that
much, but just to mention below should avoid both buffer allocations and
one round of copy (so VFIO read() directly writes to the multifd buffers
now).

Thanks,

==========8<==========
From a6cbcf692b2376e72cc053219d67bb32eabfb7a6 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 10 Sep 2024 12:10:59 -0400
Subject: [PATCH 1/3] migration/multifd: Make MultiFDSendData a struct

The newly introduced device state buffer can be used for either storing
VFIO's read() raw data, but already also possible to store generic device
states.  After noticing that device states may not easily provide a max
buffer size (also the fact that RAM MultiFDPages_t after all also want to
have flexibility on managing offset[] array), it may not be a good idea to
stick with union on MultiFDSendData.. as it won't play well with such
flexibility.

Switch MultiFDSendData to a struct.

It won't consume a lot more space in reality, after all the real buffers
were already dynamically allocated, so it's so far only about the two
structs (pages, device_state) that will be duplicated, but they're small.

With this, we can remove the pretty hard to understand alloc size logic.
Because now we can allocate offset[] together with the SendData, and
properly free it when the SendData is freed.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 migration/multifd.h              | 16 +++++++++++-----
 migration/multifd-device-state.c |  8 ++++++--
 migration/multifd-nocomp.c       | 13 ++++++-------
 migration/multifd.c              | 25 ++++++-------------------
 4 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/migration/multifd.h b/migration/multifd.h
index c15c83104c..47203334b9 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -98,9 +98,13 @@ typedef struct {
     uint32_t num;
     /* number of normal pages */
     uint32_t normal_num;
+    /*
+     * Pointer to the ramblock.  NOTE: it's caller's responsibility to make
+     * sure the pointer is always valid!
+     */
     RAMBlock *block;
-    /* offset of each page */
-    ram_addr_t offset[];
+    /* offset array of each page, managed by multifd */
+    ram_addr_t *offset;
 } MultiFDPages_t;
 
 struct MultiFDRecvData {
@@ -123,7 +127,7 @@ typedef enum {
     MULTIFD_PAYLOAD_DEVICE_STATE,
 } MultiFDPayloadType;
 
-typedef union MultiFDPayload {
+typedef struct MultiFDPayload {
     MultiFDPages_t ram;
     MultiFDDeviceState_t device_state;
 } MultiFDPayload;
@@ -323,11 +327,13 @@ static inline uint32_t multifd_ram_page_count(void)
 void multifd_ram_save_setup(void);
 void multifd_ram_save_cleanup(void);
 int multifd_ram_flush_and_sync(void);
-size_t multifd_ram_payload_size(void);
+void multifd_ram_payload_alloc(MultiFDPages_t *pages);
+void multifd_ram_payload_free(MultiFDPages_t *pages);
 void multifd_ram_fill_packet(MultiFDSendParams *p);
 int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp);
 
-size_t multifd_device_state_payload_size(void);
+void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state);
+void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state);
 void multifd_device_state_save_setup(void);
 void multifd_device_state_clear(MultiFDDeviceState_t *device_state);
 void multifd_device_state_save_cleanup(void);
diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
index 9b364e8ef3..72b72b6e62 100644
--- a/migration/multifd-device-state.c
+++ b/migration/multifd-device-state.c
@@ -22,9 +22,13 @@ bool send_threads_abort;
 
 static MultiFDSendData *device_state_send;
 
-size_t multifd_device_state_payload_size(void)
+/* TODO: use static buffers for idstr and buf */
+void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state)
+{
+}
+
+void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state)
 {
-    return sizeof(MultiFDDeviceState_t);
 }
 
 void multifd_device_state_save_setup(void)
diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
index 0b7b543f44..c1b95fee0d 100644
--- a/migration/multifd-nocomp.c
+++ b/migration/multifd-nocomp.c
@@ -22,15 +22,14 @@
 
 static MultiFDSendData *multifd_ram_send;
 
-size_t multifd_ram_payload_size(void)
+void multifd_ram_payload_alloc(MultiFDPages_t *pages)
 {
-    uint32_t n = multifd_ram_page_count();
+    pages->offset = g_new0(ram_addr_t, multifd_ram_page_count());
+}
 
-    /*
-     * We keep an array of page offsets at the end of MultiFDPages_t,
-     * add space for it in the allocation.
-     */
-    return sizeof(MultiFDPages_t) + n * sizeof(ram_addr_t);
+void multifd_ram_payload_free(MultiFDPages_t *pages)
+{
+    g_clear_pointer(&pages->offset, g_free);
 }
 
 void multifd_ram_save_setup(void)
diff --git a/migration/multifd.c b/migration/multifd.c
index bebe5b5a9b..5a20b831cf 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -101,26 +101,12 @@ struct {
 
 MultiFDSendData *multifd_send_data_alloc(void)
 {
-    size_t max_payload_size, size_minus_payload;
+    MultiFDSendData *new = g_new0(MultiFDSendData, 1);
 
-    /*
-     * MultiFDPages_t has a flexible array at the end, account for it
-     * when allocating MultiFDSendData. Use max() in case other types
-     * added to the union in the future are larger than
-     * (MultiFDPages_t + flex array).
-     */
-    max_payload_size = MAX(multifd_ram_payload_size(),
-                           multifd_device_state_payload_size());
-    max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload));
-
-    /*
-     * Account for any holes the compiler might insert. We can't pack
-     * the structure because that misaligns the members and triggers
-     * Waddress-of-packed-member.
-     */
-    size_minus_payload = sizeof(MultiFDSendData) - sizeof(MultiFDPayload);
+    multifd_ram_payload_alloc(&new->u.ram);
+    multifd_device_state_payload_alloc(&new->u.device_state);
 
-    return g_malloc0(size_minus_payload + max_payload_size);
+    return new;
 }
 
 void multifd_send_data_clear(MultiFDSendData *data)
@@ -147,7 +133,8 @@ void multifd_send_data_free(MultiFDSendData *data)
         return;
     }
 
-    multifd_send_data_clear(data);
+    multifd_ram_payload_free(&data->u.ram);
+    multifd_device_state_payload_free(&data->u.device_state);
 
     g_free(data);
 }
Fabiano Rosas Sept. 12, 2024, 6:43 p.m. UTC | #7
Peter Xu <peterx@redhat.com> writes:

Hi Peter, sorry if I'm not very enthusiastic by this, I'm sure you
understand the rework is a little frustrating.

> On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote:
>> > +size_t multifd_device_state_payload_size(void)
>> > +{
>> > +    return sizeof(MultiFDDeviceState_t);
>> > +}
>> 
>> This will not be necessary because the payload size is the same as the
>> data type. We only need it for the special case where the MultiFDPages_t
>> is smaller than the total ram payload size.
>
> Today I was thinking maybe we should really clean this up, as the current
> multifd_send_data_alloc() is indeed too tricky (blame me.. who requested
> that more or less).  Knowing that VFIO can use dynamic buffers with ->idstr
> and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made
> that feeling stronger.

If we're going to commit bad code and then rewrite it a week later, we
could have just let the original series from Maciej merge without any of
this. I already suggested it a couple of times, we shouldn't be doing
core refactorings underneath contributors' patches, this is too
fragile. Just let people contribute their code and we can change it
later.

This is also why I've been trying hard to separate core multifd
functionality from migration code that uses multifd to transmit their
data.

My original RFC plus the suggestion to extend multifd_ops for device
state would have (almost) made it so that no client code would be left
in multifd. We could have been turning this thing upside down and it
wouldn't affect anyone in terms of code conflicts.

The ship has already sailed, so your patches below are fine, I have just
some small comments.

>
> I think we should change it now perhaps, otherwise we'll need to introduce
> other helpers to e.g. reset the device buffers, and that's not only slow
> but also not good looking, IMO.

I agree that part is kind of a sore thumb.

>
> So I went ahead with the idea in previous discussion, that I managed to
> change the SendData union into struct; the memory consumption is not super
> important yet, IMHO, but we should still stick with the object model where
> multifd enqueue thread switch buffer with multifd, as it still sounds a
> sane way to do.
>
> Then when that patch is ready, I further tried to make VFIO reuse multifd
> buffers just like what we do with MultiFDPages_t->offset[]: in RAM code we
> don't allocate it every time we enqueue.
>
> I hope it'll also work for VFIO.  VFIO has a specialty on being able to
> dump the config space so it's more complex (and I noticed Maciej's current
> design requires the final chunk of VFIO config data be migrated in one
> packet.. that is also part of the complexity there).  So I allowed that
> part to allocate a buffer but only that.  IOW, I made some API (see below)
> that can either reuse preallocated buffer, or use a separate one only for
> the final bulk.
>
> In short, could both of you have a look at what I came up with below?  I
> did that in patches because I think it's too much to comment, so patches
> may work better.  No concern if any of below could be good changes to you,
> then either Maciej can squash whatever into existing patches (and I feel
> like some existing patches in this series can go away with below design),
> or I can post pre-requisite patch but only if any of you prefer that.
>
> Anyway, let me know, the patches apply on top of this whole series applied
> first.
>
> I also wonder whether there can be any perf difference already (I tested
> all multifd qtest with below, but no VFIO I can run), perhaps not that
> much, but just to mention below should avoid both buffer allocations and
> one round of copy (so VFIO read() directly writes to the multifd buffers
> now).
>
> Thanks,
>
> ==========8<==========
> From a6cbcf692b2376e72cc053219d67bb32eabfb7a6 Mon Sep 17 00:00:00 2001
> From: Peter Xu <peterx@redhat.com>
> Date: Tue, 10 Sep 2024 12:10:59 -0400
> Subject: [PATCH 1/3] migration/multifd: Make MultiFDSendData a struct
>
> The newly introduced device state buffer can be used for either storing
> VFIO's read() raw data, but already also possible to store generic device
> states.  After noticing that device states may not easily provide a max
> buffer size (also the fact that RAM MultiFDPages_t after all also want to
> have flexibility on managing offset[] array), it may not be a good idea to
> stick with union on MultiFDSendData.. as it won't play well with such
> flexibility.
>
> Switch MultiFDSendData to a struct.
>
> It won't consume a lot more space in reality, after all the real buffers
> were already dynamically allocated, so it's so far only about the two
> structs (pages, device_state) that will be duplicated, but they're small.
>
> With this, we can remove the pretty hard to understand alloc size logic.
> Because now we can allocate offset[] together with the SendData, and
> properly free it when the SendData is freed.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  migration/multifd.h              | 16 +++++++++++-----
>  migration/multifd-device-state.c |  8 ++++++--
>  migration/multifd-nocomp.c       | 13 ++++++-------
>  migration/multifd.c              | 25 ++++++-------------------
>  4 files changed, 29 insertions(+), 33 deletions(-)
>
> diff --git a/migration/multifd.h b/migration/multifd.h
> index c15c83104c..47203334b9 100644
> --- a/migration/multifd.h
> +++ b/migration/multifd.h
> @@ -98,9 +98,13 @@ typedef struct {
>      uint32_t num;
>      /* number of normal pages */
>      uint32_t normal_num;
> +    /*
> +     * Pointer to the ramblock.  NOTE: it's caller's responsibility to make
> +     * sure the pointer is always valid!
> +     */

This could use some rewording, it's not clear what "caller" means here.

>      RAMBlock *block;
> -    /* offset of each page */
> -    ram_addr_t offset[];
> +    /* offset array of each page, managed by multifd */

I'd drop the part after the comma, it's not very accurate and also gives
an impression that something sophisticated is being done to this.

> +    ram_addr_t *offset;
>  } MultiFDPages_t;
>  
>  struct MultiFDRecvData {
> @@ -123,7 +127,7 @@ typedef enum {
>      MULTIFD_PAYLOAD_DEVICE_STATE,
>  } MultiFDPayloadType;
>  
> -typedef union MultiFDPayload {
> +typedef struct MultiFDPayload {
>      MultiFDPages_t ram;
>      MultiFDDeviceState_t device_state;
>  } MultiFDPayload;
> @@ -323,11 +327,13 @@ static inline uint32_t multifd_ram_page_count(void)
>  void multifd_ram_save_setup(void);
>  void multifd_ram_save_cleanup(void);
>  int multifd_ram_flush_and_sync(void);
> -size_t multifd_ram_payload_size(void);
> +void multifd_ram_payload_alloc(MultiFDPages_t *pages);
> +void multifd_ram_payload_free(MultiFDPages_t *pages);
>  void multifd_ram_fill_packet(MultiFDSendParams *p);
>  int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp);
>  
> -size_t multifd_device_state_payload_size(void);
> +void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state);
> +void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state);
>  void multifd_device_state_save_setup(void);
>  void multifd_device_state_clear(MultiFDDeviceState_t *device_state);
>  void multifd_device_state_save_cleanup(void);
> diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
> index 9b364e8ef3..72b72b6e62 100644
> --- a/migration/multifd-device-state.c
> +++ b/migration/multifd-device-state.c
> @@ -22,9 +22,13 @@ bool send_threads_abort;
>  
>  static MultiFDSendData *device_state_send;
>  
> -size_t multifd_device_state_payload_size(void)
> +/* TODO: use static buffers for idstr and buf */
> +void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state)
> +{
> +}
> +
> +void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state)
>  {
> -    return sizeof(MultiFDDeviceState_t);
>  }
>  
>  void multifd_device_state_save_setup(void)
> diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
> index 0b7b543f44..c1b95fee0d 100644
> --- a/migration/multifd-nocomp.c
> +++ b/migration/multifd-nocomp.c
> @@ -22,15 +22,14 @@
>  
>  static MultiFDSendData *multifd_ram_send;
>  
> -size_t multifd_ram_payload_size(void)
> +void multifd_ram_payload_alloc(MultiFDPages_t *pages)
>  {
> -    uint32_t n = multifd_ram_page_count();
> +    pages->offset = g_new0(ram_addr_t, multifd_ram_page_count());
> +}
>  
> -    /*
> -     * We keep an array of page offsets at the end of MultiFDPages_t,
> -     * add space for it in the allocation.
> -     */
> -    return sizeof(MultiFDPages_t) + n * sizeof(ram_addr_t);
> +void multifd_ram_payload_free(MultiFDPages_t *pages)
> +{
> +    g_clear_pointer(&pages->offset, g_free);
>  }
>  
>  void multifd_ram_save_setup(void)
> diff --git a/migration/multifd.c b/migration/multifd.c
> index bebe5b5a9b..5a20b831cf 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -101,26 +101,12 @@ struct {
>  
>  MultiFDSendData *multifd_send_data_alloc(void)
>  {
> -    size_t max_payload_size, size_minus_payload;
> +    MultiFDSendData *new = g_new0(MultiFDSendData, 1);
>  
> -    /*
> -     * MultiFDPages_t has a flexible array at the end, account for it
> -     * when allocating MultiFDSendData. Use max() in case other types
> -     * added to the union in the future are larger than
> -     * (MultiFDPages_t + flex array).
> -     */
> -    max_payload_size = MAX(multifd_ram_payload_size(),
> -                           multifd_device_state_payload_size());
> -    max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload));
> -
> -    /*
> -     * Account for any holes the compiler might insert. We can't pack
> -     * the structure because that misaligns the members and triggers
> -     * Waddress-of-packed-member.
> -     */
> -    size_minus_payload = sizeof(MultiFDSendData) - sizeof(MultiFDPayload);
> +    multifd_ram_payload_alloc(&new->u.ram);
> +    multifd_device_state_payload_alloc(&new->u.device_state);
>  
> -    return g_malloc0(size_minus_payload + max_payload_size);
> +    return new;
>  }
>  
>  void multifd_send_data_clear(MultiFDSendData *data)
> @@ -147,7 +133,8 @@ void multifd_send_data_free(MultiFDSendData *data)
>          return;
>      }
>  
> -    multifd_send_data_clear(data);
> +    multifd_ram_payload_free(&data->u.ram);
> +    multifd_device_state_payload_free(&data->u.device_state);

The "u" needs to be dropped now. Could just change to "p". Or can we now
move the whole struct inside MultiFDSendData?

>  
>      g_free(data);
>  }
> -- 
> 2.45.0
>
>
>
> From 6695d134c0818f42183f5ea03c21e6d56e7b57ea Mon Sep 17 00:00:00 2001
> From: Peter Xu <peterx@redhat.com>
> Date: Tue, 10 Sep 2024 12:24:14 -0400
> Subject: [PATCH 2/3] migration/multifd: Optimize device_state->idstr updates
>
> The duplication / allocation of idstr for each VFIO blob is an overkill, as
> idstr isn't something that changes frequently.  Also, the idstr always came
> from the upper layer of se->idstr so it's always guaranteed to
> exist (e.g. no device unplug allowed during migration).
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  migration/multifd.h              |  4 ++++
>  migration/multifd-device-state.c | 10 +++++++---
>  2 files changed, 11 insertions(+), 3 deletions(-)
>
> diff --git a/migration/multifd.h b/migration/multifd.h
> index 47203334b9..1eaa5d4496 100644
> --- a/migration/multifd.h
> +++ b/migration/multifd.h
> @@ -115,6 +115,10 @@ struct MultiFDRecvData {
>  };
>  
>  typedef struct {
> +    /*
> +     * Name of the owner device.  NOTE: it's caller's responsibility to
> +     * make sure the pointer is always valid!
> +     */

Same comment as the other one here.

>      char *idstr;
>      uint32_t instance_id;
>      char *buf;
> diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
> index 72b72b6e62..cfd0465bac 100644
> --- a/migration/multifd-device-state.c
> +++ b/migration/multifd-device-state.c
> @@ -44,7 +44,7 @@ void multifd_device_state_save_setup(void)
>  
>  void multifd_device_state_clear(MultiFDDeviceState_t *device_state)
>  {
> -    g_clear_pointer(&device_state->idstr, g_free);
> +    device_state->idstr = NULL;
>      g_clear_pointer(&device_state->buf, g_free);
>  }
>  
> @@ -100,7 +100,12 @@ bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
>  
>      multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE);
>      device_state = &device_state_send->u.device_state;
> -    device_state->idstr = g_strdup(idstr);
> +    /*
> +     * NOTE: here we must use a static idstr (e.g. of a savevm state
> +     * entry) rather than any dynamically allocated buffer, because multifd
> +     * assumes this pointer is always valid!
> +     */
> +    device_state->idstr = idstr;
>      device_state->instance_id = instance_id;
>      device_state->buf = g_memdup2(data, len);
>      device_state->buf_len = len;
> @@ -137,7 +142,6 @@ static void multifd_device_state_save_thread_data_free(void *opaque)
>  {
>      struct MultiFDDSSaveThreadData *data = opaque;
>  
> -    g_clear_pointer(&data->idstr, g_free);
>      g_free(data);
>  }
>  
> -- 
> 2.45.0
>
>
> From abfea9698ff46ad0e0175e1dcc6e005e0b2ece2a Mon Sep 17 00:00:00 2001
> From: Peter Xu <peterx@redhat.com>
> Date: Tue, 10 Sep 2024 12:27:49 -0400
> Subject: [PATCH 3/3] migration/multifd: Optimize device_state buffer
>  allocations
>
> Provide a device_state->buf_prealloc so that the buffers can be reused if
> possible.  Provide a set of APIs to use it right.  Please see the
> documentation for the API in the code.
>
> The default buffer size came from VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE as of
> now.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  include/hw/vfio/vfio-common.h    |  9 ++++
>  include/migration/misc.h         | 12 ++++-
>  migration/multifd.h              | 11 +++-
>  hw/vfio/migration.c              | 43 ++++++++-------
>  migration/multifd-device-state.c | 93 +++++++++++++++++++++++++-------
>  migration/multifd.c              |  9 ----
>  6 files changed, 126 insertions(+), 51 deletions(-)
>
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 4578a0ca6a..c1f2f4ae55 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -61,6 +61,13 @@ typedef struct VFIORegion {
>      uint8_t nr; /* cache the region number for debug */
>  } VFIORegion;
>  
> +typedef struct VFIODeviceStatePacket {
> +    uint32_t version;
> +    uint32_t idx;
> +    uint32_t flags;
> +    uint8_t data[0];
> +} QEMU_PACKED VFIODeviceStatePacket;
> +
>  typedef struct VFIOMigration {
>      struct VFIODevice *vbasedev;
>      VMChangeStateEntry *vm_state;
> @@ -168,6 +175,8 @@ typedef struct VFIODevice {
>      int devid;
>      IOMMUFDBackend *iommufd;
>      VFIOIOASHwpt *hwpt;
> +    /* Only used on sender side when multifd is enabled */
> +    VFIODeviceStatePacket *multifd_packet;
>      QLIST_ENTRY(VFIODevice) hwpt_next;
>  } VFIODevice;
>  
> diff --git a/include/migration/misc.h b/include/migration/misc.h
> index 26f7f3140f..1a8676ed3d 100644
> --- a/include/migration/misc.h
> +++ b/include/migration/misc.h
> @@ -112,8 +112,16 @@ bool migration_in_bg_snapshot(void);
>  void dirty_bitmap_mig_init(void);
>  
>  /* migration/multifd-device-state.c */
> -bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
> -                                char *data, size_t len);
> +struct MultiFDDeviceState_t;
> +typedef struct MultiFDDeviceState_t MultiFDDeviceState_t;
> +
> +MultiFDDeviceState_t *
> +multifd_device_state_prepare(char *idstr, uint32_t instance_id);
> +void *multifd_device_state_get_buffer(MultiFDDeviceState_t *s,
> +                                      int64_t *buf_len);
> +bool multifd_device_state_finish(MultiFDDeviceState_t *state,
> +                                 void *buf, int64_t buf_len);
> +
>  bool migration_has_device_state_support(void);
>  
>  void
> diff --git a/migration/multifd.h b/migration/multifd.h
> index 1eaa5d4496..1ccdeeb8c5 100644
> --- a/migration/multifd.h
> +++ b/migration/multifd.h
> @@ -15,6 +15,7 @@
>  
>  #include "exec/target_page.h"
>  #include "ram.h"
> +#include "migration/misc.h"
>  
>  typedef struct MultiFDRecvData MultiFDRecvData;
>  typedef struct MultiFDSendData MultiFDSendData;
> @@ -114,16 +115,22 @@ struct MultiFDRecvData {
>      off_t file_offset;
>  };
>  
> -typedef struct {
> +struct MultiFDDeviceState_t {
>      /*
>       * Name of the owner device.  NOTE: it's caller's responsibility to
>       * make sure the pointer is always valid!
>       */
>      char *idstr;
>      uint32_t instance_id;
> +    /*
> +     * Points to the buffer to send via multifd.  Normally it's the same as
> +     * buf_prealloc, otherwise the caller needs to make sure the buffer is
> +     * avaliable through multifd running.

"throughout multifd runtime" maybe.

> +     */
>      char *buf;
> +    char *buf_prealloc;
>      size_t buf_len;
> -} MultiFDDeviceState_t;
> +};
>  
>  typedef enum {
>      MULTIFD_PAYLOAD_NONE,
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index 67996aa2df..e36422b7c5 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -59,13 +59,6 @@
>  
>  #define VFIO_DEVICE_STATE_CONFIG_STATE (1)
>  
> -typedef struct VFIODeviceStatePacket {
> -    uint32_t version;
> -    uint32_t idx;
> -    uint32_t flags;
> -    uint8_t data[0];
> -} QEMU_PACKED VFIODeviceStatePacket;
> -
>  static int64_t bytes_transferred;
>  
>  static const char *mig_state_to_str(enum vfio_device_mig_state state)
> @@ -741,6 +734,9 @@ static void vfio_save_cleanup(void *opaque)
>      migration->initial_data_sent = false;
>      vfio_migration_cleanup(vbasedev);
>      trace_vfio_save_cleanup(vbasedev->name);
> +    if (vbasedev->multifd_packet) {
> +        g_clear_pointer(&vbasedev->multifd_packet, g_free);
> +    }
>  }
>  
>  static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy,
> @@ -892,7 +888,8 @@ static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbas
>      g_autoptr(QIOChannelBuffer) bioc = NULL;
>      QEMUFile *f = NULL;
>      int ret;
> -    g_autofree VFIODeviceStatePacket *packet = NULL;
> +    VFIODeviceStatePacket *packet;
> +    MultiFDDeviceState_t *state;
>      size_t packet_len;
>  
>      bioc = qio_channel_buffer_new(0);
> @@ -911,13 +908,19 @@ static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbas
>      }
>  
>      packet_len = sizeof(*packet) + bioc->usage;
> -    packet = g_malloc0(packet_len);
> +
> +    state = multifd_device_state_prepare(idstr, instance_id);
> +    /*
> +     * Do not reuse multifd buffer, but use our own due to random size.
> +     * The buffer will be freed only when save cleanup.
> +     */
> +    vbasedev->multifd_packet = g_malloc0(packet_len);
> +    packet = vbasedev->multifd_packet;
>      packet->idx = idx;
>      packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE;
>      memcpy(&packet->data, bioc->data, bioc->usage);
>  
> -    if (!multifd_queue_device_state(idstr, instance_id,
> -                                    (char *)packet, packet_len)) {
> +    if (!multifd_device_state_finish(state, packet, packet_len)) {
>          ret = -1;
>      }
>  
> @@ -936,7 +939,6 @@ static int vfio_save_complete_precopy_thread(char *idstr,
>      VFIODevice *vbasedev = opaque;
>      VFIOMigration *migration = vbasedev->migration;
>      int ret;
> -    g_autofree VFIODeviceStatePacket *packet = NULL;
>      uint32_t idx;
>  
>      if (!migration->multifd_transfer) {
> @@ -954,21 +956,25 @@ static int vfio_save_complete_precopy_thread(char *idstr,
>          goto ret_finish;
>      }
>  
> -    packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size);
> -
>      for (idx = 0; ; idx++) {
> +        VFIODeviceStatePacket *packet;
> +        MultiFDDeviceState_t *state;
>          ssize_t data_size;
>          size_t packet_size;
> +        int64_t buf_size;
>  
>          if (qatomic_read(abort_flag)) {
>              ret = -ECANCELED;
>              goto ret_finish;
>          }
>  
> +        state = multifd_device_state_prepare(idstr, instance_id);
> +        packet = multifd_device_state_get_buffer(state, &buf_size);
>          data_size = read(migration->data_fd, &packet->data,
> -                         migration->data_buffer_size);
> +                         buf_size - sizeof(*packet));
>          if (data_size < 0) {
>              if (errno != ENOMSG) {
> +                multifd_device_state_finish(state, NULL, 0);
>                  ret = -errno;
>                  goto ret_finish;
>              }
> @@ -980,14 +986,15 @@ static int vfio_save_complete_precopy_thread(char *idstr,
>              data_size = 0;
>          }
>  
> -        if (data_size == 0)
> +        if (data_size == 0) {
> +            multifd_device_state_finish(state, NULL, 0);
>              break;
> +        }
>  
>          packet->idx = idx;
>          packet_size = sizeof(*packet) + data_size;
>  
> -        if (!multifd_queue_device_state(idstr, instance_id,
> -                                        (char *)packet, packet_size)) {
> +        if (!multifd_device_state_finish(state, packet, packet_size)) {
>              ret = -1;
>              goto ret_finish;
>          }
> diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
> index cfd0465bac..6f0259426d 100644
> --- a/migration/multifd-device-state.c
> +++ b/migration/multifd-device-state.c
> @@ -20,15 +20,18 @@ ThreadPool *send_threads;
>  int send_threads_ret;
>  bool send_threads_abort;
>  
> +#define  MULTIFD_DEVICE_STATE_BUFLEN  (1UL << 20)
> +
>  static MultiFDSendData *device_state_send;
>  
> -/* TODO: use static buffers for idstr and buf */
>  void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state)
>  {
> +    device_state->buf_prealloc = g_malloc0(MULTIFD_DEVICE_STATE_BUFLEN);
>  }
>  
>  void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state)
>  {
> +    g_clear_pointer(&device_state->buf_prealloc, g_free);
>  }
>  
>  void multifd_device_state_save_setup(void)
> @@ -42,12 +45,6 @@ void multifd_device_state_save_setup(void)
>      send_threads_abort = false;
>  }
>  
> -void multifd_device_state_clear(MultiFDDeviceState_t *device_state)
> -{
> -    device_state->idstr = NULL;
> -    g_clear_pointer(&device_state->buf, g_free);
> -}
> -
>  void multifd_device_state_save_cleanup(void)
>  {
>      g_clear_pointer(&send_threads, thread_pool_free);
> @@ -89,33 +86,89 @@ void multifd_device_state_send_prepare(MultiFDSendParams *p)
>      multifd_device_state_fill_packet(p);
>  }
>  
> -bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
> -                                char *data, size_t len)
> +/*
> + * Prepare to send some device state via multifd.  Returns the current idle
> + * MultiFDDeviceState_t*.
> + *
> + * As a follow up, the caller must call multifd_device_state_finish() to
> + * release the resources.
> + *
> + * One example usage of the API:
> + *
> + *   // Fetch a free multifd device state object
> + *   state = multifd_device_state_prepare(idstr, instance_id);
> + *
> + *   // Optional: fetch the buffer to reuse
> + *   buf = multifd_device_state_get_buffer(state, &buf_size);
> + *
> + *   // Here len>0 means success, otherwise failure
> + *   len = buffer_fill(buf, buf_size);
> + *
> + *   // Finish the transaction, either enqueue or cancel the request.  Here
> + *   // len>0 will enqueue, <=0 will cancel.
> + *   multifd_device_state_finish(state, buf, len);
> + */
> +MultiFDDeviceState_t *
> +multifd_device_state_prepare(char *idstr, uint32_t instance_id)
>  {
> -    /* Device state submissions can come from multiple threads */
> -    QEMU_LOCK_GUARD(&queue_job_mutex);
>      MultiFDDeviceState_t *device_state;
>  
>      assert(multifd_payload_empty(device_state_send));
>  
> -    multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE);
> +    /*
> +     * TODO: The lock name may need change, but I'm reusing just for
> +     * simplicity.
> +     */
> +    qemu_mutex_lock(&queue_job_mutex);
> +
>      device_state = &device_state_send->u.device_state;
>      /*
> -     * NOTE: here we must use a static idstr (e.g. of a savevm state
> -     * entry) rather than any dynamically allocated buffer, because multifd
> +     * NOTE: here we must use a static idstr (e.g. of a savevm state entry)
> +     * rather than any dynamically allocated buffer, because multifd
>       * assumes this pointer is always valid!
>       */
>      device_state->idstr = idstr;
>      device_state->instance_id = instance_id;
> -    device_state->buf = g_memdup2(data, len);
> -    device_state->buf_len = len;
>  
> -    if (!multifd_send(&device_state_send)) {
> -        multifd_send_data_clear(device_state_send);
> -        return false;
> +    return &device_state_send->u.device_state;
> +}
> +
> +/*
> + * Need to be used after a previous call to multifd_device_state_prepare(),
> + * the buffer must not be used after invoke multifd_device_state_finish().
> + */
> +void *multifd_device_state_get_buffer(MultiFDDeviceState_t *s,
> +                                      int64_t *buf_len)
> +{
> +    *buf_len = MULTIFD_DEVICE_STATE_BUFLEN;
> +    return s->buf_prealloc;
> +}
> +
> +/*
> + * Need to be used only in pair with a previous call to
> + * multifd_device_state_prepare().  Returns true if enqueue successful,
> + * false otherwise.
> + */
> +bool multifd_device_state_finish(MultiFDDeviceState_t *state,
> +                                 void *buf, int64_t buf_len)
> +{
> +    bool result = false;
> +
> +    /* Currently we only have one global free buffer */
> +    assert(state == &device_state_send->u.device_state);
> +
> +    if (buf_len < 0) {
> +        goto out;
>      }
>  
> -    return true;
> +    multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE);
> +    /* This normally will be the state->buf_prealloc, but not required */
> +    state->buf = buf;
> +    state->buf_len = buf_len;
> +    result = multifd_send(&device_state_send);
> +out:
> +    qemu_mutex_unlock(&queue_job_mutex);
> +    return result;
>  }
>  
>  bool migration_has_device_state_support(void)
> diff --git a/migration/multifd.c b/migration/multifd.c
> index 5a20b831cf..2b5185e298 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -115,15 +115,6 @@ void multifd_send_data_clear(MultiFDSendData *data)
>          return;
>      }
>  
> -    switch (data->type) {
> -    case MULTIFD_PAYLOAD_DEVICE_STATE:
> -        multifd_device_state_clear(&data->u.device_state);
> -        break;
> -    default:
> -        /* Nothing to do */
> -        break;
> -    }
> -
>      data->type = MULTIFD_PAYLOAD_NONE;
>  }
>  
> -- 
> 2.45.0
Peter Xu Sept. 13, 2024, 12:23 a.m. UTC | #8
On Thu, Sep 12, 2024 at 03:43:39PM -0300, Fabiano Rosas wrote:
> Peter Xu <peterx@redhat.com> writes:
> 
> Hi Peter, sorry if I'm not very enthusiastic by this, I'm sure you
> understand the rework is a little frustrating.

That's OK.

[For some reason my email sync program decided to give up working for
 hours.  I got more time looking at a tsc bug, which is good, but then I
 miss a lot of emails..]

> 
> > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote:
> >> > +size_t multifd_device_state_payload_size(void)
> >> > +{
> >> > +    return sizeof(MultiFDDeviceState_t);
> >> > +}
> >> 
> >> This will not be necessary because the payload size is the same as the
> >> data type. We only need it for the special case where the MultiFDPages_t
> >> is smaller than the total ram payload size.
> >
> > Today I was thinking maybe we should really clean this up, as the current
> > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested
> > that more or less).  Knowing that VFIO can use dynamic buffers with ->idstr
> > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made
> > that feeling stronger.
> 
> If we're going to commit bad code and then rewrite it a week later, we
> could have just let the original series from Maciej merge without any of
> this.

Why it's "bad code"?

It runs pretty well, I don't think it's bad code.  You wrote it, and I
don't think it's bad at all.

But now we're rethinking after reading Maciej's new series.  Personally I
don't think it's a major problem.

Note that we're not changing the design back: what was initially proposed
was the client submitting an array of multifd objects.  I still don't think
that's right.

What the change goes so far is we make the union a struct, however that's
still N+2 objects not 2*N, where 2 came from RAM+VFIO.  I think the
important bits are still there (from your previous refactor series).

> I already suggested it a couple of times, we shouldn't be doing
> core refactorings underneath contributors' patches, this is too
> fragile. Just let people contribute their code and we can change it
> later.

I sincerely don't think a lot needs changing... only patch 1.  Basically
patch 1 on top of your previous rework series will be at least what I want,
but I'm open to comments from you guys.

Note that patch 2-3 will be on top of Maciej's changes and they're totally
not relevant to what we merged so far.  Hence, nothing relevant there to
what you worked.  And this is the diff of patch 1:

 migration/multifd.h              | 16 +++++++++++-----
 migration/multifd-device-state.c |  8 ++++++--
 migration/multifd-nocomp.c       | 13 ++++++-------
 migration/multifd.c              | 25 ++++++-------------------
 4 files changed, 29 insertions(+), 33 deletions(-)

It's only 33 lines removed (many of which are comments..), it's not a huge
lot.  I don't know why you feel so bad at this...

It's probably because we maintain migration together, or we can keep our
own way of work.  I don't think we did anything wrong yet so far.

We can definitely talk about this in next 1:1.

> 
> This is also why I've been trying hard to separate core multifd
> functionality from migration code that uses multifd to transmit their
> data.
> 
> My original RFC plus the suggestion to extend multifd_ops for device
> state would have (almost) made it so that no client code would be left
> in multifd. We could have been turning this thing upside down and it
> wouldn't affect anyone in terms of code conflicts.

Do you mean you preferred the 2*N approach?

> 
> The ship has already sailed, so your patches below are fine, I have just
> some small comments.

I'm not sure what you meant about "ship sailed", but we should merge code
whenever we think is the most correct.  I hope you meant after below all
things look the best, or please shoot.  That's exactly what I'm requesting
for as comments.

> 
> >
> > I think we should change it now perhaps, otherwise we'll need to introduce
> > other helpers to e.g. reset the device buffers, and that's not only slow
> > but also not good looking, IMO.
> 
> I agree that part is kind of a sore thumb.
> 
> >
> > So I went ahead with the idea in previous discussion, that I managed to
> > change the SendData union into struct; the memory consumption is not super
> > important yet, IMHO, but we should still stick with the object model where
> > multifd enqueue thread switch buffer with multifd, as it still sounds a
> > sane way to do.
> >
> > Then when that patch is ready, I further tried to make VFIO reuse multifd
> > buffers just like what we do with MultiFDPages_t->offset[]: in RAM code we
> > don't allocate it every time we enqueue.
> >
> > I hope it'll also work for VFIO.  VFIO has a specialty on being able to
> > dump the config space so it's more complex (and I noticed Maciej's current
> > design requires the final chunk of VFIO config data be migrated in one
> > packet.. that is also part of the complexity there).  So I allowed that
> > part to allocate a buffer but only that.  IOW, I made some API (see below)
> > that can either reuse preallocated buffer, or use a separate one only for
> > the final bulk.
> >
> > In short, could both of you have a look at what I came up with below?  I
> > did that in patches because I think it's too much to comment, so patches
> > may work better.  No concern if any of below could be good changes to you,
> > then either Maciej can squash whatever into existing patches (and I feel
> > like some existing patches in this series can go away with below design),
> > or I can post pre-requisite patch but only if any of you prefer that.
> >
> > Anyway, let me know, the patches apply on top of this whole series applied
> > first.
> >
> > I also wonder whether there can be any perf difference already (I tested
> > all multifd qtest with below, but no VFIO I can run), perhaps not that
> > much, but just to mention below should avoid both buffer allocations and
> > one round of copy (so VFIO read() directly writes to the multifd buffers
> > now).
> >
> > Thanks,
> >
> > ==========8<==========
> > From a6cbcf692b2376e72cc053219d67bb32eabfb7a6 Mon Sep 17 00:00:00 2001
> > From: Peter Xu <peterx@redhat.com>
> > Date: Tue, 10 Sep 2024 12:10:59 -0400
> > Subject: [PATCH 1/3] migration/multifd: Make MultiFDSendData a struct
> >
> > The newly introduced device state buffer can be used for either storing
> > VFIO's read() raw data, but already also possible to store generic device
> > states.  After noticing that device states may not easily provide a max
> > buffer size (also the fact that RAM MultiFDPages_t after all also want to
> > have flexibility on managing offset[] array), it may not be a good idea to
> > stick with union on MultiFDSendData.. as it won't play well with such
> > flexibility.
> >
> > Switch MultiFDSendData to a struct.
> >
> > It won't consume a lot more space in reality, after all the real buffers
> > were already dynamically allocated, so it's so far only about the two
> > structs (pages, device_state) that will be duplicated, but they're small.
> >
> > With this, we can remove the pretty hard to understand alloc size logic.
> > Because now we can allocate offset[] together with the SendData, and
> > properly free it when the SendData is freed.
> >
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  migration/multifd.h              | 16 +++++++++++-----
> >  migration/multifd-device-state.c |  8 ++++++--
> >  migration/multifd-nocomp.c       | 13 ++++++-------
> >  migration/multifd.c              | 25 ++++++-------------------
> >  4 files changed, 29 insertions(+), 33 deletions(-)
> >
> > diff --git a/migration/multifd.h b/migration/multifd.h
> > index c15c83104c..47203334b9 100644
> > --- a/migration/multifd.h
> > +++ b/migration/multifd.h
> > @@ -98,9 +98,13 @@ typedef struct {
> >      uint32_t num;
> >      /* number of normal pages */
> >      uint32_t normal_num;
> > +    /*
> > +     * Pointer to the ramblock.  NOTE: it's caller's responsibility to make
> > +     * sure the pointer is always valid!
> > +     */
> 
> This could use some rewording, it's not clear what "caller" means here.
> 
> >      RAMBlock *block;
> > -    /* offset of each page */
> > -    ram_addr_t offset[];
> > +    /* offset array of each page, managed by multifd */
> 
> I'd drop the part after the comma, it's not very accurate and also gives
> an impression that something sophisticated is being done to this.
> 
> > +    ram_addr_t *offset;
> >  } MultiFDPages_t;
> >  
> >  struct MultiFDRecvData {
> > @@ -123,7 +127,7 @@ typedef enum {
> >      MULTIFD_PAYLOAD_DEVICE_STATE,
> >  } MultiFDPayloadType;
> >  
> > -typedef union MultiFDPayload {
> > +typedef struct MultiFDPayload {
> >      MultiFDPages_t ram;
> >      MultiFDDeviceState_t device_state;
> >  } MultiFDPayload;
> > @@ -323,11 +327,13 @@ static inline uint32_t multifd_ram_page_count(void)
> >  void multifd_ram_save_setup(void);
> >  void multifd_ram_save_cleanup(void);
> >  int multifd_ram_flush_and_sync(void);
> > -size_t multifd_ram_payload_size(void);
> > +void multifd_ram_payload_alloc(MultiFDPages_t *pages);
> > +void multifd_ram_payload_free(MultiFDPages_t *pages);
> >  void multifd_ram_fill_packet(MultiFDSendParams *p);
> >  int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp);
> >  
> > -size_t multifd_device_state_payload_size(void);
> > +void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state);
> > +void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state);
> >  void multifd_device_state_save_setup(void);
> >  void multifd_device_state_clear(MultiFDDeviceState_t *device_state);
> >  void multifd_device_state_save_cleanup(void);
> > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
> > index 9b364e8ef3..72b72b6e62 100644
> > --- a/migration/multifd-device-state.c
> > +++ b/migration/multifd-device-state.c
> > @@ -22,9 +22,13 @@ bool send_threads_abort;
> >  
> >  static MultiFDSendData *device_state_send;
> >  
> > -size_t multifd_device_state_payload_size(void)
> > +/* TODO: use static buffers for idstr and buf */
> > +void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state)
> > +{
> > +}
> > +
> > +void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state)
> >  {
> > -    return sizeof(MultiFDDeviceState_t);
> >  }
> >  
> >  void multifd_device_state_save_setup(void)
> > diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
> > index 0b7b543f44..c1b95fee0d 100644
> > --- a/migration/multifd-nocomp.c
> > +++ b/migration/multifd-nocomp.c
> > @@ -22,15 +22,14 @@
> >  
> >  static MultiFDSendData *multifd_ram_send;
> >  
> > -size_t multifd_ram_payload_size(void)
> > +void multifd_ram_payload_alloc(MultiFDPages_t *pages)
> >  {
> > -    uint32_t n = multifd_ram_page_count();
> > +    pages->offset = g_new0(ram_addr_t, multifd_ram_page_count());
> > +}
> >  
> > -    /*
> > -     * We keep an array of page offsets at the end of MultiFDPages_t,
> > -     * add space for it in the allocation.
> > -     */
> > -    return sizeof(MultiFDPages_t) + n * sizeof(ram_addr_t);
> > +void multifd_ram_payload_free(MultiFDPages_t *pages)
> > +{
> > +    g_clear_pointer(&pages->offset, g_free);
> >  }
> >  
> >  void multifd_ram_save_setup(void)
> > diff --git a/migration/multifd.c b/migration/multifd.c
> > index bebe5b5a9b..5a20b831cf 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -101,26 +101,12 @@ struct {
> >  
> >  MultiFDSendData *multifd_send_data_alloc(void)
> >  {
> > -    size_t max_payload_size, size_minus_payload;
> > +    MultiFDSendData *new = g_new0(MultiFDSendData, 1);
> >  
> > -    /*
> > -     * MultiFDPages_t has a flexible array at the end, account for it
> > -     * when allocating MultiFDSendData. Use max() in case other types
> > -     * added to the union in the future are larger than
> > -     * (MultiFDPages_t + flex array).
> > -     */
> > -    max_payload_size = MAX(multifd_ram_payload_size(),
> > -                           multifd_device_state_payload_size());
> > -    max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload));
> > -
> > -    /*
> > -     * Account for any holes the compiler might insert. We can't pack
> > -     * the structure because that misaligns the members and triggers
> > -     * Waddress-of-packed-member.
> > -     */
> > -    size_minus_payload = sizeof(MultiFDSendData) - sizeof(MultiFDPayload);
> > +    multifd_ram_payload_alloc(&new->u.ram);
> > +    multifd_device_state_payload_alloc(&new->u.device_state);
> >  
> > -    return g_malloc0(size_minus_payload + max_payload_size);
> > +    return new;
> >  }
> >  
> >  void multifd_send_data_clear(MultiFDSendData *data)
> > @@ -147,7 +133,8 @@ void multifd_send_data_free(MultiFDSendData *data)
> >          return;
> >      }
> >  
> > -    multifd_send_data_clear(data);
> > +    multifd_ram_payload_free(&data->u.ram);
> > +    multifd_device_state_payload_free(&data->u.device_state);
> 
> The "u" needs to be dropped now. Could just change to "p". Or can we now
> move the whole struct inside MultiFDSendData?

Yep, all your comments look good to me.

A note here: I intentionally didn't touch "u." as that requires more
changes which doesn't help as me leaving "patch-styled comment".  As I
said, feel free to see the patches as comments not patches for merging yet.
I / Maciej can prepare patch but only if the idea in general can be
accepted.

For me as I mentioned patch 2-3 do not relevant much to current master
branch, afaiu, so if you guys like I can repost patch 1 with a formal one,
but only if Maciej thinks it's easier for him.

> 
> >  
> >      g_free(data);
> >  }
> > -- 
> > 2.45.0
> >
> >
> >
> > From 6695d134c0818f42183f5ea03c21e6d56e7b57ea Mon Sep 17 00:00:00 2001
> > From: Peter Xu <peterx@redhat.com>
> > Date: Tue, 10 Sep 2024 12:24:14 -0400
> > Subject: [PATCH 2/3] migration/multifd: Optimize device_state->idstr updates
> >
> > The duplication / allocation of idstr for each VFIO blob is an overkill, as
> > idstr isn't something that changes frequently.  Also, the idstr always came
> > from the upper layer of se->idstr so it's always guaranteed to
> > exist (e.g. no device unplug allowed during migration).
> >
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  migration/multifd.h              |  4 ++++
> >  migration/multifd-device-state.c | 10 +++++++---
> >  2 files changed, 11 insertions(+), 3 deletions(-)
> >
> > diff --git a/migration/multifd.h b/migration/multifd.h
> > index 47203334b9..1eaa5d4496 100644
> > --- a/migration/multifd.h
> > +++ b/migration/multifd.h
> > @@ -115,6 +115,10 @@ struct MultiFDRecvData {
> >  };
> >  
> >  typedef struct {
> > +    /*
> > +     * Name of the owner device.  NOTE: it's caller's responsibility to
> > +     * make sure the pointer is always valid!
> > +     */
> 
> Same comment as the other one here.
> 
> >      char *idstr;
> >      uint32_t instance_id;
> >      char *buf;
> > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
> > index 72b72b6e62..cfd0465bac 100644
> > --- a/migration/multifd-device-state.c
> > +++ b/migration/multifd-device-state.c
> > @@ -44,7 +44,7 @@ void multifd_device_state_save_setup(void)
> >  
> >  void multifd_device_state_clear(MultiFDDeviceState_t *device_state)
> >  {
> > -    g_clear_pointer(&device_state->idstr, g_free);
> > +    device_state->idstr = NULL;
> >      g_clear_pointer(&device_state->buf, g_free);
> >  }
> >  
> > @@ -100,7 +100,12 @@ bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
> >  
> >      multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE);
> >      device_state = &device_state_send->u.device_state;
> > -    device_state->idstr = g_strdup(idstr);
> > +    /*
> > +     * NOTE: here we must use a static idstr (e.g. of a savevm state
> > +     * entry) rather than any dynamically allocated buffer, because multifd
> > +     * assumes this pointer is always valid!
> > +     */
> > +    device_state->idstr = idstr;
> >      device_state->instance_id = instance_id;
> >      device_state->buf = g_memdup2(data, len);
> >      device_state->buf_len = len;
> > @@ -137,7 +142,6 @@ static void multifd_device_state_save_thread_data_free(void *opaque)
> >  {
> >      struct MultiFDDSSaveThreadData *data = opaque;
> >  
> > -    g_clear_pointer(&data->idstr, g_free);
> >      g_free(data);
> >  }
> >  
> > -- 
> > 2.45.0
> >
> >
> > From abfea9698ff46ad0e0175e1dcc6e005e0b2ece2a Mon Sep 17 00:00:00 2001
> > From: Peter Xu <peterx@redhat.com>
> > Date: Tue, 10 Sep 2024 12:27:49 -0400
> > Subject: [PATCH 3/3] migration/multifd: Optimize device_state buffer
> >  allocations
> >
> > Provide a device_state->buf_prealloc so that the buffers can be reused if
> > possible.  Provide a set of APIs to use it right.  Please see the
> > documentation for the API in the code.
> >
> > The default buffer size came from VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE as of
> > now.
> >
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  include/hw/vfio/vfio-common.h    |  9 ++++
> >  include/migration/misc.h         | 12 ++++-
> >  migration/multifd.h              | 11 +++-
> >  hw/vfio/migration.c              | 43 ++++++++-------
> >  migration/multifd-device-state.c | 93 +++++++++++++++++++++++++-------
> >  migration/multifd.c              |  9 ----
> >  6 files changed, 126 insertions(+), 51 deletions(-)
> >
> > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> > index 4578a0ca6a..c1f2f4ae55 100644
> > --- a/include/hw/vfio/vfio-common.h
> > +++ b/include/hw/vfio/vfio-common.h
> > @@ -61,6 +61,13 @@ typedef struct VFIORegion {
> >      uint8_t nr; /* cache the region number for debug */
> >  } VFIORegion;
> >  
> > +typedef struct VFIODeviceStatePacket {
> > +    uint32_t version;
> > +    uint32_t idx;
> > +    uint32_t flags;
> > +    uint8_t data[0];
> > +} QEMU_PACKED VFIODeviceStatePacket;
> > +
> >  typedef struct VFIOMigration {
> >      struct VFIODevice *vbasedev;
> >      VMChangeStateEntry *vm_state;
> > @@ -168,6 +175,8 @@ typedef struct VFIODevice {
> >      int devid;
> >      IOMMUFDBackend *iommufd;
> >      VFIOIOASHwpt *hwpt;
> > +    /* Only used on sender side when multifd is enabled */
> > +    VFIODeviceStatePacket *multifd_packet;
> >      QLIST_ENTRY(VFIODevice) hwpt_next;
> >  } VFIODevice;
> >  
> > diff --git a/include/migration/misc.h b/include/migration/misc.h
> > index 26f7f3140f..1a8676ed3d 100644
> > --- a/include/migration/misc.h
> > +++ b/include/migration/misc.h
> > @@ -112,8 +112,16 @@ bool migration_in_bg_snapshot(void);
> >  void dirty_bitmap_mig_init(void);
> >  
> >  /* migration/multifd-device-state.c */
> > -bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
> > -                                char *data, size_t len);
> > +struct MultiFDDeviceState_t;
> > +typedef struct MultiFDDeviceState_t MultiFDDeviceState_t;
> > +
> > +MultiFDDeviceState_t *
> > +multifd_device_state_prepare(char *idstr, uint32_t instance_id);
> > +void *multifd_device_state_get_buffer(MultiFDDeviceState_t *s,
> > +                                      int64_t *buf_len);
> > +bool multifd_device_state_finish(MultiFDDeviceState_t *state,
> > +                                 void *buf, int64_t buf_len);
> > +
> >  bool migration_has_device_state_support(void);
> >  
> >  void
> > diff --git a/migration/multifd.h b/migration/multifd.h
> > index 1eaa5d4496..1ccdeeb8c5 100644
> > --- a/migration/multifd.h
> > +++ b/migration/multifd.h
> > @@ -15,6 +15,7 @@
> >  
> >  #include "exec/target_page.h"
> >  #include "ram.h"
> > +#include "migration/misc.h"
> >  
> >  typedef struct MultiFDRecvData MultiFDRecvData;
> >  typedef struct MultiFDSendData MultiFDSendData;
> > @@ -114,16 +115,22 @@ struct MultiFDRecvData {
> >      off_t file_offset;
> >  };
> >  
> > -typedef struct {
> > +struct MultiFDDeviceState_t {
> >      /*
> >       * Name of the owner device.  NOTE: it's caller's responsibility to
> >       * make sure the pointer is always valid!
> >       */
> >      char *idstr;
> >      uint32_t instance_id;
> > +    /*
> > +     * Points to the buffer to send via multifd.  Normally it's the same as
> > +     * buf_prealloc, otherwise the caller needs to make sure the buffer is
> > +     * avaliable through multifd running.
> 
> "throughout multifd runtime" maybe.
> 
> > +     */
> >      char *buf;
> > +    char *buf_prealloc;
> >      size_t buf_len;
> > -} MultiFDDeviceState_t;
> > +};
> >  
> >  typedef enum {
> >      MULTIFD_PAYLOAD_NONE,
> > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> > index 67996aa2df..e36422b7c5 100644
> > --- a/hw/vfio/migration.c
> > +++ b/hw/vfio/migration.c
> > @@ -59,13 +59,6 @@
> >  
> >  #define VFIO_DEVICE_STATE_CONFIG_STATE (1)
> >  
> > -typedef struct VFIODeviceStatePacket {
> > -    uint32_t version;
> > -    uint32_t idx;
> > -    uint32_t flags;
> > -    uint8_t data[0];
> > -} QEMU_PACKED VFIODeviceStatePacket;
> > -
> >  static int64_t bytes_transferred;
> >  
> >  static const char *mig_state_to_str(enum vfio_device_mig_state state)
> > @@ -741,6 +734,9 @@ static void vfio_save_cleanup(void *opaque)
> >      migration->initial_data_sent = false;
> >      vfio_migration_cleanup(vbasedev);
> >      trace_vfio_save_cleanup(vbasedev->name);
> > +    if (vbasedev->multifd_packet) {
> > +        g_clear_pointer(&vbasedev->multifd_packet, g_free);
> > +    }
> >  }
> >  
> >  static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy,
> > @@ -892,7 +888,8 @@ static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbas
> >      g_autoptr(QIOChannelBuffer) bioc = NULL;
> >      QEMUFile *f = NULL;
> >      int ret;
> > -    g_autofree VFIODeviceStatePacket *packet = NULL;
> > +    VFIODeviceStatePacket *packet;
> > +    MultiFDDeviceState_t *state;
> >      size_t packet_len;
> >  
> >      bioc = qio_channel_buffer_new(0);
> > @@ -911,13 +908,19 @@ static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbas
> >      }
> >  
> >      packet_len = sizeof(*packet) + bioc->usage;
> > -    packet = g_malloc0(packet_len);
> > +
> > +    state = multifd_device_state_prepare(idstr, instance_id);
> > +    /*
> > +     * Do not reuse multifd buffer, but use our own due to random size.
> > +     * The buffer will be freed only when save cleanup.
> > +     */
> > +    vbasedev->multifd_packet = g_malloc0(packet_len);
> > +    packet = vbasedev->multifd_packet;
> >      packet->idx = idx;
> >      packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE;
> >      memcpy(&packet->data, bioc->data, bioc->usage);
> >  
> > -    if (!multifd_queue_device_state(idstr, instance_id,
> > -                                    (char *)packet, packet_len)) {
> > +    if (!multifd_device_state_finish(state, packet, packet_len)) {
> >          ret = -1;
> >      }
> >  
> > @@ -936,7 +939,6 @@ static int vfio_save_complete_precopy_thread(char *idstr,
> >      VFIODevice *vbasedev = opaque;
> >      VFIOMigration *migration = vbasedev->migration;
> >      int ret;
> > -    g_autofree VFIODeviceStatePacket *packet = NULL;
> >      uint32_t idx;
> >  
> >      if (!migration->multifd_transfer) {
> > @@ -954,21 +956,25 @@ static int vfio_save_complete_precopy_thread(char *idstr,
> >          goto ret_finish;
> >      }
> >  
> > -    packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size);
> > -
> >      for (idx = 0; ; idx++) {
> > +        VFIODeviceStatePacket *packet;
> > +        MultiFDDeviceState_t *state;
> >          ssize_t data_size;
> >          size_t packet_size;
> > +        int64_t buf_size;
> >  
> >          if (qatomic_read(abort_flag)) {
> >              ret = -ECANCELED;
> >              goto ret_finish;
> >          }
> >  
> > +        state = multifd_device_state_prepare(idstr, instance_id);
> > +        packet = multifd_device_state_get_buffer(state, &buf_size);
> >          data_size = read(migration->data_fd, &packet->data,
> > -                         migration->data_buffer_size);
> > +                         buf_size - sizeof(*packet));
> >          if (data_size < 0) {
> >              if (errno != ENOMSG) {
> > +                multifd_device_state_finish(state, NULL, 0);
> >                  ret = -errno;
> >                  goto ret_finish;
> >              }
> > @@ -980,14 +986,15 @@ static int vfio_save_complete_precopy_thread(char *idstr,
> >              data_size = 0;
> >          }
> >  
> > -        if (data_size == 0)
> > +        if (data_size == 0) {
> > +            multifd_device_state_finish(state, NULL, 0);
> >              break;
> > +        }
> >  
> >          packet->idx = idx;
> >          packet_size = sizeof(*packet) + data_size;
> >  
> > -        if (!multifd_queue_device_state(idstr, instance_id,
> > -                                        (char *)packet, packet_size)) {
> > +        if (!multifd_device_state_finish(state, packet, packet_size)) {
> >              ret = -1;
> >              goto ret_finish;
> >          }
> > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
> > index cfd0465bac..6f0259426d 100644
> > --- a/migration/multifd-device-state.c
> > +++ b/migration/multifd-device-state.c
> > @@ -20,15 +20,18 @@ ThreadPool *send_threads;
> >  int send_threads_ret;
> >  bool send_threads_abort;
> >  
> > +#define  MULTIFD_DEVICE_STATE_BUFLEN  (1UL << 20)
> > +
> >  static MultiFDSendData *device_state_send;
> >  
> > -/* TODO: use static buffers for idstr and buf */
> >  void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state)
> >  {
> > +    device_state->buf_prealloc = g_malloc0(MULTIFD_DEVICE_STATE_BUFLEN);
> >  }
> >  
> >  void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state)
> >  {
> > +    g_clear_pointer(&device_state->buf_prealloc, g_free);
> >  }
> >  
> >  void multifd_device_state_save_setup(void)
> > @@ -42,12 +45,6 @@ void multifd_device_state_save_setup(void)
> >      send_threads_abort = false;
> >  }
> >  
> > -void multifd_device_state_clear(MultiFDDeviceState_t *device_state)
> > -{
> > -    device_state->idstr = NULL;
> > -    g_clear_pointer(&device_state->buf, g_free);
> > -}
> > -
> >  void multifd_device_state_save_cleanup(void)
> >  {
> >      g_clear_pointer(&send_threads, thread_pool_free);
> > @@ -89,33 +86,89 @@ void multifd_device_state_send_prepare(MultiFDSendParams *p)
> >      multifd_device_state_fill_packet(p);
> >  }
> >  
> > -bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
> > -                                char *data, size_t len)
> > +/*
> > + * Prepare to send some device state via multifd.  Returns the current idle
> > + * MultiFDDeviceState_t*.
> > + *
> > + * As a follow up, the caller must call multifd_device_state_finish() to
> > + * release the resources.
> > + *
> > + * One example usage of the API:
> > + *
> > + *   // Fetch a free multifd device state object
> > + *   state = multifd_device_state_prepare(idstr, instance_id);
> > + *
> > + *   // Optional: fetch the buffer to reuse
> > + *   buf = multifd_device_state_get_buffer(state, &buf_size);
> > + *
> > + *   // Here len>0 means success, otherwise failure
> > + *   len = buffer_fill(buf, buf_size);
> > + *
> > + *   // Finish the transaction, either enqueue or cancel the request.  Here
> > + *   // len>0 will enqueue, <=0 will cancel.
> > + *   multifd_device_state_finish(state, buf, len);
> > + */
> > +MultiFDDeviceState_t *
> > +multifd_device_state_prepare(char *idstr, uint32_t instance_id)
> >  {
> > -    /* Device state submissions can come from multiple threads */
> > -    QEMU_LOCK_GUARD(&queue_job_mutex);
> >      MultiFDDeviceState_t *device_state;
> >  
> >      assert(multifd_payload_empty(device_state_send));
> >  
> > -    multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE);
> > +    /*
> > +     * TODO: The lock name may need change, but I'm reusing just for
> > +     * simplicity.
> > +     */
> > +    qemu_mutex_lock(&queue_job_mutex);
> > +
> >      device_state = &device_state_send->u.device_state;
> >      /*
> > -     * NOTE: here we must use a static idstr (e.g. of a savevm state
> > -     * entry) rather than any dynamically allocated buffer, because multifd
> > +     * NOTE: here we must use a static idstr (e.g. of a savevm state entry)
> > +     * rather than any dynamically allocated buffer, because multifd
> >       * assumes this pointer is always valid!
> >       */
> >      device_state->idstr = idstr;
> >      device_state->instance_id = instance_id;
> > -    device_state->buf = g_memdup2(data, len);
> > -    device_state->buf_len = len;
> >  
> > -    if (!multifd_send(&device_state_send)) {
> > -        multifd_send_data_clear(device_state_send);
> > -        return false;
> > +    return &device_state_send->u.device_state;
> > +}
> > +
> > +/*
> > + * Need to be used after a previous call to multifd_device_state_prepare(),
> > + * the buffer must not be used after invoke multifd_device_state_finish().
> > + */
> > +void *multifd_device_state_get_buffer(MultiFDDeviceState_t *s,
> > +                                      int64_t *buf_len)
> > +{
> > +    *buf_len = MULTIFD_DEVICE_STATE_BUFLEN;
> > +    return s->buf_prealloc;
> > +}
> > +
> > +/*
> > + * Need to be used only in pair with a previous call to
> > + * multifd_device_state_prepare().  Returns true if enqueue successful,
> > + * false otherwise.
> > + */
> > +bool multifd_device_state_finish(MultiFDDeviceState_t *state,
> > +                                 void *buf, int64_t buf_len)
> > +{
> > +    bool result = false;
> > +
> > +    /* Currently we only have one global free buffer */
> > +    assert(state == &device_state_send->u.device_state);
> > +
> > +    if (buf_len < 0) {
> > +        goto out;
> >      }
> >  
> > -    return true;
> > +    multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE);
> > +    /* This normally will be the state->buf_prealloc, but not required */
> > +    state->buf = buf;
> > +    state->buf_len = buf_len;
> > +    result = multifd_send(&device_state_send);
> > +out:
> > +    qemu_mutex_unlock(&queue_job_mutex);
> > +    return result;
> >  }
> >  
> >  bool migration_has_device_state_support(void)
> > diff --git a/migration/multifd.c b/migration/multifd.c
> > index 5a20b831cf..2b5185e298 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -115,15 +115,6 @@ void multifd_send_data_clear(MultiFDSendData *data)
> >          return;
> >      }
> >  
> > -    switch (data->type) {
> > -    case MULTIFD_PAYLOAD_DEVICE_STATE:
> > -        multifd_device_state_clear(&data->u.device_state);
> > -        break;
> > -    default:
> > -        /* Nothing to do */
> > -        break;
> > -    }
> > -
> >      data->type = MULTIFD_PAYLOAD_NONE;
> >  }
> >  
> > -- 
> > 2.45.0
>
Fabiano Rosas Sept. 13, 2024, 1:21 p.m. UTC | #9
Peter Xu <peterx@redhat.com> writes:

> On Thu, Sep 12, 2024 at 03:43:39PM -0300, Fabiano Rosas wrote:
>> Peter Xu <peterx@redhat.com> writes:
>> 
>> Hi Peter, sorry if I'm not very enthusiastic by this, I'm sure you
>> understand the rework is a little frustrating.
>
> That's OK.
>
> [For some reason my email sync program decided to give up working for
>  hours.  I got more time looking at a tsc bug, which is good, but then I
>  miss a lot of emails..]
>
>> 
>> > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote:
>> >> > +size_t multifd_device_state_payload_size(void)
>> >> > +{
>> >> > +    return sizeof(MultiFDDeviceState_t);
>> >> > +}
>> >> 
>> >> This will not be necessary because the payload size is the same as the
>> >> data type. We only need it for the special case where the MultiFDPages_t
>> >> is smaller than the total ram payload size.
>> >
>> > Today I was thinking maybe we should really clean this up, as the current
>> > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested
>> > that more or less).  Knowing that VFIO can use dynamic buffers with ->idstr
>> > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made
>> > that feeling stronger.
>> 
>> If we're going to commit bad code and then rewrite it a week later, we
>> could have just let the original series from Maciej merge without any of
>> this.
>
> Why it's "bad code"?
>
> It runs pretty well, I don't think it's bad code.  You wrote it, and I
> don't think it's bad at all.

Code that forces us to do arithmetic in order to properly allocate
memory and comes with a big comment explaining how we're dodging
compiler warnings is bad code in my book.

>
> But now we're rethinking after reading Maciej's new series.
>Personally I don't think it's a major problem.
>
> Note that we're not changing the design back: what was initially proposed
> was the client submitting an array of multifd objects.  I still don't think
> that's right.
>
> What the change goes so far is we make the union a struct, however that's
> still N+2 objects not 2*N, where 2 came from RAM+VFIO.  I think the
> important bits are still there (from your previous refactor series).
>

You fail to appreciate that before the RFC series, multifd already
allocated N for the pages. The device state adds another client, so that
would be another N anyway. The problem the RFC tried to solve was that
multifd channels owned that 2N, so the array was added to move the
memory into the client's ownership. IOW, it wasn't even the RFC series
that made it 2N, that was the multifd design all along. Now in hindsight
I don't think we should have went with the memory saving discussion.

>> I already suggested it a couple of times, we shouldn't be doing
>> core refactorings underneath contributors' patches, this is too
>> fragile. Just let people contribute their code and we can change it
>> later.
>
> I sincerely don't think a lot needs changing... only patch 1.  Basically
> patch 1 on top of your previous rework series will be at least what I want,
> but I'm open to comments from you guys.

Don't get me wrong, I'm very much in favor of what you're doing
here. However, I don't think it's ok to be backtracking on our design
while other people have series in flight that depend on it. You
certainly know the feeling of trying to merge a feature and having
maintainers ask you to rewrite a bunch code just to be able to start
working. That's not ideal.

I tried to quickly insert the RFC series before the device state series
progressed too much, but it's 3 months later and we're still discussing
it, maybe we don't need to do it this way.

And ok, let's consider the current situation a special case. But I would
like to avoid in the future this kind of uncertainty. 

>
> Note that patch 2-3 will be on top of Maciej's changes and they're totally
> not relevant to what we merged so far.  Hence, nothing relevant there to
> what you worked.  And this is the diff of patch 1:
>
>  migration/multifd.h              | 16 +++++++++++-----
>  migration/multifd-device-state.c |  8 ++++++--
>  migration/multifd-nocomp.c       | 13 ++++++-------
>  migration/multifd.c              | 25 ++++++-------------------
>  4 files changed, 29 insertions(+), 33 deletions(-)
>
> It's only 33 lines removed (many of which are comments..), it's not a huge
> lot.  I don't know why you feel so bad at this...
>
> It's probably because we maintain migration together, or we can keep our
> own way of work.  I don't think we did anything wrong yet so far.
>
> We can definitely talk about this in next 1:1.
>
>> 
>> This is also why I've been trying hard to separate core multifd
>> functionality from migration code that uses multifd to transmit their
>> data.
>> 
>> My original RFC plus the suggestion to extend multifd_ops for device
>> state would have (almost) made it so that no client code would be left
>> in multifd. We could have been turning this thing upside down and it
>> wouldn't affect anyone in terms of code conflicts.
>
> Do you mean you preferred the 2*N approach?
>

2*N, where N is usually not larger than 32 and the payload size is
1k. Yes, I'd trade that off no problem.

>> 
>> The ship has already sailed, so your patches below are fine, I have just
>> some small comments.
>
> I'm not sure what you meant about "ship sailed", but we should merge code
> whenever we think is the most correct.

As you put above, I agree that the important bits of the original series
have been preserved, but other secondary goals were lost, such as the
more abstract separation between multifd & client code and that is the
ship that has sailed.

That series was not: "introduce this array for no reason", we also lost
the ability to abstract the payload from the multifd threads when we
dropped the .alloc_fn callback for instance. The last patch you posted
here now adds multifd_device_state_prepare, somewhat ignoring that the
ram code also has the same pattern and it could be made to use the same
API.

I did accept your premise that ram+compression is one thing while
device_state is another, so I'm not asking it to be changed, just
pointing out that the RFC series also addressed those issues. I might
not have made that clear back then.

> I hope you meant after below all things look the best, or please shoot.
> That's exactly what I'm requesting for as comments.

What you have here is certainly an improvement from the current
state. I'm just ranting about the path we took here.

>> 
>> >
>> > I think we should change it now perhaps, otherwise we'll need to introduce
>> > other helpers to e.g. reset the device buffers, and that's not only slow
>> > but also not good looking, IMO.
>> 
>> I agree that part is kind of a sore thumb.
>> 
>> >
>> > So I went ahead with the idea in previous discussion, that I managed to
>> > change the SendData union into struct; the memory consumption is not super
>> > important yet, IMHO, but we should still stick with the object model where
>> > multifd enqueue thread switch buffer with multifd, as it still sounds a
>> > sane way to do.
>> >
>> > Then when that patch is ready, I further tried to make VFIO reuse multifd
>> > buffers just like what we do with MultiFDPages_t->offset[]: in RAM code we
>> > don't allocate it every time we enqueue.
>> >
>> > I hope it'll also work for VFIO.  VFIO has a specialty on being able to
>> > dump the config space so it's more complex (and I noticed Maciej's current
>> > design requires the final chunk of VFIO config data be migrated in one
>> > packet.. that is also part of the complexity there).  So I allowed that
>> > part to allocate a buffer but only that.  IOW, I made some API (see below)
>> > that can either reuse preallocated buffer, or use a separate one only for
>> > the final bulk.
>> >
>> > In short, could both of you have a look at what I came up with below?  I
>> > did that in patches because I think it's too much to comment, so patches
>> > may work better.  No concern if any of below could be good changes to you,
>> > then either Maciej can squash whatever into existing patches (and I feel
>> > like some existing patches in this series can go away with below design),
>> > or I can post pre-requisite patch but only if any of you prefer that.
>> >
>> > Anyway, let me know, the patches apply on top of this whole series applied
>> > first.
>> >
>> > I also wonder whether there can be any perf difference already (I tested
>> > all multifd qtest with below, but no VFIO I can run), perhaps not that
>> > much, but just to mention below should avoid both buffer allocations and
>> > one round of copy (so VFIO read() directly writes to the multifd buffers
>> > now).
>> >
>> > Thanks,
>> >
>> > ==========8<==========
>> > From a6cbcf692b2376e72cc053219d67bb32eabfb7a6 Mon Sep 17 00:00:00 2001
>> > From: Peter Xu <peterx@redhat.com>
>> > Date: Tue, 10 Sep 2024 12:10:59 -0400
>> > Subject: [PATCH 1/3] migration/multifd: Make MultiFDSendData a struct
>> >
>> > The newly introduced device state buffer can be used for either storing
>> > VFIO's read() raw data, but already also possible to store generic device
>> > states.  After noticing that device states may not easily provide a max
>> > buffer size (also the fact that RAM MultiFDPages_t after all also want to
>> > have flexibility on managing offset[] array), it may not be a good idea to
>> > stick with union on MultiFDSendData.. as it won't play well with such
>> > flexibility.
>> >
>> > Switch MultiFDSendData to a struct.
>> >
>> > It won't consume a lot more space in reality, after all the real buffers
>> > were already dynamically allocated, so it's so far only about the two
>> > structs (pages, device_state) that will be duplicated, but they're small.
>> >
>> > With this, we can remove the pretty hard to understand alloc size logic.
>> > Because now we can allocate offset[] together with the SendData, and
>> > properly free it when the SendData is freed.
>> >
>> > Signed-off-by: Peter Xu <peterx@redhat.com>
>> > ---
>> >  migration/multifd.h              | 16 +++++++++++-----
>> >  migration/multifd-device-state.c |  8 ++++++--
>> >  migration/multifd-nocomp.c       | 13 ++++++-------
>> >  migration/multifd.c              | 25 ++++++-------------------
>> >  4 files changed, 29 insertions(+), 33 deletions(-)
>> >
>> > diff --git a/migration/multifd.h b/migration/multifd.h
>> > index c15c83104c..47203334b9 100644
>> > --- a/migration/multifd.h
>> > +++ b/migration/multifd.h
>> > @@ -98,9 +98,13 @@ typedef struct {
>> >      uint32_t num;
>> >      /* number of normal pages */
>> >      uint32_t normal_num;
>> > +    /*
>> > +     * Pointer to the ramblock.  NOTE: it's caller's responsibility to make
>> > +     * sure the pointer is always valid!
>> > +     */
>> 
>> This could use some rewording, it's not clear what "caller" means here.
>> 
>> >      RAMBlock *block;
>> > -    /* offset of each page */
>> > -    ram_addr_t offset[];
>> > +    /* offset array of each page, managed by multifd */
>> 
>> I'd drop the part after the comma, it's not very accurate and also gives
>> an impression that something sophisticated is being done to this.
>> 
>> > +    ram_addr_t *offset;
>> >  } MultiFDPages_t;
>> >  
>> >  struct MultiFDRecvData {
>> > @@ -123,7 +127,7 @@ typedef enum {
>> >      MULTIFD_PAYLOAD_DEVICE_STATE,
>> >  } MultiFDPayloadType;
>> >  
>> > -typedef union MultiFDPayload {
>> > +typedef struct MultiFDPayload {
>> >      MultiFDPages_t ram;
>> >      MultiFDDeviceState_t device_state;
>> >  } MultiFDPayload;
>> > @@ -323,11 +327,13 @@ static inline uint32_t multifd_ram_page_count(void)
>> >  void multifd_ram_save_setup(void);
>> >  void multifd_ram_save_cleanup(void);
>> >  int multifd_ram_flush_and_sync(void);
>> > -size_t multifd_ram_payload_size(void);
>> > +void multifd_ram_payload_alloc(MultiFDPages_t *pages);
>> > +void multifd_ram_payload_free(MultiFDPages_t *pages);
>> >  void multifd_ram_fill_packet(MultiFDSendParams *p);
>> >  int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp);
>> >  
>> > -size_t multifd_device_state_payload_size(void);
>> > +void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state);
>> > +void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state);
>> >  void multifd_device_state_save_setup(void);
>> >  void multifd_device_state_clear(MultiFDDeviceState_t *device_state);
>> >  void multifd_device_state_save_cleanup(void);
>> > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
>> > index 9b364e8ef3..72b72b6e62 100644
>> > --- a/migration/multifd-device-state.c
>> > +++ b/migration/multifd-device-state.c
>> > @@ -22,9 +22,13 @@ bool send_threads_abort;
>> >  
>> >  static MultiFDSendData *device_state_send;
>> >  
>> > -size_t multifd_device_state_payload_size(void)
>> > +/* TODO: use static buffers for idstr and buf */
>> > +void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state)
>> > +{
>> > +}
>> > +
>> > +void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state)
>> >  {
>> > -    return sizeof(MultiFDDeviceState_t);
>> >  }
>> >  
>> >  void multifd_device_state_save_setup(void)
>> > diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
>> > index 0b7b543f44..c1b95fee0d 100644
>> > --- a/migration/multifd-nocomp.c
>> > +++ b/migration/multifd-nocomp.c
>> > @@ -22,15 +22,14 @@
>> >  
>> >  static MultiFDSendData *multifd_ram_send;
>> >  
>> > -size_t multifd_ram_payload_size(void)
>> > +void multifd_ram_payload_alloc(MultiFDPages_t *pages)
>> >  {
>> > -    uint32_t n = multifd_ram_page_count();
>> > +    pages->offset = g_new0(ram_addr_t, multifd_ram_page_count());
>> > +}
>> >  
>> > -    /*
>> > -     * We keep an array of page offsets at the end of MultiFDPages_t,
>> > -     * add space for it in the allocation.
>> > -     */
>> > -    return sizeof(MultiFDPages_t) + n * sizeof(ram_addr_t);
>> > +void multifd_ram_payload_free(MultiFDPages_t *pages)
>> > +{
>> > +    g_clear_pointer(&pages->offset, g_free);
>> >  }
>> >  
>> >  void multifd_ram_save_setup(void)
>> > diff --git a/migration/multifd.c b/migration/multifd.c
>> > index bebe5b5a9b..5a20b831cf 100644
>> > --- a/migration/multifd.c
>> > +++ b/migration/multifd.c
>> > @@ -101,26 +101,12 @@ struct {
>> >  
>> >  MultiFDSendData *multifd_send_data_alloc(void)
>> >  {
>> > -    size_t max_payload_size, size_minus_payload;
>> > +    MultiFDSendData *new = g_new0(MultiFDSendData, 1);
>> >  
>> > -    /*
>> > -     * MultiFDPages_t has a flexible array at the end, account for it
>> > -     * when allocating MultiFDSendData. Use max() in case other types
>> > -     * added to the union in the future are larger than
>> > -     * (MultiFDPages_t + flex array).
>> > -     */
>> > -    max_payload_size = MAX(multifd_ram_payload_size(),
>> > -                           multifd_device_state_payload_size());
>> > -    max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload));
>> > -
>> > -    /*
>> > -     * Account for any holes the compiler might insert. We can't pack
>> > -     * the structure because that misaligns the members and triggers
>> > -     * Waddress-of-packed-member.
>> > -     */
>> > -    size_minus_payload = sizeof(MultiFDSendData) - sizeof(MultiFDPayload);
>> > +    multifd_ram_payload_alloc(&new->u.ram);
>> > +    multifd_device_state_payload_alloc(&new->u.device_state);
>> >  
>> > -    return g_malloc0(size_minus_payload + max_payload_size);
>> > +    return new;
>> >  }
>> >  
>> >  void multifd_send_data_clear(MultiFDSendData *data)
>> > @@ -147,7 +133,8 @@ void multifd_send_data_free(MultiFDSendData *data)
>> >          return;
>> >      }
>> >  
>> > -    multifd_send_data_clear(data);
>> > +    multifd_ram_payload_free(&data->u.ram);
>> > +    multifd_device_state_payload_free(&data->u.device_state);
>> 
>> The "u" needs to be dropped now. Could just change to "p". Or can we now
>> move the whole struct inside MultiFDSendData?
>
> Yep, all your comments look good to me.
>
> A note here: I intentionally didn't touch "u." as that requires more
> changes which doesn't help as me leaving "patch-styled comment".  As I
> said, feel free to see the patches as comments not patches for merging yet.
> I / Maciej can prepare patch but only if the idea in general can be
> accepted.
>
> For me as I mentioned patch 2-3 do not relevant much to current master
> branch, afaiu, so if you guys like I can repost patch 1 with a formal one,
> but only if Maciej thinks it's easier for him.
>

I don't mind either way. If it were a proper series, it could be fetched
with b4, maybe that helps.

>> 
>> >  
>> >      g_free(data);
>> >  }
>> > -- 
>> > 2.45.0
>> >
>> >
>> >
>> > From 6695d134c0818f42183f5ea03c21e6d56e7b57ea Mon Sep 17 00:00:00 2001
>> > From: Peter Xu <peterx@redhat.com>
>> > Date: Tue, 10 Sep 2024 12:24:14 -0400
>> > Subject: [PATCH 2/3] migration/multifd: Optimize device_state->idstr updates
>> >
>> > The duplication / allocation of idstr for each VFIO blob is an overkill, as
>> > idstr isn't something that changes frequently.  Also, the idstr always came
>> > from the upper layer of se->idstr so it's always guaranteed to
>> > exist (e.g. no device unplug allowed during migration).
>> >
>> > Signed-off-by: Peter Xu <peterx@redhat.com>
>> > ---
>> >  migration/multifd.h              |  4 ++++
>> >  migration/multifd-device-state.c | 10 +++++++---
>> >  2 files changed, 11 insertions(+), 3 deletions(-)
>> >
>> > diff --git a/migration/multifd.h b/migration/multifd.h
>> > index 47203334b9..1eaa5d4496 100644
>> > --- a/migration/multifd.h
>> > +++ b/migration/multifd.h
>> > @@ -115,6 +115,10 @@ struct MultiFDRecvData {
>> >  };
>> >  
>> >  typedef struct {
>> > +    /*
>> > +     * Name of the owner device.  NOTE: it's caller's responsibility to
>> > +     * make sure the pointer is always valid!
>> > +     */
>> 
>> Same comment as the other one here.
>> 
>> >      char *idstr;
>> >      uint32_t instance_id;
>> >      char *buf;
>> > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
>> > index 72b72b6e62..cfd0465bac 100644
>> > --- a/migration/multifd-device-state.c
>> > +++ b/migration/multifd-device-state.c
>> > @@ -44,7 +44,7 @@ void multifd_device_state_save_setup(void)
>> >  
>> >  void multifd_device_state_clear(MultiFDDeviceState_t *device_state)
>> >  {
>> > -    g_clear_pointer(&device_state->idstr, g_free);
>> > +    device_state->idstr = NULL;
>> >      g_clear_pointer(&device_state->buf, g_free);
>> >  }
>> >  
>> > @@ -100,7 +100,12 @@ bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
>> >  
>> >      multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE);
>> >      device_state = &device_state_send->u.device_state;
>> > -    device_state->idstr = g_strdup(idstr);
>> > +    /*
>> > +     * NOTE: here we must use a static idstr (e.g. of a savevm state
>> > +     * entry) rather than any dynamically allocated buffer, because multifd
>> > +     * assumes this pointer is always valid!
>> > +     */
>> > +    device_state->idstr = idstr;
>> >      device_state->instance_id = instance_id;
>> >      device_state->buf = g_memdup2(data, len);
>> >      device_state->buf_len = len;
>> > @@ -137,7 +142,6 @@ static void multifd_device_state_save_thread_data_free(void *opaque)
>> >  {
>> >      struct MultiFDDSSaveThreadData *data = opaque;
>> >  
>> > -    g_clear_pointer(&data->idstr, g_free);
>> >      g_free(data);
>> >  }
>> >  
>> > -- 
>> > 2.45.0
>> >
>> >
>> > From abfea9698ff46ad0e0175e1dcc6e005e0b2ece2a Mon Sep 17 00:00:00 2001
>> > From: Peter Xu <peterx@redhat.com>
>> > Date: Tue, 10 Sep 2024 12:27:49 -0400
>> > Subject: [PATCH 3/3] migration/multifd: Optimize device_state buffer
>> >  allocations
>> >
>> > Provide a device_state->buf_prealloc so that the buffers can be reused if
>> > possible.  Provide a set of APIs to use it right.  Please see the
>> > documentation for the API in the code.
>> >
>> > The default buffer size came from VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE as of
>> > now.
>> >
>> > Signed-off-by: Peter Xu <peterx@redhat.com>
>> > ---
>> >  include/hw/vfio/vfio-common.h    |  9 ++++
>> >  include/migration/misc.h         | 12 ++++-
>> >  migration/multifd.h              | 11 +++-
>> >  hw/vfio/migration.c              | 43 ++++++++-------
>> >  migration/multifd-device-state.c | 93 +++++++++++++++++++++++++-------
>> >  migration/multifd.c              |  9 ----
>> >  6 files changed, 126 insertions(+), 51 deletions(-)
>> >
>> > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> > index 4578a0ca6a..c1f2f4ae55 100644
>> > --- a/include/hw/vfio/vfio-common.h
>> > +++ b/include/hw/vfio/vfio-common.h
>> > @@ -61,6 +61,13 @@ typedef struct VFIORegion {
>> >      uint8_t nr; /* cache the region number for debug */
>> >  } VFIORegion;
>> >  
>> > +typedef struct VFIODeviceStatePacket {
>> > +    uint32_t version;
>> > +    uint32_t idx;
>> > +    uint32_t flags;
>> > +    uint8_t data[0];
>> > +} QEMU_PACKED VFIODeviceStatePacket;
>> > +
>> >  typedef struct VFIOMigration {
>> >      struct VFIODevice *vbasedev;
>> >      VMChangeStateEntry *vm_state;
>> > @@ -168,6 +175,8 @@ typedef struct VFIODevice {
>> >      int devid;
>> >      IOMMUFDBackend *iommufd;
>> >      VFIOIOASHwpt *hwpt;
>> > +    /* Only used on sender side when multifd is enabled */
>> > +    VFIODeviceStatePacket *multifd_packet;
>> >      QLIST_ENTRY(VFIODevice) hwpt_next;
>> >  } VFIODevice;
>> >  
>> > diff --git a/include/migration/misc.h b/include/migration/misc.h
>> > index 26f7f3140f..1a8676ed3d 100644
>> > --- a/include/migration/misc.h
>> > +++ b/include/migration/misc.h
>> > @@ -112,8 +112,16 @@ bool migration_in_bg_snapshot(void);
>> >  void dirty_bitmap_mig_init(void);
>> >  
>> >  /* migration/multifd-device-state.c */
>> > -bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
>> > -                                char *data, size_t len);
>> > +struct MultiFDDeviceState_t;
>> > +typedef struct MultiFDDeviceState_t MultiFDDeviceState_t;
>> > +
>> > +MultiFDDeviceState_t *
>> > +multifd_device_state_prepare(char *idstr, uint32_t instance_id);
>> > +void *multifd_device_state_get_buffer(MultiFDDeviceState_t *s,
>> > +                                      int64_t *buf_len);
>> > +bool multifd_device_state_finish(MultiFDDeviceState_t *state,
>> > +                                 void *buf, int64_t buf_len);
>> > +
>> >  bool migration_has_device_state_support(void);
>> >  
>> >  void
>> > diff --git a/migration/multifd.h b/migration/multifd.h
>> > index 1eaa5d4496..1ccdeeb8c5 100644
>> > --- a/migration/multifd.h
>> > +++ b/migration/multifd.h
>> > @@ -15,6 +15,7 @@
>> >  
>> >  #include "exec/target_page.h"
>> >  #include "ram.h"
>> > +#include "migration/misc.h"
>> >  
>> >  typedef struct MultiFDRecvData MultiFDRecvData;
>> >  typedef struct MultiFDSendData MultiFDSendData;
>> > @@ -114,16 +115,22 @@ struct MultiFDRecvData {
>> >      off_t file_offset;
>> >  };
>> >  
>> > -typedef struct {
>> > +struct MultiFDDeviceState_t {
>> >      /*
>> >       * Name of the owner device.  NOTE: it's caller's responsibility to
>> >       * make sure the pointer is always valid!
>> >       */
>> >      char *idstr;
>> >      uint32_t instance_id;
>> > +    /*
>> > +     * Points to the buffer to send via multifd.  Normally it's the same as
>> > +     * buf_prealloc, otherwise the caller needs to make sure the buffer is
>> > +     * avaliable through multifd running.
>> 
>> "throughout multifd runtime" maybe.
>> 
>> > +     */
>> >      char *buf;
>> > +    char *buf_prealloc;
>> >      size_t buf_len;
>> > -} MultiFDDeviceState_t;
>> > +};
>> >  
>> >  typedef enum {
>> >      MULTIFD_PAYLOAD_NONE,
>> > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
>> > index 67996aa2df..e36422b7c5 100644
>> > --- a/hw/vfio/migration.c
>> > +++ b/hw/vfio/migration.c
>> > @@ -59,13 +59,6 @@
>> >  
>> >  #define VFIO_DEVICE_STATE_CONFIG_STATE (1)
>> >  
>> > -typedef struct VFIODeviceStatePacket {
>> > -    uint32_t version;
>> > -    uint32_t idx;
>> > -    uint32_t flags;
>> > -    uint8_t data[0];
>> > -} QEMU_PACKED VFIODeviceStatePacket;
>> > -
>> >  static int64_t bytes_transferred;
>> >  
>> >  static const char *mig_state_to_str(enum vfio_device_mig_state state)
>> > @@ -741,6 +734,9 @@ static void vfio_save_cleanup(void *opaque)
>> >      migration->initial_data_sent = false;
>> >      vfio_migration_cleanup(vbasedev);
>> >      trace_vfio_save_cleanup(vbasedev->name);
>> > +    if (vbasedev->multifd_packet) {
>> > +        g_clear_pointer(&vbasedev->multifd_packet, g_free);
>> > +    }
>> >  }
>> >  
>> >  static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy,
>> > @@ -892,7 +888,8 @@ static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbas
>> >      g_autoptr(QIOChannelBuffer) bioc = NULL;
>> >      QEMUFile *f = NULL;
>> >      int ret;
>> > -    g_autofree VFIODeviceStatePacket *packet = NULL;
>> > +    VFIODeviceStatePacket *packet;
>> > +    MultiFDDeviceState_t *state;
>> >      size_t packet_len;
>> >  
>> >      bioc = qio_channel_buffer_new(0);
>> > @@ -911,13 +908,19 @@ static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbas
>> >      }
>> >  
>> >      packet_len = sizeof(*packet) + bioc->usage;
>> > -    packet = g_malloc0(packet_len);
>> > +
>> > +    state = multifd_device_state_prepare(idstr, instance_id);
>> > +    /*
>> > +     * Do not reuse multifd buffer, but use our own due to random size.
>> > +     * The buffer will be freed only when save cleanup.
>> > +     */
>> > +    vbasedev->multifd_packet = g_malloc0(packet_len);
>> > +    packet = vbasedev->multifd_packet;
>> >      packet->idx = idx;
>> >      packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE;
>> >      memcpy(&packet->data, bioc->data, bioc->usage);
>> >  
>> > -    if (!multifd_queue_device_state(idstr, instance_id,
>> > -                                    (char *)packet, packet_len)) {
>> > +    if (!multifd_device_state_finish(state, packet, packet_len)) {
>> >          ret = -1;
>> >      }
>> >  
>> > @@ -936,7 +939,6 @@ static int vfio_save_complete_precopy_thread(char *idstr,
>> >      VFIODevice *vbasedev = opaque;
>> >      VFIOMigration *migration = vbasedev->migration;
>> >      int ret;
>> > -    g_autofree VFIODeviceStatePacket *packet = NULL;
>> >      uint32_t idx;
>> >  
>> >      if (!migration->multifd_transfer) {
>> > @@ -954,21 +956,25 @@ static int vfio_save_complete_precopy_thread(char *idstr,
>> >          goto ret_finish;
>> >      }
>> >  
>> > -    packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size);
>> > -
>> >      for (idx = 0; ; idx++) {
>> > +        VFIODeviceStatePacket *packet;
>> > +        MultiFDDeviceState_t *state;
>> >          ssize_t data_size;
>> >          size_t packet_size;
>> > +        int64_t buf_size;
>> >  
>> >          if (qatomic_read(abort_flag)) {
>> >              ret = -ECANCELED;
>> >              goto ret_finish;
>> >          }
>> >  
>> > +        state = multifd_device_state_prepare(idstr, instance_id);
>> > +        packet = multifd_device_state_get_buffer(state, &buf_size);
>> >          data_size = read(migration->data_fd, &packet->data,
>> > -                         migration->data_buffer_size);
>> > +                         buf_size - sizeof(*packet));
>> >          if (data_size < 0) {
>> >              if (errno != ENOMSG) {
>> > +                multifd_device_state_finish(state, NULL, 0);
>> >                  ret = -errno;
>> >                  goto ret_finish;
>> >              }
>> > @@ -980,14 +986,15 @@ static int vfio_save_complete_precopy_thread(char *idstr,
>> >              data_size = 0;
>> >          }
>> >  
>> > -        if (data_size == 0)
>> > +        if (data_size == 0) {
>> > +            multifd_device_state_finish(state, NULL, 0);
>> >              break;
>> > +        }
>> >  
>> >          packet->idx = idx;
>> >          packet_size = sizeof(*packet) + data_size;
>> >  
>> > -        if (!multifd_queue_device_state(idstr, instance_id,
>> > -                                        (char *)packet, packet_size)) {
>> > +        if (!multifd_device_state_finish(state, packet, packet_size)) {
>> >              ret = -1;
>> >              goto ret_finish;
>> >          }
>> > diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
>> > index cfd0465bac..6f0259426d 100644
>> > --- a/migration/multifd-device-state.c
>> > +++ b/migration/multifd-device-state.c
>> > @@ -20,15 +20,18 @@ ThreadPool *send_threads;
>> >  int send_threads_ret;
>> >  bool send_threads_abort;
>> >  
>> > +#define  MULTIFD_DEVICE_STATE_BUFLEN  (1UL << 20)
>> > +
>> >  static MultiFDSendData *device_state_send;
>> >  
>> > -/* TODO: use static buffers for idstr and buf */
>> >  void multifd_device_state_payload_alloc(MultiFDDeviceState_t *device_state)
>> >  {
>> > +    device_state->buf_prealloc = g_malloc0(MULTIFD_DEVICE_STATE_BUFLEN);
>> >  }
>> >  
>> >  void multifd_device_state_payload_free(MultiFDDeviceState_t *device_state)
>> >  {
>> > +    g_clear_pointer(&device_state->buf_prealloc, g_free);
>> >  }
>> >  
>> >  void multifd_device_state_save_setup(void)
>> > @@ -42,12 +45,6 @@ void multifd_device_state_save_setup(void)
>> >      send_threads_abort = false;
>> >  }
>> >  
>> > -void multifd_device_state_clear(MultiFDDeviceState_t *device_state)
>> > -{
>> > -    device_state->idstr = NULL;
>> > -    g_clear_pointer(&device_state->buf, g_free);
>> > -}
>> > -
>> >  void multifd_device_state_save_cleanup(void)
>> >  {
>> >      g_clear_pointer(&send_threads, thread_pool_free);
>> > @@ -89,33 +86,89 @@ void multifd_device_state_send_prepare(MultiFDSendParams *p)
>> >      multifd_device_state_fill_packet(p);
>> >  }
>> >  
>> > -bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
>> > -                                char *data, size_t len)
>> > +/*
>> > + * Prepare to send some device state via multifd.  Returns the current idle
>> > + * MultiFDDeviceState_t*.
>> > + *
>> > + * As a follow up, the caller must call multifd_device_state_finish() to
>> > + * release the resources.
>> > + *
>> > + * One example usage of the API:
>> > + *
>> > + *   // Fetch a free multifd device state object
>> > + *   state = multifd_device_state_prepare(idstr, instance_id);
>> > + *
>> > + *   // Optional: fetch the buffer to reuse
>> > + *   buf = multifd_device_state_get_buffer(state, &buf_size);
>> > + *
>> > + *   // Here len>0 means success, otherwise failure
>> > + *   len = buffer_fill(buf, buf_size);
>> > + *
>> > + *   // Finish the transaction, either enqueue or cancel the request.  Here
>> > + *   // len>0 will enqueue, <=0 will cancel.
>> > + *   multifd_device_state_finish(state, buf, len);
>> > + */
>> > +MultiFDDeviceState_t *
>> > +multifd_device_state_prepare(char *idstr, uint32_t instance_id)
>> >  {
>> > -    /* Device state submissions can come from multiple threads */
>> > -    QEMU_LOCK_GUARD(&queue_job_mutex);
>> >      MultiFDDeviceState_t *device_state;
>> >  
>> >      assert(multifd_payload_empty(device_state_send));
>> >  
>> > -    multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE);
>> > +    /*
>> > +     * TODO: The lock name may need change, but I'm reusing just for
>> > +     * simplicity.
>> > +     */
>> > +    qemu_mutex_lock(&queue_job_mutex);
>> > +
>> >      device_state = &device_state_send->u.device_state;
>> >      /*
>> > -     * NOTE: here we must use a static idstr (e.g. of a savevm state
>> > -     * entry) rather than any dynamically allocated buffer, because multifd
>> > +     * NOTE: here we must use a static idstr (e.g. of a savevm state entry)
>> > +     * rather than any dynamically allocated buffer, because multifd
>> >       * assumes this pointer is always valid!
>> >       */
>> >      device_state->idstr = idstr;
>> >      device_state->instance_id = instance_id;
>> > -    device_state->buf = g_memdup2(data, len);
>> > -    device_state->buf_len = len;
>> >  
>> > -    if (!multifd_send(&device_state_send)) {
>> > -        multifd_send_data_clear(device_state_send);
>> > -        return false;
>> > +    return &device_state_send->u.device_state;
>> > +}
>> > +
>> > +/*
>> > + * Need to be used after a previous call to multifd_device_state_prepare(),
>> > + * the buffer must not be used after invoke multifd_device_state_finish().
>> > + */
>> > +void *multifd_device_state_get_buffer(MultiFDDeviceState_t *s,
>> > +                                      int64_t *buf_len)
>> > +{
>> > +    *buf_len = MULTIFD_DEVICE_STATE_BUFLEN;
>> > +    return s->buf_prealloc;
>> > +}
>> > +
>> > +/*
>> > + * Need to be used only in pair with a previous call to
>> > + * multifd_device_state_prepare().  Returns true if enqueue successful,
>> > + * false otherwise.
>> > + */
>> > +bool multifd_device_state_finish(MultiFDDeviceState_t *state,
>> > +                                 void *buf, int64_t buf_len)
>> > +{
>> > +    bool result = false;
>> > +
>> > +    /* Currently we only have one global free buffer */
>> > +    assert(state == &device_state_send->u.device_state);
>> > +
>> > +    if (buf_len < 0) {
>> > +        goto out;
>> >      }
>> >  
>> > -    return true;
>> > +    multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE);
>> > +    /* This normally will be the state->buf_prealloc, but not required */
>> > +    state->buf = buf;
>> > +    state->buf_len = buf_len;
>> > +    result = multifd_send(&device_state_send);
>> > +out:
>> > +    qemu_mutex_unlock(&queue_job_mutex);
>> > +    return result;
>> >  }
>> >  
>> >  bool migration_has_device_state_support(void)
>> > diff --git a/migration/multifd.c b/migration/multifd.c
>> > index 5a20b831cf..2b5185e298 100644
>> > --- a/migration/multifd.c
>> > +++ b/migration/multifd.c
>> > @@ -115,15 +115,6 @@ void multifd_send_data_clear(MultiFDSendData *data)
>> >          return;
>> >      }
>> >  
>> > -    switch (data->type) {
>> > -    case MULTIFD_PAYLOAD_DEVICE_STATE:
>> > -        multifd_device_state_clear(&data->u.device_state);
>> > -        break;
>> > -    default:
>> > -        /* Nothing to do */
>> > -        break;
>> > -    }
>> > -
>> >      data->type = MULTIFD_PAYLOAD_NONE;
>> >  }
>> >  
>> > -- 
>> > 2.45.0
>>
Peter Xu Sept. 13, 2024, 2:19 p.m. UTC | #10
On Fri, Sep 13, 2024 at 10:21:39AM -0300, Fabiano Rosas wrote:
> Peter Xu <peterx@redhat.com> writes:
> 
> > On Thu, Sep 12, 2024 at 03:43:39PM -0300, Fabiano Rosas wrote:
> >> Peter Xu <peterx@redhat.com> writes:
> >> 
> >> Hi Peter, sorry if I'm not very enthusiastic by this, I'm sure you
> >> understand the rework is a little frustrating.
> >
> > That's OK.
> >
> > [For some reason my email sync program decided to give up working for
> >  hours.  I got more time looking at a tsc bug, which is good, but then I
> >  miss a lot of emails..]
> >
> >> 
> >> > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote:
> >> >> > +size_t multifd_device_state_payload_size(void)
> >> >> > +{
> >> >> > +    return sizeof(MultiFDDeviceState_t);
> >> >> > +}
> >> >> 
> >> >> This will not be necessary because the payload size is the same as the
> >> >> data type. We only need it for the special case where the MultiFDPages_t
> >> >> is smaller than the total ram payload size.
> >> >
> >> > Today I was thinking maybe we should really clean this up, as the current
> >> > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested
> >> > that more or less).  Knowing that VFIO can use dynamic buffers with ->idstr
> >> > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made
> >> > that feeling stronger.
> >> 
> >> If we're going to commit bad code and then rewrite it a week later, we
> >> could have just let the original series from Maciej merge without any of
> >> this.
> >
> > Why it's "bad code"?
> >
> > It runs pretty well, I don't think it's bad code.  You wrote it, and I
> > don't think it's bad at all.
> 
> Code that forces us to do arithmetic in order to properly allocate
> memory and comes with a big comment explaining how we're dodging
> compiler warnings is bad code in my book.
> 
> >
> > But now we're rethinking after reading Maciej's new series.
> >Personally I don't think it's a major problem.
> >
> > Note that we're not changing the design back: what was initially proposed
> > was the client submitting an array of multifd objects.  I still don't think
> > that's right.
> >
> > What the change goes so far is we make the union a struct, however that's
> > still N+2 objects not 2*N, where 2 came from RAM+VFIO.  I think the
> > important bits are still there (from your previous refactor series).
> >
> 
> You fail to appreciate that before the RFC series, multifd already
> allocated N for the pages.

It depends on how you see it, IMHO.  I think it allocates N not for the
"pages" but for the "threads", because the threads can be busy with those
buffers, no matter if it's "page" or "device data".

> The device state adds another client, so that
> would be another N anyway. The problem the RFC tried to solve was that
> multifd channels owned that 2N, so the array was added to move the
> memory into the client's ownership. IOW, it wasn't even the RFC series
> that made it 2N, that was the multifd design all along. Now in hindsight
> I don't think we should have went with the memory saving discussion.

I think I could have made that feeling that I only wanted to save memory,
if so, I'm sorry.  But do you still remember I mentioned "we can make it a
struct, too" before your series landed?  Then you think it's ok to keep
using union, and I'm ok too! At least at that time.  I don't think that's a
huge deal.  I don't think each route we go must be perfect, but we try the
best to make it as good.

I don't think any discussion must not happen.  I agree memory consumption
is not the 1st thing to worry, but I don't see why it can't be discussed.

Note that I never said we can't save those memory either - I plan to have
follow up patches (for this, after Maciej's series land.. that's why I even
didn't yet mention) to allow modules report device state buffer size.  I
just didn't say, yet, and don't plan to worry before vfio series lands.
When with that, we'll save 1M*N when no vfio device attached (but we'll
need to handle hotplug).  So I think we don't need to lose any finally.

However I think we need to get straight with the base design on how vfio
should use multifd, because it's important bit and I don't want to rework
important bits after a huge feature, if we know a better directions.

I don't even think what I proposed patch 1-3 here is a must to me, I should
be clear again here just in case we have similar discussions
afterwards.. that I'm ok with below be done after Maciej's:

  - Avoid memory allocations per-packet (done in patch 2-3)
  - Avoid unnecessary data copy (done in patch 2-3)
  - Avoid allocate device buffers when no device will use (not proposed)

But I'm not ok building everything on top of the idea of not using multifd
buffers in the current way, because that can involve a lot of changes:
that's where buffer passes from up to down or backwards, and the interface
needs change a lot too.  We already have that in master so it's not a
problem now.

> 
> >> I already suggested it a couple of times, we shouldn't be doing
> >> core refactorings underneath contributors' patches, this is too
> >> fragile. Just let people contribute their code and we can change it
> >> later.
> >
> > I sincerely don't think a lot needs changing... only patch 1.  Basically
> > patch 1 on top of your previous rework series will be at least what I want,
> > but I'm open to comments from you guys.
> 
> Don't get me wrong, I'm very much in favor of what you're doing
> here. However, I don't think it's ok to be backtracking on our design
> while other people have series in flight that depend on it. You
> certainly know the feeling of trying to merge a feature and having
> maintainers ask you to rewrite a bunch code just to be able to start
> working. That's not ideal.

I as a patch writer always like to do that when it's essential.  Normally
the case is I don't have enough reviewer resources to help me get a better
design, or discuss about it.

When vfio is the new user of multifd vfio needs to do the heavy lifting to
draft the api.

> 
> I tried to quickly insert the RFC series before the device state series
> progressed too much, but it's 3 months later and we're still discussing
> it, maybe we don't need to do it this way.

Can I get that of your feeling from when you were working on mapped-ram?
That series does take long enough, I agree.  Not so bad yet with the VFIO
series - it's good to have you around because you provide great reviews.
I'm also trying the best to not let a series dangle for more than a year.
I don't think 3 months is long with this feature: this is the 1st multifd
extrenal user (and file mapping is also in another angle), it can take some
time.

Sorry if it's so, but sorry again I don't think I get convinced: I think we
need to go this way to build blocks one by one, and we need to make sure
lower blocks are hopefully solid enough to take the upper ones.  Again I'm
ok with small things that go against it, but not major designs.  We
shouldn't go rewrite major designs if we seem to know a better one.

> 
> And ok, let's consider the current situation a special case. But I would
> like to avoid in the future this kind of uncertainty. 
> 
> >
> > Note that patch 2-3 will be on top of Maciej's changes and they're totally
> > not relevant to what we merged so far.  Hence, nothing relevant there to
> > what you worked.  And this is the diff of patch 1:
> >
> >  migration/multifd.h              | 16 +++++++++++-----
> >  migration/multifd-device-state.c |  8 ++++++--
> >  migration/multifd-nocomp.c       | 13 ++++++-------
> >  migration/multifd.c              | 25 ++++++-------------------
> >  4 files changed, 29 insertions(+), 33 deletions(-)
> >
> > It's only 33 lines removed (many of which are comments..), it's not a huge
> > lot.  I don't know why you feel so bad at this...
> >
> > It's probably because we maintain migration together, or we can keep our
> > own way of work.  I don't think we did anything wrong yet so far.
> >
> > We can definitely talk about this in next 1:1.
> >
> >> 
> >> This is also why I've been trying hard to separate core multifd
> >> functionality from migration code that uses multifd to transmit their
> >> data.
> >> 
> >> My original RFC plus the suggestion to extend multifd_ops for device
> >> state would have (almost) made it so that no client code would be left
> >> in multifd. We could have been turning this thing upside down and it
> >> wouldn't affect anyone in terms of code conflicts.
> >
> > Do you mean you preferred the 2*N approach?
> >
> 
> 2*N, where N is usually not larger than 32 and the payload size is
> 1k. Yes, I'd trade that off no problem.

I think it's a problem.

Vdpa when involved with exactly the same pattern of how vfio uses it (as
they're really alike underneath) then vdpa will need its own array of
buffers, or it'll need to take the same vfio lock which doesn't make sense
to me.

N+2, or, N+M (M is the user) is the minimum buffers we need.  N because
multifd can be worst case 100% busy on all threads occupying the buffers.
M because M users can be worst case 100% pre-filling.  It's either about
memory consumption, or about logical sensibility.

> 
> >> 
> >> The ship has already sailed, so your patches below are fine, I have just
> >> some small comments.
> >
> > I'm not sure what you meant about "ship sailed", but we should merge code
> > whenever we think is the most correct.
> 
> As you put above, I agree that the important bits of the original series
> have been preserved, but other secondary goals were lost, such as the
> more abstract separation between multifd & client code and that is the
> ship that has sailed.
> 
> That series was not: "introduce this array for no reason", we also lost
> the ability to abstract the payload from the multifd threads when we
> dropped the .alloc_fn callback for instance. The last patch you posted

I don't remember the details there, but my memory was that it was too
flexible while we seem to reach the consensus that we only process either
RAM or device, nothing else.

> here now adds multifd_device_state_prepare, somewhat ignoring that the
> ram code also has the same pattern and it could be made to use the same
> API.

I need some further elaborations to understand.

multifd_device_state_prepare currently does a few things: taking ownership
of the temp device state object, fill in idstr / instance_id, taking the
lock (so far is needed because we only have one device state object).  None
of them seems to be needed for RAM yet.

Feel free to send a rfc patch if that helps.

> 
> I did accept your premise that ram+compression is one thing while
> device_state is another, so I'm not asking it to be changed, just
> pointing out that the RFC series also addressed those issues. I might
> not have made that clear back then.
> 
> > I hope you meant after below all things look the best, or please shoot.
> > That's exactly what I'm requesting for as comments.
> 
> What you have here is certainly an improvement from the current
> state. I'm just ranting about the path we took here.
Fabiano Rosas Sept. 13, 2024, 3:04 p.m. UTC | #11
Peter Xu <peterx@redhat.com> writes:

> On Fri, Sep 13, 2024 at 10:21:39AM -0300, Fabiano Rosas wrote:
>> Peter Xu <peterx@redhat.com> writes:
>> 
>> > On Thu, Sep 12, 2024 at 03:43:39PM -0300, Fabiano Rosas wrote:
>> >> Peter Xu <peterx@redhat.com> writes:
>> >> 
>> >> Hi Peter, sorry if I'm not very enthusiastic by this, I'm sure you
>> >> understand the rework is a little frustrating.
>> >
>> > That's OK.
>> >
>> > [For some reason my email sync program decided to give up working for
>> >  hours.  I got more time looking at a tsc bug, which is good, but then I
>> >  miss a lot of emails..]
>> >
>> >> 
>> >> > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote:
>> >> >> > +size_t multifd_device_state_payload_size(void)
>> >> >> > +{
>> >> >> > +    return sizeof(MultiFDDeviceState_t);
>> >> >> > +}
>> >> >> 
>> >> >> This will not be necessary because the payload size is the same as the
>> >> >> data type. We only need it for the special case where the MultiFDPages_t
>> >> >> is smaller than the total ram payload size.
>> >> >
>> >> > Today I was thinking maybe we should really clean this up, as the current
>> >> > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested
>> >> > that more or less).  Knowing that VFIO can use dynamic buffers with ->idstr
>> >> > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made
>> >> > that feeling stronger.
>> >> 
>> >> If we're going to commit bad code and then rewrite it a week later, we
>> >> could have just let the original series from Maciej merge without any of
>> >> this.
>> >
>> > Why it's "bad code"?
>> >
>> > It runs pretty well, I don't think it's bad code.  You wrote it, and I
>> > don't think it's bad at all.
>> 
>> Code that forces us to do arithmetic in order to properly allocate
>> memory and comes with a big comment explaining how we're dodging
>> compiler warnings is bad code in my book.
>> 
>> >
>> > But now we're rethinking after reading Maciej's new series.
>> >Personally I don't think it's a major problem.
>> >
>> > Note that we're not changing the design back: what was initially proposed
>> > was the client submitting an array of multifd objects.  I still don't think
>> > that's right.
>> >
>> > What the change goes so far is we make the union a struct, however that's
>> > still N+2 objects not 2*N, where 2 came from RAM+VFIO.  I think the
>> > important bits are still there (from your previous refactor series).
>> >
>> 
>> You fail to appreciate that before the RFC series, multifd already
>> allocated N for the pages.
>
> It depends on how you see it, IMHO.  I think it allocates N not for the
> "pages" but for the "threads", because the threads can be busy with those
> buffers, no matter if it's "page" or "device data".

Each MultiFD*Params had a p->pages, so N channels, N p->pages. The
device state series would add p->device_state, one per channel. So 2N +
2 (for the extra slot).

>
>> The device state adds another client, so that
>> would be another N anyway. The problem the RFC tried to solve was that
>> multifd channels owned that 2N, so the array was added to move the
>> memory into the client's ownership. IOW, it wasn't even the RFC series
>> that made it 2N, that was the multifd design all along. Now in hindsight
>> I don't think we should have went with the memory saving discussion.
>
> I think I could have made that feeling that I only wanted to save memory,
> if so, I'm sorry.  But do you still remember I mentioned "we can make it a
> struct, too" before your series landed?  Then you think it's ok to keep
> using union, and I'm ok too! At least at that time.  I don't think that's a
> huge deal.  I don't think each route we go must be perfect, but we try the
> best to make it as good.

Yep, I did agree with all of this. I'm just saying I now think I
shouldn't have.

>
> I don't think any discussion must not happen.  I agree memory consumption
> is not the 1st thing to worry, but I don't see why it can't be discussed.

It can be discussed, sure, but then 3 months pass and we're still
talking about it. Saved ~64k and spent 3 months. We could have just as
well said: "let's do a pass to optimize memory consumption after the
device state series is in".

>
> Note that I never said we can't save those memory either - I plan to have
> follow up patches (for this, after Maciej's series land.. that's why I even
> didn't yet mention) to allow modules report device state buffer size.  I
> just didn't say, yet, and don't plan to worry before vfio series lands.
> When with that, we'll save 1M*N when no vfio device attached (but we'll
> need to handle hotplug).  So I think we don't need to lose any finally.
>
> However I think we need to get straight with the base design on how vfio
> should use multifd, because it's important bit and I don't want to rework
> important bits after a huge feature, if we know a better directions.
>
> I don't even think what I proposed patch 1-3 here is a must to me, I should
> be clear again here just in case we have similar discussions
> afterwards.. that I'm ok with below be done after Maciej's:
>
>   - Avoid memory allocations per-packet (done in patch 2-3)
>   - Avoid unnecessary data copy (done in patch 2-3)
>   - Avoid allocate device buffers when no device will use (not proposed)
>
> But I'm not ok building everything on top of the idea of not using multifd
> buffers in the current way, because that can involve a lot of changes:
> that's where buffer passes from up to down or backwards, and the interface
> needs change a lot too.  We already have that in master so it's not a
> problem now.
>
>> 
>> >> I already suggested it a couple of times, we shouldn't be doing
>> >> core refactorings underneath contributors' patches, this is too
>> >> fragile. Just let people contribute their code and we can change it
>> >> later.
>> >
>> > I sincerely don't think a lot needs changing... only patch 1.  Basically
>> > patch 1 on top of your previous rework series will be at least what I want,
>> > but I'm open to comments from you guys.
>> 
>> Don't get me wrong, I'm very much in favor of what you're doing
>> here. However, I don't think it's ok to be backtracking on our design
>> while other people have series in flight that depend on it. You
>> certainly know the feeling of trying to merge a feature and having
>> maintainers ask you to rewrite a bunch code just to be able to start
>> working. That's not ideal.
>
> I as a patch writer always like to do that when it's essential.  Normally
> the case is I don't have enough reviewer resources to help me get a better
> design, or discuss about it.

Right, but we can't keep providing a moving target. See the thread pool
discussion for an example. It's hard to work that way. The discussion
here is similar, we introduced the union, now we're moving to the
struct. And you're right that the changes here are small, so let's not
get caught in that.

>
> When vfio is the new user of multifd vfio needs to do the heavy lifting to
> draft the api.

Well, multifd could have provided a flexible API to being with. That's
entirely on us. I have been toying with allowing more clients since at
least 1 year ago. We just couldn't get there in time.

>
>> 
>> I tried to quickly insert the RFC series before the device state series
>> progressed too much, but it's 3 months later and we're still discussing
>> it, maybe we don't need to do it this way.
>
> Can I get that of your feeling from when you were working on
> mapped-ram?

At that time I had already committed to helping maintain the code, so
the time spent there already went into the maintainer bucket anyway. If
I were instead just trying to drive-by, then that would have been a
pain.

> That series does take long enough, I agree.  Not so bad yet with the VFIO
> series - it's good to have you around because you provide great reviews.
> I'm also trying the best to not let a series dangle for more than a year.
> I don't think 3 months is long with this feature: this is the 1st multifd
> extrenal user (and file mapping is also in another angle), it can take some
> time.

Oh, I don't mean the VFIO series is taking long. That's a complex
feature indeed. I just mean going from p->pages to p->data could have
taken less time. I'm suggesting we might have overdone there a bit.

>
> Sorry if it's so, but sorry again I don't think I get convinced: I think we
> need to go this way to build blocks one by one, and we need to make sure
> lower blocks are hopefully solid enough to take the upper ones.  Again I'm
> ok with small things that go against it, but not major designs.  We
> shouldn't go rewrite major designs if we seem to know a better one.
>
>> 
>> And ok, let's consider the current situation a special case. But I would
>> like to avoid in the future this kind of uncertainty. 
>> 
>> >
>> > Note that patch 2-3 will be on top of Maciej's changes and they're totally
>> > not relevant to what we merged so far.  Hence, nothing relevant there to
>> > what you worked.  And this is the diff of patch 1:
>> >
>> >  migration/multifd.h              | 16 +++++++++++-----
>> >  migration/multifd-device-state.c |  8 ++++++--
>> >  migration/multifd-nocomp.c       | 13 ++++++-------
>> >  migration/multifd.c              | 25 ++++++-------------------
>> >  4 files changed, 29 insertions(+), 33 deletions(-)
>> >
>> > It's only 33 lines removed (many of which are comments..), it's not a huge
>> > lot.  I don't know why you feel so bad at this...
>> >
>> > It's probably because we maintain migration together, or we can keep our
>> > own way of work.  I don't think we did anything wrong yet so far.
>> >
>> > We can definitely talk about this in next 1:1.
>> >
>> >> 
>> >> This is also why I've been trying hard to separate core multifd
>> >> functionality from migration code that uses multifd to transmit their
>> >> data.
>> >> 
>> >> My original RFC plus the suggestion to extend multifd_ops for device
>> >> state would have (almost) made it so that no client code would be left
>> >> in multifd. We could have been turning this thing upside down and it
>> >> wouldn't affect anyone in terms of code conflicts.
>> >
>> > Do you mean you preferred the 2*N approach?
>> >
>> 
>> 2*N, where N is usually not larger than 32 and the payload size is
>> 1k. Yes, I'd trade that off no problem.
>
> I think it's a problem.
>
> Vdpa when involved with exactly the same pattern of how vfio uses it (as
> they're really alike underneath) then vdpa will need its own array of
> buffers, or it'll need to take the same vfio lock which doesn't make sense
> to me.
>
> N+2, or, N+M (M is the user) is the minimum buffers we need.  N because
> multifd can be worst case 100% busy on all threads occupying the buffers.
> M because M users can be worst case 100% pre-filling.  It's either about
> memory consumption, or about logical sensibility.

I'm aware of the memory consumption. Still, we're not forced to use the
minimum amount of space we can. If using more memory can lead to a
better design in the medium term, we're allowed to make that choice.

Hey, I'm not even saying we *should* have gone with 2N. I think it's
good that we're now N+M. But I think we also lost some design
flexibility due to that.

>
>> 
>> >> 
>> >> The ship has already sailed, so your patches below are fine, I have just
>> >> some small comments.
>> >
>> > I'm not sure what you meant about "ship sailed", but we should merge code
>> > whenever we think is the most correct.
>> 
>> As you put above, I agree that the important bits of the original series
>> have been preserved, but other secondary goals were lost, such as the
>> more abstract separation between multifd & client code and that is the
>> ship that has sailed.
>> 
>> That series was not: "introduce this array for no reason", we also lost
>> the ability to abstract the payload from the multifd threads when we
>> dropped the .alloc_fn callback for instance. The last patch you posted
>
> I don't remember the details there, but my memory was that it was too
> flexible while we seem to reach the consensus that we only process either
> RAM or device, nothing else.

Indeed. I'm being unfair here, sorry.

>
>> here now adds multifd_device_state_prepare, somewhat ignoring that the
>> ram code also has the same pattern and it could be made to use the same
>> API.
>
> I need some further elaborations to understand.
>
> multifd_device_state_prepare currently does a few things: taking ownership
> of the temp device state object, fill in idstr / instance_id, taking the
> lock (so far is needed because we only have one device state object).  None
> of them seems to be needed for RAM yet.
>
> Feel free to send a rfc patch if that helps.

What if I don't send a patch, wait for it to get merged and then send a
refactoring on top so we don't add yet another detour to this
conversation? =)

>
>> 
>> I did accept your premise that ram+compression is one thing while
>> device_state is another, so I'm not asking it to be changed, just
>> pointing out that the RFC series also addressed those issues. I might
>> not have made that clear back then.
>> 
>> > I hope you meant after below all things look the best, or please shoot.
>> > That's exactly what I'm requesting for as comments.
>> 
>> What you have here is certainly an improvement from the current
>> state. I'm just ranting about the path we took here.
Peter Xu Sept. 13, 2024, 3:22 p.m. UTC | #12
On Fri, Sep 13, 2024 at 12:04:00PM -0300, Fabiano Rosas wrote:
> Peter Xu <peterx@redhat.com> writes:
> 
> > On Fri, Sep 13, 2024 at 10:21:39AM -0300, Fabiano Rosas wrote:
> >> Peter Xu <peterx@redhat.com> writes:
> >> 
> >> > On Thu, Sep 12, 2024 at 03:43:39PM -0300, Fabiano Rosas wrote:
> >> >> Peter Xu <peterx@redhat.com> writes:
> >> >> 
> >> >> Hi Peter, sorry if I'm not very enthusiastic by this, I'm sure you
> >> >> understand the rework is a little frustrating.
> >> >
> >> > That's OK.
> >> >
> >> > [For some reason my email sync program decided to give up working for
> >> >  hours.  I got more time looking at a tsc bug, which is good, but then I
> >> >  miss a lot of emails..]
> >> >
> >> >> 
> >> >> > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote:
> >> >> >> > +size_t multifd_device_state_payload_size(void)
> >> >> >> > +{
> >> >> >> > +    return sizeof(MultiFDDeviceState_t);
> >> >> >> > +}
> >> >> >> 
> >> >> >> This will not be necessary because the payload size is the same as the
> >> >> >> data type. We only need it for the special case where the MultiFDPages_t
> >> >> >> is smaller than the total ram payload size.
> >> >> >
> >> >> > Today I was thinking maybe we should really clean this up, as the current
> >> >> > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested
> >> >> > that more or less).  Knowing that VFIO can use dynamic buffers with ->idstr
> >> >> > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made
> >> >> > that feeling stronger.
> >> >> 
> >> >> If we're going to commit bad code and then rewrite it a week later, we
> >> >> could have just let the original series from Maciej merge without any of
> >> >> this.
> >> >
> >> > Why it's "bad code"?
> >> >
> >> > It runs pretty well, I don't think it's bad code.  You wrote it, and I
> >> > don't think it's bad at all.
> >> 
> >> Code that forces us to do arithmetic in order to properly allocate
> >> memory and comes with a big comment explaining how we're dodging
> >> compiler warnings is bad code in my book.
> >> 
> >> >
> >> > But now we're rethinking after reading Maciej's new series.
> >> >Personally I don't think it's a major problem.
> >> >
> >> > Note that we're not changing the design back: what was initially proposed
> >> > was the client submitting an array of multifd objects.  I still don't think
> >> > that's right.
> >> >
> >> > What the change goes so far is we make the union a struct, however that's
> >> > still N+2 objects not 2*N, where 2 came from RAM+VFIO.  I think the
> >> > important bits are still there (from your previous refactor series).
> >> >
> >> 
> >> You fail to appreciate that before the RFC series, multifd already
> >> allocated N for the pages.
> >
> > It depends on how you see it, IMHO.  I think it allocates N not for the
> > "pages" but for the "threads", because the threads can be busy with those
> > buffers, no matter if it's "page" or "device data".
> 
> Each MultiFD*Params had a p->pages, so N channels, N p->pages. The
> device state series would add p->device_state, one per channel. So 2N +
> 2 (for the extra slot).

Then it makes sense to have SendData covering pages+device_state.  I think
it's what we have now, but maybe I missed the point.

> 
> >
> >> The device state adds another client, so that
> >> would be another N anyway. The problem the RFC tried to solve was that
> >> multifd channels owned that 2N, so the array was added to move the
> >> memory into the client's ownership. IOW, it wasn't even the RFC series
> >> that made it 2N, that was the multifd design all along. Now in hindsight
> >> I don't think we should have went with the memory saving discussion.
> >
> > I think I could have made that feeling that I only wanted to save memory,
> > if so, I'm sorry.  But do you still remember I mentioned "we can make it a
> > struct, too" before your series landed?  Then you think it's ok to keep
> > using union, and I'm ok too! At least at that time.  I don't think that's a
> > huge deal.  I don't think each route we go must be perfect, but we try the
> > best to make it as good.
> 
> Yep, I did agree with all of this. I'm just saying I now think I
> shouldn't have.
> 
> >
> > I don't think any discussion must not happen.  I agree memory consumption
> > is not the 1st thing to worry, but I don't see why it can't be discussed.
> 
> It can be discussed, sure, but then 3 months pass and we're still
> talking about it. Saved ~64k and spent 3 months. We could have just as
> well said: "let's do a pass to optimize memory consumption after the
> device state series is in".

We didn't discuss 3 months on discussing memory consumption only!  It's
unfair to think it like that.

> 
> >
> > Note that I never said we can't save those memory either - I plan to have
> > follow up patches (for this, after Maciej's series land.. that's why I even
> > didn't yet mention) to allow modules report device state buffer size.  I
> > just didn't say, yet, and don't plan to worry before vfio series lands.
> > When with that, we'll save 1M*N when no vfio device attached (but we'll
> > need to handle hotplug).  So I think we don't need to lose any finally.
> >
> > However I think we need to get straight with the base design on how vfio
> > should use multifd, because it's important bit and I don't want to rework
> > important bits after a huge feature, if we know a better directions.
> >
> > I don't even think what I proposed patch 1-3 here is a must to me, I should
> > be clear again here just in case we have similar discussions
> > afterwards.. that I'm ok with below be done after Maciej's:
> >
> >   - Avoid memory allocations per-packet (done in patch 2-3)
> >   - Avoid unnecessary data copy (done in patch 2-3)
> >   - Avoid allocate device buffers when no device will use (not proposed)
> >
> > But I'm not ok building everything on top of the idea of not using multifd
> > buffers in the current way, because that can involve a lot of changes:
> > that's where buffer passes from up to down or backwards, and the interface
> > needs change a lot too.  We already have that in master so it's not a
> > problem now.
> >
> >> 
> >> >> I already suggested it a couple of times, we shouldn't be doing
> >> >> core refactorings underneath contributors' patches, this is too
> >> >> fragile. Just let people contribute their code and we can change it
> >> >> later.
> >> >
> >> > I sincerely don't think a lot needs changing... only patch 1.  Basically
> >> > patch 1 on top of your previous rework series will be at least what I want,
> >> > but I'm open to comments from you guys.
> >> 
> >> Don't get me wrong, I'm very much in favor of what you're doing
> >> here. However, I don't think it's ok to be backtracking on our design
> >> while other people have series in flight that depend on it. You
> >> certainly know the feeling of trying to merge a feature and having
> >> maintainers ask you to rewrite a bunch code just to be able to start
> >> working. That's not ideal.
> >
> > I as a patch writer always like to do that when it's essential.  Normally
> > the case is I don't have enough reviewer resources to help me get a better
> > design, or discuss about it.
> 
> Right, but we can't keep providing a moving target. See the thread pool
> discussion for an example. It's hard to work that way. The discussion
> here is similar, we introduced the union, now we're moving to the
> struct. And you're right that the changes here are small, so let's not
> get caught in that.

What's your suggestion on the thread pool?  Should we merge the change
where vfio creates the threads on its own (assuming vfio maintainers are ok
with it)?

I would say no, that's what I suggested.  I'd start with reusing
ThreadPool, then we found issue when Stefan reported worry on abusing the
API.  All these discussions seem sensible to me so far.  We can't rush on
these until we figure things out step by step.  I don't see a way.

I saw Cedric suggesting to not even create a thread on recv side.  I am not
sure whether that's easy, but I'd agree with Cedric if possible.  I think
Maciej could have a point where it can block mutlifd threads, aka, IO
threads, which might be unwanted.

However said that, I still think device (even if threads needed) should not
randomly create threads during migration.  It'll be a nightmare.

> 
> >
> > When vfio is the new user of multifd vfio needs to do the heavy lifting to
> > draft the api.
> 
> Well, multifd could have provided a flexible API to being with. That's
> entirely on us. I have been toying with allowing more clients since at
> least 1 year ago. We just couldn't get there in time.
> 
> >
> >> 
> >> I tried to quickly insert the RFC series before the device state series
> >> progressed too much, but it's 3 months later and we're still discussing
> >> it, maybe we don't need to do it this way.
> >
> > Can I get that of your feeling from when you were working on
> > mapped-ram?
> 
> At that time I had already committed to helping maintain the code, so
> the time spent there already went into the maintainer bucket anyway. If
> I were instead just trying to drive-by, then that would have been a
> pain.

I don't think you became a maintainer changed how I would review mapped-ram
series.

OTOH, "I became a maintainer" could, because I know I am more responsible
to a chunk of code until I leave (and please let me know any time when you
think you're ready to take migration on your own).  That's a real
difference to me.

> 
> > That series does take long enough, I agree.  Not so bad yet with the VFIO
> > series - it's good to have you around because you provide great reviews.
> > I'm also trying the best to not let a series dangle for more than a year.
> > I don't think 3 months is long with this feature: this is the 1st multifd
> > extrenal user (and file mapping is also in another angle), it can take some
> > time.
> 
> Oh, I don't mean the VFIO series is taking long. That's a complex
> feature indeed. I just mean going from p->pages to p->data could have
> taken less time. I'm suggesting we might have overdone there a bit.
> 
> >
> > Sorry if it's so, but sorry again I don't think I get convinced: I think we
> > need to go this way to build blocks one by one, and we need to make sure
> > lower blocks are hopefully solid enough to take the upper ones.  Again I'm
> > ok with small things that go against it, but not major designs.  We
> > shouldn't go rewrite major designs if we seem to know a better one.
> >
> >> 
> >> And ok, let's consider the current situation a special case. But I would
> >> like to avoid in the future this kind of uncertainty. 
> >> 
> >> >
> >> > Note that patch 2-3 will be on top of Maciej's changes and they're totally
> >> > not relevant to what we merged so far.  Hence, nothing relevant there to
> >> > what you worked.  And this is the diff of patch 1:
> >> >
> >> >  migration/multifd.h              | 16 +++++++++++-----
> >> >  migration/multifd-device-state.c |  8 ++++++--
> >> >  migration/multifd-nocomp.c       | 13 ++++++-------
> >> >  migration/multifd.c              | 25 ++++++-------------------
> >> >  4 files changed, 29 insertions(+), 33 deletions(-)
> >> >
> >> > It's only 33 lines removed (many of which are comments..), it's not a huge
> >> > lot.  I don't know why you feel so bad at this...
> >> >
> >> > It's probably because we maintain migration together, or we can keep our
> >> > own way of work.  I don't think we did anything wrong yet so far.
> >> >
> >> > We can definitely talk about this in next 1:1.
> >> >
> >> >> 
> >> >> This is also why I've been trying hard to separate core multifd
> >> >> functionality from migration code that uses multifd to transmit their
> >> >> data.
> >> >> 
> >> >> My original RFC plus the suggestion to extend multifd_ops for device
> >> >> state would have (almost) made it so that no client code would be left
> >> >> in multifd. We could have been turning this thing upside down and it
> >> >> wouldn't affect anyone in terms of code conflicts.
> >> >
> >> > Do you mean you preferred the 2*N approach?
> >> >
> >> 
> >> 2*N, where N is usually not larger than 32 and the payload size is
> >> 1k. Yes, I'd trade that off no problem.
> >
> > I think it's a problem.
> >
> > Vdpa when involved with exactly the same pattern of how vfio uses it (as
> > they're really alike underneath) then vdpa will need its own array of
> > buffers, or it'll need to take the same vfio lock which doesn't make sense
> > to me.
> >
> > N+2, or, N+M (M is the user) is the minimum buffers we need.  N because
> > multifd can be worst case 100% busy on all threads occupying the buffers.
> > M because M users can be worst case 100% pre-filling.  It's either about
> > memory consumption, or about logical sensibility.
> 
> I'm aware of the memory consumption. Still, we're not forced to use the
> minimum amount of space we can. If using more memory can lead to a
> better design in the medium term, we're allowed to make that choice.
> 
> Hey, I'm not even saying we *should* have gone with 2N. I think it's
> good that we're now N+M. But I think we also lost some design
> flexibility due to that.
> 
> >
> >> 
> >> >> 
> >> >> The ship has already sailed, so your patches below are fine, I have just
> >> >> some small comments.
> >> >
> >> > I'm not sure what you meant about "ship sailed", but we should merge code
> >> > whenever we think is the most correct.
> >> 
> >> As you put above, I agree that the important bits of the original series
> >> have been preserved, but other secondary goals were lost, such as the
> >> more abstract separation between multifd & client code and that is the
> >> ship that has sailed.
> >> 
> >> That series was not: "introduce this array for no reason", we also lost
> >> the ability to abstract the payload from the multifd threads when we
> >> dropped the .alloc_fn callback for instance. The last patch you posted
> >
> > I don't remember the details there, but my memory was that it was too
> > flexible while we seem to reach the consensus that we only process either
> > RAM or device, nothing else.
> 
> Indeed. I'm being unfair here, sorry.
> 
> >
> >> here now adds multifd_device_state_prepare, somewhat ignoring that the
> >> ram code also has the same pattern and it could be made to use the same
> >> API.
> >
> > I need some further elaborations to understand.
> >
> > multifd_device_state_prepare currently does a few things: taking ownership
> > of the temp device state object, fill in idstr / instance_id, taking the
> > lock (so far is needed because we only have one device state object).  None
> > of them seems to be needed for RAM yet.
> >
> > Feel free to send a rfc patch if that helps.
> 
> What if I don't send a patch, wait for it to get merged and then send a
> refactoring on top so we don't add yet another detour to this
> conversation? =)

I thought it shouldn't conflict much if with ram only, and I used to mean
that can be a "comment in the form of patch".  But yeah, sure thing.
Fabiano Rosas Sept. 13, 2024, 6:26 p.m. UTC | #13
Peter Xu <peterx@redhat.com> writes:

> On Fri, Sep 13, 2024 at 12:04:00PM -0300, Fabiano Rosas wrote:
>> Peter Xu <peterx@redhat.com> writes:
>> 
>> > On Fri, Sep 13, 2024 at 10:21:39AM -0300, Fabiano Rosas wrote:
>> >> Peter Xu <peterx@redhat.com> writes:
>> >> 
>> >> > On Thu, Sep 12, 2024 at 03:43:39PM -0300, Fabiano Rosas wrote:
>> >> >> Peter Xu <peterx@redhat.com> writes:
>> >> >> 
>> >> >> Hi Peter, sorry if I'm not very enthusiastic by this, I'm sure you
>> >> >> understand the rework is a little frustrating.
>> >> >
>> >> > That's OK.
>> >> >
>> >> > [For some reason my email sync program decided to give up working for
>> >> >  hours.  I got more time looking at a tsc bug, which is good, but then I
>> >> >  miss a lot of emails..]
>> >> >
>> >> >> 
>> >> >> > On Wed, Aug 28, 2024 at 09:41:17PM -0300, Fabiano Rosas wrote:
>> >> >> >> > +size_t multifd_device_state_payload_size(void)
>> >> >> >> > +{
>> >> >> >> > +    return sizeof(MultiFDDeviceState_t);
>> >> >> >> > +}
>> >> >> >> 
>> >> >> >> This will not be necessary because the payload size is the same as the
>> >> >> >> data type. We only need it for the special case where the MultiFDPages_t
>> >> >> >> is smaller than the total ram payload size.
>> >> >> >
>> >> >> > Today I was thinking maybe we should really clean this up, as the current
>> >> >> > multifd_send_data_alloc() is indeed too tricky (blame me.. who requested
>> >> >> > that more or less).  Knowing that VFIO can use dynamic buffers with ->idstr
>> >> >> > and ->buf (I was thinking it could be buf[1M].. but I was wrong...) made
>> >> >> > that feeling stronger.
>> >> >> 
>> >> >> If we're going to commit bad code and then rewrite it a week later, we
>> >> >> could have just let the original series from Maciej merge without any of
>> >> >> this.
>> >> >
>> >> > Why it's "bad code"?
>> >> >
>> >> > It runs pretty well, I don't think it's bad code.  You wrote it, and I
>> >> > don't think it's bad at all.
>> >> 
>> >> Code that forces us to do arithmetic in order to properly allocate
>> >> memory and comes with a big comment explaining how we're dodging
>> >> compiler warnings is bad code in my book.
>> >> 
>> >> >
>> >> > But now we're rethinking after reading Maciej's new series.
>> >> >Personally I don't think it's a major problem.
>> >> >
>> >> > Note that we're not changing the design back: what was initially proposed
>> >> > was the client submitting an array of multifd objects.  I still don't think
>> >> > that's right.
>> >> >
>> >> > What the change goes so far is we make the union a struct, however that's
>> >> > still N+2 objects not 2*N, where 2 came from RAM+VFIO.  I think the
>> >> > important bits are still there (from your previous refactor series).
>> >> >
>> >> 
>> >> You fail to appreciate that before the RFC series, multifd already
>> >> allocated N for the pages.
>> >
>> > It depends on how you see it, IMHO.  I think it allocates N not for the
>> > "pages" but for the "threads", because the threads can be busy with those
>> > buffers, no matter if it's "page" or "device data".
>> 
>> Each MultiFD*Params had a p->pages, so N channels, N p->pages. The
>> device state series would add p->device_state, one per channel. So 2N +
>> 2 (for the extra slot).
>
> Then it makes sense to have SendData covering pages+device_state.  I think
> it's what we have now, but maybe I missed the point.

I misunderstood you. You're saying that you see the N as per-thread
instead of per-client-per-thread. That's one perspective indeed. It was
not what the device state series had going for it, so I still think that
this could have been a separate discussion independent of the p->pages
-> p->data change. But let's not drag this argument on, it's been
discussed and the code has been merged.

>
>> 
>> >
>> >> The device state adds another client, so that
>> >> would be another N anyway. The problem the RFC tried to solve was that
>> >> multifd channels owned that 2N, so the array was added to move the
>> >> memory into the client's ownership. IOW, it wasn't even the RFC series
>> >> that made it 2N, that was the multifd design all along. Now in hindsight
>> >> I don't think we should have went with the memory saving discussion.
>> >
>> > I think I could have made that feeling that I only wanted to save memory,
>> > if so, I'm sorry.  But do you still remember I mentioned "we can make it a
>> > struct, too" before your series landed?  Then you think it's ok to keep
>> > using union, and I'm ok too! At least at that time.  I don't think that's a
>> > huge deal.  I don't think each route we go must be perfect, but we try the
>> > best to make it as good.
>> 
>> Yep, I did agree with all of this. I'm just saying I now think I
>> shouldn't have.
>> 
>> >
>> > I don't think any discussion must not happen.  I agree memory consumption
>> > is not the 1st thing to worry, but I don't see why it can't be discussed.
>> 
>> It can be discussed, sure, but then 3 months pass and we're still
>> talking about it. Saved ~64k and spent 3 months. We could have just as
>> well said: "let's do a pass to optimize memory consumption after the
>> device state series is in".
>
> We didn't discuss 3 months on discussing memory consumption only!  It's
> unfair to think it like that.

Ok, you're right.

>
>> 
>> >
>> > Note that I never said we can't save those memory either - I plan to have
>> > follow up patches (for this, after Maciej's series land.. that's why I even
>> > didn't yet mention) to allow modules report device state buffer size.  I
>> > just didn't say, yet, and don't plan to worry before vfio series lands.
>> > When with that, we'll save 1M*N when no vfio device attached (but we'll
>> > need to handle hotplug).  So I think we don't need to lose any finally.
>> >
>> > However I think we need to get straight with the base design on how vfio
>> > should use multifd, because it's important bit and I don't want to rework
>> > important bits after a huge feature, if we know a better directions.
>> >
>> > I don't even think what I proposed patch 1-3 here is a must to me, I should
>> > be clear again here just in case we have similar discussions
>> > afterwards.. that I'm ok with below be done after Maciej's:
>> >
>> >   - Avoid memory allocations per-packet (done in patch 2-3)
>> >   - Avoid unnecessary data copy (done in patch 2-3)
>> >   - Avoid allocate device buffers when no device will use (not proposed)
>> >
>> > But I'm not ok building everything on top of the idea of not using multifd
>> > buffers in the current way, because that can involve a lot of changes:
>> > that's where buffer passes from up to down or backwards, and the interface
>> > needs change a lot too.  We already have that in master so it's not a
>> > problem now.
>> >
>> >> 
>> >> >> I already suggested it a couple of times, we shouldn't be doing
>> >> >> core refactorings underneath contributors' patches, this is too
>> >> >> fragile. Just let people contribute their code and we can change it
>> >> >> later.
>> >> >
>> >> > I sincerely don't think a lot needs changing... only patch 1.  Basically
>> >> > patch 1 on top of your previous rework series will be at least what I want,
>> >> > but I'm open to comments from you guys.
>> >> 
>> >> Don't get me wrong, I'm very much in favor of what you're doing
>> >> here. However, I don't think it's ok to be backtracking on our design
>> >> while other people have series in flight that depend on it. You
>> >> certainly know the feeling of trying to merge a feature and having
>> >> maintainers ask you to rewrite a bunch code just to be able to start
>> >> working. That's not ideal.
>> >
>> > I as a patch writer always like to do that when it's essential.  Normally
>> > the case is I don't have enough reviewer resources to help me get a better
>> > design, or discuss about it.
>> 
>> Right, but we can't keep providing a moving target. See the thread pool
>> discussion for an example. It's hard to work that way. The discussion
>> here is similar, we introduced the union, now we're moving to the
>> struct. And you're right that the changes here are small, so let's not
>> get caught in that.
>
> What's your suggestion on the thread pool?  Should we merge the change
> where vfio creates the threads on its own (assuming vfio maintainers are ok
> with it)?

This is not a simple answer and I'm not exactly sure where to draw the
line, but in this case I'm inclined to say: yes.

>
> I would say no, that's what I suggested.  I'd start with reusing
> ThreadPool, then we found issue when Stefan reported worry on abusing the
> API.  All these discussions seem sensible to me so far.  We can't rush on
> these until we figure things out step by step.  I don't see a way.

The problem is that using a thread pool is something that we've agreed
on for a while and is even in the migration TODO list. It's not
something that came up as a result of the device state series. I know
this is not anyone's intent, but it starts to feel like gatekeeping.

The fact that migration lacks a thread pool, that multifd threads have
historically caused issues and that what's in util/thread-pool.c is only
useful to the block layer are all preexisting problems of this code
base. We could be (and are) working to improve those regardless of what
new features are being contributed. I'm not sure it's productive to pick
problems we have had for a while and present those as prerequisites for
merging new code.

But as I said, there's a line to be drawn somewhere and I don't know
exactly where it lies. I understand the points that were brought up in
favor of first figuring out the thread pool situation, those are not at
all unreasonable.

>
> I saw Cedric suggesting to not even create a thread on recv side.  I am not
> sure whether that's easy, but I'd agree with Cedric if possible.  I think
> Maciej could have a point where it can block mutlifd threads, aka, IO
> threads, which might be unwanted.
>
> However said that, I still think device (even if threads needed) should not
> randomly create threads during migration.  It'll be a nightmare.

The thread-pool approach is being looked at to solve this particular
problem, but we have also discussed in the past that multifd threads
themselves should be managed by a thread pool. Will we add this
requirement to what is being done now? Otherwise, don't we risk having
an implementation that doesn't serve the rest of multifd? Do we even
know what the requirements are? Keep in mind that we're already not
modifying the existing ThreadPool, but planning to write a new one.

>
>> 
>> >
>> > When vfio is the new user of multifd vfio needs to do the heavy lifting to
>> > draft the api.
>> 
>> Well, multifd could have provided a flexible API to being with. That's
>> entirely on us. I have been toying with allowing more clients since at
>> least 1 year ago. We just couldn't get there in time.
>> 
>> >
>> >> 
>> >> I tried to quickly insert the RFC series before the device state series
>> >> progressed too much, but it's 3 months later and we're still discussing
>> >> it, maybe we don't need to do it this way.
>> >
>> > Can I get that of your feeling from when you were working on
>> > mapped-ram?
>> 
>> At that time I had already committed to helping maintain the code, so
>> the time spent there already went into the maintainer bucket anyway. If
>> I were instead just trying to drive-by, then that would have been a
>> pain.
>
> I don't think you became a maintainer changed how I would review mapped-ram
> series.
>
> OTOH, "I became a maintainer" could, because I know I am more responsible
> to a chunk of code until I leave (and please let me know any time when you
> think you're ready to take migration on your own).  That's a real
> difference to me.

Right, that's my point. I don't mind that what I'm doing takes time. I'm
not going anywhere. I do mind that because we're not going anywhere, we
start to drag people into a constant state of improving the next little
thing. Again, our definition of what constitutes "a little thing" is of
course different.

>
>> 
>> > That series does take long enough, I agree.  Not so bad yet with the VFIO
>> > series - it's good to have you around because you provide great reviews.
>> > I'm also trying the best to not let a series dangle for more than a year.
>> > I don't think 3 months is long with this feature: this is the 1st multifd
>> > extrenal user (and file mapping is also in another angle), it can take some
>> > time.
>> 
>> Oh, I don't mean the VFIO series is taking long. That's a complex
>> feature indeed. I just mean going from p->pages to p->data could have
>> taken less time. I'm suggesting we might have overdone there a bit.
>> 
>> >
>> > Sorry if it's so, but sorry again I don't think I get convinced: I think we
>> > need to go this way to build blocks one by one, and we need to make sure
>> > lower blocks are hopefully solid enough to take the upper ones.  Again I'm
>> > ok with small things that go against it, but not major designs.  We
>> > shouldn't go rewrite major designs if we seem to know a better one.
>> >
>> >> 
>> >> And ok, let's consider the current situation a special case. But I would
>> >> like to avoid in the future this kind of uncertainty. 
>> >> 
>> >> >
>> >> > Note that patch 2-3 will be on top of Maciej's changes and they're totally
>> >> > not relevant to what we merged so far.  Hence, nothing relevant there to
>> >> > what you worked.  And this is the diff of patch 1:
>> >> >
>> >> >  migration/multifd.h              | 16 +++++++++++-----
>> >> >  migration/multifd-device-state.c |  8 ++++++--
>> >> >  migration/multifd-nocomp.c       | 13 ++++++-------
>> >> >  migration/multifd.c              | 25 ++++++-------------------
>> >> >  4 files changed, 29 insertions(+), 33 deletions(-)
>> >> >
>> >> > It's only 33 lines removed (many of which are comments..), it's not a huge
>> >> > lot.  I don't know why you feel so bad at this...
>> >> >
>> >> > It's probably because we maintain migration together, or we can keep our
>> >> > own way of work.  I don't think we did anything wrong yet so far.
>> >> >
>> >> > We can definitely talk about this in next 1:1.
>> >> >
>> >> >> 
>> >> >> This is also why I've been trying hard to separate core multifd
>> >> >> functionality from migration code that uses multifd to transmit their
>> >> >> data.
>> >> >> 
>> >> >> My original RFC plus the suggestion to extend multifd_ops for device
>> >> >> state would have (almost) made it so that no client code would be left
>> >> >> in multifd. We could have been turning this thing upside down and it
>> >> >> wouldn't affect anyone in terms of code conflicts.
>> >> >
>> >> > Do you mean you preferred the 2*N approach?
>> >> >
>> >> 
>> >> 2*N, where N is usually not larger than 32 and the payload size is
>> >> 1k. Yes, I'd trade that off no problem.
>> >
>> > I think it's a problem.
>> >
>> > Vdpa when involved with exactly the same pattern of how vfio uses it (as
>> > they're really alike underneath) then vdpa will need its own array of
>> > buffers, or it'll need to take the same vfio lock which doesn't make sense
>> > to me.
>> >
>> > N+2, or, N+M (M is the user) is the minimum buffers we need.  N because
>> > multifd can be worst case 100% busy on all threads occupying the buffers.
>> > M because M users can be worst case 100% pre-filling.  It's either about
>> > memory consumption, or about logical sensibility.
>> 
>> I'm aware of the memory consumption. Still, we're not forced to use the
>> minimum amount of space we can. If using more memory can lead to a
>> better design in the medium term, we're allowed to make that choice.
>> 
>> Hey, I'm not even saying we *should* have gone with 2N. I think it's
>> good that we're now N+M. But I think we also lost some design
>> flexibility due to that.
>> 
>> >
>> >> 
>> >> >> 
>> >> >> The ship has already sailed, so your patches below are fine, I have just
>> >> >> some small comments.
>> >> >
>> >> > I'm not sure what you meant about "ship sailed", but we should merge code
>> >> > whenever we think is the most correct.
>> >> 
>> >> As you put above, I agree that the important bits of the original series
>> >> have been preserved, but other secondary goals were lost, such as the
>> >> more abstract separation between multifd & client code and that is the
>> >> ship that has sailed.
>> >> 
>> >> That series was not: "introduce this array for no reason", we also lost
>> >> the ability to abstract the payload from the multifd threads when we
>> >> dropped the .alloc_fn callback for instance. The last patch you posted
>> >
>> > I don't remember the details there, but my memory was that it was too
>> > flexible while we seem to reach the consensus that we only process either
>> > RAM or device, nothing else.
>> 
>> Indeed. I'm being unfair here, sorry.
>> 
>> >
>> >> here now adds multifd_device_state_prepare, somewhat ignoring that the
>> >> ram code also has the same pattern and it could be made to use the same
>> >> API.
>> >
>> > I need some further elaborations to understand.
>> >
>> > multifd_device_state_prepare currently does a few things: taking ownership
>> > of the temp device state object, fill in idstr / instance_id, taking the
>> > lock (so far is needed because we only have one device state object).  None
>> > of them seems to be needed for RAM yet.
>> >
>> > Feel free to send a rfc patch if that helps.
>> 
>> What if I don't send a patch, wait for it to get merged and then send a
>> refactoring on top so we don't add yet another detour to this
>> conversation? =)
>
> I thought it shouldn't conflict much if with ram only, and I used to mean
> that can be a "comment in the form of patch".  But yeah, sure thing.
diff mbox series

Patch

diff --git a/include/migration/misc.h b/include/migration/misc.h
index bfadc5613bac..7266b1b77d1f 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -111,4 +111,8 @@  bool migration_in_bg_snapshot(void);
 /* migration/block-dirty-bitmap.c */
 void dirty_bitmap_mig_init(void);
 
+/* migration/multifd-device-state.c */
+bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
+                                char *data, size_t len);
+
 #endif
diff --git a/migration/meson.build b/migration/meson.build
index 77f3abf08eb1..00853595894f 100644
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -21,6 +21,7 @@  system_ss.add(files(
   'migration-hmp-cmds.c',
   'migration.c',
   'multifd.c',
+  'multifd-device-state.c',
   'multifd-nocomp.c',
   'multifd-zlib.c',
   'multifd-zero-page.c',
diff --git a/migration/multifd-device-state.c b/migration/multifd-device-state.c
new file mode 100644
index 000000000000..c9b44f0b5ab9
--- /dev/null
+++ b/migration/multifd-device-state.c
@@ -0,0 +1,99 @@ 
+/*
+ * Multifd device state migration
+ *
+ * Copyright (C) 2024 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/lockable.h"
+#include "migration/misc.h"
+#include "multifd.h"
+
+static QemuMutex queue_job_mutex;
+
+static MultiFDSendData *device_state_send;
+
+size_t multifd_device_state_payload_size(void)
+{
+    return sizeof(MultiFDDeviceState_t);
+}
+
+void multifd_device_state_save_setup(void)
+{
+    qemu_mutex_init(&queue_job_mutex);
+
+    device_state_send = multifd_send_data_alloc();
+}
+
+void multifd_device_state_clear(MultiFDDeviceState_t *device_state)
+{
+    g_clear_pointer(&device_state->idstr, g_free);
+    g_clear_pointer(&device_state->buf, g_free);
+}
+
+void multifd_device_state_save_cleanup(void)
+{
+    g_clear_pointer(&device_state_send, multifd_send_data_free);
+
+    qemu_mutex_destroy(&queue_job_mutex);
+}
+
+static void multifd_device_state_fill_packet(MultiFDSendParams *p)
+{
+    MultiFDDeviceState_t *device_state = &p->data->u.device_state;
+    MultiFDPacketDeviceState_t *packet = p->packet_device_state;
+
+    packet->hdr.flags = cpu_to_be32(p->flags);
+    strncpy(packet->idstr, device_state->idstr, sizeof(packet->idstr));
+    packet->instance_id = cpu_to_be32(device_state->instance_id);
+    packet->next_packet_size = cpu_to_be32(p->next_packet_size);
+}
+
+void multifd_device_state_send_prepare(MultiFDSendParams *p)
+{
+    MultiFDDeviceState_t *device_state = &p->data->u.device_state;
+
+    assert(multifd_payload_device_state(p->data));
+
+    multifd_send_prepare_header_device_state(p);
+
+    assert(!(p->flags & MULTIFD_FLAG_SYNC));
+
+    p->next_packet_size = device_state->buf_len;
+    if (p->next_packet_size > 0) {
+        p->iov[p->iovs_num].iov_base = device_state->buf;
+        p->iov[p->iovs_num].iov_len = p->next_packet_size;
+        p->iovs_num++;
+    }
+
+    p->flags |= MULTIFD_FLAG_NOCOMP | MULTIFD_FLAG_DEVICE_STATE;
+
+    multifd_device_state_fill_packet(p);
+}
+
+bool multifd_queue_device_state(char *idstr, uint32_t instance_id,
+                                char *data, size_t len)
+{
+    /* Device state submissions can come from multiple threads */
+    QEMU_LOCK_GUARD(&queue_job_mutex);
+    MultiFDDeviceState_t *device_state;
+
+    assert(multifd_payload_empty(device_state_send));
+
+    multifd_set_payload_type(device_state_send, MULTIFD_PAYLOAD_DEVICE_STATE);
+    device_state = &device_state_send->u.device_state;
+    device_state->idstr = g_strdup(idstr);
+    device_state->instance_id = instance_id;
+    device_state->buf = g_memdup2(data, len);
+    device_state->buf_len = len;
+
+    if (!multifd_send(&device_state_send)) {
+        multifd_send_data_clear(device_state_send);
+        return false;
+    }
+
+    return true;
+}
diff --git a/migration/multifd-nocomp.c b/migration/multifd-nocomp.c
index 39eb77c9b3b7..0b7b543f44db 100644
--- a/migration/multifd-nocomp.c
+++ b/migration/multifd-nocomp.c
@@ -116,13 +116,13 @@  static int multifd_nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
          * Only !zerocopy needs the header in IOV; zerocopy will
          * send it separately.
          */
-        multifd_send_prepare_header(p);
+        multifd_send_prepare_header_ram(p);
     }
 
     multifd_send_prepare_iovs(p);
     p->flags |= MULTIFD_FLAG_NOCOMP;
 
-    multifd_send_fill_packet(p);
+    multifd_send_fill_packet_ram(p);
 
     if (use_zero_copy_send) {
         /* Send header first, without zerocopy */
@@ -371,7 +371,7 @@  bool multifd_send_prepare_common(MultiFDSendParams *p)
         return false;
     }
 
-    multifd_send_prepare_header(p);
+    multifd_send_prepare_header_ram(p);
 
     return true;
 }
diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
index 75041a4c4dfe..bd6b5b6a3868 100644
--- a/migration/multifd-qpl.c
+++ b/migration/multifd-qpl.c
@@ -490,7 +490,7 @@  static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp)
 
 out:
     p->flags |= MULTIFD_FLAG_QPL;
-    multifd_send_fill_packet(p);
+    multifd_send_fill_packet_ram(p);
     return 0;
 }
 
diff --git a/migration/multifd-uadk.c b/migration/multifd-uadk.c
index db2549f59bfe..6e2d26010742 100644
--- a/migration/multifd-uadk.c
+++ b/migration/multifd-uadk.c
@@ -198,7 +198,7 @@  static int multifd_uadk_send_prepare(MultiFDSendParams *p, Error **errp)
     }
 out:
     p->flags |= MULTIFD_FLAG_UADK;
-    multifd_send_fill_packet(p);
+    multifd_send_fill_packet_ram(p);
     return 0;
 }
 
diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
index 6787538762d2..62a1fe59ad3e 100644
--- a/migration/multifd-zlib.c
+++ b/migration/multifd-zlib.c
@@ -156,7 +156,7 @@  static int multifd_zlib_send_prepare(MultiFDSendParams *p, Error **errp)
 
 out:
     p->flags |= MULTIFD_FLAG_ZLIB;
-    multifd_send_fill_packet(p);
+    multifd_send_fill_packet_ram(p);
     return 0;
 }
 
diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
index 1576b1e2adc6..f98b07e7f9f5 100644
--- a/migration/multifd-zstd.c
+++ b/migration/multifd-zstd.c
@@ -143,7 +143,7 @@  static int multifd_zstd_send_prepare(MultiFDSendParams *p, Error **errp)
 
 out:
     p->flags |= MULTIFD_FLAG_ZSTD;
-    multifd_send_fill_packet(p);
+    multifd_send_fill_packet_ram(p);
     return 0;
 }
 
diff --git a/migration/multifd.c b/migration/multifd.c
index a74e8a5cc891..bebe5b5a9b9c 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -12,6 +12,7 @@ 
 
 #include "qemu/osdep.h"
 #include "qemu/cutils.h"
+#include "qemu/iov.h"
 #include "qemu/rcu.h"
 #include "exec/target_page.h"
 #include "sysemu/sysemu.h"
@@ -19,6 +20,7 @@ 
 #include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "file.h"
+#include "migration/misc.h"
 #include "migration.h"
 #include "migration-stats.h"
 #include "savevm.h"
@@ -107,7 +109,9 @@  MultiFDSendData *multifd_send_data_alloc(void)
      * added to the union in the future are larger than
      * (MultiFDPages_t + flex array).
      */
-    max_payload_size = MAX(multifd_ram_payload_size(), sizeof(MultiFDPayload));
+    max_payload_size = MAX(multifd_ram_payload_size(),
+                           multifd_device_state_payload_size());
+    max_payload_size = MAX(max_payload_size, sizeof(MultiFDPayload));
 
     /*
      * Account for any holes the compiler might insert. We can't pack
@@ -126,6 +130,9 @@  void multifd_send_data_clear(MultiFDSendData *data)
     }
 
     switch (data->type) {
+    case MULTIFD_PAYLOAD_DEVICE_STATE:
+        multifd_device_state_clear(&data->u.device_state);
+        break;
     default:
         /* Nothing to do */
         break;
@@ -228,7 +235,7 @@  static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
     return msg.id;
 }
 
-void multifd_send_fill_packet(MultiFDSendParams *p)
+void multifd_send_fill_packet_ram(MultiFDSendParams *p)
 {
     MultiFDPacket_t *packet = p->packet;
     uint64_t packet_num;
@@ -397,20 +404,16 @@  bool multifd_send(MultiFDSendData **send_data)
 
         p = &multifd_send_state->params[i];
         /*
-         * Lockless read to p->pending_job is safe, because only multifd
-         * sender thread can clear it.
+         * Lockless RMW on p->pending_job_preparing is safe, because only multifd
+         * sender thread can clear it after it had seen p->pending_job being set.
+         *
+         * Pairs with qatomic_store_release() in multifd_send_thread().
          */
-        if (qatomic_read(&p->pending_job) == false) {
+        if (qatomic_cmpxchg(&p->pending_job_preparing, false, true) == false) {
             break;
         }
     }
 
-    /*
-     * Make sure we read p->pending_job before all the rest.  Pairs with
-     * qatomic_store_release() in multifd_send_thread().
-     */
-    smp_mb_acquire();
-
     assert(multifd_payload_empty(p->data));
 
     /*
@@ -534,6 +537,7 @@  static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp)
     p->name = NULL;
     g_clear_pointer(&p->data, multifd_send_data_free);
     p->packet_len = 0;
+    g_clear_pointer(&p->packet_device_state, g_free);
     g_free(p->packet);
     p->packet = NULL;
     multifd_send_state->ops->send_cleanup(p, errp);
@@ -545,6 +549,7 @@  static void multifd_send_cleanup_state(void)
 {
     file_cleanup_outgoing_migration();
     socket_cleanup_outgoing_migration();
+    multifd_device_state_save_cleanup();
     qemu_sem_destroy(&multifd_send_state->channels_created);
     qemu_sem_destroy(&multifd_send_state->channels_ready);
     g_free(multifd_send_state->params);
@@ -670,19 +675,29 @@  static void *multifd_send_thread(void *opaque)
          * qatomic_store_release() in multifd_send().
          */
         if (qatomic_load_acquire(&p->pending_job)) {
+            bool is_device_state = multifd_payload_device_state(p->data);
+            size_t total_size;
+
             p->flags = 0;
             p->iovs_num = 0;
             assert(!multifd_payload_empty(p->data));
 
-            ret = multifd_send_state->ops->send_prepare(p, &local_err);
-            if (ret != 0) {
-                break;
+            if (is_device_state) {
+                multifd_device_state_send_prepare(p);
+            } else {
+                ret = multifd_send_state->ops->send_prepare(p, &local_err);
+                if (ret != 0) {
+                    break;
+                }
             }
 
             if (migrate_mapped_ram()) {
+                assert(!is_device_state);
+
                 ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num,
                                               &p->data->u.ram, &local_err);
             } else {
+                total_size = iov_size(p->iov, p->iovs_num);
                 ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num,
                                                   NULL, 0, p->write_flags,
                                                   &local_err);
@@ -692,18 +707,27 @@  static void *multifd_send_thread(void *opaque)
                 break;
             }
 
-            stat64_add(&mig_stats.multifd_bytes,
-                       p->next_packet_size + p->packet_len);
+            if (is_device_state) {
+                stat64_add(&mig_stats.multifd_bytes, total_size);
+            } else {
+                /*
+                 * Can't just always add total_size since IOVs do not include
+                 * packet header in the zerocopy RAM case.
+                 */
+                stat64_add(&mig_stats.multifd_bytes,
+                           p->next_packet_size + p->packet_len);
+            }
 
             p->next_packet_size = 0;
             multifd_send_data_clear(p->data);
 
             /*
              * Making sure p->data is published before saying "we're
-             * free".  Pairs with the smp_mb_acquire() in
+             * free".  Pairs with the qatomic_cmpxchg() in
              * multifd_send().
              */
             qatomic_store_release(&p->pending_job, false);
+            qatomic_store_release(&p->pending_job_preparing, false);
         } else {
             /*
              * If not a normal job, must be a sync request.  Note that
@@ -714,7 +738,7 @@  static void *multifd_send_thread(void *opaque)
 
             if (use_packets) {
                 p->flags = MULTIFD_FLAG_SYNC;
-                multifd_send_fill_packet(p);
+                multifd_send_fill_packet_ram(p);
                 ret = qio_channel_write_all(p->c, (void *)p->packet,
                                             p->packet_len, &local_err);
                 if (ret != 0) {
@@ -910,6 +934,9 @@  bool multifd_send_setup(void)
             p->packet_len = sizeof(MultiFDPacket_t)
                           + sizeof(uint64_t) * page_count;
             p->packet = g_malloc0(p->packet_len);
+            p->packet_device_state = g_malloc0(sizeof(*p->packet_device_state));
+            p->packet_device_state->hdr.magic = cpu_to_be32(MULTIFD_MAGIC);
+            p->packet_device_state->hdr.version = cpu_to_be32(MULTIFD_VERSION);
         }
         p->name = g_strdup_printf("mig/src/send_%d", i);
         p->write_flags = 0;
@@ -944,6 +971,8 @@  bool multifd_send_setup(void)
         }
     }
 
+    multifd_device_state_save_setup();
+
     return true;
 
 err:
diff --git a/migration/multifd.h b/migration/multifd.h
index a0853622153e..c15c83104c8b 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -120,10 +120,12 @@  typedef struct {
 typedef enum {
     MULTIFD_PAYLOAD_NONE,
     MULTIFD_PAYLOAD_RAM,
+    MULTIFD_PAYLOAD_DEVICE_STATE,
 } MultiFDPayloadType;
 
 typedef union MultiFDPayload {
     MultiFDPages_t ram;
+    MultiFDDeviceState_t device_state;
 } MultiFDPayload;
 
 struct MultiFDSendData {
@@ -136,6 +138,11 @@  static inline bool multifd_payload_empty(MultiFDSendData *data)
     return data->type == MULTIFD_PAYLOAD_NONE;
 }
 
+static inline bool multifd_payload_device_state(MultiFDSendData *data)
+{
+    return data->type == MULTIFD_PAYLOAD_DEVICE_STATE;
+}
+
 static inline void multifd_set_payload_type(MultiFDSendData *data,
                                             MultiFDPayloadType type)
 {
@@ -182,13 +189,15 @@  typedef struct {
      * cleared by the multifd sender threads.
      */
     bool pending_job;
+    bool pending_job_preparing;
     bool pending_sync;
     MultiFDSendData *data;
 
     /* thread local variables. No locking required */
 
-    /* pointer to the packet */
+    /* pointers to the possible packet types */
     MultiFDPacket_t *packet;
+    MultiFDPacketDeviceState_t *packet_device_state;
     /* size of the next packet that contains pages */
     uint32_t next_packet_size;
     /* packets sent through this channel */
@@ -276,18 +285,25 @@  typedef struct {
 } MultiFDMethods;
 
 void multifd_register_ops(int method, MultiFDMethods *ops);
-void multifd_send_fill_packet(MultiFDSendParams *p);
+void multifd_send_fill_packet_ram(MultiFDSendParams *p);
 bool multifd_send_prepare_common(MultiFDSendParams *p);
 void multifd_send_zero_page_detect(MultiFDSendParams *p);
 void multifd_recv_zero_page_process(MultiFDRecvParams *p);
 
-static inline void multifd_send_prepare_header(MultiFDSendParams *p)
+static inline void multifd_send_prepare_header_ram(MultiFDSendParams *p)
 {
     p->iov[0].iov_len = p->packet_len;
     p->iov[0].iov_base = p->packet;
     p->iovs_num++;
 }
 
+static inline void multifd_send_prepare_header_device_state(MultiFDSendParams *p)
+{
+    p->iov[0].iov_len = sizeof(*p->packet_device_state);
+    p->iov[0].iov_base = p->packet_device_state;
+    p->iovs_num++;
+}
+
 void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc);
 bool multifd_send(MultiFDSendData **send_data);
 MultiFDSendData *multifd_send_data_alloc(void);
@@ -310,4 +326,11 @@  int multifd_ram_flush_and_sync(void);
 size_t multifd_ram_payload_size(void);
 void multifd_ram_fill_packet(MultiFDSendParams *p);
 int multifd_ram_unfill_packet(MultiFDRecvParams *p, Error **errp);
+
+size_t multifd_device_state_payload_size(void);
+void multifd_device_state_save_setup(void);
+void multifd_device_state_clear(MultiFDDeviceState_t *device_state);
+void multifd_device_state_save_cleanup(void);
+void multifd_device_state_send_prepare(MultiFDSendParams *p);
+
 #endif