diff mbox series

[v2,17/17] vfio/migration: Multifd device state transfer support - send side

Message ID 1429fc59079d99ca035b31303892d807868dc6c0.1724701542.git.maciej.szmigiero@oracle.com (mailing list archive)
State New
Headers show
Series Multifd | expand

Commit Message

Maciej S. Szmigiero Aug. 27, 2024, 5:54 p.m. UTC
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>

Implement the multifd device state transfer via additional per-device
thread inside save_live_complete_precopy_thread handler.

Switch between doing the data transfer in the new handler and doing it
in the old save_state handler depending on the
x-migration-multifd-transfer device property value.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
---
 hw/vfio/migration.c           | 169 ++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events          |   2 +
 include/hw/vfio/vfio-common.h |   1 +
 3 files changed, 172 insertions(+)

Comments

Avihai Horon Sept. 9, 2024, 11:41 a.m. UTC | #1
On 27/08/2024 20:54, Maciej S. Szmigiero wrote:
> External email: Use caution opening links or attachments
>
>
> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>
> Implement the multifd device state transfer via additional per-device
> thread inside save_live_complete_precopy_thread handler.
>
> Switch between doing the data transfer in the new handler and doing it
> in the old save_state handler depending on the
> x-migration-multifd-transfer device property value.
>
> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
> ---
>   hw/vfio/migration.c           | 169 ++++++++++++++++++++++++++++++++++
>   hw/vfio/trace-events          |   2 +
>   include/hw/vfio/vfio-common.h |   1 +
>   3 files changed, 172 insertions(+)
>
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index 57c1542528dc..67996aa2df8b 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -655,6 +655,16 @@ static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp)
>       uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE;
>       int ret;
>
> +    /* Make a copy of this setting at the start in case it is changed mid-migration */
> +    migration->multifd_transfer = vbasedev->migration_multifd_transfer;

Should VFIO multifd be controlled by main migration multifd capability, 
and let the per VFIO device migration_multifd_transfer property be 
immutable and enabled by default?
Then we would have a single point of configuration (and an extra one per 
VFIO device just to disable for backward compatibility).
Unless there are other benefits to have this property configurable?

> +
> +    if (migration->multifd_transfer && !migration_has_device_state_support()) {
> +        error_setg(errp,
> +                   "%s: Multifd device transfer requested but unsupported in the current config",
> +                   vbasedev->name);
> +        return -EINVAL;
> +    }
> +
>       qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
>
>       vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
> @@ -835,10 +845,20 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
>   static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>   {
>       VFIODevice *vbasedev = opaque;
> +    VFIOMigration *migration = vbasedev->migration;
>       ssize_t data_size;
>       int ret;
>       Error *local_err = NULL;
>
> +    if (migration->multifd_transfer) {
> +        /*
> +         * Emit dummy NOP data, vfio_save_complete_precopy_thread()
> +         * does the actual transfer.
> +         */
> +        qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);

There are three places where we send this dummy end of state, maybe 
worth extracting it to a helper? I.e., vfio_send_end_of_state() and then 
document there the rationale.

> +        return 0;
> +    }
> +
>       trace_vfio_save_complete_precopy_started(vbasedev->name);
>
>       /* We reach here with device state STOP or STOP_COPY only */
> @@ -864,12 +884,159 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>       return ret;
>   }
>
> +static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbasedev,
> +                                                                char *idstr,
> +                                                                uint32_t instance_id,
> +                                                                uint32_t idx)
> +{
> +    g_autoptr(QIOChannelBuffer) bioc = NULL;
> +    QEMUFile *f = NULL;
> +    int ret;
> +    g_autofree VFIODeviceStatePacket *packet = NULL;
> +    size_t packet_len;
> +
> +    bioc = qio_channel_buffer_new(0);
> +    qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save");
> +
> +    f = qemu_file_new_output(QIO_CHANNEL(bioc));
> +
> +    ret = vfio_save_device_config_state(f, vbasedev, NULL);
> +    if (ret) {
> +        return ret;

Need to close f in this case.

> +    }
> +
> +    ret = qemu_fflush(f);
> +    if (ret) {
> +        goto ret_close_file;
> +    }
> +
> +    packet_len = sizeof(*packet) + bioc->usage;
> +    packet = g_malloc0(packet_len);
> +    packet->idx = idx;
> +    packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE;
> +    memcpy(&packet->data, bioc->data, bioc->usage);
> +
> +    if (!multifd_queue_device_state(idstr, instance_id,
> +                                    (char *)packet, packet_len)) {
> +        ret = -1;

goto ret_close_file?

> +    }
> +
> +    bytes_transferred += packet_len;

bytes_transferred is a global variable. Now that we access it from 
multiple threads it should be protected.
Note that now the VFIO device data is reported also in multifd stats (if 
I am not mistaken), is this the behavior we want? Maybe we should 
enhance multifd stats to distinguish between RAM data and device data?

> +
> +ret_close_file:

Rename to "out" as we only have one exit point?

> +    g_clear_pointer(&f, qemu_fclose);

f is a local variable, wouldn't qemu_fclose(f) be enough here?

> +    return ret;
> +}
> +
> +static int vfio_save_complete_precopy_thread(char *idstr,
> +                                             uint32_t instance_id,
> +                                             bool *abort_flag,
> +                                             void *opaque)
> +{
> +    VFIODevice *vbasedev = opaque;
> +    VFIOMigration *migration = vbasedev->migration;
> +    int ret;
> +    g_autofree VFIODeviceStatePacket *packet = NULL;
> +    uint32_t idx;
> +
> +    if (!migration->multifd_transfer) {
> +        /* Nothing to do, vfio_save_complete_precopy() does the transfer. */
> +        return 0;
> +    }
> +
> +    trace_vfio_save_complete_precopy_thread_started(vbasedev->name,
> +                                                    idstr, instance_id);
> +
> +    /* We reach here with device state STOP or STOP_COPY only */
> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
> +                                   VFIO_DEVICE_STATE_STOP, NULL);
> +    if (ret) {
> +        goto ret_finish;
> +    }
> +
> +    packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size);
> +
> +    for (idx = 0; ; idx++) {
> +        ssize_t data_size;
> +        size_t packet_size;
> +
> +        if (qatomic_read(abort_flag)) {
> +            ret = -ECANCELED;
> +            goto ret_finish;
> +        }
> +
> +        data_size = read(migration->data_fd, &packet->data,
> +                         migration->data_buffer_size);
> +        if (data_size < 0) {
> +            if (errno != ENOMSG) {
> +                ret = -errno;
> +                goto ret_finish;
> +            }
> +
> +            /*
> +             * Pre-copy emptied all the device state for now. For more information,
> +             * please refer to the Linux kernel VFIO uAPI.
> +             */
> +            data_size = 0;

According to VFIO uAPI, ENOMSG can only be returned in the PRE_COPY 
device states.
Here we are in STOP_COPY, so we can drop the ENOMSG handling.

Thanks.

> +        }
> +
> +        if (data_size == 0)
> +            break;
> +
> +        packet->idx = idx;
> +        packet_size = sizeof(*packet) + data_size;
> +
> +        if (!multifd_queue_device_state(idstr, instance_id,
> +                                        (char *)packet, packet_size)) {
> +            ret = -1;
> +            goto ret_finish;
> +        }
> +
> +        bytes_transferred += packet_size;
> +    }
> +
> +    ret = vfio_save_complete_precopy_async_thread_config_state(vbasedev, idstr,
> +                                                               instance_id,
> +                                                               idx);
> +
> +ret_finish:
> +    trace_vfio_save_complete_precopy_thread_finished(vbasedev->name, ret);
> +
> +    return ret;
> +}
> +
> +static int vfio_save_complete_precopy_begin(QEMUFile *f,
> +                                            char *idstr, uint32_t instance_id,
> +                                            void *opaque)
> +{
> +    VFIODevice *vbasedev = opaque;
> +    VFIOMigration *migration = vbasedev->migration;
> +
> +    if (!migration->multifd_transfer) {
> +        /* Emit dummy NOP data */
> +        qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
> +        return 0;
> +    }
> +
> +    qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE_COMPLETE);
> +    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
> +
> +    return qemu_fflush(f);
> +}
> +
>   static void vfio_save_state(QEMUFile *f, void *opaque)
>   {
>       VFIODevice *vbasedev = opaque;
> +    VFIOMigration *migration = vbasedev->migration;
>       Error *local_err = NULL;
>       int ret;
>
> +    if (migration->multifd_transfer) {
> +        /* Emit dummy NOP data */
> +        qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
> +        return;
> +    }
> +
>       ret = vfio_save_device_config_state(f, opaque, &local_err);
>       if (ret) {
>           error_prepend(&local_err,
> @@ -1119,7 +1286,9 @@ static const SaveVMHandlers savevm_vfio_handlers = {
>       .state_pending_exact = vfio_state_pending_exact,
>       .is_active_iterate = vfio_is_active_iterate,
>       .save_live_iterate = vfio_save_iterate,
> +    .save_live_complete_precopy_begin = vfio_save_complete_precopy_begin,
>       .save_live_complete_precopy = vfio_save_complete_precopy,
> +    .save_live_complete_precopy_thread = vfio_save_complete_precopy_thread,
>       .save_state = vfio_save_state,
>       .load_setup = vfio_load_setup,
>       .load_cleanup = vfio_load_cleanup,
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 9d2519a28a7e..b1d9c9d5f2e1 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -167,6 +167,8 @@ vfio_save_block(const char *name, int data_size) " (%s) data_size %d"
>   vfio_save_cleanup(const char *name) " (%s)"
>   vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d"
>   vfio_save_complete_precopy_started(const char *name) " (%s)"
> +vfio_save_complete_precopy_thread_started(const char *name, const char *idstr, uint32_t instance_id) " (%s) idstr %s instance %"PRIu32
> +vfio_save_complete_precopy_thread_finished(const char *name, int ret) " (%s) ret %d"
>   vfio_save_device_config_state(const char *name) " (%s)"
>   vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64
>   vfio_save_iterate_started(const char *name) " (%s)"
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index fe05acb9a5d1..4578a0ca6a5c 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -72,6 +72,7 @@ typedef struct VFIOMigration {
>       uint64_t mig_flags;
>       uint64_t precopy_init_size;
>       uint64_t precopy_dirty_size;
> +    bool multifd_transfer;
>       bool initial_data_sent;
>
>       bool save_iterate_run;
Maciej S. Szmigiero Sept. 9, 2024, 6:07 p.m. UTC | #2
On 9.09.2024 13:41, Avihai Horon wrote:
> 
> On 27/08/2024 20:54, Maciej S. Szmigiero wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>
>> Implement the multifd device state transfer via additional per-device
>> thread inside save_live_complete_precopy_thread handler.
>>
>> Switch between doing the data transfer in the new handler and doing it
>> in the old save_state handler depending on the
>> x-migration-multifd-transfer device property value.
>>
>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>> ---
>>   hw/vfio/migration.c           | 169 ++++++++++++++++++++++++++++++++++
>>   hw/vfio/trace-events          |   2 +
>>   include/hw/vfio/vfio-common.h |   1 +
>>   3 files changed, 172 insertions(+)
>>
>> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
>> index 57c1542528dc..67996aa2df8b 100644
>> --- a/hw/vfio/migration.c
>> +++ b/hw/vfio/migration.c
>> @@ -655,6 +655,16 @@ static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp)
>>       uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE;
>>       int ret;
>>
>> +    /* Make a copy of this setting at the start in case it is changed mid-migration */
>> +    migration->multifd_transfer = vbasedev->migration_multifd_transfer;
> 
> Should VFIO multifd be controlled by main migration multifd capability, and let the per VFIO device migration_multifd_transfer property be immutable and enabled by default?
> Then we would have a single point of configuration (and an extra one per VFIO device just to disable for backward compatibility).
> Unless there are other benefits to have this property configurable?

We want multifd device state transfer property to be configurable per-device
in case in the future we add another device type (besides VFIO) that supports
multifd device state transfer.

In this case, we might need to enable the multifd device state transfer just
for VFIO devices, but not for this new device type when we are migrating to a
QEMU target that supports just the VFIO multifd device state transfer.

TBH, I'm not opposed to adding a additional global multifd device state transfer
switch (if we keep the per-device ones too) but I am not sure what value it adds.

>> +
>> +    if (migration->multifd_transfer && !migration_has_device_state_support()) {
>> +        error_setg(errp,
>> +                   "%s: Multifd device transfer requested but unsupported in the current config",
>> +                   vbasedev->name);
>> +        return -EINVAL;
>> +    }
>> +
>>       qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
>>
>>       vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
>> @@ -835,10 +845,20 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
>>   static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>>   {
>>       VFIODevice *vbasedev = opaque;
>> +    VFIOMigration *migration = vbasedev->migration;
>>       ssize_t data_size;
>>       int ret;
>>       Error *local_err = NULL;
>>
>> +    if (migration->multifd_transfer) {
>> +        /*
>> +         * Emit dummy NOP data, vfio_save_complete_precopy_thread()
>> +         * does the actual transfer.
>> +         */
>> +        qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
> 
> There are three places where we send this dummy end of state, maybe worth extracting it to a helper? I.e., vfio_send_end_of_state() and then document there the rationale.

I'm not totally against it but it's wrapping just a single line of code in
a separate function?

>> +        return 0;
>> +    }
>> +
>>       trace_vfio_save_complete_precopy_started(vbasedev->name);
>>
>>       /* We reach here with device state STOP or STOP_COPY only */
>> @@ -864,12 +884,159 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>>       return ret;
>>   }
>>
>> +static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbasedev,
>> +                                                                char *idstr,
>> +                                                                uint32_t instance_id,
>> +                                                                uint32_t idx)
>> +{
>> +    g_autoptr(QIOChannelBuffer) bioc = NULL;
>> +    QEMUFile *f = NULL;
>> +    int ret;
>> +    g_autofree VFIODeviceStatePacket *packet = NULL;
>> +    size_t packet_len;
>> +
>> +    bioc = qio_channel_buffer_new(0);
>> +    qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save");
>> +
>> +    f = qemu_file_new_output(QIO_CHANNEL(bioc));
>> +
>> +    ret = vfio_save_device_config_state(f, vbasedev, NULL);
>> +    if (ret) {
>> +        return ret;
> 
> Need to close f in this case.

Right - by the way, that's a good example why RAII
helps avoid such mistakes.

>> +    }
>> +
>> +    ret = qemu_fflush(f);
>> +    if (ret) {
>> +        goto ret_close_file;
>> +    }
>> +
>> +    packet_len = sizeof(*packet) + bioc->usage;
>> +    packet = g_malloc0(packet_len);
>> +    packet->idx = idx;
>> +    packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE;
>> +    memcpy(&packet->data, bioc->data, bioc->usage);
>> +
>> +    if (!multifd_queue_device_state(idstr, instance_id,
>> +                                    (char *)packet, packet_len)) {
>> +        ret = -1;
> 
> goto ret_close_file?

Right, it would be better not to increment the counter in this case.

>> +    }
>> +
>> +    bytes_transferred += packet_len;
> 
> bytes_transferred is a global variable. Now that we access it from multiple threads it should be protected.

Right, this stat needs some concurrent access protection.

> Note that now the VFIO device data is reported also in multifd stats (if I am not mistaken), is this the behavior we want? Maybe we should enhance multifd stats to distinguish between RAM data and device data?

Multifd stats report total size of data transferred via multifd so
they should include device state too.

It may make sense to add a dedicated device state transfer counter
at some time though.

>> +
>> +ret_close_file:
> 
> Rename to "out" as we only have one exit point?
> 
>> +    g_clear_pointer(&f, qemu_fclose);
> 
> f is a local variable, wouldn't qemu_fclose(f) be enough here?

Sure, but why leave a dangling pointer?

Currently, it is obviously a NOP (probably deleted by dead store
elimination anyway) but the code might get refactored at some point
and I think it's good practice to always NULL pointers after freeing
them where possible and so be on the safe side.

>> +    return ret;
>> +}
>> +
>> +static int vfio_save_complete_precopy_thread(char *idstr,
>> +                                             uint32_t instance_id,
>> +                                             bool *abort_flag,
>> +                                             void *opaque)
>> +{
>> +    VFIODevice *vbasedev = opaque;
>> +    VFIOMigration *migration = vbasedev->migration;
>> +    int ret;
>> +    g_autofree VFIODeviceStatePacket *packet = NULL;
>> +    uint32_t idx;
>> +
>> +    if (!migration->multifd_transfer) {
>> +        /* Nothing to do, vfio_save_complete_precopy() does the transfer. */
>> +        return 0;
>> +    }
>> +
>> +    trace_vfio_save_complete_precopy_thread_started(vbasedev->name,
>> +                                                    idstr, instance_id);
>> +
>> +    /* We reach here with device state STOP or STOP_COPY only */
>> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
>> +                                   VFIO_DEVICE_STATE_STOP, NULL);
>> +    if (ret) {
>> +        goto ret_finish;
>> +    }
>> +
>> +    packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size);
>> +
>> +    for (idx = 0; ; idx++) {
>> +        ssize_t data_size;
>> +        size_t packet_size;
>> +
>> +        if (qatomic_read(abort_flag)) {
>> +            ret = -ECANCELED;
>> +            goto ret_finish;
>> +        }
>> +
>> +        data_size = read(migration->data_fd, &packet->data,
>> +                         migration->data_buffer_size);
>> +        if (data_size < 0) {
>> +            if (errno != ENOMSG) {
>> +                ret = -errno;
>> +                goto ret_finish;
>> +            }
>> +
>> +            /*
>> +             * Pre-copy emptied all the device state for now. For more information,
>> +             * please refer to the Linux kernel VFIO uAPI.
>> +             */
>> +            data_size = 0;
> 
> According to VFIO uAPI, ENOMSG can only be returned in the PRE_COPY device states.
> Here we are in STOP_COPY, so we can drop the ENOMSG handling.

Will drop this ENOMSG handling.

> Thanks.

Thanks,
Maciej
Avihai Horon Sept. 12, 2024, 8:26 a.m. UTC | #3
On 09/09/2024 21:07, Maciej S. Szmigiero wrote:
> External email: Use caution opening links or attachments
>
>
> On 9.09.2024 13:41, Avihai Horon wrote:
>>
>> On 27/08/2024 20:54, Maciej S. Szmigiero wrote:
>>> External email: Use caution opening links or attachments
>>>
>>>
>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>>
>>> Implement the multifd device state transfer via additional per-device
>>> thread inside save_live_complete_precopy_thread handler.
>>>
>>> Switch between doing the data transfer in the new handler and doing it
>>> in the old save_state handler depending on the
>>> x-migration-multifd-transfer device property value.
>>>
>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>> ---
>>>   hw/vfio/migration.c           | 169 
>>> ++++++++++++++++++++++++++++++++++
>>>   hw/vfio/trace-events          |   2 +
>>>   include/hw/vfio/vfio-common.h |   1 +
>>>   3 files changed, 172 insertions(+)
>>>
>>> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
>>> index 57c1542528dc..67996aa2df8b 100644
>>> --- a/hw/vfio/migration.c
>>> +++ b/hw/vfio/migration.c
>>> @@ -655,6 +655,16 @@ static int vfio_save_setup(QEMUFile *f, void 
>>> *opaque, Error **errp)
>>>       uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE;
>>>       int ret;
>>>
>>> +    /* Make a copy of this setting at the start in case it is 
>>> changed mid-migration */
>>> +    migration->multifd_transfer = 
>>> vbasedev->migration_multifd_transfer;
>>
>> Should VFIO multifd be controlled by main migration multifd 
>> capability, and let the per VFIO device migration_multifd_transfer 
>> property be immutable and enabled by default?
>> Then we would have a single point of configuration (and an extra one 
>> per VFIO device just to disable for backward compatibility).
>> Unless there are other benefits to have this property configurable?
>
> We want multifd device state transfer property to be configurable 
> per-device
> in case in the future we add another device type (besides VFIO) that 
> supports
> multifd device state transfer.
>
> In this case, we might need to enable the multifd device state 
> transfer just
> for VFIO devices, but not for this new device type when we are 
> migrating to a
> QEMU target that supports just the VFIO multifd device state transfer.

I think for this case we can use hw/core/machine.c:hw_compat_X_Y arrays [1].

[1] 
https://www.qemu.org/docs/master/devel/migration/compatibility.html#how-backwards-compatibility-works

>
> TBH, I'm not opposed to adding a additional global multifd device 
> state transfer
> switch (if we keep the per-device ones too) but I am not sure what 
> value it adds.
>
>>> +
>>> +    if (migration->multifd_transfer && 
>>> !migration_has_device_state_support()) {
>>> +        error_setg(errp,
>>> +                   "%s: Multifd device transfer requested but 
>>> unsupported in the current config",
>>> +                   vbasedev->name);
>>> +        return -EINVAL;
>>> +    }
>>> +
>>>       qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
>>>
>>>       vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
>>> @@ -835,10 +845,20 @@ static int vfio_save_iterate(QEMUFile *f, void 
>>> *opaque)
>>>   static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>>>   {
>>>       VFIODevice *vbasedev = opaque;
>>> +    VFIOMigration *migration = vbasedev->migration;
>>>       ssize_t data_size;
>>>       int ret;
>>>       Error *local_err = NULL;
>>>
>>> +    if (migration->multifd_transfer) {
>>> +        /*
>>> +         * Emit dummy NOP data, vfio_save_complete_precopy_thread()
>>> +         * does the actual transfer.
>>> +         */
>>> +        qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
>>
>> There are three places where we send this dummy end of state, maybe 
>> worth extracting it to a helper? I.e., vfio_send_end_of_state() and 
>> then document there the rationale.
>
> I'm not totally against it but it's wrapping just a single line of 
> code in
> a separate function?

Yes, it's more for self-documentation purpose and for not duplicating 
comments.
I guess it's a matter of taste, so we can go either way and let 
maintainer decide.

>
>>> +        return 0;
>>> +    }
>>> +
>>> trace_vfio_save_complete_precopy_started(vbasedev->name);
>>>
>>>       /* We reach here with device state STOP or STOP_COPY only */
>>> @@ -864,12 +884,159 @@ static int 
>>> vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>>>       return ret;
>>>   }
>>>
>>> +static int 
>>> vfio_save_complete_precopy_async_thread_config_state(VFIODevice 
>>> *vbasedev,
>>> +                                                                
>>> char *idstr,
>>> + uint32_t instance_id,
>>> + uint32_t idx)
>>> +{
>>> +    g_autoptr(QIOChannelBuffer) bioc = NULL;
>>> +    QEMUFile *f = NULL;
>>> +    int ret;
>>> +    g_autofree VFIODeviceStatePacket *packet = NULL;
>>> +    size_t packet_len;
>>> +
>>> +    bioc = qio_channel_buffer_new(0);
>>> +    qio_channel_set_name(QIO_CHANNEL(bioc), 
>>> "vfio-device-config-save");
>>> +
>>> +    f = qemu_file_new_output(QIO_CHANNEL(bioc));
>>> +
>>> +    ret = vfio_save_device_config_state(f, vbasedev, NULL);
>>> +    if (ret) {
>>> +        return ret;
>>
>> Need to close f in this case.
>
> Right - by the way, that's a good example why RAII
> helps avoid such mistakes.

Agreed :)

>
>>> +    }
>>> +
>>> +    ret = qemu_fflush(f);
>>> +    if (ret) {
>>> +        goto ret_close_file;
>>> +    }
>>> +
>>> +    packet_len = sizeof(*packet) + bioc->usage;
>>> +    packet = g_malloc0(packet_len);
>>> +    packet->idx = idx;
>>> +    packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE;
>>> +    memcpy(&packet->data, bioc->data, bioc->usage);
>>> +
>>> +    if (!multifd_queue_device_state(idstr, instance_id,
>>> +                                    (char *)packet, packet_len)) {
>>> +        ret = -1;
>>
>> goto ret_close_file?
>
> Right, it would be better not to increment the counter in this case.
>
>>> +    }
>>> +
>>> +    bytes_transferred += packet_len;
>>
>> bytes_transferred is a global variable. Now that we access it from 
>> multiple threads it should be protected.
>
> Right, this stat needs some concurrent access protection.
>
>> Note that now the VFIO device data is reported also in multifd stats 
>> (if I am not mistaken), is this the behavior we want? Maybe we should 
>> enhance multifd stats to distinguish between RAM data and device data?
>
> Multifd stats report total size of data transferred via multifd so
> they should include device state too.

Yes I agree. But now we are reporting double the amount of VFIO data 
that we actually transfer (once in "vfio device transferred" and another 
in multifd stats) and this may be misleading.
So maybe we should add a dedicated multifd device state counter and 
report VFIO multifd bytes there instead of in bytes_transferred?
We can wait for other people's opinion about that.

>
> It may make sense to add a dedicated device state transfer counter
> at some time though.
>
>>> +
>>> +ret_close_file:
>>
>> Rename to "out" as we only have one exit point?
>>
>>> +    g_clear_pointer(&f, qemu_fclose);
>>
>> f is a local variable, wouldn't qemu_fclose(f) be enough here?
>
> Sure, but why leave a dangling pointer?
>
> Currently, it is obviously a NOP (probably deleted by dead store
> elimination anyway) but the code might get refactored at some point
> and I think it's good practice to always NULL pointers after freeing
> them where possible and so be on the safe side.

Ack.

Thanks.
Cédric Le Goater Sept. 12, 2024, 8:57 a.m. UTC | #4
On 9/12/24 10:26, Avihai Horon wrote:
> 
> On 09/09/2024 21:07, Maciej S. Szmigiero wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> On 9.09.2024 13:41, Avihai Horon wrote:
>>>
>>> On 27/08/2024 20:54, Maciej S. Szmigiero wrote:
>>>> External email: Use caution opening links or attachments
>>>>
>>>>
>>>> From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
>>>>
>>>> Implement the multifd device state transfer via additional per-device
>>>> thread inside save_live_complete_precopy_thread handler.
>>>>
>>>> Switch between doing the data transfer in the new handler and doing it
>>>> in the old save_state handler depending on the
>>>> x-migration-multifd-transfer device property value.
>>>>
>>>> Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
>>>> ---
>>>>   hw/vfio/migration.c           | 169 ++++++++++++++++++++++++++++++++++
>>>>   hw/vfio/trace-events          |   2 +
>>>>   include/hw/vfio/vfio-common.h |   1 +
>>>>   3 files changed, 172 insertions(+)
>>>>
>>>> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
>>>> index 57c1542528dc..67996aa2df8b 100644
>>>> --- a/hw/vfio/migration.c
>>>> +++ b/hw/vfio/migration.c
>>>> @@ -655,6 +655,16 @@ static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp)
>>>>       uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE;
>>>>       int ret;
>>>>
>>>> +    /* Make a copy of this setting at the start in case it is changed mid-migration */
>>>> +    migration->multifd_transfer = vbasedev->migration_multifd_transfer;
>>>
>>> Should VFIO multifd be controlled by main migration multifd capability, and let the per VFIO device migration_multifd_transfer property be immutable and enabled by default?
>>> Then we would have a single point of configuration (and an extra one per VFIO device just to disable for backward compatibility).
>>> Unless there are other benefits to have this property configurable?
>>
>> We want multifd device state transfer property to be configurable per-device
>> in case in the future we add another device type (besides VFIO) that supports
>> multifd device state transfer.
>>
>> In this case, we might need to enable the multifd device state transfer just
>> for VFIO devices, but not for this new device type when we are migrating to a
>> QEMU target that supports just the VFIO multifd device state transfer.
> 
> I think for this case we can use hw/core/machine.c:hw_compat_X_Y arrays [1].
> 
> [1] https://www.qemu.org/docs/master/devel/migration/compatibility.html#how-backwards-compatibility-works
> 
>>
>> TBH, I'm not opposed to adding a additional global multifd device state transfer
>> switch (if we keep the per-device ones too) but I am not sure what value it adds.
>>
>>>> +
>>>> +    if (migration->multifd_transfer && !migration_has_device_state_support()) {
>>>> +        error_setg(errp,
>>>> +                   "%s: Multifd device transfer requested but unsupported in the current config",
>>>> +                   vbasedev->name);
>>>> +        return -EINVAL;
>>>> +    }
>>>> +
>>>>       qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
>>>>
>>>>       vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
>>>> @@ -835,10 +845,20 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
>>>>   static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>>>>   {
>>>>       VFIODevice *vbasedev = opaque;
>>>> +    VFIOMigration *migration = vbasedev->migration;
>>>>       ssize_t data_size;
>>>>       int ret;
>>>>       Error *local_err = NULL;
>>>>
>>>> +    if (migration->multifd_transfer) {
>>>> +        /*
>>>> +         * Emit dummy NOP data, vfio_save_complete_precopy_thread()
>>>> +         * does the actual transfer.
>>>> +         */
>>>> +        qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
>>>
>>> There are three places where we send this dummy end of state, maybe worth extracting it to a helper? I.e., vfio_send_end_of_state() and then document there the rationale.
>>
>> I'm not totally against it but it's wrapping just a single line of code in
>> a separate function?
> 
> Yes, it's more for self-documentation purpose and for not duplicating comments.
> I guess it's a matter of taste, so we can go either way and let maintainer decide.

I'd prefer an helper too. This comment applies to all additions
in pre-existing code. Ideally new routines should have a
'vfio_{migration,save,load}_multifd' prefix so that the reader
understands what the code is for.


Thanks,

C.


> 
>>
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> trace_vfio_save_complete_precopy_started(vbasedev->name);
>>>>
>>>>       /* We reach here with device state STOP or STOP_COPY only */
>>>> @@ -864,12 +884,159 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>>>>       return ret;
>>>>   }
>>>>
>>>> +static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbasedev,
>>>> + char *idstr,
>>>> + uint32_t instance_id,
>>>> + uint32_t idx)
>>>> +{
>>>> +    g_autoptr(QIOChannelBuffer) bioc = NULL;
>>>> +    QEMUFile *f = NULL;
>>>> +    int ret;
>>>> +    g_autofree VFIODeviceStatePacket *packet = NULL;
>>>> +    size_t packet_len;
>>>> +
>>>> +    bioc = qio_channel_buffer_new(0);
>>>> +    qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save");
>>>> +
>>>> +    f = qemu_file_new_output(QIO_CHANNEL(bioc));
>>>> +
>>>> +    ret = vfio_save_device_config_state(f, vbasedev, NULL);
>>>> +    if (ret) {
>>>> +        return ret;
>>>
>>> Need to close f in this case.
>>
>> Right - by the way, that's a good example why RAII
>> helps avoid such mistakes.
> 
> Agreed :)
> 
>>
>>>> +    }
>>>> +
>>>> +    ret = qemu_fflush(f);
>>>> +    if (ret) {
>>>> +        goto ret_close_file;
>>>> +    }
>>>> +
>>>> +    packet_len = sizeof(*packet) + bioc->usage;
>>>> +    packet = g_malloc0(packet_len);
>>>> +    packet->idx = idx;
>>>> +    packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE;
>>>> +    memcpy(&packet->data, bioc->data, bioc->usage);
>>>> +
>>>> +    if (!multifd_queue_device_state(idstr, instance_id,
>>>> +                                    (char *)packet, packet_len)) {
>>>> +        ret = -1;
>>>
>>> goto ret_close_file?
>>
>> Right, it would be better not to increment the counter in this case.
>>
>>>> +    }
>>>> +
>>>> +    bytes_transferred += packet_len;
>>>
>>> bytes_transferred is a global variable. Now that we access it from multiple threads it should be protected.
>>
>> Right, this stat needs some concurrent access protection.
>>
>>> Note that now the VFIO device data is reported also in multifd stats (if I am not mistaken), is this the behavior we want? Maybe we should enhance multifd stats to distinguish between RAM data and device data?
>>
>> Multifd stats report total size of data transferred via multifd so
>> they should include device state too.
> 
> Yes I agree. But now we are reporting double the amount of VFIO data that we actually transfer (once in "vfio device transferred" and another in multifd stats) and this may be misleading.
> So maybe we should add a dedicated multifd device state counter and report VFIO multifd bytes there instead of in bytes_transferred?
> We can wait for other people's opinion about that.
> 
>>
>> It may make sense to add a dedicated device state transfer counter
>> at some time though.
>>
>>>> +
>>>> +ret_close_file:
>>>
>>> Rename to "out" as we only have one exit point?
>>>
>>>> +    g_clear_pointer(&f, qemu_fclose);
>>>
>>> f is a local variable, wouldn't qemu_fclose(f) be enough here?
>>
>> Sure, but why leave a dangling pointer?
>>
>> Currently, it is obviously a NOP (probably deleted by dead store
>> elimination anyway) but the code might get refactored at some point
>> and I think it's good practice to always NULL pointers after freeing
>> them where possible and so be on the safe side.
> 
> Ack.
> 
> Thanks.
>
diff mbox series

Patch

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 57c1542528dc..67996aa2df8b 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -655,6 +655,16 @@  static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp)
     uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE;
     int ret;
 
+    /* Make a copy of this setting at the start in case it is changed mid-migration */
+    migration->multifd_transfer = vbasedev->migration_multifd_transfer;
+
+    if (migration->multifd_transfer && !migration_has_device_state_support()) {
+        error_setg(errp,
+                   "%s: Multifd device transfer requested but unsupported in the current config",
+                   vbasedev->name);
+        return -EINVAL;
+    }
+
     qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
 
     vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
@@ -835,10 +845,20 @@  static int vfio_save_iterate(QEMUFile *f, void *opaque)
 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
 {
     VFIODevice *vbasedev = opaque;
+    VFIOMigration *migration = vbasedev->migration;
     ssize_t data_size;
     int ret;
     Error *local_err = NULL;
 
+    if (migration->multifd_transfer) {
+        /*
+         * Emit dummy NOP data, vfio_save_complete_precopy_thread()
+         * does the actual transfer.
+         */
+        qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
+        return 0;
+    }
+
     trace_vfio_save_complete_precopy_started(vbasedev->name);
 
     /* We reach here with device state STOP or STOP_COPY only */
@@ -864,12 +884,159 @@  static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
     return ret;
 }
 
+static int vfio_save_complete_precopy_async_thread_config_state(VFIODevice *vbasedev,
+                                                                char *idstr,
+                                                                uint32_t instance_id,
+                                                                uint32_t idx)
+{
+    g_autoptr(QIOChannelBuffer) bioc = NULL;
+    QEMUFile *f = NULL;
+    int ret;
+    g_autofree VFIODeviceStatePacket *packet = NULL;
+    size_t packet_len;
+
+    bioc = qio_channel_buffer_new(0);
+    qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save");
+
+    f = qemu_file_new_output(QIO_CHANNEL(bioc));
+
+    ret = vfio_save_device_config_state(f, vbasedev, NULL);
+    if (ret) {
+        return ret;
+    }
+
+    ret = qemu_fflush(f);
+    if (ret) {
+        goto ret_close_file;
+    }
+
+    packet_len = sizeof(*packet) + bioc->usage;
+    packet = g_malloc0(packet_len);
+    packet->idx = idx;
+    packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE;
+    memcpy(&packet->data, bioc->data, bioc->usage);
+
+    if (!multifd_queue_device_state(idstr, instance_id,
+                                    (char *)packet, packet_len)) {
+        ret = -1;
+    }
+
+    bytes_transferred += packet_len;
+
+ret_close_file:
+    g_clear_pointer(&f, qemu_fclose);
+    return ret;
+}
+
+static int vfio_save_complete_precopy_thread(char *idstr,
+                                             uint32_t instance_id,
+                                             bool *abort_flag,
+                                             void *opaque)
+{
+    VFIODevice *vbasedev = opaque;
+    VFIOMigration *migration = vbasedev->migration;
+    int ret;
+    g_autofree VFIODeviceStatePacket *packet = NULL;
+    uint32_t idx;
+
+    if (!migration->multifd_transfer) {
+        /* Nothing to do, vfio_save_complete_precopy() does the transfer. */
+        return 0;
+    }
+
+    trace_vfio_save_complete_precopy_thread_started(vbasedev->name,
+                                                    idstr, instance_id);
+
+    /* We reach here with device state STOP or STOP_COPY only */
+    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
+                                   VFIO_DEVICE_STATE_STOP, NULL);
+    if (ret) {
+        goto ret_finish;
+    }
+
+    packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size);
+
+    for (idx = 0; ; idx++) {
+        ssize_t data_size;
+        size_t packet_size;
+
+        if (qatomic_read(abort_flag)) {
+            ret = -ECANCELED;
+            goto ret_finish;
+        }
+
+        data_size = read(migration->data_fd, &packet->data,
+                         migration->data_buffer_size);
+        if (data_size < 0) {
+            if (errno != ENOMSG) {
+                ret = -errno;
+                goto ret_finish;
+            }
+
+            /*
+             * Pre-copy emptied all the device state for now. For more information,
+             * please refer to the Linux kernel VFIO uAPI.
+             */
+            data_size = 0;
+        }
+
+        if (data_size == 0)
+            break;
+
+        packet->idx = idx;
+        packet_size = sizeof(*packet) + data_size;
+
+        if (!multifd_queue_device_state(idstr, instance_id,
+                                        (char *)packet, packet_size)) {
+            ret = -1;
+            goto ret_finish;
+        }
+
+        bytes_transferred += packet_size;
+    }
+
+    ret = vfio_save_complete_precopy_async_thread_config_state(vbasedev, idstr,
+                                                               instance_id,
+                                                               idx);
+
+ret_finish:
+    trace_vfio_save_complete_precopy_thread_finished(vbasedev->name, ret);
+
+    return ret;
+}
+
+static int vfio_save_complete_precopy_begin(QEMUFile *f,
+                                            char *idstr, uint32_t instance_id,
+                                            void *opaque)
+{
+    VFIODevice *vbasedev = opaque;
+    VFIOMigration *migration = vbasedev->migration;
+
+    if (!migration->multifd_transfer) {
+        /* Emit dummy NOP data */
+        qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
+        return 0;
+    }
+
+    qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE_COMPLETE);
+    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
+
+    return qemu_fflush(f);
+}
+
 static void vfio_save_state(QEMUFile *f, void *opaque)
 {
     VFIODevice *vbasedev = opaque;
+    VFIOMigration *migration = vbasedev->migration;
     Error *local_err = NULL;
     int ret;
 
+    if (migration->multifd_transfer) {
+        /* Emit dummy NOP data */
+        qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
+        return;
+    }
+
     ret = vfio_save_device_config_state(f, opaque, &local_err);
     if (ret) {
         error_prepend(&local_err,
@@ -1119,7 +1286,9 @@  static const SaveVMHandlers savevm_vfio_handlers = {
     .state_pending_exact = vfio_state_pending_exact,
     .is_active_iterate = vfio_is_active_iterate,
     .save_live_iterate = vfio_save_iterate,
+    .save_live_complete_precopy_begin = vfio_save_complete_precopy_begin,
     .save_live_complete_precopy = vfio_save_complete_precopy,
+    .save_live_complete_precopy_thread = vfio_save_complete_precopy_thread,
     .save_state = vfio_save_state,
     .load_setup = vfio_load_setup,
     .load_cleanup = vfio_load_cleanup,
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 9d2519a28a7e..b1d9c9d5f2e1 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -167,6 +167,8 @@  vfio_save_block(const char *name, int data_size) " (%s) data_size %d"
 vfio_save_cleanup(const char *name) " (%s)"
 vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d"
 vfio_save_complete_precopy_started(const char *name) " (%s)"
+vfio_save_complete_precopy_thread_started(const char *name, const char *idstr, uint32_t instance_id) " (%s) idstr %s instance %"PRIu32
+vfio_save_complete_precopy_thread_finished(const char *name, int ret) " (%s) ret %d"
 vfio_save_device_config_state(const char *name) " (%s)"
 vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64
 vfio_save_iterate_started(const char *name) " (%s)"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index fe05acb9a5d1..4578a0ca6a5c 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -72,6 +72,7 @@  typedef struct VFIOMigration {
     uint64_t mig_flags;
     uint64_t precopy_init_size;
     uint64_t precopy_dirty_size;
+    bool multifd_transfer;
     bool initial_data_sent;
 
     bool save_iterate_run;