diff mbox series

[v2,07/11] vfio/migration: Implement VFIO migration protocol v2

Message ID 20220530170739.19072-8-avihaih@nvidia.com (mailing list archive)
State New, archived
Headers show
Series vfio/migration: Implement VFIO migration protocol v2 | expand

Commit Message

Avihai Horon May 30, 2022, 5:07 p.m. UTC
Add implementation of VFIO migration protocol v2. The two protocols, v1
and v2, will co-exist and in next patch v1 protocol will be removed.

There are several main differences between v1 and v2 protocols:
- VFIO device state is now represented as a finite state machine instead
  of a bitmap.

- Migration interface with kernel is now done using VFIO_DEVICE_FEATURE
  ioctl and normal read() and write() instead of the migration region.

- VFIO migration protocol v2 currently doesn't support the pre-copy
  phase of migration.

Detailed information about VFIO migration protocol v2 and difference
compared to v1 can be found here [1].

[1]
https://lore.kernel.org/all/20220224142024.147653-10-yishaih@nvidia.com/

Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
 hw/vfio/common.c              |  19 +-
 hw/vfio/migration.c           | 365 ++++++++++++++++++++++++++++++----
 hw/vfio/trace-events          |   2 +
 include/hw/vfio/vfio-common.h |   5 +
 4 files changed, 354 insertions(+), 37 deletions(-)

Comments

Joao Martins June 14, 2022, 11:08 a.m. UTC | #1
On 5/30/22 18:07, Avihai Horon wrote:
> +static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
> +{
> +    VFIODevice *vbasedev = opaque;
> +    enum vfio_device_mig_state recover_state;
> +    int ret;
> +
> +    /* We reach here with device state STOP or STOP_COPY only */
> +    recover_state = VFIO_DEVICE_STATE_STOP;
> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
> +                                   recover_state);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    do {
> +        ret = vfio_save_block(f, vbasedev->migration);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +    } while (!ret);
> +
> +    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
> +    ret = qemu_file_get_error(f);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP,
> +                                   recover_state);

Is it expected that you are setting VFIO_DEVICE_STATE_STOP while
@recover_state is the same value (VFIO_DEVICE_STATE_STOP) ?

> +    if (ret) {
> +        return ret;
> +    }
> +
> +    trace_vfio_save_complete_precopy(vbasedev->name);
> +
> +    return 0;
> +}
> +
>  static int vfio_v1_save_complete_precopy(QEMUFile *f, void *opaque)
>  {
>      VFIODevice *vbasedev = opaque;
> @@ -593,6 +775,14 @@ static void vfio_save_state(QEMUFile *f, void *opaque)
>      }
>  }
>  
> +static int vfio_load_setup(QEMUFile *f, void *opaque)
> +{
> +    VFIODevice *vbasedev = opaque;
> +
> +    return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING,
> +                                   vbasedev->migration->device_state);
> +}
> +
>  static int vfio_v1_load_setup(QEMUFile *f, void *opaque)
>  {
>      VFIODevice *vbasedev = opaque;
> @@ -620,6 +810,15 @@ static int vfio_v1_load_setup(QEMUFile *f, void *opaque)
>      return ret;
>  }
>  
> +static int vfio_load_cleanup(void *opaque)
> +{
> +    VFIODevice *vbasedev = opaque;
> +
> +    vfio_migration_cleanup(vbasedev);
> +    trace_vfio_load_cleanup(vbasedev->name);
> +    return 0;
> +}
> +
>  static int vfio_v1_load_cleanup(void *opaque)
>  {
>      VFIODevice *vbasedev = opaque;
> @@ -662,7 +861,11 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
>              uint64_t data_size = qemu_get_be64(f);
>  
>              if (data_size) {
> -                ret = vfio_v1_load_buffer(f, vbasedev, data_size);
> +                if (vbasedev->migration->v2) {
> +                    ret = vfio_load_buffer(f, vbasedev, data_size);
> +                } else {
> +                    ret = vfio_v1_load_buffer(f, vbasedev, data_size);
> +                }
>                  if (ret < 0) {
>                      return ret;
>                  }
> @@ -683,6 +886,16 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
>      return ret;
>  }
>  
> +static SaveVMHandlers savevm_vfio_handlers = {
> +    .save_setup = vfio_save_setup,
> +    .save_cleanup = vfio_save_cleanup,
> +    .save_live_complete_precopy = vfio_save_complete_precopy,
> +    .save_state = vfio_save_state,
> +    .load_setup = vfio_load_setup,
> +    .load_cleanup = vfio_load_cleanup,
> +    .load_state = vfio_load_state,
> +};
> +
>  static SaveVMHandlers savevm_vfio_v1_handlers = {
>      .save_setup = vfio_v1_save_setup,
>      .save_cleanup = vfio_v1_save_cleanup,
> @@ -697,6 +910,34 @@ static SaveVMHandlers savevm_vfio_v1_handlers = {
>  
>  /* ---------------------------------------------------------------------- */
>  
> +static void vfio_vmstate_change(void *opaque, bool running, RunState state)
> +{
> +    VFIODevice *vbasedev = opaque;
> +    enum vfio_device_mig_state new_state;
> +    int ret;
> +
> +    if (running) {
> +        new_state = VFIO_DEVICE_STATE_RUNNING;
> +    } else {
> +        new_state = VFIO_DEVICE_STATE_STOP;
> +    }
> +
> +    ret = vfio_migration_set_state(vbasedev, new_state,
> +                                   VFIO_DEVICE_STATE_ERROR);
> +    if (ret) {
> +        /*
> +         * Migration should be aborted in this case, but vm_state_notify()
> +         * currently does not support reporting failures.
> +         */
> +        if (migrate_get_current()->to_dst_file) {
> +            qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
> +        }
> +    }
> +
> +    trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
> +                              new_state);
> +}
> +
>  static void vfio_v1_vmstate_change(void *opaque, bool running, RunState state)
>  {
>      VFIODevice *vbasedev = opaque;
> @@ -770,12 +1011,17 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data)
>      case MIGRATION_STATUS_CANCELLED:
>      case MIGRATION_STATUS_FAILED:
>          bytes_transferred = 0;
> -        ret = vfio_migration_v1_set_state(vbasedev,
> -                                          ~(VFIO_DEVICE_STATE_V1_SAVING |
> -                                            VFIO_DEVICE_STATE_V1_RESUMING),
> -                                          VFIO_DEVICE_STATE_V1_RUNNING);
> -        if (ret) {
> -            error_report("%s: Failed to set state RUNNING", vbasedev->name);
> +        if (migration->v2) {
> +            vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
> +                                     VFIO_DEVICE_STATE_ERROR);

Perhaps you are discarding the error?

Shouldn't it be:

	err =  vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
					VFIO_DEVICE_STATE_ERROR);

> +        } else {
> +            ret = vfio_migration_v1_set_state(vbasedev,
> +                                              ~(VFIO_DEVICE_STATE_V1_SAVING |
> +                                                VFIO_DEVICE_STATE_V1_RESUMING),
> +                                              VFIO_DEVICE_STATE_V1_RUNNING);
> +            if (ret) {
> +                error_report("%s: Failed to set state RUNNING", vbasedev->name);
> +            }

Perhaps this error_report and condition is in the wrong scope?

Shouldn't it be more like this:

if (migration->v2) {
	ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
				 VFIO_DEVICE_STATE_ERROR);
} else {
        ret = vfio_migration_v1_set_state(vbasedev,
                                          ~(VFIO_DEVICE_STATE_V1_SAVING |
                                            VFIO_DEVICE_STATE_V1_RESUMING),
                                          VFIO_DEVICE_STATE_V1_RUNNING);
}


if (ret) {
    error_report("%s: Failed to set state RUNNING", vbasedev->name);
}

>          }
>      }
>  }
> @@ -784,12 +1030,35 @@ static void vfio_migration_exit(VFIODevice *vbasedev)
>  {
>      VFIOMigration *migration = vbasedev->migration;
>  
> -    vfio_region_exit(&migration->region);
> -    vfio_region_finalize(&migration->region);
> +    if (migration->v2) {
> +        g_free(migration->data_buffer);
> +    } else {
> +        vfio_region_exit(&migration->region);
> +        vfio_region_finalize(&migration->region);
> +    }
>      g_free(vbasedev->migration);
>      vbasedev->migration = NULL;
>  }
>  
> +static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
> +{
> +    uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
> +                                  sizeof(struct vfio_device_feature_migration),
> +                              sizeof(uint64_t))] = {};
> +    struct vfio_device_feature *feature = (void *)buf;
> +    struct vfio_device_feature_migration *mig = (void *)feature->data;
> +
> +    feature->argsz = sizeof(buf);
> +    feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION;
> +    if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
> +        return -EOPNOTSUPP;
> +    }
> +
> +    *mig_flags = mig->flags;
> +
> +    return 0;
> +}
> +
>  static int vfio_migration_init(VFIODevice *vbasedev)
>  {
>      int ret;
> @@ -798,6 +1067,7 @@ static int vfio_migration_init(VFIODevice *vbasedev)
>      char id[256] = "";
>      g_autofree char *path = NULL, *oid = NULL;
>      struct vfio_region_info *info = NULL;
> +    uint64_t mig_flags;
>  
>      if (!vbasedev->ops->vfio_get_object) {
>          return -EINVAL;
> @@ -808,32 +1078,48 @@ static int vfio_migration_init(VFIODevice *vbasedev)
>          return -EINVAL;
>      }
>  
> -    ret = vfio_get_dev_region_info(vbasedev,
> -                                   VFIO_REGION_TYPE_MIGRATION_DEPRECATED,
> -                                   VFIO_REGION_SUBTYPE_MIGRATION_DEPRECATED,
> -                                   &info);
> -    if (ret) {
> -        return ret;
> -    }
> +    ret = vfio_migration_query_flags(vbasedev, &mig_flags);
> +    if (!ret) {
> +        /* Migration v2 */
> +        /* Basic migration functionality must be supported */
> +        if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) {
> +            return -EOPNOTSUPP;
> +        }
> +        vbasedev->migration = g_new0(VFIOMigration, 1);
> +        vbasedev->migration->data_buffer_size = VFIO_MIG_DATA_BUFFER_SIZE;
> +        vbasedev->migration->data_buffer =
> +            g_malloc0(vbasedev->migration->data_buffer_size);
> +        vbasedev->migration->data_fd = -1;
> +        vbasedev->migration->v2 = true;
> +    } else {
> +        /* Migration v1 */
> +        ret = vfio_get_dev_region_info(vbasedev,
> +                                       VFIO_REGION_TYPE_MIGRATION_DEPRECATED,
> +                                       VFIO_REGION_SUBTYPE_MIGRATION_DEPRECATED,
> +                                       &info);
> +        if (ret) {
> +            return ret;
> +        }
>  
> -    vbasedev->migration = g_new0(VFIOMigration, 1);
> +        vbasedev->migration = g_new0(VFIOMigration, 1);
>  
> -    ret = vfio_region_setup(obj, vbasedev, &vbasedev->migration->region,
> -                            info->index, "migration");
> -    if (ret) {
> -        error_report("%s: Failed to setup VFIO migration region %d: %s",
> -                     vbasedev->name, info->index, strerror(-ret));
> -        goto err;
> -    }
> +        ret = vfio_region_setup(obj, vbasedev, &vbasedev->migration->region,
> +                                info->index, "migration");
> +        if (ret) {
> +            error_report("%s: Failed to setup VFIO migration region %d: %s",
> +                         vbasedev->name, info->index, strerror(-ret));
> +            goto err;
> +        }
>  
> -    if (!vbasedev->migration->region.size) {
> -        error_report("%s: Invalid zero-sized VFIO migration region %d",
> -                     vbasedev->name, info->index);
> -        ret = -EINVAL;
> -        goto err;
> -    }
> +        if (!vbasedev->migration->region.size) {
> +            error_report("%s: Invalid zero-sized VFIO migration region %d",
> +                         vbasedev->name, info->index);
> +            ret = -EINVAL;
> +            goto err;
> +        }
>  
> -    g_free(info);
> +        g_free(info);
> +    }
>  
>      migration = vbasedev->migration;
>      migration->vbasedev = vbasedev;
> @@ -846,11 +1132,20 @@ static int vfio_migration_init(VFIODevice *vbasedev)
>      }
>      strpadcpy(id, sizeof(id), path, '\0');
>  
> -    register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1,
> -                         &savevm_vfio_v1_handlers, vbasedev);
> +    if (migration->v2) {
> +        register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1,
> +                             &savevm_vfio_handlers, vbasedev);
> +
> +        migration->vm_state = qdev_add_vm_change_state_handler(
> +            vbasedev->dev, vfio_vmstate_change, vbasedev);
> +    } else {
> +        register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1,
> +                             &savevm_vfio_v1_handlers, vbasedev);
> +
> +        migration->vm_state = qdev_add_vm_change_state_handler(
> +            vbasedev->dev, vfio_v1_vmstate_change, vbasedev);
> +    }
>  
> -    migration->vm_state = qdev_add_vm_change_state_handler(
> -        vbasedev->dev, vfio_v1_vmstate_change, vbasedev);
>      migration->migration_state.notify = vfio_migration_state_notifier;
>      add_migration_state_change_notifier(&migration->migration_state);
>      return 0;
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index ac8b04f52a..6e8c5958b9 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -163,6 +163,8 @@ vfio_save_complete_precopy(const char *name) " (%s)"
>  vfio_load_device_config_state(const char *name) " (%s)"
>  vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
>  vfio_v1_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64
> +vfio_load_state_device_data(const char *name, uint64_t data_size) " (%s) size 0x%"PRIx64
>  vfio_load_cleanup(const char *name) " (%s)"
>  vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
>  vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
> +vfio_save_block(const char *name, int data_size) " (%s) data_size %d"
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index bbaf72ba00..2ec3346fea 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -66,6 +66,11 @@ typedef struct VFIOMigration {
>      int vm_running;
>      Notifier migration_state;
>      uint64_t pending_bytes;
> +    enum vfio_device_mig_state device_state;
> +    int data_fd;
> +    void *data_buffer;
> +    size_t data_buffer_size;
> +    bool v2;
>  } VFIOMigration;
>  
>  typedef struct VFIOAddressSpace {
Avihai Horon June 14, 2022, 4:34 p.m. UTC | #2
On 6/14/2022 2:08 PM, Joao Martins wrote:
> External email: Use caution opening links or attachments
>
>
> On 5/30/22 18:07, Avihai Horon wrote:
>> +static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>> +{
>> +    VFIODevice *vbasedev = opaque;
>> +    enum vfio_device_mig_state recover_state;
>> +    int ret;
>> +
>> +    /* We reach here with device state STOP or STOP_COPY only */
>> +    recover_state = VFIO_DEVICE_STATE_STOP;
>> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
>> +                                   recover_state);
>> +    if (ret) {
>> +        return ret;
>> +    }
>> +
>> +    do {
>> +        ret = vfio_save_block(f, vbasedev->migration);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +    } while (!ret);
>> +
>> +    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
>> +    ret = qemu_file_get_error(f);
>> +    if (ret) {
>> +        return ret;
>> +    }
>> +
>> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP,
>> +                                   recover_state);
> Is it expected that you are setting VFIO_DEVICE_STATE_STOP while
> @recover_state is the same value (VFIO_DEVICE_STATE_STOP) ?


Yes.
Transitioning to any other state from STOP_COPY will first go through 
STOP state (this is done internally by kernel).
So there is no better option for the recover state but STOP.

>> +    if (ret) {
>> +        return ret;
>> +    }
>> +
>> +    trace_vfio_save_complete_precopy(vbasedev->name);
>> +
>> +    return 0;
>> +}
>> +
>>   static int vfio_v1_save_complete_precopy(QEMUFile *f, void *opaque)
>>   {
>>       VFIODevice *vbasedev = opaque;
>> @@ -593,6 +775,14 @@ static void vfio_save_state(QEMUFile *f, void *opaque)
>>       }
>>   }
>>
>> +static int vfio_load_setup(QEMUFile *f, void *opaque)
>> +{
>> +    VFIODevice *vbasedev = opaque;
>> +
>> +    return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING,
>> +                                   vbasedev->migration->device_state);
>> +}
>> +
>>   static int vfio_v1_load_setup(QEMUFile *f, void *opaque)
>>   {
>>       VFIODevice *vbasedev = opaque;
>> @@ -620,6 +810,15 @@ static int vfio_v1_load_setup(QEMUFile *f, void *opaque)
>>       return ret;
>>   }
>>
>> +static int vfio_load_cleanup(void *opaque)
>> +{
>> +    VFIODevice *vbasedev = opaque;
>> +
>> +    vfio_migration_cleanup(vbasedev);
>> +    trace_vfio_load_cleanup(vbasedev->name);
>> +    return 0;
>> +}
>> +
>>   static int vfio_v1_load_cleanup(void *opaque)
>>   {
>>       VFIODevice *vbasedev = opaque;
>> @@ -662,7 +861,11 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
>>               uint64_t data_size = qemu_get_be64(f);
>>
>>               if (data_size) {
>> -                ret = vfio_v1_load_buffer(f, vbasedev, data_size);
>> +                if (vbasedev->migration->v2) {
>> +                    ret = vfio_load_buffer(f, vbasedev, data_size);
>> +                } else {
>> +                    ret = vfio_v1_load_buffer(f, vbasedev, data_size);
>> +                }
>>                   if (ret < 0) {
>>                       return ret;
>>                   }
>> @@ -683,6 +886,16 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
>>       return ret;
>>   }
>>
>> +static SaveVMHandlers savevm_vfio_handlers = {
>> +    .save_setup = vfio_save_setup,
>> +    .save_cleanup = vfio_save_cleanup,
>> +    .save_live_complete_precopy = vfio_save_complete_precopy,
>> +    .save_state = vfio_save_state,
>> +    .load_setup = vfio_load_setup,
>> +    .load_cleanup = vfio_load_cleanup,
>> +    .load_state = vfio_load_state,
>> +};
>> +
>>   static SaveVMHandlers savevm_vfio_v1_handlers = {
>>       .save_setup = vfio_v1_save_setup,
>>       .save_cleanup = vfio_v1_save_cleanup,
>> @@ -697,6 +910,34 @@ static SaveVMHandlers savevm_vfio_v1_handlers = {
>>
>>   /* ---------------------------------------------------------------------- */
>>
>> +static void vfio_vmstate_change(void *opaque, bool running, RunState state)
>> +{
>> +    VFIODevice *vbasedev = opaque;
>> +    enum vfio_device_mig_state new_state;
>> +    int ret;
>> +
>> +    if (running) {
>> +        new_state = VFIO_DEVICE_STATE_RUNNING;
>> +    } else {
>> +        new_state = VFIO_DEVICE_STATE_STOP;
>> +    }
>> +
>> +    ret = vfio_migration_set_state(vbasedev, new_state,
>> +                                   VFIO_DEVICE_STATE_ERROR);
>> +    if (ret) {
>> +        /*
>> +         * Migration should be aborted in this case, but vm_state_notify()
>> +         * currently does not support reporting failures.
>> +         */
>> +        if (migrate_get_current()->to_dst_file) {
>> +            qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
>> +        }
>> +    }
>> +
>> +    trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
>> +                              new_state);
>> +}
>> +
>>   static void vfio_v1_vmstate_change(void *opaque, bool running, RunState state)
>>   {
>>       VFIODevice *vbasedev = opaque;
>> @@ -770,12 +1011,17 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data)
>>       case MIGRATION_STATUS_CANCELLED:
>>       case MIGRATION_STATUS_FAILED:
>>           bytes_transferred = 0;
>> -        ret = vfio_migration_v1_set_state(vbasedev,
>> -                                          ~(VFIO_DEVICE_STATE_V1_SAVING |
>> -                                            VFIO_DEVICE_STATE_V1_RESUMING),
>> -                                          VFIO_DEVICE_STATE_V1_RUNNING);
>> -        if (ret) {
>> -            error_report("%s: Failed to set state RUNNING", vbasedev->name);
>> +        if (migration->v2) {
>> +            vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
>> +                                     VFIO_DEVICE_STATE_ERROR);
> Perhaps you are discarding the error?
>
> Shouldn't it be:
>
>          err =  vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
>                                          VFIO_DEVICE_STATE_ERROR);
>
>> +        } else {
>> +            ret = vfio_migration_v1_set_state(vbasedev,
>> +                                              ~(VFIO_DEVICE_STATE_V1_SAVING |
>> +                                                VFIO_DEVICE_STATE_V1_RESUMING),
>> +                                              VFIO_DEVICE_STATE_V1_RUNNING);
>> +            if (ret) {
>> +                error_report("%s: Failed to set state RUNNING", vbasedev->name);
>> +            }
> Perhaps this error_report and condition is in the wrong scope?
>
> Shouldn't it be more like this:
>
> if (migration->v2) {
>          ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
>                                   VFIO_DEVICE_STATE_ERROR);
> } else {
>          ret = vfio_migration_v1_set_state(vbasedev,
>                                            ~(VFIO_DEVICE_STATE_V1_SAVING |
>                                              VFIO_DEVICE_STATE_V1_RESUMING),
>                                            VFIO_DEVICE_STATE_V1_RUNNING);
> }
>
>
> if (ret) {
>      error_report("%s: Failed to set state RUNNING", vbasedev->name);
> }


It was intentionally discarded.
The return value is used by v1 code to determine whether to print an 
error message or not.
In v2 code the error message print is done inside 
vfio_migration_set_state(), so there is no
need for the return value here.

>>           }
>>       }
>>   }
>> @@ -784,12 +1030,35 @@ static void vfio_migration_exit(VFIODevice *vbasedev)
>>   {
>>       VFIOMigration *migration = vbasedev->migration;
>>
>> -    vfio_region_exit(&migration->region);
>> -    vfio_region_finalize(&migration->region);
>> +    if (migration->v2) {
>> +        g_free(migration->data_buffer);
>> +    } else {
>> +        vfio_region_exit(&migration->region);
>> +        vfio_region_finalize(&migration->region);
>> +    }
>>       g_free(vbasedev->migration);
>>       vbasedev->migration = NULL;
>>   }
>>
>> +static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
>> +{
>> +    uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
>> +                                  sizeof(struct vfio_device_feature_migration),
>> +                              sizeof(uint64_t))] = {};
>> +    struct vfio_device_feature *feature = (void *)buf;
>> +    struct vfio_device_feature_migration *mig = (void *)feature->data;
>> +
>> +    feature->argsz = sizeof(buf);
>> +    feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION;
>> +    if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
>> +        return -EOPNOTSUPP;
>> +    }
>> +
>> +    *mig_flags = mig->flags;
>> +
>> +    return 0;
>> +}
>> +
>>   static int vfio_migration_init(VFIODevice *vbasedev)
>>   {
>>       int ret;
>> @@ -798,6 +1067,7 @@ static int vfio_migration_init(VFIODevice *vbasedev)
>>       char id[256] = "";
>>       g_autofree char *path = NULL, *oid = NULL;
>>       struct vfio_region_info *info = NULL;
>> +    uint64_t mig_flags;
>>
>>       if (!vbasedev->ops->vfio_get_object) {
>>           return -EINVAL;
>> @@ -808,32 +1078,48 @@ static int vfio_migration_init(VFIODevice *vbasedev)
>>           return -EINVAL;
>>       }
>>
>> -    ret = vfio_get_dev_region_info(vbasedev,
>> -                                   VFIO_REGION_TYPE_MIGRATION_DEPRECATED,
>> -                                   VFIO_REGION_SUBTYPE_MIGRATION_DEPRECATED,
>> -                                   &info);
>> -    if (ret) {
>> -        return ret;
>> -    }
>> +    ret = vfio_migration_query_flags(vbasedev, &mig_flags);
>> +    if (!ret) {
>> +        /* Migration v2 */
>> +        /* Basic migration functionality must be supported */
>> +        if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) {
>> +            return -EOPNOTSUPP;
>> +        }
>> +        vbasedev->migration = g_new0(VFIOMigration, 1);
>> +        vbasedev->migration->data_buffer_size = VFIO_MIG_DATA_BUFFER_SIZE;
>> +        vbasedev->migration->data_buffer =
>> +            g_malloc0(vbasedev->migration->data_buffer_size);
>> +        vbasedev->migration->data_fd = -1;
>> +        vbasedev->migration->v2 = true;
>> +    } else {
>> +        /* Migration v1 */
>> +        ret = vfio_get_dev_region_info(vbasedev,
>> +                                       VFIO_REGION_TYPE_MIGRATION_DEPRECATED,
>> +                                       VFIO_REGION_SUBTYPE_MIGRATION_DEPRECATED,
>> +                                       &info);
>> +        if (ret) {
>> +            return ret;
>> +        }
>>
>> -    vbasedev->migration = g_new0(VFIOMigration, 1);
>> +        vbasedev->migration = g_new0(VFIOMigration, 1);
>>
>> -    ret = vfio_region_setup(obj, vbasedev, &vbasedev->migration->region,
>> -                            info->index, "migration");
>> -    if (ret) {
>> -        error_report("%s: Failed to setup VFIO migration region %d: %s",
>> -                     vbasedev->name, info->index, strerror(-ret));
>> -        goto err;
>> -    }
>> +        ret = vfio_region_setup(obj, vbasedev, &vbasedev->migration->region,
>> +                                info->index, "migration");
>> +        if (ret) {
>> +            error_report("%s: Failed to setup VFIO migration region %d: %s",
>> +                         vbasedev->name, info->index, strerror(-ret));
>> +            goto err;
>> +        }
>>
>> -    if (!vbasedev->migration->region.size) {
>> -        error_report("%s: Invalid zero-sized VFIO migration region %d",
>> -                     vbasedev->name, info->index);
>> -        ret = -EINVAL;
>> -        goto err;
>> -    }
>> +        if (!vbasedev->migration->region.size) {
>> +            error_report("%s: Invalid zero-sized VFIO migration region %d",
>> +                         vbasedev->name, info->index);
>> +            ret = -EINVAL;
>> +            goto err;
>> +        }
>>
>> -    g_free(info);
>> +        g_free(info);
>> +    }
>>
>>       migration = vbasedev->migration;
>>       migration->vbasedev = vbasedev;
>> @@ -846,11 +1132,20 @@ static int vfio_migration_init(VFIODevice *vbasedev)
>>       }
>>       strpadcpy(id, sizeof(id), path, '\0');
>>
>> -    register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1,
>> -                         &savevm_vfio_v1_handlers, vbasedev);
>> +    if (migration->v2) {
>> +        register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1,
>> +                             &savevm_vfio_handlers, vbasedev);
>> +
>> +        migration->vm_state = qdev_add_vm_change_state_handler(
>> +            vbasedev->dev, vfio_vmstate_change, vbasedev);
>> +    } else {
>> +        register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1,
>> +                             &savevm_vfio_v1_handlers, vbasedev);
>> +
>> +        migration->vm_state = qdev_add_vm_change_state_handler(
>> +            vbasedev->dev, vfio_v1_vmstate_change, vbasedev);
>> +    }
>>
>> -    migration->vm_state = qdev_add_vm_change_state_handler(
>> -        vbasedev->dev, vfio_v1_vmstate_change, vbasedev);
>>       migration->migration_state.notify = vfio_migration_state_notifier;
>>       add_migration_state_change_notifier(&migration->migration_state);
>>       return 0;
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index ac8b04f52a..6e8c5958b9 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -163,6 +163,8 @@ vfio_save_complete_precopy(const char *name) " (%s)"
>>   vfio_load_device_config_state(const char *name) " (%s)"
>>   vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
>>   vfio_v1_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64
>> +vfio_load_state_device_data(const char *name, uint64_t data_size) " (%s) size 0x%"PRIx64
>>   vfio_load_cleanup(const char *name) " (%s)"
>>   vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
>>   vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
>> +vfio_save_block(const char *name, int data_size) " (%s) data_size %d"
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index bbaf72ba00..2ec3346fea 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -66,6 +66,11 @@ typedef struct VFIOMigration {
>>       int vm_running;
>>       Notifier migration_state;
>>       uint64_t pending_bytes;
>> +    enum vfio_device_mig_state device_state;
>> +    int data_fd;
>> +    void *data_buffer;
>> +    size_t data_buffer_size;
>> +    bool v2;
>>   } VFIOMigration;
>>
>>   typedef struct VFIOAddressSpace {
Joao Martins June 14, 2022, 5:24 p.m. UTC | #3
On 6/14/22 17:34, Avihai Horon wrote:
> 
> On 6/14/2022 2:08 PM, Joao Martins wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> On 5/30/22 18:07, Avihai Horon wrote:
>>> +static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>>> +{
>>> +    VFIODevice *vbasedev = opaque;
>>> +    enum vfio_device_mig_state recover_state;
>>> +    int ret;
>>> +
>>> +    /* We reach here with device state STOP or STOP_COPY only */
>>> +    recover_state = VFIO_DEVICE_STATE_STOP;
>>> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
>>> +                                   recover_state);
>>> +    if (ret) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    do {
>>> +        ret = vfio_save_block(f, vbasedev->migration);
>>> +        if (ret < 0) {
>>> +            return ret;
>>> +        }
>>> +    } while (!ret);
>>> +
>>> +    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
>>> +    ret = qemu_file_get_error(f);
>>> +    if (ret) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP,
>>> +                                   recover_state);
>> Is it expected that you are setting VFIO_DEVICE_STATE_STOP while
>> @recover_state is the same value (VFIO_DEVICE_STATE_STOP) ?
> 
> 
> Yes.
> Transitioning to any other state from STOP_COPY will first go through 
> STOP state (this is done internally by kernel).
> So there is no better option for the recover state but STOP.
> 
I was think about ERROR state given that you can transition there
from any state, but wasn't quite sure if it's appropriate to make that arc
while in stop copy migration phase.

>>> +    if (ret) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    trace_vfio_save_complete_precopy(vbasedev->name);
>>> +
>>> +    return 0;

just a cosmetic nit: you could probably rewrite these last couple of lines as:

	if (!ret) {
	    trace_vfio_save_complete_precopy(vbasedev->name);
	}

	return ret;

Let's you avoid the double return path.

>>> +}
>>> +
>>>   static int vfio_v1_save_complete_precopy(QEMUFile *f, void *opaque)
>>>   {
>>>       VFIODevice *vbasedev = opaque;
>>> @@ -593,6 +775,14 @@ static void vfio_save_state(QEMUFile *f, void *opaque)
>>>       }
>>>   }
>>>
>>> +static int vfio_load_setup(QEMUFile *f, void *opaque)
>>> +{
>>> +    VFIODevice *vbasedev = opaque;
>>> +
>>> +    return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING,
>>> +                                   vbasedev->migration->device_state);
>>> +}
>>> +
>>>   static int vfio_v1_load_setup(QEMUFile *f, void *opaque)
>>>   {
>>>       VFIODevice *vbasedev = opaque;
>>> @@ -620,6 +810,15 @@ static int vfio_v1_load_setup(QEMUFile *f, void *opaque)
>>>       return ret;
>>>   }
>>>
>>> +static int vfio_load_cleanup(void *opaque)
>>> +{
>>> +    VFIODevice *vbasedev = opaque;
>>> +
>>> +    vfio_migration_cleanup(vbasedev);
>>> +    trace_vfio_load_cleanup(vbasedev->name);
>>> +    return 0;
>>> +}
>>> +
>>>   static int vfio_v1_load_cleanup(void *opaque)
>>>   {
>>>       VFIODevice *vbasedev = opaque;
>>> @@ -662,7 +861,11 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
>>>               uint64_t data_size = qemu_get_be64(f);
>>>
>>>               if (data_size) {
>>> -                ret = vfio_v1_load_buffer(f, vbasedev, data_size);
>>> +                if (vbasedev->migration->v2) {
>>> +                    ret = vfio_load_buffer(f, vbasedev, data_size);
>>> +                } else {
>>> +                    ret = vfio_v1_load_buffer(f, vbasedev, data_size);
>>> +                }
>>>                   if (ret < 0) {
>>>                       return ret;
>>>                   }
>>> @@ -683,6 +886,16 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
>>>       return ret;
>>>   }
>>>
>>> +static SaveVMHandlers savevm_vfio_handlers = {
>>> +    .save_setup = vfio_save_setup,
>>> +    .save_cleanup = vfio_save_cleanup,
>>> +    .save_live_complete_precopy = vfio_save_complete_precopy,
>>> +    .save_state = vfio_save_state,
>>> +    .load_setup = vfio_load_setup,
>>> +    .load_cleanup = vfio_load_cleanup,
>>> +    .load_state = vfio_load_state,
>>> +};
>>> +
>>>   static SaveVMHandlers savevm_vfio_v1_handlers = {
>>>       .save_setup = vfio_v1_save_setup,
>>>       .save_cleanup = vfio_v1_save_cleanup,
>>> @@ -697,6 +910,34 @@ static SaveVMHandlers savevm_vfio_v1_handlers = {
>>>
>>>   /* ---------------------------------------------------------------------- */
>>>
>>> +static void vfio_vmstate_change(void *opaque, bool running, RunState state)
>>> +{
>>> +    VFIODevice *vbasedev = opaque;
>>> +    enum vfio_device_mig_state new_state;
>>> +    int ret;
>>> +
>>> +    if (running) {
>>> +        new_state = VFIO_DEVICE_STATE_RUNNING;
>>> +    } else {
>>> +        new_state = VFIO_DEVICE_STATE_STOP;
>>> +    }
>>> +
>>> +    ret = vfio_migration_set_state(vbasedev, new_state,
>>> +                                   VFIO_DEVICE_STATE_ERROR);
>>> +    if (ret) {
>>> +        /*
>>> +         * Migration should be aborted in this case, but vm_state_notify()
>>> +         * currently does not support reporting failures.
>>> +         */
>>> +        if (migrate_get_current()->to_dst_file) {
>>> +            qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
>>> +        }
>>> +    }
>>> +
>>> +    trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
>>> +                              new_state);
>>> +}
>>> +
>>>   static void vfio_v1_vmstate_change(void *opaque, bool running, RunState state)
>>>   {
>>>       VFIODevice *vbasedev = opaque;
>>> @@ -770,12 +1011,17 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data)
>>>       case MIGRATION_STATUS_CANCELLED:
>>>       case MIGRATION_STATUS_FAILED:
>>>           bytes_transferred = 0;
>>> -        ret = vfio_migration_v1_set_state(vbasedev,
>>> -                                          ~(VFIO_DEVICE_STATE_V1_SAVING |
>>> -                                            VFIO_DEVICE_STATE_V1_RESUMING),
>>> -                                          VFIO_DEVICE_STATE_V1_RUNNING);
>>> -        if (ret) {
>>> -            error_report("%s: Failed to set state RUNNING", vbasedev->name);
>>> +        if (migration->v2) {
>>> +            vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
>>> +                                     VFIO_DEVICE_STATE_ERROR);
>> Perhaps you are discarding the error?
>>
>> Shouldn't it be:
>>
>>          err =  vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
>>                                          VFIO_DEVICE_STATE_ERROR);
>>
>>> +        } else {
>>> +            ret = vfio_migration_v1_set_state(vbasedev,
>>> +                                              ~(VFIO_DEVICE_STATE_V1_SAVING |
>>> +                                                VFIO_DEVICE_STATE_V1_RESUMING),
>>> +                                              VFIO_DEVICE_STATE_V1_RUNNING);
>>> +            if (ret) {
>>> +                error_report("%s: Failed to set state RUNNING", vbasedev->name);
>>> +            }
>> Perhaps this error_report and condition is in the wrong scope?
>>
>> Shouldn't it be more like this:
>>
>> if (migration->v2) {
>>          ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
>>                                   VFIO_DEVICE_STATE_ERROR);
>> } else {
>>          ret = vfio_migration_v1_set_state(vbasedev,
>>                                            ~(VFIO_DEVICE_STATE_V1_SAVING |
>>                                              VFIO_DEVICE_STATE_V1_RESUMING),
>>                                            VFIO_DEVICE_STATE_V1_RUNNING);
>> }
>>
>>
>> if (ret) {
>>      error_report("%s: Failed to set state RUNNING", vbasedev->name);
>> }
> 
> 
> It was intentionally discarded.
> The return value is used by v1 code to determine whether to print an 
> error message or not.
> In v2 code the error message print is done inside 
> vfio_migration_set_state(), so there is no
> need for the return value here.
> 
Oh yes, I forgot that other print.
Avihai Horon June 15, 2022, 6:40 a.m. UTC | #4
On 6/14/2022 8:24 PM, Joao Martins wrote:
> External email: Use caution opening links or attachments
>
>
> On 6/14/22 17:34, Avihai Horon wrote:
>> On 6/14/2022 2:08 PM, Joao Martins wrote:
>>> External email: Use caution opening links or attachments
>>>
>>>
>>> On 5/30/22 18:07, Avihai Horon wrote:
>>>> +static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>>>> +{
>>>> +    VFIODevice *vbasedev = opaque;
>>>> +    enum vfio_device_mig_state recover_state;
>>>> +    int ret;
>>>> +
>>>> +    /* We reach here with device state STOP or STOP_COPY only */
>>>> +    recover_state = VFIO_DEVICE_STATE_STOP;
>>>> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
>>>> +                                   recover_state);
>>>> +    if (ret) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    do {
>>>> +        ret = vfio_save_block(f, vbasedev->migration);
>>>> +        if (ret < 0) {
>>>> +            return ret;
>>>> +        }
>>>> +    } while (!ret);
>>>> +
>>>> +    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
>>>> +    ret = qemu_file_get_error(f);
>>>> +    if (ret) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP,
>>>> +                                   recover_state);
>>> Is it expected that you are setting VFIO_DEVICE_STATE_STOP while
>>> @recover_state is the same value (VFIO_DEVICE_STATE_STOP) ?
>>
>> Yes.
>> Transitioning to any other state from STOP_COPY will first go through
>> STOP state (this is done internally by kernel).
>> So there is no better option for the recover state but STOP.
>>
> I was think about ERROR state given that you can transition there
> from any state, but wasn't quite sure if it's appropriate to make that arc
> while in stop copy migration phase.

Moving to ERROR is possible but it will just fail, triggering a 
hw_error() (and with following patch triggering a device reset).
Failing to move to STOP recover state will go the same path - trigger a 
hw_error() and with following patch a device reset.

The only difference is that by moving to STOP recover state we try one 
more time to set it to STOP.
Probably it's a useless try, since we failed the first time.

We can change the recover state to ERROR if it makes the code clearer 
and avoids the extra try.


>>>> +    if (ret) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    trace_vfio_save_complete_precopy(vbasedev->name);
>>>> +
>>>> +    return 0;
> just a cosmetic nit: you could probably rewrite these last couple of lines as:
>
>          if (!ret) {
>              trace_vfio_save_complete_precopy(vbasedev->name);
>          }
>
>          return ret;
>
> Let's you avoid the double return path.

Ah thanks! Will change.

>>>> +}
>>>> +
>>>>    static int vfio_v1_save_complete_precopy(QEMUFile *f, void *opaque)
>>>>    {
>>>>        VFIODevice *vbasedev = opaque;
>>>> @@ -593,6 +775,14 @@ static void vfio_save_state(QEMUFile *f, void *opaque)
>>>>        }
>>>>    }
>>>>
>>>> +static int vfio_load_setup(QEMUFile *f, void *opaque)
>>>> +{
>>>> +    VFIODevice *vbasedev = opaque;
>>>> +
>>>> +    return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING,
>>>> +                                   vbasedev->migration->device_state);
>>>> +}
>>>> +
>>>>    static int vfio_v1_load_setup(QEMUFile *f, void *opaque)
>>>>    {
>>>>        VFIODevice *vbasedev = opaque;
>>>> @@ -620,6 +810,15 @@ static int vfio_v1_load_setup(QEMUFile *f, void *opaque)
>>>>        return ret;
>>>>    }
>>>>
>>>> +static int vfio_load_cleanup(void *opaque)
>>>> +{
>>>> +    VFIODevice *vbasedev = opaque;
>>>> +
>>>> +    vfio_migration_cleanup(vbasedev);
>>>> +    trace_vfio_load_cleanup(vbasedev->name);
>>>> +    return 0;
>>>> +}
>>>> +
>>>>    static int vfio_v1_load_cleanup(void *opaque)
>>>>    {
>>>>        VFIODevice *vbasedev = opaque;
>>>> @@ -662,7 +861,11 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
>>>>                uint64_t data_size = qemu_get_be64(f);
>>>>
>>>>                if (data_size) {
>>>> -                ret = vfio_v1_load_buffer(f, vbasedev, data_size);
>>>> +                if (vbasedev->migration->v2) {
>>>> +                    ret = vfio_load_buffer(f, vbasedev, data_size);
>>>> +                } else {
>>>> +                    ret = vfio_v1_load_buffer(f, vbasedev, data_size);
>>>> +                }
>>>>                    if (ret < 0) {
>>>>                        return ret;
>>>>                    }
>>>> @@ -683,6 +886,16 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
>>>>        return ret;
>>>>    }
>>>>
>>>> +static SaveVMHandlers savevm_vfio_handlers = {
>>>> +    .save_setup = vfio_save_setup,
>>>> +    .save_cleanup = vfio_save_cleanup,
>>>> +    .save_live_complete_precopy = vfio_save_complete_precopy,
>>>> +    .save_state = vfio_save_state,
>>>> +    .load_setup = vfio_load_setup,
>>>> +    .load_cleanup = vfio_load_cleanup,
>>>> +    .load_state = vfio_load_state,
>>>> +};
>>>> +
>>>>    static SaveVMHandlers savevm_vfio_v1_handlers = {
>>>>        .save_setup = vfio_v1_save_setup,
>>>>        .save_cleanup = vfio_v1_save_cleanup,
>>>> @@ -697,6 +910,34 @@ static SaveVMHandlers savevm_vfio_v1_handlers = {
>>>>
>>>>    /* ---------------------------------------------------------------------- */
>>>>
>>>> +static void vfio_vmstate_change(void *opaque, bool running, RunState state)
>>>> +{
>>>> +    VFIODevice *vbasedev = opaque;
>>>> +    enum vfio_device_mig_state new_state;
>>>> +    int ret;
>>>> +
>>>> +    if (running) {
>>>> +        new_state = VFIO_DEVICE_STATE_RUNNING;
>>>> +    } else {
>>>> +        new_state = VFIO_DEVICE_STATE_STOP;
>>>> +    }
>>>> +
>>>> +    ret = vfio_migration_set_state(vbasedev, new_state,
>>>> +                                   VFIO_DEVICE_STATE_ERROR);
>>>> +    if (ret) {
>>>> +        /*
>>>> +         * Migration should be aborted in this case, but vm_state_notify()
>>>> +         * currently does not support reporting failures.
>>>> +         */
>>>> +        if (migrate_get_current()->to_dst_file) {
>>>> +            qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
>>>> +        }
>>>> +    }
>>>> +
>>>> +    trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
>>>> +                              new_state);
>>>> +}
>>>> +
>>>>    static void vfio_v1_vmstate_change(void *opaque, bool running, RunState state)
>>>>    {
>>>>        VFIODevice *vbasedev = opaque;
>>>> @@ -770,12 +1011,17 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data)
>>>>        case MIGRATION_STATUS_CANCELLED:
>>>>        case MIGRATION_STATUS_FAILED:
>>>>            bytes_transferred = 0;
>>>> -        ret = vfio_migration_v1_set_state(vbasedev,
>>>> -                                          ~(VFIO_DEVICE_STATE_V1_SAVING |
>>>> -                                            VFIO_DEVICE_STATE_V1_RESUMING),
>>>> -                                          VFIO_DEVICE_STATE_V1_RUNNING);
>>>> -        if (ret) {
>>>> -            error_report("%s: Failed to set state RUNNING", vbasedev->name);
>>>> +        if (migration->v2) {
>>>> +            vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
>>>> +                                     VFIO_DEVICE_STATE_ERROR);
>>> Perhaps you are discarding the error?
>>>
>>> Shouldn't it be:
>>>
>>>           err =  vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
>>>                                           VFIO_DEVICE_STATE_ERROR);
>>>
>>>> +        } else {
>>>> +            ret = vfio_migration_v1_set_state(vbasedev,
>>>> +                                              ~(VFIO_DEVICE_STATE_V1_SAVING |
>>>> +                                                VFIO_DEVICE_STATE_V1_RESUMING),
>>>> +                                              VFIO_DEVICE_STATE_V1_RUNNING);
>>>> +            if (ret) {
>>>> +                error_report("%s: Failed to set state RUNNING", vbasedev->name);
>>>> +            }
>>> Perhaps this error_report and condition is in the wrong scope?
>>>
>>> Shouldn't it be more like this:
>>>
>>> if (migration->v2) {
>>>           ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
>>>                                    VFIO_DEVICE_STATE_ERROR);
>>> } else {
>>>           ret = vfio_migration_v1_set_state(vbasedev,
>>>                                             ~(VFIO_DEVICE_STATE_V1_SAVING |
>>>                                               VFIO_DEVICE_STATE_V1_RESUMING),
>>>                                             VFIO_DEVICE_STATE_V1_RUNNING);
>>> }
>>>
>>>
>>> if (ret) {
>>>       error_report("%s: Failed to set state RUNNING", vbasedev->name);
>>> }
>>
>> It was intentionally discarded.
>> The return value is used by v1 code to determine whether to print an
>> error message or not.
>> In v2 code the error message print is done inside
>> vfio_migration_set_state(), so there is no
>> need for the return value here.
>>
> Oh yes, I forgot that other print.
Jason Gunthorpe July 18, 2022, 3:12 p.m. UTC | #5
On Mon, May 30, 2022 at 08:07:35PM +0300, Avihai Horon wrote:

> +/* Returns 1 if end-of-stream is reached, 0 if more data and -1 if error */
> +static int vfio_save_block(QEMUFile *f, VFIOMigration *migration)
> +{
> +    ssize_t data_size;
> +
> +    data_size = read(migration->data_fd, migration->data_buffer,
> +                     migration->data_buffer_size);
> +    if (data_size < 0) {
> +        return -1;
> +    }
> +    if (data_size == 0) {
> +        return 1;
> +    }
> +
> +    qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
> +    qemu_put_be64(f, data_size);
> +    qemu_put_buffer_async(f, migration->data_buffer, data_size, false);
> +    qemu_fflush(f);
> +    bytes_transferred += data_size;
> +
> +    trace_vfio_save_block(migration->vbasedev->name, data_size);
> +
> +    return qemu_file_get_error(f);
> +}

We looked at this from an eye to "how much data is transfered" per
callback.

The above function is the basic data mover, and
'migration->data_buffer_size' is set to 1MB at the moment.

So, we product up to 1MB VFIO_MIG_FLAG_DEV_DATA_STATE sections.

This series does not include the precopy support, but that will
include a precopy 'save_live_iterate' function like this:

static int vfio_save_iterate(QEMUFile *f, void *opaque)
{
    VFIODevice *vbasedev = opaque;
    VFIOMigration *migration = vbasedev->migration;
    int ret;

    ret = vfio_save_block(f, migration);
    if (ret < 0) {
        return ret;
    }
    if (ret == 1) {
        return 1;
    }
    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
    return 0;
}

Thus, during precopy this will never do more than 1MB per callback.

> +static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
> +{
> +    VFIODevice *vbasedev = opaque;
> +    enum vfio_device_mig_state recover_state;
> +    int ret;
> +
> +    /* We reach here with device state STOP or STOP_COPY only */
> +    recover_state = VFIO_DEVICE_STATE_STOP;
> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
> +                                   recover_state);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    do {
> +        ret = vfio_save_block(f, vbasedev->migration);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +    } while (!ret);

This seems to be the main problem where we chain together 1MB blocks
until the entire completed precopy data is completed. The above is
hooked to 'save_live_complete_precopy'

So, if we want to break the above up into some 'save_iterate' like
function, do you have some advice how to do it? The above do/while
must happen after the VFIO_DEVICE_STATE_STOP_COPY.

For mlx5 the above loop will often be ~10MB's for small VMs and
100MB's for big VMs (big meaning making extensive use of RDMA
functionality), and this will not change with pre-copy support or not.

Is it still a problem?

For other devices, like a GPU, I would imagine pre-copy support is
implemented and this will be a smaller post-precopy residual.

Jason
Avihai Horon July 27, 2022, 3:45 p.m. UTC | #6
On 7/18/2022 6:12 PM, Jason Gunthorpe wrote:
> On Mon, May 30, 2022 at 08:07:35PM +0300, Avihai Horon wrote:
>
>> +/* Returns 1 if end-of-stream is reached, 0 if more data and -1 if error */
>> +static int vfio_save_block(QEMUFile *f, VFIOMigration *migration)
>> +{
>> +    ssize_t data_size;
>> +
>> +    data_size = read(migration->data_fd, migration->data_buffer,
>> +                     migration->data_buffer_size);
>> +    if (data_size < 0) {
>> +        return -1;
>> +    }
>> +    if (data_size == 0) {
>> +        return 1;
>> +    }
>> +
>> +    qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
>> +    qemu_put_be64(f, data_size);
>> +    qemu_put_buffer_async(f, migration->data_buffer, data_size, false);
>> +    qemu_fflush(f);
>> +    bytes_transferred += data_size;
>> +
>> +    trace_vfio_save_block(migration->vbasedev->name, data_size);
>> +
>> +    return qemu_file_get_error(f);
>> +}
> We looked at this from an eye to "how much data is transfered" per
> callback.
>
> The above function is the basic data mover, and
> 'migration->data_buffer_size' is set to 1MB at the moment.
>
> So, we product up to 1MB VFIO_MIG_FLAG_DEV_DATA_STATE sections.
>
> This series does not include the precopy support, but that will
> include a precopy 'save_live_iterate' function like this:
>
> static int vfio_save_iterate(QEMUFile *f, void *opaque)
> {
>      VFIODevice *vbasedev = opaque;
>      VFIOMigration *migration = vbasedev->migration;
>      int ret;
>
>      ret = vfio_save_block(f, migration);
>      if (ret < 0) {
>          return ret;
>      }
>      if (ret == 1) {
>          return 1;
>      }
>      qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
>      return 0;
> }
>
> Thus, during precopy this will never do more than 1MB per callback.
>
>> +static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
>> +{
>> +    VFIODevice *vbasedev = opaque;
>> +    enum vfio_device_mig_state recover_state;
>> +    int ret;
>> +
>> +    /* We reach here with device state STOP or STOP_COPY only */
>> +    recover_state = VFIO_DEVICE_STATE_STOP;
>> +    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
>> +                                   recover_state);
>> +    if (ret) {
>> +        return ret;
>> +    }
>> +
>> +    do {
>> +        ret = vfio_save_block(f, vbasedev->migration);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +    } while (!ret);
> This seems to be the main problem where we chain together 1MB blocks
> until the entire completed precopy data is completed. The above is
> hooked to 'save_live_complete_precopy'
>
> So, if we want to break the above up into some 'save_iterate' like
> function, do you have some advice how to do it? The above do/while
> must happen after the VFIO_DEVICE_STATE_STOP_COPY.

Ping.

Juan, AFAIU (and correct me if I am wrong) the problem on source side is 
that save_live_complete_precopy handlers are called with iothread 
locked, so during this time QEMU is non-responsive.
On destination side, we don't yield every now and then like RAM code 
does, so QEMU is non-responsive there as well.

Is it possible to solve this problem by letting the VFIO 
save_live_complete_precopy handler run outside the iothread lock?

For example, add a function to SaveVMHandlers that indicates whether 
this specific save_live_complete_precopy handler should run 
inside/outside iothread lock?
Or add a save_live_complete_precopy_nonblocking handler that runs 
outside the iothread lock?

On destination side, since VFIO data is sent in chunks of 1MB, we can 
yield every now and then.

What do you think?

Thanks.

> For mlx5 the above loop will often be ~10MB's for small VMs and
> 100MB's for big VMs (big meaning making extensive use of RDMA
> functionality), and this will not change with pre-copy support or not.
>
> Is it still a problem?
>
> For other devices, like a GPU, I would imagine pre-copy support is
> implemented and this will be a smaller post-precopy residual.
>
> Jason
diff mbox series

Patch

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index a3dd8221ed..5541133ec9 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -355,10 +355,18 @@  static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
                 return false;
             }
 
-            if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) &&
+            if (!migration->v2 &&
+                (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) &&
                 (migration->device_state_v1 & VFIO_DEVICE_STATE_V1_RUNNING)) {
                 return false;
             }
+
+            if (migration->v2 &&
+                (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) &&
+                (migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
+                 migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P)) {
+                return false;
+            }
         }
     }
     return true;
@@ -385,7 +393,14 @@  static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
                 return false;
             }
 
-            if (migration->device_state_v1 & VFIO_DEVICE_STATE_V1_RUNNING) {
+            if (!migration->v2 &&
+                migration->device_state_v1 & VFIO_DEVICE_STATE_V1_RUNNING) {
+                continue;
+            }
+
+            if (migration->v2 &&
+                (migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
+                 migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P)) {
                 continue;
             } else {
                 return false;
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index e40aa0ad80..de68eadb09 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -44,8 +44,83 @@ 
 #define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xffffffffef100003ULL)
 #define VFIO_MIG_FLAG_DEV_DATA_STATE    (0xffffffffef100004ULL)
 
+#define VFIO_MIG_DATA_BUFFER_SIZE (1024 * 1024)
+
 static int64_t bytes_transferred;
 
+static const char *mig_state_to_str(enum vfio_device_mig_state state)
+{
+    switch (state) {
+    case VFIO_DEVICE_STATE_ERROR:
+        return "ERROR";
+    case VFIO_DEVICE_STATE_STOP:
+        return "STOP";
+    case VFIO_DEVICE_STATE_RUNNING:
+        return "RUNNING";
+    case VFIO_DEVICE_STATE_STOP_COPY:
+        return "STOP_COPY";
+    case VFIO_DEVICE_STATE_RESUMING:
+        return "RESUMING";
+    case VFIO_DEVICE_STATE_RUNNING_P2P:
+        return "RUNNING_P2P";
+    default:
+        return "UNKNOWN STATE";
+    }
+}
+
+static int vfio_migration_set_state(VFIODevice *vbasedev,
+                                    enum vfio_device_mig_state new_state,
+                                    enum vfio_device_mig_state recover_state)
+{
+    VFIOMigration *migration = vbasedev->migration;
+    uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
+                              sizeof(struct vfio_device_feature_mig_state),
+                              sizeof(uint64_t))] = {};
+    struct vfio_device_feature *feature = (void *)buf;
+    struct vfio_device_feature_mig_state *mig_state = (void *)feature->data;
+    int ret;
+
+    feature->argsz = sizeof(buf);
+    feature->flags =
+        VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE;
+    mig_state->device_state = new_state;
+    ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
+    if (ret) {
+        /* Try to put the device in some good state */
+        mig_state->device_state = recover_state;
+        if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+            hw_error("%s: Device in error state, can't recover",
+                     vbasedev->name);
+        }
+
+        error_report("%s: Failed changing device state to %s", vbasedev->name,
+                     mig_state_to_str(new_state));
+        migration->device_state = recover_state;
+
+        return -1;
+    }
+
+    if (mig_state->data_fd != -1) {
+        if (migration->data_fd != -1) {
+            /*
+             * This can happen if the device is asynchronously reset and
+             * terminates a data transfer.
+             */
+            error_report("%s: data_fd out of sync", vbasedev->name);
+            close(mig_state->data_fd);
+
+            return -1;
+        }
+
+        migration->data_fd = mig_state->data_fd;
+    }
+    migration->device_state = new_state;
+
+    trace_vfio_migration_set_state(vbasedev->name, new_state);
+
+    return 0;
+}
+
 static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count,
                                   off_t off, bool iswrite)
 {
@@ -260,6 +335,22 @@  static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size)
     return ret;
 }
 
+static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
+                            uint64_t data_size)
+{
+    VFIOMigration *migration = vbasedev->migration;
+    int ret;
+
+    ret = qemu_file_get_to_fd(f, migration->data_fd, data_size);
+    if (ret) {
+        return ret;
+    }
+
+    trace_vfio_load_state_device_data(vbasedev->name, data_size);
+
+    return 0;
+}
+
 static int vfio_v1_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
                                uint64_t data_size)
 {
@@ -394,6 +485,14 @@  static int vfio_load_device_config_state(QEMUFile *f, void *opaque)
     return qemu_file_get_error(f);
 }
 
+static void vfio_migration_cleanup(VFIODevice *vbasedev)
+{
+    VFIOMigration *migration = vbasedev->migration;
+
+    close(migration->data_fd);
+    migration->data_fd = -1;
+}
+
 static void vfio_migration_v1_cleanup(VFIODevice *vbasedev)
 {
     VFIOMigration *migration = vbasedev->migration;
@@ -405,6 +504,18 @@  static void vfio_migration_v1_cleanup(VFIODevice *vbasedev)
 
 /* ---------------------------------------------------------------------- */
 
+static int vfio_save_setup(QEMUFile *f, void *opaque)
+{
+    VFIODevice *vbasedev = opaque;
+
+    trace_vfio_save_setup(vbasedev->name);
+
+    qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
+    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
+
+    return qemu_file_get_error(f);
+}
+
 static int vfio_v1_save_setup(QEMUFile *f, void *opaque)
 {
     VFIODevice *vbasedev = opaque;
@@ -448,6 +559,14 @@  static int vfio_v1_save_setup(QEMUFile *f, void *opaque)
     return 0;
 }
 
+static void vfio_save_cleanup(void *opaque)
+{
+    VFIODevice *vbasedev = opaque;
+
+    vfio_migration_cleanup(vbasedev);
+    trace_vfio_save_cleanup(vbasedev->name);
+}
+
 static void vfio_v1_save_cleanup(void *opaque)
 {
     VFIODevice *vbasedev = opaque;
@@ -524,6 +643,69 @@  static int vfio_save_iterate(QEMUFile *f, void *opaque)
     return 0;
 }
 
+/* Returns 1 if end-of-stream is reached, 0 if more data and -1 if error */
+static int vfio_save_block(QEMUFile *f, VFIOMigration *migration)
+{
+    ssize_t data_size;
+
+    data_size = read(migration->data_fd, migration->data_buffer,
+                     migration->data_buffer_size);
+    if (data_size < 0) {
+        return -1;
+    }
+    if (data_size == 0) {
+        return 1;
+    }
+
+    qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
+    qemu_put_be64(f, data_size);
+    qemu_put_buffer_async(f, migration->data_buffer, data_size, false);
+    qemu_fflush(f);
+    bytes_transferred += data_size;
+
+    trace_vfio_save_block(migration->vbasedev->name, data_size);
+
+    return qemu_file_get_error(f);
+}
+
+static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
+{
+    VFIODevice *vbasedev = opaque;
+    enum vfio_device_mig_state recover_state;
+    int ret;
+
+    /* We reach here with device state STOP or STOP_COPY only */
+    recover_state = VFIO_DEVICE_STATE_STOP;
+    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
+                                   recover_state);
+    if (ret) {
+        return ret;
+    }
+
+    do {
+        ret = vfio_save_block(f, vbasedev->migration);
+        if (ret < 0) {
+            return ret;
+        }
+    } while (!ret);
+
+    qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
+    ret = qemu_file_get_error(f);
+    if (ret) {
+        return ret;
+    }
+
+    ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP,
+                                   recover_state);
+    if (ret) {
+        return ret;
+    }
+
+    trace_vfio_save_complete_precopy(vbasedev->name);
+
+    return 0;
+}
+
 static int vfio_v1_save_complete_precopy(QEMUFile *f, void *opaque)
 {
     VFIODevice *vbasedev = opaque;
@@ -593,6 +775,14 @@  static void vfio_save_state(QEMUFile *f, void *opaque)
     }
 }
 
+static int vfio_load_setup(QEMUFile *f, void *opaque)
+{
+    VFIODevice *vbasedev = opaque;
+
+    return vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING,
+                                   vbasedev->migration->device_state);
+}
+
 static int vfio_v1_load_setup(QEMUFile *f, void *opaque)
 {
     VFIODevice *vbasedev = opaque;
@@ -620,6 +810,15 @@  static int vfio_v1_load_setup(QEMUFile *f, void *opaque)
     return ret;
 }
 
+static int vfio_load_cleanup(void *opaque)
+{
+    VFIODevice *vbasedev = opaque;
+
+    vfio_migration_cleanup(vbasedev);
+    trace_vfio_load_cleanup(vbasedev->name);
+    return 0;
+}
+
 static int vfio_v1_load_cleanup(void *opaque)
 {
     VFIODevice *vbasedev = opaque;
@@ -662,7 +861,11 @@  static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
             uint64_t data_size = qemu_get_be64(f);
 
             if (data_size) {
-                ret = vfio_v1_load_buffer(f, vbasedev, data_size);
+                if (vbasedev->migration->v2) {
+                    ret = vfio_load_buffer(f, vbasedev, data_size);
+                } else {
+                    ret = vfio_v1_load_buffer(f, vbasedev, data_size);
+                }
                 if (ret < 0) {
                     return ret;
                 }
@@ -683,6 +886,16 @@  static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
     return ret;
 }
 
+static SaveVMHandlers savevm_vfio_handlers = {
+    .save_setup = vfio_save_setup,
+    .save_cleanup = vfio_save_cleanup,
+    .save_live_complete_precopy = vfio_save_complete_precopy,
+    .save_state = vfio_save_state,
+    .load_setup = vfio_load_setup,
+    .load_cleanup = vfio_load_cleanup,
+    .load_state = vfio_load_state,
+};
+
 static SaveVMHandlers savevm_vfio_v1_handlers = {
     .save_setup = vfio_v1_save_setup,
     .save_cleanup = vfio_v1_save_cleanup,
@@ -697,6 +910,34 @@  static SaveVMHandlers savevm_vfio_v1_handlers = {
 
 /* ---------------------------------------------------------------------- */
 
+static void vfio_vmstate_change(void *opaque, bool running, RunState state)
+{
+    VFIODevice *vbasedev = opaque;
+    enum vfio_device_mig_state new_state;
+    int ret;
+
+    if (running) {
+        new_state = VFIO_DEVICE_STATE_RUNNING;
+    } else {
+        new_state = VFIO_DEVICE_STATE_STOP;
+    }
+
+    ret = vfio_migration_set_state(vbasedev, new_state,
+                                   VFIO_DEVICE_STATE_ERROR);
+    if (ret) {
+        /*
+         * Migration should be aborted in this case, but vm_state_notify()
+         * currently does not support reporting failures.
+         */
+        if (migrate_get_current()->to_dst_file) {
+            qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
+        }
+    }
+
+    trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
+                              new_state);
+}
+
 static void vfio_v1_vmstate_change(void *opaque, bool running, RunState state)
 {
     VFIODevice *vbasedev = opaque;
@@ -770,12 +1011,17 @@  static void vfio_migration_state_notifier(Notifier *notifier, void *data)
     case MIGRATION_STATUS_CANCELLED:
     case MIGRATION_STATUS_FAILED:
         bytes_transferred = 0;
-        ret = vfio_migration_v1_set_state(vbasedev,
-                                          ~(VFIO_DEVICE_STATE_V1_SAVING |
-                                            VFIO_DEVICE_STATE_V1_RESUMING),
-                                          VFIO_DEVICE_STATE_V1_RUNNING);
-        if (ret) {
-            error_report("%s: Failed to set state RUNNING", vbasedev->name);
+        if (migration->v2) {
+            vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING,
+                                     VFIO_DEVICE_STATE_ERROR);
+        } else {
+            ret = vfio_migration_v1_set_state(vbasedev,
+                                              ~(VFIO_DEVICE_STATE_V1_SAVING |
+                                                VFIO_DEVICE_STATE_V1_RESUMING),
+                                              VFIO_DEVICE_STATE_V1_RUNNING);
+            if (ret) {
+                error_report("%s: Failed to set state RUNNING", vbasedev->name);
+            }
         }
     }
 }
@@ -784,12 +1030,35 @@  static void vfio_migration_exit(VFIODevice *vbasedev)
 {
     VFIOMigration *migration = vbasedev->migration;
 
-    vfio_region_exit(&migration->region);
-    vfio_region_finalize(&migration->region);
+    if (migration->v2) {
+        g_free(migration->data_buffer);
+    } else {
+        vfio_region_exit(&migration->region);
+        vfio_region_finalize(&migration->region);
+    }
     g_free(vbasedev->migration);
     vbasedev->migration = NULL;
 }
 
+static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
+{
+    uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
+                                  sizeof(struct vfio_device_feature_migration),
+                              sizeof(uint64_t))] = {};
+    struct vfio_device_feature *feature = (void *)buf;
+    struct vfio_device_feature_migration *mig = (void *)feature->data;
+
+    feature->argsz = sizeof(buf);
+    feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION;
+    if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
+        return -EOPNOTSUPP;
+    }
+
+    *mig_flags = mig->flags;
+
+    return 0;
+}
+
 static int vfio_migration_init(VFIODevice *vbasedev)
 {
     int ret;
@@ -798,6 +1067,7 @@  static int vfio_migration_init(VFIODevice *vbasedev)
     char id[256] = "";
     g_autofree char *path = NULL, *oid = NULL;
     struct vfio_region_info *info = NULL;
+    uint64_t mig_flags;
 
     if (!vbasedev->ops->vfio_get_object) {
         return -EINVAL;
@@ -808,32 +1078,48 @@  static int vfio_migration_init(VFIODevice *vbasedev)
         return -EINVAL;
     }
 
-    ret = vfio_get_dev_region_info(vbasedev,
-                                   VFIO_REGION_TYPE_MIGRATION_DEPRECATED,
-                                   VFIO_REGION_SUBTYPE_MIGRATION_DEPRECATED,
-                                   &info);
-    if (ret) {
-        return ret;
-    }
+    ret = vfio_migration_query_flags(vbasedev, &mig_flags);
+    if (!ret) {
+        /* Migration v2 */
+        /* Basic migration functionality must be supported */
+        if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) {
+            return -EOPNOTSUPP;
+        }
+        vbasedev->migration = g_new0(VFIOMigration, 1);
+        vbasedev->migration->data_buffer_size = VFIO_MIG_DATA_BUFFER_SIZE;
+        vbasedev->migration->data_buffer =
+            g_malloc0(vbasedev->migration->data_buffer_size);
+        vbasedev->migration->data_fd = -1;
+        vbasedev->migration->v2 = true;
+    } else {
+        /* Migration v1 */
+        ret = vfio_get_dev_region_info(vbasedev,
+                                       VFIO_REGION_TYPE_MIGRATION_DEPRECATED,
+                                       VFIO_REGION_SUBTYPE_MIGRATION_DEPRECATED,
+                                       &info);
+        if (ret) {
+            return ret;
+        }
 
-    vbasedev->migration = g_new0(VFIOMigration, 1);
+        vbasedev->migration = g_new0(VFIOMigration, 1);
 
-    ret = vfio_region_setup(obj, vbasedev, &vbasedev->migration->region,
-                            info->index, "migration");
-    if (ret) {
-        error_report("%s: Failed to setup VFIO migration region %d: %s",
-                     vbasedev->name, info->index, strerror(-ret));
-        goto err;
-    }
+        ret = vfio_region_setup(obj, vbasedev, &vbasedev->migration->region,
+                                info->index, "migration");
+        if (ret) {
+            error_report("%s: Failed to setup VFIO migration region %d: %s",
+                         vbasedev->name, info->index, strerror(-ret));
+            goto err;
+        }
 
-    if (!vbasedev->migration->region.size) {
-        error_report("%s: Invalid zero-sized VFIO migration region %d",
-                     vbasedev->name, info->index);
-        ret = -EINVAL;
-        goto err;
-    }
+        if (!vbasedev->migration->region.size) {
+            error_report("%s: Invalid zero-sized VFIO migration region %d",
+                         vbasedev->name, info->index);
+            ret = -EINVAL;
+            goto err;
+        }
 
-    g_free(info);
+        g_free(info);
+    }
 
     migration = vbasedev->migration;
     migration->vbasedev = vbasedev;
@@ -846,11 +1132,20 @@  static int vfio_migration_init(VFIODevice *vbasedev)
     }
     strpadcpy(id, sizeof(id), path, '\0');
 
-    register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1,
-                         &savevm_vfio_v1_handlers, vbasedev);
+    if (migration->v2) {
+        register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1,
+                             &savevm_vfio_handlers, vbasedev);
+
+        migration->vm_state = qdev_add_vm_change_state_handler(
+            vbasedev->dev, vfio_vmstate_change, vbasedev);
+    } else {
+        register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1,
+                             &savevm_vfio_v1_handlers, vbasedev);
+
+        migration->vm_state = qdev_add_vm_change_state_handler(
+            vbasedev->dev, vfio_v1_vmstate_change, vbasedev);
+    }
 
-    migration->vm_state = qdev_add_vm_change_state_handler(
-        vbasedev->dev, vfio_v1_vmstate_change, vbasedev);
     migration->migration_state.notify = vfio_migration_state_notifier;
     add_migration_state_change_notifier(&migration->migration_state);
     return 0;
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index ac8b04f52a..6e8c5958b9 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -163,6 +163,8 @@  vfio_save_complete_precopy(const char *name) " (%s)"
 vfio_load_device_config_state(const char *name) " (%s)"
 vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
 vfio_v1_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64
+vfio_load_state_device_data(const char *name, uint64_t data_size) " (%s) size 0x%"PRIx64
 vfio_load_cleanup(const char *name) " (%s)"
 vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
+vfio_save_block(const char *name, int data_size) " (%s) data_size %d"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index bbaf72ba00..2ec3346fea 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -66,6 +66,11 @@  typedef struct VFIOMigration {
     int vm_running;
     Notifier migration_state;
     uint64_t pending_bytes;
+    enum vfio_device_mig_state device_state;
+    int data_fd;
+    void *data_buffer;
+    size_t data_buffer_size;
+    bool v2;
 } VFIOMigration;
 
 typedef struct VFIOAddressSpace {