diff mbox series

[V1,30/32] vfio-pci: save and restore

Message ID 1596122076-341293-31-git-send-email-steven.sistare@oracle.com (mailing list archive)
State New, archived
Headers show
Series Live Update | expand

Commit Message

Steven Sistare July 30, 2020, 3:14 p.m. UTC
Enable vfio-pci devices to be saved and restored across an exec restart
of qemu.

At vfio creation time, save the value of vfio container, group, and device
descriptors in the environment.

In cprsave, save the msi message area as part of vfio-pci vmstate, and
clear the close-on-exec flag for the vfio descriptors.  The flag is not
cleared earlier because the descriptors should not persist across misc
fork and exec calls that may be performed during normal operation.

On qemu restart, vfio_realize() finds the descriptor env vars, uses
the descriptors, and notes that the device is being reused.  Device and
iommu state is already configured, so operations in vfio_realize that
would modify the configuration are skipped for a reused device, including
vfio ioctl's and writes to PCI configuration space.  The result is that
vfio_realize constructs qemu data structures that reflect the current
state of the device.  However, the reconstruction is not complete until
cprload is called, and vfio_pci_post_load uses the msi data to rebuild
interrupt structures and attach the interrupts to the new KVM instance.
Lastly, vfio device reset is suppressed when the VM is started.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 hw/pci/pci.c                  |  4 ++
 hw/vfio/common.c              | 99 ++++++++++++++++++++++++++++++++++---------
 hw/vfio/pci.c                 | 79 ++++++++++++++++++++++++++++++++--
 hw/vfio/platform.c            |  2 +-
 include/hw/pci/pci.h          |  1 +
 include/hw/vfio/vfio-common.h |  4 +-
 migration/savevm.c            |  2 +-
 7 files changed, 163 insertions(+), 28 deletions(-)

Comments

Jason Zeng Aug. 6, 2020, 10:22 a.m. UTC | #1
Hi Steve,

On Thu, Jul 30, 2020 at 08:14:34AM -0700, Steve Sistare wrote:
> @@ -3182,6 +3207,51 @@ static Property vfio_pci_dev_properties[] = {
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> +static int vfio_pci_post_load(void *opaque, int version_id)
> +{
> +    int vector;
> +    MSIMessage msg;
> +    Error *err = 0;
> +    VFIOPCIDevice *vdev = opaque;
> +    PCIDevice *pdev = &vdev->pdev;
> +
> +    if (msix_enabled(pdev)) {
> +        vfio_msix_enable(vdev);
> +        pdev->msix_function_masked = false;
> +
> +        for (vector = 0; vector < vdev->pdev.msix_entries_nr; vector++) {
> +            if (!msix_is_masked(pdev, vector)) {
> +                msg = msix_get_message(pdev, vector);
> +                vfio_msix_vector_use(pdev, vector, msg);
> +            }
> +        }

It looks to me MSIX re-init here may lose device IRQs and impact
device hardware state?

The re-init will cause the kernel vfio driver to connect the device
MSIX vectors to new eventfds and KVM instance. But before that, device
IRQs will be routed to previous eventfd. Looks these IRQs will be lost.

And the re-init will make the device go through the procedure of
disabling MSIX, enabling INTX, and re-enabling MSIX and vectors.
So if the device is active, its hardware state will be impacted?


Thanks,
Jason

> +
> +    } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
> +        vfio_intx_enable(vdev, &err);
> +        if (err) {
> +            error_report_err(err);
> +        }
> +    }
> +
> +    vdev->vbasedev.group->container->reused = false;
> +    vdev->pdev.reused = false;
> +
> +    return 0;
> +}
> +
> +static const VMStateDescription vfio_pci_vmstate = {
> +    .name = "vfio-pci",
> +    .unmigratable = 1,
> +    .mode_mask = VMS_RESTART,
> +    .version_id = 0,
> +    .minimum_version_id = 0,
> +    .post_load = vfio_pci_post_load,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_MSIX(pdev, VFIOPCIDevice),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>  static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
> @@ -3189,6 +3259,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
>  
>      dc->reset = vfio_pci_reset;
>      device_class_set_props(dc, vfio_pci_dev_properties);
> +    dc->vmsd = &vfio_pci_vmstate;
>      dc->desc = "VFIO-based PCI device assignment";
>      set_bit(DEVICE_CATEGORY_MISC, dc->categories);
>      pdc->realize = vfio_realize;
> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
> index ac2cefc..e6e1a5d 100644
> --- a/hw/vfio/platform.c
> +++ b/hw/vfio/platform.c
> @@ -592,7 +592,7 @@ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
>              return -EBUSY;
>          }
>      }
> -    ret = vfio_get_device(group, vbasedev->name, vbasedev, errp);
> +    ret = vfio_get_device(group, vbasedev->name, vbasedev, 0, errp);
>      if (ret) {
>          vfio_put_group(group);
>          return ret;
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index bd07c86..c926a24 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -358,6 +358,7 @@ struct PCIDevice {
>  
>      /* ID of standby device in net_failover pair */
>      char *failover_pair_id;
> +    bool reused;
>  };
>  
>  void pci_register_bar(PCIDevice *pci_dev, int region_num,
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index c78f3ff..4e2a332 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -73,6 +73,8 @@ typedef struct VFIOContainer {
>      unsigned iommu_type;
>      Error *error;
>      bool initialized;
> +    bool reused;
> +    int cid;
>      unsigned long pgsizes;
>      QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>      QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
> @@ -177,7 +179,7 @@ void vfio_reset_handler(void *opaque);
>  VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
>  void vfio_put_group(VFIOGroup *group);
>  int vfio_get_device(VFIOGroup *group, const char *name,
> -                    VFIODevice *vbasedev, Error **errp);
> +                    VFIODevice *vbasedev, bool *reused, Error **errp);
>  
>  extern const MemoryRegionOps vfio_region_ops;
>  typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 881dc13..2606cf0 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -1568,7 +1568,7 @@ static int qemu_savevm_state(QEMUFile *f, VMStateMode mode, Error **errp)
>          return -EINVAL;
>      }
>  
> -    if (migrate_use_block()) {
> +    if ((mode & (VMS_SNAPSHOT | VMS_MIGRATE)) && migrate_use_block()) {
>          error_setg(errp, "Block migration and snapshots are incompatible");
>          return -EINVAL;
>      }
> -- 
> 1.8.3.1
> 
>
Steven Sistare Aug. 7, 2020, 8:38 p.m. UTC | #2
On 8/6/2020 6:22 AM, Jason Zeng wrote:
> Hi Steve,
> 
> On Thu, Jul 30, 2020 at 08:14:34AM -0700, Steve Sistare wrote:
>> @@ -3182,6 +3207,51 @@ static Property vfio_pci_dev_properties[] = {
>>      DEFINE_PROP_END_OF_LIST(),
>>  };
>>  
>> +static int vfio_pci_post_load(void *opaque, int version_id)
>> +{
>> +    int vector;
>> +    MSIMessage msg;
>> +    Error *err = 0;
>> +    VFIOPCIDevice *vdev = opaque;
>> +    PCIDevice *pdev = &vdev->pdev;
>> +
>> +    if (msix_enabled(pdev)) {
>> +        vfio_msix_enable(vdev);
>> +        pdev->msix_function_masked = false;
>> +
>> +        for (vector = 0; vector < vdev->pdev.msix_entries_nr; vector++) {
>> +            if (!msix_is_masked(pdev, vector)) {
>> +                msg = msix_get_message(pdev, vector);
>> +                vfio_msix_vector_use(pdev, vector, msg);
>> +            }
>> +        }
> 
> It looks to me MSIX re-init here may lose device IRQs and impact
> device hardware state?
> 
> The re-init will cause the kernel vfio driver to connect the device
> MSIX vectors to new eventfds and KVM instance. But before that, device
> IRQs will be routed to previous eventfd. Looks these IRQs will be lost.

Thanks Jason, that sounds like a problem.  I could try reading and saving an 
event from eventfd before shutdown, and injecting it into the eventfd after
restart, but that would be racy unless I disable interrupts.  Or, unconditionally
inject a spurious interrupt after restart to kick it, in case an interrupt 
was lost.

Do you have any other ideas?

> And the re-init will make the device go through the procedure of
> disabling MSIX, enabling INTX, and re-enabling MSIX and vectors.
> So if the device is active, its hardware state will be impacted?

Again thanks.  vfio_msix_enable() does indeed call vfio_disable_interrupts().
For a quick experiment, I deleted that call in for the post_load code path, and 
it seems to work fine, but I need to study it more.

- Steve
 
>> +
>> +    } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
>> +        vfio_intx_enable(vdev, &err);
>> +        if (err) {
>> +            error_report_err(err);
>> +        }
>> +    }
>> +
>> +    vdev->vbasedev.group->container->reused = false;
>> +    vdev->pdev.reused = false;
>> +
>> +    return 0;
>> +}
>> +
>> +static const VMStateDescription vfio_pci_vmstate = {
>> +    .name = "vfio-pci",
>> +    .unmigratable = 1,
>> +    .mode_mask = VMS_RESTART,
>> +    .version_id = 0,
>> +    .minimum_version_id = 0,
>> +    .post_load = vfio_pci_post_load,
>> +    .fields = (VMStateField[]) {
>> +        VMSTATE_MSIX(pdev, VFIOPCIDevice),
>> +        VMSTATE_END_OF_LIST()
>> +    }
>> +};
>> +
>>  static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
>>  {
>>      DeviceClass *dc = DEVICE_CLASS(klass);
>> @@ -3189,6 +3259,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
>>  
>>      dc->reset = vfio_pci_reset;
>>      device_class_set_props(dc, vfio_pci_dev_properties);
>> +    dc->vmsd = &vfio_pci_vmstate;
>>      dc->desc = "VFIO-based PCI device assignment";
>>      set_bit(DEVICE_CATEGORY_MISC, dc->categories);
>>      pdc->realize = vfio_realize;
>> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
>> index ac2cefc..e6e1a5d 100644
>> --- a/hw/vfio/platform.c
>> +++ b/hw/vfio/platform.c
>> @@ -592,7 +592,7 @@ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
>>              return -EBUSY;
>>          }
>>      }
>> -    ret = vfio_get_device(group, vbasedev->name, vbasedev, errp);
>> +    ret = vfio_get_device(group, vbasedev->name, vbasedev, 0, errp);
>>      if (ret) {
>>          vfio_put_group(group);
>>          return ret;
>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>> index bd07c86..c926a24 100644
>> --- a/include/hw/pci/pci.h
>> +++ b/include/hw/pci/pci.h
>> @@ -358,6 +358,7 @@ struct PCIDevice {
>>  
>>      /* ID of standby device in net_failover pair */
>>      char *failover_pair_id;
>> +    bool reused;
>>  };
>>  
>>  void pci_register_bar(PCIDevice *pci_dev, int region_num,
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index c78f3ff..4e2a332 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -73,6 +73,8 @@ typedef struct VFIOContainer {
>>      unsigned iommu_type;
>>      Error *error;
>>      bool initialized;
>> +    bool reused;
>> +    int cid;
>>      unsigned long pgsizes;
>>      QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>>      QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
>> @@ -177,7 +179,7 @@ void vfio_reset_handler(void *opaque);
>>  VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
>>  void vfio_put_group(VFIOGroup *group);
>>  int vfio_get_device(VFIOGroup *group, const char *name,
>> -                    VFIODevice *vbasedev, Error **errp);
>> +                    VFIODevice *vbasedev, bool *reused, Error **errp);
>>  
>>  extern const MemoryRegionOps vfio_region_ops;
>>  typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
>> diff --git a/migration/savevm.c b/migration/savevm.c
>> index 881dc13..2606cf0 100644
>> --- a/migration/savevm.c
>> +++ b/migration/savevm.c
>> @@ -1568,7 +1568,7 @@ static int qemu_savevm_state(QEMUFile *f, VMStateMode mode, Error **errp)
>>          return -EINVAL;
>>      }
>>  
>> -    if (migrate_use_block()) {
>> +    if ((mode & (VMS_SNAPSHOT | VMS_MIGRATE)) && migrate_use_block()) {
>>          error_setg(errp, "Block migration and snapshots are incompatible");
>>          return -EINVAL;
>>      }
>> -- 
>> 1.8.3.1
>>
>>
Jason Zeng Aug. 10, 2020, 3:50 a.m. UTC | #3
On Fri, Aug 07, 2020 at 04:38:12PM -0400, Steven Sistare wrote:
> On 8/6/2020 6:22 AM, Jason Zeng wrote:
> > Hi Steve,
> > 
> > On Thu, Jul 30, 2020 at 08:14:34AM -0700, Steve Sistare wrote:
> >> @@ -3182,6 +3207,51 @@ static Property vfio_pci_dev_properties[] = {
> >>      DEFINE_PROP_END_OF_LIST(),
> >>  };
> >>  
> >> +static int vfio_pci_post_load(void *opaque, int version_id)
> >> +{
> >> +    int vector;
> >> +    MSIMessage msg;
> >> +    Error *err = 0;
> >> +    VFIOPCIDevice *vdev = opaque;
> >> +    PCIDevice *pdev = &vdev->pdev;
> >> +
> >> +    if (msix_enabled(pdev)) {
> >> +        vfio_msix_enable(vdev);
> >> +        pdev->msix_function_masked = false;
> >> +
> >> +        for (vector = 0; vector < vdev->pdev.msix_entries_nr; vector++) {
> >> +            if (!msix_is_masked(pdev, vector)) {
> >> +                msg = msix_get_message(pdev, vector);
> >> +                vfio_msix_vector_use(pdev, vector, msg);
> >> +            }
> >> +        }
> > 
> > It looks to me MSIX re-init here may lose device IRQs and impact
> > device hardware state?
> > 
> > The re-init will cause the kernel vfio driver to connect the device
> > MSIX vectors to new eventfds and KVM instance. But before that, device
> > IRQs will be routed to previous eventfd. Looks these IRQs will be lost.
> 
> Thanks Jason, that sounds like a problem.  I could try reading and saving an 
> event from eventfd before shutdown, and injecting it into the eventfd after
> restart, but that would be racy unless I disable interrupts.  Or, unconditionally
> inject a spurious interrupt after restart to kick it, in case an interrupt 
> was lost.
> 
> Do you have any other ideas?

Maybe we can consider to also hand over the eventfd file descriptor, or
even the KVM fds to the new Qemu?

If the KVM fds can be preserved, we will just need to restore Qemu KVM
side states. But not sure how complicated the implementation would be.

If we only preserve the eventfd fd, we can attach the old eventfd to
vfio devices. But looks it may turn out we always inject an interrupt
unconditionally, because kernel KVM irqfd eventfd handling is a bit
different than normal user land eventfd read/write. It doesn't decrease
the counter in the eventfd context. So if we read the eventfd from new
Qemu, it looks will always have a non-zero counter, which requires an
interrupt injection.

> 
> > And the re-init will make the device go through the procedure of
> > disabling MSIX, enabling INTX, and re-enabling MSIX and vectors.
> > So if the device is active, its hardware state will be impacted?
> 
> Again thanks.  vfio_msix_enable() does indeed call vfio_disable_interrupts().
> For a quick experiment, I deleted that call in for the post_load code path, and 
> it seems to work fine, but I need to study it more.

vfio_msix_vector_use() will also trigger this procedure in the kernel.

Looks we shouldn't trigger any kernel vfio actions here? Because we
preserve vfio fds, so its kernel state shouldn't be touched. Here we
may only need to restore Qemu states. Re-connect to KVM instance should
be done automatically when we setup the KVM irqfds with the same eventfd.

BTW, if I remember correctly, it is not enough to only save MSIX state
in the snapshot. We should also save the Qemu side pci config space
cache to the snapshot, because Qemu's copy is not exactly the same as
the kernel's copy. I encountered this before, but I don't remember which
field it was.

And another question, why don't we support MSI? I see the code only
handles MSIX?

Thanks,
Jason


> 
> - Steve
>  
> >> +
> >> +    } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
> >> +        vfio_intx_enable(vdev, &err);
> >> +        if (err) {
> >> +            error_report_err(err);
> >> +        }
> >> +    }
> >> +
> >> +    vdev->vbasedev.group->container->reused = false;
> >> +    vdev->pdev.reused = false;
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static const VMStateDescription vfio_pci_vmstate = {
> >> +    .name = "vfio-pci",
> >> +    .unmigratable = 1,
> >> +    .mode_mask = VMS_RESTART,
> >> +    .version_id = 0,
> >> +    .minimum_version_id = 0,
> >> +    .post_load = vfio_pci_post_load,
> >> +    .fields = (VMStateField[]) {
> >> +        VMSTATE_MSIX(pdev, VFIOPCIDevice),
> >> +        VMSTATE_END_OF_LIST()
> >> +    }
> >> +};
> >> +
> >>  static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> >>  {
> >>      DeviceClass *dc = DEVICE_CLASS(klass);
> >> @@ -3189,6 +3259,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> >>  
> >>      dc->reset = vfio_pci_reset;
> >>      device_class_set_props(dc, vfio_pci_dev_properties);
> >> +    dc->vmsd = &vfio_pci_vmstate;
> >>      dc->desc = "VFIO-based PCI device assignment";
> >>      set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> >>      pdc->realize = vfio_realize;
> >> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
> >> index ac2cefc..e6e1a5d 100644
> >> --- a/hw/vfio/platform.c
> >> +++ b/hw/vfio/platform.c
> >> @@ -592,7 +592,7 @@ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
> >>              return -EBUSY;
> >>          }
> >>      }
> >> -    ret = vfio_get_device(group, vbasedev->name, vbasedev, errp);
> >> +    ret = vfio_get_device(group, vbasedev->name, vbasedev, 0, errp);
> >>      if (ret) {
> >>          vfio_put_group(group);
> >>          return ret;
> >> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> >> index bd07c86..c926a24 100644
> >> --- a/include/hw/pci/pci.h
> >> +++ b/include/hw/pci/pci.h
> >> @@ -358,6 +358,7 @@ struct PCIDevice {
> >>  
> >>      /* ID of standby device in net_failover pair */
> >>      char *failover_pair_id;
> >> +    bool reused;
> >>  };
> >>  
> >>  void pci_register_bar(PCIDevice *pci_dev, int region_num,
> >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> >> index c78f3ff..4e2a332 100644
> >> --- a/include/hw/vfio/vfio-common.h
> >> +++ b/include/hw/vfio/vfio-common.h
> >> @@ -73,6 +73,8 @@ typedef struct VFIOContainer {
> >>      unsigned iommu_type;
> >>      Error *error;
> >>      bool initialized;
> >> +    bool reused;
> >> +    int cid;
> >>      unsigned long pgsizes;
> >>      QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
> >>      QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
> >> @@ -177,7 +179,7 @@ void vfio_reset_handler(void *opaque);
> >>  VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
> >>  void vfio_put_group(VFIOGroup *group);
> >>  int vfio_get_device(VFIOGroup *group, const char *name,
> >> -                    VFIODevice *vbasedev, Error **errp);
> >> +                    VFIODevice *vbasedev, bool *reused, Error **errp);
> >>  
> >>  extern const MemoryRegionOps vfio_region_ops;
> >>  typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
> >> diff --git a/migration/savevm.c b/migration/savevm.c
> >> index 881dc13..2606cf0 100644
> >> --- a/migration/savevm.c
> >> +++ b/migration/savevm.c
> >> @@ -1568,7 +1568,7 @@ static int qemu_savevm_state(QEMUFile *f, VMStateMode mode, Error **errp)
> >>          return -EINVAL;
> >>      }
> >>  
> >> -    if (migrate_use_block()) {
> >> +    if ((mode & (VMS_SNAPSHOT | VMS_MIGRATE)) && migrate_use_block()) {
> >>          error_setg(errp, "Block migration and snapshots are incompatible");
> >>          return -EINVAL;
> >>      }
> >> -- 
> >> 1.8.3.1
> >>
> >>
Steven Sistare Aug. 19, 2020, 9:15 p.m. UTC | #4
On 8/9/2020 11:50 PM, Jason Zeng wrote:
> On Fri, Aug 07, 2020 at 04:38:12PM -0400, Steven Sistare wrote:
>> On 8/6/2020 6:22 AM, Jason Zeng wrote:
>>> Hi Steve,
>>>
>>> On Thu, Jul 30, 2020 at 08:14:34AM -0700, Steve Sistare wrote:
>>>> @@ -3182,6 +3207,51 @@ static Property vfio_pci_dev_properties[] = {
>>>>      DEFINE_PROP_END_OF_LIST(),
>>>>  };
>>>>  
>>>> +static int vfio_pci_post_load(void *opaque, int version_id)
>>>> +{
>>>> +    int vector;
>>>> +    MSIMessage msg;
>>>> +    Error *err = 0;
>>>> +    VFIOPCIDevice *vdev = opaque;
>>>> +    PCIDevice *pdev = &vdev->pdev;
>>>> +
>>>> +    if (msix_enabled(pdev)) {
>>>> +        vfio_msix_enable(vdev);
>>>> +        pdev->msix_function_masked = false;
>>>> +
>>>> +        for (vector = 0; vector < vdev->pdev.msix_entries_nr; vector++) {
>>>> +            if (!msix_is_masked(pdev, vector)) {
>>>> +                msg = msix_get_message(pdev, vector);
>>>> +                vfio_msix_vector_use(pdev, vector, msg);
>>>> +            }
>>>> +        }
>>>
>>> It looks to me MSIX re-init here may lose device IRQs and impact
>>> device hardware state?
>>>
>>> The re-init will cause the kernel vfio driver to connect the device
>>> MSIX vectors to new eventfds and KVM instance. But before that, device
>>> IRQs will be routed to previous eventfd. Looks these IRQs will be lost.
>>
>> Thanks Jason, that sounds like a problem.  I could try reading and saving an 
>> event from eventfd before shutdown, and injecting it into the eventfd after
>> restart, but that would be racy unless I disable interrupts.  Or, unconditionally
>> inject a spurious interrupt after restart to kick it, in case an interrupt 
>> was lost.
>>
>> Do you have any other ideas?
> 
> Maybe we can consider to also hand over the eventfd file descriptor, or

I believe preserving this descriptor in isolation is not sufficient.  We would
also need to preserve the KVM instance which it is linked to.

> or even the KVM fds to the new Qemu?
> 
> If the KVM fds can be preserved, we will just need to restore Qemu KVM
> side states. But not sure how complicated the implementation would be.

That should work, but I fear it would require many code changes in QEMU
to re-use descriptors at object creation time and suppress the initial 
configuration ioctl's, so it's not my first choice for a solution.

> If we only preserve the eventfd fd, we can attach the old eventfd to
> vfio devices. But looks it may turn out we always inject an interrupt
> unconditionally, because kernel KVM irqfd eventfd handling is a bit
> different than normal user land eventfd read/write. It doesn't decrease
> the counter in the eventfd context. So if we read the eventfd from new
> Qemu, it looks will always have a non-zero counter, which requires an
> interrupt injection.

Good to know, thanks.

I will try creating a new eventfd and injecting an interrupt unconditionally.
I need a test case to demonstrate losing an interrupt, and fixing it with
injection.  Any advice?  My stress tests with a virtual function nic and a
directly assigned nvme block device have never failed across live update.

>>> And the re-init will make the device go through the procedure of
>>> disabling MSIX, enabling INTX, and re-enabling MSIX and vectors.
>>> So if the device is active, its hardware state will be impacted?
>>
>> Again thanks.  vfio_msix_enable() does indeed call vfio_disable_interrupts().
>> For a quick experiment, I deleted that call in for the post_load code path, and 
>> it seems to work fine, but I need to study it more.
> 
> vfio_msix_vector_use() will also trigger this procedure in the kernel.

Because that code path calls VFIO_DEVICE_SET_IRQS? Or something else?
Can you point to what it triggers in the kernel?

> Looks we shouldn't trigger any kernel vfio actions here? Because we
> preserve vfio fds, so its kernel state shouldn't be touched. Here we
> may only need to restore Qemu states. Re-connect to KVM instance should
> be done automatically when we setup the KVM irqfds with the same eventfd.
> 
> BTW, if I remember correctly, it is not enough to only save MSIX state
> in the snapshot. We should also save the Qemu side pci config space
> cache to the snapshot, because Qemu's copy is not exactly the same as
> the kernel's copy. I encountered this before, but I don't remember which
> field it was.

FYI all, Jason told me offline that qemu may emulate some pci capabilities and
hence keeps state in the shadow config that is never written to the kernel.
I need to study that.

> And another question, why don't we support MSI? I see the code only
> handles MSIX?

Yes, needs more code for MSI.

- Steve
  
>>>> +
>>>> +    } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
>>>> +        vfio_intx_enable(vdev, &err);
>>>> +        if (err) {
>>>> +            error_report_err(err);
>>>> +        }
>>>> +    }
>>>> +
>>>> +    vdev->vbasedev.group->container->reused = false;
>>>> +    vdev->pdev.reused = false;
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static const VMStateDescription vfio_pci_vmstate = {
>>>> +    .name = "vfio-pci",
>>>> +    .unmigratable = 1,
>>>> +    .mode_mask = VMS_RESTART,
>>>> +    .version_id = 0,
>>>> +    .minimum_version_id = 0,
>>>> +    .post_load = vfio_pci_post_load,
>>>> +    .fields = (VMStateField[]) {
>>>> +        VMSTATE_MSIX(pdev, VFIOPCIDevice),
>>>> +        VMSTATE_END_OF_LIST()
>>>> +    }
>>>> +};
>>>> +
>>>>  static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
>>>>  {
>>>>      DeviceClass *dc = DEVICE_CLASS(klass);
>>>> @@ -3189,6 +3259,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
>>>>  
>>>>      dc->reset = vfio_pci_reset;
>>>>      device_class_set_props(dc, vfio_pci_dev_properties);
>>>> +    dc->vmsd = &vfio_pci_vmstate;
>>>>      dc->desc = "VFIO-based PCI device assignment";
>>>>      set_bit(DEVICE_CATEGORY_MISC, dc->categories);
>>>>      pdc->realize = vfio_realize;
>>>> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
>>>> index ac2cefc..e6e1a5d 100644
>>>> --- a/hw/vfio/platform.c
>>>> +++ b/hw/vfio/platform.c
>>>> @@ -592,7 +592,7 @@ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
>>>>              return -EBUSY;
>>>>          }
>>>>      }
>>>> -    ret = vfio_get_device(group, vbasedev->name, vbasedev, errp);
>>>> +    ret = vfio_get_device(group, vbasedev->name, vbasedev, 0, errp);
>>>>      if (ret) {
>>>>          vfio_put_group(group);
>>>>          return ret;
>>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>>>> index bd07c86..c926a24 100644
>>>> --- a/include/hw/pci/pci.h
>>>> +++ b/include/hw/pci/pci.h
>>>> @@ -358,6 +358,7 @@ struct PCIDevice {
>>>>  
>>>>      /* ID of standby device in net_failover pair */
>>>>      char *failover_pair_id;
>>>> +    bool reused;
>>>>  };
>>>>  
>>>>  void pci_register_bar(PCIDevice *pci_dev, int region_num,
>>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>>>> index c78f3ff..4e2a332 100644
>>>> --- a/include/hw/vfio/vfio-common.h
>>>> +++ b/include/hw/vfio/vfio-common.h
>>>> @@ -73,6 +73,8 @@ typedef struct VFIOContainer {
>>>>      unsigned iommu_type;
>>>>      Error *error;
>>>>      bool initialized;
>>>> +    bool reused;
>>>> +    int cid;
>>>>      unsigned long pgsizes;
>>>>      QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>>>>      QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
>>>> @@ -177,7 +179,7 @@ void vfio_reset_handler(void *opaque);
>>>>  VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
>>>>  void vfio_put_group(VFIOGroup *group);
>>>>  int vfio_get_device(VFIOGroup *group, const char *name,
>>>> -                    VFIODevice *vbasedev, Error **errp);
>>>> +                    VFIODevice *vbasedev, bool *reused, Error **errp);
>>>>  
>>>>  extern const MemoryRegionOps vfio_region_ops;
>>>>  typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
>>>> diff --git a/migration/savevm.c b/migration/savevm.c
>>>> index 881dc13..2606cf0 100644
>>>> --- a/migration/savevm.c
>>>> +++ b/migration/savevm.c
>>>> @@ -1568,7 +1568,7 @@ static int qemu_savevm_state(QEMUFile *f, VMStateMode mode, Error **errp)
>>>>          return -EINVAL;
>>>>      }
>>>>  
>>>> -    if (migrate_use_block()) {
>>>> +    if ((mode & (VMS_SNAPSHOT | VMS_MIGRATE)) && migrate_use_block()) {
>>>>          error_setg(errp, "Block migration and snapshots are incompatible");
>>>>          return -EINVAL;
>>>>      }
>>>> -- 
>>>> 1.8.3.1
>>>>
>>>>
Jason Zeng Aug. 20, 2020, 10:33 a.m. UTC | #5
On Wed, Aug 19, 2020 at 05:15:11PM -0400, Steven Sistare wrote:
> On 8/9/2020 11:50 PM, Jason Zeng wrote:
> > On Fri, Aug 07, 2020 at 04:38:12PM -0400, Steven Sistare wrote:
> >> On 8/6/2020 6:22 AM, Jason Zeng wrote:
> >>> Hi Steve,
> >>>
> >>> On Thu, Jul 30, 2020 at 08:14:34AM -0700, Steve Sistare wrote:
> >>>> @@ -3182,6 +3207,51 @@ static Property vfio_pci_dev_properties[] = {
> >>>>      DEFINE_PROP_END_OF_LIST(),
> >>>>  };
> >>>>  
> >>>> +static int vfio_pci_post_load(void *opaque, int version_id)
> >>>> +{
> >>>> +    int vector;
> >>>> +    MSIMessage msg;
> >>>> +    Error *err = 0;
> >>>> +    VFIOPCIDevice *vdev = opaque;
> >>>> +    PCIDevice *pdev = &vdev->pdev;
> >>>> +
> >>>> +    if (msix_enabled(pdev)) {
> >>>> +        vfio_msix_enable(vdev);
> >>>> +        pdev->msix_function_masked = false;
> >>>> +
> >>>> +        for (vector = 0; vector < vdev->pdev.msix_entries_nr; vector++) {
> >>>> +            if (!msix_is_masked(pdev, vector)) {
> >>>> +                msg = msix_get_message(pdev, vector);
> >>>> +                vfio_msix_vector_use(pdev, vector, msg);
> >>>> +            }
> >>>> +        }
> >>>
> >>> It looks to me MSIX re-init here may lose device IRQs and impact
> >>> device hardware state?
> >>>
> >>> The re-init will cause the kernel vfio driver to connect the device
> >>> MSIX vectors to new eventfds and KVM instance. But before that, device
> >>> IRQs will be routed to previous eventfd. Looks these IRQs will be lost.
> >>
> >> Thanks Jason, that sounds like a problem.  I could try reading and saving an 
> >> event from eventfd before shutdown, and injecting it into the eventfd after
> >> restart, but that would be racy unless I disable interrupts.  Or, unconditionally
> >> inject a spurious interrupt after restart to kick it, in case an interrupt 
> >> was lost.
> >>
> >> Do you have any other ideas?
> > 
> > Maybe we can consider to also hand over the eventfd file descriptor, or
> 
> I believe preserving this descriptor in isolation is not sufficient.  We would
> also need to preserve the KVM instance which it is linked to.
> 
> > or even the KVM fds to the new Qemu?
> > 
> > If the KVM fds can be preserved, we will just need to restore Qemu KVM
> > side states. But not sure how complicated the implementation would be.
> 
> That should work, but I fear it would require many code changes in QEMU
> to re-use descriptors at object creation time and suppress the initial 
> configuration ioctl's, so it's not my first choice for a solution.
> 
> > If we only preserve the eventfd fd, we can attach the old eventfd to
> > vfio devices. But looks it may turn out we always inject an interrupt
> > unconditionally, because kernel KVM irqfd eventfd handling is a bit
> > different than normal user land eventfd read/write. It doesn't decrease
> > the counter in the eventfd context. So if we read the eventfd from new
> > Qemu, it looks will always have a non-zero counter, which requires an
> > interrupt injection.
> 
> Good to know, thanks.
> 
> I will try creating a new eventfd and injecting an interrupt unconditionally.
> I need a test case to demonstrate losing an interrupt, and fixing it with
> injection.  Any advice?  My stress tests with a virtual function nic and a
> directly assigned nvme block device have never failed across live update.
> 

I am not familiar with nvme devices. For nic device, to my understanding,
stress nic testing will not have many IRQs, because nic driver usually
enables NAPI, which only take the first interrupt, then disable interrupt
and start polling. It will only re-enable interrupt after some packet
quota reached or the traffic quiesces for a while. But anyway, if the
test goes enough long time, the number of IRQs should also be big, not
sure why it doesn't trigger any issue. Maybe we can have some study on
the IRQ pattern for the testing and see how we can design a test case?
or see if our assumption is wrong?


> >>> And the re-init will make the device go through the procedure of
> >>> disabling MSIX, enabling INTX, and re-enabling MSIX and vectors.
> >>> So if the device is active, its hardware state will be impacted?
> >>
> >> Again thanks.  vfio_msix_enable() does indeed call vfio_disable_interrupts().
> >> For a quick experiment, I deleted that call in for the post_load code path, and 
> >> it seems to work fine, but I need to study it more.
> > 
> > vfio_msix_vector_use() will also trigger this procedure in the kernel.
> 
> Because that code path calls VFIO_DEVICE_SET_IRQS? Or something else?
> Can you point to what it triggers in the kernel?


In vfio_msix_vector_use(), I see vfio_disable_irqindex() will be invoked
if "vdev->nr_vectors < nr + 1" is true. Since the 'vdev' is re-inited,
so this condition should be true, and vfio_disable_irqindex() will
trigger VFIO_DEVICE_SET_IRQS with VFIO_IRQ_SET_DATA_NONE, which will
cause kernel to disable MSIX.

> 
> > Looks we shouldn't trigger any kernel vfio actions here? Because we
> > preserve vfio fds, so its kernel state shouldn't be touched. Here we
> > may only need to restore Qemu states. Re-connect to KVM instance should
> > be done automatically when we setup the KVM irqfds with the same eventfd.
> > 
> > BTW, if I remember correctly, it is not enough to only save MSIX state
> > in the snapshot. We should also save the Qemu side pci config space
> > cache to the snapshot, because Qemu's copy is not exactly the same as
> > the kernel's copy. I encountered this before, but I don't remember which
> > field it was.
> 
> FYI all, Jason told me offline that qemu may emulate some pci capabilities and
> hence keeps state in the shadow config that is never written to the kernel.
> I need to study that.
> 

Sorry, I read the code again, see Qemu does write all config-space-write
to kernel in vfio_pci_write_config(). Now I am also confused about what
I was seeing previously :(. But it seems we still need to look at kernel
code to see if mismatch is possibile for config space cache between Qemu
and kernel.

FYI. Some discussion about the VFIO PCI config space saving/restoring in
live migration scenario:
https://lists.gnu.org/archive/html/qemu-devel/2020-06/msg06964.html

thanks,
Jason


> > And another question, why don't we support MSI? I see the code only
> > handles MSIX?
> 
> Yes, needs more code for MSI.
> 
> - Steve
>   
> >>>> +
> >>>> +    } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
> >>>> +        vfio_intx_enable(vdev, &err);
> >>>> +        if (err) {
> >>>> +            error_report_err(err);
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>> +    vdev->vbasedev.group->container->reused = false;
> >>>> +    vdev->pdev.reused = false;
> >>>> +
> >>>> +    return 0;
> >>>> +}
> >>>> +
> >>>> +static const VMStateDescription vfio_pci_vmstate = {
> >>>> +    .name = "vfio-pci",
> >>>> +    .unmigratable = 1,
> >>>> +    .mode_mask = VMS_RESTART,
> >>>> +    .version_id = 0,
> >>>> +    .minimum_version_id = 0,
> >>>> +    .post_load = vfio_pci_post_load,
> >>>> +    .fields = (VMStateField[]) {
> >>>> +        VMSTATE_MSIX(pdev, VFIOPCIDevice),
> >>>> +        VMSTATE_END_OF_LIST()
> >>>> +    }
> >>>> +};
> >>>> +
> >>>>  static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> >>>>  {
> >>>>      DeviceClass *dc = DEVICE_CLASS(klass);
> >>>> @@ -3189,6 +3259,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> >>>>  
> >>>>      dc->reset = vfio_pci_reset;
> >>>>      device_class_set_props(dc, vfio_pci_dev_properties);
> >>>> +    dc->vmsd = &vfio_pci_vmstate;
> >>>>      dc->desc = "VFIO-based PCI device assignment";
> >>>>      set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> >>>>      pdc->realize = vfio_realize;
> >>>> diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
> >>>> index ac2cefc..e6e1a5d 100644
> >>>> --- a/hw/vfio/platform.c
> >>>> +++ b/hw/vfio/platform.c
> >>>> @@ -592,7 +592,7 @@ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
> >>>>              return -EBUSY;
> >>>>          }
> >>>>      }
> >>>> -    ret = vfio_get_device(group, vbasedev->name, vbasedev, errp);
> >>>> +    ret = vfio_get_device(group, vbasedev->name, vbasedev, 0, errp);
> >>>>      if (ret) {
> >>>>          vfio_put_group(group);
> >>>>          return ret;
> >>>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> >>>> index bd07c86..c926a24 100644
> >>>> --- a/include/hw/pci/pci.h
> >>>> +++ b/include/hw/pci/pci.h
> >>>> @@ -358,6 +358,7 @@ struct PCIDevice {
> >>>>  
> >>>>      /* ID of standby device in net_failover pair */
> >>>>      char *failover_pair_id;
> >>>> +    bool reused;
> >>>>  };
> >>>>  
> >>>>  void pci_register_bar(PCIDevice *pci_dev, int region_num,
> >>>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> >>>> index c78f3ff..4e2a332 100644
> >>>> --- a/include/hw/vfio/vfio-common.h
> >>>> +++ b/include/hw/vfio/vfio-common.h
> >>>> @@ -73,6 +73,8 @@ typedef struct VFIOContainer {
> >>>>      unsigned iommu_type;
> >>>>      Error *error;
> >>>>      bool initialized;
> >>>> +    bool reused;
> >>>> +    int cid;
> >>>>      unsigned long pgsizes;
> >>>>      QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
> >>>>      QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
> >>>> @@ -177,7 +179,7 @@ void vfio_reset_handler(void *opaque);
> >>>>  VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
> >>>>  void vfio_put_group(VFIOGroup *group);
> >>>>  int vfio_get_device(VFIOGroup *group, const char *name,
> >>>> -                    VFIODevice *vbasedev, Error **errp);
> >>>> +                    VFIODevice *vbasedev, bool *reused, Error **errp);
> >>>>  
> >>>>  extern const MemoryRegionOps vfio_region_ops;
> >>>>  typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
> >>>> diff --git a/migration/savevm.c b/migration/savevm.c
> >>>> index 881dc13..2606cf0 100644
> >>>> --- a/migration/savevm.c
> >>>> +++ b/migration/savevm.c
> >>>> @@ -1568,7 +1568,7 @@ static int qemu_savevm_state(QEMUFile *f, VMStateMode mode, Error **errp)
> >>>>          return -EINVAL;
> >>>>      }
> >>>>  
> >>>> -    if (migrate_use_block()) {
> >>>> +    if ((mode & (VMS_SNAPSHOT | VMS_MIGRATE)) && migrate_use_block()) {
> >>>>          error_setg(errp, "Block migration and snapshots are incompatible");
> >>>>          return -EINVAL;
> >>>>      }
> >>>> -- 
> >>>> 1.8.3.1
> >>>>
> >>>>
Steven Sistare Oct. 7, 2020, 9:25 p.m. UTC | #6
On 8/20/2020 6:33 AM, Jason Zeng wrote:
> On Wed, Aug 19, 2020 at 05:15:11PM -0400, Steven Sistare wrote:
>> On 8/9/2020 11:50 PM, Jason Zeng wrote:
>>> On Fri, Aug 07, 2020 at 04:38:12PM -0400, Steven Sistare wrote:
>>>> On 8/6/2020 6:22 AM, Jason Zeng wrote:
>>>>> Hi Steve,
>>>>>
>>>>> On Thu, Jul 30, 2020 at 08:14:34AM -0700, Steve Sistare wrote:
>>>>>> @@ -3182,6 +3207,51 @@ static Property vfio_pci_dev_properties[] = {
>>>>>>      DEFINE_PROP_END_OF_LIST(),
>>>>>>  };
>>>>>>  
>>>>>> +static int vfio_pci_post_load(void *opaque, int version_id)
>>>>>> +{
>>>>>> +    int vector;
>>>>>> +    MSIMessage msg;
>>>>>> +    Error *err = 0;
>>>>>> +    VFIOPCIDevice *vdev = opaque;
>>>>>> +    PCIDevice *pdev = &vdev->pdev;
>>>>>> +
>>>>>> +    if (msix_enabled(pdev)) {
>>>>>> +        vfio_msix_enable(vdev);
>>>>>> +        pdev->msix_function_masked = false;
>>>>>> +
>>>>>> +        for (vector = 0; vector < vdev->pdev.msix_entries_nr; vector++) {
>>>>>> +            if (!msix_is_masked(pdev, vector)) {
>>>>>> +                msg = msix_get_message(pdev, vector);
>>>>>> +                vfio_msix_vector_use(pdev, vector, msg);
>>>>>> +            }
>>>>>> +        }
>>>>>
>>>>> It looks to me MSIX re-init here may lose device IRQs and impact
>>>>> device hardware state?
>>>>>
>>>>> The re-init will cause the kernel vfio driver to connect the device
>>>>> MSIX vectors to new eventfds and KVM instance. But before that, device
>>>>> IRQs will be routed to previous eventfd. Looks these IRQs will be lost.
>>>>
>>>> Thanks Jason, that sounds like a problem.  I could try reading and saving an 
>>>> event from eventfd before shutdown, and injecting it into the eventfd after
>>>> restart, but that would be racy unless I disable interrupts.  Or, unconditionally
>>>> inject a spurious interrupt after restart to kick it, in case an interrupt 
>>>> was lost.
>>>>
>>>> Do you have any other ideas?
>>>
>>> Maybe we can consider to also hand over the eventfd file descriptor, or
>>
>> I believe preserving this descriptor in isolation is not sufficient.  We would
>> also need to preserve the KVM instance which it is linked to.
>>
>>> or even the KVM fds to the new Qemu?
>>>
>>> If the KVM fds can be preserved, we will just need to restore Qemu KVM
>>> side states. But not sure how complicated the implementation would be.
>>
>> That should work, but I fear it would require many code changes in QEMU
>> to re-use descriptors at object creation time and suppress the initial 
>> configuration ioctl's, so it's not my first choice for a solution.
>>
>>> If we only preserve the eventfd fd, we can attach the old eventfd to
>>> vfio devices. But looks it may turn out we always inject an interrupt
>>> unconditionally, because kernel KVM irqfd eventfd handling is a bit
>>> different than normal user land eventfd read/write. It doesn't decrease
>>> the counter in the eventfd context. So if we read the eventfd from new
>>> Qemu, it looks will always have a non-zero counter, which requires an
>>> interrupt injection.
>>
>> Good to know, thanks.
>>
>> I will try creating a new eventfd and injecting an interrupt unconditionally.
>> I need a test case to demonstrate losing an interrupt, and fixing it with
>> injection.  Any advice?  My stress tests with a virtual function nic and a
>> directly assigned nvme block device have never failed across live update.
>>
> 
> I am not familiar with nvme devices. For nic device, to my understanding,
> stress nic testing will not have many IRQs, because nic driver usually
> enables NAPI, which only take the first interrupt, then disable interrupt
> and start polling. It will only re-enable interrupt after some packet
> quota reached or the traffic quiesces for a while. But anyway, if the
> test goes enough long time, the number of IRQs should also be big, not
> sure why it doesn't trigger any issue. Maybe we can have some study on
> the IRQ pattern for the testing and see how we can design a test case?
> or see if our assumption is wrong?
> 
> 
>>>>> And the re-init will make the device go through the procedure of
>>>>> disabling MSIX, enabling INTX, and re-enabling MSIX and vectors.
>>>>> So if the device is active, its hardware state will be impacted?
>>>>
>>>> Again thanks.  vfio_msix_enable() does indeed call vfio_disable_interrupts().
>>>> For a quick experiment, I deleted that call in for the post_load code path, and 
>>>> it seems to work fine, but I need to study it more.
>>>
>>> vfio_msix_vector_use() will also trigger this procedure in the kernel.
>>
>> Because that code path calls VFIO_DEVICE_SET_IRQS? Or something else?
>> Can you point to what it triggers in the kernel?
> 
> 
> In vfio_msix_vector_use(), I see vfio_disable_irqindex() will be invoked
> if "vdev->nr_vectors < nr + 1" is true. Since the 'vdev' is re-inited,
> so this condition should be true, and vfio_disable_irqindex() will
> trigger VFIO_DEVICE_SET_IRQS with VFIO_IRQ_SET_DATA_NONE, which will
> cause kernel to disable MSIX.
> 
>>
>>> Looks we shouldn't trigger any kernel vfio actions here? Because we
>>> preserve vfio fds, so its kernel state shouldn't be touched. Here we
>>> may only need to restore Qemu states. Re-connect to KVM instance should
>>> be done automatically when we setup the KVM irqfds with the same eventfd.
>>>
>>> BTW, if I remember correctly, it is not enough to only save MSIX state
>>> in the snapshot. We should also save the Qemu side pci config space
>>> cache to the snapshot, because Qemu's copy is not exactly the same as
>>> the kernel's copy. I encountered this before, but I don't remember which
>>> field it was.
>>
>> FYI all, Jason told me offline that qemu may emulate some pci capabilities and
>> hence keeps state in the shadow config that is never written to the kernel.
>> I need to study that.
>>
> 
> Sorry, I read the code again, see Qemu does write all config-space-write
> to kernel in vfio_pci_write_config(). Now I am also confused about what
> I was seeing previously :(. But it seems we still need to look at kernel
> code to see if mismatch is possibile for config space cache between Qemu
> and kernel.
> 
> FYI. Some discussion about the VFIO PCI config space saving/restoring in
> live migration scenario:
> https://lists.gnu.org/archive/html/qemu-devel/2020-06/msg06964.html
> 

I have coded a solution for much of the "lost interrupts" issue.
cprsave preserves the vfio err, req, and msi irq eventfd's across exec:
  vdev->err_notifier
  vdev->req_notifier
  vdev->msi_vectors[i].interrupt
  vdev->msi_vectors[i].kvm_interrupt

The KVM instance is destroyed and recreated as before.
The eventfd descriptors are found and reused during vfio_realize using
event_notifier_init_fd.  No calls to VFIO_DEVICE_SET_IRQS are made before or
after the exec.  The descriptors are attached to the new KVM instance via the
usual ioctl's on the existing code paths.

It works.  I issue cprsave, send an interrupt, wait a few seconds, then issue cprload.
The interrupt fires immediately after cprload.  I tested interrupt delivery to the 
kvm_irqchip and to qemu.

It does not support Posted Interrupts, as that involves state attached to the
VMCS, which is destroyed with the KVM instance.  That needs more study and
a likely kernel enhancement.

I will post the full code as part of the V2 patch series.

- Steve
diff mbox series

Patch

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 7343e00..c2e1509 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -291,6 +291,10 @@  static void pci_do_device_reset(PCIDevice *dev)
 {
     int r;
 
+    if (dev->reused) {
+        return;
+    }
+
     pci_device_deassert_intx(dev);
     assert(dev->irq_state == 0);
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 3335714..a51a093 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -37,6 +37,7 @@ 
 #include "sysemu/reset.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "qemu/cutils.h"
 
 VFIOGroupList vfio_group_list =
     QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -299,6 +300,10 @@  static int vfio_dma_unmap(VFIOContainer *container,
         .size = size,
     };
 
+    if (container->reused) {
+        return 0;
+    }
+
     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
         /*
          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
@@ -336,6 +341,10 @@  static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
         .size = size,
     };
 
+    if (container->reused) {
+        return 0;
+    }
+
     if (!readonly) {
         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
     }
@@ -1179,25 +1188,27 @@  static int vfio_init_container(VFIOContainer *container, int group_fd,
         return iommu_type;
     }
 
-    ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
-    if (ret) {
-        error_setg_errno(errp, errno, "Failed to set group container");
-        return -errno;
-    }
+    if (!container->reused) {
+        ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to set group container");
+            return -errno;
+        }
 
-    while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
-        if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
-            /*
-             * On sPAPR, despite the IOMMU subdriver always advertises v1 and
-             * v2, the running platform may not support v2 and there is no
-             * way to guess it until an IOMMU group gets added to the container.
-             * So in case it fails with v2, try v1 as a fallback.
-             */
-            iommu_type = VFIO_SPAPR_TCE_IOMMU;
-            continue;
+        while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
+            if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+                /*
+                 * On sPAPR, despite the IOMMU subdriver always advertises v1
+                 * and v2, the running platform may not support v2 and there is
+                 * no way to guess it until an IOMMU group gets added to the
+                 * container. So in case it fails with v2, try v1 as a fallback.
+                 */
+                iommu_type = VFIO_SPAPR_TCE_IOMMU;
+                continue;
+            }
+            error_setg_errno(errp, errno, "Failed to set iommu for container");
+            return -errno;
         }
-        error_setg_errno(errp, errno, "Failed to set iommu for container");
-        return -errno;
     }
 
     container->iommu_type = iommu_type;
@@ -1210,6 +1221,8 @@  static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
     VFIOContainer *container;
     int ret, fd;
     VFIOAddressSpace *space;
+    char name[40];
+    bool reused;
 
     space = vfio_get_address_space(as);
 
@@ -1254,7 +1267,13 @@  static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
         }
     }
 
-    fd = qemu_open("/dev/vfio/vfio", O_RDWR);
+    snprintf(name, sizeof(name), "vfio_container_%d", group->groupid);
+    fd = getenv_fd(name);
+    reused = (fd >= 0);
+    if (fd < 0) {
+        fd = qemu_open("/dev/vfio/vfio", O_RDWR);
+    }
+
     if (fd < 0) {
         error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
         ret = -errno;
@@ -1272,6 +1291,8 @@  static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
     container = g_malloc0(sizeof(*container));
     container->space = space;
     container->fd = fd;
+    container->cid = group->groupid;
+    container->reused = reused;
     container->error = NULL;
     QLIST_INIT(&container->giommu_list);
     QLIST_INIT(&container->hostwin_list);
@@ -1395,6 +1416,10 @@  static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
 
     container->initialized = true;
 
+    if (!reused) {
+        setenv_fd(name, fd);
+    }
+
     return 0;
 listener_release_exit:
     QLIST_REMOVE(group, container_next);
@@ -1418,6 +1443,7 @@  put_space_exit:
 static void vfio_disconnect_container(VFIOGroup *group)
 {
     VFIOContainer *container = group->container;
+    char name[40];
 
     QLIST_REMOVE(group, container_next);
     group->container = NULL;
@@ -1450,6 +1476,8 @@  static void vfio_disconnect_container(VFIOGroup *group)
         }
 
         trace_vfio_disconnect_container(container->fd);
+        snprintf(name, sizeof(name), "vfio_container_%d", container->cid);
+        unsetenv_fd(name);
         close(container->fd);
         g_free(container);
 
@@ -1462,6 +1490,7 @@  VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
     VFIOGroup *group;
     char path[32];
     struct vfio_group_status status = { .argsz = sizeof(status) };
+    bool reused;
 
     QLIST_FOREACH(group, &vfio_group_list, next) {
         if (group->groupid == groupid) {
@@ -1479,7 +1508,13 @@  VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
     group = g_malloc0(sizeof(*group));
 
     snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
-    group->fd = qemu_open(path, O_RDWR);
+
+    group->fd = getenv_fd(path);
+    reused = (group->fd >= 0);
+    if (group->fd < 0) {
+        group->fd = qemu_open(path, O_RDWR);
+    }
+
     if (group->fd < 0) {
         error_setg_errno(errp, errno, "failed to open %s", path);
         goto free_group_exit;
@@ -1513,6 +1548,10 @@  VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
 
     QLIST_INSERT_HEAD(&vfio_group_list, group, next);
 
+    if (!reused) {
+        setenv_fd(path, group->fd);
+    }
+
     return group;
 
 close_fd_exit:
@@ -1526,6 +1565,8 @@  free_group_exit:
 
 void vfio_put_group(VFIOGroup *group)
 {
+    char path[32];
+
     if (!group || !QLIST_EMPTY(&group->device_list)) {
         return;
     }
@@ -1537,6 +1578,8 @@  void vfio_put_group(VFIOGroup *group)
     vfio_disconnect_container(group);
     QLIST_REMOVE(group, next);
     trace_vfio_put_group(group->fd);
+    snprintf(path, sizeof(path), "/dev/vfio/%d", group->groupid);
+    unsetenv_fd(path);
     close(group->fd);
     g_free(group);
 
@@ -1546,12 +1589,18 @@  void vfio_put_group(VFIOGroup *group)
 }
 
 int vfio_get_device(VFIOGroup *group, const char *name,
-                    VFIODevice *vbasedev, Error **errp)
+                    VFIODevice *vbasedev, bool *reusedp, Error **errp)
 {
     struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
     int ret, fd;
+    bool reused;
+
+    fd = getenv_fd(name);
+    reused = (fd >= 0);
+    if (fd < 0) {
+        fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+    }
 
-    fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
     if (fd < 0) {
         error_setg_errno(errp, errno, "error getting device from group %d",
                          group->groupid);
@@ -1601,6 +1650,13 @@  int vfio_get_device(VFIOGroup *group, const char *name,
                           dev_info.num_irqs);
 
     vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
+
+    if (!reused) {
+        setenv_fd(name, fd);
+    }
+    if (reusedp) {
+        *reusedp = reused;
+    }
     return 0;
 }
 
@@ -1612,6 +1668,7 @@  void vfio_put_base_device(VFIODevice *vbasedev)
     QLIST_REMOVE(vbasedev, next);
     vbasedev->group = NULL;
     trace_vfio_put_base_device(vbasedev->fd);
+    unsetenv_fd(vbasedev->name);
     close(vbasedev->fd);
 }
 
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 2e561c0..5743807 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -49,6 +49,7 @@ 
 
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
+static const VMStateDescription vfio_pci_vmstate;
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -1585,6 +1586,14 @@  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled)
     }
 }
 
+static void vfio_config_sync(VFIOPCIDevice *vdev, uint32_t offset, size_t len)
+{
+    if (pread(vdev->vbasedev.fd, vdev->pdev.config + offset, len,
+          vdev->config_offset + offset) != len) {
+        error_report("vfio_config_sync pread failed");
+    }
+}
+
 static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
 {
     VFIOBAR *bar = &vdev->bars[nr];
@@ -1626,6 +1635,7 @@  static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
 {
     VFIOBAR *bar = &vdev->bars[nr];
     char *name;
+    PCIDevice *pdev = &vdev->pdev;
 
     if (!bar->size) {
         return;
@@ -1646,6 +1656,9 @@  static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
     }
 
     pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
+    if (pdev->reused) {
+        vfio_config_sync(vdev, pci_bar(pdev, nr), 8);
+    }
 }
 
 static void vfio_bars_register(VFIOPCIDevice *vdev)
@@ -2805,7 +2818,8 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
         goto error;
     }
 
-    ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev, errp);
+    ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev,
+                          &pdev->reused, errp);
     if (ret) {
         vfio_put_group(group);
         goto error;
@@ -2972,9 +2986,11 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
                                              vfio_intx_routing_notifier);
         vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
         kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
-        ret = vfio_intx_enable(vdev, errp);
-        if (ret) {
-            goto out_deregister;
+        if (!pdev->reused) {
+            ret = vfio_intx_enable(vdev, errp);
+            if (ret) {
+                goto out_deregister;
+            }
         }
     }
 
@@ -3017,6 +3033,11 @@  static void vfio_realize(PCIDevice *pdev, Error **errp)
     vfio_register_req_notifier(vdev);
     vfio_setup_resetfn_quirk(vdev);
 
+    vfio_config_sync(vdev, pdev->msix_cap + PCI_MSIX_FLAGS, 2);
+    if (pdev->reused) {
+        pci_update_mappings(pdev);
+    }
+
     return;
 
 out_deregister:
@@ -3080,6 +3101,10 @@  static void vfio_pci_reset(DeviceState *dev)
 {
     VFIOPCIDevice *vdev = PCI_VFIO(dev);
 
+    if (vdev->pdev.reused) {
+        return;
+    }
+
     trace_vfio_pci_reset(vdev->vbasedev.name);
 
     vfio_pci_pre_reset(vdev);
@@ -3182,6 +3207,51 @@  static Property vfio_pci_dev_properties[] = {
     DEFINE_PROP_END_OF_LIST(),
 };
 
+static int vfio_pci_post_load(void *opaque, int version_id)
+{
+    int vector;
+    MSIMessage msg;
+    Error *err = 0;
+    VFIOPCIDevice *vdev = opaque;
+    PCIDevice *pdev = &vdev->pdev;
+
+    if (msix_enabled(pdev)) {
+        vfio_msix_enable(vdev);
+        pdev->msix_function_masked = false;
+
+        for (vector = 0; vector < vdev->pdev.msix_entries_nr; vector++) {
+            if (!msix_is_masked(pdev, vector)) {
+                msg = msix_get_message(pdev, vector);
+                vfio_msix_vector_use(pdev, vector, msg);
+            }
+        }
+
+    } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
+        vfio_intx_enable(vdev, &err);
+        if (err) {
+            error_report_err(err);
+        }
+    }
+
+    vdev->vbasedev.group->container->reused = false;
+    vdev->pdev.reused = false;
+
+    return 0;
+}
+
+static const VMStateDescription vfio_pci_vmstate = {
+    .name = "vfio-pci",
+    .unmigratable = 1,
+    .mode_mask = VMS_RESTART,
+    .version_id = 0,
+    .minimum_version_id = 0,
+    .post_load = vfio_pci_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_MSIX(pdev, VFIOPCIDevice),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
@@ -3189,6 +3259,7 @@  static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
 
     dc->reset = vfio_pci_reset;
     device_class_set_props(dc, vfio_pci_dev_properties);
+    dc->vmsd = &vfio_pci_vmstate;
     dc->desc = "VFIO-based PCI device assignment";
     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
     pdc->realize = vfio_realize;
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index ac2cefc..e6e1a5d 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -592,7 +592,7 @@  static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
             return -EBUSY;
         }
     }
-    ret = vfio_get_device(group, vbasedev->name, vbasedev, errp);
+    ret = vfio_get_device(group, vbasedev->name, vbasedev, 0, errp);
     if (ret) {
         vfio_put_group(group);
         return ret;
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index bd07c86..c926a24 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -358,6 +358,7 @@  struct PCIDevice {
 
     /* ID of standby device in net_failover pair */
     char *failover_pair_id;
+    bool reused;
 };
 
 void pci_register_bar(PCIDevice *pci_dev, int region_num,
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index c78f3ff..4e2a332 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -73,6 +73,8 @@  typedef struct VFIOContainer {
     unsigned iommu_type;
     Error *error;
     bool initialized;
+    bool reused;
+    int cid;
     unsigned long pgsizes;
     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
@@ -177,7 +179,7 @@  void vfio_reset_handler(void *opaque);
 VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
 void vfio_put_group(VFIOGroup *group);
 int vfio_get_device(VFIOGroup *group, const char *name,
-                    VFIODevice *vbasedev, Error **errp);
+                    VFIODevice *vbasedev, bool *reused, Error **errp);
 
 extern const MemoryRegionOps vfio_region_ops;
 typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
diff --git a/migration/savevm.c b/migration/savevm.c
index 881dc13..2606cf0 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1568,7 +1568,7 @@  static int qemu_savevm_state(QEMUFile *f, VMStateMode mode, Error **errp)
         return -EINVAL;
     }
 
-    if (migrate_use_block()) {
+    if ((mode & (VMS_SNAPSHOT | VMS_MIGRATE)) && migrate_use_block()) {
         error_setg(errp, "Block migration and snapshots are incompatible");
         return -EINVAL;
     }