diff mbox series

[v16,QEMU,04/16] vfio: Add save and load functions for VFIO PCI devices

Message ID 1585084154-29461-5-git-send-email-kwankhede@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Add migration support for VFIO devices | expand

Commit Message

Kirti Wankhede March 24, 2020, 9:09 p.m. UTC
These functions save and restore PCI device specific data - config
space of PCI device.
Tested save and restore with MSI and MSIX type.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
---
 hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
 include/hw/vfio/vfio-common.h |   2 +
 2 files changed, 165 insertions(+)

Comments

Alex Williamson March 25, 2020, 7:56 p.m. UTC | #1
On Wed, 25 Mar 2020 02:39:02 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> These functions save and restore PCI device specific data - config
> space of PCI device.
> Tested save and restore with MSI and MSIX type.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
>  include/hw/vfio/vfio-common.h |   2 +
>  2 files changed, 165 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 6c77c12e44b9..8deb11e87ef7 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -41,6 +41,7 @@
>  #include "trace.h"
>  #include "qapi/error.h"
>  #include "migration/blocker.h"
> +#include "migration/qemu-file.h"
>  
>  #define TYPE_VFIO_PCI "vfio-pci"
>  #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
> @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
>      }
>  }
>  
> +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
> +{
> +    PCIDevice *pdev = &vdev->pdev;
> +    VFIOBAR *bar = &vdev->bars[nr];
> +    uint64_t addr;
> +    uint32_t addr_lo, addr_hi = 0;
> +
> +    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
> +    if (!bar->size) {
> +        return 0;
> +    }
> +
> +    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
> +
> +    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
> +                                       PCI_BASE_ADDRESS_MEM_MASK);

Nit, &= or combine with previous set.

> +    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
> +        addr_hi = pci_default_read_config(pdev,
> +                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
> +    }
> +
> +    addr = ((uint64_t)addr_hi << 32) | addr_lo;

Could we use a union?

> +
> +    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
> +        return -EINVAL;
> +    }

What specifically are we validating here?  This should be true no
matter what we wrote to the BAR or else BAR emulation is broken.  The
bits that could make this unaligned are not implemented in the BAR.

> +
> +    return 0;
> +}
> +
> +static int vfio_bars_validate(VFIOPCIDevice *vdev)
> +{
> +    int i, ret;
> +
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        ret = vfio_bar_validate(vdev, i);
> +        if (ret) {
> +            error_report("vfio: BAR address %d validation failed", i);
> +            return ret;
> +        }
> +    }
> +    return 0;
> +}
> +
>  static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
>  {
>      VFIOBAR *bar = &vdev->bars[nr];
> @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
>      return OBJECT(vdev);
>  }
>  
> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> +{
> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +    PCIDevice *pdev = &vdev->pdev;
> +    uint16_t pci_cmd;
> +    int i;
> +
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        uint32_t bar;
> +
> +        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
> +        qemu_put_be32(f, bar);
> +    }
> +
> +    qemu_put_be32(f, vdev->interrupt);
> +    if (vdev->interrupt == VFIO_INT_MSI) {
> +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> +        bool msi_64bit;
> +
> +        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +                                            2);
> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> +
> +        msi_addr_lo = pci_default_read_config(pdev,
> +                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
> +        qemu_put_be32(f, msi_addr_lo);
> +
> +        if (msi_64bit) {
> +            msi_addr_hi = pci_default_read_config(pdev,
> +                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> +                                             4);
> +        }
> +        qemu_put_be32(f, msi_addr_hi);
> +
> +        msi_data = pci_default_read_config(pdev,
> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> +                2);
> +        qemu_put_be32(f, msi_data);

Isn't the data field only a u16?

> +    } else if (vdev->interrupt == VFIO_INT_MSIX) {
> +        uint16_t offset;
> +
> +        /* save enable bit and maskall bit */
> +        offset = pci_default_read_config(pdev,
> +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
> +        qemu_put_be16(f, offset);
> +        msix_save(pdev, f);
> +    }
> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> +    qemu_put_be16(f, pci_cmd);
> +}
> +
> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
> +{
> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +    PCIDevice *pdev = &vdev->pdev;
> +    uint32_t interrupt_type;
> +    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> +    uint16_t pci_cmd;
> +    bool msi_64bit;
> +    int i, ret;
> +
> +    /* retore pci bar configuration */
> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> +    vfio_pci_write_config(pdev, PCI_COMMAND,
> +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        uint32_t bar = qemu_get_be32(f);
> +
> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
> +    }
> +
> +    ret = vfio_bars_validate(vdev);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    interrupt_type = qemu_get_be32(f);
> +
> +    if (interrupt_type == VFIO_INT_MSI) {
> +        /* restore msi configuration */
> +        msi_flags = pci_default_read_config(pdev,
> +                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> +
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
> +
> +        msi_addr_lo = qemu_get_be32(f);
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
> +                              msi_addr_lo, 4);
> +
> +        msi_addr_hi = qemu_get_be32(f);
> +        if (msi_64bit) {
> +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> +                                  msi_addr_hi, 4);
> +        }
> +        msi_data = qemu_get_be32(f);
> +        vfio_pci_write_config(pdev,
> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> +                msi_data, 2);
> +
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
> +    } else if (interrupt_type == VFIO_INT_MSIX) {
> +        uint16_t offset = qemu_get_be16(f);
> +
> +        /* load enable bit and maskall bit */
> +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
> +                              offset, 2);
> +        msix_load(pdev, f);
> +    }
> +    pci_cmd = qemu_get_be16(f);
> +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
> +    return 0;
> +}

It always seems like there should be a lot more state than this, and I
probably sound like a broken record because I ask every time, but maybe
that's a good indication that we (or at least I) need a comment
explaining why we only care about these.  For example, what if we
migrate a device in the D3 power state, don't we need to account for
the state stored in the PM capability or does the device wake up into
D0 auto-magically after migration?  I think we could repeat that
question for every capability that can be modified.  Even for the MSI/X
cases, the interrupt may not be active, but there could be state in
virtual config space that would be different on the target.  For
example, if we migrate with a device in INTx mode where the guest had
written vector fields on the source, but only writes the enable bit on
the target, can we seamlessly figure out the rest?  For other
capabilities, that state may represent config space changes written
through to the physical device and represent a functional difference on
the target.  Thanks,

Alex

> +
>  static VFIODeviceOps vfio_pci_ops = {
>      .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
>      .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
>      .vfio_eoi = vfio_intx_eoi,
>      .vfio_get_object = vfio_pci_get_object,
> +    .vfio_save_config = vfio_pci_save_config,
> +    .vfio_load_config = vfio_pci_load_config,
>  };
>  
>  int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 74261feaeac9..d69a7f3ae31e 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -120,6 +120,8 @@ struct VFIODeviceOps {
>      int (*vfio_hot_reset_multi)(VFIODevice *vdev);
>      void (*vfio_eoi)(VFIODevice *vdev);
>      Object *(*vfio_get_object)(VFIODevice *vdev);
> +    void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f);
> +    int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
>  };
>  
>  typedef struct VFIOGroup {
Dr. David Alan Gilbert March 26, 2020, 5:29 p.m. UTC | #2
* Alex Williamson (alex.williamson@redhat.com) wrote:
> On Wed, 25 Mar 2020 02:39:02 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > These functions save and restore PCI device specific data - config
> > space of PCI device.
> > Tested save and restore with MSI and MSIX type.
> > 
> > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > Reviewed-by: Neo Jia <cjia@nvidia.com>
> > ---
> >  hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
> >  include/hw/vfio/vfio-common.h |   2 +
> >  2 files changed, 165 insertions(+)
> > 
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > index 6c77c12e44b9..8deb11e87ef7 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -41,6 +41,7 @@
> >  #include "trace.h"
> >  #include "qapi/error.h"
> >  #include "migration/blocker.h"
> > +#include "migration/qemu-file.h"
> >  
> >  #define TYPE_VFIO_PCI "vfio-pci"
> >  #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
> > @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
> >      }
> >  }
> >  
> > +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
> > +{
> > +    PCIDevice *pdev = &vdev->pdev;
> > +    VFIOBAR *bar = &vdev->bars[nr];
> > +    uint64_t addr;
> > +    uint32_t addr_lo, addr_hi = 0;
> > +
> > +    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
> > +    if (!bar->size) {
> > +        return 0;
> > +    }
> > +
> > +    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
> > +
> > +    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
> > +                                       PCI_BASE_ADDRESS_MEM_MASK);
> 
> Nit, &= or combine with previous set.
> 
> > +    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > +        addr_hi = pci_default_read_config(pdev,
> > +                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
> > +    }
> > +
> > +    addr = ((uint64_t)addr_hi << 32) | addr_lo;
> 
> Could we use a union?
> 
> > +
> > +    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
> > +        return -EINVAL;
> > +    }
> 
> What specifically are we validating here?  This should be true no
> matter what we wrote to the BAR or else BAR emulation is broken.  The
> bits that could make this unaligned are not implemented in the BAR.

That I think is based on a comment I asked a few versions back.
Remember the value being checked here is a value loaded from the
migration stream; it could be garbage, so it's good to do whatever
checks you can.

> > +
> > +    return 0;
> > +}
> > +
> > +static int vfio_bars_validate(VFIOPCIDevice *vdev)
> > +{
> > +    int i, ret;
> > +
> > +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> > +        ret = vfio_bar_validate(vdev, i);
> > +        if (ret) {
> > +            error_report("vfio: BAR address %d validation failed", i);
> > +            return ret;
> > +        }
> > +    }
> > +    return 0;
> > +}
> > +
> >  static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
> >  {
> >      VFIOBAR *bar = &vdev->bars[nr];
> > @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
> >      return OBJECT(vdev);
> >  }
> >  
> > +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> > +{
> > +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> > +    PCIDevice *pdev = &vdev->pdev;
> > +    uint16_t pci_cmd;
> > +    int i;
> > +
> > +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> > +        uint32_t bar;
> > +
> > +        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
> > +        qemu_put_be32(f, bar);
> > +    }
> > +
> > +    qemu_put_be32(f, vdev->interrupt);
> > +    if (vdev->interrupt == VFIO_INT_MSI) {
> > +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> > +        bool msi_64bit;
> > +
> > +        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> > +                                            2);
> > +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> > +
> > +        msi_addr_lo = pci_default_read_config(pdev,
> > +                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
> > +        qemu_put_be32(f, msi_addr_lo);
> > +
> > +        if (msi_64bit) {
> > +            msi_addr_hi = pci_default_read_config(pdev,
> > +                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> > +                                             4);
> > +        }
> > +        qemu_put_be32(f, msi_addr_hi);
> > +
> > +        msi_data = pci_default_read_config(pdev,
> > +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> > +                2);
> > +        qemu_put_be32(f, msi_data);
> 
> Isn't the data field only a u16?
> 
> > +    } else if (vdev->interrupt == VFIO_INT_MSIX) {
> > +        uint16_t offset;
> > +
> > +        /* save enable bit and maskall bit */
> > +        offset = pci_default_read_config(pdev,
> > +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
> > +        qemu_put_be16(f, offset);
> > +        msix_save(pdev, f);
> > +    }
> > +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> > +    qemu_put_be16(f, pci_cmd);
> > +}
> > +
> > +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
> > +{
> > +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> > +    PCIDevice *pdev = &vdev->pdev;
> > +    uint32_t interrupt_type;
> > +    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> > +    uint16_t pci_cmd;
> > +    bool msi_64bit;
> > +    int i, ret;
> > +
> > +    /* retore pci bar configuration */
> > +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> > +    vfio_pci_write_config(pdev, PCI_COMMAND,
> > +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
> > +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> > +        uint32_t bar = qemu_get_be32(f);
> > +
> > +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
> > +    }
> > +
> > +    ret = vfio_bars_validate(vdev);
> > +    if (ret) {
> > +        return ret;
> > +    }
> > +
> > +    interrupt_type = qemu_get_be32(f);
> > +
> > +    if (interrupt_type == VFIO_INT_MSI) {
> > +        /* restore msi configuration */
> > +        msi_flags = pci_default_read_config(pdev,
> > +                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
> > +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> > +
> > +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> > +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
> > +
> > +        msi_addr_lo = qemu_get_be32(f);
> > +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
> > +                              msi_addr_lo, 4);
> > +
> > +        msi_addr_hi = qemu_get_be32(f);
> > +        if (msi_64bit) {
> > +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> > +                                  msi_addr_hi, 4);
> > +        }
> > +        msi_data = qemu_get_be32(f);
> > +        vfio_pci_write_config(pdev,
> > +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> > +                msi_data, 2);
> > +
> > +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> > +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
> > +    } else if (interrupt_type == VFIO_INT_MSIX) {
> > +        uint16_t offset = qemu_get_be16(f);
> > +
> > +        /* load enable bit and maskall bit */
> > +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
> > +                              offset, 2);
> > +        msix_load(pdev, f);
> > +    }
> > +    pci_cmd = qemu_get_be16(f);
> > +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
> > +    return 0;
> > +}
> 
> It always seems like there should be a lot more state than this, and I
> probably sound like a broken record because I ask every time, but maybe
> that's a good indication that we (or at least I) need a comment
> explaining why we only care about these.  For example, what if we
> migrate a device in the D3 power state, don't we need to account for
> the state stored in the PM capability or does the device wake up into
> D0 auto-magically after migration?  I think we could repeat that
> question for every capability that can be modified.  Even for the MSI/X
> cases, the interrupt may not be active, but there could be state in
> virtual config space that would be different on the target.  For
> example, if we migrate with a device in INTx mode where the guest had
> written vector fields on the source, but only writes the enable bit on
> the target, can we seamlessly figure out the rest?  For other
> capabilities, that state may represent config space changes written
> through to the physical device and represent a functional difference on
> the target.  Thanks,
> 
> Alex
> 
> > +
> >  static VFIODeviceOps vfio_pci_ops = {
> >      .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
> >      .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
> >      .vfio_eoi = vfio_intx_eoi,
> >      .vfio_get_object = vfio_pci_get_object,
> > +    .vfio_save_config = vfio_pci_save_config,
> > +    .vfio_load_config = vfio_pci_load_config,
> >  };
> >  
> >  int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
> > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> > index 74261feaeac9..d69a7f3ae31e 100644
> > --- a/include/hw/vfio/vfio-common.h
> > +++ b/include/hw/vfio/vfio-common.h
> > @@ -120,6 +120,8 @@ struct VFIODeviceOps {
> >      int (*vfio_hot_reset_multi)(VFIODevice *vdev);
> >      void (*vfio_eoi)(VFIODevice *vdev);
> >      Object *(*vfio_get_object)(VFIODevice *vdev);
> > +    void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f);
> > +    int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
> >  };
> >  
> >  typedef struct VFIOGroup {
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Alex Williamson March 26, 2020, 5:38 p.m. UTC | #3
On Thu, 26 Mar 2020 17:29:26 +0000
"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:

> * Alex Williamson (alex.williamson@redhat.com) wrote:
> > On Wed, 25 Mar 2020 02:39:02 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> > > These functions save and restore PCI device specific data - config
> > > space of PCI device.
> > > Tested save and restore with MSI and MSIX type.
> > > 
> > > Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > > Reviewed-by: Neo Jia <cjia@nvidia.com>
> > > ---
> > >  hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
> > >  include/hw/vfio/vfio-common.h |   2 +
> > >  2 files changed, 165 insertions(+)
> > > 
> > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > index 6c77c12e44b9..8deb11e87ef7 100644
> > > --- a/hw/vfio/pci.c
> > > +++ b/hw/vfio/pci.c
> > > @@ -41,6 +41,7 @@
> > >  #include "trace.h"
> > >  #include "qapi/error.h"
> > >  #include "migration/blocker.h"
> > > +#include "migration/qemu-file.h"
> > >  
> > >  #define TYPE_VFIO_PCI "vfio-pci"
> > >  #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
> > > @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
> > >      }
> > >  }
> > >  
> > > +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
> > > +{
> > > +    PCIDevice *pdev = &vdev->pdev;
> > > +    VFIOBAR *bar = &vdev->bars[nr];
> > > +    uint64_t addr;
> > > +    uint32_t addr_lo, addr_hi = 0;
> > > +
> > > +    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
> > > +    if (!bar->size) {
> > > +        return 0;
> > > +    }
> > > +
> > > +    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
> > > +
> > > +    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
> > > +                                       PCI_BASE_ADDRESS_MEM_MASK);  
> > 
> > Nit, &= or combine with previous set.
> >   
> > > +    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > > +        addr_hi = pci_default_read_config(pdev,
> > > +                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
> > > +    }
> > > +
> > > +    addr = ((uint64_t)addr_hi << 32) | addr_lo;  
> > 
> > Could we use a union?
> >   
> > > +
> > > +    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
> > > +        return -EINVAL;
> > > +    }  
> > 
> > What specifically are we validating here?  This should be true no
> > matter what we wrote to the BAR or else BAR emulation is broken.  The
> > bits that could make this unaligned are not implemented in the BAR.  
> 
> That I think is based on a comment I asked a few versions back.
> Remember the value being checked here is a value loaded from the
> migration stream; it could be garbage, so it's good to do whatever
> checks you can.

It's not the migration stream though, we're reading it from config
space emulation.  The migration stream could have written absolutely
anything to the device BAR and this test should still be ok.  PCI BARs
are naturally aligned by definition.  The address bits that could make
the value unaligned are not implemented.  This is why we can determine
the size of the BAR by writing -1 to it.  Thanks,

Alex

> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int vfio_bars_validate(VFIOPCIDevice *vdev)
> > > +{
> > > +    int i, ret;
> > > +
> > > +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> > > +        ret = vfio_bar_validate(vdev, i);
> > > +        if (ret) {
> > > +            error_report("vfio: BAR address %d validation failed", i);
> > > +            return ret;
> > > +        }
> > > +    }
> > > +    return 0;
> > > +}
> > > +
> > >  static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
> > >  {
> > >      VFIOBAR *bar = &vdev->bars[nr];
> > > @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
> > >      return OBJECT(vdev);
> > >  }
> > >  
> > > +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> > > +{
> > > +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> > > +    PCIDevice *pdev = &vdev->pdev;
> > > +    uint16_t pci_cmd;
> > > +    int i;
> > > +
> > > +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> > > +        uint32_t bar;
> > > +
> > > +        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
> > > +        qemu_put_be32(f, bar);
> > > +    }
> > > +
> > > +    qemu_put_be32(f, vdev->interrupt);
> > > +    if (vdev->interrupt == VFIO_INT_MSI) {
> > > +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> > > +        bool msi_64bit;
> > > +
> > > +        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> > > +                                            2);
> > > +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> > > +
> > > +        msi_addr_lo = pci_default_read_config(pdev,
> > > +                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
> > > +        qemu_put_be32(f, msi_addr_lo);
> > > +
> > > +        if (msi_64bit) {
> > > +            msi_addr_hi = pci_default_read_config(pdev,
> > > +                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> > > +                                             4);
> > > +        }
> > > +        qemu_put_be32(f, msi_addr_hi);
> > > +
> > > +        msi_data = pci_default_read_config(pdev,
> > > +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> > > +                2);
> > > +        qemu_put_be32(f, msi_data);  
> > 
> > Isn't the data field only a u16?
> >   
> > > +    } else if (vdev->interrupt == VFIO_INT_MSIX) {
> > > +        uint16_t offset;
> > > +
> > > +        /* save enable bit and maskall bit */
> > > +        offset = pci_default_read_config(pdev,
> > > +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
> > > +        qemu_put_be16(f, offset);
> > > +        msix_save(pdev, f);
> > > +    }
> > > +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> > > +    qemu_put_be16(f, pci_cmd);
> > > +}
> > > +
> > > +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
> > > +{
> > > +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> > > +    PCIDevice *pdev = &vdev->pdev;
> > > +    uint32_t interrupt_type;
> > > +    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> > > +    uint16_t pci_cmd;
> > > +    bool msi_64bit;
> > > +    int i, ret;
> > > +
> > > +    /* retore pci bar configuration */
> > > +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> > > +    vfio_pci_write_config(pdev, PCI_COMMAND,
> > > +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
> > > +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> > > +        uint32_t bar = qemu_get_be32(f);
> > > +
> > > +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
> > > +    }
> > > +
> > > +    ret = vfio_bars_validate(vdev);
> > > +    if (ret) {
> > > +        return ret;
> > > +    }
> > > +
> > > +    interrupt_type = qemu_get_be32(f);
> > > +
> > > +    if (interrupt_type == VFIO_INT_MSI) {
> > > +        /* restore msi configuration */
> > > +        msi_flags = pci_default_read_config(pdev,
> > > +                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
> > > +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> > > +
> > > +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> > > +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
> > > +
> > > +        msi_addr_lo = qemu_get_be32(f);
> > > +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
> > > +                              msi_addr_lo, 4);
> > > +
> > > +        msi_addr_hi = qemu_get_be32(f);
> > > +        if (msi_64bit) {
> > > +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> > > +                                  msi_addr_hi, 4);
> > > +        }
> > > +        msi_data = qemu_get_be32(f);
> > > +        vfio_pci_write_config(pdev,
> > > +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> > > +                msi_data, 2);
> > > +
> > > +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> > > +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
> > > +    } else if (interrupt_type == VFIO_INT_MSIX) {
> > > +        uint16_t offset = qemu_get_be16(f);
> > > +
> > > +        /* load enable bit and maskall bit */
> > > +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
> > > +                              offset, 2);
> > > +        msix_load(pdev, f);
> > > +    }
> > > +    pci_cmd = qemu_get_be16(f);
> > > +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
> > > +    return 0;
> > > +}  
> > 
> > It always seems like there should be a lot more state than this, and I
> > probably sound like a broken record because I ask every time, but maybe
> > that's a good indication that we (or at least I) need a comment
> > explaining why we only care about these.  For example, what if we
> > migrate a device in the D3 power state, don't we need to account for
> > the state stored in the PM capability or does the device wake up into
> > D0 auto-magically after migration?  I think we could repeat that
> > question for every capability that can be modified.  Even for the MSI/X
> > cases, the interrupt may not be active, but there could be state in
> > virtual config space that would be different on the target.  For
> > example, if we migrate with a device in INTx mode where the guest had
> > written vector fields on the source, but only writes the enable bit on
> > the target, can we seamlessly figure out the rest?  For other
> > capabilities, that state may represent config space changes written
> > through to the physical device and represent a functional difference on
> > the target.  Thanks,
> > 
> > Alex
> >   
> > > +
> > >  static VFIODeviceOps vfio_pci_ops = {
> > >      .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
> > >      .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
> > >      .vfio_eoi = vfio_intx_eoi,
> > >      .vfio_get_object = vfio_pci_get_object,
> > > +    .vfio_save_config = vfio_pci_save_config,
> > > +    .vfio_load_config = vfio_pci_load_config,
> > >  };
> > >  
> > >  int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
> > > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> > > index 74261feaeac9..d69a7f3ae31e 100644
> > > --- a/include/hw/vfio/vfio-common.h
> > > +++ b/include/hw/vfio/vfio-common.h
> > > @@ -120,6 +120,8 @@ struct VFIODeviceOps {
> > >      int (*vfio_hot_reset_multi)(VFIODevice *vdev);
> > >      void (*vfio_eoi)(VFIODevice *vdev);
> > >      Object *(*vfio_get_object)(VFIODevice *vdev);
> > > +    void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f);
> > > +    int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
> > >  };
> > >  
> > >  typedef struct VFIOGroup {  
> >   
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert March 26, 2020, 5:46 p.m. UTC | #4
* Kirti Wankhede (kwankhede@nvidia.com) wrote:
> These functions save and restore PCI device specific data - config
> space of PCI device.
> Tested save and restore with MSI and MSIX type.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
>  include/hw/vfio/vfio-common.h |   2 +
>  2 files changed, 165 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 6c77c12e44b9..8deb11e87ef7 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -41,6 +41,7 @@
>  #include "trace.h"
>  #include "qapi/error.h"
>  #include "migration/blocker.h"
> +#include "migration/qemu-file.h"
>  
>  #define TYPE_VFIO_PCI "vfio-pci"
>  #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
> @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
>      }
>  }
>  
> +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
> +{
> +    PCIDevice *pdev = &vdev->pdev;
> +    VFIOBAR *bar = &vdev->bars[nr];
> +    uint64_t addr;
> +    uint32_t addr_lo, addr_hi = 0;
> +
> +    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
> +    if (!bar->size) {
> +        return 0;
> +    }
> +
> +    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
> +
> +    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
> +                                       PCI_BASE_ADDRESS_MEM_MASK);
> +    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
> +        addr_hi = pci_default_read_config(pdev,
> +                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
> +    }
> +
> +    addr = ((uint64_t)addr_hi << 32) | addr_lo;
> +
> +    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
> +static int vfio_bars_validate(VFIOPCIDevice *vdev)
> +{
> +    int i, ret;
> +
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        ret = vfio_bar_validate(vdev, i);
> +        if (ret) {
> +            error_report("vfio: BAR address %d validation failed", i);
> +            return ret;
> +        }
> +    }
> +    return 0;
> +}
> +
>  static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
>  {
>      VFIOBAR *bar = &vdev->bars[nr];
> @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
>      return OBJECT(vdev);
>  }
>  
> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> +{
> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +    PCIDevice *pdev = &vdev->pdev;
> +    uint16_t pci_cmd;
> +    int i;
> +
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        uint32_t bar;
> +
> +        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
> +        qemu_put_be32(f, bar);
> +    }
> +
> +    qemu_put_be32(f, vdev->interrupt);
> +    if (vdev->interrupt == VFIO_INT_MSI) {
> +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> +        bool msi_64bit;
> +
> +        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +                                            2);
> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> +
> +        msi_addr_lo = pci_default_read_config(pdev,
> +                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
> +        qemu_put_be32(f, msi_addr_lo);
> +
> +        if (msi_64bit) {
> +            msi_addr_hi = pci_default_read_config(pdev,
> +                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> +                                             4);
> +        }
> +        qemu_put_be32(f, msi_addr_hi);
> +
> +        msi_data = pci_default_read_config(pdev,
> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> +                2);
> +        qemu_put_be32(f, msi_data);
> +    } else if (vdev->interrupt == VFIO_INT_MSIX) {
> +        uint16_t offset;
> +
> +        /* save enable bit and maskall bit */
> +        offset = pci_default_read_config(pdev,
> +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
> +        qemu_put_be16(f, offset);
> +        msix_save(pdev, f);
> +    }
> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> +    qemu_put_be16(f, pci_cmd);
> +}
> +
> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
> +{
> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +    PCIDevice *pdev = &vdev->pdev;
> +    uint32_t interrupt_type;
> +    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> +    uint16_t pci_cmd;
> +    bool msi_64bit;
> +    int i, ret;
> +
> +    /* retore pci bar configuration */
> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> +    vfio_pci_write_config(pdev, PCI_COMMAND,
> +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        uint32_t bar = qemu_get_be32(f);
> +
> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
> +    }
> +
> +    ret = vfio_bars_validate(vdev);

This isn't quite what I'd expected, since that validate is reading what
you read back; I'd have thought you'd validate the bar value before
writing it to the device.
(I'm also surprised you're only reading 32bit here?)

> +    if (ret) {
> +        return ret;
> +    }
> +
> +    interrupt_type = qemu_get_be32(f);
> +
> +    if (interrupt_type == VFIO_INT_MSI) {
> +        /* restore msi configuration */
> +        msi_flags = pci_default_read_config(pdev,
> +                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> +
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
> +
> +        msi_addr_lo = qemu_get_be32(f);
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
> +                              msi_addr_lo, 4);
> +
> +        msi_addr_hi = qemu_get_be32(f);
> +        if (msi_64bit) {
> +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> +                                  msi_addr_hi, 4);
> +        }
> +        msi_data = qemu_get_be32(f);
> +        vfio_pci_write_config(pdev,
> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> +                msi_data, 2);
> +
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
> +    } else if (interrupt_type == VFIO_INT_MSIX) {
> +        uint16_t offset = qemu_get_be16(f);
> +
> +        /* load enable bit and maskall bit */
> +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
> +                              offset, 2);
> +        msix_load(pdev, f);
> +    }
> +    pci_cmd = qemu_get_be16(f);
> +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
> +    return 0;
> +}
> +

While I don't know PCI as well as Alex, I share the worry about what
happens when you decide to want to save more information about the
device; you've not got any place holders where you can add anything; and
since it's all hand-coded (rather than using vmstate) it's only going to
get hairier.

Dave

>  static VFIODeviceOps vfio_pci_ops = {
>      .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
>      .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
>      .vfio_eoi = vfio_intx_eoi,
>      .vfio_get_object = vfio_pci_get_object,
> +    .vfio_save_config = vfio_pci_save_config,
> +    .vfio_load_config = vfio_pci_load_config,
>  };
>  
>  int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 74261feaeac9..d69a7f3ae31e 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -120,6 +120,8 @@ struct VFIODeviceOps {
>      int (*vfio_hot_reset_multi)(VFIODevice *vdev);
>      void (*vfio_eoi)(VFIODevice *vdev);
>      Object *(*vfio_get_object)(VFIODevice *vdev);
> +    void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f);
> +    int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
>  };
>  
>  typedef struct VFIOGroup {
> -- 
> 2.7.0
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Longpeng(Mike) April 7, 2020, 4:10 a.m. UTC | #5
On 2020/3/25 5:09, Kirti Wankhede wrote:
> These functions save and restore PCI device specific data - config
> space of PCI device.
> Tested save and restore with MSI and MSIX type.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
>  include/hw/vfio/vfio-common.h |   2 +
>  2 files changed, 165 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 6c77c12e44b9..8deb11e87ef7 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -41,6 +41,7 @@
>  #include "trace.h"
>  #include "qapi/error.h"
>  #include "migration/blocker.h"
> +#include "migration/qemu-file.h"
>  
>  #define TYPE_VFIO_PCI "vfio-pci"
>  #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
> @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
>      }
>  }
>  
> +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
> +{
> +    PCIDevice *pdev = &vdev->pdev;
> +    VFIOBAR *bar = &vdev->bars[nr];
> +    uint64_t addr;
> +    uint32_t addr_lo, addr_hi = 0;
> +
> +    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
> +    if (!bar->size) {
> +        return 0;
> +    }
> +
> +    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
> +
> +    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
> +                                       PCI_BASE_ADDRESS_MEM_MASK);
> +    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
> +        addr_hi = pci_default_read_config(pdev,
> +                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
> +    }
> +
> +    addr = ((uint64_t)addr_hi << 32) | addr_lo;
> +
> +    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
> +static int vfio_bars_validate(VFIOPCIDevice *vdev)
> +{
> +    int i, ret;
> +
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        ret = vfio_bar_validate(vdev, i);
> +        if (ret) {
> +            error_report("vfio: BAR address %d validation failed", i);
> +            return ret;
> +        }
> +    }
> +    return 0;
> +}
> +
>  static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
>  {
>      VFIOBAR *bar = &vdev->bars[nr];
> @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
>      return OBJECT(vdev);
>  }
>  
> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> +{
> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +    PCIDevice *pdev = &vdev->pdev;
> +    uint16_t pci_cmd;
> +    int i;
> +
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        uint32_t bar;
> +
> +        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
> +        qemu_put_be32(f, bar);
> +    }
> +
> +    qemu_put_be32(f, vdev->interrupt);
> +    if (vdev->interrupt == VFIO_INT_MSI) {
> +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> +        bool msi_64bit;
> +
> +        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +                                            2);
> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> +
> +        msi_addr_lo = pci_default_read_config(pdev,
> +                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
> +        qemu_put_be32(f, msi_addr_lo);
> +
> +        if (msi_64bit) {
> +            msi_addr_hi = pci_default_read_config(pdev,
> +                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> +                                             4);
> +        }
> +        qemu_put_be32(f, msi_addr_hi);
> +
> +        msi_data = pci_default_read_config(pdev,
> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> +                2);
> +        qemu_put_be32(f, msi_data);
> +    } else if (vdev->interrupt == VFIO_INT_MSIX) {
> +        uint16_t offset;
> +
> +        /* save enable bit and maskall bit */
> +        offset = pci_default_read_config(pdev,
> +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
> +        qemu_put_be16(f, offset);
> +        msix_save(pdev, f);
> +    }
> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> +    qemu_put_be16(f, pci_cmd);
> +}
> +
> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
> +{
> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +    PCIDevice *pdev = &vdev->pdev;
> +    uint32_t interrupt_type;
> +    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> +    uint16_t pci_cmd;
> +    bool msi_64bit;
> +    int i, ret;
> +
> +    /* retore pci bar configuration */
> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> +    vfio_pci_write_config(pdev, PCI_COMMAND,
> +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        uint32_t bar = qemu_get_be32(f);
> +
> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
> +    }
> +
> +    ret = vfio_bars_validate(vdev);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    interrupt_type = qemu_get_be32(f);
> +
> +    if (interrupt_type == VFIO_INT_MSI) {
> +        /* restore msi configuration */
> +        msi_flags = pci_default_read_config(pdev,
> +                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> +
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
> +
> +        msi_addr_lo = qemu_get_be32(f);
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
> +                              msi_addr_lo, 4);
> +
> +        msi_addr_hi = qemu_get_be32(f);
> +        if (msi_64bit) {
> +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> +                                  msi_addr_hi, 4);
> +        }
> +        msi_data = qemu_get_be32(f);
> +        vfio_pci_write_config(pdev,
> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> +                msi_data, 2);
> +
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
> +    } else if (interrupt_type == VFIO_INT_MSIX) {
> +        uint16_t offset = qemu_get_be16(f);
> +
> +        /* load enable bit and maskall bit */
> +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
> +                              offset, 2);
> +        msix_load(pdev, f);
Hi Kirti, Alex

'msix_load' here may increases the downtime. Our migrate-cap device has 128 msix
interrupts and the guestos enables all of them, so cost a lot of time (nearly
1s) to do 'msix_load', because in 'vfio_msix_vector_do_use' we need disable all
old interuppts and then append a new one.

What's your opinions ?

> +    }
> +    pci_cmd = qemu_get_be16(f);
> +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
> +    return 0;
> +}
> +
>  static VFIODeviceOps vfio_pci_ops = {
>      .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
>      .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
>      .vfio_eoi = vfio_intx_eoi,
>      .vfio_get_object = vfio_pci_get_object,
> +    .vfio_save_config = vfio_pci_save_config,
> +    .vfio_load_config = vfio_pci_load_config,
>  };
>  
>  int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 74261feaeac9..d69a7f3ae31e 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -120,6 +120,8 @@ struct VFIODeviceOps {
>      int (*vfio_hot_reset_multi)(VFIODevice *vdev);
>      void (*vfio_eoi)(VFIODevice *vdev);
>      Object *(*vfio_get_object)(VFIODevice *vdev);
> +    void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f);
> +    int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
>  };
>  
>  typedef struct VFIOGroup {
>
Kirti Wankhede May 4, 2020, 11:18 p.m. UTC | #6
On 3/26/2020 1:26 AM, Alex Williamson wrote:
> On Wed, 25 Mar 2020 02:39:02 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> These functions save and restore PCI device specific data - config
>> space of PCI device.
>> Tested save and restore with MSI and MSIX type.
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>> ---
>>   hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
>>   include/hw/vfio/vfio-common.h |   2 +
>>   2 files changed, 165 insertions(+)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 6c77c12e44b9..8deb11e87ef7 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -41,6 +41,7 @@
>>   #include "trace.h"
>>   #include "qapi/error.h"
>>   #include "migration/blocker.h"
>> +#include "migration/qemu-file.h"
>>   
>>   #define TYPE_VFIO_PCI "vfio-pci"
>>   #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
>> @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
>>       }
>>   }
>>   
>> +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
>> +{
>> +    PCIDevice *pdev = &vdev->pdev;
>> +    VFIOBAR *bar = &vdev->bars[nr];
>> +    uint64_t addr;
>> +    uint32_t addr_lo, addr_hi = 0;
>> +
>> +    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
>> +    if (!bar->size) {
>> +        return 0;
>> +    }
>> +
>> +    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
>> +
>> +    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
>> +                                       PCI_BASE_ADDRESS_MEM_MASK);
> 
> Nit, &= or combine with previous set.
> 
>> +    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
>> +        addr_hi = pci_default_read_config(pdev,
>> +                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
>> +    }
>> +
>> +    addr = ((uint64_t)addr_hi << 32) | addr_lo;
> 
> Could we use a union?
> 
>> +
>> +    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
>> +        return -EINVAL;
>> +    }
> 
> What specifically are we validating here?  This should be true no
> matter what we wrote to the BAR or else BAR emulation is broken.  The
> bits that could make this unaligned are not implemented in the BAR.
> 
>> +
>> +    return 0;
>> +}
>> +
>> +static int vfio_bars_validate(VFIOPCIDevice *vdev)
>> +{
>> +    int i, ret;
>> +
>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>> +        ret = vfio_bar_validate(vdev, i);
>> +        if (ret) {
>> +            error_report("vfio: BAR address %d validation failed", i);
>> +            return ret;
>> +        }
>> +    }
>> +    return 0;
>> +}
>> +
>>   static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
>>   {
>>       VFIOBAR *bar = &vdev->bars[nr];
>> @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
>>       return OBJECT(vdev);
>>   }
>>   
>> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
>> +{
>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>> +    PCIDevice *pdev = &vdev->pdev;
>> +    uint16_t pci_cmd;
>> +    int i;
>> +
>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>> +        uint32_t bar;
>> +
>> +        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
>> +        qemu_put_be32(f, bar);
>> +    }
>> +
>> +    qemu_put_be32(f, vdev->interrupt);
>> +    if (vdev->interrupt == VFIO_INT_MSI) {
>> +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
>> +        bool msi_64bit;
>> +
>> +        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>> +                                            2);
>> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
>> +
>> +        msi_addr_lo = pci_default_read_config(pdev,
>> +                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
>> +        qemu_put_be32(f, msi_addr_lo);
>> +
>> +        if (msi_64bit) {
>> +            msi_addr_hi = pci_default_read_config(pdev,
>> +                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
>> +                                             4);
>> +        }
>> +        qemu_put_be32(f, msi_addr_hi);
>> +
>> +        msi_data = pci_default_read_config(pdev,
>> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
>> +                2);
>> +        qemu_put_be32(f, msi_data);
> 
> Isn't the data field only a u16?
> 

Yes, fixing it.

>> +    } else if (vdev->interrupt == VFIO_INT_MSIX) {
>> +        uint16_t offset;
>> +
>> +        /* save enable bit and maskall bit */
>> +        offset = pci_default_read_config(pdev,
>> +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
>> +        qemu_put_be16(f, offset);
>> +        msix_save(pdev, f);
>> +    }
>> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
>> +    qemu_put_be16(f, pci_cmd);
>> +}
>> +
>> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
>> +{
>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>> +    PCIDevice *pdev = &vdev->pdev;
>> +    uint32_t interrupt_type;
>> +    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
>> +    uint16_t pci_cmd;
>> +    bool msi_64bit;
>> +    int i, ret;
>> +
>> +    /* retore pci bar configuration */
>> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
>> +    vfio_pci_write_config(pdev, PCI_COMMAND,
>> +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>> +        uint32_t bar = qemu_get_be32(f);
>> +
>> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
>> +    }
>> +
>> +    ret = vfio_bars_validate(vdev);
>> +    if (ret) {
>> +        return ret;
>> +    }
>> +
>> +    interrupt_type = qemu_get_be32(f);
>> +
>> +    if (interrupt_type == VFIO_INT_MSI) {
>> +        /* restore msi configuration */
>> +        msi_flags = pci_default_read_config(pdev,
>> +                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
>> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
>> +
>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>> +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
>> +
>> +        msi_addr_lo = qemu_get_be32(f);
>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
>> +                              msi_addr_lo, 4);
>> +
>> +        msi_addr_hi = qemu_get_be32(f);
>> +        if (msi_64bit) {
>> +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
>> +                                  msi_addr_hi, 4);
>> +        }
>> +        msi_data = qemu_get_be32(f);
>> +        vfio_pci_write_config(pdev,
>> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
>> +                msi_data, 2);
>> +
>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>> +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
>> +    } else if (interrupt_type == VFIO_INT_MSIX) {
>> +        uint16_t offset = qemu_get_be16(f);
>> +
>> +        /* load enable bit and maskall bit */
>> +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
>> +                              offset, 2);
>> +        msix_load(pdev, f);
>> +    }
>> +    pci_cmd = qemu_get_be16(f);
>> +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
>> +    return 0;
>> +}
> 
> It always seems like there should be a lot more state than this, and I
> probably sound like a broken record because I ask every time, but maybe
> that's a good indication that we (or at least I) need a comment
> explaining why we only care about these.  For example, what if we
> migrate a device in the D3 power state, don't we need to account for
> the state stored in the PM capability or does the device wake up into
> D0 auto-magically after migration?  I think we could repeat that
> question for every capability that can be modified.  Even for the MSI/X
> cases, the interrupt may not be active, but there could be state in
> virtual config space that would be different on the target.  For
> example, if we migrate with a device in INTx mode where the guest had
> written vector fields on the source, but only writes the enable bit on
> the target, can we seamlessly figure out the rest?  For other
> capabilities, that state may represent config space changes written
> through to the physical device and represent a functional difference on
> the target.  Thanks,
>

These are very basic set of registers from config state. Other are more 
of vendor specific which vendor driver can save and restore in their own 
data. I don't think we have to take care of all those vendor specific 
fields here.

Thanks,
Kirti

> Alex
> 
>> +
>>   static VFIODeviceOps vfio_pci_ops = {
>>       .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
>>       .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
>>       .vfio_eoi = vfio_intx_eoi,
>>       .vfio_get_object = vfio_pci_get_object,
>> +    .vfio_save_config = vfio_pci_save_config,
>> +    .vfio_load_config = vfio_pci_load_config,
>>   };
>>   
>>   int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 74261feaeac9..d69a7f3ae31e 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -120,6 +120,8 @@ struct VFIODeviceOps {
>>       int (*vfio_hot_reset_multi)(VFIODevice *vdev);
>>       void (*vfio_eoi)(VFIODevice *vdev);
>>       Object *(*vfio_get_object)(VFIODevice *vdev);
>> +    void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f);
>> +    int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
>>   };
>>   
>>   typedef struct VFIOGroup {
>
Kirti Wankhede May 4, 2020, 11:19 p.m. UTC | #7
On 3/26/2020 11:16 PM, Dr. David Alan Gilbert wrote:
> * Kirti Wankhede (kwankhede@nvidia.com) wrote:
>> These functions save and restore PCI device specific data - config
>> space of PCI device.
>> Tested save and restore with MSI and MSIX type.
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>> ---
>>   hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
>>   include/hw/vfio/vfio-common.h |   2 +
>>   2 files changed, 165 insertions(+)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 6c77c12e44b9..8deb11e87ef7 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -41,6 +41,7 @@
>>   #include "trace.h"
>>   #include "qapi/error.h"
>>   #include "migration/blocker.h"
>> +#include "migration/qemu-file.h"
>>   
>>   #define TYPE_VFIO_PCI "vfio-pci"
>>   #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
>> @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
>>       }
>>   }
>>   
>> +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
>> +{
>> +    PCIDevice *pdev = &vdev->pdev;
>> +    VFIOBAR *bar = &vdev->bars[nr];
>> +    uint64_t addr;
>> +    uint32_t addr_lo, addr_hi = 0;
>> +
>> +    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
>> +    if (!bar->size) {
>> +        return 0;
>> +    }
>> +
>> +    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
>> +
>> +    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
>> +                                       PCI_BASE_ADDRESS_MEM_MASK);
>> +    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
>> +        addr_hi = pci_default_read_config(pdev,
>> +                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
>> +    }
>> +
>> +    addr = ((uint64_t)addr_hi << 32) | addr_lo;
>> +
>> +    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int vfio_bars_validate(VFIOPCIDevice *vdev)
>> +{
>> +    int i, ret;
>> +
>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>> +        ret = vfio_bar_validate(vdev, i);
>> +        if (ret) {
>> +            error_report("vfio: BAR address %d validation failed", i);
>> +            return ret;
>> +        }
>> +    }
>> +    return 0;
>> +}
>> +
>>   static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
>>   {
>>       VFIOBAR *bar = &vdev->bars[nr];
>> @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
>>       return OBJECT(vdev);
>>   }
>>   
>> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
>> +{
>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>> +    PCIDevice *pdev = &vdev->pdev;
>> +    uint16_t pci_cmd;
>> +    int i;
>> +
>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>> +        uint32_t bar;
>> +
>> +        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
>> +        qemu_put_be32(f, bar);
>> +    }
>> +
>> +    qemu_put_be32(f, vdev->interrupt);
>> +    if (vdev->interrupt == VFIO_INT_MSI) {
>> +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
>> +        bool msi_64bit;
>> +
>> +        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>> +                                            2);
>> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
>> +
>> +        msi_addr_lo = pci_default_read_config(pdev,
>> +                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
>> +        qemu_put_be32(f, msi_addr_lo);
>> +
>> +        if (msi_64bit) {
>> +            msi_addr_hi = pci_default_read_config(pdev,
>> +                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
>> +                                             4);
>> +        }
>> +        qemu_put_be32(f, msi_addr_hi);
>> +
>> +        msi_data = pci_default_read_config(pdev,
>> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
>> +                2);
>> +        qemu_put_be32(f, msi_data);
>> +    } else if (vdev->interrupt == VFIO_INT_MSIX) {
>> +        uint16_t offset;
>> +
>> +        /* save enable bit and maskall bit */
>> +        offset = pci_default_read_config(pdev,
>> +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
>> +        qemu_put_be16(f, offset);
>> +        msix_save(pdev, f);
>> +    }
>> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
>> +    qemu_put_be16(f, pci_cmd);
>> +}
>> +
>> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
>> +{
>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>> +    PCIDevice *pdev = &vdev->pdev;
>> +    uint32_t interrupt_type;
>> +    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
>> +    uint16_t pci_cmd;
>> +    bool msi_64bit;
>> +    int i, ret;
>> +
>> +    /* retore pci bar configuration */
>> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
>> +    vfio_pci_write_config(pdev, PCI_COMMAND,
>> +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>> +        uint32_t bar = qemu_get_be32(f);
>> +
>> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
>> +    }
>> +
>> +    ret = vfio_bars_validate(vdev);
> 
> This isn't quite what I'd expected, since that validate is reading what
> you read back; I'd have thought you'd validate the bar value before
> writing it to the device.
> (I'm also surprised you're only reading 32bit here?)
> 

Sorry, then I didn't understood what exactly validation need to done here.

Reading 32-bit here because all PCI_BASE_ADDRESS_* are 32-bit registers. 
When BARs are 64-bit, adjacent bar addresses are clubbed to use hi and 
low. For example if BAR0 is 32-bit and BAR1 is 64 bit, 
PCI_BASE_ADDRESS_0 is used for BAR0. PCI_BASE_ADDRESS_1 is BAR1_lo and 
PCI_BASE_ADDRESS_2 is BAR1_hi address, then PCI_BASE_ADDRESS_3 will have 
  BAR2 address.This is according to PCIe specs.


>> +    if (ret) {
>> +        return ret;
>> +    }
>> +
>> +    interrupt_type = qemu_get_be32(f);
>> +
>> +    if (interrupt_type == VFIO_INT_MSI) {
>> +        /* restore msi configuration */
>> +        msi_flags = pci_default_read_config(pdev,
>> +                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
>> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
>> +
>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>> +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
>> +
>> +        msi_addr_lo = qemu_get_be32(f);
>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
>> +                              msi_addr_lo, 4);
>> +
>> +        msi_addr_hi = qemu_get_be32(f);
>> +        if (msi_64bit) {
>> +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
>> +                                  msi_addr_hi, 4);
>> +        }
>> +        msi_data = qemu_get_be32(f);
>> +        vfio_pci_write_config(pdev,
>> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
>> +                msi_data, 2);
>> +
>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>> +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
>> +    } else if (interrupt_type == VFIO_INT_MSIX) {
>> +        uint16_t offset = qemu_get_be16(f);
>> +
>> +        /* load enable bit and maskall bit */
>> +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
>> +                              offset, 2);
>> +        msix_load(pdev, f);
>> +    }
>> +    pci_cmd = qemu_get_be16(f);
>> +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
>> +    return 0;
>> +}
>> +
> 
> While I don't know PCI as well as Alex, I share the worry about what
> happens when you decide to want to save more information about the
> device; you've not got any place holders where you can add anything; and
> since it's all hand-coded (rather than using vmstate) it's only going to
> get hairier.
> 

These are very basic registers of PCI config space. When there are more, 
which are generally vendor specific, vendor driver should take care of 
saving and restoring rest of the data in config space.

Thanks,
Kirti

> Dave
> 
>>   static VFIODeviceOps vfio_pci_ops = {
>>       .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
>>       .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
>>       .vfio_eoi = vfio_intx_eoi,
>>       .vfio_get_object = vfio_pci_get_object,
>> +    .vfio_save_config = vfio_pci_save_config,
>> +    .vfio_load_config = vfio_pci_load_config,
>>   };
>>   
>>   int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 74261feaeac9..d69a7f3ae31e 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -120,6 +120,8 @@ struct VFIODeviceOps {
>>       int (*vfio_hot_reset_multi)(VFIODevice *vdev);
>>       void (*vfio_eoi)(VFIODevice *vdev);
>>       Object *(*vfio_get_object)(VFIODevice *vdev);
>> +    void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f);
>> +    int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
>>   };
>>   
>>   typedef struct VFIOGroup {
>> -- 
>> 2.7.0
>>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Kirti Wankhede May 4, 2020, 11:21 p.m. UTC | #8
On 4/7/2020 9:40 AM, Longpeng (Mike, Cloud Infrastructure Service 
Product Dept.) wrote:
> 
> 
> On 2020/3/25 5:09, Kirti Wankhede wrote:
>> These functions save and restore PCI device specific data - config
>> space of PCI device.
>> Tested save and restore with MSI and MSIX type.
>>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>> ---
>>   hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
>>   include/hw/vfio/vfio-common.h |   2 +
>>   2 files changed, 165 insertions(+)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index 6c77c12e44b9..8deb11e87ef7 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -41,6 +41,7 @@
>>   #include "trace.h"
>>   #include "qapi/error.h"
>>   #include "migration/blocker.h"
>> +#include "migration/qemu-file.h"
>>   
>>   #define TYPE_VFIO_PCI "vfio-pci"
>>   #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
>> @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
>>       }
>>   }
>>   
>> +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
>> +{
>> +    PCIDevice *pdev = &vdev->pdev;
>> +    VFIOBAR *bar = &vdev->bars[nr];
>> +    uint64_t addr;
>> +    uint32_t addr_lo, addr_hi = 0;
>> +
>> +    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
>> +    if (!bar->size) {
>> +        return 0;
>> +    }
>> +
>> +    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
>> +
>> +    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
>> +                                       PCI_BASE_ADDRESS_MEM_MASK);
>> +    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
>> +        addr_hi = pci_default_read_config(pdev,
>> +                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
>> +    }
>> +
>> +    addr = ((uint64_t)addr_hi << 32) | addr_lo;
>> +
>> +    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int vfio_bars_validate(VFIOPCIDevice *vdev)
>> +{
>> +    int i, ret;
>> +
>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>> +        ret = vfio_bar_validate(vdev, i);
>> +        if (ret) {
>> +            error_report("vfio: BAR address %d validation failed", i);
>> +            return ret;
>> +        }
>> +    }
>> +    return 0;
>> +}
>> +
>>   static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
>>   {
>>       VFIOBAR *bar = &vdev->bars[nr];
>> @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
>>       return OBJECT(vdev);
>>   }
>>   
>> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
>> +{
>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>> +    PCIDevice *pdev = &vdev->pdev;
>> +    uint16_t pci_cmd;
>> +    int i;
>> +
>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>> +        uint32_t bar;
>> +
>> +        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
>> +        qemu_put_be32(f, bar);
>> +    }
>> +
>> +    qemu_put_be32(f, vdev->interrupt);
>> +    if (vdev->interrupt == VFIO_INT_MSI) {
>> +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
>> +        bool msi_64bit;
>> +
>> +        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>> +                                            2);
>> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
>> +
>> +        msi_addr_lo = pci_default_read_config(pdev,
>> +                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
>> +        qemu_put_be32(f, msi_addr_lo);
>> +
>> +        if (msi_64bit) {
>> +            msi_addr_hi = pci_default_read_config(pdev,
>> +                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
>> +                                             4);
>> +        }
>> +        qemu_put_be32(f, msi_addr_hi);
>> +
>> +        msi_data = pci_default_read_config(pdev,
>> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
>> +                2);
>> +        qemu_put_be32(f, msi_data);
>> +    } else if (vdev->interrupt == VFIO_INT_MSIX) {
>> +        uint16_t offset;
>> +
>> +        /* save enable bit and maskall bit */
>> +        offset = pci_default_read_config(pdev,
>> +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
>> +        qemu_put_be16(f, offset);
>> +        msix_save(pdev, f);
>> +    }
>> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
>> +    qemu_put_be16(f, pci_cmd);
>> +}
>> +
>> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
>> +{
>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>> +    PCIDevice *pdev = &vdev->pdev;
>> +    uint32_t interrupt_type;
>> +    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
>> +    uint16_t pci_cmd;
>> +    bool msi_64bit;
>> +    int i, ret;
>> +
>> +    /* retore pci bar configuration */
>> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
>> +    vfio_pci_write_config(pdev, PCI_COMMAND,
>> +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>> +        uint32_t bar = qemu_get_be32(f);
>> +
>> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
>> +    }
>> +
>> +    ret = vfio_bars_validate(vdev);
>> +    if (ret) {
>> +        return ret;
>> +    }
>> +
>> +    interrupt_type = qemu_get_be32(f);
>> +
>> +    if (interrupt_type == VFIO_INT_MSI) {
>> +        /* restore msi configuration */
>> +        msi_flags = pci_default_read_config(pdev,
>> +                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
>> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
>> +
>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>> +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
>> +
>> +        msi_addr_lo = qemu_get_be32(f);
>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
>> +                              msi_addr_lo, 4);
>> +
>> +        msi_addr_hi = qemu_get_be32(f);
>> +        if (msi_64bit) {
>> +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
>> +                                  msi_addr_hi, 4);
>> +        }
>> +        msi_data = qemu_get_be32(f);
>> +        vfio_pci_write_config(pdev,
>> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
>> +                msi_data, 2);
>> +
>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>> +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
>> +    } else if (interrupt_type == VFIO_INT_MSIX) {
>> +        uint16_t offset = qemu_get_be16(f);
>> +
>> +        /* load enable bit and maskall bit */
>> +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
>> +                              offset, 2);
>> +        msix_load(pdev, f);
> Hi Kirti, Alex
> 
> 'msix_load' here may increases the downtime. Our migrate-cap device has 128 msix
> interrupts and the guestos enables all of them, so cost a lot of time (nearly
> 1s) to do 'msix_load', because in 'vfio_msix_vector_do_use' we need disable all
> old interuppts and then append a new one.
> 
> What's your opinions ?
> 

This will be at destination, so device should not be running which means 
interrupts should be disabled already, right?
You only need to append new.

Thanks,
Kirti

>> +    }
>> +    pci_cmd = qemu_get_be16(f);
>> +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
>> +    return 0;
>> +}
>> +
>>   static VFIODeviceOps vfio_pci_ops = {
>>       .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
>>       .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
>>       .vfio_eoi = vfio_intx_eoi,
>>       .vfio_get_object = vfio_pci_get_object,
>> +    .vfio_save_config = vfio_pci_save_config,
>> +    .vfio_load_config = vfio_pci_load_config,
>>   };
>>   
>>   int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 74261feaeac9..d69a7f3ae31e 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -120,6 +120,8 @@ struct VFIODeviceOps {
>>       int (*vfio_hot_reset_multi)(VFIODevice *vdev);
>>       void (*vfio_eoi)(VFIODevice *vdev);
>>       Object *(*vfio_get_object)(VFIODevice *vdev);
>> +    void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f);
>> +    int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
>>   };
>>   
>>   typedef struct VFIOGroup {
>>
>
Alex Williamson May 5, 2020, 4:37 a.m. UTC | #9
On Tue, 5 May 2020 04:48:37 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 3/26/2020 1:26 AM, Alex Williamson wrote:
> > On Wed, 25 Mar 2020 02:39:02 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> These functions save and restore PCI device specific data - config
> >> space of PCI device.
> >> Tested save and restore with MSI and MSIX type.
> >>
> >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >> ---
> >>   hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
> >>   include/hw/vfio/vfio-common.h |   2 +
> >>   2 files changed, 165 insertions(+)
> >>
> >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >> index 6c77c12e44b9..8deb11e87ef7 100644
> >> --- a/hw/vfio/pci.c
> >> +++ b/hw/vfio/pci.c
> >> @@ -41,6 +41,7 @@
> >>   #include "trace.h"
> >>   #include "qapi/error.h"
> >>   #include "migration/blocker.h"
> >> +#include "migration/qemu-file.h"
> >>   
> >>   #define TYPE_VFIO_PCI "vfio-pci"
> >>   #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
> >> @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
> >>       }
> >>   }
> >>   
> >> +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
> >> +{
> >> +    PCIDevice *pdev = &vdev->pdev;
> >> +    VFIOBAR *bar = &vdev->bars[nr];
> >> +    uint64_t addr;
> >> +    uint32_t addr_lo, addr_hi = 0;
> >> +
> >> +    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
> >> +    if (!bar->size) {
> >> +        return 0;
> >> +    }
> >> +
> >> +    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
> >> +
> >> +    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
> >> +                                       PCI_BASE_ADDRESS_MEM_MASK);  
> > 
> > Nit, &= or combine with previous set.
> >   
> >> +    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
> >> +        addr_hi = pci_default_read_config(pdev,
> >> +                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
> >> +    }
> >> +
> >> +    addr = ((uint64_t)addr_hi << 32) | addr_lo;  
> > 
> > Could we use a union?
> >   
> >> +
> >> +    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
> >> +        return -EINVAL;
> >> +    }  
> > 
> > What specifically are we validating here?  This should be true no
> > matter what we wrote to the BAR or else BAR emulation is broken.  The
> > bits that could make this unaligned are not implemented in the BAR.
> >   
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static int vfio_bars_validate(VFIOPCIDevice *vdev)
> >> +{
> >> +    int i, ret;
> >> +
> >> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> >> +        ret = vfio_bar_validate(vdev, i);
> >> +        if (ret) {
> >> +            error_report("vfio: BAR address %d validation failed", i);
> >> +            return ret;
> >> +        }
> >> +    }
> >> +    return 0;
> >> +}
> >> +
> >>   static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
> >>   {
> >>       VFIOBAR *bar = &vdev->bars[nr];
> >> @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
> >>       return OBJECT(vdev);
> >>   }
> >>   
> >> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> >> +{
> >> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> >> +    PCIDevice *pdev = &vdev->pdev;
> >> +    uint16_t pci_cmd;
> >> +    int i;
> >> +
> >> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> >> +        uint32_t bar;
> >> +
> >> +        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
> >> +        qemu_put_be32(f, bar);
> >> +    }
> >> +
> >> +    qemu_put_be32(f, vdev->interrupt);
> >> +    if (vdev->interrupt == VFIO_INT_MSI) {
> >> +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> >> +        bool msi_64bit;
> >> +
> >> +        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> >> +                                            2);
> >> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> >> +
> >> +        msi_addr_lo = pci_default_read_config(pdev,
> >> +                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
> >> +        qemu_put_be32(f, msi_addr_lo);
> >> +
> >> +        if (msi_64bit) {
> >> +            msi_addr_hi = pci_default_read_config(pdev,
> >> +                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> >> +                                             4);
> >> +        }
> >> +        qemu_put_be32(f, msi_addr_hi);
> >> +
> >> +        msi_data = pci_default_read_config(pdev,
> >> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> >> +                2);
> >> +        qemu_put_be32(f, msi_data);  
> > 
> > Isn't the data field only a u16?
> >   
> 
> Yes, fixing it.
> 
> >> +    } else if (vdev->interrupt == VFIO_INT_MSIX) {
> >> +        uint16_t offset;
> >> +
> >> +        /* save enable bit and maskall bit */
> >> +        offset = pci_default_read_config(pdev,
> >> +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
> >> +        qemu_put_be16(f, offset);
> >> +        msix_save(pdev, f);
> >> +    }
> >> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> >> +    qemu_put_be16(f, pci_cmd);
> >> +}
> >> +
> >> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
> >> +{
> >> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> >> +    PCIDevice *pdev = &vdev->pdev;
> >> +    uint32_t interrupt_type;
> >> +    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> >> +    uint16_t pci_cmd;
> >> +    bool msi_64bit;
> >> +    int i, ret;
> >> +
> >> +    /* retore pci bar configuration */
> >> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> >> +    vfio_pci_write_config(pdev, PCI_COMMAND,
> >> +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
> >> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> >> +        uint32_t bar = qemu_get_be32(f);
> >> +
> >> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
> >> +    }
> >> +
> >> +    ret = vfio_bars_validate(vdev);
> >> +    if (ret) {
> >> +        return ret;
> >> +    }
> >> +
> >> +    interrupt_type = qemu_get_be32(f);
> >> +
> >> +    if (interrupt_type == VFIO_INT_MSI) {
> >> +        /* restore msi configuration */
> >> +        msi_flags = pci_default_read_config(pdev,
> >> +                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
> >> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> >> +
> >> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> >> +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
> >> +
> >> +        msi_addr_lo = qemu_get_be32(f);
> >> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
> >> +                              msi_addr_lo, 4);
> >> +
> >> +        msi_addr_hi = qemu_get_be32(f);
> >> +        if (msi_64bit) {
> >> +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> >> +                                  msi_addr_hi, 4);
> >> +        }
> >> +        msi_data = qemu_get_be32(f);
> >> +        vfio_pci_write_config(pdev,
> >> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> >> +                msi_data, 2);
> >> +
> >> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> >> +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
> >> +    } else if (interrupt_type == VFIO_INT_MSIX) {
> >> +        uint16_t offset = qemu_get_be16(f);
> >> +
> >> +        /* load enable bit and maskall bit */
> >> +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
> >> +                              offset, 2);
> >> +        msix_load(pdev, f);
> >> +    }
> >> +    pci_cmd = qemu_get_be16(f);
> >> +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
> >> +    return 0;
> >> +}  
> > 
> > It always seems like there should be a lot more state than this, and I
> > probably sound like a broken record because I ask every time, but maybe
> > that's a good indication that we (or at least I) need a comment
> > explaining why we only care about these.  For example, what if we
> > migrate a device in the D3 power state, don't we need to account for
> > the state stored in the PM capability or does the device wake up into
> > D0 auto-magically after migration?  I think we could repeat that
> > question for every capability that can be modified.  Even for the MSI/X
> > cases, the interrupt may not be active, but there could be state in
> > virtual config space that would be different on the target.  For
> > example, if we migrate with a device in INTx mode where the guest had
> > written vector fields on the source, but only writes the enable bit on
> > the target, can we seamlessly figure out the rest?  For other
> > capabilities, that state may represent config space changes written
> > through to the physical device and represent a functional difference on
> > the target.  Thanks,
> >  
> 
> These are very basic set of registers from config state. Other are more 
> of vendor specific which vendor driver can save and restore in their own 
> data. I don't think we have to take care of all those vendor specific 
> fields here.

That had not been clear to me.  Intel folks, is this your understanding
regarding the responsibility of the user to save and restore config
space of the device as part of the vendor provided migration stream
data?  Thanks,

Alex
Yan Zhao May 6, 2020, 6:11 a.m. UTC | #10
On Tue, May 05, 2020 at 12:37:11PM +0800, Alex Williamson wrote:
> On Tue, 5 May 2020 04:48:37 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > On 3/26/2020 1:26 AM, Alex Williamson wrote:
> > > On Wed, 25 Mar 2020 02:39:02 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >   
> > >> These functions save and restore PCI device specific data - config
> > >> space of PCI device.
> > >> Tested save and restore with MSI and MSIX type.
> > >>
> > >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > >> ---
> > >>   hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
> > >>   include/hw/vfio/vfio-common.h |   2 +
> > >>   2 files changed, 165 insertions(+)
> > >>
> > >> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > >> index 6c77c12e44b9..8deb11e87ef7 100644
> > >> --- a/hw/vfio/pci.c
> > >> +++ b/hw/vfio/pci.c
> > >> @@ -41,6 +41,7 @@
> > >>   #include "trace.h"
> > >>   #include "qapi/error.h"
> > >>   #include "migration/blocker.h"
> > >> +#include "migration/qemu-file.h"
> > >>   
> > >>   #define TYPE_VFIO_PCI "vfio-pci"
> > >>   #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
> > >> @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
> > >>       }
> > >>   }
> > >>   
> > >> +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
> > >> +{
> > >> +    PCIDevice *pdev = &vdev->pdev;
> > >> +    VFIOBAR *bar = &vdev->bars[nr];
> > >> +    uint64_t addr;
> > >> +    uint32_t addr_lo, addr_hi = 0;
> > >> +
> > >> +    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
> > >> +    if (!bar->size) {
> > >> +        return 0;
> > >> +    }
> > >> +
> > >> +    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
> > >> +
> > >> +    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
> > >> +                                       PCI_BASE_ADDRESS_MEM_MASK);  
> > > 
> > > Nit, &= or combine with previous set.
> > >   
> > >> +    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > >> +        addr_hi = pci_default_read_config(pdev,
> > >> +                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
> > >> +    }
> > >> +
> > >> +    addr = ((uint64_t)addr_hi << 32) | addr_lo;  
> > > 
> > > Could we use a union?
> > >   
> > >> +
> > >> +    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
> > >> +        return -EINVAL;
> > >> +    }  
> > > 
> > > What specifically are we validating here?  This should be true no
> > > matter what we wrote to the BAR or else BAR emulation is broken.  The
> > > bits that could make this unaligned are not implemented in the BAR.
> > >   
> > >> +
> > >> +    return 0;
> > >> +}
> > >> +
> > >> +static int vfio_bars_validate(VFIOPCIDevice *vdev)
> > >> +{
> > >> +    int i, ret;
> > >> +
> > >> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> > >> +        ret = vfio_bar_validate(vdev, i);
> > >> +        if (ret) {
> > >> +            error_report("vfio: BAR address %d validation failed", i);
> > >> +            return ret;
> > >> +        }
> > >> +    }
> > >> +    return 0;
> > >> +}
> > >> +
> > >>   static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
> > >>   {
> > >>       VFIOBAR *bar = &vdev->bars[nr];
> > >> @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
> > >>       return OBJECT(vdev);
> > >>   }
> > >>   
> > >> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> > >> +{
> > >> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> > >> +    PCIDevice *pdev = &vdev->pdev;
> > >> +    uint16_t pci_cmd;
> > >> +    int i;
> > >> +
> > >> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> > >> +        uint32_t bar;
> > >> +
> > >> +        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
> > >> +        qemu_put_be32(f, bar);
> > >> +    }
> > >> +
> > >> +    qemu_put_be32(f, vdev->interrupt);
> > >> +    if (vdev->interrupt == VFIO_INT_MSI) {
> > >> +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> > >> +        bool msi_64bit;
> > >> +
> > >> +        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> > >> +                                            2);
> > >> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> > >> +
> > >> +        msi_addr_lo = pci_default_read_config(pdev,
> > >> +                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
> > >> +        qemu_put_be32(f, msi_addr_lo);
> > >> +
> > >> +        if (msi_64bit) {
> > >> +            msi_addr_hi = pci_default_read_config(pdev,
> > >> +                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> > >> +                                             4);
> > >> +        }
> > >> +        qemu_put_be32(f, msi_addr_hi);
> > >> +
> > >> +        msi_data = pci_default_read_config(pdev,
> > >> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> > >> +                2);
> > >> +        qemu_put_be32(f, msi_data);  
> > > 
> > > Isn't the data field only a u16?
> > >   
> > 
> > Yes, fixing it.
> > 
> > >> +    } else if (vdev->interrupt == VFIO_INT_MSIX) {
> > >> +        uint16_t offset;
> > >> +
> > >> +        /* save enable bit and maskall bit */
> > >> +        offset = pci_default_read_config(pdev,
> > >> +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
> > >> +        qemu_put_be16(f, offset);
> > >> +        msix_save(pdev, f);
> > >> +    }
> > >> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> > >> +    qemu_put_be16(f, pci_cmd);
> > >> +}
> > >> +
> > >> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
> > >> +{
> > >> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> > >> +    PCIDevice *pdev = &vdev->pdev;
> > >> +    uint32_t interrupt_type;
> > >> +    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> > >> +    uint16_t pci_cmd;
> > >> +    bool msi_64bit;
> > >> +    int i, ret;
> > >> +
> > >> +    /* retore pci bar configuration */
> > >> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> > >> +    vfio_pci_write_config(pdev, PCI_COMMAND,
> > >> +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
> > >> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> > >> +        uint32_t bar = qemu_get_be32(f);
> > >> +
> > >> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
> > >> +    }
> > >> +
> > >> +    ret = vfio_bars_validate(vdev);
> > >> +    if (ret) {
> > >> +        return ret;
> > >> +    }
> > >> +
> > >> +    interrupt_type = qemu_get_be32(f);
> > >> +
> > >> +    if (interrupt_type == VFIO_INT_MSI) {
> > >> +        /* restore msi configuration */
> > >> +        msi_flags = pci_default_read_config(pdev,
> > >> +                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
> > >> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> > >> +
> > >> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> > >> +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
> > >> +
> > >> +        msi_addr_lo = qemu_get_be32(f);
> > >> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
> > >> +                              msi_addr_lo, 4);
> > >> +
> > >> +        msi_addr_hi = qemu_get_be32(f);
> > >> +        if (msi_64bit) {
> > >> +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> > >> +                                  msi_addr_hi, 4);
> > >> +        }
> > >> +        msi_data = qemu_get_be32(f);
> > >> +        vfio_pci_write_config(pdev,
> > >> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> > >> +                msi_data, 2);
> > >> +
> > >> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> > >> +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
> > >> +    } else if (interrupt_type == VFIO_INT_MSIX) {
> > >> +        uint16_t offset = qemu_get_be16(f);
> > >> +
> > >> +        /* load enable bit and maskall bit */
> > >> +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
> > >> +                              offset, 2);
> > >> +        msix_load(pdev, f);
> > >> +    }
> > >> +    pci_cmd = qemu_get_be16(f);
> > >> +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
> > >> +    return 0;
> > >> +}  
> > > 
> > > It always seems like there should be a lot more state than this, and I
> > > probably sound like a broken record because I ask every time, but maybe
> > > that's a good indication that we (or at least I) need a comment
> > > explaining why we only care about these.  For example, what if we
> > > migrate a device in the D3 power state, don't we need to account for
> > > the state stored in the PM capability or does the device wake up into
> > > D0 auto-magically after migration?  I think we could repeat that
> > > question for every capability that can be modified.  Even for the MSI/X
> > > cases, the interrupt may not be active, but there could be state in
> > > virtual config space that would be different on the target.  For
> > > example, if we migrate with a device in INTx mode where the guest had
> > > written vector fields on the source, but only writes the enable bit on
> > > the target, can we seamlessly figure out the rest?  For other
> > > capabilities, that state may represent config space changes written
> > > through to the physical device and represent a functional difference on
> > > the target.  Thanks,
> > >  
> > 
> > These are very basic set of registers from config state. Other are more 
> > of vendor specific which vendor driver can save and restore in their own 
> > data. I don't think we have to take care of all those vendor specific 
> > fields here.
> 
> That had not been clear to me.  Intel folks, is this your understanding
> regarding the responsibility of the user to save and restore config
> space of the device as part of the vendor provided migration stream
> data?  Thanks,
> 
Currently, the code works for us. but I agree with you that there should
be more states to save, at least for emulated config bits.
I think we should call pci_device_save() to serve that purpose.

Thanks
Yan
Kirti Wankhede May 6, 2020, 7:48 p.m. UTC | #11
On 5/6/2020 11:41 AM, Yan Zhao wrote:
> On Tue, May 05, 2020 at 12:37:11PM +0800, Alex Williamson wrote:
>> On Tue, 5 May 2020 04:48:37 +0530
>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>
>>> On 3/26/2020 1:26 AM, Alex Williamson wrote:
>>>> On Wed, 25 Mar 2020 02:39:02 +0530
>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>    
>>>>> These functions save and restore PCI device specific data - config
>>>>> space of PCI device.
>>>>> Tested save and restore with MSI and MSIX type.
>>>>>
>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>>>>> ---
>>>>>    hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
>>>>>    include/hw/vfio/vfio-common.h |   2 +
>>>>>    2 files changed, 165 insertions(+)
>>>>>
>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>> index 6c77c12e44b9..8deb11e87ef7 100644
>>>>> --- a/hw/vfio/pci.c
>>>>> +++ b/hw/vfio/pci.c
>>>>> @@ -41,6 +41,7 @@
>>>>>    #include "trace.h"
>>>>>    #include "qapi/error.h"
>>>>>    #include "migration/blocker.h"
>>>>> +#include "migration/qemu-file.h"
>>>>>    
>>>>>    #define TYPE_VFIO_PCI "vfio-pci"
>>>>>    #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
>>>>> @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
>>>>>        }
>>>>>    }
>>>>>    
>>>>> +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
>>>>> +{
>>>>> +    PCIDevice *pdev = &vdev->pdev;
>>>>> +    VFIOBAR *bar = &vdev->bars[nr];
>>>>> +    uint64_t addr;
>>>>> +    uint32_t addr_lo, addr_hi = 0;
>>>>> +
>>>>> +    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
>>>>> +    if (!bar->size) {
>>>>> +        return 0;
>>>>> +    }
>>>>> +
>>>>> +    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
>>>>> +
>>>>> +    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
>>>>> +                                       PCI_BASE_ADDRESS_MEM_MASK);
>>>>
>>>> Nit, &= or combine with previous set.
>>>>    
>>>>> +    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
>>>>> +        addr_hi = pci_default_read_config(pdev,
>>>>> +                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
>>>>> +    }
>>>>> +
>>>>> +    addr = ((uint64_t)addr_hi << 32) | addr_lo;
>>>>
>>>> Could we use a union?
>>>>    
>>>>> +
>>>>> +    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
>>>>> +        return -EINVAL;
>>>>> +    }
>>>>
>>>> What specifically are we validating here?  This should be true no
>>>> matter what we wrote to the BAR or else BAR emulation is broken.  The
>>>> bits that could make this unaligned are not implemented in the BAR.
>>>>    
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static int vfio_bars_validate(VFIOPCIDevice *vdev)
>>>>> +{
>>>>> +    int i, ret;
>>>>> +
>>>>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>>>>> +        ret = vfio_bar_validate(vdev, i);
>>>>> +        if (ret) {
>>>>> +            error_report("vfio: BAR address %d validation failed", i);
>>>>> +            return ret;
>>>>> +        }
>>>>> +    }
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>>    static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
>>>>>    {
>>>>>        VFIOBAR *bar = &vdev->bars[nr];
>>>>> @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
>>>>>        return OBJECT(vdev);
>>>>>    }
>>>>>    
>>>>> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
>>>>> +{
>>>>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>>>> +    PCIDevice *pdev = &vdev->pdev;
>>>>> +    uint16_t pci_cmd;
>>>>> +    int i;
>>>>> +
>>>>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>>>>> +        uint32_t bar;
>>>>> +
>>>>> +        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
>>>>> +        qemu_put_be32(f, bar);
>>>>> +    }
>>>>> +
>>>>> +    qemu_put_be32(f, vdev->interrupt);
>>>>> +    if (vdev->interrupt == VFIO_INT_MSI) {
>>>>> +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
>>>>> +        bool msi_64bit;
>>>>> +
>>>>> +        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>>>>> +                                            2);
>>>>> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
>>>>> +
>>>>> +        msi_addr_lo = pci_default_read_config(pdev,
>>>>> +                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
>>>>> +        qemu_put_be32(f, msi_addr_lo);
>>>>> +
>>>>> +        if (msi_64bit) {
>>>>> +            msi_addr_hi = pci_default_read_config(pdev,
>>>>> +                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
>>>>> +                                             4);
>>>>> +        }
>>>>> +        qemu_put_be32(f, msi_addr_hi);
>>>>> +
>>>>> +        msi_data = pci_default_read_config(pdev,
>>>>> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
>>>>> +                2);
>>>>> +        qemu_put_be32(f, msi_data);
>>>>
>>>> Isn't the data field only a u16?
>>>>    
>>>
>>> Yes, fixing it.
>>>
>>>>> +    } else if (vdev->interrupt == VFIO_INT_MSIX) {
>>>>> +        uint16_t offset;
>>>>> +
>>>>> +        /* save enable bit and maskall bit */
>>>>> +        offset = pci_default_read_config(pdev,
>>>>> +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
>>>>> +        qemu_put_be16(f, offset);
>>>>> +        msix_save(pdev, f);
>>>>> +    }
>>>>> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
>>>>> +    qemu_put_be16(f, pci_cmd);
>>>>> +}
>>>>> +
>>>>> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
>>>>> +{
>>>>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>>>> +    PCIDevice *pdev = &vdev->pdev;
>>>>> +    uint32_t interrupt_type;
>>>>> +    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
>>>>> +    uint16_t pci_cmd;
>>>>> +    bool msi_64bit;
>>>>> +    int i, ret;
>>>>> +
>>>>> +    /* retore pci bar configuration */
>>>>> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
>>>>> +    vfio_pci_write_config(pdev, PCI_COMMAND,
>>>>> +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
>>>>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>>>>> +        uint32_t bar = qemu_get_be32(f);
>>>>> +
>>>>> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
>>>>> +    }
>>>>> +
>>>>> +    ret = vfio_bars_validate(vdev);
>>>>> +    if (ret) {
>>>>> +        return ret;
>>>>> +    }
>>>>> +
>>>>> +    interrupt_type = qemu_get_be32(f);
>>>>> +
>>>>> +    if (interrupt_type == VFIO_INT_MSI) {
>>>>> +        /* restore msi configuration */
>>>>> +        msi_flags = pci_default_read_config(pdev,
>>>>> +                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
>>>>> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
>>>>> +
>>>>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>>>>> +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
>>>>> +
>>>>> +        msi_addr_lo = qemu_get_be32(f);
>>>>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
>>>>> +                              msi_addr_lo, 4);
>>>>> +
>>>>> +        msi_addr_hi = qemu_get_be32(f);
>>>>> +        if (msi_64bit) {
>>>>> +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
>>>>> +                                  msi_addr_hi, 4);
>>>>> +        }
>>>>> +        msi_data = qemu_get_be32(f);
>>>>> +        vfio_pci_write_config(pdev,
>>>>> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
>>>>> +                msi_data, 2);
>>>>> +
>>>>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>>>>> +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
>>>>> +    } else if (interrupt_type == VFIO_INT_MSIX) {
>>>>> +        uint16_t offset = qemu_get_be16(f);
>>>>> +
>>>>> +        /* load enable bit and maskall bit */
>>>>> +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
>>>>> +                              offset, 2);
>>>>> +        msix_load(pdev, f);
>>>>> +    }
>>>>> +    pci_cmd = qemu_get_be16(f);
>>>>> +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
>>>>> +    return 0;
>>>>> +}
>>>>
>>>> It always seems like there should be a lot more state than this, and I
>>>> probably sound like a broken record because I ask every time, but maybe
>>>> that's a good indication that we (or at least I) need a comment
>>>> explaining why we only care about these.  For example, what if we
>>>> migrate a device in the D3 power state, don't we need to account for
>>>> the state stored in the PM capability or does the device wake up into
>>>> D0 auto-magically after migration?  I think we could repeat that
>>>> question for every capability that can be modified.  Even for the MSI/X
>>>> cases, the interrupt may not be active, but there could be state in
>>>> virtual config space that would be different on the target.  For
>>>> example, if we migrate with a device in INTx mode where the guest had
>>>> written vector fields on the source, but only writes the enable bit on
>>>> the target, can we seamlessly figure out the rest?  For other
>>>> capabilities, that state may represent config space changes written
>>>> through to the physical device and represent a functional difference on
>>>> the target.  Thanks,
>>>>   
>>>
>>> These are very basic set of registers from config state. Other are more
>>> of vendor specific which vendor driver can save and restore in their own
>>> data. I don't think we have to take care of all those vendor specific
>>> fields here.
>>
>> That had not been clear to me.  Intel folks, is this your understanding
>> regarding the responsibility of the user to save and restore config
>> space of the device as part of the vendor provided migration stream
>> data?  Thanks,
>>
> Currently, the code works for us. but I agree with you that there should
> be more states to save, at least for emulated config bits.
> I think we should call pci_device_save() to serve that purpose.
> 

If vendor driver can restore all vendor specific config space, then 
adding it again in QEMU might be redundant. As an example, I had mailed 
mtty sample code, in which config space has vendor specific information 
and that is restored in easy way.

Thanks,
Kirti
Alex Williamson May 6, 2020, 8:03 p.m. UTC | #12
On Thu, 7 May 2020 01:18:19 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 5/6/2020 11:41 AM, Yan Zhao wrote:
> > On Tue, May 05, 2020 at 12:37:11PM +0800, Alex Williamson wrote:  
> >> On Tue, 5 May 2020 04:48:37 +0530
> >> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>  
> >>> On 3/26/2020 1:26 AM, Alex Williamson wrote:  
> >>>> On Wed, 25 Mar 2020 02:39:02 +0530
> >>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>      
> >>>>> These functions save and restore PCI device specific data - config
> >>>>> space of PCI device.
> >>>>> Tested save and restore with MSI and MSIX type.
> >>>>>
> >>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >>>>> ---
> >>>>>    hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
> >>>>>    include/hw/vfio/vfio-common.h |   2 +
> >>>>>    2 files changed, 165 insertions(+)
> >>>>>
> >>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >>>>> index 6c77c12e44b9..8deb11e87ef7 100644
> >>>>> --- a/hw/vfio/pci.c
> >>>>> +++ b/hw/vfio/pci.c
> >>>>> @@ -41,6 +41,7 @@
> >>>>>    #include "trace.h"
> >>>>>    #include "qapi/error.h"
> >>>>>    #include "migration/blocker.h"
> >>>>> +#include "migration/qemu-file.h"
> >>>>>    
> >>>>>    #define TYPE_VFIO_PCI "vfio-pci"
> >>>>>    #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
> >>>>> @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
> >>>>>        }
> >>>>>    }
> >>>>>    
> >>>>> +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
> >>>>> +{
> >>>>> +    PCIDevice *pdev = &vdev->pdev;
> >>>>> +    VFIOBAR *bar = &vdev->bars[nr];
> >>>>> +    uint64_t addr;
> >>>>> +    uint32_t addr_lo, addr_hi = 0;
> >>>>> +
> >>>>> +    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
> >>>>> +    if (!bar->size) {
> >>>>> +        return 0;
> >>>>> +    }
> >>>>> +
> >>>>> +    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
> >>>>> +
> >>>>> +    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
> >>>>> +                                       PCI_BASE_ADDRESS_MEM_MASK);  
> >>>>
> >>>> Nit, &= or combine with previous set.
> >>>>      
> >>>>> +    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
> >>>>> +        addr_hi = pci_default_read_config(pdev,
> >>>>> +                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
> >>>>> +    }
> >>>>> +
> >>>>> +    addr = ((uint64_t)addr_hi << 32) | addr_lo;  
> >>>>
> >>>> Could we use a union?
> >>>>      
> >>>>> +
> >>>>> +    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
> >>>>> +        return -EINVAL;
> >>>>> +    }  
> >>>>
> >>>> What specifically are we validating here?  This should be true no
> >>>> matter what we wrote to the BAR or else BAR emulation is broken.  The
> >>>> bits that could make this unaligned are not implemented in the BAR.
> >>>>      
> >>>>> +
> >>>>> +    return 0;
> >>>>> +}
> >>>>> +
> >>>>> +static int vfio_bars_validate(VFIOPCIDevice *vdev)
> >>>>> +{
> >>>>> +    int i, ret;
> >>>>> +
> >>>>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> >>>>> +        ret = vfio_bar_validate(vdev, i);
> >>>>> +        if (ret) {
> >>>>> +            error_report("vfio: BAR address %d validation failed", i);
> >>>>> +            return ret;
> >>>>> +        }
> >>>>> +    }
> >>>>> +    return 0;
> >>>>> +}
> >>>>> +
> >>>>>    static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
> >>>>>    {
> >>>>>        VFIOBAR *bar = &vdev->bars[nr];
> >>>>> @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
> >>>>>        return OBJECT(vdev);
> >>>>>    }
> >>>>>    
> >>>>> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> >>>>> +{
> >>>>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> >>>>> +    PCIDevice *pdev = &vdev->pdev;
> >>>>> +    uint16_t pci_cmd;
> >>>>> +    int i;
> >>>>> +
> >>>>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> >>>>> +        uint32_t bar;
> >>>>> +
> >>>>> +        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
> >>>>> +        qemu_put_be32(f, bar);
> >>>>> +    }
> >>>>> +
> >>>>> +    qemu_put_be32(f, vdev->interrupt);
> >>>>> +    if (vdev->interrupt == VFIO_INT_MSI) {
> >>>>> +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> >>>>> +        bool msi_64bit;
> >>>>> +
> >>>>> +        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> >>>>> +                                            2);
> >>>>> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> >>>>> +
> >>>>> +        msi_addr_lo = pci_default_read_config(pdev,
> >>>>> +                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
> >>>>> +        qemu_put_be32(f, msi_addr_lo);
> >>>>> +
> >>>>> +        if (msi_64bit) {
> >>>>> +            msi_addr_hi = pci_default_read_config(pdev,
> >>>>> +                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> >>>>> +                                             4);
> >>>>> +        }
> >>>>> +        qemu_put_be32(f, msi_addr_hi);
> >>>>> +
> >>>>> +        msi_data = pci_default_read_config(pdev,
> >>>>> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> >>>>> +                2);
> >>>>> +        qemu_put_be32(f, msi_data);  
> >>>>
> >>>> Isn't the data field only a u16?
> >>>>      
> >>>
> >>> Yes, fixing it.
> >>>  
> >>>>> +    } else if (vdev->interrupt == VFIO_INT_MSIX) {
> >>>>> +        uint16_t offset;
> >>>>> +
> >>>>> +        /* save enable bit and maskall bit */
> >>>>> +        offset = pci_default_read_config(pdev,
> >>>>> +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
> >>>>> +        qemu_put_be16(f, offset);
> >>>>> +        msix_save(pdev, f);
> >>>>> +    }
> >>>>> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> >>>>> +    qemu_put_be16(f, pci_cmd);
> >>>>> +}
> >>>>> +
> >>>>> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
> >>>>> +{
> >>>>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> >>>>> +    PCIDevice *pdev = &vdev->pdev;
> >>>>> +    uint32_t interrupt_type;
> >>>>> +    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> >>>>> +    uint16_t pci_cmd;
> >>>>> +    bool msi_64bit;
> >>>>> +    int i, ret;
> >>>>> +
> >>>>> +    /* retore pci bar configuration */
> >>>>> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> >>>>> +    vfio_pci_write_config(pdev, PCI_COMMAND,
> >>>>> +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
> >>>>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> >>>>> +        uint32_t bar = qemu_get_be32(f);
> >>>>> +
> >>>>> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
> >>>>> +    }
> >>>>> +
> >>>>> +    ret = vfio_bars_validate(vdev);
> >>>>> +    if (ret) {
> >>>>> +        return ret;
> >>>>> +    }
> >>>>> +
> >>>>> +    interrupt_type = qemu_get_be32(f);
> >>>>> +
> >>>>> +    if (interrupt_type == VFIO_INT_MSI) {
> >>>>> +        /* restore msi configuration */
> >>>>> +        msi_flags = pci_default_read_config(pdev,
> >>>>> +                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
> >>>>> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> >>>>> +
> >>>>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> >>>>> +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
> >>>>> +
> >>>>> +        msi_addr_lo = qemu_get_be32(f);
> >>>>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
> >>>>> +                              msi_addr_lo, 4);
> >>>>> +
> >>>>> +        msi_addr_hi = qemu_get_be32(f);
> >>>>> +        if (msi_64bit) {
> >>>>> +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> >>>>> +                                  msi_addr_hi, 4);
> >>>>> +        }
> >>>>> +        msi_data = qemu_get_be32(f);
> >>>>> +        vfio_pci_write_config(pdev,
> >>>>> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> >>>>> +                msi_data, 2);
> >>>>> +
> >>>>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> >>>>> +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
> >>>>> +    } else if (interrupt_type == VFIO_INT_MSIX) {
> >>>>> +        uint16_t offset = qemu_get_be16(f);
> >>>>> +
> >>>>> +        /* load enable bit and maskall bit */
> >>>>> +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
> >>>>> +                              offset, 2);
> >>>>> +        msix_load(pdev, f);
> >>>>> +    }
> >>>>> +    pci_cmd = qemu_get_be16(f);
> >>>>> +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
> >>>>> +    return 0;
> >>>>> +}  
> >>>>
> >>>> It always seems like there should be a lot more state than this, and I
> >>>> probably sound like a broken record because I ask every time, but maybe
> >>>> that's a good indication that we (or at least I) need a comment
> >>>> explaining why we only care about these.  For example, what if we
> >>>> migrate a device in the D3 power state, don't we need to account for
> >>>> the state stored in the PM capability or does the device wake up into
> >>>> D0 auto-magically after migration?  I think we could repeat that
> >>>> question for every capability that can be modified.  Even for the MSI/X
> >>>> cases, the interrupt may not be active, but there could be state in
> >>>> virtual config space that would be different on the target.  For
> >>>> example, if we migrate with a device in INTx mode where the guest had
> >>>> written vector fields on the source, but only writes the enable bit on
> >>>> the target, can we seamlessly figure out the rest?  For other
> >>>> capabilities, that state may represent config space changes written
> >>>> through to the physical device and represent a functional difference on
> >>>> the target.  Thanks,
> >>>>     
> >>>
> >>> These are very basic set of registers from config state. Other are more
> >>> of vendor specific which vendor driver can save and restore in their own
> >>> data. I don't think we have to take care of all those vendor specific
> >>> fields here.  
> >>
> >> That had not been clear to me.  Intel folks, is this your understanding
> >> regarding the responsibility of the user to save and restore config
> >> space of the device as part of the vendor provided migration stream
> >> data?  Thanks,
> >>  
> > Currently, the code works for us. but I agree with you that there should
> > be more states to save, at least for emulated config bits.
> > I think we should call pci_device_save() to serve that purpose.
> >   
> 
> If vendor driver can restore all vendor specific config space, then 
> adding it again in QEMU might be redundant. As an example, I had mailed 
> mtty sample code, in which config space has vendor specific information 
> and that is restored in easy way.

The redundancy is implementing it in each vendor driver.  Thanks,

Alex
Kirti Wankhede May 7, 2020, 5:40 a.m. UTC | #13
On 5/7/2020 1:33 AM, Alex Williamson wrote:
> On Thu, 7 May 2020 01:18:19 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 5/6/2020 11:41 AM, Yan Zhao wrote:
>>> On Tue, May 05, 2020 at 12:37:11PM +0800, Alex Williamson wrote:
>>>> On Tue, 5 May 2020 04:48:37 +0530
>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>   
>>>>> On 3/26/2020 1:26 AM, Alex Williamson wrote:
>>>>>> On Wed, 25 Mar 2020 02:39:02 +0530
>>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>>       
>>>>>>> These functions save and restore PCI device specific data - config
>>>>>>> space of PCI device.
>>>>>>> Tested save and restore with MSI and MSIX type.
>>>>>>>
>>>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>>>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>>>>>>> ---
>>>>>>>     hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
>>>>>>>     include/hw/vfio/vfio-common.h |   2 +
>>>>>>>     2 files changed, 165 insertions(+)
>>>>>>>
>>>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>>>> index 6c77c12e44b9..8deb11e87ef7 100644
>>>>>>> --- a/hw/vfio/pci.c
>>>>>>> +++ b/hw/vfio/pci.c
>>>>>>> @@ -41,6 +41,7 @@
>>>>>>>     #include "trace.h"
>>>>>>>     #include "qapi/error.h"
>>>>>>>     #include "migration/blocker.h"
>>>>>>> +#include "migration/qemu-file.h"
>>>>>>>     
>>>>>>>     #define TYPE_VFIO_PCI "vfio-pci"
>>>>>>>     #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
>>>>>>> @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
>>>>>>>         }
>>>>>>>     }
>>>>>>>     
>>>>>>> +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
>>>>>>> +{
>>>>>>> +    PCIDevice *pdev = &vdev->pdev;
>>>>>>> +    VFIOBAR *bar = &vdev->bars[nr];
>>>>>>> +    uint64_t addr;
>>>>>>> +    uint32_t addr_lo, addr_hi = 0;
>>>>>>> +
>>>>>>> +    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
>>>>>>> +    if (!bar->size) {
>>>>>>> +        return 0;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
>>>>>>> +
>>>>>>> +    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
>>>>>>> +                                       PCI_BASE_ADDRESS_MEM_MASK);
>>>>>>
>>>>>> Nit, &= or combine with previous set.
>>>>>>       
>>>>>>> +    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
>>>>>>> +        addr_hi = pci_default_read_config(pdev,
>>>>>>> +                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    addr = ((uint64_t)addr_hi << 32) | addr_lo;
>>>>>>
>>>>>> Could we use a union?
>>>>>>       
>>>>>>> +
>>>>>>> +    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
>>>>>>> +        return -EINVAL;
>>>>>>> +    }
>>>>>>
>>>>>> What specifically are we validating here?  This should be true no
>>>>>> matter what we wrote to the BAR or else BAR emulation is broken.  The
>>>>>> bits that could make this unaligned are not implemented in the BAR.
>>>>>>       
>>>>>>> +
>>>>>>> +    return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int vfio_bars_validate(VFIOPCIDevice *vdev)
>>>>>>> +{
>>>>>>> +    int i, ret;
>>>>>>> +
>>>>>>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>>>>>>> +        ret = vfio_bar_validate(vdev, i);
>>>>>>> +        if (ret) {
>>>>>>> +            error_report("vfio: BAR address %d validation failed", i);
>>>>>>> +            return ret;
>>>>>>> +        }
>>>>>>> +    }
>>>>>>> +    return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>>     static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
>>>>>>>     {
>>>>>>>         VFIOBAR *bar = &vdev->bars[nr];
>>>>>>> @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
>>>>>>>         return OBJECT(vdev);
>>>>>>>     }
>>>>>>>     
>>>>>>> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
>>>>>>> +{
>>>>>>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>>>>>> +    PCIDevice *pdev = &vdev->pdev;
>>>>>>> +    uint16_t pci_cmd;
>>>>>>> +    int i;
>>>>>>> +
>>>>>>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>>>>>>> +        uint32_t bar;
>>>>>>> +
>>>>>>> +        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
>>>>>>> +        qemu_put_be32(f, bar);
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    qemu_put_be32(f, vdev->interrupt);
>>>>>>> +    if (vdev->interrupt == VFIO_INT_MSI) {
>>>>>>> +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
>>>>>>> +        bool msi_64bit;
>>>>>>> +
>>>>>>> +        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>>>>>>> +                                            2);
>>>>>>> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
>>>>>>> +
>>>>>>> +        msi_addr_lo = pci_default_read_config(pdev,
>>>>>>> +                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
>>>>>>> +        qemu_put_be32(f, msi_addr_lo);
>>>>>>> +
>>>>>>> +        if (msi_64bit) {
>>>>>>> +            msi_addr_hi = pci_default_read_config(pdev,
>>>>>>> +                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
>>>>>>> +                                             4);
>>>>>>> +        }
>>>>>>> +        qemu_put_be32(f, msi_addr_hi);
>>>>>>> +
>>>>>>> +        msi_data = pci_default_read_config(pdev,
>>>>>>> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
>>>>>>> +                2);
>>>>>>> +        qemu_put_be32(f, msi_data);
>>>>>>
>>>>>> Isn't the data field only a u16?
>>>>>>       
>>>>>
>>>>> Yes, fixing it.
>>>>>   
>>>>>>> +    } else if (vdev->interrupt == VFIO_INT_MSIX) {
>>>>>>> +        uint16_t offset;
>>>>>>> +
>>>>>>> +        /* save enable bit and maskall bit */
>>>>>>> +        offset = pci_default_read_config(pdev,
>>>>>>> +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
>>>>>>> +        qemu_put_be16(f, offset);
>>>>>>> +        msix_save(pdev, f);
>>>>>>> +    }
>>>>>>> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
>>>>>>> +    qemu_put_be16(f, pci_cmd);
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
>>>>>>> +{
>>>>>>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
>>>>>>> +    PCIDevice *pdev = &vdev->pdev;
>>>>>>> +    uint32_t interrupt_type;
>>>>>>> +    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
>>>>>>> +    uint16_t pci_cmd;
>>>>>>> +    bool msi_64bit;
>>>>>>> +    int i, ret;
>>>>>>> +
>>>>>>> +    /* retore pci bar configuration */
>>>>>>> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
>>>>>>> +    vfio_pci_write_config(pdev, PCI_COMMAND,
>>>>>>> +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
>>>>>>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
>>>>>>> +        uint32_t bar = qemu_get_be32(f);
>>>>>>> +
>>>>>>> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    ret = vfio_bars_validate(vdev);
>>>>>>> +    if (ret) {
>>>>>>> +        return ret;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    interrupt_type = qemu_get_be32(f);
>>>>>>> +
>>>>>>> +    if (interrupt_type == VFIO_INT_MSI) {
>>>>>>> +        /* restore msi configuration */
>>>>>>> +        msi_flags = pci_default_read_config(pdev,
>>>>>>> +                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
>>>>>>> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
>>>>>>> +
>>>>>>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>>>>>>> +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
>>>>>>> +
>>>>>>> +        msi_addr_lo = qemu_get_be32(f);
>>>>>>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
>>>>>>> +                              msi_addr_lo, 4);
>>>>>>> +
>>>>>>> +        msi_addr_hi = qemu_get_be32(f);
>>>>>>> +        if (msi_64bit) {
>>>>>>> +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
>>>>>>> +                                  msi_addr_hi, 4);
>>>>>>> +        }
>>>>>>> +        msi_data = qemu_get_be32(f);
>>>>>>> +        vfio_pci_write_config(pdev,
>>>>>>> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
>>>>>>> +                msi_data, 2);
>>>>>>> +
>>>>>>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
>>>>>>> +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
>>>>>>> +    } else if (interrupt_type == VFIO_INT_MSIX) {
>>>>>>> +        uint16_t offset = qemu_get_be16(f);
>>>>>>> +
>>>>>>> +        /* load enable bit and maskall bit */
>>>>>>> +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
>>>>>>> +                              offset, 2);
>>>>>>> +        msix_load(pdev, f);
>>>>>>> +    }
>>>>>>> +    pci_cmd = qemu_get_be16(f);
>>>>>>> +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
>>>>>>> +    return 0;
>>>>>>> +}
>>>>>>
>>>>>> It always seems like there should be a lot more state than this, and I
>>>>>> probably sound like a broken record because I ask every time, but maybe
>>>>>> that's a good indication that we (or at least I) need a comment
>>>>>> explaining why we only care about these.  For example, what if we
>>>>>> migrate a device in the D3 power state, don't we need to account for
>>>>>> the state stored in the PM capability or does the device wake up into
>>>>>> D0 auto-magically after migration?  I think we could repeat that
>>>>>> question for every capability that can be modified.  Even for the MSI/X
>>>>>> cases, the interrupt may not be active, but there could be state in
>>>>>> virtual config space that would be different on the target.  For
>>>>>> example, if we migrate with a device in INTx mode where the guest had
>>>>>> written vector fields on the source, but only writes the enable bit on
>>>>>> the target, can we seamlessly figure out the rest?  For other
>>>>>> capabilities, that state may represent config space changes written
>>>>>> through to the physical device and represent a functional difference on
>>>>>> the target.  Thanks,
>>>>>>      
>>>>>
>>>>> These are very basic set of registers from config state. Other are more
>>>>> of vendor specific which vendor driver can save and restore in their own
>>>>> data. I don't think we have to take care of all those vendor specific
>>>>> fields here.
>>>>
>>>> That had not been clear to me.  Intel folks, is this your understanding
>>>> regarding the responsibility of the user to save and restore config
>>>> space of the device as part of the vendor provided migration stream
>>>> data?  Thanks,
>>>>   
>>> Currently, the code works for us. but I agree with you that there should
>>> be more states to save, at least for emulated config bits.
>>> I think we should call pci_device_save() to serve that purpose.
>>>    
>>
>> If vendor driver can restore all vendor specific config space, then
>> adding it again in QEMU might be redundant. As an example, I had mailed
>> mtty sample code, in which config space has vendor specific information
>> and that is restored in easy way.
> 
> The redundancy is implementing it in each vendor driver.  Thanks,
> 

Vendor driver knows better about vendor specific configs, isn't it 
better vendor driver handle those at their end rather than adding vendor 
specific quirks in QEMU?

Thanks,
Kirti
Alex Williamson May 7, 2020, 6:14 p.m. UTC | #14
On Thu, 7 May 2020 11:10:40 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 5/7/2020 1:33 AM, Alex Williamson wrote:
> > On Thu, 7 May 2020 01:18:19 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 5/6/2020 11:41 AM, Yan Zhao wrote:  
> >>> On Tue, May 05, 2020 at 12:37:11PM +0800, Alex Williamson wrote:  
> >>>> On Tue, 5 May 2020 04:48:37 +0530
> >>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>     
> >>>>> On 3/26/2020 1:26 AM, Alex Williamson wrote:  
> >>>>>> On Wed, 25 Mar 2020 02:39:02 +0530
> >>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>>         
> >>>>>>> These functions save and restore PCI device specific data - config
> >>>>>>> space of PCI device.
> >>>>>>> Tested save and restore with MSI and MSIX type.
> >>>>>>>
> >>>>>>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >>>>>>> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >>>>>>> ---
> >>>>>>>     hw/vfio/pci.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++
> >>>>>>>     include/hw/vfio/vfio-common.h |   2 +
> >>>>>>>     2 files changed, 165 insertions(+)
> >>>>>>>
> >>>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> >>>>>>> index 6c77c12e44b9..8deb11e87ef7 100644
> >>>>>>> --- a/hw/vfio/pci.c
> >>>>>>> +++ b/hw/vfio/pci.c
> >>>>>>> @@ -41,6 +41,7 @@
> >>>>>>>     #include "trace.h"
> >>>>>>>     #include "qapi/error.h"
> >>>>>>>     #include "migration/blocker.h"
> >>>>>>> +#include "migration/qemu-file.h"
> >>>>>>>     
> >>>>>>>     #define TYPE_VFIO_PCI "vfio-pci"
> >>>>>>>     #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
> >>>>>>> @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
> >>>>>>>         }
> >>>>>>>     }
> >>>>>>>     
> >>>>>>> +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
> >>>>>>> +{
> >>>>>>> +    PCIDevice *pdev = &vdev->pdev;
> >>>>>>> +    VFIOBAR *bar = &vdev->bars[nr];
> >>>>>>> +    uint64_t addr;
> >>>>>>> +    uint32_t addr_lo, addr_hi = 0;
> >>>>>>> +
> >>>>>>> +    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
> >>>>>>> +    if (!bar->size) {
> >>>>>>> +        return 0;
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
> >>>>>>> +
> >>>>>>> +    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
> >>>>>>> +                                       PCI_BASE_ADDRESS_MEM_MASK);  
> >>>>>>
> >>>>>> Nit, &= or combine with previous set.
> >>>>>>         
> >>>>>>> +    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
> >>>>>>> +        addr_hi = pci_default_read_config(pdev,
> >>>>>>> +                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    addr = ((uint64_t)addr_hi << 32) | addr_lo;  
> >>>>>>
> >>>>>> Could we use a union?
> >>>>>>         
> >>>>>>> +
> >>>>>>> +    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
> >>>>>>> +        return -EINVAL;
> >>>>>>> +    }  
> >>>>>>
> >>>>>> What specifically are we validating here?  This should be true no
> >>>>>> matter what we wrote to the BAR or else BAR emulation is broken.  The
> >>>>>> bits that could make this unaligned are not implemented in the BAR.
> >>>>>>         
> >>>>>>> +
> >>>>>>> +    return 0;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static int vfio_bars_validate(VFIOPCIDevice *vdev)
> >>>>>>> +{
> >>>>>>> +    int i, ret;
> >>>>>>> +
> >>>>>>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> >>>>>>> +        ret = vfio_bar_validate(vdev, i);
> >>>>>>> +        if (ret) {
> >>>>>>> +            error_report("vfio: BAR address %d validation failed", i);
> >>>>>>> +            return ret;
> >>>>>>> +        }
> >>>>>>> +    }
> >>>>>>> +    return 0;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>>     static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
> >>>>>>>     {
> >>>>>>>         VFIOBAR *bar = &vdev->bars[nr];
> >>>>>>> @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev)
> >>>>>>>         return OBJECT(vdev);
> >>>>>>>     }
> >>>>>>>     
> >>>>>>> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> >>>>>>> +{
> >>>>>>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> >>>>>>> +    PCIDevice *pdev = &vdev->pdev;
> >>>>>>> +    uint16_t pci_cmd;
> >>>>>>> +    int i;
> >>>>>>> +
> >>>>>>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> >>>>>>> +        uint32_t bar;
> >>>>>>> +
> >>>>>>> +        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
> >>>>>>> +        qemu_put_be32(f, bar);
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    qemu_put_be32(f, vdev->interrupt);
> >>>>>>> +    if (vdev->interrupt == VFIO_INT_MSI) {
> >>>>>>> +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> >>>>>>> +        bool msi_64bit;
> >>>>>>> +
> >>>>>>> +        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> >>>>>>> +                                            2);
> >>>>>>> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> >>>>>>> +
> >>>>>>> +        msi_addr_lo = pci_default_read_config(pdev,
> >>>>>>> +                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
> >>>>>>> +        qemu_put_be32(f, msi_addr_lo);
> >>>>>>> +
> >>>>>>> +        if (msi_64bit) {
> >>>>>>> +            msi_addr_hi = pci_default_read_config(pdev,
> >>>>>>> +                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> >>>>>>> +                                             4);
> >>>>>>> +        }
> >>>>>>> +        qemu_put_be32(f, msi_addr_hi);
> >>>>>>> +
> >>>>>>> +        msi_data = pci_default_read_config(pdev,
> >>>>>>> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> >>>>>>> +                2);
> >>>>>>> +        qemu_put_be32(f, msi_data);  
> >>>>>>
> >>>>>> Isn't the data field only a u16?
> >>>>>>         
> >>>>>
> >>>>> Yes, fixing it.
> >>>>>     
> >>>>>>> +    } else if (vdev->interrupt == VFIO_INT_MSIX) {
> >>>>>>> +        uint16_t offset;
> >>>>>>> +
> >>>>>>> +        /* save enable bit and maskall bit */
> >>>>>>> +        offset = pci_default_read_config(pdev,
> >>>>>>> +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
> >>>>>>> +        qemu_put_be16(f, offset);
> >>>>>>> +        msix_save(pdev, f);
> >>>>>>> +    }
> >>>>>>> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> >>>>>>> +    qemu_put_be16(f, pci_cmd);
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
> >>>>>>> +{
> >>>>>>> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> >>>>>>> +    PCIDevice *pdev = &vdev->pdev;
> >>>>>>> +    uint32_t interrupt_type;
> >>>>>>> +    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> >>>>>>> +    uint16_t pci_cmd;
> >>>>>>> +    bool msi_64bit;
> >>>>>>> +    int i, ret;
> >>>>>>> +
> >>>>>>> +    /* retore pci bar configuration */
> >>>>>>> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> >>>>>>> +    vfio_pci_write_config(pdev, PCI_COMMAND,
> >>>>>>> +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
> >>>>>>> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> >>>>>>> +        uint32_t bar = qemu_get_be32(f);
> >>>>>>> +
> >>>>>>> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    ret = vfio_bars_validate(vdev);
> >>>>>>> +    if (ret) {
> >>>>>>> +        return ret;
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    interrupt_type = qemu_get_be32(f);
> >>>>>>> +
> >>>>>>> +    if (interrupt_type == VFIO_INT_MSI) {
> >>>>>>> +        /* restore msi configuration */
> >>>>>>> +        msi_flags = pci_default_read_config(pdev,
> >>>>>>> +                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
> >>>>>>> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> >>>>>>> +
> >>>>>>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> >>>>>>> +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
> >>>>>>> +
> >>>>>>> +        msi_addr_lo = qemu_get_be32(f);
> >>>>>>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
> >>>>>>> +                              msi_addr_lo, 4);
> >>>>>>> +
> >>>>>>> +        msi_addr_hi = qemu_get_be32(f);
> >>>>>>> +        if (msi_64bit) {
> >>>>>>> +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> >>>>>>> +                                  msi_addr_hi, 4);
> >>>>>>> +        }
> >>>>>>> +        msi_data = qemu_get_be32(f);
> >>>>>>> +        vfio_pci_write_config(pdev,
> >>>>>>> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
> >>>>>>> +                msi_data, 2);
> >>>>>>> +
> >>>>>>> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> >>>>>>> +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
> >>>>>>> +    } else if (interrupt_type == VFIO_INT_MSIX) {
> >>>>>>> +        uint16_t offset = qemu_get_be16(f);
> >>>>>>> +
> >>>>>>> +        /* load enable bit and maskall bit */
> >>>>>>> +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
> >>>>>>> +                              offset, 2);
> >>>>>>> +        msix_load(pdev, f);
> >>>>>>> +    }
> >>>>>>> +    pci_cmd = qemu_get_be16(f);
> >>>>>>> +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
> >>>>>>> +    return 0;
> >>>>>>> +}  
> >>>>>>
> >>>>>> It always seems like there should be a lot more state than this, and I
> >>>>>> probably sound like a broken record because I ask every time, but maybe
> >>>>>> that's a good indication that we (or at least I) need a comment
> >>>>>> explaining why we only care about these.  For example, what if we
> >>>>>> migrate a device in the D3 power state, don't we need to account for
> >>>>>> the state stored in the PM capability or does the device wake up into
> >>>>>> D0 auto-magically after migration?  I think we could repeat that
> >>>>>> question for every capability that can be modified.  Even for the MSI/X
> >>>>>> cases, the interrupt may not be active, but there could be state in
> >>>>>> virtual config space that would be different on the target.  For
> >>>>>> example, if we migrate with a device in INTx mode where the guest had
> >>>>>> written vector fields on the source, but only writes the enable bit on
> >>>>>> the target, can we seamlessly figure out the rest?  For other
> >>>>>> capabilities, that state may represent config space changes written
> >>>>>> through to the physical device and represent a functional difference on
> >>>>>> the target.  Thanks,
> >>>>>>        
> >>>>>
> >>>>> These are very basic set of registers from config state. Other are more
> >>>>> of vendor specific which vendor driver can save and restore in their own
> >>>>> data. I don't think we have to take care of all those vendor specific
> >>>>> fields here.  
> >>>>
> >>>> That had not been clear to me.  Intel folks, is this your understanding
> >>>> regarding the responsibility of the user to save and restore config
> >>>> space of the device as part of the vendor provided migration stream
> >>>> data?  Thanks,
> >>>>     
> >>> Currently, the code works for us. but I agree with you that there should
> >>> be more states to save, at least for emulated config bits.
> >>> I think we should call pci_device_save() to serve that purpose.
> >>>      
> >>
> >> If vendor driver can restore all vendor specific config space, then
> >> adding it again in QEMU might be redundant. As an example, I had mailed
> >> mtty sample code, in which config space has vendor specific information
> >> and that is restored in easy way.  
> > 
> > The redundancy is implementing it in each vendor driver.  Thanks,
> >   
> 
> Vendor driver knows better about vendor specific configs, isn't it 
> better vendor driver handle those at their end rather than adding vendor 
> specific quirks in QEMU?

Some capabilities, ex. the vendor specific capability, interact with
hardware in ways that are opaque to QEMU, the vendor driver needs to
include that underlying state as part of its migration stream.  Other
capabilities are completely standard, for example the PM capability.
Do we really want every vendor driver to implement their own power
state save and restore?  I think we want to centralize anything we can
in the save/restore process so that we don't see different behavior and
different bugs from every single vendor driver.  Thanks,

Alex
diff mbox series

Patch

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6c77c12e44b9..8deb11e87ef7 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -41,6 +41,7 @@ 
 #include "trace.h"
 #include "qapi/error.h"
 #include "migration/blocker.h"
+#include "migration/qemu-file.h"
 
 #define TYPE_VFIO_PCI "vfio-pci"
 #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
@@ -1632,6 +1633,50 @@  static void vfio_bars_prepare(VFIOPCIDevice *vdev)
     }
 }
 
+static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    VFIOBAR *bar = &vdev->bars[nr];
+    uint64_t addr;
+    uint32_t addr_lo, addr_hi = 0;
+
+    /* Skip unimplemented BARs and the upper half of 64bit BARS. */
+    if (!bar->size) {
+        return 0;
+    }
+
+    addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
+
+    addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
+                                       PCI_BASE_ADDRESS_MEM_MASK);
+    if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+        addr_hi = pci_default_read_config(pdev,
+                                         PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
+    }
+
+    addr = ((uint64_t)addr_hi << 32) | addr_lo;
+
+    if (!QEMU_IS_ALIGNED(addr, bar->size)) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static int vfio_bars_validate(VFIOPCIDevice *vdev)
+{
+    int i, ret;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        ret = vfio_bar_validate(vdev, i);
+        if (ret) {
+            error_report("vfio: BAR address %d validation failed", i);
+            return ret;
+        }
+    }
+    return 0;
+}
+
 static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
 {
     VFIOBAR *bar = &vdev->bars[nr];
@@ -2414,11 +2459,129 @@  static Object *vfio_pci_get_object(VFIODevice *vbasedev)
     return OBJECT(vdev);
 }
 
+static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
+{
+    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+    PCIDevice *pdev = &vdev->pdev;
+    uint16_t pci_cmd;
+    int i;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        uint32_t bar;
+
+        bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
+        qemu_put_be32(f, bar);
+    }
+
+    qemu_put_be32(f, vdev->interrupt);
+    if (vdev->interrupt == VFIO_INT_MSI) {
+        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
+        bool msi_64bit;
+
+        msi_flags = pci_default_read_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
+                                            2);
+        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
+
+        msi_addr_lo = pci_default_read_config(pdev,
+                                         pdev->msi_cap + PCI_MSI_ADDRESS_LO, 4);
+        qemu_put_be32(f, msi_addr_lo);
+
+        if (msi_64bit) {
+            msi_addr_hi = pci_default_read_config(pdev,
+                                             pdev->msi_cap + PCI_MSI_ADDRESS_HI,
+                                             4);
+        }
+        qemu_put_be32(f, msi_addr_hi);
+
+        msi_data = pci_default_read_config(pdev,
+                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
+                2);
+        qemu_put_be32(f, msi_data);
+    } else if (vdev->interrupt == VFIO_INT_MSIX) {
+        uint16_t offset;
+
+        /* save enable bit and maskall bit */
+        offset = pci_default_read_config(pdev,
+                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
+        qemu_put_be16(f, offset);
+        msix_save(pdev, f);
+    }
+    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
+    qemu_put_be16(f, pci_cmd);
+}
+
+static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
+{
+    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+    PCIDevice *pdev = &vdev->pdev;
+    uint32_t interrupt_type;
+    uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
+    uint16_t pci_cmd;
+    bool msi_64bit;
+    int i, ret;
+
+    /* retore pci bar configuration */
+    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
+    vfio_pci_write_config(pdev, PCI_COMMAND,
+                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 2);
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        uint32_t bar = qemu_get_be32(f);
+
+        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
+    }
+
+    ret = vfio_bars_validate(vdev);
+    if (ret) {
+        return ret;
+    }
+
+    interrupt_type = qemu_get_be32(f);
+
+    if (interrupt_type == VFIO_INT_MSI) {
+        /* restore msi configuration */
+        msi_flags = pci_default_read_config(pdev,
+                                            pdev->msi_cap + PCI_MSI_FLAGS, 2);
+        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
+
+        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
+                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
+
+        msi_addr_lo = qemu_get_be32(f);
+        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
+                              msi_addr_lo, 4);
+
+        msi_addr_hi = qemu_get_be32(f);
+        if (msi_64bit) {
+            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
+                                  msi_addr_hi, 4);
+        }
+        msi_data = qemu_get_be32(f);
+        vfio_pci_write_config(pdev,
+                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32),
+                msi_data, 2);
+
+        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
+                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
+    } else if (interrupt_type == VFIO_INT_MSIX) {
+        uint16_t offset = qemu_get_be16(f);
+
+        /* load enable bit and maskall bit */
+        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
+                              offset, 2);
+        msix_load(pdev, f);
+    }
+    pci_cmd = qemu_get_be16(f);
+    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
+    return 0;
+}
+
 static VFIODeviceOps vfio_pci_ops = {
     .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
     .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
     .vfio_eoi = vfio_intx_eoi,
     .vfio_get_object = vfio_pci_get_object,
+    .vfio_save_config = vfio_pci_save_config,
+    .vfio_load_config = vfio_pci_load_config,
 };
 
 int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 74261feaeac9..d69a7f3ae31e 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -120,6 +120,8 @@  struct VFIODeviceOps {
     int (*vfio_hot_reset_multi)(VFIODevice *vdev);
     void (*vfio_eoi)(VFIODevice *vdev);
     Object *(*vfio_get_object)(VFIODevice *vdev);
+    void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f);
+    int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
 };
 
 typedef struct VFIOGroup {