diff mbox series

[net-next,v2,09/14] sfc: implement device status related vdpa config operations

Message ID 20230307113621.64153-10-gautam.dawar@amd.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series sfc: add vDPA support for EF100 devices | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 20 this patch: 20
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang success Errors and warnings before: 21 this patch: 21
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 20 this patch: 20
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 471 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Gautam Dawar March 7, 2023, 11:36 a.m. UTC
vDPA config opertions to handle get/set device status and device
reset have been implemented. Also .suspend config operation is
implemented to support Live Migration.

Signed-off-by: Gautam Dawar <gautam.dawar@amd.com>
---
 drivers/net/ethernet/sfc/ef100_vdpa.c     |  16 +-
 drivers/net/ethernet/sfc/ef100_vdpa.h     |   2 +
 drivers/net/ethernet/sfc/ef100_vdpa_ops.c | 367 ++++++++++++++++++++--
 3 files changed, 355 insertions(+), 30 deletions(-)

Comments

Jason Wang March 10, 2023, 5:05 a.m. UTC | #1
On Tue, Mar 7, 2023 at 7:38 PM Gautam Dawar <gautam.dawar@amd.com> wrote:
>
> vDPA config opertions to handle get/set device status and device
> reset have been implemented. Also .suspend config operation is
> implemented to support Live Migration.
>
> Signed-off-by: Gautam Dawar <gautam.dawar@amd.com>
> ---
>  drivers/net/ethernet/sfc/ef100_vdpa.c     |  16 +-
>  drivers/net/ethernet/sfc/ef100_vdpa.h     |   2 +
>  drivers/net/ethernet/sfc/ef100_vdpa_ops.c | 367 ++++++++++++++++++++--
>  3 files changed, 355 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.c b/drivers/net/ethernet/sfc/ef100_vdpa.c
> index c66e5aef69ea..4ba57827a6cd 100644
> --- a/drivers/net/ethernet/sfc/ef100_vdpa.c
> +++ b/drivers/net/ethernet/sfc/ef100_vdpa.c
> @@ -68,9 +68,14 @@ static int vdpa_allocate_vis(struct efx_nic *efx, unsigned int *allocated_vis)
>
>  static void ef100_vdpa_delete(struct efx_nic *efx)
>  {
> +       struct vdpa_device *vdpa_dev;
> +
>         if (efx->vdpa_nic) {
> +               vdpa_dev = &efx->vdpa_nic->vdpa_dev;
> +               ef100_vdpa_reset(vdpa_dev);
> +
>                 /* replace with _vdpa_unregister_device later */
> -               put_device(&efx->vdpa_nic->vdpa_dev.dev);
> +               put_device(&vdpa_dev->dev);
>         }
>         efx_mcdi_free_vis(efx);
>  }
> @@ -171,6 +176,15 @@ static struct ef100_vdpa_nic *ef100_vdpa_create(struct efx_nic *efx,
>                 }
>         }
>
> +       rc = devm_add_action_or_reset(&efx->pci_dev->dev,
> +                                     ef100_vdpa_irq_vectors_free,
> +                                     efx->pci_dev);
> +       if (rc) {
> +               pci_err(efx->pci_dev,
> +                       "Failed adding devres for freeing irq vectors\n");
> +               goto err_put_device;
> +       }
> +
>         rc = get_net_config(vdpa_nic);
>         if (rc)
>                 goto err_put_device;
> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.h b/drivers/net/ethernet/sfc/ef100_vdpa.h
> index 348ca8a7404b..58791402e454 100644
> --- a/drivers/net/ethernet/sfc/ef100_vdpa.h
> +++ b/drivers/net/ethernet/sfc/ef100_vdpa.h
> @@ -149,6 +149,8 @@ int ef100_vdpa_register_mgmtdev(struct efx_nic *efx);
>  void ef100_vdpa_unregister_mgmtdev(struct efx_nic *efx);
>  void ef100_vdpa_irq_vectors_free(void *data);
>  int ef100_vdpa_init_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx);
> +void ef100_vdpa_irq_vectors_free(void *data);
> +int ef100_vdpa_reset(struct vdpa_device *vdev);
>
>  static inline bool efx_vdpa_is_little_endian(struct ef100_vdpa_nic *vdpa_nic)
>  {
> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
> index 0051c4c0e47c..95a2177f85a2 100644
> --- a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
> +++ b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
> @@ -22,11 +22,6 @@ static struct ef100_vdpa_nic *get_vdpa_nic(struct vdpa_device *vdev)
>         return container_of(vdev, struct ef100_vdpa_nic, vdpa_dev);
>  }
>
> -void ef100_vdpa_irq_vectors_free(void *data)
> -{
> -       pci_free_irq_vectors(data);
> -}
> -
>  static int create_vring_ctx(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>  {
>         struct efx_vring_ctx *vring_ctx;
> @@ -52,14 +47,6 @@ static void delete_vring_ctx(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>         vdpa_nic->vring[idx].vring_ctx = NULL;
>  }
>
> -static void reset_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
> -{
> -       vdpa_nic->vring[idx].vring_type = EF100_VDPA_VQ_NTYPES;
> -       vdpa_nic->vring[idx].vring_state = 0;
> -       vdpa_nic->vring[idx].last_avail_idx = 0;
> -       vdpa_nic->vring[idx].last_used_idx = 0;
> -}
> -
>  int ef100_vdpa_init_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>  {
>         u32 offset;
> @@ -103,6 +90,236 @@ static bool is_qid_invalid(struct ef100_vdpa_nic *vdpa_nic, u16 idx,
>         return false;
>  }
>
> +static void irq_vring_fini(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
> +{
> +       struct ef100_vdpa_vring_info *vring = &vdpa_nic->vring[idx];
> +       struct pci_dev *pci_dev = vdpa_nic->efx->pci_dev;
> +
> +       devm_free_irq(&pci_dev->dev, vring->irq, vring);
> +       vring->irq = -EINVAL;
> +}
> +
> +static irqreturn_t vring_intr_handler(int irq, void *arg)
> +{
> +       struct ef100_vdpa_vring_info *vring = arg;
> +
> +       if (vring->cb.callback)
> +               return vring->cb.callback(vring->cb.private);
> +
> +       return IRQ_NONE;
> +}
> +
> +static int ef100_vdpa_irq_vectors_alloc(struct pci_dev *pci_dev, u16 nvqs)
> +{
> +       int rc;
> +
> +       rc = pci_alloc_irq_vectors(pci_dev, nvqs, nvqs, PCI_IRQ_MSIX);
> +       if (rc < 0)
> +               pci_err(pci_dev,
> +                       "Failed to alloc %d IRQ vectors, err:%d\n", nvqs, rc);
> +       return rc;
> +}
> +
> +void ef100_vdpa_irq_vectors_free(void *data)
> +{
> +       pci_free_irq_vectors(data);
> +}
> +
> +static int irq_vring_init(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
> +{
> +       struct ef100_vdpa_vring_info *vring = &vdpa_nic->vring[idx];
> +       struct pci_dev *pci_dev = vdpa_nic->efx->pci_dev;
> +       int irq;
> +       int rc;
> +
> +       snprintf(vring->msix_name, 256, "x_vdpa[%s]-%d\n",
> +                pci_name(pci_dev), idx);
> +       irq = pci_irq_vector(pci_dev, idx);
> +       rc = devm_request_irq(&pci_dev->dev, irq, vring_intr_handler, 0,
> +                             vring->msix_name, vring);
> +       if (rc)
> +               pci_err(pci_dev,
> +                       "devm_request_irq failed for vring %d, rc %d\n",
> +                       idx, rc);
> +       else
> +               vring->irq = irq;
> +
> +       return rc;
> +}
> +
> +static int delete_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
> +{
> +       struct efx_vring_dyn_cfg vring_dyn_cfg;
> +       int rc;
> +
> +       if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
> +               return 0;
> +
> +       rc = efx_vdpa_vring_destroy(vdpa_nic->vring[idx].vring_ctx,
> +                                   &vring_dyn_cfg);
> +       if (rc)
> +               dev_err(&vdpa_nic->vdpa_dev.dev,
> +                       "%s: delete vring failed index:%u, err:%d\n",
> +                       __func__, idx, rc);
> +       vdpa_nic->vring[idx].last_avail_idx = vring_dyn_cfg.avail_idx;
> +       vdpa_nic->vring[idx].last_used_idx = vring_dyn_cfg.used_idx;
> +       vdpa_nic->vring[idx].vring_state &= ~EF100_VRING_CREATED;
> +
> +       irq_vring_fini(vdpa_nic, idx);
> +
> +       return rc;
> +}
> +
> +static void ef100_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
> +{
> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
> +       u32 idx_val;
> +
> +       if (is_qid_invalid(vdpa_nic, idx, __func__))
> +               return;
> +
> +       if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
> +               return;
> +
> +       idx_val = idx;
> +       _efx_writed(vdpa_nic->efx, cpu_to_le32(idx_val),
> +                   vdpa_nic->vring[idx].doorbell_offset);
> +}
> +
> +static bool can_create_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
> +{
> +       if (vdpa_nic->vring[idx].vring_state == EF100_VRING_CONFIGURED &&
> +           vdpa_nic->status & VIRTIO_CONFIG_S_DRIVER_OK &&
> +           !(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
> +               return true;
> +
> +       return false;
> +}
> +
> +static int create_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
> +{
> +       struct efx_vring_dyn_cfg vring_dyn_cfg;
> +       struct efx_vring_cfg vring_cfg;
> +       int rc;
> +
> +       rc = irq_vring_init(vdpa_nic, idx);
> +       if (rc) {
> +               dev_err(&vdpa_nic->vdpa_dev.dev,
> +                       "%s: irq_vring_init failed. index:%u, err:%d\n",
> +                       __func__, idx, rc);
> +               return rc;
> +       }
> +       vring_cfg.desc = vdpa_nic->vring[idx].desc;
> +       vring_cfg.avail = vdpa_nic->vring[idx].avail;
> +       vring_cfg.used = vdpa_nic->vring[idx].used;
> +       vring_cfg.size = vdpa_nic->vring[idx].size;
> +       vring_cfg.features = vdpa_nic->features;
> +       vring_cfg.msix_vector = idx;
> +       vring_dyn_cfg.avail_idx = vdpa_nic->vring[idx].last_avail_idx;
> +       vring_dyn_cfg.used_idx = vdpa_nic->vring[idx].last_used_idx;
> +
> +       rc = efx_vdpa_vring_create(vdpa_nic->vring[idx].vring_ctx,
> +                                  &vring_cfg, &vring_dyn_cfg);
> +       if (rc) {
> +               dev_err(&vdpa_nic->vdpa_dev.dev,
> +                       "%s: vring_create failed index:%u, err:%d\n",
> +                       __func__, idx, rc);
> +               goto err_vring_create;
> +       }
> +       vdpa_nic->vring[idx].vring_state |= EF100_VRING_CREATED;
> +
> +       /* A VQ kick allows the device to read the avail_idx, which will be
> +        * required at the destination after live migration.
> +        */
> +       ef100_vdpa_kick_vq(&vdpa_nic->vdpa_dev, idx);
> +
> +       return 0;
> +
> +err_vring_create:
> +       irq_vring_fini(vdpa_nic, idx);
> +       return rc;
> +}
> +
> +static void reset_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
> +{
> +       delete_vring(vdpa_nic, idx);
> +       vdpa_nic->vring[idx].vring_type = EF100_VDPA_VQ_NTYPES;
> +       vdpa_nic->vring[idx].vring_state = 0;
> +       vdpa_nic->vring[idx].last_avail_idx = 0;
> +       vdpa_nic->vring[idx].last_used_idx = 0;
> +}
> +
> +static void ef100_reset_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
> +{
> +       int i;
> +
> +       WARN_ON(!mutex_is_locked(&vdpa_nic->lock));
> +
> +       if (!vdpa_nic->status)
> +               return;
> +
> +       vdpa_nic->vdpa_state = EF100_VDPA_STATE_INITIALIZED;
> +       vdpa_nic->status = 0;
> +       vdpa_nic->features = 0;
> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++)
> +               reset_vring(vdpa_nic, i);
> +       ef100_vdpa_irq_vectors_free(vdpa_nic->efx->pci_dev);
> +}
> +
> +/* May be called under the rtnl lock */
> +int ef100_vdpa_reset(struct vdpa_device *vdev)
> +{
> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
> +
> +       /* vdpa device can be deleted anytime but the bar_config
> +        * could still be vdpa and hence efx->state would be STATE_VDPA.
> +        * Accordingly, ensure vdpa device exists before reset handling
> +        */
> +       if (!vdpa_nic)
> +               return -ENODEV;
> +
> +       mutex_lock(&vdpa_nic->lock);
> +       ef100_reset_vdpa_device(vdpa_nic);
> +       mutex_unlock(&vdpa_nic->lock);
> +       return 0;
> +}
> +
> +static int start_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
> +{
> +       struct efx_nic *efx = vdpa_nic->efx;
> +       struct ef100_nic_data *nic_data;
> +       int i, j;
> +       int rc;
> +
> +       nic_data = efx->nic_data;
> +       rc = ef100_vdpa_irq_vectors_alloc(efx->pci_dev,
> +                                         vdpa_nic->max_queue_pairs * 2);
> +       if (rc < 0) {
> +               pci_err(efx->pci_dev,
> +                       "vDPA IRQ alloc failed for vf: %u err:%d\n",
> +                       nic_data->vf_index, rc);
> +               return rc;
> +       }
> +
> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++) {
> +               if (can_create_vring(vdpa_nic, i)) {
> +                       rc = create_vring(vdpa_nic, i);
> +                       if (rc)
> +                               goto clear_vring;
> +               }
> +       }
> +
> +       vdpa_nic->vdpa_state = EF100_VDPA_STATE_STARTED;

It looks to me that this duplicates with the DRIVER_OK status bit.

> +       return 0;
> +
> +clear_vring:
> +       for (j = 0; j < i; j++)
> +               delete_vring(vdpa_nic, j);
> +
> +       ef100_vdpa_irq_vectors_free(efx->pci_dev);
> +       return rc;
> +}
> +
>  static int ef100_vdpa_set_vq_address(struct vdpa_device *vdev,
>                                      u16 idx, u64 desc_area, u64 driver_area,
>                                      u64 device_area)
> @@ -144,22 +361,6 @@ static void ef100_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
>         mutex_unlock(&vdpa_nic->lock);
>  }
>
> -static void ef100_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
> -{
> -       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
> -       u32 idx_val;
> -
> -       if (is_qid_invalid(vdpa_nic, idx, __func__))
> -               return;
> -
> -       if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
> -               return;
> -
> -       idx_val = idx;
> -       _efx_writed(vdpa_nic->efx, cpu_to_le32(idx_val),
> -                   vdpa_nic->vring[idx].doorbell_offset);
> -}
> -
>  static void ef100_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx,
>                                  struct vdpa_callback *cb)
>  {
> @@ -176,6 +377,7 @@ static void ef100_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx,
>                                     bool ready)
>  {
>         struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
> +       int rc;
>
>         if (is_qid_invalid(vdpa_nic, idx, __func__))
>                 return;
> @@ -184,9 +386,21 @@ static void ef100_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx,
>         if (ready) {
>                 vdpa_nic->vring[idx].vring_state |=
>                                         EF100_VRING_READY_CONFIGURED;
> +               if (vdpa_nic->vdpa_state == EF100_VDPA_STATE_STARTED &&
> +                   can_create_vring(vdpa_nic, idx)) {
> +                       rc = create_vring(vdpa_nic, idx);
> +                       if (rc)
> +                               /* Rollback ready configuration
> +                                * So that the above layer driver
> +                                * can make another attempt to set ready
> +                                */
> +                               vdpa_nic->vring[idx].vring_state &=
> +                                       ~EF100_VRING_READY_CONFIGURED;
> +               }
>         } else {
>                 vdpa_nic->vring[idx].vring_state &=
>                                         ~EF100_VRING_READY_CONFIGURED;
> +               delete_vring(vdpa_nic, idx);
>         }
>         mutex_unlock(&vdpa_nic->lock);
>  }
> @@ -296,6 +510,12 @@ static u64 ef100_vdpa_get_device_features(struct vdpa_device *vdev)
>         }
>
>         features |= BIT_ULL(VIRTIO_NET_F_MAC);
> +       /* As QEMU SVQ doesn't implement the following features,
> +        * masking them off to allow Live Migration
> +        */
> +       features &= ~BIT_ULL(VIRTIO_F_IN_ORDER);
> +       features &= ~BIT_ULL(VIRTIO_F_ORDER_PLATFORM);

It's better not to work around userspace bugs in the kernel. We should
fix Qemu instead.

> +
>         return features;
>  }
>
> @@ -356,6 +576,77 @@ static u32 ef100_vdpa_get_vendor_id(struct vdpa_device *vdev)
>         return EF100_VDPA_VENDOR_ID;
>  }
>
> +static u8 ef100_vdpa_get_status(struct vdpa_device *vdev)
> +{
> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
> +       u8 status;
> +
> +       mutex_lock(&vdpa_nic->lock);
> +       status = vdpa_nic->status;
> +       mutex_unlock(&vdpa_nic->lock);
> +       return status;
> +}
> +
> +static void ef100_vdpa_set_status(struct vdpa_device *vdev, u8 status)
> +{
> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
> +       u8 new_status;
> +       int rc;
> +
> +       mutex_lock(&vdpa_nic->lock);
> +       if (!status) {
> +               dev_info(&vdev->dev,
> +                        "%s: Status received is 0. Device reset being done\n",
> +                        __func__);

This is trigger-able by the userspace. It might be better to use
dev_dbg() instead.

> +               ef100_reset_vdpa_device(vdpa_nic);
> +               goto unlock_return;
> +       }
> +       new_status = status & ~vdpa_nic->status;
> +       if (new_status == 0) {
> +               dev_info(&vdev->dev,
> +                        "%s: New status same as current status\n", __func__);

Same here.

> +               goto unlock_return;
> +       }
> +       if (new_status & VIRTIO_CONFIG_S_FAILED) {
> +               ef100_reset_vdpa_device(vdpa_nic);
> +               goto unlock_return;
> +       }
> +
> +       if (new_status & VIRTIO_CONFIG_S_ACKNOWLEDGE) {
> +               vdpa_nic->status |= VIRTIO_CONFIG_S_ACKNOWLEDGE;
> +               new_status &= ~VIRTIO_CONFIG_S_ACKNOWLEDGE;
> +       }
> +       if (new_status & VIRTIO_CONFIG_S_DRIVER) {
> +               vdpa_nic->status |= VIRTIO_CONFIG_S_DRIVER;
> +               new_status &= ~VIRTIO_CONFIG_S_DRIVER;
> +       }
> +       if (new_status & VIRTIO_CONFIG_S_FEATURES_OK) {
> +               vdpa_nic->status |= VIRTIO_CONFIG_S_FEATURES_OK;
> +               vdpa_nic->vdpa_state = EF100_VDPA_STATE_NEGOTIATED;

It might be better to explain the reason we need to track another
state in vdpa_state instead of simply using the device status.

> +               new_status &= ~VIRTIO_CONFIG_S_FEATURES_OK;
> +       }
> +       if (new_status & VIRTIO_CONFIG_S_DRIVER_OK &&
> +           vdpa_nic->vdpa_state == EF100_VDPA_STATE_NEGOTIATED) {
> +               vdpa_nic->status |= VIRTIO_CONFIG_S_DRIVER_OK;
> +               rc = start_vdpa_device(vdpa_nic);
> +               if (rc) {
> +                       dev_err(&vdpa_nic->vdpa_dev.dev,
> +                               "%s: vDPA device failed:%d\n", __func__, rc);
> +                       vdpa_nic->status &= ~VIRTIO_CONFIG_S_DRIVER_OK;
> +                       goto unlock_return;
> +               }
> +               new_status &= ~VIRTIO_CONFIG_S_DRIVER_OK;
> +       }
> +       if (new_status) {
> +               dev_warn(&vdev->dev,
> +                        "%s: Mismatch Status: %x & State: %u\n",
> +                        __func__, new_status, vdpa_nic->vdpa_state);
> +       }
> +
> +unlock_return:
> +       mutex_unlock(&vdpa_nic->lock);
> +}
> +
>  static size_t ef100_vdpa_get_config_size(struct vdpa_device *vdev)
>  {
>         return sizeof(struct virtio_net_config);
> @@ -393,6 +684,20 @@ static void ef100_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset,
>                 vdpa_nic->mac_configured = true;
>  }
>
> +static int ef100_vdpa_suspend(struct vdpa_device *vdev)
> +{
> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
> +       int i, rc;
> +
> +       mutex_lock(&vdpa_nic->lock);
> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++) {
> +               rc = delete_vring(vdpa_nic, i);

Note that the suspension matters for the whole device. It means the
config space should not be changed. But the code here only suspends
the vring, is this intended?

Reset may have the same issue.

Thanks


> +               if (rc)
> +                       break;
> +       }
> +       mutex_unlock(&vdpa_nic->lock);
> +       return rc;
> +}
>  static void ef100_vdpa_free(struct vdpa_device *vdev)
>  {
>         struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
> @@ -428,9 +733,13 @@ const struct vdpa_config_ops ef100_vdpa_config_ops = {
>         .get_vq_num_max      = ef100_vdpa_get_vq_num_max,
>         .get_device_id       = ef100_vdpa_get_device_id,
>         .get_vendor_id       = ef100_vdpa_get_vendor_id,
> +       .get_status          = ef100_vdpa_get_status,
> +       .set_status          = ef100_vdpa_set_status,
> +       .reset               = ef100_vdpa_reset,
>         .get_config_size     = ef100_vdpa_get_config_size,
>         .get_config          = ef100_vdpa_get_config,
>         .set_config          = ef100_vdpa_set_config,
>         .get_generation      = NULL,
> +       .suspend             = ef100_vdpa_suspend,
>         .free                = ef100_vdpa_free,
>  };
> --
> 2.30.1
>
Gautam Dawar March 13, 2023, 12:10 p.m. UTC | #2
On 3/10/23 10:35, Jason Wang wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> On Tue, Mar 7, 2023 at 7:38 PM Gautam Dawar <gautam.dawar@amd.com> wrote:
>> vDPA config opertions to handle get/set device status and device
>> reset have been implemented. Also .suspend config operation is
>> implemented to support Live Migration.
>>
>> Signed-off-by: Gautam Dawar <gautam.dawar@amd.com>
>> ---
>>   drivers/net/ethernet/sfc/ef100_vdpa.c     |  16 +-
>>   drivers/net/ethernet/sfc/ef100_vdpa.h     |   2 +
>>   drivers/net/ethernet/sfc/ef100_vdpa_ops.c | 367 ++++++++++++++++++++--
>>   3 files changed, 355 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.c b/drivers/net/ethernet/sfc/ef100_vdpa.c
>> index c66e5aef69ea..4ba57827a6cd 100644
>> --- a/drivers/net/ethernet/sfc/ef100_vdpa.c
>> +++ b/drivers/net/ethernet/sfc/ef100_vdpa.c
>> @@ -68,9 +68,14 @@ static int vdpa_allocate_vis(struct efx_nic *efx, unsigned int *allocated_vis)
>>
>>   static void ef100_vdpa_delete(struct efx_nic *efx)
>>   {
>> +       struct vdpa_device *vdpa_dev;
>> +
>>          if (efx->vdpa_nic) {
>> +               vdpa_dev = &efx->vdpa_nic->vdpa_dev;
>> +               ef100_vdpa_reset(vdpa_dev);
>> +
>>                  /* replace with _vdpa_unregister_device later */
>> -               put_device(&efx->vdpa_nic->vdpa_dev.dev);
>> +               put_device(&vdpa_dev->dev);
>>          }
>>          efx_mcdi_free_vis(efx);
>>   }
>> @@ -171,6 +176,15 @@ static struct ef100_vdpa_nic *ef100_vdpa_create(struct efx_nic *efx,
>>                  }
>>          }
>>
>> +       rc = devm_add_action_or_reset(&efx->pci_dev->dev,
>> +                                     ef100_vdpa_irq_vectors_free,
>> +                                     efx->pci_dev);
>> +       if (rc) {
>> +               pci_err(efx->pci_dev,
>> +                       "Failed adding devres for freeing irq vectors\n");
>> +               goto err_put_device;
>> +       }
>> +
>>          rc = get_net_config(vdpa_nic);
>>          if (rc)
>>                  goto err_put_device;
>> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.h b/drivers/net/ethernet/sfc/ef100_vdpa.h
>> index 348ca8a7404b..58791402e454 100644
>> --- a/drivers/net/ethernet/sfc/ef100_vdpa.h
>> +++ b/drivers/net/ethernet/sfc/ef100_vdpa.h
>> @@ -149,6 +149,8 @@ int ef100_vdpa_register_mgmtdev(struct efx_nic *efx);
>>   void ef100_vdpa_unregister_mgmtdev(struct efx_nic *efx);
>>   void ef100_vdpa_irq_vectors_free(void *data);
>>   int ef100_vdpa_init_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx);
>> +void ef100_vdpa_irq_vectors_free(void *data);
>> +int ef100_vdpa_reset(struct vdpa_device *vdev);
>>
>>   static inline bool efx_vdpa_is_little_endian(struct ef100_vdpa_nic *vdpa_nic)
>>   {
>> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
>> index 0051c4c0e47c..95a2177f85a2 100644
>> --- a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
>> +++ b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
>> @@ -22,11 +22,6 @@ static struct ef100_vdpa_nic *get_vdpa_nic(struct vdpa_device *vdev)
>>          return container_of(vdev, struct ef100_vdpa_nic, vdpa_dev);
>>   }
>>
>> -void ef100_vdpa_irq_vectors_free(void *data)
>> -{
>> -       pci_free_irq_vectors(data);
>> -}
>> -
>>   static int create_vring_ctx(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>   {
>>          struct efx_vring_ctx *vring_ctx;
>> @@ -52,14 +47,6 @@ static void delete_vring_ctx(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>          vdpa_nic->vring[idx].vring_ctx = NULL;
>>   }
>>
>> -static void reset_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>> -{
>> -       vdpa_nic->vring[idx].vring_type = EF100_VDPA_VQ_NTYPES;
>> -       vdpa_nic->vring[idx].vring_state = 0;
>> -       vdpa_nic->vring[idx].last_avail_idx = 0;
>> -       vdpa_nic->vring[idx].last_used_idx = 0;
>> -}
>> -
>>   int ef100_vdpa_init_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>   {
>>          u32 offset;
>> @@ -103,6 +90,236 @@ static bool is_qid_invalid(struct ef100_vdpa_nic *vdpa_nic, u16 idx,
>>          return false;
>>   }
>>
>> +static void irq_vring_fini(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>> +{
>> +       struct ef100_vdpa_vring_info *vring = &vdpa_nic->vring[idx];
>> +       struct pci_dev *pci_dev = vdpa_nic->efx->pci_dev;
>> +
>> +       devm_free_irq(&pci_dev->dev, vring->irq, vring);
>> +       vring->irq = -EINVAL;
>> +}
>> +
>> +static irqreturn_t vring_intr_handler(int irq, void *arg)
>> +{
>> +       struct ef100_vdpa_vring_info *vring = arg;
>> +
>> +       if (vring->cb.callback)
>> +               return vring->cb.callback(vring->cb.private);
>> +
>> +       return IRQ_NONE;
>> +}
>> +
>> +static int ef100_vdpa_irq_vectors_alloc(struct pci_dev *pci_dev, u16 nvqs)
>> +{
>> +       int rc;
>> +
>> +       rc = pci_alloc_irq_vectors(pci_dev, nvqs, nvqs, PCI_IRQ_MSIX);
>> +       if (rc < 0)
>> +               pci_err(pci_dev,
>> +                       "Failed to alloc %d IRQ vectors, err:%d\n", nvqs, rc);
>> +       return rc;
>> +}
>> +
>> +void ef100_vdpa_irq_vectors_free(void *data)
>> +{
>> +       pci_free_irq_vectors(data);
>> +}
>> +
>> +static int irq_vring_init(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>> +{
>> +       struct ef100_vdpa_vring_info *vring = &vdpa_nic->vring[idx];
>> +       struct pci_dev *pci_dev = vdpa_nic->efx->pci_dev;
>> +       int irq;
>> +       int rc;
>> +
>> +       snprintf(vring->msix_name, 256, "x_vdpa[%s]-%d\n",
>> +                pci_name(pci_dev), idx);
>> +       irq = pci_irq_vector(pci_dev, idx);
>> +       rc = devm_request_irq(&pci_dev->dev, irq, vring_intr_handler, 0,
>> +                             vring->msix_name, vring);
>> +       if (rc)
>> +               pci_err(pci_dev,
>> +                       "devm_request_irq failed for vring %d, rc %d\n",
>> +                       idx, rc);
>> +       else
>> +               vring->irq = irq;
>> +
>> +       return rc;
>> +}
>> +
>> +static int delete_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>> +{
>> +       struct efx_vring_dyn_cfg vring_dyn_cfg;
>> +       int rc;
>> +
>> +       if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
>> +               return 0;
>> +
>> +       rc = efx_vdpa_vring_destroy(vdpa_nic->vring[idx].vring_ctx,
>> +                                   &vring_dyn_cfg);
>> +       if (rc)
>> +               dev_err(&vdpa_nic->vdpa_dev.dev,
>> +                       "%s: delete vring failed index:%u, err:%d\n",
>> +                       __func__, idx, rc);
>> +       vdpa_nic->vring[idx].last_avail_idx = vring_dyn_cfg.avail_idx;
>> +       vdpa_nic->vring[idx].last_used_idx = vring_dyn_cfg.used_idx;
>> +       vdpa_nic->vring[idx].vring_state &= ~EF100_VRING_CREATED;
>> +
>> +       irq_vring_fini(vdpa_nic, idx);
>> +
>> +       return rc;
>> +}
>> +
>> +static void ef100_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
>> +{
>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>> +       u32 idx_val;
>> +
>> +       if (is_qid_invalid(vdpa_nic, idx, __func__))
>> +               return;
>> +
>> +       if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
>> +               return;
>> +
>> +       idx_val = idx;
>> +       _efx_writed(vdpa_nic->efx, cpu_to_le32(idx_val),
>> +                   vdpa_nic->vring[idx].doorbell_offset);
>> +}
>> +
>> +static bool can_create_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>> +{
>> +       if (vdpa_nic->vring[idx].vring_state == EF100_VRING_CONFIGURED &&
>> +           vdpa_nic->status & VIRTIO_CONFIG_S_DRIVER_OK &&
>> +           !(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
>> +               return true;
>> +
>> +       return false;
>> +}
>> +
>> +static int create_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>> +{
>> +       struct efx_vring_dyn_cfg vring_dyn_cfg;
>> +       struct efx_vring_cfg vring_cfg;
>> +       int rc;
>> +
>> +       rc = irq_vring_init(vdpa_nic, idx);
>> +       if (rc) {
>> +               dev_err(&vdpa_nic->vdpa_dev.dev,
>> +                       "%s: irq_vring_init failed. index:%u, err:%d\n",
>> +                       __func__, idx, rc);
>> +               return rc;
>> +       }
>> +       vring_cfg.desc = vdpa_nic->vring[idx].desc;
>> +       vring_cfg.avail = vdpa_nic->vring[idx].avail;
>> +       vring_cfg.used = vdpa_nic->vring[idx].used;
>> +       vring_cfg.size = vdpa_nic->vring[idx].size;
>> +       vring_cfg.features = vdpa_nic->features;
>> +       vring_cfg.msix_vector = idx;
>> +       vring_dyn_cfg.avail_idx = vdpa_nic->vring[idx].last_avail_idx;
>> +       vring_dyn_cfg.used_idx = vdpa_nic->vring[idx].last_used_idx;
>> +
>> +       rc = efx_vdpa_vring_create(vdpa_nic->vring[idx].vring_ctx,
>> +                                  &vring_cfg, &vring_dyn_cfg);
>> +       if (rc) {
>> +               dev_err(&vdpa_nic->vdpa_dev.dev,
>> +                       "%s: vring_create failed index:%u, err:%d\n",
>> +                       __func__, idx, rc);
>> +               goto err_vring_create;
>> +       }
>> +       vdpa_nic->vring[idx].vring_state |= EF100_VRING_CREATED;
>> +
>> +       /* A VQ kick allows the device to read the avail_idx, which will be
>> +        * required at the destination after live migration.
>> +        */
>> +       ef100_vdpa_kick_vq(&vdpa_nic->vdpa_dev, idx);
>> +
>> +       return 0;
>> +
>> +err_vring_create:
>> +       irq_vring_fini(vdpa_nic, idx);
>> +       return rc;
>> +}
>> +
>> +static void reset_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>> +{
>> +       delete_vring(vdpa_nic, idx);
>> +       vdpa_nic->vring[idx].vring_type = EF100_VDPA_VQ_NTYPES;
>> +       vdpa_nic->vring[idx].vring_state = 0;
>> +       vdpa_nic->vring[idx].last_avail_idx = 0;
>> +       vdpa_nic->vring[idx].last_used_idx = 0;
>> +}
>> +
>> +static void ef100_reset_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
>> +{
>> +       int i;
>> +
>> +       WARN_ON(!mutex_is_locked(&vdpa_nic->lock));
>> +
>> +       if (!vdpa_nic->status)
>> +               return;
>> +
>> +       vdpa_nic->vdpa_state = EF100_VDPA_STATE_INITIALIZED;
>> +       vdpa_nic->status = 0;
>> +       vdpa_nic->features = 0;
>> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++)
>> +               reset_vring(vdpa_nic, i);
>> +       ef100_vdpa_irq_vectors_free(vdpa_nic->efx->pci_dev);
>> +}
>> +
>> +/* May be called under the rtnl lock */
>> +int ef100_vdpa_reset(struct vdpa_device *vdev)
>> +{
>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>> +
>> +       /* vdpa device can be deleted anytime but the bar_config
>> +        * could still be vdpa and hence efx->state would be STATE_VDPA.
>> +        * Accordingly, ensure vdpa device exists before reset handling
>> +        */
>> +       if (!vdpa_nic)
>> +               return -ENODEV;
>> +
>> +       mutex_lock(&vdpa_nic->lock);
>> +       ef100_reset_vdpa_device(vdpa_nic);
>> +       mutex_unlock(&vdpa_nic->lock);
>> +       return 0;
>> +}
>> +
>> +static int start_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
>> +{
>> +       struct efx_nic *efx = vdpa_nic->efx;
>> +       struct ef100_nic_data *nic_data;
>> +       int i, j;
>> +       int rc;
>> +
>> +       nic_data = efx->nic_data;
>> +       rc = ef100_vdpa_irq_vectors_alloc(efx->pci_dev,
>> +                                         vdpa_nic->max_queue_pairs * 2);
>> +       if (rc < 0) {
>> +               pci_err(efx->pci_dev,
>> +                       "vDPA IRQ alloc failed for vf: %u err:%d\n",
>> +                       nic_data->vf_index, rc);
>> +               return rc;
>> +       }
>> +
>> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++) {
>> +               if (can_create_vring(vdpa_nic, i)) {
>> +                       rc = create_vring(vdpa_nic, i);
>> +                       if (rc)
>> +                               goto clear_vring;
>> +               }
>> +       }
>> +
>> +       vdpa_nic->vdpa_state = EF100_VDPA_STATE_STARTED;
> It looks to me that this duplicates with the DRIVER_OK status bit.
vdpa_state is set EF100_VDPA_STATE_STARTED during DRIVER_OK handling. 
See my later response for its purpose.
>
>> +       return 0;
>> +
>> +clear_vring:
>> +       for (j = 0; j < i; j++)
>> +               delete_vring(vdpa_nic, j);
>> +
>> +       ef100_vdpa_irq_vectors_free(efx->pci_dev);
>> +       return rc;
>> +}
>> +
>>   static int ef100_vdpa_set_vq_address(struct vdpa_device *vdev,
>>                                       u16 idx, u64 desc_area, u64 driver_area,
>>                                       u64 device_area)
>> @@ -144,22 +361,6 @@ static void ef100_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
>>          mutex_unlock(&vdpa_nic->lock);
>>   }
>>
>> -static void ef100_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
>> -{
>> -       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>> -       u32 idx_val;
>> -
>> -       if (is_qid_invalid(vdpa_nic, idx, __func__))
>> -               return;
>> -
>> -       if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
>> -               return;
>> -
>> -       idx_val = idx;
>> -       _efx_writed(vdpa_nic->efx, cpu_to_le32(idx_val),
>> -                   vdpa_nic->vring[idx].doorbell_offset);
>> -}
>> -
>>   static void ef100_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx,
>>                                   struct vdpa_callback *cb)
>>   {
>> @@ -176,6 +377,7 @@ static void ef100_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx,
>>                                      bool ready)
>>   {
>>          struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>> +       int rc;
>>
>>          if (is_qid_invalid(vdpa_nic, idx, __func__))
>>                  return;
>> @@ -184,9 +386,21 @@ static void ef100_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx,
>>          if (ready) {
>>                  vdpa_nic->vring[idx].vring_state |=
>>                                          EF100_VRING_READY_CONFIGURED;
>> +               if (vdpa_nic->vdpa_state == EF100_VDPA_STATE_STARTED &&
>> +                   can_create_vring(vdpa_nic, idx)) {
>> +                       rc = create_vring(vdpa_nic, idx);
>> +                       if (rc)
>> +                               /* Rollback ready configuration
>> +                                * So that the above layer driver
>> +                                * can make another attempt to set ready
>> +                                */
>> +                               vdpa_nic->vring[idx].vring_state &=
>> +                                       ~EF100_VRING_READY_CONFIGURED;
>> +               }
>>          } else {
>>                  vdpa_nic->vring[idx].vring_state &=
>>                                          ~EF100_VRING_READY_CONFIGURED;
>> +               delete_vring(vdpa_nic, idx);
>>          }
>>          mutex_unlock(&vdpa_nic->lock);
>>   }
>> @@ -296,6 +510,12 @@ static u64 ef100_vdpa_get_device_features(struct vdpa_device *vdev)
>>          }
>>
>>          features |= BIT_ULL(VIRTIO_NET_F_MAC);
>> +       /* As QEMU SVQ doesn't implement the following features,
>> +        * masking them off to allow Live Migration
>> +        */
>> +       features &= ~BIT_ULL(VIRTIO_F_IN_ORDER);
>> +       features &= ~BIT_ULL(VIRTIO_F_ORDER_PLATFORM);
> It's better not to work around userspace bugs in the kernel. We should
> fix Qemu instead.

There's already a QEMU patch [1] submitted to support 
VIRTIO_F_ORDER_PLATFORM but it hasn't concluded yet. Also, there is no 
support for VIRTIO_F_IN_ORDER in the kernel virtio driver. The motive of 
this change is to have VM Live Migration working with the kernel in-tree 
driver without requiring any changes.

Once QEMU is able to handle these features, we can submit a patch to 
undo these changes.

>
>> +
>>          return features;
>>   }
>>
>> @@ -356,6 +576,77 @@ static u32 ef100_vdpa_get_vendor_id(struct vdpa_device *vdev)
>>          return EF100_VDPA_VENDOR_ID;
>>   }
>>
>> +static u8 ef100_vdpa_get_status(struct vdpa_device *vdev)
>> +{
>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>> +       u8 status;
>> +
>> +       mutex_lock(&vdpa_nic->lock);
>> +       status = vdpa_nic->status;
>> +       mutex_unlock(&vdpa_nic->lock);
>> +       return status;
>> +}
>> +
>> +static void ef100_vdpa_set_status(struct vdpa_device *vdev, u8 status)
>> +{
>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>> +       u8 new_status;
>> +       int rc;
>> +
>> +       mutex_lock(&vdpa_nic->lock);
>> +       if (!status) {
>> +               dev_info(&vdev->dev,
>> +                        "%s: Status received is 0. Device reset being done\n",
>> +                        __func__);
> This is trigger-able by the userspace. It might be better to use
> dev_dbg() instead.
Will change.
>
>> +               ef100_reset_vdpa_device(vdpa_nic);
>> +               goto unlock_return;
>> +       }
>> +       new_status = status & ~vdpa_nic->status;
>> +       if (new_status == 0) {
>> +               dev_info(&vdev->dev,
>> +                        "%s: New status same as current status\n", __func__);
> Same here.
Ok.
>
>> +               goto unlock_return;
>> +       }
>> +       if (new_status & VIRTIO_CONFIG_S_FAILED) {
>> +               ef100_reset_vdpa_device(vdpa_nic);
>> +               goto unlock_return;
>> +       }
>> +
>> +       if (new_status & VIRTIO_CONFIG_S_ACKNOWLEDGE) {
>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_ACKNOWLEDGE;
>> +               new_status &= ~VIRTIO_CONFIG_S_ACKNOWLEDGE;
>> +       }
>> +       if (new_status & VIRTIO_CONFIG_S_DRIVER) {
>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_DRIVER;
>> +               new_status &= ~VIRTIO_CONFIG_S_DRIVER;
>> +       }
>> +       if (new_status & VIRTIO_CONFIG_S_FEATURES_OK) {
>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_FEATURES_OK;
>> +               vdpa_nic->vdpa_state = EF100_VDPA_STATE_NEGOTIATED;
> It might be better to explain the reason we need to track another
> state in vdpa_state instead of simply using the device status.
vdpa_state helps to ensure correct status transitions in the .set_status 
callback and safe-guards against incorrect/malicious user-space driver.
>
>> +               new_status &= ~VIRTIO_CONFIG_S_FEATURES_OK;
>> +       }
>> +       if (new_status & VIRTIO_CONFIG_S_DRIVER_OK &&
>> +           vdpa_nic->vdpa_state == EF100_VDPA_STATE_NEGOTIATED) {
>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_DRIVER_OK;
>> +               rc = start_vdpa_device(vdpa_nic);
>> +               if (rc) {
>> +                       dev_err(&vdpa_nic->vdpa_dev.dev,
>> +                               "%s: vDPA device failed:%d\n", __func__, rc);
>> +                       vdpa_nic->status &= ~VIRTIO_CONFIG_S_DRIVER_OK;
>> +                       goto unlock_return;
>> +               }
>> +               new_status &= ~VIRTIO_CONFIG_S_DRIVER_OK;
>> +       }
>> +       if (new_status) {
>> +               dev_warn(&vdev->dev,
>> +                        "%s: Mismatch Status: %x & State: %u\n",
>> +                        __func__, new_status, vdpa_nic->vdpa_state);
>> +       }
>> +
>> +unlock_return:
>> +       mutex_unlock(&vdpa_nic->lock);
>> +}
>> +
>>   static size_t ef100_vdpa_get_config_size(struct vdpa_device *vdev)
>>   {
>>          return sizeof(struct virtio_net_config);
>> @@ -393,6 +684,20 @@ static void ef100_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset,
>>                  vdpa_nic->mac_configured = true;
>>   }
>>
>> +static int ef100_vdpa_suspend(struct vdpa_device *vdev)
>> +{
>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>> +       int i, rc;
>> +
>> +       mutex_lock(&vdpa_nic->lock);
>> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++) {
>> +               rc = delete_vring(vdpa_nic, i);
> Note that the suspension matters for the whole device. It means the
> config space should not be changed. But the code here only suspends
> the vring, is this intende/d?
Are you referring to the possibility of updating device configuration 
(eg. MAC address) using .set_config() after suspend operation? Is there 
any other user triggered operation that falls in this category?
>
> Reset may have the same issue.
Could you pls elaborate on the requirement during device reset?
>
> Thanks
[1] https://patchew.org/QEMU/20230213191929.1547497-1-eperezma@redhat.com/
>
>> +               if (rc)
>> +                       break;
>> +       }
>> +       mutex_unlock(&vdpa_nic->lock);
>> +       return rc;
>> +}
>>   static void ef100_vdpa_free(struct vdpa_device *vdev)
>>   {
>>          struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>> @@ -428,9 +733,13 @@ const struct vdpa_config_ops ef100_vdpa_config_ops = {
>>          .get_vq_num_max      = ef100_vdpa_get_vq_num_max,
>>          .get_device_id       = ef100_vdpa_get_device_id,
>>          .get_vendor_id       = ef100_vdpa_get_vendor_id,
>> +       .get_status          = ef100_vdpa_get_status,
>> +       .set_status          = ef100_vdpa_set_status,
>> +       .reset               = ef100_vdpa_reset,
>>          .get_config_size     = ef100_vdpa_get_config_size,
>>          .get_config          = ef100_vdpa_get_config,
>>          .set_config          = ef100_vdpa_set_config,
>>          .get_generation      = NULL,
>> +       .suspend             = ef100_vdpa_suspend,
>>          .free                = ef100_vdpa_free,
>>   };
>> --
>> 2.30.1
>>
Jason Wang March 15, 2023, 5 a.m. UTC | #3
在 2023/3/13 20:10, Gautam Dawar 写道:
>
> On 3/10/23 10:35, Jason Wang wrote:
>> Caution: This message originated from an External Source. Use proper 
>> caution when opening attachments, clicking links, or responding.
>>
>>
>> On Tue, Mar 7, 2023 at 7:38 PM Gautam Dawar <gautam.dawar@amd.com> 
>> wrote:
>>> vDPA config opertions to handle get/set device status and device
>>> reset have been implemented. Also .suspend config operation is
>>> implemented to support Live Migration.
>>>
>>> Signed-off-by: Gautam Dawar <gautam.dawar@amd.com>
>>> ---
>>>   drivers/net/ethernet/sfc/ef100_vdpa.c     |  16 +-
>>>   drivers/net/ethernet/sfc/ef100_vdpa.h     |   2 +
>>>   drivers/net/ethernet/sfc/ef100_vdpa_ops.c | 367 
>>> ++++++++++++++++++++--
>>>   3 files changed, 355 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.c 
>>> b/drivers/net/ethernet/sfc/ef100_vdpa.c
>>> index c66e5aef69ea..4ba57827a6cd 100644
>>> --- a/drivers/net/ethernet/sfc/ef100_vdpa.c
>>> +++ b/drivers/net/ethernet/sfc/ef100_vdpa.c
>>> @@ -68,9 +68,14 @@ static int vdpa_allocate_vis(struct efx_nic *efx, 
>>> unsigned int *allocated_vis)
>>>
>>>   static void ef100_vdpa_delete(struct efx_nic *efx)
>>>   {
>>> +       struct vdpa_device *vdpa_dev;
>>> +
>>>          if (efx->vdpa_nic) {
>>> +               vdpa_dev = &efx->vdpa_nic->vdpa_dev;
>>> +               ef100_vdpa_reset(vdpa_dev);
>>> +
>>>                  /* replace with _vdpa_unregister_device later */
>>> - put_device(&efx->vdpa_nic->vdpa_dev.dev);
>>> +               put_device(&vdpa_dev->dev);
>>>          }
>>>          efx_mcdi_free_vis(efx);
>>>   }
>>> @@ -171,6 +176,15 @@ static struct ef100_vdpa_nic 
>>> *ef100_vdpa_create(struct efx_nic *efx,
>>>                  }
>>>          }
>>>
>>> +       rc = devm_add_action_or_reset(&efx->pci_dev->dev,
>>> + ef100_vdpa_irq_vectors_free,
>>> +                                     efx->pci_dev);
>>> +       if (rc) {
>>> +               pci_err(efx->pci_dev,
>>> +                       "Failed adding devres for freeing irq 
>>> vectors\n");
>>> +               goto err_put_device;
>>> +       }
>>> +
>>>          rc = get_net_config(vdpa_nic);
>>>          if (rc)
>>>                  goto err_put_device;
>>> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.h 
>>> b/drivers/net/ethernet/sfc/ef100_vdpa.h
>>> index 348ca8a7404b..58791402e454 100644
>>> --- a/drivers/net/ethernet/sfc/ef100_vdpa.h
>>> +++ b/drivers/net/ethernet/sfc/ef100_vdpa.h
>>> @@ -149,6 +149,8 @@ int ef100_vdpa_register_mgmtdev(struct efx_nic 
>>> *efx);
>>>   void ef100_vdpa_unregister_mgmtdev(struct efx_nic *efx);
>>>   void ef100_vdpa_irq_vectors_free(void *data);
>>>   int ef100_vdpa_init_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx);
>>> +void ef100_vdpa_irq_vectors_free(void *data);
>>> +int ef100_vdpa_reset(struct vdpa_device *vdev);
>>>
>>>   static inline bool efx_vdpa_is_little_endian(struct ef100_vdpa_nic 
>>> *vdpa_nic)
>>>   {
>>> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c 
>>> b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
>>> index 0051c4c0e47c..95a2177f85a2 100644
>>> --- a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
>>> +++ b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
>>> @@ -22,11 +22,6 @@ static struct ef100_vdpa_nic *get_vdpa_nic(struct 
>>> vdpa_device *vdev)
>>>          return container_of(vdev, struct ef100_vdpa_nic, vdpa_dev);
>>>   }
>>>
>>> -void ef100_vdpa_irq_vectors_free(void *data)
>>> -{
>>> -       pci_free_irq_vectors(data);
>>> -}
>>> -
>>>   static int create_vring_ctx(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>   {
>>>          struct efx_vring_ctx *vring_ctx;
>>> @@ -52,14 +47,6 @@ static void delete_vring_ctx(struct 
>>> ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>          vdpa_nic->vring[idx].vring_ctx = NULL;
>>>   }
>>>
>>> -static void reset_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>> -{
>>> -       vdpa_nic->vring[idx].vring_type = EF100_VDPA_VQ_NTYPES;
>>> -       vdpa_nic->vring[idx].vring_state = 0;
>>> -       vdpa_nic->vring[idx].last_avail_idx = 0;
>>> -       vdpa_nic->vring[idx].last_used_idx = 0;
>>> -}
>>> -
>>>   int ef100_vdpa_init_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>   {
>>>          u32 offset;
>>> @@ -103,6 +90,236 @@ static bool is_qid_invalid(struct 
>>> ef100_vdpa_nic *vdpa_nic, u16 idx,
>>>          return false;
>>>   }
>>>
>>> +static void irq_vring_fini(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>> +{
>>> +       struct ef100_vdpa_vring_info *vring = &vdpa_nic->vring[idx];
>>> +       struct pci_dev *pci_dev = vdpa_nic->efx->pci_dev;
>>> +
>>> +       devm_free_irq(&pci_dev->dev, vring->irq, vring);
>>> +       vring->irq = -EINVAL;
>>> +}
>>> +
>>> +static irqreturn_t vring_intr_handler(int irq, void *arg)
>>> +{
>>> +       struct ef100_vdpa_vring_info *vring = arg;
>>> +
>>> +       if (vring->cb.callback)
>>> +               return vring->cb.callback(vring->cb.private);
>>> +
>>> +       return IRQ_NONE;
>>> +}
>>> +
>>> +static int ef100_vdpa_irq_vectors_alloc(struct pci_dev *pci_dev, 
>>> u16 nvqs)
>>> +{
>>> +       int rc;
>>> +
>>> +       rc = pci_alloc_irq_vectors(pci_dev, nvqs, nvqs, PCI_IRQ_MSIX);
>>> +       if (rc < 0)
>>> +               pci_err(pci_dev,
>>> +                       "Failed to alloc %d IRQ vectors, err:%d\n", 
>>> nvqs, rc);
>>> +       return rc;
>>> +}
>>> +
>>> +void ef100_vdpa_irq_vectors_free(void *data)
>>> +{
>>> +       pci_free_irq_vectors(data);
>>> +}
>>> +
>>> +static int irq_vring_init(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>> +{
>>> +       struct ef100_vdpa_vring_info *vring = &vdpa_nic->vring[idx];
>>> +       struct pci_dev *pci_dev = vdpa_nic->efx->pci_dev;
>>> +       int irq;
>>> +       int rc;
>>> +
>>> +       snprintf(vring->msix_name, 256, "x_vdpa[%s]-%d\n",
>>> +                pci_name(pci_dev), idx);
>>> +       irq = pci_irq_vector(pci_dev, idx);
>>> +       rc = devm_request_irq(&pci_dev->dev, irq, 
>>> vring_intr_handler, 0,
>>> +                             vring->msix_name, vring);
>>> +       if (rc)
>>> +               pci_err(pci_dev,
>>> +                       "devm_request_irq failed for vring %d, rc 
>>> %d\n",
>>> +                       idx, rc);
>>> +       else
>>> +               vring->irq = irq;
>>> +
>>> +       return rc;
>>> +}
>>> +
>>> +static int delete_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>> +{
>>> +       struct efx_vring_dyn_cfg vring_dyn_cfg;
>>> +       int rc;
>>> +
>>> +       if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
>>> +               return 0;
>>> +
>>> +       rc = efx_vdpa_vring_destroy(vdpa_nic->vring[idx].vring_ctx,
>>> +                                   &vring_dyn_cfg);
>>> +       if (rc)
>>> +               dev_err(&vdpa_nic->vdpa_dev.dev,
>>> +                       "%s: delete vring failed index:%u, err:%d\n",
>>> +                       __func__, idx, rc);
>>> +       vdpa_nic->vring[idx].last_avail_idx = vring_dyn_cfg.avail_idx;
>>> +       vdpa_nic->vring[idx].last_used_idx = vring_dyn_cfg.used_idx;
>>> +       vdpa_nic->vring[idx].vring_state &= ~EF100_VRING_CREATED;
>>> +
>>> +       irq_vring_fini(vdpa_nic, idx);
>>> +
>>> +       return rc;
>>> +}
>>> +
>>> +static void ef100_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
>>> +{
>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>> +       u32 idx_val;
>>> +
>>> +       if (is_qid_invalid(vdpa_nic, idx, __func__))
>>> +               return;
>>> +
>>> +       if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
>>> +               return;
>>> +
>>> +       idx_val = idx;
>>> +       _efx_writed(vdpa_nic->efx, cpu_to_le32(idx_val),
>>> +                   vdpa_nic->vring[idx].doorbell_offset);
>>> +}
>>> +
>>> +static bool can_create_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>> +{
>>> +       if (vdpa_nic->vring[idx].vring_state == 
>>> EF100_VRING_CONFIGURED &&
>>> +           vdpa_nic->status & VIRTIO_CONFIG_S_DRIVER_OK &&
>>> +           !(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
>>> +               return true;
>>> +
>>> +       return false;
>>> +}
>>> +
>>> +static int create_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>> +{
>>> +       struct efx_vring_dyn_cfg vring_dyn_cfg;
>>> +       struct efx_vring_cfg vring_cfg;
>>> +       int rc;
>>> +
>>> +       rc = irq_vring_init(vdpa_nic, idx);
>>> +       if (rc) {
>>> +               dev_err(&vdpa_nic->vdpa_dev.dev,
>>> +                       "%s: irq_vring_init failed. index:%u, 
>>> err:%d\n",
>>> +                       __func__, idx, rc);
>>> +               return rc;
>>> +       }
>>> +       vring_cfg.desc = vdpa_nic->vring[idx].desc;
>>> +       vring_cfg.avail = vdpa_nic->vring[idx].avail;
>>> +       vring_cfg.used = vdpa_nic->vring[idx].used;
>>> +       vring_cfg.size = vdpa_nic->vring[idx].size;
>>> +       vring_cfg.features = vdpa_nic->features;
>>> +       vring_cfg.msix_vector = idx;
>>> +       vring_dyn_cfg.avail_idx = vdpa_nic->vring[idx].last_avail_idx;
>>> +       vring_dyn_cfg.used_idx = vdpa_nic->vring[idx].last_used_idx;
>>> +
>>> +       rc = efx_vdpa_vring_create(vdpa_nic->vring[idx].vring_ctx,
>>> +                                  &vring_cfg, &vring_dyn_cfg);
>>> +       if (rc) {
>>> +               dev_err(&vdpa_nic->vdpa_dev.dev,
>>> +                       "%s: vring_create failed index:%u, err:%d\n",
>>> +                       __func__, idx, rc);
>>> +               goto err_vring_create;
>>> +       }
>>> +       vdpa_nic->vring[idx].vring_state |= EF100_VRING_CREATED;
>>> +
>>> +       /* A VQ kick allows the device to read the avail_idx, which 
>>> will be
>>> +        * required at the destination after live migration.
>>> +        */
>>> +       ef100_vdpa_kick_vq(&vdpa_nic->vdpa_dev, idx);
>>> +
>>> +       return 0;
>>> +
>>> +err_vring_create:
>>> +       irq_vring_fini(vdpa_nic, idx);
>>> +       return rc;
>>> +}
>>> +
>>> +static void reset_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>> +{
>>> +       delete_vring(vdpa_nic, idx);
>>> +       vdpa_nic->vring[idx].vring_type = EF100_VDPA_VQ_NTYPES;
>>> +       vdpa_nic->vring[idx].vring_state = 0;
>>> +       vdpa_nic->vring[idx].last_avail_idx = 0;
>>> +       vdpa_nic->vring[idx].last_used_idx = 0;
>>> +}
>>> +
>>> +static void ef100_reset_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
>>> +{
>>> +       int i;
>>> +
>>> +       WARN_ON(!mutex_is_locked(&vdpa_nic->lock));
>>> +
>>> +       if (!vdpa_nic->status)
>>> +               return;
>>> +
>>> +       vdpa_nic->vdpa_state = EF100_VDPA_STATE_INITIALIZED;
>>> +       vdpa_nic->status = 0;
>>> +       vdpa_nic->features = 0;
>>> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++)
>>> +               reset_vring(vdpa_nic, i);
>>> + ef100_vdpa_irq_vectors_free(vdpa_nic->efx->pci_dev);
>>> +}
>>> +
>>> +/* May be called under the rtnl lock */
>>> +int ef100_vdpa_reset(struct vdpa_device *vdev)
>>> +{
>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>> +
>>> +       /* vdpa device can be deleted anytime but the bar_config
>>> +        * could still be vdpa and hence efx->state would be 
>>> STATE_VDPA.
>>> +        * Accordingly, ensure vdpa device exists before reset handling
>>> +        */
>>> +       if (!vdpa_nic)
>>> +               return -ENODEV;
>>> +
>>> +       mutex_lock(&vdpa_nic->lock);
>>> +       ef100_reset_vdpa_device(vdpa_nic);
>>> +       mutex_unlock(&vdpa_nic->lock);
>>> +       return 0;
>>> +}
>>> +
>>> +static int start_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
>>> +{
>>> +       struct efx_nic *efx = vdpa_nic->efx;
>>> +       struct ef100_nic_data *nic_data;
>>> +       int i, j;
>>> +       int rc;
>>> +
>>> +       nic_data = efx->nic_data;
>>> +       rc = ef100_vdpa_irq_vectors_alloc(efx->pci_dev,
>>> + vdpa_nic->max_queue_pairs * 2);
>>> +       if (rc < 0) {
>>> +               pci_err(efx->pci_dev,
>>> +                       "vDPA IRQ alloc failed for vf: %u err:%d\n",
>>> +                       nic_data->vf_index, rc);
>>> +               return rc;
>>> +       }
>>> +
>>> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++) {
>>> +               if (can_create_vring(vdpa_nic, i)) {
>>> +                       rc = create_vring(vdpa_nic, i);
>>> +                       if (rc)
>>> +                               goto clear_vring;
>>> +               }
>>> +       }
>>> +
>>> +       vdpa_nic->vdpa_state = EF100_VDPA_STATE_STARTED;
>> It looks to me that this duplicates with the DRIVER_OK status bit.
> vdpa_state is set EF100_VDPA_STATE_STARTED during DRIVER_OK handling. 
> See my later response for its purpose.
>>
>>> +       return 0;
>>> +
>>> +clear_vring:
>>> +       for (j = 0; j < i; j++)
>>> +               delete_vring(vdpa_nic, j);
>>> +
>>> +       ef100_vdpa_irq_vectors_free(efx->pci_dev);
>>> +       return rc;
>>> +}
>>> +
>>>   static int ef100_vdpa_set_vq_address(struct vdpa_device *vdev,
>>>                                       u16 idx, u64 desc_area, u64 
>>> driver_area,
>>>                                       u64 device_area)
>>> @@ -144,22 +361,6 @@ static void ef100_vdpa_set_vq_num(struct 
>>> vdpa_device *vdev, u16 idx, u32 num)
>>>          mutex_unlock(&vdpa_nic->lock);
>>>   }
>>>
>>> -static void ef100_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
>>> -{
>>> -       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>> -       u32 idx_val;
>>> -
>>> -       if (is_qid_invalid(vdpa_nic, idx, __func__))
>>> -               return;
>>> -
>>> -       if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
>>> -               return;
>>> -
>>> -       idx_val = idx;
>>> -       _efx_writed(vdpa_nic->efx, cpu_to_le32(idx_val),
>>> -                   vdpa_nic->vring[idx].doorbell_offset);
>>> -}
>>> -
>>>   static void ef100_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx,
>>>                                   struct vdpa_callback *cb)
>>>   {
>>> @@ -176,6 +377,7 @@ static void ef100_vdpa_set_vq_ready(struct 
>>> vdpa_device *vdev, u16 idx,
>>>                                      bool ready)
>>>   {
>>>          struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>> +       int rc;
>>>
>>>          if (is_qid_invalid(vdpa_nic, idx, __func__))
>>>                  return;
>>> @@ -184,9 +386,21 @@ static void ef100_vdpa_set_vq_ready(struct 
>>> vdpa_device *vdev, u16 idx,
>>>          if (ready) {
>>>                  vdpa_nic->vring[idx].vring_state |=
>>> EF100_VRING_READY_CONFIGURED;
>>> +               if (vdpa_nic->vdpa_state == EF100_VDPA_STATE_STARTED &&
>>> +                   can_create_vring(vdpa_nic, idx)) {
>>> +                       rc = create_vring(vdpa_nic, idx);
>>> +                       if (rc)
>>> +                               /* Rollback ready configuration
>>> +                                * So that the above layer driver
>>> +                                * can make another attempt to set 
>>> ready
>>> +                                */
>>> + vdpa_nic->vring[idx].vring_state &=
>>> + ~EF100_VRING_READY_CONFIGURED;
>>> +               }
>>>          } else {
>>>                  vdpa_nic->vring[idx].vring_state &=
>>> ~EF100_VRING_READY_CONFIGURED;
>>> +               delete_vring(vdpa_nic, idx);
>>>          }
>>>          mutex_unlock(&vdpa_nic->lock);
>>>   }
>>> @@ -296,6 +510,12 @@ static u64 
>>> ef100_vdpa_get_device_features(struct vdpa_device *vdev)
>>>          }
>>>
>>>          features |= BIT_ULL(VIRTIO_NET_F_MAC);
>>> +       /* As QEMU SVQ doesn't implement the following features,
>>> +        * masking them off to allow Live Migration
>>> +        */
>>> +       features &= ~BIT_ULL(VIRTIO_F_IN_ORDER);
>>> +       features &= ~BIT_ULL(VIRTIO_F_ORDER_PLATFORM);
>> It's better not to work around userspace bugs in the kernel. We should
>> fix Qemu instead.
>
> There's already a QEMU patch [1] submitted to support 
> VIRTIO_F_ORDER_PLATFORM but it hasn't concluded yet. Also, there is no 
> support for VIRTIO_F_IN_ORDER in the kernel virtio driver. The motive 
> of this change is to have VM Live Migration working with the kernel 
> in-tree driver without requiring any changes.
>
> Once QEMU is able to handle these features, we can submit a patch to 
> undo these changes.


I can understand the motivation, but it works for prototyping but not 
formal kernel code (especially consider SVQ is not mature and still 
being development). What's more, we can not assume Qemu is the only 
user, we have other users like DPDK and cloud-hypervisors.

Thanks


>
>>
>>> +
>>>          return features;
>>>   }
>>>
>>> @@ -356,6 +576,77 @@ static u32 ef100_vdpa_get_vendor_id(struct 
>>> vdpa_device *vdev)
>>>          return EF100_VDPA_VENDOR_ID;
>>>   }
>>>
>>> +static u8 ef100_vdpa_get_status(struct vdpa_device *vdev)
>>> +{
>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>> +       u8 status;
>>> +
>>> +       mutex_lock(&vdpa_nic->lock);
>>> +       status = vdpa_nic->status;
>>> +       mutex_unlock(&vdpa_nic->lock);
>>> +       return status;
>>> +}
>>> +
>>> +static void ef100_vdpa_set_status(struct vdpa_device *vdev, u8 status)
>>> +{
>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>> +       u8 new_status;
>>> +       int rc;
>>> +
>>> +       mutex_lock(&vdpa_nic->lock);
>>> +       if (!status) {
>>> +               dev_info(&vdev->dev,
>>> +                        "%s: Status received is 0. Device reset 
>>> being done\n",
>>> +                        __func__);
>> This is trigger-able by the userspace. It might be better to use
>> dev_dbg() instead.
> Will change.
>>
>>> + ef100_reset_vdpa_device(vdpa_nic);
>>> +               goto unlock_return;
>>> +       }
>>> +       new_status = status & ~vdpa_nic->status;
>>> +       if (new_status == 0) {
>>> +               dev_info(&vdev->dev,
>>> +                        "%s: New status same as current status\n", 
>>> __func__);
>> Same here.
> Ok.
>>
>>> +               goto unlock_return;
>>> +       }
>>> +       if (new_status & VIRTIO_CONFIG_S_FAILED) {
>>> +               ef100_reset_vdpa_device(vdpa_nic);
>>> +               goto unlock_return;
>>> +       }
>>> +
>>> +       if (new_status & VIRTIO_CONFIG_S_ACKNOWLEDGE) {
>>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_ACKNOWLEDGE;
>>> +               new_status &= ~VIRTIO_CONFIG_S_ACKNOWLEDGE;
>>> +       }
>>> +       if (new_status & VIRTIO_CONFIG_S_DRIVER) {
>>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_DRIVER;
>>> +               new_status &= ~VIRTIO_CONFIG_S_DRIVER;
>>> +       }
>>> +       if (new_status & VIRTIO_CONFIG_S_FEATURES_OK) {
>>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_FEATURES_OK;
>>> +               vdpa_nic->vdpa_state = EF100_VDPA_STATE_NEGOTIATED;
>> It might be better to explain the reason we need to track another
>> state in vdpa_state instead of simply using the device status.
> vdpa_state helps to ensure correct status transitions in the 
> .set_status callback and safe-guards against incorrect/malicious 
> user-space driver.


Ok, let's document this in the definition of vdpa_state.


>>
>>> +               new_status &= ~VIRTIO_CONFIG_S_FEATURES_OK;
>>> +       }
>>> +       if (new_status & VIRTIO_CONFIG_S_DRIVER_OK &&
>>> +           vdpa_nic->vdpa_state == EF100_VDPA_STATE_NEGOTIATED) {
>>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_DRIVER_OK;
>>> +               rc = start_vdpa_device(vdpa_nic);
>>> +               if (rc) {
>>> + dev_err(&vdpa_nic->vdpa_dev.dev,
>>> +                               "%s: vDPA device failed:%d\n", 
>>> __func__, rc);
>>> +                       vdpa_nic->status &= ~VIRTIO_CONFIG_S_DRIVER_OK;
>>> +                       goto unlock_return;
>>> +               }
>>> +               new_status &= ~VIRTIO_CONFIG_S_DRIVER_OK;
>>> +       }
>>> +       if (new_status) {
>>> +               dev_warn(&vdev->dev,
>>> +                        "%s: Mismatch Status: %x & State: %u\n",
>>> +                        __func__, new_status, vdpa_nic->vdpa_state);
>>> +       }
>>> +
>>> +unlock_return:
>>> +       mutex_unlock(&vdpa_nic->lock);
>>> +}
>>> +
>>>   static size_t ef100_vdpa_get_config_size(struct vdpa_device *vdev)
>>>   {
>>>          return sizeof(struct virtio_net_config);
>>> @@ -393,6 +684,20 @@ static void ef100_vdpa_set_config(struct 
>>> vdpa_device *vdev, unsigned int offset,
>>>                  vdpa_nic->mac_configured = true;
>>>   }
>>>
>>> +static int ef100_vdpa_suspend(struct vdpa_device *vdev)
>>> +{
>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>> +       int i, rc;
>>> +
>>> +       mutex_lock(&vdpa_nic->lock);
>>> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++) {
>>> +               rc = delete_vring(vdpa_nic, i);
>> Note that the suspension matters for the whole device. It means the
>> config space should not be changed. But the code here only suspends
>> the vring, is this intende/d?
> Are you referring to the possibility of updating device configuration 
> (eg. MAC address) using .set_config() after suspend operation? Is 
> there any other user triggered operation that falls in this category?


Updating MAC should be prohibited, one typical use case is the link status.


>>
>> Reset may have the same issue.
> Could you pls elaborate on the requirement during device reset?


I meant ef100_reset_vdpa_device() may suffer from the same issue:

It only reset all the vring but not the config space?

Thanks


>>
>> Thanks
> [1] 
> https://patchew.org/QEMU/20230213191929.1547497-1-eperezma@redhat.com/
>>
>>> +               if (rc)
>>> +                       break;
>>> +       }
>>> +       mutex_unlock(&vdpa_nic->lock);
>>> +       return rc;
>>> +}
>>>   static void ef100_vdpa_free(struct vdpa_device *vdev)
>>>   {
>>>          struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>> @@ -428,9 +733,13 @@ const struct vdpa_config_ops 
>>> ef100_vdpa_config_ops = {
>>>          .get_vq_num_max      = ef100_vdpa_get_vq_num_max,
>>>          .get_device_id       = ef100_vdpa_get_device_id,
>>>          .get_vendor_id       = ef100_vdpa_get_vendor_id,
>>> +       .get_status          = ef100_vdpa_get_status,
>>> +       .set_status          = ef100_vdpa_set_status,
>>> +       .reset               = ef100_vdpa_reset,
>>>          .get_config_size     = ef100_vdpa_get_config_size,
>>>          .get_config          = ef100_vdpa_get_config,
>>>          .set_config          = ef100_vdpa_set_config,
>>>          .get_generation      = NULL,
>>> +       .suspend             = ef100_vdpa_suspend,
>>>          .free                = ef100_vdpa_free,
>>>   };
>>> -- 
>>> 2.30.1
>>>
>
Gautam Dawar March 15, 2023, 5:18 p.m. UTC | #4
On 3/15/23 10:30, Jason Wang wrote:
> Caution: This message originated from an External Source. Use proper 
> caution when opening attachments, clicking links, or responding.
>
>
> 在 2023/3/13 20:10, Gautam Dawar 写道:
>>
>> On 3/10/23 10:35, Jason Wang wrote:
>>> Caution: This message originated from an External Source. Use proper
>>> caution when opening attachments, clicking links, or responding.
>>>
>>>
>>> On Tue, Mar 7, 2023 at 7:38 PM Gautam Dawar <gautam.dawar@amd.com>
>>> wrote:
>>>> vDPA config opertions to handle get/set device status and device
>>>> reset have been implemented. Also .suspend config operation is
>>>> implemented to support Live Migration.
>>>>
>>>> Signed-off-by: Gautam Dawar <gautam.dawar@amd.com>
>>>> ---
>>>>   drivers/net/ethernet/sfc/ef100_vdpa.c     |  16 +-
>>>>   drivers/net/ethernet/sfc/ef100_vdpa.h     |   2 +
>>>>   drivers/net/ethernet/sfc/ef100_vdpa_ops.c | 367
>>>> ++++++++++++++++++++--
>>>>   3 files changed, 355 insertions(+), 30 deletions(-)
>>>>
>>>> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.c
>>>> b/drivers/net/ethernet/sfc/ef100_vdpa.c
>>>> index c66e5aef69ea..4ba57827a6cd 100644
>>>> --- a/drivers/net/ethernet/sfc/ef100_vdpa.c
>>>> +++ b/drivers/net/ethernet/sfc/ef100_vdpa.c
>>>> @@ -68,9 +68,14 @@ static int vdpa_allocate_vis(struct efx_nic *efx,
>>>> unsigned int *allocated_vis)
>>>>
>>>>   static void ef100_vdpa_delete(struct efx_nic *efx)
>>>>   {
>>>> +       struct vdpa_device *vdpa_dev;
>>>> +
>>>>          if (efx->vdpa_nic) {
>>>> +               vdpa_dev = &efx->vdpa_nic->vdpa_dev;
>>>> +               ef100_vdpa_reset(vdpa_dev);
>>>> +
>>>>                  /* replace with _vdpa_unregister_device later */
>>>> - put_device(&efx->vdpa_nic->vdpa_dev.dev);
>>>> +               put_device(&vdpa_dev->dev);
>>>>          }
>>>>          efx_mcdi_free_vis(efx);
>>>>   }
>>>> @@ -171,6 +176,15 @@ static struct ef100_vdpa_nic
>>>> *ef100_vdpa_create(struct efx_nic *efx,
>>>>                  }
>>>>          }
>>>>
>>>> +       rc = devm_add_action_or_reset(&efx->pci_dev->dev,
>>>> + ef100_vdpa_irq_vectors_free,
>>>> +                                     efx->pci_dev);
>>>> +       if (rc) {
>>>> +               pci_err(efx->pci_dev,
>>>> +                       "Failed adding devres for freeing irq
>>>> vectors\n");
>>>> +               goto err_put_device;
>>>> +       }
>>>> +
>>>>          rc = get_net_config(vdpa_nic);
>>>>          if (rc)
>>>>                  goto err_put_device;
>>>> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.h
>>>> b/drivers/net/ethernet/sfc/ef100_vdpa.h
>>>> index 348ca8a7404b..58791402e454 100644
>>>> --- a/drivers/net/ethernet/sfc/ef100_vdpa.h
>>>> +++ b/drivers/net/ethernet/sfc/ef100_vdpa.h
>>>> @@ -149,6 +149,8 @@ int ef100_vdpa_register_mgmtdev(struct efx_nic
>>>> *efx);
>>>>   void ef100_vdpa_unregister_mgmtdev(struct efx_nic *efx);
>>>>   void ef100_vdpa_irq_vectors_free(void *data);
>>>>   int ef100_vdpa_init_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx);
>>>> +void ef100_vdpa_irq_vectors_free(void *data);
>>>> +int ef100_vdpa_reset(struct vdpa_device *vdev);
>>>>
>>>>   static inline bool efx_vdpa_is_little_endian(struct ef100_vdpa_nic
>>>> *vdpa_nic)
>>>>   {
>>>> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
>>>> b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
>>>> index 0051c4c0e47c..95a2177f85a2 100644
>>>> --- a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
>>>> +++ b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
>>>> @@ -22,11 +22,6 @@ static struct ef100_vdpa_nic *get_vdpa_nic(struct
>>>> vdpa_device *vdev)
>>>>          return container_of(vdev, struct ef100_vdpa_nic, vdpa_dev);
>>>>   }
>>>>
>>>> -void ef100_vdpa_irq_vectors_free(void *data)
>>>> -{
>>>> -       pci_free_irq_vectors(data);
>>>> -}
>>>> -
>>>>   static int create_vring_ctx(struct ef100_vdpa_nic *vdpa_nic, u16 
>>>> idx)
>>>>   {
>>>>          struct efx_vring_ctx *vring_ctx;
>>>> @@ -52,14 +47,6 @@ static void delete_vring_ctx(struct
>>>> ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>>          vdpa_nic->vring[idx].vring_ctx = NULL;
>>>>   }
>>>>
>>>> -static void reset_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>> -{
>>>> -       vdpa_nic->vring[idx].vring_type = EF100_VDPA_VQ_NTYPES;
>>>> -       vdpa_nic->vring[idx].vring_state = 0;
>>>> -       vdpa_nic->vring[idx].last_avail_idx = 0;
>>>> -       vdpa_nic->vring[idx].last_used_idx = 0;
>>>> -}
>>>> -
>>>>   int ef100_vdpa_init_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>>   {
>>>>          u32 offset;
>>>> @@ -103,6 +90,236 @@ static bool is_qid_invalid(struct
>>>> ef100_vdpa_nic *vdpa_nic, u16 idx,
>>>>          return false;
>>>>   }
>>>>
>>>> +static void irq_vring_fini(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>> +{
>>>> +       struct ef100_vdpa_vring_info *vring = &vdpa_nic->vring[idx];
>>>> +       struct pci_dev *pci_dev = vdpa_nic->efx->pci_dev;
>>>> +
>>>> +       devm_free_irq(&pci_dev->dev, vring->irq, vring);
>>>> +       vring->irq = -EINVAL;
>>>> +}
>>>> +
>>>> +static irqreturn_t vring_intr_handler(int irq, void *arg)
>>>> +{
>>>> +       struct ef100_vdpa_vring_info *vring = arg;
>>>> +
>>>> +       if (vring->cb.callback)
>>>> +               return vring->cb.callback(vring->cb.private);
>>>> +
>>>> +       return IRQ_NONE;
>>>> +}
>>>> +
>>>> +static int ef100_vdpa_irq_vectors_alloc(struct pci_dev *pci_dev,
>>>> u16 nvqs)
>>>> +{
>>>> +       int rc;
>>>> +
>>>> +       rc = pci_alloc_irq_vectors(pci_dev, nvqs, nvqs, PCI_IRQ_MSIX);
>>>> +       if (rc < 0)
>>>> +               pci_err(pci_dev,
>>>> +                       "Failed to alloc %d IRQ vectors, err:%d\n",
>>>> nvqs, rc);
>>>> +       return rc;
>>>> +}
>>>> +
>>>> +void ef100_vdpa_irq_vectors_free(void *data)
>>>> +{
>>>> +       pci_free_irq_vectors(data);
>>>> +}
>>>> +
>>>> +static int irq_vring_init(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>> +{
>>>> +       struct ef100_vdpa_vring_info *vring = &vdpa_nic->vring[idx];
>>>> +       struct pci_dev *pci_dev = vdpa_nic->efx->pci_dev;
>>>> +       int irq;
>>>> +       int rc;
>>>> +
>>>> +       snprintf(vring->msix_name, 256, "x_vdpa[%s]-%d\n",
>>>> +                pci_name(pci_dev), idx);
>>>> +       irq = pci_irq_vector(pci_dev, idx);
>>>> +       rc = devm_request_irq(&pci_dev->dev, irq,
>>>> vring_intr_handler, 0,
>>>> +                             vring->msix_name, vring);
>>>> +       if (rc)
>>>> +               pci_err(pci_dev,
>>>> +                       "devm_request_irq failed for vring %d, rc
>>>> %d\n",
>>>> +                       idx, rc);
>>>> +       else
>>>> +               vring->irq = irq;
>>>> +
>>>> +       return rc;
>>>> +}
>>>> +
>>>> +static int delete_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>> +{
>>>> +       struct efx_vring_dyn_cfg vring_dyn_cfg;
>>>> +       int rc;
>>>> +
>>>> +       if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
>>>> +               return 0;
>>>> +
>>>> +       rc = efx_vdpa_vring_destroy(vdpa_nic->vring[idx].vring_ctx,
>>>> +                                   &vring_dyn_cfg);
>>>> +       if (rc)
>>>> +               dev_err(&vdpa_nic->vdpa_dev.dev,
>>>> +                       "%s: delete vring failed index:%u, err:%d\n",
>>>> +                       __func__, idx, rc);
>>>> +       vdpa_nic->vring[idx].last_avail_idx = vring_dyn_cfg.avail_idx;
>>>> +       vdpa_nic->vring[idx].last_used_idx = vring_dyn_cfg.used_idx;
>>>> +       vdpa_nic->vring[idx].vring_state &= ~EF100_VRING_CREATED;
>>>> +
>>>> +       irq_vring_fini(vdpa_nic, idx);
>>>> +
>>>> +       return rc;
>>>> +}
>>>> +
>>>> +static void ef100_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
>>>> +{
>>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>> +       u32 idx_val;
>>>> +
>>>> +       if (is_qid_invalid(vdpa_nic, idx, __func__))
>>>> +               return;
>>>> +
>>>> +       if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
>>>> +               return;
>>>> +
>>>> +       idx_val = idx;
>>>> +       _efx_writed(vdpa_nic->efx, cpu_to_le32(idx_val),
>>>> + vdpa_nic->vring[idx].doorbell_offset);
>>>> +}
>>>> +
>>>> +static bool can_create_vring(struct ef100_vdpa_nic *vdpa_nic, u16 
>>>> idx)
>>>> +{
>>>> +       if (vdpa_nic->vring[idx].vring_state ==
>>>> EF100_VRING_CONFIGURED &&
>>>> +           vdpa_nic->status & VIRTIO_CONFIG_S_DRIVER_OK &&
>>>> +           !(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
>>>> +               return true;
>>>> +
>>>> +       return false;
>>>> +}
>>>> +
>>>> +static int create_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>> +{
>>>> +       struct efx_vring_dyn_cfg vring_dyn_cfg;
>>>> +       struct efx_vring_cfg vring_cfg;
>>>> +       int rc;
>>>> +
>>>> +       rc = irq_vring_init(vdpa_nic, idx);
>>>> +       if (rc) {
>>>> +               dev_err(&vdpa_nic->vdpa_dev.dev,
>>>> +                       "%s: irq_vring_init failed. index:%u,
>>>> err:%d\n",
>>>> +                       __func__, idx, rc);
>>>> +               return rc;
>>>> +       }
>>>> +       vring_cfg.desc = vdpa_nic->vring[idx].desc;
>>>> +       vring_cfg.avail = vdpa_nic->vring[idx].avail;
>>>> +       vring_cfg.used = vdpa_nic->vring[idx].used;
>>>> +       vring_cfg.size = vdpa_nic->vring[idx].size;
>>>> +       vring_cfg.features = vdpa_nic->features;
>>>> +       vring_cfg.msix_vector = idx;
>>>> +       vring_dyn_cfg.avail_idx = vdpa_nic->vring[idx].last_avail_idx;
>>>> +       vring_dyn_cfg.used_idx = vdpa_nic->vring[idx].last_used_idx;
>>>> +
>>>> +       rc = efx_vdpa_vring_create(vdpa_nic->vring[idx].vring_ctx,
>>>> +                                  &vring_cfg, &vring_dyn_cfg);
>>>> +       if (rc) {
>>>> +               dev_err(&vdpa_nic->vdpa_dev.dev,
>>>> +                       "%s: vring_create failed index:%u, err:%d\n",
>>>> +                       __func__, idx, rc);
>>>> +               goto err_vring_create;
>>>> +       }
>>>> +       vdpa_nic->vring[idx].vring_state |= EF100_VRING_CREATED;
>>>> +
>>>> +       /* A VQ kick allows the device to read the avail_idx, which
>>>> will be
>>>> +        * required at the destination after live migration.
>>>> +        */
>>>> +       ef100_vdpa_kick_vq(&vdpa_nic->vdpa_dev, idx);
>>>> +
>>>> +       return 0;
>>>> +
>>>> +err_vring_create:
>>>> +       irq_vring_fini(vdpa_nic, idx);
>>>> +       return rc;
>>>> +}
>>>> +
>>>> +static void reset_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>> +{
>>>> +       delete_vring(vdpa_nic, idx);
>>>> +       vdpa_nic->vring[idx].vring_type = EF100_VDPA_VQ_NTYPES;
>>>> +       vdpa_nic->vring[idx].vring_state = 0;
>>>> +       vdpa_nic->vring[idx].last_avail_idx = 0;
>>>> +       vdpa_nic->vring[idx].last_used_idx = 0;
>>>> +}
>>>> +
>>>> +static void ef100_reset_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
>>>> +{
>>>> +       int i;
>>>> +
>>>> +       WARN_ON(!mutex_is_locked(&vdpa_nic->lock));
>>>> +
>>>> +       if (!vdpa_nic->status)
>>>> +               return;
>>>> +
>>>> +       vdpa_nic->vdpa_state = EF100_VDPA_STATE_INITIALIZED;
>>>> +       vdpa_nic->status = 0;
>>>> +       vdpa_nic->features = 0;
>>>> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++)
>>>> +               reset_vring(vdpa_nic, i);
>>>> + ef100_vdpa_irq_vectors_free(vdpa_nic->efx->pci_dev);
>>>> +}
>>>> +
>>>> +/* May be called under the rtnl lock */
>>>> +int ef100_vdpa_reset(struct vdpa_device *vdev)
>>>> +{
>>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>> +
>>>> +       /* vdpa device can be deleted anytime but the bar_config
>>>> +        * could still be vdpa and hence efx->state would be
>>>> STATE_VDPA.
>>>> +        * Accordingly, ensure vdpa device exists before reset 
>>>> handling
>>>> +        */
>>>> +       if (!vdpa_nic)
>>>> +               return -ENODEV;
>>>> +
>>>> +       mutex_lock(&vdpa_nic->lock);
>>>> +       ef100_reset_vdpa_device(vdpa_nic);
>>>> +       mutex_unlock(&vdpa_nic->lock);
>>>> +       return 0;
>>>> +}
>>>> +
>>>> +static int start_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
>>>> +{
>>>> +       struct efx_nic *efx = vdpa_nic->efx;
>>>> +       struct ef100_nic_data *nic_data;
>>>> +       int i, j;
>>>> +       int rc;
>>>> +
>>>> +       nic_data = efx->nic_data;
>>>> +       rc = ef100_vdpa_irq_vectors_alloc(efx->pci_dev,
>>>> + vdpa_nic->max_queue_pairs * 2);
>>>> +       if (rc < 0) {
>>>> +               pci_err(efx->pci_dev,
>>>> +                       "vDPA IRQ alloc failed for vf: %u err:%d\n",
>>>> +                       nic_data->vf_index, rc);
>>>> +               return rc;
>>>> +       }
>>>> +
>>>> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++) {
>>>> +               if (can_create_vring(vdpa_nic, i)) {
>>>> +                       rc = create_vring(vdpa_nic, i);
>>>> +                       if (rc)
>>>> +                               goto clear_vring;
>>>> +               }
>>>> +       }
>>>> +
>>>> +       vdpa_nic->vdpa_state = EF100_VDPA_STATE_STARTED;
>>> It looks to me that this duplicates with the DRIVER_OK status bit.
>> vdpa_state is set EF100_VDPA_STATE_STARTED during DRIVER_OK handling.
>> See my later response for its purpose.
>>>
>>>> +       return 0;
>>>> +
>>>> +clear_vring:
>>>> +       for (j = 0; j < i; j++)
>>>> +               delete_vring(vdpa_nic, j);
>>>> +
>>>> +       ef100_vdpa_irq_vectors_free(efx->pci_dev);
>>>> +       return rc;
>>>> +}
>>>> +
>>>>   static int ef100_vdpa_set_vq_address(struct vdpa_device *vdev,
>>>>                                       u16 idx, u64 desc_area, u64
>>>> driver_area,
>>>>                                       u64 device_area)
>>>> @@ -144,22 +361,6 @@ static void ef100_vdpa_set_vq_num(struct
>>>> vdpa_device *vdev, u16 idx, u32 num)
>>>>          mutex_unlock(&vdpa_nic->lock);
>>>>   }
>>>>
>>>> -static void ef100_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
>>>> -{
>>>> -       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>> -       u32 idx_val;
>>>> -
>>>> -       if (is_qid_invalid(vdpa_nic, idx, __func__))
>>>> -               return;
>>>> -
>>>> -       if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
>>>> -               return;
>>>> -
>>>> -       idx_val = idx;
>>>> -       _efx_writed(vdpa_nic->efx, cpu_to_le32(idx_val),
>>>> - vdpa_nic->vring[idx].doorbell_offset);
>>>> -}
>>>> -
>>>>   static void ef100_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx,
>>>>                                   struct vdpa_callback *cb)
>>>>   {
>>>> @@ -176,6 +377,7 @@ static void ef100_vdpa_set_vq_ready(struct
>>>> vdpa_device *vdev, u16 idx,
>>>>                                      bool ready)
>>>>   {
>>>>          struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>> +       int rc;
>>>>
>>>>          if (is_qid_invalid(vdpa_nic, idx, __func__))
>>>>                  return;
>>>> @@ -184,9 +386,21 @@ static void ef100_vdpa_set_vq_ready(struct
>>>> vdpa_device *vdev, u16 idx,
>>>>          if (ready) {
>>>>                  vdpa_nic->vring[idx].vring_state |=
>>>> EF100_VRING_READY_CONFIGURED;
>>>> +               if (vdpa_nic->vdpa_state == 
>>>> EF100_VDPA_STATE_STARTED &&
>>>> +                   can_create_vring(vdpa_nic, idx)) {
>>>> +                       rc = create_vring(vdpa_nic, idx);
>>>> +                       if (rc)
>>>> +                               /* Rollback ready configuration
>>>> +                                * So that the above layer driver
>>>> +                                * can make another attempt to set
>>>> ready
>>>> +                                */
>>>> + vdpa_nic->vring[idx].vring_state &=
>>>> + ~EF100_VRING_READY_CONFIGURED;
>>>> +               }
>>>>          } else {
>>>>                  vdpa_nic->vring[idx].vring_state &=
>>>> ~EF100_VRING_READY_CONFIGURED;
>>>> +               delete_vring(vdpa_nic, idx);
>>>>          }
>>>>          mutex_unlock(&vdpa_nic->lock);
>>>>   }
>>>> @@ -296,6 +510,12 @@ static u64
>>>> ef100_vdpa_get_device_features(struct vdpa_device *vdev)
>>>>          }
>>>>
>>>>          features |= BIT_ULL(VIRTIO_NET_F_MAC);
>>>> +       /* As QEMU SVQ doesn't implement the following features,
>>>> +        * masking them off to allow Live Migration
>>>> +        */
>>>> +       features &= ~BIT_ULL(VIRTIO_F_IN_ORDER);
>>>> +       features &= ~BIT_ULL(VIRTIO_F_ORDER_PLATFORM);
>>> It's better not to work around userspace bugs in the kernel. We should
>>> fix Qemu instead.
>>
>> There's already a QEMU patch [1] submitted to support
>> VIRTIO_F_ORDER_PLATFORM but it hasn't concluded yet. Also, there is no
>> support for VIRTIO_F_IN_ORDER in the kernel virtio driver. The motive
>> of this change is to have VM Live Migration working with the kernel
>> in-tree driver without requiring any changes.
>>
>> Once QEMU is able to handle these features, we can submit a patch to
>> undo these changes.
>
>
> I can understand the motivation, but it works for prototyping but not
> formal kernel code (especially consider SVQ is not mature and still
> being development). What's more, we can not assume Qemu is the only
> user, we have other users like DPDK and cloud-hypervisors.

Ok, if the expectation is to have the user deal with the issues and make 
required changes on the in-tree driver to make it work, I'll remove this 
part.

>
> Thanks
>
>
>>
>>>
>>>> +
>>>>          return features;
>>>>   }
>>>>
>>>> @@ -356,6 +576,77 @@ static u32 ef100_vdpa_get_vendor_id(struct
>>>> vdpa_device *vdev)
>>>>          return EF100_VDPA_VENDOR_ID;
>>>>   }
>>>>
>>>> +static u8 ef100_vdpa_get_status(struct vdpa_device *vdev)
>>>> +{
>>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>> +       u8 status;
>>>> +
>>>> +       mutex_lock(&vdpa_nic->lock);
>>>> +       status = vdpa_nic->status;
>>>> +       mutex_unlock(&vdpa_nic->lock);
>>>> +       return status;
>>>> +}
>>>> +
>>>> +static void ef100_vdpa_set_status(struct vdpa_device *vdev, u8 
>>>> status)
>>>> +{
>>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>> +       u8 new_status;
>>>> +       int rc;
>>>> +
>>>> +       mutex_lock(&vdpa_nic->lock);
>>>> +       if (!status) {
>>>> +               dev_info(&vdev->dev,
>>>> +                        "%s: Status received is 0. Device reset
>>>> being done\n",
>>>> +                        __func__);
>>> This is trigger-able by the userspace. It might be better to use
>>> dev_dbg() instead.
>> Will change.
>>>
>>>> + ef100_reset_vdpa_device(vdpa_nic);
>>>> +               goto unlock_return;
>>>> +       }
>>>> +       new_status = status & ~vdpa_nic->status;
>>>> +       if (new_status == 0) {
>>>> +               dev_info(&vdev->dev,
>>>> +                        "%s: New status same as current status\n",
>>>> __func__);
>>> Same here.
>> Ok.
>>>
>>>> +               goto unlock_return;
>>>> +       }
>>>> +       if (new_status & VIRTIO_CONFIG_S_FAILED) {
>>>> +               ef100_reset_vdpa_device(vdpa_nic);
>>>> +               goto unlock_return;
>>>> +       }
>>>> +
>>>> +       if (new_status & VIRTIO_CONFIG_S_ACKNOWLEDGE) {
>>>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_ACKNOWLEDGE;
>>>> +               new_status &= ~VIRTIO_CONFIG_S_ACKNOWLEDGE;
>>>> +       }
>>>> +       if (new_status & VIRTIO_CONFIG_S_DRIVER) {
>>>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_DRIVER;
>>>> +               new_status &= ~VIRTIO_CONFIG_S_DRIVER;
>>>> +       }
>>>> +       if (new_status & VIRTIO_CONFIG_S_FEATURES_OK) {
>>>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_FEATURES_OK;
>>>> +               vdpa_nic->vdpa_state = EF100_VDPA_STATE_NEGOTIATED;
>>> It might be better to explain the reason we need to track another
>>> state in vdpa_state instead of simply using the device status.
>> vdpa_state helps to ensure correct status transitions in the
>> .set_status callback and safe-guards against incorrect/malicious
>> user-space driver.
>
>
> Ok, let's document this in the definition of vdpa_state.
Sure, will do.
>
>
>>>
>>>> +               new_status &= ~VIRTIO_CONFIG_S_FEATURES_OK;
>>>> +       }
>>>> +       if (new_status & VIRTIO_CONFIG_S_DRIVER_OK &&
>>>> +           vdpa_nic->vdpa_state == EF100_VDPA_STATE_NEGOTIATED) {
>>>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_DRIVER_OK;
>>>> +               rc = start_vdpa_device(vdpa_nic);
>>>> +               if (rc) {
>>>> + dev_err(&vdpa_nic->vdpa_dev.dev,
>>>> +                               "%s: vDPA device failed:%d\n",
>>>> __func__, rc);
>>>> +                       vdpa_nic->status &= 
>>>> ~VIRTIO_CONFIG_S_DRIVER_OK;
>>>> +                       goto unlock_return;
>>>> +               }
>>>> +               new_status &= ~VIRTIO_CONFIG_S_DRIVER_OK;
>>>> +       }
>>>> +       if (new_status) {
>>>> +               dev_warn(&vdev->dev,
>>>> +                        "%s: Mismatch Status: %x & State: %u\n",
>>>> +                        __func__, new_status, vdpa_nic->vdpa_state);
>>>> +       }
>>>> +
>>>> +unlock_return:
>>>> +       mutex_unlock(&vdpa_nic->lock);
>>>> +}
>>>> +
>>>>   static size_t ef100_vdpa_get_config_size(struct vdpa_device *vdev)
>>>>   {
>>>>          return sizeof(struct virtio_net_config);
>>>> @@ -393,6 +684,20 @@ static void ef100_vdpa_set_config(struct
>>>> vdpa_device *vdev, unsigned int offset,
>>>>                  vdpa_nic->mac_configured = true;
>>>>   }
>>>>
>>>> +static int ef100_vdpa_suspend(struct vdpa_device *vdev)
>>>> +{
>>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>> +       int i, rc;
>>>> +
>>>> +       mutex_lock(&vdpa_nic->lock);
>>>> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++) {
>>>> +               rc = delete_vring(vdpa_nic, i);
>>> Note that the suspension matters for the whole device. It means the
>>> config space should not be changed. But the code here only suspends
>>> the vring, is this intende/d?
>> Are you referring to the possibility of updating device configuration
>> (eg. MAC address) using .set_config() after suspend operation? Is
>> there any other user triggered operation that falls in this category?
>
>
> Updating MAC should be prohibited, one typical use case is the link 
> status.
I think this can be dealt with an additional SUSPEND state which can be 
used to avoid any changes to the config space.
>
>
>>>
>>> Reset may have the same issue.
>> Could you pls elaborate on the requirement during device reset?
>
>
> I meant ef100_reset_vdpa_device() may suffer from the same issue:
>
> It only reset all the vring but not the config space?

Ok, I think resetting config space would basically be clearing the 
memory for vdpa_nic->net_config which is the initial state after vdpa 
device allocation.

Gautam

>
> Thanks
>
>
>>>
>>> Thanks
>> [1]
>> https://patchew.org/QEMU/20230213191929.1547497-1-eperezma@redhat.com/
>>>
>>>> +               if (rc)
>>>> +                       break;
>>>> +       }
>>>> +       mutex_unlock(&vdpa_nic->lock);
>>>> +       return rc;
>>>> +}
>>>>   static void ef100_vdpa_free(struct vdpa_device *vdev)
>>>>   {
>>>>          struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>> @@ -428,9 +733,13 @@ const struct vdpa_config_ops
>>>> ef100_vdpa_config_ops = {
>>>>          .get_vq_num_max      = ef100_vdpa_get_vq_num_max,
>>>>          .get_device_id       = ef100_vdpa_get_device_id,
>>>>          .get_vendor_id       = ef100_vdpa_get_vendor_id,
>>>> +       .get_status          = ef100_vdpa_get_status,
>>>> +       .set_status          = ef100_vdpa_set_status,
>>>> +       .reset               = ef100_vdpa_reset,
>>>>          .get_config_size     = ef100_vdpa_get_config_size,
>>>>          .get_config          = ef100_vdpa_get_config,
>>>>          .set_config          = ef100_vdpa_set_config,
>>>>          .get_generation      = NULL,
>>>> +       .suspend             = ef100_vdpa_suspend,
>>>>          .free                = ef100_vdpa_free,
>>>>   };
>>>> -- 
>>>> 2.30.1
>>>>
>>
>
Gautam Dawar March 17, 2023, 11:01 a.m. UTC | #5
On 3/15/23 22:48, Gautam Dawar wrote:
>
> On 3/15/23 10:30, Jason Wang wrote:
>> Caution: This message originated from an External Source. Use proper 
>> caution when opening attachments, clicking links, or responding.
>>
>>
>> 在 2023/3/13 20:10, Gautam Dawar 写道:
>>>
>>> On 3/10/23 10:35, Jason Wang wrote:
>>>> Caution: This message originated from an External Source. Use proper
>>>> caution when opening attachments, clicking links, or responding.
>>>>
>>>>
>>>> On Tue, Mar 7, 2023 at 7:38 PM Gautam Dawar <gautam.dawar@amd.com>
>>>> wrote:
>>>>> vDPA config opertions to handle get/set device status and device
>>>>> reset have been implemented. Also .suspend config operation is
>>>>> implemented to support Live Migration.
>>>>>
>>>>> Signed-off-by: Gautam Dawar <gautam.dawar@amd.com>
>>>>> ---
>>>>>   drivers/net/ethernet/sfc/ef100_vdpa.c     |  16 +-
>>>>>   drivers/net/ethernet/sfc/ef100_vdpa.h     |   2 +
>>>>>   drivers/net/ethernet/sfc/ef100_vdpa_ops.c | 367
>>>>> ++++++++++++++++++++--
>>>>>   3 files changed, 355 insertions(+), 30 deletions(-)
>>>>>
>>>>> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.c
>>>>> b/drivers/net/ethernet/sfc/ef100_vdpa.c
>>>>> index c66e5aef69ea..4ba57827a6cd 100644
>>>>> --- a/drivers/net/ethernet/sfc/ef100_vdpa.c
>>>>> +++ b/drivers/net/ethernet/sfc/ef100_vdpa.c
>>>>> @@ -68,9 +68,14 @@ static int vdpa_allocate_vis(struct efx_nic *efx,
>>>>> unsigned int *allocated_vis)
>>>>>
>>>>>   static void ef100_vdpa_delete(struct efx_nic *efx)
>>>>>   {
>>>>> +       struct vdpa_device *vdpa_dev;
>>>>> +
>>>>>          if (efx->vdpa_nic) {
>>>>> +               vdpa_dev = &efx->vdpa_nic->vdpa_dev;
>>>>> +               ef100_vdpa_reset(vdpa_dev);
>>>>> +
>>>>>                  /* replace with _vdpa_unregister_device later */
>>>>> - put_device(&efx->vdpa_nic->vdpa_dev.dev);
>>>>> +               put_device(&vdpa_dev->dev);
>>>>>          }
>>>>>          efx_mcdi_free_vis(efx);
>>>>>   }
>>>>> @@ -171,6 +176,15 @@ static struct ef100_vdpa_nic
>>>>> *ef100_vdpa_create(struct efx_nic *efx,
>>>>>                  }
>>>>>          }
>>>>>
>>>>> +       rc = devm_add_action_or_reset(&efx->pci_dev->dev,
>>>>> + ef100_vdpa_irq_vectors_free,
>>>>> +                                     efx->pci_dev);
>>>>> +       if (rc) {
>>>>> +               pci_err(efx->pci_dev,
>>>>> +                       "Failed adding devres for freeing irq
>>>>> vectors\n");
>>>>> +               goto err_put_device;
>>>>> +       }
>>>>> +
>>>>>          rc = get_net_config(vdpa_nic);
>>>>>          if (rc)
>>>>>                  goto err_put_device;
>>>>> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.h
>>>>> b/drivers/net/ethernet/sfc/ef100_vdpa.h
>>>>> index 348ca8a7404b..58791402e454 100644
>>>>> --- a/drivers/net/ethernet/sfc/ef100_vdpa.h
>>>>> +++ b/drivers/net/ethernet/sfc/ef100_vdpa.h
>>>>> @@ -149,6 +149,8 @@ int ef100_vdpa_register_mgmtdev(struct efx_nic
>>>>> *efx);
>>>>>   void ef100_vdpa_unregister_mgmtdev(struct efx_nic *efx);
>>>>>   void ef100_vdpa_irq_vectors_free(void *data);
>>>>>   int ef100_vdpa_init_vring(struct ef100_vdpa_nic *vdpa_nic, u16 
>>>>> idx);
>>>>> +void ef100_vdpa_irq_vectors_free(void *data);
>>>>> +int ef100_vdpa_reset(struct vdpa_device *vdev);
>>>>>
>>>>>   static inline bool efx_vdpa_is_little_endian(struct ef100_vdpa_nic
>>>>> *vdpa_nic)
>>>>>   {
>>>>> diff --git a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
>>>>> b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
>>>>> index 0051c4c0e47c..95a2177f85a2 100644
>>>>> --- a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
>>>>> +++ b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
>>>>> @@ -22,11 +22,6 @@ static struct ef100_vdpa_nic *get_vdpa_nic(struct
>>>>> vdpa_device *vdev)
>>>>>          return container_of(vdev, struct ef100_vdpa_nic, vdpa_dev);
>>>>>   }
>>>>>
>>>>> -void ef100_vdpa_irq_vectors_free(void *data)
>>>>> -{
>>>>> -       pci_free_irq_vectors(data);
>>>>> -}
>>>>> -
>>>>>   static int create_vring_ctx(struct ef100_vdpa_nic *vdpa_nic, u16 
>>>>> idx)
>>>>>   {
>>>>>          struct efx_vring_ctx *vring_ctx;
>>>>> @@ -52,14 +47,6 @@ static void delete_vring_ctx(struct
>>>>> ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>>>          vdpa_nic->vring[idx].vring_ctx = NULL;
>>>>>   }
>>>>>
>>>>> -static void reset_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>>> -{
>>>>> -       vdpa_nic->vring[idx].vring_type = EF100_VDPA_VQ_NTYPES;
>>>>> -       vdpa_nic->vring[idx].vring_state = 0;
>>>>> -       vdpa_nic->vring[idx].last_avail_idx = 0;
>>>>> -       vdpa_nic->vring[idx].last_used_idx = 0;
>>>>> -}
>>>>> -
>>>>>   int ef100_vdpa_init_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>>>   {
>>>>>          u32 offset;
>>>>> @@ -103,6 +90,236 @@ static bool is_qid_invalid(struct
>>>>> ef100_vdpa_nic *vdpa_nic, u16 idx,
>>>>>          return false;
>>>>>   }
>>>>>
>>>>> +static void irq_vring_fini(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>>> +{
>>>>> +       struct ef100_vdpa_vring_info *vring = &vdpa_nic->vring[idx];
>>>>> +       struct pci_dev *pci_dev = vdpa_nic->efx->pci_dev;
>>>>> +
>>>>> +       devm_free_irq(&pci_dev->dev, vring->irq, vring);
>>>>> +       vring->irq = -EINVAL;
>>>>> +}
>>>>> +
>>>>> +static irqreturn_t vring_intr_handler(int irq, void *arg)
>>>>> +{
>>>>> +       struct ef100_vdpa_vring_info *vring = arg;
>>>>> +
>>>>> +       if (vring->cb.callback)
>>>>> +               return vring->cb.callback(vring->cb.private);
>>>>> +
>>>>> +       return IRQ_NONE;
>>>>> +}
>>>>> +
>>>>> +static int ef100_vdpa_irq_vectors_alloc(struct pci_dev *pci_dev,
>>>>> u16 nvqs)
>>>>> +{
>>>>> +       int rc;
>>>>> +
>>>>> +       rc = pci_alloc_irq_vectors(pci_dev, nvqs, nvqs, 
>>>>> PCI_IRQ_MSIX);
>>>>> +       if (rc < 0)
>>>>> +               pci_err(pci_dev,
>>>>> +                       "Failed to alloc %d IRQ vectors, err:%d\n",
>>>>> nvqs, rc);
>>>>> +       return rc;
>>>>> +}
>>>>> +
>>>>> +void ef100_vdpa_irq_vectors_free(void *data)
>>>>> +{
>>>>> +       pci_free_irq_vectors(data);
>>>>> +}
>>>>> +
>>>>> +static int irq_vring_init(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>>> +{
>>>>> +       struct ef100_vdpa_vring_info *vring = &vdpa_nic->vring[idx];
>>>>> +       struct pci_dev *pci_dev = vdpa_nic->efx->pci_dev;
>>>>> +       int irq;
>>>>> +       int rc;
>>>>> +
>>>>> +       snprintf(vring->msix_name, 256, "x_vdpa[%s]-%d\n",
>>>>> +                pci_name(pci_dev), idx);
>>>>> +       irq = pci_irq_vector(pci_dev, idx);
>>>>> +       rc = devm_request_irq(&pci_dev->dev, irq,
>>>>> vring_intr_handler, 0,
>>>>> +                             vring->msix_name, vring);
>>>>> +       if (rc)
>>>>> +               pci_err(pci_dev,
>>>>> +                       "devm_request_irq failed for vring %d, rc
>>>>> %d\n",
>>>>> +                       idx, rc);
>>>>> +       else
>>>>> +               vring->irq = irq;
>>>>> +
>>>>> +       return rc;
>>>>> +}
>>>>> +
>>>>> +static int delete_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>>> +{
>>>>> +       struct efx_vring_dyn_cfg vring_dyn_cfg;
>>>>> +       int rc;
>>>>> +
>>>>> +       if (!(vdpa_nic->vring[idx].vring_state & 
>>>>> EF100_VRING_CREATED))
>>>>> +               return 0;
>>>>> +
>>>>> +       rc = efx_vdpa_vring_destroy(vdpa_nic->vring[idx].vring_ctx,
>>>>> +                                   &vring_dyn_cfg);
>>>>> +       if (rc)
>>>>> +               dev_err(&vdpa_nic->vdpa_dev.dev,
>>>>> +                       "%s: delete vring failed index:%u, err:%d\n",
>>>>> +                       __func__, idx, rc);
>>>>> +       vdpa_nic->vring[idx].last_avail_idx = 
>>>>> vring_dyn_cfg.avail_idx;
>>>>> +       vdpa_nic->vring[idx].last_used_idx = vring_dyn_cfg.used_idx;
>>>>> +       vdpa_nic->vring[idx].vring_state &= ~EF100_VRING_CREATED;
>>>>> +
>>>>> +       irq_vring_fini(vdpa_nic, idx);
>>>>> +
>>>>> +       return rc;
>>>>> +}
>>>>> +
>>>>> +static void ef100_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
>>>>> +{
>>>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>>> +       u32 idx_val;
>>>>> +
>>>>> +       if (is_qid_invalid(vdpa_nic, idx, __func__))
>>>>> +               return;
>>>>> +
>>>>> +       if (!(vdpa_nic->vring[idx].vring_state & 
>>>>> EF100_VRING_CREATED))
>>>>> +               return;
>>>>> +
>>>>> +       idx_val = idx;
>>>>> +       _efx_writed(vdpa_nic->efx, cpu_to_le32(idx_val),
>>>>> + vdpa_nic->vring[idx].doorbell_offset);
>>>>> +}
>>>>> +
>>>>> +static bool can_create_vring(struct ef100_vdpa_nic *vdpa_nic, u16 
>>>>> idx)
>>>>> +{
>>>>> +       if (vdpa_nic->vring[idx].vring_state ==
>>>>> EF100_VRING_CONFIGURED &&
>>>>> +           vdpa_nic->status & VIRTIO_CONFIG_S_DRIVER_OK &&
>>>>> +           !(vdpa_nic->vring[idx].vring_state & 
>>>>> EF100_VRING_CREATED))
>>>>> +               return true;
>>>>> +
>>>>> +       return false;
>>>>> +}
>>>>> +
>>>>> +static int create_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>>> +{
>>>>> +       struct efx_vring_dyn_cfg vring_dyn_cfg;
>>>>> +       struct efx_vring_cfg vring_cfg;
>>>>> +       int rc;
>>>>> +
>>>>> +       rc = irq_vring_init(vdpa_nic, idx);
>>>>> +       if (rc) {
>>>>> +               dev_err(&vdpa_nic->vdpa_dev.dev,
>>>>> +                       "%s: irq_vring_init failed. index:%u,
>>>>> err:%d\n",
>>>>> +                       __func__, idx, rc);
>>>>> +               return rc;
>>>>> +       }
>>>>> +       vring_cfg.desc = vdpa_nic->vring[idx].desc;
>>>>> +       vring_cfg.avail = vdpa_nic->vring[idx].avail;
>>>>> +       vring_cfg.used = vdpa_nic->vring[idx].used;
>>>>> +       vring_cfg.size = vdpa_nic->vring[idx].size;
>>>>> +       vring_cfg.features = vdpa_nic->features;
>>>>> +       vring_cfg.msix_vector = idx;
>>>>> +       vring_dyn_cfg.avail_idx = 
>>>>> vdpa_nic->vring[idx].last_avail_idx;
>>>>> +       vring_dyn_cfg.used_idx = vdpa_nic->vring[idx].last_used_idx;
>>>>> +
>>>>> +       rc = efx_vdpa_vring_create(vdpa_nic->vring[idx].vring_ctx,
>>>>> +                                  &vring_cfg, &vring_dyn_cfg);
>>>>> +       if (rc) {
>>>>> +               dev_err(&vdpa_nic->vdpa_dev.dev,
>>>>> +                       "%s: vring_create failed index:%u, err:%d\n",
>>>>> +                       __func__, idx, rc);
>>>>> +               goto err_vring_create;
>>>>> +       }
>>>>> +       vdpa_nic->vring[idx].vring_state |= EF100_VRING_CREATED;
>>>>> +
>>>>> +       /* A VQ kick allows the device to read the avail_idx, which
>>>>> will be
>>>>> +        * required at the destination after live migration.
>>>>> +        */
>>>>> +       ef100_vdpa_kick_vq(&vdpa_nic->vdpa_dev, idx);
>>>>> +
>>>>> +       return 0;
>>>>> +
>>>>> +err_vring_create:
>>>>> +       irq_vring_fini(vdpa_nic, idx);
>>>>> +       return rc;
>>>>> +}
>>>>> +
>>>>> +static void reset_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
>>>>> +{
>>>>> +       delete_vring(vdpa_nic, idx);
>>>>> +       vdpa_nic->vring[idx].vring_type = EF100_VDPA_VQ_NTYPES;
>>>>> +       vdpa_nic->vring[idx].vring_state = 0;
>>>>> +       vdpa_nic->vring[idx].last_avail_idx = 0;
>>>>> +       vdpa_nic->vring[idx].last_used_idx = 0;
>>>>> +}
>>>>> +
>>>>> +static void ef100_reset_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
>>>>> +{
>>>>> +       int i;
>>>>> +
>>>>> +       WARN_ON(!mutex_is_locked(&vdpa_nic->lock));
>>>>> +
>>>>> +       if (!vdpa_nic->status)
>>>>> +               return;
>>>>> +
>>>>> +       vdpa_nic->vdpa_state = EF100_VDPA_STATE_INITIALIZED;
>>>>> +       vdpa_nic->status = 0;
>>>>> +       vdpa_nic->features = 0;
>>>>> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++)
>>>>> +               reset_vring(vdpa_nic, i);
>>>>> + ef100_vdpa_irq_vectors_free(vdpa_nic->efx->pci_dev);
>>>>> +}
>>>>> +
>>>>> +/* May be called under the rtnl lock */
>>>>> +int ef100_vdpa_reset(struct vdpa_device *vdev)
>>>>> +{
>>>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>>> +
>>>>> +       /* vdpa device can be deleted anytime but the bar_config
>>>>> +        * could still be vdpa and hence efx->state would be
>>>>> STATE_VDPA.
>>>>> +        * Accordingly, ensure vdpa device exists before reset 
>>>>> handling
>>>>> +        */
>>>>> +       if (!vdpa_nic)
>>>>> +               return -ENODEV;
>>>>> +
>>>>> +       mutex_lock(&vdpa_nic->lock);
>>>>> +       ef100_reset_vdpa_device(vdpa_nic);
>>>>> +       mutex_unlock(&vdpa_nic->lock);
>>>>> +       return 0;
>>>>> +}
>>>>> +
>>>>> +static int start_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
>>>>> +{
>>>>> +       struct efx_nic *efx = vdpa_nic->efx;
>>>>> +       struct ef100_nic_data *nic_data;
>>>>> +       int i, j;
>>>>> +       int rc;
>>>>> +
>>>>> +       nic_data = efx->nic_data;
>>>>> +       rc = ef100_vdpa_irq_vectors_alloc(efx->pci_dev,
>>>>> + vdpa_nic->max_queue_pairs * 2);
>>>>> +       if (rc < 0) {
>>>>> +               pci_err(efx->pci_dev,
>>>>> +                       "vDPA IRQ alloc failed for vf: %u err:%d\n",
>>>>> +                       nic_data->vf_index, rc);
>>>>> +               return rc;
>>>>> +       }
>>>>> +
>>>>> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++) {
>>>>> +               if (can_create_vring(vdpa_nic, i)) {
>>>>> +                       rc = create_vring(vdpa_nic, i);
>>>>> +                       if (rc)
>>>>> +                               goto clear_vring;
>>>>> +               }
>>>>> +       }
>>>>> +
>>>>> +       vdpa_nic->vdpa_state = EF100_VDPA_STATE_STARTED;
>>>> It looks to me that this duplicates with the DRIVER_OK status bit.
>>> vdpa_state is set EF100_VDPA_STATE_STARTED during DRIVER_OK handling.
>>> See my later response for its purpose.
>>>>
>>>>> +       return 0;
>>>>> +
>>>>> +clear_vring:
>>>>> +       for (j = 0; j < i; j++)
>>>>> +               delete_vring(vdpa_nic, j);
>>>>> +
>>>>> +       ef100_vdpa_irq_vectors_free(efx->pci_dev);
>>>>> +       return rc;
>>>>> +}
>>>>> +
>>>>>   static int ef100_vdpa_set_vq_address(struct vdpa_device *vdev,
>>>>>                                       u16 idx, u64 desc_area, u64
>>>>> driver_area,
>>>>>                                       u64 device_area)
>>>>> @@ -144,22 +361,6 @@ static void ef100_vdpa_set_vq_num(struct
>>>>> vdpa_device *vdev, u16 idx, u32 num)
>>>>>          mutex_unlock(&vdpa_nic->lock);
>>>>>   }
>>>>>
>>>>> -static void ef100_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
>>>>> -{
>>>>> -       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>>> -       u32 idx_val;
>>>>> -
>>>>> -       if (is_qid_invalid(vdpa_nic, idx, __func__))
>>>>> -               return;
>>>>> -
>>>>> -       if (!(vdpa_nic->vring[idx].vring_state & 
>>>>> EF100_VRING_CREATED))
>>>>> -               return;
>>>>> -
>>>>> -       idx_val = idx;
>>>>> -       _efx_writed(vdpa_nic->efx, cpu_to_le32(idx_val),
>>>>> - vdpa_nic->vring[idx].doorbell_offset);
>>>>> -}
>>>>> -
>>>>>   static void ef100_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx,
>>>>>                                   struct vdpa_callback *cb)
>>>>>   {
>>>>> @@ -176,6 +377,7 @@ static void ef100_vdpa_set_vq_ready(struct
>>>>> vdpa_device *vdev, u16 idx,
>>>>>                                      bool ready)
>>>>>   {
>>>>>          struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>>> +       int rc;
>>>>>
>>>>>          if (is_qid_invalid(vdpa_nic, idx, __func__))
>>>>>                  return;
>>>>> @@ -184,9 +386,21 @@ static void ef100_vdpa_set_vq_ready(struct
>>>>> vdpa_device *vdev, u16 idx,
>>>>>          if (ready) {
>>>>>                  vdpa_nic->vring[idx].vring_state |=
>>>>> EF100_VRING_READY_CONFIGURED;
>>>>> +               if (vdpa_nic->vdpa_state == 
>>>>> EF100_VDPA_STATE_STARTED &&
>>>>> +                   can_create_vring(vdpa_nic, idx)) {
>>>>> +                       rc = create_vring(vdpa_nic, idx);
>>>>> +                       if (rc)
>>>>> +                               /* Rollback ready configuration
>>>>> +                                * So that the above layer driver
>>>>> +                                * can make another attempt to set
>>>>> ready
>>>>> +                                */
>>>>> + vdpa_nic->vring[idx].vring_state &=
>>>>> + ~EF100_VRING_READY_CONFIGURED;
>>>>> +               }
>>>>>          } else {
>>>>>                  vdpa_nic->vring[idx].vring_state &=
>>>>> ~EF100_VRING_READY_CONFIGURED;
>>>>> +               delete_vring(vdpa_nic, idx);
>>>>>          }
>>>>>          mutex_unlock(&vdpa_nic->lock);
>>>>>   }
>>>>> @@ -296,6 +510,12 @@ static u64
>>>>> ef100_vdpa_get_device_features(struct vdpa_device *vdev)
>>>>>          }
>>>>>
>>>>>          features |= BIT_ULL(VIRTIO_NET_F_MAC);
>>>>> +       /* As QEMU SVQ doesn't implement the following features,
>>>>> +        * masking them off to allow Live Migration
>>>>> +        */
>>>>> +       features &= ~BIT_ULL(VIRTIO_F_IN_ORDER);
>>>>> +       features &= ~BIT_ULL(VIRTIO_F_ORDER_PLATFORM);
>>>> It's better not to work around userspace bugs in the kernel. We should
>>>> fix Qemu instead.
>>>
>>> There's already a QEMU patch [1] submitted to support
>>> VIRTIO_F_ORDER_PLATFORM but it hasn't concluded yet. Also, there is no
>>> support for VIRTIO_F_IN_ORDER in the kernel virtio driver. The motive
>>> of this change is to have VM Live Migration working with the kernel
>>> in-tree driver without requiring any changes.
>>>
>>> Once QEMU is able to handle these features, we can submit a patch to
>>> undo these changes.
>>
>>
>> I can understand the motivation, but it works for prototyping but not
>> formal kernel code (especially consider SVQ is not mature and still
>> being development). What's more, we can not assume Qemu is the only
>> user, we have other users like DPDK and cloud-hypervisors.
>
> Ok, if the expectation is to have the user deal with the issues and 
> make required changes on the in-tree driver to make it work, I'll 
> remove this part.
>
>>
>> Thanks
>>
>>
>>>
>>>>
>>>>> +
>>>>>          return features;
>>>>>   }
>>>>>
>>>>> @@ -356,6 +576,77 @@ static u32 ef100_vdpa_get_vendor_id(struct
>>>>> vdpa_device *vdev)
>>>>>          return EF100_VDPA_VENDOR_ID;
>>>>>   }
>>>>>
>>>>> +static u8 ef100_vdpa_get_status(struct vdpa_device *vdev)
>>>>> +{
>>>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>>> +       u8 status;
>>>>> +
>>>>> +       mutex_lock(&vdpa_nic->lock);
>>>>> +       status = vdpa_nic->status;
>>>>> +       mutex_unlock(&vdpa_nic->lock);
>>>>> +       return status;
>>>>> +}
>>>>> +
>>>>> +static void ef100_vdpa_set_status(struct vdpa_device *vdev, u8 
>>>>> status)
>>>>> +{
>>>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>>> +       u8 new_status;
>>>>> +       int rc;
>>>>> +
>>>>> +       mutex_lock(&vdpa_nic->lock);
>>>>> +       if (!status) {
>>>>> +               dev_info(&vdev->dev,
>>>>> +                        "%s: Status received is 0. Device reset
>>>>> being done\n",
>>>>> +                        __func__);
>>>> This is trigger-able by the userspace. It might be better to use
>>>> dev_dbg() instead.
>>> Will change.
>>>>
>>>>> + ef100_reset_vdpa_device(vdpa_nic);
>>>>> +               goto unlock_return;
>>>>> +       }
>>>>> +       new_status = status & ~vdpa_nic->status;
>>>>> +       if (new_status == 0) {
>>>>> +               dev_info(&vdev->dev,
>>>>> +                        "%s: New status same as current status\n",
>>>>> __func__);
>>>> Same here.
>>> Ok.
>>>>
>>>>> +               goto unlock_return;
>>>>> +       }
>>>>> +       if (new_status & VIRTIO_CONFIG_S_FAILED) {
>>>>> +               ef100_reset_vdpa_device(vdpa_nic);
>>>>> +               goto unlock_return;
>>>>> +       }
>>>>> +
>>>>> +       if (new_status & VIRTIO_CONFIG_S_ACKNOWLEDGE) {
>>>>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_ACKNOWLEDGE;
>>>>> +               new_status &= ~VIRTIO_CONFIG_S_ACKNOWLEDGE;
>>>>> +       }
>>>>> +       if (new_status & VIRTIO_CONFIG_S_DRIVER) {
>>>>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_DRIVER;
>>>>> +               new_status &= ~VIRTIO_CONFIG_S_DRIVER;
>>>>> +       }
>>>>> +       if (new_status & VIRTIO_CONFIG_S_FEATURES_OK) {
>>>>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_FEATURES_OK;
>>>>> +               vdpa_nic->vdpa_state = EF100_VDPA_STATE_NEGOTIATED;
>>>> It might be better to explain the reason we need to track another
>>>> state in vdpa_state instead of simply using the device status.
>>> vdpa_state helps to ensure correct status transitions in the
>>> .set_status callback and safe-guards against incorrect/malicious
>>> user-space driver.
>>
>>
>> Ok, let's document this in the definition of vdpa_state.
> Sure, will do.
>>
>>
>>>>
>>>>> +               new_status &= ~VIRTIO_CONFIG_S_FEATURES_OK;
>>>>> +       }
>>>>> +       if (new_status & VIRTIO_CONFIG_S_DRIVER_OK &&
>>>>> +           vdpa_nic->vdpa_state == EF100_VDPA_STATE_NEGOTIATED) {
>>>>> +               vdpa_nic->status |= VIRTIO_CONFIG_S_DRIVER_OK;
>>>>> +               rc = start_vdpa_device(vdpa_nic);
>>>>> +               if (rc) {
>>>>> + dev_err(&vdpa_nic->vdpa_dev.dev,
>>>>> +                               "%s: vDPA device failed:%d\n",
>>>>> __func__, rc);
>>>>> +                       vdpa_nic->status &= 
>>>>> ~VIRTIO_CONFIG_S_DRIVER_OK;
>>>>> +                       goto unlock_return;
>>>>> +               }
>>>>> +               new_status &= ~VIRTIO_CONFIG_S_DRIVER_OK;
>>>>> +       }
>>>>> +       if (new_status) {
>>>>> +               dev_warn(&vdev->dev,
>>>>> +                        "%s: Mismatch Status: %x & State: %u\n",
>>>>> +                        __func__, new_status, vdpa_nic->vdpa_state);
>>>>> +       }
>>>>> +
>>>>> +unlock_return:
>>>>> +       mutex_unlock(&vdpa_nic->lock);
>>>>> +}
>>>>> +
>>>>>   static size_t ef100_vdpa_get_config_size(struct vdpa_device *vdev)
>>>>>   {
>>>>>          return sizeof(struct virtio_net_config);
>>>>> @@ -393,6 +684,20 @@ static void ef100_vdpa_set_config(struct
>>>>> vdpa_device *vdev, unsigned int offset,
>>>>>                  vdpa_nic->mac_configured = true;
>>>>>   }
>>>>>
>>>>> +static int ef100_vdpa_suspend(struct vdpa_device *vdev)
>>>>> +{
>>>>> +       struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>>> +       int i, rc;
>>>>> +
>>>>> +       mutex_lock(&vdpa_nic->lock);
>>>>> +       for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++) {
>>>>> +               rc = delete_vring(vdpa_nic, i);
>>>> Note that the suspension matters for the whole device. It means the
>>>> config space should not be changed. But the code here only suspends
>>>> the vring, is this intende/d?
>>> Are you referring to the possibility of updating device configuration
>>> (eg. MAC address) using .set_config() after suspend operation? Is
>>> there any other user triggered operation that falls in this category?
>>
>>
>> Updating MAC should be prohibited, one typical use case is the link 
>> status.
> I think this can be dealt with an additional SUSPEND state which can 
> be used to avoid any changes to the config space.
>>
>>
>>>>
>>>> Reset may have the same issue.
>>> Could you pls elaborate on the requirement during device reset?
>>
>>
>> I meant ef100_reset_vdpa_device() may suffer from the same issue:
>>
>> It only reset all the vring but not the config space?
>
> Ok, I think resetting config space would basically be clearing the 
> memory for vdpa_nic->net_config which is the initial state after vdpa 
> device allocation.

On second thought, the virtio_net_config data except mac is static 
(updated during vdpa device creation) and must not be cleared during 
device reset. Also, as currently the MAC address of the existing vdpa 
device can't be updated, it too should not be cleared.

Gautam

>
>
> Gautam
>
>>
>> Thanks
>>
>>
>>>>
>>>> Thanks
>>> [1]
>>> https://patchew.org/QEMU/20230213191929.1547497-1-eperezma@redhat.com/
>>>>
>>>>> +               if (rc)
>>>>> +                       break;
>>>>> +       }
>>>>> +       mutex_unlock(&vdpa_nic->lock);
>>>>> +       return rc;
>>>>> +}
>>>>>   static void ef100_vdpa_free(struct vdpa_device *vdev)
>>>>>   {
>>>>>          struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
>>>>> @@ -428,9 +733,13 @@ const struct vdpa_config_ops
>>>>> ef100_vdpa_config_ops = {
>>>>>          .get_vq_num_max      = ef100_vdpa_get_vq_num_max,
>>>>>          .get_device_id       = ef100_vdpa_get_device_id,
>>>>>          .get_vendor_id       = ef100_vdpa_get_vendor_id,
>>>>> +       .get_status          = ef100_vdpa_get_status,
>>>>> +       .set_status          = ef100_vdpa_set_status,
>>>>> +       .reset               = ef100_vdpa_reset,
>>>>>          .get_config_size     = ef100_vdpa_get_config_size,
>>>>>          .get_config          = ef100_vdpa_get_config,
>>>>>          .set_config          = ef100_vdpa_set_config,
>>>>>          .get_generation      = NULL,
>>>>> +       .suspend             = ef100_vdpa_suspend,
>>>>>          .free                = ef100_vdpa_free,
>>>>>   };
>>>>> -- 
>>>>> 2.30.1
>>>>>
>>>
>>
diff mbox series

Patch

diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.c b/drivers/net/ethernet/sfc/ef100_vdpa.c
index c66e5aef69ea..4ba57827a6cd 100644
--- a/drivers/net/ethernet/sfc/ef100_vdpa.c
+++ b/drivers/net/ethernet/sfc/ef100_vdpa.c
@@ -68,9 +68,14 @@  static int vdpa_allocate_vis(struct efx_nic *efx, unsigned int *allocated_vis)
 
 static void ef100_vdpa_delete(struct efx_nic *efx)
 {
+	struct vdpa_device *vdpa_dev;
+
 	if (efx->vdpa_nic) {
+		vdpa_dev = &efx->vdpa_nic->vdpa_dev;
+		ef100_vdpa_reset(vdpa_dev);
+
 		/* replace with _vdpa_unregister_device later */
-		put_device(&efx->vdpa_nic->vdpa_dev.dev);
+		put_device(&vdpa_dev->dev);
 	}
 	efx_mcdi_free_vis(efx);
 }
@@ -171,6 +176,15 @@  static struct ef100_vdpa_nic *ef100_vdpa_create(struct efx_nic *efx,
 		}
 	}
 
+	rc = devm_add_action_or_reset(&efx->pci_dev->dev,
+				      ef100_vdpa_irq_vectors_free,
+				      efx->pci_dev);
+	if (rc) {
+		pci_err(efx->pci_dev,
+			"Failed adding devres for freeing irq vectors\n");
+		goto err_put_device;
+	}
+
 	rc = get_net_config(vdpa_nic);
 	if (rc)
 		goto err_put_device;
diff --git a/drivers/net/ethernet/sfc/ef100_vdpa.h b/drivers/net/ethernet/sfc/ef100_vdpa.h
index 348ca8a7404b..58791402e454 100644
--- a/drivers/net/ethernet/sfc/ef100_vdpa.h
+++ b/drivers/net/ethernet/sfc/ef100_vdpa.h
@@ -149,6 +149,8 @@  int ef100_vdpa_register_mgmtdev(struct efx_nic *efx);
 void ef100_vdpa_unregister_mgmtdev(struct efx_nic *efx);
 void ef100_vdpa_irq_vectors_free(void *data);
 int ef100_vdpa_init_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx);
+void ef100_vdpa_irq_vectors_free(void *data);
+int ef100_vdpa_reset(struct vdpa_device *vdev);
 
 static inline bool efx_vdpa_is_little_endian(struct ef100_vdpa_nic *vdpa_nic)
 {
diff --git a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
index 0051c4c0e47c..95a2177f85a2 100644
--- a/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
+++ b/drivers/net/ethernet/sfc/ef100_vdpa_ops.c
@@ -22,11 +22,6 @@  static struct ef100_vdpa_nic *get_vdpa_nic(struct vdpa_device *vdev)
 	return container_of(vdev, struct ef100_vdpa_nic, vdpa_dev);
 }
 
-void ef100_vdpa_irq_vectors_free(void *data)
-{
-	pci_free_irq_vectors(data);
-}
-
 static int create_vring_ctx(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
 {
 	struct efx_vring_ctx *vring_ctx;
@@ -52,14 +47,6 @@  static void delete_vring_ctx(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
 	vdpa_nic->vring[idx].vring_ctx = NULL;
 }
 
-static void reset_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
-{
-	vdpa_nic->vring[idx].vring_type = EF100_VDPA_VQ_NTYPES;
-	vdpa_nic->vring[idx].vring_state = 0;
-	vdpa_nic->vring[idx].last_avail_idx = 0;
-	vdpa_nic->vring[idx].last_used_idx = 0;
-}
-
 int ef100_vdpa_init_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
 {
 	u32 offset;
@@ -103,6 +90,236 @@  static bool is_qid_invalid(struct ef100_vdpa_nic *vdpa_nic, u16 idx,
 	return false;
 }
 
+static void irq_vring_fini(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
+{
+	struct ef100_vdpa_vring_info *vring = &vdpa_nic->vring[idx];
+	struct pci_dev *pci_dev = vdpa_nic->efx->pci_dev;
+
+	devm_free_irq(&pci_dev->dev, vring->irq, vring);
+	vring->irq = -EINVAL;
+}
+
+static irqreturn_t vring_intr_handler(int irq, void *arg)
+{
+	struct ef100_vdpa_vring_info *vring = arg;
+
+	if (vring->cb.callback)
+		return vring->cb.callback(vring->cb.private);
+
+	return IRQ_NONE;
+}
+
+static int ef100_vdpa_irq_vectors_alloc(struct pci_dev *pci_dev, u16 nvqs)
+{
+	int rc;
+
+	rc = pci_alloc_irq_vectors(pci_dev, nvqs, nvqs, PCI_IRQ_MSIX);
+	if (rc < 0)
+		pci_err(pci_dev,
+			"Failed to alloc %d IRQ vectors, err:%d\n", nvqs, rc);
+	return rc;
+}
+
+void ef100_vdpa_irq_vectors_free(void *data)
+{
+	pci_free_irq_vectors(data);
+}
+
+static int irq_vring_init(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
+{
+	struct ef100_vdpa_vring_info *vring = &vdpa_nic->vring[idx];
+	struct pci_dev *pci_dev = vdpa_nic->efx->pci_dev;
+	int irq;
+	int rc;
+
+	snprintf(vring->msix_name, 256, "x_vdpa[%s]-%d\n",
+		 pci_name(pci_dev), idx);
+	irq = pci_irq_vector(pci_dev, idx);
+	rc = devm_request_irq(&pci_dev->dev, irq, vring_intr_handler, 0,
+			      vring->msix_name, vring);
+	if (rc)
+		pci_err(pci_dev,
+			"devm_request_irq failed for vring %d, rc %d\n",
+			idx, rc);
+	else
+		vring->irq = irq;
+
+	return rc;
+}
+
+static int delete_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
+{
+	struct efx_vring_dyn_cfg vring_dyn_cfg;
+	int rc;
+
+	if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
+		return 0;
+
+	rc = efx_vdpa_vring_destroy(vdpa_nic->vring[idx].vring_ctx,
+				    &vring_dyn_cfg);
+	if (rc)
+		dev_err(&vdpa_nic->vdpa_dev.dev,
+			"%s: delete vring failed index:%u, err:%d\n",
+			__func__, idx, rc);
+	vdpa_nic->vring[idx].last_avail_idx = vring_dyn_cfg.avail_idx;
+	vdpa_nic->vring[idx].last_used_idx = vring_dyn_cfg.used_idx;
+	vdpa_nic->vring[idx].vring_state &= ~EF100_VRING_CREATED;
+
+	irq_vring_fini(vdpa_nic, idx);
+
+	return rc;
+}
+
+static void ef100_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
+{
+	struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
+	u32 idx_val;
+
+	if (is_qid_invalid(vdpa_nic, idx, __func__))
+		return;
+
+	if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
+		return;
+
+	idx_val = idx;
+	_efx_writed(vdpa_nic->efx, cpu_to_le32(idx_val),
+		    vdpa_nic->vring[idx].doorbell_offset);
+}
+
+static bool can_create_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
+{
+	if (vdpa_nic->vring[idx].vring_state == EF100_VRING_CONFIGURED &&
+	    vdpa_nic->status & VIRTIO_CONFIG_S_DRIVER_OK &&
+	    !(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
+		return true;
+
+	return false;
+}
+
+static int create_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
+{
+	struct efx_vring_dyn_cfg vring_dyn_cfg;
+	struct efx_vring_cfg vring_cfg;
+	int rc;
+
+	rc = irq_vring_init(vdpa_nic, idx);
+	if (rc) {
+		dev_err(&vdpa_nic->vdpa_dev.dev,
+			"%s: irq_vring_init failed. index:%u, err:%d\n",
+			__func__, idx, rc);
+		return rc;
+	}
+	vring_cfg.desc = vdpa_nic->vring[idx].desc;
+	vring_cfg.avail = vdpa_nic->vring[idx].avail;
+	vring_cfg.used = vdpa_nic->vring[idx].used;
+	vring_cfg.size = vdpa_nic->vring[idx].size;
+	vring_cfg.features = vdpa_nic->features;
+	vring_cfg.msix_vector = idx;
+	vring_dyn_cfg.avail_idx = vdpa_nic->vring[idx].last_avail_idx;
+	vring_dyn_cfg.used_idx = vdpa_nic->vring[idx].last_used_idx;
+
+	rc = efx_vdpa_vring_create(vdpa_nic->vring[idx].vring_ctx,
+				   &vring_cfg, &vring_dyn_cfg);
+	if (rc) {
+		dev_err(&vdpa_nic->vdpa_dev.dev,
+			"%s: vring_create failed index:%u, err:%d\n",
+			__func__, idx, rc);
+		goto err_vring_create;
+	}
+	vdpa_nic->vring[idx].vring_state |= EF100_VRING_CREATED;
+
+	/* A VQ kick allows the device to read the avail_idx, which will be
+	 * required at the destination after live migration.
+	 */
+	ef100_vdpa_kick_vq(&vdpa_nic->vdpa_dev, idx);
+
+	return 0;
+
+err_vring_create:
+	irq_vring_fini(vdpa_nic, idx);
+	return rc;
+}
+
+static void reset_vring(struct ef100_vdpa_nic *vdpa_nic, u16 idx)
+{
+	delete_vring(vdpa_nic, idx);
+	vdpa_nic->vring[idx].vring_type = EF100_VDPA_VQ_NTYPES;
+	vdpa_nic->vring[idx].vring_state = 0;
+	vdpa_nic->vring[idx].last_avail_idx = 0;
+	vdpa_nic->vring[idx].last_used_idx = 0;
+}
+
+static void ef100_reset_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
+{
+	int i;
+
+	WARN_ON(!mutex_is_locked(&vdpa_nic->lock));
+
+	if (!vdpa_nic->status)
+		return;
+
+	vdpa_nic->vdpa_state = EF100_VDPA_STATE_INITIALIZED;
+	vdpa_nic->status = 0;
+	vdpa_nic->features = 0;
+	for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++)
+		reset_vring(vdpa_nic, i);
+	ef100_vdpa_irq_vectors_free(vdpa_nic->efx->pci_dev);
+}
+
+/* May be called under the rtnl lock */
+int ef100_vdpa_reset(struct vdpa_device *vdev)
+{
+	struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
+
+	/* vdpa device can be deleted anytime but the bar_config
+	 * could still be vdpa and hence efx->state would be STATE_VDPA.
+	 * Accordingly, ensure vdpa device exists before reset handling
+	 */
+	if (!vdpa_nic)
+		return -ENODEV;
+
+	mutex_lock(&vdpa_nic->lock);
+	ef100_reset_vdpa_device(vdpa_nic);
+	mutex_unlock(&vdpa_nic->lock);
+	return 0;
+}
+
+static int start_vdpa_device(struct ef100_vdpa_nic *vdpa_nic)
+{
+	struct efx_nic *efx = vdpa_nic->efx;
+	struct ef100_nic_data *nic_data;
+	int i, j;
+	int rc;
+
+	nic_data = efx->nic_data;
+	rc = ef100_vdpa_irq_vectors_alloc(efx->pci_dev,
+					  vdpa_nic->max_queue_pairs * 2);
+	if (rc < 0) {
+		pci_err(efx->pci_dev,
+			"vDPA IRQ alloc failed for vf: %u err:%d\n",
+			nic_data->vf_index, rc);
+		return rc;
+	}
+
+	for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++) {
+		if (can_create_vring(vdpa_nic, i)) {
+			rc = create_vring(vdpa_nic, i);
+			if (rc)
+				goto clear_vring;
+		}
+	}
+
+	vdpa_nic->vdpa_state = EF100_VDPA_STATE_STARTED;
+	return 0;
+
+clear_vring:
+	for (j = 0; j < i; j++)
+		delete_vring(vdpa_nic, j);
+
+	ef100_vdpa_irq_vectors_free(efx->pci_dev);
+	return rc;
+}
+
 static int ef100_vdpa_set_vq_address(struct vdpa_device *vdev,
 				     u16 idx, u64 desc_area, u64 driver_area,
 				     u64 device_area)
@@ -144,22 +361,6 @@  static void ef100_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
 	mutex_unlock(&vdpa_nic->lock);
 }
 
-static void ef100_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
-{
-	struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
-	u32 idx_val;
-
-	if (is_qid_invalid(vdpa_nic, idx, __func__))
-		return;
-
-	if (!(vdpa_nic->vring[idx].vring_state & EF100_VRING_CREATED))
-		return;
-
-	idx_val = idx;
-	_efx_writed(vdpa_nic->efx, cpu_to_le32(idx_val),
-		    vdpa_nic->vring[idx].doorbell_offset);
-}
-
 static void ef100_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx,
 				 struct vdpa_callback *cb)
 {
@@ -176,6 +377,7 @@  static void ef100_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx,
 				    bool ready)
 {
 	struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
+	int rc;
 
 	if (is_qid_invalid(vdpa_nic, idx, __func__))
 		return;
@@ -184,9 +386,21 @@  static void ef100_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx,
 	if (ready) {
 		vdpa_nic->vring[idx].vring_state |=
 					EF100_VRING_READY_CONFIGURED;
+		if (vdpa_nic->vdpa_state == EF100_VDPA_STATE_STARTED &&
+		    can_create_vring(vdpa_nic, idx)) {
+			rc = create_vring(vdpa_nic, idx);
+			if (rc)
+				/* Rollback ready configuration
+				 * So that the above layer driver
+				 * can make another attempt to set ready
+				 */
+				vdpa_nic->vring[idx].vring_state &=
+					~EF100_VRING_READY_CONFIGURED;
+		}
 	} else {
 		vdpa_nic->vring[idx].vring_state &=
 					~EF100_VRING_READY_CONFIGURED;
+		delete_vring(vdpa_nic, idx);
 	}
 	mutex_unlock(&vdpa_nic->lock);
 }
@@ -296,6 +510,12 @@  static u64 ef100_vdpa_get_device_features(struct vdpa_device *vdev)
 	}
 
 	features |= BIT_ULL(VIRTIO_NET_F_MAC);
+	/* As QEMU SVQ doesn't implement the following features,
+	 * masking them off to allow Live Migration
+	 */
+	features &= ~BIT_ULL(VIRTIO_F_IN_ORDER);
+	features &= ~BIT_ULL(VIRTIO_F_ORDER_PLATFORM);
+
 	return features;
 }
 
@@ -356,6 +576,77 @@  static u32 ef100_vdpa_get_vendor_id(struct vdpa_device *vdev)
 	return EF100_VDPA_VENDOR_ID;
 }
 
+static u8 ef100_vdpa_get_status(struct vdpa_device *vdev)
+{
+	struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
+	u8 status;
+
+	mutex_lock(&vdpa_nic->lock);
+	status = vdpa_nic->status;
+	mutex_unlock(&vdpa_nic->lock);
+	return status;
+}
+
+static void ef100_vdpa_set_status(struct vdpa_device *vdev, u8 status)
+{
+	struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
+	u8 new_status;
+	int rc;
+
+	mutex_lock(&vdpa_nic->lock);
+	if (!status) {
+		dev_info(&vdev->dev,
+			 "%s: Status received is 0. Device reset being done\n",
+			 __func__);
+		ef100_reset_vdpa_device(vdpa_nic);
+		goto unlock_return;
+	}
+	new_status = status & ~vdpa_nic->status;
+	if (new_status == 0) {
+		dev_info(&vdev->dev,
+			 "%s: New status same as current status\n", __func__);
+		goto unlock_return;
+	}
+	if (new_status & VIRTIO_CONFIG_S_FAILED) {
+		ef100_reset_vdpa_device(vdpa_nic);
+		goto unlock_return;
+	}
+
+	if (new_status & VIRTIO_CONFIG_S_ACKNOWLEDGE) {
+		vdpa_nic->status |= VIRTIO_CONFIG_S_ACKNOWLEDGE;
+		new_status &= ~VIRTIO_CONFIG_S_ACKNOWLEDGE;
+	}
+	if (new_status & VIRTIO_CONFIG_S_DRIVER) {
+		vdpa_nic->status |= VIRTIO_CONFIG_S_DRIVER;
+		new_status &= ~VIRTIO_CONFIG_S_DRIVER;
+	}
+	if (new_status & VIRTIO_CONFIG_S_FEATURES_OK) {
+		vdpa_nic->status |= VIRTIO_CONFIG_S_FEATURES_OK;
+		vdpa_nic->vdpa_state = EF100_VDPA_STATE_NEGOTIATED;
+		new_status &= ~VIRTIO_CONFIG_S_FEATURES_OK;
+	}
+	if (new_status & VIRTIO_CONFIG_S_DRIVER_OK &&
+	    vdpa_nic->vdpa_state == EF100_VDPA_STATE_NEGOTIATED) {
+		vdpa_nic->status |= VIRTIO_CONFIG_S_DRIVER_OK;
+		rc = start_vdpa_device(vdpa_nic);
+		if (rc) {
+			dev_err(&vdpa_nic->vdpa_dev.dev,
+				"%s: vDPA device failed:%d\n", __func__, rc);
+			vdpa_nic->status &= ~VIRTIO_CONFIG_S_DRIVER_OK;
+			goto unlock_return;
+		}
+		new_status &= ~VIRTIO_CONFIG_S_DRIVER_OK;
+	}
+	if (new_status) {
+		dev_warn(&vdev->dev,
+			 "%s: Mismatch Status: %x & State: %u\n",
+			 __func__, new_status, vdpa_nic->vdpa_state);
+	}
+
+unlock_return:
+	mutex_unlock(&vdpa_nic->lock);
+}
+
 static size_t ef100_vdpa_get_config_size(struct vdpa_device *vdev)
 {
 	return sizeof(struct virtio_net_config);
@@ -393,6 +684,20 @@  static void ef100_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset,
 		vdpa_nic->mac_configured = true;
 }
 
+static int ef100_vdpa_suspend(struct vdpa_device *vdev)
+{
+	struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
+	int i, rc;
+
+	mutex_lock(&vdpa_nic->lock);
+	for (i = 0; i < (vdpa_nic->max_queue_pairs * 2); i++) {
+		rc = delete_vring(vdpa_nic, i);
+		if (rc)
+			break;
+	}
+	mutex_unlock(&vdpa_nic->lock);
+	return rc;
+}
 static void ef100_vdpa_free(struct vdpa_device *vdev)
 {
 	struct ef100_vdpa_nic *vdpa_nic = get_vdpa_nic(vdev);
@@ -428,9 +733,13 @@  const struct vdpa_config_ops ef100_vdpa_config_ops = {
 	.get_vq_num_max      = ef100_vdpa_get_vq_num_max,
 	.get_device_id	     = ef100_vdpa_get_device_id,
 	.get_vendor_id	     = ef100_vdpa_get_vendor_id,
+	.get_status	     = ef100_vdpa_get_status,
+	.set_status	     = ef100_vdpa_set_status,
+	.reset               = ef100_vdpa_reset,
 	.get_config_size     = ef100_vdpa_get_config_size,
 	.get_config	     = ef100_vdpa_get_config,
 	.set_config	     = ef100_vdpa_set_config,
 	.get_generation      = NULL,
+	.suspend	     = ef100_vdpa_suspend,
 	.free	             = ef100_vdpa_free,
 };