@@ -20,6 +20,104 @@
#define PDS_VFIO_DRV_DESCRIPTION "AMD/Pensando VFIO Device Driver"
#define PCI_VENDOR_ID_PENSANDO 0x1dd8
+static void
+pds_vfio_recovery_work(struct work_struct *work)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(work, struct pds_vfio_pci_device, work);
+ bool deferred_reset_needed = false;
+
+ /* Documentation states that the kernel migration driver must not
+ * generate asynchronous device state transitions outside of
+ * manipulation by the user or the VFIO_DEVICE_RESET ioctl.
+ *
+ * Since recovery is an asynchronous event received from the device,
+ * initiate a deferred reset. Only issue the deferred reset if a
+ * migration is in progress, which will cause the next step of the
+ * migration to fail. Also, if the device is in a state that will
+ * be set to VFIO_DEVICE_STATE_RUNNING on the next action (i.e. VM is
+ * shutdown and device is in VFIO_DEVICE_STATE_STOP) as that will clear
+ * the VFIO_DEVICE_STATE_ERROR when the VM starts back up.
+ */
+ mutex_lock(&pds_vfio->state_mutex);
+ if ((pds_vfio->state != VFIO_DEVICE_STATE_RUNNING &&
+ pds_vfio->state != VFIO_DEVICE_STATE_ERROR) ||
+ (pds_vfio->state == VFIO_DEVICE_STATE_RUNNING &&
+ pds_vfio_dirty_is_enabled(pds_vfio)))
+ deferred_reset_needed = true;
+ mutex_unlock(&pds_vfio->state_mutex);
+
+ /* On the next user initiated state transition, the device will
+ * transition to the VFIO_DEVICE_STATE_ERROR. At this point it's the user's
+ * responsibility to reset the device.
+ *
+ * If a VFIO_DEVICE_RESET is requested post recovery and before the next
+ * state transition, then the deferred reset state will be set to
+ * VFIO_DEVICE_STATE_RUNNING.
+ */
+ if (deferred_reset_needed)
+ pds_vfio_deferred_reset(pds_vfio, VFIO_DEVICE_STATE_ERROR);
+}
+
+static int
+pds_vfio_pci_notify_handler(struct notifier_block *nb,
+ unsigned long ecode,
+ void *data)
+{
+ struct pds_vfio_pci_device *pds_vfio = container_of(nb,
+ struct pds_vfio_pci_device,
+ nb);
+ union pds_core_notifyq_comp *event = data;
+
+ dev_info(pds_vfio->coredev, "%s: event code %lu\n", __func__, ecode);
+
+ /* We don't need to do anything for RESET state==0 as there is no notify
+ * or feedback mechanism available, and it is possible that we won't
+ * even see a state==0 event.
+ *
+ * Any requests from VFIO while state==0 will fail, which will return
+ * error and may cause migration to fail.
+ */
+ if (ecode == PDS_EVENT_RESET) {
+ dev_info(pds_vfio->coredev, "%s: PDS_EVENT_RESET event received, state==%d\n",
+ __func__, event->reset.state);
+ if (event->reset.state == 1)
+ schedule_work(&pds_vfio->work);
+ }
+
+ return 0;
+}
+
+static int
+pds_vfio_pci_register_event_handler(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct notifier_block *nb = &pds_vfio->nb;
+ int err;
+
+ if (!nb->notifier_call) {
+ nb->notifier_call = pds_vfio_pci_notify_handler;
+ err = pdsc_register_notify(nb);
+ if (err) {
+ nb->notifier_call = NULL;
+ dev_err(pds_vfio->coredev, "failed to register pds event handler: %pe\n",
+ ERR_PTR(err));
+ return -EINVAL;
+ }
+ dev_dbg(pds_vfio->coredev, "pds event handler registered\n");
+ }
+
+ return 0;
+}
+
+static void
+pds_vfio_pci_unregister_event_handler(struct pds_vfio_pci_device *pds_vfio)
+{
+ if (pds_vfio->nb.notifier_call) {
+ pdsc_unregister_notify(&pds_vfio->nb);
+ pds_vfio->nb.notifier_call = NULL;
+ }
+}
+
static int
pds_vfio_pci_probe(struct pci_dev *pdev,
const struct pci_device_id *id)
@@ -44,14 +142,22 @@ pds_vfio_pci_probe(struct pci_dev *pdev,
goto out_put_vdev;
}
- err = vfio_pci_core_register_device(&pds_vfio->vfio_coredev);
+ INIT_WORK(&pds_vfio->work, pds_vfio_recovery_work);
+ err = pds_vfio_pci_register_event_handler(pds_vfio);
if (err)
goto out_unreg_client;
+ err = vfio_pci_core_register_device(&pds_vfio->vfio_coredev);
+ if (err)
+ goto out_unreg_notify;
+
return 0;
+out_unreg_notify:
+ pds_vfio_pci_unregister_event_handler(pds_vfio);
out_unreg_client:
pds_vfio_unregister_client_cmd(pds_vfio);
+ cancel_work_sync(&pds_vfio->work);
out_put_vdev:
vfio_put_device(&pds_vfio->vfio_coredev.vdev);
return err;
@@ -62,6 +168,8 @@ pds_vfio_pci_remove(struct pci_dev *pdev)
{
struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev);
+ pds_vfio_pci_unregister_event_handler(pds_vfio);
+ cancel_work_sync(&pds_vfio->work);
vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev);
vfio_put_device(&pds_vfio->vfio_coredev.vdev);
}
@@ -25,10 +25,17 @@ pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
if (pds_vfio->deferred_reset) {
pds_vfio->deferred_reset = false;
if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) {
- pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
+ dev_dbg(&pds_vfio->pdev->dev, "Transitioning from VFIO_DEVICE_STATE_ERROR to %s\n",
+ pds_vfio_lm_state(pds_vfio->deferred_reset_state));
+ pds_vfio->state = pds_vfio->deferred_reset_state;
pds_vfio_put_restore_file(pds_vfio);
pds_vfio_put_save_file(pds_vfio);
+ } else if (pds_vfio->deferred_reset_state == VFIO_DEVICE_STATE_ERROR) {
+ dev_dbg(&pds_vfio->pdev->dev, "Transitioning from %s to VFIO_DEVICE_STATE_ERROR based on deferred_reset request\n",
+ pds_vfio_lm_state(pds_vfio->state));
+ pds_vfio->state = VFIO_DEVICE_STATE_ERROR;
}
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
spin_unlock(&pds_vfio->reset_lock);
goto again;
}
@@ -41,6 +48,7 @@ pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio)
{
spin_lock(&pds_vfio->reset_lock);
pds_vfio->deferred_reset = true;
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
if (!mutex_trylock(&pds_vfio->state_mutex)) {
spin_unlock(&pds_vfio->reset_lock);
return;
@@ -49,6 +57,18 @@ pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio)
pds_vfio_state_mutex_unlock(pds_vfio);
}
+void
+pds_vfio_deferred_reset(struct pds_vfio_pci_device *pds_vfio,
+ enum vfio_device_mig_state reset_state)
+{
+ dev_info(&pds_vfio->pdev->dev, "Requesting deferred_reset to state %s\n",
+ pds_vfio_lm_state(reset_state));
+ spin_lock(&pds_vfio->reset_lock);
+ pds_vfio->deferred_reset = true;
+ pds_vfio->deferred_reset_state = reset_state;
+ spin_unlock(&pds_vfio->reset_lock);
+}
+
static struct file *
pds_vfio_set_device_state(struct vfio_device *vdev,
enum vfio_device_mig_state new_state)
@@ -59,7 +79,13 @@ pds_vfio_set_device_state(struct vfio_device *vdev,
struct file *res = NULL;
mutex_lock(&pds_vfio->state_mutex);
- while (new_state != pds_vfio->state) {
+ /* only way to transition out of VFIO_DEVICE_STATE_ERROR is via
+ * VFIO_DEVICE_RESET, so prevent the state machine from running since
+ * vfio_mig_get_next_state() will throw a WARN_ON() when transitioning
+ * from VFIO_DEVICE_STATE_ERROR to any other state
+ */
+ while (pds_vfio->state != VFIO_DEVICE_STATE_ERROR &&
+ new_state != pds_vfio->state) {
enum vfio_device_mig_state next_state;
int err = vfio_mig_get_next_state(vdev, pds_vfio->state,
@@ -81,6 +107,9 @@ pds_vfio_set_device_state(struct vfio_device *vdev,
}
}
pds_vfio_state_mutex_unlock(pds_vfio);
+ /* still waiting on a deferred_reset */
+ if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR)
+ res = ERR_PTR(-EIO);
return res;
}
@@ -165,6 +194,7 @@ pds_vfio_open_device(struct vfio_device *vdev)
dev_dbg(&pds_vfio->pdev->dev, "%s: %s => VFIO_DEVICE_STATE_RUNNING\n",
__func__, pds_vfio_lm_state(pds_vfio->state));
pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev);
@@ -23,6 +23,9 @@ struct pds_vfio_pci_device {
enum vfio_device_mig_state state;
spinlock_t reset_lock; /* protect reset_done flow */
u8 deferred_reset;
+ enum vfio_device_mig_state deferred_reset_state;
+ struct work_struct work;
+ struct notifier_block nb;
int vf_id;
int pci_id;
@@ -32,5 +35,6 @@ struct pds_vfio_pci_device {
const struct vfio_device_ops *pds_vfio_ops_info(void);
struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev);
void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio);
-
+void pds_vfio_deferred_reset(struct pds_vfio_pci_device *pds_vfio,
+ enum vfio_device_mig_state reset_state);
#endif /* _VFIO_DEV_H_ */