@@ -19,6 +19,113 @@
#define PDS_VFIO_DRV_DESCRIPTION "AMD/Pensando VFIO Device Driver"
#define PCI_VENDOR_ID_PENSANDO 0x1dd8
+static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio)
+{
+ bool deferred_reset_needed = false;
+
+ /*
+ * Documentation states that the kernel migration driver must not
+ * generate asynchronous device state transitions outside of
+ * manipulation by the user or the VFIO_DEVICE_RESET ioctl.
+ *
+ * Since recovery is an asynchronous event received from the device,
+ * initiate a deferred reset. Issue a deferred reset in the following
+ * situations:
+ * 1. Migration is in progress, which will cause the next step of
+ * the migration to fail.
+ * 2. If the device is in a state that will be set to
+ * VFIO_DEVICE_STATE_RUNNING on the next action (i.e. VM is
+ * shutdown and device is in VFIO_DEVICE_STATE_STOP).
+ */
+ mutex_lock(&pds_vfio->state_mutex);
+ if ((pds_vfio->state != VFIO_DEVICE_STATE_RUNNING &&
+ pds_vfio->state != VFIO_DEVICE_STATE_ERROR) ||
+ (pds_vfio->state == VFIO_DEVICE_STATE_RUNNING &&
+ pds_vfio_dirty_is_enabled(pds_vfio)))
+ deferred_reset_needed = true;
+ mutex_unlock(&pds_vfio->state_mutex);
+
+ /*
+ * On the next user initiated state transition, the device will
+ * transition to the VFIO_DEVICE_STATE_ERROR. At this point it's the user's
+ * responsibility to reset the device.
+ *
+ * If a VFIO_DEVICE_RESET is requested post recovery and before the next
+ * state transition, then the deferred reset state will be set to
+ * VFIO_DEVICE_STATE_RUNNING.
+ */
+ if (deferred_reset_needed) {
+ spin_lock(&pds_vfio->reset_lock);
+ pds_vfio->deferred_reset = true;
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR;
+ spin_unlock(&pds_vfio->reset_lock);
+ }
+}
+
+static int pds_vfio_pci_notify_handler(struct notifier_block *nb,
+ unsigned long ecode, void *data)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(nb, struct pds_vfio_pci_device, nb);
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_notifyq_comp *event = data;
+
+ dev_dbg(dev, "%s: event code %lu\n", __func__, ecode);
+
+ /*
+ * We don't need to do anything for RESET state==0 as there is no notify
+ * or feedback mechanism available, and it is possible that we won't
+ * even see a state==0 event since the pds_core recovery is pending.
+ *
+ * Any requests from VFIO while state==0 will fail, which will return
+ * error and may cause migration to fail.
+ */
+ if (ecode == PDS_EVENT_RESET) {
+ dev_info(dev, "%s: PDS_EVENT_RESET event received, state==%d\n",
+ __func__, event->reset.state);
+ /*
+ * pds_core device finished recovery and sent us the
+ * notification (state == 1) to allow us to recover
+ */
+ if (event->reset.state == 1)
+ pds_vfio_recovery(pds_vfio);
+ }
+
+ return 0;
+}
+
+static int
+pds_vfio_pci_register_event_handler(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ struct notifier_block *nb = &pds_vfio->nb;
+ int err;
+
+ if (!nb->notifier_call) {
+ nb->notifier_call = pds_vfio_pci_notify_handler;
+ err = pdsc_register_notify(nb);
+ if (err) {
+ nb->notifier_call = NULL;
+ dev_err(dev,
+ "failed to register pds event handler: %pe\n",
+ ERR_PTR(err));
+ return -EINVAL;
+ }
+ dev_dbg(dev, "pds event handler registered\n");
+ }
+
+ return 0;
+}
+
+static void
+pds_vfio_pci_unregister_event_handler(struct pds_vfio_pci_device *pds_vfio)
+{
+ if (pds_vfio->nb.notifier_call) {
+ pdsc_unregister_notify(&pds_vfio->nb);
+ pds_vfio->nb.notifier_call = NULL;
+ }
+}
+
static int pds_vfio_pci_probe(struct pci_dev *pdev,
const struct pci_device_id *id)
{
@@ -48,8 +155,14 @@ static int pds_vfio_pci_probe(struct pci_dev *pdev,
goto out_unregister_coredev;
}
+ err = pds_vfio_pci_register_event_handler(pds_vfio);
+ if (err)
+ goto out_unregister_client;
+
return 0;
+out_unregister_client:
+ pds_vfio_unregister_client_cmd(pds_vfio);
out_unregister_coredev:
vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev);
out_put_vdev:
@@ -61,6 +174,7 @@ static void pds_vfio_pci_remove(struct pci_dev *pdev)
{
struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev);
+ pds_vfio_pci_unregister_event_handler(pds_vfio);
pds_vfio_unregister_client_cmd(pds_vfio);
vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev);
vfio_put_device(&pds_vfio->vfio_coredev.vdev);
@@ -33,11 +33,12 @@ void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
if (pds_vfio->deferred_reset) {
pds_vfio->deferred_reset = false;
if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) {
- pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
pds_vfio_put_restore_file(pds_vfio);
pds_vfio_put_save_file(pds_vfio);
pds_vfio_dirty_disable(pds_vfio, false);
}
+ pds_vfio->state = pds_vfio->deferred_reset_state;
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
spin_unlock(&pds_vfio->reset_lock);
goto again;
}
@@ -49,6 +50,7 @@ void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio)
{
spin_lock(&pds_vfio->reset_lock);
pds_vfio->deferred_reset = true;
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
if (!mutex_trylock(&pds_vfio->state_mutex)) {
spin_unlock(&pds_vfio->reset_lock);
return;
@@ -67,7 +69,14 @@ pds_vfio_set_device_state(struct vfio_device *vdev,
struct file *res = NULL;
mutex_lock(&pds_vfio->state_mutex);
- while (new_state != pds_vfio->state) {
+ /*
+ * only way to transition out of VFIO_DEVICE_STATE_ERROR is via
+ * VFIO_DEVICE_RESET, so prevent the state machine from running since
+ * vfio_mig_get_next_state() will throw a WARN_ON() when transitioning
+ * from VFIO_DEVICE_STATE_ERROR to any other state
+ */
+ while (pds_vfio->state != VFIO_DEVICE_STATE_ERROR &&
+ new_state != pds_vfio->state) {
enum vfio_device_mig_state next_state;
int err = vfio_mig_get_next_state(vdev, pds_vfio->state,
@@ -89,6 +98,9 @@ pds_vfio_set_device_state(struct vfio_device *vdev,
}
}
pds_vfio_state_mutex_unlock(pds_vfio);
+ /* still waiting on a deferred_reset */
+ if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR)
+ res = ERR_PTR(-EIO);
return res;
}
@@ -169,6 +181,7 @@ static int pds_vfio_open_device(struct vfio_device *vdev)
mutex_init(&pds_vfio->state_mutex);
pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev);
@@ -23,6 +23,8 @@ struct pds_vfio_pci_device {
enum vfio_device_mig_state state;
spinlock_t reset_lock; /* protect reset_done flow */
u8 deferred_reset;
+ enum vfio_device_mig_state deferred_reset_state;
+ struct notifier_block nb;
int vf_id;
u16 client_id;