Message ID | 20211019105838.227569-15-yishaih@nvidia.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Bjorn Helgaas |
Headers | show |
Series | Add mlx5 live migration driver | expand |
On Tue, 19 Oct 2021 13:58:38 +0300 Yishai Hadas <yishaih@nvidia.com> wrote: > Register its own handler for pci_error_handlers.reset_done and update > state accordingly. > > Signed-off-by: Yishai Hadas <yishaih@nvidia.com> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > --- > drivers/vfio/pci/mlx5/main.c | 33 ++++++++++++++++++++++++++++++++- > 1 file changed, 32 insertions(+), 1 deletion(-) > > diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c > index 621b7fc60544..b759c6e153b6 100644 > --- a/drivers/vfio/pci/mlx5/main.c > +++ b/drivers/vfio/pci/mlx5/main.c > @@ -58,6 +58,7 @@ struct mlx5vf_pci_core_device { > /* protect migartion state */ > struct mutex state_mutex; > struct mlx5vf_pci_migration_info vmig; > + struct work_struct work; > }; > > static int mlx5vf_pci_unquiesce_device(struct mlx5vf_pci_core_device *mvdev) > @@ -615,6 +616,27 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { > .match = vfio_pci_core_match, > }; > > +static void mlx5vf_reset_work_handler(struct work_struct *work) > +{ > + struct mlx5vf_pci_core_device *mvdev = > + container_of(work, struct mlx5vf_pci_core_device, work); > + > + mutex_lock(&mvdev->state_mutex); > + mlx5vf_reset_mig_state(mvdev); I see this calls mlx5vf_reset_vhca_state() but how does that unfreeze and unquiesce the device as necessary to get back to _RUNNING? > + mvdev->vmig.vfio_dev_state = VFIO_DEVICE_STATE_RUNNING; > + mutex_unlock(&mvdev->state_mutex); > +} > + > +static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) > +{ > + struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev); > + > + if (!mvdev->migrate_cap) > + return; > + > + schedule_work(&mvdev->work); This seems troublesome, how long does userspace poll the device state after reset to get back to _RUNNING? Seems we at least need a flush_work() call when userspace reads the device state. Thanks, Alex > +} > + > static int mlx5vf_pci_probe(struct pci_dev *pdev, > const struct pci_device_id *id) > { > @@ -634,6 +656,8 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev, > if (MLX5_CAP_GEN(mdev, migration)) { > mvdev->migrate_cap = 1; > mutex_init(&mvdev->state_mutex); > + INIT_WORK(&mvdev->work, > + mlx5vf_reset_work_handler); > } > mlx5_vf_put_core_dev(mdev); > } > @@ -656,6 +680,8 @@ static void mlx5vf_pci_remove(struct pci_dev *pdev) > { > struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev); > > + if (mvdev->migrate_cap) > + cancel_work_sync(&mvdev->work); > vfio_pci_core_unregister_device(&mvdev->core_device); > vfio_pci_core_uninit_device(&mvdev->core_device); > kfree(mvdev); > @@ -668,12 +694,17 @@ static const struct pci_device_id mlx5vf_pci_table[] = { > > MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); > > +const struct pci_error_handlers mlx5vf_err_handlers = { > + .reset_done = mlx5vf_pci_aer_reset_done, > + .error_detected = vfio_pci_aer_err_detected, > +}; > + > static struct pci_driver mlx5vf_pci_driver = { > .name = KBUILD_MODNAME, > .id_table = mlx5vf_pci_table, > .probe = mlx5vf_pci_probe, > .remove = mlx5vf_pci_remove, > - .err_handler = &vfio_pci_core_err_handlers, > + .err_handler = &mlx5vf_err_handlers, > }; > > static void __exit mlx5vf_pci_cleanup(void)
On Tue, Oct 19, 2021 at 12:55:13PM -0600, Alex Williamson wrote: > > +static void mlx5vf_reset_work_handler(struct work_struct *work) > > +{ > > + struct mlx5vf_pci_core_device *mvdev = > > + container_of(work, struct mlx5vf_pci_core_device, work); > > + > > + mutex_lock(&mvdev->state_mutex); > > + mlx5vf_reset_mig_state(mvdev); > > I see this calls mlx5vf_reset_vhca_state() but how does that unfreeze > and unquiesce the device as necessary to get back to _RUNNING? FLR of the function does it. Same flow as if userspace attaches the vfio migration, freezes the device then closes the FD. The FLR puts everything in the device right and the next open will see a functional, unfrozen, blank device. > > + mvdev->vmig.vfio_dev_state = VFIO_DEVICE_STATE_RUNNING; > > + mutex_unlock(&mvdev->state_mutex); > > +} > > + > > +static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) > > +{ > > + struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev); > > + > > + if (!mvdev->migrate_cap) > > + return; > > + > > + schedule_work(&mvdev->work); > > This seems troublesome, how long does userspace poll the device state > after reset to get back to _RUNNING? Seems we at least need a > flush_work() call when userspace reads the device state. Thanks, The locking is very troubled here because the higher VFIO layers are holding locks across reset and using those same locks with the mm_lock The delay is a good point :( The other algorithm that can rescue this is to defer the cleanup work to the mutex unlock, which ever context happens to get to it: reset_done: spin_lock(spin) defered_reset = true; if (!mutex_trylock(&state_mutex)) spin_unlock(spin) return spin_unlock(spin) state_mutex_unlock() state_mutex_unlock: again: spin_lock(spin) if (defered_reset) spin_unlock() do_post_reset; goto again; mutex_unlock(state_mutex); spin_unlock() and call state_mutex_unlock() in all unlock cases. It is a much more complicated algorithm than the work. Yishai this should also have had a comment explaining why this is needed as nobody is going to guess a ABBA deadlock on mm_lock is the reason. Jason
On 10/19/2021 10:10 PM, Jason Gunthorpe wrote: > On Tue, Oct 19, 2021 at 12:55:13PM -0600, Alex Williamson wrote: > >>> +static void mlx5vf_reset_work_handler(struct work_struct *work) >>> +{ >>> + struct mlx5vf_pci_core_device *mvdev = >>> + container_of(work, struct mlx5vf_pci_core_device, work); >>> + >>> + mutex_lock(&mvdev->state_mutex); >>> + mlx5vf_reset_mig_state(mvdev); >> I see this calls mlx5vf_reset_vhca_state() but how does that unfreeze >> and unquiesce the device as necessary to get back to _RUNNING? > FLR of the function does it. > > Same flow as if userspace attaches the vfio migration, freezes the > device then closes the FD. The FLR puts everything in the device right > and the next open will see a functional, unfrozen, blank device. Right > >>> + mvdev->vmig.vfio_dev_state = VFIO_DEVICE_STATE_RUNNING; >>> + mutex_unlock(&mvdev->state_mutex); >>> +} >>> + >>> +static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) >>> +{ >>> + struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev); >>> + >>> + if (!mvdev->migrate_cap) >>> + return; >>> + >>> + schedule_work(&mvdev->work); >> This seems troublesome, how long does userspace poll the device state >> after reset to get back to _RUNNING? Seems we at least need a >> flush_work() call when userspace reads the device state. Thanks, > The locking is very troubled here because the higher VFIO layers are > holding locks across reset and using those same locks with the mm_lock > > The delay is a good point :( What is the expectation for a reasonable delay ? we may expect this system WQ to run only short tasks and be very responsive. See https://elixir.bootlin.com/linux/v5.15-rc6/source/include/linux/workqueue.h#L355 > > The other algorithm that can rescue this is to defer the cleanup work > to the mutex unlock, which ever context happens to get to it: > > reset_done: > spin_lock(spin) > defered_reset = true; > if (!mutex_trylock(&state_mutex)) > spin_unlock(spin) > return > spin_unlock(spin) > > state_mutex_unlock() > > state_mutex_unlock: > again: > spin_lock(spin) > if (defered_reset) > spin_unlock() > do_post_reset; > goto again; > mutex_unlock(state_mutex); > spin_unlock() > > and call state_mutex_unlock() in all unlock cases. > > It is a much more complicated algorithm than the work. Right, it seems much more complicated compared to current code.. In addition, need to get into the details of the above algorithm, not sure that I got all of them .. > > Yishai this should also have had a comment explaining why this is > needed as nobody is going to guess a ABBA deadlock on mm_lock is the > reason. Sure, this can be done. Are we fine to start/stay with current simple approach that I wrote and tested so far ? Yishai
On Wed, Oct 20, 2021 at 11:46:07AM +0300, Yishai Hadas wrote: > What is the expectation for a reasonable delay ? we may expect this system > WQ to run only short tasks and be very responsive. If the expectation is that qemu will see the error return and the turn around and issue FLR followed by another state operation then it does seem strange that there would be a delay. On the other hand, this doesn't seem that useful. If qemu tries to migrate and the device fails then the migration operation is toast and possibly the device is wrecked. It can't really issue a FLR without coordinating with the VM, and it cannot resume the VM as the device is now irrecoverably messed up. If we look at this from a RAS perspective would would be useful here is a way for qemu to request a fail safe migration data. This must always be available and cannot fail. When the failsafe is loaded into the device it would trigger the device's built-in RAS features to co-ordinate with the VM driver and recover. Perhaps qemu would also have to inject an AER or something. Basically instead of the device starting in an "empty ready to use state" it would start in a "failure detected, needs recovery" state. Not hitless, but preserves overall availability vs a failed migration == VM crash. That said, it is just a thought, and I don't know if anyone has put any resources into what to do if migration operations fail right now. But failure is possible, ie the physical device could have crashed and perhaps the migration is to move the VMs off the broken HW. In this scenario all the migration operations will timeout and fail in the driver. However, since the guest VM could issue a FLR at any time, we really shouldn't have this kind of operation floating around in the background. Things must be made deterministic for qemu. eg if qemu gets a guest request for FLR during the pre-copy stage it really should abort the pre-copy, issue the FLR and then restart the migration. I think it is unresonable to ask a device to be able to maintain pre-copy across FLR. To make this work the restarting of the migration must not race with a schedule work wiping out all the state. So, regrettably, something is needed here. Ideally more of this logic would be in shared code, but I'm not sure I have a good feeling what that should look like at this point. Something to attempt once there are a few more implementations. For instance the if predicate ladder I mentioned in the last email should be shared code, not driver core as it is fundamental ABI. Jason
On Wed, 20 Oct 2021 13:46:29 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Wed, Oct 20, 2021 at 11:46:07AM +0300, Yishai Hadas wrote: > > > What is the expectation for a reasonable delay ? we may expect this system > > WQ to run only short tasks and be very responsive. > > If the expectation is that qemu will see the error return and the turn > around and issue FLR followed by another state operation then it does > seem strange that there would be a delay. > > On the other hand, this doesn't seem that useful. If qemu tries to > migrate and the device fails then the migration operation is toast and > possibly the device is wrecked. It can't really issue a FLR without > coordinating with the VM, and it cannot resume the VM as the device is > now irrecoverably messed up. > > If we look at this from a RAS perspective would would be useful here > is a way for qemu to request a fail safe migration data. This must > always be available and cannot fail. > > When the failsafe is loaded into the device it would trigger the > device's built-in RAS features to co-ordinate with the VM driver and > recover. Perhaps qemu would also have to inject an AER or something. > > Basically instead of the device starting in an "empty ready to use > state" it would start in a "failure detected, needs recovery" state. The "fail-safe recovery state" is essentially the reset state of the device. If a device enters an error state during migration, I would think the ultimate recovery procedure would be to abort the migration, send an AER to the VM, whereby the guest would trigger a reset, and the RAS capabilities of the guest would handle failing over to a multipath device, ejecting the failing device, etc. However, regardless of the migration recovery strategy, userspace needs a means to get the device back into an initial state in a deterministic way without closing and re-opening the device (or polling for an arbitrary length of time). That's the minimum viable product here. Thanks, Alex
On Wed, Oct 20, 2021 at 11:45:14AM -0600, Alex Williamson wrote: > On Wed, 20 Oct 2021 13:46:29 -0300 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Wed, Oct 20, 2021 at 11:46:07AM +0300, Yishai Hadas wrote: > > > > > What is the expectation for a reasonable delay ? we may expect this system > > > WQ to run only short tasks and be very responsive. > > > > If the expectation is that qemu will see the error return and the turn > > around and issue FLR followed by another state operation then it does > > seem strange that there would be a delay. > > > > On the other hand, this doesn't seem that useful. If qemu tries to > > migrate and the device fails then the migration operation is toast and > > possibly the device is wrecked. It can't really issue a FLR without > > coordinating with the VM, and it cannot resume the VM as the device is > > now irrecoverably messed up. > > > > If we look at this from a RAS perspective would would be useful here > > is a way for qemu to request a fail safe migration data. This must > > always be available and cannot fail. > > > > When the failsafe is loaded into the device it would trigger the > > device's built-in RAS features to co-ordinate with the VM driver and > > recover. Perhaps qemu would also have to inject an AER or something. > > > > Basically instead of the device starting in an "empty ready to use > > state" it would start in a "failure detected, needs recovery" state. > > The "fail-safe recovery state" is essentially the reset state of the > device. This is only the case if qemu does work to isolate the recently FLR'd device from the VM until the VM acknowledges that it understands it is FLR'd. At least it would have to remove it from CPU access and the IOMMU, as though the memory enable bit was cleared. Is it reasonable to do this using just qemu, AER and no device support? > If a device enters an error state during migration, I would > think the ultimate recovery procedure would be to abort the migration, > send an AER to the VM, whereby the guest would trigger a reset, and > the RAS capabilities of the guest would handle failing over to a > multipath device, ejecting the failing device, etc. Yes, this is my thinking, except I would not abort the migration but continue on to the new hypervisor and then do the RAS recovery with the new device. Jason
On Wed, 20 Oct 2021 15:57:21 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Wed, Oct 20, 2021 at 11:45:14AM -0600, Alex Williamson wrote: > > On Wed, 20 Oct 2021 13:46:29 -0300 > > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > On Wed, Oct 20, 2021 at 11:46:07AM +0300, Yishai Hadas wrote: > > > > > > > What is the expectation for a reasonable delay ? we may expect this system > > > > WQ to run only short tasks and be very responsive. > > > > > > If the expectation is that qemu will see the error return and the turn > > > around and issue FLR followed by another state operation then it does > > > seem strange that there would be a delay. > > > > > > On the other hand, this doesn't seem that useful. If qemu tries to > > > migrate and the device fails then the migration operation is toast and > > > possibly the device is wrecked. It can't really issue a FLR without > > > coordinating with the VM, and it cannot resume the VM as the device is > > > now irrecoverably messed up. > > > > > > If we look at this from a RAS perspective would would be useful here > > > is a way for qemu to request a fail safe migration data. This must > > > always be available and cannot fail. > > > > > > When the failsafe is loaded into the device it would trigger the > > > device's built-in RAS features to co-ordinate with the VM driver and > > > recover. Perhaps qemu would also have to inject an AER or something. > > > > > > Basically instead of the device starting in an "empty ready to use > > > state" it would start in a "failure detected, needs recovery" state. > > > > The "fail-safe recovery state" is essentially the reset state of the > > device. > > This is only the case if qemu does work to isolate the recently FLR'd > device from the VM until the VM acknowledges that it understands it is > FLR'd. > > At least it would have to remove it from CPU access and the IOMMU, as > though the memory enable bit was cleared. > > Is it reasonable to do this using just qemu, AER and no device > support? I suspect yes, worst case could be a surprise hot-remove or DPC event, but IIRC Linux will reset a device on a fatal AER error regardless of the driver. > > If a device enters an error state during migration, I would > > think the ultimate recovery procedure would be to abort the migration, > > send an AER to the VM, whereby the guest would trigger a reset, and > > the RAS capabilities of the guest would handle failing over to a > > multipath device, ejecting the failing device, etc. > > Yes, this is my thinking, except I would not abort the migration but > continue on to the new hypervisor and then do the RAS recovery with > the new device. Potentially a valid option, QEMU might optionally insert a subsection in the migration stream to indicate the device failed during the migration process. The option might also allow migrating devices that don't support migration, ie. the recovery process on the target is the same. This is essentially a policy decision and I think QEMU probably leans more towards failing the migration and letting a management tool decided on the next course of action. Thanks, Alex
On 10/20/2021 8:45 PM, Alex Williamson wrote: > However, regardless of the migration recovery strategy, userspace needs > a means to get the device back into an initial state in a deterministic > way without closing and re-opening the device (or polling for an > arbitrary length of time). That's the minimum viable product here. > Thanks, > > Alex > OK, in V3 I'll use the algorithm that Jason has suggested in this mail thread to let things be done in a deterministic way without using a WQ. Thanks, Yishai
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 621b7fc60544..b759c6e153b6 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -58,6 +58,7 @@ struct mlx5vf_pci_core_device { /* protect migartion state */ struct mutex state_mutex; struct mlx5vf_pci_migration_info vmig; + struct work_struct work; }; static int mlx5vf_pci_unquiesce_device(struct mlx5vf_pci_core_device *mvdev) @@ -615,6 +616,27 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .match = vfio_pci_core_match, }; +static void mlx5vf_reset_work_handler(struct work_struct *work) +{ + struct mlx5vf_pci_core_device *mvdev = + container_of(work, struct mlx5vf_pci_core_device, work); + + mutex_lock(&mvdev->state_mutex); + mlx5vf_reset_mig_state(mvdev); + mvdev->vmig.vfio_dev_state = VFIO_DEVICE_STATE_RUNNING; + mutex_unlock(&mvdev->state_mutex); +} + +static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) +{ + struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev); + + if (!mvdev->migrate_cap) + return; + + schedule_work(&mvdev->work); +} + static int mlx5vf_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -634,6 +656,8 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev, if (MLX5_CAP_GEN(mdev, migration)) { mvdev->migrate_cap = 1; mutex_init(&mvdev->state_mutex); + INIT_WORK(&mvdev->work, + mlx5vf_reset_work_handler); } mlx5_vf_put_core_dev(mdev); } @@ -656,6 +680,8 @@ static void mlx5vf_pci_remove(struct pci_dev *pdev) { struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev); + if (mvdev->migrate_cap) + cancel_work_sync(&mvdev->work); vfio_pci_core_unregister_device(&mvdev->core_device); vfio_pci_core_uninit_device(&mvdev->core_device); kfree(mvdev); @@ -668,12 +694,17 @@ static const struct pci_device_id mlx5vf_pci_table[] = { MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); +const struct pci_error_handlers mlx5vf_err_handlers = { + .reset_done = mlx5vf_pci_aer_reset_done, + .error_detected = vfio_pci_aer_err_detected, +}; + static struct pci_driver mlx5vf_pci_driver = { .name = KBUILD_MODNAME, .id_table = mlx5vf_pci_table, .probe = mlx5vf_pci_probe, .remove = mlx5vf_pci_remove, - .err_handler = &vfio_pci_core_err_handlers, + .err_handler = &mlx5vf_err_handlers, }; static void __exit mlx5vf_pci_cleanup(void)