Message ID | 20211019105838.227569-13-yishaih@nvidia.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Bjorn Helgaas |
Headers | show |
Series | Add mlx5 live migration driver | expand |
On Tue, 19 Oct 2021 13:58:36 +0300 Yishai Hadas <yishaih@nvidia.com> wrote: > This patch adds support for vfio_pci driver for mlx5 devices. > > It uses vfio_pci_core to register to the VFIO subsystem and then > implements the mlx5 specific logic in the migration area. > > The migration implementation follows the definition from uapi/vfio.h and > uses the mlx5 VF->PF command channel to achieve it. > > This patch implements the suspend/resume flows. > > Signed-off-by: Yishai Hadas <yishaih@nvidia.com> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > --- > MAINTAINERS | 6 + > drivers/vfio/pci/Kconfig | 3 + > drivers/vfio/pci/Makefile | 2 + > drivers/vfio/pci/mlx5/Kconfig | 10 + > drivers/vfio/pci/mlx5/Makefile | 4 + > drivers/vfio/pci/mlx5/main.c | 696 +++++++++++++++++++++++++++++++++ > 6 files changed, 721 insertions(+) > create mode 100644 drivers/vfio/pci/mlx5/Kconfig > create mode 100644 drivers/vfio/pci/mlx5/Makefile > create mode 100644 drivers/vfio/pci/mlx5/main.c > > diff --git a/MAINTAINERS b/MAINTAINERS > index abdcbcfef73d..e824bfab4a01 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -19699,6 +19699,12 @@ L: kvm@vger.kernel.org > S: Maintained > F: drivers/vfio/platform/ > > +VFIO MLX5 PCI DRIVER > +M: Yishai Hadas <yishaih@nvidia.com> > +L: kvm@vger.kernel.org > +S: Maintained > +F: drivers/vfio/pci/mlx5/ > + > VGA_SWITCHEROO > R: Lukas Wunner <lukas@wunner.de> > S: Maintained > diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig > index 860424ccda1b..187b9c259944 100644 > --- a/drivers/vfio/pci/Kconfig > +++ b/drivers/vfio/pci/Kconfig > @@ -43,4 +43,7 @@ config VFIO_PCI_IGD > > To enable Intel IGD assignment through vfio-pci, say Y. > endif > + > +source "drivers/vfio/pci/mlx5/Kconfig" > + > endif > diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile > index 349d68d242b4..ed9d6f2e0555 100644 > --- a/drivers/vfio/pci/Makefile > +++ b/drivers/vfio/pci/Makefile > @@ -7,3 +7,5 @@ obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o > vfio-pci-y := vfio_pci.o > vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o > obj-$(CONFIG_VFIO_PCI) += vfio-pci.o > + > +obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5/ > diff --git a/drivers/vfio/pci/mlx5/Kconfig b/drivers/vfio/pci/mlx5/Kconfig > new file mode 100644 > index 000000000000..119712656400 > --- /dev/null > +++ b/drivers/vfio/pci/mlx5/Kconfig > @@ -0,0 +1,10 @@ > +# SPDX-License-Identifier: GPL-2.0-only > +config MLX5_VFIO_PCI > + tristate "VFIO support for MLX5 PCI devices" > + depends on MLX5_CORE > + select VFIO_PCI_CORE > + help > + This provides a migration support for MLX5 devices using the VFIO s/ a// > + framework. > + > + If you don't know what to do here, say N. > diff --git a/drivers/vfio/pci/mlx5/Makefile b/drivers/vfio/pci/mlx5/Makefile > new file mode 100644 > index 000000000000..689627da7ff5 > --- /dev/null > +++ b/drivers/vfio/pci/mlx5/Makefile > @@ -0,0 +1,4 @@ > +# SPDX-License-Identifier: GPL-2.0-only > +obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5-vfio-pci.o > +mlx5-vfio-pci-y := main.o cmd.o > + > diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c > new file mode 100644 > index 000000000000..621b7fc60544 > --- /dev/null > +++ b/drivers/vfio/pci/mlx5/main.c > @@ -0,0 +1,696 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved > + */ > + > +#include <linux/device.h> > +#include <linux/eventfd.h> > +#include <linux/file.h> > +#include <linux/interrupt.h> > +#include <linux/iommu.h> > +#include <linux/module.h> > +#include <linux/mutex.h> > +#include <linux/notifier.h> > +#include <linux/pci.h> > +#include <linux/pm_runtime.h> > +#include <linux/types.h> > +#include <linux/uaccess.h> > +#include <linux/vfio.h> > +#include <linux/sched/mm.h> > +#include <linux/vfio_pci_core.h> > + > +#include "cmd.h" > + > +enum { > + MLX5VF_PCI_FREEZED = 1 << 0, > +}; > + > +enum { > + MLX5VF_REGION_PENDING_BYTES = 1 << 0, > + MLX5VF_REGION_DATA_SIZE = 1 << 1, > +}; > + > +enum { > + MLX5VF_SUPPORTED_DEVICE_STATES = VFIO_DEVICE_STATE_RUNNING | > + VFIO_DEVICE_STATE_SAVING | > + VFIO_DEVICE_STATE_RESUMING, > +}; > + > +#define MLX5VF_MIG_REGION_DATA_SIZE SZ_128K > +/* Data section offset from migration region */ > +#define MLX5VF_MIG_REGION_DATA_OFFSET \ > + (sizeof(struct vfio_device_migration_info)) > + > +#define VFIO_DEVICE_MIGRATION_OFFSET(x) \ > + (offsetof(struct vfio_device_migration_info, x)) > + > +struct mlx5vf_pci_migration_info { > + u32 vfio_dev_state; /* VFIO_DEVICE_STATE_XXX */ > + u32 dev_state; /* device migration state */ > + u32 region_state; /* Use MLX5VF_REGION_XXX */ > + u16 vhca_id; > + struct mlx5_vhca_state_data vhca_state_data; > +}; > + > +struct mlx5vf_pci_core_device { > + struct vfio_pci_core_device core_device; > + u8 migrate_cap:1; > + /* protect migartion state */ s/migartion/migration/ > + struct mutex state_mutex; > + struct mlx5vf_pci_migration_info vmig; > +}; > + > +static int mlx5vf_pci_unquiesce_device(struct mlx5vf_pci_core_device *mvdev) > +{ > + return mlx5vf_cmd_resume_vhca(mvdev->core_device.pdev, > + mvdev->vmig.vhca_id, > + MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_MASTER); > +} > + > +static int mlx5vf_pci_quiesce_device(struct mlx5vf_pci_core_device *mvdev) > +{ > + return mlx5vf_cmd_suspend_vhca( > + mvdev->core_device.pdev, mvdev->vmig.vhca_id, > + MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_MASTER); > +} > + > +static int mlx5vf_pci_unfreeze_device(struct mlx5vf_pci_core_device *mvdev) > +{ > + int ret; > + > + ret = mlx5vf_cmd_resume_vhca(mvdev->core_device.pdev, > + mvdev->vmig.vhca_id, > + MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_SLAVE); > + if (ret) > + return ret; > + > + mvdev->vmig.dev_state &= ~MLX5VF_PCI_FREEZED; > + return 0; > +} > + > +static int mlx5vf_pci_freeze_device(struct mlx5vf_pci_core_device *mvdev) > +{ > + int ret; > + > + ret = mlx5vf_cmd_suspend_vhca( > + mvdev->core_device.pdev, mvdev->vmig.vhca_id, > + MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_SLAVE); > + if (ret) > + return ret; > + > + mvdev->vmig.dev_state |= MLX5VF_PCI_FREEZED; > + return 0; > +} > + > +static int mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) > +{ > + u32 state_size = 0; > + int ret; > + > + if (!(mvdev->vmig.dev_state & MLX5VF_PCI_FREEZED)) > + return -EFAULT; > + > + /* If we already read state no reason to re-read */ > + if (mvdev->vmig.vhca_state_data.state_size) > + return 0; > + > + ret = mlx5vf_cmd_query_vhca_migration_state( > + mvdev->core_device.pdev, mvdev->vmig.vhca_id, &state_size); > + if (ret) > + return ret; > + > + return mlx5vf_cmd_save_vhca_state(mvdev->core_device.pdev, > + mvdev->vmig.vhca_id, state_size, > + &mvdev->vmig.vhca_state_data); > +} > + > +static int mlx5vf_pci_new_write_window(struct mlx5vf_pci_core_device *mvdev) > +{ > + struct mlx5_vhca_state_data *state_data = &mvdev->vmig.vhca_state_data; > + u32 num_pages_needed; > + u64 allocated_ready; > + u32 bytes_needed; > + > + /* Check how many bytes are available from previous flows */ > + WARN_ON(state_data->num_pages * PAGE_SIZE < > + state_data->win_start_offset); > + allocated_ready = (state_data->num_pages * PAGE_SIZE) - > + state_data->win_start_offset; > + WARN_ON(allocated_ready > MLX5VF_MIG_REGION_DATA_SIZE); > + > + bytes_needed = MLX5VF_MIG_REGION_DATA_SIZE - allocated_ready; > + if (!bytes_needed) > + return 0; > + > + num_pages_needed = DIV_ROUND_UP_ULL(bytes_needed, PAGE_SIZE); > + return mlx5vf_add_migration_pages(state_data, num_pages_needed); > +} > + > +static ssize_t > +mlx5vf_pci_handle_migration_data_size(struct mlx5vf_pci_core_device *mvdev, > + char __user *buf, bool iswrite) > +{ > + struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig; > + u64 data_size; > + int ret; > + > + if (iswrite) { > + /* data_size is writable only during resuming state */ > + if (vmig->vfio_dev_state != VFIO_DEVICE_STATE_RESUMING) > + return -EINVAL; > + > + ret = copy_from_user(&data_size, buf, sizeof(data_size)); > + if (ret) > + return -EFAULT; > + > + vmig->vhca_state_data.state_size += data_size; > + vmig->vhca_state_data.win_start_offset += data_size; > + ret = mlx5vf_pci_new_write_window(mvdev); > + if (ret) > + return ret; > + > + } else { > + if (vmig->vfio_dev_state != VFIO_DEVICE_STATE_SAVING) > + return -EINVAL; > + > + data_size = min_t(u64, MLX5VF_MIG_REGION_DATA_SIZE, > + vmig->vhca_state_data.state_size - > + vmig->vhca_state_data.win_start_offset); > + ret = copy_to_user(buf, &data_size, sizeof(data_size)); > + if (ret) > + return -EFAULT; > + } > + > + vmig->region_state |= MLX5VF_REGION_DATA_SIZE; > + return sizeof(data_size); > +} > + > +static ssize_t > +mlx5vf_pci_handle_migration_data_offset(struct mlx5vf_pci_core_device *mvdev, > + char __user *buf, bool iswrite) > +{ > + static const u64 data_offset = MLX5VF_MIG_REGION_DATA_OFFSET; > + int ret; > + > + /* RO field */ > + if (iswrite) > + return -EFAULT; > + > + ret = copy_to_user(buf, &data_offset, sizeof(data_offset)); > + if (ret) > + return -EFAULT; > + > + return sizeof(data_offset); > +} > + > +static ssize_t > +mlx5vf_pci_handle_migration_pending_bytes(struct mlx5vf_pci_core_device *mvdev, > + char __user *buf, bool iswrite) > +{ > + struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig; > + u64 pending_bytes; > + int ret; > + > + /* RO field */ > + if (iswrite) > + return -EFAULT; > + > + if (vmig->vfio_dev_state == (VFIO_DEVICE_STATE_SAVING | > + VFIO_DEVICE_STATE_RUNNING)) { > + /* In pre-copy state we have no data to return for now, > + * return 0 pending bytes > + */ > + pending_bytes = 0; > + } else { > + if (!vmig->vhca_state_data.state_size) > + return 0; > + pending_bytes = vmig->vhca_state_data.state_size - > + vmig->vhca_state_data.win_start_offset; > + } > + > + ret = copy_to_user(buf, &pending_bytes, sizeof(pending_bytes)); > + if (ret) > + return -EFAULT; > + > + /* Window moves forward once data from previous iteration was read */ > + if (vmig->region_state & MLX5VF_REGION_DATA_SIZE) > + vmig->vhca_state_data.win_start_offset += > + min_t(u64, MLX5VF_MIG_REGION_DATA_SIZE, pending_bytes); > + > + WARN_ON(vmig->vhca_state_data.win_start_offset > > + vmig->vhca_state_data.state_size); > + > + /* New iteration started */ > + vmig->region_state = MLX5VF_REGION_PENDING_BYTES; > + return sizeof(pending_bytes); > +} > + > +static int mlx5vf_load_state(struct mlx5vf_pci_core_device *mvdev) > +{ > + if (!mvdev->vmig.vhca_state_data.state_size) > + return 0; > + > + return mlx5vf_cmd_load_vhca_state(mvdev->core_device.pdev, > + mvdev->vmig.vhca_id, > + &mvdev->vmig.vhca_state_data); > +} > + > +static void mlx5vf_reset_mig_state(struct mlx5vf_pci_core_device *mvdev) > +{ > + struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig; > + > + vmig->region_state = 0; > + mlx5vf_reset_vhca_state(&vmig->vhca_state_data); > +} > + > +static int mlx5vf_pci_set_device_state(struct mlx5vf_pci_core_device *mvdev, > + u32 state) > +{ > + struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig; > + u32 old_state = vmig->vfio_dev_state; > + int ret = 0; > + > + if (old_state == VFIO_DEVICE_STATE_ERROR || > + !VFIO_DEVICE_STATE_VALID(state) || > + (state & ~MLX5VF_SUPPORTED_DEVICE_STATES)) > + return -EINVAL; > + > + /* Running switches off */ > + if (((old_state ^ state) & VFIO_DEVICE_STATE_RUNNING) && > + (old_state & VFIO_DEVICE_STATE_RUNNING)) { > + ret = mlx5vf_pci_quiesce_device(mvdev); > + if (ret) > + return ret; > + ret = mlx5vf_pci_freeze_device(mvdev); > + if (ret) { > + vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR; > + return ret; > + } > + } > + > + /* Resuming switches off */ > + if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) && > + (old_state & VFIO_DEVICE_STATE_RESUMING)) { > + /* deserialize state into the device */ > + ret = mlx5vf_load_state(mvdev); > + if (ret) { > + vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR; > + return ret; > + } > + } > + > + /* Resuming switches on */ > + if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) && > + (state & VFIO_DEVICE_STATE_RESUMING)) { > + mlx5vf_reset_mig_state(mvdev); > + ret = mlx5vf_pci_new_write_window(mvdev); > + if (ret) > + return ret; > + } A couple nits here... Perhaps: if ((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING)) { /* Resuming bit cleared */ if (old_state & VFIO_DEVICE_STATE_RESUMING) { ... } else { /* Resuming bit set */ ... } } Also u32 flipped_bits = old_state ^ state; or similar would simplify all these cases slightly. > + > + /* Saving switches on */ > + if (((old_state ^ state) & VFIO_DEVICE_STATE_SAVING) && > + (state & VFIO_DEVICE_STATE_SAVING)) { > + if (!(state & VFIO_DEVICE_STATE_RUNNING)) { > + /* serialize post copy */ > + ret = mlx5vf_pci_save_device_data(mvdev); > + if (ret) > + return ret; > + } > + } This doesn't catch all the cases, and in fact misses the most expected case where userspace clears the _RUNNING bit while _SAVING is already enabled. Does that mean this hasn't actually been tested with QEMU? It seems like there also needs to be a clause in the case where _RUNNING switches off to test if _SAVING is already set and has not toggled. > + > + /* Running switches on */ > + if (((old_state ^ state) & VFIO_DEVICE_STATE_RUNNING) && > + (state & VFIO_DEVICE_STATE_RUNNING)) { > + ret = mlx5vf_pci_unfreeze_device(mvdev); > + if (ret) > + return ret; > + ret = mlx5vf_pci_unquiesce_device(mvdev); > + if (ret) { > + vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR; > + return ret; > + } > + } Per previous discussion, I understand that freeze and quiesce are loosely stop-responding-to-dma and stop-sending-dma, respectively. Once we're quiesced and frozen, device state doesn't change. What are the implications to userspace that we don't expose a quiesce state (yet)? I'm wondering if this needs to be resolved before we introduce our first in-tree user of the uAPI (and before QEMU support becomes non-experimental). Thanks, Alex
On Tue, Oct 19, 2021 at 12:43:52PM -0600, Alex Williamson wrote: > > + /* Running switches on */ > > + if (((old_state ^ state) & VFIO_DEVICE_STATE_RUNNING) && > > + (state & VFIO_DEVICE_STATE_RUNNING)) { > > + ret = mlx5vf_pci_unfreeze_device(mvdev); > > + if (ret) > > + return ret; > > + ret = mlx5vf_pci_unquiesce_device(mvdev); > > + if (ret) { > > + vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR; > > + return ret; > > + } > > + } > > Per previous discussion, I understand that freeze and quiesce are > loosely stop-responding-to-dma and stop-sending-dma, respectively. > Once we're quiesced and frozen, device state doesn't change. What are > the implications to userspace that we don't expose a quiesce state > (yet)? I'm wondering if this needs to be resolved before we introduce > our first in-tree user of the uAPI (and before QEMU support becomes > non-experimental). Thanks, The prototype patch I saw added a 4th bit to the state which was 1 == 'not dma initiating' As you suggested I think a cap bit someplace should be defined if the driver supports the 4th bit. Otherwise, I think it is backwards compatible, the new logic would be two ifs if ((flipped & STATE_NDMA) && (flipped & (STATE_NDMA | STATE_RUNNING)) == STATE_NDMA | STATE_RUNNING) mlx5vf_pci _quiesce_device() [..] if ((flipped == (STATE_NDMA)) && (flipped & (STATE_NDMA | STATE_RUNNING)) == STATE_RUNNING) mlx5vf_pci_unquiesce_device() Sequenced before/after the other calls to quiesce_device So if userspace doesn't use it then the same driver behavior is kept, as it never sees STATE_NDMA flip Asking for STATE_NDMA !STATE_RUNNING is just ignored because !RUNNING already implies NDMA .. and some optimization of the logic to avoid duplicated work Jason
On Tue, 19 Oct 2021 16:23:28 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Tue, Oct 19, 2021 at 12:43:52PM -0600, Alex Williamson wrote: > > > + /* Running switches on */ > > > + if (((old_state ^ state) & VFIO_DEVICE_STATE_RUNNING) && > > > + (state & VFIO_DEVICE_STATE_RUNNING)) { > > > + ret = mlx5vf_pci_unfreeze_device(mvdev); > > > + if (ret) > > > + return ret; > > > + ret = mlx5vf_pci_unquiesce_device(mvdev); > > > + if (ret) { > > > + vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR; > > > + return ret; > > > + } > > > + } > > > > Per previous discussion, I understand that freeze and quiesce are > > loosely stop-responding-to-dma and stop-sending-dma, respectively. > > Once we're quiesced and frozen, device state doesn't change. What are > > the implications to userspace that we don't expose a quiesce state > > (yet)? I'm wondering if this needs to be resolved before we introduce > > our first in-tree user of the uAPI (and before QEMU support becomes > > non-experimental). Thanks, > > The prototype patch I saw added a 4th bit to the state which was > 1 == 'not dma initiating' > As you suggested I think a cap bit someplace should be defined if the > driver supports the 4th bit. > > Otherwise, I think it is backwards compatible, the new logic would be > two ifs > > if ((flipped & STATE_NDMA) && > (flipped & (STATE_NDMA | STATE_RUNNING)) == STATE_NDMA | STATE_RUNNING) > mlx5vf_pci _quiesce_device() > > [..] > > if ((flipped == (STATE_NDMA)) && > (flipped & (STATE_NDMA | STATE_RUNNING)) == STATE_RUNNING) > mlx5vf_pci_unquiesce_device() > > Sequenced before/after the other calls to quiesce_device > > So if userspace doesn't use it then the same driver behavior is kept, > as it never sees STATE_NDMA flip > > Asking for STATE_NDMA !STATE_RUNNING is just ignored because !RUNNING > already implies NDMA > > .. and some optimization of the logic to avoid duplicated work Ok, so this new bit just augments how the device interprets _RUNNING, it's essentially a don't-care relative to _SAVING or _RESTORING. I think that gives us this table: | NDMA | RESUMING | SAVING | RUNNING | +----------+----------+----------+----------+ --- | X | 0 | 0 | 0 | ^ +----------+----------+----------+----------+ | | 0 | 0 | 0 | 1 | | +----------+----------+----------+----------+ | | X | 0 | 1 | 0 | +----------+----------+----------+----------+ NDMA value is either compatible | 0 | 0 | 1 | 1 | to existing behavior or don't +----------+----------+----------+----------+ care due to redundancy vs | X | 1 | 0 | 0 | !_RUNNING/INVALID/ERROR +----------+----------+----------+----------+ | X | 1 | 0 | 1 | | +----------+----------+----------+----------+ | | X | 1 | 1 | 0 | | +----------+----------+----------+----------+ | | X | 1 | 1 | 1 | v +----------+----------+----------+----------+ --- | 1 | 0 | 0 | 1 | ^ +----------+----------+----------+----------+ Desired new useful cases | 1 | 0 | 1 | 1 | v +----------+----------+----------+----------+ --- Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can set which are simply redundant to the NDMA = 0 cases. Row 6 remains invalid due to lack of support for pre-copy (_RESUMING | _RUNNING) and therefore cannot be set by userspace. Rows 7 & 8 are error states and cannot be set by userspace. Like other bits, setting the bit should be effective at the completion of writing device state. Therefore the device would need to flush any outbound DMA queues before returning. The question I was really trying to get to though is whether we have a supportable interface without such an extension. There's currently only an experimental version of vfio migration support for PCI devices in QEMU (afaik), so it seems like we could make use of the bus-master bit to fill this gap in QEMU currently, before we claim non-experimental support, but this new device agnostic extension would be required for non-PCI device support (and PCI support should adopt it as available). Does that sound right? Thanks, Alex
On Tue, Oct 19, 2021 at 02:58:56PM -0600, Alex Williamson wrote: > I think that gives us this table: > > | NDMA | RESUMING | SAVING | RUNNING | > +----------+----------+----------+----------+ --- > | X | 0 | 0 | 0 | ^ > +----------+----------+----------+----------+ | > | 0 | 0 | 0 | 1 | | > +----------+----------+----------+----------+ | > | X | 0 | 1 | 0 | > +----------+----------+----------+----------+ NDMA value is either compatible > | 0 | 0 | 1 | 1 | to existing behavior or don't > +----------+----------+----------+----------+ care due to redundancy vs > | X | 1 | 0 | 0 | !_RUNNING/INVALID/ERROR > +----------+----------+----------+----------+ > | X | 1 | 0 | 1 | | > +----------+----------+----------+----------+ | > | X | 1 | 1 | 0 | | > +----------+----------+----------+----------+ | > | X | 1 | 1 | 1 | v > +----------+----------+----------+----------+ --- > | 1 | 0 | 0 | 1 | ^ > +----------+----------+----------+----------+ Desired new useful cases > | 1 | 0 | 1 | 1 | v > +----------+----------+----------+----------+ --- > > Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can > set which are simply redundant to the NDMA = 0 cases. It seems right > Row 6 remains invalid due to lack of support for pre-copy (_RESUMING > | _RUNNING) and therefore cannot be set by userspace. Rows 7 & 8 > are error states and cannot be set by userspace. I wonder, did Yishai's series capture this row 6 restriction? Yishai? > Like other bits, setting the bit should be effective at the completion > of writing device state. Therefore the device would need to flush any > outbound DMA queues before returning. Yes, the device commands are expected to achieve this. > The question I was really trying to get to though is whether we have a > supportable interface without such an extension. There's currently > only an experimental version of vfio migration support for PCI devices > in QEMU (afaik), If I recall this only matters if you have a VM that is causing migratable devices to interact with each other. So long as the devices are only interacting with the CPU this extra step is not strictly needed. So, single device cases can be fine as-is IMHO the multi-device case the VMM should probably demand this support from the migration drivers, otherwise it cannot know if it is safe for sure. A config option to override the block if the admin knows there is no use case to cause devices to interact - eg two NVMe devices without CMB do not have a useful interaction. > so it seems like we could make use of the bus-master bit to fill > this gap in QEMU currently, before we claim non-experimental > support, but this new device agnostic extension would be required > for non-PCI device support (and PCI support should adopt it as > available). Does that sound right? Thanks, I don't think the bus master support is really a substitute, tripping bus master will stop DMA but it will not do so in a clean way and is likely to be non-transparent to the VM's driver. The single-device-assigned case is a cleaner restriction, IMHO. Alternatively we can add the 4th bit and insist that migration drivers support all the states. I'm just unsure what other HW can do, I get the feeling people have been designing to the migration description in the header file for a while and this is a new idea. Jason
On 10/19/2021 9:43 PM, Alex Williamson wrote: > >> + >> + /* Resuming switches off */ >> + if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) && >> + (old_state & VFIO_DEVICE_STATE_RESUMING)) { >> + /* deserialize state into the device */ >> + ret = mlx5vf_load_state(mvdev); >> + if (ret) { >> + vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR; >> + return ret; >> + } >> + } >> + >> + /* Resuming switches on */ >> + if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) && >> + (state & VFIO_DEVICE_STATE_RESUMING)) { >> + mlx5vf_reset_mig_state(mvdev); >> + ret = mlx5vf_pci_new_write_window(mvdev); >> + if (ret) >> + return ret; >> + } > A couple nits here... > > Perhaps: > > if ((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING)) { > /* Resuming bit cleared */ > if (old_state & VFIO_DEVICE_STATE_RESUMING) { > ... > } else { /* Resuming bit set */ > ... > } > } I tried to avoid nested 'if's as of some previous notes. The 'resuming' two cases are handled already above so functional wise the code covers this. Jason/Alex, Please recommend what is the preferred way, both options seems to be fine for me. > > Also > > u32 flipped_bits = old_state ^ state; > > or similar would simplify all these cases slightly. > Sure, will use it in V3. >> + >> + /* Saving switches on */ >> + if (((old_state ^ state) & VFIO_DEVICE_STATE_SAVING) && >> + (state & VFIO_DEVICE_STATE_SAVING)) { >> + if (!(state & VFIO_DEVICE_STATE_RUNNING)) { >> + /* serialize post copy */ >> + ret = mlx5vf_pci_save_device_data(mvdev); >> + if (ret) >> + return ret; >> + } >> + } > This doesn't catch all the cases, and in fact misses the most expected > case where userspace clears the _RUNNING bit while _SAVING is already > enabled. Does that mean this hasn't actually been tested with QEMU? I run QEMU with 'x-pre-copy-dirty-page-tracking=off' as current driver doesn't support dirty-pages. As so, it seems that this flow wasn't triggered by QEMU in my save/load test. > It seems like there also needs to be a clause in the case where > _RUNNING switches off to test if _SAVING is already set and has not > toggled. > This can be achieved by adding the below to current code, this assumes that we are fine with nested 'if's coding. Seems OK ? @@ -269,6 +269,7 @@ static int mlx5vf_pci_set_device_state(struct mlx5vf_pci_core_device *mvdev, { struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig; u32 old_state = vmig->vfio_dev_state; + u32 flipped_bits = old_state ^ state; int ret = 0; if (old_state == VFIO_DEVICE_STATE_ERROR || @@ -277,7 +278,7 @@ static int mlx5vf_pci_set_device_state(struct mlx5vf_pci_core_device *mvdev, return -EINVAL; /* Running switches off */ - if (((old_state ^ state) & VFIO_DEVICE_STATE_RUNNING) && + if ((flipped_bits & VFIO_DEVICE_STATE_RUNNING) && (old_state & VFIO_DEVICE_STATE_RUNNING)) { ret = mlx5vf_pci_quiesce_device(mvdev); if (ret) @@ -287,10 +288,18 @@ static int mlx5vf_pci_set_device_state(struct mlx5vf_pci_core_device *mvdev, vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR; return ret; } + if (state & VFIO_DEVICE_STATE_SAVING) { + /* serialize post copy */ + ret = mlx5vf_pci_save_device_data(mvdev); + if (ret) { + vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR; + return ret; + } + } } Yishai
On 10/20/2021 2:04 AM, Jason Gunthorpe wrote: > On Tue, Oct 19, 2021 at 02:58:56PM -0600, Alex Williamson wrote: >> I think that gives us this table: >> >> | NDMA | RESUMING | SAVING | RUNNING | >> +----------+----------+----------+----------+ --- >> | X | 0 | 0 | 0 | ^ >> +----------+----------+----------+----------+ | >> | 0 | 0 | 0 | 1 | | >> +----------+----------+----------+----------+ | >> | X | 0 | 1 | 0 | >> +----------+----------+----------+----------+ NDMA value is either compatible >> | 0 | 0 | 1 | 1 | to existing behavior or don't >> +----------+----------+----------+----------+ care due to redundancy vs >> | X | 1 | 0 | 0 | !_RUNNING/INVALID/ERROR >> +----------+----------+----------+----------+ >> | X | 1 | 0 | 1 | | >> +----------+----------+----------+----------+ | >> | X | 1 | 1 | 0 | | >> +----------+----------+----------+----------+ | >> | X | 1 | 1 | 1 | v >> +----------+----------+----------+----------+ --- >> | 1 | 0 | 0 | 1 | ^ >> +----------+----------+----------+----------+ Desired new useful cases >> | 1 | 0 | 1 | 1 | v >> +----------+----------+----------+----------+ --- >> >> Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can >> set which are simply redundant to the NDMA = 0 cases. > It seems right > >> Row 6 remains invalid due to lack of support for pre-copy (_RESUMING >> | _RUNNING) and therefore cannot be set by userspace. Rows 7 & 8 >> are error states and cannot be set by userspace. > I wonder, did Yishai's series capture this row 6 restriction? Yishai? It seems so, by using the below check which includes the !VFIO_DEVICE_STATE_VALID clause. if (old_state == VFIO_DEVICE_STATE_ERROR || !VFIO_DEVICE_STATE_VALID(state) || (state & ~MLX5VF_SUPPORTED_DEVICE_STATES)) return -EINVAL; Which is: #define VFIO_DEVICE_STATE_VALID(state) \ (state & VFIO_DEVICE_STATE_RESUMING ? \ (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1) > >> Like other bits, setting the bit should be effective at the completion >> of writing device state. Therefore the device would need to flush any >> outbound DMA queues before returning. > Yes, the device commands are expected to achieve this. > >> The question I was really trying to get to though is whether we have a >> supportable interface without such an extension. There's currently >> only an experimental version of vfio migration support for PCI devices >> in QEMU (afaik), > If I recall this only matters if you have a VM that is causing > migratable devices to interact with each other. So long as the devices > are only interacting with the CPU this extra step is not strictly > needed. > > So, single device cases can be fine as-is > > IMHO the multi-device case the VMM should probably demand this support > from the migration drivers, otherwise it cannot know if it is safe for > sure. > > A config option to override the block if the admin knows there is no > use case to cause devices to interact - eg two NVMe devices without > CMB do not have a useful interaction. > >> so it seems like we could make use of the bus-master bit to fill >> this gap in QEMU currently, before we claim non-experimental >> support, but this new device agnostic extension would be required >> for non-PCI device support (and PCI support should adopt it as >> available). Does that sound right? Thanks, > I don't think the bus master support is really a substitute, tripping > bus master will stop DMA but it will not do so in a clean way and is > likely to be non-transparent to the VM's driver. > > The single-device-assigned case is a cleaner restriction, IMHO. > > Alternatively we can add the 4th bit and insist that migration drivers > support all the states. I'm just unsure what other HW can do, I get > the feeling people have been designing to the migration description in > the header file for a while and this is a new idea. > > Jason Just to be sure, We refer here to some future functionality support with this extra 4th bit but it doesn't enforce any change in the submitted code, right ? The below code uses the (state & ~MLX5VF_SUPPORTED_DEVICE_STATES) clause which fails any usage of a non-supported bit as of this one. if (old_state == VFIO_DEVICE_STATE_ERROR || !VFIO_DEVICE_STATE_VALID(state) || (state & ~MLX5VF_SUPPORTED_DEVICE_STATES)) return -EINVAL; Yishai
On Wed, Oct 20, 2021 at 11:01:01AM +0300, Yishai Hadas wrote: > On 10/19/2021 9:43 PM, Alex Williamson wrote: > > > > > + > > > + /* Resuming switches off */ > > > + if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) && > > > + (old_state & VFIO_DEVICE_STATE_RESUMING)) { > > > + /* deserialize state into the device */ > > > + ret = mlx5vf_load_state(mvdev); > > > + if (ret) { > > > + vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR; > > > + return ret; > > > + } > > > + } > > > + > > > + /* Resuming switches on */ > > > + if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) && > > > + (state & VFIO_DEVICE_STATE_RESUMING)) { > > > + mlx5vf_reset_mig_state(mvdev); > > > + ret = mlx5vf_pci_new_write_window(mvdev); > > > + if (ret) > > > + return ret; > > > + } > > A couple nits here... > > > > Perhaps: > > > > if ((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING)) { > > /* Resuming bit cleared */ > > if (old_state & VFIO_DEVICE_STATE_RESUMING) { > > ... > > } else { /* Resuming bit set */ > > ... > > } > > } > > I tried to avoid nested 'if's as of some previous notes. The layout of the if blocks must follow a logical progression of the actions toward the chip: start precopy tracking quiesce freeze stop precopy tracking save state to buffer clear state buffer load state from buffer unfreeze unquiesce The order of the if blocks sets the precendence of the requested actions, because the bit-field view means userspace can request multiple actions concurrently. When adding the precopy actions into the above list we can see that the current patches ordering is backwards, save/load should be swapped. Idiomatically each action needs a single edge triggred predicate listed in the right order. The predicates we define here will become the true ABI for qemu to follow to operate the HW Also, the ordering of the predicates influences what usable state transitions exist. So, it is really important to get this right. > I run QEMU with 'x-pre-copy-dirty-page-tracking=off' as current driver > doesn't support dirty-pages. > > As so, it seems that this flow wasn't triggered by QEMU in my save/load > test. > > > It seems like there also needs to be a clause in the case where > > _RUNNING switches off to test if _SAVING is already set and has not > > toggled. > > > > This can be achieved by adding the below to current code, this assumes that > we are fine with nested 'if's coding. Like this: if ((flipped_bits & (RUNNING | SAVING)) && ((old_state & (RUNNING | SAVING)) == (RUNNING|SAVING)) /* enter pre-copy state */ if ((flipped_bits & (RUNNING | SAVING)) && ((old_state & (RUNNING | SAVING)) != (RUNNING|SAVING)) /* exit pre-copy state */ if ((flipped_bits & (RUNNING | SAVING)) && ((old_state & (RUNNING | SAVING)) == SAVING)) mlx5vf_pci_save_device_data() Jason
[Cc +dgilbert, +cohuck] On Wed, 20 Oct 2021 11:28:04 +0300 Yishai Hadas <yishaih@nvidia.com> wrote: > On 10/20/2021 2:04 AM, Jason Gunthorpe wrote: > > On Tue, Oct 19, 2021 at 02:58:56PM -0600, Alex Williamson wrote: > >> I think that gives us this table: > >> > >> | NDMA | RESUMING | SAVING | RUNNING | > >> +----------+----------+----------+----------+ --- > >> | X | 0 | 0 | 0 | ^ > >> +----------+----------+----------+----------+ | > >> | 0 | 0 | 0 | 1 | | > >> +----------+----------+----------+----------+ | > >> | X | 0 | 1 | 0 | > >> +----------+----------+----------+----------+ NDMA value is either compatible > >> | 0 | 0 | 1 | 1 | to existing behavior or don't > >> +----------+----------+----------+----------+ care due to redundancy vs > >> | X | 1 | 0 | 0 | !_RUNNING/INVALID/ERROR > >> +----------+----------+----------+----------+ > >> | X | 1 | 0 | 1 | | > >> +----------+----------+----------+----------+ | > >> | X | 1 | 1 | 0 | | > >> +----------+----------+----------+----------+ | > >> | X | 1 | 1 | 1 | v > >> +----------+----------+----------+----------+ --- > >> | 1 | 0 | 0 | 1 | ^ > >> +----------+----------+----------+----------+ Desired new useful cases > >> | 1 | 0 | 1 | 1 | v > >> +----------+----------+----------+----------+ --- > >> > >> Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can > >> set which are simply redundant to the NDMA = 0 cases. > > It seems right > > > >> Row 6 remains invalid due to lack of support for pre-copy (_RESUMING > >> | _RUNNING) and therefore cannot be set by userspace. Rows 7 & 8 > >> are error states and cannot be set by userspace. > > I wonder, did Yishai's series capture this row 6 restriction? Yishai? > > > It seems so, by using the below check which includes the > !VFIO_DEVICE_STATE_VALID clause. > > if (old_state == VFIO_DEVICE_STATE_ERROR || > !VFIO_DEVICE_STATE_VALID(state) || > (state & ~MLX5VF_SUPPORTED_DEVICE_STATES)) > return -EINVAL; > > Which is: > > #define VFIO_DEVICE_STATE_VALID(state) \ > (state & VFIO_DEVICE_STATE_RESUMING ? \ > (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1) > > > > >> Like other bits, setting the bit should be effective at the completion > >> of writing device state. Therefore the device would need to flush any > >> outbound DMA queues before returning. > > Yes, the device commands are expected to achieve this. > > > >> The question I was really trying to get to though is whether we have a > >> supportable interface without such an extension. There's currently > >> only an experimental version of vfio migration support for PCI devices > >> in QEMU (afaik), > > If I recall this only matters if you have a VM that is causing > > migratable devices to interact with each other. So long as the devices > > are only interacting with the CPU this extra step is not strictly > > needed. > > > > So, single device cases can be fine as-is > > > > IMHO the multi-device case the VMM should probably demand this support > > from the migration drivers, otherwise it cannot know if it is safe for > > sure. > > > > A config option to override the block if the admin knows there is no > > use case to cause devices to interact - eg two NVMe devices without > > CMB do not have a useful interaction. > > > >> so it seems like we could make use of the bus-master bit to fill > >> this gap in QEMU currently, before we claim non-experimental > >> support, but this new device agnostic extension would be required > >> for non-PCI device support (and PCI support should adopt it as > >> available). Does that sound right? Thanks, > > I don't think the bus master support is really a substitute, tripping > > bus master will stop DMA but it will not do so in a clean way and is > > likely to be non-transparent to the VM's driver. > > > > The single-device-assigned case is a cleaner restriction, IMHO. > > > > Alternatively we can add the 4th bit and insist that migration drivers > > support all the states. I'm just unsure what other HW can do, I get > > the feeling people have been designing to the migration description in > > the header file for a while and this is a new idea. I'm wondering if we're imposing extra requirements on the !_RUNNING state that don't need to be there. For example, if we can assume that all devices within a userspace context are !_RUNNING before any of the devices begin to retrieve final state, then clearing of the _RUNNING bit becomes the device quiesce point and the beginning of reading device data is the point at which the device state is frozen and serialized. No new states required and essentially works with a slight rearrangement of the callbacks in this series. Why can't we do that? Maybe a clarification of the uAPI spec is sufficient to achieve this, ex. !_RUNNING devices may still update their internal state machine based on external access. Userspace is expected to quiesce all external access prior to initiating the retrieval of the final device state from the data section of the migration region. Failure to do so may result in inconsistent device state or optionally the device driver may induce a fault if a quiescent state is not maintained. > Just to be sure, > > We refer here to some future functionality support with this extra 4th > bit but it doesn't enforce any change in the submitted code, right ? > > The below code uses the (state & ~MLX5VF_SUPPORTED_DEVICE_STATES) clause > which fails any usage of a non-supported bit as of this one. > > if (old_state == VFIO_DEVICE_STATE_ERROR || > !VFIO_DEVICE_STATE_VALID(state) || > (state & ~MLX5VF_SUPPORTED_DEVICE_STATES)) > return -EINVAL; Correct, userspace shouldn't be setting any extra bits unless we advertise support, such as via a capability or flag. Drivers need to continue to sanitize user input to validate yet-to-be-defined bits are not accepted from userspace or else we risk not being able to define them later without breaking userspace. Thanks, Alex
On Wed, Oct 20, 2021 at 10:52:30AM -0600, Alex Williamson wrote: > I'm wondering if we're imposing extra requirements on the !_RUNNING > state that don't need to be there. For example, if we can assume that > all devices within a userspace context are !_RUNNING before any of the > devices begin to retrieve final state, then clearing of the _RUNNING > bit becomes the device quiesce point and the beginning of reading > device data is the point at which the device state is frozen and > serialized. No new states required and essentially works with a slight > rearrangement of the callbacks in this series. Why can't we do that? It sounds worth checking carefully. I didn't come up with a major counter scenario. We would need to specifically define which user action triggers the device to freeze and serialize. Reading pending_bytes I suppose? Since freeze is a device command we need to be able to report failure and to move the state to error, that feels bad hidden inside reading pending_bytes. > Maybe a clarification of the uAPI spec is sufficient to achieve this, > ex. !_RUNNING devices may still update their internal state machine > based on external access. Userspace is expected to quiesce all external > access prior to initiating the retrieval of the final device state from > the data section of the migration region. Failure to do so may result > in inconsistent device state or optionally the device driver may induce > a fault if a quiescent state is not maintained. But on the other hand this seem so subtle and tricky way to design a uAPI that devices and users are unlikely to implement it completely correctly. IMHO the point of the state is to control the current behavior of the device - yes we can control behavior on other operations, but it feels obfuscated. Especially when the userspace that needs to know about this isn't even deployed yet, let's just do it cleanly? Jason
On Wed, 20 Oct 2021 15:59:19 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Wed, Oct 20, 2021 at 10:52:30AM -0600, Alex Williamson wrote: > > > I'm wondering if we're imposing extra requirements on the !_RUNNING > > state that don't need to be there. For example, if we can assume that > > all devices within a userspace context are !_RUNNING before any of the > > devices begin to retrieve final state, then clearing of the _RUNNING > > bit becomes the device quiesce point and the beginning of reading > > device data is the point at which the device state is frozen and > > serialized. No new states required and essentially works with a slight > > rearrangement of the callbacks in this series. Why can't we do that? > > It sounds worth checking carefully. I didn't come up with a major > counter scenario. > > We would need to specifically define which user action triggers the > device to freeze and serialize. Reading pending_bytes I suppose? The first read of pending_bytes after clearing the _RUNNING bit would be the logical place to do this since that's what we define as the start of the cycle for reading the device state. "Freezing" the device is a valid implementation, but I don't think it's strictly required per the uAPI. For instance there's no requirement that pending_bytes is reduced by data_size on each iteratio; we specifically only define that the state is complete when the user reads a pending_bytes value of zero. So a driver could restart the device state if the device continues to change (though it's debatable whether triggering an -errno on the next migration region access might be a more supportable approach to enforce that userspace has quiesced external access). > Since freeze is a device command we need to be able to report failure > and to move the state to error, that feels bad hidden inside reading > pending_bytes. That seems like the wrong model. Reading pending_bytes can return -errno should an error occur during freeze/serialize, but device_state is never implicitly modified. Upon encountering an error reading pending_bytes, userspace would abort the migration and move the device_state to a new value. This is the point at which the driver would return another -errno to indicate an unrecoverable internal error, or honor the requested new state. > > Maybe a clarification of the uAPI spec is sufficient to achieve this, > > ex. !_RUNNING devices may still update their internal state machine > > based on external access. Userspace is expected to quiesce all external > > access prior to initiating the retrieval of the final device state from > > the data section of the migration region. Failure to do so may result > > in inconsistent device state or optionally the device driver may induce > > a fault if a quiescent state is not maintained. > > But on the other hand this seem so subtle and tricky way to design a > uAPI that devices and users are unlikely to implement it completely > correctly. I think it's only tricky if device_state is implicitly modified by other actions, stopping all devices before collecting the state of any of them seems like a reasonable requirement. It's the same requirement we'd add with an NDMA bit. > IMHO the point of the state is to control the current behavior of the > device - yes we can control behavior on other operations, but it feels > obfuscated. It is, don't do that. Fail the migration region operation, fail the next device state transition if the internal error is irrecoverable. > Especially when the userspace that needs to know about this isn't even > deployed yet, let's just do it cleanly? That's an option, but again I'm wondering if we're imposing a requirement on the !_RUNNING state that doesn't really exist and a clean solution exists already. Thanks, Alex
On Wed, Oct 20 2021, Alex Williamson <alex.williamson@redhat.com> wrote: > On Wed, 20 Oct 2021 15:59:19 -0300 > Jason Gunthorpe <jgg@nvidia.com> wrote: > >> On Wed, Oct 20, 2021 at 10:52:30AM -0600, Alex Williamson wrote: >> >> > I'm wondering if we're imposing extra requirements on the !_RUNNING >> > state that don't need to be there. For example, if we can assume that >> > all devices within a userspace context are !_RUNNING before any of the >> > devices begin to retrieve final state, then clearing of the _RUNNING >> > bit becomes the device quiesce point and the beginning of reading >> > device data is the point at which the device state is frozen and >> > serialized. No new states required and essentially works with a slight >> > rearrangement of the callbacks in this series. Why can't we do that? >> >> It sounds worth checking carefully. I didn't come up with a major >> counter scenario. >> >> We would need to specifically define which user action triggers the >> device to freeze and serialize. Reading pending_bytes I suppose? > > The first read of pending_bytes after clearing the _RUNNING bit would > be the logical place to do this since that's what we define as the start > of the cycle for reading the device state. > > "Freezing" the device is a valid implementation, but I don't think it's > strictly required per the uAPI. For instance there's no requirement > that pending_bytes is reduced by data_size on each iteratio; we > specifically only define that the state is complete when the user reads > a pending_bytes value of zero. So a driver could restart the device > state if the device continues to change (though it's debatable whether > triggering an -errno on the next migration region access might be a > more supportable approach to enforce that userspace has quiesced > external access). Hm, not so sure. From my reading of the uAPI, transitioning from pre-copy to stop-and-copy (i.e. clearing _RUNNING) implies that we freeze the device (at least, that's how I interpret "On state transition from pre-copy to stop-and-copy, the driver must stop the device, save the device state and send it to the user application through the migration region.") Maybe the uAPI is simply not yet clear enough.
On 10/20/2021 7:25 PM, Jason Gunthorpe wrote: > On Wed, Oct 20, 2021 at 11:01:01AM +0300, Yishai Hadas wrote: >> On 10/19/2021 9:43 PM, Alex Williamson wrote: >>>> + >>>> + /* Resuming switches off */ >>>> + if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) && >>>> + (old_state & VFIO_DEVICE_STATE_RESUMING)) { >>>> + /* deserialize state into the device */ >>>> + ret = mlx5vf_load_state(mvdev); >>>> + if (ret) { >>>> + vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR; >>>> + return ret; >>>> + } >>>> + } >>>> + >>>> + /* Resuming switches on */ >>>> + if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) && >>>> + (state & VFIO_DEVICE_STATE_RESUMING)) { >>>> + mlx5vf_reset_mig_state(mvdev); >>>> + ret = mlx5vf_pci_new_write_window(mvdev); >>>> + if (ret) >>>> + return ret; >>>> + } >>> A couple nits here... >>> >>> Perhaps: >>> >>> if ((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING)) { >>> /* Resuming bit cleared */ >>> if (old_state & VFIO_DEVICE_STATE_RESUMING) { >>> ... >>> } else { /* Resuming bit set */ >>> ... >>> } >>> } >> I tried to avoid nested 'if's as of some previous notes. > The layout of the if blocks must follow a logical progression of > the actions toward the chip: > > start precopy tracking > quiesce > freeze > stop precopy tracking > save state to buffer > clear state buffer > load state from buffer > unfreeze > unquiesce > > The order of the if blocks sets the precendence of the requested > actions, because the bit-field view means userspace can request > multiple actions concurrently. > > When adding the precopy actions into the above list we can see that > the current patches ordering is backwards, save/load should be > swapped. > > Idiomatically each action needs a single edge triggred predicate > listed in the right order. > > The predicates we define here will become the true ABI for qemu to > follow to operate the HW > > Also, the ordering of the predicates influences what usable state > transitions exist. > > So, it is really important to get this right. > >> I run QEMU with 'x-pre-copy-dirty-page-tracking=off' as current driver >> doesn't support dirty-pages. >> >> As so, it seems that this flow wasn't triggered by QEMU in my save/load >> test. >> >>> It seems like there also needs to be a clause in the case where >>> _RUNNING switches off to test if _SAVING is already set and has not >>> toggled. >>> >> This can be achieved by adding the below to current code, this assumes that >> we are fine with nested 'if's coding. > Like this: > > if ((flipped_bits & (RUNNING | SAVING)) && > ((old_state & (RUNNING | SAVING)) == (RUNNING|SAVING)) > /* enter pre-copy state */ > > if ((flipped_bits & (RUNNING | SAVING)) && > ((old_state & (RUNNING | SAVING)) != (RUNNING|SAVING)) > /* exit pre-copy state */ > > if ((flipped_bits & (RUNNING | SAVING)) && > ((old_state & (RUNNING | SAVING)) == SAVING)) > mlx5vf_pci_save_device_data() > > Jason OK, V3 will follow that. Note: In the above old_state needs to be the new state. Thanks, Yishai
On Thu, 21 Oct 2021 11:34:00 +0200 Cornelia Huck <cohuck@redhat.com> wrote: > On Wed, Oct 20 2021, Alex Williamson <alex.williamson@redhat.com> wrote: > > > On Wed, 20 Oct 2021 15:59:19 -0300 > > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > >> On Wed, Oct 20, 2021 at 10:52:30AM -0600, Alex Williamson wrote: > >> > >> > I'm wondering if we're imposing extra requirements on the !_RUNNING > >> > state that don't need to be there. For example, if we can assume that > >> > all devices within a userspace context are !_RUNNING before any of the > >> > devices begin to retrieve final state, then clearing of the _RUNNING > >> > bit becomes the device quiesce point and the beginning of reading > >> > device data is the point at which the device state is frozen and > >> > serialized. No new states required and essentially works with a slight > >> > rearrangement of the callbacks in this series. Why can't we do that? > >> > >> It sounds worth checking carefully. I didn't come up with a major > >> counter scenario. > >> > >> We would need to specifically define which user action triggers the > >> device to freeze and serialize. Reading pending_bytes I suppose? > > > > The first read of pending_bytes after clearing the _RUNNING bit would > > be the logical place to do this since that's what we define as the start > > of the cycle for reading the device state. > > > > "Freezing" the device is a valid implementation, but I don't think it's > > strictly required per the uAPI. For instance there's no requirement > > that pending_bytes is reduced by data_size on each iteratio; we > > specifically only define that the state is complete when the user reads > > a pending_bytes value of zero. So a driver could restart the device > > state if the device continues to change (though it's debatable whether > > triggering an -errno on the next migration region access might be a > > more supportable approach to enforce that userspace has quiesced > > external access). > > Hm, not so sure. From my reading of the uAPI, transitioning from > pre-copy to stop-and-copy (i.e. clearing _RUNNING) implies that we > freeze the device (at least, that's how I interpret "On state transition > from pre-copy to stop-and-copy, the driver must stop the device, save > the device state and send it to the user application through the > migration region.") "[S]end it to the user application through the migration region" is certainly not something that's encompassed just by clearing the _RUNNING bit. There's a sequence of operations there. If the device is quiesced for outbound DMA and frozen from inbound DMA (or can reasonably expect no further inbound DMA) before the user reads the data, I think that meets the description. We can certainly clarify the spec in the process if we agree that we can do this without adding another state bit. I recall that we previously suggested a very strict interpretation of clearing the _RUNNING bit, but again I'm questioning if that's a real requirement or simply a nice-to-have feature for some undefined debugging capability. In raising the p2p DMA issue, we can see that a hard stop independent of other devices is not really practical but I also don't see that introducing a new state bit solves this problem any more elegantly than proposed here. Thanks, Alex
On Thu, Oct 21, 2021 at 03:47:29PM -0600, Alex Williamson wrote: > I recall that we previously suggested a very strict interpretation of > clearing the _RUNNING bit, but again I'm questioning if that's a real > requirement or simply a nice-to-have feature for some undefined > debugging capability. In raising the p2p DMA issue, we can see that a > hard stop independent of other devices is not really practical but I > also don't see that introducing a new state bit solves this problem any > more elegantly than proposed here. Thanks, I still disagree with this - the level of 'frozenness' of a device is something that belongs in the defined state exposed to userspace, not as a hidden internal state that userspace can't see. It makes the state transitions asymmetric between suspend/resume as resume does have a defined uAPI state for each level of frozeness and suspend does not. With the extra bit resume does: 0000, 0100, 1000, 0001 And suspend does: 0001, 1001, 0010, 0000 However, without the extra bit suspend is only 001, 010, 000 With hidden state inside the 010 Jason
On Mon, 25 Oct 2021 09:29:38 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Thu, Oct 21, 2021 at 03:47:29PM -0600, Alex Williamson wrote: > > I recall that we previously suggested a very strict interpretation of > > clearing the _RUNNING bit, but again I'm questioning if that's a real > > requirement or simply a nice-to-have feature for some undefined > > debugging capability. In raising the p2p DMA issue, we can see that a > > hard stop independent of other devices is not really practical but I > > also don't see that introducing a new state bit solves this problem any > > more elegantly than proposed here. Thanks, > > I still disagree with this - the level of 'frozenness' of a device is > something that belongs in the defined state exposed to userspace, not > as a hidden internal state that userspace can't see. > > It makes the state transitions asymmetric between suspend/resume as > resume does have a defined uAPI state for each level of frozeness and > suspend does not. > > With the extra bit resume does: > > 0000, 0100, 1000, 0001 > > And suspend does: > > 0001, 1001, 0010, 0000 > > However, without the extra bit suspend is only > > 001, 010, 000 > > With hidden state inside the 010 And what is the device supposed to do if it receives a DMA while in this strictly defined stopped state? If it generates an unsupported request, that can trigger a fatal platform error. If it silently drops the DMA, then we have data loss. We're defining a catch-22 scenario for drivers versus placing the onus on the user to quiesce the set of devices in order to consider the migration status as valid. Thanks, Alex
On Mon, Oct 25, 2021 at 08:28:57AM -0600, Alex Williamson wrote: > On Mon, 25 Oct 2021 09:29:38 -0300 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Thu, Oct 21, 2021 at 03:47:29PM -0600, Alex Williamson wrote: > > > I recall that we previously suggested a very strict interpretation of > > > clearing the _RUNNING bit, but again I'm questioning if that's a real > > > requirement or simply a nice-to-have feature for some undefined > > > debugging capability. In raising the p2p DMA issue, we can see that a > > > hard stop independent of other devices is not really practical but I > > > also don't see that introducing a new state bit solves this problem any > > > more elegantly than proposed here. Thanks, > > > > I still disagree with this - the level of 'frozenness' of a device is > > something that belongs in the defined state exposed to userspace, not > > as a hidden internal state that userspace can't see. > > > > It makes the state transitions asymmetric between suspend/resume as > > resume does have a defined uAPI state for each level of frozeness and > > suspend does not. > > > > With the extra bit resume does: > > > > 0000, 0100, 1000, 0001 > > > > And suspend does: > > > > 0001, 1001, 0010, 0000 > > > > However, without the extra bit suspend is only > > > > 001, 010, 000 > > > > With hidden state inside the 010 > > And what is the device supposed to do if it receives a DMA while in > this strictly defined stopped state? If it generates an unsupported > request, that can trigger a fatal platform error. I don't see that this question changes anything, we always have a state where the device is unable to respond to incoming DMA. In all cases entry to this state is triggered only by user space action, if userspace does the ioctls in the wrong order then it will hit it. > If it silently drops the DMA, then we have data loss. We're > defining a catch-22 scenario for drivers versus placing the onus on > the user to quiesce the set of devices in order to consider the > migration status as valid. The device should error the TLP. Userspace must globally fence access to the device before it enters the device into the state where it errors TLPs. This is also why I don't like it being so transparent as it is something userspace needs to care about - especially if the HW cannot support such a thing, if we intend to allow that. Jason
* Alex Williamson (alex.williamson@redhat.com) wrote: > [Cc +dgilbert, +cohuck] > > On Wed, 20 Oct 2021 11:28:04 +0300 > Yishai Hadas <yishaih@nvidia.com> wrote: > > > On 10/20/2021 2:04 AM, Jason Gunthorpe wrote: > > > On Tue, Oct 19, 2021 at 02:58:56PM -0600, Alex Williamson wrote: > > >> I think that gives us this table: > > >> > > >> | NDMA | RESUMING | SAVING | RUNNING | > > >> +----------+----------+----------+----------+ --- > > >> | X | 0 | 0 | 0 | ^ > > >> +----------+----------+----------+----------+ | > > >> | 0 | 0 | 0 | 1 | | > > >> +----------+----------+----------+----------+ | > > >> | X | 0 | 1 | 0 | > > >> +----------+----------+----------+----------+ NDMA value is either compatible > > >> | 0 | 0 | 1 | 1 | to existing behavior or don't > > >> +----------+----------+----------+----------+ care due to redundancy vs > > >> | X | 1 | 0 | 0 | !_RUNNING/INVALID/ERROR > > >> +----------+----------+----------+----------+ > > >> | X | 1 | 0 | 1 | | > > >> +----------+----------+----------+----------+ | > > >> | X | 1 | 1 | 0 | | > > >> +----------+----------+----------+----------+ | > > >> | X | 1 | 1 | 1 | v > > >> +----------+----------+----------+----------+ --- > > >> | 1 | 0 | 0 | 1 | ^ > > >> +----------+----------+----------+----------+ Desired new useful cases > > >> | 1 | 0 | 1 | 1 | v > > >> +----------+----------+----------+----------+ --- > > >> > > >> Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can > > >> set which are simply redundant to the NDMA = 0 cases. > > > It seems right > > > > > >> Row 6 remains invalid due to lack of support for pre-copy (_RESUMING > > >> | _RUNNING) and therefore cannot be set by userspace. Rows 7 & 8 > > >> are error states and cannot be set by userspace. > > > I wonder, did Yishai's series capture this row 6 restriction? Yishai? > > > > > > It seems so, by using the below check which includes the > > !VFIO_DEVICE_STATE_VALID clause. > > > > if (old_state == VFIO_DEVICE_STATE_ERROR || > > !VFIO_DEVICE_STATE_VALID(state) || > > (state & ~MLX5VF_SUPPORTED_DEVICE_STATES)) > > return -EINVAL; > > > > Which is: > > > > #define VFIO_DEVICE_STATE_VALID(state) \ > > (state & VFIO_DEVICE_STATE_RESUMING ? \ > > (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1) > > > > > > > >> Like other bits, setting the bit should be effective at the completion > > >> of writing device state. Therefore the device would need to flush any > > >> outbound DMA queues before returning. > > > Yes, the device commands are expected to achieve this. > > > > > >> The question I was really trying to get to though is whether we have a > > >> supportable interface without such an extension. There's currently > > >> only an experimental version of vfio migration support for PCI devices > > >> in QEMU (afaik), > > > If I recall this only matters if you have a VM that is causing > > > migratable devices to interact with each other. So long as the devices > > > are only interacting with the CPU this extra step is not strictly > > > needed. > > > > > > So, single device cases can be fine as-is > > > > > > IMHO the multi-device case the VMM should probably demand this support > > > from the migration drivers, otherwise it cannot know if it is safe for > > > sure. > > > > > > A config option to override the block if the admin knows there is no > > > use case to cause devices to interact - eg two NVMe devices without > > > CMB do not have a useful interaction. > > > > > >> so it seems like we could make use of the bus-master bit to fill > > >> this gap in QEMU currently, before we claim non-experimental > > >> support, but this new device agnostic extension would be required > > >> for non-PCI device support (and PCI support should adopt it as > > >> available). Does that sound right? Thanks, > > > I don't think the bus master support is really a substitute, tripping > > > bus master will stop DMA but it will not do so in a clean way and is > > > likely to be non-transparent to the VM's driver. > > > > > > The single-device-assigned case is a cleaner restriction, IMHO. > > > > > > Alternatively we can add the 4th bit and insist that migration drivers > > > support all the states. I'm just unsure what other HW can do, I get > > > the feeling people have been designing to the migration description in > > > the header file for a while and this is a new idea. > > I'm wondering if we're imposing extra requirements on the !_RUNNING > state that don't need to be there. For example, if we can assume that > all devices within a userspace context are !_RUNNING before any of the > devices begin to retrieve final state, then clearing of the _RUNNING > bit becomes the device quiesce point and the beginning of reading > device data is the point at which the device state is frozen and > serialized. No new states required and essentially works with a slight > rearrangement of the callbacks in this series. Why can't we do that? So without me actually understanding your bit encodings that closely, I think the problem is we have to asusme that any transition takes time. From the QEMU point of view I think the requirement is when we stop the machine (vm_stop_force_state(RUN_STATE_FINISH_MIGRATE) in migration_completion) that at the point that call returns (with no error) all devices are idle. That means you need a way to command the device to go into the stopped state, and probably another to make sure it's got there. Now, you could be a *little* more sloppy; you could allow a device carry on doing stuff purely with it's own internal state up until the point it needs to serialise; but that would have to be strictly internal state only - if it can change any other devices state (or issue an interrupt, change RAM etc) then you get into ordering issues on the serialisation of multiple devices. Dave > Maybe a clarification of the uAPI spec is sufficient to achieve this, > ex. !_RUNNING devices may still update their internal state machine > based on external access. Userspace is expected to quiesce all external > access prior to initiating the retrieval of the final device state from > the data section of the migration region. Failure to do so may result > in inconsistent device state or optionally the device driver may induce > a fault if a quiescent state is not maintained. > > > Just to be sure, > > > > We refer here to some future functionality support with this extra 4th > > bit but it doesn't enforce any change in the submitted code, right ? > > > > The below code uses the (state & ~MLX5VF_SUPPORTED_DEVICE_STATES) clause > > which fails any usage of a non-supported bit as of this one. > > > > if (old_state == VFIO_DEVICE_STATE_ERROR || > > !VFIO_DEVICE_STATE_VALID(state) || > > (state & ~MLX5VF_SUPPORTED_DEVICE_STATES)) > > return -EINVAL; > > Correct, userspace shouldn't be setting any extra bits unless we > advertise support, such as via a capability or flag. Drivers need to > continue to sanitize user input to validate yet-to-be-defined bits are > not accepted from userspace or else we risk not being able to define > them later without breaking userspace. Thanks, > > Alex >
On Mon, 25 Oct 2021 17:34:01 +0100 "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote: > * Alex Williamson (alex.williamson@redhat.com) wrote: > > [Cc +dgilbert, +cohuck] > > > > On Wed, 20 Oct 2021 11:28:04 +0300 > > Yishai Hadas <yishaih@nvidia.com> wrote: > > > > > On 10/20/2021 2:04 AM, Jason Gunthorpe wrote: > > > > On Tue, Oct 19, 2021 at 02:58:56PM -0600, Alex Williamson wrote: > > > >> I think that gives us this table: > > > >> > > > >> | NDMA | RESUMING | SAVING | RUNNING | > > > >> +----------+----------+----------+----------+ --- > > > >> | X | 0 | 0 | 0 | ^ > > > >> +----------+----------+----------+----------+ | > > > >> | 0 | 0 | 0 | 1 | | > > > >> +----------+----------+----------+----------+ | > > > >> | X | 0 | 1 | 0 | > > > >> +----------+----------+----------+----------+ NDMA value is either compatible > > > >> | 0 | 0 | 1 | 1 | to existing behavior or don't > > > >> +----------+----------+----------+----------+ care due to redundancy vs > > > >> | X | 1 | 0 | 0 | !_RUNNING/INVALID/ERROR > > > >> +----------+----------+----------+----------+ > > > >> | X | 1 | 0 | 1 | | > > > >> +----------+----------+----------+----------+ | > > > >> | X | 1 | 1 | 0 | | > > > >> +----------+----------+----------+----------+ | > > > >> | X | 1 | 1 | 1 | v > > > >> +----------+----------+----------+----------+ --- > > > >> | 1 | 0 | 0 | 1 | ^ > > > >> +----------+----------+----------+----------+ Desired new useful cases > > > >> | 1 | 0 | 1 | 1 | v > > > >> +----------+----------+----------+----------+ --- > > > >> > > > >> Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can > > > >> set which are simply redundant to the NDMA = 0 cases. > > > > It seems right > > > > > > > >> Row 6 remains invalid due to lack of support for pre-copy (_RESUMING > > > >> | _RUNNING) and therefore cannot be set by userspace. Rows 7 & 8 > > > >> are error states and cannot be set by userspace. > > > > I wonder, did Yishai's series capture this row 6 restriction? Yishai? > > > > > > > > > It seems so, by using the below check which includes the > > > !VFIO_DEVICE_STATE_VALID clause. > > > > > > if (old_state == VFIO_DEVICE_STATE_ERROR || > > > !VFIO_DEVICE_STATE_VALID(state) || > > > (state & ~MLX5VF_SUPPORTED_DEVICE_STATES)) > > > return -EINVAL; > > > > > > Which is: > > > > > > #define VFIO_DEVICE_STATE_VALID(state) \ > > > (state & VFIO_DEVICE_STATE_RESUMING ? \ > > > (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1) > > > > > > > > > > >> Like other bits, setting the bit should be effective at the completion > > > >> of writing device state. Therefore the device would need to flush any > > > >> outbound DMA queues before returning. > > > > Yes, the device commands are expected to achieve this. > > > > > > > >> The question I was really trying to get to though is whether we have a > > > >> supportable interface without such an extension. There's currently > > > >> only an experimental version of vfio migration support for PCI devices > > > >> in QEMU (afaik), > > > > If I recall this only matters if you have a VM that is causing > > > > migratable devices to interact with each other. So long as the devices > > > > are only interacting with the CPU this extra step is not strictly > > > > needed. > > > > > > > > So, single device cases can be fine as-is > > > > > > > > IMHO the multi-device case the VMM should probably demand this support > > > > from the migration drivers, otherwise it cannot know if it is safe for > > > > sure. > > > > > > > > A config option to override the block if the admin knows there is no > > > > use case to cause devices to interact - eg two NVMe devices without > > > > CMB do not have a useful interaction. > > > > > > > >> so it seems like we could make use of the bus-master bit to fill > > > >> this gap in QEMU currently, before we claim non-experimental > > > >> support, but this new device agnostic extension would be required > > > >> for non-PCI device support (and PCI support should adopt it as > > > >> available). Does that sound right? Thanks, > > > > I don't think the bus master support is really a substitute, tripping > > > > bus master will stop DMA but it will not do so in a clean way and is > > > > likely to be non-transparent to the VM's driver. > > > > > > > > The single-device-assigned case is a cleaner restriction, IMHO. > > > > > > > > Alternatively we can add the 4th bit and insist that migration drivers > > > > support all the states. I'm just unsure what other HW can do, I get > > > > the feeling people have been designing to the migration description in > > > > the header file for a while and this is a new idea. > > > > I'm wondering if we're imposing extra requirements on the !_RUNNING > > state that don't need to be there. For example, if we can assume that > > all devices within a userspace context are !_RUNNING before any of the > > devices begin to retrieve final state, then clearing of the _RUNNING > > bit becomes the device quiesce point and the beginning of reading > > device data is the point at which the device state is frozen and > > serialized. No new states required and essentially works with a slight > > rearrangement of the callbacks in this series. Why can't we do that? > > So without me actually understanding your bit encodings that closely, I > think the problem is we have to asusme that any transition takes time. > From the QEMU point of view I think the requirement is when we stop the > machine (vm_stop_force_state(RUN_STATE_FINISH_MIGRATE) in > migration_completion) that at the point that call returns (with no > error) all devices are idle. That means you need a way to command the > device to go into the stopped state, and probably another to make sure > it's got there. In a way. We're essentially recognizing that we cannot stop a single device in isolation of others that might participate in peer-to-peer DMA with that device, so we need to make a pass to quiesce each device before we can ask the device to fully stop. This new device state bit is meant to be that quiescent point, devices can accept incoming DMA but should cease to generate any. Once all device are quiesced then we can safely stop them. > Now, you could be a *little* more sloppy; you could allow a device carry > on doing stuff purely with it's own internal state up until the point > it needs to serialise; but that would have to be strictly internal state > only - if it can change any other devices state (or issue an interrupt, > change RAM etc) then you get into ordering issues on the serialisation > of multiple devices. Yep, that's the proposal that doesn't require a uAPI change, we loosen the definition of stopped to mean the device can no longer generate DMA or interrupts and all internal processing outside or responding to incoming DMA should halt (essentially the same as the new quiescent state above). Once all devices are in this state, there should be no incoming DMA and we can safely collect per device migration data. If state changes occur beyond the point in time where userspace has initiated the collection of migration data, drivers have options for generating errors when userspace consumes that data. AFAICT, the two approaches are equally valid. If we modify the uAPI to include this new quiescent state then userspace needs to make some hard choices about what configurations they support without such a feature. The majority of configurations are likely not exercising p2p between assigned devices, but the hypervisor can't know that. If we work within the existing uAPI, well there aren't any open source driver implementations yet anyway and any non-upstream implementations would need to be updated for this clarification. Existing userspace works better with no change, so long as they already follow the guideline that all devices in the userspace context must be stopped before the migration data of any device can be considered valid. Thanks, Alex
* Alex Williamson (alex.williamson@redhat.com) wrote: > On Mon, 25 Oct 2021 17:34:01 +0100 > "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote: > > > * Alex Williamson (alex.williamson@redhat.com) wrote: > > > [Cc +dgilbert, +cohuck] > > > > > > On Wed, 20 Oct 2021 11:28:04 +0300 > > > Yishai Hadas <yishaih@nvidia.com> wrote: > > > > > > > On 10/20/2021 2:04 AM, Jason Gunthorpe wrote: > > > > > On Tue, Oct 19, 2021 at 02:58:56PM -0600, Alex Williamson wrote: > > > > >> I think that gives us this table: > > > > >> > > > > >> | NDMA | RESUMING | SAVING | RUNNING | > > > > >> +----------+----------+----------+----------+ --- > > > > >> | X | 0 | 0 | 0 | ^ > > > > >> +----------+----------+----------+----------+ | > > > > >> | 0 | 0 | 0 | 1 | | > > > > >> +----------+----------+----------+----------+ | > > > > >> | X | 0 | 1 | 0 | > > > > >> +----------+----------+----------+----------+ NDMA value is either compatible > > > > >> | 0 | 0 | 1 | 1 | to existing behavior or don't > > > > >> +----------+----------+----------+----------+ care due to redundancy vs > > > > >> | X | 1 | 0 | 0 | !_RUNNING/INVALID/ERROR > > > > >> +----------+----------+----------+----------+ > > > > >> | X | 1 | 0 | 1 | | > > > > >> +----------+----------+----------+----------+ | > > > > >> | X | 1 | 1 | 0 | | > > > > >> +----------+----------+----------+----------+ | > > > > >> | X | 1 | 1 | 1 | v > > > > >> +----------+----------+----------+----------+ --- > > > > >> | 1 | 0 | 0 | 1 | ^ > > > > >> +----------+----------+----------+----------+ Desired new useful cases > > > > >> | 1 | 0 | 1 | 1 | v > > > > >> +----------+----------+----------+----------+ --- > > > > >> > > > > >> Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can > > > > >> set which are simply redundant to the NDMA = 0 cases. > > > > > It seems right > > > > > > > > > >> Row 6 remains invalid due to lack of support for pre-copy (_RESUMING > > > > >> | _RUNNING) and therefore cannot be set by userspace. Rows 7 & 8 > > > > >> are error states and cannot be set by userspace. > > > > > I wonder, did Yishai's series capture this row 6 restriction? Yishai? > > > > > > > > > > > > It seems so, by using the below check which includes the > > > > !VFIO_DEVICE_STATE_VALID clause. > > > > > > > > if (old_state == VFIO_DEVICE_STATE_ERROR || > > > > !VFIO_DEVICE_STATE_VALID(state) || > > > > (state & ~MLX5VF_SUPPORTED_DEVICE_STATES)) > > > > return -EINVAL; > > > > > > > > Which is: > > > > > > > > #define VFIO_DEVICE_STATE_VALID(state) \ > > > > (state & VFIO_DEVICE_STATE_RESUMING ? \ > > > > (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1) > > > > > > > > > > > > > >> Like other bits, setting the bit should be effective at the completion > > > > >> of writing device state. Therefore the device would need to flush any > > > > >> outbound DMA queues before returning. > > > > > Yes, the device commands are expected to achieve this. > > > > > > > > > >> The question I was really trying to get to though is whether we have a > > > > >> supportable interface without such an extension. There's currently > > > > >> only an experimental version of vfio migration support for PCI devices > > > > >> in QEMU (afaik), > > > > > If I recall this only matters if you have a VM that is causing > > > > > migratable devices to interact with each other. So long as the devices > > > > > are only interacting with the CPU this extra step is not strictly > > > > > needed. > > > > > > > > > > So, single device cases can be fine as-is > > > > > > > > > > IMHO the multi-device case the VMM should probably demand this support > > > > > from the migration drivers, otherwise it cannot know if it is safe for > > > > > sure. > > > > > > > > > > A config option to override the block if the admin knows there is no > > > > > use case to cause devices to interact - eg two NVMe devices without > > > > > CMB do not have a useful interaction. > > > > > > > > > >> so it seems like we could make use of the bus-master bit to fill > > > > >> this gap in QEMU currently, before we claim non-experimental > > > > >> support, but this new device agnostic extension would be required > > > > >> for non-PCI device support (and PCI support should adopt it as > > > > >> available). Does that sound right? Thanks, > > > > > I don't think the bus master support is really a substitute, tripping > > > > > bus master will stop DMA but it will not do so in a clean way and is > > > > > likely to be non-transparent to the VM's driver. > > > > > > > > > > The single-device-assigned case is a cleaner restriction, IMHO. > > > > > > > > > > Alternatively we can add the 4th bit and insist that migration drivers > > > > > support all the states. I'm just unsure what other HW can do, I get > > > > > the feeling people have been designing to the migration description in > > > > > the header file for a while and this is a new idea. > > > > > > I'm wondering if we're imposing extra requirements on the !_RUNNING > > > state that don't need to be there. For example, if we can assume that > > > all devices within a userspace context are !_RUNNING before any of the > > > devices begin to retrieve final state, then clearing of the _RUNNING > > > bit becomes the device quiesce point and the beginning of reading > > > device data is the point at which the device state is frozen and > > > serialized. No new states required and essentially works with a slight > > > rearrangement of the callbacks in this series. Why can't we do that? > > > > So without me actually understanding your bit encodings that closely, I > > think the problem is we have to asusme that any transition takes time. > > From the QEMU point of view I think the requirement is when we stop the > > machine (vm_stop_force_state(RUN_STATE_FINISH_MIGRATE) in > > migration_completion) that at the point that call returns (with no > > error) all devices are idle. That means you need a way to command the > > device to go into the stopped state, and probably another to make sure > > it's got there. > > In a way. We're essentially recognizing that we cannot stop a single > device in isolation of others that might participate in peer-to-peer > DMA with that device, so we need to make a pass to quiesce each device > before we can ask the device to fully stop. This new device state bit > is meant to be that quiescent point, devices can accept incoming DMA > but should cease to generate any. Once all device are quiesced then we > can safely stop them. It may need some further refinement; for example in that quiesed state do counters still tick? will a NIC still respond to packets that don't get forwarded to the host? Note I still think you need a way to know when you have actually reached these states; setting a bit in a register is asking nicely for a device to go into a state - has it got there? > > Now, you could be a *little* more sloppy; you could allow a device carry > > on doing stuff purely with it's own internal state up until the point > > it needs to serialise; but that would have to be strictly internal state > > only - if it can change any other devices state (or issue an interrupt, > > change RAM etc) then you get into ordering issues on the serialisation > > of multiple devices. > > Yep, that's the proposal that doesn't require a uAPI change, we loosen > the definition of stopped to mean the device can no longer generate DMA > or interrupts and all internal processing outside or responding to > incoming DMA should halt (essentially the same as the new quiescent > state above). Once all devices are in this state, there should be no > incoming DMA and we can safely collect per device migration data. If > state changes occur beyond the point in time where userspace has > initiated the collection of migration data, drivers have options for > generating errors when userspace consumes that data. How do you know that last device has actually gone into that state? Also be careful; it feels much more delicate where something might accidentally start a transaction. > AFAICT, the two approaches are equally valid. If we modify the uAPI to > include this new quiescent state then userspace needs to make some hard > choices about what configurations they support without such a feature. > The majority of configurations are likely not exercising p2p between > assigned devices, but the hypervisor can't know that. If we work > within the existing uAPI, well there aren't any open source driver > implementations yet anyway and any non-upstream implementations would > need to be updated for this clarification. Existing userspace works > better with no change, so long as they already follow the guideline > that all devices in the userspace context must be stopped before the > migration data of any device can be considered valid. Thanks, Dave > Alex >
On Mon, Oct 25, 2021 at 07:47:29PM +0100, Dr. David Alan Gilbert wrote: > It may need some further refinement; for example in that quiesed state > do counters still tick? will a NIC still respond to packets that don't > get forwarded to the host? At least for the mlx5 NIC the two states are 'able to issue outbound DMA' and 'all internal memories and state are frozen and unchanging'. The later means it should not respond/count/etc to network packets either. Roughly it is 'able to mutate external state' / 'able to mutate internal state' The invariant should be that successive calls to serialize the internal state while frozen should return the same serialization. We may find that migratable PCI functions must have some limited functionality. Ie anything with a realtime compoment - for instance a sync-ethernet clock synchronization control loop, may become challenging to migrate without creating a serious functional distortion. Jason
* Jason Gunthorpe (jgg@nvidia.com) wrote: > On Mon, Oct 25, 2021 at 07:47:29PM +0100, Dr. David Alan Gilbert wrote: > > > It may need some further refinement; for example in that quiesed state > > do counters still tick? will a NIC still respond to packets that don't > > get forwarded to the host? > > At least for the mlx5 NIC the two states are 'able to issue outbound > DMA' and 'all internal memories and state are frozen and unchanging'. Yeh, so my point was just that if you're adding a new state to this process, you need to define the details like that. Dave > The later means it should not respond/count/etc to network packets > either. > > Roughly it is > 'able to mutate external state' / 'able to mutate internal state' > > The invariant should be that successive calls to serialize the > internal state while frozen should return the same serialization. > > We may find that migratable PCI functions must have some limited > functionality. Ie anything with a realtime compoment - for instance a > sync-ethernet clock synchronization control loop, may become > challenging to migrate without creating a serious functional > distortion. > > Jason >
On Tue, Oct 26, 2021 at 09:40:34AM +0100, Dr. David Alan Gilbert wrote: > * Jason Gunthorpe (jgg@nvidia.com) wrote: > > On Mon, Oct 25, 2021 at 07:47:29PM +0100, Dr. David Alan Gilbert wrote: > > > > > It may need some further refinement; for example in that quiesed state > > > do counters still tick? will a NIC still respond to packets that don't > > > get forwarded to the host? > > > > At least for the mlx5 NIC the two states are 'able to issue outbound > > DMA' and 'all internal memories and state are frozen and unchanging'. > > Yeh, so my point was just that if you're adding a new state to this > process, you need to define the details like that. We are not planning to propose any patches/uAPI specification for this problem until after the mlx5 vfio driver is merged.. Jason
On Mon, 25 Oct 2021 19:47:29 +0100 "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote: > * Alex Williamson (alex.williamson@redhat.com) wrote: > > On Mon, 25 Oct 2021 17:34:01 +0100 > > "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote: > > > > > * Alex Williamson (alex.williamson@redhat.com) wrote: > > > > [Cc +dgilbert, +cohuck] > > > > > > > > On Wed, 20 Oct 2021 11:28:04 +0300 > > > > Yishai Hadas <yishaih@nvidia.com> wrote: > > > > > > > > > On 10/20/2021 2:04 AM, Jason Gunthorpe wrote: > > > > > > On Tue, Oct 19, 2021 at 02:58:56PM -0600, Alex Williamson wrote: > > > > > >> I think that gives us this table: > > > > > >> > > > > > >> | NDMA | RESUMING | SAVING | RUNNING | > > > > > >> +----------+----------+----------+----------+ --- > > > > > >> | X | 0 | 0 | 0 | ^ > > > > > >> +----------+----------+----------+----------+ | > > > > > >> | 0 | 0 | 0 | 1 | | > > > > > >> +----------+----------+----------+----------+ | > > > > > >> | X | 0 | 1 | 0 | > > > > > >> +----------+----------+----------+----------+ NDMA value is either compatible > > > > > >> | 0 | 0 | 1 | 1 | to existing behavior or don't > > > > > >> +----------+----------+----------+----------+ care due to redundancy vs > > > > > >> | X | 1 | 0 | 0 | !_RUNNING/INVALID/ERROR > > > > > >> +----------+----------+----------+----------+ > > > > > >> | X | 1 | 0 | 1 | | > > > > > >> +----------+----------+----------+----------+ | > > > > > >> | X | 1 | 1 | 0 | | > > > > > >> +----------+----------+----------+----------+ | > > > > > >> | X | 1 | 1 | 1 | v > > > > > >> +----------+----------+----------+----------+ --- > > > > > >> | 1 | 0 | 0 | 1 | ^ > > > > > >> +----------+----------+----------+----------+ Desired new useful cases > > > > > >> | 1 | 0 | 1 | 1 | v > > > > > >> +----------+----------+----------+----------+ --- > > > > > >> > > > > > >> Specifically, rows 1, 3, 5 with NDMA = 1 are valid states a user can > > > > > >> set which are simply redundant to the NDMA = 0 cases. > > > > > > It seems right > > > > > > > > > > > >> Row 6 remains invalid due to lack of support for pre-copy (_RESUMING > > > > > >> | _RUNNING) and therefore cannot be set by userspace. Rows 7 & 8 > > > > > >> are error states and cannot be set by userspace. > > > > > > I wonder, did Yishai's series capture this row 6 restriction? Yishai? > > > > > > > > > > > > > > > It seems so, by using the below check which includes the > > > > > !VFIO_DEVICE_STATE_VALID clause. > > > > > > > > > > if (old_state == VFIO_DEVICE_STATE_ERROR || > > > > > !VFIO_DEVICE_STATE_VALID(state) || > > > > > (state & ~MLX5VF_SUPPORTED_DEVICE_STATES)) > > > > > return -EINVAL; > > > > > > > > > > Which is: > > > > > > > > > > #define VFIO_DEVICE_STATE_VALID(state) \ > > > > > (state & VFIO_DEVICE_STATE_RESUMING ? \ > > > > > (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1) > > > > > > > > > > > > > > > > >> Like other bits, setting the bit should be effective at the completion > > > > > >> of writing device state. Therefore the device would need to flush any > > > > > >> outbound DMA queues before returning. > > > > > > Yes, the device commands are expected to achieve this. > > > > > > > > > > > >> The question I was really trying to get to though is whether we have a > > > > > >> supportable interface without such an extension. There's currently > > > > > >> only an experimental version of vfio migration support for PCI devices > > > > > >> in QEMU (afaik), > > > > > > If I recall this only matters if you have a VM that is causing > > > > > > migratable devices to interact with each other. So long as the devices > > > > > > are only interacting with the CPU this extra step is not strictly > > > > > > needed. > > > > > > > > > > > > So, single device cases can be fine as-is > > > > > > > > > > > > IMHO the multi-device case the VMM should probably demand this support > > > > > > from the migration drivers, otherwise it cannot know if it is safe for > > > > > > sure. > > > > > > > > > > > > A config option to override the block if the admin knows there is no > > > > > > use case to cause devices to interact - eg two NVMe devices without > > > > > > CMB do not have a useful interaction. > > > > > > > > > > > >> so it seems like we could make use of the bus-master bit to fill > > > > > >> this gap in QEMU currently, before we claim non-experimental > > > > > >> support, but this new device agnostic extension would be required > > > > > >> for non-PCI device support (and PCI support should adopt it as > > > > > >> available). Does that sound right? Thanks, > > > > > > I don't think the bus master support is really a substitute, tripping > > > > > > bus master will stop DMA but it will not do so in a clean way and is > > > > > > likely to be non-transparent to the VM's driver. > > > > > > > > > > > > The single-device-assigned case is a cleaner restriction, IMHO. > > > > > > > > > > > > Alternatively we can add the 4th bit and insist that migration drivers > > > > > > support all the states. I'm just unsure what other HW can do, I get > > > > > > the feeling people have been designing to the migration description in > > > > > > the header file for a while and this is a new idea. > > > > > > > > I'm wondering if we're imposing extra requirements on the !_RUNNING > > > > state that don't need to be there. For example, if we can assume that > > > > all devices within a userspace context are !_RUNNING before any of the > > > > devices begin to retrieve final state, then clearing of the _RUNNING > > > > bit becomes the device quiesce point and the beginning of reading > > > > device data is the point at which the device state is frozen and > > > > serialized. No new states required and essentially works with a slight > > > > rearrangement of the callbacks in this series. Why can't we do that? > > > > > > So without me actually understanding your bit encodings that closely, I > > > think the problem is we have to asusme that any transition takes time. > > > From the QEMU point of view I think the requirement is when we stop the > > > machine (vm_stop_force_state(RUN_STATE_FINISH_MIGRATE) in > > > migration_completion) that at the point that call returns (with no > > > error) all devices are idle. That means you need a way to command the > > > device to go into the stopped state, and probably another to make sure > > > it's got there. > > > > In a way. We're essentially recognizing that we cannot stop a single > > device in isolation of others that might participate in peer-to-peer > > DMA with that device, so we need to make a pass to quiesce each device > > before we can ask the device to fully stop. This new device state bit > > is meant to be that quiescent point, devices can accept incoming DMA > > but should cease to generate any. Once all device are quiesced then we > > can safely stop them. > > It may need some further refinement; for example in that quiesed state > do counters still tick? will a NIC still respond to packets that don't > get forwarded to the host? I'd think no, but I imagine it's largely device specific to what extent a device can be fully halted yet minimally handle incoming DMA. > Note I still think you need a way to know when you have actually reached > these states; setting a bit in a register is asking nicely for a device > to go into a state - has it got there? It's more than asking nicely, we define the device_state bits as synchronous, the device needs to enter the state before returning from the write operation or return an errno. > > > Now, you could be a *little* more sloppy; you could allow a device carry > > > on doing stuff purely with it's own internal state up until the point > > > it needs to serialise; but that would have to be strictly internal state > > > only - if it can change any other devices state (or issue an interrupt, > > > change RAM etc) then you get into ordering issues on the serialisation > > > of multiple devices. > > > > Yep, that's the proposal that doesn't require a uAPI change, we loosen > > the definition of stopped to mean the device can no longer generate DMA > > or interrupts and all internal processing outside or responding to > > incoming DMA should halt (essentially the same as the new quiescent > > state above). Once all devices are in this state, there should be no > > incoming DMA and we can safely collect per device migration data. If > > state changes occur beyond the point in time where userspace has > > initiated the collection of migration data, drivers have options for > > generating errors when userspace consumes that data. > > How do you know that last device has actually gone into that state? Each device cannot, the burden is on the user to make sure all devices are stopped before proceeding to read migration data. > Also be careful; it feels much more delicate where something might > accidentally start a transaction. This sounds like a discussion of theoretically broken drivers. Like the above device_state, drivers still have a synchronization point when the user reads the pending_bytes field to initiate retrieving the device state. If the implementation requires the device to be fully stopped to snapshot the device state to provide to the user, this is where that would happen. Thanks, Alex
On Mon, 25 Oct 2021 11:56:46 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Mon, Oct 25, 2021 at 08:28:57AM -0600, Alex Williamson wrote: > > On Mon, 25 Oct 2021 09:29:38 -0300 > > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > On Thu, Oct 21, 2021 at 03:47:29PM -0600, Alex Williamson wrote: > > > > I recall that we previously suggested a very strict interpretation of > > > > clearing the _RUNNING bit, but again I'm questioning if that's a real > > > > requirement or simply a nice-to-have feature for some undefined > > > > debugging capability. In raising the p2p DMA issue, we can see that a > > > > hard stop independent of other devices is not really practical but I > > > > also don't see that introducing a new state bit solves this problem any > > > > more elegantly than proposed here. Thanks, > > > > > > I still disagree with this - the level of 'frozenness' of a device is > > > something that belongs in the defined state exposed to userspace, not > > > as a hidden internal state that userspace can't see. > > > > > > It makes the state transitions asymmetric between suspend/resume as > > > resume does have a defined uAPI state for each level of frozeness and > > > suspend does not. > > > > > > With the extra bit resume does: > > > > > > 0000, 0100, 1000, 0001 > > > > > > And suspend does: > > > > > > 0001, 1001, 0010, 0000 > > > > > > However, without the extra bit suspend is only > > > > > > 001, 010, 000 > > > > > > With hidden state inside the 010 > > > > And what is the device supposed to do if it receives a DMA while in > > this strictly defined stopped state? If it generates an unsupported > > request, that can trigger a fatal platform error. > > I don't see that this question changes anything, we always have a > state where the device is unable to respond to incoming DMA. I think that depends on the device implementation. If all devices can receive incoming DMA, but all devices are also quiesced not to send DMA, there's not necessarily a need to put the device in a state where it errors TLPs. This ventures into conversations about why assigning VFs can be considered safer than assigning PFs, users cannot disable memory space of VFs and therefore cannot generate URs on writes to MMIO, which may generate fatal faults on some platforms. If we create a uAPI that requires dropping TLPs, then we provide userspace with a means to specifically generate those faults. > In all cases entry to this state is triggered only by user space > action, if userspace does the ioctls in the wrong order then it will > hit it. And if userspace does not quiesce DMA and gets an intermediate device state, that's a failure to follow the protocol. > > If it silently drops the DMA, then we have data loss. We're > > defining a catch-22 scenario for drivers versus placing the onus on > > the user to quiesce the set of devices in order to consider the > > migration status as valid. > > The device should error the TLP. That's a bit of a landmine as outlined above. > Userspace must globally fence access to the device before it enters > the device into the state where it errors TLPs. > > This is also why I don't like it being so transparent as it is > something userspace needs to care about - especially if the HW cannot > support such a thing, if we intend to allow that. Userspace does need to care, but userspace's concern over this should not be able to compromise the platform and therefore making VF assignment more susceptible to fatal error conditions to comply with a migration uAPI is troublesome for me. Thanks, Alex
* Alex Williamson (alex.williamson@redhat.com) wrote: > On Mon, 25 Oct 2021 19:47:29 +0100 > "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote: > > > * Alex Williamson (alex.williamson@redhat.com) wrote: > > > On Mon, 25 Oct 2021 17:34:01 +0100 > > > "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote: > > > > > > > * Alex Williamson (alex.williamson@redhat.com) wrote: > > > > > [Cc +dgilbert, +cohuck] > > > > > > > > > > On Wed, 20 Oct 2021 11:28:04 +0300 > > > > > Yishai Hadas <yishaih@nvidia.com> wrote: > > > > > <snip> > > > In a way. We're essentially recognizing that we cannot stop a single > > > device in isolation of others that might participate in peer-to-peer > > > DMA with that device, so we need to make a pass to quiesce each device > > > before we can ask the device to fully stop. This new device state bit > > > is meant to be that quiescent point, devices can accept incoming DMA > > > but should cease to generate any. Once all device are quiesced then we > > > can safely stop them. > > > > It may need some further refinement; for example in that quiesed state > > do counters still tick? will a NIC still respond to packets that don't > > get forwarded to the host? > > I'd think no, but I imagine it's largely device specific to what extent > a device can be fully halted yet minimally handle incoming DMA. That's what worries me; we're adding a new state here as we understand more about trying to implement a device; but it seems that we need to nail something down as to what the state means. > > Note I still think you need a way to know when you have actually reached > > these states; setting a bit in a register is asking nicely for a device > > to go into a state - has it got there? > > It's more than asking nicely, we define the device_state bits as > synchronous, the device needs to enter the state before returning from > the write operation or return an errno. I don't see how it can be synchronous in practice; can it really wait to complete if it has to take many cycles to finish off an inflight DMA before it transitions? > > > > Now, you could be a *little* more sloppy; you could allow a device carry > > > > on doing stuff purely with it's own internal state up until the point > > > > it needs to serialise; but that would have to be strictly internal state > > > > only - if it can change any other devices state (or issue an interrupt, > > > > change RAM etc) then you get into ordering issues on the serialisation > > > > of multiple devices. > > > > > > Yep, that's the proposal that doesn't require a uAPI change, we loosen > > > the definition of stopped to mean the device can no longer generate DMA > > > or interrupts and all internal processing outside or responding to > > > incoming DMA should halt (essentially the same as the new quiescent > > > state above). Once all devices are in this state, there should be no > > > incoming DMA and we can safely collect per device migration data. If > > > state changes occur beyond the point in time where userspace has > > > initiated the collection of migration data, drivers have options for > > > generating errors when userspace consumes that data. > > > > How do you know that last device has actually gone into that state? > > Each device cannot, the burden is on the user to make sure all devices > are stopped before proceeding to read migration data. Yeh this really ties to the previous question; if it's synchronous you're OK. > > Also be careful; it feels much more delicate where something might > > accidentally start a transaction. > > This sounds like a discussion of theoretically broken drivers. Like > the above device_state, drivers still have a synchronization point when > the user reads the pending_bytes field to initiate retrieving the > device state. If the implementation requires the device to be fully > stopped to snapshot the device state to provide to the user, this is > where that would happen. Thanks, Yes, but I worry that some ways of definining it are harder to get right in drivers, so less likely to be theoretical. Dave > Alex >
On Tue, 26 Oct 2021 09:13:53 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Tue, Oct 26, 2021 at 09:40:34AM +0100, Dr. David Alan Gilbert wrote: > > * Jason Gunthorpe (jgg@nvidia.com) wrote: > > > On Mon, Oct 25, 2021 at 07:47:29PM +0100, Dr. David Alan Gilbert wrote: > > > > > > > It may need some further refinement; for example in that quiesed state > > > > do counters still tick? will a NIC still respond to packets that don't > > > > get forwarded to the host? > > > > > > At least for the mlx5 NIC the two states are 'able to issue outbound > > > DMA' and 'all internal memories and state are frozen and unchanging'. > > > > Yeh, so my point was just that if you're adding a new state to this > > process, you need to define the details like that. > > We are not planning to propose any patches/uAPI specification for this > problem until after the mlx5 vfio driver is merged.. I'm not super comfortable with that. If we're expecting to add a new bit to define a quiescent state prior to clearing the running flag and this is an optional device feature that userspace migration needs to be aware of and it's really not clear from a hypervisor when p2p DMA might be in use, I think that leaves userspace in a pickle how and when they'd impose restrictions on assignment with multiple assigned devices. It's likely that the majority of initial use cases wouldn't need this feature, which would make it difficult to arbitrarily impose later. OTOH, if we define !_RUNNING as quiescent and userspace reading pending_bytes as the point by which the user is responsible for quiescing all devices and the device state becomes stable (or drivers can generate errors during collection of device state if that proves otherwise), then I think existing userspace doesn't care about this issue. Thanks, Alex
On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote: > > This is also why I don't like it being so transparent as it is > > something userspace needs to care about - especially if the HW cannot > > support such a thing, if we intend to allow that. > > Userspace does need to care, but userspace's concern over this should > not be able to compromise the platform and therefore making VF > assignment more susceptible to fatal error conditions to comply with a > migration uAPI is troublesome for me. It is an interesting scenario. I think it points that we are not implementing this fully properly. The !RUNNING state should be like your reset efforts. All access to the MMIO memories from userspace should be revoked during !RUNNING All VMAs zap'd. All IOMMU peer mappings invalidated. The kernel should directly block userspace from causing a MMIO TLP before the device driver goes to !RUNNING. Then the question of what the device does at this edge is not relevant as hostile userspace cannot trigger it. The logical way to implement this is to key off running and block/unblock MMIO access when !RUNNING. To me this strongly suggests that the extra bit is the correct way forward as the driver is much simpler to implement and understand if RUNNING directly controls the availability of MMIO instead of having an irregular case where !RUNNING still allows MMIO but only until a pending_bytes read. Given the complexity of this can we move ahead with the current mlx5_vfio and Yishai&co can come with some followup proposal to split the freeze/queice and block MMIO? Jason
On Tue, Oct 26, 2021 at 03:51:21PM +0100, Dr. David Alan Gilbert wrote: > > It's more than asking nicely, we define the device_state bits as > > synchronous, the device needs to enter the state before returning from > > the write operation or return an errno. > > I don't see how it can be synchronous in practice; can it really wait to > complete if it has to take many cycles to finish off an inflight DMA > before it transitions? The fencing of outbound DMAs in the device must be synchronous, how could anything work if it isn't? Jason
On Tue, Oct 26, 2021 at 08:52:10AM -0600, Alex Williamson wrote: > On Tue, 26 Oct 2021 09:13:53 -0300 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Tue, Oct 26, 2021 at 09:40:34AM +0100, Dr. David Alan Gilbert wrote: > > > * Jason Gunthorpe (jgg@nvidia.com) wrote: > > > > On Mon, Oct 25, 2021 at 07:47:29PM +0100, Dr. David Alan Gilbert wrote: > > > > > > > > > It may need some further refinement; for example in that quiesed state > > > > > do counters still tick? will a NIC still respond to packets that don't > > > > > get forwarded to the host? > > > > > > > > At least for the mlx5 NIC the two states are 'able to issue outbound > > > > DMA' and 'all internal memories and state are frozen and unchanging'. > > > > > > Yeh, so my point was just that if you're adding a new state to this > > > process, you need to define the details like that. > > > > We are not planning to propose any patches/uAPI specification for this > > problem until after the mlx5 vfio driver is merged.. > > I'm not super comfortable with that. If we're expecting to add a new > bit to define a quiescent state prior to clearing the running flag and > this is an optional device feature that userspace migration needs to be > aware of and it's really not clear from a hypervisor when p2p DMA might > be in use, I think that leaves userspace in a pickle how and when > they'd impose restrictions on assignment with multiple assigned > devices. It's likely that the majority of initial use cases wouldn't > need this feature, which would make it difficult to arbitrarily impose > later. Either supporting a freeze/quiesce split is an optional feature, and current mlx5_vfio is an example of a driver that does not implement it Or, freeze/quiesce is a mandatory HW feature and no VFIO driver will be merged that doesn't implement it - even if the HW cannot support it. Frankly, I think we will see HW that doesn't support this and VFIO will be required to have it as an optional feature What does userspace do? I don't know - but it does need to be aware if the optional HW feature is available and it does need to make choices if the VM will be migratable or not. I don't see any path where userspace truly cannot care unless HW support for this is mandatory. Jason
On Tue, 26 Oct 2021 12:18:51 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote: > > > > This is also why I don't like it being so transparent as it is > > > something userspace needs to care about - especially if the HW cannot > > > support such a thing, if we intend to allow that. > > > > Userspace does need to care, but userspace's concern over this should > > not be able to compromise the platform and therefore making VF > > assignment more susceptible to fatal error conditions to comply with a > > migration uAPI is troublesome for me. > > It is an interesting scenario. > > I think it points that we are not implementing this fully properly. > > The !RUNNING state should be like your reset efforts. > > All access to the MMIO memories from userspace should be revoked > during !RUNNING > > All VMAs zap'd. > > All IOMMU peer mappings invalidated. > > The kernel should directly block userspace from causing a MMIO TLP > before the device driver goes to !RUNNING. > > Then the question of what the device does at this edge is not > relevant as hostile userspace cannot trigger it. > > The logical way to implement this is to key off running and > block/unblock MMIO access when !RUNNING. > > To me this strongly suggests that the extra bit is the correct way > forward as the driver is much simpler to implement and understand if > RUNNING directly controls the availability of MMIO instead of having > an irregular case where !RUNNING still allows MMIO but only until a > pending_bytes read. > > Given the complexity of this can we move ahead with the current > mlx5_vfio and Yishai&co can come with some followup proposal to split > the freeze/queice and block MMIO? I know how much we want this driver in, but I'm surprised that you're advocating to cut-and-run with an implementation where we've identified a potentially significant gap with some hand waving that we'll resolve it later. Deciding at some point in the future to forcefully block device MMIO access from userspace when the device stops running is clearly a user visible change and therefore subject to the don't-break-userspace clause. It also seems to presume that the device relies on the vfio-core to block device access, whereas device implementations may not require such if they're able to snapshot device state. That might also indicate that "freeze" is only an implementation specific requirement. Thanks, Alex
On Tue, Oct 26, 2021 at 01:50:46PM -0600, Alex Williamson wrote: > On Tue, 26 Oct 2021 12:18:51 -0300 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote: > > > > > > This is also why I don't like it being so transparent as it is > > > > something userspace needs to care about - especially if the HW cannot > > > > support such a thing, if we intend to allow that. > > > > > > Userspace does need to care, but userspace's concern over this should > > > not be able to compromise the platform and therefore making VF > > > assignment more susceptible to fatal error conditions to comply with a > > > migration uAPI is troublesome for me. > > > > It is an interesting scenario. > > > > I think it points that we are not implementing this fully properly. > > > > The !RUNNING state should be like your reset efforts. > > > > All access to the MMIO memories from userspace should be revoked > > during !RUNNING > > > > All VMAs zap'd. > > > > All IOMMU peer mappings invalidated. > > > > The kernel should directly block userspace from causing a MMIO TLP > > before the device driver goes to !RUNNING. > > > > Then the question of what the device does at this edge is not > > relevant as hostile userspace cannot trigger it. > > > > The logical way to implement this is to key off running and > > block/unblock MMIO access when !RUNNING. > > > > To me this strongly suggests that the extra bit is the correct way > > forward as the driver is much simpler to implement and understand if > > RUNNING directly controls the availability of MMIO instead of having > > an irregular case where !RUNNING still allows MMIO but only until a > > pending_bytes read. > > > > Given the complexity of this can we move ahead with the current > > mlx5_vfio and Yishai&co can come with some followup proposal to split > > the freeze/queice and block MMIO? > > I know how much we want this driver in, but I'm surprised that you're > advocating to cut-and-run with an implementation where we've identified > a potentially significant gap with some hand waving that we'll resolve > it later. I don't view it as cut and run, but making reasonable quanta of progress with patch series of reviewable size and scope. At a certain point we need to get the base level of functionality, that matches the currently defined ABI merged so we can talk about where the ABI needs to go. If you are uncomfortable about this from a uABI stability perspective then mark the driver EXPERIMENTAL and do not merge any other migration drivers until the two details are fully sorted out. As far as the actual issue, if you hadn't just discovered it now nobody would have known we have this gap - much like how the very similar reset issue was present in VFIO for so many years until you plugged it. > Deciding at some point in the future to forcefully block device MMIO > access from userspace when the device stops running is clearly a user > visible change and therefore subject to the don't-break-userspace > clause. I don't think so, this was done for reset retroactively after all. Well behaved qmeu should have silenced all MMIO touches as part of the ABI contract anyhow. The "don't-break-userspace" is not an absolute prohibition, Linus has been very clear this limitation is about direct, ideally demonstrable, breakage to actually deployed software. > That might also indicate that "freeze" is only an implementation > specific requirement. Thanks, It doesn't matter if a theoretical device can exist that doesn't need "freeze" - this device does, and so it is the lowest common denominator for the uAPI contract and userspace must abide by the restriction. Further, I can think of no utility to exposing some non-freeze route as an optional feature. If someone has a scenario where this is useful they should come with a uAPI design and feature bit to describe how it works and what it does. This driver will probably not ever set that feature. Jason
On Tue, 26 Oct 2021 20:43:00 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Tue, Oct 26, 2021 at 01:50:46PM -0600, Alex Williamson wrote: > > On Tue, 26 Oct 2021 12:18:51 -0300 > > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote: > > > > > > > > This is also why I don't like it being so transparent as it is > > > > > something userspace needs to care about - especially if the HW cannot > > > > > support such a thing, if we intend to allow that. > > > > > > > > Userspace does need to care, but userspace's concern over this should > > > > not be able to compromise the platform and therefore making VF > > > > assignment more susceptible to fatal error conditions to comply with a > > > > migration uAPI is troublesome for me. > > > > > > It is an interesting scenario. > > > > > > I think it points that we are not implementing this fully properly. > > > > > > The !RUNNING state should be like your reset efforts. > > > > > > All access to the MMIO memories from userspace should be revoked > > > during !RUNNING > > > > > > All VMAs zap'd. > > > > > > All IOMMU peer mappings invalidated. > > > > > > The kernel should directly block userspace from causing a MMIO TLP > > > before the device driver goes to !RUNNING. > > > > > > Then the question of what the device does at this edge is not > > > relevant as hostile userspace cannot trigger it. > > > > > > The logical way to implement this is to key off running and > > > block/unblock MMIO access when !RUNNING. > > > > > > To me this strongly suggests that the extra bit is the correct way > > > forward as the driver is much simpler to implement and understand if > > > RUNNING directly controls the availability of MMIO instead of having > > > an irregular case where !RUNNING still allows MMIO but only until a > > > pending_bytes read. > > > > > > Given the complexity of this can we move ahead with the current > > > mlx5_vfio and Yishai&co can come with some followup proposal to split > > > the freeze/queice and block MMIO? > > > > I know how much we want this driver in, but I'm surprised that you're > > advocating to cut-and-run with an implementation where we've identified > > a potentially significant gap with some hand waving that we'll resolve > > it later. > > I don't view it as cut and run, but making reasonable quanta of > progress with patch series of reviewable size and scope. > > At a certain point we need to get the base level of functionality, > that matches the currently defined ABI merged so we can talk about > where the ABI needs to go. > > If you are uncomfortable about this from a uABI stability perspective > then mark the driver EXPERIMENTAL and do not merge any other migration > drivers until the two details are fully sorted out. > > As far as the actual issue, if you hadn't just discovered it now > nobody would have known we have this gap - much like how the very > similar reset issue was present in VFIO for so many years until you > plugged it. But the fact that we did discover it is hugely important. We've identified that the potential use case is significantly limited and that userspace doesn't have a good mechanism to determine when to expose that limitation to the user. We're tossing around solutions that involve extensions, if not changes to the uAPI. It's Wednesday of rc7. I feel like we've already been burned by making one of these "reasonable quanta of progress" to accept and mark experimental decisions with where we stand between defining the uAPI in the kernel and accepting an experimental implementation in QEMU. Now we have multiple closed driver implementations (none of which are contributing to this discussion), but thankfully we're not committed to supporting them because we have no open implementations. I think we could get away with ripping up the uAPI if we really needed to. > > Deciding at some point in the future to forcefully block device MMIO > > access from userspace when the device stops running is clearly a user > > visible change and therefore subject to the don't-break-userspace > > clause. > > I don't think so, this was done for reset retroactively after > all. Well behaved qmeu should have silenced all MMIO touches as part > of the ABI contract anyhow. That's not obvious to me and I think it conflates access to the device and execution of the device. If it's QEMU's responsibility to quiesce access to the device anyway, why does the kernel need to impose this restriction. I'd think we'd generally only impose such a restriction if the alternative allows the user to invoke bad behavior outside the scope of their use of the device or consistency of the migration data. It appears that any such behavior would be implementation specific here. > The "don't-break-userspace" is not an absolute prohibition, Linus has > been very clear this limitation is about direct, ideally demonstrable, > breakage to actually deployed software. And if we introduce an open driver that unblocks QEMU support to become non-experimental, I think that's where we stand. > > That might also indicate that "freeze" is only an implementation > > specific requirement. Thanks, > > It doesn't matter if a theoretical device can exist that doesn't need > "freeze" - this device does, and so it is the lowest common > denominator for the uAPI contract and userspace must abide by the > restriction. Sorry, "to the victor go the spoils" is not really how I strictly want to define a uAPI contract with userspace. If we're claiming that userspace is responsible for quiescing devices and we're providing a means for that to occur, and userspace is already responsible for managing MMIO access, then the only reason the kernel would forcefully impose such a restriction itself would be to protect the host and the implementation of that would depend on whether this is expected to be a universal or device specific limitation. Thanks, Alex
On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote: > > As far as the actual issue, if you hadn't just discovered it now > > nobody would have known we have this gap - much like how the very > > similar reset issue was present in VFIO for so many years until you > > plugged it. > > But the fact that we did discover it is hugely important. We've > identified that the potential use case is significantly limited and > that userspace doesn't have a good mechanism to determine when to > expose that limitation to the user. Huh? We've identified that, depending on device behavior, the kernel may need to revoke MMIO access to protect itself from hostile userspace triggering TLP Errors or something. Well behaved userspace must already stop touching the MMIO on the device when !RUNNING - I see no compelling argument against that position. We've been investigating how the mlx5 HW will behave in corner cases, and currently it looks like mlx5 vfio will not generate error TLPs, or corrupt the device itself due to MMIO operations when !RUNNING. So the driver itself, as written, probably does not currently have a bug here, or need changes. > We're tossing around solutions that involve extensions, if not > changes to the uAPI. It's Wednesday of rc7. The P2P issue is seperate, and as I keep saying, unless you want to block support for any HW that does not have freeze&queice userspace must be aware of this ability and it is logical to design it as an extension from where we are now. > I feel like we've already been burned by making one of these > "reasonable quanta of progress" to accept and mark experimental > decisions with where we stand between defining the uAPI in the kernel > and accepting an experimental implementation in QEMU. I won't argue there.. > Now we have multiple closed driver implementations (none of which > are contributing to this discussion), but thankfully we're not > committed to supporting them because we have no open > implementations. I think we could get away with ripping up the uAPI > if we really needed to. Do we need to? > > > Deciding at some point in the future to forcefully block device MMIO > > > access from userspace when the device stops running is clearly a user > > > visible change and therefore subject to the don't-break-userspace > > > clause. > > > > I don't think so, this was done for reset retroactively after > > all. Well behaved qmeu should have silenced all MMIO touches as part > > of the ABI contract anyhow. > > That's not obvious to me and I think it conflates access to the device > and execution of the device. If it's QEMU's responsibility to quiesce > access to the device anyway, why does the kernel need to impose this > restriction. I'd think we'd generally only impose such a restriction > if the alternative allows the user to invoke bad behavior outside the > scope of their use of the device or consistency of the migration data. > It appears that any such behavior would be implementation specific here. I think if an implementation has a problem, like error TLPs, then yes, it must fence. The conservative specification of the uAPI is that userspace should not allow MMIO when !RUNNING. If we ever get any implementation that needs this to fence then we should do it for all implementations just out of consistency. > > The "don't-break-userspace" is not an absolute prohibition, Linus has > > been very clear this limitation is about direct, ideally demonstrable, > > breakage to actually deployed software. > > And if we introduce an open driver that unblocks QEMU support to become > non-experimental, I think that's where we stand. Yes, if qemu becomes deployed, but our testing shows qemu support needs a lot of work before it is deployable, so that doesn't seem to be an immediate risk. > > > That might also indicate that "freeze" is only an implementation > > > specific requirement. Thanks, > > > > It doesn't matter if a theoretical device can exist that doesn't need > > "freeze" - this device does, and so it is the lowest common > > denominator for the uAPI contract and userspace must abide by the > > restriction. > > Sorry, "to the victor go the spoils" is not really how I strictly want > to define a uAPI contract with userspace. This is not the "victor go the spoils" this is meeting the least common denominator of HW we have today. If some fictional HW can be more advanced and can snapshot not freeze, that is great, but it doesn't change one bit that mlx5 cannot and will not work that way. Since mlx5 must be supported, there is no choice but to define the uAPI around its limitations. snapshot devices are strictly a superset of freeze devices, they can emulate freeze by doing snapshot at the freeze operation. In all cases userspace should not touch the device when !RUNNING to preserve generality to all implementations. > If we're claiming that userspace is responsible for quiescing > devices and we're providing a means for that to occur, and userspace > is already responsible for managing MMIO access, then the only > reason the kernel would forcefully impose such a restriction itself > would be to protect the host and the implementation of that would > depend on whether this is expected to be a universal or device > specific limitation. I think the best way forward is to allow for revoke to happen if we ever need it (by specification), and not implement it right now. So, I am not left with a clear idea what is still open that you see as blocking. Can you summarize? Jason
On Wed, Oct 27 2021, Jason Gunthorpe <jgg@nvidia.com> wrote: > On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote: >> We're tossing around solutions that involve extensions, if not >> changes to the uAPI. It's Wednesday of rc7. > > The P2P issue is seperate, and as I keep saying, unless you want to > block support for any HW that does not have freeze&queice userspace > must be aware of this ability and it is logical to design it as an > extension from where we are now. I think the very fact that we're still discussing whether something needs to be changed/documented or not already shows that this is nothing that should go in right now. Actually, I'd already consider it too late even if we agreed now; I would expect a change like this to get at least two weeks in linux-next before the merge window. >> > The "don't-break-userspace" is not an absolute prohibition, Linus has >> > been very clear this limitation is about direct, ideally demonstrable, >> > breakage to actually deployed software. >> >> And if we introduce an open driver that unblocks QEMU support to become >> non-experimental, I think that's where we stand. > > Yes, if qemu becomes deployed, but our testing shows qemu support > needs a lot of work before it is deployable, so that doesn't seem to > be an immediate risk. Do you have any patches/problem reports you can share? If you already identified that there is work to be done in QEMU, I think that speaks even more for delaying this. What if we notice that uapi changes are needed while fixing QEMU?
On Wed, 27 Oct 2021 16:23:45 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote: > > > > As far as the actual issue, if you hadn't just discovered it now > > > nobody would have known we have this gap - much like how the very > > > similar reset issue was present in VFIO for so many years until you > > > plugged it. > > > > But the fact that we did discover it is hugely important. We've > > identified that the potential use case is significantly limited and > > that userspace doesn't have a good mechanism to determine when to > > expose that limitation to the user. > > Huh? > > We've identified that, depending on device behavior, the kernel may > need to revoke MMIO access to protect itself from hostile userspace > triggering TLP Errors or something. > > Well behaved userspace must already stop touching the MMIO on the > device when !RUNNING - I see no compelling argument against that > position. Not touching MMIO is not specified in our uAPI protocol, nor is it an obvious assumption to me, nor is it sufficient to assume well behaved userspace in the implementation of a kernel interface. > We've been investigating how the mlx5 HW will behave in corner cases, > and currently it looks like mlx5 vfio will not generate error TLPs, or > corrupt the device itself due to MMIO operations when !RUNNING. So the > driver itself, as written, probably does not currently have a bug > here, or need changes. This is a system level observation or is it actually looking at the bus? An Unsupported Request on MMIO write won't even generate an AER on some systems, but others can trigger a fatal error on others. That sounds like potentially good news, but either way we're still also discussing a fundamental gap in the uAPI for quiescing multiple devices in a coordinated way and how we actually define !_RUNNING. > > We're tossing around solutions that involve extensions, if not > > changes to the uAPI. It's Wednesday of rc7. > > The P2P issue is seperate, and as I keep saying, unless you want to > block support for any HW that does not have freeze&queice userspace > must be aware of this ability and it is logical to design it as an > extension from where we are now. Is this essentially suggesting that the uAPI be clarified to state that the base implementation is only applicable to userspace contexts with a single migratable vfio device instance? Does that need to preemptively include /dev/iommu generically, ie. anything that could potentially have an IOMMU mapping to the device? I agree that it would be easier to add a capability to expose multi-device compatibility than to try to retrofit one to expose a restriction. > > I feel like we've already been burned by making one of these > > "reasonable quanta of progress" to accept and mark experimental > > decisions with where we stand between defining the uAPI in the kernel > > and accepting an experimental implementation in QEMU. > > I won't argue there.. > > > Now we have multiple closed driver implementations (none of which > > are contributing to this discussion), but thankfully we're not > > committed to supporting them because we have no open > > implementations. I think we could get away with ripping up the uAPI > > if we really needed to. > > Do we need to? I'd prefer not. > > > > Deciding at some point in the future to forcefully block device MMIO > > > > access from userspace when the device stops running is clearly a user > > > > visible change and therefore subject to the don't-break-userspace > > > > clause. > > > > > > I don't think so, this was done for reset retroactively after > > > all. Well behaved qmeu should have silenced all MMIO touches as part > > > of the ABI contract anyhow. > > > > That's not obvious to me and I think it conflates access to the device > > and execution of the device. If it's QEMU's responsibility to quiesce > > access to the device anyway, why does the kernel need to impose this > > restriction. I'd think we'd generally only impose such a restriction > > if the alternative allows the user to invoke bad behavior outside the > > scope of their use of the device or consistency of the migration data. > > It appears that any such behavior would be implementation specific here. > > I think if an implementation has a problem, like error TLPs, then yes, > it must fence. The conservative specification of the uAPI is that > userspace should not allow MMIO when !RUNNING. > > If we ever get any implementation that needs this to fence then we > should do it for all implementations just out of consistency. Like I've indicated, this is not an obvious corollary of the !_RUNNING state to me. I'd tend more towards letting userspace do what they want and only restrict as necessary to protect the host. For example the state of the device when !_RUNNING may be changed by external stimuli, including MMIO and DMA accesses, but the device does not independently advance state. Also, I think we necessarily require config space read-access to support migration, which begs the question specifically which regions, if any, are restricted when !_RUNNING? Could we get away with zapping mmaps (sigbus on fault) but allowing r/w access? > > > The "don't-break-userspace" is not an absolute prohibition, Linus has > > > been very clear this limitation is about direct, ideally demonstrable, > > > breakage to actually deployed software. > > > > And if we introduce an open driver that unblocks QEMU support to become > > non-experimental, I think that's where we stand. > > Yes, if qemu becomes deployed, but our testing shows qemu support > needs a lot of work before it is deployable, so that doesn't seem to > be an immediate risk. Good news... I guess... but do we know what other uAPI changes might be lurking without completing that effort? > > > > That might also indicate that "freeze" is only an implementation > > > > specific requirement. Thanks, > > > > > > It doesn't matter if a theoretical device can exist that doesn't need > > > "freeze" - this device does, and so it is the lowest common > > > denominator for the uAPI contract and userspace must abide by the > > > restriction. > > > > Sorry, "to the victor go the spoils" is not really how I strictly want > > to define a uAPI contract with userspace. > > This is not the "victor go the spoils" this is meeting the least > common denominator of HW we have today. > > If some fictional HW can be more advanced and can snapshot not freeze, > that is great, but it doesn't change one bit that mlx5 cannot and will > not work that way. Since mlx5 must be supported, there is no choice > but to define the uAPI around its limitations. But it seems like you've found that mlx5 is resilient to these things that you're also deeming necessary to restrict. > snapshot devices are strictly a superset of freeze devices, they can > emulate freeze by doing snapshot at the freeze operation. True. > In all cases userspace should not touch the device when !RUNNING to > preserve generality to all implementations. Not and obvious conclusion to me. > > If we're claiming that userspace is responsible for quiescing > > devices and we're providing a means for that to occur, and userspace > > is already responsible for managing MMIO access, then the only > > reason the kernel would forcefully impose such a restriction itself > > would be to protect the host and the implementation of that would > > depend on whether this is expected to be a universal or device > > specific limitation. > > I think the best way forward is to allow for revoke to happen if we > ever need it (by specification), and not implement it right now. > > So, I am not left with a clear idea what is still open that you see as > blocking. Can you summarize? It seems we have numerous uAPI questions floating around, including whether the base specification is limited to a single physical device within the user's IOMMU context, what the !_RUNNING state actually implies about the device state, expectations around userspace access to device regions while in this state, and who is responsible for limiting such access, and uncertainty what other uAPI changes are necessary as QEMU support is stabilized. Why should we rush a driver in just before the merge window and potentially increase our experimental driver debt load rather than continue to co-develop kernel and userspace drivers and maybe also get input from the owners of the existing out-of-tree drivers? Thanks, Alex
On Thu, Oct 28, 2021 at 09:30:35AM -0600, Alex Williamson wrote: > On Wed, 27 Oct 2021 16:23:45 -0300 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote: > > > > > > As far as the actual issue, if you hadn't just discovered it now > > > > nobody would have known we have this gap - much like how the very > > > > similar reset issue was present in VFIO for so many years until you > > > > plugged it. > > > > > > But the fact that we did discover it is hugely important. We've > > > identified that the potential use case is significantly limited and > > > that userspace doesn't have a good mechanism to determine when to > > > expose that limitation to the user. > > > > Huh? > > > > We've identified that, depending on device behavior, the kernel may > > need to revoke MMIO access to protect itself from hostile userspace > > triggering TLP Errors or something. > > > > Well behaved userspace must already stop touching the MMIO on the > > device when !RUNNING - I see no compelling argument against that > > position. > > Not touching MMIO is not specified in our uAPI protocol, To be frank, not much is specified in the uAPI comment, certainly not a detailed meaning of RUNNING. > nor is it an obvious assumption to me, nor is it sufficient to > assume well behaved userspace in the implementation of a kernel > interface. I view two aspects to !RUNNING: 1) the kernel must protect itself from hostile userspace. This means preventing loss of kernel or device integrity (ie no error TLPs, no crashing/corrupting the device/etc) 2) userspace must follow the rules so that the migration is not corrupted. We want to set the rules in a way that gives the greatest freedom of HW implementation Regarding 1, I think we all agree on this, and currently we believe mlx5 is meeting this goal as-is. Regarding 2, I think about possible implementations and come to the conclusion that !RUNNING must mean no MMIO. For several major reasons - For whatever reason a poor device may become harmed by MMIO during !RUNNING and so we may someday need to revoke MMIO like we do for reset. This implies that segfault on MMIO during !RUNNING is an option we should keep open. - A simple DMA queue device, kind of like the HNS driver, could implement migration without HW support. Transition to !RUNNING only needs to wait for the device to fully drained the DMA queue. Any empty DMA queue with no MMIOs means a quiet and migration ready device. However, if further MMIOs poke at the device it may resume operating and issue DMAs, which would corrupt the migration. - We cannot define what MMIO during !RUNNING should even do. What should a write do? What should a read return? The mlx5 version is roughly discard the write and return garbage on read. While this does not corrupt the migration it is also not useful behavior to define. In several of these case I'm happy if broken userspace harms itself and corrupts the migration. That does not impact the integrity of the kernel, and is just buggy userspace. > > We've been investigating how the mlx5 HW will behave in corner cases, > > and currently it looks like mlx5 vfio will not generate error TLPs, or > > corrupt the device itself due to MMIO operations when !RUNNING. So the > > driver itself, as written, probably does not currently have a bug > > here, or need changes. > > This is a system level observation or is it actually looking at the > bus? An Unsupported Request on MMIO write won't even generate an AER > on some systems, but others can trigger a fatal error on others. At this point this information is a design analysis from the HW people. > > > We're tossing around solutions that involve extensions, if not > > > changes to the uAPI. It's Wednesday of rc7. > > > > The P2P issue is seperate, and as I keep saying, unless you want to > > block support for any HW that does not have freeze&queice userspace > > must be aware of this ability and it is logical to design it as an > > extension from where we are now. > > Is this essentially suggesting that the uAPI be clarified to state > that the base implementation is only applicable to userspace contexts > with a single migratable vfio device instance? That is one way to look at it, yes. It is not just a uAPI limitation but a HW limitation as the NDMA state does require direct HW support to continue accepting MMIO/etc but not issue DMA. A simple DMA queue device as I imagine above couldn't implement it. > Does that need to preemptively include /dev/iommu generically, > ie. anything that could potentially have an IOMMU mapping to the > device? Going back to the top, for #1 the kernel must protect its integrity. So, like reset, if we have a driver where revoke is required then the revoke must extend to /dev/iommu as well. For #2 - it is up to userspace. If userspace plugs the device into /dev/iommu and keeps operating it then the migration can be corrupted. Buggy userspace. > I agree that it would be easier to add a capability to expose > multi-device compatibility than to try to retrofit one to expose a > restriction. Yes, me too. What we have here is a realization that the current interface does not support P2P scenarios. There is a wide universe of applications that don't need P2P. The realization is that qemu has a bug in that it allows the VM to execute P2P operations which are incompatible with my above definition of !RUNNING. The result is the #2 case: migration corruption. qemu should protect itself from a VM causing corruption of the migration. Either by only supporting migration with a single VFIO device, or directly blocking P2P scenarios using the IOMMU. To support P2P will require new kernel support, capability and HW support. Which, in concrete terms, means we need to write a new uAPI spec, update the mlx5 vfio driver, implement qemu patches, and test the full solution. Right now we are focused on the non-P2P cases, which I think is a reasonable starting limitation. > Like I've indicated, this is not an obvious corollary of the !_RUNNING > state to me. I'd tend more towards letting userspace do what they want > and only restrict as necessary to protect the host. For example the > state of the device when !_RUNNING may be changed by external stimuli, > including MMIO and DMA accesses, but the device does not independently > advance state. As above this is mixing #1 and #2 - it is fine to allow the device to do whatever as long as it doesn't harm the host - however that doesn't define the conditions userspace must follow to have a successful migration. > Also, I think we necessarily require config space read-access to > support migration, which begs the question specifically which regions, > if any, are restricted when !_RUNNING? Could we get away with zapping > mmaps (sigbus on fault) but allowing r/w access? Ideally we would define exactly what device operations are allowed during !RUNNING such that the migration will be successful. Operations outside that list should be considered things that could corrupt the migration. This list should be as narrow as possible to allow the broadest range of HW designs. > > Yes, if qemu becomes deployed, but our testing shows qemu support > > needs a lot of work before it is deployable, so that doesn't seem to > > be an immediate risk. > > Good news... I guess... but do we know what other uAPI changes might > be lurking without completing that effort? Well, I would say this patch series is approximately the mid point of the project. We are about 60 patches into kernel changes at this point. What is left is approximately: - fix bugs in qemu so single-device operation is robust - dirty page tracking using the system iommu (via iommufd I suppose?) - dirty page tracking using the device iommu - migration with P2P ongoing: uAPI spec, kernel implementation and qemu implementation Then we might have a product.. I also know the mlx5 device was designed with knowledge of other operating systems and our team believes the device interface meets all needs. So, is the uAPI OK? I'd say provisionally yes. It works within its limitations and several vendors have implemented it, even if only two are heading toward in-tree. Is it clearly specified and covers all scenarios? No.. > > If some fictional HW can be more advanced and can snapshot not freeze, > > that is great, but it doesn't change one bit that mlx5 cannot and will > > not work that way. Since mlx5 must be supported, there is no choice > > but to define the uAPI around its limitations. > > But it seems like you've found that mlx5 is resilient to these things > that you're also deeming necessary to restrict. Here I am talking about freeze/quiesce as a HW design choice, not the mmio stuff. > > So, I am not left with a clear idea what is still open that you see as > > blocking. Can you summarize? > > It seems we have numerous uAPI questions floating around, including > whether the base specification is limited to a single physical device > within the user's IOMMU context, what the !_RUNNING state actually > implies about the device state, expectations around userspace access > to device regions while in this state, and who is responsible for > limiting such access, and uncertainty what other uAPI changes are > necessary as QEMU support is stabilized. I think these questions have straightfoward answers. I've tried to explain my view above. > Why should we rush a driver in just before the merge window and > potentially increase our experimental driver debt load rather than > continue to co-develop kernel and userspace drivers and maybe also > get input from the owners of the existing out-of-tree drivers? Thanks, It is not a big deal to defer things to rc1, though merging a leaf-driver that has been on-list over a month is certainly not rushing either. We are not here doing all this work because we want to co-develop kernel and user space drivers out of tree for ages. Why to merge it? Because there is still lots of work to do, and to make progress on the next bits require agreeing to the basic stuff first! So, lets have some actional feedback on what you need to see for an rc1 merging please. Since there are currently no unaddressed comments on the patches, I assume you want to see more work done, please define it. Thanks, Jason
On Thu, Oct 28, 2021 at 05:08:11PM +0200, Cornelia Huck wrote: > that should go in right now. Actually, I'd already consider it too late > even if we agreed now; I would expect a change like this to get at least > two weeks in linux-next before the merge window. Usually linux-next is about sorting out integration problems so we have an orderly merge window. Nobody is going to test this code just because it is in linux-next, it isn't mm or something with coverage there. > > Yes, if qemu becomes deployed, but our testing shows qemu support > > needs a lot of work before it is deployable, so that doesn't seem to > > be an immediate risk. > > Do you have any patches/problem reports you can share? Yishai has some stuff, he was doing failure injection testing and other interesting things. I think we are hoping to start looking at it. > If you already identified that there is work to be done in QEMU, I think > that speaks even more for delaying this. What if we notice that uapi > changes are needed while fixing QEMU? I don't think it is those kinds of bugs. Jason
On Thu, Oct 28 2021, Jason Gunthorpe <jgg@nvidia.com> wrote: > On Thu, Oct 28, 2021 at 09:30:35AM -0600, Alex Williamson wrote: >> On Wed, 27 Oct 2021 16:23:45 -0300 >> Jason Gunthorpe <jgg@nvidia.com> wrote: >> >> > On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote: >> > >> > > > As far as the actual issue, if you hadn't just discovered it now >> > > > nobody would have known we have this gap - much like how the very >> > > > similar reset issue was present in VFIO for so many years until you >> > > > plugged it. >> > > >> > > But the fact that we did discover it is hugely important. We've >> > > identified that the potential use case is significantly limited and >> > > that userspace doesn't have a good mechanism to determine when to >> > > expose that limitation to the user. >> > >> > Huh? >> > >> > We've identified that, depending on device behavior, the kernel may >> > need to revoke MMIO access to protect itself from hostile userspace >> > triggering TLP Errors or something. >> > >> > Well behaved userspace must already stop touching the MMIO on the >> > device when !RUNNING - I see no compelling argument against that >> > position. >> >> Not touching MMIO is not specified in our uAPI protocol, > > To be frank, not much is specified in the uAPI comment, certainly not > a detailed meaning of RUNNING. Yes! And I think that means we need to improve that comment before the first in-tree driver to use it is merged, just to make sure we all agree on the protocol, and future drivers can rely on that understanding as well.
On 10/29/2021 3:26 AM, Jason Gunthorpe wrote: > On Thu, Oct 28, 2021 at 05:08:11PM +0200, Cornelia Huck wrote: > >> that should go in right now. Actually, I'd already consider it too late >> even if we agreed now; I would expect a change like this to get at least >> two weeks in linux-next before the merge window. > Usually linux-next is about sorting out integration problems so we > have an orderly merge window. Nobody is going to test this code just > because it is in linux-next, it isn't mm or something with coverage > there. Right, in addition, the series has been on-list over a month to let people review and comment. V5 has no specific comment on, I believe that we are in a good state / point to move forward with it. >>> Yes, if qemu becomes deployed, but our testing shows qemu support >>> needs a lot of work before it is deployable, so that doesn't seem to >>> be an immediate risk. >> Do you have any patches/problem reports you can share? > Yishai has some stuff, he was doing failure injection testing and > other interesting things. I think we are hoping to start looking at > it. Correct, I encountered some SF of QEMU upon failure injection / error flows. For example, - Unbinding the VF then trying to run savevm. - Moving the mlx5 device to ERROR state, my expectation was to see some recovery flow from QEMU as of calling the RESET ioctl to let it be running again, however it crashed. - etc. Yes, we have some plans to start looking at. >> If you already identified that there is work to be done in QEMU, I think >> that speaks even more for delaying this. What if we notice that uapi >> changes are needed while fixing QEMU? > I don't think it is those kinds of bugs. Right, it doesn't seem as uapi changed are required, need to debug and fix QEMU. Yishai
On 10/29/2021 9:57 AM, Cornelia Huck wrote: > On Thu, Oct 28 2021, Jason Gunthorpe <jgg@nvidia.com> wrote: > >> On Thu, Oct 28, 2021 at 09:30:35AM -0600, Alex Williamson wrote: >>> On Wed, 27 Oct 2021 16:23:45 -0300 >>> Jason Gunthorpe <jgg@nvidia.com> wrote: >>> >>>> On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote: >>>> >>>>>> As far as the actual issue, if you hadn't just discovered it now >>>>>> nobody would have known we have this gap - much like how the very >>>>>> similar reset issue was present in VFIO for so many years until you >>>>>> plugged it. >>>>> But the fact that we did discover it is hugely important. We've >>>>> identified that the potential use case is significantly limited and >>>>> that userspace doesn't have a good mechanism to determine when to >>>>> expose that limitation to the user. >>>> Huh? >>>> >>>> We've identified that, depending on device behavior, the kernel may >>>> need to revoke MMIO access to protect itself from hostile userspace >>>> triggering TLP Errors or something. >>>> >>>> Well behaved userspace must already stop touching the MMIO on the >>>> device when !RUNNING - I see no compelling argument against that >>>> position. >>> Not touching MMIO is not specified in our uAPI protocol, >> To be frank, not much is specified in the uAPI comment, certainly not >> a detailed meaning of RUNNING. > Yes! And I think that means we need to improve that comment before the > first in-tree driver to use it is merged, just to make sure we all agree > on the protocol, and future drivers can rely on that understanding as > well. > This can done by a follow-up patch as part of the the RC cycles, once we agree on the exact comment. Alternatively, We can come with V6 on Sunday if we can agree on the comment here soon. In any case, we don't expect for now code changes in mlx5 as of that. Frankly, I don't believe that this should block the series from being merged. Yishai
Hi Jason, > -----Original Message----- > From: Jason Gunthorpe [mailto:jgg@nvidia.com] > Sent: 29 October 2021 00:48 > To: Alex Williamson <alex.williamson@redhat.com> > Cc: Cornelia Huck <cohuck@redhat.com>; Yishai Hadas <yishaih@nvidia.com>; > bhelgaas@google.com; saeedm@nvidia.com; linux-pci@vger.kernel.org; > kvm@vger.kernel.org; netdev@vger.kernel.org; kuba@kernel.org; > leonro@nvidia.com; kwankhede@nvidia.com; mgurtovoy@nvidia.com; > maorg@nvidia.com; Dr. David Alan Gilbert <dgilbert@redhat.com> > Subject: Re: [PATCH V2 mlx5-next 12/14] vfio/mlx5: Implement vfio_pci driver > for mlx5 devices > > On Thu, Oct 28, 2021 at 09:30:35AM -0600, Alex Williamson wrote: > > On Wed, 27 Oct 2021 16:23:45 -0300 > > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote: > > > > > > > > As far as the actual issue, if you hadn't just discovered it now > > > > > nobody would have known we have this gap - much like how the very > > > > > similar reset issue was present in VFIO for so many years until you > > > > > plugged it. > > > > > > > > But the fact that we did discover it is hugely important. We've > > > > identified that the potential use case is significantly limited and > > > > that userspace doesn't have a good mechanism to determine when to > > > > expose that limitation to the user. > > > > > > Huh? > > > > > > We've identified that, depending on device behavior, the kernel may > > > need to revoke MMIO access to protect itself from hostile userspace > > > triggering TLP Errors or something. > > > > > > Well behaved userspace must already stop touching the MMIO on the > > > device when !RUNNING - I see no compelling argument against that > > > position. > > > > Not touching MMIO is not specified in our uAPI protocol, > > To be frank, not much is specified in the uAPI comment, certainly not > a detailed meaning of RUNNING. > > > nor is it an obvious assumption to me, nor is it sufficient to > > assume well behaved userspace in the implementation of a kernel > > interface. > > I view two aspects to !RUNNING: > > 1) the kernel must protect itself from hostile userspace. This means > preventing loss of kernel or device integrity (ie no error TLPs, no > crashing/corrupting the device/etc) > > 2) userspace must follow the rules so that the migration is not > corrupted. We want to set the rules in a way that gives the > greatest freedom of HW implementation > > Regarding 1, I think we all agree on this, and currently we believe > mlx5 is meeting this goal as-is. > > Regarding 2, I think about possible implementations and come to the > conclusion that !RUNNING must mean no MMIO. For several major reasons > > - For whatever reason a poor device may become harmed by MMIO during > !RUNNING and so we may someday need to revoke MMIO like we do for > reset. This implies that segfault on MMIO during !RUNNING > is an option we should keep open. > > - A simple DMA queue device, kind of like the HNS driver, could > implement migration without HW support. Transition to !RUNNING only > needs to wait for the device to fully drained the DMA queue. > > Any empty DMA queue with no MMIOs means a quiet and migration ready > device. > > However, if further MMIOs poke at the device it may resume > operating and issue DMAs, which would corrupt the migration. Just trying to follow the thread here. The above situation where MMIOs can be further poked in the !RUNNING state, do you mean poking through P2P or Guest/Qemu can still do that? I am just trying to see if the current uAPI fits well within the use case where we don't have any P2P scenario and the assigned dev is behind the IOMMU. Thanks, Shameer > > - We cannot define what MMIO during !RUNNING should even do. What > should a write do? What should a read return? The mlx5 version is > roughly discard the write and return garbage on read. While this > does not corrupt the migration it is also not useful behavior to > define. > > In several of these case I'm happy if broken userspace harms itself > and corrupts the migration. That does not impact the integrity of the > kernel, and is just buggy userspace. > > > > We've been investigating how the mlx5 HW will behave in corner cases, > > > and currently it looks like mlx5 vfio will not generate error TLPs, or > > > corrupt the device itself due to MMIO operations when !RUNNING. So the > > > driver itself, as written, probably does not currently have a bug > > > here, or need changes. > > > > This is a system level observation or is it actually looking at the > > bus? An Unsupported Request on MMIO write won't even generate an AER > > on some systems, but others can trigger a fatal error on others. > > At this point this information is a design analysis from the HW > people. > > > > > We're tossing around solutions that involve extensions, if not > > > > changes to the uAPI. It's Wednesday of rc7. > > > > > > The P2P issue is seperate, and as I keep saying, unless you want to > > > block support for any HW that does not have freeze&queice userspace > > > must be aware of this ability and it is logical to design it as an > > > extension from where we are now. > > > > Is this essentially suggesting that the uAPI be clarified to state > > that the base implementation is only applicable to userspace contexts > > with a single migratable vfio device instance? > > That is one way to look at it, yes. It is not just a uAPI limitation > but a HW limitation as the NDMA state does require direct HW support > to continue accepting MMIO/etc but not issue DMA. A simple DMA queue > device as I imagine above couldn't implement it. > > > Does that need to preemptively include /dev/iommu generically, > > ie. anything that could potentially have an IOMMU mapping to the > > device? > > Going back to the top, for #1 the kernel must protect its > integrity. So, like reset, if we have a driver where revoke is > required then the revoke must extend to /dev/iommu as well. > > For #2 - it is up to userspace. If userspace plugs the device into > /dev/iommu and keeps operating it then the migration can be > corrupted. Buggy userspace. > > > I agree that it would be easier to add a capability to expose > > multi-device compatibility than to try to retrofit one to expose a > > restriction. > > Yes, me too. What we have here is a realization that the current > interface does not support P2P scenarios. There is a wide universe of > applications that don't need P2P. > > The realization is that qemu has a bug in that it allows the VM to > execute P2P operations which are incompatible with my above definition > of !RUNNING. The result is the #2 case: migration corruption. > > qemu should protect itself from a VM causing corruption of the > migration. Either by only supporting migration with a single VFIO > device, or directly blocking P2P scenarios using the IOMMU. > > To support P2P will require new kernel support, capability and HW > support. Which, in concrete terms, means we need to write a new uAPI > spec, update the mlx5 vfio driver, implement qemu patches, and test > the full solution. > > Right now we are focused on the non-P2P cases, which I think is a > reasonable starting limitation. > > > Like I've indicated, this is not an obvious corollary of the !_RUNNING > > state to me. I'd tend more towards letting userspace do what they want > > and only restrict as necessary to protect the host. For example the > > state of the device when !_RUNNING may be changed by external stimuli, > > including MMIO and DMA accesses, but the device does not independently > > advance state. > > As above this is mixing #1 and #2 - it is fine to allow the device to > do whatever as long as it doesn't harm the host - however that doesn't > define the conditions userspace must follow to have a successful > migration. > > > Also, I think we necessarily require config space read-access to > > support migration, which begs the question specifically which regions, > > if any, are restricted when !_RUNNING? Could we get away with zapping > > mmaps (sigbus on fault) but allowing r/w access? > > Ideally we would define exactly what device operations are allowed > during !RUNNING such that the migration will be successful. Operations > outside that list should be considered things that could corrupt the > migration. > > This list should be as narrow as possible to allow the broadest range > of HW designs. > > > > Yes, if qemu becomes deployed, but our testing shows qemu support > > > needs a lot of work before it is deployable, so that doesn't seem to > > > be an immediate risk. > > > > Good news... I guess... but do we know what other uAPI changes might > > be lurking without completing that effort? > > Well, I would say this patch series is approximately the mid point of > the project. We are about 60 patches into kernel changes at this > point. What is left is approximately: > > - fix bugs in qemu so single-device operation is robust > - dirty page tracking using the system iommu (via iommufd I suppose?) > - dirty page tracking using the device iommu > - migration with P2P ongoing: uAPI spec, kernel implementation > and qemu implementation > > Then we might have a product.. > > I also know the mlx5 device was designed with knowledge of other > operating systems and our team believes the device interface meets all > needs. > > So, is the uAPI OK? I'd say provisionally yes. It works within its > limitations and several vendors have implemented it, even if only two > are heading toward in-tree. > > Is it clearly specified and covers all scenarios? No.. > > > > If some fictional HW can be more advanced and can snapshot not freeze, > > > that is great, but it doesn't change one bit that mlx5 cannot and will > > > not work that way. Since mlx5 must be supported, there is no choice > > > but to define the uAPI around its limitations. > > > > But it seems like you've found that mlx5 is resilient to these things > > that you're also deeming necessary to restrict. > > Here I am talking about freeze/quiesce as a HW design choice, not the > mmio stuff. > > > > So, I am not left with a clear idea what is still open that you see as > > > blocking. Can you summarize? > > > > It seems we have numerous uAPI questions floating around, including > > whether the base specification is limited to a single physical device > > within the user's IOMMU context, what the !_RUNNING state actually > > implies about the device state, expectations around userspace access > > to device regions while in this state, and who is responsible for > > limiting such access, and uncertainty what other uAPI changes are > > necessary as QEMU support is stabilized. > > I think these questions have straightfoward answers. I've tried to > explain my view above. > > > Why should we rush a driver in just before the merge window and > > potentially increase our experimental driver debt load rather than > > continue to co-develop kernel and userspace drivers and maybe also > > get input from the owners of the existing out-of-tree drivers? Thanks, > > It is not a big deal to defer things to rc1, though merging a > leaf-driver that has been on-list over a month is certainly not > rushing either. > > We are not here doing all this work because we want to co-develop > kernel and user space drivers out of tree for ages. > > Why to merge it? Because there is still lots of work to do, and to > make progress on the next bits require agreeing to the basic stuff > first! > > So, lets have some actional feedback on what you need to see for an > rc1 merging please. > > Since there are currently no unaddressed comments on the patches, I > assume you want to see more work done, please define it. > > Thanks, > Jason
On Fri, Oct 29, 2021 at 10:32:03AM +0000, Shameerali Kolothum Thodi wrote: > Just trying to follow the thread here. The above situation where MMIOs can be > further poked in the !RUNNING state, do you mean poking through P2P or > Guest/Qemu can still do that? I am just trying to see if the current uAPI fits well > within the use case where we don't have any P2P scenario and the assigned dev > is behind the IOMMU. The way it is today any VFIO user can poke MMIO while in the !RUNNING state, no P2P needed, just write to the mmap. Jason
On Thu, 28 Oct 2021 20:47:50 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Thu, Oct 28, 2021 at 09:30:35AM -0600, Alex Williamson wrote: > > On Wed, 27 Oct 2021 16:23:45 -0300 > > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > On Wed, Oct 27, 2021 at 01:05:20PM -0600, Alex Williamson wrote: > > > > > > > > As far as the actual issue, if you hadn't just discovered it now > > > > > nobody would have known we have this gap - much like how the very > > > > > similar reset issue was present in VFIO for so many years until you > > > > > plugged it. > > > > > > > > But the fact that we did discover it is hugely important. We've > > > > identified that the potential use case is significantly limited and > > > > that userspace doesn't have a good mechanism to determine when to > > > > expose that limitation to the user. > > > > > > Huh? > > > > > > We've identified that, depending on device behavior, the kernel may > > > need to revoke MMIO access to protect itself from hostile userspace > > > triggering TLP Errors or something. > > > > > > Well behaved userspace must already stop touching the MMIO on the > > > device when !RUNNING - I see no compelling argument against that > > > position. > > > > Not touching MMIO is not specified in our uAPI protocol, > > To be frank, not much is specified in the uAPI comment, certainly not > a detailed meaning of RUNNING. > > > nor is it an obvious assumption to me, nor is it sufficient to > > assume well behaved userspace in the implementation of a kernel > > interface. > > I view two aspects to !RUNNING: > > 1) the kernel must protect itself from hostile userspace. This means > preventing loss of kernel or device integrity (ie no error TLPs, no > crashing/corrupting the device/etc) > > 2) userspace must follow the rules so that the migration is not > corrupted. We want to set the rules in a way that gives the > greatest freedom of HW implementation > > Regarding 1, I think we all agree on this, and currently we believe > mlx5 is meeting this goal as-is. > > Regarding 2, I think about possible implementations and come to the > conclusion that !RUNNING must mean no MMIO. For several major reasons > > - For whatever reason a poor device may become harmed by MMIO during > !RUNNING and so we may someday need to revoke MMIO like we do for > reset. This implies that segfault on MMIO during !RUNNING > is an option we should keep open. > > - A simple DMA queue device, kind of like the HNS driver, could > implement migration without HW support. Transition to !RUNNING only > needs to wait for the device to fully drained the DMA queue. > > Any empty DMA queue with no MMIOs means a quiet and migration ready > device. > > However, if further MMIOs poke at the device it may resume > operating and issue DMAs, which would corrupt the migration. > > - We cannot define what MMIO during !RUNNING should even do. What > should a write do? What should a read return? The mlx5 version is > roughly discard the write and return garbage on read. While this > does not corrupt the migration it is also not useful behavior to > define. > > In several of these case I'm happy if broken userspace harms itself > and corrupts the migration. That does not impact the integrity of the > kernel, and is just buggy userspace. > > > > We've been investigating how the mlx5 HW will behave in corner cases, > > > and currently it looks like mlx5 vfio will not generate error TLPs, or > > > corrupt the device itself due to MMIO operations when !RUNNING. So the > > > driver itself, as written, probably does not currently have a bug > > > here, or need changes. > > > > This is a system level observation or is it actually looking at the > > bus? An Unsupported Request on MMIO write won't even generate an AER > > on some systems, but others can trigger a fatal error on others. > > At this point this information is a design analysis from the HW > people. > > > > > We're tossing around solutions that involve extensions, if not > > > > changes to the uAPI. It's Wednesday of rc7. > > > > > > The P2P issue is seperate, and as I keep saying, unless you want to > > > block support for any HW that does not have freeze&queice userspace > > > must be aware of this ability and it is logical to design it as an > > > extension from where we are now. > > > > Is this essentially suggesting that the uAPI be clarified to state > > that the base implementation is only applicable to userspace contexts > > with a single migratable vfio device instance? > > That is one way to look at it, yes. It is not just a uAPI limitation > but a HW limitation as the NDMA state does require direct HW support > to continue accepting MMIO/etc but not issue DMA. A simple DMA queue > device as I imagine above couldn't implement it. > > > Does that need to preemptively include /dev/iommu generically, > > ie. anything that could potentially have an IOMMU mapping to the > > device? > > Going back to the top, for #1 the kernel must protect its > integrity. So, like reset, if we have a driver where revoke is > required then the revoke must extend to /dev/iommu as well. > > For #2 - it is up to userspace. If userspace plugs the device into > /dev/iommu and keeps operating it then the migration can be > corrupted. Buggy userspace. > > > I agree that it would be easier to add a capability to expose > > multi-device compatibility than to try to retrofit one to expose a > > restriction. > > Yes, me too. What we have here is a realization that the current > interface does not support P2P scenarios. There is a wide universe of > applications that don't need P2P. > > The realization is that qemu has a bug in that it allows the VM to > execute P2P operations which are incompatible with my above definition > of !RUNNING. The result is the #2 case: migration corruption. > > qemu should protect itself from a VM causing corruption of the > migration. Either by only supporting migration with a single VFIO > device, or directly blocking P2P scenarios using the IOMMU. > > To support P2P will require new kernel support, capability and HW > support. Which, in concrete terms, means we need to write a new uAPI > spec, update the mlx5 vfio driver, implement qemu patches, and test > the full solution. > > Right now we are focused on the non-P2P cases, which I think is a > reasonable starting limitation. It's a reasonable starting point iff we know that we need to support devices that cannot themselves support a quiescent state. Otherwise it would make sense to go back to work on the uAPI because I suspect the implications to userspace are not going to be as simple as "oops, can't migrate, there are two devices." As you say, there's a universe of devices that run together that don't care about p2p and QEMU will be pressured to support migration of those configurations. QEMU currently already supports p2p between assigned devices, which means the user or management tool is going to need to choose whether they prefer migration or legacy p2p compatibility. The DMA mapping aspects of this get complicated. Ideally we could skip p2p DMA mappings for any individual device that doesn't support this future quiescent state, but we don't do mappings based on individual devices, we do them based on the container. There's a least common denominator among the devices in a container, but we also support hotplug and we can't suddenly decide to tear down p2p mappings because a new device is added. That probably means that we can't automatically enable both p2p and migration, even if we initially only have devices that support this new quiescent migration state. We'd need to design the QEMU options so we can't have a subset of devices that want p2p and another set that want migration. If we ever want both migration and p2p, QEMU would need to reject any device that can't comply. If we're moving forward without an independent quiescent state, the uAPI should include clarification specifying that it's the user's responsibility to independently prevent DMA to devices while the device is !_RUNNING. Whether that's because DMA to the device is always blocked at the IOMMU or a configuration restriction to prevent additional devices that could generate DMA is left to the user. Support would need to be added in the kernel to tear down these mappings if there were a scenario where the user failing to prevent DMA to the device could cause misbehavior classified under your #1 above, harm to host. > > Like I've indicated, this is not an obvious corollary of the !_RUNNING > > state to me. I'd tend more towards letting userspace do what they want > > and only restrict as necessary to protect the host. For example the > > state of the device when !_RUNNING may be changed by external stimuli, > > including MMIO and DMA accesses, but the device does not independently > > advance state. > > As above this is mixing #1 and #2 - it is fine to allow the device to > do whatever as long as it doesn't harm the host - however that doesn't > define the conditions userspace must follow to have a successful > migration. > > > Also, I think we necessarily require config space read-access to > > support migration, which begs the question specifically which regions, > > if any, are restricted when !_RUNNING? Could we get away with zapping > > mmaps (sigbus on fault) but allowing r/w access? > > Ideally we would define exactly what device operations are allowed > during !RUNNING such that the migration will be successful. Operations > outside that list should be considered things that could corrupt the > migration. > > This list should be as narrow as possible to allow the broadest range > of HW designs. So we need a proposal of that list. > > > Yes, if qemu becomes deployed, but our testing shows qemu support > > > needs a lot of work before it is deployable, so that doesn't seem to > > > be an immediate risk. > > > > Good news... I guess... but do we know what other uAPI changes might > > be lurking without completing that effort? > > Well, I would say this patch series is approximately the mid point of > the project. We are about 60 patches into kernel changes at this > point. What is left is approximately: > > - fix bugs in qemu so single-device operation is robust > - dirty page tracking using the system iommu (via iommufd I suppose?) > - dirty page tracking using the device iommu > - migration with P2P ongoing: uAPI spec, kernel implementation > and qemu implementation > > Then we might have a product.. > > I also know the mlx5 device was designed with knowledge of other > operating systems and our team believes the device interface meets all > needs. > > So, is the uAPI OK? I'd say provisionally yes. It works within its > limitations and several vendors have implemented it, even if only two > are heading toward in-tree. > > Is it clearly specified and covers all scenarios? No.. > > > > If some fictional HW can be more advanced and can snapshot not freeze, > > > that is great, but it doesn't change one bit that mlx5 cannot and will > > > not work that way. Since mlx5 must be supported, there is no choice > > > but to define the uAPI around its limitations. > > > > But it seems like you've found that mlx5 is resilient to these things > > that you're also deeming necessary to restrict. > > Here I am talking about freeze/quiesce as a HW design choice, not the > mmio stuff. > > > > So, I am not left with a clear idea what is still open that you see as > > > blocking. Can you summarize? > > > > It seems we have numerous uAPI questions floating around, including > > whether the base specification is limited to a single physical device > > within the user's IOMMU context, what the !_RUNNING state actually > > implies about the device state, expectations around userspace access > > to device regions while in this state, and who is responsible for > > limiting such access, and uncertainty what other uAPI changes are > > necessary as QEMU support is stabilized. > > I think these questions have straightfoward answers. I've tried to > explain my view above. > > > Why should we rush a driver in just before the merge window and > > potentially increase our experimental driver debt load rather than > > continue to co-develop kernel and userspace drivers and maybe also > > get input from the owners of the existing out-of-tree drivers? Thanks, > > It is not a big deal to defer things to rc1, though merging a > leaf-driver that has been on-list over a month is certainly not > rushing either. If "on-list over a month" is meant to imply that it's well vetted, it does not. That's a pretty quick time frame given the uAPI viability discussions that it's generated. > We are not here doing all this work because we want to co-develop > kernel and user space drivers out of tree for ages. > > Why to merge it? Because there is still lots of work to do, and to > make progress on the next bits require agreeing to the basic stuff > first! > > So, lets have some actional feedback on what you need to see for an > rc1 merging please. > > Since there are currently no unaddressed comments on the patches, I > assume you want to see more work done, please define it. I'm tending to agree that there's value in moving forward, but there's a lot we're defining here that's not in the uAPI, so I'd like to see those things become formalized. I think this version is defining that it's the user's responsibility to prevent external DMA to devices while in the !_RUNNING state. This resolves the condition that we have no means to coordinate quiescing multiple devices. We shouldn't necessarily prescribe a single device solution in the uAPI if the same can be equally achieved through configuration of DMA mapping. I was almost on board with blocking MMIO, especially as p2p is just DMA mapping of MMIO, but what about MSI-X? During _RESUME we must access the MSI-X vector table via the SET_IRQS ioctl to configure interrupts. Is this exempt because the access occurs in the host? In any case, it requires that the device cannot be absolutely static while !_RUNNING. Does (_RESUMING) have different rules than (_SAVING)? So I'm still unclear how the uAPI needs to be updated relative to region access. We need that list of what the user is allowed to access, which seems like minimally config space and MSI-X table space, but are these implicitly known for vfio-pci devices or do we need region flags or capabilities to describe? We can't generally know the disposition of device specific regions relative to this access. Thanks, Alex
On Fri, Oct 29, 2021 at 04:06:21PM -0600, Alex Williamson wrote: > > Right now we are focused on the non-P2P cases, which I think is a > > reasonable starting limitation. > > It's a reasonable starting point iff we know that we need to support > devices that cannot themselves support a quiescent state. Otherwise it > would make sense to go back to work on the uAPI because I suspect the > implications to userspace are not going to be as simple as "oops, can't > migrate, there are two devices." As you say, there's a universe of > devices that run together that don't care about p2p and QEMU will be > pressured to support migration of those configurations. I agree with this, but I also think what I saw in the proposed hns driver suggests it's HW cannot do quiescent, if so this is the first counter-example to the notion it is a universal ability? hns people: Can you put your device in a state where it is operating, able to accept and respond to MMIO, and yet guarentees it generates no DMA transactions? > want migration. If we ever want both migration and p2p, QEMU would > need to reject any device that can't comply. Yes, it looks like a complicated task on the qemu side to get this resolved > > It is not a big deal to defer things to rc1, though merging a > > leaf-driver that has been on-list over a month is certainly not > > rushing either. > > If "on-list over a month" is meant to imply that it's well vetted, it > does not. That's a pretty quick time frame given the uAPI viability > discussions that it's generated. I only said rushed :) > I'm tending to agree that there's value in moving forward, but there's > a lot we're defining here that's not in the uAPI, so I'd like to see > those things become formalized. Ok, lets come up with a documentation patch then to define !RUNNING as I outlined and start to come up with the allowed list of actions.. I think I would like to have a proper rst file for documenting the uapi as well. > I think this version is defining that it's the user's responsibility to > prevent external DMA to devices while in the !_RUNNING state. This > resolves the condition that we have no means to coordinate quiescing > multiple devices. We shouldn't necessarily prescribe a single device > solution in the uAPI if the same can be equally achieved through > configuration of DMA mapping. I'm not sure what this means? > I was almost on board with blocking MMIO, especially as p2p is just DMA > mapping of MMIO, but what about MSI-X? During _RESUME we must access > the MSI-X vector table via the SET_IRQS ioctl to configure interrupts. > Is this exempt because the access occurs in the host? s/in the host/in the kernel/ SET_IRQS is a kernel ioctl that uses the core MSIX code to do the mmio, so it would not be impacted by MMIO zap. Looks like you've already marked these points with the vfio_pci_memory_lock_and_enable(), so a zap for migration would have to be a little different than a zap for reset. Still, this is something that needs clear definition, I would expect the SET_IRQS to happen after resuming clears but before running sets to give maximum HW flexibility and symmetry with saving. And we should really define clearly what a device is supposed to do with the interrupt vectors during migration. Obviously there are races here. > In any case, it requires that the device cannot be absolutely static > while !_RUNNING. Does (_RESUMING) have different rules than > (_SAVING)? I'd prever to avoid all device touches during both resuming and saving, and do them during !RUNNING > So I'm still unclear how the uAPI needs to be updated relative to > region access. We need that list of what the user is allowed to > access, which seems like minimally config space and MSI-X table space, > but are these implicitly known for vfio-pci devices or do we need > region flags or capabilities to describe? We can't generally know the > disposition of device specific regions relative to this access. Thanks, I'd prefer to be general and have the spec forbid everything. Specifying things like VFIO_DEVICE_SET_IRQS1 covers all the bus types. Other bus types should get spec updates before any other bus type driver is merged. Thanks, Jason
> -----Original Message----- > From: Jason Gunthorpe [mailto:jgg@nvidia.com] > Sent: 01 November 2021 17:25 > To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum > Thodi <shameerali.kolothum.thodi@huawei.com> > Cc: Cornelia Huck <cohuck@redhat.com>; Yishai Hadas <yishaih@nvidia.com>; > bhelgaas@google.com; saeedm@nvidia.com; linux-pci@vger.kernel.org; > kvm@vger.kernel.org; netdev@vger.kernel.org; kuba@kernel.org; > leonro@nvidia.com; kwankhede@nvidia.com; mgurtovoy@nvidia.com; > maorg@nvidia.com; Dr. David Alan Gilbert <dgilbert@redhat.com> > Subject: Re: [PATCH V2 mlx5-next 12/14] vfio/mlx5: Implement vfio_pci driver > for mlx5 devices > > On Fri, Oct 29, 2021 at 04:06:21PM -0600, Alex Williamson wrote: > > > > Right now we are focused on the non-P2P cases, which I think is a > > > reasonable starting limitation. > > > > It's a reasonable starting point iff we know that we need to support > > devices that cannot themselves support a quiescent state. Otherwise it > > would make sense to go back to work on the uAPI because I suspect the > > implications to userspace are not going to be as simple as "oops, can't > > migrate, there are two devices." As you say, there's a universe of > > devices that run together that don't care about p2p and QEMU will be > > pressured to support migration of those configurations. > > I agree with this, but I also think what I saw in the proposed hns > driver suggests it's HW cannot do quiescent, if so this is the first > counter-example to the notion it is a universal ability? > > hns people: Can you put your device in a state where it is operating, > able to accept and respond to MMIO, and yet guarentees it generates no > DMA transactions? AFAIK, I am afraid we cannot guarantee that as per our current implementation. At present in !RUNNING state we are putting the device in to a PAUSE state so it will complete the current request and keep the remaining ones in queue. But it can still receive a new request which will trigger the PAUSE state exit and resume the operation. So I guess, it is possible to corrupt the migration if user space misbehaves. Thanks, Shameer
On Mon, 1 Nov 2021 14:25:06 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Fri, Oct 29, 2021 at 04:06:21PM -0600, Alex Williamson wrote: > > > > Right now we are focused on the non-P2P cases, which I think is a > > > reasonable starting limitation. > > > > It's a reasonable starting point iff we know that we need to support > > devices that cannot themselves support a quiescent state. Otherwise it > > would make sense to go back to work on the uAPI because I suspect the > > implications to userspace are not going to be as simple as "oops, can't > > migrate, there are two devices." As you say, there's a universe of > > devices that run together that don't care about p2p and QEMU will be > > pressured to support migration of those configurations. > > I agree with this, but I also think what I saw in the proposed hns > driver suggests it's HW cannot do quiescent, if so this is the first > counter-example to the notion it is a universal ability? > > hns people: Can you put your device in a state where it is operating, > able to accept and respond to MMIO, and yet guarentees it generates no > DMA transactions? > > > want migration. If we ever want both migration and p2p, QEMU would > > need to reject any device that can't comply. > > Yes, it looks like a complicated task on the qemu side to get this > resolved > > > > It is not a big deal to defer things to rc1, though merging a > > > leaf-driver that has been on-list over a month is certainly not > > > rushing either. > > > > If "on-list over a month" is meant to imply that it's well vetted, it > > does not. That's a pretty quick time frame given the uAPI viability > > discussions that it's generated. > > I only said rushed :) To push forward regardless of unresolved questions is rushing regardless of how long it's been on-list. > > I'm tending to agree that there's value in moving forward, but there's > > a lot we're defining here that's not in the uAPI, so I'd like to see > > those things become formalized. > > Ok, lets come up with a documentation patch then to define !RUNNING as > I outlined and start to come up with the allowed list of actions.. > > I think I would like to have a proper rst file for documenting the > uapi as well. > > > I think this version is defining that it's the user's responsibility to > > prevent external DMA to devices while in the !_RUNNING state. This > > resolves the condition that we have no means to coordinate quiescing > > multiple devices. We shouldn't necessarily prescribe a single device > > solution in the uAPI if the same can be equally achieved through > > configuration of DMA mapping. > > I'm not sure what this means? I'm just trying to avoid the uAPI calling out a single-device restriction if there are other ways that userspace can quiesce external DMA outside of the uAPI, such as by limiting p2p DMA mappings at the IOMMU, ie. define the userspace requirements but don't dictate a specific solution. > > I was almost on board with blocking MMIO, especially as p2p is just DMA > > mapping of MMIO, but what about MSI-X? During _RESUME we must access > > the MSI-X vector table via the SET_IRQS ioctl to configure interrupts. > > Is this exempt because the access occurs in the host? > > s/in the host/in the kernel/ SET_IRQS is a kernel ioctl that uses the > core MSIX code to do the mmio, so it would not be impacted by MMIO > zap. AIUI, "zap" is just the proposed userspace manifestation that the device cannot accept MMIO writes while !_RUNNING, but these writes must occur in that state. > Looks like you've already marked these points with the > vfio_pci_memory_lock_and_enable(), so a zap for migration would have > to be a little different than a zap for reset. > > Still, this is something that needs clear definition, I would expect > the SET_IRQS to happen after resuming clears but before running sets > to give maximum HW flexibility and symmetry with saving. There's no requirement that the device enters a null state (!_RESUMING | !_SAVING | !_RUNNING), the uAPI even species the flows as _RESUMING transitioning to _RUNNING. There's no point at which we can do SET_IRQS other than in the _RESUMING state. Generally SET_IRQS ioctls are coordinated with the guest driver based on actions to the device, we can't be mucking with IRQs while the device is presumed running and already generating interrupt conditions. > And we should really define clearly what a device is supposed to do > with the interrupt vectors during migration. Obviously there are races > here. The device should not be generating interrupts while !_RUNNING, pending interrupts should be held until the device is _RUNNING. To me this means the sequence must be that INTx/MSI/MSI-X are restored while in the !_RUNNING state. > > In any case, it requires that the device cannot be absolutely static > > while !_RUNNING. Does (_RESUMING) have different rules than > > (_SAVING)? > > I'd prever to avoid all device touches during both resuming and > saving, and do them during !RUNNING There's no such state required by the uAPI. > > So I'm still unclear how the uAPI needs to be updated relative to > > region access. We need that list of what the user is allowed to > > access, which seems like minimally config space and MSI-X table space, > > but are these implicitly known for vfio-pci devices or do we need > > region flags or capabilities to describe? We can't generally know the > > disposition of device specific regions relative to this access. Thanks, > > I'd prefer to be general and have the spec forbid > everything. Specifying things like VFIO_DEVICE_SET_IRQS1 covers all the > bus types. AFAICT, SET_IRQS while _RESUMING is a requirement, as is some degree of access to config space. It seems you're proposing a new required null state which is contradictory to the existing uAPI. Thanks, Alex
On Tue, Nov 02, 2021 at 08:56:51AM -0600, Alex Williamson wrote: > > Still, this is something that needs clear definition, I would expect > > the SET_IRQS to happen after resuming clears but before running sets > > to give maximum HW flexibility and symmetry with saving. > > There's no requirement that the device enters a null state (!_RESUMING > | !_SAVING | !_RUNNING), the uAPI even species the flows as _RESUMING > transitioning to _RUNNING. If the device saves the MSI-X state inside it's migration data (as apparently would be convenient for other OSs) then when RESUMING clears and the migration data is de-serialized the device will overwrite the MSI-X data. Since Linux as an OS wants to control the MSI-X it needs to load it after RESUMING, but before RUNNING. > There's no point at which we can do SET_IRQS other than in the > _RESUMING state. Generally SET_IRQS ioctls are coordinated with the > guest driver based on actions to the device, we can't be mucking > with IRQs while the device is presumed running and already > generating interrupt conditions. We need to do it in state 000 ie resume should go 000 -> 100 -> 000 -> 001 With SET_IRQS and any other fixing done during the 2nd 000, after the migration data has been loaded into the device. > > And we should really define clearly what a device is supposed to do > > with the interrupt vectors during migration. Obviously there are races > > here. > > The device should not be generating interrupts while !_RUNNING, pending > interrupts should be held until the device is _RUNNING. To me this > means the sequence must be that INTx/MSI/MSI-X are restored while in > the !_RUNNING state. Yes > > > In any case, it requires that the device cannot be absolutely static > > > while !_RUNNING. Does (_RESUMING) have different rules than > > > (_SAVING)? > > > > I'd prever to avoid all device touches during both resuming and > > saving, and do them during !RUNNING > > There's no such state required by the uAPI. The uAPI comment does not define when to do the SET_IRQS, it seems this has been missed. We really should fix it, unless you feel strongly that the experimental API in qemu shouldn't be changed. Jason
On Tue, 2 Nov 2021 12:54:20 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Tue, Nov 02, 2021 at 08:56:51AM -0600, Alex Williamson wrote: > > > > Still, this is something that needs clear definition, I would expect > > > the SET_IRQS to happen after resuming clears but before running sets > > > to give maximum HW flexibility and symmetry with saving. > > > > There's no requirement that the device enters a null state (!_RESUMING > > | !_SAVING | !_RUNNING), the uAPI even species the flows as _RESUMING > > transitioning to _RUNNING. > > If the device saves the MSI-X state inside it's migration data (as > apparently would be convenient for other OSs) then when RESUMING > clears and the migration data is de-serialized the device will > overwrite the MSI-X data. > > Since Linux as an OS wants to control the MSI-X it needs to load it > after RESUMING, but before RUNNING. This is not how it works today, QEMU enables MSI/X based on the config space information, which is also outside of the device migration stream. > > There's no point at which we can do SET_IRQS other than in the > > _RESUMING state. Generally SET_IRQS ioctls are coordinated with the > > guest driver based on actions to the device, we can't be mucking > > with IRQs while the device is presumed running and already > > generating interrupt conditions. > > We need to do it in state 000 > > ie resume should go > > 000 -> 100 -> 000 -> 001 > > With SET_IRQS and any other fixing done during the 2nd 000, after the > migration data has been loaded into the device. Again, this is not how QEMU works today. > > > And we should really define clearly what a device is supposed to do > > > with the interrupt vectors during migration. Obviously there are races > > > here. > > > > The device should not be generating interrupts while !_RUNNING, pending > > interrupts should be held until the device is _RUNNING. To me this > > means the sequence must be that INTx/MSI/MSI-X are restored while in > > the !_RUNNING state. > > Yes Except I suppose them to be restored while _RESUMING is set. > > > > In any case, it requires that the device cannot be absolutely static > > > > while !_RUNNING. Does (_RESUMING) have different rules than > > > > (_SAVING)? > > > > > > I'd prever to avoid all device touches during both resuming and > > > saving, and do them during !RUNNING > > > > There's no such state required by the uAPI. > > The uAPI comment does not define when to do the SET_IRQS, it seems > this has been missed. > > We really should fix it, unless you feel strongly that the > experimental API in qemu shouldn't be changed. I think the QEMU implementation fills in some details of how the uAPI is expected to work. MSI/X is expected to be restored while _RESUMING based on the config space of the device, there is no intermediate step between _RESUMING and _RUNNING. Introducing such a requirement precludes the option of a post-copy implementation of (_RESUMING | _RUNNING). Thanks, Alex
On Tue, Nov 02, 2021 at 10:22:36AM -0600, Alex Williamson wrote: > > > There's no point at which we can do SET_IRQS other than in the > > > _RESUMING state. Generally SET_IRQS ioctls are coordinated with the > > > guest driver based on actions to the device, we can't be mucking > > > with IRQs while the device is presumed running and already > > > generating interrupt conditions. > > > > We need to do it in state 000 > > > > ie resume should go > > > > 000 -> 100 -> 000 -> 001 > > > > With SET_IRQS and any other fixing done during the 2nd 000, after the > > migration data has been loaded into the device. > > Again, this is not how QEMU works today. I know, I think it is a poor choice to carve out certain changes to the device that must be preserved across loading the migration state. > > The uAPI comment does not define when to do the SET_IRQS, it seems > > this has been missed. > > > > We really should fix it, unless you feel strongly that the > > experimental API in qemu shouldn't be changed. > > I think the QEMU implementation fills in some details of how the uAPI > is expected to work. Well, we already know QEMU has problems, like the P2P thing. Is this a bug, or a preferred limitation as designed? > MSI/X is expected to be restored while _RESUMING based on the > config space of the device, there is no intermediate step between > _RESUMING and _RUNNING. Introducing such a requirement precludes > the option of a post-copy implementation of (_RESUMING | _RUNNING). Not precluded, a new state bit would be required to implement some future post-copy. 0000 -> 1100 -> 1000 -> 1001 -> 0001 Instead of overloading the meaning of RUNNING. I think this is cleaner anyhow. (though I don't know how we'd structure the save side to get two bitstreams) Thanks, Jason
On Tue, 2 Nov 2021 13:36:10 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Tue, Nov 02, 2021 at 10:22:36AM -0600, Alex Williamson wrote: > > > > > There's no point at which we can do SET_IRQS other than in the > > > > _RESUMING state. Generally SET_IRQS ioctls are coordinated with the > > > > guest driver based on actions to the device, we can't be mucking > > > > with IRQs while the device is presumed running and already > > > > generating interrupt conditions. > > > > > > We need to do it in state 000 > > > > > > ie resume should go > > > > > > 000 -> 100 -> 000 -> 001 > > > > > > With SET_IRQS and any other fixing done during the 2nd 000, after the > > > migration data has been loaded into the device. > > > > Again, this is not how QEMU works today. > > I know, I think it is a poor choice to carve out certain changes to > the device that must be preserved across loading the migration state. > > > > The uAPI comment does not define when to do the SET_IRQS, it seems > > > this has been missed. > > > > > > We really should fix it, unless you feel strongly that the > > > experimental API in qemu shouldn't be changed. > > > > I think the QEMU implementation fills in some details of how the uAPI > > is expected to work. > > Well, we already know QEMU has problems, like the P2P thing. Is this a > bug, or a preferred limitation as designed? > > > MSI/X is expected to be restored while _RESUMING based on the > > config space of the device, there is no intermediate step between > > _RESUMING and _RUNNING. Introducing such a requirement precludes > > the option of a post-copy implementation of (_RESUMING | _RUNNING). > > Not precluded, a new state bit would be required to implement some > future post-copy. > > 0000 -> 1100 -> 1000 -> 1001 -> 0001 > > Instead of overloading the meaning of RUNNING. > > I think this is cleaner anyhow. > > (though I don't know how we'd structure the save side to get two > bitstreams) The way this is supposed to work is that the device migration stream contains the device internal state. QEMU is then responsible for restoring the external state of the device, including the DMA mappings, interrupts, and config space. It's not possible for the migration driver to reestablish these things. So there is a necessary division of device state between QEMU and the migration driver. If we don't think the uAPI includes the necessary states, doesn't sufficiently define the states, and we're not following the existing QEMU implementation as the guide for the intentions of the uAPI spec, then what exactly is the proposed mlx5 migration driver implementing and why would we even considering including it at this point? Thanks, Alex
On Tue, Nov 02, 2021 at 02:15:47PM -0600, Alex Williamson wrote: > On Tue, 2 Nov 2021 13:36:10 -0300 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Tue, Nov 02, 2021 at 10:22:36AM -0600, Alex Williamson wrote: > > > > > > > There's no point at which we can do SET_IRQS other than in the > > > > > _RESUMING state. Generally SET_IRQS ioctls are coordinated with the > > > > > guest driver based on actions to the device, we can't be mucking > > > > > with IRQs while the device is presumed running and already > > > > > generating interrupt conditions. > > > > > > > > We need to do it in state 000 > > > > > > > > ie resume should go > > > > > > > > 000 -> 100 -> 000 -> 001 > > > > > > > > With SET_IRQS and any other fixing done during the 2nd 000, after the > > > > migration data has been loaded into the device. > > > > > > Again, this is not how QEMU works today. > > > > I know, I think it is a poor choice to carve out certain changes to > > the device that must be preserved across loading the migration state. > > > > > > The uAPI comment does not define when to do the SET_IRQS, it seems > > > > this has been missed. > > > > > > > > We really should fix it, unless you feel strongly that the > > > > experimental API in qemu shouldn't be changed. > > > > > > I think the QEMU implementation fills in some details of how the uAPI > > > is expected to work. > > > > Well, we already know QEMU has problems, like the P2P thing. Is this a > > bug, or a preferred limitation as designed? > > > > > MSI/X is expected to be restored while _RESUMING based on the > > > config space of the device, there is no intermediate step between > > > _RESUMING and _RUNNING. Introducing such a requirement precludes > > > the option of a post-copy implementation of (_RESUMING | _RUNNING). > > > > Not precluded, a new state bit would be required to implement some > > future post-copy. > > > > 0000 -> 1100 -> 1000 -> 1001 -> 0001 > > > > Instead of overloading the meaning of RUNNING. > > > > I think this is cleaner anyhow. > > > > (though I don't know how we'd structure the save side to get two > > bitstreams) > > The way this is supposed to work is that the device migration stream > contains the device internal state. QEMU is then responsible for > restoring the external state of the device, including the DMA mappings, > interrupts, and config space. It's not possible for the migration > driver to reestablish these things. So there is a necessary division > of device state between QEMU and the migration driver. > > If we don't think the uAPI includes the necessary states, doesn't > sufficiently define the states, and we're not following the existing > QEMU implementation as the guide for the intentions of the uAPI spec, > then what exactly is the proposed mlx5 migration driver implementing > and why would we even considering including it at this point? Thanks, The driver posting follows the undocumented behaviors of QEMU You asked that these all be documented, evaluated and formalized as a precondition to merging it. So, what do you want? A critical review of the uAPI design or documenting whatever behvaior is coded in qemu? A critical review suggest SET_IRQ should not happen during RESUMING, but mlx5 today doesn't care either way. Jason
On Wed, 3 Nov 2021 09:09:55 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Tue, Nov 02, 2021 at 02:15:47PM -0600, Alex Williamson wrote: > > On Tue, 2 Nov 2021 13:36:10 -0300 > > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > On Tue, Nov 02, 2021 at 10:22:36AM -0600, Alex Williamson wrote: > > > > > > > > > There's no point at which we can do SET_IRQS other than in the > > > > > > _RESUMING state. Generally SET_IRQS ioctls are coordinated with the > > > > > > guest driver based on actions to the device, we can't be mucking > > > > > > with IRQs while the device is presumed running and already > > > > > > generating interrupt conditions. > > > > > > > > > > We need to do it in state 000 > > > > > > > > > > ie resume should go > > > > > > > > > > 000 -> 100 -> 000 -> 001 > > > > > > > > > > With SET_IRQS and any other fixing done during the 2nd 000, after the > > > > > migration data has been loaded into the device. > > > > > > > > Again, this is not how QEMU works today. > > > > > > I know, I think it is a poor choice to carve out certain changes to > > > the device that must be preserved across loading the migration state. > > > > > > > > The uAPI comment does not define when to do the SET_IRQS, it seems > > > > > this has been missed. > > > > > > > > > > We really should fix it, unless you feel strongly that the > > > > > experimental API in qemu shouldn't be changed. > > > > > > > > I think the QEMU implementation fills in some details of how the uAPI > > > > is expected to work. > > > > > > Well, we already know QEMU has problems, like the P2P thing. Is this a > > > bug, or a preferred limitation as designed? > > > > > > > MSI/X is expected to be restored while _RESUMING based on the > > > > config space of the device, there is no intermediate step between > > > > _RESUMING and _RUNNING. Introducing such a requirement precludes > > > > the option of a post-copy implementation of (_RESUMING | _RUNNING). > > > > > > Not precluded, a new state bit would be required to implement some > > > future post-copy. > > > > > > 0000 -> 1100 -> 1000 -> 1001 -> 0001 > > > > > > Instead of overloading the meaning of RUNNING. > > > > > > I think this is cleaner anyhow. > > > > > > (though I don't know how we'd structure the save side to get two > > > bitstreams) > > > > The way this is supposed to work is that the device migration stream > > contains the device internal state. QEMU is then responsible for > > restoring the external state of the device, including the DMA mappings, > > interrupts, and config space. It's not possible for the migration > > driver to reestablish these things. So there is a necessary division > > of device state between QEMU and the migration driver. > > > > If we don't think the uAPI includes the necessary states, doesn't > > sufficiently define the states, and we're not following the existing > > QEMU implementation as the guide for the intentions of the uAPI spec, > > then what exactly is the proposed mlx5 migration driver implementing > > and why would we even considering including it at this point? Thanks, > > The driver posting follows the undocumented behaviors of QEMU In one email I read that QEMU clearly should not be performing SET_IRQS while the device is _RESUMING (which it does) and we need to require an interim state before the device becomes _RUNNING to poke at the device (which QEMU doesn't do and the uAPI doesn't require), and the next I read that we should proceed with some useful quanta of work despite that we clearly don't intend to retain much of the protocol of the current uAPI long term... > You asked that these all be documented, evaluated and formalized as a > precondition to merging it. > > So, what do you want? A critical review of the uAPI design or > documenting whatever behvaior is coded in qemu? Too much is in flux and we're only getting breadcrumbs of the changes to come. It's becoming more evident that we're likely to sufficiently modify the uAPI to the point where I'd probably suggest a new "v2" subtype for the region. > A critical review suggest SET_IRQ should not happen during RESUMING, > but mlx5 today doesn't care either way. But if it can't happening during _RESUMING and once the device is _RUNNING it's too late, then we're demanding an interim state that is not required by the existing protocol. We're redefining that existing operations on the device while in _RESUMING cannot occur in that device state. That's more than uAPI clarification. Thanks, Alex
On Wed, Nov 03, 2021 at 09:44:09AM -0600, Alex Williamson wrote: > In one email I read that QEMU clearly should not be performing SET_IRQS > while the device is _RESUMING (which it does) and we need to require an > interim state before the device becomes _RUNNING to poke at the device > (which QEMU doesn't do and the uAPI doesn't require), and the next I > read that we should proceed with some useful quanta of work despite > that we clearly don't intend to retain much of the protocol of the > current uAPI long term... mlx5 implements the protocol as is today, in a way that is compatible with today's qemu. Qemu has various problems like the P2P issue we talked about, but it is something working. If you want to do a full re-review of the protocol and make changes, then fine, let's do that, but everything should be on the table, and changing qemu shouldn't be a blocker. In one email you are are saying we need to document and decide things as a pre-condition to move the driver forward, and then in the next email you say whatever qemu does is the specification, and can't change it. Part of this messy discussion is my fault as I've been a little unclear in mixing my "community view" of how the protocol should be designed to maximize future HW support and then switching to topics that have direct relevance to mlx5 itself. I want to see devices like hns be supportable and, from experience, I'm very skeptical about placing HW design restrictions into a uAPI. So I don't like those things. However, mlx5's HW is robust and more functional than hns, and doesn't care which way things are decided. > Too much is in flux and we're only getting breadcrumbs of the > changes to come. We have no intention to go in and change the uapi after merging beyond solving the P2P issue. Since we now have confirmation that hns cannot do P2P I see no issue to keep the current design as the non-p2p baseline that hns will implement and the P2P upgrade should be designed separately. > It's becoming more evident that we're likely to sufficiently modify > the uAPI to the point where I'd probably suggest a new "v2" subtype > for the region. I don't think this is evident. It is really your/community choice what to do in VFIO. If vfio sticks with the uAPI "as is" then it places additional requirements on future HW designs. If you want to relax these requirements before stabilizing the uAPI, then we need to make those changes now. It is your decision. I don't know of any upcoming HW designs that have a problem with any of the choices. Thanks, Jason
On Wed, 3 Nov 2021 13:10:19 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Wed, Nov 03, 2021 at 09:44:09AM -0600, Alex Williamson wrote: > > > In one email I read that QEMU clearly should not be performing SET_IRQS > > while the device is _RESUMING (which it does) and we need to require an > > interim state before the device becomes _RUNNING to poke at the device > > (which QEMU doesn't do and the uAPI doesn't require), and the next I > > read that we should proceed with some useful quanta of work despite > > that we clearly don't intend to retain much of the protocol of the > > current uAPI long term... > > mlx5 implements the protocol as is today, in a way that is compatible > with today's qemu. Qemu has various problems like the P2P issue we > talked about, but it is something working. > > If you want to do a full re-review of the protocol and make changes, > then fine, let's do that, but everything should be on the table, and > changing qemu shouldn't be a blocker. I don't think changing QEMU is a blocker, but QEMU should be seen as the closest thing we currently have to a reference user implementation against the uAPI and therefore may define de facto behaviors that are not sufficiently clear in the uAPI. So if we see issues with the QEMU implementation, that's a reflection on gaps and disagreements in the uAPI itself. If we think we need new device states and protocols to handle the issues being raised, we need plans to incrementally add those to the uAPI, otherwise we should halt and reevaluate the existing uAPI for a full overhaul. We agreed that it's easier to add a feature than a restriction in a uAPI, so how do we resolve that some future device may require a new state in order to apply the SET_IRQS configuration? Existing userspace would fail with such a device. > In one email you are are saying we need to document and decide things > as a pre-condition to move the driver forward, and then in the next > email you say whatever qemu does is the specification, and can't > change it. I don't think I ever said we can't change it. I'm being presented with new information, new requirements, new protocols that existing QEMU code does not follow. We can change QEMU, but as I noted before we're getting dangerously close to having a formal, non-experimental user while we're poking holes in the uAPI and we need to consider how the uAPI extends to fill those holes and remains backwards compatible to the current implementation. > Part of this messy discussion is my fault as I've been a little > unclear in mixing my "community view" of how the protocol should be > designed to maximize future HW support and then switching to topics > that have direct relevance to mlx5 itself. Better sooner than later to evaluate the limitations and compatibility issues against what we think is reasonable hardware behavior with respect to migration states and transitions. > I want to see devices like hns be supportable and, from experience, > I'm very skeptical about placing HW design restrictions into a > uAPI. So I don't like those things. > > However, mlx5's HW is robust and more functional than hns, and doesn't > care which way things are decided. Regardless, the issues are already out on the table. We want migration for mlx5, but we also want it to be as reasonably close to what we think can support any device designed for this use case. You seem to have far more visibility into that than I do. > > Too much is in flux and we're only getting breadcrumbs of the > > changes to come. > > We have no intention to go in and change the uapi after merging beyond > solving the P2P issue. Then I'm confused where we're at with the notion that we shouldn't be calling SET_IRQS while in the _RESUMING state. > Since we now have confirmation that hns cannot do P2P I see no issue > to keep the current design as the non-p2p baseline that hns will > implement and the P2P upgrade should be designed separately. > > > It's becoming more evident that we're likely to sufficiently modify > > the uAPI to the point where I'd probably suggest a new "v2" subtype > > for the region. > > I don't think this is evident. It is really your/community choice what > to do in VFIO. > > If vfio sticks with the uAPI "as is" then it places additional > requirements on future HW designs. > > If you want to relax these requirements before stabilizing the uAPI, > then we need to make those changes now. > > It is your decision. I don't know of any upcoming HW designs that have > a problem with any of the choices. If we're going to move forward with the existing uAPI, then we're going to need to start factoring compatibility into our discussions of missing states and protocols. For example, requiring that the device is "quiesced" when the _RUNNING bit is cleared and "frozen" when pending_bytes is read has certain compatibility advantages versus defining a new state bit. Likewise, it might be fair to define that userspace should not touch device MMIO during _RESUMING until after the last bit of the device migration stream has been written, and then it's free to touch MMIO before transitioning directly to the _RUNNING state. IOW, we at least need to entertain methods to achieve the clarifications were trying for within the existing uAPI rather than toss out new device states and protocols at every turn for the sake of API purity. The rate at which we're proposing new states and required transitions without a plan for the uAPI is not where I want to be for adding the driver that could lock us in to a supported uAPI. Thanks, Alex
So, I doubt that I'm the only person trying to follow this discussion who has lost the overview about issues and possible solutions here. I think it would be a good idea to summarize what has been brought up so far outside of this thread. To that effect, I've created an etherpad at https://etherpad.opendev.org/p/VFIOMigrationDiscussions and started filling it with some points. It would be great if others could fill in the blanks so that everyone has a chance to see what is on the table so far, so that we can hopefully discuss this on-list and come up with something that works.
On Wed, Nov 03, 2021 at 12:04:11PM -0600, Alex Williamson wrote: > We agreed that it's easier to add a feature than a restriction in a > uAPI, so how do we resolve that some future device may require a new > state in order to apply the SET_IRQS configuration? I would say don't support those devices. If there is even a hint that they could maybe exist then we should fix it now. Once the uapi is set and documented we should expect device makers to consider it when building their devices. As for SET_IRQs, I have been looking at making documentation and I don't like the way the documentation has to be wrriten because of this. What I see as an understandable, clear, documentation is: - SAVING set - no device touches allowed beyond migration operations and reset via XX Must be set with !RUNNING - RESUMING set - same as SAVING - RUNNING cleared - limited device touches in this list: SET_IRQs, XX config, XX. Device may assume no touches outside the above. (ie no MMIO) Implies NDMA - NDMA set - full device touches Device may not issue DMA or interrupts (??) Device may not dirty pages - RUNNING set - full functionality * In no state may a device generate an error TLP, device hang/integrity failure or kernel intergity failure, no matter what userspace does. The device is permitted to corrupt the migration/VM or SEGV userspace if userspace doesn't follow the rules. (we are trying to figure out what the XX's are right now, would appreciate any help) This is something I think we could expect a HW engineering team to follow and implement in devices. It doesn't complicate things. Overall, at this moment, I would prioritize documentation clarity over strict compatability with qemu, because people have to follow this documentation and make their devices long into the future. If the documentation is convoluted for compatibility reasons HW people are more likely to get it wrong. When HW people get it wrong they are more likely to ask for "quirks" in the uAPI to fix their mistakes. The pending_bytes P2P idea is also quite complicated to document as now we have to describe an HW state not in terms of a NDMA control bit, but in terms of a bunch of implicit operations in a protocol. Not so nice. So, here is what I propose. Let us work on some documentation and come up with the sort of HW centric docs like above and we can then decide if we want to make the qemu changes it will imply, or not. We'll include the P2P stuff, as we see it, so it shows a whole picture. I think that will help everyone participate fully in the discussion. > If we're going to move forward with the existing uAPI, then we're going > to need to start factoring compatibility into our discussions of > missing states and protocols. For example, requiring that the device > is "quiesced" when the _RUNNING bit is cleared and "frozen" when > pending_bytes is read has certain compatibility advantages versus > defining a new state bit. Not entirely, to support P2P going from RESUMING directly to RUNNING is not possible. There must be an in between state that all devices reach before they go to RUNNING. It seems P2P cannot be bolted into the existing qmeu flow with a kernel only change? > clarifications were trying for within the existing uAPI rather than > toss out new device states and protocols at every turn for the sake of > API purity. The rate at which we're proposing new states and required > transitions without a plan for the uAPI is not where I want to be for > adding the driver that could lock us in to a supported uAPI. Thanks, Well, to be fair, the other cases I suggested new stats was when you asked about features we don't have at all today (like post-copy). I think adding new states is a very reasonable way to approach adding new features. As long as new features can be supported with new states we have a forward compatability story. Thanks, Jason
On Fri, 5 Nov 2021 10:24:04 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Wed, Nov 03, 2021 at 12:04:11PM -0600, Alex Williamson wrote: > > > We agreed that it's easier to add a feature than a restriction in a > > uAPI, so how do we resolve that some future device may require a new > > state in order to apply the SET_IRQS configuration? > > I would say don't support those devices. If there is even a hint that > they could maybe exist then we should fix it now. Once the uapi is set > and documented we should expect device makers to consider it when > building their devices. > > As for SET_IRQs, I have been looking at making documentation and I > don't like the way the documentation has to be wrriten because of > this. > > What I see as an understandable, clear, documentation is: > > - SAVING set - no device touches allowed beyond migration operations > and reset via XX I'd suggest defining reset via ioctl only. > Must be set with !RUNNING Not sure what this means. Pre-copy requires SAVING and RUNNING together, is this only suggesting that to get the final device state we need to do so in a !RUNNING state? > - RESUMING set - same as SAVING I take it then that we're defining a new protocol if we can't do SET_IRQS here. > - RUNNING cleared - limited device touches in this list: SET_IRQs, XX > config, XX. > Device may assume no touches outside the above. (ie no MMIO) > Implies NDMA SET_IRQS is MMIO, is the distinction userspace vs kernel? > - NDMA set - full device touches > Device may not issue DMA or interrupts (??) > Device may not dirty pages Is this achievable? We can't bound the time where incoming DMA is possible, devices don't have infinite buffers. > - RUNNING set - full functionality > * In no state may a device generate an error TLP, device > hang/integrity failure or kernel intergity failure, no matter > what userspace does. > The device is permitted to corrupt the migration/VM or SEGV > userspace if userspace doesn't follow the rules. > > (we are trying to figure out what the XX's are right now, would > appreciate any help) > > This is something I think we could expect a HW engineering team to > follow and implement in devices. It doesn't complicate things. > > Overall, at this moment, I would prioritize documentation clarity over > strict compatability with qemu, because people have to follow this > documentation and make their devices long into the future. If the > documentation is convoluted for compatibility reasons HW people are > more likely to get it wrong. When HW people get it wrong they are more > likely to ask for "quirks" in the uAPI to fix their mistakes. I might still suggest a v2 migration sub-type, we'll just immediately deprecate the original as we have no users and QEMU would modify all support to find only the new sub-type as code is updated. "v1" never really materialized, but we can avoid future confusion if it's never produced by in-tree drivers and never consume by mainstream userspace. > The pending_bytes P2P idea is also quite complicated to document as > now we have to describe an HW state not in terms of a NDMA control > bit, but in terms of a bunch of implicit operations in a protocol. Not > so nice. > > So, here is what I propose. Let us work on some documentation and come > up with the sort of HW centric docs like above and we can then decide > if we want to make the qemu changes it will imply, or not. We'll > include the P2P stuff, as we see it, so it shows a whole picture. > > I think that will help everyone participate fully in the discussion. Good plan. > > If we're going to move forward with the existing uAPI, then we're going > > to need to start factoring compatibility into our discussions of > > missing states and protocols. For example, requiring that the device > > is "quiesced" when the _RUNNING bit is cleared and "frozen" when > > pending_bytes is read has certain compatibility advantages versus > > defining a new state bit. > > Not entirely, to support P2P going from RESUMING directly to RUNNING > is not possible. There must be an in between state that all devices > reach before they go to RUNNING. It seems P2P cannot be bolted into > the existing qmeu flow with a kernel only change? Perhaps, yes. > > clarifications were trying for within the existing uAPI rather than > > toss out new device states and protocols at every turn for the sake of > > API purity. The rate at which we're proposing new states and required > > transitions without a plan for the uAPI is not where I want to be for > > adding the driver that could lock us in to a supported uAPI. Thanks, > > Well, to be fair, the other cases I suggested new stats was when you > asked about features we don't have at all today (like post-copy). I > think adding new states is a very reasonable way to approach adding > new features. As long as new features can be supported with new states > we have a forward compatability story. That has a viable upgrade path, I'm onboard with that. A device that imposes it can't do SET_IRQS while RESUMING when we have no required state in between RESUMING and RUNNING are the sorts of issues that I'm going to get hung up on. I take it from the above that you're building that state transition requirement into the uAPI now. Thanks, Alex
On Thu, Nov 04 2021, Cornelia Huck <cohuck@redhat.com> wrote: > So, I doubt that I'm the only person trying to follow this discussion > who has lost the overview about issues and possible solutions here. I > think it would be a good idea to summarize what has been brought up so > far outside of this thread. > > To that effect, I've created an etherpad at > https://etherpad.opendev.org/p/VFIOMigrationDiscussions and started > filling it with some points. It would be great if others could fill in > the blanks so that everyone has a chance to see what is on the table so > far, so that we can hopefully discuss this on-list and come up with > something that works. ...just to clarify, my idea was that we could have a writeup of the various issues and proposed solutions on the etherpad, and then post the contents on-list next week as a starting point for a discussion that is not hidden deeply inside the discussion on a patch set. So, please continue adding points :)
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Tuesday, October 26, 2021 11:19 PM > > On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote: > > > > This is also why I don't like it being so transparent as it is > > > something userspace needs to care about - especially if the HW cannot > > > support such a thing, if we intend to allow that. > > > > Userspace does need to care, but userspace's concern over this should > > not be able to compromise the platform and therefore making VF > > assignment more susceptible to fatal error conditions to comply with a > > migration uAPI is troublesome for me. > > It is an interesting scenario. > > I think it points that we are not implementing this fully properly. > > The !RUNNING state should be like your reset efforts. > > All access to the MMIO memories from userspace should be revoked > during !RUNNING This assumes that vCPUs must be stopped before !RUNNING is entered in virtualization case. and it is true today. But it may not hold when talking about guest SVA and I/O page fault [1]. The problem is that the pending requests may trigger I/O page faults on guest page tables. W/o running vCPUs to handle those faults, the quiesce command cannot complete draining the pending requests if the device doesn't support preempt-on-fault (at least it's the case for some Intel and Huawei devices, possibly true for most initial SVA implementations). Of course migrating guest SVA requires more changes as discussed in [1]. Here just want to point out this forward-looking requirement so any definition change in this thread won't break that usage. [1] https://lore.kernel.org/qemu-devel/06cb5bfd-f6f8-b61b-1a7e-60a9ae2f8fac@nvidia.com/T/ (p.s. 'stop device' in [1] means 'quiesce device' in this thread) Thanks, Kevin > > All VMAs zap'd. > > All IOMMU peer mappings invalidated. > > The kernel should directly block userspace from causing a MMIO TLP > before the device driver goes to !RUNNING. > > Then the question of what the device does at this edge is not > relevant as hostile userspace cannot trigger it. > > The logical way to implement this is to key off running and > block/unblock MMIO access when !RUNNING. > > To me this strongly suggests that the extra bit is the correct way > forward as the driver is much simpler to implement and understand if > RUNNING directly controls the availability of MMIO instead of having > an irregular case where !RUNNING still allows MMIO but only until a > pending_bytes read. > > Given the complexity of this can we move ahead with the current > mlx5_vfio and Yishai&co can come with some followup proposal to split > the freeze/queice and block MMIO? > > Jason
On Mon, Nov 08, 2021 at 08:53:20AM +0000, Tian, Kevin wrote: > > From: Jason Gunthorpe <jgg@nvidia.com> > > Sent: Tuesday, October 26, 2021 11:19 PM > > > > On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote: > > > > > > This is also why I don't like it being so transparent as it is > > > > something userspace needs to care about - especially if the HW cannot > > > > support such a thing, if we intend to allow that. > > > > > > Userspace does need to care, but userspace's concern over this should > > > not be able to compromise the platform and therefore making VF > > > assignment more susceptible to fatal error conditions to comply with a > > > migration uAPI is troublesome for me. > > > > It is an interesting scenario. > > > > I think it points that we are not implementing this fully properly. > > > > The !RUNNING state should be like your reset efforts. > > > > All access to the MMIO memories from userspace should be revoked > > during !RUNNING > > This assumes that vCPUs must be stopped before !RUNNING is entered > in virtualization case. and it is true today. > > But it may not hold when talking about guest SVA and I/O page fault [1]. > The problem is that the pending requests may trigger I/O page faults > on guest page tables. W/o running vCPUs to handle those faults, the > quiesce command cannot complete draining the pending requests > if the device doesn't support preempt-on-fault (at least it's the case for > some Intel and Huawei devices, possibly true for most initial SVA > implementations). It cannot be ordered any other way. vCPUs must be stopped first, then the PCI devices must be stopped after, otherwise the vCPU can touch a stopped a device while handling a fault which is unreasonable. However, migrating a pending IOMMU fault does seem unreasonable as well. The NDA state can potentially solve this: RUNNING | VCPU RUNNING - Normal NDMA | RUNNING | VCPU RUNNING - Halt and flush DMA, and thus all faults NDMA | RUNNING - Halt all MMIO access 0 - Halted everything Though this may be more disruptive to the vCPUs as they could spin on DMA/interrupts that will not come. Jason
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Monday, November 8, 2021 8:36 PM > > On Mon, Nov 08, 2021 at 08:53:20AM +0000, Tian, Kevin wrote: > > > From: Jason Gunthorpe <jgg@nvidia.com> > > > Sent: Tuesday, October 26, 2021 11:19 PM > > > > > > On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote: > > > > > > > > This is also why I don't like it being so transparent as it is > > > > > something userspace needs to care about - especially if the HW > cannot > > > > > support such a thing, if we intend to allow that. > > > > > > > > Userspace does need to care, but userspace's concern over this should > > > > not be able to compromise the platform and therefore making VF > > > > assignment more susceptible to fatal error conditions to comply with a > > > > migration uAPI is troublesome for me. > > > > > > It is an interesting scenario. > > > > > > I think it points that we are not implementing this fully properly. > > > > > > The !RUNNING state should be like your reset efforts. > > > > > > All access to the MMIO memories from userspace should be revoked > > > during !RUNNING > > > > This assumes that vCPUs must be stopped before !RUNNING is entered > > in virtualization case. and it is true today. > > > > But it may not hold when talking about guest SVA and I/O page fault [1]. > > The problem is that the pending requests may trigger I/O page faults > > on guest page tables. W/o running vCPUs to handle those faults, the > > quiesce command cannot complete draining the pending requests > > if the device doesn't support preempt-on-fault (at least it's the case for > > some Intel and Huawei devices, possibly true for most initial SVA > > implementations). > > It cannot be ordered any other way. > > vCPUs must be stopped first, then the PCI devices must be stopped > after, otherwise the vCPU can touch a stopped a device while handling > a fault which is unreasonable. > > However, migrating a pending IOMMU fault does seem unreasonable as well. > > The NDA state can potentially solve this: > > RUNNING | VCPU RUNNING - Normal > NDMA | RUNNING | VCPU RUNNING - Halt and flush DMA, and thus all > faults > NDMA | RUNNING - Halt all MMIO access should be two steps? NDMA | RUNNING - vCPU stops access to the device NDMA - halt all MMIO access by revoking mapping > 0 - Halted everything yes, adding a new state sounds better than reordering the vcpu/device stop sequence. > > Though this may be more disruptive to the vCPUs as they could spin on > DMA/interrupts that will not come. it's inevitable regardless how we define the migration states. the actual impact depends on how long 'Halt and flush DMA' will take. Thanks Kevin
On Tue, Nov 09, 2021 at 12:58:26AM +0000, Tian, Kevin wrote: > > From: Jason Gunthorpe <jgg@nvidia.com> > > Sent: Monday, November 8, 2021 8:36 PM > > > > On Mon, Nov 08, 2021 at 08:53:20AM +0000, Tian, Kevin wrote: > > > > From: Jason Gunthorpe <jgg@nvidia.com> > > > > Sent: Tuesday, October 26, 2021 11:19 PM > > > > > > > > On Tue, Oct 26, 2021 at 08:42:12AM -0600, Alex Williamson wrote: > > > > > > > > > > This is also why I don't like it being so transparent as it is > > > > > > something userspace needs to care about - especially if the HW > > cannot > > > > > > support such a thing, if we intend to allow that. > > > > > > > > > > Userspace does need to care, but userspace's concern over this should > > > > > not be able to compromise the platform and therefore making VF > > > > > assignment more susceptible to fatal error conditions to comply with a > > > > > migration uAPI is troublesome for me. > > > > > > > > It is an interesting scenario. > > > > > > > > I think it points that we are not implementing this fully properly. > > > > > > > > The !RUNNING state should be like your reset efforts. > > > > > > > > All access to the MMIO memories from userspace should be revoked > > > > during !RUNNING > > > > > > This assumes that vCPUs must be stopped before !RUNNING is entered > > > in virtualization case. and it is true today. > > > > > > But it may not hold when talking about guest SVA and I/O page fault [1]. > > > The problem is that the pending requests may trigger I/O page faults > > > on guest page tables. W/o running vCPUs to handle those faults, the > > > quiesce command cannot complete draining the pending requests > > > if the device doesn't support preempt-on-fault (at least it's the case for > > > some Intel and Huawei devices, possibly true for most initial SVA > > > implementations). > > > > It cannot be ordered any other way. > > > > vCPUs must be stopped first, then the PCI devices must be stopped > > after, otherwise the vCPU can touch a stopped a device while handling > > a fault which is unreasonable. > > > > However, migrating a pending IOMMU fault does seem unreasonable as well. > > > > The NDA state can potentially solve this: > > > > RUNNING | VCPU RUNNING - Normal > > NDMA | RUNNING | VCPU RUNNING - Halt and flush DMA, and thus all > > faults > > NDMA | RUNNING - Halt all MMIO access > > should be two steps? > > NDMA | RUNNING - vCPU stops access to the device > NDMA - halt all MMIO access by revoking mapping No, NDMA without running is equivalent to 0, which is the next step: > > 0 - Halted everything Jason
On Fri, Nov 05, 2021 at 09:31:45AM -0600, Alex Williamson wrote: > On Fri, 5 Nov 2021 10:24:04 -0300 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Wed, Nov 03, 2021 at 12:04:11PM -0600, Alex Williamson wrote: > > > > > We agreed that it's easier to add a feature than a restriction in a > > > uAPI, so how do we resolve that some future device may require a new > > > state in order to apply the SET_IRQS configuration? > > > > I would say don't support those devices. If there is even a hint that > > they could maybe exist then we should fix it now. Once the uapi is set > > and documented we should expect device makers to consider it when > > building their devices. > > > > As for SET_IRQs, I have been looking at making documentation and I > > don't like the way the documentation has to be wrriten because of > > this. > > > > What I see as an understandable, clear, documentation is: > > > > - SAVING set - no device touches allowed beyond migration operations > > and reset via XX > > I'd suggest defining reset via ioctl only. > > > Must be set with !RUNNING > > Not sure what this means. Pre-copy requires SAVING and RUNNING > together, is this only suggesting that to get the final device state we > need to do so in a !RUNNING state? Sorry, I did not think about pre-copy here, mlx5 doesn't do it so I'm not as familiar > > - RESUMING set - same as SAVING > > I take it then that we're defining a new protocol if we can't do > SET_IRQS here. We've been working on some documentation and one of the challenges turns out that all the PCI device state owned by other subsystems (eg the PCI core, the interrupt code, power management, etc) must be kept in sync. No matter what RESUMING cannot just async change device state that the kernel assumes it is controlling. So, in practice, this necessarily requires forbidding the device from touching the MSI table, and other stuff, during RESUMING. Further, since we can't just halt all the other kernel subsystems during SAVING/RESUMING the device must be able to accept touches in those areas, for completely unrelated reasons, (eg a MSI addr/data being changed) safely. Seems like no need to change SET_IRQs. > > - NDMA set - full device touches > > Device may not issue DMA or interrupts (??) > > Device may not dirty pages > > Is this achievable? We can't bound the time where incoming DMA is > possible, devices don't have infinite buffers. It is a necessary evil for migration. The device cannot know how long it will be suspended for and must cope. With networking discarded packets can be resent, but the reality is that real deployments need a QOS that the device will not be paused for too long otherwise the peers may declare the node dead. > > Not entirely, to support P2P going from RESUMING directly to RUNNING > > is not possible. There must be an in between state that all devices > > reach before they go to RUNNING. It seems P2P cannot be bolted into > > the existing qmeu flow with a kernel only change? > > Perhaps, yes. We have also been looking at dirty tracking and we are wondering how that should work. (Dirty tracking will be another followup) If we look at mlx5, it will have built in dirty tracking, and when used with a newer IOMMUs there is also system dirty tracking available. I think userspace should decide if it wants to use mlx5 built in or the system IOMMU to do dirty tracking. Presumably the system IOMMU is turned on via VFIO_IOMMU_DIRTY_PAGES_FLAG_START, but what controls if the mlx5 mechanism should be used or not? mlx5 also has no way to return the dirty log. If the system IOMMU is not used then VFIO_IOMMU_DIRTY_PAGES_FLAG_START should not be done, however that is what controls all the logic under the two GET_BITMAP APIs. (even if fixed I don't really like the idea of the IOMMU extracting this data from the migration driver in the context of iommufd) Further how does mlx5 even report that it has dirty tracking? Was there some plan here we are missing? In light of all this I'm wondering if device dirty tracking should exist as new ioctls on the device FD and reserve the type1 code to only work the IOMMU dirty tracking. Jason
On Fri, Nov 05 2021, Cornelia Huck <cohuck@redhat.com> wrote: > On Thu, Nov 04 2021, Cornelia Huck <cohuck@redhat.com> wrote: > >> So, I doubt that I'm the only person trying to follow this discussion >> who has lost the overview about issues and possible solutions here. I >> think it would be a good idea to summarize what has been brought up so >> far outside of this thread. >> >> To that effect, I've created an etherpad at >> https://etherpad.opendev.org/p/VFIOMigrationDiscussions and started >> filling it with some points. It would be great if others could fill in >> the blanks so that everyone has a chance to see what is on the table so >> far, so that we can hopefully discuss this on-list and come up with >> something that works. > > ...just to clarify, my idea was that we could have a writeup of the > various issues and proposed solutions on the etherpad, and then post the > contents on-list next week as a starting point for a discussion that is > not hidden deeply inside the discussion on a patch set. > > So, please continue adding points :) Last call for anything you want to add to the etherpad; I'll post the contents tomorrow. [Yes, I wanted to post it last week, but got sidetracked.]
On Mon, 15 Nov 2021 19:29:21 -0400 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Fri, Nov 05, 2021 at 09:31:45AM -0600, Alex Williamson wrote: > > On Fri, 5 Nov 2021 10:24:04 -0300 > > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > On Wed, Nov 03, 2021 at 12:04:11PM -0600, Alex Williamson wrote: > > > > > > > We agreed that it's easier to add a feature than a restriction in a > > > > uAPI, so how do we resolve that some future device may require a new > > > > state in order to apply the SET_IRQS configuration? > > > > > > I would say don't support those devices. If there is even a hint that > > > they could maybe exist then we should fix it now. Once the uapi is set > > > and documented we should expect device makers to consider it when > > > building their devices. > > > > > > As for SET_IRQs, I have been looking at making documentation and I > > > don't like the way the documentation has to be wrriten because of > > > this. > > > > > > What I see as an understandable, clear, documentation is: > > > > > > - SAVING set - no device touches allowed beyond migration operations > > > and reset via XX > > > > I'd suggest defining reset via ioctl only. > > > > > Must be set with !RUNNING > > > > Not sure what this means. Pre-copy requires SAVING and RUNNING > > together, is this only suggesting that to get the final device state we > > need to do so in a !RUNNING state? > > Sorry, I did not think about pre-copy here, mlx5 doesn't do it so I'm > not as familiar > > > > - RESUMING set - same as SAVING > > > > I take it then that we're defining a new protocol if we can't do > > SET_IRQS here. > > We've been working on some documentation and one of the challenges > turns out that all the PCI device state owned by other subsystems (eg > the PCI core, the interrupt code, power management, etc) must be kept > in sync. No matter what RESUMING cannot just async change device state > that the kernel assumes it is controlling. > > So, in practice, this necessarily requires forbidding the device from > touching the MSI table, and other stuff, during RESUMING. > > Further, since we can't just halt all the other kernel subsystems > during SAVING/RESUMING the device must be able to accept touches in > those areas, for completely unrelated reasons, (eg a MSI addr/data > being changed) safely. > > Seems like no need to change SET_IRQs. > > > > > - NDMA set - full device touches > > > Device may not issue DMA or interrupts (??) > > > Device may not dirty pages > > > > Is this achievable? We can't bound the time where incoming DMA is > > possible, devices don't have infinite buffers. > > It is a necessary evil for migration. > > The device cannot know how long it will be suspended for and must > cope. With networking discarded packets can be resent, but the reality > is that real deployments need a QOS that the device will not be paused > for too long otherwise the peers may declare the node dead. > > > > Not entirely, to support P2P going from RESUMING directly to RUNNING > > > is not possible. There must be an in between state that all devices > > > reach before they go to RUNNING. It seems P2P cannot be bolted into > > > the existing qmeu flow with a kernel only change? > > > > Perhaps, yes. > > We have also been looking at dirty tracking and we are wondering how > that should work. (Dirty tracking will be another followup) > > If we look at mlx5, it will have built in dirty tracking, and when > used with a newer IOMMUs there is also system dirty tracking > available. > > I think userspace should decide if it wants to use mlx5 built in or > the system IOMMU to do dirty tracking. What information does userspace use to inform such a decision? Ultimately userspace just wants the finest granularity of tracking, shouldn't that guide our decisions which to provide? > Presumably the system IOMMU is turned on via > VFIO_IOMMU_DIRTY_PAGES_FLAG_START, but what controls if the mlx5 > mechanism should be used or not? > > mlx5 also has no way to return the dirty log. If the system IOMMU is > not used then VFIO_IOMMU_DIRTY_PAGES_FLAG_START should not be done, > however that is what controls all the logic under the two GET_BITMAP > APIs. (even if fixed I don't really like the idea of the IOMMU > extracting this data from the migration driver in the context of > iommufd) > > Further how does mlx5 even report that it has dirty tracking? > > Was there some plan here we are missing? I believe the intended progression of dirty tracking is that by default all mapped ranges are dirty. If the device supports page pinning, then we reduce the set of dirty pages to those pages which are pinned. A device that doesn't otherwise need page pinning, such as a fully IOMMU backed device, would use gratuitous page pinning triggered by the _SAVING state activation on the device. It sounds like mlx5 could use this existing support today. We had also discussed variants to page pinning that might be more useful as device dirty page support improves. For example calls to mark pages dirty once rather than the perpetual dirtying of pinned pages, calls to pin pages for read vs write, etc. We didn't dive much into system IOMMU dirtying, but presumably we'd have a fault handler triggered if a page is written by the device and go from there. > In light of all this I'm wondering if device dirty tracking should > exist as new ioctls on the device FD and reserve the type1 code to > only work the IOMMU dirty tracking. Our existing model is working towards the IOMMU, ie. container, interface aggregating dirty page context. For example when page pinning is used, it's only when all devices within the container are using page pinning that we can report the pinned subset as dirty. Otherwise userspace needs to poll each device, which I suppose enables your idea that userspace decides which source to use, but why? Does the IOMMU dirty page tracking exclude devices if the user queries the device separately? How would it know? What's the advantage? It seems like this creates too many support paths that all need to converge on the same answer. Consolidating DMA dirty page tracking to the DMA mapping interface for all devices within a DMA context makes more sense to me. Thanks, Alex
On Tue, Nov 16, 2021 at 10:57:36AM -0700, Alex Williamson wrote: > > I think userspace should decide if it wants to use mlx5 built in or > > the system IOMMU to do dirty tracking. > > What information does userspace use to inform such a decision? Kernel can't know which approach performs better. Operators should benchmark and make a choice for their deployment HW. Maybe device tracking severely impacts device performance or vice versa. Kernel doesn't easily know what userspace has done, maybe one device supports migration driver dirty tracking and one device does not. Is user space going to use a system IOMMU for both devices? Is it going to put the simple device in NDMA early and continue to dirty track to shutdown the other devices? > Ultimately userspace just wants the finest granularity of tracking, > shouldn't that guide our decisions which to provide? At least for mlx5 there is going to some trade off curve of device performance, dirty tracking page size, and working set. Even lower is better is not necessarily true. After overheads on a 400GB RDMA NIC there is not such a big difference between doing a 4k and 16k scatter transfer. The CPU work to process all the extra bitmap data may not be a net win compared to block transfer times. Conversly someone doing 1G TCP transfers probably cares a lot to minimize block size. Overall, I think there is far too much up in the air and unmeasured to firmly commit the kernel to a fixed policy. So, I would like to see userspace control most of the policy aspects, including the dirty track provider. > I believe the intended progression of dirty tracking is that by default > all mapped ranges are dirty. If the device supports page pinning, then > we reduce the set of dirty pages to those pages which are pinned. A > device that doesn't otherwise need page pinning, such as a fully IOMMU How does userspace know if dirty tracking works or not? All I see VFIO_IOMMU_DIRTY_PAGES_FLAG_START unconditionally allocs some bitmaps. I'm surprised it doesn't check that only NO_IOMMU's devices are attached to the container and refuse to dirty track otherwise - since it doesn't work.. > backed device, would use gratuitous page pinning triggered by the > _SAVING state activation on the device. It sounds like mlx5 could use > this existing support today. How does mlx5 know if it should turn on its dirty page tracking on SAVING or if the system IOMMU covers it? Or for some reason userspace doesn't want dirty tracking but is doing pre-copy? When we mix dirty track with pre-copy, the progression seems to be: DITRY TRACKING | RUNNING Copy every page to the remote DT | SAVING | RUNNING Copy pre-copy migration data to the remote SAVING | NDMA | RUNNING Read and clear dirty track device bitmap DT | SAVING | RUNNING Copy new dirtied data (maybe loop back to NDMA a few times?) SAVING | NDMA | RUNNING P2P grace state 0 Read the dirty track and copy data Read and send the migration state Can we do something so complex using only SAVING? .. and along the lines of the above how do we mix in NDMA to the iommu container, and how does it work if only some devices support NDMA? > We had also discussed variants to page pinning that might be more > useful as device dirty page support improves. For example calls to > mark pages dirty once rather than the perpetual dirtying of pinned > pages, calls to pin pages for read vs write, etc. We didn't dive much > into system IOMMU dirtying, but presumably we'd have a fault handler > triggered if a page is written by the device and go from there. Would be interesting to know for sure what current IOMMU HW has done. I'm supposing the easiest implementation is to write a dirty bit to the IO PTE the same as the CPU writes a dirty bit the normal PTE. > > In light of all this I'm wondering if device dirty tracking should > > exist as new ioctls on the device FD and reserve the type1 code to > > only work the IOMMU dirty tracking. > > Our existing model is working towards the IOMMU, ie. container, > interface aggregating dirty page context. This creates inefficiencies in the kernel, we copy from the mlx5 formed data structure to new memory in the iommu through a very ineffficent API and then again we do an ioctl to copy it once more and throw all the extra work away. It does not seem good for something where we want performance. > For example when page pinning is used, it's only when all devices > within the container are using page pinning that we can report the > pinned subset as dirty. Otherwise userspace needs to poll each > device, which I suppose enables your idea that userspace decides > which source to use, but why? Efficiency, and user selectable policy. Userspace can just allocate an all zeros bitmap and feed it to each of the providers in the kernel using a 'or in your dirty' semantic. No redundant kernel data marshaling, userspace gets to decide which tracking provider to use, and it is simple to implement in the kernel. Userspace has to do this anyhow if it has configurations with multiple containers. For instance because it was forced to split the containers due to one device not supporting NDMA. > Does the IOMMU dirty page tracking exclude devices if the user > queries the device separately? What makes sense to me is multiple tracking providers. Each can be turned on and off. If the container tracking provider says it supports tracking then it means it can track DMA from every device it is connected to (unlike today?). eg by using IOMMU HW that naturally does this, or by only having only NO_IOMMU devices. If the migration driver says it supports tracking, then it only tracks DMA from that device. > How would it know? What's the advantage? It seems like this > creates too many support paths that all need to converge on the same > answer. Consolidating DMA dirty page tracking to the DMA mapping > interface for all devices within a DMA context makes more sense to > me. What I see is a lot of questions and limitations with this approach. If we stick to funneling everything through the iommu then answering the questions seem to create a large amount of kernel work. Enough to ask if it is worthwhile.. .. and then we have to ask how does this all work in IOMMUFD where it is not so reasonable to tightly couple the migration driver and the IOAS and I get more questions :) Jason
On Tue, 16 Nov 2021 15:25:05 -0400 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Tue, Nov 16, 2021 at 10:57:36AM -0700, Alex Williamson wrote: > > > > I think userspace should decide if it wants to use mlx5 built in or > > > the system IOMMU to do dirty tracking. > > > > What information does userspace use to inform such a decision? > > Kernel can't know which approach performs better. Operators should > benchmark and make a choice for their deployment HW. Maybe device > tracking severely impacts device performance or vice versa. I'm all for keeping policy decisions out of the kernel, but it's pretty absurd to expect a userspace operator to benchmark various combination and wire various knobs through the user interface for this. It seems to me that the kernel, ie. the vfio variant driver, *can* know the best default. We can design in interfaces so that the driver may, for example, know whether to pin pages or defer to the system IOMMU dirty tracking. The driver provider can provide quirks for IOMMU implementations that perform poorly versus device provided alternatives. The driver can know if a device iotlb cache provides the better result. The driver can provide module options or devlink tweaks to change the behavior. This seems like something userspace doesn't want to care about in the common path. > Kernel doesn't easily know what userspace has done, maybe one device > supports migration driver dirty tracking and one device does not. And that's exactly why the current type1 implementation exposes the least common denominator to userspace, ie. pinned pages only if all devices in the container have enabled this degree of granularity. > Is user space going to use a system IOMMU for both devices? If the system IOMMU supports it and none of the drivers have opt'd to report via other means, yes. > Is it going to put the simple device in NDMA early and continue to > dirty track to shutdown the other devices? Yes, the current model could account for this, the device entering NDMA mode effectively becomes enlightened because it knows that it is no longer dirtying pages. It'd be the same as a driver turning on page pinning with _SAVING, we'd just need a way to do that without actually pinning a page. > > Ultimately userspace just wants the finest granularity of tracking, > > shouldn't that guide our decisions which to provide? > > At least for mlx5 there is going to some trade off curve of device > performance, dirty tracking page size, and working set. > > Even lower is better is not necessarily true. After overheads on a > 400GB RDMA NIC there is not such a big difference between doing a 4k > and 16k scatter transfer. The CPU work to process all the extra bitmap > data may not be a net win compared to block transfer times. > > Conversly someone doing 1G TCP transfers probably cares a lot to > minimize block size. > > Overall, I think there is far too much up in the air and unmeasured to > firmly commit the kernel to a fixed policy. > > So, I would like to see userspace control most of the policy aspects, > including the dirty track provider. This sounds like device specific migration parameter tuning via a devlink interface to me, tbh. How would you propose a generic vfio/iommufd interface to tune this sort of thing? > > I believe the intended progression of dirty tracking is that by default > > all mapped ranges are dirty. If the device supports page pinning, then > > we reduce the set of dirty pages to those pages which are pinned. A > > device that doesn't otherwise need page pinning, such as a fully IOMMU > > How does userspace know if dirty tracking works or not? All I see > VFIO_IOMMU_DIRTY_PAGES_FLAG_START unconditionally allocs some bitmaps. IIRC, it's always supported by type1. In the worst case we always report all mapped pages as dirty. > I'm surprised it doesn't check that only NO_IOMMU's devices are > attached to the container and refuse to dirty track otherwise - since > it doesn't work.. No-IOMMU doesn't use type1, the ioctl returns errno. > > backed device, would use gratuitous page pinning triggered by the > > _SAVING state activation on the device. It sounds like mlx5 could use > > this existing support today. > > How does mlx5 know if it should turn on its dirty page tracking on > SAVING or if the system IOMMU covers it? Or for some reason userspace > doesn't want dirty tracking but is doing pre-copy? Likely there'd be some sort of IOMMU property the driver could check, type1 would need to figure out the same. The type1/iommufd interfaces to the driver could evolve so that the driver can know if DMA dirty tracking is enabled by the user. > When we mix dirty track with pre-copy, the progression seems to be: > > DITRY TRACKING | RUNNING > Copy every page to the remote > DT | SAVING | RUNNING > Copy pre-copy migration data to the remote > SAVING | NDMA | RUNNING > Read and clear dirty track device bitmap > DT | SAVING | RUNNING > Copy new dirtied data > (maybe loop back to NDMA a few times?) > SAVING | NDMA | RUNNING > P2P grace state > 0 > Read the dirty track and copy data > Read and send the migration state > > Can we do something so complex using only SAVING? I'm not demanding that triggering device dirty tracking on saving is how this must be done, I'm only stating that's an idea that was discussed. If we need more complicated triggers between the IOMMU and device, let's define those, but I don't see that doing so negates the benefits of aggregated dirty bitmaps in the IOMMU context. > .. and along the lines of the above how do we mix in NDMA to the iommu > container, and how does it work if only some devices support NDMA? As above, turning on NDMA effectively enlightens the device, we'd need a counter interface to de-enlighten, the IOMMU dirty context dynamically works at the least common denominator at the time. > > We had also discussed variants to page pinning that might be more > > useful as device dirty page support improves. For example calls to > > mark pages dirty once rather than the perpetual dirtying of pinned > > pages, calls to pin pages for read vs write, etc. We didn't dive much > > into system IOMMU dirtying, but presumably we'd have a fault handler > > triggered if a page is written by the device and go from there. > > Would be interesting to know for sure what current IOMMU HW has > done. I'm supposing the easiest implementation is to write a dirty bit > to the IO PTE the same as the CPU writes a dirty bit the normal PTE. > > > > In light of all this I'm wondering if device dirty tracking should > > > exist as new ioctls on the device FD and reserve the type1 code to > > > only work the IOMMU dirty tracking. > > > > Our existing model is working towards the IOMMU, ie. container, > > interface aggregating dirty page context. > > This creates inefficiencies in the kernel, we copy from the mlx5 > formed data structure to new memory in the iommu through a very > ineffficent API and then again we do an ioctl to copy it once more and > throw all the extra work away. It does not seem good for something > where we want performance. So maybe the dirty bitmaps for the IOMMU context need to be exposed to and directly modifiable by the drivers using atomic bitmap ops. Maybe those same bitmaps can be mmap'd to userspace. These bitmaps are not insignificant, do we want every driver managing their own copies? > > For example when page pinning is used, it's only when all devices > > within the container are using page pinning that we can report the > > pinned subset as dirty. Otherwise userspace needs to poll each > > device, which I suppose enables your idea that userspace decides > > which source to use, but why? > > Efficiency, and user selectable policy. > > Userspace can just allocate an all zeros bitmap and feed it to each of > the providers in the kernel using a 'or in your dirty' semantic. > > No redundant kernel data marshaling, userspace gets to decide which > tracking provider to use, and it is simple to implement in the kernel. > > Userspace has to do this anyhow if it has configurations with multiple > containers. For instance because it was forced to split the containers > due to one device not supporting NDMA. Huh? When did that become a requirement? I feel like there are a lot of excuses listed here, but nothing that really demands a per device interface, or at least a per device interface that we have any hope of specifying. Shared infrastructure in the IOMMU allows both kernel-side and userspace-side consolidation of these bitmaps. It's pretty clear that our current interfaces are rudimentary, but we've got to start somewhere. > > Does the IOMMU dirty page tracking exclude devices if the user > > queries the device separately? > > What makes sense to me is multiple tracking providers. Each can be > turned on and off. > > If the container tracking provider says it supports tracking then it > means it can track DMA from every device it is connected to (unlike > today?). eg by using IOMMU HW that naturally does this, or by only > having only NO_IOMMU devices. Let's kick No-IOMMU out of this conversation, it doesn't claim to have any of these features, it never will. Type1 can always provide dirty tracking with the default being to mark all mapped pages perpetually dirty. It doesn't make much sense for userspace to get a concise dirty bitmap from one device and a full dirty bitmap from another. We've optimized that userspace gets the least common denominator for the entire IOMMU context. > If the migration driver says it supports tracking, then it only tracks > DMA from that device. I don't see what this buys us. Userspace is only going to do a migration if all devices support the per device migration region. At that point we need the best representation of the dirty bitmap we can provide per IOMMU context. It makes sense to me to aggregate per device dirtying into that one context. > > How would it know? What's the advantage? It seems like this > > creates too many support paths that all need to converge on the same > > answer. Consolidating DMA dirty page tracking to the DMA mapping > > interface for all devices within a DMA context makes more sense to > > me. > > What I see is a lot of questions and limitations with this > approach. If we stick to funneling everything through the iommu then > answering the questions seem to create a large amount of kernel > work. Enough to ask if it is worthwhile.. If we need a common userspace IOMMU subsystem like IOMMUfd that can handle driver page pinning, IOMMU faults, and dirty tracking, why does it suddenly become an unbearable burden to allow other means besides page pinning for a driver to relay DMA page writes? OTOH, aggregating these features in the IOMMU reduces both overhead of per device bitmaps and user operations to create their own consolidated view. > .. and then we have to ask how does this all work in IOMMUFD where it > is not so reasonable to tightly couple the migration driver and the > IOAS and I get more questions :) To me, the per device option seems pretty ad-hoc, cumbersome and complicated for userspace, and invents problems with the aggregated bitmap that either don't really exist (imo) or where interfaces could be refined. Maybe start with what uAPI visible knobs really make sense and provide a benefit for per-device dirty bitmap tuning and how a device agnostic userspace like QEMU is going to make intelligent decisions about those knobs. Otherwise I see the knobs as out-of-band and most of the other arguments tending towards NIH. Thanks, Alex
On Tue, Nov 16, 2021 at 02:10:31PM -0700, Alex Williamson wrote: > On Tue, 16 Nov 2021 15:25:05 -0400 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Tue, Nov 16, 2021 at 10:57:36AM -0700, Alex Williamson wrote: > > > > > > I think userspace should decide if it wants to use mlx5 built in or > > > > the system IOMMU to do dirty tracking. > > > > > > What information does userspace use to inform such a decision? > > > > Kernel can't know which approach performs better. Operators should > > benchmark and make a choice for their deployment HW. Maybe device > > tracking severely impacts device performance or vice versa. > > I'm all for keeping policy decisions out of the kernel, but it's pretty > absurd to expect a userspace operator to benchmark various combination > and wire various knobs through the user interface for this. Huh? This is the standard operating procedure in netdev land, mm and others. Max performance requires tunables. And here we really do care alot about migration time. I've seen the mm complexity supporting the normal vCPU migration to know this is a high priority for many people. > It seems to me that the kernel, ie. the vfio variant driver, *can* > know the best default. If the kernel can have an algorithm to find the best default then qemu can implement the same algorithm too. I'm also skeptical about this claim the best is knowable. > > Kernel doesn't easily know what userspace has done, maybe one device > > supports migration driver dirty tracking and one device does not. > > And that's exactly why the current type1 implementation exposes the > least common denominator to userspace, ie. pinned pages only if all > devices in the container have enabled this degree of granularity. The current implementation forces no dirty tracking. That is a policy choice that denies userspace the option to run one device with dirty track and simply suspend the other. > > So, I would like to see userspace control most of the policy aspects, > > including the dirty track provider. > > This sounds like device specific migration parameter tuning via a > devlink interface to me, tbh. How would you propose a generic > vfio/iommufd interface to tune this sort of thing? As I said, if each page track provider has its own interface it is straightforward to make policy in userspace. The only tunable beyond that is the dirty page tracking granularity. That is already provided by userspace, but not as an parameter during START. I don't see why we'd need something as complicated as devlink just yet. > > How does userspace know if dirty tracking works or not? All I see > > VFIO_IOMMU_DIRTY_PAGES_FLAG_START unconditionally allocs some bitmaps. > > IIRC, it's always supported by type1. In the worst case we always > report all mapped pages as dirty. Again this denies userspace a policy choice. Why do dirty tracking gyrations if they don't work? Just directly suspend the devices and then copy. > > I'm surprised it doesn't check that only NO_IOMMU's devices are > > attached to the container and refuse to dirty track otherwise - since > > it doesn't work.. > > No-IOMMU doesn't use type1, the ioctl returns errno. Sorry, I mistyped that, I ment emulated iommu, as HCH has called it: vfio_register_emulated_iommu_dev() > > When we mix dirty track with pre-copy, the progression seems to be: > > > > DITRY TRACKING | RUNNING > > Copy every page to the remote > > DT | SAVING | RUNNING > > Copy pre-copy migration data to the remote > > SAVING | NDMA | RUNNING > > Read and clear dirty track device bitmap > > DT | SAVING | RUNNING > > Copy new dirtied data > > (maybe loop back to NDMA a few times?) > > SAVING | NDMA | RUNNING > > P2P grace state > > 0 > > Read the dirty track and copy data > > Read and send the migration state > > > > Can we do something so complex using only SAVING? > > I'm not demanding that triggering device dirty tracking on saving is > how this must be done, I'm only stating that's an idea that was > discussed. If we need more complicated triggers between the IOMMU and > device, let's define those, but I don't see that doing so negates the > benefits of aggregated dirty bitmaps in the IOMMU context. Okay. As far as your request to document things as we seem them upcoming I belive we should have some idea how dirty tracking control fits in. I agree that is not related to how the bitmap is reported. We will continue to think about dirty tracking as not connected to SAVING. > > This creates inefficiencies in the kernel, we copy from the mlx5 > > formed data structure to new memory in the iommu through a very > > ineffficent API and then again we do an ioctl to copy it once more and > > throw all the extra work away. It does not seem good for something > > where we want performance. > > So maybe the dirty bitmaps for the IOMMU context need to be exposed to > and directly modifiable by the drivers using atomic bitmap ops. Maybe > those same bitmaps can be mmap'd to userspace. These bitmaps are not > insignificant, do we want every driver managing their own copies? If we look at mlx5 for example there is no choice. The dirty log is in some device specific format and is not sharable. We must allocate memory to work with it. What I don't need is the bitmap memory in the iommu container, that is all useless for mlx5. So, down this path we need some API for the iommu context to not allocate its memory at all and refer to storage from the tracking provider for cases where that makes sense. > > Userspace has to do this anyhow if it has configurations with multiple > > containers. For instance because it was forced to split the containers > > due to one device not supporting NDMA. > > Huh? When did that become a requirement? It is not a requirement, it is something userspace can do, if it wants. And we talked about this, if NDMA isn't supported the P2P can't work, and a way to enforce that is to not include P2P in the IOMMU mapping. Except if you have an asymmetric NDMA you may want an asymmetric IOMMU mapping too where the NDMA devices can do P2P and the others don't. That is two containers and two dirty bitmaps. Who knows, it isn't the job of the kernel to make these choices, the kernel just provides tools. > I feel like there are a lot of excuses listed here, but nothing that > really demands a per device interface, "excuses" and "NIH" is a bit rude Alex. From my side, you are asking for a lot work from us (who else?) to define and implement a wack of missing kernel functionality. I think it is very reasonable to ask what the return to the community is for this work. "makes more sense to me" is not something that is really compelling. So, I'd rather you tell me concretely why doing this work, in this way, is a good idea. > > If the migration driver says it supports tracking, then it only tracks > > DMA from that device. > > I don't see what this buys us. Userspace is only going to do a > migration if all devices support the per device migration region. At > that point we need the best representation of the dirty bitmap we can > provide per IOMMU context. It makes sense to me to aggregate per > device dirtying into that one context. Again, there are policy choices userspace can make, like just suspending the device that doesn't do dirty tracking and continuing to dirty track the ones that do. This might be very logical if a non-dirty tracking device is something like IDXD that will just cause higher request latency and the dirty tracking is mlx5 that will cause externally visable artifacts. My point is *we don't know* what people will want. I also think you are too focused on a one-size solution that fits into a qemu sort of enteprise product. While I can agree with your position relative to an enterprise style customer, NVIDIA has different customers, many that have their own customized VMMs that are tuned for their HW choices. For these customers I do like to see that the kernel allows options, and I don't think it is appropriate to be so dismissive of this perspective. > > What I see is a lot of questions and limitations with this > > approach. If we stick to funneling everything through the iommu then > > answering the questions seem to create a large amount of kernel > > work. Enough to ask if it is worthwhile.. > > If we need a common userspace IOMMU subsystem like IOMMUfd that can > handle driver page pinning, IOMMU faults, and dirty tracking, why does > it suddenly become an unbearable burden to allow other means besides > page pinning for a driver to relay DMA page writes? I can see some concrete reasons for iommufd, like it allows this code to be shared between mutliple subsystems that need it. Its design supports more functionality than the vfio container can. Here, I don't quite see it. If userspace does for (i = 0; i != num_trackers; i++) ioctl(merge dirty bitmap, i, &bitmap) Or ioctl(read diry bitmap, &bitmap) Hardly seems decisive. What bothers me is the overhead and kernel complexity. If we split them it looks quite simple: - new ioctl on vfio device to read & clear dirty bitmap & extension flag to show this is supported - DIRTY_TRACK migration state bit to start/stop Simple logic that read is only possible in NDMA/!RUNNING - new ioctl on vfio device to report dirty tracking capability flags: - internal dirty track supported - real dirty track through attached container supported (only mdev can set this today) Scenarios: a. If userspace has a mlx5 and a mdev then it does a for loop as above to start and read the dirty bitmaps. b. If userspace has only mlx5 it reads one bitmap c. If userspace has mlx5 and some other PCI device it can activate mlx5 and leave the container alone. Suspend the PCI device early. Or directly give up on dirty track For funneling through the container ioctls.. Humm: - Still track per device/container connection if internal/external dirty track is supported. Add an new ioctl so userspace can have this info for (c) - For external dirty track have some ops callbacks for start/stop, read bitmap and clear. (So, no migration state flag?) - On start make the policy choice if the external/internal will be used, then negotiate some uniform tracking size and iterate over all externals to call start - On read.. to avoid overheads iterate over the internal bitmap and read ops on all external bitmaps and or them together then copy to user. Just ignore NDMA and rely on userspace to do it right? - Clear iterates and zeros bitmaps - Change logic to not allocate tracking bitmaps if no mdevs a. works the same, kernel turns on both trackers b. works the same, kernel turns on only mlx5 c. Hum. We have to disable the 'no tracker report everything as dirty' feature somehow so we can access only the mlx5 tracker without forcing evey page seen as dirty. Needs a details And figure out how this relates to the ongoing iommufd project (which, BTW, we have now invested a lot in too). I'm not as convinced as you are that the 2nd is obviously better, and on principle I don't like avoidable policy choices baked in the kernel. And I don't see that it makes userspace really much simpler anyhow. On the other hand it looks like a lot more kernel work.. > OTOH, aggregating these features in the IOMMU reduces both overhead > of per device bitmaps and user operations to create their own > consolidated view. I don't understand this, funneling will not reduce overhead, at best with some work we can almost break even by not allocating the SW bitmaps. Jason
On Tue, 16 Nov 2021 21:48:31 -0400 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Tue, Nov 16, 2021 at 02:10:31PM -0700, Alex Williamson wrote: > > On Tue, 16 Nov 2021 15:25:05 -0400 > > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > On Tue, Nov 16, 2021 at 10:57:36AM -0700, Alex Williamson wrote: > > > > > > > > I think userspace should decide if it wants to use mlx5 built in or > > > > > the system IOMMU to do dirty tracking. > > > > > > > > What information does userspace use to inform such a decision? > > > > > > Kernel can't know which approach performs better. Operators should > > > benchmark and make a choice for their deployment HW. Maybe device > > > tracking severely impacts device performance or vice versa. > > > > I'm all for keeping policy decisions out of the kernel, but it's pretty > > absurd to expect a userspace operator to benchmark various combination > > and wire various knobs through the user interface for this. > > Huh? This is the standard operating procedure in netdev land, mm and > others. Max performance requires tunables. And here we really do care > alot about migration time. I've seen the mm complexity supporting the > normal vCPU migration to know this is a high priority for many people. Per previous reply: "Maybe start with what uAPI visible knobs really make sense and provide a benefit for per-device dirty bitmap tuning and how a device agnostic userspace like QEMU is going to make intelligent decisions about those knobs." > > It seems to me that the kernel, ie. the vfio variant driver, *can* > > know the best default. > > If the kernel can have an algorithm to find the best default then qemu > can implement the same algorithm too. I'm also skeptical about this > claim the best is knowable. QEMU is device agnostic and has no visibility to the system IOMMU capabilities. > > > Kernel doesn't easily know what userspace has done, maybe one device > > > supports migration driver dirty tracking and one device does not. > > > > And that's exactly why the current type1 implementation exposes the > > least common denominator to userspace, ie. pinned pages only if all > > devices in the container have enabled this degree of granularity. > > The current implementation forces no dirty tracking. That is a policy > choice that denies userspace the option to run one device with dirty > track and simply suspend the other. "Forces no dirty tracking"? I think you're suggesting that if a device within an IOMMU context doesn't support dirty tracking that we're forced to always report all mappings within that context as dirty, because for some reason we're not allowed to evolve how devices can interact with a shared dirty context, but we are allowed to invent a new per-device uAPI? > > > So, I would like to see userspace control most of the policy aspects, > > > including the dirty track provider. > > > > This sounds like device specific migration parameter tuning via a > > devlink interface to me, tbh. How would you propose a generic > > vfio/iommufd interface to tune this sort of thing? > > As I said, if each page track provider has its own interface it is > straightforward to make policy in userspace. The only tunable beyond > that is the dirty page tracking granularity. That is already > provided by userspace, but not as an parameter during START. > > I don't see why we'd need something as complicated as devlink just > yet. The approaches aren't that different. It seems like you want userspace to be able to select which devices to get dirty page info from while I'm suggesting that we can develop ways that the user can manage how the device interacts with a shared dirty page state. Per-device dirty page state doesn't preclude a shared dirty state, we still expect that any IOMMU interface is the source of truth when we have multiple devices sharing an IOMMU context. What if we have a scenario where devices optionally have per device dirty page tracking. A) mlx5 w/ device level dirty page tracking, vGPU w/ page pinning vs B) mlx5 w/ device level dirty page tracking, NIC with system IOMMU dirty page tracking Compare and contrast in which case the shared IOMMU context dirty page tracking reports both device's versus only the devices without per-device tracking. Is this obvious to both userspace and the shared dirty page tracking if it's the same or different in both cases? > > > How does userspace know if dirty tracking works or not? All I see > > > VFIO_IOMMU_DIRTY_PAGES_FLAG_START unconditionally allocs some bitmaps. > > > > IIRC, it's always supported by type1. In the worst case we always > > report all mapped pages as dirty. > > Again this denies userspace a policy choice. Why do dirty tracking > gyrations if they don't work? Just directly suspend the devices and > then copy. As above, it seems like you're freezing one interface and allowing the other to evolve. We can create ways that a device without dirty tracking can be marked as not generating DMA within a shared dirty page context. If maybe what you're really trying to get at all along is visibility to the per-device dirty page contribution, that does sound interesting. But I also don't know how that interacts with system IOMMU based reporting. Does the IOMMU report to the driver that reports to userspace the per-device dirty pages? > > > I'm surprised it doesn't check that only NO_IOMMU's devices are > > > attached to the container and refuse to dirty track otherwise - since > > > it doesn't work.. > > > > No-IOMMU doesn't use type1, the ioctl returns errno. > > Sorry, I mistyped that, I ment emulated iommu, as HCH has called it: > > vfio_register_emulated_iommu_dev() Ok, so that's essentially the vfio IOMMU driver just keeping track of userspace mappings in order to provide page pinning, where a device supporting page pinning is our first step in more fine grained dirty tracking. Type1 keeps track of pages pinned across all devices sharing the IOMMU container, thus we have container based dirty page reporting. > > > When we mix dirty track with pre-copy, the progression seems to be: > > > > > > DITRY TRACKING | RUNNING > > > Copy every page to the remote > > > DT | SAVING | RUNNING > > > Copy pre-copy migration data to the remote > > > SAVING | NDMA | RUNNING > > > Read and clear dirty track device bitmap > > > DT | SAVING | RUNNING > > > Copy new dirtied data > > > (maybe loop back to NDMA a few times?) > > > SAVING | NDMA | RUNNING > > > P2P grace state > > > 0 > > > Read the dirty track and copy data > > > Read and send the migration state > > > > > > Can we do something so complex using only SAVING? > > > > I'm not demanding that triggering device dirty tracking on saving is > > how this must be done, I'm only stating that's an idea that was > > discussed. If we need more complicated triggers between the IOMMU and > > device, let's define those, but I don't see that doing so negates the > > benefits of aggregated dirty bitmaps in the IOMMU context. > > Okay. As far as your request to document things as we seem them > upcoming I belive we should have some idea how dirty tracking control > fits in. I agree that is not related to how the bitmap is reported. We > will continue to think about dirty tracking as not connected to > SAVING. Ok. > > > This creates inefficiencies in the kernel, we copy from the mlx5 > > > formed data structure to new memory in the iommu through a very > > > ineffficent API and then again we do an ioctl to copy it once more and > > > throw all the extra work away. It does not seem good for something > > > where we want performance. > > > > So maybe the dirty bitmaps for the IOMMU context need to be exposed to > > and directly modifiable by the drivers using atomic bitmap ops. Maybe > > those same bitmaps can be mmap'd to userspace. These bitmaps are not > > insignificant, do we want every driver managing their own copies? > > If we look at mlx5 for example there is no choice. The dirty log is in > some device specific format and is not sharable. We must allocate > memory to work with it. > > What I don't need is the bitmap memory in the iommu container, that is > all useless for mlx5. > > So, down this path we need some API for the iommu context to not > allocate its memory at all and refer to storage from the tracking > provider for cases where that makes sense. That depends whether there are other devices in the context and if the container dirty context is meant to include all devices or if the driver is opt'ing out of the shared tracking... if that's a thing. Alternatively, drivers could register callbacks to report their dirty pages into the shared tracking for ranges requested by the user. We need to figure out how per-device tracking an system IOMMU tracking get along if that's where we're headed. > > > Userspace has to do this anyhow if it has configurations with multiple > > > containers. For instance because it was forced to split the containers > > > due to one device not supporting NDMA. > > > > Huh? When did that become a requirement? > > It is not a requirement, it is something userspace can do, if it > wants. And we talked about this, if NDMA isn't supported the P2P can't > work, and a way to enforce that is to not include P2P in the IOMMU > mapping. Except if you have an asymmetric NDMA you may want an > asymmetric IOMMU mapping too where the NDMA devices can do P2P and the > others don't. That is two containers and two dirty bitmaps. I'm not sure how I was supposed to infer that use case from "...forced to split the containers due to one device not support NDMA". That's certainly an option, but not a requirement. In fact, if QEMU were to do something like that, then we have some devices that can do p2p and some devices that cannot... all or none seems like a much more deterministic choice for QEMU. How do guest drivers currently test p2p? > Who knows, it isn't the job of the kernel to make these choices, the > kernel just provides tools. Agreed, but I don't see that userspace choosing to use separate contexts either negates the value of the kernel aggregating dirty pages within a container or clearly makes the case for per-device dirty pages. > > I feel like there are a lot of excuses listed here, but nothing that > > really demands a per device interface, > > "excuses" and "NIH" is a bit rude Alex. > > From my side, you are asking for a lot work from us (who else?) to > define and implement a wack of missing kernel functionality. > > I think it is very reasonable to ask what the return to the community > is for this work. "makes more sense to me" is not something that > is really compelling. > > So, I'd rather you tell me concretely why doing this work, in this > way, is a good idea. On my end, we have a defined dirty page tracking interface at the IOMMU context with some, admittedly rough, plans on how we intend to evolve that interface for both device level and IOMMU level tracking feeding into a shared dirty page interfaces. The example scenarios presented mostly seem to have solutions within that design framework if we put a little effort into it. There are unanswered questions down either path, but one of those paths is the path we previously chose. The greater burden is to switch paths, so rather I need to understand why this is the better path, with due diligence to explore what the same looks like with the current design. It's only finally here in the thread that we're seeing some of the mlx5 implementation details that might favor a per-device solution, hints that per device page granularity might be useful, and maybe that exposing per-device dirty page footprints to userspace is underlying this change of course. > > > If the migration driver says it supports tracking, then it only tracks > > > DMA from that device. > > > > I don't see what this buys us. Userspace is only going to do a > > migration if all devices support the per device migration region. At > > that point we need the best representation of the dirty bitmap we can > > provide per IOMMU context. It makes sense to me to aggregate per > > device dirtying into that one context. > > Again, there are policy choices userspace can make, like just > suspending the device that doesn't do dirty tracking and continuing to > dirty track the ones that do. > > This might be very logical if a non-dirty tracking device is something > like IDXD that will just cause higher request latency and the dirty > tracking is mlx5 that will cause externally visable artifacts. > > My point is *we don't know* what people will want. > > I also think you are too focused on a one-size solution that fits into > a qemu sort of enteprise product. While I can agree with your position > relative to an enterprise style customer, NVIDIA has different > customers, many that have their own customized VMMs that are tuned for > their HW choices. For these customers I do like to see that the kernel > allows options, and I don't think it is appropriate to be so > dismissive of this perspective. So provide the justification I asked for previously and quoted above, what are the things we want to be able to tune that cannot be done through reasonable extensions of the current design? I'm not trying to be dismissive, I'm lacking facts and evidence of due diligence that the current design is incapable of meeting our goals. > > > What I see is a lot of questions and limitations with this > > > approach. If we stick to funneling everything through the iommu then > > > answering the questions seem to create a large amount of kernel > > > work. Enough to ask if it is worthwhile.. > > > > If we need a common userspace IOMMU subsystem like IOMMUfd that can > > handle driver page pinning, IOMMU faults, and dirty tracking, why does > > it suddenly become an unbearable burden to allow other means besides > > page pinning for a driver to relay DMA page writes? > > I can see some concrete reasons for iommufd, like it allows this code > to be shared between mutliple subsystems that need it. Its design > supports more functionality than the vfio container can. > > Here, I don't quite see it. If userspace does > > for (i = 0; i != num_trackers; i++) > ioctl(merge dirty bitmap, i, &bitmap) > > Or > ioctl(read diry bitmap, &bitmap) > > Hardly seems decisive. On the same order as "makes more sense to me" ;) > What bothers me is the overhead and kernel > complexity. > > If we split them it looks quite simple: > - new ioctl on vfio device to read & clear dirty bitmap > & extension flag to show this is supported > - DIRTY_TRACK migration state bit to start/stop Is this another device_state bit? > Simple logic that read is only possible in NDMA/!RUNNING > - new ioctl on vfio device to report dirty tracking capability flags: > - internal dirty track supported > - real dirty track through attached container supported > (only mdev can set this today) How does system IOMMU dirty tracking work? > Scenarios: > > a. If userspace has a mlx5 and a mdev then it does a for loop as above to > start and read the dirty bitmaps. > > b. If userspace has only mlx5 it reads one bitmap > > c. If userspace has mlx5 and some other PCI device it can activate > mlx5 and leave the container alone. Suspend the PCI device early. > Or directly give up on dirty track > > For funneling through the container ioctls.. Humm: > - Still track per device/container connection if internal/external > dirty track is supported. Add an new ioctl so userspace can have > this info for (c) > - For external dirty track have some ops callbacks for start/stop, > read bitmap and clear. (So, no migration state flag?) > - On start make the policy choice if the external/internal will be > used, then negotiate some uniform tracking size and iterate over > all externals to call start > - On read.. to avoid overheads iterate over the internal bitmap > and read ops on all external bitmaps and or them together > then copy to user. Just ignore NDMA and rely on userspace to > do it right? > - Clear iterates and zeros bitmaps > - Change logic to not allocate tracking bitmaps if no mdevs I don't understand what's being described here, I'm glad an attempt is being made to see what this might look like with the current interface, but at the same time the outline seems biased towards a complicated portrayal. It's a relatively simple progression, with no outside information the container exposes all mapped pages as dirty. Individual devices in the container can expose themselves as enlightened in various ways, initially we mark devices that pin pages as enlightened and when all devices in the container are enlightened we can reduce the reported bitmap. As devices add their own dirty tracking we can add interfaces to allow devices to register their status, mark dirty pages, and potentially poll devices to update their dirty pages against ranges requested by the user. A device that's suspended, for example in PCI D3 state, might mark itself as becoming enlightened and report no page dirtying, a device ioctl might allow a more explicit instruction for the same. As IOMMU dirty page tracking comes online, the IOMMU can essentially enlighten all the devices attached to the context. > a. works the same, kernel turns on both trackers > b. works the same, kernel turns on only mlx5 > c. Hum. We have to disable the 'no tracker report everything as > dirty' feature somehow so we can access only the mlx5 tracker > without forcing evey page seen as dirty. Needs a details Doesn't seem as complicated in my rendition. > And figure out how this relates to the ongoing iommufd project (which, > BTW, we have now invested a lot in too). > > I'm not as convinced as you are that the 2nd is obviously better, and > on principle I don't like avoidable policy choices baked in the > kernel. Userspace has the same degree of policy decision in my version, what's missing from my version is userspace visibility to the per device dirty footprint and the newly mentioned desire for per-device page granularity. > And I don't see that it makes userspace really much simpler anyhow. On > the other hand it looks like a lot more kernel work.. There's kernel work either way. > > OTOH, aggregating these features in the IOMMU reduces both overhead > > of per device bitmaps and user operations to create their own > > consolidated view. > > I don't understand this, funneling will not reduce overhead, at best > with some work we can almost break even by not allocating the SW > bitmaps. The moment we have more than one device that requires a bitmap, where the device doesn't have the visibility of the extent of the bitmap, we introduce both space and time overhead versus a shared bitmap that can be pre-allocated. What's being asked for here is a change from the current plan of record, that entails work and justification, including tangible benefits versus a fair exploration of how the same might work in the current design. Anyway, as Connie mentioned off-list, we're deep into a comment thread that's becoming more and more tangential to the original patch, I'm not sure how many relevant contributors are still following this, and it's probably best to propose a change in course to dirty bitmap tracking in a new thread. Thanks, Alex
On Thu, Nov 18, 2021 at 11:15:55AM -0700, Alex Williamson wrote: > On Tue, 16 Nov 2021 21:48:31 -0400 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Tue, Nov 16, 2021 at 02:10:31PM -0700, Alex Williamson wrote: > > > On Tue, 16 Nov 2021 15:25:05 -0400 > > > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > > > On Tue, Nov 16, 2021 at 10:57:36AM -0700, Alex Williamson wrote: > > > > > > > > > > I think userspace should decide if it wants to use mlx5 built in or > > > > > > the system IOMMU to do dirty tracking. > > > > > > > > > > What information does userspace use to inform such a decision? > > > > > > > > Kernel can't know which approach performs better. Operators should > > > > benchmark and make a choice for their deployment HW. Maybe device > > > > tracking severely impacts device performance or vice versa. > > > > > > I'm all for keeping policy decisions out of the kernel, but it's pretty > > > absurd to expect a userspace operator to benchmark various combination > > > and wire various knobs through the user interface for this. > > > > Huh? This is the standard operating procedure in netdev land, mm and > > others. Max performance requires tunables. And here we really do care > > alot about migration time. I've seen the mm complexity supporting the > > normal vCPU migration to know this is a high priority for many people. > > Per previous reply: > > "Maybe start with what uAPI visible knobs really make sense > and provide a benefit for per-device dirty bitmap tuning and how a > device agnostic userspace like QEMU is going to make intelligent > decisions about those knobs." Well, the main knob is to pick which dirty tracker to use, and a 2ndary knob is to perhaps configure it (eg granual size or something, but I have nothing concrete right now). How to do that in the current uapi? I don't know. Beyond some very ugly 'do not use system iommu' flag that doesn't fit every need, I don't have any suggestion. IMHO, generic qemu is fine to do something simple like always prefer the system IOMMU tracker. > QEMU is device agnostic and has no visibility to the system IOMMU > capabilities. Ah, there is a lot going on here. We are adding iommufd and iommufd's design has a specific place to expose the iommu_domain, it's unique capabilities, and uAPI visible connections from the devices. This is the big uAPI design upgrade that is needed to get to all the features we want to see on the iommu side. For what we have now it is quite easy to add a few ioctls for dirty track query/start/stop/read&clear that wire directly 1:1 to an ops on the iommu_domain. So, qemu does a simple 'query' to the iommu_domain, sees dirty track is supported and esn't even do anything device specific. If that fails then it queries each vfio_device for a tracker, if that fails it operates with no dirty track. Since each domain and device is a uAPI object all policy is trivially in userspace hands with a simple uAPI design. > > The current implementation forces no dirty tracking. That is a policy > > choice that denies userspace the option to run one device with dirty > > track and simply suspend the other. > > "Forces no dirty tracking"? I think you're suggesting that if a device > within an IOMMU context doesn't support dirty tracking that we're > forced to always report all mappings within that context as dirty, > because for some reason we're not allowed to evolve how devices can > interact with a shared dirty context, but we are allowed to invent a new > per-device uAPI? Basically yes. If we are changing the uAPI then let's change it to something that makes sense for the HW implementations we actually have. If you have a clever idea how to do this with the current API I'm interested to hear, but I don't have anything in mind right now that isn't much more complicated for everyone. > What if we have a scenario where devices optionally have per device > dirty page tracking. > > A) mlx5 w/ device level dirty page tracking, vGPU w/ page pinning > > vs > > B) mlx5 w/ device level dirty page tracking, NIC with system IOMMU > dirty page tracking > > Compare and contrast in which case the shared IOMMU context dirty page > tracking reports both device's versus only the devices without > per-device tracking. Is this obvious to both userspace and the shared > dirty page tracking if it's the same or different in both cases? Neither A nor B have 'shared dirty page tracking' The only case where we have some kind of shared tracker is when there are multiple mdevs with migration support, and the mdev HW can support a shared bitmap. In iommufd we still have a reasonable place to put a shared mdev dirty tracker - the "emulated iommu" is also exposed as uAPI object that can hold it - however I wouldn't want to implement that in iommufd until we have some intree drivers that need it. All the out of tree mdev drivers can use the device specific ioctl. It only means they can't share dirty bitmaps. Which I think is a reasonable penalty for out of tree drivers. > If maybe what you're really trying to get at all along is visibility to > the per-device dirty page contribution, that does sound interesting. Visibility and control, yes. It is the requests I've had from our internal teams. > But I also don't know how that interacts with system IOMMU based > reporting. Does the IOMMU report to the driver that reports to > userspace the per-device dirty pages? No, the system IOMMU reports through iommufd on the iommu_domain. Completely unconnected from the migration driver. Completely unconnected is the design simplification because I don't need to make kernel infrastructure to build that connection and uAPI to manage it explicitly. > That depends whether there are other devices in the context and if the > container dirty context is meant to include all devices or if the > driver is opt'ing out of the shared tracking... Right now, *today*, I know of nothing that needs or can make good use of shared state bitmaps. > Alternatively, drivers could register callbacks to report their dirty > pages into the shared tracking for ranges requested by the user. We > need to figure out how per-device tracking an system IOMMU tracking get > along if that's where we're headed. I don't want shared tracking, it is a waste of CPU and memory resources. With a single API the only thing that would be OK is no shared state and kernel iterates over all the trackers. This is quite far from what is there right now. > do something like that, then we have some devices that can do p2p and > some devices that cannot... all or none seems like a much more > deterministic choice for QEMU. How do guest drivers currently test p2p? There is a 'p2p allowed' call inside the kernel pci_p2p layer. It could concievably be extended to consult some firmware table provided by the hypervisor. It is another one of these cases, like IMS, where guest transparency is not possible :( However, you can easially see this need arising - eg a GPU is frequently a P2P target but rarely a P2P initiator. It is another case where I can see people making custom VMMs deviating from what makes sense in enterprise qemu. > Agreed, but I don't see that userspace choosing to use separate > contexts either negates the value of the kernel aggregating dirty pages > within a container or clearly makes the case for per-device dirty pages. I bring it up because I don't see a clear way to support it at all with the current API design. Happy to hear ideas > It's only finally here in the thread that we're seeing some of the mlx5 > implementation details that might favor a per-device solution, hints > that per device page granularity might be useful, and maybe that > exposing per-device dirty page footprints to userspace is underlying > this change of course. Well, yes, I mean we've thought about this quite a lot internally, we are not suggesting things randomly for "NIH" as you said. We have lots of different use cases, several customers, multiple devices and more. > So provide the justification I asked for previously and quoted above, > what are the things we want to be able to tune that cannot be done > through reasonable extensions of the current design? I'm not trying to > be dismissive, I'm lacking facts and evidence of due diligence that the > current design is incapable of meeting our goals. It is not incapable or not, it is ROI. I'm sure we can hack the current design into something, with a lot of work and complexity. I'm saying we see a much simpler version that has a simpler kernel side. > > If we split them it looks quite simple: > > - new ioctl on vfio device to read & clear dirty bitmap > > & extension flag to show this is supported > > - DIRTY_TRACK migration state bit to start/stop > > Is this another device_state bit? Probably, or a device ioctl - semantically the same > > Simple logic that read is only possible in NDMA/!RUNNING > > - new ioctl on vfio device to report dirty tracking capability flags: > > - internal dirty track supported > > - real dirty track through attached container supported > > (only mdev can set this today) > > How does system IOMMU dirty tracking work? As above, it is parallel and completely contained in iommufd. It is split, iommufd only does system iommu tracking, the device FD only does device integral tracking. They never cross interfaces internally, so I don't need to make a kernel framework to support, aggregate and control multiple dirty trackers. > I don't understand what's being described here, I'm glad an attempt is > being made to see what this might look like with the current interface, > but at the same time the outline seems biased towards a complicated > portrayal. Well, not understanding is a problem :| > > a. works the same, kernel turns on both trackers > > b. works the same, kernel turns on only mlx5 > > c. Hum. We have to disable the 'no tracker report everything as > > dirty' feature somehow so we can access only the mlx5 tracker > > without forcing evey page seen as dirty. Needs a details > > Doesn't seem as complicated in my rendition. I don't see your path through the code. > > And I don't see that it makes userspace really much simpler anyhow. On > > the other hand it looks like a lot more kernel work.. > > There's kernel work either way. Yes, more vs less :) > > > OTOH, aggregating these features in the IOMMU reduces both overhead > > > of per device bitmaps and user operations to create their own > > > consolidated view. > > > > I don't understand this, funneling will not reduce overhead, at best > > with some work we can almost break even by not allocating the SW > > bitmaps. > > The moment we have more than one device that requires a bitmap, where > the device doesn't have the visibility of the extent of the bitmap, we > introduce both space and time overhead versus a shared bitmap that can > be pre-allocated. The HW designs we can see right now: mlx5, AMD IOMMU, and hns IOMMU (though less clear on this one) track the full IOVA space, they store the track in their own memory and they allow reading each page's dirty bit with an 'atomic test and clear' semantic. So currently, we have *zero* devices that require the shared bitmap. Why should I invest in maintaining and extending this code that has no current user and no known future purpose? > What's being asked for here is a change from the current plan of > record, that entails work and justification, including tangible > benefits versus a fair exploration of how the same might work in the > current design. So, we will make patches for our plan to split them. We'll show an mlx5 implementation using an ioctl on the vfio device. I'll sketch how a system iommu works through iommu_domain ops in iommufd (my goal is to get an AMD implementation at least too) I can write a paragraph how to fit a shared mdev bitmap tracker into iommufd if an in-tree user comes.. Can someone provide an equally complete view of how to extend the current API and solve the same set of use cases? We can revist this in a month or so when we might have a mlx5 patch. Jason
diff --git a/MAINTAINERS b/MAINTAINERS index abdcbcfef73d..e824bfab4a01 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19699,6 +19699,12 @@ L: kvm@vger.kernel.org S: Maintained F: drivers/vfio/platform/ +VFIO MLX5 PCI DRIVER +M: Yishai Hadas <yishaih@nvidia.com> +L: kvm@vger.kernel.org +S: Maintained +F: drivers/vfio/pci/mlx5/ + VGA_SWITCHEROO R: Lukas Wunner <lukas@wunner.de> S: Maintained diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 860424ccda1b..187b9c259944 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -43,4 +43,7 @@ config VFIO_PCI_IGD To enable Intel IGD assignment through vfio-pci, say Y. endif + +source "drivers/vfio/pci/mlx5/Kconfig" + endif diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 349d68d242b4..ed9d6f2e0555 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -7,3 +7,5 @@ obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o vfio-pci-y := vfio_pci.o vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o obj-$(CONFIG_VFIO_PCI) += vfio-pci.o + +obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5/ diff --git a/drivers/vfio/pci/mlx5/Kconfig b/drivers/vfio/pci/mlx5/Kconfig new file mode 100644 index 000000000000..119712656400 --- /dev/null +++ b/drivers/vfio/pci/mlx5/Kconfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +config MLX5_VFIO_PCI + tristate "VFIO support for MLX5 PCI devices" + depends on MLX5_CORE + select VFIO_PCI_CORE + help + This provides a migration support for MLX5 devices using the VFIO + framework. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/pci/mlx5/Makefile b/drivers/vfio/pci/mlx5/Makefile new file mode 100644 index 000000000000..689627da7ff5 --- /dev/null +++ b/drivers/vfio/pci/mlx5/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5-vfio-pci.o +mlx5-vfio-pci-y := main.o cmd.o + diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c new file mode 100644 index 000000000000..621b7fc60544 --- /dev/null +++ b/drivers/vfio/pci/mlx5/main.c @@ -0,0 +1,696 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include <linux/device.h> +#include <linux/eventfd.h> +#include <linux/file.h> +#include <linux/interrupt.h> +#include <linux/iommu.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/notifier.h> +#include <linux/pci.h> +#include <linux/pm_runtime.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> +#include <linux/sched/mm.h> +#include <linux/vfio_pci_core.h> + +#include "cmd.h" + +enum { + MLX5VF_PCI_FREEZED = 1 << 0, +}; + +enum { + MLX5VF_REGION_PENDING_BYTES = 1 << 0, + MLX5VF_REGION_DATA_SIZE = 1 << 1, +}; + +enum { + MLX5VF_SUPPORTED_DEVICE_STATES = VFIO_DEVICE_STATE_RUNNING | + VFIO_DEVICE_STATE_SAVING | + VFIO_DEVICE_STATE_RESUMING, +}; + +#define MLX5VF_MIG_REGION_DATA_SIZE SZ_128K +/* Data section offset from migration region */ +#define MLX5VF_MIG_REGION_DATA_OFFSET \ + (sizeof(struct vfio_device_migration_info)) + +#define VFIO_DEVICE_MIGRATION_OFFSET(x) \ + (offsetof(struct vfio_device_migration_info, x)) + +struct mlx5vf_pci_migration_info { + u32 vfio_dev_state; /* VFIO_DEVICE_STATE_XXX */ + u32 dev_state; /* device migration state */ + u32 region_state; /* Use MLX5VF_REGION_XXX */ + u16 vhca_id; + struct mlx5_vhca_state_data vhca_state_data; +}; + +struct mlx5vf_pci_core_device { + struct vfio_pci_core_device core_device; + u8 migrate_cap:1; + /* protect migartion state */ + struct mutex state_mutex; + struct mlx5vf_pci_migration_info vmig; +}; + +static int mlx5vf_pci_unquiesce_device(struct mlx5vf_pci_core_device *mvdev) +{ + return mlx5vf_cmd_resume_vhca(mvdev->core_device.pdev, + mvdev->vmig.vhca_id, + MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_MASTER); +} + +static int mlx5vf_pci_quiesce_device(struct mlx5vf_pci_core_device *mvdev) +{ + return mlx5vf_cmd_suspend_vhca( + mvdev->core_device.pdev, mvdev->vmig.vhca_id, + MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_MASTER); +} + +static int mlx5vf_pci_unfreeze_device(struct mlx5vf_pci_core_device *mvdev) +{ + int ret; + + ret = mlx5vf_cmd_resume_vhca(mvdev->core_device.pdev, + mvdev->vmig.vhca_id, + MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_SLAVE); + if (ret) + return ret; + + mvdev->vmig.dev_state &= ~MLX5VF_PCI_FREEZED; + return 0; +} + +static int mlx5vf_pci_freeze_device(struct mlx5vf_pci_core_device *mvdev) +{ + int ret; + + ret = mlx5vf_cmd_suspend_vhca( + mvdev->core_device.pdev, mvdev->vmig.vhca_id, + MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_SLAVE); + if (ret) + return ret; + + mvdev->vmig.dev_state |= MLX5VF_PCI_FREEZED; + return 0; +} + +static int mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) +{ + u32 state_size = 0; + int ret; + + if (!(mvdev->vmig.dev_state & MLX5VF_PCI_FREEZED)) + return -EFAULT; + + /* If we already read state no reason to re-read */ + if (mvdev->vmig.vhca_state_data.state_size) + return 0; + + ret = mlx5vf_cmd_query_vhca_migration_state( + mvdev->core_device.pdev, mvdev->vmig.vhca_id, &state_size); + if (ret) + return ret; + + return mlx5vf_cmd_save_vhca_state(mvdev->core_device.pdev, + mvdev->vmig.vhca_id, state_size, + &mvdev->vmig.vhca_state_data); +} + +static int mlx5vf_pci_new_write_window(struct mlx5vf_pci_core_device *mvdev) +{ + struct mlx5_vhca_state_data *state_data = &mvdev->vmig.vhca_state_data; + u32 num_pages_needed; + u64 allocated_ready; + u32 bytes_needed; + + /* Check how many bytes are available from previous flows */ + WARN_ON(state_data->num_pages * PAGE_SIZE < + state_data->win_start_offset); + allocated_ready = (state_data->num_pages * PAGE_SIZE) - + state_data->win_start_offset; + WARN_ON(allocated_ready > MLX5VF_MIG_REGION_DATA_SIZE); + + bytes_needed = MLX5VF_MIG_REGION_DATA_SIZE - allocated_ready; + if (!bytes_needed) + return 0; + + num_pages_needed = DIV_ROUND_UP_ULL(bytes_needed, PAGE_SIZE); + return mlx5vf_add_migration_pages(state_data, num_pages_needed); +} + +static ssize_t +mlx5vf_pci_handle_migration_data_size(struct mlx5vf_pci_core_device *mvdev, + char __user *buf, bool iswrite) +{ + struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig; + u64 data_size; + int ret; + + if (iswrite) { + /* data_size is writable only during resuming state */ + if (vmig->vfio_dev_state != VFIO_DEVICE_STATE_RESUMING) + return -EINVAL; + + ret = copy_from_user(&data_size, buf, sizeof(data_size)); + if (ret) + return -EFAULT; + + vmig->vhca_state_data.state_size += data_size; + vmig->vhca_state_data.win_start_offset += data_size; + ret = mlx5vf_pci_new_write_window(mvdev); + if (ret) + return ret; + + } else { + if (vmig->vfio_dev_state != VFIO_DEVICE_STATE_SAVING) + return -EINVAL; + + data_size = min_t(u64, MLX5VF_MIG_REGION_DATA_SIZE, + vmig->vhca_state_data.state_size - + vmig->vhca_state_data.win_start_offset); + ret = copy_to_user(buf, &data_size, sizeof(data_size)); + if (ret) + return -EFAULT; + } + + vmig->region_state |= MLX5VF_REGION_DATA_SIZE; + return sizeof(data_size); +} + +static ssize_t +mlx5vf_pci_handle_migration_data_offset(struct mlx5vf_pci_core_device *mvdev, + char __user *buf, bool iswrite) +{ + static const u64 data_offset = MLX5VF_MIG_REGION_DATA_OFFSET; + int ret; + + /* RO field */ + if (iswrite) + return -EFAULT; + + ret = copy_to_user(buf, &data_offset, sizeof(data_offset)); + if (ret) + return -EFAULT; + + return sizeof(data_offset); +} + +static ssize_t +mlx5vf_pci_handle_migration_pending_bytes(struct mlx5vf_pci_core_device *mvdev, + char __user *buf, bool iswrite) +{ + struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig; + u64 pending_bytes; + int ret; + + /* RO field */ + if (iswrite) + return -EFAULT; + + if (vmig->vfio_dev_state == (VFIO_DEVICE_STATE_SAVING | + VFIO_DEVICE_STATE_RUNNING)) { + /* In pre-copy state we have no data to return for now, + * return 0 pending bytes + */ + pending_bytes = 0; + } else { + if (!vmig->vhca_state_data.state_size) + return 0; + pending_bytes = vmig->vhca_state_data.state_size - + vmig->vhca_state_data.win_start_offset; + } + + ret = copy_to_user(buf, &pending_bytes, sizeof(pending_bytes)); + if (ret) + return -EFAULT; + + /* Window moves forward once data from previous iteration was read */ + if (vmig->region_state & MLX5VF_REGION_DATA_SIZE) + vmig->vhca_state_data.win_start_offset += + min_t(u64, MLX5VF_MIG_REGION_DATA_SIZE, pending_bytes); + + WARN_ON(vmig->vhca_state_data.win_start_offset > + vmig->vhca_state_data.state_size); + + /* New iteration started */ + vmig->region_state = MLX5VF_REGION_PENDING_BYTES; + return sizeof(pending_bytes); +} + +static int mlx5vf_load_state(struct mlx5vf_pci_core_device *mvdev) +{ + if (!mvdev->vmig.vhca_state_data.state_size) + return 0; + + return mlx5vf_cmd_load_vhca_state(mvdev->core_device.pdev, + mvdev->vmig.vhca_id, + &mvdev->vmig.vhca_state_data); +} + +static void mlx5vf_reset_mig_state(struct mlx5vf_pci_core_device *mvdev) +{ + struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig; + + vmig->region_state = 0; + mlx5vf_reset_vhca_state(&vmig->vhca_state_data); +} + +static int mlx5vf_pci_set_device_state(struct mlx5vf_pci_core_device *mvdev, + u32 state) +{ + struct mlx5vf_pci_migration_info *vmig = &mvdev->vmig; + u32 old_state = vmig->vfio_dev_state; + int ret = 0; + + if (old_state == VFIO_DEVICE_STATE_ERROR || + !VFIO_DEVICE_STATE_VALID(state) || + (state & ~MLX5VF_SUPPORTED_DEVICE_STATES)) + return -EINVAL; + + /* Running switches off */ + if (((old_state ^ state) & VFIO_DEVICE_STATE_RUNNING) && + (old_state & VFIO_DEVICE_STATE_RUNNING)) { + ret = mlx5vf_pci_quiesce_device(mvdev); + if (ret) + return ret; + ret = mlx5vf_pci_freeze_device(mvdev); + if (ret) { + vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR; + return ret; + } + } + + /* Resuming switches off */ + if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) && + (old_state & VFIO_DEVICE_STATE_RESUMING)) { + /* deserialize state into the device */ + ret = mlx5vf_load_state(mvdev); + if (ret) { + vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR; + return ret; + } + } + + /* Resuming switches on */ + if (((old_state ^ state) & VFIO_DEVICE_STATE_RESUMING) && + (state & VFIO_DEVICE_STATE_RESUMING)) { + mlx5vf_reset_mig_state(mvdev); + ret = mlx5vf_pci_new_write_window(mvdev); + if (ret) + return ret; + } + + /* Saving switches on */ + if (((old_state ^ state) & VFIO_DEVICE_STATE_SAVING) && + (state & VFIO_DEVICE_STATE_SAVING)) { + if (!(state & VFIO_DEVICE_STATE_RUNNING)) { + /* serialize post copy */ + ret = mlx5vf_pci_save_device_data(mvdev); + if (ret) + return ret; + } + } + + /* Running switches on */ + if (((old_state ^ state) & VFIO_DEVICE_STATE_RUNNING) && + (state & VFIO_DEVICE_STATE_RUNNING)) { + ret = mlx5vf_pci_unfreeze_device(mvdev); + if (ret) + return ret; + ret = mlx5vf_pci_unquiesce_device(mvdev); + if (ret) { + vmig->vfio_dev_state = VFIO_DEVICE_STATE_ERROR; + return ret; + } + } + + vmig->vfio_dev_state = state; + return 0; +} + +static ssize_t +mlx5vf_pci_handle_migration_device_state(struct mlx5vf_pci_core_device *mvdev, + char __user *buf, bool iswrite) +{ + size_t count = sizeof(mvdev->vmig.vfio_dev_state); + int ret; + + if (iswrite) { + u32 device_state; + + ret = copy_from_user(&device_state, buf, count); + if (ret) + return -EFAULT; + + ret = mlx5vf_pci_set_device_state(mvdev, device_state); + if (ret) + return ret; + } else { + ret = copy_to_user(buf, &mvdev->vmig.vfio_dev_state, count); + if (ret) + return -EFAULT; + } + + return count; +} + +static ssize_t +mlx5vf_pci_copy_user_data_to_device_state(struct mlx5vf_pci_core_device *mvdev, + char __user *buf, size_t count, + u64 offset) +{ + struct mlx5_vhca_state_data *state_data = &mvdev->vmig.vhca_state_data; + char __user *from_buff = buf; + u32 curr_offset; + u32 win_page_offset; + u32 copy_count; + struct page *page; + char *to_buff; + int ret; + + curr_offset = state_data->win_start_offset + offset; + + do { + page = mlx5vf_get_migration_page(&state_data->mig_data, + curr_offset); + if (!page) + return -EINVAL; + + win_page_offset = curr_offset % PAGE_SIZE; + copy_count = min_t(u32, PAGE_SIZE - win_page_offset, count); + + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + win_page_offset, from_buff, + copy_count); + kunmap_local(to_buff); + if (ret) + return -EFAULT; + + from_buff += copy_count; + curr_offset += copy_count; + count -= copy_count; + } while (count > 0); + + return 0; +} + +static ssize_t +mlx5vf_pci_copy_device_state_to_user(struct mlx5vf_pci_core_device *mvdev, + char __user *buf, u64 offset, size_t count) +{ + struct mlx5_vhca_state_data *state_data = &mvdev->vmig.vhca_state_data; + char __user *to_buff = buf; + u32 win_available_bytes; + u32 win_page_offset; + u32 copy_count; + u32 curr_offset; + char *from_buff; + struct page *page; + int ret; + + win_available_bytes = + min_t(u64, MLX5VF_MIG_REGION_DATA_SIZE, + mvdev->vmig.vhca_state_data.state_size - + mvdev->vmig.vhca_state_data.win_start_offset); + + if (count + offset > win_available_bytes) + return -EINVAL; + + curr_offset = state_data->win_start_offset + offset; + + do { + page = mlx5vf_get_migration_page(&state_data->mig_data, + curr_offset); + if (!page) + return -EINVAL; + + win_page_offset = curr_offset % PAGE_SIZE; + copy_count = min_t(u32, PAGE_SIZE - win_page_offset, count); + + from_buff = kmap_local_page(page); + ret = copy_to_user(buf, from_buff + win_page_offset, + copy_count); + kunmap_local(from_buff); + if (ret) + return -EFAULT; + + curr_offset += copy_count; + count -= copy_count; + to_buff += copy_count; + } while (count); + + return 0; +} + +static ssize_t +mlx5vf_pci_migration_data_rw(struct mlx5vf_pci_core_device *mvdev, + char __user *buf, size_t count, u64 offset, + bool iswrite) +{ + int ret; + + if (offset + count > MLX5VF_MIG_REGION_DATA_SIZE) + return -EINVAL; + + if (iswrite) + ret = mlx5vf_pci_copy_user_data_to_device_state(mvdev, buf, + count, offset); + else + ret = mlx5vf_pci_copy_device_state_to_user(mvdev, buf, offset, + count); + if (ret) + return ret; + return count; +} + +static ssize_t mlx5vf_pci_mig_rw(struct vfio_pci_core_device *vdev, + char __user *buf, size_t count, loff_t *ppos, + bool iswrite) +{ + struct mlx5vf_pci_core_device *mvdev = + container_of(vdev, struct mlx5vf_pci_core_device, core_device); + u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; + int ret; + + mutex_lock(&mvdev->state_mutex); + /* Copy to/from the migration region data section */ + if (pos >= MLX5VF_MIG_REGION_DATA_OFFSET) { + ret = mlx5vf_pci_migration_data_rw( + mvdev, buf, count, pos - MLX5VF_MIG_REGION_DATA_OFFSET, + iswrite); + goto end; + } + + switch (pos) { + case VFIO_DEVICE_MIGRATION_OFFSET(device_state): + /* This is RW field. */ + if (count != sizeof(mvdev->vmig.vfio_dev_state)) { + ret = -EINVAL; + break; + } + ret = mlx5vf_pci_handle_migration_device_state(mvdev, buf, + iswrite); + break; + case VFIO_DEVICE_MIGRATION_OFFSET(pending_bytes): + /* + * The number of pending bytes still to be migrated from the + * vendor driver. This is RO field. + * Reading this field indicates on the start of a new iteration + * to get device data. + * + */ + ret = mlx5vf_pci_handle_migration_pending_bytes(mvdev, buf, + iswrite); + break; + case VFIO_DEVICE_MIGRATION_OFFSET(data_offset): + /* + * The user application should read data_offset field from the + * migration region. The user application should read the + * device data from this offset within the migration region + * during the _SAVING mode or write the device data during the + * _RESUMING mode. This is RO field. + */ + ret = mlx5vf_pci_handle_migration_data_offset(mvdev, buf, + iswrite); + break; + case VFIO_DEVICE_MIGRATION_OFFSET(data_size): + /* + * The user application should read data_size to get the size + * in bytes of the data copied to the migration region during + * the _SAVING state by the device. The user application should + * write the size in bytes of the data that was copied to + * the migration region during the _RESUMING state by the user. + * This is RW field. + */ + ret = mlx5vf_pci_handle_migration_data_size(mvdev, buf, + iswrite); + break; + default: + ret = -EFAULT; + break; + } + +end: + mutex_unlock(&mvdev->state_mutex); + return ret; +} + +static struct vfio_pci_regops migration_ops = { + .rw = mlx5vf_pci_mig_rw, +}; + +static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &mvdev->core_device; + int vf_id; + int ret; + + ret = vfio_pci_core_enable(vdev); + if (ret) + return ret; + + if (!mvdev->migrate_cap) { + vfio_pci_core_finish_enable(vdev); + return 0; + } + + vf_id = pci_iov_vf_id(vdev->pdev); + if (vf_id < 0) { + ret = vf_id; + goto out_disable; + } + + ret = mlx5vf_cmd_get_vhca_id(vdev->pdev, vf_id + 1, + &mvdev->vmig.vhca_id); + if (ret) + goto out_disable; + + ret = vfio_pci_register_dev_region(vdev, VFIO_REGION_TYPE_MIGRATION, + VFIO_REGION_SUBTYPE_MIGRATION, + &migration_ops, + MLX5VF_MIG_REGION_DATA_OFFSET + + MLX5VF_MIG_REGION_DATA_SIZE, + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE, + NULL); + if (ret) + goto out_disable; + + mvdev->vmig.vfio_dev_state = VFIO_DEVICE_STATE_RUNNING; + vfio_pci_core_finish_enable(vdev); + return 0; +out_disable: + vfio_pci_core_disable(vdev); + return ret; +} + +static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); + + vfio_pci_core_close_device(core_vdev); + mlx5vf_reset_mig_state(mvdev); +} + +static const struct vfio_device_ops mlx5vf_pci_ops = { + .name = "mlx5-vfio-pci", + .open_device = mlx5vf_pci_open_device, + .close_device = mlx5vf_pci_close_device, + .ioctl = vfio_pci_core_ioctl, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, +}; + +static int mlx5vf_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct mlx5vf_pci_core_device *mvdev; + int ret; + + mvdev = kzalloc(sizeof(*mvdev), GFP_KERNEL); + if (!mvdev) + return -ENOMEM; + vfio_pci_core_init_device(&mvdev->core_device, pdev, &mlx5vf_pci_ops); + + if (pdev->is_virtfn) { + struct mlx5_core_dev *mdev = + mlx5_vf_get_core_dev(pdev); + + if (mdev) { + if (MLX5_CAP_GEN(mdev, migration)) { + mvdev->migrate_cap = 1; + mutex_init(&mvdev->state_mutex); + } + mlx5_vf_put_core_dev(mdev); + } + } + + ret = vfio_pci_core_register_device(&mvdev->core_device); + if (ret) + goto out_free; + + dev_set_drvdata(&pdev->dev, mvdev); + return 0; + +out_free: + vfio_pci_core_uninit_device(&mvdev->core_device); + kfree(mvdev); + return ret; +} + +static void mlx5vf_pci_remove(struct pci_dev *pdev) +{ + struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev); + + vfio_pci_core_unregister_device(&mvdev->core_device); + vfio_pci_core_uninit_device(&mvdev->core_device); + kfree(mvdev); +} + +static const struct pci_device_id mlx5vf_pci_table[] = { + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ + {} +}; + +MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); + +static struct pci_driver mlx5vf_pci_driver = { + .name = KBUILD_MODNAME, + .id_table = mlx5vf_pci_table, + .probe = mlx5vf_pci_probe, + .remove = mlx5vf_pci_remove, + .err_handler = &vfio_pci_core_err_handlers, +}; + +static void __exit mlx5vf_pci_cleanup(void) +{ + pci_unregister_driver(&mlx5vf_pci_driver); +} + +static int __init mlx5vf_pci_init(void) +{ + return pci_register_driver(&mlx5vf_pci_driver); +} + +module_init(mlx5vf_pci_init); +module_exit(mlx5vf_pci_cleanup); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); +MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); +MODULE_DESCRIPTION( + "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");