diff mbox series

[10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF devices

Message ID 20240201153337.4033490-11-xin.zeng@intel.com (mailing list archive)
State New, archived
Headers show
Series crypto: qat - enable SRIOV VF live migration for | expand

Commit Message

Xin Zeng Feb. 1, 2024, 3:33 p.m. UTC
Add vfio pci driver for Intel QAT VF devices.

This driver uses vfio_pci_core to register to the VFIO subsystem. It
acts as a vfio agent and interacts with the QAT PF driver to implement
VF live migration.

Co-developed-by: Yahui Cao <yahui.cao@intel.com>
Signed-off-by: Yahui Cao <yahui.cao@intel.com>
Signed-off-by: Xin Zeng <xin.zeng@intel.com>
Reviewed-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
---
 MAINTAINERS                         |   8 +
 drivers/vfio/pci/Kconfig            |   2 +
 drivers/vfio/pci/Makefile           |   2 +
 drivers/vfio/pci/intel/qat/Kconfig  |  13 +
 drivers/vfio/pci/intel/qat/Makefile |   4 +
 drivers/vfio/pci/intel/qat/main.c   | 572 ++++++++++++++++++++++++++++
 6 files changed, 601 insertions(+)
 create mode 100644 drivers/vfio/pci/intel/qat/Kconfig
 create mode 100644 drivers/vfio/pci/intel/qat/Makefile
 create mode 100644 drivers/vfio/pci/intel/qat/main.c

Comments

Jason Gunthorpe Feb. 6, 2024, 12:55 p.m. UTC | #1
On Thu, Feb 01, 2024 at 11:33:37PM +0800, Xin Zeng wrote:

> +static int qat_vf_pci_init_dev(struct vfio_device *core_vdev)
> +{
> +	struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
> +			struct qat_vf_core_device, core_device.vdev);
> +	struct qat_migdev_ops *ops;
> +	struct qat_mig_dev *mdev;
> +	struct pci_dev *parent;
> +	int ret, vf_id;
> +
> +	core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
> +	core_vdev->mig_ops = &qat_vf_pci_mig_ops;
> +
> +	ret = vfio_pci_core_init_dev(core_vdev);
> +	if (ret)
> +		return ret;
> +
> +	mutex_init(&qat_vdev->state_mutex);
> +	spin_lock_init(&qat_vdev->reset_lock);
> +
> +	parent = qat_vdev->core_device.pdev->physfn;
> +	vf_id = pci_iov_vf_id(qat_vdev->core_device.pdev);
> +	if (!parent || vf_id < 0) {
> +		ret = -ENODEV;
> +		goto err_rel;
> +	}
> +
> +	mdev = qat_vfmig_create(parent, vf_id);
> +	if (IS_ERR(mdev)) {
> +		ret = PTR_ERR(mdev);
> +		goto err_rel;
> +	}
> +
> +	ops = mdev->ops;
> +	if (!ops || !ops->init || !ops->cleanup ||
> +	    !ops->open || !ops->close ||
> +	    !ops->save_state || !ops->load_state ||
> +	    !ops->suspend || !ops->resume) {
> +		ret = -EIO;
> +		dev_err(&parent->dev, "Incomplete device migration ops structure!");
> +		goto err_destroy;
> +	}

Why are there ops pointers here? I would expect this should just be
direct function calls to the PF QAT driver.

> +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev)
> +{
> +	struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev);
> +
> +	if (!qat_vdev->core_device.vdev.mig_ops)
> +		return;
> +
> +	/*
> +	 * As the higher VFIO layers are holding locks across reset and using
> +	 * those same locks with the mm_lock we need to prevent ABBA deadlock
> +	 * with the state_mutex and mm_lock.
> +	 * In case the state_mutex was taken already we defer the cleanup work
> +	 * to the unlock flow of the other running context.
> +	 */
> +	spin_lock(&qat_vdev->reset_lock);
> +	qat_vdev->deferred_reset = true;
> +	if (!mutex_trylock(&qat_vdev->state_mutex)) {
> +		spin_unlock(&qat_vdev->reset_lock);
> +		return;
> +	}
> +	spin_unlock(&qat_vdev->reset_lock);
> +	qat_vf_state_mutex_unlock(qat_vdev);
> +}

Do you really need this? I thought this ugly thing was going to be a
uniquely mlx5 thing..

Jason
Alex Williamson Feb. 6, 2024, 9:14 p.m. UTC | #2
On Thu,  1 Feb 2024 23:33:37 +0800
Xin Zeng <xin.zeng@intel.com> wrote:

> Add vfio pci driver for Intel QAT VF devices.
> 
> This driver uses vfio_pci_core to register to the VFIO subsystem. It
> acts as a vfio agent and interacts with the QAT PF driver to implement
> VF live migration.
> 
> Co-developed-by: Yahui Cao <yahui.cao@intel.com>
> Signed-off-by: Yahui Cao <yahui.cao@intel.com>
> Signed-off-by: Xin Zeng <xin.zeng@intel.com>
> Reviewed-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
> ---
>  MAINTAINERS                         |   8 +
>  drivers/vfio/pci/Kconfig            |   2 +
>  drivers/vfio/pci/Makefile           |   2 +
>  drivers/vfio/pci/intel/qat/Kconfig  |  13 +
>  drivers/vfio/pci/intel/qat/Makefile |   4 +
>  drivers/vfio/pci/intel/qat/main.c   | 572 ++++++++++++++++++++++++++++
>  6 files changed, 601 insertions(+)
>  create mode 100644 drivers/vfio/pci/intel/qat/Kconfig
>  create mode 100644 drivers/vfio/pci/intel/qat/Makefile
>  create mode 100644 drivers/vfio/pci/intel/qat/main.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 8d1052fa6a69..c1d3e4cb3892 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -23095,6 +23095,14 @@ S:	Maintained
>  F:	Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst
>  F:	drivers/vfio/pci/pds/
>  
> +VFIO QAT PCI DRIVER
> +M:	Xin Zeng <xin.zeng@intel.com>
> +M:	Giovanni Cabiddu <giovanni.cabiddu@intel.com>
> +L:	kvm@vger.kernel.org
> +L:	qat-linux@intel.com
> +S:	Supported
> +F:	drivers/vfio/pci/intel/qat/
> +
>  VFIO PLATFORM DRIVER
>  M:	Eric Auger <eric.auger@redhat.com>
>  L:	kvm@vger.kernel.org
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 18c397df566d..329d25c53274 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -67,4 +67,6 @@ source "drivers/vfio/pci/pds/Kconfig"
>  
>  source "drivers/vfio/pci/virtio/Kconfig"
>  
> +source "drivers/vfio/pci/intel/qat/Kconfig"
> +
>  endmenu
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index 046139a4eca5..a87b6b43ce1c 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -15,3 +15,5 @@ obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
>  obj-$(CONFIG_PDS_VFIO_PCI) += pds/
>  
>  obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/
> +
> +obj-$(CONFIG_QAT_VFIO_PCI) += intel/qat/
> diff --git a/drivers/vfio/pci/intel/qat/Kconfig b/drivers/vfio/pci/intel/qat/Kconfig
> new file mode 100644
> index 000000000000..71b28ac0bf6a
> --- /dev/null
> +++ b/drivers/vfio/pci/intel/qat/Kconfig
> @@ -0,0 +1,13 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +config QAT_VFIO_PCI
> +	tristate "VFIO support for QAT VF PCI devices"
> +	select VFIO_PCI_CORE
> +	depends on CRYPTO_DEV_QAT
> +	depends on CRYPTO_DEV_QAT_4XXX

CRYPTO_DEV_QAT_4XXX selects CRYPTO_DEV_QAT, is it necessary to depend
on CRYPTO_DEV_QAT here?

> +	help
> +	  This provides migration support for Intel(R) QAT Virtual Function
> +	  using the VFIO framework.
> +
> +	  To compile this as a module, choose M here: the module
> +	  will be called qat_vfio_pci. If you don't know what to do here,
> +	  say N.
> diff --git a/drivers/vfio/pci/intel/qat/Makefile b/drivers/vfio/pci/intel/qat/Makefile
> new file mode 100644
> index 000000000000..9289ae4c51bf
> --- /dev/null
> +++ b/drivers/vfio/pci/intel/qat/Makefile
> @@ -0,0 +1,4 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +obj-$(CONFIG_QAT_VFIO_PCI) += qat_vfio_pci.o
> +qat_vfio_pci-y := main.o
> +

Blank line at end of file.

> diff --git a/drivers/vfio/pci/intel/qat/main.c b/drivers/vfio/pci/intel/qat/main.c
> new file mode 100644
> index 000000000000..85d0ed701397
> --- /dev/null
> +++ b/drivers/vfio/pci/intel/qat/main.c
> @@ -0,0 +1,572 @@
...
> +static int qat_vf_pci_init_dev(struct vfio_device *core_vdev)
> +{
> +	struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
> +			struct qat_vf_core_device, core_device.vdev);
> +	struct qat_migdev_ops *ops;
> +	struct qat_mig_dev *mdev;
> +	struct pci_dev *parent;
> +	int ret, vf_id;
> +
> +	core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;

AFAICT it's always good practice to support VFIO_MIGRATION_PRE_COPY,
even if only to leave yourself an option to mark an incompatible ABI
change in the data stream that can be checked before the stop-copy
phase of migration.  See for example the mtty sample driver.  Thanks,

Alex

> +	core_vdev->mig_ops = &qat_vf_pci_mig_ops;
> +
> +	ret = vfio_pci_core_init_dev(core_vdev);
> +	if (ret)
> +		return ret;
> +
> +	mutex_init(&qat_vdev->state_mutex);
> +	spin_lock_init(&qat_vdev->reset_lock);
> +
> +	parent = qat_vdev->core_device.pdev->physfn;
> +	vf_id = pci_iov_vf_id(qat_vdev->core_device.pdev);
> +	if (!parent || vf_id < 0) {
> +		ret = -ENODEV;
> +		goto err_rel;
> +	}
> +
> +	mdev = qat_vfmig_create(parent, vf_id);
> +	if (IS_ERR(mdev)) {
> +		ret = PTR_ERR(mdev);
> +		goto err_rel;
> +	}
> +
> +	ops = mdev->ops;
> +	if (!ops || !ops->init || !ops->cleanup ||
> +	    !ops->open || !ops->close ||
> +	    !ops->save_state || !ops->load_state ||
> +	    !ops->suspend || !ops->resume) {
> +		ret = -EIO;
> +		dev_err(&parent->dev, "Incomplete device migration ops structure!");
> +		goto err_destroy;
> +	}
> +	ret = ops->init(mdev);
> +	if (ret)
> +		goto err_destroy;
> +
> +	qat_vdev->mdev = mdev;
> +
> +	return 0;
> +
> +err_destroy:
> +	qat_vfmig_destroy(mdev);
> +err_rel:
> +	vfio_pci_core_release_dev(core_vdev);
> +	return ret;
> +}
Shameerali Kolothum Thodi Feb. 8, 2024, 12:17 p.m. UTC | #3
> -----Original Message-----
> From: Xin Zeng <xin.zeng@intel.com>
> Sent: Thursday, February 1, 2024 3:34 PM
> To: herbert@gondor.apana.org.au; alex.williamson@redhat.com;
> jgg@nvidia.com; yishaih@nvidia.com; Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>; kevin.tian@intel.com
> Cc: linux-crypto@vger.kernel.org; kvm@vger.kernel.org; qat-
> linux@intel.com; Xin Zeng <xin.zeng@intel.com>; Yahui Cao
> <yahui.cao@intel.com>
> Subject: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF devices
> 
> Add vfio pci driver for Intel QAT VF devices.
> 
> This driver uses vfio_pci_core to register to the VFIO subsystem. It
> acts as a vfio agent and interacts with the QAT PF driver to implement
> VF live migration.
> 
> Co-developed-by: Yahui Cao <yahui.cao@intel.com>
> Signed-off-by: Yahui Cao <yahui.cao@intel.com>
> Signed-off-by: Xin Zeng <xin.zeng@intel.com>
> Reviewed-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
> ---
>  MAINTAINERS                         |   8 +
>  drivers/vfio/pci/Kconfig            |   2 +
>  drivers/vfio/pci/Makefile           |   2 +
>  drivers/vfio/pci/intel/qat/Kconfig  |  13 +
>  drivers/vfio/pci/intel/qat/Makefile |   4 +
>  drivers/vfio/pci/intel/qat/main.c   | 572 ++++++++++++++++++++++++++++
>  6 files changed, 601 insertions(+)
>  create mode 100644 drivers/vfio/pci/intel/qat/Kconfig
>  create mode 100644 drivers/vfio/pci/intel/qat/Makefile
>  create mode 100644 drivers/vfio/pci/intel/qat/main.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 8d1052fa6a69..c1d3e4cb3892 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -23095,6 +23095,14 @@ S:	Maintained
>  F:
> 	Documentation/networking/device_drivers/ethernet/amd/pds_vfio_
> pci.rst
>  F:	drivers/vfio/pci/pds/
> 
> +VFIO QAT PCI DRIVER
> +M:	Xin Zeng <xin.zeng@intel.com>
> +M:	Giovanni Cabiddu <giovanni.cabiddu@intel.com>
> +L:	kvm@vger.kernel.org
> +L:	qat-linux@intel.com
> +S:	Supported
> +F:	drivers/vfio/pci/intel/qat/
> +
>  VFIO PLATFORM DRIVER
>  M:	Eric Auger <eric.auger@redhat.com>
>  L:	kvm@vger.kernel.org
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 18c397df566d..329d25c53274 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -67,4 +67,6 @@ source "drivers/vfio/pci/pds/Kconfig"
> 
>  source "drivers/vfio/pci/virtio/Kconfig"
> 
> +source "drivers/vfio/pci/intel/qat/Kconfig"
> +
>  endmenu
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index 046139a4eca5..a87b6b43ce1c 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -15,3 +15,5 @@ obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
>  obj-$(CONFIG_PDS_VFIO_PCI) += pds/
> 
>  obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/
> +
> +obj-$(CONFIG_QAT_VFIO_PCI) += intel/qat/
> diff --git a/drivers/vfio/pci/intel/qat/Kconfig
> b/drivers/vfio/pci/intel/qat/Kconfig
> new file mode 100644
> index 000000000000..71b28ac0bf6a
> --- /dev/null
> +++ b/drivers/vfio/pci/intel/qat/Kconfig
> @@ -0,0 +1,13 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +config QAT_VFIO_PCI
> +	tristate "VFIO support for QAT VF PCI devices"
> +	select VFIO_PCI_CORE
> +	depends on CRYPTO_DEV_QAT
> +	depends on CRYPTO_DEV_QAT_4XXX
> +	help
> +	  This provides migration support for Intel(R) QAT Virtual Function
> +	  using the VFIO framework.
> +
> +	  To compile this as a module, choose M here: the module
> +	  will be called qat_vfio_pci. If you don't know what to do here,
> +	  say N.
> diff --git a/drivers/vfio/pci/intel/qat/Makefile
> b/drivers/vfio/pci/intel/qat/Makefile
> new file mode 100644
> index 000000000000..9289ae4c51bf
> --- /dev/null
> +++ b/drivers/vfio/pci/intel/qat/Makefile
> @@ -0,0 +1,4 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +obj-$(CONFIG_QAT_VFIO_PCI) += qat_vfio_pci.o
> +qat_vfio_pci-y := main.o
> +
> diff --git a/drivers/vfio/pci/intel/qat/main.c
> b/drivers/vfio/pci/intel/qat/main.c
> new file mode 100644
> index 000000000000..85d0ed701397
> --- /dev/null
> +++ b/drivers/vfio/pci/intel/qat/main.c
> @@ -0,0 +1,572 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright(c) 2024 Intel Corporation */
> +
> +#include <linux/anon_inodes.h>
> +#include <linux/container_of.h>
> +#include <linux/device.h>
> +#include <linux/file.h>
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/pci.h>
> +#include <linux/sizes.h>
> +#include <linux/types.h>
> +#include <linux/uaccess.h>
> +#include <linux/vfio_pci_core.h>
> +#include <linux/qat/qat_mig_dev.h>
> +
> +struct qat_vf_migration_file {
> +	struct file *filp;
> +	/* protects migration region context */
> +	struct mutex lock;
> +	bool disabled;
> +	struct qat_mig_dev *mdev;
> +};
> +
> +struct qat_vf_core_device {
> +	struct vfio_pci_core_device core_device;
> +	struct qat_mig_dev *mdev;
> +	/* protects migration state */
> +	struct mutex state_mutex;
> +	enum vfio_device_mig_state mig_state;
> +	/* protects reset down flow */
> +	spinlock_t reset_lock;
> +	bool deferred_reset;
> +	struct qat_vf_migration_file *resuming_migf;
> +	struct qat_vf_migration_file *saving_migf;
> +};
> +
> +static int qat_vf_pci_open_device(struct vfio_device *core_vdev)
> +{
> +	struct qat_vf_core_device *qat_vdev =
> +		container_of(core_vdev, struct qat_vf_core_device,
> +			     core_device.vdev);
> +	struct vfio_pci_core_device *vdev = &qat_vdev->core_device;
> +	int ret;
> +
> +	ret = vfio_pci_core_enable(vdev);
> +	if (ret)
> +		return ret;
> +
> +	ret = qat_vdev->mdev->ops->open(qat_vdev->mdev);
> +	if (ret) {
> +		vfio_pci_core_disable(vdev);
> +		return ret;
> +	}
> +	qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
> +
> +	vfio_pci_core_finish_enable(vdev);
> +
> +	return 0;
> +}
> +
> +static void qat_vf_disable_fd(struct qat_vf_migration_file *migf)
> +{
> +	mutex_lock(&migf->lock);
> +	migf->disabled = true;
> +	migf->filp->f_pos = 0;
> +	mutex_unlock(&migf->lock);
> +}
> +
> +static void qat_vf_disable_fds(struct qat_vf_core_device *qat_vdev)
> +{
> +	if (qat_vdev->resuming_migf) {
> +		qat_vf_disable_fd(qat_vdev->resuming_migf);
> +		fput(qat_vdev->resuming_migf->filp);
> +		qat_vdev->resuming_migf = NULL;
> +	}
> +
> +	if (qat_vdev->saving_migf) {
> +		qat_vf_disable_fd(qat_vdev->saving_migf);
> +		fput(qat_vdev->saving_migf->filp);
> +		qat_vdev->saving_migf = NULL;
> +	}
> +}
> +
> +static void qat_vf_pci_close_device(struct vfio_device *core_vdev)
> +{
> +	struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
> +			struct qat_vf_core_device, core_device.vdev);
> +
> +	qat_vdev->mdev->ops->close(qat_vdev->mdev);
> +	qat_vf_disable_fds(qat_vdev);
> +	vfio_pci_core_close_device(core_vdev);
> +}
> +
> +static ssize_t qat_vf_save_read(struct file *filp, char __user *buf,
> +				size_t len, loff_t *pos)
> +{
> +	struct qat_vf_migration_file *migf = filp->private_data;
> +	ssize_t done = 0;
> +	loff_t *offs;
> +	int ret;
> +
> +	if (pos)
> +		return -ESPIPE;
> +	offs = &filp->f_pos;
> +
> +	mutex_lock(&migf->lock);
> +	if (*offs > migf->mdev->state_size || *offs < 0) {
> +		done = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	if (migf->disabled) {
> +		done = -ENODEV;
> +		goto out_unlock;
> +	}
> +
> +	len = min_t(size_t, migf->mdev->state_size - *offs, len);
> +	if (len) {
> +		ret = copy_to_user(buf, migf->mdev->state + *offs, len);
> +		if (ret) {
> +			done = -EFAULT;
> +			goto out_unlock;
> +		}
> +		*offs += len;
> +		done = len;
> +	}
> +
> +out_unlock:
> +	mutex_unlock(&migf->lock);
> +	return done;
> +}
> +
> +static int qat_vf_release_file(struct inode *inode, struct file *filp)
> +{
> +	struct qat_vf_migration_file *migf = filp->private_data;
> +
> +	qat_vf_disable_fd(migf);
> +	mutex_destroy(&migf->lock);
> +	kfree(migf);
> +
> +	return 0;
> +}
> +
> +static const struct file_operations qat_vf_save_fops = {
> +	.owner = THIS_MODULE,
> +	.read = qat_vf_save_read,
> +	.release = qat_vf_release_file,
> +	.llseek = no_llseek,
> +};
> +
> +static struct qat_vf_migration_file *
> +qat_vf_save_device_data(struct qat_vf_core_device *qat_vdev)
> +{
> +	struct qat_vf_migration_file *migf;
> +	int ret;
> +
> +	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
> +	if (!migf)
> +		return ERR_PTR(-ENOMEM);
> +
> +	migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_save_fops,
> migf, O_RDONLY);
> +	ret = PTR_ERR_OR_ZERO(migf->filp);
> +	if (ret) {
> +		kfree(migf);
> +		return ERR_PTR(ret);
> +	}
> +
> +	stream_open(migf->filp->f_inode, migf->filp);
> +	mutex_init(&migf->lock);
> +
> +	ret = qat_vdev->mdev->ops->save_state(qat_vdev->mdev);
> +	if (ret) {
> +		fput(migf->filp);
> +		kfree(migf);

Probably don't need that kfree(migf) here as fput() -->  qat_vf_release_file () will do that.

> +		return ERR_PTR(ret);
> +	}
> +
> +	migf->mdev = qat_vdev->mdev;
> +
> +	return migf;
> +}
> +
> +static ssize_t qat_vf_resume_write(struct file *filp, const char __user *buf,
> +				   size_t len, loff_t *pos)
> +{
> +	struct qat_vf_migration_file *migf = filp->private_data;
> +	loff_t end, *offs;
> +	ssize_t done = 0;
> +	int ret;
> +
> +	if (pos)
> +		return -ESPIPE;
> +	offs = &filp->f_pos;
> +
> +	if (*offs < 0 ||
> +	    check_add_overflow((loff_t)len, *offs, &end))
> +		return -EOVERFLOW;
> +
> +	if (end > migf->mdev->state_size)
> +		return -ENOMEM;
> +
> +	mutex_lock(&migf->lock);
> +	if (migf->disabled) {
> +		done = -ENODEV;
> +		goto out_unlock;
> +	}
> +
> +	ret = copy_from_user(migf->mdev->state + *offs, buf, len);
> +	if (ret) {
> +		done = -EFAULT;
> +		goto out_unlock;
> +	}
> +	*offs += len;
> +	done = len;
> +
> +out_unlock:
> +	mutex_unlock(&migf->lock);
> +	return done;
> +}
> +
> +static const struct file_operations qat_vf_resume_fops = {
> +	.owner = THIS_MODULE,
> +	.write = qat_vf_resume_write,
> +	.release = qat_vf_release_file,
> +	.llseek = no_llseek,
> +};
> +
> +static struct qat_vf_migration_file *
> +qat_vf_resume_device_data(struct qat_vf_core_device *qat_vdev)
> +{
> +	struct qat_vf_migration_file *migf;
> +	int ret;
> +
> +	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
> +	if (!migf)
> +		return ERR_PTR(-ENOMEM);
> +
> +	migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_resume_fops,
> migf, O_WRONLY);
> +	ret = PTR_ERR_OR_ZERO(migf->filp);
> +	if (ret) {
> +		kfree(migf);
> +		return ERR_PTR(ret);
> +	}
> +
> +	migf->mdev = qat_vdev->mdev;
> +	stream_open(migf->filp->f_inode, migf->filp);
> +	mutex_init(&migf->lock);
> +
> +	return migf;
> +}
> +
> +static int qat_vf_load_device_data(struct qat_vf_core_device *qat_vdev)
> +{
> +	return qat_vdev->mdev->ops->load_state(qat_vdev->mdev);
> +}
> +
> +static struct file *qat_vf_pci_step_device_state(struct qat_vf_core_device
> *qat_vdev, u32 new)
> +{
> +	u32 cur = qat_vdev->mig_state;
> +	int ret;
> +
> +	if ((cur == VFIO_DEVICE_STATE_RUNNING && new ==
> VFIO_DEVICE_STATE_RUNNING_P2P)) {
> +		ret = qat_vdev->mdev->ops->suspend(qat_vdev->mdev);
> +		if (ret)
> +			return ERR_PTR(ret);
> +		return NULL;
> +	}
> +
> +	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new ==
> VFIO_DEVICE_STATE_STOP) ||
> +	    (cur == VFIO_DEVICE_STATE_STOP && new ==
> VFIO_DEVICE_STATE_RUNNING_P2P))
> +		return NULL;
> +
> +	if (cur == VFIO_DEVICE_STATE_STOP && new ==
> VFIO_DEVICE_STATE_STOP_COPY) {
> +		struct qat_vf_migration_file *migf;
> +
> +		migf = qat_vf_save_device_data(qat_vdev);
> +		if (IS_ERR(migf))
> +			return ERR_CAST(migf);
> +		get_file(migf->filp);
> +		qat_vdev->saving_migf = migf;
> +		return migf->filp;
> +	}
> +
> +	if (cur == VFIO_DEVICE_STATE_STOP_COPY && new ==
> VFIO_DEVICE_STATE_STOP) {
> +		qat_vf_disable_fds(qat_vdev);
> +		return NULL;
> +	}
> +
> +	if (cur == VFIO_DEVICE_STATE_STOP && new ==
> VFIO_DEVICE_STATE_RESUMING) {
> +		struct qat_vf_migration_file *migf;
> +
> +		migf = qat_vf_resume_device_data(qat_vdev);
> +		if (IS_ERR(migf))
> +			return ERR_CAST(migf);
> +		get_file(migf->filp);
> +		qat_vdev->resuming_migf = migf;
> +		return migf->filp;
> +	}
> +
> +	if (cur == VFIO_DEVICE_STATE_RESUMING && new ==
> VFIO_DEVICE_STATE_STOP) {
> +		ret = qat_vf_load_device_data(qat_vdev);
> +		if (ret)
> +			return ERR_PTR(ret);
> +
> +		qat_vf_disable_fds(qat_vdev);
> +		return NULL;
> +	}
> +
> +	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new ==
> VFIO_DEVICE_STATE_RUNNING) {
> +		qat_vdev->mdev->ops->resume(qat_vdev->mdev);
> +		return NULL;
> +	}
> +
> +	/* vfio_mig_get_next_state() does not use arcs other than the above
> */
> +	WARN_ON(true);
> +	return ERR_PTR(-EINVAL);
> +}
> +
> +static void qat_vf_state_mutex_unlock(struct qat_vf_core_device
> *qat_vdev)
> +{
> +again:
> +	spin_lock(&qat_vdev->reset_lock);
> +	if (qat_vdev->deferred_reset) {
> +		qat_vdev->deferred_reset = false;
> +		spin_unlock(&qat_vdev->reset_lock);
> +		qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
> +		qat_vf_disable_fds(qat_vdev);
> +		goto again;
> +	}
> +	mutex_unlock(&qat_vdev->state_mutex);
> +	spin_unlock(&qat_vdev->reset_lock);
> +}
> +
> +static struct file *qat_vf_pci_set_device_state(struct vfio_device *vdev,
> +						enum vfio_device_mig_state
> new_state)
> +{
> +	struct qat_vf_core_device *qat_vdev = container_of(vdev,
> +			struct qat_vf_core_device, core_device.vdev);
> +	enum vfio_device_mig_state next_state;
> +	struct file *res = NULL;
> +	int ret;
> +
> +	mutex_lock(&qat_vdev->state_mutex);
> +	while (new_state != qat_vdev->mig_state) {
> +		ret = vfio_mig_get_next_state(vdev, qat_vdev->mig_state,
> +					      new_state, &next_state);
> +		if (ret) {
> +			res = ERR_PTR(ret);
> +			break;
> +		}
> +		res = qat_vf_pci_step_device_state(qat_vdev, next_state);
> +		if (IS_ERR(res))
> +			break;
> +		qat_vdev->mig_state = next_state;
> +		if (WARN_ON(res && new_state != qat_vdev->mig_state)) {
> +			fput(res);
> +			res = ERR_PTR(-EINVAL);
> +			break;
> +		}
> +	}
> +	qat_vf_state_mutex_unlock(qat_vdev);
> +
> +	return res;
> +}
> +
> +static int qat_vf_pci_get_device_state(struct vfio_device *vdev,
> +				       enum vfio_device_mig_state *curr_state)
> +{
> +	struct qat_vf_core_device *qat_vdev = container_of(vdev,
> +			struct qat_vf_core_device, core_device.vdev);
> +
> +	mutex_lock(&qat_vdev->state_mutex);
> +	*curr_state = qat_vdev->mig_state;
> +	qat_vf_state_mutex_unlock(qat_vdev);
> +
> +	return 0;
> +}
> +
> +static int qat_vf_pci_get_data_size(struct vfio_device *vdev,
> +				    unsigned long *stop_copy_length)
> +{
> +	struct qat_vf_core_device *qat_vdev = container_of(vdev,
> +			struct qat_vf_core_device, core_device.vdev);
> +
> +	*stop_copy_length = qat_vdev->mdev->state_size;

Do we need a lock here or this is not changing?

> +	return 0;
> +}
> +
> +static const struct vfio_migration_ops qat_vf_pci_mig_ops = {
> +	.migration_set_state = qat_vf_pci_set_device_state,
> +	.migration_get_state = qat_vf_pci_get_device_state,
> +	.migration_get_data_size = qat_vf_pci_get_data_size,
> +};
> +
> +static void qat_vf_pci_release_dev(struct vfio_device *core_vdev)
> +{
> +	struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
> +			struct qat_vf_core_device, core_device.vdev);
> +
> +	qat_vdev->mdev->ops->cleanup(qat_vdev->mdev);
> +	qat_vfmig_destroy(qat_vdev->mdev);
> +	mutex_destroy(&qat_vdev->state_mutex);
> +	vfio_pci_core_release_dev(core_vdev);
> +}
> +
> +static int qat_vf_pci_init_dev(struct vfio_device *core_vdev)
> +{
> +	struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
> +			struct qat_vf_core_device, core_device.vdev);
> +	struct qat_migdev_ops *ops;
> +	struct qat_mig_dev *mdev;
> +	struct pci_dev *parent;
> +	int ret, vf_id;
> +
> +	core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY |
> VFIO_MIGRATION_P2P;
> +	core_vdev->mig_ops = &qat_vf_pci_mig_ops;
> +
> +	ret = vfio_pci_core_init_dev(core_vdev);
> +	if (ret)
> +		return ret;
> +
> +	mutex_init(&qat_vdev->state_mutex);
> +	spin_lock_init(&qat_vdev->reset_lock);
> +
> +	parent = qat_vdev->core_device.pdev->physfn;

Can we use pci_physfn() here?

> +	vf_id = pci_iov_vf_id(qat_vdev->core_device.pdev);
> +	if (!parent || vf_id < 0) {

Also if the pci_iov_vf_id() return success I don't think you need to 
check for parent and can use directly below.

> +		ret = -ENODEV;
> +		goto err_rel;
> +	}
> +
> +	mdev = qat_vfmig_create(parent, vf_id);
> +	if (IS_ERR(mdev)) {
> +		ret = PTR_ERR(mdev);
> +		goto err_rel;
> +	}
> +
> +	ops = mdev->ops;
> +	if (!ops || !ops->init || !ops->cleanup ||
> +	    !ops->open || !ops->close ||
> +	    !ops->save_state || !ops->load_state ||
> +	    !ops->suspend || !ops->resume) {
> +		ret = -EIO;
> +		dev_err(&parent->dev, "Incomplete device migration ops
> structure!");
> +		goto err_destroy;
> +	}

If all these ops are a must why cant we move the check inside the qat_vfmig_create()?
Or rather call them explicitly as suggested by Jason.

Thanks,
Shameer

> +	ret = ops->init(mdev);
> +	if (ret)
> +		goto err_destroy;
> +
> +	qat_vdev->mdev = mdev;
> +
> +	return 0;
> +
> +err_destroy:
> +	qat_vfmig_destroy(mdev);
> +err_rel:
> +	vfio_pci_core_release_dev(core_vdev);
> +	return ret;
> +}
> +
> +static const struct vfio_device_ops qat_vf_pci_ops = {
> +	.name = "qat-vf-vfio-pci",
> +	.init = qat_vf_pci_init_dev,
> +	.release = qat_vf_pci_release_dev,
> +	.open_device = qat_vf_pci_open_device,
> +	.close_device = qat_vf_pci_close_device,
> +	.ioctl = vfio_pci_core_ioctl,
> +	.read = vfio_pci_core_read,
> +	.write = vfio_pci_core_write,
> +	.mmap = vfio_pci_core_mmap,
> +	.request = vfio_pci_core_request,
> +	.match = vfio_pci_core_match,
> +	.bind_iommufd = vfio_iommufd_physical_bind,
> +	.unbind_iommufd = vfio_iommufd_physical_unbind,
> +	.attach_ioas = vfio_iommufd_physical_attach_ioas,
> +	.detach_ioas = vfio_iommufd_physical_detach_ioas,
> +};
> +
> +static struct qat_vf_core_device *qat_vf_drvdata(struct pci_dev *pdev)
> +{
> +	struct vfio_pci_core_device *core_device = pci_get_drvdata(pdev);
> +
> +	return container_of(core_device, struct qat_vf_core_device,
> core_device);
> +}
> +
> +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev)
> +{
> +	struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev);
> +
> +	if (!qat_vdev->core_device.vdev.mig_ops)
> +		return;
> +
> +	/*
> +	 * As the higher VFIO layers are holding locks across reset and using
> +	 * those same locks with the mm_lock we need to prevent ABBA
> deadlock
> +	 * with the state_mutex and mm_lock.
> +	 * In case the state_mutex was taken already we defer the cleanup
> work
> +	 * to the unlock flow of the other running context.
> +	 */
> +	spin_lock(&qat_vdev->reset_lock);
> +	qat_vdev->deferred_reset = true;
> +	if (!mutex_trylock(&qat_vdev->state_mutex)) {
> +		spin_unlock(&qat_vdev->reset_lock);
> +		return;
> +	}
> +	spin_unlock(&qat_vdev->reset_lock);
> +	qat_vf_state_mutex_unlock(qat_vdev);
> +}
> +
> +static int
> +qat_vf_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> +{
> +	struct device *dev = &pdev->dev;
> +	struct qat_vf_core_device *qat_vdev;
> +	int ret;
> +
> +	qat_vdev = vfio_alloc_device(qat_vf_core_device, core_device.vdev,
> dev, &qat_vf_pci_ops);
> +	if (IS_ERR(qat_vdev))
> +		return PTR_ERR(qat_vdev);
> +
> +	pci_set_drvdata(pdev, &qat_vdev->core_device);
> +	ret = vfio_pci_core_register_device(&qat_vdev->core_device);
> +	if (ret)
> +		goto out_put_device;
> +
> +	return 0;
> +
> +out_put_device:
> +	vfio_put_device(&qat_vdev->core_device.vdev);
> +	return ret;
> +}
> +
> +static void qat_vf_vfio_pci_remove(struct pci_dev *pdev)
> +{
> +	struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev);
> +
> +	vfio_pci_core_unregister_device(&qat_vdev->core_device);
> +	vfio_put_device(&qat_vdev->core_device.vdev);
> +}
> +
> +static const struct pci_device_id qat_vf_vfio_pci_table[] = {
> +	/* Intel QAT GEN4 4xxx VF device */
> +	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL,
> 0x4941) },
> +	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL,
> 0x4943) },
> +	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL,
> 0x4945) },
> +	{}
> +};
> +MODULE_DEVICE_TABLE(pci, qat_vf_vfio_pci_table);
> +
> +static const struct pci_error_handlers qat_vf_err_handlers = {
> +	.reset_done = qat_vf_pci_aer_reset_done,
> +	.error_detected = vfio_pci_core_aer_err_detected,
> +};
> +
> +static struct pci_driver qat_vf_vfio_pci_driver = {
> +	.name = "qat_vfio_pci",
> +	.id_table = qat_vf_vfio_pci_table,
> +	.probe = qat_vf_vfio_pci_probe,
> +	.remove = qat_vf_vfio_pci_remove,
> +	.err_handler = &qat_vf_err_handlers,
> +	.driver_managed_dma = true,
> +};
> +module_pci_driver(qat_vf_vfio_pci_driver)
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Intel Corporation");
> +MODULE_DESCRIPTION("QAT VFIO PCI - VFIO PCI driver with live migration
> support for Intel(R) QAT GEN4 device family");
> +MODULE_IMPORT_NS(CRYPTO_QAT);
> --
> 2.18.2
Xin Zeng Feb. 9, 2024, 8:23 a.m. UTC | #4
Thanks for your comments, Jason.
On Tuesday, February 6, 2024 8:55 PM, Jason Gunthorpe <jgg@nvidia.com> wrote:
> > +
> > +	ops = mdev->ops;
> > +	if (!ops || !ops->init || !ops->cleanup ||
> > +	    !ops->open || !ops->close ||
> > +	    !ops->save_state || !ops->load_state ||
> > +	    !ops->suspend || !ops->resume) {
> > +		ret = -EIO;
> > +		dev_err(&parent->dev, "Incomplete device migration ops
> structure!");
> > +		goto err_destroy;
> > +	}
> 
> Why are there ops pointers here? I would expect this should just be
> direct function calls to the PF QAT driver.

I indeed had a version where the direct function calls are Implemented in
QAT driver, while when I look at the functions, most of them 
only translate the interface to the ops pointer. That's why I put
ops pointers directly into vfio variant driver.

> 
> > +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev)
> > +{
> > +	struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev);
> > +
> > +	if (!qat_vdev->core_device.vdev.mig_ops)
> > +		return;
> > +
> > +	/*
> > +	 * As the higher VFIO layers are holding locks across reset and using
> > +	 * those same locks with the mm_lock we need to prevent ABBA
> deadlock
> > +	 * with the state_mutex and mm_lock.
> > +	 * In case the state_mutex was taken already we defer the cleanup work
> > +	 * to the unlock flow of the other running context.
> > +	 */
> > +	spin_lock(&qat_vdev->reset_lock);
> > +	qat_vdev->deferred_reset = true;
> > +	if (!mutex_trylock(&qat_vdev->state_mutex)) {
> > +		spin_unlock(&qat_vdev->reset_lock);
> > +		return;
> > +	}
> > +	spin_unlock(&qat_vdev->reset_lock);
> > +	qat_vf_state_mutex_unlock(qat_vdev);
> > +}
> 
> Do you really need this? I thought this ugly thing was going to be a
> uniquely mlx5 thing..

I think that's still required to make the migration state synchronized
if the VF is reset by other VFIO emulation paths. Is it the case? 
BTW, this implementation is not only in mlx5 driver, but also in other
Vfio pci variant drivers such as hisilicon acc driver and pds driver.
Thanks,
Xin
Jason Gunthorpe Feb. 9, 2024, 12:10 p.m. UTC | #5
On Fri, Feb 09, 2024 at 08:23:32AM +0000, Zeng, Xin wrote:
> Thanks for your comments, Jason.
> On Tuesday, February 6, 2024 8:55 PM, Jason Gunthorpe <jgg@nvidia.com> wrote:
> > > +
> > > +	ops = mdev->ops;
> > > +	if (!ops || !ops->init || !ops->cleanup ||
> > > +	    !ops->open || !ops->close ||
> > > +	    !ops->save_state || !ops->load_state ||
> > > +	    !ops->suspend || !ops->resume) {
> > > +		ret = -EIO;
> > > +		dev_err(&parent->dev, "Incomplete device migration ops
> > structure!");
> > > +		goto err_destroy;
> > > +	}
> > 
> > Why are there ops pointers here? I would expect this should just be
> > direct function calls to the PF QAT driver.
> 
> I indeed had a version where the direct function calls are Implemented in
> QAT driver, while when I look at the functions, most of them 
> only translate the interface to the ops pointer. That's why I put
> ops pointers directly into vfio variant driver.

But why is there an ops indirection at all? Are there more than one
ops?

> > > +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev)
> > > +{
> > > +	struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev);
> > > +
> > > +	if (!qat_vdev->core_device.vdev.mig_ops)
> > > +		return;
> > > +
> > > +	/*
> > > +	 * As the higher VFIO layers are holding locks across reset and using
> > > +	 * those same locks with the mm_lock we need to prevent ABBA
> > deadlock
> > > +	 * with the state_mutex and mm_lock.
> > > +	 * In case the state_mutex was taken already we defer the cleanup work
> > > +	 * to the unlock flow of the other running context.
> > > +	 */
> > > +	spin_lock(&qat_vdev->reset_lock);
> > > +	qat_vdev->deferred_reset = true;
> > > +	if (!mutex_trylock(&qat_vdev->state_mutex)) {
> > > +		spin_unlock(&qat_vdev->reset_lock);
> > > +		return;
> > > +	}
> > > +	spin_unlock(&qat_vdev->reset_lock);
> > > +	qat_vf_state_mutex_unlock(qat_vdev);
> > > +}
> > 
> > Do you really need this? I thought this ugly thing was going to be a
> > uniquely mlx5 thing..
> 
> I think that's still required to make the migration state synchronized
> if the VF is reset by other VFIO emulation paths. Is it the case? 
> BTW, this implementation is not only in mlx5 driver, but also in other
> Vfio pci variant drivers such as hisilicon acc driver and pds
> driver.

It had to specifically do with the mm lock interaction that, I
thought, was going to be unique to the mlx driver. Otherwise you could
just directly hold the state_mutex here.

Yishai do you remember the exact trace for the mmlock entanglement?

Jason
Xin Zeng Feb. 9, 2024, 4:16 p.m. UTC | #6
Thanks for your comments, Alex.
On Wednesday, February 7, 2024 5:15 AM, Alex Williamson <alex.williamson@redhat.com> wrote:
> > diff --git a/drivers/vfio/pci/intel/qat/Kconfig
> b/drivers/vfio/pci/intel/qat/Kconfig
> > new file mode 100644
> > index 000000000000..71b28ac0bf6a
> > --- /dev/null
> > +++ b/drivers/vfio/pci/intel/qat/Kconfig
> > @@ -0,0 +1,13 @@
> > +# SPDX-License-Identifier: GPL-2.0-only
> > +config QAT_VFIO_PCI
> > +	tristate "VFIO support for QAT VF PCI devices"
> > +	select VFIO_PCI_CORE
> > +	depends on CRYPTO_DEV_QAT
> > +	depends on CRYPTO_DEV_QAT_4XXX
> 
> CRYPTO_DEV_QAT_4XXX selects CRYPTO_DEV_QAT, is it necessary to depend
> on CRYPTO_DEV_QAT here?

You are right, we don't need it, will update it in next version.

> 
> > +	help
> > +	  This provides migration support for Intel(R) QAT Virtual Function
> > +	  using the VFIO framework.
> > +
> > +	  To compile this as a module, choose M here: the module
> > +	  will be called qat_vfio_pci. If you don't know what to do here,
> > +	  say N.
> > diff --git a/drivers/vfio/pci/intel/qat/Makefile
> b/drivers/vfio/pci/intel/qat/Makefile
> > new file mode 100644
> > index 000000000000..9289ae4c51bf
> > --- /dev/null
> > +++ b/drivers/vfio/pci/intel/qat/Makefile
> > @@ -0,0 +1,4 @@
> > +# SPDX-License-Identifier: GPL-2.0-only
> > +obj-$(CONFIG_QAT_VFIO_PCI) += qat_vfio_pci.o
> > +qat_vfio_pci-y := main.o
> > +
> 
> Blank line at end of file.

Good catch, will update it in next version.

> 
> > diff --git a/drivers/vfio/pci/intel/qat/main.c
> b/drivers/vfio/pci/intel/qat/main.c
> > new file mode 100644
> > index 000000000000..85d0ed701397
> > --- /dev/null
> > +++ b/drivers/vfio/pci/intel/qat/main.c
> > @@ -0,0 +1,572 @@
> ...
> > +static int qat_vf_pci_init_dev(struct vfio_device *core_vdev)
> > +{
> > +	struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
> > +			struct qat_vf_core_device, core_device.vdev);
> > +	struct qat_migdev_ops *ops;
> > +	struct qat_mig_dev *mdev;
> > +	struct pci_dev *parent;
> > +	int ret, vf_id;
> > +
> > +	core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY |
> VFIO_MIGRATION_P2P;
> 
> AFAICT it's always good practice to support VFIO_MIGRATION_PRE_COPY,
> even if only to leave yourself an option to mark an incompatible ABI
> change in the data stream that can be checked before the stop-copy
> phase of migration.  See for example the mtty sample driver.  Thanks,
> Alex

If the good practice is to check migration stream compatibility, then
we do incorporate additional information as part of device state for ABI
compatibility testing such as magic,  version and device capability in stop
copy phase. Can we ignore the optional VFIO_MIGRATION_PRE_COPY support?
Thanks,
Xin
Yishai Hadas Feb. 11, 2024, 8:17 a.m. UTC | #7
On 09/02/2024 14:10, Jason Gunthorpe wrote:
> On Fri, Feb 09, 2024 at 08:23:32AM +0000, Zeng, Xin wrote:
>> Thanks for your comments, Jason.
>> On Tuesday, February 6, 2024 8:55 PM, Jason Gunthorpe <jgg@nvidia.com> wrote:
>>>> +
>>>> +	ops = mdev->ops;
>>>> +	if (!ops || !ops->init || !ops->cleanup ||
>>>> +	    !ops->open || !ops->close ||
>>>> +	    !ops->save_state || !ops->load_state ||
>>>> +	    !ops->suspend || !ops->resume) {
>>>> +		ret = -EIO;
>>>> +		dev_err(&parent->dev, "Incomplete device migration ops
>>> structure!");
>>>> +		goto err_destroy;
>>>> +	}
>>>
>>> Why are there ops pointers here? I would expect this should just be
>>> direct function calls to the PF QAT driver.
>>
>> I indeed had a version where the direct function calls are Implemented in
>> QAT driver, while when I look at the functions, most of them
>> only translate the interface to the ops pointer. That's why I put
>> ops pointers directly into vfio variant driver.
> 
> But why is there an ops indirection at all? Are there more than one
> ops?
> 
>>>> +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev)
>>>> +{
>>>> +	struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev);
>>>> +
>>>> +	if (!qat_vdev->core_device.vdev.mig_ops)
>>>> +		return;
>>>> +
>>>> +	/*
>>>> +	 * As the higher VFIO layers are holding locks across reset and using
>>>> +	 * those same locks with the mm_lock we need to prevent ABBA
>>> deadlock
>>>> +	 * with the state_mutex and mm_lock.
>>>> +	 * In case the state_mutex was taken already we defer the cleanup work
>>>> +	 * to the unlock flow of the other running context.
>>>> +	 */
>>>> +	spin_lock(&qat_vdev->reset_lock);
>>>> +	qat_vdev->deferred_reset = true;
>>>> +	if (!mutex_trylock(&qat_vdev->state_mutex)) {
>>>> +		spin_unlock(&qat_vdev->reset_lock);
>>>> +		return;
>>>> +	}
>>>> +	spin_unlock(&qat_vdev->reset_lock);
>>>> +	qat_vf_state_mutex_unlock(qat_vdev);
>>>> +}
>>>
>>> Do you really need this? I thought this ugly thing was going to be a
>>> uniquely mlx5 thing..
>>
>> I think that's still required to make the migration state synchronized
>> if the VF is reset by other VFIO emulation paths. Is it the case?
>> BTW, this implementation is not only in mlx5 driver, but also in other
>> Vfio pci variant drivers such as hisilicon acc driver and pds
>> driver.
> 
> It had to specifically do with the mm lock interaction that, I
> thought, was going to be unique to the mlx driver. Otherwise you could
> just directly hold the state_mutex here.
> 
> Yishai do you remember the exact trace for the mmlock entanglement?
> 

I found in my in-box (from more than 2.5 years ago) the below [1] 
lockdep WARNING when the state_mutex was used directly.

Once we moved to the 'deferred_reset' mode it solved that.

[1]
[  +1.063822] ======================================================
[  +0.000732] WARNING: possible circular locking dependency detected
[  +0.000747] 5.15.0-rc3 #236 Not tainted
[  +0.000556] ------------------------------------------------------
[  +0.000714] qemu-system-x86/7731 is trying to acquire lock:
[  +0.000659] ffff888126c64b78 (&vdev->vma_lock){+.+.}-{3:3}, at: 
vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core]
[  +0.001127]
               but task is already holding lock:
[  +0.000803] ffff888105f4c5d8 (&mm->mmap_lock#2){++++}-{3:3}, at: 
vaddr_get_pfns+0x64/0x240 [vfio_iommu_type1]
[  +0.001119]
               which lock already depends on the new lock.

[  +0.001086]
               the existing dependency chain (in reverse order) is:
[  +0.000910]
               -> #3 (&mm->mmap_lock#2){++++}-{3:3}:
[  +0.000844]        __might_fault+0x56/0x80
[  +0.000572]        _copy_to_user+0x1e/0x80
[  +0.000556]        mlx5vf_pci_mig_rw.cold+0xa1/0x24f [mlx5_vfio_pci]
[  +0.000732]        vfs_read+0xa8/0x1c0
[  +0.000547]        __x64_sys_pread64+0x8c/0xc0
[  +0.000580]        do_syscall_64+0x3d/0x90
[  +0.000566]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[  +0.000682]
               -> #2 (&mvdev->state_mutex){+.+.}-{3:3}:
[  +0.000899]        __mutex_lock+0x80/0x9d0
[  +0.000566]        mlx5vf_reset_done+0x2c/0x40 [mlx5_vfio_pci]
[  +0.000697]        vfio_pci_core_ioctl+0x585/0x1020 [vfio_pci_core]
[  +0.000721]        __x64_sys_ioctl+0x436/0x9a0
[  +0.000588]        do_syscall_64+0x3d/0x90
[  +0.000584]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[  +0.000674]
               -> #1 (&vdev->memory_lock){+.+.}-{3:3}:
[  +0.000843]        down_write+0x38/0x70
[  +0.000544]        vfio_pci_zap_and_down_write_memory_lock+0x1c/0x30 
[vfio_pci_core]
[  +0.003705]        vfio_basic_config_write+0x1e7/0x280 [vfio_pci_core]
[  +0.000744]        vfio_pci_config_rw+0x1b7/0x3af [vfio_pci_core]
[  +0.000716]        vfs_write+0xe6/0x390
[  +0.000539]        __x64_sys_pwrite64+0x8c/0xc0
[  +0.000603]        do_syscall_64+0x3d/0x90
[  +0.000572]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[  +0.000661]
               -> #0 (&vdev->vma_lock){+.+.}-{3:3}:
[  +0.000828]        __lock_acquire+0x1244/0x2280
[  +0.000596]        lock_acquire+0xc2/0x2c0
[  +0.000556]        __mutex_lock+0x80/0x9d0
[  +0.000580]        vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core]
[  +0.000709]        __do_fault+0x32/0xa0
[  +0.000556]        __handle_mm_fault+0xbe8/0x1450
[  +0.000606]        handle_mm_fault+0x6c/0x140
[  +0.000624]        fixup_user_fault+0x6b/0x100
[  +0.000600]        vaddr_get_pfns+0x108/0x240 [vfio_iommu_type1]
[  +0.000726]        vfio_pin_pages_remote+0x326/0x460 [vfio_iommu_type1]
[  +0.000736]        vfio_iommu_type1_ioctl+0x43b/0x15a0 [vfio_iommu_type1]
[  +0.000752]        __x64_sys_ioctl+0x436/0x9a0
[  +0.000588]        do_syscall_64+0x3d/0x90
[  +0.000571]        entry_SYSCALL_64_after_hwframe+0x44/0xae
[  +0.000677]
               other info that might help us debug this:

[  +0.001073] Chain exists of:
                 &vdev->vma_lock --> &mvdev->state_mutex --> 
&mm->mmap_lock#2

[  +0.001285]  Possible unsafe locking scenario:

[  +0.000808]        CPU0                    CPU1
[  +0.000599]        ----                    ----
[  +0.000593]   lock(&mm->mmap_lock#2);
[  +0.000530]                                lock(&mvdev->state_mutex);
[  +0.000725]                                lock(&mm->mmap_lock#2);
[  +0.000712]   lock(&vdev->vma_lock);
[  +0.000532]
                *** DEADLOCK ***

[  +0.000922] 2 locks held by qemu-system-x86/7731:
[  +0.000597]  #0: ffff88810c9bec88 (&iommu->lock#2){+.+.}-{3:3}, at: 
vfio_iommu_type1_ioctl+0x189/0x15a0 [vfio_iommu_type1]
[  +0.001177]  #1: ffff888105f4c5d8 (&mm->mmap_lock#2){++++}-{3:3}, at: 
vaddr_get_pfns+0x64/0x240 [vfio_iommu_type1]
[  +0.001153]
               stack backtrace:
[  +0.000689] CPU: 1 PID: 7731 Comm: qemu-system-x86 Not tainted 
5.15.0-rc3 #236
[  +0.000932] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[  +0.001192] Call Trace:
[  +0.000454]  dump_stack_lvl+0x45/0x59
[  +0.000527]  check_noncircular+0xf2/0x110
[  +0.000557]  __lock_acquire+0x1244/0x2280
[  +0.002462]  lock_acquire+0xc2/0x2c0
[  +0.000534]  ? vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core]
[  +0.000684]  ? lock_is_held_type+0x98/0x110
[  +0.000565]  __mutex_lock+0x80/0x9d0
[  +0.000542]  ? vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core]
[  +0.000676]  ? vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core]
[  +0.000681]  ? mark_held_locks+0x49/0x70
[  +0.000551]  ? lock_is_held_type+0x98/0x110
[  +0.000575]  vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core]
[  +0.000679]  __do_fault+0x32/0xa0
[  +0.000700]  __handle_mm_fault+0xbe8/0x1450
[  +0.000571]  handle_mm_fault+0x6c/0x140
[  +0.000564]  fixup_user_fault+0x6b/0x100
[  +0.000556]  vaddr_get_pfns+0x108/0x240 [vfio_iommu_type1]
[  +0.000666]  ? lock_is_held_type+0x98/0x110
[  +0.000572]  vfio_pin_pages_remote+0x326/0x460 [vfio_iommu_type1]
[  +0.000710]  vfio_iommu_type1_ioctl+0x43b/0x15a0 [vfio_iommu_type1]
[  +0.001050]  ? find_held_lock+0x2b/0x80
[  +0.000561]  ? lock_release+0xc2/0x2a0
[  +0.000537]  ? __fget_files+0xdc/0x1d0
[  +0.000541]  __x64_sys_ioctl+0x436/0x9a0
[  +0.000556]  ? lockdep_hardirqs_on_prepare+0xd4/0x170
[  +0.000641]  do_syscall_64+0x3d/0x90
[  +0.000529]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  +0.000670] RIP: 0033:0x7f54255bb45b
[  +0.000541] Code: 0f 1e fa 48 8b 05 2d aa 2c 00 64 c7 00 26 00 00 00 
48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 
05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d fd a9 2c 00 f7 d8 64 89 01 48
[  +0.001843] RSP: 002b:00007f5415ca3d38 EFLAGS: 00000206 ORIG_RAX: 
0000000000000010
[  +0.000929] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 
00007f54255bb45b
[  +0.000781] RDX: 00007f5415ca3d70 RSI: 0000000000003b71 RDI: 
0000000000000012
[  +0.000775] RBP: 00007f5415ca3da0 R08: 0000000000000000 R09: 
0000000000000000
[  +0.000777] R10: 00000000fe002000 R11: 0000000000000206 R12: 
0000000000000004
[  +0.000775] R13: 0000000000000000 R14: 00000000fe000000 R15: 
00007f5415ca47c0

Yishai
Xin Zeng Feb. 13, 2024, 1:08 p.m. UTC | #8
Thanks for the comments, Shameer.

> -----Original Message-----
> From: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Sent: Thursday, February 8, 2024 8:17 PM
> To: Zeng, Xin <xin.zeng@intel.com>; herbert@gondor.apana.org.au;
> alex.williamson@redhat.com; jgg@nvidia.com; yishaih@nvidia.com; Tian, Kevin
> <kevin.tian@intel.com>
> Cc: linux-crypto@vger.kernel.org; kvm@vger.kernel.org; qat-linux <qat-
> linux@intel.com>; Cao, Yahui <yahui.cao@intel.com>
> Subject: RE: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF devices
> 
> 
> > +static struct qat_vf_migration_file *
> > +qat_vf_save_device_data(struct qat_vf_core_device *qat_vdev)
> > +{
> > +	struct qat_vf_migration_file *migf;
> > +	int ret;
> > +
> > +	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
> > +	if (!migf)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_save_fops,
> > migf, O_RDONLY);
> > +	ret = PTR_ERR_OR_ZERO(migf->filp);
> > +	if (ret) {
> > +		kfree(migf);
> > +		return ERR_PTR(ret);
> > +	}
> > +
> > +	stream_open(migf->filp->f_inode, migf->filp);
> > +	mutex_init(&migf->lock);
> > +
> > +	ret = qat_vdev->mdev->ops->save_state(qat_vdev->mdev);
> > +	if (ret) {
> > +		fput(migf->filp);
> > +		kfree(migf);
> 
> Probably don't need that kfree(migf) here as fput() -->  qat_vf_release_file () will
> do that.

Thanks, it's redundant, will update it in next version.

> 
> > +static int qat_vf_pci_get_data_size(struct vfio_device *vdev,
> > +				    unsigned long *stop_copy_length)
> > +{
> > +	struct qat_vf_core_device *qat_vdev = container_of(vdev,
> > +			struct qat_vf_core_device, core_device.vdev);
> > +
> > +	*stop_copy_length = qat_vdev->mdev->state_size;
> 
> Do we need a lock here or this is not changing?

Yes, will update it in next version.

> 
> > +	return 0;
> > +}
> > +
> > +static const struct vfio_migration_ops qat_vf_pci_mig_ops = {
> > +	.migration_set_state = qat_vf_pci_set_device_state,
> > +	.migration_get_state = qat_vf_pci_get_device_state,
> > +	.migration_get_data_size = qat_vf_pci_get_data_size,
> > +};
> > +
> > +static void qat_vf_pci_release_dev(struct vfio_device *core_vdev)
> > +{
> > +	struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
> > +			struct qat_vf_core_device, core_device.vdev);
> > +
> > +	qat_vdev->mdev->ops->cleanup(qat_vdev->mdev);
> > +	qat_vfmig_destroy(qat_vdev->mdev);
> > +	mutex_destroy(&qat_vdev->state_mutex);
> > +	vfio_pci_core_release_dev(core_vdev);
> > +}
> > +
> > +static int qat_vf_pci_init_dev(struct vfio_device *core_vdev)
> > +{
> > +	struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
> > +			struct qat_vf_core_device, core_device.vdev);
> > +	struct qat_migdev_ops *ops;
> > +	struct qat_mig_dev *mdev;
> > +	struct pci_dev *parent;
> > +	int ret, vf_id;
> > +
> > +	core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY |
> > VFIO_MIGRATION_P2P;
> > +	core_vdev->mig_ops = &qat_vf_pci_mig_ops;
> > +
> > +	ret = vfio_pci_core_init_dev(core_vdev);
> > +	if (ret)
> > +		return ret;
> > +
> > +	mutex_init(&qat_vdev->state_mutex);
> > +	spin_lock_init(&qat_vdev->reset_lock);
> > +
> > +	parent = qat_vdev->core_device.pdev->physfn;
> 
> Can we use pci_physfn() here?

Sure, will update it in next version.

> 
> > +	vf_id = pci_iov_vf_id(qat_vdev->core_device.pdev);
> > +	if (!parent || vf_id < 0) {
> 
> Also if the pci_iov_vf_id() return success I don't think you need to
> check for parent and can use directly below.

OK, will update it in next version.

> 
> > +		ret = -ENODEV;
> > +		goto err_rel;
> > +	}
> > +
> > +	mdev = qat_vfmig_create(parent, vf_id);
> > +	if (IS_ERR(mdev)) {
> > +		ret = PTR_ERR(mdev);
> > +		goto err_rel;
> > +	}
> > +
> > +	ops = mdev->ops;
> > +	if (!ops || !ops->init || !ops->cleanup ||
> > +	    !ops->open || !ops->close ||
> > +	    !ops->save_state || !ops->load_state ||
> > +	    !ops->suspend || !ops->resume) {
> > +		ret = -EIO;
> > +		dev_err(&parent->dev, "Incomplete device migration ops
> > structure!");
> > +		goto err_destroy;
> > +	}
> 
> If all these ops are a must why cant we move the check inside the
> qat_vfmig_create()?
> Or rather call them explicitly as suggested by Jason.

We can do it, but it might make sense to leave the check to the APIs' user
as some of these ops interfaces might be optional for other QAT variant driver
in future.

Thanks,
Xin
Jason Gunthorpe Feb. 13, 2024, 2:55 p.m. UTC | #9
On Tue, Feb 13, 2024 at 01:08:47PM +0000, Zeng, Xin wrote:
> > > +		ret = -ENODEV;
> > > +		goto err_rel;
> > > +	}
> > > +
> > > +	mdev = qat_vfmig_create(parent, vf_id);
> > > +	if (IS_ERR(mdev)) {
> > > +		ret = PTR_ERR(mdev);
> > > +		goto err_rel;
> > > +	}
> > > +
> > > +	ops = mdev->ops;
> > > +	if (!ops || !ops->init || !ops->cleanup ||
> > > +	    !ops->open || !ops->close ||
> > > +	    !ops->save_state || !ops->load_state ||
> > > +	    !ops->suspend || !ops->resume) {
> > > +		ret = -EIO;
> > > +		dev_err(&parent->dev, "Incomplete device migration ops
> > > structure!");
> > > +		goto err_destroy;
> > > +	}
> > 
> > If all these ops are a must why cant we move the check inside the
> > qat_vfmig_create()?
> > Or rather call them explicitly as suggested by Jason.
> 
> We can do it, but it might make sense to leave the check to the APIs' user
> as some of these ops interfaces might be optional for other QAT variant driver
> in future.

If you have patches already that need ops then you can leave them, but
otherwise they should be removed and added back if you come with
future patches that require another implementation.

Jason
Xin Zeng Feb. 17, 2024, 4:20 p.m. UTC | #10
On Sunday, February 11, 2024 4:17 PM, Yishai Hadas <yishaih@nvidia.com> wrote:
> >>>> +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev)
> >>>> +{
> >>>> +	struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev);
> >>>> +
> >>>> +	if (!qat_vdev->core_device.vdev.mig_ops)
> >>>> +		return;
> >>>> +
> >>>> +	/*
> >>>> +	 * As the higher VFIO layers are holding locks across reset and using
> >>>> +	 * those same locks with the mm_lock we need to prevent ABBA
> >>> deadlock
> >>>> +	 * with the state_mutex and mm_lock.
> >>>> +	 * In case the state_mutex was taken already we defer the cleanup work
> >>>> +	 * to the unlock flow of the other running context.
> >>>> +	 */
> >>>> +	spin_lock(&qat_vdev->reset_lock);
> >>>> +	qat_vdev->deferred_reset = true;
> >>>> +	if (!mutex_trylock(&qat_vdev->state_mutex)) {
> >>>> +		spin_unlock(&qat_vdev->reset_lock);
> >>>> +		return;
> >>>> +	}
> >>>> +	spin_unlock(&qat_vdev->reset_lock);
> >>>> +	qat_vf_state_mutex_unlock(qat_vdev);
> >>>> +}
> >>>
> >>> Do you really need this? I thought this ugly thing was going to be a
> >>> uniquely mlx5 thing..
> >>
> >> I think that's still required to make the migration state synchronized
> >> if the VF is reset by other VFIO emulation paths. Is it the case?
> >> BTW, this implementation is not only in mlx5 driver, but also in other
> >> Vfio pci variant drivers such as hisilicon acc driver and pds
> >> driver.
> >
> > It had to specifically do with the mm lock interaction that, I
> > thought, was going to be unique to the mlx driver. Otherwise you could
> > just directly hold the state_mutex here.
> >
> > Yishai do you remember the exact trace for the mmlock entanglement?
> >
> 
> I found in my in-box (from more than 2.5 years ago) the below [1]
> lockdep WARNING when the state_mutex was used directly.
> 
> Once we moved to the 'deferred_reset' mode it solved that.
> 
> [1]
> [  +1.063822] ======================================================
> [  +0.000732] WARNING: possible circular locking dependency detected
> [  +0.000747] 5.15.0-rc3 #236 Not tainted
> [  +0.000556] ------------------------------------------------------
> [  +0.000714] qemu-system-x86/7731 is trying to acquire lock:
> [  +0.000659] ffff888126c64b78 (&vdev->vma_lock){+.+.}-{3:3}, at:
> vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core]
> [  +0.001127]
>                but task is already holding lock:
> [  +0.000803] ffff888105f4c5d8 (&mm->mmap_lock#2){++++}-{3:3}, at:
> vaddr_get_pfns+0x64/0x240 [vfio_iommu_type1]
> [  +0.001119]
>                which lock already depends on the new lock.
> 
> [  +0.001086]
>                the existing dependency chain (in reverse order) is:
> [  +0.000910]
>                -> #3 (&mm->mmap_lock#2){++++}-{3:3}:
> [  +0.000844]        __might_fault+0x56/0x80
> [  +0.000572]        _copy_to_user+0x1e/0x80
> [  +0.000556]        mlx5vf_pci_mig_rw.cold+0xa1/0x24f [mlx5_vfio_pci]
> [  +0.000732]        vfs_read+0xa8/0x1c0
> [  +0.000547]        __x64_sys_pread64+0x8c/0xc0
> [  +0.000580]        do_syscall_64+0x3d/0x90
> [  +0.000566]        entry_SYSCALL_64_after_hwframe+0x44/0xae
> [  +0.000682]
>                -> #2 (&mvdev->state_mutex){+.+.}-{3:3}:
> [  +0.000899]        __mutex_lock+0x80/0x9d0
> [  +0.000566]        mlx5vf_reset_done+0x2c/0x40 [mlx5_vfio_pci]
> [  +0.000697]        vfio_pci_core_ioctl+0x585/0x1020 [vfio_pci_core]
> [  +0.000721]        __x64_sys_ioctl+0x436/0x9a0
> [  +0.000588]        do_syscall_64+0x3d/0x90
> [  +0.000584]        entry_SYSCALL_64_after_hwframe+0x44/0xae
> [  +0.000674]
>                -> #1 (&vdev->memory_lock){+.+.}-{3:3}:
> [  +0.000843]        down_write+0x38/0x70
> [  +0.000544]        vfio_pci_zap_and_down_write_memory_lock+0x1c/0x30
> [vfio_pci_core]
> [  +0.003705]        vfio_basic_config_write+0x1e7/0x280 [vfio_pci_core]
> [  +0.000744]        vfio_pci_config_rw+0x1b7/0x3af [vfio_pci_core]
> [  +0.000716]        vfs_write+0xe6/0x390
> [  +0.000539]        __x64_sys_pwrite64+0x8c/0xc0
> [  +0.000603]        do_syscall_64+0x3d/0x90
> [  +0.000572]        entry_SYSCALL_64_after_hwframe+0x44/0xae
> [  +0.000661]
>                -> #0 (&vdev->vma_lock){+.+.}-{3:3}:
> [  +0.000828]        __lock_acquire+0x1244/0x2280
> [  +0.000596]        lock_acquire+0xc2/0x2c0
> [  +0.000556]        __mutex_lock+0x80/0x9d0
> [  +0.000580]        vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core]
> [  +0.000709]        __do_fault+0x32/0xa0
> [  +0.000556]        __handle_mm_fault+0xbe8/0x1450
> [  +0.000606]        handle_mm_fault+0x6c/0x140
> [  +0.000624]        fixup_user_fault+0x6b/0x100
> [  +0.000600]        vaddr_get_pfns+0x108/0x240 [vfio_iommu_type1]
> [  +0.000726]        vfio_pin_pages_remote+0x326/0x460 [vfio_iommu_type1]
> [  +0.000736]        vfio_iommu_type1_ioctl+0x43b/0x15a0 [vfio_iommu_type1]
> [  +0.000752]        __x64_sys_ioctl+0x436/0x9a0
> [  +0.000588]        do_syscall_64+0x3d/0x90
> [  +0.000571]        entry_SYSCALL_64_after_hwframe+0x44/0xae
> [  +0.000677]
>                other info that might help us debug this:
> 
> [  +0.001073] Chain exists of:
>                  &vdev->vma_lock --> &mvdev->state_mutex -->
> &mm->mmap_lock#2
> 
> [  +0.001285]  Possible unsafe locking scenario:
> 
> [  +0.000808]        CPU0                    CPU1
> [  +0.000599]        ----                    ----
> [  +0.000593]   lock(&mm->mmap_lock#2);
> [  +0.000530]                                lock(&mvdev->state_mutex);
> [  +0.000725]                                lock(&mm->mmap_lock#2);
> [  +0.000712]   lock(&vdev->vma_lock);
> [  +0.000532]
>                 *** DEADLOCK ***

Thanks for this information, but this flow is not clear to me why it 
cause deadlock. From this flow, CPU0 is not waiting for any resource
held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue
to run. Am I missing something? 
Meanwhile, it seems the trace above is triggered with migration
protocol v1, the context of CPU1 listed also seems protocol v1
specific. Is it the case?
Thanks,
Xin

> 
> [  +0.000922] 2 locks held by qemu-system-x86/7731:
> [  +0.000597]  #0: ffff88810c9bec88 (&iommu->lock#2){+.+.}-{3:3}, at:
> vfio_iommu_type1_ioctl+0x189/0x15a0 [vfio_iommu_type1]
> [  +0.001177]  #1: ffff888105f4c5d8 (&mm->mmap_lock#2){++++}-{3:3}, at:
> vaddr_get_pfns+0x64/0x240 [vfio_iommu_type1]
> [  +0.001153]
>                stack backtrace:
> [  +0.000689] CPU: 1 PID: 7731 Comm: qemu-system-x86 Not tainted
> 5.15.0-rc3 #236
> [  +0.000932] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
> rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
> [  +0.001192] Call Trace:
> [  +0.000454]  dump_stack_lvl+0x45/0x59
> [  +0.000527]  check_noncircular+0xf2/0x110
> [  +0.000557]  __lock_acquire+0x1244/0x2280
> [  +0.002462]  lock_acquire+0xc2/0x2c0
> [  +0.000534]  ? vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core]
> [  +0.000684]  ? lock_is_held_type+0x98/0x110
> [  +0.000565]  __mutex_lock+0x80/0x9d0
> [  +0.000542]  ? vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core]
> [  +0.000676]  ? vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core]
> [  +0.000681]  ? mark_held_locks+0x49/0x70
> [  +0.000551]  ? lock_is_held_type+0x98/0x110
> [  +0.000575]  vfio_pci_mmap_fault+0x35/0x140 [vfio_pci_core]
> [  +0.000679]  __do_fault+0x32/0xa0
> [  +0.000700]  __handle_mm_fault+0xbe8/0x1450
> [  +0.000571]  handle_mm_fault+0x6c/0x140
> [  +0.000564]  fixup_user_fault+0x6b/0x100
> [  +0.000556]  vaddr_get_pfns+0x108/0x240 [vfio_iommu_type1]
> [  +0.000666]  ? lock_is_held_type+0x98/0x110
> [  +0.000572]  vfio_pin_pages_remote+0x326/0x460 [vfio_iommu_type1]
> [  +0.000710]  vfio_iommu_type1_ioctl+0x43b/0x15a0 [vfio_iommu_type1]
> [  +0.001050]  ? find_held_lock+0x2b/0x80
> [  +0.000561]  ? lock_release+0xc2/0x2a0
> [  +0.000537]  ? __fget_files+0xdc/0x1d0
> [  +0.000541]  __x64_sys_ioctl+0x436/0x9a0
> [  +0.000556]  ? lockdep_hardirqs_on_prepare+0xd4/0x170
> [  +0.000641]  do_syscall_64+0x3d/0x90
> [  +0.000529]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> [  +0.000670] RIP: 0033:0x7f54255bb45b
> [  +0.000541] Code: 0f 1e fa 48 8b 05 2d aa 2c 00 64 c7 00 26 00 00 00
> 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f
> 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d fd a9 2c 00 f7 d8 64 89 01 48
> [  +0.001843] RSP: 002b:00007f5415ca3d38 EFLAGS: 00000206 ORIG_RAX:
> 0000000000000010
> [  +0.000929] RAX: ffffffffffffffda RBX: 0000000000000000 RCX:
> 00007f54255bb45b
> [  +0.000781] RDX: 00007f5415ca3d70 RSI: 0000000000003b71 RDI:
> 0000000000000012
> [  +0.000775] RBP: 00007f5415ca3da0 R08: 0000000000000000 R09:
> 0000000000000000
> [  +0.000777] R10: 00000000fe002000 R11: 0000000000000206 R12:
> 0000000000000004
> [  +0.000775] R13: 0000000000000000 R14: 00000000fe000000 R15:
> 00007f5415ca47c0
> 
> Yishai
Jason Gunthorpe Feb. 20, 2024, 1:24 p.m. UTC | #11
On Sat, Feb 17, 2024 at 04:20:20PM +0000, Zeng, Xin wrote:

> Thanks for this information, but this flow is not clear to me why it 
> cause deadlock. From this flow, CPU0 is not waiting for any resource
> held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue
> to run. Am I missing something?

At some point it was calling copy_to_user() under the state
mutex. These days it doesn't.

copy_to_user() would nest the mm_lock under the state mutex which is a
locking inversion.

So I wonder if we still have this problem now that the copy_to_user()
is not under the mutex?

Jason
Xin Zeng Feb. 20, 2024, 3:53 p.m. UTC | #12
On Tuesday, February 20, 2024 9:25 PM, Jason Gunthorpe wrote:
> To: Zeng, Xin <xin.zeng@intel.com>
> Cc: Yishai Hadas <yishaih@nvidia.com>; herbert@gondor.apana.org.au;
> alex.williamson@redhat.com; shameerali.kolothum.thodi@huawei.com; Tian,
> Kevin <kevin.tian@intel.com>; linux-crypto@vger.kernel.org;
> kvm@vger.kernel.org; qat-linux <qat-linux@intel.com>; Cao, Yahui
> <yahui.cao@intel.com>
> Subject: Re: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF devices
> 
> On Sat, Feb 17, 2024 at 04:20:20PM +0000, Zeng, Xin wrote:
> 
> > Thanks for this information, but this flow is not clear to me why it
> > cause deadlock. From this flow, CPU0 is not waiting for any resource
> > held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue
> > to run. Am I missing something?
> 
> At some point it was calling copy_to_user() under the state
> mutex. These days it doesn't.
> 
> copy_to_user() would nest the mm_lock under the state mutex which is a
> locking inversion.
> 
> So I wonder if we still have this problem now that the copy_to_user()
> is not under the mutex?

In protocol v2, we still have the scenario in precopy_ioctl where copy_to_user is
called under state_mutex.

Thanks,
Xin
Jason Gunthorpe Feb. 20, 2024, 5:03 p.m. UTC | #13
On Tue, Feb 20, 2024 at 03:53:08PM +0000, Zeng, Xin wrote:
> On Tuesday, February 20, 2024 9:25 PM, Jason Gunthorpe wrote:
> > To: Zeng, Xin <xin.zeng@intel.com>
> > Cc: Yishai Hadas <yishaih@nvidia.com>; herbert@gondor.apana.org.au;
> > alex.williamson@redhat.com; shameerali.kolothum.thodi@huawei.com; Tian,
> > Kevin <kevin.tian@intel.com>; linux-crypto@vger.kernel.org;
> > kvm@vger.kernel.org; qat-linux <qat-linux@intel.com>; Cao, Yahui
> > <yahui.cao@intel.com>
> > Subject: Re: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF devices
> > 
> > On Sat, Feb 17, 2024 at 04:20:20PM +0000, Zeng, Xin wrote:
> > 
> > > Thanks for this information, but this flow is not clear to me why it
> > > cause deadlock. From this flow, CPU0 is not waiting for any resource
> > > held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue
> > > to run. Am I missing something?
> > 
> > At some point it was calling copy_to_user() under the state
> > mutex. These days it doesn't.
> > 
> > copy_to_user() would nest the mm_lock under the state mutex which is a
> > locking inversion.
> > 
> > So I wonder if we still have this problem now that the copy_to_user()
> > is not under the mutex?
> 
> In protocol v2, we still have the scenario in precopy_ioctl where copy_to_user is
> called under state_mutex.

Why? Does mlx5 do that? It looked Ok to me:

        mlx5vf_state_mutex_unlock(mvdev);
        if (copy_to_user((void __user *)arg, &info, minsz))
                return -EFAULT;

Jason
Xin Zeng Feb. 21, 2024, 8:44 a.m. UTC | #14
On Wednesday, February 21, 2024 1:03 AM, Jason Gunthorpe wrote:
> On Tue, Feb 20, 2024 at 03:53:08PM +0000, Zeng, Xin wrote:
> > On Tuesday, February 20, 2024 9:25 PM, Jason Gunthorpe wrote:
> > > To: Zeng, Xin <xin.zeng@intel.com>
> > > Cc: Yishai Hadas <yishaih@nvidia.com>; herbert@gondor.apana.org.au;
> > > alex.williamson@redhat.com; shameerali.kolothum.thodi@huawei.com;
> Tian,
> > > Kevin <kevin.tian@intel.com>; linux-crypto@vger.kernel.org;
> > > kvm@vger.kernel.org; qat-linux <qat-linux@intel.com>; Cao, Yahui
> > > <yahui.cao@intel.com>
> > > Subject: Re: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF
> devices
> > >
> > > On Sat, Feb 17, 2024 at 04:20:20PM +0000, Zeng, Xin wrote:
> > >
> > > > Thanks for this information, but this flow is not clear to me why it
> > > > cause deadlock. From this flow, CPU0 is not waiting for any resource
> > > > held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue
> > > > to run. Am I missing something?
> > >
> > > At some point it was calling copy_to_user() under the state
> > > mutex. These days it doesn't.
> > >
> > > copy_to_user() would nest the mm_lock under the state mutex which is
> a
> > > locking inversion.
> > >
> > > So I wonder if we still have this problem now that the copy_to_user()
> > > is not under the mutex?
> >
> > In protocol v2, we still have the scenario in precopy_ioctl where
> copy_to_user is
> > called under state_mutex.
> 
> Why? Does mlx5 do that? It looked Ok to me:
> 
>         mlx5vf_state_mutex_unlock(mvdev);
>         if (copy_to_user((void __user *)arg, &info, minsz))
>                 return -EFAULT;

Indeed, thanks, Jason. BTW, is there any reason why was "deferred_reset" mode
still implemented in mlx5 driver given this deadlock condition has been avoided
with migration protocol v2 implementation.
Anyway, I'll use state_mutex directly instead of the "deferred_reset" mode in qat
variant driver and update it in next version soon, please help to review.
Thanks,
Xin
Jason Gunthorpe Feb. 21, 2024, 1:18 p.m. UTC | #15
On Wed, Feb 21, 2024 at 08:44:31AM +0000, Zeng, Xin wrote:
> On Wednesday, February 21, 2024 1:03 AM, Jason Gunthorpe wrote:
> > On Tue, Feb 20, 2024 at 03:53:08PM +0000, Zeng, Xin wrote:
> > > On Tuesday, February 20, 2024 9:25 PM, Jason Gunthorpe wrote:
> > > > To: Zeng, Xin <xin.zeng@intel.com>
> > > > Cc: Yishai Hadas <yishaih@nvidia.com>; herbert@gondor.apana.org.au;
> > > > alex.williamson@redhat.com; shameerali.kolothum.thodi@huawei.com;
> > Tian,
> > > > Kevin <kevin.tian@intel.com>; linux-crypto@vger.kernel.org;
> > > > kvm@vger.kernel.org; qat-linux <qat-linux@intel.com>; Cao, Yahui
> > > > <yahui.cao@intel.com>
> > > > Subject: Re: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF
> > devices
> > > >
> > > > On Sat, Feb 17, 2024 at 04:20:20PM +0000, Zeng, Xin wrote:
> > > >
> > > > > Thanks for this information, but this flow is not clear to me why it
> > > > > cause deadlock. From this flow, CPU0 is not waiting for any resource
> > > > > held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue
> > > > > to run. Am I missing something?
> > > >
> > > > At some point it was calling copy_to_user() under the state
> > > > mutex. These days it doesn't.
> > > >
> > > > copy_to_user() would nest the mm_lock under the state mutex which is
> > a
> > > > locking inversion.
> > > >
> > > > So I wonder if we still have this problem now that the copy_to_user()
> > > > is not under the mutex?
> > >
> > > In protocol v2, we still have the scenario in precopy_ioctl where
> > copy_to_user is
> > > called under state_mutex.
> > 
> > Why? Does mlx5 do that? It looked Ok to me:
> > 
> >         mlx5vf_state_mutex_unlock(mvdev);
> >         if (copy_to_user((void __user *)arg, &info, minsz))
> >                 return -EFAULT;
> 
> Indeed, thanks, Jason. BTW, is there any reason why was "deferred_reset" mode
> still implemented in mlx5 driver given this deadlock condition has been avoided
> with migration protocol v2 implementation.

I do not remember. Yishai? 

Jason
Yishai Hadas Feb. 21, 2024, 3:11 p.m. UTC | #16
On 21/02/2024 15:18, Jason Gunthorpe wrote:
> On Wed, Feb 21, 2024 at 08:44:31AM +0000, Zeng, Xin wrote:
>> On Wednesday, February 21, 2024 1:03 AM, Jason Gunthorpe wrote:
>>> On Tue, Feb 20, 2024 at 03:53:08PM +0000, Zeng, Xin wrote:
>>>> On Tuesday, February 20, 2024 9:25 PM, Jason Gunthorpe wrote:
>>>>> To: Zeng, Xin <xin.zeng@intel.com>
>>>>> Cc: Yishai Hadas <yishaih@nvidia.com>; herbert@gondor.apana.org.au;
>>>>> alex.williamson@redhat.com; shameerali.kolothum.thodi@huawei.com;
>>> Tian,
>>>>> Kevin <kevin.tian@intel.com>; linux-crypto@vger.kernel.org;
>>>>> kvm@vger.kernel.org; qat-linux <qat-linux@intel.com>; Cao, Yahui
>>>>> <yahui.cao@intel.com>
>>>>> Subject: Re: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel QAT VF
>>> devices
>>>>>
>>>>> On Sat, Feb 17, 2024 at 04:20:20PM +0000, Zeng, Xin wrote:
>>>>>
>>>>>> Thanks for this information, but this flow is not clear to me why it
>>>>>> cause deadlock. From this flow, CPU0 is not waiting for any resource
>>>>>> held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue
>>>>>> to run. Am I missing something?
>>>>>
>>>>> At some point it was calling copy_to_user() under the state
>>>>> mutex. These days it doesn't.
>>>>>
>>>>> copy_to_user() would nest the mm_lock under the state mutex which is
>>> a
>>>>> locking inversion.
>>>>>
>>>>> So I wonder if we still have this problem now that the copy_to_user()
>>>>> is not under the mutex?
>>>>
>>>> In protocol v2, we still have the scenario in precopy_ioctl where
>>> copy_to_user is
>>>> called under state_mutex.
>>>
>>> Why? Does mlx5 do that? It looked Ok to me:
>>>
>>>          mlx5vf_state_mutex_unlock(mvdev);
>>>          if (copy_to_user((void __user *)arg, &info, minsz))
>>>                  return -EFAULT;
>>
>> Indeed, thanks, Jason. BTW, is there any reason why was "deferred_reset" mode
>> still implemented in mlx5 driver given this deadlock condition has been avoided
>> with migration protocol v2 implementation.
> 
> I do not remember. Yishai?
> 

Long time passed.., I also don't fully remember whether this was the 
only potential problem here, maybe Yes.

My plan is to prepare a cleanup patch for mlx5 and put it into our 
regression for a while, if all will be good, I may send it for the next 
kernel cycle.

By the way, there are other drivers around (i.e. hisi and mtty) that 
still run copy_to_user() under the state mutex and might hit the problem 
without the 'deferred_rest', see here[1].

If we'll reach to the conclusion that the only reason for that mechanism 
was the copy_to_user() under the state_mutex, those drivers can change 
their code easily and also send a patch to cleanup the 'deferred_reset'.

[1] 
https://elixir.bootlin.com/linux/v6.8-rc5/source/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c#L808
[2] 
https://elixir.bootlin.com/linux/v6.8-rc5/source/samples/vfio-mdev/mtty.c#L878

Yishai
Yishai Hadas Feb. 22, 2024, 9:39 a.m. UTC | #17
On 21/02/2024 17:11, Yishai Hadas wrote:
> On 21/02/2024 15:18, Jason Gunthorpe wrote:
>> On Wed, Feb 21, 2024 at 08:44:31AM +0000, Zeng, Xin wrote:
>>> On Wednesday, February 21, 2024 1:03 AM, Jason Gunthorpe wrote:
>>>> On Tue, Feb 20, 2024 at 03:53:08PM +0000, Zeng, Xin wrote:
>>>>> On Tuesday, February 20, 2024 9:25 PM, Jason Gunthorpe wrote:
>>>>>> To: Zeng, Xin <xin.zeng@intel.com>
>>>>>> Cc: Yishai Hadas <yishaih@nvidia.com>; herbert@gondor.apana.org.au;
>>>>>> alex.williamson@redhat.com; shameerali.kolothum.thodi@huawei.com;
>>>> Tian,
>>>>>> Kevin <kevin.tian@intel.com>; linux-crypto@vger.kernel.org;
>>>>>> kvm@vger.kernel.org; qat-linux <qat-linux@intel.com>; Cao, Yahui
>>>>>> <yahui.cao@intel.com>
>>>>>> Subject: Re: [PATCH 10/10] vfio/qat: Add vfio_pci driver for Intel 
>>>>>> QAT VF
>>>> devices
>>>>>>
>>>>>> On Sat, Feb 17, 2024 at 04:20:20PM +0000, Zeng, Xin wrote:
>>>>>>
>>>>>>> Thanks for this information, but this flow is not clear to me why it
>>>>>>> cause deadlock. From this flow, CPU0 is not waiting for any resource
>>>>>>> held by CPU1, so after CPU0 releases mmap_lock, CPU1 can continue
>>>>>>> to run. Am I missing something?
>>>>>>
>>>>>> At some point it was calling copy_to_user() under the state
>>>>>> mutex. These days it doesn't.
>>>>>>
>>>>>> copy_to_user() would nest the mm_lock under the state mutex which is
>>>> a
>>>>>> locking inversion.
>>>>>>
>>>>>> So I wonder if we still have this problem now that the copy_to_user()
>>>>>> is not under the mutex?
>>>>>
>>>>> In protocol v2, we still have the scenario in precopy_ioctl where
>>>> copy_to_user is
>>>>> called under state_mutex.
>>>>
>>>> Why? Does mlx5 do that? It looked Ok to me:
>>>>
>>>>          mlx5vf_state_mutex_unlock(mvdev);
>>>>          if (copy_to_user((void __user *)arg, &info, minsz))
>>>>                  return -EFAULT;
>>>
>>> Indeed, thanks, Jason. BTW, is there any reason why was 
>>> "deferred_reset" mode
>>> still implemented in mlx5 driver given this deadlock condition has 
>>> been avoided
>>> with migration protocol v2 implementation.
>>
>> I do not remember. Yishai?
>>
> 
> Long time passed.., I also don't fully remember whether this was the 
> only potential problem here, maybe Yes.
> 
> My plan is to prepare a cleanup patch for mlx5 and put it into our 
> regression for a while, if all will be good, I may send it for the next 
> kernel cycle.
> 
> By the way, there are other drivers around (i.e. hisi and mtty) that 
> still run copy_to_user() under the state mutex and might hit the problem 
> without the 'deferred_rest', see here[1].
> 
> If we'll reach to the conclusion that the only reason for that mechanism 
> was the copy_to_user() under the state_mutex, those drivers can change 
> their code easily and also send a patch to cleanup the 'deferred_reset'.
> 
> [1] 
> https://elixir.bootlin.com/linux/v6.8-rc5/source/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c#L808
> [2] 
> https://elixir.bootlin.com/linux/v6.8-rc5/source/samples/vfio-mdev/mtty.c#L878
> 
> Yishai
> 

 From an extra code review around, it looks as we still need the 
'deferred_reset' flow in mlx5.

For example, we call copy_from_user() as part of 
mlx5vf_resume_read_header() which is called by mlx5vf_resume_write() 
under the state mutex.

So, no change is expected in mlx5.

Yishai
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 8d1052fa6a69..c1d3e4cb3892 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -23095,6 +23095,14 @@  S:	Maintained
 F:	Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst
 F:	drivers/vfio/pci/pds/
 
+VFIO QAT PCI DRIVER
+M:	Xin Zeng <xin.zeng@intel.com>
+M:	Giovanni Cabiddu <giovanni.cabiddu@intel.com>
+L:	kvm@vger.kernel.org
+L:	qat-linux@intel.com
+S:	Supported
+F:	drivers/vfio/pci/intel/qat/
+
 VFIO PLATFORM DRIVER
 M:	Eric Auger <eric.auger@redhat.com>
 L:	kvm@vger.kernel.org
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 18c397df566d..329d25c53274 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -67,4 +67,6 @@  source "drivers/vfio/pci/pds/Kconfig"
 
 source "drivers/vfio/pci/virtio/Kconfig"
 
+source "drivers/vfio/pci/intel/qat/Kconfig"
+
 endmenu
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 046139a4eca5..a87b6b43ce1c 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -15,3 +15,5 @@  obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
 obj-$(CONFIG_PDS_VFIO_PCI) += pds/
 
 obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/
+
+obj-$(CONFIG_QAT_VFIO_PCI) += intel/qat/
diff --git a/drivers/vfio/pci/intel/qat/Kconfig b/drivers/vfio/pci/intel/qat/Kconfig
new file mode 100644
index 000000000000..71b28ac0bf6a
--- /dev/null
+++ b/drivers/vfio/pci/intel/qat/Kconfig
@@ -0,0 +1,13 @@ 
+# SPDX-License-Identifier: GPL-2.0-only
+config QAT_VFIO_PCI
+	tristate "VFIO support for QAT VF PCI devices"
+	select VFIO_PCI_CORE
+	depends on CRYPTO_DEV_QAT
+	depends on CRYPTO_DEV_QAT_4XXX
+	help
+	  This provides migration support for Intel(R) QAT Virtual Function
+	  using the VFIO framework.
+
+	  To compile this as a module, choose M here: the module
+	  will be called qat_vfio_pci. If you don't know what to do here,
+	  say N.
diff --git a/drivers/vfio/pci/intel/qat/Makefile b/drivers/vfio/pci/intel/qat/Makefile
new file mode 100644
index 000000000000..9289ae4c51bf
--- /dev/null
+++ b/drivers/vfio/pci/intel/qat/Makefile
@@ -0,0 +1,4 @@ 
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_QAT_VFIO_PCI) += qat_vfio_pci.o
+qat_vfio_pci-y := main.o
+
diff --git a/drivers/vfio/pci/intel/qat/main.c b/drivers/vfio/pci/intel/qat/main.c
new file mode 100644
index 000000000000..85d0ed701397
--- /dev/null
+++ b/drivers/vfio/pci/intel/qat/main.c
@@ -0,0 +1,572 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2024 Intel Corporation */
+
+#include <linux/anon_inodes.h>
+#include <linux/container_of.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/qat/qat_mig_dev.h>
+
+struct qat_vf_migration_file {
+	struct file *filp;
+	/* protects migration region context */
+	struct mutex lock;
+	bool disabled;
+	struct qat_mig_dev *mdev;
+};
+
+struct qat_vf_core_device {
+	struct vfio_pci_core_device core_device;
+	struct qat_mig_dev *mdev;
+	/* protects migration state */
+	struct mutex state_mutex;
+	enum vfio_device_mig_state mig_state;
+	/* protects reset down flow */
+	spinlock_t reset_lock;
+	bool deferred_reset;
+	struct qat_vf_migration_file *resuming_migf;
+	struct qat_vf_migration_file *saving_migf;
+};
+
+static int qat_vf_pci_open_device(struct vfio_device *core_vdev)
+{
+	struct qat_vf_core_device *qat_vdev =
+		container_of(core_vdev, struct qat_vf_core_device,
+			     core_device.vdev);
+	struct vfio_pci_core_device *vdev = &qat_vdev->core_device;
+	int ret;
+
+	ret = vfio_pci_core_enable(vdev);
+	if (ret)
+		return ret;
+
+	ret = qat_vdev->mdev->ops->open(qat_vdev->mdev);
+	if (ret) {
+		vfio_pci_core_disable(vdev);
+		return ret;
+	}
+	qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+
+	vfio_pci_core_finish_enable(vdev);
+
+	return 0;
+}
+
+static void qat_vf_disable_fd(struct qat_vf_migration_file *migf)
+{
+	mutex_lock(&migf->lock);
+	migf->disabled = true;
+	migf->filp->f_pos = 0;
+	mutex_unlock(&migf->lock);
+}
+
+static void qat_vf_disable_fds(struct qat_vf_core_device *qat_vdev)
+{
+	if (qat_vdev->resuming_migf) {
+		qat_vf_disable_fd(qat_vdev->resuming_migf);
+		fput(qat_vdev->resuming_migf->filp);
+		qat_vdev->resuming_migf = NULL;
+	}
+
+	if (qat_vdev->saving_migf) {
+		qat_vf_disable_fd(qat_vdev->saving_migf);
+		fput(qat_vdev->saving_migf->filp);
+		qat_vdev->saving_migf = NULL;
+	}
+}
+
+static void qat_vf_pci_close_device(struct vfio_device *core_vdev)
+{
+	struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
+			struct qat_vf_core_device, core_device.vdev);
+
+	qat_vdev->mdev->ops->close(qat_vdev->mdev);
+	qat_vf_disable_fds(qat_vdev);
+	vfio_pci_core_close_device(core_vdev);
+}
+
+static ssize_t qat_vf_save_read(struct file *filp, char __user *buf,
+				size_t len, loff_t *pos)
+{
+	struct qat_vf_migration_file *migf = filp->private_data;
+	ssize_t done = 0;
+	loff_t *offs;
+	int ret;
+
+	if (pos)
+		return -ESPIPE;
+	offs = &filp->f_pos;
+
+	mutex_lock(&migf->lock);
+	if (*offs > migf->mdev->state_size || *offs < 0) {
+		done = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (migf->disabled) {
+		done = -ENODEV;
+		goto out_unlock;
+	}
+
+	len = min_t(size_t, migf->mdev->state_size - *offs, len);
+	if (len) {
+		ret = copy_to_user(buf, migf->mdev->state + *offs, len);
+		if (ret) {
+			done = -EFAULT;
+			goto out_unlock;
+		}
+		*offs += len;
+		done = len;
+	}
+
+out_unlock:
+	mutex_unlock(&migf->lock);
+	return done;
+}
+
+static int qat_vf_release_file(struct inode *inode, struct file *filp)
+{
+	struct qat_vf_migration_file *migf = filp->private_data;
+
+	qat_vf_disable_fd(migf);
+	mutex_destroy(&migf->lock);
+	kfree(migf);
+
+	return 0;
+}
+
+static const struct file_operations qat_vf_save_fops = {
+	.owner = THIS_MODULE,
+	.read = qat_vf_save_read,
+	.release = qat_vf_release_file,
+	.llseek = no_llseek,
+};
+
+static struct qat_vf_migration_file *
+qat_vf_save_device_data(struct qat_vf_core_device *qat_vdev)
+{
+	struct qat_vf_migration_file *migf;
+	int ret;
+
+	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+	if (!migf)
+		return ERR_PTR(-ENOMEM);
+
+	migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_save_fops, migf, O_RDONLY);
+	ret = PTR_ERR_OR_ZERO(migf->filp);
+	if (ret) {
+		kfree(migf);
+		return ERR_PTR(ret);
+	}
+
+	stream_open(migf->filp->f_inode, migf->filp);
+	mutex_init(&migf->lock);
+
+	ret = qat_vdev->mdev->ops->save_state(qat_vdev->mdev);
+	if (ret) {
+		fput(migf->filp);
+		kfree(migf);
+		return ERR_PTR(ret);
+	}
+
+	migf->mdev = qat_vdev->mdev;
+
+	return migf;
+}
+
+static ssize_t qat_vf_resume_write(struct file *filp, const char __user *buf,
+				   size_t len, loff_t *pos)
+{
+	struct qat_vf_migration_file *migf = filp->private_data;
+	loff_t end, *offs;
+	ssize_t done = 0;
+	int ret;
+
+	if (pos)
+		return -ESPIPE;
+	offs = &filp->f_pos;
+
+	if (*offs < 0 ||
+	    check_add_overflow((loff_t)len, *offs, &end))
+		return -EOVERFLOW;
+
+	if (end > migf->mdev->state_size)
+		return -ENOMEM;
+
+	mutex_lock(&migf->lock);
+	if (migf->disabled) {
+		done = -ENODEV;
+		goto out_unlock;
+	}
+
+	ret = copy_from_user(migf->mdev->state + *offs, buf, len);
+	if (ret) {
+		done = -EFAULT;
+		goto out_unlock;
+	}
+	*offs += len;
+	done = len;
+
+out_unlock:
+	mutex_unlock(&migf->lock);
+	return done;
+}
+
+static const struct file_operations qat_vf_resume_fops = {
+	.owner = THIS_MODULE,
+	.write = qat_vf_resume_write,
+	.release = qat_vf_release_file,
+	.llseek = no_llseek,
+};
+
+static struct qat_vf_migration_file *
+qat_vf_resume_device_data(struct qat_vf_core_device *qat_vdev)
+{
+	struct qat_vf_migration_file *migf;
+	int ret;
+
+	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+	if (!migf)
+		return ERR_PTR(-ENOMEM);
+
+	migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_resume_fops, migf, O_WRONLY);
+	ret = PTR_ERR_OR_ZERO(migf->filp);
+	if (ret) {
+		kfree(migf);
+		return ERR_PTR(ret);
+	}
+
+	migf->mdev = qat_vdev->mdev;
+	stream_open(migf->filp->f_inode, migf->filp);
+	mutex_init(&migf->lock);
+
+	return migf;
+}
+
+static int qat_vf_load_device_data(struct qat_vf_core_device *qat_vdev)
+{
+	return qat_vdev->mdev->ops->load_state(qat_vdev->mdev);
+}
+
+static struct file *qat_vf_pci_step_device_state(struct qat_vf_core_device *qat_vdev, u32 new)
+{
+	u32 cur = qat_vdev->mig_state;
+	int ret;
+
+	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
+		ret = qat_vdev->mdev->ops->suspend(qat_vdev->mdev);
+		if (ret)
+			return ERR_PTR(ret);
+		return NULL;
+	}
+
+	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) ||
+	    (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P))
+		return NULL;
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+		struct qat_vf_migration_file *migf;
+
+		migf = qat_vf_save_device_data(qat_vdev);
+		if (IS_ERR(migf))
+			return ERR_CAST(migf);
+		get_file(migf->filp);
+		qat_vdev->saving_migf = migf;
+		return migf->filp;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
+		qat_vf_disable_fds(qat_vdev);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+		struct qat_vf_migration_file *migf;
+
+		migf = qat_vf_resume_device_data(qat_vdev);
+		if (IS_ERR(migf))
+			return ERR_CAST(migf);
+		get_file(migf->filp);
+		qat_vdev->resuming_migf = migf;
+		return migf->filp;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+		ret = qat_vf_load_device_data(qat_vdev);
+		if (ret)
+			return ERR_PTR(ret);
+
+		qat_vf_disable_fds(qat_vdev);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) {
+		qat_vdev->mdev->ops->resume(qat_vdev->mdev);
+		return NULL;
+	}
+
+	/* vfio_mig_get_next_state() does not use arcs other than the above */
+	WARN_ON(true);
+	return ERR_PTR(-EINVAL);
+}
+
+static void qat_vf_state_mutex_unlock(struct qat_vf_core_device *qat_vdev)
+{
+again:
+	spin_lock(&qat_vdev->reset_lock);
+	if (qat_vdev->deferred_reset) {
+		qat_vdev->deferred_reset = false;
+		spin_unlock(&qat_vdev->reset_lock);
+		qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+		qat_vf_disable_fds(qat_vdev);
+		goto again;
+	}
+	mutex_unlock(&qat_vdev->state_mutex);
+	spin_unlock(&qat_vdev->reset_lock);
+}
+
+static struct file *qat_vf_pci_set_device_state(struct vfio_device *vdev,
+						enum vfio_device_mig_state new_state)
+{
+	struct qat_vf_core_device *qat_vdev = container_of(vdev,
+			struct qat_vf_core_device, core_device.vdev);
+	enum vfio_device_mig_state next_state;
+	struct file *res = NULL;
+	int ret;
+
+	mutex_lock(&qat_vdev->state_mutex);
+	while (new_state != qat_vdev->mig_state) {
+		ret = vfio_mig_get_next_state(vdev, qat_vdev->mig_state,
+					      new_state, &next_state);
+		if (ret) {
+			res = ERR_PTR(ret);
+			break;
+		}
+		res = qat_vf_pci_step_device_state(qat_vdev, next_state);
+		if (IS_ERR(res))
+			break;
+		qat_vdev->mig_state = next_state;
+		if (WARN_ON(res && new_state != qat_vdev->mig_state)) {
+			fput(res);
+			res = ERR_PTR(-EINVAL);
+			break;
+		}
+	}
+	qat_vf_state_mutex_unlock(qat_vdev);
+
+	return res;
+}
+
+static int qat_vf_pci_get_device_state(struct vfio_device *vdev,
+				       enum vfio_device_mig_state *curr_state)
+{
+	struct qat_vf_core_device *qat_vdev = container_of(vdev,
+			struct qat_vf_core_device, core_device.vdev);
+
+	mutex_lock(&qat_vdev->state_mutex);
+	*curr_state = qat_vdev->mig_state;
+	qat_vf_state_mutex_unlock(qat_vdev);
+
+	return 0;
+}
+
+static int qat_vf_pci_get_data_size(struct vfio_device *vdev,
+				    unsigned long *stop_copy_length)
+{
+	struct qat_vf_core_device *qat_vdev = container_of(vdev,
+			struct qat_vf_core_device, core_device.vdev);
+
+	*stop_copy_length = qat_vdev->mdev->state_size;
+	return 0;
+}
+
+static const struct vfio_migration_ops qat_vf_pci_mig_ops = {
+	.migration_set_state = qat_vf_pci_set_device_state,
+	.migration_get_state = qat_vf_pci_get_device_state,
+	.migration_get_data_size = qat_vf_pci_get_data_size,
+};
+
+static void qat_vf_pci_release_dev(struct vfio_device *core_vdev)
+{
+	struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
+			struct qat_vf_core_device, core_device.vdev);
+
+	qat_vdev->mdev->ops->cleanup(qat_vdev->mdev);
+	qat_vfmig_destroy(qat_vdev->mdev);
+	mutex_destroy(&qat_vdev->state_mutex);
+	vfio_pci_core_release_dev(core_vdev);
+}
+
+static int qat_vf_pci_init_dev(struct vfio_device *core_vdev)
+{
+	struct qat_vf_core_device *qat_vdev = container_of(core_vdev,
+			struct qat_vf_core_device, core_device.vdev);
+	struct qat_migdev_ops *ops;
+	struct qat_mig_dev *mdev;
+	struct pci_dev *parent;
+	int ret, vf_id;
+
+	core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
+	core_vdev->mig_ops = &qat_vf_pci_mig_ops;
+
+	ret = vfio_pci_core_init_dev(core_vdev);
+	if (ret)
+		return ret;
+
+	mutex_init(&qat_vdev->state_mutex);
+	spin_lock_init(&qat_vdev->reset_lock);
+
+	parent = qat_vdev->core_device.pdev->physfn;
+	vf_id = pci_iov_vf_id(qat_vdev->core_device.pdev);
+	if (!parent || vf_id < 0) {
+		ret = -ENODEV;
+		goto err_rel;
+	}
+
+	mdev = qat_vfmig_create(parent, vf_id);
+	if (IS_ERR(mdev)) {
+		ret = PTR_ERR(mdev);
+		goto err_rel;
+	}
+
+	ops = mdev->ops;
+	if (!ops || !ops->init || !ops->cleanup ||
+	    !ops->open || !ops->close ||
+	    !ops->save_state || !ops->load_state ||
+	    !ops->suspend || !ops->resume) {
+		ret = -EIO;
+		dev_err(&parent->dev, "Incomplete device migration ops structure!");
+		goto err_destroy;
+	}
+	ret = ops->init(mdev);
+	if (ret)
+		goto err_destroy;
+
+	qat_vdev->mdev = mdev;
+
+	return 0;
+
+err_destroy:
+	qat_vfmig_destroy(mdev);
+err_rel:
+	vfio_pci_core_release_dev(core_vdev);
+	return ret;
+}
+
+static const struct vfio_device_ops qat_vf_pci_ops = {
+	.name = "qat-vf-vfio-pci",
+	.init = qat_vf_pci_init_dev,
+	.release = qat_vf_pci_release_dev,
+	.open_device = qat_vf_pci_open_device,
+	.close_device = qat_vf_pci_close_device,
+	.ioctl = vfio_pci_core_ioctl,
+	.read = vfio_pci_core_read,
+	.write = vfio_pci_core_write,
+	.mmap = vfio_pci_core_mmap,
+	.request = vfio_pci_core_request,
+	.match = vfio_pci_core_match,
+	.bind_iommufd = vfio_iommufd_physical_bind,
+	.unbind_iommufd = vfio_iommufd_physical_unbind,
+	.attach_ioas = vfio_iommufd_physical_attach_ioas,
+	.detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static struct qat_vf_core_device *qat_vf_drvdata(struct pci_dev *pdev)
+{
+	struct vfio_pci_core_device *core_device = pci_get_drvdata(pdev);
+
+	return container_of(core_device, struct qat_vf_core_device, core_device);
+}
+
+static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev)
+{
+	struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev);
+
+	if (!qat_vdev->core_device.vdev.mig_ops)
+		return;
+
+	/*
+	 * As the higher VFIO layers are holding locks across reset and using
+	 * those same locks with the mm_lock we need to prevent ABBA deadlock
+	 * with the state_mutex and mm_lock.
+	 * In case the state_mutex was taken already we defer the cleanup work
+	 * to the unlock flow of the other running context.
+	 */
+	spin_lock(&qat_vdev->reset_lock);
+	qat_vdev->deferred_reset = true;
+	if (!mutex_trylock(&qat_vdev->state_mutex)) {
+		spin_unlock(&qat_vdev->reset_lock);
+		return;
+	}
+	spin_unlock(&qat_vdev->reset_lock);
+	qat_vf_state_mutex_unlock(qat_vdev);
+}
+
+static int
+qat_vf_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct device *dev = &pdev->dev;
+	struct qat_vf_core_device *qat_vdev;
+	int ret;
+
+	qat_vdev = vfio_alloc_device(qat_vf_core_device, core_device.vdev, dev, &qat_vf_pci_ops);
+	if (IS_ERR(qat_vdev))
+		return PTR_ERR(qat_vdev);
+
+	pci_set_drvdata(pdev, &qat_vdev->core_device);
+	ret = vfio_pci_core_register_device(&qat_vdev->core_device);
+	if (ret)
+		goto out_put_device;
+
+	return 0;
+
+out_put_device:
+	vfio_put_device(&qat_vdev->core_device.vdev);
+	return ret;
+}
+
+static void qat_vf_vfio_pci_remove(struct pci_dev *pdev)
+{
+	struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev);
+
+	vfio_pci_core_unregister_device(&qat_vdev->core_device);
+	vfio_put_device(&qat_vdev->core_device.vdev);
+}
+
+static const struct pci_device_id qat_vf_vfio_pci_table[] = {
+	/* Intel QAT GEN4 4xxx VF device */
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4941) },
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4943) },
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4945) },
+	{}
+};
+MODULE_DEVICE_TABLE(pci, qat_vf_vfio_pci_table);
+
+static const struct pci_error_handlers qat_vf_err_handlers = {
+	.reset_done = qat_vf_pci_aer_reset_done,
+	.error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static struct pci_driver qat_vf_vfio_pci_driver = {
+	.name = "qat_vfio_pci",
+	.id_table = qat_vf_vfio_pci_table,
+	.probe = qat_vf_vfio_pci_probe,
+	.remove = qat_vf_vfio_pci_remove,
+	.err_handler = &qat_vf_err_handlers,
+	.driver_managed_dma = true,
+};
+module_pci_driver(qat_vf_vfio_pci_driver)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("QAT VFIO PCI - VFIO PCI driver with live migration support for Intel(R) QAT GEN4 device family");
+MODULE_IMPORT_NS(CRYPTO_QAT);