diff mbox series

[RFC,v2,vfio,3/7] vfio/pds: Add VFIO live migration support

Message ID 20221214232136.64220-4-brett.creeley@amd.com (mailing list archive)
State RFC
Headers show
Series pds vfio driver | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Brett Creeley Dec. 14, 2022, 11:21 p.m. UTC
Add live migration support via the VFIO subsystem. The migration
implementation aligns with the definition from uapi/vfio.h and uses the
auxiliary bus connection between pds_core/pds_vfio to communicate with
the device.

The ability to suspend, resume, and transfer VF device state data is
included along with the required admin queue command structures and
implementations.

PDS_LM_CMD_SUSPEND and PDS_LM_CMD_SUSPEND_STATUS are added to support
the VF device suspend operation.

PDS_LM_CMD_RESUME is added to support the VF device resume operation.

PDS_LM_CMD_STATUS is added to determine the exact size of the VF
device state data.

PDS_LM_CMD_SAVE is added to get the VF device state data.

PDS_LM_CMD_RESTORE is added to restore the VF device with the
previously saved data from PDS_LM_CMD_SAVE.

PDS_LM_CMD_HOST_VF_STATUS is added to notify the device when
a migration is in/not-in progress from the host's perspective.

Signed-off-by: Brett Creeley <brett.creeley@amd.com>
Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
---
 drivers/vfio/pci/pds/Makefile   |   1 +
 drivers/vfio/pci/pds/aux_drv.c  |   1 +
 drivers/vfio/pci/pds/cmds.c     | 312 ++++++++++++++++++++
 drivers/vfio/pci/pds/cmds.h     |  15 +
 drivers/vfio/pci/pds/lm.c       | 491 ++++++++++++++++++++++++++++++++
 drivers/vfio/pci/pds/lm.h       |  43 +++
 drivers/vfio/pci/pds/pci_drv.c  |  15 +
 drivers/vfio/pci/pds/vfio_dev.c | 119 +++++++-
 drivers/vfio/pci/pds/vfio_dev.h |  12 +
 include/linux/pds/pds_lm.h      | 206 ++++++++++++++
 10 files changed, 1214 insertions(+), 1 deletion(-)
 create mode 100644 drivers/vfio/pci/pds/lm.c
 create mode 100644 drivers/vfio/pci/pds/lm.h

Comments

Longfang Liu Jan. 3, 2023, 12:25 p.m. UTC | #1
On 2022/12/15 7:21, Brett Creeley wrote:
> Add live migration support via the VFIO subsystem. The migration
> implementation aligns with the definition from uapi/vfio.h and uses the
> auxiliary bus connection between pds_core/pds_vfio to communicate with
> the device.
> 
> The ability to suspend, resume, and transfer VF device state data is
> included along with the required admin queue command structures and
> implementations.
> 
> PDS_LM_CMD_SUSPEND and PDS_LM_CMD_SUSPEND_STATUS are added to support
> the VF device suspend operation.
> 
> PDS_LM_CMD_RESUME is added to support the VF device resume operation.
> 
> PDS_LM_CMD_STATUS is added to determine the exact size of the VF
> device state data.
> 
> PDS_LM_CMD_SAVE is added to get the VF device state data.
> 

Hi, this PDS is an NVME storage disk. When it supports live migration,
the amount of data stored on the disk is so large.
How do you realize the migration of a large amount of data in the disk
in a short period of time?

Thanks,
Longfang.

> PDS_LM_CMD_RESTORE is added to restore the VF device with the
> previously saved data from PDS_LM_CMD_SAVE.
> 
> PDS_LM_CMD_HOST_VF_STATUS is added to notify the device when
> a migration is in/not-in progress from the host's perspective.
> 
> Signed-off-by: Brett Creeley <brett.creeley@amd.com>
> Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
> ---
>  drivers/vfio/pci/pds/Makefile   |   1 +
>  drivers/vfio/pci/pds/aux_drv.c  |   1 +
>  drivers/vfio/pci/pds/cmds.c     | 312 ++++++++++++++++++++
>  drivers/vfio/pci/pds/cmds.h     |  15 +
>  drivers/vfio/pci/pds/lm.c       | 491 ++++++++++++++++++++++++++++++++
>  drivers/vfio/pci/pds/lm.h       |  43 +++
>  drivers/vfio/pci/pds/pci_drv.c  |  15 +
>  drivers/vfio/pci/pds/vfio_dev.c | 119 +++++++-
>  drivers/vfio/pci/pds/vfio_dev.h |  12 +
>  include/linux/pds/pds_lm.h      | 206 ++++++++++++++
>  10 files changed, 1214 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/vfio/pci/pds/lm.c
>  create mode 100644 drivers/vfio/pci/pds/lm.h
> 
> diff --git a/drivers/vfio/pci/pds/Makefile b/drivers/vfio/pci/pds/Makefile
> index 49bf8289739b..b4980323cff4 100644
> --- a/drivers/vfio/pci/pds/Makefile
> +++ b/drivers/vfio/pci/pds/Makefile
> @@ -4,5 +4,6 @@ obj-$(CONFIG_PDS_VFIO_PCI) += pds_vfio.o
>  pds_vfio-y := \
>  	aux_drv.o	\
>  	cmds.o		\
> +	lm.o		\
>  	pci_drv.o	\
>  	vfio_dev.o
> diff --git a/drivers/vfio/pci/pds/aux_drv.c b/drivers/vfio/pci/pds/aux_drv.c
> index 494551894926..59b43075e85f 100644
> --- a/drivers/vfio/pci/pds/aux_drv.c
> +++ b/drivers/vfio/pci/pds/aux_drv.c
> @@ -77,6 +77,7 @@ pds_vfio_aux_probe(struct auxiliary_device *aux_dev,
>  	dev_dbg(dev, "%s: id %#04x busnr %#x devfn %#x bus %p pds_vfio %p\n",
>  		__func__, padev->id, busnr, devfn, bus, vfio_aux->pds_vfio);
>  
> +	vfio_aux->pds_vfio->coredev = aux_dev->dev.parent;
>  	vfio_aux->pds_vfio->vfio_aux = vfio_aux;
>  
>  	vfio_aux->padrv.event_handler = pds_vfio_aux_notify_handler;
> diff --git a/drivers/vfio/pci/pds/cmds.c b/drivers/vfio/pci/pds/cmds.c
> index 5a3fadcd38d8..11823d824ccc 100644
> --- a/drivers/vfio/pci/pds/cmds.c
> +++ b/drivers/vfio/pci/pds/cmds.c
> @@ -3,9 +3,11 @@
>  
>  #include <linux/io.h>
>  #include <linux/types.h>
> +#include <linux/delay.h>
>  
>  #include <linux/pds/pds_core_if.h>
>  #include <linux/pds/pds_adminq.h>
> +#include <linux/pds/pds_lm.h>
>  #include <linux/pds/pds_auxbus.h>
>  
>  #include "vfio_dev.h"
> @@ -28,3 +30,313 @@ pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio)
>  
>  	padev->ops->unregister_client(padev);
>  }
> +
> +#define SUSPEND_TIMEOUT_S		5
> +#define SUSPEND_CHECK_INTERVAL_MS	1
> +
> +static int
> +pds_vfio_suspend_wait_device_cmd(struct pds_vfio_pci_device *pds_vfio)
> +{
> +	struct pds_lm_suspend_status_cmd cmd = {
> +		.opcode = PDS_LM_CMD_SUSPEND_STATUS,
> +		.vf_id	= cpu_to_le16(pds_vfio->vf_id),
> +	};
> +	struct pci_dev *pdev = pds_vfio->pdev;
> +	struct pds_lm_comp comp = { 0 };
> +	struct pds_auxiliary_dev *padev;
> +	unsigned long time_limit;
> +	unsigned long time_start;
> +	unsigned long time_done;
> +	int err;
> +
> +	padev = pds_vfio->vfio_aux->padev;
> +
> +	time_start = jiffies;
> +	time_limit = time_start + HZ * SUSPEND_TIMEOUT_S;
> +	do {
> +		err = padev->ops->adminq_cmd(padev,
> +					     (union pds_core_adminq_cmd *)&cmd,
> +					     sizeof(cmd),
> +					     (union pds_core_adminq_comp *)&comp,
> +					      PDS_AQ_FLAG_FASTPOLL);
> +		if (err != -EAGAIN)
> +			break;
> +
> +		msleep(SUSPEND_CHECK_INTERVAL_MS);
> +	} while (time_before(jiffies, time_limit));
> +
> +	time_done = jiffies;
> +	dev_dbg(&pdev->dev, "%s: vf%u: Suspend comp received in %d msecs\n",
> +		__func__, pds_vfio->vf_id,
> +		jiffies_to_msecs(time_done - time_start));
> +
> +	/* Check the results */
> +	if (time_after_eq(time_done, time_limit)) {
> +		dev_err(&pdev->dev, "%s: vf%u: Suspend comp timeout\n", __func__,
> +			pds_vfio->vf_id);
> +		err = -ETIMEDOUT;
> +	}
> +
> +	return err;
> +}
> +
> +int
> +pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio)
> +{
> +	struct pds_lm_suspend_cmd cmd = {
> +		.opcode = PDS_LM_CMD_SUSPEND,
> +		.vf_id = cpu_to_le16(pds_vfio->vf_id),
> +	};
> +	struct pds_lm_suspend_comp comp = {0};
> +	struct pci_dev *pdev = pds_vfio->pdev;
> +	struct pds_auxiliary_dev *padev;
> +	int err;
> +
> +	dev_dbg(&pdev->dev, "vf%u: Suspend device\n", pds_vfio->vf_id);
> +
> +	padev = pds_vfio->vfio_aux->padev;
> +	err = padev->ops->adminq_cmd(padev,
> +				     (union pds_core_adminq_cmd *)&cmd,
> +				     sizeof(cmd),
> +				     (union pds_core_adminq_comp *)&comp,
> +				     PDS_AQ_FLAG_FASTPOLL);
> +	if (err) {
> +		dev_err(&pdev->dev, "vf%u: Suspend failed: %pe\n",
> +			pds_vfio->vf_id, ERR_PTR(err));
> +		return err;
> +	}
> +
> +	return pds_vfio_suspend_wait_device_cmd(pds_vfio);
> +}
> +
> +int
> +pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio)
> +{
> +	struct pds_lm_resume_cmd cmd = {
> +		.opcode = PDS_LM_CMD_RESUME,
> +		.vf_id = cpu_to_le16(pds_vfio->vf_id),
> +	};
> +	struct pds_auxiliary_dev *padev;
> +	struct pds_lm_comp comp = {0};
> +
> +	dev_dbg(&pds_vfio->pdev->dev, "vf%u: Resume device\n", pds_vfio->vf_id);
> +
> +	padev = pds_vfio->vfio_aux->padev;
> +	return padev->ops->adminq_cmd(padev,
> +				     (union pds_core_adminq_cmd *)&cmd,
> +				     sizeof(cmd),
> +				     (union pds_core_adminq_comp *)&comp,
> +				     0);
> +}
> +
> +int
> +pds_vfio_get_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size)
> +{
> +	struct pds_lm_status_cmd cmd = {
> +		.opcode = PDS_LM_CMD_STATUS,
> +		.vf_id = cpu_to_le16(pds_vfio->vf_id),
> +	};
> +	struct pds_lm_status_comp comp = {0};
> +	struct pds_auxiliary_dev *padev;
> +	int err = 0;
> +
> +	dev_dbg(&pds_vfio->pdev->dev, "vf%u: Get migration status\n",
> +		pds_vfio->vf_id);
> +
> +	padev = pds_vfio->vfio_aux->padev;
> +	err = padev->ops->adminq_cmd(padev,
> +				     (union pds_core_adminq_cmd *)&cmd,
> +				     sizeof(cmd),
> +				     (union pds_core_adminq_comp *)&comp,
> +				     0);
> +	if (err)
> +		return err;
> +
> +	*size = le64_to_cpu(comp.size);
> +	return 0;
> +}
> +
> +static int
> +pds_vfio_dma_map_lm_file(struct device *dev, enum dma_data_direction dir,
> +			 struct pds_vfio_lm_file *lm_file)
> +{
> +	struct pds_lm_sg_elem *sgl, *sge;
> +	struct scatterlist *sg;
> +	int err = 0;
> +	int i;
> +
> +	if (!lm_file)
> +		return -EINVAL;
> +
> +	/* dma map file pages */
> +	err = dma_map_sgtable(dev, &lm_file->sg_table, dir, 0);
> +	if (err)
> +		goto err_dma_map_sg;
> +
> +	lm_file->num_sge = lm_file->sg_table.nents;
> +
> +	/* alloc sgl */
> +	sgl = dma_alloc_coherent(dev, lm_file->num_sge *
> +				 sizeof(struct pds_lm_sg_elem),
> +				 &lm_file->sgl_addr, GFP_KERNEL);
> +	if (!sgl) {
> +		err = -ENOMEM;
> +		goto err_alloc_sgl;
> +	}
> +
> +	lm_file->sgl = sgl;
> +
> +	/* fill sgl */
> +	sge = sgl;
> +	for_each_sgtable_dma_sg(&lm_file->sg_table, sg, i) {
> +		sge->addr = cpu_to_le64(sg_dma_address(sg));
> +		sge->len  = cpu_to_le32(sg_dma_len(sg));
> +		dev_dbg(dev, "addr = %llx, len = %u\n", sge->addr, sge->len);
> +		sge++;
> +	}
> +
> +	return 0;
> +
> +err_alloc_sgl:
> +	dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0);
> +err_dma_map_sg:
> +	return err;
> +}
> +
> +static void
> +pds_vfio_dma_unmap_lm_file(struct device *dev, enum dma_data_direction dir,
> +			   struct pds_vfio_lm_file *lm_file)
> +{
> +	if (!lm_file)
> +		return;
> +
> +	/* free sgl */
> +	if (lm_file->sgl) {
> +		dma_free_coherent(dev, lm_file->num_sge *
> +				  sizeof(struct pds_lm_sg_elem),
> +				  lm_file->sgl, lm_file->sgl_addr);
> +		lm_file->sgl = NULL;
> +		lm_file->sgl_addr = DMA_MAPPING_ERROR;
> +		lm_file->num_sge = 0;
> +	}
> +
> +	/* dma unmap file pages */
> +	dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0);
> +}
> +
> +int
> +pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio)
> +{
> +	struct pds_lm_save_cmd cmd = {
> +		.opcode = PDS_LM_CMD_SAVE,
> +		.vf_id = cpu_to_le16(pds_vfio->vf_id),
> +	};
> +	struct pci_dev *pdev = pds_vfio->pdev;
> +	struct pds_vfio_lm_file *lm_file;
> +	struct pds_auxiliary_dev *padev;
> +	struct pds_lm_comp comp = {0};
> +	int err;
> +
> +	dev_dbg(&pdev->dev, "vf%u: Get migration state\n", pds_vfio->vf_id);
> +
> +	lm_file = pds_vfio->save_file;
> +
> +	padev = pds_vfio->vfio_aux->padev;
> +	err = pds_vfio_dma_map_lm_file(pds_vfio->coredev, DMA_FROM_DEVICE, lm_file);
> +	if (err) {
> +		dev_err(&pdev->dev, "failed to map save migration file: %pe\n",
> +			ERR_PTR(err));
> +		return err;
> +	}
> +
> +	cmd.sgl_addr = cpu_to_le64(lm_file->sgl_addr);
> +	cmd.num_sge = cpu_to_le32(lm_file->num_sge);
> +
> +	err = padev->ops->adminq_cmd(padev,
> +				     (union pds_core_adminq_cmd *)&cmd,
> +				     sizeof(cmd),
> +				     (union pds_core_adminq_comp *)&comp,
> +				     0);
> +	if (err)
> +		dev_err(&pdev->dev, "failed to get migration state: %pe\n",
> +			ERR_PTR(err));
> +
> +	pds_vfio_dma_unmap_lm_file(pds_vfio->coredev, DMA_FROM_DEVICE, lm_file);
> +
> +	return err;
> +}
> +
> +int
> +pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio)
> +{
> +	struct pds_lm_restore_cmd cmd = {
> +		.opcode = PDS_LM_CMD_RESTORE,
> +		.vf_id = cpu_to_le16(pds_vfio->vf_id),
> +	};
> +	struct pci_dev *pdev = pds_vfio->pdev;
> +	struct pds_vfio_lm_file *lm_file;
> +	struct pds_auxiliary_dev *padev;
> +	struct pds_lm_comp comp = {0};
> +	int err;
> +
> +	dev_dbg(&pdev->dev, "vf%u: Set migration state\n", pds_vfio->vf_id);
> +
> +	lm_file = pds_vfio->restore_file;
> +
> +	padev = pds_vfio->vfio_aux->padev;
> +	err = pds_vfio_dma_map_lm_file(pds_vfio->coredev, DMA_TO_DEVICE, lm_file);
> +	if (err) {
> +		dev_err(&pdev->dev, "failed to map restore migration file: %pe\n",
> +			ERR_PTR(err));
> +		return err;
> +	}
> +
> +	cmd.sgl_addr = cpu_to_le64(lm_file->sgl_addr);
> +	cmd.num_sge = cpu_to_le32(lm_file->num_sge);
> +
> +	err = padev->ops->adminq_cmd(padev,
> +				     (union pds_core_adminq_cmd *)&cmd,
> +				     sizeof(cmd),
> +				     (union pds_core_adminq_comp *)&comp,
> +				     0);
> +	if (err)
> +		dev_err(&pdev->dev, "failed to set migration state: %pe\n",
> +			ERR_PTR(err));
> +
> +	pds_vfio_dma_unmap_lm_file(pds_vfio->coredev, DMA_TO_DEVICE, lm_file);
> +
> +	return err;
> +}
> +
> +void
> +pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio,
> +				    enum pds_lm_host_vf_status vf_status)
> +{
> +	struct pds_auxiliary_dev *padev = pds_vfio->vfio_aux->padev;
> +	struct pds_lm_host_vf_status_cmd cmd = {
> +		.opcode = PDS_LM_CMD_HOST_VF_STATUS,
> +		.vf_id = cpu_to_le16(pds_vfio->vf_id),
> +		.status = vf_status,
> +	};
> +	struct pci_dev *pdev = pds_vfio->pdev;
> +	struct pds_lm_comp comp = {0};
> +	int err;
> +
> +	dev_dbg(&pdev->dev, "vf%u: Set host VF LM status: %u",
> +		pds_vfio->vf_id, cmd.status);
> +	if (vf_status != PDS_LM_STA_IN_PROGRESS &&
> +	    vf_status != PDS_LM_STA_NONE) {
> +		dev_warn(&pdev->dev, "Invalid host VF migration status, %d\n",
> +			 vf_status);
> +		return;
> +	}
> +
> +	err = padev->ops->adminq_cmd(padev,
> +				     (union pds_core_adminq_cmd *)&cmd,
> +				     sizeof(cmd),
> +				     (union pds_core_adminq_comp *)&comp,
> +				     0);
> +	if (err)
> +		dev_warn(&pdev->dev, "failed to send host VF migration status: %pe\n",
> +			 ERR_PTR(err));
> +}
> diff --git a/drivers/vfio/pci/pds/cmds.h b/drivers/vfio/pci/pds/cmds.h
> index 7fe2d1efd894..5f9ac45ee5a3 100644
> --- a/drivers/vfio/pci/pds/cmds.h
> +++ b/drivers/vfio/pci/pds/cmds.h
> @@ -4,11 +4,26 @@
>  #ifndef _CMDS_H_
>  #define _CMDS_H_
>  
> +#include <linux/pds/pds_lm.h>
> +
>  struct pds_vfio_pci_device;
>  
>  int
>  pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio);
>  void
>  pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio);
> +int
> +pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio);
> +int
> +pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio);
> +int
> +pds_vfio_get_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size);
> +int
> +pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio);
> +int
> +pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio);
> +void
> +pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio,
> +				    enum pds_lm_host_vf_status vf_status);
>  
>  #endif /* _CMDS_H_ */
> diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c
> new file mode 100644
> index 000000000000..86152b18a5b7
> --- /dev/null
> +++ b/drivers/vfio/pci/pds/lm.c
> @@ -0,0 +1,491 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright(c) 2022 Pensando Systems, Inc */
> +
> +#include <linux/anon_inodes.h>
> +#include <linux/file.h>
> +#include <linux/fs.h>
> +#include <linux/highmem.h>
> +#include <linux/vfio.h>
> +#include <linux/vfio_pci_core.h>
> +
> +#include "cmds.h"
> +#include "vfio_dev.h"
> +
> +#define PDS_VFIO_LM_FILENAME	"pds_vfio_lm"
> +
> +const char *
> +pds_vfio_lm_state(enum vfio_device_mig_state state)
> +{
> +	switch (state) {
> +	case VFIO_DEVICE_STATE_ERROR:
> +		return "VFIO_DEVICE_STATE_ERROR";
> +	case VFIO_DEVICE_STATE_STOP:
> +		return "VFIO_DEVICE_STATE_STOP";
> +	case VFIO_DEVICE_STATE_RUNNING:
> +		return "VFIO_DEVICE_STATE_RUNNING";
> +	case VFIO_DEVICE_STATE_STOP_COPY:
> +		return "VFIO_DEVICE_STATE_STOP_COPY";
> +	case VFIO_DEVICE_STATE_RESUMING:
> +		return "VFIO_DEVICE_STATE_RESUMING";
> +	case VFIO_DEVICE_STATE_RUNNING_P2P:
> +		return "VFIO_DEVICE_STATE_RUNNING_P2P";
> +	default:
> +		return "VFIO_DEVICE_STATE_INVALID";
> +	}
> +
> +	return "VFIO_DEVICE_STATE_INVALID";
> +}
> +
> +static struct pds_vfio_lm_file *
> +pds_vfio_get_lm_file(const char *name, const struct file_operations *fops,
> +		     int flags, u64 size)
> +{
> +	struct pds_vfio_lm_file *lm_file = NULL;
> +	unsigned long long npages;
> +	unsigned long long i;
> +	struct page **pages;
> +	int err = 0;
> +
> +	if (!size)
> +		return NULL;
> +
> +	/* Alloc file structure */
> +	lm_file = kzalloc(sizeof(*lm_file), GFP_KERNEL);
> +	if (!lm_file)
> +		return NULL;
> +
> +	/* Create file */
> +	lm_file->filep = anon_inode_getfile(name, fops, lm_file, flags);
> +	if (!lm_file->filep)
> +		goto err_get_file;
> +
> +	stream_open(lm_file->filep->f_inode, lm_file->filep);
> +	mutex_init(&lm_file->lock);
> +
> +	lm_file->size = size;
> +
> +	/* Allocate memory for file pages */
> +	npages = DIV_ROUND_UP_ULL(lm_file->size, PAGE_SIZE);
> +
> +	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
> +	if (!pages)
> +		goto err_alloc_pages;
> +
> +	for (i = 0; i < npages; i++) {
> +		pages[i] = alloc_page(GFP_KERNEL);
> +		if (!pages[i])
> +			goto err_alloc_page;
> +	}
> +
> +	lm_file->pages = pages;
> +	lm_file->npages = npages;
> +	lm_file->alloc_size = npages * PAGE_SIZE;
> +
> +	/* Create scatterlist of file pages to use for DMA mapping later */
> +	err = sg_alloc_table_from_pages(&lm_file->sg_table, pages, npages,
> +					0, size, GFP_KERNEL);
> +	if (err)
> +		goto err_alloc_sg_table;
> +
> +	return lm_file;
> +
> +err_alloc_sg_table:
> +err_alloc_page:
> +	/* free allocated pages */
> +	for (i = 0; i < npages && pages[i]; i++)
> +		__free_page(pages[i]);
> +	kfree(pages);
> +err_alloc_pages:
> +	fput(lm_file->filep);
> +	mutex_destroy(&lm_file->lock);
> +err_get_file:
> +	kfree(lm_file);
> +
> +	return NULL;
> +}
> +
> +static void
> +pds_vfio_put_lm_file(struct pds_vfio_lm_file *lm_file)
> +{
> +	unsigned long long i;
> +
> +	mutex_lock(&lm_file->lock);
> +
> +	lm_file->size = 0;
> +	lm_file->alloc_size = 0;
> +
> +	/* Free scatter list of file pages*/
> +	sg_free_table(&lm_file->sg_table);
> +
> +	/* Free allocated file pages */
> +	for (i = 0; i < lm_file->npages && lm_file->pages[i]; i++)
> +		__free_page(lm_file->pages[i]);
> +	kfree(lm_file->pages);
> +	lm_file->pages = NULL;
> +
> +	/* Delete file */
> +	fput(lm_file->filep);
> +	lm_file->filep = NULL;
> +
> +	mutex_unlock(&lm_file->lock);
> +
> +	mutex_destroy(&lm_file->lock);
> +
> +	/* Free file structure */
> +	kfree(lm_file);
> +}
> +
> +void
> +pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio)
> +{
> +	if (!pds_vfio->save_file)
> +		return;
> +
> +	pds_vfio_put_lm_file(pds_vfio->save_file);
> +	pds_vfio->save_file = NULL;
> +}
> +
> +void
> +pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio)
> +{
> +	if (!pds_vfio->restore_file)
> +		return;
> +
> +	pds_vfio_put_lm_file(pds_vfio->restore_file);
> +	pds_vfio->restore_file = NULL;
> +}
> +
> +static struct page *
> +pds_vfio_get_file_page(struct pds_vfio_lm_file *lm_file,
> +		       unsigned long offset)
> +{
> +	unsigned long cur_offset = 0;
> +	struct scatterlist *sg;
> +	unsigned int i;
> +
> +	/* All accesses are sequential */
> +	if (offset < lm_file->last_offset || !lm_file->last_offset_sg) {
> +		lm_file->last_offset = 0;
> +		lm_file->last_offset_sg = lm_file->sg_table.sgl;
> +		lm_file->sg_last_entry = 0;
> +	}
> +
> +	cur_offset = lm_file->last_offset;
> +
> +	for_each_sg(lm_file->last_offset_sg, sg,
> +		    lm_file->sg_table.orig_nents - lm_file->sg_last_entry,
> +		    i) {
> +		if (offset < sg->length + cur_offset) {
> +			lm_file->last_offset_sg = sg;
> +			lm_file->sg_last_entry += i;
> +			lm_file->last_offset = cur_offset;
> +			return nth_page(sg_page(sg),
> +					(offset - cur_offset) / PAGE_SIZE);
> +		}
> +		cur_offset += sg->length;
> +	}
> +
> +	return NULL;
> +}
> +
> +static int
> +pds_vfio_release_file(struct inode *inode, struct file *filp)
> +{
> +	struct pds_vfio_lm_file *lm_file = filp->private_data;
> +
> +	lm_file->size = 0;
> +
> +	return 0;
> +}
> +
> +static ssize_t
> +pds_vfio_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos)
> +{
> +	struct pds_vfio_lm_file *lm_file = filp->private_data;
> +	ssize_t done = 0;
> +
> +	if (pos)
> +		return -ESPIPE;
> +	pos = &filp->f_pos;
> +
> +	mutex_lock(&lm_file->lock);
> +	if (*pos > lm_file->size) {
> +		done = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	len = min_t(size_t, lm_file->size - *pos, len);
> +	while (len) {
> +		size_t page_offset;
> +		struct page *page;
> +		size_t page_len;
> +		u8 *from_buff;
> +		int err;
> +
> +		page_offset = (*pos) % PAGE_SIZE;
> +		page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
> +		if (!page) {
> +			if (done == 0)
> +				done = -EINVAL;
> +			goto out_unlock;
> +		}
> +
> +		page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
> +		from_buff = kmap_local_page(page);
> +		err = copy_to_user(buf, from_buff + page_offset, page_len);
> +		kunmap_local(from_buff);
> +		if (err) {
> +			done = -EFAULT;
> +			goto out_unlock;
> +		}
> +		*pos += page_len;
> +		len -= page_len;
> +		done += page_len;
> +		buf += page_len;
> +	}
> +
> +out_unlock:
> +	mutex_unlock(&lm_file->lock);
> +	return done;
> +}
> +
> +static const struct file_operations
> +pds_vfio_save_fops = {
> +	.owner = THIS_MODULE,
> +	.read = pds_vfio_save_read,
> +	.release = pds_vfio_release_file,
> +	.llseek = no_llseek,
> +};
> +
> +static int
> +pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio)
> +{
> +	struct pci_dev *pdev = pds_vfio->pdev;
> +	struct pds_vfio_lm_file *lm_file;
> +	int err = 0;
> +	u64 size;
> +
> +	/* Get live migration state size in this state */
> +	err = pds_vfio_get_lm_status_cmd(pds_vfio, &size);
> +	if (err) {
> +		dev_err(&pdev->dev, "failed to get save status: %pe\n",
> +			ERR_PTR(err));
> +		goto err_get_lm_status;
> +	}
> +
> +	dev_dbg(&pdev->dev, "save status, size = %lld\n", size);
> +
> +	if (!size) {
> +		err = -EIO;
> +		dev_err(&pdev->dev, "invalid state size\n");
> +		goto err_get_lm_status;
> +	}
> +
> +	lm_file = pds_vfio_get_lm_file(PDS_VFIO_LM_FILENAME,
> +				       &pds_vfio_save_fops,
> +				       O_RDONLY, size);
> +	if (!lm_file) {
> +		err = -ENOENT;
> +		dev_err(&pdev->dev, "failed to create save file\n");
> +		goto err_get_lm_file;
> +	}
> +
> +	dev_dbg(&pdev->dev, "size = %lld, alloc_size = %lld, npages = %lld\n",
> +		lm_file->size, lm_file->alloc_size, lm_file->npages);
> +
> +	pds_vfio->save_file = lm_file;
> +
> +	return 0;
> +
> +err_get_lm_file:
> +err_get_lm_status:
> +	return err;
> +}
> +
> +static ssize_t
> +pds_vfio_restore_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos)
> +{
> +	struct pds_vfio_lm_file *lm_file = filp->private_data;
> +	loff_t requested_length;
> +	ssize_t done = 0;
> +
> +	if (pos)
> +		return -ESPIPE;
> +
> +	pos = &filp->f_pos;
> +
> +	if (*pos < 0 ||
> +	    check_add_overflow((loff_t)len, *pos, &requested_length))
> +		return -EINVAL;
> +
> +	mutex_lock(&lm_file->lock);
> +
> +	while (len) {
> +		size_t page_offset;
> +		struct page *page;
> +		size_t page_len;
> +		u8 *to_buff;
> +		int err;
> +
> +		page_offset = (*pos) % PAGE_SIZE;
> +		page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
> +		if (!page) {
> +			if (done == 0)
> +				done = -EINVAL;
> +			goto out_unlock;
> +		}
> +
> +		page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
> +		to_buff = kmap_local_page(page);
> +		err = copy_from_user(to_buff + page_offset, buf, page_len);
> +		kunmap_local(to_buff);
> +		if (err) {
> +			done = -EFAULT;
> +			goto out_unlock;
> +		}
> +		*pos += page_len;
> +		len -= page_len;
> +		done += page_len;
> +		buf += page_len;
> +		lm_file->size += page_len;
> +	}
> +out_unlock:
> +	mutex_unlock(&lm_file->lock);
> +	return done;
> +}
> +
> +static const struct file_operations
> +pds_vfio_restore_fops = {
> +	.owner = THIS_MODULE,
> +	.write = pds_vfio_restore_write,
> +	.release = pds_vfio_release_file,
> +	.llseek = no_llseek,
> +};
> +
> +static int
> +pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio)
> +{
> +	struct pci_dev *pdev = pds_vfio->pdev;
> +	struct pds_vfio_lm_file *lm_file;
> +	int err = 0;
> +	u64 size;
> +
> +	size = sizeof(union pds_lm_dev_state);
> +
> +	dev_dbg(&pdev->dev, "restore status, size = %lld\n", size);
> +
> +	if (!size) {
> +		err = -EIO;
> +		dev_err(&pdev->dev, "invalid state size");
> +		goto err_get_lm_status;
> +	}
> +
> +	lm_file = pds_vfio_get_lm_file(PDS_VFIO_LM_FILENAME,
> +				       &pds_vfio_restore_fops,
> +				       O_WRONLY, size);
> +	if (!lm_file) {
> +		err = -ENOENT;
> +		dev_err(&pdev->dev, "failed to create restore file");
> +		goto err_get_lm_file;
> +	}
> +	pds_vfio->restore_file = lm_file;
> +
> +	return 0;
> +
> +err_get_lm_file:
> +err_get_lm_status:
> +	return err;
> +}
> +
> +struct file *
> +pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
> +				  enum vfio_device_mig_state next)
> +{
> +	enum vfio_device_mig_state cur = pds_vfio->state;
> +	struct device *dev = &pds_vfio->pdev->dev;
> +	int err = 0;
> +
> +	dev_dbg(dev, "%s => %s\n",
> +		pds_vfio_lm_state(cur), pds_vfio_lm_state(next));
> +
> +	if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_STOP_COPY) {
> +		/* Device is already stopped
> +		 * create save device data file & get device state from firmware
> +		 */
> +		err = pds_vfio_get_save_file(pds_vfio);
> +		if (err)
> +			return ERR_PTR(err);
> +
> +		/* Get device state */
> +		err = pds_vfio_get_lm_state_cmd(pds_vfio);
> +		if (err) {
> +			pds_vfio_put_save_file(pds_vfio);
> +			return ERR_PTR(err);
> +		}
> +
> +		return pds_vfio->save_file->filep;
> +	}
> +
> +	if (cur == VFIO_DEVICE_STATE_STOP_COPY && next == VFIO_DEVICE_STATE_STOP) {
> +		/* Device is already stopped
> +		 * delete the save device state file
> +		 */
> +		pds_vfio_put_save_file(pds_vfio);
> +		pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
> +						    PDS_LM_STA_NONE);
> +		return NULL;
> +	}
> +
> +	if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RESUMING) {
> +		/* create resume device data file */
> +		err = pds_vfio_get_restore_file(pds_vfio);
> +		if (err)
> +			return ERR_PTR(err);
> +
> +		return pds_vfio->restore_file->filep;
> +	}
> +
> +	if (cur == VFIO_DEVICE_STATE_RESUMING && next == VFIO_DEVICE_STATE_STOP) {
> +		/* Set device state */
> +		err = pds_vfio_set_lm_state_cmd(pds_vfio);
> +		if (err)
> +			return ERR_PTR(err);
> +
> +		/* delete resume device data file */
> +		pds_vfio_put_restore_file(pds_vfio);
> +		return NULL;
> +	}
> +
> +	if (cur == VFIO_DEVICE_STATE_RUNNING && next == VFIO_DEVICE_STATE_RUNNING_P2P) {
> +		pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS);
> +		/* Device should be stopped
> +		 * no interrupts, dma or change in internal state
> +		 */
> +		err = pds_vfio_suspend_device_cmd(pds_vfio);
> +		if (err)
> +			return ERR_PTR(err);
> +
> +		return NULL;
> +	}
> +
> +	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_RUNNING) {
> +		/* Device should be functional
> +		 * interrupts, dma, mmio or changes to internal state is allowed
> +		 */
> +		err = pds_vfio_resume_device_cmd(pds_vfio);
> +		if (err)
> +			return ERR_PTR(err);
> +
> +		pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
> +						    PDS_LM_STA_NONE);
> +		return NULL;
> +	}
> +
> +	if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RUNNING_P2P)
> +		return NULL;
> +
> +	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_STOP)
> +		return NULL;
> +
> +	return ERR_PTR(-EINVAL);
> +}
> diff --git a/drivers/vfio/pci/pds/lm.h b/drivers/vfio/pci/pds/lm.h
> new file mode 100644
> index 000000000000..3dd97b807db6
> --- /dev/null
> +++ b/drivers/vfio/pci/pds/lm.h
> @@ -0,0 +1,43 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright(c) 2022 Pensando Systems, Inc */
> +
> +#ifndef _LM_H_
> +#define _LM_H_
> +
> +#include <linux/fs.h>
> +#include <linux/mutex.h>
> +#include <linux/scatterlist.h>
> +#include <linux/types.h>
> +#include <linux/vfio.h>
> +
> +#include <linux/pds/pds_lm.h>
> +
> +struct pds_vfio_lm_file {
> +	struct file *filep;
> +	struct mutex lock;	/* protect live migration data file */
> +	u64 size;		/* Size with valid data */
> +	u64 alloc_size;		/* Total allocated size. Always >= len */
> +	struct page **pages;	/* Backing pages for file */
> +	unsigned long long npages;
> +	struct sg_table sg_table;	/* SG table for backing pages */
> +	struct pds_lm_sg_elem *sgl;	/* DMA mapping */
> +	dma_addr_t sgl_addr;
> +	u16 num_sge;
> +	struct scatterlist *last_offset_sg;	/* Iterator */
> +	unsigned int sg_last_entry;
> +	unsigned long last_offset;
> +};
> +
> +struct pds_vfio_pci_device;
> +
> +struct file *
> +pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
> +				  enum vfio_device_mig_state next);
> +const char *
> +pds_vfio_lm_state(enum vfio_device_mig_state state);
> +void
> +pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio);
> +void
> +pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio);
> +
> +#endif /* _LM_H_ */
> diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c
> index d6ad15719ec4..aafa013ee8c2 100644
> --- a/drivers/vfio/pci/pds/pci_drv.c
> +++ b/drivers/vfio/pci/pds/pci_drv.c
> @@ -67,12 +67,27 @@ pds_vfio_pci_table[] = {
>  };
>  MODULE_DEVICE_TABLE(pci, pds_vfio_pci_table);
>  
> +static void
> +pds_vfio_pci_aer_reset_done(struct pci_dev *pdev)
> +{
> +	struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev);
> +
> +	pds_vfio_reset(pds_vfio);
> +}
> +
> +static const struct
> +pci_error_handlers pds_vfio_pci_err_handlers = {
> +	.reset_done = pds_vfio_pci_aer_reset_done,
> +	.error_detected = vfio_pci_core_aer_err_detected,
> +};
> +
>  static struct pci_driver
>  pds_vfio_pci_driver = {
>  	.name = PDS_VFIO_DRV_NAME,
>  	.id_table = pds_vfio_pci_table,
>  	.probe = pds_vfio_pci_probe,
>  	.remove = pds_vfio_pci_remove,
> +	.err_handler = &pds_vfio_pci_err_handlers,
>  	.driver_managed_dma = true,
>  };
>  
> diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
> index 30c3bb47a2be..d28f96f93097 100644
> --- a/drivers/vfio/pci/pds/vfio_dev.c
> +++ b/drivers/vfio/pci/pds/vfio_dev.c
> @@ -4,6 +4,7 @@
>  #include <linux/vfio.h>
>  #include <linux/vfio_pci_core.h>
>  
> +#include "lm.h"
>  #include "vfio_dev.h"
>  #include "aux_drv.h"
>  
> @@ -16,6 +17,97 @@ pds_vfio_pci_drvdata(struct pci_dev *pdev)
>  			    vfio_coredev);
>  }
>  
> +static void
> +pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
> +{
> +again:
> +	spin_lock(&pds_vfio->reset_lock);
> +	if (pds_vfio->deferred_reset) {
> +		pds_vfio->deferred_reset = false;
> +		if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) {
> +			pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
> +			pds_vfio_put_restore_file(pds_vfio);
> +			pds_vfio_put_save_file(pds_vfio);
> +		}
> +		spin_unlock(&pds_vfio->reset_lock);
> +		goto again;
> +	}
> +	mutex_unlock(&pds_vfio->state_mutex);
> +	spin_unlock(&pds_vfio->reset_lock);
> +}
> +
> +void
> +pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio)
> +{
> +	spin_lock(&pds_vfio->reset_lock);
> +	pds_vfio->deferred_reset = true;
> +	if (!mutex_trylock(&pds_vfio->state_mutex)) {
> +		spin_unlock(&pds_vfio->reset_lock);
> +		return;
> +	}
> +	spin_unlock(&pds_vfio->reset_lock);
> +	pds_vfio_state_mutex_unlock(pds_vfio);
> +}
> +
> +static struct file *
> +pds_vfio_set_device_state(struct vfio_device *vdev,
> +			  enum vfio_device_mig_state new_state)
> +{
> +	struct pds_vfio_pci_device *pds_vfio =
> +		container_of(vdev, struct pds_vfio_pci_device,
> +			     vfio_coredev.vdev);
> +	struct file *res = NULL;
> +
> +	if (!pds_vfio->vfio_aux)
> +		return ERR_PTR(-ENODEV);
> +
> +	mutex_lock(&pds_vfio->state_mutex);
> +	while (new_state != pds_vfio->state) {
> +		enum vfio_device_mig_state next_state;
> +
> +		int err = vfio_mig_get_next_state(vdev, pds_vfio->state,
> +						  new_state, &next_state);
> +		if (err) {
> +			res = ERR_PTR(err);
> +			break;
> +		}
> +
> +		res = pds_vfio_step_device_state_locked(pds_vfio, next_state);
> +		if (IS_ERR(res))
> +			break;
> +
> +		pds_vfio->state = next_state;
> +
> +		if (WARN_ON(res && new_state != pds_vfio->state)) {
> +			res = ERR_PTR(-EINVAL);
> +			break;
> +		}
> +	}
> +	pds_vfio_state_mutex_unlock(pds_vfio);
> +
> +	return res;
> +}
> +
> +static int
> +pds_vfio_get_device_state(struct vfio_device *vdev,
> +			  enum vfio_device_mig_state *current_state)
> +{
> +	struct pds_vfio_pci_device *pds_vfio =
> +		container_of(vdev, struct pds_vfio_pci_device,
> +			     vfio_coredev.vdev);
> +
> +	mutex_lock(&pds_vfio->state_mutex);
> +	*current_state = pds_vfio->state;
> +	pds_vfio_state_mutex_unlock(pds_vfio);
> +	return 0;
> +}
> +
> +static const struct vfio_migration_ops
> +pds_vfio_lm_ops = {
> +	.migration_set_state = pds_vfio_set_device_state,
> +	.migration_get_state = pds_vfio_get_device_state
> +};
> +
>  static int
>  pds_vfio_init_device(struct vfio_device *vdev)
>  {
> @@ -35,10 +127,19 @@ pds_vfio_init_device(struct vfio_device *vdev)
>  	vfio_aux = pds_vfio_aux_get_drvdata(pds_vfio->pci_id);
>  	if (vfio_aux) {
>  		vfio_aux->pds_vfio = pds_vfio;
> +		pds_vfio->coredev = vfio_aux->padev->aux_dev.dev.parent;
>  		pds_vfio->vfio_aux = vfio_aux;
>  		pds_vfio_put_aux_dev(vfio_aux);
>  	}
>  
> +	vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
> +	vdev->mig_ops = &pds_vfio_lm_ops;
> +
> +	dev_dbg(&pdev->dev, "%s: PF %#04x VF %#04x (%d) vf_id %d domain %d vfio_aux %p pds_vfio %p\n",
> +		__func__, pci_dev_id(pdev->physfn),
> +		pds_vfio->pci_id, pds_vfio->pci_id, pds_vfio->vf_id,
> +		pci_domain_nr(pdev->bus), pds_vfio->vfio_aux, pds_vfio);
> +
>  	return 0;
>  }
>  
> @@ -54,18 +155,34 @@ pds_vfio_open_device(struct vfio_device *vdev)
>  	if (err)
>  		return err;
>  
> +	mutex_init(&pds_vfio->state_mutex);
> +	dev_dbg(&pds_vfio->pdev->dev, "%s: %s => VFIO_DEVICE_STATE_RUNNING\n",
> +		__func__, pds_vfio_lm_state(pds_vfio->state));
> +	pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
> +
>  	vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev);
>  
>  	return 0;
>  }
>  
> +static void
> +pds_vfio_close_device(struct vfio_device *vdev)
> +{
> +	struct pds_vfio_pci_device *pds_vfio =
> +		container_of(vdev, struct pds_vfio_pci_device,
> +			     vfio_coredev.vdev);
> +
> +	mutex_destroy(&pds_vfio->state_mutex);
> +	vfio_pci_core_close_device(vdev);
> +}
> +
>  static const struct vfio_device_ops
>  pds_vfio_ops = {
>  	.name = "pds-vfio",
>  	.init = pds_vfio_init_device,
>  	.release = vfio_pci_core_release_dev,
>  	.open_device = pds_vfio_open_device,
> -	.close_device = vfio_pci_core_close_device,
> +	.close_device = pds_vfio_close_device,
>  	.ioctl = vfio_pci_core_ioctl,
>  	.device_feature = vfio_pci_core_ioctl_feature,
>  	.read = vfio_pci_core_read,
> diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h
> index b16668693e1f..a09570eec6fa 100644
> --- a/drivers/vfio/pci/pds/vfio_dev.h
> +++ b/drivers/vfio/pci/pds/vfio_dev.h
> @@ -7,10 +7,20 @@
>  #include <linux/pci.h>
>  #include <linux/vfio_pci_core.h>
>  
> +#include "lm.h"
> +
>  struct pds_vfio_pci_device {
>  	struct vfio_pci_core_device vfio_coredev;
>  	struct pci_dev *pdev;
>  	struct pds_vfio_aux *vfio_aux;
> +	struct device *coredev;
> +
> +	struct pds_vfio_lm_file *save_file;
> +	struct pds_vfio_lm_file *restore_file;
> +	struct mutex state_mutex; /* protect migration state */
> +	enum vfio_device_mig_state state;
> +	spinlock_t reset_lock; /* protect reset_done flow */
> +	u8 deferred_reset;
>  
>  	int vf_id;
>  	int pci_id;
> @@ -20,5 +30,7 @@ const struct vfio_device_ops *
>  pds_vfio_ops_info(void);
>  struct pds_vfio_pci_device *
>  pds_vfio_pci_drvdata(struct pci_dev *pdev);
> +void
> +pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio);
>  
>  #endif /* _VFIO_DEV_H_ */
> diff --git a/include/linux/pds/pds_lm.h b/include/linux/pds/pds_lm.h
> index fdaf2bf71d35..28ebd62f7583 100644
> --- a/include/linux/pds/pds_lm.h
> +++ b/include/linux/pds/pds_lm.h
> @@ -8,5 +8,211 @@
>  
>  #define PDS_DEV_TYPE_LM_STR	"LM"
>  #define PDS_LM_DEV_NAME		PDS_CORE_DRV_NAME "." PDS_DEV_TYPE_LM_STR
> +#define PDS_LM_DEVICE_STATE_LENGTH		65536
> +#define PDS_LM_CHECK_DEVICE_STATE_LENGTH(X)					\
> +			PDS_CORE_SIZE_CHECK(union, PDS_LM_DEVICE_STATE_LENGTH, X)
> +
> +/*
> + * enum pds_lm_cmd_opcode - Live Migration Device commands
> + */
> +enum pds_lm_cmd_opcode {
> +	PDS_LM_CMD_HOST_VF_STATUS  = 1,
> +
> +	/* Device state commands */
> +	PDS_LM_CMD_STATUS          = 16,
> +	PDS_LM_CMD_SUSPEND         = 18,
> +	PDS_LM_CMD_SUSPEND_STATUS  = 19,
> +	PDS_LM_CMD_RESUME          = 20,
> +	PDS_LM_CMD_SAVE            = 21,
> +	PDS_LM_CMD_RESTORE         = 22,
> +};
> +
> +/**
> + * struct pds_lm_cmd - generic command
> + * @opcode:	Opcode
> + * @rsvd:	Word boundary padding
> + * @vf_id:	VF id
> + * @rsvd2:	Structure padding to 60 Bytes
> + */
> +struct pds_lm_cmd {
> +	u8     opcode;
> +	u8     rsvd;
> +	__le16 vf_id;
> +	u8     rsvd2[56];
> +};
> +
> +/**
> + * struct pds_lm_comp - generic command completion
> + * @status:	Status of the command (enum pds_core_status_code)
> + * @rsvd:	Structure padding to 16 Bytes
> + */
> +struct pds_lm_comp {
> +	u8 status;
> +	u8 rsvd[15];
> +};
> +
> +/**
> + * struct pds_lm_status_cmd - STATUS command
> + * @opcode:	Opcode
> + * @rsvd:	Word boundary padding
> + * @vf_id:	VF id
> + */
> +struct pds_lm_status_cmd {
> +	u8     opcode;
> +	u8     rsvd;
> +	__le16 vf_id;
> +};
> +
> +/**
> + * struct pds_lm_status_comp - STATUS command completion
> + * @status:		Status of the command (enum pds_core_status_code)
> + * @rsvd:		Word boundary padding
> + * @comp_index:		Index in the desc ring for which this is the completion
> + * @size:		Size of the device state
> + * @rsvd2:		Word boundary padding
> + * @color:		Color bit
> + */
> +struct pds_lm_status_comp {
> +	u8     status;
> +	u8     rsvd;
> +	__le16 comp_index;
> +	union {
> +		__le64 size;
> +		u8     rsvd2[11];
> +	} __packed;
> +	u8     color;
> +};
> +
> +/**
> + * struct pds_lm_suspend_cmd - SUSPEND command
> + * @opcode:	Opcode PDS_LM_CMD_SUSPEND
> + * @rsvd:	Word boundary padding
> + * @vf_id:	VF id
> + */
> +struct pds_lm_suspend_cmd {
> +	u8     opcode;
> +	u8     rsvd;
> +	__le16 vf_id;
> +};
> +
> +/**
> + * struct pds_lm_suspend_comp - SUSPEND command completion
> + * @status:		Status of the command (enum pds_core_status_code)
> + * @rsvd:		Word boundary padding
> + * @comp_index:		Index in the desc ring for which this is the completion
> + * @state_size:		Size of the device state computed post suspend.
> + * @rsvd2:		Word boundary padding
> + * @color:		Color bit
> + */
> +struct pds_lm_suspend_comp {
> +	u8     status;
> +	u8     rsvd;
> +	__le16 comp_index;
> +	union {
> +		__le64 state_size;
> +		u8     rsvd2[11];
> +	} __packed;
> +	u8     color;
> +};
> +
> +/**
> + * struct pds_lm_suspend_status_cmd - SUSPEND status command
> + * @opcode:	Opcode PDS_AQ_CMD_LM_SUSPEND_STATUS
> + * @rsvd:	Word boundary padding
> + * @vf_id:	VF id
> + */
> +struct pds_lm_suspend_status_cmd {
> +	u8 opcode;
> +	u8 rsvd;
> +	__le16 vf_id;
> +};
> +
> +/**
> + * struct pds_lm_resume_cmd - RESUME command
> + * @opcode:	Opcode PDS_LM_CMD_RESUME
> + * @rsvd:	Word boundary padding
> + * @vf_id:	VF id
> + */
> +struct pds_lm_resume_cmd {
> +	u8     opcode;
> +	u8     rsvd;
> +	__le16 vf_id;
> +};
> +
> +/**
> + * struct pds_lm_sg_elem - Transmit scatter-gather (SG) descriptor element
> + * @addr:	DMA address of SG element data buffer
> + * @len:	Length of SG element data buffer, in bytes
> + * @rsvd:	Word boundary padding
> + */
> +struct pds_lm_sg_elem {
> +	__le64 addr;
> +	__le32 len;
> +	__le16 rsvd[2];
> +};
> +
> +/**
> + * struct pds_lm_save_cmd - SAVE command
> + * @opcode:	Opcode PDS_LM_CMD_SAVE
> + * @rsvd:	Word boundary padding
> + * @vf_id:	VF id
> + * @rsvd2:	Word boundary padding
> + * @sgl_addr:	IOVA address of the SGL to dma the device state
> + * @num_sge:	Total number of SG elements
> + */
> +struct pds_lm_save_cmd {
> +	u8     opcode;
> +	u8     rsvd;
> +	__le16 vf_id;
> +	u8     rsvd2[4];
> +	__le64 sgl_addr;
> +	__le32 num_sge;
> +} __packed;
> +
> +/**
> + * struct pds_lm_restore_cmd - RESTORE command
> + * @opcode:	Opcode PDS_LM_CMD_RESTORE
> + * @rsvd:	Word boundary padding
> + * @vf_id:	VF id
> + * @rsvd2:	Word boundary padding
> + * @sgl_addr:	IOVA address of the SGL to dma the device state
> + * @num_sge:	Total number of SG elements
> + */
> +struct pds_lm_restore_cmd {
> +	u8     opcode;
> +	u8     rsvd;
> +	__le16 vf_id;
> +	u8     rsvd2[4];
> +	__le64 sgl_addr;
> +	__le32 num_sge;
> +} __packed;
> +
> +/**
> + * union pds_lm_dev_state - device state information
> + * @words:	Device state words
> + */
> +union pds_lm_dev_state {
> +	__le32 words[PDS_LM_DEVICE_STATE_LENGTH / sizeof(__le32)];
> +};
> +
> +enum pds_lm_host_vf_status {
> +	PDS_LM_STA_NONE = 0,
> +	PDS_LM_STA_IN_PROGRESS,
> +	PDS_LM_STA_MAX,
> +};
> +
> +/**
> + * struct pds_lm_host_vf_status_cmd - HOST_VF_STATUS command
> + * @opcode:	Opcode PDS_LM_CMD_HOST_VF_STATUS
> + * @rsvd:	Word boundary padding
> + * @vf_id:	VF id
> + * @status:	Current LM status of host VF driver (enum pds_lm_host_status)
> + */
> +struct pds_lm_host_vf_status_cmd {
> +	u8     opcode;
> +	u8     rsvd;
> +	__le16 vf_id;
> +	u8     status;
> +};
>  
>  #endif /* _PDS_LM_H_ */
>
Nelson, Shannon Jan. 3, 2023, 8:23 p.m. UTC | #2
On 1/3/23 4:25 AM, liulongfang wrote:
> On 2022/12/15 7:21, Brett Creeley wrote:
>> Add live migration support via the VFIO subsystem. The migration
>> implementation aligns with the definition from uapi/vfio.h and uses the
>> auxiliary bus connection between pds_core/pds_vfio to communicate with
>> the device.
>>
>> The ability to suspend, resume, and transfer VF device state data is
>> included along with the required admin queue command structures and
>> implementations.
>>
>> PDS_LM_CMD_SUSPEND and PDS_LM_CMD_SUSPEND_STATUS are added to support
>> the VF device suspend operation.
>>
>> PDS_LM_CMD_RESUME is added to support the VF device resume operation.
>>
>> PDS_LM_CMD_STATUS is added to determine the exact size of the VF
>> device state data.
>>
>> PDS_LM_CMD_SAVE is added to get the VF device state data.
>>
> 
> Hi, this PDS is an NVME storage disk. When it supports live migration,
> the amount of data stored on the disk is so large.
> How do you realize the migration of a large amount of data in the disk
> in a short period of time?

In our case we are not providing the actual storage, but a fabric 
initiator to a separate remote target that the migration target can also 
access.

sln


> 
> Thanks,
> Longfang.
> 
>> PDS_LM_CMD_RESTORE is added to restore the VF device with the
>> previously saved data from PDS_LM_CMD_SAVE.
>>
>> PDS_LM_CMD_HOST_VF_STATUS is added to notify the device when
>> a migration is in/not-in progress from the host's perspective.
>>
>> Signed-off-by: Brett Creeley <brett.creeley@amd.com>
>> Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
>> ---
>>   drivers/vfio/pci/pds/Makefile   |   1 +
>>   drivers/vfio/pci/pds/aux_drv.c  |   1 +
>>   drivers/vfio/pci/pds/cmds.c     | 312 ++++++++++++++++++++
>>   drivers/vfio/pci/pds/cmds.h     |  15 +
>>   drivers/vfio/pci/pds/lm.c       | 491 ++++++++++++++++++++++++++++++++
>>   drivers/vfio/pci/pds/lm.h       |  43 +++
>>   drivers/vfio/pci/pds/pci_drv.c  |  15 +
>>   drivers/vfio/pci/pds/vfio_dev.c | 119 +++++++-
>>   drivers/vfio/pci/pds/vfio_dev.h |  12 +
>>   include/linux/pds/pds_lm.h      | 206 ++++++++++++++
>>   10 files changed, 1214 insertions(+), 1 deletion(-)
>>   create mode 100644 drivers/vfio/pci/pds/lm.c
>>   create mode 100644 drivers/vfio/pci/pds/lm.h
>>
>> diff --git a/drivers/vfio/pci/pds/Makefile b/drivers/vfio/pci/pds/Makefile
>> index 49bf8289739b..b4980323cff4 100644
>> --- a/drivers/vfio/pci/pds/Makefile
>> +++ b/drivers/vfio/pci/pds/Makefile
>> @@ -4,5 +4,6 @@ obj-$(CONFIG_PDS_VFIO_PCI) += pds_vfio.o
>>   pds_vfio-y := \
>>        aux_drv.o       \
>>        cmds.o          \
>> +     lm.o            \
>>        pci_drv.o       \
>>        vfio_dev.o
>> diff --git a/drivers/vfio/pci/pds/aux_drv.c b/drivers/vfio/pci/pds/aux_drv.c
>> index 494551894926..59b43075e85f 100644
>> --- a/drivers/vfio/pci/pds/aux_drv.c
>> +++ b/drivers/vfio/pci/pds/aux_drv.c
>> @@ -77,6 +77,7 @@ pds_vfio_aux_probe(struct auxiliary_device *aux_dev,
>>        dev_dbg(dev, "%s: id %#04x busnr %#x devfn %#x bus %p pds_vfio %p\n",
>>                __func__, padev->id, busnr, devfn, bus, vfio_aux->pds_vfio);
>>
>> +     vfio_aux->pds_vfio->coredev = aux_dev->dev.parent;
>>        vfio_aux->pds_vfio->vfio_aux = vfio_aux;
>>
>>        vfio_aux->padrv.event_handler = pds_vfio_aux_notify_handler;
>> diff --git a/drivers/vfio/pci/pds/cmds.c b/drivers/vfio/pci/pds/cmds.c
>> index 5a3fadcd38d8..11823d824ccc 100644
>> --- a/drivers/vfio/pci/pds/cmds.c
>> +++ b/drivers/vfio/pci/pds/cmds.c
>> @@ -3,9 +3,11 @@
>>
>>   #include <linux/io.h>
>>   #include <linux/types.h>
>> +#include <linux/delay.h>
>>
>>   #include <linux/pds/pds_core_if.h>
>>   #include <linux/pds/pds_adminq.h>
>> +#include <linux/pds/pds_lm.h>
>>   #include <linux/pds/pds_auxbus.h>
>>
>>   #include "vfio_dev.h"
>> @@ -28,3 +30,313 @@ pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio)
>>
>>        padev->ops->unregister_client(padev);
>>   }
>> +
>> +#define SUSPEND_TIMEOUT_S            5
>> +#define SUSPEND_CHECK_INTERVAL_MS    1
>> +
>> +static int
>> +pds_vfio_suspend_wait_device_cmd(struct pds_vfio_pci_device *pds_vfio)
>> +{
>> +     struct pds_lm_suspend_status_cmd cmd = {
>> +             .opcode = PDS_LM_CMD_SUSPEND_STATUS,
>> +             .vf_id  = cpu_to_le16(pds_vfio->vf_id),
>> +     };
>> +     struct pci_dev *pdev = pds_vfio->pdev;
>> +     struct pds_lm_comp comp = { 0 };
>> +     struct pds_auxiliary_dev *padev;
>> +     unsigned long time_limit;
>> +     unsigned long time_start;
>> +     unsigned long time_done;
>> +     int err;
>> +
>> +     padev = pds_vfio->vfio_aux->padev;
>> +
>> +     time_start = jiffies;
>> +     time_limit = time_start + HZ * SUSPEND_TIMEOUT_S;
>> +     do {
>> +             err = padev->ops->adminq_cmd(padev,
>> +                                          (union pds_core_adminq_cmd *)&cmd,
>> +                                          sizeof(cmd),
>> +                                          (union pds_core_adminq_comp *)&comp,
>> +                                           PDS_AQ_FLAG_FASTPOLL);
>> +             if (err != -EAGAIN)
>> +                     break;
>> +
>> +             msleep(SUSPEND_CHECK_INTERVAL_MS);
>> +     } while (time_before(jiffies, time_limit));
>> +
>> +     time_done = jiffies;
>> +     dev_dbg(&pdev->dev, "%s: vf%u: Suspend comp received in %d msecs\n",
>> +             __func__, pds_vfio->vf_id,
>> +             jiffies_to_msecs(time_done - time_start));
>> +
>> +     /* Check the results */
>> +     if (time_after_eq(time_done, time_limit)) {
>> +             dev_err(&pdev->dev, "%s: vf%u: Suspend comp timeout\n", __func__,
>> +                     pds_vfio->vf_id);
>> +             err = -ETIMEDOUT;
>> +     }
>> +
>> +     return err;
>> +}
>> +
>> +int
>> +pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio)
>> +{
>> +     struct pds_lm_suspend_cmd cmd = {
>> +             .opcode = PDS_LM_CMD_SUSPEND,
>> +             .vf_id = cpu_to_le16(pds_vfio->vf_id),
>> +     };
>> +     struct pds_lm_suspend_comp comp = {0};
>> +     struct pci_dev *pdev = pds_vfio->pdev;
>> +     struct pds_auxiliary_dev *padev;
>> +     int err;
>> +
>> +     dev_dbg(&pdev->dev, "vf%u: Suspend device\n", pds_vfio->vf_id);
>> +
>> +     padev = pds_vfio->vfio_aux->padev;
>> +     err = padev->ops->adminq_cmd(padev,
>> +                                  (union pds_core_adminq_cmd *)&cmd,
>> +                                  sizeof(cmd),
>> +                                  (union pds_core_adminq_comp *)&comp,
>> +                                  PDS_AQ_FLAG_FASTPOLL);
>> +     if (err) {
>> +             dev_err(&pdev->dev, "vf%u: Suspend failed: %pe\n",
>> +                     pds_vfio->vf_id, ERR_PTR(err));
>> +             return err;
>> +     }
>> +
>> +     return pds_vfio_suspend_wait_device_cmd(pds_vfio);
>> +}
>> +
>> +int
>> +pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio)
>> +{
>> +     struct pds_lm_resume_cmd cmd = {
>> +             .opcode = PDS_LM_CMD_RESUME,
>> +             .vf_id = cpu_to_le16(pds_vfio->vf_id),
>> +     };
>> +     struct pds_auxiliary_dev *padev;
>> +     struct pds_lm_comp comp = {0};
>> +
>> +     dev_dbg(&pds_vfio->pdev->dev, "vf%u: Resume device\n", pds_vfio->vf_id);
>> +
>> +     padev = pds_vfio->vfio_aux->padev;
>> +     return padev->ops->adminq_cmd(padev,
>> +                                  (union pds_core_adminq_cmd *)&cmd,
>> +                                  sizeof(cmd),
>> +                                  (union pds_core_adminq_comp *)&comp,
>> +                                  0);
>> +}
>> +
>> +int
>> +pds_vfio_get_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size)
>> +{
>> +     struct pds_lm_status_cmd cmd = {
>> +             .opcode = PDS_LM_CMD_STATUS,
>> +             .vf_id = cpu_to_le16(pds_vfio->vf_id),
>> +     };
>> +     struct pds_lm_status_comp comp = {0};
>> +     struct pds_auxiliary_dev *padev;
>> +     int err = 0;
>> +
>> +     dev_dbg(&pds_vfio->pdev->dev, "vf%u: Get migration status\n",
>> +             pds_vfio->vf_id);
>> +
>> +     padev = pds_vfio->vfio_aux->padev;
>> +     err = padev->ops->adminq_cmd(padev,
>> +                                  (union pds_core_adminq_cmd *)&cmd,
>> +                                  sizeof(cmd),
>> +                                  (union pds_core_adminq_comp *)&comp,
>> +                                  0);
>> +     if (err)
>> +             return err;
>> +
>> +     *size = le64_to_cpu(comp.size);
>> +     return 0;
>> +}
>> +
>> +static int
>> +pds_vfio_dma_map_lm_file(struct device *dev, enum dma_data_direction dir,
>> +                      struct pds_vfio_lm_file *lm_file)
>> +{
>> +     struct pds_lm_sg_elem *sgl, *sge;
>> +     struct scatterlist *sg;
>> +     int err = 0;
>> +     int i;
>> +
>> +     if (!lm_file)
>> +             return -EINVAL;
>> +
>> +     /* dma map file pages */
>> +     err = dma_map_sgtable(dev, &lm_file->sg_table, dir, 0);
>> +     if (err)
>> +             goto err_dma_map_sg;
>> +
>> +     lm_file->num_sge = lm_file->sg_table.nents;
>> +
>> +     /* alloc sgl */
>> +     sgl = dma_alloc_coherent(dev, lm_file->num_sge *
>> +                              sizeof(struct pds_lm_sg_elem),
>> +                              &lm_file->sgl_addr, GFP_KERNEL);
>> +     if (!sgl) {
>> +             err = -ENOMEM;
>> +             goto err_alloc_sgl;
>> +     }
>> +
>> +     lm_file->sgl = sgl;
>> +
>> +     /* fill sgl */
>> +     sge = sgl;
>> +     for_each_sgtable_dma_sg(&lm_file->sg_table, sg, i) {
>> +             sge->addr = cpu_to_le64(sg_dma_address(sg));
>> +             sge->len  = cpu_to_le32(sg_dma_len(sg));
>> +             dev_dbg(dev, "addr = %llx, len = %u\n", sge->addr, sge->len);
>> +             sge++;
>> +     }
>> +
>> +     return 0;
>> +
>> +err_alloc_sgl:
>> +     dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0);
>> +err_dma_map_sg:
>> +     return err;
>> +}
>> +
>> +static void
>> +pds_vfio_dma_unmap_lm_file(struct device *dev, enum dma_data_direction dir,
>> +                        struct pds_vfio_lm_file *lm_file)
>> +{
>> +     if (!lm_file)
>> +             return;
>> +
>> +     /* free sgl */
>> +     if (lm_file->sgl) {
>> +             dma_free_coherent(dev, lm_file->num_sge *
>> +                               sizeof(struct pds_lm_sg_elem),
>> +                               lm_file->sgl, lm_file->sgl_addr);
>> +             lm_file->sgl = NULL;
>> +             lm_file->sgl_addr = DMA_MAPPING_ERROR;
>> +             lm_file->num_sge = 0;
>> +     }
>> +
>> +     /* dma unmap file pages */
>> +     dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0);
>> +}
>> +
>> +int
>> +pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio)
>> +{
>> +     struct pds_lm_save_cmd cmd = {
>> +             .opcode = PDS_LM_CMD_SAVE,
>> +             .vf_id = cpu_to_le16(pds_vfio->vf_id),
>> +     };
>> +     struct pci_dev *pdev = pds_vfio->pdev;
>> +     struct pds_vfio_lm_file *lm_file;
>> +     struct pds_auxiliary_dev *padev;
>> +     struct pds_lm_comp comp = {0};
>> +     int err;
>> +
>> +     dev_dbg(&pdev->dev, "vf%u: Get migration state\n", pds_vfio->vf_id);
>> +
>> +     lm_file = pds_vfio->save_file;
>> +
>> +     padev = pds_vfio->vfio_aux->padev;
>> +     err = pds_vfio_dma_map_lm_file(pds_vfio->coredev, DMA_FROM_DEVICE, lm_file);
>> +     if (err) {
>> +             dev_err(&pdev->dev, "failed to map save migration file: %pe\n",
>> +                     ERR_PTR(err));
>> +             return err;
>> +     }
>> +
>> +     cmd.sgl_addr = cpu_to_le64(lm_file->sgl_addr);
>> +     cmd.num_sge = cpu_to_le32(lm_file->num_sge);
>> +
>> +     err = padev->ops->adminq_cmd(padev,
>> +                                  (union pds_core_adminq_cmd *)&cmd,
>> +                                  sizeof(cmd),
>> +                                  (union pds_core_adminq_comp *)&comp,
>> +                                  0);
>> +     if (err)
>> +             dev_err(&pdev->dev, "failed to get migration state: %pe\n",
>> +                     ERR_PTR(err));
>> +
>> +     pds_vfio_dma_unmap_lm_file(pds_vfio->coredev, DMA_FROM_DEVICE, lm_file);
>> +
>> +     return err;
>> +}
>> +
>> +int
>> +pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio)
>> +{
>> +     struct pds_lm_restore_cmd cmd = {
>> +             .opcode = PDS_LM_CMD_RESTORE,
>> +             .vf_id = cpu_to_le16(pds_vfio->vf_id),
>> +     };
>> +     struct pci_dev *pdev = pds_vfio->pdev;
>> +     struct pds_vfio_lm_file *lm_file;
>> +     struct pds_auxiliary_dev *padev;
>> +     struct pds_lm_comp comp = {0};
>> +     int err;
>> +
>> +     dev_dbg(&pdev->dev, "vf%u: Set migration state\n", pds_vfio->vf_id);
>> +
>> +     lm_file = pds_vfio->restore_file;
>> +
>> +     padev = pds_vfio->vfio_aux->padev;
>> +     err = pds_vfio_dma_map_lm_file(pds_vfio->coredev, DMA_TO_DEVICE, lm_file);
>> +     if (err) {
>> +             dev_err(&pdev->dev, "failed to map restore migration file: %pe\n",
>> +                     ERR_PTR(err));
>> +             return err;
>> +     }
>> +
>> +     cmd.sgl_addr = cpu_to_le64(lm_file->sgl_addr);
>> +     cmd.num_sge = cpu_to_le32(lm_file->num_sge);
>> +
>> +     err = padev->ops->adminq_cmd(padev,
>> +                                  (union pds_core_adminq_cmd *)&cmd,
>> +                                  sizeof(cmd),
>> +                                  (union pds_core_adminq_comp *)&comp,
>> +                                  0);
>> +     if (err)
>> +             dev_err(&pdev->dev, "failed to set migration state: %pe\n",
>> +                     ERR_PTR(err));
>> +
>> +     pds_vfio_dma_unmap_lm_file(pds_vfio->coredev, DMA_TO_DEVICE, lm_file);
>> +
>> +     return err;
>> +}
>> +
>> +void
>> +pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio,
>> +                                 enum pds_lm_host_vf_status vf_status)
>> +{
>> +     struct pds_auxiliary_dev *padev = pds_vfio->vfio_aux->padev;
>> +     struct pds_lm_host_vf_status_cmd cmd = {
>> +             .opcode = PDS_LM_CMD_HOST_VF_STATUS,
>> +             .vf_id = cpu_to_le16(pds_vfio->vf_id),
>> +             .status = vf_status,
>> +     };
>> +     struct pci_dev *pdev = pds_vfio->pdev;
>> +     struct pds_lm_comp comp = {0};
>> +     int err;
>> +
>> +     dev_dbg(&pdev->dev, "vf%u: Set host VF LM status: %u",
>> +             pds_vfio->vf_id, cmd.status);
>> +     if (vf_status != PDS_LM_STA_IN_PROGRESS &&
>> +         vf_status != PDS_LM_STA_NONE) {
>> +             dev_warn(&pdev->dev, "Invalid host VF migration status, %d\n",
>> +                      vf_status);
>> +             return;
>> +     }
>> +
>> +     err = padev->ops->adminq_cmd(padev,
>> +                                  (union pds_core_adminq_cmd *)&cmd,
>> +                                  sizeof(cmd),
>> +                                  (union pds_core_adminq_comp *)&comp,
>> +                                  0);
>> +     if (err)
>> +             dev_warn(&pdev->dev, "failed to send host VF migration status: %pe\n",
>> +                      ERR_PTR(err));
>> +}
>> diff --git a/drivers/vfio/pci/pds/cmds.h b/drivers/vfio/pci/pds/cmds.h
>> index 7fe2d1efd894..5f9ac45ee5a3 100644
>> --- a/drivers/vfio/pci/pds/cmds.h
>> +++ b/drivers/vfio/pci/pds/cmds.h
>> @@ -4,11 +4,26 @@
>>   #ifndef _CMDS_H_
>>   #define _CMDS_H_
>>
>> +#include <linux/pds/pds_lm.h>
>> +
>>   struct pds_vfio_pci_device;
>>
>>   int
>>   pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio);
>>   void
>>   pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio);
>> +int
>> +pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio);
>> +int
>> +pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio);
>> +int
>> +pds_vfio_get_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size);
>> +int
>> +pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio);
>> +int
>> +pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio);
>> +void
>> +pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio,
>> +                                 enum pds_lm_host_vf_status vf_status);
>>
>>   #endif /* _CMDS_H_ */
>> diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c
>> new file mode 100644
>> index 000000000000..86152b18a5b7
>> --- /dev/null
>> +++ b/drivers/vfio/pci/pds/lm.c
>> @@ -0,0 +1,491 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright(c) 2022 Pensando Systems, Inc */
>> +
>> +#include <linux/anon_inodes.h>
>> +#include <linux/file.h>
>> +#include <linux/fs.h>
>> +#include <linux/highmem.h>
>> +#include <linux/vfio.h>
>> +#include <linux/vfio_pci_core.h>
>> +
>> +#include "cmds.h"
>> +#include "vfio_dev.h"
>> +
>> +#define PDS_VFIO_LM_FILENAME "pds_vfio_lm"
>> +
>> +const char *
>> +pds_vfio_lm_state(enum vfio_device_mig_state state)
>> +{
>> +     switch (state) {
>> +     case VFIO_DEVICE_STATE_ERROR:
>> +             return "VFIO_DEVICE_STATE_ERROR";
>> +     case VFIO_DEVICE_STATE_STOP:
>> +             return "VFIO_DEVICE_STATE_STOP";
>> +     case VFIO_DEVICE_STATE_RUNNING:
>> +             return "VFIO_DEVICE_STATE_RUNNING";
>> +     case VFIO_DEVICE_STATE_STOP_COPY:
>> +             return "VFIO_DEVICE_STATE_STOP_COPY";
>> +     case VFIO_DEVICE_STATE_RESUMING:
>> +             return "VFIO_DEVICE_STATE_RESUMING";
>> +     case VFIO_DEVICE_STATE_RUNNING_P2P:
>> +             return "VFIO_DEVICE_STATE_RUNNING_P2P";
>> +     default:
>> +             return "VFIO_DEVICE_STATE_INVALID";
>> +     }
>> +
>> +     return "VFIO_DEVICE_STATE_INVALID";
>> +}
>> +
>> +static struct pds_vfio_lm_file *
>> +pds_vfio_get_lm_file(const char *name, const struct file_operations *fops,
>> +                  int flags, u64 size)
>> +{
>> +     struct pds_vfio_lm_file *lm_file = NULL;
>> +     unsigned long long npages;
>> +     unsigned long long i;
>> +     struct page **pages;
>> +     int err = 0;
>> +
>> +     if (!size)
>> +             return NULL;
>> +
>> +     /* Alloc file structure */
>> +     lm_file = kzalloc(sizeof(*lm_file), GFP_KERNEL);
>> +     if (!lm_file)
>> +             return NULL;
>> +
>> +     /* Create file */
>> +     lm_file->filep = anon_inode_getfile(name, fops, lm_file, flags);
>> +     if (!lm_file->filep)
>> +             goto err_get_file;
>> +
>> +     stream_open(lm_file->filep->f_inode, lm_file->filep);
>> +     mutex_init(&lm_file->lock);
>> +
>> +     lm_file->size = size;
>> +
>> +     /* Allocate memory for file pages */
>> +     npages = DIV_ROUND_UP_ULL(lm_file->size, PAGE_SIZE);
>> +
>> +     pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
>> +     if (!pages)
>> +             goto err_alloc_pages;
>> +
>> +     for (i = 0; i < npages; i++) {
>> +             pages[i] = alloc_page(GFP_KERNEL);
>> +             if (!pages[i])
>> +                     goto err_alloc_page;
>> +     }
>> +
>> +     lm_file->pages = pages;
>> +     lm_file->npages = npages;
>> +     lm_file->alloc_size = npages * PAGE_SIZE;
>> +
>> +     /* Create scatterlist of file pages to use for DMA mapping later */
>> +     err = sg_alloc_table_from_pages(&lm_file->sg_table, pages, npages,
>> +                                     0, size, GFP_KERNEL);
>> +     if (err)
>> +             goto err_alloc_sg_table;
>> +
>> +     return lm_file;
>> +
>> +err_alloc_sg_table:
>> +err_alloc_page:
>> +     /* free allocated pages */
>> +     for (i = 0; i < npages && pages[i]; i++)
>> +             __free_page(pages[i]);
>> +     kfree(pages);
>> +err_alloc_pages:
>> +     fput(lm_file->filep);
>> +     mutex_destroy(&lm_file->lock);
>> +err_get_file:
>> +     kfree(lm_file);
>> +
>> +     return NULL;
>> +}
>> +
>> +static void
>> +pds_vfio_put_lm_file(struct pds_vfio_lm_file *lm_file)
>> +{
>> +     unsigned long long i;
>> +
>> +     mutex_lock(&lm_file->lock);
>> +
>> +     lm_file->size = 0;
>> +     lm_file->alloc_size = 0;
>> +
>> +     /* Free scatter list of file pages*/
>> +     sg_free_table(&lm_file->sg_table);
>> +
>> +     /* Free allocated file pages */
>> +     for (i = 0; i < lm_file->npages && lm_file->pages[i]; i++)
>> +             __free_page(lm_file->pages[i]);
>> +     kfree(lm_file->pages);
>> +     lm_file->pages = NULL;
>> +
>> +     /* Delete file */
>> +     fput(lm_file->filep);
>> +     lm_file->filep = NULL;
>> +
>> +     mutex_unlock(&lm_file->lock);
>> +
>> +     mutex_destroy(&lm_file->lock);
>> +
>> +     /* Free file structure */
>> +     kfree(lm_file);
>> +}
>> +
>> +void
>> +pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio)
>> +{
>> +     if (!pds_vfio->save_file)
>> +             return;
>> +
>> +     pds_vfio_put_lm_file(pds_vfio->save_file);
>> +     pds_vfio->save_file = NULL;
>> +}
>> +
>> +void
>> +pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio)
>> +{
>> +     if (!pds_vfio->restore_file)
>> +             return;
>> +
>> +     pds_vfio_put_lm_file(pds_vfio->restore_file);
>> +     pds_vfio->restore_file = NULL;
>> +}
>> +
>> +static struct page *
>> +pds_vfio_get_file_page(struct pds_vfio_lm_file *lm_file,
>> +                    unsigned long offset)
>> +{
>> +     unsigned long cur_offset = 0;
>> +     struct scatterlist *sg;
>> +     unsigned int i;
>> +
>> +     /* All accesses are sequential */
>> +     if (offset < lm_file->last_offset || !lm_file->last_offset_sg) {
>> +             lm_file->last_offset = 0;
>> +             lm_file->last_offset_sg = lm_file->sg_table.sgl;
>> +             lm_file->sg_last_entry = 0;
>> +     }
>> +
>> +     cur_offset = lm_file->last_offset;
>> +
>> +     for_each_sg(lm_file->last_offset_sg, sg,
>> +                 lm_file->sg_table.orig_nents - lm_file->sg_last_entry,
>> +                 i) {
>> +             if (offset < sg->length + cur_offset) {
>> +                     lm_file->last_offset_sg = sg;
>> +                     lm_file->sg_last_entry += i;
>> +                     lm_file->last_offset = cur_offset;
>> +                     return nth_page(sg_page(sg),
>> +                                     (offset - cur_offset) / PAGE_SIZE);
>> +             }
>> +             cur_offset += sg->length;
>> +     }
>> +
>> +     return NULL;
>> +}
>> +
>> +static int
>> +pds_vfio_release_file(struct inode *inode, struct file *filp)
>> +{
>> +     struct pds_vfio_lm_file *lm_file = filp->private_data;
>> +
>> +     lm_file->size = 0;
>> +
>> +     return 0;
>> +}
>> +
>> +static ssize_t
>> +pds_vfio_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos)
>> +{
>> +     struct pds_vfio_lm_file *lm_file = filp->private_data;
>> +     ssize_t done = 0;
>> +
>> +     if (pos)
>> +             return -ESPIPE;
>> +     pos = &filp->f_pos;
>> +
>> +     mutex_lock(&lm_file->lock);
>> +     if (*pos > lm_file->size) {
>> +             done = -EINVAL;
>> +             goto out_unlock;
>> +     }
>> +
>> +     len = min_t(size_t, lm_file->size - *pos, len);
>> +     while (len) {
>> +             size_t page_offset;
>> +             struct page *page;
>> +             size_t page_len;
>> +             u8 *from_buff;
>> +             int err;
>> +
>> +             page_offset = (*pos) % PAGE_SIZE;
>> +             page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
>> +             if (!page) {
>> +                     if (done == 0)
>> +                             done = -EINVAL;
>> +                     goto out_unlock;
>> +             }
>> +
>> +             page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
>> +             from_buff = kmap_local_page(page);
>> +             err = copy_to_user(buf, from_buff + page_offset, page_len);
>> +             kunmap_local(from_buff);
>> +             if (err) {
>> +                     done = -EFAULT;
>> +                     goto out_unlock;
>> +             }
>> +             *pos += page_len;
>> +             len -= page_len;
>> +             done += page_len;
>> +             buf += page_len;
>> +     }
>> +
>> +out_unlock:
>> +     mutex_unlock(&lm_file->lock);
>> +     return done;
>> +}
>> +
>> +static const struct file_operations
>> +pds_vfio_save_fops = {
>> +     .owner = THIS_MODULE,
>> +     .read = pds_vfio_save_read,
>> +     .release = pds_vfio_release_file,
>> +     .llseek = no_llseek,
>> +};
>> +
>> +static int
>> +pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio)
>> +{
>> +     struct pci_dev *pdev = pds_vfio->pdev;
>> +     struct pds_vfio_lm_file *lm_file;
>> +     int err = 0;
>> +     u64 size;
>> +
>> +     /* Get live migration state size in this state */
>> +     err = pds_vfio_get_lm_status_cmd(pds_vfio, &size);
>> +     if (err) {
>> +             dev_err(&pdev->dev, "failed to get save status: %pe\n",
>> +                     ERR_PTR(err));
>> +             goto err_get_lm_status;
>> +     }
>> +
>> +     dev_dbg(&pdev->dev, "save status, size = %lld\n", size);
>> +
>> +     if (!size) {
>> +             err = -EIO;
>> +             dev_err(&pdev->dev, "invalid state size\n");
>> +             goto err_get_lm_status;
>> +     }
>> +
>> +     lm_file = pds_vfio_get_lm_file(PDS_VFIO_LM_FILENAME,
>> +                                    &pds_vfio_save_fops,
>> +                                    O_RDONLY, size);
>> +     if (!lm_file) {
>> +             err = -ENOENT;
>> +             dev_err(&pdev->dev, "failed to create save file\n");
>> +             goto err_get_lm_file;
>> +     }
>> +
>> +     dev_dbg(&pdev->dev, "size = %lld, alloc_size = %lld, npages = %lld\n",
>> +             lm_file->size, lm_file->alloc_size, lm_file->npages);
>> +
>> +     pds_vfio->save_file = lm_file;
>> +
>> +     return 0;
>> +
>> +err_get_lm_file:
>> +err_get_lm_status:
>> +     return err;
>> +}
>> +
>> +static ssize_t
>> +pds_vfio_restore_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos)
>> +{
>> +     struct pds_vfio_lm_file *lm_file = filp->private_data;
>> +     loff_t requested_length;
>> +     ssize_t done = 0;
>> +
>> +     if (pos)
>> +             return -ESPIPE;
>> +
>> +     pos = &filp->f_pos;
>> +
>> +     if (*pos < 0 ||
>> +         check_add_overflow((loff_t)len, *pos, &requested_length))
>> +             return -EINVAL;
>> +
>> +     mutex_lock(&lm_file->lock);
>> +
>> +     while (len) {
>> +             size_t page_offset;
>> +             struct page *page;
>> +             size_t page_len;
>> +             u8 *to_buff;
>> +             int err;
>> +
>> +             page_offset = (*pos) % PAGE_SIZE;
>> +             page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
>> +             if (!page) {
>> +                     if (done == 0)
>> +                             done = -EINVAL;
>> +                     goto out_unlock;
>> +             }
>> +
>> +             page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
>> +             to_buff = kmap_local_page(page);
>> +             err = copy_from_user(to_buff + page_offset, buf, page_len);
>> +             kunmap_local(to_buff);
>> +             if (err) {
>> +                     done = -EFAULT;
>> +                     goto out_unlock;
>> +             }
>> +             *pos += page_len;
>> +             len -= page_len;
>> +             done += page_len;
>> +             buf += page_len;
>> +             lm_file->size += page_len;
>> +     }
>> +out_unlock:
>> +     mutex_unlock(&lm_file->lock);
>> +     return done;
>> +}
>> +
>> +static const struct file_operations
>> +pds_vfio_restore_fops = {
>> +     .owner = THIS_MODULE,
>> +     .write = pds_vfio_restore_write,
>> +     .release = pds_vfio_release_file,
>> +     .llseek = no_llseek,
>> +};
>> +
>> +static int
>> +pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio)
>> +{
>> +     struct pci_dev *pdev = pds_vfio->pdev;
>> +     struct pds_vfio_lm_file *lm_file;
>> +     int err = 0;
>> +     u64 size;
>> +
>> +     size = sizeof(union pds_lm_dev_state);
>> +
>> +     dev_dbg(&pdev->dev, "restore status, size = %lld\n", size);
>> +
>> +     if (!size) {
>> +             err = -EIO;
>> +             dev_err(&pdev->dev, "invalid state size");
>> +             goto err_get_lm_status;
>> +     }
>> +
>> +     lm_file = pds_vfio_get_lm_file(PDS_VFIO_LM_FILENAME,
>> +                                    &pds_vfio_restore_fops,
>> +                                    O_WRONLY, size);
>> +     if (!lm_file) {
>> +             err = -ENOENT;
>> +             dev_err(&pdev->dev, "failed to create restore file");
>> +             goto err_get_lm_file;
>> +     }
>> +     pds_vfio->restore_file = lm_file;
>> +
>> +     return 0;
>> +
>> +err_get_lm_file:
>> +err_get_lm_status:
>> +     return err;
>> +}
>> +
>> +struct file *
>> +pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
>> +                               enum vfio_device_mig_state next)
>> +{
>> +     enum vfio_device_mig_state cur = pds_vfio->state;
>> +     struct device *dev = &pds_vfio->pdev->dev;
>> +     int err = 0;
>> +
>> +     dev_dbg(dev, "%s => %s\n",
>> +             pds_vfio_lm_state(cur), pds_vfio_lm_state(next));
>> +
>> +     if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_STOP_COPY) {
>> +             /* Device is already stopped
>> +              * create save device data file & get device state from firmware
>> +              */
>> +             err = pds_vfio_get_save_file(pds_vfio);
>> +             if (err)
>> +                     return ERR_PTR(err);
>> +
>> +             /* Get device state */
>> +             err = pds_vfio_get_lm_state_cmd(pds_vfio);
>> +             if (err) {
>> +                     pds_vfio_put_save_file(pds_vfio);
>> +                     return ERR_PTR(err);
>> +             }
>> +
>> +             return pds_vfio->save_file->filep;
>> +     }
>> +
>> +     if (cur == VFIO_DEVICE_STATE_STOP_COPY && next == VFIO_DEVICE_STATE_STOP) {
>> +             /* Device is already stopped
>> +              * delete the save device state file
>> +              */
>> +             pds_vfio_put_save_file(pds_vfio);
>> +             pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
>> +                                                 PDS_LM_STA_NONE);
>> +             return NULL;
>> +     }
>> +
>> +     if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RESUMING) {
>> +             /* create resume device data file */
>> +             err = pds_vfio_get_restore_file(pds_vfio);
>> +             if (err)
>> +                     return ERR_PTR(err);
>> +
>> +             return pds_vfio->restore_file->filep;
>> +     }
>> +
>> +     if (cur == VFIO_DEVICE_STATE_RESUMING && next == VFIO_DEVICE_STATE_STOP) {
>> +             /* Set device state */
>> +             err = pds_vfio_set_lm_state_cmd(pds_vfio);
>> +             if (err)
>> +                     return ERR_PTR(err);
>> +
>> +             /* delete resume device data file */
>> +             pds_vfio_put_restore_file(pds_vfio);
>> +             return NULL;
>> +     }
>> +
>> +     if (cur == VFIO_DEVICE_STATE_RUNNING && next == VFIO_DEVICE_STATE_RUNNING_P2P) {
>> +             pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS);
>> +             /* Device should be stopped
>> +              * no interrupts, dma or change in internal state
>> +              */
>> +             err = pds_vfio_suspend_device_cmd(pds_vfio);
>> +             if (err)
>> +                     return ERR_PTR(err);
>> +
>> +             return NULL;
>> +     }
>> +
>> +     if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_RUNNING) {
>> +             /* Device should be functional
>> +              * interrupts, dma, mmio or changes to internal state is allowed
>> +              */
>> +             err = pds_vfio_resume_device_cmd(pds_vfio);
>> +             if (err)
>> +                     return ERR_PTR(err);
>> +
>> +             pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
>> +                                                 PDS_LM_STA_NONE);
>> +             return NULL;
>> +     }
>> +
>> +     if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RUNNING_P2P)
>> +             return NULL;
>> +
>> +     if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_STOP)
>> +             return NULL;
>> +
>> +     return ERR_PTR(-EINVAL);
>> +}
>> diff --git a/drivers/vfio/pci/pds/lm.h b/drivers/vfio/pci/pds/lm.h
>> new file mode 100644
>> index 000000000000..3dd97b807db6
>> --- /dev/null
>> +++ b/drivers/vfio/pci/pds/lm.h
>> @@ -0,0 +1,43 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright(c) 2022 Pensando Systems, Inc */
>> +
>> +#ifndef _LM_H_
>> +#define _LM_H_
>> +
>> +#include <linux/fs.h>
>> +#include <linux/mutex.h>
>> +#include <linux/scatterlist.h>
>> +#include <linux/types.h>
>> +#include <linux/vfio.h>
>> +
>> +#include <linux/pds/pds_lm.h>
>> +
>> +struct pds_vfio_lm_file {
>> +     struct file *filep;
>> +     struct mutex lock;      /* protect live migration data file */
>> +     u64 size;               /* Size with valid data */
>> +     u64 alloc_size;         /* Total allocated size. Always >= len */
>> +     struct page **pages;    /* Backing pages for file */
>> +     unsigned long long npages;
>> +     struct sg_table sg_table;       /* SG table for backing pages */
>> +     struct pds_lm_sg_elem *sgl;     /* DMA mapping */
>> +     dma_addr_t sgl_addr;
>> +     u16 num_sge;
>> +     struct scatterlist *last_offset_sg;     /* Iterator */
>> +     unsigned int sg_last_entry;
>> +     unsigned long last_offset;
>> +};
>> +
>> +struct pds_vfio_pci_device;
>> +
>> +struct file *
>> +pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
>> +                               enum vfio_device_mig_state next);
>> +const char *
>> +pds_vfio_lm_state(enum vfio_device_mig_state state);
>> +void
>> +pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio);
>> +void
>> +pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio);
>> +
>> +#endif /* _LM_H_ */
>> diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c
>> index d6ad15719ec4..aafa013ee8c2 100644
>> --- a/drivers/vfio/pci/pds/pci_drv.c
>> +++ b/drivers/vfio/pci/pds/pci_drv.c
>> @@ -67,12 +67,27 @@ pds_vfio_pci_table[] = {
>>   };
>>   MODULE_DEVICE_TABLE(pci, pds_vfio_pci_table);
>>
>> +static void
>> +pds_vfio_pci_aer_reset_done(struct pci_dev *pdev)
>> +{
>> +     struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev);
>> +
>> +     pds_vfio_reset(pds_vfio);
>> +}
>> +
>> +static const struct
>> +pci_error_handlers pds_vfio_pci_err_handlers = {
>> +     .reset_done = pds_vfio_pci_aer_reset_done,
>> +     .error_detected = vfio_pci_core_aer_err_detected,
>> +};
>> +
>>   static struct pci_driver
>>   pds_vfio_pci_driver = {
>>        .name = PDS_VFIO_DRV_NAME,
>>        .id_table = pds_vfio_pci_table,
>>        .probe = pds_vfio_pci_probe,
>>        .remove = pds_vfio_pci_remove,
>> +     .err_handler = &pds_vfio_pci_err_handlers,
>>        .driver_managed_dma = true,
>>   };
>>
>> diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
>> index 30c3bb47a2be..d28f96f93097 100644
>> --- a/drivers/vfio/pci/pds/vfio_dev.c
>> +++ b/drivers/vfio/pci/pds/vfio_dev.c
>> @@ -4,6 +4,7 @@
>>   #include <linux/vfio.h>
>>   #include <linux/vfio_pci_core.h>
>>
>> +#include "lm.h"
>>   #include "vfio_dev.h"
>>   #include "aux_drv.h"
>>
>> @@ -16,6 +17,97 @@ pds_vfio_pci_drvdata(struct pci_dev *pdev)
>>                            vfio_coredev);
>>   }
>>
>> +static void
>> +pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
>> +{
>> +again:
>> +     spin_lock(&pds_vfio->reset_lock);
>> +     if (pds_vfio->deferred_reset) {
>> +             pds_vfio->deferred_reset = false;
>> +             if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) {
>> +                     pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
>> +                     pds_vfio_put_restore_file(pds_vfio);
>> +                     pds_vfio_put_save_file(pds_vfio);
>> +             }
>> +             spin_unlock(&pds_vfio->reset_lock);
>> +             goto again;
>> +     }
>> +     mutex_unlock(&pds_vfio->state_mutex);
>> +     spin_unlock(&pds_vfio->reset_lock);
>> +}
>> +
>> +void
>> +pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio)
>> +{
>> +     spin_lock(&pds_vfio->reset_lock);
>> +     pds_vfio->deferred_reset = true;
>> +     if (!mutex_trylock(&pds_vfio->state_mutex)) {
>> +             spin_unlock(&pds_vfio->reset_lock);
>> +             return;
>> +     }
>> +     spin_unlock(&pds_vfio->reset_lock);
>> +     pds_vfio_state_mutex_unlock(pds_vfio);
>> +}
>> +
>> +static struct file *
>> +pds_vfio_set_device_state(struct vfio_device *vdev,
>> +                       enum vfio_device_mig_state new_state)
>> +{
>> +     struct pds_vfio_pci_device *pds_vfio =
>> +             container_of(vdev, struct pds_vfio_pci_device,
>> +                          vfio_coredev.vdev);
>> +     struct file *res = NULL;
>> +
>> +     if (!pds_vfio->vfio_aux)
>> +             return ERR_PTR(-ENODEV);
>> +
>> +     mutex_lock(&pds_vfio->state_mutex);
>> +     while (new_state != pds_vfio->state) {
>> +             enum vfio_device_mig_state next_state;
>> +
>> +             int err = vfio_mig_get_next_state(vdev, pds_vfio->state,
>> +                                               new_state, &next_state);
>> +             if (err) {
>> +                     res = ERR_PTR(err);
>> +                     break;
>> +             }
>> +
>> +             res = pds_vfio_step_device_state_locked(pds_vfio, next_state);
>> +             if (IS_ERR(res))
>> +                     break;
>> +
>> +             pds_vfio->state = next_state;
>> +
>> +             if (WARN_ON(res && new_state != pds_vfio->state)) {
>> +                     res = ERR_PTR(-EINVAL);
>> +                     break;
>> +             }
>> +     }
>> +     pds_vfio_state_mutex_unlock(pds_vfio);
>> +
>> +     return res;
>> +}
>> +
>> +static int
>> +pds_vfio_get_device_state(struct vfio_device *vdev,
>> +                       enum vfio_device_mig_state *current_state)
>> +{
>> +     struct pds_vfio_pci_device *pds_vfio =
>> +             container_of(vdev, struct pds_vfio_pci_device,
>> +                          vfio_coredev.vdev);
>> +
>> +     mutex_lock(&pds_vfio->state_mutex);
>> +     *current_state = pds_vfio->state;
>> +     pds_vfio_state_mutex_unlock(pds_vfio);
>> +     return 0;
>> +}
>> +
>> +static const struct vfio_migration_ops
>> +pds_vfio_lm_ops = {
>> +     .migration_set_state = pds_vfio_set_device_state,
>> +     .migration_get_state = pds_vfio_get_device_state
>> +};
>> +
>>   static int
>>   pds_vfio_init_device(struct vfio_device *vdev)
>>   {
>> @@ -35,10 +127,19 @@ pds_vfio_init_device(struct vfio_device *vdev)
>>        vfio_aux = pds_vfio_aux_get_drvdata(pds_vfio->pci_id);
>>        if (vfio_aux) {
>>                vfio_aux->pds_vfio = pds_vfio;
>> +             pds_vfio->coredev = vfio_aux->padev->aux_dev.dev.parent;
>>                pds_vfio->vfio_aux = vfio_aux;
>>                pds_vfio_put_aux_dev(vfio_aux);
>>        }
>>
>> +     vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
>> +     vdev->mig_ops = &pds_vfio_lm_ops;
>> +
>> +     dev_dbg(&pdev->dev, "%s: PF %#04x VF %#04x (%d) vf_id %d domain %d vfio_aux %p pds_vfio %p\n",
>> +             __func__, pci_dev_id(pdev->physfn),
>> +             pds_vfio->pci_id, pds_vfio->pci_id, pds_vfio->vf_id,
>> +             pci_domain_nr(pdev->bus), pds_vfio->vfio_aux, pds_vfio);
>> +
>>        return 0;
>>   }
>>
>> @@ -54,18 +155,34 @@ pds_vfio_open_device(struct vfio_device *vdev)
>>        if (err)
>>                return err;
>>
>> +     mutex_init(&pds_vfio->state_mutex);
>> +     dev_dbg(&pds_vfio->pdev->dev, "%s: %s => VFIO_DEVICE_STATE_RUNNING\n",
>> +             __func__, pds_vfio_lm_state(pds_vfio->state));
>> +     pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
>> +
>>        vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev);
>>
>>        return 0;
>>   }
>>
>> +static void
>> +pds_vfio_close_device(struct vfio_device *vdev)
>> +{
>> +     struct pds_vfio_pci_device *pds_vfio =
>> +             container_of(vdev, struct pds_vfio_pci_device,
>> +                          vfio_coredev.vdev);
>> +
>> +     mutex_destroy(&pds_vfio->state_mutex);
>> +     vfio_pci_core_close_device(vdev);
>> +}
>> +
>>   static const struct vfio_device_ops
>>   pds_vfio_ops = {
>>        .name = "pds-vfio",
>>        .init = pds_vfio_init_device,
>>        .release = vfio_pci_core_release_dev,
>>        .open_device = pds_vfio_open_device,
>> -     .close_device = vfio_pci_core_close_device,
>> +     .close_device = pds_vfio_close_device,
>>        .ioctl = vfio_pci_core_ioctl,
>>        .device_feature = vfio_pci_core_ioctl_feature,
>>        .read = vfio_pci_core_read,
>> diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h
>> index b16668693e1f..a09570eec6fa 100644
>> --- a/drivers/vfio/pci/pds/vfio_dev.h
>> +++ b/drivers/vfio/pci/pds/vfio_dev.h
>> @@ -7,10 +7,20 @@
>>   #include <linux/pci.h>
>>   #include <linux/vfio_pci_core.h>
>>
>> +#include "lm.h"
>> +
>>   struct pds_vfio_pci_device {
>>        struct vfio_pci_core_device vfio_coredev;
>>        struct pci_dev *pdev;
>>        struct pds_vfio_aux *vfio_aux;
>> +     struct device *coredev;
>> +
>> +     struct pds_vfio_lm_file *save_file;
>> +     struct pds_vfio_lm_file *restore_file;
>> +     struct mutex state_mutex; /* protect migration state */
>> +     enum vfio_device_mig_state state;
>> +     spinlock_t reset_lock; /* protect reset_done flow */
>> +     u8 deferred_reset;
>>
>>        int vf_id;
>>        int pci_id;
>> @@ -20,5 +30,7 @@ const struct vfio_device_ops *
>>   pds_vfio_ops_info(void);
>>   struct pds_vfio_pci_device *
>>   pds_vfio_pci_drvdata(struct pci_dev *pdev);
>> +void
>> +pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio);
>>
>>   #endif /* _VFIO_DEV_H_ */
>> diff --git a/include/linux/pds/pds_lm.h b/include/linux/pds/pds_lm.h
>> index fdaf2bf71d35..28ebd62f7583 100644
>> --- a/include/linux/pds/pds_lm.h
>> +++ b/include/linux/pds/pds_lm.h
>> @@ -8,5 +8,211 @@
>>
>>   #define PDS_DEV_TYPE_LM_STR  "LM"
>>   #define PDS_LM_DEV_NAME              PDS_CORE_DRV_NAME "." PDS_DEV_TYPE_LM_STR
>> +#define PDS_LM_DEVICE_STATE_LENGTH           65536
>> +#define PDS_LM_CHECK_DEVICE_STATE_LENGTH(X)                                  \
>> +                     PDS_CORE_SIZE_CHECK(union, PDS_LM_DEVICE_STATE_LENGTH, X)
>> +
>> +/*
>> + * enum pds_lm_cmd_opcode - Live Migration Device commands
>> + */
>> +enum pds_lm_cmd_opcode {
>> +     PDS_LM_CMD_HOST_VF_STATUS  = 1,
>> +
>> +     /* Device state commands */
>> +     PDS_LM_CMD_STATUS          = 16,
>> +     PDS_LM_CMD_SUSPEND         = 18,
>> +     PDS_LM_CMD_SUSPEND_STATUS  = 19,
>> +     PDS_LM_CMD_RESUME          = 20,
>> +     PDS_LM_CMD_SAVE            = 21,
>> +     PDS_LM_CMD_RESTORE         = 22,
>> +};
>> +
>> +/**
>> + * struct pds_lm_cmd - generic command
>> + * @opcode:  Opcode
>> + * @rsvd:    Word boundary padding
>> + * @vf_id:   VF id
>> + * @rsvd2:   Structure padding to 60 Bytes
>> + */
>> +struct pds_lm_cmd {
>> +     u8     opcode;
>> +     u8     rsvd;
>> +     __le16 vf_id;
>> +     u8     rsvd2[56];
>> +};
>> +
>> +/**
>> + * struct pds_lm_comp - generic command completion
>> + * @status:  Status of the command (enum pds_core_status_code)
>> + * @rsvd:    Structure padding to 16 Bytes
>> + */
>> +struct pds_lm_comp {
>> +     u8 status;
>> +     u8 rsvd[15];
>> +};
>> +
>> +/**
>> + * struct pds_lm_status_cmd - STATUS command
>> + * @opcode:  Opcode
>> + * @rsvd:    Word boundary padding
>> + * @vf_id:   VF id
>> + */
>> +struct pds_lm_status_cmd {
>> +     u8     opcode;
>> +     u8     rsvd;
>> +     __le16 vf_id;
>> +};
>> +
>> +/**
>> + * struct pds_lm_status_comp - STATUS command completion
>> + * @status:          Status of the command (enum pds_core_status_code)
>> + * @rsvd:            Word boundary padding
>> + * @comp_index:              Index in the desc ring for which this is the completion
>> + * @size:            Size of the device state
>> + * @rsvd2:           Word boundary padding
>> + * @color:           Color bit
>> + */
>> +struct pds_lm_status_comp {
>> +     u8     status;
>> +     u8     rsvd;
>> +     __le16 comp_index;
>> +     union {
>> +             __le64 size;
>> +             u8     rsvd2[11];
>> +     } __packed;
>> +     u8     color;
>> +};
>> +
>> +/**
>> + * struct pds_lm_suspend_cmd - SUSPEND command
>> + * @opcode:  Opcode PDS_LM_CMD_SUSPEND
>> + * @rsvd:    Word boundary padding
>> + * @vf_id:   VF id
>> + */
>> +struct pds_lm_suspend_cmd {
>> +     u8     opcode;
>> +     u8     rsvd;
>> +     __le16 vf_id;
>> +};
>> +
>> +/**
>> + * struct pds_lm_suspend_comp - SUSPEND command completion
>> + * @status:          Status of the command (enum pds_core_status_code)
>> + * @rsvd:            Word boundary padding
>> + * @comp_index:              Index in the desc ring for which this is the completion
>> + * @state_size:              Size of the device state computed post suspend.
>> + * @rsvd2:           Word boundary padding
>> + * @color:           Color bit
>> + */
>> +struct pds_lm_suspend_comp {
>> +     u8     status;
>> +     u8     rsvd;
>> +     __le16 comp_index;
>> +     union {
>> +             __le64 state_size;
>> +             u8     rsvd2[11];
>> +     } __packed;
>> +     u8     color;
>> +};
>> +
>> +/**
>> + * struct pds_lm_suspend_status_cmd - SUSPEND status command
>> + * @opcode:  Opcode PDS_AQ_CMD_LM_SUSPEND_STATUS
>> + * @rsvd:    Word boundary padding
>> + * @vf_id:   VF id
>> + */
>> +struct pds_lm_suspend_status_cmd {
>> +     u8 opcode;
>> +     u8 rsvd;
>> +     __le16 vf_id;
>> +};
>> +
>> +/**
>> + * struct pds_lm_resume_cmd - RESUME command
>> + * @opcode:  Opcode PDS_LM_CMD_RESUME
>> + * @rsvd:    Word boundary padding
>> + * @vf_id:   VF id
>> + */
>> +struct pds_lm_resume_cmd {
>> +     u8     opcode;
>> +     u8     rsvd;
>> +     __le16 vf_id;
>> +};
>> +
>> +/**
>> + * struct pds_lm_sg_elem - Transmit scatter-gather (SG) descriptor element
>> + * @addr:    DMA address of SG element data buffer
>> + * @len:     Length of SG element data buffer, in bytes
>> + * @rsvd:    Word boundary padding
>> + */
>> +struct pds_lm_sg_elem {
>> +     __le64 addr;
>> +     __le32 len;
>> +     __le16 rsvd[2];
>> +};
>> +
>> +/**
>> + * struct pds_lm_save_cmd - SAVE command
>> + * @opcode:  Opcode PDS_LM_CMD_SAVE
>> + * @rsvd:    Word boundary padding
>> + * @vf_id:   VF id
>> + * @rsvd2:   Word boundary padding
>> + * @sgl_addr:        IOVA address of the SGL to dma the device state
>> + * @num_sge: Total number of SG elements
>> + */
>> +struct pds_lm_save_cmd {
>> +     u8     opcode;
>> +     u8     rsvd;
>> +     __le16 vf_id;
>> +     u8     rsvd2[4];
>> +     __le64 sgl_addr;
>> +     __le32 num_sge;
>> +} __packed;
>> +
>> +/**
>> + * struct pds_lm_restore_cmd - RESTORE command
>> + * @opcode:  Opcode PDS_LM_CMD_RESTORE
>> + * @rsvd:    Word boundary padding
>> + * @vf_id:   VF id
>> + * @rsvd2:   Word boundary padding
>> + * @sgl_addr:        IOVA address of the SGL to dma the device state
>> + * @num_sge: Total number of SG elements
>> + */
>> +struct pds_lm_restore_cmd {
>> +     u8     opcode;
>> +     u8     rsvd;
>> +     __le16 vf_id;
>> +     u8     rsvd2[4];
>> +     __le64 sgl_addr;
>> +     __le32 num_sge;
>> +} __packed;
>> +
>> +/**
>> + * union pds_lm_dev_state - device state information
>> + * @words:   Device state words
>> + */
>> +union pds_lm_dev_state {
>> +     __le32 words[PDS_LM_DEVICE_STATE_LENGTH / sizeof(__le32)];
>> +};
>> +
>> +enum pds_lm_host_vf_status {
>> +     PDS_LM_STA_NONE = 0,
>> +     PDS_LM_STA_IN_PROGRESS,
>> +     PDS_LM_STA_MAX,
>> +};
>> +
>> +/**
>> + * struct pds_lm_host_vf_status_cmd - HOST_VF_STATUS command
>> + * @opcode:  Opcode PDS_LM_CMD_HOST_VF_STATUS
>> + * @rsvd:    Word boundary padding
>> + * @vf_id:   VF id
>> + * @status:  Current LM status of host VF driver (enum pds_lm_host_status)
>> + */
>> +struct pds_lm_host_vf_status_cmd {
>> +     u8     opcode;
>> +     u8     rsvd;
>> +     __le16 vf_id;
>> +     u8     status;
>> +};
>>
>>   #endif /* _PDS_LM_H_ */
>>
diff mbox series

Patch

diff --git a/drivers/vfio/pci/pds/Makefile b/drivers/vfio/pci/pds/Makefile
index 49bf8289739b..b4980323cff4 100644
--- a/drivers/vfio/pci/pds/Makefile
+++ b/drivers/vfio/pci/pds/Makefile
@@ -4,5 +4,6 @@  obj-$(CONFIG_PDS_VFIO_PCI) += pds_vfio.o
 pds_vfio-y := \
 	aux_drv.o	\
 	cmds.o		\
+	lm.o		\
 	pci_drv.o	\
 	vfio_dev.o
diff --git a/drivers/vfio/pci/pds/aux_drv.c b/drivers/vfio/pci/pds/aux_drv.c
index 494551894926..59b43075e85f 100644
--- a/drivers/vfio/pci/pds/aux_drv.c
+++ b/drivers/vfio/pci/pds/aux_drv.c
@@ -77,6 +77,7 @@  pds_vfio_aux_probe(struct auxiliary_device *aux_dev,
 	dev_dbg(dev, "%s: id %#04x busnr %#x devfn %#x bus %p pds_vfio %p\n",
 		__func__, padev->id, busnr, devfn, bus, vfio_aux->pds_vfio);
 
+	vfio_aux->pds_vfio->coredev = aux_dev->dev.parent;
 	vfio_aux->pds_vfio->vfio_aux = vfio_aux;
 
 	vfio_aux->padrv.event_handler = pds_vfio_aux_notify_handler;
diff --git a/drivers/vfio/pci/pds/cmds.c b/drivers/vfio/pci/pds/cmds.c
index 5a3fadcd38d8..11823d824ccc 100644
--- a/drivers/vfio/pci/pds/cmds.c
+++ b/drivers/vfio/pci/pds/cmds.c
@@ -3,9 +3,11 @@ 
 
 #include <linux/io.h>
 #include <linux/types.h>
+#include <linux/delay.h>
 
 #include <linux/pds/pds_core_if.h>
 #include <linux/pds/pds_adminq.h>
+#include <linux/pds/pds_lm.h>
 #include <linux/pds/pds_auxbus.h>
 
 #include "vfio_dev.h"
@@ -28,3 +30,313 @@  pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio)
 
 	padev->ops->unregister_client(padev);
 }
+
+#define SUSPEND_TIMEOUT_S		5
+#define SUSPEND_CHECK_INTERVAL_MS	1
+
+static int
+pds_vfio_suspend_wait_device_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+	struct pds_lm_suspend_status_cmd cmd = {
+		.opcode = PDS_LM_CMD_SUSPEND_STATUS,
+		.vf_id	= cpu_to_le16(pds_vfio->vf_id),
+	};
+	struct pci_dev *pdev = pds_vfio->pdev;
+	struct pds_lm_comp comp = { 0 };
+	struct pds_auxiliary_dev *padev;
+	unsigned long time_limit;
+	unsigned long time_start;
+	unsigned long time_done;
+	int err;
+
+	padev = pds_vfio->vfio_aux->padev;
+
+	time_start = jiffies;
+	time_limit = time_start + HZ * SUSPEND_TIMEOUT_S;
+	do {
+		err = padev->ops->adminq_cmd(padev,
+					     (union pds_core_adminq_cmd *)&cmd,
+					     sizeof(cmd),
+					     (union pds_core_adminq_comp *)&comp,
+					      PDS_AQ_FLAG_FASTPOLL);
+		if (err != -EAGAIN)
+			break;
+
+		msleep(SUSPEND_CHECK_INTERVAL_MS);
+	} while (time_before(jiffies, time_limit));
+
+	time_done = jiffies;
+	dev_dbg(&pdev->dev, "%s: vf%u: Suspend comp received in %d msecs\n",
+		__func__, pds_vfio->vf_id,
+		jiffies_to_msecs(time_done - time_start));
+
+	/* Check the results */
+	if (time_after_eq(time_done, time_limit)) {
+		dev_err(&pdev->dev, "%s: vf%u: Suspend comp timeout\n", __func__,
+			pds_vfio->vf_id);
+		err = -ETIMEDOUT;
+	}
+
+	return err;
+}
+
+int
+pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+	struct pds_lm_suspend_cmd cmd = {
+		.opcode = PDS_LM_CMD_SUSPEND,
+		.vf_id = cpu_to_le16(pds_vfio->vf_id),
+	};
+	struct pds_lm_suspend_comp comp = {0};
+	struct pci_dev *pdev = pds_vfio->pdev;
+	struct pds_auxiliary_dev *padev;
+	int err;
+
+	dev_dbg(&pdev->dev, "vf%u: Suspend device\n", pds_vfio->vf_id);
+
+	padev = pds_vfio->vfio_aux->padev;
+	err = padev->ops->adminq_cmd(padev,
+				     (union pds_core_adminq_cmd *)&cmd,
+				     sizeof(cmd),
+				     (union pds_core_adminq_comp *)&comp,
+				     PDS_AQ_FLAG_FASTPOLL);
+	if (err) {
+		dev_err(&pdev->dev, "vf%u: Suspend failed: %pe\n",
+			pds_vfio->vf_id, ERR_PTR(err));
+		return err;
+	}
+
+	return pds_vfio_suspend_wait_device_cmd(pds_vfio);
+}
+
+int
+pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+	struct pds_lm_resume_cmd cmd = {
+		.opcode = PDS_LM_CMD_RESUME,
+		.vf_id = cpu_to_le16(pds_vfio->vf_id),
+	};
+	struct pds_auxiliary_dev *padev;
+	struct pds_lm_comp comp = {0};
+
+	dev_dbg(&pds_vfio->pdev->dev, "vf%u: Resume device\n", pds_vfio->vf_id);
+
+	padev = pds_vfio->vfio_aux->padev;
+	return padev->ops->adminq_cmd(padev,
+				     (union pds_core_adminq_cmd *)&cmd,
+				     sizeof(cmd),
+				     (union pds_core_adminq_comp *)&comp,
+				     0);
+}
+
+int
+pds_vfio_get_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size)
+{
+	struct pds_lm_status_cmd cmd = {
+		.opcode = PDS_LM_CMD_STATUS,
+		.vf_id = cpu_to_le16(pds_vfio->vf_id),
+	};
+	struct pds_lm_status_comp comp = {0};
+	struct pds_auxiliary_dev *padev;
+	int err = 0;
+
+	dev_dbg(&pds_vfio->pdev->dev, "vf%u: Get migration status\n",
+		pds_vfio->vf_id);
+
+	padev = pds_vfio->vfio_aux->padev;
+	err = padev->ops->adminq_cmd(padev,
+				     (union pds_core_adminq_cmd *)&cmd,
+				     sizeof(cmd),
+				     (union pds_core_adminq_comp *)&comp,
+				     0);
+	if (err)
+		return err;
+
+	*size = le64_to_cpu(comp.size);
+	return 0;
+}
+
+static int
+pds_vfio_dma_map_lm_file(struct device *dev, enum dma_data_direction dir,
+			 struct pds_vfio_lm_file *lm_file)
+{
+	struct pds_lm_sg_elem *sgl, *sge;
+	struct scatterlist *sg;
+	int err = 0;
+	int i;
+
+	if (!lm_file)
+		return -EINVAL;
+
+	/* dma map file pages */
+	err = dma_map_sgtable(dev, &lm_file->sg_table, dir, 0);
+	if (err)
+		goto err_dma_map_sg;
+
+	lm_file->num_sge = lm_file->sg_table.nents;
+
+	/* alloc sgl */
+	sgl = dma_alloc_coherent(dev, lm_file->num_sge *
+				 sizeof(struct pds_lm_sg_elem),
+				 &lm_file->sgl_addr, GFP_KERNEL);
+	if (!sgl) {
+		err = -ENOMEM;
+		goto err_alloc_sgl;
+	}
+
+	lm_file->sgl = sgl;
+
+	/* fill sgl */
+	sge = sgl;
+	for_each_sgtable_dma_sg(&lm_file->sg_table, sg, i) {
+		sge->addr = cpu_to_le64(sg_dma_address(sg));
+		sge->len  = cpu_to_le32(sg_dma_len(sg));
+		dev_dbg(dev, "addr = %llx, len = %u\n", sge->addr, sge->len);
+		sge++;
+	}
+
+	return 0;
+
+err_alloc_sgl:
+	dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0);
+err_dma_map_sg:
+	return err;
+}
+
+static void
+pds_vfio_dma_unmap_lm_file(struct device *dev, enum dma_data_direction dir,
+			   struct pds_vfio_lm_file *lm_file)
+{
+	if (!lm_file)
+		return;
+
+	/* free sgl */
+	if (lm_file->sgl) {
+		dma_free_coherent(dev, lm_file->num_sge *
+				  sizeof(struct pds_lm_sg_elem),
+				  lm_file->sgl, lm_file->sgl_addr);
+		lm_file->sgl = NULL;
+		lm_file->sgl_addr = DMA_MAPPING_ERROR;
+		lm_file->num_sge = 0;
+	}
+
+	/* dma unmap file pages */
+	dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0);
+}
+
+int
+pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+	struct pds_lm_save_cmd cmd = {
+		.opcode = PDS_LM_CMD_SAVE,
+		.vf_id = cpu_to_le16(pds_vfio->vf_id),
+	};
+	struct pci_dev *pdev = pds_vfio->pdev;
+	struct pds_vfio_lm_file *lm_file;
+	struct pds_auxiliary_dev *padev;
+	struct pds_lm_comp comp = {0};
+	int err;
+
+	dev_dbg(&pdev->dev, "vf%u: Get migration state\n", pds_vfio->vf_id);
+
+	lm_file = pds_vfio->save_file;
+
+	padev = pds_vfio->vfio_aux->padev;
+	err = pds_vfio_dma_map_lm_file(pds_vfio->coredev, DMA_FROM_DEVICE, lm_file);
+	if (err) {
+		dev_err(&pdev->dev, "failed to map save migration file: %pe\n",
+			ERR_PTR(err));
+		return err;
+	}
+
+	cmd.sgl_addr = cpu_to_le64(lm_file->sgl_addr);
+	cmd.num_sge = cpu_to_le32(lm_file->num_sge);
+
+	err = padev->ops->adminq_cmd(padev,
+				     (union pds_core_adminq_cmd *)&cmd,
+				     sizeof(cmd),
+				     (union pds_core_adminq_comp *)&comp,
+				     0);
+	if (err)
+		dev_err(&pdev->dev, "failed to get migration state: %pe\n",
+			ERR_PTR(err));
+
+	pds_vfio_dma_unmap_lm_file(pds_vfio->coredev, DMA_FROM_DEVICE, lm_file);
+
+	return err;
+}
+
+int
+pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+	struct pds_lm_restore_cmd cmd = {
+		.opcode = PDS_LM_CMD_RESTORE,
+		.vf_id = cpu_to_le16(pds_vfio->vf_id),
+	};
+	struct pci_dev *pdev = pds_vfio->pdev;
+	struct pds_vfio_lm_file *lm_file;
+	struct pds_auxiliary_dev *padev;
+	struct pds_lm_comp comp = {0};
+	int err;
+
+	dev_dbg(&pdev->dev, "vf%u: Set migration state\n", pds_vfio->vf_id);
+
+	lm_file = pds_vfio->restore_file;
+
+	padev = pds_vfio->vfio_aux->padev;
+	err = pds_vfio_dma_map_lm_file(pds_vfio->coredev, DMA_TO_DEVICE, lm_file);
+	if (err) {
+		dev_err(&pdev->dev, "failed to map restore migration file: %pe\n",
+			ERR_PTR(err));
+		return err;
+	}
+
+	cmd.sgl_addr = cpu_to_le64(lm_file->sgl_addr);
+	cmd.num_sge = cpu_to_le32(lm_file->num_sge);
+
+	err = padev->ops->adminq_cmd(padev,
+				     (union pds_core_adminq_cmd *)&cmd,
+				     sizeof(cmd),
+				     (union pds_core_adminq_comp *)&comp,
+				     0);
+	if (err)
+		dev_err(&pdev->dev, "failed to set migration state: %pe\n",
+			ERR_PTR(err));
+
+	pds_vfio_dma_unmap_lm_file(pds_vfio->coredev, DMA_TO_DEVICE, lm_file);
+
+	return err;
+}
+
+void
+pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+				    enum pds_lm_host_vf_status vf_status)
+{
+	struct pds_auxiliary_dev *padev = pds_vfio->vfio_aux->padev;
+	struct pds_lm_host_vf_status_cmd cmd = {
+		.opcode = PDS_LM_CMD_HOST_VF_STATUS,
+		.vf_id = cpu_to_le16(pds_vfio->vf_id),
+		.status = vf_status,
+	};
+	struct pci_dev *pdev = pds_vfio->pdev;
+	struct pds_lm_comp comp = {0};
+	int err;
+
+	dev_dbg(&pdev->dev, "vf%u: Set host VF LM status: %u",
+		pds_vfio->vf_id, cmd.status);
+	if (vf_status != PDS_LM_STA_IN_PROGRESS &&
+	    vf_status != PDS_LM_STA_NONE) {
+		dev_warn(&pdev->dev, "Invalid host VF migration status, %d\n",
+			 vf_status);
+		return;
+	}
+
+	err = padev->ops->adminq_cmd(padev,
+				     (union pds_core_adminq_cmd *)&cmd,
+				     sizeof(cmd),
+				     (union pds_core_adminq_comp *)&comp,
+				     0);
+	if (err)
+		dev_warn(&pdev->dev, "failed to send host VF migration status: %pe\n",
+			 ERR_PTR(err));
+}
diff --git a/drivers/vfio/pci/pds/cmds.h b/drivers/vfio/pci/pds/cmds.h
index 7fe2d1efd894..5f9ac45ee5a3 100644
--- a/drivers/vfio/pci/pds/cmds.h
+++ b/drivers/vfio/pci/pds/cmds.h
@@ -4,11 +4,26 @@ 
 #ifndef _CMDS_H_
 #define _CMDS_H_
 
+#include <linux/pds/pds_lm.h>
+
 struct pds_vfio_pci_device;
 
 int
 pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio);
 void
 pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio);
+int
+pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio);
+int
+pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio);
+int
+pds_vfio_get_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size);
+int
+pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio);
+int
+pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio);
+void
+pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+				    enum pds_lm_host_vf_status vf_status);
 
 #endif /* _CMDS_H_ */
diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c
new file mode 100644
index 000000000000..86152b18a5b7
--- /dev/null
+++ b/drivers/vfio/pci/pds/lm.c
@@ -0,0 +1,491 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 Pensando Systems, Inc */
+
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+
+#include "cmds.h"
+#include "vfio_dev.h"
+
+#define PDS_VFIO_LM_FILENAME	"pds_vfio_lm"
+
+const char *
+pds_vfio_lm_state(enum vfio_device_mig_state state)
+{
+	switch (state) {
+	case VFIO_DEVICE_STATE_ERROR:
+		return "VFIO_DEVICE_STATE_ERROR";
+	case VFIO_DEVICE_STATE_STOP:
+		return "VFIO_DEVICE_STATE_STOP";
+	case VFIO_DEVICE_STATE_RUNNING:
+		return "VFIO_DEVICE_STATE_RUNNING";
+	case VFIO_DEVICE_STATE_STOP_COPY:
+		return "VFIO_DEVICE_STATE_STOP_COPY";
+	case VFIO_DEVICE_STATE_RESUMING:
+		return "VFIO_DEVICE_STATE_RESUMING";
+	case VFIO_DEVICE_STATE_RUNNING_P2P:
+		return "VFIO_DEVICE_STATE_RUNNING_P2P";
+	default:
+		return "VFIO_DEVICE_STATE_INVALID";
+	}
+
+	return "VFIO_DEVICE_STATE_INVALID";
+}
+
+static struct pds_vfio_lm_file *
+pds_vfio_get_lm_file(const char *name, const struct file_operations *fops,
+		     int flags, u64 size)
+{
+	struct pds_vfio_lm_file *lm_file = NULL;
+	unsigned long long npages;
+	unsigned long long i;
+	struct page **pages;
+	int err = 0;
+
+	if (!size)
+		return NULL;
+
+	/* Alloc file structure */
+	lm_file = kzalloc(sizeof(*lm_file), GFP_KERNEL);
+	if (!lm_file)
+		return NULL;
+
+	/* Create file */
+	lm_file->filep = anon_inode_getfile(name, fops, lm_file, flags);
+	if (!lm_file->filep)
+		goto err_get_file;
+
+	stream_open(lm_file->filep->f_inode, lm_file->filep);
+	mutex_init(&lm_file->lock);
+
+	lm_file->size = size;
+
+	/* Allocate memory for file pages */
+	npages = DIV_ROUND_UP_ULL(lm_file->size, PAGE_SIZE);
+
+	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		goto err_alloc_pages;
+
+	for (i = 0; i < npages; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+		if (!pages[i])
+			goto err_alloc_page;
+	}
+
+	lm_file->pages = pages;
+	lm_file->npages = npages;
+	lm_file->alloc_size = npages * PAGE_SIZE;
+
+	/* Create scatterlist of file pages to use for DMA mapping later */
+	err = sg_alloc_table_from_pages(&lm_file->sg_table, pages, npages,
+					0, size, GFP_KERNEL);
+	if (err)
+		goto err_alloc_sg_table;
+
+	return lm_file;
+
+err_alloc_sg_table:
+err_alloc_page:
+	/* free allocated pages */
+	for (i = 0; i < npages && pages[i]; i++)
+		__free_page(pages[i]);
+	kfree(pages);
+err_alloc_pages:
+	fput(lm_file->filep);
+	mutex_destroy(&lm_file->lock);
+err_get_file:
+	kfree(lm_file);
+
+	return NULL;
+}
+
+static void
+pds_vfio_put_lm_file(struct pds_vfio_lm_file *lm_file)
+{
+	unsigned long long i;
+
+	mutex_lock(&lm_file->lock);
+
+	lm_file->size = 0;
+	lm_file->alloc_size = 0;
+
+	/* Free scatter list of file pages*/
+	sg_free_table(&lm_file->sg_table);
+
+	/* Free allocated file pages */
+	for (i = 0; i < lm_file->npages && lm_file->pages[i]; i++)
+		__free_page(lm_file->pages[i]);
+	kfree(lm_file->pages);
+	lm_file->pages = NULL;
+
+	/* Delete file */
+	fput(lm_file->filep);
+	lm_file->filep = NULL;
+
+	mutex_unlock(&lm_file->lock);
+
+	mutex_destroy(&lm_file->lock);
+
+	/* Free file structure */
+	kfree(lm_file);
+}
+
+void
+pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio)
+{
+	if (!pds_vfio->save_file)
+		return;
+
+	pds_vfio_put_lm_file(pds_vfio->save_file);
+	pds_vfio->save_file = NULL;
+}
+
+void
+pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio)
+{
+	if (!pds_vfio->restore_file)
+		return;
+
+	pds_vfio_put_lm_file(pds_vfio->restore_file);
+	pds_vfio->restore_file = NULL;
+}
+
+static struct page *
+pds_vfio_get_file_page(struct pds_vfio_lm_file *lm_file,
+		       unsigned long offset)
+{
+	unsigned long cur_offset = 0;
+	struct scatterlist *sg;
+	unsigned int i;
+
+	/* All accesses are sequential */
+	if (offset < lm_file->last_offset || !lm_file->last_offset_sg) {
+		lm_file->last_offset = 0;
+		lm_file->last_offset_sg = lm_file->sg_table.sgl;
+		lm_file->sg_last_entry = 0;
+	}
+
+	cur_offset = lm_file->last_offset;
+
+	for_each_sg(lm_file->last_offset_sg, sg,
+		    lm_file->sg_table.orig_nents - lm_file->sg_last_entry,
+		    i) {
+		if (offset < sg->length + cur_offset) {
+			lm_file->last_offset_sg = sg;
+			lm_file->sg_last_entry += i;
+			lm_file->last_offset = cur_offset;
+			return nth_page(sg_page(sg),
+					(offset - cur_offset) / PAGE_SIZE);
+		}
+		cur_offset += sg->length;
+	}
+
+	return NULL;
+}
+
+static int
+pds_vfio_release_file(struct inode *inode, struct file *filp)
+{
+	struct pds_vfio_lm_file *lm_file = filp->private_data;
+
+	lm_file->size = 0;
+
+	return 0;
+}
+
+static ssize_t
+pds_vfio_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos)
+{
+	struct pds_vfio_lm_file *lm_file = filp->private_data;
+	ssize_t done = 0;
+
+	if (pos)
+		return -ESPIPE;
+	pos = &filp->f_pos;
+
+	mutex_lock(&lm_file->lock);
+	if (*pos > lm_file->size) {
+		done = -EINVAL;
+		goto out_unlock;
+	}
+
+	len = min_t(size_t, lm_file->size - *pos, len);
+	while (len) {
+		size_t page_offset;
+		struct page *page;
+		size_t page_len;
+		u8 *from_buff;
+		int err;
+
+		page_offset = (*pos) % PAGE_SIZE;
+		page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
+		if (!page) {
+			if (done == 0)
+				done = -EINVAL;
+			goto out_unlock;
+		}
+
+		page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
+		from_buff = kmap_local_page(page);
+		err = copy_to_user(buf, from_buff + page_offset, page_len);
+		kunmap_local(from_buff);
+		if (err) {
+			done = -EFAULT;
+			goto out_unlock;
+		}
+		*pos += page_len;
+		len -= page_len;
+		done += page_len;
+		buf += page_len;
+	}
+
+out_unlock:
+	mutex_unlock(&lm_file->lock);
+	return done;
+}
+
+static const struct file_operations
+pds_vfio_save_fops = {
+	.owner = THIS_MODULE,
+	.read = pds_vfio_save_read,
+	.release = pds_vfio_release_file,
+	.llseek = no_llseek,
+};
+
+static int
+pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio)
+{
+	struct pci_dev *pdev = pds_vfio->pdev;
+	struct pds_vfio_lm_file *lm_file;
+	int err = 0;
+	u64 size;
+
+	/* Get live migration state size in this state */
+	err = pds_vfio_get_lm_status_cmd(pds_vfio, &size);
+	if (err) {
+		dev_err(&pdev->dev, "failed to get save status: %pe\n",
+			ERR_PTR(err));
+		goto err_get_lm_status;
+	}
+
+	dev_dbg(&pdev->dev, "save status, size = %lld\n", size);
+
+	if (!size) {
+		err = -EIO;
+		dev_err(&pdev->dev, "invalid state size\n");
+		goto err_get_lm_status;
+	}
+
+	lm_file = pds_vfio_get_lm_file(PDS_VFIO_LM_FILENAME,
+				       &pds_vfio_save_fops,
+				       O_RDONLY, size);
+	if (!lm_file) {
+		err = -ENOENT;
+		dev_err(&pdev->dev, "failed to create save file\n");
+		goto err_get_lm_file;
+	}
+
+	dev_dbg(&pdev->dev, "size = %lld, alloc_size = %lld, npages = %lld\n",
+		lm_file->size, lm_file->alloc_size, lm_file->npages);
+
+	pds_vfio->save_file = lm_file;
+
+	return 0;
+
+err_get_lm_file:
+err_get_lm_status:
+	return err;
+}
+
+static ssize_t
+pds_vfio_restore_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos)
+{
+	struct pds_vfio_lm_file *lm_file = filp->private_data;
+	loff_t requested_length;
+	ssize_t done = 0;
+
+	if (pos)
+		return -ESPIPE;
+
+	pos = &filp->f_pos;
+
+	if (*pos < 0 ||
+	    check_add_overflow((loff_t)len, *pos, &requested_length))
+		return -EINVAL;
+
+	mutex_lock(&lm_file->lock);
+
+	while (len) {
+		size_t page_offset;
+		struct page *page;
+		size_t page_len;
+		u8 *to_buff;
+		int err;
+
+		page_offset = (*pos) % PAGE_SIZE;
+		page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
+		if (!page) {
+			if (done == 0)
+				done = -EINVAL;
+			goto out_unlock;
+		}
+
+		page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
+		to_buff = kmap_local_page(page);
+		err = copy_from_user(to_buff + page_offset, buf, page_len);
+		kunmap_local(to_buff);
+		if (err) {
+			done = -EFAULT;
+			goto out_unlock;
+		}
+		*pos += page_len;
+		len -= page_len;
+		done += page_len;
+		buf += page_len;
+		lm_file->size += page_len;
+	}
+out_unlock:
+	mutex_unlock(&lm_file->lock);
+	return done;
+}
+
+static const struct file_operations
+pds_vfio_restore_fops = {
+	.owner = THIS_MODULE,
+	.write = pds_vfio_restore_write,
+	.release = pds_vfio_release_file,
+	.llseek = no_llseek,
+};
+
+static int
+pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio)
+{
+	struct pci_dev *pdev = pds_vfio->pdev;
+	struct pds_vfio_lm_file *lm_file;
+	int err = 0;
+	u64 size;
+
+	size = sizeof(union pds_lm_dev_state);
+
+	dev_dbg(&pdev->dev, "restore status, size = %lld\n", size);
+
+	if (!size) {
+		err = -EIO;
+		dev_err(&pdev->dev, "invalid state size");
+		goto err_get_lm_status;
+	}
+
+	lm_file = pds_vfio_get_lm_file(PDS_VFIO_LM_FILENAME,
+				       &pds_vfio_restore_fops,
+				       O_WRONLY, size);
+	if (!lm_file) {
+		err = -ENOENT;
+		dev_err(&pdev->dev, "failed to create restore file");
+		goto err_get_lm_file;
+	}
+	pds_vfio->restore_file = lm_file;
+
+	return 0;
+
+err_get_lm_file:
+err_get_lm_status:
+	return err;
+}
+
+struct file *
+pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
+				  enum vfio_device_mig_state next)
+{
+	enum vfio_device_mig_state cur = pds_vfio->state;
+	struct device *dev = &pds_vfio->pdev->dev;
+	int err = 0;
+
+	dev_dbg(dev, "%s => %s\n",
+		pds_vfio_lm_state(cur), pds_vfio_lm_state(next));
+
+	if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_STOP_COPY) {
+		/* Device is already stopped
+		 * create save device data file & get device state from firmware
+		 */
+		err = pds_vfio_get_save_file(pds_vfio);
+		if (err)
+			return ERR_PTR(err);
+
+		/* Get device state */
+		err = pds_vfio_get_lm_state_cmd(pds_vfio);
+		if (err) {
+			pds_vfio_put_save_file(pds_vfio);
+			return ERR_PTR(err);
+		}
+
+		return pds_vfio->save_file->filep;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP_COPY && next == VFIO_DEVICE_STATE_STOP) {
+		/* Device is already stopped
+		 * delete the save device state file
+		 */
+		pds_vfio_put_save_file(pds_vfio);
+		pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
+						    PDS_LM_STA_NONE);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RESUMING) {
+		/* create resume device data file */
+		err = pds_vfio_get_restore_file(pds_vfio);
+		if (err)
+			return ERR_PTR(err);
+
+		return pds_vfio->restore_file->filep;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RESUMING && next == VFIO_DEVICE_STATE_STOP) {
+		/* Set device state */
+		err = pds_vfio_set_lm_state_cmd(pds_vfio);
+		if (err)
+			return ERR_PTR(err);
+
+		/* delete resume device data file */
+		pds_vfio_put_restore_file(pds_vfio);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RUNNING && next == VFIO_DEVICE_STATE_RUNNING_P2P) {
+		pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS);
+		/* Device should be stopped
+		 * no interrupts, dma or change in internal state
+		 */
+		err = pds_vfio_suspend_device_cmd(pds_vfio);
+		if (err)
+			return ERR_PTR(err);
+
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_RUNNING) {
+		/* Device should be functional
+		 * interrupts, dma, mmio or changes to internal state is allowed
+		 */
+		err = pds_vfio_resume_device_cmd(pds_vfio);
+		if (err)
+			return ERR_PTR(err);
+
+		pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
+						    PDS_LM_STA_NONE);
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RUNNING_P2P)
+		return NULL;
+
+	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_STOP)
+		return NULL;
+
+	return ERR_PTR(-EINVAL);
+}
diff --git a/drivers/vfio/pci/pds/lm.h b/drivers/vfio/pci/pds/lm.h
new file mode 100644
index 000000000000..3dd97b807db6
--- /dev/null
+++ b/drivers/vfio/pci/pds/lm.h
@@ -0,0 +1,43 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 Pensando Systems, Inc */
+
+#ifndef _LM_H_
+#define _LM_H_
+
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/scatterlist.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+
+#include <linux/pds/pds_lm.h>
+
+struct pds_vfio_lm_file {
+	struct file *filep;
+	struct mutex lock;	/* protect live migration data file */
+	u64 size;		/* Size with valid data */
+	u64 alloc_size;		/* Total allocated size. Always >= len */
+	struct page **pages;	/* Backing pages for file */
+	unsigned long long npages;
+	struct sg_table sg_table;	/* SG table for backing pages */
+	struct pds_lm_sg_elem *sgl;	/* DMA mapping */
+	dma_addr_t sgl_addr;
+	u16 num_sge;
+	struct scatterlist *last_offset_sg;	/* Iterator */
+	unsigned int sg_last_entry;
+	unsigned long last_offset;
+};
+
+struct pds_vfio_pci_device;
+
+struct file *
+pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
+				  enum vfio_device_mig_state next);
+const char *
+pds_vfio_lm_state(enum vfio_device_mig_state state);
+void
+pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio);
+void
+pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio);
+
+#endif /* _LM_H_ */
diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c
index d6ad15719ec4..aafa013ee8c2 100644
--- a/drivers/vfio/pci/pds/pci_drv.c
+++ b/drivers/vfio/pci/pds/pci_drv.c
@@ -67,12 +67,27 @@  pds_vfio_pci_table[] = {
 };
 MODULE_DEVICE_TABLE(pci, pds_vfio_pci_table);
 
+static void
+pds_vfio_pci_aer_reset_done(struct pci_dev *pdev)
+{
+	struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev);
+
+	pds_vfio_reset(pds_vfio);
+}
+
+static const struct
+pci_error_handlers pds_vfio_pci_err_handlers = {
+	.reset_done = pds_vfio_pci_aer_reset_done,
+	.error_detected = vfio_pci_core_aer_err_detected,
+};
+
 static struct pci_driver
 pds_vfio_pci_driver = {
 	.name = PDS_VFIO_DRV_NAME,
 	.id_table = pds_vfio_pci_table,
 	.probe = pds_vfio_pci_probe,
 	.remove = pds_vfio_pci_remove,
+	.err_handler = &pds_vfio_pci_err_handlers,
 	.driver_managed_dma = true,
 };
 
diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
index 30c3bb47a2be..d28f96f93097 100644
--- a/drivers/vfio/pci/pds/vfio_dev.c
+++ b/drivers/vfio/pci/pds/vfio_dev.c
@@ -4,6 +4,7 @@ 
 #include <linux/vfio.h>
 #include <linux/vfio_pci_core.h>
 
+#include "lm.h"
 #include "vfio_dev.h"
 #include "aux_drv.h"
 
@@ -16,6 +17,97 @@  pds_vfio_pci_drvdata(struct pci_dev *pdev)
 			    vfio_coredev);
 }
 
+static void
+pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
+{
+again:
+	spin_lock(&pds_vfio->reset_lock);
+	if (pds_vfio->deferred_reset) {
+		pds_vfio->deferred_reset = false;
+		if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) {
+			pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
+			pds_vfio_put_restore_file(pds_vfio);
+			pds_vfio_put_save_file(pds_vfio);
+		}
+		spin_unlock(&pds_vfio->reset_lock);
+		goto again;
+	}
+	mutex_unlock(&pds_vfio->state_mutex);
+	spin_unlock(&pds_vfio->reset_lock);
+}
+
+void
+pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio)
+{
+	spin_lock(&pds_vfio->reset_lock);
+	pds_vfio->deferred_reset = true;
+	if (!mutex_trylock(&pds_vfio->state_mutex)) {
+		spin_unlock(&pds_vfio->reset_lock);
+		return;
+	}
+	spin_unlock(&pds_vfio->reset_lock);
+	pds_vfio_state_mutex_unlock(pds_vfio);
+}
+
+static struct file *
+pds_vfio_set_device_state(struct vfio_device *vdev,
+			  enum vfio_device_mig_state new_state)
+{
+	struct pds_vfio_pci_device *pds_vfio =
+		container_of(vdev, struct pds_vfio_pci_device,
+			     vfio_coredev.vdev);
+	struct file *res = NULL;
+
+	if (!pds_vfio->vfio_aux)
+		return ERR_PTR(-ENODEV);
+
+	mutex_lock(&pds_vfio->state_mutex);
+	while (new_state != pds_vfio->state) {
+		enum vfio_device_mig_state next_state;
+
+		int err = vfio_mig_get_next_state(vdev, pds_vfio->state,
+						  new_state, &next_state);
+		if (err) {
+			res = ERR_PTR(err);
+			break;
+		}
+
+		res = pds_vfio_step_device_state_locked(pds_vfio, next_state);
+		if (IS_ERR(res))
+			break;
+
+		pds_vfio->state = next_state;
+
+		if (WARN_ON(res && new_state != pds_vfio->state)) {
+			res = ERR_PTR(-EINVAL);
+			break;
+		}
+	}
+	pds_vfio_state_mutex_unlock(pds_vfio);
+
+	return res;
+}
+
+static int
+pds_vfio_get_device_state(struct vfio_device *vdev,
+			  enum vfio_device_mig_state *current_state)
+{
+	struct pds_vfio_pci_device *pds_vfio =
+		container_of(vdev, struct pds_vfio_pci_device,
+			     vfio_coredev.vdev);
+
+	mutex_lock(&pds_vfio->state_mutex);
+	*current_state = pds_vfio->state;
+	pds_vfio_state_mutex_unlock(pds_vfio);
+	return 0;
+}
+
+static const struct vfio_migration_ops
+pds_vfio_lm_ops = {
+	.migration_set_state = pds_vfio_set_device_state,
+	.migration_get_state = pds_vfio_get_device_state
+};
+
 static int
 pds_vfio_init_device(struct vfio_device *vdev)
 {
@@ -35,10 +127,19 @@  pds_vfio_init_device(struct vfio_device *vdev)
 	vfio_aux = pds_vfio_aux_get_drvdata(pds_vfio->pci_id);
 	if (vfio_aux) {
 		vfio_aux->pds_vfio = pds_vfio;
+		pds_vfio->coredev = vfio_aux->padev->aux_dev.dev.parent;
 		pds_vfio->vfio_aux = vfio_aux;
 		pds_vfio_put_aux_dev(vfio_aux);
 	}
 
+	vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
+	vdev->mig_ops = &pds_vfio_lm_ops;
+
+	dev_dbg(&pdev->dev, "%s: PF %#04x VF %#04x (%d) vf_id %d domain %d vfio_aux %p pds_vfio %p\n",
+		__func__, pci_dev_id(pdev->physfn),
+		pds_vfio->pci_id, pds_vfio->pci_id, pds_vfio->vf_id,
+		pci_domain_nr(pdev->bus), pds_vfio->vfio_aux, pds_vfio);
+
 	return 0;
 }
 
@@ -54,18 +155,34 @@  pds_vfio_open_device(struct vfio_device *vdev)
 	if (err)
 		return err;
 
+	mutex_init(&pds_vfio->state_mutex);
+	dev_dbg(&pds_vfio->pdev->dev, "%s: %s => VFIO_DEVICE_STATE_RUNNING\n",
+		__func__, pds_vfio_lm_state(pds_vfio->state));
+	pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
+
 	vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev);
 
 	return 0;
 }
 
+static void
+pds_vfio_close_device(struct vfio_device *vdev)
+{
+	struct pds_vfio_pci_device *pds_vfio =
+		container_of(vdev, struct pds_vfio_pci_device,
+			     vfio_coredev.vdev);
+
+	mutex_destroy(&pds_vfio->state_mutex);
+	vfio_pci_core_close_device(vdev);
+}
+
 static const struct vfio_device_ops
 pds_vfio_ops = {
 	.name = "pds-vfio",
 	.init = pds_vfio_init_device,
 	.release = vfio_pci_core_release_dev,
 	.open_device = pds_vfio_open_device,
-	.close_device = vfio_pci_core_close_device,
+	.close_device = pds_vfio_close_device,
 	.ioctl = vfio_pci_core_ioctl,
 	.device_feature = vfio_pci_core_ioctl_feature,
 	.read = vfio_pci_core_read,
diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h
index b16668693e1f..a09570eec6fa 100644
--- a/drivers/vfio/pci/pds/vfio_dev.h
+++ b/drivers/vfio/pci/pds/vfio_dev.h
@@ -7,10 +7,20 @@ 
 #include <linux/pci.h>
 #include <linux/vfio_pci_core.h>
 
+#include "lm.h"
+
 struct pds_vfio_pci_device {
 	struct vfio_pci_core_device vfio_coredev;
 	struct pci_dev *pdev;
 	struct pds_vfio_aux *vfio_aux;
+	struct device *coredev;
+
+	struct pds_vfio_lm_file *save_file;
+	struct pds_vfio_lm_file *restore_file;
+	struct mutex state_mutex; /* protect migration state */
+	enum vfio_device_mig_state state;
+	spinlock_t reset_lock; /* protect reset_done flow */
+	u8 deferred_reset;
 
 	int vf_id;
 	int pci_id;
@@ -20,5 +30,7 @@  const struct vfio_device_ops *
 pds_vfio_ops_info(void);
 struct pds_vfio_pci_device *
 pds_vfio_pci_drvdata(struct pci_dev *pdev);
+void
+pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio);
 
 #endif /* _VFIO_DEV_H_ */
diff --git a/include/linux/pds/pds_lm.h b/include/linux/pds/pds_lm.h
index fdaf2bf71d35..28ebd62f7583 100644
--- a/include/linux/pds/pds_lm.h
+++ b/include/linux/pds/pds_lm.h
@@ -8,5 +8,211 @@ 
 
 #define PDS_DEV_TYPE_LM_STR	"LM"
 #define PDS_LM_DEV_NAME		PDS_CORE_DRV_NAME "." PDS_DEV_TYPE_LM_STR
+#define PDS_LM_DEVICE_STATE_LENGTH		65536
+#define PDS_LM_CHECK_DEVICE_STATE_LENGTH(X)					\
+			PDS_CORE_SIZE_CHECK(union, PDS_LM_DEVICE_STATE_LENGTH, X)
+
+/*
+ * enum pds_lm_cmd_opcode - Live Migration Device commands
+ */
+enum pds_lm_cmd_opcode {
+	PDS_LM_CMD_HOST_VF_STATUS  = 1,
+
+	/* Device state commands */
+	PDS_LM_CMD_STATUS          = 16,
+	PDS_LM_CMD_SUSPEND         = 18,
+	PDS_LM_CMD_SUSPEND_STATUS  = 19,
+	PDS_LM_CMD_RESUME          = 20,
+	PDS_LM_CMD_SAVE            = 21,
+	PDS_LM_CMD_RESTORE         = 22,
+};
+
+/**
+ * struct pds_lm_cmd - generic command
+ * @opcode:	Opcode
+ * @rsvd:	Word boundary padding
+ * @vf_id:	VF id
+ * @rsvd2:	Structure padding to 60 Bytes
+ */
+struct pds_lm_cmd {
+	u8     opcode;
+	u8     rsvd;
+	__le16 vf_id;
+	u8     rsvd2[56];
+};
+
+/**
+ * struct pds_lm_comp - generic command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @rsvd:	Structure padding to 16 Bytes
+ */
+struct pds_lm_comp {
+	u8 status;
+	u8 rsvd[15];
+};
+
+/**
+ * struct pds_lm_status_cmd - STATUS command
+ * @opcode:	Opcode
+ * @rsvd:	Word boundary padding
+ * @vf_id:	VF id
+ */
+struct pds_lm_status_cmd {
+	u8     opcode;
+	u8     rsvd;
+	__le16 vf_id;
+};
+
+/**
+ * struct pds_lm_status_comp - STATUS command completion
+ * @status:		Status of the command (enum pds_core_status_code)
+ * @rsvd:		Word boundary padding
+ * @comp_index:		Index in the desc ring for which this is the completion
+ * @size:		Size of the device state
+ * @rsvd2:		Word boundary padding
+ * @color:		Color bit
+ */
+struct pds_lm_status_comp {
+	u8     status;
+	u8     rsvd;
+	__le16 comp_index;
+	union {
+		__le64 size;
+		u8     rsvd2[11];
+	} __packed;
+	u8     color;
+};
+
+/**
+ * struct pds_lm_suspend_cmd - SUSPEND command
+ * @opcode:	Opcode PDS_LM_CMD_SUSPEND
+ * @rsvd:	Word boundary padding
+ * @vf_id:	VF id
+ */
+struct pds_lm_suspend_cmd {
+	u8     opcode;
+	u8     rsvd;
+	__le16 vf_id;
+};
+
+/**
+ * struct pds_lm_suspend_comp - SUSPEND command completion
+ * @status:		Status of the command (enum pds_core_status_code)
+ * @rsvd:		Word boundary padding
+ * @comp_index:		Index in the desc ring for which this is the completion
+ * @state_size:		Size of the device state computed post suspend.
+ * @rsvd2:		Word boundary padding
+ * @color:		Color bit
+ */
+struct pds_lm_suspend_comp {
+	u8     status;
+	u8     rsvd;
+	__le16 comp_index;
+	union {
+		__le64 state_size;
+		u8     rsvd2[11];
+	} __packed;
+	u8     color;
+};
+
+/**
+ * struct pds_lm_suspend_status_cmd - SUSPEND status command
+ * @opcode:	Opcode PDS_AQ_CMD_LM_SUSPEND_STATUS
+ * @rsvd:	Word boundary padding
+ * @vf_id:	VF id
+ */
+struct pds_lm_suspend_status_cmd {
+	u8 opcode;
+	u8 rsvd;
+	__le16 vf_id;
+};
+
+/**
+ * struct pds_lm_resume_cmd - RESUME command
+ * @opcode:	Opcode PDS_LM_CMD_RESUME
+ * @rsvd:	Word boundary padding
+ * @vf_id:	VF id
+ */
+struct pds_lm_resume_cmd {
+	u8     opcode;
+	u8     rsvd;
+	__le16 vf_id;
+};
+
+/**
+ * struct pds_lm_sg_elem - Transmit scatter-gather (SG) descriptor element
+ * @addr:	DMA address of SG element data buffer
+ * @len:	Length of SG element data buffer, in bytes
+ * @rsvd:	Word boundary padding
+ */
+struct pds_lm_sg_elem {
+	__le64 addr;
+	__le32 len;
+	__le16 rsvd[2];
+};
+
+/**
+ * struct pds_lm_save_cmd - SAVE command
+ * @opcode:	Opcode PDS_LM_CMD_SAVE
+ * @rsvd:	Word boundary padding
+ * @vf_id:	VF id
+ * @rsvd2:	Word boundary padding
+ * @sgl_addr:	IOVA address of the SGL to dma the device state
+ * @num_sge:	Total number of SG elements
+ */
+struct pds_lm_save_cmd {
+	u8     opcode;
+	u8     rsvd;
+	__le16 vf_id;
+	u8     rsvd2[4];
+	__le64 sgl_addr;
+	__le32 num_sge;
+} __packed;
+
+/**
+ * struct pds_lm_restore_cmd - RESTORE command
+ * @opcode:	Opcode PDS_LM_CMD_RESTORE
+ * @rsvd:	Word boundary padding
+ * @vf_id:	VF id
+ * @rsvd2:	Word boundary padding
+ * @sgl_addr:	IOVA address of the SGL to dma the device state
+ * @num_sge:	Total number of SG elements
+ */
+struct pds_lm_restore_cmd {
+	u8     opcode;
+	u8     rsvd;
+	__le16 vf_id;
+	u8     rsvd2[4];
+	__le64 sgl_addr;
+	__le32 num_sge;
+} __packed;
+
+/**
+ * union pds_lm_dev_state - device state information
+ * @words:	Device state words
+ */
+union pds_lm_dev_state {
+	__le32 words[PDS_LM_DEVICE_STATE_LENGTH / sizeof(__le32)];
+};
+
+enum pds_lm_host_vf_status {
+	PDS_LM_STA_NONE = 0,
+	PDS_LM_STA_IN_PROGRESS,
+	PDS_LM_STA_MAX,
+};
+
+/**
+ * struct pds_lm_host_vf_status_cmd - HOST_VF_STATUS command
+ * @opcode:	Opcode PDS_LM_CMD_HOST_VF_STATUS
+ * @rsvd:	Word boundary padding
+ * @vf_id:	VF id
+ * @status:	Current LM status of host VF driver (enum pds_lm_host_status)
+ */
+struct pds_lm_host_vf_status_cmd {
+	u8     opcode;
+	u8     rsvd;
+	__le16 vf_id;
+	u8     status;
+};
 
 #endif /* _PDS_LM_H_ */