@@ -6,5 +6,6 @@ obj-$(CONFIG_PDS_VFIO_PCI) += pds_vfio.o
pds_vfio-y := \
aux_drv.o \
cmds.o \
+ lm.o \
pci_drv.o \
vfio_dev.o
@@ -77,6 +77,7 @@ pds_vfio_aux_probe(struct auxiliary_device *aux_dev,
dev_dbg(dev, "%s: id %#04x busnr %#x devfn %#x bus %p pds_vfio %p\n",
__func__, padev->id, busnr, devfn, bus, vfio_aux->pds_vfio);
+ vfio_aux->pds_vfio->coredev = aux_dev->dev.parent;
vfio_aux->pds_vfio->vfio_aux = vfio_aux;
vfio_aux->padrv.event_handler = pds_vfio_aux_notify_handler;
@@ -3,9 +3,11 @@
#include <linux/io.h>
#include <linux/types.h>
+#include <linux/delay.h>
#include <linux/pds/pds_core_if.h>
#include <linux/pds/pds_adminq.h>
+#include <linux/pds/pds_lm.h>
#include <linux/pds/pds_auxbus.h>
#include "vfio_dev.h"
@@ -28,3 +30,313 @@ pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio)
padev->ops->unregister_client(padev);
}
+
+#define SUSPEND_TIMEOUT_S 5
+#define SUSPEND_CHECK_INTERVAL_MS 1
+
+static int
+pds_vfio_suspend_wait_device_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct pds_lm_suspend_status_cmd cmd = {
+ .opcode = PDS_LM_CMD_SUSPEND_STATUS,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ };
+ struct pci_dev *pdev = pds_vfio->pdev;
+ struct pds_lm_comp comp = { 0 };
+ struct pds_auxiliary_dev *padev;
+ unsigned long time_limit;
+ unsigned long time_start;
+ unsigned long time_done;
+ int err;
+
+ padev = pds_vfio->vfio_aux->padev;
+
+ time_start = jiffies;
+ time_limit = time_start + HZ * SUSPEND_TIMEOUT_S;
+ do {
+ err = padev->ops->adminq_cmd(padev,
+ (union pds_core_adminq_cmd *)&cmd,
+ sizeof(cmd),
+ (union pds_core_adminq_comp *)&comp,
+ PDS_AQ_FLAG_FASTPOLL);
+ if (err != -EAGAIN)
+ break;
+
+ msleep(SUSPEND_CHECK_INTERVAL_MS);
+ } while (time_before(jiffies, time_limit));
+
+ time_done = jiffies;
+ dev_dbg(&pdev->dev, "%s: vf%u: Suspend comp received in %d msecs\n",
+ __func__, pds_vfio->vf_id,
+ jiffies_to_msecs(time_done - time_start));
+
+ /* Check the results */
+ if (time_after_eq(time_done, time_limit)) {
+ dev_err(&pdev->dev, "%s: vf%u: Suspend comp timeout\n", __func__,
+ pds_vfio->vf_id);
+ err = -ETIMEDOUT;
+ }
+
+ return err;
+}
+
+int
+pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct pds_lm_suspend_cmd cmd = {
+ .opcode = PDS_LM_CMD_SUSPEND,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ };
+ struct pds_lm_suspend_comp comp = {0};
+ struct pci_dev *pdev = pds_vfio->pdev;
+ struct pds_auxiliary_dev *padev;
+ int err;
+
+ dev_dbg(&pdev->dev, "vf%u: Suspend device\n", pds_vfio->vf_id);
+
+ padev = pds_vfio->vfio_aux->padev;
+ err = padev->ops->adminq_cmd(padev,
+ (union pds_core_adminq_cmd *)&cmd,
+ sizeof(cmd),
+ (union pds_core_adminq_comp *)&comp,
+ PDS_AQ_FLAG_FASTPOLL);
+ if (err) {
+ dev_err(&pdev->dev, "vf%u: Suspend failed: %pe\n",
+ pds_vfio->vf_id, ERR_PTR(err));
+ return err;
+ }
+
+ return pds_vfio_suspend_wait_device_cmd(pds_vfio);
+}
+
+int
+pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct pds_lm_resume_cmd cmd = {
+ .opcode = PDS_LM_CMD_RESUME,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ };
+ struct pds_auxiliary_dev *padev;
+ struct pds_lm_comp comp = {0};
+
+ dev_dbg(&pds_vfio->pdev->dev, "vf%u: Resume device\n", pds_vfio->vf_id);
+
+ padev = pds_vfio->vfio_aux->padev;
+ return padev->ops->adminq_cmd(padev,
+ (union pds_core_adminq_cmd *)&cmd,
+ sizeof(cmd),
+ (union pds_core_adminq_comp *)&comp,
+ 0);
+}
+
+int
+pds_vfio_get_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size)
+{
+ struct pds_lm_status_cmd cmd = {
+ .opcode = PDS_LM_CMD_STATUS,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ };
+ struct pds_lm_status_comp comp = {0};
+ struct pds_auxiliary_dev *padev;
+ int err = 0;
+
+ dev_dbg(&pds_vfio->pdev->dev, "vf%u: Get migration status\n",
+ pds_vfio->vf_id);
+
+ padev = pds_vfio->vfio_aux->padev;
+ err = padev->ops->adminq_cmd(padev,
+ (union pds_core_adminq_cmd *)&cmd,
+ sizeof(cmd),
+ (union pds_core_adminq_comp *)&comp,
+ 0);
+ if (err)
+ return err;
+
+ *size = le64_to_cpu(comp.size);
+ return 0;
+}
+
+static int
+pds_vfio_dma_map_lm_file(struct device *dev, enum dma_data_direction dir,
+ struct pds_vfio_lm_file *lm_file)
+{
+ struct pds_lm_sg_elem *sgl, *sge;
+ struct scatterlist *sg;
+ int err = 0;
+ int i;
+
+ if (!lm_file)
+ return -EINVAL;
+
+ /* dma map file pages */
+ err = dma_map_sgtable(dev, &lm_file->sg_table, dir, 0);
+ if (err)
+ goto err_dma_map_sg;
+
+ lm_file->num_sge = lm_file->sg_table.nents;
+
+ /* alloc sgl */
+ sgl = dma_alloc_coherent(dev, lm_file->num_sge *
+ sizeof(struct pds_lm_sg_elem),
+ &lm_file->sgl_addr, GFP_KERNEL);
+ if (!sgl) {
+ err = -ENOMEM;
+ goto err_alloc_sgl;
+ }
+
+ lm_file->sgl = sgl;
+
+ /* fill sgl */
+ sge = sgl;
+ for_each_sgtable_dma_sg(&lm_file->sg_table, sg, i) {
+ sge->addr = cpu_to_le64(sg_dma_address(sg));
+ sge->len = cpu_to_le32(sg_dma_len(sg));
+ dev_dbg(dev, "addr = %llx, len = %u\n", sge->addr, sge->len);
+ sge++;
+ }
+
+ return 0;
+
+err_alloc_sgl:
+ dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0);
+err_dma_map_sg:
+ return err;
+}
+
+static void
+pds_vfio_dma_unmap_lm_file(struct device *dev, enum dma_data_direction dir,
+ struct pds_vfio_lm_file *lm_file)
+{
+ if (!lm_file)
+ return;
+
+ /* free sgl */
+ if (lm_file->sgl) {
+ dma_free_coherent(dev, lm_file->num_sge *
+ sizeof(struct pds_lm_sg_elem),
+ lm_file->sgl, lm_file->sgl_addr);
+ lm_file->sgl = NULL;
+ lm_file->sgl_addr = DMA_MAPPING_ERROR;
+ lm_file->num_sge = 0;
+ }
+
+ /* dma unmap file pages */
+ dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0);
+}
+
+int
+pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct pds_lm_save_cmd cmd = {
+ .opcode = PDS_LM_CMD_SAVE,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ };
+ struct pci_dev *pdev = pds_vfio->pdev;
+ struct pds_vfio_lm_file *lm_file;
+ struct pds_auxiliary_dev *padev;
+ struct pds_lm_comp comp = {0};
+ int err;
+
+ dev_dbg(&pdev->dev, "vf%u: Get migration state\n", pds_vfio->vf_id);
+
+ lm_file = pds_vfio->save_file;
+
+ padev = pds_vfio->vfio_aux->padev;
+ err = pds_vfio_dma_map_lm_file(pds_vfio->coredev, DMA_FROM_DEVICE, lm_file);
+ if (err) {
+ dev_err(&pdev->dev, "failed to map save migration file: %pe\n",
+ ERR_PTR(err));
+ return err;
+ }
+
+ cmd.sgl_addr = cpu_to_le64(lm_file->sgl_addr);
+ cmd.num_sge = cpu_to_le32(lm_file->num_sge);
+
+ err = padev->ops->adminq_cmd(padev,
+ (union pds_core_adminq_cmd *)&cmd,
+ sizeof(cmd),
+ (union pds_core_adminq_comp *)&comp,
+ 0);
+ if (err)
+ dev_err(&pdev->dev, "failed to get migration state: %pe\n",
+ ERR_PTR(err));
+
+ pds_vfio_dma_unmap_lm_file(pds_vfio->coredev, DMA_FROM_DEVICE, lm_file);
+
+ return err;
+}
+
+int
+pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct pds_lm_restore_cmd cmd = {
+ .opcode = PDS_LM_CMD_RESTORE,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ };
+ struct pci_dev *pdev = pds_vfio->pdev;
+ struct pds_vfio_lm_file *lm_file;
+ struct pds_auxiliary_dev *padev;
+ struct pds_lm_comp comp = {0};
+ int err;
+
+ dev_dbg(&pdev->dev, "vf%u: Set migration state\n", pds_vfio->vf_id);
+
+ lm_file = pds_vfio->restore_file;
+
+ padev = pds_vfio->vfio_aux->padev;
+ err = pds_vfio_dma_map_lm_file(pds_vfio->coredev, DMA_TO_DEVICE, lm_file);
+ if (err) {
+ dev_err(&pdev->dev, "failed to map restore migration file: %pe\n",
+ ERR_PTR(err));
+ return err;
+ }
+
+ cmd.sgl_addr = cpu_to_le64(lm_file->sgl_addr);
+ cmd.num_sge = cpu_to_le32(lm_file->num_sge);
+
+ err = padev->ops->adminq_cmd(padev,
+ (union pds_core_adminq_cmd *)&cmd,
+ sizeof(cmd),
+ (union pds_core_adminq_comp *)&comp,
+ 0);
+ if (err)
+ dev_err(&pdev->dev, "failed to set migration state: %pe\n",
+ ERR_PTR(err));
+
+ pds_vfio_dma_unmap_lm_file(pds_vfio->coredev, DMA_TO_DEVICE, lm_file);
+
+ return err;
+}
+
+void
+pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+ enum pds_lm_host_vf_status vf_status)
+{
+ struct pds_auxiliary_dev *padev = pds_vfio->vfio_aux->padev;
+ struct pds_lm_host_vf_status_cmd cmd = {
+ .opcode = PDS_LM_CMD_HOST_VF_STATUS,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ .status = vf_status,
+ };
+ struct pci_dev *pdev = pds_vfio->pdev;
+ struct pds_lm_comp comp = {0};
+ int err;
+
+ dev_dbg(&pdev->dev, "vf%u: Set host VF LM status: %u",
+ pds_vfio->vf_id, cmd.status);
+ if (vf_status != PDS_LM_STA_IN_PROGRESS &&
+ vf_status != PDS_LM_STA_NONE) {
+ dev_warn(&pdev->dev, "Invalid host VF migration status, %d\n",
+ vf_status);
+ return;
+ }
+
+ err = padev->ops->adminq_cmd(padev,
+ (union pds_core_adminq_cmd *)&cmd,
+ sizeof(cmd),
+ (union pds_core_adminq_comp *)&comp,
+ 0);
+ if (err)
+ dev_warn(&pdev->dev, "failed to send host VF migration status: %pe\n",
+ ERR_PTR(err));
+}
@@ -4,11 +4,26 @@
#ifndef _CMDS_H_
#define _CMDS_H_
+#include <linux/pds/pds_lm.h>
+
struct pds_vfio_pci_device;
int
pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio);
void
pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio);
+int
+pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio);
+int
+pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio);
+int
+pds_vfio_get_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size);
+int
+pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio);
+int
+pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio);
+void
+pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+ enum pds_lm_host_vf_status vf_status);
#endif /* _CMDS_H_ */
new file mode 100644
@@ -0,0 +1,491 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+
+#include "cmds.h"
+#include "vfio_dev.h"
+
+#define PDS_VFIO_LM_FILENAME "pds_vfio_lm"
+
+const char *
+pds_vfio_lm_state(enum vfio_device_mig_state state)
+{
+ switch (state) {
+ case VFIO_DEVICE_STATE_ERROR:
+ return "VFIO_DEVICE_STATE_ERROR";
+ case VFIO_DEVICE_STATE_STOP:
+ return "VFIO_DEVICE_STATE_STOP";
+ case VFIO_DEVICE_STATE_RUNNING:
+ return "VFIO_DEVICE_STATE_RUNNING";
+ case VFIO_DEVICE_STATE_STOP_COPY:
+ return "VFIO_DEVICE_STATE_STOP_COPY";
+ case VFIO_DEVICE_STATE_RESUMING:
+ return "VFIO_DEVICE_STATE_RESUMING";
+ case VFIO_DEVICE_STATE_RUNNING_P2P:
+ return "VFIO_DEVICE_STATE_RUNNING_P2P";
+ default:
+ return "VFIO_DEVICE_STATE_INVALID";
+ }
+
+ return "VFIO_DEVICE_STATE_INVALID";
+}
+
+static struct pds_vfio_lm_file *
+pds_vfio_get_lm_file(const char *name, const struct file_operations *fops,
+ int flags, u64 size)
+{
+ struct pds_vfio_lm_file *lm_file = NULL;
+ unsigned long long npages;
+ unsigned long long i;
+ struct page **pages;
+ int err = 0;
+
+ if (!size)
+ return NULL;
+
+ /* Alloc file structure */
+ lm_file = kzalloc(sizeof(*lm_file), GFP_KERNEL);
+ if (!lm_file)
+ return NULL;
+
+ /* Create file */
+ lm_file->filep = anon_inode_getfile(name, fops, lm_file, flags);
+ if (!lm_file->filep)
+ goto err_get_file;
+
+ stream_open(lm_file->filep->f_inode, lm_file->filep);
+ mutex_init(&lm_file->lock);
+
+ lm_file->size = size;
+
+ /* Allocate memory for file pages */
+ npages = DIV_ROUND_UP_ULL(lm_file->size, PAGE_SIZE);
+
+ pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ goto err_alloc_pages;
+
+ for (i = 0; i < npages; i++) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i])
+ goto err_alloc_page;
+ }
+
+ lm_file->pages = pages;
+ lm_file->npages = npages;
+ lm_file->alloc_size = npages * PAGE_SIZE;
+
+ /* Create scatterlist of file pages to use for DMA mapping later */
+ err = sg_alloc_table_from_pages(&lm_file->sg_table, pages, npages,
+ 0, size, GFP_KERNEL);
+ if (err)
+ goto err_alloc_sg_table;
+
+ return lm_file;
+
+err_alloc_sg_table:
+err_alloc_page:
+ /* free allocated pages */
+ for (i = 0; i < npages && pages[i]; i++)
+ __free_page(pages[i]);
+ kfree(pages);
+err_alloc_pages:
+ fput(lm_file->filep);
+ mutex_destroy(&lm_file->lock);
+err_get_file:
+ kfree(lm_file);
+
+ return NULL;
+}
+
+static void
+pds_vfio_put_lm_file(struct pds_vfio_lm_file *lm_file)
+{
+ unsigned long long i;
+
+ mutex_lock(&lm_file->lock);
+
+ lm_file->size = 0;
+ lm_file->alloc_size = 0;
+
+ /* Free scatter list of file pages*/
+ sg_free_table(&lm_file->sg_table);
+
+ /* Free allocated file pages */
+ for (i = 0; i < lm_file->npages && lm_file->pages[i]; i++)
+ __free_page(lm_file->pages[i]);
+ kfree(lm_file->pages);
+ lm_file->pages = NULL;
+
+ /* Delete file */
+ fput(lm_file->filep);
+ lm_file->filep = NULL;
+
+ mutex_unlock(&lm_file->lock);
+
+ mutex_destroy(&lm_file->lock);
+
+ /* Free file structure */
+ kfree(lm_file);
+}
+
+void
+pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio)
+{
+ if (!pds_vfio->save_file)
+ return;
+
+ pds_vfio_put_lm_file(pds_vfio->save_file);
+ pds_vfio->save_file = NULL;
+}
+
+void
+pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio)
+{
+ if (!pds_vfio->restore_file)
+ return;
+
+ pds_vfio_put_lm_file(pds_vfio->restore_file);
+ pds_vfio->restore_file = NULL;
+}
+
+static struct page *
+pds_vfio_get_file_page(struct pds_vfio_lm_file *lm_file,
+ unsigned long offset)
+{
+ unsigned long cur_offset = 0;
+ struct scatterlist *sg;
+ unsigned int i;
+
+ /* All accesses are sequential */
+ if (offset < lm_file->last_offset || !lm_file->last_offset_sg) {
+ lm_file->last_offset = 0;
+ lm_file->last_offset_sg = lm_file->sg_table.sgl;
+ lm_file->sg_last_entry = 0;
+ }
+
+ cur_offset = lm_file->last_offset;
+
+ for_each_sg(lm_file->last_offset_sg, sg,
+ lm_file->sg_table.orig_nents - lm_file->sg_last_entry,
+ i) {
+ if (offset < sg->length + cur_offset) {
+ lm_file->last_offset_sg = sg;
+ lm_file->sg_last_entry += i;
+ lm_file->last_offset = cur_offset;
+ return nth_page(sg_page(sg),
+ (offset - cur_offset) / PAGE_SIZE);
+ }
+ cur_offset += sg->length;
+ }
+
+ return NULL;
+}
+
+static int
+pds_vfio_release_file(struct inode *inode, struct file *filp)
+{
+ struct pds_vfio_lm_file *lm_file = filp->private_data;
+
+ lm_file->size = 0;
+
+ return 0;
+}
+
+static ssize_t
+pds_vfio_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos)
+{
+ struct pds_vfio_lm_file *lm_file = filp->private_data;
+ ssize_t done = 0;
+
+ if (pos)
+ return -ESPIPE;
+ pos = &filp->f_pos;
+
+ mutex_lock(&lm_file->lock);
+ if (*pos > lm_file->size) {
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ len = min_t(size_t, lm_file->size - *pos, len);
+ while (len) {
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *from_buff;
+ int err;
+
+ page_offset = (*pos) % PAGE_SIZE;
+ page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
+ if (!page) {
+ if (done == 0)
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
+ from_buff = kmap_local_page(page);
+ err = copy_to_user(buf, from_buff + page_offset, page_len);
+ kunmap_local(from_buff);
+ if (err) {
+ done = -EFAULT;
+ goto out_unlock;
+ }
+ *pos += page_len;
+ len -= page_len;
+ done += page_len;
+ buf += page_len;
+ }
+
+out_unlock:
+ mutex_unlock(&lm_file->lock);
+ return done;
+}
+
+static const struct file_operations
+pds_vfio_save_fops = {
+ .owner = THIS_MODULE,
+ .read = pds_vfio_save_read,
+ .release = pds_vfio_release_file,
+ .llseek = no_llseek,
+};
+
+static int
+pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct pci_dev *pdev = pds_vfio->pdev;
+ struct pds_vfio_lm_file *lm_file;
+ int err = 0;
+ u64 size;
+
+ /* Get live migration state size in this state */
+ err = pds_vfio_get_lm_status_cmd(pds_vfio, &size);
+ if (err) {
+ dev_err(&pdev->dev, "failed to get save status: %pe\n",
+ ERR_PTR(err));
+ goto err_get_lm_status;
+ }
+
+ dev_dbg(&pdev->dev, "save status, size = %lld\n", size);
+
+ if (!size) {
+ err = -EIO;
+ dev_err(&pdev->dev, "invalid state size\n");
+ goto err_get_lm_status;
+ }
+
+ lm_file = pds_vfio_get_lm_file(PDS_VFIO_LM_FILENAME,
+ &pds_vfio_save_fops,
+ O_RDONLY, size);
+ if (!lm_file) {
+ err = -ENOENT;
+ dev_err(&pdev->dev, "failed to create save file\n");
+ goto err_get_lm_file;
+ }
+
+ dev_dbg(&pdev->dev, "size = %lld, alloc_size = %lld, npages = %lld\n",
+ lm_file->size, lm_file->alloc_size, lm_file->npages);
+
+ pds_vfio->save_file = lm_file;
+
+ return 0;
+
+err_get_lm_file:
+err_get_lm_status:
+ return err;
+}
+
+static ssize_t
+pds_vfio_restore_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos)
+{
+ struct pds_vfio_lm_file *lm_file = filp->private_data;
+ loff_t requested_length;
+ ssize_t done = 0;
+
+ if (pos)
+ return -ESPIPE;
+
+ pos = &filp->f_pos;
+
+ if (*pos < 0 ||
+ check_add_overflow((loff_t)len, *pos, &requested_length))
+ return -EINVAL;
+
+ mutex_lock(&lm_file->lock);
+
+ while (len) {
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *to_buff;
+ int err;
+
+ page_offset = (*pos) % PAGE_SIZE;
+ page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
+ if (!page) {
+ if (done == 0)
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
+ to_buff = kmap_local_page(page);
+ err = copy_from_user(to_buff + page_offset, buf, page_len);
+ kunmap_local(to_buff);
+ if (err) {
+ done = -EFAULT;
+ goto out_unlock;
+ }
+ *pos += page_len;
+ len -= page_len;
+ done += page_len;
+ buf += page_len;
+ lm_file->size += page_len;
+ }
+out_unlock:
+ mutex_unlock(&lm_file->lock);
+ return done;
+}
+
+static const struct file_operations
+pds_vfio_restore_fops = {
+ .owner = THIS_MODULE,
+ .write = pds_vfio_restore_write,
+ .release = pds_vfio_release_file,
+ .llseek = no_llseek,
+};
+
+static int
+pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct pci_dev *pdev = pds_vfio->pdev;
+ struct pds_vfio_lm_file *lm_file;
+ int err = 0;
+ u64 size;
+
+ size = sizeof(union pds_lm_dev_state);
+
+ dev_dbg(&pdev->dev, "restore status, size = %lld\n", size);
+
+ if (!size) {
+ err = -EIO;
+ dev_err(&pdev->dev, "invalid state size");
+ goto err_get_lm_status;
+ }
+
+ lm_file = pds_vfio_get_lm_file(PDS_VFIO_LM_FILENAME,
+ &pds_vfio_restore_fops,
+ O_WRONLY, size);
+ if (!lm_file) {
+ err = -ENOENT;
+ dev_err(&pdev->dev, "failed to create restore file");
+ goto err_get_lm_file;
+ }
+ pds_vfio->restore_file = lm_file;
+
+ return 0;
+
+err_get_lm_file:
+err_get_lm_status:
+ return err;
+}
+
+struct file *
+pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
+ enum vfio_device_mig_state next)
+{
+ enum vfio_device_mig_state cur = pds_vfio->state;
+ struct device *dev = &pds_vfio->pdev->dev;
+ int err = 0;
+
+ dev_dbg(dev, "%s => %s\n",
+ pds_vfio_lm_state(cur), pds_vfio_lm_state(next));
+
+ if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_STOP_COPY) {
+ /* Device is already stopped
+ * create save device data file & get device state from firmware
+ */
+ err = pds_vfio_get_save_file(pds_vfio);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Get device state */
+ err = pds_vfio_get_lm_state_cmd(pds_vfio);
+ if (err) {
+ pds_vfio_put_save_file(pds_vfio);
+ return ERR_PTR(err);
+ }
+
+ return pds_vfio->save_file->filep;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP_COPY && next == VFIO_DEVICE_STATE_STOP) {
+ /* Device is already stopped
+ * delete the save device state file
+ */
+ pds_vfio_put_save_file(pds_vfio);
+ pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
+ PDS_LM_STA_NONE);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RESUMING) {
+ /* create resume device data file */
+ err = pds_vfio_get_restore_file(pds_vfio);
+ if (err)
+ return ERR_PTR(err);
+
+ return pds_vfio->restore_file->filep;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RESUMING && next == VFIO_DEVICE_STATE_STOP) {
+ /* Set device state */
+ err = pds_vfio_set_lm_state_cmd(pds_vfio);
+ if (err)
+ return ERR_PTR(err);
+
+ /* delete resume device data file */
+ pds_vfio_put_restore_file(pds_vfio);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING && next == VFIO_DEVICE_STATE_RUNNING_P2P) {
+ pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS);
+ /* Device should be stopped
+ * no interrupts, dma or change in internal state
+ */
+ err = pds_vfio_suspend_device_cmd(pds_vfio);
+ if (err)
+ return ERR_PTR(err);
+
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_RUNNING) {
+ /* Device should be functional
+ * interrupts, dma, mmio or changes to internal state is allowed
+ */
+ err = pds_vfio_resume_device_cmd(pds_vfio);
+ if (err)
+ return ERR_PTR(err);
+
+ pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
+ PDS_LM_STA_NONE);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RUNNING_P2P)
+ return NULL;
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_STOP)
+ return NULL;
+
+ return ERR_PTR(-EINVAL);
+}
new file mode 100644
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _LM_H_
+#define _LM_H_
+
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/scatterlist.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+
+#include <linux/pds/pds_lm.h>
+
+struct pds_vfio_lm_file {
+ struct file *filep;
+ struct mutex lock; /* protect live migration data file */
+ u64 size; /* Size with valid data */
+ u64 alloc_size; /* Total allocated size. Always >= len */
+ struct page **pages; /* Backing pages for file */
+ unsigned long long npages;
+ struct sg_table sg_table; /* SG table for backing pages */
+ struct pds_lm_sg_elem *sgl; /* DMA mapping */
+ dma_addr_t sgl_addr;
+ u16 num_sge;
+ struct scatterlist *last_offset_sg; /* Iterator */
+ unsigned int sg_last_entry;
+ unsigned long last_offset;
+};
+
+struct pds_vfio_pci_device;
+
+struct file *
+pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
+ enum vfio_device_mig_state next);
+const char *
+pds_vfio_lm_state(enum vfio_device_mig_state state);
+void
+pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio);
+void
+pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio);
+
+#endif /* _LM_H_ */
@@ -60,12 +60,27 @@ pds_vfio_pci_table[] = {
};
MODULE_DEVICE_TABLE(pci, pds_vfio_pci_table);
+static void
+pds_vfio_pci_aer_reset_done(struct pci_dev *pdev)
+{
+ struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev);
+
+ pds_vfio_reset(pds_vfio);
+}
+
+static const struct
+pci_error_handlers pds_vfio_pci_err_handlers = {
+ .reset_done = pds_vfio_pci_aer_reset_done,
+ .error_detected = vfio_pci_core_aer_err_detected,
+};
+
static struct pci_driver
pds_vfio_pci_driver = {
.name = PDS_VFIO_DRV_NAME,
.id_table = pds_vfio_pci_table,
.probe = pds_vfio_pci_probe,
.remove = pds_vfio_pci_remove,
+ .err_handler = &pds_vfio_pci_err_handlers,
.driver_managed_dma = true,
};
@@ -4,6 +4,7 @@
#include <linux/vfio.h>
#include <linux/vfio_pci_core.h>
+#include "lm.h"
#include "vfio_dev.h"
#include "aux_drv.h"
@@ -16,6 +17,106 @@ pds_vfio_pci_drvdata(struct pci_dev *pdev)
vfio_coredev);
}
+static void
+pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
+{
+again:
+ spin_lock(&pds_vfio->reset_lock);
+ if (pds_vfio->deferred_reset) {
+ pds_vfio->deferred_reset = false;
+ if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) {
+ pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
+ pds_vfio_put_restore_file(pds_vfio);
+ pds_vfio_put_save_file(pds_vfio);
+ }
+ spin_unlock(&pds_vfio->reset_lock);
+ goto again;
+ }
+ mutex_unlock(&pds_vfio->state_mutex);
+ spin_unlock(&pds_vfio->reset_lock);
+}
+
+void
+pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio)
+{
+ spin_lock(&pds_vfio->reset_lock);
+ pds_vfio->deferred_reset = true;
+ if (!mutex_trylock(&pds_vfio->state_mutex)) {
+ spin_unlock(&pds_vfio->reset_lock);
+ return;
+ }
+ spin_unlock(&pds_vfio->reset_lock);
+ pds_vfio_state_mutex_unlock(pds_vfio);
+}
+
+static struct file *
+pds_vfio_set_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state new_state)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+ struct file *res = NULL;
+
+ if (!pds_vfio->vfio_aux)
+ return ERR_PTR(-ENODEV);
+
+ mutex_lock(&pds_vfio->state_mutex);
+ while (new_state != pds_vfio->state) {
+ enum vfio_device_mig_state next_state;
+
+ int err = vfio_mig_get_next_state(vdev, pds_vfio->state,
+ new_state, &next_state);
+ if (err) {
+ res = ERR_PTR(err);
+ break;
+ }
+
+ res = pds_vfio_step_device_state_locked(pds_vfio, next_state);
+ if (IS_ERR(res))
+ break;
+
+ pds_vfio->state = next_state;
+
+ if (WARN_ON(res && new_state != pds_vfio->state)) {
+ res = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ pds_vfio_state_mutex_unlock(pds_vfio);
+
+ return res;
+}
+
+static int
+pds_vfio_get_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state *current_state)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+
+ mutex_lock(&pds_vfio->state_mutex);
+ *current_state = pds_vfio->state;
+ pds_vfio_state_mutex_unlock(pds_vfio);
+ return 0;
+}
+
+static int
+pds_vfio_get_device_state_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ *stop_copy_length = PDS_LM_DEVICE_STATE_LENGTH;
+ return 0;
+}
+
+static const struct vfio_migration_ops
+pds_vfio_lm_ops = {
+ .migration_set_state = pds_vfio_set_device_state,
+ .migration_get_state = pds_vfio_get_device_state,
+ .migration_get_data_size = pds_vfio_get_device_state_size
+};
+
static int
pds_vfio_init_device(struct vfio_device *vdev)
{
@@ -32,6 +133,9 @@ pds_vfio_init_device(struct vfio_device *vdev)
pds_vfio->vf_id = pci_iov_vf_id(pdev);
pds_vfio->pci_id = PCI_DEVID(pdev->bus->number, pdev->devfn);
+ vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
+ vdev->mig_ops = &pds_vfio_lm_ops;
+
dev_dbg(&pdev->dev, "%s: PF %#04x VF %#04x (%d) vf_id %d domain %d pds_vfio %p\n",
__func__, pci_dev_id(pdev->physfn),
pds_vfio->pci_id, pds_vfio->pci_id, pds_vfio->vf_id,
@@ -52,18 +156,34 @@ pds_vfio_open_device(struct vfio_device *vdev)
if (err)
return err;
+ mutex_init(&pds_vfio->state_mutex);
+ dev_dbg(&pds_vfio->pdev->dev, "%s: %s => VFIO_DEVICE_STATE_RUNNING\n",
+ __func__, pds_vfio_lm_state(pds_vfio->state));
+ pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
+
vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev);
return 0;
}
+static void
+pds_vfio_close_device(struct vfio_device *vdev)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+
+ mutex_destroy(&pds_vfio->state_mutex);
+ vfio_pci_core_close_device(vdev);
+}
+
static const struct vfio_device_ops
pds_vfio_ops = {
.name = "pds-vfio",
.init = pds_vfio_init_device,
.release = vfio_pci_core_release_dev,
.open_device = pds_vfio_open_device,
- .close_device = vfio_pci_core_close_device,
+ .close_device = pds_vfio_close_device,
.ioctl = vfio_pci_core_ioctl,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
@@ -7,10 +7,20 @@
#include <linux/pci.h>
#include <linux/vfio_pci_core.h>
+#include "lm.h"
+
struct pds_vfio_pci_device {
struct vfio_pci_core_device vfio_coredev;
struct pci_dev *pdev;
struct pds_vfio_aux *vfio_aux;
+ struct device *coredev;
+
+ struct pds_vfio_lm_file *save_file;
+ struct pds_vfio_lm_file *restore_file;
+ struct mutex state_mutex; /* protect migration state */
+ enum vfio_device_mig_state state;
+ spinlock_t reset_lock; /* protect reset_done flow */
+ u8 deferred_reset;
int vf_id;
int pci_id;
@@ -20,5 +30,7 @@ const struct vfio_device_ops *
pds_vfio_ops_info(void);
struct pds_vfio_pci_device *
pds_vfio_pci_drvdata(struct pci_dev *pdev);
+void
+pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio);
#endif /* _VFIO_DEV_H_ */
@@ -8,5 +8,211 @@
#define PDS_DEV_TYPE_LM_STR "LM"
#define PDS_LM_DEV_NAME PDS_CORE_DRV_NAME "." PDS_DEV_TYPE_LM_STR
+#define PDS_LM_DEVICE_STATE_LENGTH 65536
+#define PDS_LM_CHECK_DEVICE_STATE_LENGTH(X) \
+ PDS_CORE_SIZE_CHECK(union, PDS_LM_DEVICE_STATE_LENGTH, X)
+
+/*
+ * enum pds_lm_cmd_opcode - Live Migration Device commands
+ */
+enum pds_lm_cmd_opcode {
+ PDS_LM_CMD_HOST_VF_STATUS = 1,
+
+ /* Device state commands */
+ PDS_LM_CMD_STATUS = 16,
+ PDS_LM_CMD_SUSPEND = 18,
+ PDS_LM_CMD_SUSPEND_STATUS = 19,
+ PDS_LM_CMD_RESUME = 20,
+ PDS_LM_CMD_SAVE = 21,
+ PDS_LM_CMD_RESTORE = 22,
+};
+
+/**
+ * struct pds_lm_cmd - generic command
+ * @opcode: Opcode
+ * @rsvd: Word boundary padding
+ * @vf_id: VF id
+ * @rsvd2: Structure padding to 60 Bytes
+ */
+struct pds_lm_cmd {
+ u8 opcode;
+ u8 rsvd;
+ __le16 vf_id;
+ u8 rsvd2[56];
+};
+
+/**
+ * struct pds_lm_comp - generic command completion
+ * @status: Status of the command (enum pds_core_status_code)
+ * @rsvd: Structure padding to 16 Bytes
+ */
+struct pds_lm_comp {
+ u8 status;
+ u8 rsvd[15];
+};
+
+/**
+ * struct pds_lm_status_cmd - STATUS command
+ * @opcode: Opcode
+ * @rsvd: Word boundary padding
+ * @vf_id: VF id
+ */
+struct pds_lm_status_cmd {
+ u8 opcode;
+ u8 rsvd;
+ __le16 vf_id;
+};
+
+/**
+ * struct pds_lm_status_comp - STATUS command completion
+ * @status: Status of the command (enum pds_core_status_code)
+ * @rsvd: Word boundary padding
+ * @comp_index: Index in the desc ring for which this is the completion
+ * @size: Size of the device state
+ * @rsvd2: Word boundary padding
+ * @color: Color bit
+ */
+struct pds_lm_status_comp {
+ u8 status;
+ u8 rsvd;
+ __le16 comp_index;
+ union {
+ __le64 size;
+ u8 rsvd2[11];
+ } __packed;
+ u8 color;
+};
+
+/**
+ * struct pds_lm_suspend_cmd - SUSPEND command
+ * @opcode: Opcode PDS_LM_CMD_SUSPEND
+ * @rsvd: Word boundary padding
+ * @vf_id: VF id
+ */
+struct pds_lm_suspend_cmd {
+ u8 opcode;
+ u8 rsvd;
+ __le16 vf_id;
+};
+
+/**
+ * struct pds_lm_suspend_comp - SUSPEND command completion
+ * @status: Status of the command (enum pds_core_status_code)
+ * @rsvd: Word boundary padding
+ * @comp_index: Index in the desc ring for which this is the completion
+ * @state_size: Size of the device state computed post suspend.
+ * @rsvd2: Word boundary padding
+ * @color: Color bit
+ */
+struct pds_lm_suspend_comp {
+ u8 status;
+ u8 rsvd;
+ __le16 comp_index;
+ union {
+ __le64 state_size;
+ u8 rsvd2[11];
+ } __packed;
+ u8 color;
+};
+
+/**
+ * struct pds_lm_suspend_status_cmd - SUSPEND status command
+ * @opcode: Opcode PDS_AQ_CMD_LM_SUSPEND_STATUS
+ * @rsvd: Word boundary padding
+ * @vf_id: VF id
+ */
+struct pds_lm_suspend_status_cmd {
+ u8 opcode;
+ u8 rsvd;
+ __le16 vf_id;
+};
+
+/**
+ * struct pds_lm_resume_cmd - RESUME command
+ * @opcode: Opcode PDS_LM_CMD_RESUME
+ * @rsvd: Word boundary padding
+ * @vf_id: VF id
+ */
+struct pds_lm_resume_cmd {
+ u8 opcode;
+ u8 rsvd;
+ __le16 vf_id;
+};
+
+/**
+ * struct pds_lm_sg_elem - Transmit scatter-gather (SG) descriptor element
+ * @addr: DMA address of SG element data buffer
+ * @len: Length of SG element data buffer, in bytes
+ * @rsvd: Word boundary padding
+ */
+struct pds_lm_sg_elem {
+ __le64 addr;
+ __le32 len;
+ __le16 rsvd[2];
+};
+
+/**
+ * struct pds_lm_save_cmd - SAVE command
+ * @opcode: Opcode PDS_LM_CMD_SAVE
+ * @rsvd: Word boundary padding
+ * @vf_id: VF id
+ * @rsvd2: Word boundary padding
+ * @sgl_addr: IOVA address of the SGL to dma the device state
+ * @num_sge: Total number of SG elements
+ */
+struct pds_lm_save_cmd {
+ u8 opcode;
+ u8 rsvd;
+ __le16 vf_id;
+ u8 rsvd2[4];
+ __le64 sgl_addr;
+ __le32 num_sge;
+} __packed;
+
+/**
+ * struct pds_lm_restore_cmd - RESTORE command
+ * @opcode: Opcode PDS_LM_CMD_RESTORE
+ * @rsvd: Word boundary padding
+ * @vf_id: VF id
+ * @rsvd2: Word boundary padding
+ * @sgl_addr: IOVA address of the SGL to dma the device state
+ * @num_sge: Total number of SG elements
+ */
+struct pds_lm_restore_cmd {
+ u8 opcode;
+ u8 rsvd;
+ __le16 vf_id;
+ u8 rsvd2[4];
+ __le64 sgl_addr;
+ __le32 num_sge;
+} __packed;
+
+/**
+ * union pds_lm_dev_state - device state information
+ * @words: Device state words
+ */
+union pds_lm_dev_state {
+ __le32 words[PDS_LM_DEVICE_STATE_LENGTH / sizeof(__le32)];
+};
+
+enum pds_lm_host_vf_status {
+ PDS_LM_STA_NONE = 0,
+ PDS_LM_STA_IN_PROGRESS,
+ PDS_LM_STA_MAX,
+};
+
+/**
+ * struct pds_lm_host_vf_status_cmd - HOST_VF_STATUS command
+ * @opcode: Opcode PDS_LM_CMD_HOST_VF_STATUS
+ * @rsvd: Word boundary padding
+ * @vf_id: VF id
+ * @status: Current LM status of host VF driver (enum pds_lm_host_status)
+ */
+struct pds_lm_host_vf_status_cmd {
+ u8 opcode;
+ u8 rsvd;
+ __le16 vf_id;
+ u8 status;
+};
#endif /* _PDS_LM_H_ */