From patchwork Mon May 18 02:43:17 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yan Zhao X-Patchwork-Id: 11554619 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id B0B9360D for ; Mon, 18 May 2020 02:53:46 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 9DA12207F9 for ; Mon, 18 May 2020 02:53:46 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726726AbgERCxn (ORCPT ); Sun, 17 May 2020 22:53:43 -0400 Received: from mga06.intel.com ([134.134.136.31]:38912 "EHLO mga06.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726675AbgERCxn (ORCPT ); Sun, 17 May 2020 22:53:43 -0400 IronPort-SDR: v0031KGH8sPzZNW36l48JiqLD/Ypcles9iwOm9NYyPTbJvPwu16AgLt8YEuwUX9YhZBGwVDpLz hI4jkWd3plUw== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by orsmga104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 17 May 2020 19:53:42 -0700 IronPort-SDR: ajl1mKmNYUq8JZW6h0P0xcT210nfq6Cn+lB1ky8e0KlSTGHH5i3gnWv1CM7K3UqhXBHZhDBz1b Aw4BnjPfMmdg== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.73,405,1583222400"; d="scan'208";a="411103805" Received: from joy-optiplex-7040.sh.intel.com ([10.239.13.16]) by orsmga004.jf.intel.com with ESMTP; 17 May 2020 19:53:39 -0700 From: Yan Zhao To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org Cc: alex.williamson@redhat.com, cohuck@redhat.com, zhenyuw@linux.intel.com, zhi.a.wang@intel.com, kevin.tian@intel.com, shaopeng.he@intel.com, yi.l.liu@intel.com, xin.zeng@intel.com, hang.yuan@intel.com, Yan Zhao Subject: [RFC PATCH v4 01/10] vfio/pci: register/unregister vfio_pci_vendor_driver_ops Date: Sun, 17 May 2020 22:43:17 -0400 Message-Id: <20200518024317.14055-1-yan.y.zhao@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20200518024202.13996-1-yan.y.zhao@intel.com> References: <20200518024202.13996-1-yan.y.zhao@intel.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org vfio_pci_vendor_driver_ops includes two parts: (1) .probe() and .remove() interface to be called by vfio_pci_probe() and vfio_pci_remove(). (2) pointer to struct vfio_device_ops. It will be registered as ops of vfio device if .probe() succeeds. Suggested-by: Alex Williamson Signed-off-by: Yan Zhao --- drivers/vfio/pci/vfio_pci.c | 102 +++++++++++++++++++++++++++- drivers/vfio/pci/vfio_pci_private.h | 7 ++ include/linux/vfio.h | 9 +++ 3 files changed, 117 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 6c6b37b5c04e..43d10d34cbc2 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -68,6 +68,11 @@ static inline bool vfio_vga_disabled(void) #endif } +static struct vfio_pci { + struct mutex vendor_drivers_lock; + struct list_head vendor_drivers_list; +} vfio_pci; + /* * Our VGA arbiter participation is limited since we don't know anything * about the device itself. However, if the device is the only VGA device @@ -1570,6 +1575,35 @@ static int vfio_pci_bus_notifier(struct notifier_block *nb, return 0; } +static int probe_vendor_drivers(struct vfio_pci_device *vdev) +{ + struct vfio_pci_vendor_driver *driver; + int ret = -ENODEV; + + request_module("vfio-pci:%x-%x", vdev->pdev->vendor, + vdev->pdev->device); + + mutex_lock(&vfio_pci.vendor_drivers_lock); + list_for_each_entry(driver, &vfio_pci.vendor_drivers_list, next) { + void *data; + + if (!try_module_get(driver->ops->owner)) + continue; + + data = driver->ops->probe(vdev->pdev); + if (IS_ERR(data)) { + module_put(driver->ops->owner); + continue; + } + vdev->vendor_driver = driver; + vdev->vendor_data = data; + ret = 0; + break; + } + mutex_unlock(&vfio_pci.vendor_drivers_lock); + return ret; +} + static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct vfio_pci_device *vdev; @@ -1609,7 +1643,12 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) mutex_init(&vdev->ioeventfds_lock); INIT_LIST_HEAD(&vdev->ioeventfds_list); - ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); + if (probe_vendor_drivers(vdev)) + ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); + else + ret = vfio_add_group_dev(&pdev->dev, + vdev->vendor_driver->ops->device_ops, + vdev); if (ret) goto out_free; @@ -1698,6 +1737,11 @@ static void vfio_pci_remove(struct pci_dev *pdev) if (!disable_idle_d3) vfio_pci_set_power_state(vdev, PCI_D0); + if (vdev->vendor_driver) { + vdev->vendor_driver->ops->remove(vdev->vendor_data); + module_put(vdev->vendor_driver->ops->owner); + } + kfree(vdev->pm_save); kfree(vdev); @@ -2035,6 +2079,8 @@ static int __init vfio_pci_init(void) vfio_pci_fill_ids(); + mutex_init(&vfio_pci.vendor_drivers_lock); + INIT_LIST_HEAD(&vfio_pci.vendor_drivers_list); return 0; out_driver: @@ -2042,6 +2088,60 @@ static int __init vfio_pci_init(void) return ret; } +int __vfio_pci_register_vendor_driver(struct vfio_pci_vendor_driver_ops *ops) +{ + struct vfio_pci_vendor_driver *driver, *tmp; + + if (!ops || !ops->device_ops) + return -EINVAL; + + driver = kzalloc(sizeof(*driver), GFP_KERNEL); + if (!driver) + return -ENOMEM; + + driver->ops = ops; + + mutex_lock(&vfio_pci.vendor_drivers_lock); + + /* Check for duplicates */ + list_for_each_entry(tmp, &vfio_pci.vendor_drivers_list, next) { + if (tmp->ops->device_ops == ops->device_ops) { + mutex_unlock(&vfio_pci.vendor_drivers_lock); + kfree(driver); + return -EINVAL; + } + } + + list_add(&driver->next, &vfio_pci.vendor_drivers_list); + + mutex_unlock(&vfio_pci.vendor_drivers_lock); + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + return 0; +} +EXPORT_SYMBOL_GPL(__vfio_pci_register_vendor_driver); + +void vfio_pci_unregister_vendor_driver(struct vfio_device_ops *device_ops) +{ + struct vfio_pci_vendor_driver *driver, *tmp; + + mutex_lock(&vfio_pci.vendor_drivers_lock); + list_for_each_entry_safe(driver, tmp, + &vfio_pci.vendor_drivers_list, next) { + if (driver->ops->device_ops == device_ops) { + list_del(&driver->next); + mutex_unlock(&vfio_pci.vendor_drivers_lock); + kfree(driver); + module_put(THIS_MODULE); + return; + } + } + mutex_unlock(&vfio_pci.vendor_drivers_lock); +} +EXPORT_SYMBOL_GPL(vfio_pci_unregister_vendor_driver); + module_init(vfio_pci_init); module_exit(vfio_pci_cleanup); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 36ec69081ecd..7758a20546fa 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -92,6 +92,11 @@ struct vfio_pci_vf_token { int users; }; +struct vfio_pci_vendor_driver { + const struct vfio_pci_vendor_driver_ops *ops; + struct list_head next; +}; + struct vfio_pci_device { struct pci_dev *pdev; void __iomem *barmap[PCI_STD_NUM_BARS]; @@ -132,6 +137,8 @@ struct vfio_pci_device { struct list_head ioeventfds_list; struct vfio_pci_vf_token *vf_token; struct notifier_block nb; + void *vendor_data; + struct vfio_pci_vendor_driver *vendor_driver; }; #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 38d3c6a8dc7e..3e53deb012b6 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -214,4 +214,13 @@ extern int vfio_virqfd_enable(void *opaque, void *data, struct virqfd **pvirqfd, int fd); extern void vfio_virqfd_disable(struct virqfd **pvirqfd); +struct vfio_pci_vendor_driver_ops { + char *name; + struct module *owner; + void *(*probe)(struct pci_dev *pdev); + void (*remove)(void *vendor_data); + struct vfio_device_ops *device_ops; +}; +int __vfio_pci_register_vendor_driver(struct vfio_pci_vendor_driver_ops *ops); +void vfio_pci_unregister_vendor_driver(struct vfio_device_ops *device_ops); #endif /* VFIO_H */ From patchwork Mon May 18 02:45:10 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yan Zhao X-Patchwork-Id: 11554623 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 1545060D for ; Mon, 18 May 2020 02:55:26 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 06DC7207F9 for ; Mon, 18 May 2020 02:55:26 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726939AbgERCzW (ORCPT ); Sun, 17 May 2020 22:55:22 -0400 Received: from mga18.intel.com ([134.134.136.126]:29280 "EHLO mga18.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726639AbgERCzW (ORCPT ); Sun, 17 May 2020 22:55:22 -0400 IronPort-SDR: jEwfjgTW73c42okrEuuT7PBqz32zu76m2cohSCWJFZRDuINEgQgzmNLC3eCO3UCr4XP+cKmxfp iWq+Gw3begWw== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by orsmga106.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 17 May 2020 19:55:21 -0700 IronPort-SDR: OglWgLH78bA92/WGFbFfRAjAgOZ3+ncDSkAz2DVB2civwg7YxMXX52i0cJOEpn49OAm94bmAL6 sKi6ocpyuOWA== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.73,405,1583222400"; d="scan'208";a="411104064" Received: from joy-optiplex-7040.sh.intel.com ([10.239.13.16]) by orsmga004.jf.intel.com with ESMTP; 17 May 2020 19:55:18 -0700 From: Yan Zhao To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org Cc: alex.williamson@redhat.com, cohuck@redhat.com, zhenyuw@linux.intel.com, zhi.a.wang@intel.com, kevin.tian@intel.com, shaopeng.he@intel.com, yi.l.liu@intel.com, xin.zeng@intel.com, hang.yuan@intel.com, Yan Zhao Subject: [RFC PATCH v4 02/10] vfio/pci: macros to generate module_init and module_exit for vendor modules Date: Sun, 17 May 2020 22:45:10 -0400 Message-Id: <20200518024510.14115-1-yan.y.zhao@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20200518024202.13996-1-yan.y.zhao@intel.com> References: <20200518024202.13996-1-yan.y.zhao@intel.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org vendor modules call macro module_vfio_pci_register_vendor_handler to generate module_init and module_exit. It is necessary to ensure that vendor modules always call vfio_pci_register_vendor_driver() on driver loading and vfio_pci_unregister_vendor_driver on driver unloading, because (1) at compiling time, there's only a dependency of vendor modules on vfio_pci. (2) at runtime, - vendor modules add refs of vfio_pci on a successful calling of vfio_pci_register_vendor_driver() and deref of vfio_pci on a successful calling of vfio_pci_unregister_vendor_driver(). - vfio_pci only adds refs of vendor module on a successful probe of vendor driver. vfio_pci derefs vendor module when unbinding from a device. So, after vfio_pci is unbound from a device, the vendor module to that device is free to get unloaded. However, if that vendor module does not call vfio_pci_unregister_vendor_driver() in its module_exit, vfio_pci may hold a stale pointer to vendor module. Cc: Kevin Tian Suggested-by: Alex Williamson Signed-off-by: Yan Zhao --- include/linux/vfio.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 3e53deb012b6..f3746608c2d9 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -223,4 +223,31 @@ struct vfio_pci_vendor_driver_ops { }; int __vfio_pci_register_vendor_driver(struct vfio_pci_vendor_driver_ops *ops); void vfio_pci_unregister_vendor_driver(struct vfio_device_ops *device_ops); + +#define vfio_pci_register_vendor_driver(__name, __probe, __remove, \ + __device_ops) \ +static struct vfio_pci_vendor_driver_ops __ops ## _node = { \ + .owner = THIS_MODULE, \ + .name = __name, \ + .probe = __probe, \ + .remove = __remove, \ + .device_ops = __device_ops, \ +}; \ +__vfio_pci_register_vendor_driver(&__ops ## _node) + +#define module_vfio_pci_register_vendor_handler(name, probe, remove, \ + device_ops) \ +static int __init device_ops ## _module_init(void) \ +{ \ + vfio_pci_register_vendor_driver(name, probe, remove, \ + device_ops); \ + return 0; \ +}; \ +static void __exit device_ops ## _module_exit(void) \ +{ \ + vfio_pci_unregister_vendor_driver(device_ops); \ +}; \ +module_init(device_ops ## _module_init); \ +module_exit(device_ops ## _module_exit) + #endif /* VFIO_H */ From patchwork Mon May 18 02:49:05 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yan Zhao X-Patchwork-Id: 11554627 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id F174D138A for ; Mon, 18 May 2020 02:59:07 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id DECC520787 for ; Mon, 18 May 2020 02:59:07 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726821AbgERC7E (ORCPT ); Sun, 17 May 2020 22:59:04 -0400 Received: from mga05.intel.com ([192.55.52.43]:18921 "EHLO mga05.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726720AbgERC7E (ORCPT ); Sun, 17 May 2020 22:59:04 -0400 IronPort-SDR: LNprU+NXPqN3d4PfX2SWfRpNy/iTG2wxSvoTwmuQrqN7Diy3suZWGKzyniS75aXDFD8jtoELuQ qRE6cHYngJWg== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by fmsmga105.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 17 May 2020 19:59:00 -0700 IronPort-SDR: kqg+56nQW5C1KiOo1+hxJMBYyeF5hU/ZEj0gR+engB0MQZJ+NIoGTmxc9dmkyXjl81SYnZWg0w 4/tdCaUCu0Ig== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.73,405,1583222400"; d="scan'208";a="411104728" Received: from joy-optiplex-7040.sh.intel.com ([10.239.13.16]) by orsmga004.jf.intel.com with ESMTP; 17 May 2020 19:58:57 -0700 From: Yan Zhao To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org Cc: alex.williamson@redhat.com, cohuck@redhat.com, zhenyuw@linux.intel.com, zhi.a.wang@intel.com, kevin.tian@intel.com, shaopeng.he@intel.com, yi.l.liu@intel.com, xin.zeng@intel.com, hang.yuan@intel.com, Yan Zhao Subject: [RFC PATCH v4 03/10] vfio/pci: export vendor_data, irq_type, num_regions, pdev and functions in vfio_pci_ops Date: Sun, 17 May 2020 22:49:05 -0400 Message-Id: <20200518024905.14207-1-yan.y.zhao@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20200518024202.13996-1-yan.y.zhao@intel.com> References: <20200518024202.13996-1-yan.y.zhao@intel.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org export functions vfio_pci_vendor_data(), vfio_pci_irq_type(), vfio_pci_num_regions(), vfio_pci_pdev(), and functions in vfio_pci_ops, so they are able to be called from outside modules and make them a kind of inherited by vfio_device_ops provided by vendor modules Cc: Kevin Tian Suggested-by: Alex Williamson Signed-off-by: Yan Zhao --- drivers/vfio/pci/vfio_pci.c | 56 +++++++++++++++++++++++++++++++------ include/linux/vfio.h | 18 ++++++++++++ 2 files changed, 66 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 43d10d34cbc2..290b7ab55ecf 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -73,6 +73,38 @@ static struct vfio_pci { struct list_head vendor_drivers_list; } vfio_pci; +struct pci_dev *vfio_pci_pdev(void *device_data) +{ + struct vfio_pci_device *vdev = device_data; + + return vdev->pdev; +} +EXPORT_SYMBOL_GPL(vfio_pci_pdev); + +int vfio_pci_num_regions(void *device_data) +{ + struct vfio_pci_device *vdev = device_data; + + return vdev->num_regions; +} +EXPORT_SYMBOL_GPL(vfio_pci_num_regions); + +int vfio_pci_irq_type(void *device_data) +{ + struct vfio_pci_device *vdev = device_data; + + return vdev->irq_type; +} +EXPORT_SYMBOL_GPL(vfio_pci_irq_type); + +void *vfio_pci_vendor_data(void *device_data) +{ + struct vfio_pci_device *vdev = device_data; + + return vdev->vendor_data; +} +EXPORT_SYMBOL_GPL(vfio_pci_vendor_data); + /* * Our VGA arbiter participation is limited since we don't know anything * about the device itself. However, if the device is the only VGA device @@ -514,7 +546,7 @@ static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val) vfio_device_put(pf_dev); } -static void vfio_pci_release(void *device_data) +void vfio_pci_release(void *device_data) { struct vfio_pci_device *vdev = device_data; @@ -530,8 +562,9 @@ static void vfio_pci_release(void *device_data) module_put(THIS_MODULE); } +EXPORT_SYMBOL_GPL(vfio_pci_release); -static int vfio_pci_open(void *device_data) +int vfio_pci_open(void *device_data) { struct vfio_pci_device *vdev = device_data; int ret = 0; @@ -556,6 +589,7 @@ static int vfio_pci_open(void *device_data) module_put(THIS_MODULE); return ret; } +EXPORT_SYMBOL_GPL(vfio_pci_open); static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) { @@ -741,7 +775,7 @@ int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, return 0; } -static long vfio_pci_ioctl(void *device_data, +long vfio_pci_ioctl(void *device_data, unsigned int cmd, unsigned long arg) { struct vfio_pci_device *vdev = device_data; @@ -1253,6 +1287,7 @@ static long vfio_pci_ioctl(void *device_data, return -ENOTTY; } +EXPORT_SYMBOL_GPL(vfio_pci_ioctl); static ssize_t vfio_pci_rw(void *device_data, char __user *buf, size_t count, loff_t *ppos, bool iswrite) @@ -1286,7 +1321,7 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, return -EINVAL; } -static ssize_t vfio_pci_read(void *device_data, char __user *buf, +ssize_t vfio_pci_read(void *device_data, char __user *buf, size_t count, loff_t *ppos) { if (!count) @@ -1294,8 +1329,9 @@ static ssize_t vfio_pci_read(void *device_data, char __user *buf, return vfio_pci_rw(device_data, buf, count, ppos, false); } +EXPORT_SYMBOL_GPL(vfio_pci_read); -static ssize_t vfio_pci_write(void *device_data, const char __user *buf, +ssize_t vfio_pci_write(void *device_data, const char __user *buf, size_t count, loff_t *ppos) { if (!count) @@ -1303,8 +1339,9 @@ static ssize_t vfio_pci_write(void *device_data, const char __user *buf, return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true); } +EXPORT_SYMBOL_GPL(vfio_pci_write); -static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) +int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) { struct vfio_pci_device *vdev = device_data; struct pci_dev *pdev = vdev->pdev; @@ -1365,8 +1402,9 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, req_len, vma->vm_page_prot); } +EXPORT_SYMBOL_GPL(vfio_pci_mmap); -static void vfio_pci_request(void *device_data, unsigned int count) +void vfio_pci_request(void *device_data, unsigned int count) { struct vfio_pci_device *vdev = device_data; struct pci_dev *pdev = vdev->pdev; @@ -1386,6 +1424,7 @@ static void vfio_pci_request(void *device_data, unsigned int count) mutex_unlock(&vdev->igate); } +EXPORT_SYMBOL_GPL(vfio_pci_request); static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, bool vf_token, uuid_t *uuid) @@ -1482,7 +1521,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, #define VF_TOKEN_ARG "vf_token=" -static int vfio_pci_match(void *device_data, char *buf) +int vfio_pci_match(void *device_data, char *buf) { struct vfio_pci_device *vdev = device_data; bool vf_token = false; @@ -1530,6 +1569,7 @@ static int vfio_pci_match(void *device_data, char *buf) return 1; /* Match */ } +EXPORT_SYMBOL_GPL(vfio_pci_match); static const struct vfio_device_ops vfio_pci_ops = { .name = "vfio-pci", diff --git a/include/linux/vfio.h b/include/linux/vfio.h index f3746608c2d9..6ededceb1964 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -214,6 +214,24 @@ extern int vfio_virqfd_enable(void *opaque, void *data, struct virqfd **pvirqfd, int fd); extern void vfio_virqfd_disable(struct virqfd **pvirqfd); +extern int vfio_pci_irq_type(void *device_data); +extern int vfio_pci_num_regions(void *device_data); +extern struct pci_dev *vfio_pci_pdev(void *device_data); + +extern long vfio_pci_ioctl(void *device_data, + unsigned int cmd, unsigned long arg); +extern ssize_t vfio_pci_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos); +extern ssize_t vfio_pci_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos); +extern int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma); +extern void vfio_pci_request(void *device_data, unsigned int count); +extern int vfio_pci_open(void *device_data); +extern void vfio_pci_release(void *device_data); +extern int vfio_pci_match(void *device_data, char *buf); + +extern void *vfio_pci_vendor_data(void *device_data); + struct vfio_pci_vendor_driver_ops { char *name; struct module *owner; From patchwork Mon May 18 02:49:44 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yan Zhao X-Patchwork-Id: 11554629 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 6E502138A for ; Mon, 18 May 2020 02:59:45 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 5F21E207BB for ; Mon, 18 May 2020 02:59:45 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726957AbgERC7m (ORCPT ); Sun, 17 May 2020 22:59:42 -0400 Received: from mga02.intel.com ([134.134.136.20]:4477 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726730AbgERC7m (ORCPT ); Sun, 17 May 2020 22:59:42 -0400 IronPort-SDR: yjkIprTiIpcMw+Ln4BjgRYBUtvfsRx5iC/RJ1DCz3Dy/DPqenC5/Y78w7Y/+RmWDrT+zMLVpTn WFu5gSdbzwMw== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by orsmga101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 17 May 2020 19:59:41 -0700 IronPort-SDR: CLkgpeC95vYKERnqRDBQB3CrlTou/Ki0yED5Gr1dbfqcwXDseZQ8ySd7LVctblNGcSQVVZKt2l KM06JQHYVDuw== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.73,405,1583222400"; d="scan'208";a="411104839" Received: from joy-optiplex-7040.sh.intel.com ([10.239.13.16]) by orsmga004.jf.intel.com with ESMTP; 17 May 2020 19:59:38 -0700 From: Yan Zhao To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org Cc: alex.williamson@redhat.com, cohuck@redhat.com, zhenyuw@linux.intel.com, zhi.a.wang@intel.com, kevin.tian@intel.com, shaopeng.he@intel.com, yi.l.liu@intel.com, xin.zeng@intel.com, hang.yuan@intel.com, Yan Zhao Subject: [RFC PATCH v4 04/10] vfio/pci: let vfio_pci know number of vendor regions and vendor irqs Date: Sun, 17 May 2020 22:49:44 -0400 Message-Id: <20200518024944.14263-1-yan.y.zhao@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20200518024202.13996-1-yan.y.zhao@intel.com> References: <20200518024202.13996-1-yan.y.zhao@intel.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org This allows a simpler VFIO_DEVICE_GET_INFO ioctl in vendor driver Cc: Kevin Tian Signed-off-by: Yan Zhao --- drivers/vfio/pci/vfio_pci.c | 23 +++++++++++++++++++++-- drivers/vfio/pci/vfio_pci_private.h | 2 ++ include/linux/vfio.h | 3 +++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 290b7ab55ecf..30137c1c5308 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -105,6 +105,24 @@ void *vfio_pci_vendor_data(void *device_data) } EXPORT_SYMBOL_GPL(vfio_pci_vendor_data); +int vfio_pci_set_vendor_regions(void *device_data, int num_vendor_regions) +{ + struct vfio_pci_device *vdev = device_data; + + vdev->num_vendor_regions = num_vendor_regions; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_set_vendor_regions); + + +int vfio_pci_set_vendor_irqs(void *device_data, int num_vendor_irqs) +{ + struct vfio_pci_device *vdev = device_data; + + vdev->num_vendor_irqs = num_vendor_irqs; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_set_vendor_irqs); /* * Our VGA arbiter participation is limited since we don't know anything * about the device itself. However, if the device is the only VGA device @@ -797,8 +815,9 @@ long vfio_pci_ioctl(void *device_data, if (vdev->reset_works) info.flags |= VFIO_DEVICE_FLAGS_RESET; - info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; - info.num_irqs = VFIO_PCI_NUM_IRQS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions + + vdev->num_vendor_regions; + info.num_irqs = VFIO_PCI_NUM_IRQS + vdev->num_vendor_irqs; return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 7758a20546fa..c6cfc4605987 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -110,6 +110,8 @@ struct vfio_pci_device { int num_ctx; int irq_type; int num_regions; + int num_vendor_regions; + int num_vendor_irqs; struct vfio_pci_region *region; u8 msi_qmax; u8 msix_bar; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 6ededceb1964..6310c53f9d36 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -231,6 +231,9 @@ extern void vfio_pci_release(void *device_data); extern int vfio_pci_match(void *device_data, char *buf); extern void *vfio_pci_vendor_data(void *device_data); +extern int vfio_pci_set_vendor_regions(void *device_data, + int num_vendor_regions); +extern int vfio_pci_set_vendor_irqs(void *device_data, int num_vendor_irqs); struct vfio_pci_vendor_driver_ops { char *name; From patchwork Mon May 18 02:50:16 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yan Zhao X-Patchwork-Id: 11554631 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id E644A618 for ; Mon, 18 May 2020 03:00:14 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id CD9BF20787 for ; Mon, 18 May 2020 03:00:14 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726990AbgERDAL (ORCPT ); Sun, 17 May 2020 23:00:11 -0400 Received: from mga05.intel.com ([192.55.52.43]:18994 "EHLO mga05.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726721AbgERDAL (ORCPT ); Sun, 17 May 2020 23:00:11 -0400 IronPort-SDR: E8nF6okch5IVhBiQVkGQdu2E20vAdSKlISqyXjlWY9db1dCi9g38DnsrgsBSj+gVBctlpMlkPK iB4dacBwArdQ== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by fmsmga105.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 17 May 2020 20:00:10 -0700 IronPort-SDR: p0H95gVRAi/oYZb5SAEN6kreT+i2s9dECYuTiMr3NwRKY3i2tL8seL7w1i7MENDt7BPCfEZ2W/ OckX267ut7OQ== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.73,405,1583222400"; d="scan'208";a="411105271" Received: from joy-optiplex-7040.sh.intel.com ([10.239.13.16]) by orsmga004.jf.intel.com with ESMTP; 17 May 2020 20:00:07 -0700 From: Yan Zhao To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org Cc: alex.williamson@redhat.com, cohuck@redhat.com, zhenyuw@linux.intel.com, zhi.a.wang@intel.com, kevin.tian@intel.com, shaopeng.he@intel.com, yi.l.liu@intel.com, xin.zeng@intel.com, hang.yuan@intel.com, Yan Zhao Subject: [RFC PATCH v4 05/10] vfio/pci: export vfio_pci_get_barmap Date: Sun, 17 May 2020 22:50:16 -0400 Message-Id: <20200518025016.14317-1-yan.y.zhao@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20200518024202.13996-1-yan.y.zhao@intel.com> References: <20200518024202.13996-1-yan.y.zhao@intel.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org This allows vendor driver to read/write to bars directly which is useful in security checking condition. Cc: Kevin Tian Signed-off-by: Yan Zhao --- drivers/vfio/pci/vfio_pci_rdwr.c | 10 ++++++++++ include/linux/vfio.h | 1 + 2 files changed, 11 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index a87992892a9f..e4085311ab28 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -153,6 +153,16 @@ static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar) return 0; } +void __iomem *vfio_pci_get_barmap(void *device_data, int bar) +{ + int ret; + struct vfio_pci_device *vdev = device_data; + + ret = vfio_pci_setup_barmap(vdev, bar); + return ret ? ERR_PTR(ret) : vdev->barmap[bar]; +} +EXPORT_SYMBOL_GPL(vfio_pci_get_barmap); + ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 6310c53f9d36..0c786fec4602 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -234,6 +234,7 @@ extern void *vfio_pci_vendor_data(void *device_data); extern int vfio_pci_set_vendor_regions(void *device_data, int num_vendor_regions); extern int vfio_pci_set_vendor_irqs(void *device_data, int num_vendor_irqs); +extern void __iomem *vfio_pci_get_barmap(void *device_data, int bar); struct vfio_pci_vendor_driver_ops { char *name; From patchwork Mon May 18 02:50:52 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yan Zhao X-Patchwork-Id: 11554633 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id F4044618 for ; Mon, 18 May 2020 03:00:49 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id E5E5D20787 for ; Mon, 18 May 2020 03:00:49 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727017AbgERDAr (ORCPT ); Sun, 17 May 2020 23:00:47 -0400 Received: from mga17.intel.com ([192.55.52.151]:34839 "EHLO mga17.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726680AbgERDAr (ORCPT ); Sun, 17 May 2020 23:00:47 -0400 IronPort-SDR: G3+DSFow9qx80B57ytZLh7cpJqSFR5aT88/0bhqJ/HvoQT4lyMxEr9k//UvgfaItjwx9XIHRed knCx3Cp/Bjpw== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by fmsmga107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 17 May 2020 20:00:46 -0700 IronPort-SDR: 8hFVjjQ1rWf9g9PItNpkMFb9V3x0CZN8lj2Z4/VZzX/rkoAdE4rdqqUwAGVYLx53UuMj1meEdK wh+K/MufHlIQ== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.73,405,1583222400"; d="scan'208";a="411105557" Received: from joy-optiplex-7040.sh.intel.com ([10.239.13.16]) by orsmga004.jf.intel.com with ESMTP; 17 May 2020 20:00:43 -0700 From: Yan Zhao To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org Cc: alex.williamson@redhat.com, cohuck@redhat.com, zhenyuw@linux.intel.com, zhi.a.wang@intel.com, kevin.tian@intel.com, shaopeng.he@intel.com, yi.l.liu@intel.com, xin.zeng@intel.com, hang.yuan@intel.com, Tina Zhang , Eric Auger Subject: [RFC PATCH v4 06/10] vfio: Define device specific irq type capability Date: Sun, 17 May 2020 22:50:52 -0400 Message-Id: <20200518025052.14369-1-yan.y.zhao@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20200518024202.13996-1-yan.y.zhao@intel.com> References: <20200518024202.13996-1-yan.y.zhao@intel.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org From: Tina Zhang Cap the number of irqs with fixed indexes and use capability chains to chain device specific irqs. v2: - Irq capability index starts from 1. Signed-off-by: Tina Zhang Signed-off-by: Eric Auger --- include/uapi/linux/vfio.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 0fe7c9a6f211..2d0d85c7c4d4 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -683,11 +683,27 @@ struct vfio_irq_info { #define VFIO_IRQ_INFO_MASKABLE (1 << 1) #define VFIO_IRQ_INFO_AUTOMASKED (1 << 2) #define VFIO_IRQ_INFO_NORESIZE (1 << 3) +#define VFIO_IRQ_INFO_FLAG_CAPS (1 << 4) /* Info supports caps */ __u32 index; /* IRQ index */ __u32 count; /* Number of IRQs within this index */ + __u32 cap_offset; /* Offset within info struct of first cap */ }; #define VFIO_DEVICE_GET_IRQ_INFO _IO(VFIO_TYPE, VFIO_BASE + 9) +/* + * The irq type capability allows irqs unique to a specific device or + * class of devices to be exposed. + * + * The structures below define version 1 of this capability. + */ +#define VFIO_IRQ_INFO_CAP_TYPE 1 + +struct vfio_irq_info_cap_type { + struct vfio_info_cap_header header; + __u32 type; /* global per bus driver */ + __u32 subtype; /* type specific */ +}; + /** * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set) * @@ -789,7 +805,8 @@ enum { VFIO_PCI_MSIX_IRQ_INDEX, VFIO_PCI_ERR_IRQ_INDEX, VFIO_PCI_REQ_IRQ_INDEX, - VFIO_PCI_NUM_IRQS + VFIO_PCI_NUM_IRQS = 5 /* Fixed user ABI, IRQ indexes >=5 use */ + /* device specific cap to define content */ }; /* From patchwork Mon May 18 02:52:45 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yan Zhao X-Patchwork-Id: 11554635 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 37E79618 for ; Mon, 18 May 2020 03:02:43 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 20516207BB for ; Mon, 18 May 2020 03:02:43 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726932AbgERDCk (ORCPT ); Sun, 17 May 2020 23:02:40 -0400 Received: from mga14.intel.com ([192.55.52.115]:21760 "EHLO mga14.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726639AbgERDCk (ORCPT ); Sun, 17 May 2020 23:02:40 -0400 IronPort-SDR: azQmqO2J10SWh3CDZEpUtxNnQChCZqYdKov4OTiyY0QLMpUN2kgrbrLMFBD0TnzZZgrmZUjv/O UZHdkbneOqRg== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by fmsmga103.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 17 May 2020 20:02:39 -0700 IronPort-SDR: v8ZzGhIZ1YZDRT0sKRTjnVYJT+D0/Zt0Y3ti7/50UogXp1khFlTEvLd4WraU4XCMk6j1qwl8GY ljDFSQ/Dp/jg== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.73,405,1583222400"; d="scan'208";a="411106132" Received: from joy-optiplex-7040.sh.intel.com ([10.239.13.16]) by orsmga004.jf.intel.com with ESMTP; 17 May 2020 20:02:36 -0700 From: Yan Zhao To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org Cc: alex.williamson@redhat.com, cohuck@redhat.com, zhenyuw@linux.intel.com, zhi.a.wang@intel.com, kevin.tian@intel.com, shaopeng.he@intel.com, yi.l.liu@intel.com, xin.zeng@intel.com, hang.yuan@intel.com, Yan Zhao Subject: [RFC PATCH v4 07/10] vfio/pci: introduce a new irq type VFIO_IRQ_TYPE_REMAP_BAR_REGION Date: Sun, 17 May 2020 22:52:45 -0400 Message-Id: <20200518025245.14425-1-yan.y.zhao@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20200518024202.13996-1-yan.y.zhao@intel.com> References: <20200518024202.13996-1-yan.y.zhao@intel.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org This is a virtual irq type. vendor driver triggers this irq when it wants to notify userspace to remap PCI BARs. 1. vendor driver triggers this irq and packs the target bar number in the ctx count. i.e. "1 << bar_number". if a bit is set, the corresponding bar is to be remapped. 2. userspace requery the specified PCI BAR from kernel and if flags of the bar regions are changed, it removes the old subregions and attaches subregions according to the new flags. 3. userspace notifies back to kernel by writing one to the eventfd of this irq. Please check the corresponding qemu implementation from the reply of this patch, and a sample usage in vendor driver in patch [10/10]. Cc: Kevin Tian Signed-off-by: Yan Zhao --- include/uapi/linux/vfio.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 2d0d85c7c4d4..55895f75d720 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -704,6 +704,17 @@ struct vfio_irq_info_cap_type { __u32 subtype; /* type specific */ }; +/* Bar Region Query IRQ TYPE */ +#define VFIO_IRQ_TYPE_REMAP_BAR_REGION (1) + +/* sub-types for VFIO_IRQ_TYPE_REMAP_BAR_REGION */ +/* + * This irq notifies userspace to re-query BAR region and remaps the + * subregions. + */ +#define VFIO_IRQ_SUBTYPE_REMAP_BAR_REGION (0) + + /** * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set) * From patchwork Mon May 18 02:53:16 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yan Zhao X-Patchwork-Id: 11554637 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 22340138A for ; Mon, 18 May 2020 03:03:17 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 0E2F0207F9 for ; Mon, 18 May 2020 03:03:17 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727055AbgERDDN (ORCPT ); Sun, 17 May 2020 23:03:13 -0400 Received: from mga07.intel.com ([134.134.136.100]:21823 "EHLO mga07.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726979AbgERDDN (ORCPT ); Sun, 17 May 2020 23:03:13 -0400 IronPort-SDR: DRKLOSNyx9S02ZunKEX/2yiIsUN7oZfQ/GbYzULzPahzOFkcEfdcmTKaeJFUPUDy4VJjgmCpFw kcFL3qoarUSQ== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by orsmga105.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 17 May 2020 20:03:11 -0700 IronPort-SDR: ml1VkQIFoGG7qYFcZu/rU80fLMIw6nteTbFsZh23owle36yfqKpUCk9J1SByDEVVvsoof17f3E rxNkpkK4RuQQ== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.73,405,1583222400"; d="scan'208";a="411106215" Received: from joy-optiplex-7040.sh.intel.com ([10.239.13.16]) by orsmga004.jf.intel.com with ESMTP; 17 May 2020 20:03:08 -0700 From: Yan Zhao To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org Cc: alex.williamson@redhat.com, cohuck@redhat.com, zhenyuw@linux.intel.com, zhi.a.wang@intel.com, kevin.tian@intel.com, shaopeng.he@intel.com, yi.l.liu@intel.com, xin.zeng@intel.com, hang.yuan@intel.com, Yan Zhao Subject: [RFC PATCH v4 08/10] i40e/vf_migration: VF live migration - pass-through VF first Date: Sun, 17 May 2020 22:53:16 -0400 Message-Id: <20200518025316.14491-1-yan.y.zhao@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20200518024202.13996-1-yan.y.zhao@intel.com> References: <20200518024202.13996-1-yan.y.zhao@intel.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org This driver intercepts all device operations as long as it's probed successfully by vfio-pci driver. It will process regions and irqs of its interest and then forward operations to default handlers exported from vfio pci if it wishes to. In this patch, this driver does nothing but pass through VFs to guest by calling to exported handlers from driver vfio-pci. Cc: Shaopeng He Signed-off-by: Yan Zhao --- drivers/net/ethernet/intel/Kconfig | 10 ++ drivers/net/ethernet/intel/i40e/Makefile | 2 + .../ethernet/intel/i40e/i40e_vf_migration.c | 165 ++++++++++++++++++ .../ethernet/intel/i40e/i40e_vf_migration.h | 59 +++++++ 4 files changed, 236 insertions(+) create mode 100644 drivers/net/ethernet/intel/i40e/i40e_vf_migration.c create mode 100644 drivers/net/ethernet/intel/i40e/i40e_vf_migration.h diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index ad34e4335df2..31780d9a59f1 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -264,6 +264,16 @@ config I40E_DCB If unsure, say N. +config I40E_VF_MIGRATION + tristate "XL710 Family VF live migration support -- loadable modules only" + depends on I40E && VFIO_PCI && m + help + Say m if you want to enable live migration of + Virtual Functions of Intel(R) Ethernet Controller XL710 + Family of devices. It must be a module. + This module serves as vendor module of module vfio_pci. + VFs bind to module vfio_pci directly. + # this is here to allow seamless migration from I40EVF --> IAVF name # so that CONFIG_IAVF symbol will always mirror the state of CONFIG_I40EVF config IAVF diff --git a/drivers/net/ethernet/intel/i40e/Makefile b/drivers/net/ethernet/intel/i40e/Makefile index 2f21b3e89fd0..b80c224c2602 100644 --- a/drivers/net/ethernet/intel/i40e/Makefile +++ b/drivers/net/ethernet/intel/i40e/Makefile @@ -27,3 +27,5 @@ i40e-objs := i40e_main.o \ i40e_xsk.o i40e-$(CONFIG_I40E_DCB) += i40e_dcb.o i40e_dcb_nl.o + +obj-$(CONFIG_I40E_VF_MIGRATION) += i40e_vf_migration.o diff --git a/drivers/net/ethernet/intel/i40e/i40e_vf_migration.c b/drivers/net/ethernet/intel/i40e/i40e_vf_migration.c new file mode 100644 index 000000000000..96026dcf5c9d --- /dev/null +++ b/drivers/net/ethernet/intel/i40e/i40e_vf_migration.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2013 - 2019 Intel Corporation. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "i40e.h" +#include "i40e_vf_migration.h" + +#define VERSION_STRING "0.1" +#define DRIVER_AUTHOR "Intel Corporation" + +static int i40e_vf_open(void *device_data) +{ + struct i40e_vf_migration *i40e_vf_dev = + vfio_pci_vendor_data(device_data); + int ret; + struct vfio_device_migration_info *mig_ctl = NULL; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mutex_lock(&i40e_vf_dev->reflock); + if (!i40e_vf_dev->refcnt) { + vfio_pci_set_vendor_regions(device_data, 0); + vfio_pci_set_vendor_irqs(device_data, 0); + } + + ret = vfio_pci_open(device_data); + if (ret) + goto error; + + i40e_vf_dev->refcnt++; + mutex_unlock(&i40e_vf_dev->reflock); + return 0; +error: + if (!i40e_vf_dev->refcnt) { + vfio_pci_set_vendor_regions(device_data, 0); + vfio_pci_set_vendor_irqs(device_data, 0); + } + module_put(THIS_MODULE); + mutex_unlock(&i40e_vf_dev->reflock); + return ret; +} + +void i40e_vf_release(void *device_data) +{ + struct i40e_vf_migration *i40e_vf_dev = + vfio_pci_vendor_data(device_data); + + mutex_lock(&i40e_vf_dev->reflock); + if (!--i40e_vf_dev->refcnt) { + vfio_pci_set_vendor_regions(device_data, 0); + vfio_pci_set_vendor_irqs(device_data, 0); + } + vfio_pci_release(device_data); + mutex_unlock(&i40e_vf_dev->reflock); + module_put(THIS_MODULE); +} + +static long i40e_vf_ioctl(void *device_data, + unsigned int cmd, unsigned long arg) +{ + return vfio_pci_ioctl(device_data, cmd, arg); +} + +static ssize_t i40e_vf_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos) +{ + return vfio_pci_read(device_data, buf, count, ppos); +} + +static ssize_t i40e_vf_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos) +{ + return vfio_pci_write(device_data, buf, count, ppos); +} + +static int i40e_vf_mmap(void *device_data, struct vm_area_struct *vma) +{ + return vfio_pci_mmap(device_data, vma); +} + +static void i40e_vf_request(void *device_data, unsigned int count) +{ + vfio_pci_request(device_data, count); +} + +static struct vfio_device_ops i40e_vf_device_ops_node = { + .name = "i40e_vf", + .open = i40e_vf_open, + .release = i40e_vf_release, + .ioctl = i40e_vf_ioctl, + .read = i40e_vf_read, + .write = i40e_vf_write, + .mmap = i40e_vf_mmap, + .request = i40e_vf_request, +}; + +void *i40e_vf_probe(struct pci_dev *pdev) +{ + struct i40e_vf_migration *i40e_vf_dev = NULL; + struct pci_dev *pf_dev, *vf_dev; + struct i40e_pf *pf; + struct i40e_vf *vf; + unsigned int vf_devfn, devfn; + int vf_id = -1; + int i; + + pf_dev = pdev->physfn; + pf = pci_get_drvdata(pf_dev); + vf_dev = pdev; + vf_devfn = vf_dev->devfn; + + for (i = 0; i < pci_num_vf(pf_dev); i++) { + devfn = (pf_dev->devfn + pf_dev->sriov->offset + + pf_dev->sriov->stride * i) & 0xff; + if (devfn == vf_devfn) { + vf_id = i; + break; + } + } + + if (vf_id == -1) + return ERR_PTR(-EINVAL); + + i40e_vf_dev = kzalloc(sizeof(*i40e_vf_dev), GFP_KERNEL); + + if (!i40e_vf_dev) + return ERR_PTR(-ENOMEM); + + i40e_vf_dev->vf_id = vf_id; + i40e_vf_dev->vf_vendor = pdev->vendor; + i40e_vf_dev->vf_device = pdev->device; + i40e_vf_dev->pf_dev = pf_dev; + i40e_vf_dev->vf_dev = vf_dev; + mutex_init(&i40e_vf_dev->reflock); + + vf = &pf->vf[vf_id]; + + return i40e_vf_dev; +} + +static void i40e_vf_remove(void *vendor_data) +{ + kfree(vendor_data); +} + +#define i40e_vf_device_ops (&i40e_vf_device_ops_node) +module_vfio_pci_register_vendor_handler("I40E VF", i40e_vf_probe, + i40e_vf_remove, i40e_vf_device_ops); + +MODULE_ALIAS("vfio-pci:8086-154c"); +MODULE_LICENSE("GPL v2"); +MODULE_INFO(supported, "Vendor driver of vfio pci to support VF live migration"); +MODULE_VERSION(VERSION_STRING); +MODULE_AUTHOR(DRIVER_AUTHOR); diff --git a/drivers/net/ethernet/intel/i40e/i40e_vf_migration.h b/drivers/net/ethernet/intel/i40e/i40e_vf_migration.h new file mode 100644 index 000000000000..696d40601ec3 --- /dev/null +++ b/drivers/net/ethernet/intel/i40e/i40e_vf_migration.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2013 - 2019 Intel Corporation. */ + +#ifndef I40E_MIG_H +#define I40E_MIG_H + +#include +#include +#include + +#include "i40e.h" +#include "i40e_txrx.h" + +/* helper macros copied from vfio-pci */ +#define VFIO_PCI_OFFSET_SHIFT 40 +#define VFIO_PCI_OFFSET_TO_INDEX(off) ((off) >> VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) + +/* Single Root I/O Virtualization */ +struct pci_sriov { + int pos; /* Capability position */ + int nres; /* Number of resources */ + u32 cap; /* SR-IOV Capabilities */ + u16 ctrl; /* SR-IOV Control */ + u16 total_VFs; /* Total VFs associated with the PF */ + u16 initial_VFs; /* Initial VFs associated with the PF */ + u16 num_VFs; /* Number of VFs available */ + u16 offset; /* First VF Routing ID offset */ + u16 stride; /* Following VF stride */ + u16 vf_device; /* VF device ID */ + u32 pgsz; /* Page size for BAR alignment */ + u8 link; /* Function Dependency Link */ + u8 max_VF_buses; /* Max buses consumed by VFs */ + u16 driver_max_VFs; /* Max num VFs driver supports */ + struct pci_dev *dev; /* Lowest numbered PF */ + struct pci_dev *self; /* This PF */ + u32 cfg_size; /* VF config space size */ + u32 class; /* VF device */ + u8 hdr_type; /* VF header type */ + u16 subsystem_vendor; /* VF subsystem vendor */ + u16 subsystem_device; /* VF subsystem device */ + resource_size_t barsz[PCI_SRIOV_NUM_BARS]; /* VF BAR size */ + bool drivers_autoprobe; /* Auto probing of VFs by driver */ +}; + +struct i40e_vf_migration { + __u32 vf_vendor; + __u32 vf_device; + __u32 handle; + struct pci_dev *pf_dev; + struct pci_dev *vf_dev; + int vf_id; + int refcnt; + struct mutex reflock; /*mutex protect refcnt */ +}; + +#endif /* I40E_MIG_H */ + From patchwork Mon May 18 02:54:00 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yan Zhao X-Patchwork-Id: 11554639 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 2D03A138A for ; Mon, 18 May 2020 03:04:08 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 14600207F9 for ; Mon, 18 May 2020 03:04:08 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727005AbgERDEF (ORCPT ); Sun, 17 May 2020 23:04:05 -0400 Received: from mga07.intel.com ([134.134.136.100]:21859 "EHLO mga07.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726720AbgERDEE (ORCPT ); Sun, 17 May 2020 23:04:04 -0400 IronPort-SDR: W/FQOQOyUBGA7wZqZbjkcYderdlSfLBHjmVvQd5BR5b0jR88PLXi82D1Mo550mDrf3I2aLqmj4 o5a2B39YBtjw== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by orsmga105.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 17 May 2020 20:03:56 -0700 IronPort-SDR: AqAXxl2Q+vwSd+sRpwBU79HD5r++WP5aak5eaF/O8uGHoMIMBVI6inKp9LVglc6/nMMtlXlUte 5siBD9lJv4Qw== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.73,405,1583222400"; d="scan'208";a="411106351" Received: from joy-optiplex-7040.sh.intel.com ([10.239.13.16]) by orsmga004.jf.intel.com with ESMTP; 17 May 2020 20:03:51 -0700 From: Yan Zhao To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org Cc: alex.williamson@redhat.com, cohuck@redhat.com, zhenyuw@linux.intel.com, zhi.a.wang@intel.com, kevin.tian@intel.com, shaopeng.he@intel.com, yi.l.liu@intel.com, xin.zeng@intel.com, hang.yuan@intel.com, Yan Zhao Subject: [RFC PATCH v4 09/10] i40e/vf_migration: register a migration vendor region Date: Sun, 17 May 2020 22:54:00 -0400 Message-Id: <20200518025400.14547-1-yan.y.zhao@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20200518024202.13996-1-yan.y.zhao@intel.com> References: <20200518024202.13996-1-yan.y.zhao@intel.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org This patch let the vendor driver register a migration region, so that the migration detection code in userspace will be able to see this region and triggers the migration flow according to VFIO migration protocol. This migration region works based on VFIO migration series with some minor fixes: [1] kernel v17: https://patchwork.kernel.org/cover/11466129/ [2] qemu v16: https://patchwork.kernel.org/cover/11456557/ Cc: Shaopeng He Signed-off-by: Yan Zhao --- .../ethernet/intel/i40e/i40e_vf_migration.c | 429 +++++++++++++++++- .../ethernet/intel/i40e/i40e_vf_migration.h | 34 ++ 2 files changed, 460 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_vf_migration.c b/drivers/net/ethernet/intel/i40e/i40e_vf_migration.c index 96026dcf5c9d..107a291909b3 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_vf_migration.c +++ b/drivers/net/ethernet/intel/i40e/i40e_vf_migration.c @@ -17,6 +17,351 @@ #define VERSION_STRING "0.1" #define DRIVER_AUTHOR "Intel Corporation" +#define TEST_DIRTY_IOVA_PFN 0 + +static int i40e_vf_iommu_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) { + struct vfio_iommu_type1_dma_unmap *unmap = data; + unsigned long iova_pfn, end_iova_pfn; + + iova_pfn = unmap->iova >> PAGE_SHIFT; + end_iova_pfn = iova_pfn + unmap->size / PAGE_SIZE; + + pr_info("DMA UNMAP iova_pfn=%lx, end=%lx\n", iova_pfn, + end_iova_pfn); + } + + return NOTIFY_OK; +} + +/* transient pinning a page + */ +static int i40e_vf_set_page_dirty(struct i40e_vf_migration *i40e_vf_dev, + unsigned long dirty_iova_pfn) +{ + unsigned long dirty_pfn, cnt = 1; + int ret; + + ret = vfio_group_pin_pages(i40e_vf_dev->vfio_group, + &dirty_iova_pfn, cnt, + IOMMU_READ | IOMMU_WRITE, &dirty_pfn); + if (ret != cnt) { + pr_err("failed to track dirty of page of iova pfn %lx\n", + dirty_iova_pfn); + return ret < 0 ? ret : -EFAULT; + } + + vfio_group_unpin_pages(i40e_vf_dev->vfio_group, &dirty_iova_pfn, cnt); + + return 0; +} + +/* alloc dirty page tracking resources and + * do the first round dirty page scanning + */ +static int i40e_vf_prepare_dirty_track(struct i40e_vf_migration *i40e_vf_dev) +{ + struct vfio_group *vfio_group; + unsigned long events; + int ret; + struct device *dev = &i40e_vf_dev->vf_dev->dev; + + if (i40e_vf_dev->in_dirty_track) { + pr_warn("%s, previous dirty track resources found\n", + __func__); + return 0; + } + + i40e_vf_dev->iommu_notifier.notifier_call = i40e_vf_iommu_notifier; + + events = VFIO_IOMMU_NOTIFY_DMA_UNMAP; + ret = vfio_register_notifier(dev, VFIO_IOMMU_NOTIFY, &events, + &i40e_vf_dev->iommu_notifier); + if (ret) { + pr_err("failed to register vfio iommu notifier\n"); + return ret; + } + + vfio_group = vfio_group_get_external_user_from_dev(dev); + if (IS_ERR_OR_NULL(vfio_group)) { + ret = PTR_ERR(vfio_group); + pr_err("failed to get vfio group from dev\n"); + goto out; + } + + i40e_vf_dev->vfio_group = vfio_group; + + ret = i40e_vf_set_page_dirty(i40e_vf_dev, TEST_DIRTY_IOVA_PFN); + + if (ret) { + pr_err("failed to set dirty for test page\n"); + goto out_group; + } + + i40e_vf_dev->in_dirty_track = true; + return 0; + +out_group: + vfio_unregister_notifier(dev, VFIO_IOMMU_NOTIFY, + &i40e_vf_dev->iommu_notifier); +out: + vfio_group_put_external_user(i40e_vf_dev->vfio_group); + return ret; +} + +static void i40e_vf_stop_dirty_track(struct i40e_vf_migration *i40e_vf_dev) +{ + if (!i40e_vf_dev->in_dirty_track) + return; + + vfio_unregister_notifier(&i40e_vf_dev->vf_dev->dev, + VFIO_IOMMU_NOTIFY, + &i40e_vf_dev->iommu_notifier); + vfio_group_put_external_user(i40e_vf_dev->vfio_group); + i40e_vf_dev->in_dirty_track = false; +} + +static size_t i40e_vf_set_device_state(struct i40e_vf_migration *i40e_vf_dev, + u32 state) +{ + int ret = 0; + struct vfio_device_migration_info *mig_ctl = i40e_vf_dev->mig_ctl; + + if (state == mig_ctl->device_state) + return 0; + + switch (state) { + case VFIO_DEVICE_STATE_RUNNING: + break; + case VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING: + ret = i40e_vf_prepare_dirty_track(i40e_vf_dev); + break; + case VFIO_DEVICE_STATE_SAVING: + // do the last round of dirty page scanning + break; + case VFIO_DEVICE_STATE_STOP: + // release dirty page tracking resources + if (mig_ctl->device_state == VFIO_DEVICE_STATE_SAVING) + i40e_vf_stop_dirty_track(i40e_vf_dev); + break; + case VFIO_DEVICE_STATE_RESUMING: + break; + default: + ret = -EFAULT; + } + + if (!ret) + mig_ctl->device_state = state; + + return ret; +} + +static +ssize_t i40e_vf_region_migration_rw(struct i40e_vf_migration *i40e_vf_dev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite) +{ +#define VDM_OFFSET(x) offsetof(struct vfio_device_migration_info, x) + struct vfio_device_migration_info *mig_ctl = i40e_vf_dev->mig_ctl; + u64 pos = *ppos & VFIO_PCI_OFFSET_MASK; + int ret = 0; + + switch (pos) { + case VDM_OFFSET(device_state): + if (count != sizeof(mig_ctl->device_state)) { + ret = -EINVAL; + break; + } + + if (iswrite) { + u32 device_state; + + if (copy_from_user(&device_state, buf, count)) { + ret = -EFAULT; + break; + } + + ret = i40e_vf_set_device_state(i40e_vf_dev, + device_state) ? + ret : count; + } else { + ret = copy_to_user(buf, &mig_ctl->device_state, + count) ? -EFAULT : count; + } + break; + + case VDM_OFFSET(reserved): + ret = -EFAULT; + break; + + case VDM_OFFSET(pending_bytes): + { + if (count != sizeof(mig_ctl->pending_bytes)) { + ret = -EINVAL; + break; + } + + if (iswrite) + ret = -EFAULT; + else + ret = copy_to_user(buf, + &mig_ctl->pending_bytes, + count) ? -EFAULT : count; + + break; + } + + case VDM_OFFSET(data_offset): + { + /* as we don't support device internal dirty data + * and our pending_bytes is always 0, + * return error here. + */ + ret = -EFAULT; + break; + } + case VDM_OFFSET(data_size): + if (count != sizeof(mig_ctl->data_size)) { + ret = -EINVAL; + break; + } + + if (iswrite) + ret = copy_from_user(&mig_ctl->data_size, buf, count) ? + -EFAULT : count; + else + ret = copy_to_user(buf, &mig_ctl->data_size, count) ? + -EFAULT : count; + break; + + default: + ret = -EFAULT; + break; + } + return ret; +} + +static +int i40e_vf_region_migration_mmap(struct i40e_vf_migration *i40e_vf_dev, + struct i40e_vf_region *region, + struct vm_area_struct *vma) +{ + return -EFAULT; +} + +static +void i40e_vf_region_migration_release(struct i40e_vf_migration *i40e_vf_dev, + struct i40e_vf_region *region) +{ + kfree(i40e_vf_dev->mig_ctl); + i40e_vf_dev->mig_ctl = NULL; +} + +static const struct i40e_vf_region_ops i40e_vf_region_ops_migration = { + .rw = i40e_vf_region_migration_rw, + .release = i40e_vf_region_migration_release, + .mmap = i40e_vf_region_migration_mmap, +}; + +static int i40e_vf_register_region(struct i40e_vf_migration *i40e_vf_dev, + unsigned int type, unsigned int subtype, + const struct i40e_vf_region_ops *ops, + size_t size, u32 flags, void *data) +{ + struct i40e_vf_region *regions; + + regions = krealloc(i40e_vf_dev->regions, + (i40e_vf_dev->num_regions + 1) * sizeof(*regions), + GFP_KERNEL); + if (!regions) + return -ENOMEM; + + i40e_vf_dev->regions = regions; + regions[i40e_vf_dev->num_regions].type = type; + regions[i40e_vf_dev->num_regions].subtype = subtype; + regions[i40e_vf_dev->num_regions].ops = ops; + regions[i40e_vf_dev->num_regions].size = size; + regions[i40e_vf_dev->num_regions].flags = flags; + regions[i40e_vf_dev->num_regions].data = data; + i40e_vf_dev->num_regions++; + return 0; +} + +static long i40e_vf_get_region_info(void *device_data, + unsigned int cmd, unsigned long arg) +{ + struct vfio_region_info info; + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; + int index, ret; + struct vfio_region_info_cap_type cap_type = { + .header.id = VFIO_REGION_INFO_CAP_TYPE, + .header.version = 1 }; + struct i40e_vf_region *regions; + int num_vdev_regions = vfio_pci_num_regions(device_data); + unsigned long minsz; + struct i40e_vf_migration *i40e_vf_dev = + vfio_pci_vendor_data(device_data); + + minsz = offsetofend(struct vfio_region_info, offset); + + if (cmd != VFIO_DEVICE_GET_REGION_INFO) + return -EINVAL; + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + if (info.index < VFIO_PCI_NUM_REGIONS + num_vdev_regions) + goto default_handle; + + index = info.index - VFIO_PCI_NUM_REGIONS - num_vdev_regions; + if (index > i40e_vf_dev->num_regions) + return -EINVAL; + + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + regions = i40e_vf_dev->regions; + info.size = regions[index].size; + info.flags = regions[index].flags; + cap_type.type = regions[index].type; + cap_type.subtype = regions[index].subtype; + + ret = vfio_info_add_capability(&caps, &cap_type.header, + sizeof(cap_type)); + if (ret) + return ret; + + if (regions[index].ops->add_cap) { + ret = regions[index].ops->add_cap(i40e_vf_dev, + ®ions[index], &caps); + if (ret) + return ret; + } + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user((void __user *)arg + sizeof(info), + caps.buf, caps.size)) { + kfree(caps.buf); + return -EFAULT; + } + info.cap_offset = sizeof(info); + } + + kfree(caps.buf); + } + + return copy_to_user((void __user *)arg, &info, minsz) ? + -EFAULT : 0; + +default_handle: + return vfio_pci_ioctl(device_data, cmd, arg); +} static int i40e_vf_open(void *device_data) { @@ -30,7 +375,26 @@ static int i40e_vf_open(void *device_data) mutex_lock(&i40e_vf_dev->reflock); if (!i40e_vf_dev->refcnt) { - vfio_pci_set_vendor_regions(device_data, 0); + mig_ctl = kzalloc(sizeof(*mig_ctl), GFP_KERNEL); + if (!mig_ctl) { + ret = -ENOMEM; + goto error; + } + + ret = i40e_vf_register_region(i40e_vf_dev, + VFIO_REGION_TYPE_MIGRATION, + VFIO_REGION_SUBTYPE_MIGRATION, + &i40e_vf_region_ops_migration, + MIGRATION_REGION_SZ, + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE, + NULL); + if (ret) + goto error; + + i40e_vf_dev->mig_ctl = mig_ctl; + vfio_pci_set_vendor_regions(device_data, + i40e_vf_dev->num_regions); vfio_pci_set_vendor_irqs(device_data, 0); } @@ -43,6 +407,10 @@ static int i40e_vf_open(void *device_data) return 0; error: if (!i40e_vf_dev->refcnt) { + kfree(mig_ctl); + kfree(i40e_vf_dev->regions); + i40e_vf_dev->num_regions = 0; + i40e_vf_dev->regions = NULL; vfio_pci_set_vendor_regions(device_data, 0); vfio_pci_set_vendor_irqs(device_data, 0); } @@ -56,8 +424,17 @@ void i40e_vf_release(void *device_data) struct i40e_vf_migration *i40e_vf_dev = vfio_pci_vendor_data(device_data); + i40e_vf_stop_dirty_track(i40e_vf_dev); mutex_lock(&i40e_vf_dev->reflock); if (!--i40e_vf_dev->refcnt) { + int i; + + for (i = 0; i < i40e_vf_dev->num_regions; i++) + i40e_vf_dev->regions[i].ops->release(i40e_vf_dev, + &i40e_vf_dev->regions[i]); + i40e_vf_dev->num_regions = 0; + kfree(i40e_vf_dev->regions); + i40e_vf_dev->regions = NULL; vfio_pci_set_vendor_regions(device_data, 0); vfio_pci_set_vendor_irqs(device_data, 0); } @@ -69,19 +446,65 @@ void i40e_vf_release(void *device_data) static long i40e_vf_ioctl(void *device_data, unsigned int cmd, unsigned long arg) { + if (cmd == VFIO_DEVICE_GET_REGION_INFO) + return i40e_vf_get_region_info(device_data, cmd, arg); + return vfio_pci_ioctl(device_data, cmd, arg); } static ssize_t i40e_vf_read(void *device_data, char __user *buf, size_t count, loff_t *ppos) { - return vfio_pci_read(device_data, buf, count, ppos); + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + struct i40e_vf_migration *i40e_vf_dev = + vfio_pci_vendor_data(device_data); + struct i40e_vf_region *region; + int num_vdev_regions = vfio_pci_num_regions(device_data); + int num_vendor_region = i40e_vf_dev->num_regions; + + if (index < VFIO_PCI_NUM_REGIONS + num_vdev_regions) + return vfio_pci_read(device_data, buf, count, ppos); + else if (index >= VFIO_PCI_NUM_REGIONS + num_vdev_regions + + num_vendor_region) + return -EINVAL; + + index -= VFIO_PCI_NUM_REGIONS + num_vdev_regions; + + region = &i40e_vf_dev->regions[index]; + if (!region->ops->rw) + return -EINVAL; + + return region->ops->rw(i40e_vf_dev, buf, count, ppos, false); } static ssize_t i40e_vf_write(void *device_data, const char __user *buf, size_t count, loff_t *ppos) { - return vfio_pci_write(device_data, buf, count, ppos); + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + struct i40e_vf_migration *i40e_vf_dev = + vfio_pci_vendor_data(device_data); + struct i40e_vf_region *region; + int num_vdev_regions = vfio_pci_num_regions(device_data); + int num_vendor_region = i40e_vf_dev->num_regions; + + if (index == VFIO_PCI_BAR0_REGION_INDEX) + ;// scan dirty pages + + if (index < VFIO_PCI_NUM_REGIONS + num_vdev_regions) + return vfio_pci_write(device_data, buf, count, ppos); + else if (index >= VFIO_PCI_NUM_REGIONS + num_vdev_regions + + num_vendor_region) + return -EINVAL; + + index -= VFIO_PCI_NUM_REGIONS + num_vdev_regions; + + region = &i40e_vf_dev->regions[index]; + + if (!region->ops->rw) + return -EINVAL; + + return region->ops->rw(i40e_vf_dev, (char __user *)buf, + count, ppos, true); } static int i40e_vf_mmap(void *device_data, struct vm_area_struct *vma) diff --git a/drivers/net/ethernet/intel/i40e/i40e_vf_migration.h b/drivers/net/ethernet/intel/i40e/i40e_vf_migration.h index 696d40601ec3..918ba275d5b5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_vf_migration.h +++ b/drivers/net/ethernet/intel/i40e/i40e_vf_migration.h @@ -17,6 +17,8 @@ #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) +#define MIGRATION_REGION_SZ (sizeof(struct vfio_device_migration_info)) + /* Single Root I/O Virtualization */ struct pci_sriov { int pos; /* Capability position */ @@ -53,6 +55,38 @@ struct i40e_vf_migration { int vf_id; int refcnt; struct mutex reflock; /*mutex protect refcnt */ + + struct vfio_device_migration_info *mig_ctl; + bool in_dirty_track; + + struct i40e_vf_region *regions; + int num_regions; + struct notifier_block iommu_notifier; + struct vfio_group *vfio_group; + +}; + +struct i40e_vf_region_ops { + ssize_t (*rw)(struct i40e_vf_migration *i40e_vf_dev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite); + void (*release)(struct i40e_vf_migration *i40e_vf_dev, + struct i40e_vf_region *region); + int (*mmap)(struct i40e_vf_migration *i40e_vf_dev, + struct i40e_vf_region *region, + struct vm_area_struct *vma); + int (*add_cap)(struct i40e_vf_migration *i40e_vf_dev, + struct i40e_vf_region *region, + struct vfio_info_cap *caps); +}; + +struct i40e_vf_region { + u32 type; + u32 subtype; + size_t size; + u32 flags; + const struct i40e_vf_region_ops *ops; + void *data; }; #endif /* I40E_MIG_H */ From patchwork Mon May 18 02:54:41 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Yan Zhao X-Patchwork-Id: 11554641 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id E4665138A for ; Mon, 18 May 2020 03:04:42 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id CA06620825 for ; Mon, 18 May 2020 03:04:42 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727123AbgERDEj (ORCPT ); Sun, 17 May 2020 23:04:39 -0400 Received: from mga03.intel.com ([134.134.136.65]:25715 "EHLO mga03.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726720AbgERDEj (ORCPT ); Sun, 17 May 2020 23:04:39 -0400 IronPort-SDR: egluIFURTmoNkAcIJFaLKbYv/fLj1DeuA24/MvuwlwtBhmcBtLeRw3WGALJhV8+Yub7qOwgbyN pmuG9DlAiYXg== X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from orsmga004.jf.intel.com ([10.7.209.38]) by orsmga103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 17 May 2020 20:04:37 -0700 IronPort-SDR: Dc7ZD0Txewr9IXncTFuVfkfHwmH4yE52rq7GMqu7GdB9kXfQdTGHEJiDfzqXUA/gz6Qt5fSYhL FGEqfwinhJOw== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.73,405,1583222400"; d="scan'208";a="411106449" Received: from joy-optiplex-7040.sh.intel.com ([10.239.13.16]) by orsmga004.jf.intel.com with ESMTP; 17 May 2020 20:04:34 -0700 From: Yan Zhao To: kvm@vger.kernel.org, linux-kernel@vger.kernel.org Cc: alex.williamson@redhat.com, cohuck@redhat.com, zhenyuw@linux.intel.com, zhi.a.wang@intel.com, kevin.tian@intel.com, shaopeng.he@intel.com, yi.l.liu@intel.com, xin.zeng@intel.com, hang.yuan@intel.com, Yan Zhao Subject: [RFC PATCH v4 10/10] i40e/vf_migration: vendor defined irq_type to support dynamic bar map Date: Sun, 17 May 2020 22:54:41 -0400 Message-Id: <20200518025441.14604-1-yan.y.zhao@intel.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20200518024202.13996-1-yan.y.zhao@intel.com> References: <20200518024202.13996-1-yan.y.zhao@intel.com> Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org This patch gives an example implementation to support vendor defined irq_type. - on this vendor driver open, it registers an irq of type VFIO_IRQ_TYPE_REMAP_BAR_REGION, and reports to driver vfio-pci there's 1 vendor irq. - after userspace detects and enables the irq of type VFIO_IRQ_TYPE_REMAP_BAR_REGION, this vendor driver will setup a virqfd to monitor file write to the fd of this irq. (1) when migration starts (the device state is set to _SAVING & _RUNNING), a. this vendor driver will signal the irq VFIO_IRQ_TYPE_REMAP_BAR_REGION to ask userspace to remap pci bars. It packs the target bar number in the ctx count. i.e. 1 << bar_number. if there are multiple bars to remap, the numbers are or'ed. b. on receiving this eventfd signal, userspace will read the bar number, re-query the bar flags (like READ/WRITE/MMAP/SPARSE ranges), and remap the bar's subregions. c. vendor driver reports bar 0 to be trapped (not MMAP'd). d. after remapping completion, it writes 0 to the eventfd so that the vendor driver waiting for it would complete too. (2) as the bar 0 is remapped to be trapped, vendor driver is able to start tracking dirty pages in software way. (3) when migration stops, similar to what's done in migration start, the vendor driver would signal to remap the bar back to un-trapped (MMAP'd), but it would not wait for the userspace writing back for remapping completion. - on releasing this vendor driver, it frees resources to vendor defined irqs. Cc: Kevin Tian Cc: Shaopeng He Signed-off-by: Yan Zhao --- drivers/net/ethernet/intel/Kconfig | 2 +- .../ethernet/intel/i40e/i40e_vf_migration.c | 322 +++++++++++++++++- .../ethernet/intel/i40e/i40e_vf_migration.h | 26 ++ 3 files changed, 346 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index 31780d9a59f1..6a52a197c4d8 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -266,7 +266,7 @@ config I40E_DCB config I40E_VF_MIGRATION tristate "XL710 Family VF live migration support -- loadable modules only" - depends on I40E && VFIO_PCI && m + depends on I40E && VFIO_PCI && VFIO_VIRQFD && m help Say m if you want to enable live migration of Virtual Functions of Intel(R) Ethernet Controller XL710 diff --git a/drivers/net/ethernet/intel/i40e/i40e_vf_migration.c b/drivers/net/ethernet/intel/i40e/i40e_vf_migration.c index 107a291909b3..188829efaa19 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_vf_migration.c +++ b/drivers/net/ethernet/intel/i40e/i40e_vf_migration.c @@ -19,6 +19,266 @@ #define DRIVER_AUTHOR "Intel Corporation" #define TEST_DIRTY_IOVA_PFN 0 +static int i40e_vf_remap_bars(struct i40e_vf_migration *i40e_vf_dev, bool wait) +{ + int bar_num = 0; + + if (!i40e_vf_dev->remap_irq_ctx.init) + return -ENODEV; + + /* set cnt to 2 as it will enter wait_handler too times. + * one from this eventfd_signal, + * one from userspace ack back + */ + atomic_set(&i40e_vf_dev->remap_irq_ctx.cnt, 2); + eventfd_signal(i40e_vf_dev->remap_irq_ctx.trigger, 1 << bar_num); + + if (!wait) + return 0; + + /* the wait cannot be executed in vcpu threads, as the eventfd write + * from userspace we are waiting for is waiting on the lock vcpu + * threads hold + */ + wait_event_killable(i40e_vf_dev->remap_irq_ctx.waitq, + !atomic_read(&i40e_vf_dev->remap_irq_ctx.cnt)); + + return 0; +} + +static int i40e_vf_remap_bar_wait_handler(void *opaque, void *unused) +{ + struct i40e_vf_migration *i40e_vf_dev = opaque; + + atomic_dec_if_positive(&i40e_vf_dev->remap_irq_ctx.cnt); + wake_up(&i40e_vf_dev->remap_irq_ctx.waitq); + return 0; +} + +static void i40e_vf_disable_remap_bars_irq(struct i40e_vf_migration *vf_dev) +{ + if (!vf_dev->remap_irq_ctx.init) + return; + + if (vf_dev->remap_irq_ctx.sync) + vfio_virqfd_disable(&vf_dev->remap_irq_ctx.sync); + + atomic_set(&vf_dev->remap_irq_ctx.cnt, 0); + wake_up(&vf_dev->remap_irq_ctx.waitq); + + eventfd_ctx_put(vf_dev->remap_irq_ctx.trigger); + vf_dev->remap_irq_ctx.trigger = NULL; + vf_dev->remap_irq_ctx.init = false; +} + +static int i40e_vf_enable_remap_bars_irq(struct i40e_vf_migration *vf_dev, + struct eventfd_ctx *ctx, int32_t fd) +{ + int ret; + + if (vf_dev->remap_irq_ctx.init) + return -EEXIST; + + ret = vfio_virqfd_enable((void *)vf_dev, + i40e_vf_remap_bar_wait_handler, NULL, ctx, + &vf_dev->remap_irq_ctx.sync, fd); + if (ret) { + eventfd_ctx_put(ctx); + return ret; + } + + init_waitqueue_head(&vf_dev->remap_irq_ctx.waitq); + atomic_set(&vf_dev->remap_irq_ctx.cnt, 0); + vf_dev->remap_irq_ctx.init = true; + vf_dev->remap_irq_ctx.trigger = ctx; + return 0; +} + +static int i40e_vf_set_irq_remap_bars(struct i40e_vf_migration *i40e_vf_dev, + u32 flags, unsigned int index, + unsigned int start, unsigned int count, + void *data) +{ + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_MASK: + case VFIO_IRQ_SET_ACTION_UNMASK: + /* XXX Need masking support exported */ + return 0; + case VFIO_IRQ_SET_ACTION_TRIGGER: + break; + default: + return 0; + } + + if (start != 0 || count > 1) + return -EINVAL; + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + if (!count) { + i40e_vf_disable_remap_bars_irq(i40e_vf_dev); + return 0; + } + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + return -EINVAL; + } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + int fd; + + if (!count || !data) + return -EINVAL; + + fd = *(int32_t *)data; + if (fd == -1) { + i40e_vf_disable_remap_bars_irq(i40e_vf_dev); + } else if (fd >= 0) { + struct eventfd_ctx *efdctx; + + efdctx = eventfd_ctx_fdget(fd); + if (IS_ERR(efdctx)) + return PTR_ERR(efdctx); + + i40e_vf_disable_remap_bars_irq(i40e_vf_dev); + + return i40e_vf_enable_remap_bars_irq(i40e_vf_dev, + efdctx, fd); + } + return 0; + } + return -EINVAL; +} + +static const struct i40e_vf_irqops i40e_vf_irqops_remap_bars = { + .set_irqs = i40e_vf_set_irq_remap_bars, +}; + +static long i40e_vf_set_irqs(void *device_data, + unsigned int cmd, unsigned long arg) +{ + struct vfio_irq_set hdr; + int index, ret; + u8 *data = NULL; + size_t data_size = 0; + unsigned long minsz; + struct i40e_vf_migration *i40e_vf_dev = + vfio_pci_vendor_data(device_data); + + minsz = offsetofend(struct vfio_irq_set, count); + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + if (hdr.argsz < minsz || + hdr.index >= VFIO_PCI_NUM_IRQS + i40e_vf_dev->num_irqs) + return -EINVAL; + if (hdr.index < VFIO_PCI_NUM_IRQS) + goto default_handle; + + index = hdr.index - VFIO_PCI_NUM_IRQS; + + ret = vfio_set_irqs_validate_and_prepare(&hdr, + i40e_vf_dev->irqs[index].count, + VFIO_PCI_NUM_IRQS + + i40e_vf_dev->num_irqs, + &data_size); + if (ret) + return ret; + + if (data_size) { + data = memdup_user((void __user *)(arg + minsz), data_size); + if (IS_ERR(data)) + return PTR_ERR(data); + } + + ret = i40e_vf_dev->irqs[index].ops->set_irqs(i40e_vf_dev, + hdr.flags, hdr.index, + hdr.start, hdr.count, + data); + kfree(data); + return ret; + +default_handle: + return vfio_pci_ioctl(device_data, cmd, arg); +} + +static long i40e_vf_get_irq_info(void *device_data, + unsigned int cmd, unsigned long arg) +{ + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; + struct vfio_irq_info info; + int index, ret; + unsigned long minsz; + struct vfio_irq_info_cap_type cap_type = { + .header.id = VFIO_IRQ_INFO_CAP_TYPE, + .header.version = 1 + }; + struct i40e_vf_migration *i40e_vf_dev = + vfio_pci_vendor_data(device_data); + + minsz = offsetofend(struct vfio_irq_info, count); + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz || + info.index >= VFIO_PCI_NUM_IRQS + i40e_vf_dev->num_irqs) + return -EINVAL; + if (info.index < VFIO_PCI_NUM_IRQS) + goto default_handle; + + index = info.index - VFIO_PCI_NUM_IRQS; + info.flags = i40e_vf_dev->irqs[index].flags; + cap_type.type = i40e_vf_dev->irqs[index].type; + cap_type.subtype = i40e_vf_dev->irqs[index].subtype; + + ret = vfio_info_add_capability(&caps, &cap_type.header, + sizeof(cap_type)); + if (ret) + return ret; + + if (caps.size) { + info.flags |= VFIO_IRQ_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user((void __user *)arg + sizeof(info), + caps.buf, caps.size)) { + kfree(caps.buf); + return -EFAULT; + } + info.cap_offset = sizeof(info); + if (offsetofend(struct vfio_irq_info, cap_offset) > + minsz) + minsz = offsetofend(struct vfio_irq_info, + cap_offset); + } + kfree(caps.buf); + } + return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; + +default_handle: + return vfio_pci_ioctl(device_data, cmd, arg); +} + +static int i40e_vf_register_irq(struct i40e_vf_migration *i40e_vf_dev, + unsigned int type, unsigned int subtype, + u32 flags, const struct i40e_vf_irqops *ops) +{ + struct i40e_vf_irq *irqs; + + irqs = krealloc(i40e_vf_dev->irqs, + (i40e_vf_dev->num_irqs + 1) * sizeof(*irqs), + GFP_KERNEL); + if (!irqs) + return -ENOMEM; + + i40e_vf_dev->irqs = irqs; + i40e_vf_dev->irqs[i40e_vf_dev->num_irqs].type = type; + i40e_vf_dev->irqs[i40e_vf_dev->num_irqs].subtype = subtype; + i40e_vf_dev->irqs[i40e_vf_dev->num_irqs].count = 1; + i40e_vf_dev->irqs[i40e_vf_dev->num_irqs].flags = flags; + i40e_vf_dev->irqs[i40e_vf_dev->num_irqs].ops = ops; + i40e_vf_dev->num_irqs++; + return 0; +} static int i40e_vf_iommu_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -100,6 +360,12 @@ static int i40e_vf_prepare_dirty_track(struct i40e_vf_migration *i40e_vf_dev) goto out_group; } + /* wait for bar 0 is remapped to read-write */ + ret = i40e_vf_remap_bars(i40e_vf_dev, true); + if (ret) { + pr_err("failed to remap BAR 0\n"); + goto out_group; + } i40e_vf_dev->in_dirty_track = true; return 0; @@ -121,6 +387,8 @@ static void i40e_vf_stop_dirty_track(struct i40e_vf_migration *i40e_vf_dev) &i40e_vf_dev->iommu_notifier); vfio_group_put_external_user(i40e_vf_dev->vfio_group); i40e_vf_dev->in_dirty_track = false; + /* just nottify userspace to remap bar0 without waiting */ + i40e_vf_remap_bars(i40e_vf_dev, false); } static size_t i40e_vf_set_device_state(struct i40e_vf_migration *i40e_vf_dev, @@ -134,6 +402,8 @@ static size_t i40e_vf_set_device_state(struct i40e_vf_migration *i40e_vf_dev, switch (state) { case VFIO_DEVICE_STATE_RUNNING: + if (mig_ctl->device_state & VFIO_DEVICE_STATE_SAVING) + i40e_vf_stop_dirty_track(i40e_vf_dev); break; case VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING: ret = i40e_vf_prepare_dirty_track(i40e_vf_dev); @@ -360,7 +630,25 @@ static long i40e_vf_get_region_info(void *device_data, -EFAULT : 0; default_handle: - return vfio_pci_ioctl(device_data, cmd, arg); + ret = vfio_pci_ioctl(device_data, cmd, arg); + if (ret) + return ret; + + if (info.index == VFIO_PCI_BAR0_REGION_INDEX) { + if (!i40e_vf_dev->in_dirty_track) + return ret; + + /* read default handler's data back*/ + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + /* update customized region info*/ + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + } + return ret; } static int i40e_vf_open(void *device_data) @@ -392,10 +680,20 @@ static int i40e_vf_open(void *device_data) if (ret) goto error; + ret = i40e_vf_register_irq(i40e_vf_dev, + VFIO_IRQ_TYPE_REMAP_BAR_REGION, + VFIO_IRQ_SUBTYPE_REMAP_BAR_REGION, + VFIO_IRQ_INFO_MASKABLE | + VFIO_IRQ_INFO_EVENTFD, + &i40e_vf_irqops_remap_bars); + if (ret) + goto error; + i40e_vf_dev->mig_ctl = mig_ctl; vfio_pci_set_vendor_regions(device_data, i40e_vf_dev->num_regions); - vfio_pci_set_vendor_irqs(device_data, 0); + vfio_pci_set_vendor_irqs(device_data, + i40e_vf_dev->num_irqs); } ret = vfio_pci_open(device_data); @@ -413,6 +711,9 @@ static int i40e_vf_open(void *device_data) i40e_vf_dev->regions = NULL; vfio_pci_set_vendor_regions(device_data, 0); vfio_pci_set_vendor_irqs(device_data, 0); + i40e_vf_dev->irqs = NULL; + i40e_vf_dev->num_irqs = 0; + kfree(i40e_vf_dev->irqs); } module_put(THIS_MODULE); mutex_unlock(&i40e_vf_dev->reflock); @@ -436,7 +737,16 @@ void i40e_vf_release(void *device_data) kfree(i40e_vf_dev->regions); i40e_vf_dev->regions = NULL; vfio_pci_set_vendor_regions(device_data, 0); + vfio_pci_set_vendor_irqs(device_data, 0); + for (i = 0; i < i40e_vf_dev->num_irqs; i++) + i40e_vf_dev->irqs[i].ops->set_irqs(i40e_vf_dev, + VFIO_IRQ_SET_DATA_NONE | + VFIO_IRQ_SET_ACTION_TRIGGER, + i, 0, 0, NULL); + kfree(i40e_vf_dev->irqs); + i40e_vf_dev->irqs = NULL; + i40e_vf_dev->num_irqs = 0; } vfio_pci_release(device_data); mutex_unlock(&i40e_vf_dev->reflock); @@ -448,6 +758,10 @@ static long i40e_vf_ioctl(void *device_data, { if (cmd == VFIO_DEVICE_GET_REGION_INFO) return i40e_vf_get_region_info(device_data, cmd, arg); + else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) + return i40e_vf_get_irq_info(device_data, cmd, arg); + else if (cmd == VFIO_DEVICE_SET_IRQS) + return i40e_vf_set_irqs(device_data, cmd, arg); return vfio_pci_ioctl(device_data, cmd, arg); } @@ -487,8 +801,10 @@ static ssize_t i40e_vf_write(void *device_data, const char __user *buf, int num_vdev_regions = vfio_pci_num_regions(device_data); int num_vendor_region = i40e_vf_dev->num_regions; - if (index == VFIO_PCI_BAR0_REGION_INDEX) + if (index == VFIO_PCI_BAR0_REGION_INDEX) { + pr_debug("vfio bar 0 write\n"); ;// scan dirty pages + } if (index < VFIO_PCI_NUM_REGIONS + num_vdev_regions) return vfio_pci_write(device_data, buf, count, ppos); diff --git a/drivers/net/ethernet/intel/i40e/i40e_vf_migration.h b/drivers/net/ethernet/intel/i40e/i40e_vf_migration.h index 918ba275d5b5..2c4d9ebee4ac 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_vf_migration.h +++ b/drivers/net/ethernet/intel/i40e/i40e_vf_migration.h @@ -46,6 +46,14 @@ struct pci_sriov { bool drivers_autoprobe; /* Auto probing of VFs by driver */ }; +struct i40e_vf_remap_irq_ctx { + struct eventfd_ctx *trigger; + struct virqfd *sync; + atomic_t cnt; + wait_queue_head_t waitq; + bool init; +}; + struct i40e_vf_migration { __u32 vf_vendor; __u32 vf_device; @@ -58,11 +66,14 @@ struct i40e_vf_migration { struct vfio_device_migration_info *mig_ctl; bool in_dirty_track; + struct i40e_vf_remap_irq_ctx remap_irq_ctx; struct i40e_vf_region *regions; int num_regions; struct notifier_block iommu_notifier; struct vfio_group *vfio_group; + struct i40e_vf_irq *irqs; + int num_irqs; }; @@ -89,5 +100,20 @@ struct i40e_vf_region { void *data; }; +struct i40e_vf_irqops { + int (*set_irqs)(struct i40e_vf_migration *i40e_vf_dev, + u32 flags, unsigned int index, + unsigned int start, unsigned int count, + void *data); +}; + +struct i40e_vf_irq { + u32 type; + u32 subtype; + u32 flags; + u32 count; + const struct i40e_vf_irqops *ops; +}; + #endif /* I40E_MIG_H */