diff mbox series

[v4,07/12] vfio_pci: shrink vfio_pci.c

Message ID 1578398509-26453-8-git-send-email-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series vfio_pci: wrap pci device as a mediated device | expand

Commit Message

Yi Liu Jan. 7, 2020, 12:01 p.m. UTC
This patch removes the common codes in vfio_pci.c, leave the module
specific codes, new vfio_pci.c will leverage the common functions
implemented in vfio_pci_common.c.

Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
---
 drivers/vfio/pci/Makefile           |    3 +-
 drivers/vfio/pci/vfio_pci.c         | 1442 -----------------------------------
 drivers/vfio/pci/vfio_pci_common.c  |    2 +-
 drivers/vfio/pci/vfio_pci_private.h |    2 +
 4 files changed, 5 insertions(+), 1444 deletions(-)

Comments

kernel test robot Jan. 8, 2020, 11:24 a.m. UTC | #1
Hi Liu,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on v5.5-rc5]
[also build test WARNING on next-20200106]
[cannot apply to vfio/next]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Liu-Yi-L/vfio_pci-wrap-pci-device-as-a-mediated-device/20200108-020930
base:    c79f46a282390e0f5b306007bf7b11a46d529538
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.1-129-g341daf20-dirty
        make ARCH=x86_64 allmodconfig
        make C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__'

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>


sparse warnings: (new ones prefixed by >>)

>> drivers/vfio/pci/vfio_pci_common.c:201:25: sparse: sparse: restricted pci_power_t degrades to integer
   drivers/vfio/pci/vfio_pci_common.c:201:43: sparse: sparse: restricted pci_power_t degrades to integer
   drivers/vfio/pci/vfio_pci_common.c:201:56: sparse: sparse: restricted pci_power_t degrades to integer
   drivers/vfio/pci/vfio_pci_common.c:201:65: sparse: sparse: restricted pci_power_t degrades to integer
   drivers/vfio/pci/vfio_pci_common.c:206:25: sparse: sparse: restricted pci_power_t degrades to integer
   drivers/vfio/pci/vfio_pci_common.c:206:44: sparse: sparse: restricted pci_power_t degrades to integer
   drivers/vfio/pci/vfio_pci_common.c:206:57: sparse: sparse: restricted pci_power_t degrades to integer
   drivers/vfio/pci/vfio_pci_common.c:206:66: sparse: sparse: restricted pci_power_t degrades to integer
   drivers/vfio/pci/vfio_pci_common.c:214:39: sparse: sparse: restricted pci_power_t degrades to integer
   drivers/vfio/pci/vfio_pci_common.c:214:58: sparse: sparse: restricted pci_power_t degrades to integer

vim +201 drivers/vfio/pci/vfio_pci_common.c

8db581db521ec0 Liu Yi L 2020-01-07  186  
8db581db521ec0 Liu Yi L 2020-01-07  187  /*
8db581db521ec0 Liu Yi L 2020-01-07  188   * pci_set_power_state() wrapper handling devices which perform a soft reset on
8db581db521ec0 Liu Yi L 2020-01-07  189   * D3->D0 transition.  Save state prior to D0/1/2->D3, stash it on the vdev,
8db581db521ec0 Liu Yi L 2020-01-07  190   * restore when returned to D0.  Saved separately from pci_saved_state for use
8db581db521ec0 Liu Yi L 2020-01-07  191   * by PM capability emulation and separately from pci_dev internal saved state
8db581db521ec0 Liu Yi L 2020-01-07  192   * to avoid it being overwritten and consumed around other resets.
8db581db521ec0 Liu Yi L 2020-01-07  193   */
8db581db521ec0 Liu Yi L 2020-01-07  194  int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state)
8db581db521ec0 Liu Yi L 2020-01-07  195  {
8db581db521ec0 Liu Yi L 2020-01-07  196  	struct pci_dev *pdev = vdev->pdev;
8db581db521ec0 Liu Yi L 2020-01-07  197  	bool needs_restore = false, needs_save = false;
8db581db521ec0 Liu Yi L 2020-01-07  198  	int ret;
8db581db521ec0 Liu Yi L 2020-01-07  199  
8db581db521ec0 Liu Yi L 2020-01-07  200  	if (vdev->needs_pm_restore) {
8db581db521ec0 Liu Yi L 2020-01-07 @201  		if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
8db581db521ec0 Liu Yi L 2020-01-07  202  			pci_save_state(pdev);
8db581db521ec0 Liu Yi L 2020-01-07  203  			needs_save = true;
8db581db521ec0 Liu Yi L 2020-01-07  204  		}
8db581db521ec0 Liu Yi L 2020-01-07  205  
8db581db521ec0 Liu Yi L 2020-01-07  206  		if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
8db581db521ec0 Liu Yi L 2020-01-07  207  			needs_restore = true;
8db581db521ec0 Liu Yi L 2020-01-07  208  	}
8db581db521ec0 Liu Yi L 2020-01-07  209  
8db581db521ec0 Liu Yi L 2020-01-07  210  	ret = pci_set_power_state(pdev, state);
8db581db521ec0 Liu Yi L 2020-01-07  211  
8db581db521ec0 Liu Yi L 2020-01-07  212  	if (!ret) {
8db581db521ec0 Liu Yi L 2020-01-07  213  		/* D3 might be unsupported via quirk, skip unless in D3 */
8db581db521ec0 Liu Yi L 2020-01-07  214  		if (needs_save && pdev->current_state >= PCI_D3hot) {
8db581db521ec0 Liu Yi L 2020-01-07  215  			vdev->pm_save = pci_store_saved_state(pdev);
8db581db521ec0 Liu Yi L 2020-01-07  216  		} else if (needs_restore) {
8db581db521ec0 Liu Yi L 2020-01-07  217  			pci_load_and_free_saved_state(pdev, &vdev->pm_save);
8db581db521ec0 Liu Yi L 2020-01-07  218  			pci_restore_state(pdev);
8db581db521ec0 Liu Yi L 2020-01-07  219  		}
8db581db521ec0 Liu Yi L 2020-01-07  220  	}
8db581db521ec0 Liu Yi L 2020-01-07  221  
8db581db521ec0 Liu Yi L 2020-01-07  222  	return ret;
8db581db521ec0 Liu Yi L 2020-01-07  223  }
8db581db521ec0 Liu Yi L 2020-01-07  224  

:::::: The code at line 201 was first introduced by commit
:::::: 8db581db521ec047e12946a9c933f085c4d680ba vfio_pci: duplicate vfio_pci.c

:::::: TO: Liu Yi L <yi.l.liu@intel.com>
:::::: CC: 0day robot <lkp@intel.com>

---
0-DAY kernel test infrastructure                 Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation
Alex Williamson Jan. 9, 2020, 10:48 p.m. UTC | #2
On Tue,  7 Jan 2020 20:01:44 +0800
Liu Yi L <yi.l.liu@intel.com> wrote:

> This patch removes the common codes in vfio_pci.c, leave the module
> specific codes, new vfio_pci.c will leverage the common functions
> implemented in vfio_pci_common.c.
> 
> Cc: Kevin Tian <kevin.tian@intel.com>
> Cc: Lu Baolu <baolu.lu@linux.intel.com>
> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> ---
>  drivers/vfio/pci/Makefile           |    3 +-
>  drivers/vfio/pci/vfio_pci.c         | 1442 -----------------------------------
>  drivers/vfio/pci/vfio_pci_common.c  |    2 +-
>  drivers/vfio/pci/vfio_pci_private.h |    2 +
>  4 files changed, 5 insertions(+), 1444 deletions(-)
> 
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index f027f8a..d94317a 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -1,6 +1,7 @@
>  # SPDX-License-Identifier: GPL-2.0-only
>  
> -vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
> +vfio-pci-y := vfio_pci.o vfio_pci_common.o vfio_pci_intrs.o \
> +		vfio_pci_rdwr.o vfio_pci_config.o
>  vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
>  vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
>  
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 103e493..7e24da2 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c

I think there are a bunch of headers that are no longer needed here
too.  It at least compiles without these:

-#include <linux/eventfd.h>
-#include <linux/file.h>
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <linux/pm_runtime.h>
-#include <linux/uaccess.h>
-#include <linux/nospec.h>


> @@ -54,411 +54,6 @@ module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
>  MODULE_PARM_DESC(disable_idle_d3,
>  		 "Disable using the PCI D3 low power state for idle, unused devices");
>  
> -/*
> - * Our VGA arbiter participation is limited since we don't know anything
> - * about the device itself.  However, if the device is the only VGA device
> - * downstream of a bridge and VFIO VGA support is disabled, then we can
> - * safely return legacy VGA IO and memory as not decoded since the user
> - * has no way to get to it and routing can be disabled externally at the
> - * bridge.
> - */
> -unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga)
> -{
> -	struct vfio_pci_device *vdev = opaque;
> -	struct pci_dev *tmp = NULL, *pdev = vdev->pdev;
> -	unsigned char max_busnr;
> -	unsigned int decodes;
> -
> -	if (single_vga || !vfio_vga_disabled(vdev) ||
> -		pci_is_root_bus(pdev->bus))
> -		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
> -		       VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
> -
> -	max_busnr = pci_bus_max_busnr(pdev->bus);
> -	decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
> -
> -	while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) {
> -		if (tmp == pdev ||
> -		    pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) ||
> -		    pci_is_root_bus(tmp->bus))
> -			continue;
> -
> -		if (tmp->bus->number >= pdev->bus->number &&
> -		    tmp->bus->number <= max_busnr) {
> -			pci_dev_put(tmp);
> -			decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
> -			break;
> -		}
> -	}
> -
> -	return decodes;
> -}
> -
> -static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> -{
> -	struct resource *res;
> -	int i;
> -	struct vfio_pci_dummy_resource *dummy_res;
> -
> -	INIT_LIST_HEAD(&vdev->dummy_resources_list);
> -
> -	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
> -		int bar = i + PCI_STD_RESOURCES;
> -
> -		res = &vdev->pdev->resource[bar];
> -
> -		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
> -			goto no_mmap;
> -
> -		if (!(res->flags & IORESOURCE_MEM))
> -			goto no_mmap;
> -
> -		/*
> -		 * The PCI core shouldn't set up a resource with a
> -		 * type but zero size. But there may be bugs that
> -		 * cause us to do that.
> -		 */
> -		if (!resource_size(res))
> -			goto no_mmap;
> -
> -		if (resource_size(res) >= PAGE_SIZE) {
> -			vdev->bar_mmap_supported[bar] = true;
> -			continue;
> -		}
> -
> -		if (!(res->start & ~PAGE_MASK)) {
> -			/*
> -			 * Add a dummy resource to reserve the remainder
> -			 * of the exclusive page in case that hot-add
> -			 * device's bar is assigned into it.
> -			 */
> -			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
> -			if (dummy_res == NULL)
> -				goto no_mmap;
> -
> -			dummy_res->resource.name = "vfio sub-page reserved";
> -			dummy_res->resource.start = res->end + 1;
> -			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
> -			dummy_res->resource.flags = res->flags;
> -			if (request_resource(res->parent,
> -						&dummy_res->resource)) {
> -				kfree(dummy_res);
> -				goto no_mmap;
> -			}
> -			dummy_res->index = bar;
> -			list_add(&dummy_res->res_next,
> -					&vdev->dummy_resources_list);
> -			vdev->bar_mmap_supported[bar] = true;
> -			continue;
> -		}
> -		/*
> -		 * Here we don't handle the case when the BAR is not page
> -		 * aligned because we can't expect the BAR will be
> -		 * assigned into the same location in a page in guest
> -		 * when we passthrough the BAR. And it's hard to access
> -		 * this BAR in userspace because we have no way to get
> -		 * the BAR's location in a page.
> -		 */
> -no_mmap:
> -		vdev->bar_mmap_supported[bar] = false;
> -	}
> -}
> -
> -static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
> -
> -/*
> - * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
> - * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
> - * If a device implements the former but not the latter we would typically
> - * expect broken_intx_masking be set and require an exclusive interrupt.
> - * However since we do have control of the device's ability to assert INTx,
> - * we can instead pretend that the device does not implement INTx, virtualizing
> - * the pin register to report zero and maintaining DisINTx set on the host.
> - */
> -static bool vfio_pci_nointx(struct pci_dev *pdev)
> -{
> -	switch (pdev->vendor) {
> -	case PCI_VENDOR_ID_INTEL:
> -		switch (pdev->device) {
> -		/* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
> -		case 0x1572:
> -		case 0x1574:
> -		case 0x1580 ... 0x1581:
> -		case 0x1583 ... 0x158b:
> -		case 0x37d0 ... 0x37d2:
> -			return true;
> -		default:
> -			return false;
> -		}
> -	}
> -
> -	return false;
> -}
> -
> -void vfio_pci_probe_power_state(struct vfio_pci_device *vdev)
> -{
> -	struct pci_dev *pdev = vdev->pdev;
> -	u16 pmcsr;
> -
> -	if (!pdev->pm_cap)
> -		return;
> -
> -	pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
> -
> -	vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
> -}
> -
> -/*
> - * pci_set_power_state() wrapper handling devices which perform a soft reset on
> - * D3->D0 transition.  Save state prior to D0/1/2->D3, stash it on the vdev,
> - * restore when returned to D0.  Saved separately from pci_saved_state for use
> - * by PM capability emulation and separately from pci_dev internal saved state
> - * to avoid it being overwritten and consumed around other resets.
> - */
> -int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state)
> -{
> -	struct pci_dev *pdev = vdev->pdev;
> -	bool needs_restore = false, needs_save = false;
> -	int ret;
> -
> -	if (vdev->needs_pm_restore) {
> -		if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
> -			pci_save_state(pdev);
> -			needs_save = true;
> -		}
> -
> -		if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
> -			needs_restore = true;
> -	}
> -
> -	ret = pci_set_power_state(pdev, state);
> -
> -	if (!ret) {
> -		/* D3 might be unsupported via quirk, skip unless in D3 */
> -		if (needs_save && pdev->current_state >= PCI_D3hot) {
> -			vdev->pm_save = pci_store_saved_state(pdev);
> -		} else if (needs_restore) {
> -			pci_load_and_free_saved_state(pdev, &vdev->pm_save);
> -			pci_restore_state(pdev);
> -		}
> -	}


This gets a bit ugly, vfio_pci_remove() retains:

kfree(vdev->pm_save)

But vfio_pci.c otherwise has no use of this field on the
vfio_pci_device.  I'm afraid we're really just doing a pretty rough
splitting of the code rather than massaging the callbacks between the
modules into an actual API, for example maybe there should be init and
exit callbacks into the common code to handle such things.
ioeventfds_{list,lock} are similar, vfio_pci.c inits and destroys them,
but otherwise doesn't know what they're for.  I wonder how many more
such things exist.  Thanks,

Alex
Yi Liu Jan. 16, 2020, 12:42 p.m. UTC | #3
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Friday, January 10, 2020 6:48 AM
> To: Liu, Yi L <yi.l.liu@intel.com>
> Subject: Re: [PATCH v4 07/12] vfio_pci: shrink vfio_pci.c
> 
> On Tue,  7 Jan 2020 20:01:44 +0800
> Liu Yi L <yi.l.liu@intel.com> wrote:
> 
> > This patch removes the common codes in vfio_pci.c, leave the module
> > specific codes, new vfio_pci.c will leverage the common functions
> > implemented in vfio_pci_common.c.
> >
> > Cc: Kevin Tian <kevin.tian@intel.com>
> > Cc: Lu Baolu <baolu.lu@linux.intel.com>
> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > ---
> >  drivers/vfio/pci/Makefile           |    3 +-
> >  drivers/vfio/pci/vfio_pci.c         | 1442 -----------------------------------
> >  drivers/vfio/pci/vfio_pci_common.c  |    2 +-
> >  drivers/vfio/pci/vfio_pci_private.h |    2 +
> >  4 files changed, 5 insertions(+), 1444 deletions(-)
> >
> > diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> > index f027f8a..d94317a 100644
> > --- a/drivers/vfio/pci/Makefile
> > +++ b/drivers/vfio/pci/Makefile
> > @@ -1,6 +1,7 @@
> >  # SPDX-License-Identifier: GPL-2.0-only
> >
> > -vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
> > +vfio-pci-y := vfio_pci.o vfio_pci_common.o vfio_pci_intrs.o \
> > +		vfio_pci_rdwr.o vfio_pci_config.o
> >  vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
> >  vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
> >
> > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> > index 103e493..7e24da2 100644
> > --- a/drivers/vfio/pci/vfio_pci.c
> > +++ b/drivers/vfio/pci/vfio_pci.c
> 
> I think there are a bunch of headers that are no longer needed here
> too.  It at least compiles without these:
> 
> -#include <linux/eventfd.h>
> -#include <linux/file.h>
> -#include <linux/interrupt.h>
> -#include <linux/notifier.h>
> -#include <linux/pm_runtime.h>
> -#include <linux/uaccess.h>
> -#include <linux/nospec.h>

Got it, let me remove them.

> 
> 
> > @@ -54,411 +54,6 @@ module_param(disable_idle_d3, bool, S_IRUGO |
> S_IWUSR);
> >  MODULE_PARM_DESC(disable_idle_d3,
> >  		 "Disable using the PCI D3 low power state for idle, unused devices");
> >
> > -/*
> > - * Our VGA arbiter participation is limited since we don't know anything
> > - * about the device itself.  However, if the device is the only VGA device
> > - * downstream of a bridge and VFIO VGA support is disabled, then we can
> > - * safely return legacy VGA IO and memory as not decoded since the user
> > - * has no way to get to it and routing can be disabled externally at the
> > - * bridge.
> > - */
> > -unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga)
> > -{
> > -	struct vfio_pci_device *vdev = opaque;
> > -	struct pci_dev *tmp = NULL, *pdev = vdev->pdev;
> > -	unsigned char max_busnr;
> > -	unsigned int decodes;
> > -
> > -	if (single_vga || !vfio_vga_disabled(vdev) ||
> > -		pci_is_root_bus(pdev->bus))
> > -		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
> > -		       VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
> > -
> > -	max_busnr = pci_bus_max_busnr(pdev->bus);
> > -	decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
> > -
> > -	while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) {
> > -		if (tmp == pdev ||
> > -		    pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) ||
> > -		    pci_is_root_bus(tmp->bus))
> > -			continue;
> > -
> > -		if (tmp->bus->number >= pdev->bus->number &&
> > -		    tmp->bus->number <= max_busnr) {
> > -			pci_dev_put(tmp);
> > -			decodes |= VGA_RSRC_LEGACY_IO |
> VGA_RSRC_LEGACY_MEM;
> > -			break;
> > -		}
> > -	}
> > -
> > -	return decodes;
> > -}
> > -
> > -static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
> > -{
> > -	struct resource *res;
> > -	int i;
> > -	struct vfio_pci_dummy_resource *dummy_res;
> > -
> > -	INIT_LIST_HEAD(&vdev->dummy_resources_list);
> > -
> > -	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
> > -		int bar = i + PCI_STD_RESOURCES;
> > -
> > -		res = &vdev->pdev->resource[bar];
> > -
> > -		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
> > -			goto no_mmap;
> > -
> > -		if (!(res->flags & IORESOURCE_MEM))
> > -			goto no_mmap;
> > -
> > -		/*
> > -		 * The PCI core shouldn't set up a resource with a
> > -		 * type but zero size. But there may be bugs that
> > -		 * cause us to do that.
> > -		 */
> > -		if (!resource_size(res))
> > -			goto no_mmap;
> > -
> > -		if (resource_size(res) >= PAGE_SIZE) {
> > -			vdev->bar_mmap_supported[bar] = true;
> > -			continue;
> > -		}
> > -
> > -		if (!(res->start & ~PAGE_MASK)) {
> > -			/*
> > -			 * Add a dummy resource to reserve the remainder
> > -			 * of the exclusive page in case that hot-add
> > -			 * device's bar is assigned into it.
> > -			 */
> > -			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
> > -			if (dummy_res == NULL)
> > -				goto no_mmap;
> > -
> > -			dummy_res->resource.name = "vfio sub-page reserved";
> > -			dummy_res->resource.start = res->end + 1;
> > -			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
> > -			dummy_res->resource.flags = res->flags;
> > -			if (request_resource(res->parent,
> > -						&dummy_res->resource)) {
> > -				kfree(dummy_res);
> > -				goto no_mmap;
> > -			}
> > -			dummy_res->index = bar;
> > -			list_add(&dummy_res->res_next,
> > -					&vdev->dummy_resources_list);
> > -			vdev->bar_mmap_supported[bar] = true;
> > -			continue;
> > -		}
> > -		/*
> > -		 * Here we don't handle the case when the BAR is not page
> > -		 * aligned because we can't expect the BAR will be
> > -		 * assigned into the same location in a page in guest
> > -		 * when we passthrough the BAR. And it's hard to access
> > -		 * this BAR in userspace because we have no way to get
> > -		 * the BAR's location in a page.
> > -		 */
> > -no_mmap:
> > -		vdev->bar_mmap_supported[bar] = false;
> > -	}
> > -}
> > -
> > -static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
> > -
> > -/*
> > - * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
> > - * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
> > - * If a device implements the former but not the latter we would typically
> > - * expect broken_intx_masking be set and require an exclusive interrupt.
> > - * However since we do have control of the device's ability to assert INTx,
> > - * we can instead pretend that the device does not implement INTx, virtualizing
> > - * the pin register to report zero and maintaining DisINTx set on the host.
> > - */
> > -static bool vfio_pci_nointx(struct pci_dev *pdev)
> > -{
> > -	switch (pdev->vendor) {
> > -	case PCI_VENDOR_ID_INTEL:
> > -		switch (pdev->device) {
> > -		/* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
> > -		case 0x1572:
> > -		case 0x1574:
> > -		case 0x1580 ... 0x1581:
> > -		case 0x1583 ... 0x158b:
> > -		case 0x37d0 ... 0x37d2:
> > -			return true;
> > -		default:
> > -			return false;
> > -		}
> > -	}
> > -
> > -	return false;
> > -}
> > -
> > -void vfio_pci_probe_power_state(struct vfio_pci_device *vdev)
> > -{
> > -	struct pci_dev *pdev = vdev->pdev;
> > -	u16 pmcsr;
> > -
> > -	if (!pdev->pm_cap)
> > -		return;
> > -
> > -	pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
> > -
> > -	vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
> > -}
> > -
> > -/*
> > - * pci_set_power_state() wrapper handling devices which perform a soft reset on
> > - * D3->D0 transition.  Save state prior to D0/1/2->D3, stash it on the vdev,
> > - * restore when returned to D0.  Saved separately from pci_saved_state for use
> > - * by PM capability emulation and separately from pci_dev internal saved state
> > - * to avoid it being overwritten and consumed around other resets.
> > - */
> > -int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state)
> > -{
> > -	struct pci_dev *pdev = vdev->pdev;
> > -	bool needs_restore = false, needs_save = false;
> > -	int ret;
> > -
> > -	if (vdev->needs_pm_restore) {
> > -		if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
> > -			pci_save_state(pdev);
> > -			needs_save = true;
> > -		}
> > -
> > -		if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
> > -			needs_restore = true;
> > -	}
> > -
> > -	ret = pci_set_power_state(pdev, state);
> > -
> > -	if (!ret) {
> > -		/* D3 might be unsupported via quirk, skip unless in D3 */
> > -		if (needs_save && pdev->current_state >= PCI_D3hot) {
> > -			vdev->pm_save = pci_store_saved_state(pdev);
> > -		} else if (needs_restore) {
> > -			pci_load_and_free_saved_state(pdev, &vdev->pm_save);
> > -			pci_restore_state(pdev);
> > -		}
> > -	}
> 
> 
> This gets a bit ugly, vfio_pci_remove() retains:
> 
> kfree(vdev->pm_save)
> 
> But vfio_pci.c otherwise has no use of this field on the
> vfio_pci_device.  I'm afraid we're really just doing a pretty rough
> splitting of the code rather than massaging the callbacks between the
> modules into an actual API, for example maybe there should be init and
> exit callbacks into the common code to handle such things.
> ioeventfds_{list,lock} are similar, vfio_pci.c inits and destroys them,
> but otherwise doesn't know what they're for.  I wonder how many more
> such things exist.  Thanks,

yeah, I tried to keep the code as what it looks like today. So it is
now much more like a code splitting). But I agree we need to make it
more thorough. I had been considering how to make the code work as
what you described here since I saw your comment last week. I may
need a more detailed investigation on it per your direction, and answer
your question better.

Thanks very much for your guidelines.

Regards,
Yi Liu

> 
> Alex
diff mbox series

Patch

diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index f027f8a..d94317a 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 
-vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
+vfio-pci-y := vfio_pci.o vfio_pci_common.o vfio_pci_intrs.o \
+		vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
 vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
 
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 103e493..7e24da2 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -54,411 +54,6 @@  module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(disable_idle_d3,
 		 "Disable using the PCI D3 low power state for idle, unused devices");
 
-/*
- * Our VGA arbiter participation is limited since we don't know anything
- * about the device itself.  However, if the device is the only VGA device
- * downstream of a bridge and VFIO VGA support is disabled, then we can
- * safely return legacy VGA IO and memory as not decoded since the user
- * has no way to get to it and routing can be disabled externally at the
- * bridge.
- */
-unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga)
-{
-	struct vfio_pci_device *vdev = opaque;
-	struct pci_dev *tmp = NULL, *pdev = vdev->pdev;
-	unsigned char max_busnr;
-	unsigned int decodes;
-
-	if (single_vga || !vfio_vga_disabled(vdev) ||
-		pci_is_root_bus(pdev->bus))
-		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
-		       VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
-
-	max_busnr = pci_bus_max_busnr(pdev->bus);
-	decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
-
-	while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) {
-		if (tmp == pdev ||
-		    pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) ||
-		    pci_is_root_bus(tmp->bus))
-			continue;
-
-		if (tmp->bus->number >= pdev->bus->number &&
-		    tmp->bus->number <= max_busnr) {
-			pci_dev_put(tmp);
-			decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
-			break;
-		}
-	}
-
-	return decodes;
-}
-
-static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
-{
-	struct resource *res;
-	int i;
-	struct vfio_pci_dummy_resource *dummy_res;
-
-	INIT_LIST_HEAD(&vdev->dummy_resources_list);
-
-	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
-		int bar = i + PCI_STD_RESOURCES;
-
-		res = &vdev->pdev->resource[bar];
-
-		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
-			goto no_mmap;
-
-		if (!(res->flags & IORESOURCE_MEM))
-			goto no_mmap;
-
-		/*
-		 * The PCI core shouldn't set up a resource with a
-		 * type but zero size. But there may be bugs that
-		 * cause us to do that.
-		 */
-		if (!resource_size(res))
-			goto no_mmap;
-
-		if (resource_size(res) >= PAGE_SIZE) {
-			vdev->bar_mmap_supported[bar] = true;
-			continue;
-		}
-
-		if (!(res->start & ~PAGE_MASK)) {
-			/*
-			 * Add a dummy resource to reserve the remainder
-			 * of the exclusive page in case that hot-add
-			 * device's bar is assigned into it.
-			 */
-			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
-			if (dummy_res == NULL)
-				goto no_mmap;
-
-			dummy_res->resource.name = "vfio sub-page reserved";
-			dummy_res->resource.start = res->end + 1;
-			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
-			dummy_res->resource.flags = res->flags;
-			if (request_resource(res->parent,
-						&dummy_res->resource)) {
-				kfree(dummy_res);
-				goto no_mmap;
-			}
-			dummy_res->index = bar;
-			list_add(&dummy_res->res_next,
-					&vdev->dummy_resources_list);
-			vdev->bar_mmap_supported[bar] = true;
-			continue;
-		}
-		/*
-		 * Here we don't handle the case when the BAR is not page
-		 * aligned because we can't expect the BAR will be
-		 * assigned into the same location in a page in guest
-		 * when we passthrough the BAR. And it's hard to access
-		 * this BAR in userspace because we have no way to get
-		 * the BAR's location in a page.
-		 */
-no_mmap:
-		vdev->bar_mmap_supported[bar] = false;
-	}
-}
-
-static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
-
-/*
- * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
- * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
- * If a device implements the former but not the latter we would typically
- * expect broken_intx_masking be set and require an exclusive interrupt.
- * However since we do have control of the device's ability to assert INTx,
- * we can instead pretend that the device does not implement INTx, virtualizing
- * the pin register to report zero and maintaining DisINTx set on the host.
- */
-static bool vfio_pci_nointx(struct pci_dev *pdev)
-{
-	switch (pdev->vendor) {
-	case PCI_VENDOR_ID_INTEL:
-		switch (pdev->device) {
-		/* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
-		case 0x1572:
-		case 0x1574:
-		case 0x1580 ... 0x1581:
-		case 0x1583 ... 0x158b:
-		case 0x37d0 ... 0x37d2:
-			return true;
-		default:
-			return false;
-		}
-	}
-
-	return false;
-}
-
-void vfio_pci_probe_power_state(struct vfio_pci_device *vdev)
-{
-	struct pci_dev *pdev = vdev->pdev;
-	u16 pmcsr;
-
-	if (!pdev->pm_cap)
-		return;
-
-	pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
-
-	vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
-}
-
-/*
- * pci_set_power_state() wrapper handling devices which perform a soft reset on
- * D3->D0 transition.  Save state prior to D0/1/2->D3, stash it on the vdev,
- * restore when returned to D0.  Saved separately from pci_saved_state for use
- * by PM capability emulation and separately from pci_dev internal saved state
- * to avoid it being overwritten and consumed around other resets.
- */
-int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state)
-{
-	struct pci_dev *pdev = vdev->pdev;
-	bool needs_restore = false, needs_save = false;
-	int ret;
-
-	if (vdev->needs_pm_restore) {
-		if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
-			pci_save_state(pdev);
-			needs_save = true;
-		}
-
-		if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
-			needs_restore = true;
-	}
-
-	ret = pci_set_power_state(pdev, state);
-
-	if (!ret) {
-		/* D3 might be unsupported via quirk, skip unless in D3 */
-		if (needs_save && pdev->current_state >= PCI_D3hot) {
-			vdev->pm_save = pci_store_saved_state(pdev);
-		} else if (needs_restore) {
-			pci_load_and_free_saved_state(pdev, &vdev->pm_save);
-			pci_restore_state(pdev);
-		}
-	}
-
-	return ret;
-}
-
-int vfio_pci_enable(struct vfio_pci_device *vdev)
-{
-	struct pci_dev *pdev = vdev->pdev;
-	int ret;
-	u16 cmd;
-	u8 msix_pos;
-
-	vfio_pci_set_power_state(vdev, PCI_D0);
-
-	/* Don't allow our initial saved state to include busmaster */
-	pci_clear_master(pdev);
-
-	ret = pci_enable_device(pdev);
-	if (ret)
-		return ret;
-
-	/* If reset fails because of the device lock, fail this path entirely */
-	ret = pci_try_reset_function(pdev);
-	if (ret == -EAGAIN) {
-		pci_disable_device(pdev);
-		return ret;
-	}
-
-	vdev->reset_works = !ret;
-	pci_save_state(pdev);
-	vdev->pci_saved_state = pci_store_saved_state(pdev);
-	if (!vdev->pci_saved_state)
-		pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__);
-
-	if (likely(!vdev->nointxmask)) {
-		if (vfio_pci_nointx(pdev)) {
-			pci_info(pdev, "Masking broken INTx support\n");
-			vdev->nointx = true;
-			pci_intx(pdev, 0);
-		} else
-			vdev->pci_2_3 = pci_intx_mask_supported(pdev);
-	}
-
-	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
-	if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
-		cmd &= ~PCI_COMMAND_INTX_DISABLE;
-		pci_write_config_word(pdev, PCI_COMMAND, cmd);
-	}
-
-	ret = vfio_config_init(vdev);
-	if (ret) {
-		kfree(vdev->pci_saved_state);
-		vdev->pci_saved_state = NULL;
-		pci_disable_device(pdev);
-		return ret;
-	}
-
-	msix_pos = pdev->msix_cap;
-	if (msix_pos) {
-		u16 flags;
-		u32 table;
-
-		pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
-		pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
-
-		vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
-		vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
-		vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
-	} else
-		vdev->msix_bar = 0xFF;
-
-	if (!vfio_vga_disabled(vdev) && vfio_pci_is_vga(pdev))
-		vdev->has_vga = true;
-
-
-	if (vfio_pci_is_vga(pdev) &&
-	    pdev->vendor == PCI_VENDOR_ID_INTEL &&
-	    IS_ENABLED(CONFIG_VFIO_PCI_IGD)) {
-		ret = vfio_pci_igd_init(vdev);
-		if (ret) {
-			pci_warn(pdev, "Failed to setup Intel IGD regions\n");
-			goto disable_exit;
-		}
-	}
-
-	if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
-	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
-		ret = vfio_pci_nvdia_v100_nvlink2_init(vdev);
-		if (ret && ret != -ENODEV) {
-			pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n");
-			goto disable_exit;
-		}
-	}
-
-	if (pdev->vendor == PCI_VENDOR_ID_IBM &&
-	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
-		ret = vfio_pci_ibm_npu2_init(vdev);
-		if (ret && ret != -ENODEV) {
-			pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n");
-			goto disable_exit;
-		}
-	}
-
-	vfio_pci_probe_mmaps(vdev);
-
-	return 0;
-
-disable_exit:
-	vfio_pci_disable(vdev);
-	return ret;
-}
-
-void vfio_pci_disable(struct vfio_pci_device *vdev)
-{
-	struct pci_dev *pdev = vdev->pdev;
-	struct vfio_pci_dummy_resource *dummy_res, *tmp;
-	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
-	int i, bar;
-
-	/* Stop the device from further DMA */
-	pci_clear_master(pdev);
-
-	vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
-				VFIO_IRQ_SET_ACTION_TRIGGER,
-				vdev->irq_type, 0, 0, NULL);
-
-	/* Device closed, don't need mutex here */
-	list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
-				 &vdev->ioeventfds_list, next) {
-		vfio_virqfd_disable(&ioeventfd->virqfd);
-		list_del(&ioeventfd->next);
-		kfree(ioeventfd);
-	}
-	vdev->ioeventfds_nr = 0;
-
-	vdev->virq_disabled = false;
-
-	for (i = 0; i < vdev->num_regions; i++)
-		vdev->region[i].ops->release(vdev, &vdev->region[i]);
-
-	vdev->num_regions = 0;
-	kfree(vdev->region);
-	vdev->region = NULL; /* don't krealloc a freed pointer */
-
-	vfio_config_free(vdev);
-
-	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
-		bar = i + PCI_STD_RESOURCES;
-		if (!vdev->barmap[bar])
-			continue;
-		pci_iounmap(pdev, vdev->barmap[bar]);
-		pci_release_selected_regions(pdev, 1 << bar);
-		vdev->barmap[bar] = NULL;
-	}
-
-	list_for_each_entry_safe(dummy_res, tmp,
-				 &vdev->dummy_resources_list, res_next) {
-		list_del(&dummy_res->res_next);
-		release_resource(&dummy_res->resource);
-		kfree(dummy_res);
-	}
-
-	vdev->needs_reset = true;
-
-	/*
-	 * If we have saved state, restore it.  If we can reset the device,
-	 * even better.  Resetting with current state seems better than
-	 * nothing, but saving and restoring current state without reset
-	 * is just busy work.
-	 */
-	if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
-		pci_info(pdev, "%s: Couldn't reload saved state\n", __func__);
-
-		if (!vdev->reset_works)
-			goto out;
-
-		pci_save_state(pdev);
-	}
-
-	/*
-	 * Disable INTx and MSI, presumably to avoid spurious interrupts
-	 * during reset.  Stolen from pci_reset_function()
-	 */
-	pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
-
-	/*
-	 * Try to get the locks ourselves to prevent a deadlock. The
-	 * success of this is dependent on being able to lock the device,
-	 * which is not always possible.
-	 * We can not use the "try" reset interface here, which will
-	 * overwrite the previously restored configuration information.
-	 */
-	if (vdev->reset_works && pci_cfg_access_trylock(pdev)) {
-		if (device_trylock(&pdev->dev)) {
-			if (!__pci_reset_function_locked(pdev))
-				vdev->needs_reset = false;
-			device_unlock(&pdev->dev);
-		}
-		pci_cfg_access_unlock(pdev);
-	}
-
-	pci_restore_state(pdev);
-out:
-	pci_disable_device(pdev);
-
-	vfio_pci_try_bus_reset(vdev);
-
-	if (!vdev->disable_idle_d3)
-		vfio_pci_set_power_state(vdev, PCI_D3hot);
-}
-
-void vfio_pci_refresh_config(struct vfio_pci_device *vdev,
-			bool nointxmask, bool disable_idle_d3)
-{
-	vdev->nointxmask = nointxmask;
-	vdev->disable_idle_d3 = disable_idle_d3;
-}
-
 static void vfio_pci_release(void *device_data)
 {
 	struct vfio_pci_device *vdev = device_data;
@@ -502,777 +97,6 @@  static int vfio_pci_open(void *device_data)
 	return ret;
 }
 
-static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
-{
-	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
-		u8 pin;
-
-		if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) ||
-		    vdev->nointx || vdev->pdev->is_virtfn)
-			return 0;
-
-		pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
-
-		return pin ? 1 : 0;
-	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
-		u8 pos;
-		u16 flags;
-
-		pos = vdev->pdev->msi_cap;
-		if (pos) {
-			pci_read_config_word(vdev->pdev,
-					     pos + PCI_MSI_FLAGS, &flags);
-			return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
-		}
-	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
-		u8 pos;
-		u16 flags;
-
-		pos = vdev->pdev->msix_cap;
-		if (pos) {
-			pci_read_config_word(vdev->pdev,
-					     pos + PCI_MSIX_FLAGS, &flags);
-
-			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
-		}
-	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
-		if (pci_is_pcie(vdev->pdev))
-			return 1;
-	} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
-		return 1;
-	}
-
-	return 0;
-}
-
-static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
-{
-	(*(int *)data)++;
-	return 0;
-}
-
-struct vfio_pci_fill_info {
-	int max;
-	int cur;
-	struct vfio_pci_dependent_device *devices;
-};
-
-static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
-{
-	struct vfio_pci_fill_info *fill = data;
-	struct iommu_group *iommu_group;
-
-	if (fill->cur == fill->max)
-		return -EAGAIN; /* Something changed, try again */
-
-	iommu_group = iommu_group_get(&pdev->dev);
-	if (!iommu_group)
-		return -EPERM; /* Cannot reset non-isolated devices */
-
-	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
-	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
-	fill->devices[fill->cur].bus = pdev->bus->number;
-	fill->devices[fill->cur].devfn = pdev->devfn;
-	fill->cur++;
-	iommu_group_put(iommu_group);
-	return 0;
-}
-
-struct vfio_pci_group_entry {
-	struct vfio_group *group;
-	int id;
-};
-
-struct vfio_pci_group_info {
-	int count;
-	struct vfio_pci_group_entry *groups;
-};
-
-static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
-{
-	struct vfio_pci_group_info *info = data;
-	struct iommu_group *group;
-	int id, i;
-
-	group = iommu_group_get(&pdev->dev);
-	if (!group)
-		return -EPERM;
-
-	id = iommu_group_id(group);
-
-	for (i = 0; i < info->count; i++)
-		if (info->groups[i].id == id)
-			break;
-
-	iommu_group_put(group);
-
-	return (i == info->count) ? -EINVAL : 0;
-}
-
-static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
-{
-	for (; pdev; pdev = pdev->bus->self)
-		if (pdev->bus == slot->bus)
-			return (pdev->slot == slot);
-	return false;
-}
-
-struct vfio_pci_walk_info {
-	int (*fn)(struct pci_dev *, void *data);
-	void *data;
-	struct pci_dev *pdev;
-	bool slot;
-	int ret;
-};
-
-static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
-{
-	struct vfio_pci_walk_info *walk = data;
-
-	if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
-		walk->ret = walk->fn(pdev, walk->data);
-
-	return walk->ret;
-}
-
-static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
-					 int (*fn)(struct pci_dev *,
-						   void *data), void *data,
-					 bool slot)
-{
-	struct vfio_pci_walk_info walk = {
-		.fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
-	};
-
-	pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
-
-	return walk.ret;
-}
-
-static int msix_mmappable_cap(struct vfio_pci_device *vdev,
-			      struct vfio_info_cap *caps)
-{
-	struct vfio_info_cap_header header = {
-		.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
-		.version = 1
-	};
-
-	return vfio_info_add_capability(caps, &header, sizeof(header));
-}
-
-int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
-				 unsigned int type, unsigned int subtype,
-				 const struct vfio_pci_regops *ops,
-				 size_t size, u32 flags, void *data)
-{
-	struct vfio_pci_region *region;
-
-	region = krealloc(vdev->region,
-			  (vdev->num_regions + 1) * sizeof(*region),
-			  GFP_KERNEL);
-	if (!region)
-		return -ENOMEM;
-
-	vdev->region = region;
-	vdev->region[vdev->num_regions].type = type;
-	vdev->region[vdev->num_regions].subtype = subtype;
-	vdev->region[vdev->num_regions].ops = ops;
-	vdev->region[vdev->num_regions].size = size;
-	vdev->region[vdev->num_regions].flags = flags;
-	vdev->region[vdev->num_regions].data = data;
-
-	vdev->num_regions++;
-
-	return 0;
-}
-
-long vfio_pci_ioctl(void *device_data,
-		   unsigned int cmd, unsigned long arg)
-{
-	struct vfio_pci_device *vdev = device_data;
-	unsigned long minsz;
-
-	if (cmd == VFIO_DEVICE_GET_INFO) {
-		struct vfio_device_info info;
-
-		minsz = offsetofend(struct vfio_device_info, num_irqs);
-
-		if (copy_from_user(&info, (void __user *)arg, minsz))
-			return -EFAULT;
-
-		if (info.argsz < minsz)
-			return -EINVAL;
-
-		info.flags = VFIO_DEVICE_FLAGS_PCI;
-
-		if (vdev->reset_works)
-			info.flags |= VFIO_DEVICE_FLAGS_RESET;
-
-		info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
-		info.num_irqs = VFIO_PCI_NUM_IRQS;
-
-		return copy_to_user((void __user *)arg, &info, minsz) ?
-			-EFAULT : 0;
-
-	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
-		struct pci_dev *pdev = vdev->pdev;
-		struct vfio_region_info info;
-		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
-		int i, ret;
-
-		minsz = offsetofend(struct vfio_region_info, offset);
-
-		if (copy_from_user(&info, (void __user *)arg, minsz))
-			return -EFAULT;
-
-		if (info.argsz < minsz)
-			return -EINVAL;
-
-		switch (info.index) {
-		case VFIO_PCI_CONFIG_REGION_INDEX:
-			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-			info.size = pdev->cfg_size;
-			info.flags = VFIO_REGION_INFO_FLAG_READ |
-				     VFIO_REGION_INFO_FLAG_WRITE;
-			break;
-		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
-			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-			info.size = pci_resource_len(pdev, info.index);
-			if (!info.size) {
-				info.flags = 0;
-				break;
-			}
-
-			info.flags = VFIO_REGION_INFO_FLAG_READ |
-				     VFIO_REGION_INFO_FLAG_WRITE;
-			if (vdev->bar_mmap_supported[info.index]) {
-				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
-				if (info.index == vdev->msix_bar) {
-					ret = msix_mmappable_cap(vdev, &caps);
-					if (ret)
-						return ret;
-				}
-			}
-
-			break;
-		case VFIO_PCI_ROM_REGION_INDEX:
-		{
-			void __iomem *io;
-			size_t size;
-			u16 orig_cmd;
-
-			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-			info.flags = 0;
-
-			/* Report the BAR size, not the ROM size */
-			info.size = pci_resource_len(pdev, info.index);
-			if (!info.size) {
-				/* Shadow ROMs appear as PCI option ROMs */
-				if (pdev->resource[PCI_ROM_RESOURCE].flags &
-							IORESOURCE_ROM_SHADOW)
-					info.size = 0x20000;
-				else
-					break;
-			}
-
-			/*
-			 * Is it really there?  Enable memory decode for
-			 * implicit access in pci_map_rom().
-			 */
-			pci_read_config_word(pdev, PCI_COMMAND, &orig_cmd);
-			pci_write_config_word(pdev, PCI_COMMAND,
-					      orig_cmd | PCI_COMMAND_MEMORY);
-
-			io = pci_map_rom(pdev, &size);
-			if (io) {
-				info.flags = VFIO_REGION_INFO_FLAG_READ;
-				pci_unmap_rom(pdev, io);
-			} else {
-				info.size = 0;
-			}
-
-			pci_write_config_word(pdev, PCI_COMMAND, orig_cmd);
-			break;
-		}
-		case VFIO_PCI_VGA_REGION_INDEX:
-			if (!vdev->has_vga)
-				return -EINVAL;
-
-			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-			info.size = 0xc0000;
-			info.flags = VFIO_REGION_INFO_FLAG_READ |
-				     VFIO_REGION_INFO_FLAG_WRITE;
-
-			break;
-		default:
-		{
-			struct vfio_region_info_cap_type cap_type = {
-					.header.id = VFIO_REGION_INFO_CAP_TYPE,
-					.header.version = 1 };
-
-			if (info.index >=
-			    VFIO_PCI_NUM_REGIONS + vdev->num_regions)
-				return -EINVAL;
-			info.index = array_index_nospec(info.index,
-							VFIO_PCI_NUM_REGIONS +
-							vdev->num_regions);
-
-			i = info.index - VFIO_PCI_NUM_REGIONS;
-
-			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
-			info.size = vdev->region[i].size;
-			info.flags = vdev->region[i].flags;
-
-			cap_type.type = vdev->region[i].type;
-			cap_type.subtype = vdev->region[i].subtype;
-
-			ret = vfio_info_add_capability(&caps, &cap_type.header,
-						       sizeof(cap_type));
-			if (ret)
-				return ret;
-
-			if (vdev->region[i].ops->add_capability) {
-				ret = vdev->region[i].ops->add_capability(vdev,
-						&vdev->region[i], &caps);
-				if (ret)
-					return ret;
-			}
-		}
-		}
-
-		if (caps.size) {
-			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
-			if (info.argsz < sizeof(info) + caps.size) {
-				info.argsz = sizeof(info) + caps.size;
-				info.cap_offset = 0;
-			} else {
-				vfio_info_cap_shift(&caps, sizeof(info));
-				if (copy_to_user((void __user *)arg +
-						  sizeof(info), caps.buf,
-						  caps.size)) {
-					kfree(caps.buf);
-					return -EFAULT;
-				}
-				info.cap_offset = sizeof(info);
-			}
-
-			kfree(caps.buf);
-		}
-
-		return copy_to_user((void __user *)arg, &info, minsz) ?
-			-EFAULT : 0;
-
-	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
-		struct vfio_irq_info info;
-
-		minsz = offsetofend(struct vfio_irq_info, count);
-
-		if (copy_from_user(&info, (void __user *)arg, minsz))
-			return -EFAULT;
-
-		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
-			return -EINVAL;
-
-		switch (info.index) {
-		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
-		case VFIO_PCI_REQ_IRQ_INDEX:
-			break;
-		case VFIO_PCI_ERR_IRQ_INDEX:
-			if (pci_is_pcie(vdev->pdev))
-				break;
-		/* fall through */
-		default:
-			return -EINVAL;
-		}
-
-		info.flags = VFIO_IRQ_INFO_EVENTFD;
-
-		info.count = vfio_pci_get_irq_count(vdev, info.index);
-
-		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
-			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
-				       VFIO_IRQ_INFO_AUTOMASKED);
-		else
-			info.flags |= VFIO_IRQ_INFO_NORESIZE;
-
-		return copy_to_user((void __user *)arg, &info, minsz) ?
-			-EFAULT : 0;
-
-	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
-		struct vfio_irq_set hdr;
-		u8 *data = NULL;
-		int max, ret = 0;
-		size_t data_size = 0;
-
-		minsz = offsetofend(struct vfio_irq_set, count);
-
-		if (copy_from_user(&hdr, (void __user *)arg, minsz))
-			return -EFAULT;
-
-		max = vfio_pci_get_irq_count(vdev, hdr.index);
-
-		ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
-						 VFIO_PCI_NUM_IRQS, &data_size);
-		if (ret)
-			return ret;
-
-		if (data_size) {
-			data = memdup_user((void __user *)(arg + minsz),
-					    data_size);
-			if (IS_ERR(data))
-				return PTR_ERR(data);
-		}
-
-		mutex_lock(&vdev->igate);
-
-		ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
-					      hdr.start, hdr.count, data);
-
-		mutex_unlock(&vdev->igate);
-		kfree(data);
-
-		return ret;
-
-	} else if (cmd == VFIO_DEVICE_RESET) {
-		return vdev->reset_works ?
-			pci_try_reset_function(vdev->pdev) : -EINVAL;
-
-	} else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
-		struct vfio_pci_hot_reset_info hdr;
-		struct vfio_pci_fill_info fill = { 0 };
-		struct vfio_pci_dependent_device *devices = NULL;
-		bool slot = false;
-		int ret = 0;
-
-		minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
-
-		if (copy_from_user(&hdr, (void __user *)arg, minsz))
-			return -EFAULT;
-
-		if (hdr.argsz < minsz)
-			return -EINVAL;
-
-		hdr.flags = 0;
-
-		/* Can we do a slot or bus reset or neither? */
-		if (!pci_probe_reset_slot(vdev->pdev->slot))
-			slot = true;
-		else if (pci_probe_reset_bus(vdev->pdev->bus))
-			return -ENODEV;
-
-		/* How many devices are affected? */
-		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
-						    vfio_pci_count_devs,
-						    &fill.max, slot);
-		if (ret)
-			return ret;
-
-		WARN_ON(!fill.max); /* Should always be at least one */
-
-		/*
-		 * If there's enough space, fill it now, otherwise return
-		 * -ENOSPC and the number of devices affected.
-		 */
-		if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
-			ret = -ENOSPC;
-			hdr.count = fill.max;
-			goto reset_info_exit;
-		}
-
-		devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
-		if (!devices)
-			return -ENOMEM;
-
-		fill.devices = devices;
-
-		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
-						    vfio_pci_fill_devs,
-						    &fill, slot);
-
-		/*
-		 * If a device was removed between counting and filling,
-		 * we may come up short of fill.max.  If a device was
-		 * added, we'll have a return of -EAGAIN above.
-		 */
-		if (!ret)
-			hdr.count = fill.cur;
-
-reset_info_exit:
-		if (copy_to_user((void __user *)arg, &hdr, minsz))
-			ret = -EFAULT;
-
-		if (!ret) {
-			if (copy_to_user((void __user *)(arg + minsz), devices,
-					 hdr.count * sizeof(*devices)))
-				ret = -EFAULT;
-		}
-
-		kfree(devices);
-		return ret;
-
-	} else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
-		struct vfio_pci_hot_reset hdr;
-		int32_t *group_fds;
-		struct vfio_pci_group_entry *groups;
-		struct vfio_pci_group_info info;
-		bool slot = false;
-		int i, count = 0, ret = 0;
-
-		minsz = offsetofend(struct vfio_pci_hot_reset, count);
-
-		if (copy_from_user(&hdr, (void __user *)arg, minsz))
-			return -EFAULT;
-
-		if (hdr.argsz < minsz || hdr.flags)
-			return -EINVAL;
-
-		/* Can we do a slot or bus reset or neither? */
-		if (!pci_probe_reset_slot(vdev->pdev->slot))
-			slot = true;
-		else if (pci_probe_reset_bus(vdev->pdev->bus))
-			return -ENODEV;
-
-		/*
-		 * We can't let userspace give us an arbitrarily large
-		 * buffer to copy, so verify how many we think there
-		 * could be.  Note groups can have multiple devices so
-		 * one group per device is the max.
-		 */
-		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
-						    vfio_pci_count_devs,
-						    &count, slot);
-		if (ret)
-			return ret;
-
-		/* Somewhere between 1 and count is OK */
-		if (!hdr.count || hdr.count > count)
-			return -EINVAL;
-
-		group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
-		groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
-		if (!group_fds || !groups) {
-			kfree(group_fds);
-			kfree(groups);
-			return -ENOMEM;
-		}
-
-		if (copy_from_user(group_fds, (void __user *)(arg + minsz),
-				   hdr.count * sizeof(*group_fds))) {
-			kfree(group_fds);
-			kfree(groups);
-			return -EFAULT;
-		}
-
-		/*
-		 * For each group_fd, get the group through the vfio external
-		 * user interface and store the group and iommu ID.  This
-		 * ensures the group is held across the reset.
-		 */
-		for (i = 0; i < hdr.count; i++) {
-			struct vfio_group *group;
-			struct fd f = fdget(group_fds[i]);
-			if (!f.file) {
-				ret = -EBADF;
-				break;
-			}
-
-			group = vfio_group_get_external_user(f.file);
-			fdput(f);
-			if (IS_ERR(group)) {
-				ret = PTR_ERR(group);
-				break;
-			}
-
-			groups[i].group = group;
-			groups[i].id = vfio_external_user_iommu_id(group);
-		}
-
-		kfree(group_fds);
-
-		/* release reference to groups on error */
-		if (ret)
-			goto hot_reset_release;
-
-		info.count = hdr.count;
-		info.groups = groups;
-
-		/*
-		 * Test whether all the affected devices are contained
-		 * by the set of groups provided by the user.
-		 */
-		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
-						    vfio_pci_validate_devs,
-						    &info, slot);
-		if (!ret)
-			/* User has access, do the reset */
-			ret = pci_reset_bus(vdev->pdev);
-
-hot_reset_release:
-		for (i--; i >= 0; i--)
-			vfio_group_put_external_user(groups[i].group);
-
-		kfree(groups);
-		return ret;
-	} else if (cmd == VFIO_DEVICE_IOEVENTFD) {
-		struct vfio_device_ioeventfd ioeventfd;
-		int count;
-
-		minsz = offsetofend(struct vfio_device_ioeventfd, fd);
-
-		if (copy_from_user(&ioeventfd, (void __user *)arg, minsz))
-			return -EFAULT;
-
-		if (ioeventfd.argsz < minsz)
-			return -EINVAL;
-
-		if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
-			return -EINVAL;
-
-		count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
-
-		if (hweight8(count) != 1 || ioeventfd.fd < -1)
-			return -EINVAL;
-
-		return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
-					  ioeventfd.data, count, ioeventfd.fd);
-	}
-
-	return -ENOTTY;
-}
-
-static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
-			   size_t count, loff_t *ppos, bool iswrite)
-{
-	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
-	struct vfio_pci_device *vdev = device_data;
-
-	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
-		return -EINVAL;
-
-	switch (index) {
-	case VFIO_PCI_CONFIG_REGION_INDEX:
-		return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
-
-	case VFIO_PCI_ROM_REGION_INDEX:
-		if (iswrite)
-			return -EINVAL;
-		return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
-
-	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
-		return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
-
-	case VFIO_PCI_VGA_REGION_INDEX:
-		return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
-	default:
-		index -= VFIO_PCI_NUM_REGIONS;
-		return vdev->region[index].ops->rw(vdev, buf,
-						   count, ppos, iswrite);
-	}
-
-	return -EINVAL;
-}
-
-ssize_t vfio_pci_read(void *device_data, char __user *buf,
-			     size_t count, loff_t *ppos)
-{
-	if (!count)
-		return 0;
-
-	return vfio_pci_rw(device_data, buf, count, ppos, false);
-}
-
-ssize_t vfio_pci_write(void *device_data, const char __user *buf,
-			      size_t count, loff_t *ppos)
-{
-	if (!count)
-		return 0;
-
-	return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true);
-}
-
-int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
-{
-	struct vfio_pci_device *vdev = device_data;
-	struct pci_dev *pdev = vdev->pdev;
-	unsigned int index;
-	u64 phys_len, req_len, pgoff, req_start;
-	int ret;
-
-	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
-
-	if (vma->vm_end < vma->vm_start)
-		return -EINVAL;
-	if ((vma->vm_flags & VM_SHARED) == 0)
-		return -EINVAL;
-	if (index >= VFIO_PCI_NUM_REGIONS) {
-		int regnum = index - VFIO_PCI_NUM_REGIONS;
-		struct vfio_pci_region *region = vdev->region + regnum;
-
-		if (region && region->ops && region->ops->mmap &&
-		    (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
-			return region->ops->mmap(vdev, region, vma);
-		return -EINVAL;
-	}
-	if (index >= VFIO_PCI_ROM_REGION_INDEX)
-		return -EINVAL;
-	if (!vdev->bar_mmap_supported[index])
-		return -EINVAL;
-
-	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
-	req_len = vma->vm_end - vma->vm_start;
-	pgoff = vma->vm_pgoff &
-		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
-	req_start = pgoff << PAGE_SHIFT;
-
-	if (req_start + req_len > phys_len)
-		return -EINVAL;
-
-	/*
-	 * Even though we don't make use of the barmap for the mmap,
-	 * we need to request the region and the barmap tracks that.
-	 */
-	if (!vdev->barmap[index]) {
-		ret = pci_request_selected_regions(pdev,
-						   1 << index, "vfio-pci");
-		if (ret)
-			return ret;
-
-		vdev->barmap[index] = pci_iomap(pdev, index, 0);
-		if (!vdev->barmap[index]) {
-			pci_release_selected_regions(pdev, 1 << index);
-			return -ENOMEM;
-		}
-	}
-
-	vma->vm_private_data = vdev;
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
-
-	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
-			       req_len, vma->vm_page_prot);
-}
-
-void vfio_pci_request(void *device_data, unsigned int count)
-{
-	struct vfio_pci_device *vdev = device_data;
-	struct pci_dev *pdev = vdev->pdev;
-
-	mutex_lock(&vdev->igate);
-
-	if (vdev->req_trigger) {
-		if (!(count % 10))
-			pci_notice_ratelimited(pdev,
-				"Relaying device request to user (#%u)\n",
-				count);
-		eventfd_signal(vdev->req_trigger, 1);
-	} else if (count == 0) {
-		pci_warn(pdev,
-			"No device request channel registered, blocked until released by user\n");
-	}
-
-	mutex_unlock(&vdev->igate);
-}
-
 static const struct vfio_device_ops vfio_pci_ops = {
 	.name		= "vfio-pci",
 	.open		= vfio_pci_open,
@@ -1396,38 +220,6 @@  static void vfio_pci_remove(struct pci_dev *pdev)
 	}
 }
 
-static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
-						  pci_channel_state_t state)
-{
-	struct vfio_pci_device *vdev;
-	struct vfio_device *device;
-
-	device = vfio_device_get_from_dev(&pdev->dev);
-	if (device == NULL)
-		return PCI_ERS_RESULT_DISCONNECT;
-
-	vdev = vfio_device_data(device);
-	if (vdev == NULL) {
-		vfio_device_put(device);
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-
-	mutex_lock(&vdev->igate);
-
-	if (vdev->err_trigger)
-		eventfd_signal(vdev->err_trigger, 1);
-
-	mutex_unlock(&vdev->igate);
-
-	vfio_device_put(device);
-
-	return PCI_ERS_RESULT_CAN_RECOVER;
-}
-
-static const struct pci_error_handlers vfio_err_handlers = {
-	.error_detected = vfio_pci_aer_err_detected,
-};
-
 static struct pci_driver vfio_pci_driver = {
 	.name		= "vfio-pci",
 	.id_table	= NULL, /* only dynamic ids */
@@ -1436,246 +228,12 @@  static struct pci_driver vfio_pci_driver = {
 	.err_handler	= &vfio_err_handlers,
 };
 
-static DEFINE_MUTEX(reflck_lock);
-
-static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void)
-{
-	struct vfio_pci_reflck *reflck;
-
-	reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
-	if (!reflck)
-		return ERR_PTR(-ENOMEM);
-
-	kref_init(&reflck->kref);
-	mutex_init(&reflck->lock);
-
-	return reflck;
-}
-
-static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck)
-{
-	kref_get(&reflck->kref);
-}
-
-static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
-{
-	struct vfio_pci_device *vdev = data;
-	struct vfio_pci_reflck **preflck = &vdev->reflck;
-	struct vfio_device *device;
-	struct vfio_pci_device *tmp;
-
-	device = vfio_device_get_from_dev(&pdev->dev);
-	if (!device)
-		return 0;
-
-	if (pci_dev_driver(pdev) != pci_dev_driver(vdev->pdev)) {
-		vfio_device_put(device);
-		return 0;
-	}
-
-	tmp = vfio_device_data(device);
-
-	if (tmp->reflck) {
-		vfio_pci_reflck_get(tmp->reflck);
-		*preflck = tmp->reflck;
-		vfio_device_put(device);
-		return 1;
-	}
-
-	vfio_device_put(device);
-	return 0;
-}
-
-int vfio_pci_reflck_attach(struct vfio_pci_device *vdev)
-{
-	bool slot = !pci_probe_reset_slot(vdev->pdev->slot);
-
-	mutex_lock(&reflck_lock);
-
-	if (pci_is_root_bus(vdev->pdev->bus) ||
-	    vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
-					  vdev, slot) <= 0)
-		vdev->reflck = vfio_pci_reflck_alloc();
-
-	mutex_unlock(&reflck_lock);
-
-	return PTR_ERR_OR_ZERO(vdev->reflck);
-}
-
-static void vfio_pci_reflck_release(struct kref *kref)
-{
-	struct vfio_pci_reflck *reflck = container_of(kref,
-						      struct vfio_pci_reflck,
-						      kref);
-
-	kfree(reflck);
-	mutex_unlock(&reflck_lock);
-}
-
-void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck)
-{
-	kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock);
-}
-
-struct vfio_devices {
-	struct vfio_device **devices;
-	struct vfio_pci_device *vdev;
-	int cur_index;
-	int max_index;
-};
-
-static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
-{
-	struct vfio_devices *devs = data;
-	struct vfio_device *device;
-	struct vfio_pci_device *tmp;
-
-	if (devs->cur_index == devs->max_index)
-		return -ENOSPC;
-
-	device = vfio_device_get_from_dev(&pdev->dev);
-	if (!device)
-		return -EINVAL;
-
-	if (pci_dev_driver(pdev) != pci_dev_driver(devs->vdev->pdev)) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
-
-	tmp = vfio_device_data(device);
-
-	/* Fault if the device is not unused */
-	if (tmp->refcnt) {
-		vfio_device_put(device);
-		return -EBUSY;
-	}
-
-	devs->devices[devs->cur_index++] = device;
-	return 0;
-}
-
-/*
- * If a bus or slot reset is available for the provided device and:
- *  - All of the devices affected by that bus or slot reset are unused
- *    (!refcnt)
- *  - At least one of the affected devices is marked dirty via
- *    needs_reset (such as by lack of FLR support)
- * Then attempt to perform that bus or slot reset.  Callers are required
- * to hold vdev->reflck->lock, protecting the bus/slot reset group from
- * concurrent opens.  A vfio_device reference is acquired for each device
- * to prevent unbinds during the reset operation.
- *
- * NB: vfio-core considers a group to be viable even if some devices are
- * bound to drivers like pci-stub or pcieport.  Here we require all devices
- * to be bound to vfio_pci since that's the only way we can be sure they
- * stay put.
- */
-static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
-{
-	struct vfio_devices devs = { .vdev = vdev, .cur_index = 0 };
-	int i = 0, ret = -EINVAL;
-	bool slot = false;
-	struct vfio_pci_device *tmp;
-
-	if (!pci_probe_reset_slot(vdev->pdev->slot))
-		slot = true;
-	else if (pci_probe_reset_bus(vdev->pdev->bus))
-		return;
-
-	if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
-					  &i, slot) || !i)
-		return;
-
-	devs.max_index = i;
-	devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL);
-	if (!devs.devices)
-		return;
-
-	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
-					  vfio_pci_get_unused_devs,
-					  &devs, slot))
-		goto put_devs;
-
-	/* Does at least one need a reset? */
-	for (i = 0; i < devs.cur_index; i++) {
-		tmp = vfio_device_data(devs.devices[i]);
-		if (tmp->needs_reset) {
-			ret = pci_reset_bus(vdev->pdev);
-			break;
-		}
-	}
-
-put_devs:
-	for (i = 0; i < devs.cur_index; i++) {
-		tmp = vfio_device_data(devs.devices[i]);
-
-		/*
-		 * If reset was successful, affected devices no longer need
-		 * a reset and we should return all the collateral devices
-		 * to low power.  If not successful, we either didn't reset
-		 * the bus or timed out waiting for it, so let's not touch
-		 * the power state.
-		 */
-		if (!ret) {
-			tmp->needs_reset = false;
-
-			if (tmp != vdev && !tmp->disable_idle_d3)
-				vfio_pci_set_power_state(tmp, PCI_D3hot);
-		}
-
-		vfio_device_put(devs.devices[i]);
-	}
-
-	kfree(devs.devices);
-}
-
 static void __exit vfio_pci_cleanup(void)
 {
 	pci_unregister_driver(&vfio_pci_driver);
 	vfio_pci_uninit_perm_bits();
 }
 
-void __init vfio_pci_fill_ids(char *ids, struct pci_driver *driver)
-{
-	char *p, *id;
-	int rc;
-
-	/* no ids passed actually */
-	if (ids[0] == '\0')
-		return;
-
-	/* add ids specified in the module parameter */
-	p = ids;
-	while ((id = strsep(&p, ","))) {
-		unsigned int vendor, device, subvendor = PCI_ANY_ID,
-			subdevice = PCI_ANY_ID, class = 0, class_mask = 0;
-		int fields;
-
-		if (!strlen(id))
-			continue;
-
-		fields = sscanf(id, "%x:%x:%x:%x:%x:%x",
-				&vendor, &device, &subvendor, &subdevice,
-				&class, &class_mask);
-
-		if (fields < 2) {
-			pr_warn("invalid id string \"%s\"\n", id);
-			continue;
-		}
-
-		rc = pci_add_dynid(driver, vendor, device,
-				   subvendor, subdevice, class, class_mask, 0);
-		if (rc)
-			pr_warn("failed to add dynamic id [%04x:%04x[%04x:%04x]] class %#08x/%08x (%d)\n",
-				vendor, device, subvendor, subdevice,
-				class, class_mask, rc);
-		else
-			pr_info("add [%04x:%04x[%04x:%04x]] class %#08x/%08x\n",
-				vendor, device, subvendor, subdevice,
-				class, class_mask);
-	}
-}
-
 static int __init vfio_pci_init(void)
 {
 	int ret;
diff --git a/drivers/vfio/pci/vfio_pci_common.c b/drivers/vfio/pci/vfio_pci_common.c
index b0894dfc..15d8b55 100644
--- a/drivers/vfio/pci/vfio_pci_common.c
+++ b/drivers/vfio/pci/vfio_pci_common.c
@@ -1234,7 +1234,7 @@  static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
 	return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
-static const struct pci_error_handlers vfio_err_handlers = {
+const struct pci_error_handlers vfio_err_handlers = {
 	.error_detected = vfio_pci_aer_err_detected,
 };
 
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 194d487..499dd04 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -135,6 +135,8 @@  struct vfio_pci_device {
 #define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev)))
 #define irq_is(vdev, type) (vdev->irq_type == type)
 
+extern const struct pci_error_handlers vfio_err_handlers;
+
 static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 {
 	return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;