diff mbox series

[8/9] vfio/pci: export nvlink2 support into vendor vfio_pci drivers

Message ID 20210309083357.65467-9-mgurtovoy@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Introduce vfio-pci-core subsystem | expand

Commit Message

Max Gurtovoy March 9, 2021, 8:33 a.m. UTC
The new drivers introduced are nvlink2gpu_vfio_pci.ko and
npu2_vfio_pci.ko.
The first will be responsible for providing special extensions for
NVIDIA GPUs with NVLINK2 support for P9 platform (and others in the
future). The last will be responsible for POWER9 NPU2 unit (NVLink2 host
bus adapter).

Also, preserve backward compatibility for users that were binding
NVLINK2 devices to vfio_pci.ko. Hopefully this compatibility layer will
be dropped in the future

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
---
 drivers/vfio/pci/Kconfig                      |  28 +++-
 drivers/vfio/pci/Makefile                     |   7 +-
 .../pci/{vfio_pci_npu2.c => npu2_vfio_pci.c}  | 144 ++++++++++++++++-
 drivers/vfio/pci/npu2_vfio_pci.h              |  24 +++
 ...pci_nvlink2gpu.c => nvlink2gpu_vfio_pci.c} | 149 +++++++++++++++++-
 drivers/vfio/pci/nvlink2gpu_vfio_pci.h        |  24 +++
 drivers/vfio/pci/vfio_pci.c                   |  61 ++++++-
 drivers/vfio/pci/vfio_pci_core.c              |  18 ---
 drivers/vfio/pci/vfio_pci_core.h              |  14 --
 9 files changed, 422 insertions(+), 47 deletions(-)
 rename drivers/vfio/pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} (64%)
 create mode 100644 drivers/vfio/pci/npu2_vfio_pci.h
 rename drivers/vfio/pci/{vfio_pci_nvlink2gpu.c => nvlink2gpu_vfio_pci.c} (67%)
 create mode 100644 drivers/vfio/pci/nvlink2gpu_vfio_pci.h

Comments

Alexey Kardashevskiy March 10, 2021, 6:39 a.m. UTC | #1
On 09/03/2021 19:33, Max Gurtovoy wrote:
> The new drivers introduced are nvlink2gpu_vfio_pci.ko and
> npu2_vfio_pci.ko.
> The first will be responsible for providing special extensions for
> NVIDIA GPUs with NVLINK2 support for P9 platform (and others in the
> future). The last will be responsible for POWER9 NPU2 unit (NVLink2 host
> bus adapter).
> 
> Also, preserve backward compatibility for users that were binding
> NVLINK2 devices to vfio_pci.ko. Hopefully this compatibility layer will
> be dropped in the future
> 
> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
> ---
>   drivers/vfio/pci/Kconfig                      |  28 +++-
>   drivers/vfio/pci/Makefile                     |   7 +-
>   .../pci/{vfio_pci_npu2.c => npu2_vfio_pci.c}  | 144 ++++++++++++++++-
>   drivers/vfio/pci/npu2_vfio_pci.h              |  24 +++
>   ...pci_nvlink2gpu.c => nvlink2gpu_vfio_pci.c} | 149 +++++++++++++++++-
>   drivers/vfio/pci/nvlink2gpu_vfio_pci.h        |  24 +++
>   drivers/vfio/pci/vfio_pci.c                   |  61 ++++++-
>   drivers/vfio/pci/vfio_pci_core.c              |  18 ---
>   drivers/vfio/pci/vfio_pci_core.h              |  14 --
>   9 files changed, 422 insertions(+), 47 deletions(-)
>   rename drivers/vfio/pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} (64%)
>   create mode 100644 drivers/vfio/pci/npu2_vfio_pci.h
>   rename drivers/vfio/pci/{vfio_pci_nvlink2gpu.c => nvlink2gpu_vfio_pci.c} (67%)
>   create mode 100644 drivers/vfio/pci/nvlink2gpu_vfio_pci.h
> 
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 829e90a2e5a3..88c89863a205 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -48,8 +48,30 @@ config VFIO_PCI_IGD
>   
>   	  To enable Intel IGD assignment through vfio-pci, say Y.
>   
> -config VFIO_PCI_NVLINK2
> -	def_bool y
> +config VFIO_PCI_NVLINK2GPU
> +	tristate "VFIO support for NVIDIA NVLINK2 GPUs"
>   	depends on VFIO_PCI_CORE && PPC_POWERNV
>   	help
> -	  VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
> +	  VFIO PCI driver for NVIDIA NVLINK2 GPUs with specific extensions
> +	  for P9 Witherspoon machine.
> +
> +config VFIO_PCI_NPU2
> +	tristate "VFIO support for IBM NPU host bus adapter on P9"
> +	depends on VFIO_PCI_NVLINK2GPU && PPC_POWERNV
> +	help
> +	  VFIO PCI specific extensions for IBM NVLink2 host bus adapter on P9
> +	  Witherspoon machine.
> +
> +config VFIO_PCI_DRIVER_COMPAT
> +	bool "VFIO PCI backward compatibility for vendor specific extensions"
> +	default y
> +	depends on VFIO_PCI
> +	help
> +	  Say Y here if you want to preserve VFIO PCI backward
> +	  compatibility. vfio_pci.ko will continue to automatically use
> +	  the NVLINK2, NPU2 and IGD VFIO drivers when it is attached to
> +	  a compatible device.
> +
> +	  When N is selected the user must bind explicity to the module
> +	  they want to handle the device and vfio_pci.ko will have no
> +	  device specific special behaviors.
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index f539f32c9296..86fb62e271fc 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -2,10 +2,15 @@
>   
>   obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
>   obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
> +obj-$(CONFIG_VFIO_PCI_NPU2) += npu2-vfio-pci.o
> +obj-$(CONFIG_VFIO_PCI_NVLINK2GPU) += nvlink2gpu-vfio-pci.o
>   
>   vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
>   vfio-pci-core-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
> -vfio-pci-core-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2gpu.o vfio_pci_npu2.o
>   vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o
>   
>   vfio-pci-y := vfio_pci.o
> +
> +npu2-vfio-pci-y := npu2_vfio_pci.o
> +
> +nvlink2gpu-vfio-pci-y := nvlink2gpu_vfio_pci.o
> diff --git a/drivers/vfio/pci/vfio_pci_npu2.c b/drivers/vfio/pci/npu2_vfio_pci.c
> similarity index 64%
> rename from drivers/vfio/pci/vfio_pci_npu2.c
> rename to drivers/vfio/pci/npu2_vfio_pci.c
> index 717745256ab3..7071bda0f2b6 100644
> --- a/drivers/vfio/pci/vfio_pci_npu2.c
> +++ b/drivers/vfio/pci/npu2_vfio_pci.c
> @@ -14,19 +14,28 @@
>    *	Author: Alex Williamson <alex.williamson@redhat.com>
>    */
>   
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/module.h>
>   #include <linux/io.h>
>   #include <linux/pci.h>
>   #include <linux/uaccess.h>
>   #include <linux/vfio.h>
> +#include <linux/list.h>
>   #include <linux/sched/mm.h>
>   #include <linux/mmu_context.h>
>   #include <asm/kvm_ppc.h>
>   
>   #include "vfio_pci_core.h"
> +#include "npu2_vfio_pci.h"
>   
>   #define CREATE_TRACE_POINTS
>   #include "npu2_trace.h"
>   
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "Alexey Kardashevskiy <aik@ozlabs.ru>"
> +#define DRIVER_DESC     "NPU2 VFIO PCI - User Level meta-driver for POWER9 NPU NVLink2 HBA"
> +
>   EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap);
>   
>   struct vfio_pci_npu2_data {
> @@ -36,6 +45,10 @@ struct vfio_pci_npu2_data {
>   	unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */
>   };
>   
> +struct npu2_vfio_pci_device {
> +	struct vfio_pci_core_device	vdev;
> +};
> +
>   static size_t vfio_pci_npu2_rw(struct vfio_pci_core_device *vdev,
>   		char __user *buf, size_t count, loff_t *ppos, bool iswrite)
>   {
> @@ -120,7 +133,7 @@ static const struct vfio_pci_regops vfio_pci_npu2_regops = {
>   	.add_capability = vfio_pci_npu2_add_capability,
>   };
>   
> -int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev)
> +static int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev)
>   {
>   	int ret;
>   	struct vfio_pci_npu2_data *data;
> @@ -220,3 +233,132 @@ int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev)
>   
>   	return ret;
>   }
> +
> +static void npu2_vfio_pci_release(void *device_data)
> +{
> +	struct vfio_pci_core_device *vdev = device_data;
> +
> +	mutex_lock(&vdev->reflck->lock);
> +	if (!(--vdev->refcnt)) {
> +		vfio_pci_vf_token_user_add(vdev, -1);
> +		vfio_pci_core_spapr_eeh_release(vdev);
> +		vfio_pci_core_disable(vdev);
> +	}
> +	mutex_unlock(&vdev->reflck->lock);
> +
> +	module_put(THIS_MODULE);
> +}
> +
> +static int npu2_vfio_pci_open(void *device_data)
> +{
> +	struct vfio_pci_core_device *vdev = device_data;
> +	int ret = 0;
> +
> +	if (!try_module_get(THIS_MODULE))
> +		return -ENODEV;
> +
> +	mutex_lock(&vdev->reflck->lock);
> +
> +	if (!vdev->refcnt) {
> +		ret = vfio_pci_core_enable(vdev);
> +		if (ret)
> +			goto error;
> +
> +		ret = vfio_pci_ibm_npu2_init(vdev);
> +		if (ret && ret != -ENODEV) {
> +			pci_warn(vdev->pdev,
> +				 "Failed to setup NVIDIA NV2 ATSD region\n");
> +			vfio_pci_core_disable(vdev);
> +			goto error;
> +		}
> +		ret = 0;
> +		vfio_pci_probe_mmaps(vdev);
> +		vfio_pci_core_spapr_eeh_open(vdev);
> +		vfio_pci_vf_token_user_add(vdev, 1);
> +	}
> +	vdev->refcnt++;
> +error:
> +	mutex_unlock(&vdev->reflck->lock);
> +	if (ret)
> +		module_put(THIS_MODULE);
> +	return ret;
> +}
> +
> +static const struct vfio_device_ops npu2_vfio_pci_ops = {
> +	.name		= "npu2-vfio-pci",
> +	.open		= npu2_vfio_pci_open,
> +	.release	= npu2_vfio_pci_release,
> +	.ioctl		= vfio_pci_core_ioctl,
> +	.read		= vfio_pci_core_read,
> +	.write		= vfio_pci_core_write,
> +	.mmap		= vfio_pci_core_mmap,
> +	.request	= vfio_pci_core_request,
> +	.match		= vfio_pci_core_match,
> +};
> +
> +static int npu2_vfio_pci_probe(struct pci_dev *pdev,
> +		const struct pci_device_id *id)
> +{
> +	struct npu2_vfio_pci_device *npvdev;
> +	int ret;
> +
> +	npvdev = kzalloc(sizeof(*npvdev), GFP_KERNEL);
> +	if (!npvdev)
> +		return -ENOMEM;
> +
> +	ret = vfio_pci_core_register_device(&npvdev->vdev, pdev,
> +			&npu2_vfio_pci_ops);
> +	if (ret)
> +		goto out_free;
> +
> +	return 0;
> +
> +out_free:
> +	kfree(npvdev);
> +	return ret;
> +}
> +
> +static void npu2_vfio_pci_remove(struct pci_dev *pdev)
> +{
> +	struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
> +	struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
> +	struct npu2_vfio_pci_device *npvdev;
> +
> +	npvdev = container_of(core_vpdev, struct npu2_vfio_pci_device, vdev);
> +
> +	vfio_pci_core_unregister_device(core_vpdev);
> +	kfree(npvdev);
> +}
> +
> +static const struct pci_device_id npu2_vfio_pci_table[] = {
> +	{ PCI_VDEVICE(IBM, 0x04ea) },
> +	{ 0, }
> +};
> +
> +static struct pci_driver npu2_vfio_pci_driver = {
> +	.name			= "npu2-vfio-pci",
> +	.id_table		= npu2_vfio_pci_table,
> +	.probe			= npu2_vfio_pci_probe,
> +	.remove			= npu2_vfio_pci_remove,
> +#ifdef CONFIG_PCI_IOV
> +	.sriov_configure	= vfio_pci_core_sriov_configure,
> +#endif
> +	.err_handler		= &vfio_pci_core_err_handlers,
> +};
> +
> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev)
> +{
> +	if (pci_match_id(npu2_vfio_pci_driver.id_table, pdev))
> +		return &npu2_vfio_pci_driver;
> +	return NULL;
> +}
> +EXPORT_SYMBOL_GPL(get_npu2_vfio_pci_driver);
> +#endif
> +
> +module_pci_driver(npu2_vfio_pci_driver);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> diff --git a/drivers/vfio/pci/npu2_vfio_pci.h b/drivers/vfio/pci/npu2_vfio_pci.h
> new file mode 100644
> index 000000000000..92010d340346
> --- /dev/null
> +++ b/drivers/vfio/pci/npu2_vfio_pci.h
> @@ -0,0 +1,24 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (c) 2020, Mellanox Technologies, Ltd.  All rights reserved.
> + *     Author: Max Gurtovoy <mgurtovoy@nvidia.com>
> + */
> +
> +#ifndef NPU2_VFIO_PCI_H
> +#define NPU2_VFIO_PCI_H
> +
> +#include <linux/pci.h>
> +#include <linux/module.h>
> +
> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
> +#if defined(CONFIG_VFIO_PCI_NPU2) || defined(CONFIG_VFIO_PCI_NPU2_MODULE)
> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev);
> +#else
> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev)
> +{
> +	return NULL;
> +}
> +#endif
> +#endif
> +
> +#endif /* NPU2_VFIO_PCI_H */
> diff --git a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c
> similarity index 67%
> rename from drivers/vfio/pci/vfio_pci_nvlink2gpu.c
> rename to drivers/vfio/pci/nvlink2gpu_vfio_pci.c
> index 6dce1e78ee82..84a5ac1ce8ac 100644
> --- a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c
> +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c
> @@ -1,6 +1,6 @@
>   // SPDX-License-Identifier: GPL-2.0-only
>   /*
> - * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2.
> + * VFIO PCI NVIDIA NVLink2 GPUs support.
>    *
>    * Copyright (C) 2018 IBM Corp.  All rights reserved.
>    *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
> @@ -12,6 +12,9 @@
>    *	Author: Alex Williamson <alex.williamson@redhat.com>
>    */
>   
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/module.h>
>   #include <linux/io.h>
>   #include <linux/pci.h>
>   #include <linux/uaccess.h>
> @@ -21,10 +24,15 @@
>   #include <asm/kvm_ppc.h>
>   
>   #include "vfio_pci_core.h"
> +#include "nvlink2gpu_vfio_pci.h"
>   
>   #define CREATE_TRACE_POINTS
>   #include "nvlink2gpu_trace.h"
>   
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "Alexey Kardashevskiy <aik@ozlabs.ru>"
> +#define DRIVER_DESC     "NVLINK2GPU VFIO PCI - User Level meta-driver for NVIDIA NVLink2 GPUs"
> +
>   EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault);
>   EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap);
>   
> @@ -39,6 +47,10 @@ struct vfio_pci_nvgpu_data {
>   	struct notifier_block group_notifier;
>   };
>   
> +struct nv_vfio_pci_device {
> +	struct vfio_pci_core_device	vdev;
> +};
> +
>   static size_t vfio_pci_nvgpu_rw(struct vfio_pci_core_device *vdev,
>   		char __user *buf, size_t count, loff_t *ppos, bool iswrite)
>   {
> @@ -207,7 +219,8 @@ static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb,
>   	return NOTIFY_OK;
>   }
>   
> -int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev)
> +static int
> +vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev)
>   {
>   	int ret;
>   	u64 reg[2];
> @@ -293,3 +306,135 @@ int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev)
>   
>   	return ret;
>   }
> +
> +static void nvlink2gpu_vfio_pci_release(void *device_data)
> +{
> +	struct vfio_pci_core_device *vdev = device_data;
> +
> +	mutex_lock(&vdev->reflck->lock);
> +	if (!(--vdev->refcnt)) {
> +		vfio_pci_vf_token_user_add(vdev, -1);
> +		vfio_pci_core_spapr_eeh_release(vdev);
> +		vfio_pci_core_disable(vdev);
> +	}
> +	mutex_unlock(&vdev->reflck->lock);
> +
> +	module_put(THIS_MODULE);
> +}
> +
> +static int nvlink2gpu_vfio_pci_open(void *device_data)
> +{
> +	struct vfio_pci_core_device *vdev = device_data;
> +	int ret = 0;
> +
> +	if (!try_module_get(THIS_MODULE))
> +		return -ENODEV;
> +
> +	mutex_lock(&vdev->reflck->lock);
> +
> +	if (!vdev->refcnt) {
> +		ret = vfio_pci_core_enable(vdev);
> +		if (ret)
> +			goto error;
> +
> +		ret = vfio_pci_nvidia_v100_nvlink2_init(vdev);
> +		if (ret && ret != -ENODEV) {
> +			pci_warn(vdev->pdev,
> +				 "Failed to setup NVIDIA NV2 RAM region\n");
> +			vfio_pci_core_disable(vdev);
> +			goto error;
> +		}
> +		ret = 0;
> +		vfio_pci_probe_mmaps(vdev);
> +		vfio_pci_core_spapr_eeh_open(vdev);
> +		vfio_pci_vf_token_user_add(vdev, 1);
> +	}
> +	vdev->refcnt++;
> +error:
> +	mutex_unlock(&vdev->reflck->lock);
> +	if (ret)
> +		module_put(THIS_MODULE);
> +	return ret;
> +}
> +
> +static const struct vfio_device_ops nvlink2gpu_vfio_pci_ops = {
> +	.name		= "nvlink2gpu-vfio-pci",
> +	.open		= nvlink2gpu_vfio_pci_open,
> +	.release	= nvlink2gpu_vfio_pci_release,
> +	.ioctl		= vfio_pci_core_ioctl,
> +	.read		= vfio_pci_core_read,
> +	.write		= vfio_pci_core_write,
> +	.mmap		= vfio_pci_core_mmap,
> +	.request	= vfio_pci_core_request,
> +	.match		= vfio_pci_core_match,
> +};
> +
> +static int nvlink2gpu_vfio_pci_probe(struct pci_dev *pdev,
> +		const struct pci_device_id *id)
> +{
> +	struct nv_vfio_pci_device *nvdev;
> +	int ret;
> +
> +	nvdev = kzalloc(sizeof(*nvdev), GFP_KERNEL);
> +	if (!nvdev)
> +		return -ENOMEM;
> +
> +	ret = vfio_pci_core_register_device(&nvdev->vdev, pdev,
> +			&nvlink2gpu_vfio_pci_ops);
> +	if (ret)
> +		goto out_free;
> +
> +	return 0;
> +
> +out_free:
> +	kfree(nvdev);
> +	return ret;
> +}
> +
> +static void nvlink2gpu_vfio_pci_remove(struct pci_dev *pdev)
> +{
> +	struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
> +	struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
> +	struct nv_vfio_pci_device *nvdev;
> +
> +	nvdev = container_of(core_vpdev, struct nv_vfio_pci_device, vdev);
> +
> +	vfio_pci_core_unregister_device(core_vpdev);
> +	kfree(nvdev);
> +}
> +
> +static const struct pci_device_id nvlink2gpu_vfio_pci_table[] = {
> +	{ PCI_VDEVICE(NVIDIA, 0x1DB1) }, /* GV100GL-A NVIDIA Tesla V100-SXM2-16GB */
> +	{ PCI_VDEVICE(NVIDIA, 0x1DB5) }, /* GV100GL-A NVIDIA Tesla V100-SXM2-32GB */
> +	{ PCI_VDEVICE(NVIDIA, 0x1DB8) }, /* GV100GL-A NVIDIA Tesla V100-SXM3-32GB */
> +	{ PCI_VDEVICE(NVIDIA, 0x1DF5) }, /* GV100GL-B NVIDIA Tesla V100-SXM2-16GB */


Where is this list from?

Also, how is this supposed to work at the boot time? Will the kernel try 
binding let's say this one and nouveau? Which one is going to win?


> +	{ 0, }


Why a comma?

> +};



> +
> +static struct pci_driver nvlink2gpu_vfio_pci_driver = {
> +	.name			= "nvlink2gpu-vfio-pci",
> +	.id_table		= nvlink2gpu_vfio_pci_table,
> +	.probe			= nvlink2gpu_vfio_pci_probe,
> +	.remove			= nvlink2gpu_vfio_pci_remove,
> +#ifdef CONFIG_PCI_IOV
> +	.sriov_configure	= vfio_pci_core_sriov_configure,
> +#endif


What is this IOV business about?


> +	.err_handler		= &vfio_pci_core_err_handlers,
> +};
> +
> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev)
> +{
> +	if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev))
> +		return &nvlink2gpu_vfio_pci_driver;


Why do we need matching PCI ids here instead of looking at the FDT which 
will work better?


> +	return NULL;
> +}
> +EXPORT_SYMBOL_GPL(get_nvlink2gpu_vfio_pci_driver);
> +#endif
> +
> +module_pci_driver(nvlink2gpu_vfio_pci_driver);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> diff --git a/drivers/vfio/pci/nvlink2gpu_vfio_pci.h b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h
> new file mode 100644
> index 000000000000..ebd5b600b190
> --- /dev/null
> +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h
> @@ -0,0 +1,24 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (c) 2020, Mellanox Technologies, Ltd.  All rights reserved.
> + *     Author: Max Gurtovoy <mgurtovoy@nvidia.com>
> + */
> +
> +#ifndef NVLINK2GPU_VFIO_PCI_H
> +#define NVLINK2GPU_VFIO_PCI_H
> +
> +#include <linux/pci.h>
> +#include <linux/module.h>
> +
> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
> +#if defined(CONFIG_VFIO_PCI_NVLINK2GPU) || defined(CONFIG_VFIO_PCI_NVLINK2GPU_MODULE)
> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev);
> +#else
> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev)
> +{
> +	return NULL;
> +}
> +#endif
> +#endif
> +
> +#endif /* NVLINK2GPU_VFIO_PCI_H */
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index dbc0a6559914..8e81ea039f31 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -27,6 +27,10 @@
>   #include <linux/uaccess.h>
>   
>   #include "vfio_pci_core.h"
> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
> +#include "npu2_vfio_pci.h"
> +#include "nvlink2gpu_vfio_pci.h"
> +#endif
>   
>   #define DRIVER_VERSION  "0.2"
>   #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> @@ -142,14 +146,48 @@ static const struct vfio_device_ops vfio_pci_ops = {
>   	.match		= vfio_pci_core_match,
>   };
>   
> +/*
> + * This layer is used for backward compatibility. Hopefully it will be
> + * removed in the future.
> + */
> +static struct pci_driver *vfio_pci_get_compat_driver(struct pci_dev *pdev)
> +{
> +	switch (pdev->vendor) {
> +	case PCI_VENDOR_ID_NVIDIA:
> +		switch (pdev->device) {
> +		case 0x1db1:
> +		case 0x1db5:
> +		case 0x1db8:
> +		case 0x1df5:
> +			return get_nvlink2gpu_vfio_pci_driver(pdev);

This does not really need a switch, could simply call these 
get_xxxx_vfio_pci_driver. Thanks,


> +		default:
> +			return NULL;
> +		}
> +	case PCI_VENDOR_ID_IBM:
> +		switch (pdev->device) {
> +		case 0x04ea:
> +			return get_npu2_vfio_pci_driver(pdev);
> +		default:
> +			return NULL;
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
>   static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>   {
>   	struct vfio_pci_device *vpdev;
> +	struct pci_driver *driver;
>   	int ret;
>   
>   	if (vfio_pci_is_denylisted(pdev))
>   		return -EINVAL;
>   
> +	driver = vfio_pci_get_compat_driver(pdev);
> +	if (driver)
> +		return driver->probe(pdev, id);
> +
>   	vpdev = kzalloc(sizeof(*vpdev), GFP_KERNEL);
>   	if (!vpdev)
>   		return -ENOMEM;
> @@ -167,14 +205,21 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>   
>   static void vfio_pci_remove(struct pci_dev *pdev)
>   {
> -	struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
> -	struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
> -	struct vfio_pci_device *vpdev;
> -
> -	vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev);
> -
> -	vfio_pci_core_unregister_device(core_vpdev);
> -	kfree(vpdev);
> +	struct pci_driver *driver;
> +
> +	driver = vfio_pci_get_compat_driver(pdev);
> +	if (driver) {
> +		driver->remove(pdev);
> +	} else {
> +		struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
> +		struct vfio_pci_core_device *core_vpdev;
> +		struct vfio_pci_device *vpdev;
> +
> +		core_vpdev = vfio_device_data(vdev);
> +		vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev);
> +		vfio_pci_core_unregister_device(core_vpdev);
> +		kfree(vpdev);
> +	}
>   }
>   
>   static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 4de8e352df9c..f9b39abe54cb 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -354,24 +354,6 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
>   		}
>   	}
>   
> -	if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
> -	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
> -		ret = vfio_pci_nvidia_v100_nvlink2_init(vdev);
> -		if (ret && ret != -ENODEV) {
> -			pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n");
> -			goto disable_exit;
> -		}
> -	}
> -
> -	if (pdev->vendor == PCI_VENDOR_ID_IBM &&
> -	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
> -		ret = vfio_pci_ibm_npu2_init(vdev);
> -		if (ret && ret != -ENODEV) {
> -			pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n");
> -			goto disable_exit;
> -		}
> -	}
> -
>   	return 0;
>   
>   disable_exit:
> diff --git a/drivers/vfio/pci/vfio_pci_core.h b/drivers/vfio/pci/vfio_pci_core.h
> index 8989443c3086..31f3836e606e 100644
> --- a/drivers/vfio/pci/vfio_pci_core.h
> +++ b/drivers/vfio/pci/vfio_pci_core.h
> @@ -204,20 +204,6 @@ static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
>   	return -ENODEV;
>   }
>   #endif
> -#ifdef CONFIG_VFIO_PCI_NVLINK2
> -extern int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev);
> -extern int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev);
> -#else
> -static inline int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev)
> -{
> -	return -ENODEV;
> -}
> -
> -static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev)
> -{
> -	return -ENODEV;
> -}
> -#endif
>   
>   #ifdef CONFIG_S390
>   extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
>
Max Gurtovoy March 10, 2021, 12:57 p.m. UTC | #2
On 3/10/2021 8:39 AM, Alexey Kardashevskiy wrote:
>
>
> On 09/03/2021 19:33, Max Gurtovoy wrote:
>> The new drivers introduced are nvlink2gpu_vfio_pci.ko and
>> npu2_vfio_pci.ko.
>> The first will be responsible for providing special extensions for
>> NVIDIA GPUs with NVLINK2 support for P9 platform (and others in the
>> future). The last will be responsible for POWER9 NPU2 unit (NVLink2 host
>> bus adapter).
>>
>> Also, preserve backward compatibility for users that were binding
>> NVLINK2 devices to vfio_pci.ko. Hopefully this compatibility layer will
>> be dropped in the future
>>
>> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
>> ---
>>   drivers/vfio/pci/Kconfig                      |  28 +++-
>>   drivers/vfio/pci/Makefile                     |   7 +-
>>   .../pci/{vfio_pci_npu2.c => npu2_vfio_pci.c}  | 144 ++++++++++++++++-
>>   drivers/vfio/pci/npu2_vfio_pci.h              |  24 +++
>>   ...pci_nvlink2gpu.c => nvlink2gpu_vfio_pci.c} | 149 +++++++++++++++++-
>>   drivers/vfio/pci/nvlink2gpu_vfio_pci.h        |  24 +++
>>   drivers/vfio/pci/vfio_pci.c                   |  61 ++++++-
>>   drivers/vfio/pci/vfio_pci_core.c              |  18 ---
>>   drivers/vfio/pci/vfio_pci_core.h              |  14 --
>>   9 files changed, 422 insertions(+), 47 deletions(-)
>>   rename drivers/vfio/pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} (64%)
>>   create mode 100644 drivers/vfio/pci/npu2_vfio_pci.h
>>   rename drivers/vfio/pci/{vfio_pci_nvlink2gpu.c => 
>> nvlink2gpu_vfio_pci.c} (67%)
>>   create mode 100644 drivers/vfio/pci/nvlink2gpu_vfio_pci.h
>>
>> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
>> index 829e90a2e5a3..88c89863a205 100644
>> --- a/drivers/vfio/pci/Kconfig
>> +++ b/drivers/vfio/pci/Kconfig
>> @@ -48,8 +48,30 @@ config VFIO_PCI_IGD
>>           To enable Intel IGD assignment through vfio-pci, say Y.
>>   -config VFIO_PCI_NVLINK2
>> -    def_bool y
>> +config VFIO_PCI_NVLINK2GPU
>> +    tristate "VFIO support for NVIDIA NVLINK2 GPUs"
>>       depends on VFIO_PCI_CORE && PPC_POWERNV
>>       help
>> -      VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
>> +      VFIO PCI driver for NVIDIA NVLINK2 GPUs with specific extensions
>> +      for P9 Witherspoon machine.
>> +
>> +config VFIO_PCI_NPU2
>> +    tristate "VFIO support for IBM NPU host bus adapter on P9"
>> +    depends on VFIO_PCI_NVLINK2GPU && PPC_POWERNV
>> +    help
>> +      VFIO PCI specific extensions for IBM NVLink2 host bus adapter 
>> on P9
>> +      Witherspoon machine.
>> +
>> +config VFIO_PCI_DRIVER_COMPAT
>> +    bool "VFIO PCI backward compatibility for vendor specific 
>> extensions"
>> +    default y
>> +    depends on VFIO_PCI
>> +    help
>> +      Say Y here if you want to preserve VFIO PCI backward
>> +      compatibility. vfio_pci.ko will continue to automatically use
>> +      the NVLINK2, NPU2 and IGD VFIO drivers when it is attached to
>> +      a compatible device.
>> +
>> +      When N is selected the user must bind explicity to the module
>> +      they want to handle the device and vfio_pci.ko will have no
>> +      device specific special behaviors.
>> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
>> index f539f32c9296..86fb62e271fc 100644
>> --- a/drivers/vfio/pci/Makefile
>> +++ b/drivers/vfio/pci/Makefile
>> @@ -2,10 +2,15 @@
>>     obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
>>   obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
>> +obj-$(CONFIG_VFIO_PCI_NPU2) += npu2-vfio-pci.o
>> +obj-$(CONFIG_VFIO_PCI_NVLINK2GPU) += nvlink2gpu-vfio-pci.o
>>     vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o 
>> vfio_pci_rdwr.o vfio_pci_config.o
>>   vfio-pci-core-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
>> -vfio-pci-core-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2gpu.o 
>> vfio_pci_npu2.o
>>   vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o
>>     vfio-pci-y := vfio_pci.o
>> +
>> +npu2-vfio-pci-y := npu2_vfio_pci.o
>> +
>> +nvlink2gpu-vfio-pci-y := nvlink2gpu_vfio_pci.o
>> diff --git a/drivers/vfio/pci/vfio_pci_npu2.c 
>> b/drivers/vfio/pci/npu2_vfio_pci.c
>> similarity index 64%
>> rename from drivers/vfio/pci/vfio_pci_npu2.c
>> rename to drivers/vfio/pci/npu2_vfio_pci.c
>> index 717745256ab3..7071bda0f2b6 100644
>> --- a/drivers/vfio/pci/vfio_pci_npu2.c
>> +++ b/drivers/vfio/pci/npu2_vfio_pci.c
>> @@ -14,19 +14,28 @@
>>    *    Author: Alex Williamson <alex.williamson@redhat.com>
>>    */
>>   +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>> +
>> +#include <linux/module.h>
>>   #include <linux/io.h>
>>   #include <linux/pci.h>
>>   #include <linux/uaccess.h>
>>   #include <linux/vfio.h>
>> +#include <linux/list.h>
>>   #include <linux/sched/mm.h>
>>   #include <linux/mmu_context.h>
>>   #include <asm/kvm_ppc.h>
>>     #include "vfio_pci_core.h"
>> +#include "npu2_vfio_pci.h"
>>     #define CREATE_TRACE_POINTS
>>   #include "npu2_trace.h"
>>   +#define DRIVER_VERSION  "0.1"
>> +#define DRIVER_AUTHOR   "Alexey Kardashevskiy <aik@ozlabs.ru>"
>> +#define DRIVER_DESC     "NPU2 VFIO PCI - User Level meta-driver for 
>> POWER9 NPU NVLink2 HBA"
>> +
>>   EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap);
>>     struct vfio_pci_npu2_data {
>> @@ -36,6 +45,10 @@ struct vfio_pci_npu2_data {
>>       unsigned int link_speed; /* The link speed from DT's 
>> ibm,nvlink-speed */
>>   };
>>   +struct npu2_vfio_pci_device {
>> +    struct vfio_pci_core_device    vdev;
>> +};
>> +
>>   static size_t vfio_pci_npu2_rw(struct vfio_pci_core_device *vdev,
>>           char __user *buf, size_t count, loff_t *ppos, bool iswrite)
>>   {
>> @@ -120,7 +133,7 @@ static const struct vfio_pci_regops 
>> vfio_pci_npu2_regops = {
>>       .add_capability = vfio_pci_npu2_add_capability,
>>   };
>>   -int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev)
>> +static int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev)
>>   {
>>       int ret;
>>       struct vfio_pci_npu2_data *data;
>> @@ -220,3 +233,132 @@ int vfio_pci_ibm_npu2_init(struct 
>> vfio_pci_core_device *vdev)
>>         return ret;
>>   }
>> +
>> +static void npu2_vfio_pci_release(void *device_data)
>> +{
>> +    struct vfio_pci_core_device *vdev = device_data;
>> +
>> +    mutex_lock(&vdev->reflck->lock);
>> +    if (!(--vdev->refcnt)) {
>> +        vfio_pci_vf_token_user_add(vdev, -1);
>> +        vfio_pci_core_spapr_eeh_release(vdev);
>> +        vfio_pci_core_disable(vdev);
>> +    }
>> +    mutex_unlock(&vdev->reflck->lock);
>> +
>> +    module_put(THIS_MODULE);
>> +}
>> +
>> +static int npu2_vfio_pci_open(void *device_data)
>> +{
>> +    struct vfio_pci_core_device *vdev = device_data;
>> +    int ret = 0;
>> +
>> +    if (!try_module_get(THIS_MODULE))
>> +        return -ENODEV;
>> +
>> +    mutex_lock(&vdev->reflck->lock);
>> +
>> +    if (!vdev->refcnt) {
>> +        ret = vfio_pci_core_enable(vdev);
>> +        if (ret)
>> +            goto error;
>> +
>> +        ret = vfio_pci_ibm_npu2_init(vdev);
>> +        if (ret && ret != -ENODEV) {
>> +            pci_warn(vdev->pdev,
>> +                 "Failed to setup NVIDIA NV2 ATSD region\n");
>> +            vfio_pci_core_disable(vdev);
>> +            goto error;
>> +        }
>> +        ret = 0;
>> +        vfio_pci_probe_mmaps(vdev);
>> +        vfio_pci_core_spapr_eeh_open(vdev);
>> +        vfio_pci_vf_token_user_add(vdev, 1);
>> +    }
>> +    vdev->refcnt++;
>> +error:
>> +    mutex_unlock(&vdev->reflck->lock);
>> +    if (ret)
>> +        module_put(THIS_MODULE);
>> +    return ret;
>> +}
>> +
>> +static const struct vfio_device_ops npu2_vfio_pci_ops = {
>> +    .name        = "npu2-vfio-pci",
>> +    .open        = npu2_vfio_pci_open,
>> +    .release    = npu2_vfio_pci_release,
>> +    .ioctl        = vfio_pci_core_ioctl,
>> +    .read        = vfio_pci_core_read,
>> +    .write        = vfio_pci_core_write,
>> +    .mmap        = vfio_pci_core_mmap,
>> +    .request    = vfio_pci_core_request,
>> +    .match        = vfio_pci_core_match,
>> +};
>> +
>> +static int npu2_vfio_pci_probe(struct pci_dev *pdev,
>> +        const struct pci_device_id *id)
>> +{
>> +    struct npu2_vfio_pci_device *npvdev;
>> +    int ret;
>> +
>> +    npvdev = kzalloc(sizeof(*npvdev), GFP_KERNEL);
>> +    if (!npvdev)
>> +        return -ENOMEM;
>> +
>> +    ret = vfio_pci_core_register_device(&npvdev->vdev, pdev,
>> +            &npu2_vfio_pci_ops);
>> +    if (ret)
>> +        goto out_free;
>> +
>> +    return 0;
>> +
>> +out_free:
>> +    kfree(npvdev);
>> +    return ret;
>> +}
>> +
>> +static void npu2_vfio_pci_remove(struct pci_dev *pdev)
>> +{
>> +    struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
>> +    struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
>> +    struct npu2_vfio_pci_device *npvdev;
>> +
>> +    npvdev = container_of(core_vpdev, struct npu2_vfio_pci_device, 
>> vdev);
>> +
>> +    vfio_pci_core_unregister_device(core_vpdev);
>> +    kfree(npvdev);
>> +}
>> +
>> +static const struct pci_device_id npu2_vfio_pci_table[] = {
>> +    { PCI_VDEVICE(IBM, 0x04ea) },
>> +    { 0, }
>> +};
>> +
>> +static struct pci_driver npu2_vfio_pci_driver = {
>> +    .name            = "npu2-vfio-pci",
>> +    .id_table        = npu2_vfio_pci_table,
>> +    .probe            = npu2_vfio_pci_probe,
>> +    .remove            = npu2_vfio_pci_remove,
>> +#ifdef CONFIG_PCI_IOV
>> +    .sriov_configure    = vfio_pci_core_sriov_configure,
>> +#endif
>> +    .err_handler        = &vfio_pci_core_err_handlers,
>> +};
>> +
>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev)
>> +{
>> +    if (pci_match_id(npu2_vfio_pci_driver.id_table, pdev))
>> +        return &npu2_vfio_pci_driver;
>> +    return NULL;
>> +}
>> +EXPORT_SYMBOL_GPL(get_npu2_vfio_pci_driver);
>> +#endif
>> +
>> +module_pci_driver(npu2_vfio_pci_driver);
>> +
>> +MODULE_VERSION(DRIVER_VERSION);
>> +MODULE_LICENSE("GPL v2");
>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>> +MODULE_DESCRIPTION(DRIVER_DESC);
>> diff --git a/drivers/vfio/pci/npu2_vfio_pci.h 
>> b/drivers/vfio/pci/npu2_vfio_pci.h
>> new file mode 100644
>> index 000000000000..92010d340346
>> --- /dev/null
>> +++ b/drivers/vfio/pci/npu2_vfio_pci.h
>> @@ -0,0 +1,24 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * Copyright (c) 2020, Mellanox Technologies, Ltd.  All rights 
>> reserved.
>> + *     Author: Max Gurtovoy <mgurtovoy@nvidia.com>
>> + */
>> +
>> +#ifndef NPU2_VFIO_PCI_H
>> +#define NPU2_VFIO_PCI_H
>> +
>> +#include <linux/pci.h>
>> +#include <linux/module.h>
>> +
>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>> +#if defined(CONFIG_VFIO_PCI_NPU2) || 
>> defined(CONFIG_VFIO_PCI_NPU2_MODULE)
>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev);
>> +#else
>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev)
>> +{
>> +    return NULL;
>> +}
>> +#endif
>> +#endif
>> +
>> +#endif /* NPU2_VFIO_PCI_H */
>> diff --git a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c 
>> b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c
>> similarity index 67%
>> rename from drivers/vfio/pci/vfio_pci_nvlink2gpu.c
>> rename to drivers/vfio/pci/nvlink2gpu_vfio_pci.c
>> index 6dce1e78ee82..84a5ac1ce8ac 100644
>> --- a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c
>> +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c
>> @@ -1,6 +1,6 @@
>>   // SPDX-License-Identifier: GPL-2.0-only
>>   /*
>> - * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2.
>> + * VFIO PCI NVIDIA NVLink2 GPUs support.
>>    *
>>    * Copyright (C) 2018 IBM Corp.  All rights reserved.
>>    *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
>> @@ -12,6 +12,9 @@
>>    *    Author: Alex Williamson <alex.williamson@redhat.com>
>>    */
>>   +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>> +
>> +#include <linux/module.h>
>>   #include <linux/io.h>
>>   #include <linux/pci.h>
>>   #include <linux/uaccess.h>
>> @@ -21,10 +24,15 @@
>>   #include <asm/kvm_ppc.h>
>>     #include "vfio_pci_core.h"
>> +#include "nvlink2gpu_vfio_pci.h"
>>     #define CREATE_TRACE_POINTS
>>   #include "nvlink2gpu_trace.h"
>>   +#define DRIVER_VERSION  "0.1"
>> +#define DRIVER_AUTHOR   "Alexey Kardashevskiy <aik@ozlabs.ru>"
>> +#define DRIVER_DESC     "NVLINK2GPU VFIO PCI - User Level 
>> meta-driver for NVIDIA NVLink2 GPUs"
>> +
>>   EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault);
>>   EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap);
>>   @@ -39,6 +47,10 @@ struct vfio_pci_nvgpu_data {
>>       struct notifier_block group_notifier;
>>   };
>>   +struct nv_vfio_pci_device {
>> +    struct vfio_pci_core_device    vdev;
>> +};
>> +
>>   static size_t vfio_pci_nvgpu_rw(struct vfio_pci_core_device *vdev,
>>           char __user *buf, size_t count, loff_t *ppos, bool iswrite)
>>   {
>> @@ -207,7 +219,8 @@ static int vfio_pci_nvgpu_group_notifier(struct 
>> notifier_block *nb,
>>       return NOTIFY_OK;
>>   }
>>   -int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device 
>> *vdev)
>> +static int
>> +vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev)
>>   {
>>       int ret;
>>       u64 reg[2];
>> @@ -293,3 +306,135 @@ int vfio_pci_nvidia_v100_nvlink2_init(struct 
>> vfio_pci_core_device *vdev)
>>         return ret;
>>   }
>> +
>> +static void nvlink2gpu_vfio_pci_release(void *device_data)
>> +{
>> +    struct vfio_pci_core_device *vdev = device_data;
>> +
>> +    mutex_lock(&vdev->reflck->lock);
>> +    if (!(--vdev->refcnt)) {
>> +        vfio_pci_vf_token_user_add(vdev, -1);
>> +        vfio_pci_core_spapr_eeh_release(vdev);
>> +        vfio_pci_core_disable(vdev);
>> +    }
>> +    mutex_unlock(&vdev->reflck->lock);
>> +
>> +    module_put(THIS_MODULE);
>> +}
>> +
>> +static int nvlink2gpu_vfio_pci_open(void *device_data)
>> +{
>> +    struct vfio_pci_core_device *vdev = device_data;
>> +    int ret = 0;
>> +
>> +    if (!try_module_get(THIS_MODULE))
>> +        return -ENODEV;
>> +
>> +    mutex_lock(&vdev->reflck->lock);
>> +
>> +    if (!vdev->refcnt) {
>> +        ret = vfio_pci_core_enable(vdev);
>> +        if (ret)
>> +            goto error;
>> +
>> +        ret = vfio_pci_nvidia_v100_nvlink2_init(vdev);
>> +        if (ret && ret != -ENODEV) {
>> +            pci_warn(vdev->pdev,
>> +                 "Failed to setup NVIDIA NV2 RAM region\n");
>> +            vfio_pci_core_disable(vdev);
>> +            goto error;
>> +        }
>> +        ret = 0;
>> +        vfio_pci_probe_mmaps(vdev);
>> +        vfio_pci_core_spapr_eeh_open(vdev);
>> +        vfio_pci_vf_token_user_add(vdev, 1);
>> +    }
>> +    vdev->refcnt++;
>> +error:
>> +    mutex_unlock(&vdev->reflck->lock);
>> +    if (ret)
>> +        module_put(THIS_MODULE);
>> +    return ret;
>> +}
>> +
>> +static const struct vfio_device_ops nvlink2gpu_vfio_pci_ops = {
>> +    .name        = "nvlink2gpu-vfio-pci",
>> +    .open        = nvlink2gpu_vfio_pci_open,
>> +    .release    = nvlink2gpu_vfio_pci_release,
>> +    .ioctl        = vfio_pci_core_ioctl,
>> +    .read        = vfio_pci_core_read,
>> +    .write        = vfio_pci_core_write,
>> +    .mmap        = vfio_pci_core_mmap,
>> +    .request    = vfio_pci_core_request,
>> +    .match        = vfio_pci_core_match,
>> +};
>> +
>> +static int nvlink2gpu_vfio_pci_probe(struct pci_dev *pdev,
>> +        const struct pci_device_id *id)
>> +{
>> +    struct nv_vfio_pci_device *nvdev;
>> +    int ret;
>> +
>> +    nvdev = kzalloc(sizeof(*nvdev), GFP_KERNEL);
>> +    if (!nvdev)
>> +        return -ENOMEM;
>> +
>> +    ret = vfio_pci_core_register_device(&nvdev->vdev, pdev,
>> +            &nvlink2gpu_vfio_pci_ops);
>> +    if (ret)
>> +        goto out_free;
>> +
>> +    return 0;
>> +
>> +out_free:
>> +    kfree(nvdev);
>> +    return ret;
>> +}
>> +
>> +static void nvlink2gpu_vfio_pci_remove(struct pci_dev *pdev)
>> +{
>> +    struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
>> +    struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
>> +    struct nv_vfio_pci_device *nvdev;
>> +
>> +    nvdev = container_of(core_vpdev, struct nv_vfio_pci_device, vdev);
>> +
>> +    vfio_pci_core_unregister_device(core_vpdev);
>> +    kfree(nvdev);
>> +}
>> +
>> +static const struct pci_device_id nvlink2gpu_vfio_pci_table[] = {
>> +    { PCI_VDEVICE(NVIDIA, 0x1DB1) }, /* GV100GL-A NVIDIA Tesla 
>> V100-SXM2-16GB */
>> +    { PCI_VDEVICE(NVIDIA, 0x1DB5) }, /* GV100GL-A NVIDIA Tesla 
>> V100-SXM2-32GB */
>> +    { PCI_VDEVICE(NVIDIA, 0x1DB8) }, /* GV100GL-A NVIDIA Tesla 
>> V100-SXM3-32GB */
>> +    { PCI_VDEVICE(NVIDIA, 0x1DF5) }, /* GV100GL-B NVIDIA Tesla 
>> V100-SXM2-16GB */
>
>
> Where is this list from?
>
> Also, how is this supposed to work at the boot time? Will the kernel 
> try binding let's say this one and nouveau? Which one is going to win?

At boot time nouveau driver will win since the vfio drivers don't 
declare MODULE_DEVICE_TABLE


>
>
>> +    { 0, }
>
>
> Why a comma?

I'll remove the comma.


>
>> +};
>
>
>
>> +
>> +static struct pci_driver nvlink2gpu_vfio_pci_driver = {
>> +    .name            = "nvlink2gpu-vfio-pci",
>> +    .id_table        = nvlink2gpu_vfio_pci_table,
>> +    .probe            = nvlink2gpu_vfio_pci_probe,
>> +    .remove            = nvlink2gpu_vfio_pci_remove,
>> +#ifdef CONFIG_PCI_IOV
>> +    .sriov_configure    = vfio_pci_core_sriov_configure,
>> +#endif
>
>
> What is this IOV business about?

from vfio_pci

#ifdef CONFIG_PCI_IOV
module_param(enable_sriov, bool, 0644);
MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV 
configuration.  Enabling SR-IOV on a PF typically requires support of 
the userspace PF driver, enabling VFs without such support may result in 
non-functional VFs or PF.");
#endif


>
>
>> +    .err_handler        = &vfio_pci_core_err_handlers,
>> +};
>> +
>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev)
>> +{
>> +    if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev))
>> +        return &nvlink2gpu_vfio_pci_driver;
>
>
> Why do we need matching PCI ids here instead of looking at the FDT 
> which will work better?

what is FDT ? any is it better to use it instead of match_id ?


>
>
>> +    return NULL;
>> +}
>> +EXPORT_SYMBOL_GPL(get_nvlink2gpu_vfio_pci_driver);
>> +#endif
>> +
>> +module_pci_driver(nvlink2gpu_vfio_pci_driver);
>> +
>> +MODULE_VERSION(DRIVER_VERSION);
>> +MODULE_LICENSE("GPL v2");
>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>> +MODULE_DESCRIPTION(DRIVER_DESC);
>> diff --git a/drivers/vfio/pci/nvlink2gpu_vfio_pci.h 
>> b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h
>> new file mode 100644
>> index 000000000000..ebd5b600b190
>> --- /dev/null
>> +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h
>> @@ -0,0 +1,24 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * Copyright (c) 2020, Mellanox Technologies, Ltd.  All rights 
>> reserved.
>> + *     Author: Max Gurtovoy <mgurtovoy@nvidia.com>
>> + */
>> +
>> +#ifndef NVLINK2GPU_VFIO_PCI_H
>> +#define NVLINK2GPU_VFIO_PCI_H
>> +
>> +#include <linux/pci.h>
>> +#include <linux/module.h>
>> +
>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>> +#if defined(CONFIG_VFIO_PCI_NVLINK2GPU) || 
>> defined(CONFIG_VFIO_PCI_NVLINK2GPU_MODULE)
>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev 
>> *pdev);
>> +#else
>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev)
>> +{
>> +    return NULL;
>> +}
>> +#endif
>> +#endif
>> +
>> +#endif /* NVLINK2GPU_VFIO_PCI_H */
>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>> index dbc0a6559914..8e81ea039f31 100644
>> --- a/drivers/vfio/pci/vfio_pci.c
>> +++ b/drivers/vfio/pci/vfio_pci.c
>> @@ -27,6 +27,10 @@
>>   #include <linux/uaccess.h>
>>     #include "vfio_pci_core.h"
>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>> +#include "npu2_vfio_pci.h"
>> +#include "nvlink2gpu_vfio_pci.h"
>> +#endif
>>     #define DRIVER_VERSION  "0.2"
>>   #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
>> @@ -142,14 +146,48 @@ static const struct vfio_device_ops 
>> vfio_pci_ops = {
>>       .match        = vfio_pci_core_match,
>>   };
>>   +/*
>> + * This layer is used for backward compatibility. Hopefully it will be
>> + * removed in the future.
>> + */
>> +static struct pci_driver *vfio_pci_get_compat_driver(struct pci_dev 
>> *pdev)
>> +{
>> +    switch (pdev->vendor) {
>> +    case PCI_VENDOR_ID_NVIDIA:
>> +        switch (pdev->device) {
>> +        case 0x1db1:
>> +        case 0x1db5:
>> +        case 0x1db8:
>> +        case 0x1df5:
>> +            return get_nvlink2gpu_vfio_pci_driver(pdev);
>
> This does not really need a switch, could simply call these 
> get_xxxx_vfio_pci_driver. Thanks,

maybe the result will be the same but I don't think we need to send all 
NVIDIA devices or IBM devices to this function.

we can maybe export the tables from the vfio_vendor driver and match it 
here.

>
>
>> +        default:
>> +            return NULL;
>> +        }
>> +    case PCI_VENDOR_ID_IBM:
>> +        switch (pdev->device) {
>> +        case 0x04ea:
>> +            return get_npu2_vfio_pci_driver(pdev);
>> +        default:
>> +            return NULL;
>> +        }
>> +    }
>> +
>> +    return NULL;
>> +}
>> +
>>   static int vfio_pci_probe(struct pci_dev *pdev, const struct 
>> pci_device_id *id)
>>   {
>>       struct vfio_pci_device *vpdev;
>> +    struct pci_driver *driver;
>>       int ret;
>>         if (vfio_pci_is_denylisted(pdev))
>>           return -EINVAL;
>>   +    driver = vfio_pci_get_compat_driver(pdev);
>> +    if (driver)
>> +        return driver->probe(pdev, id);
>> +
>>       vpdev = kzalloc(sizeof(*vpdev), GFP_KERNEL);
>>       if (!vpdev)
>>           return -ENOMEM;
>> @@ -167,14 +205,21 @@ static int vfio_pci_probe(struct pci_dev *pdev, 
>> const struct pci_device_id *id)
>>     static void vfio_pci_remove(struct pci_dev *pdev)
>>   {
>> -    struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
>> -    struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
>> -    struct vfio_pci_device *vpdev;
>> -
>> -    vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev);
>> -
>> -    vfio_pci_core_unregister_device(core_vpdev);
>> -    kfree(vpdev);
>> +    struct pci_driver *driver;
>> +
>> +    driver = vfio_pci_get_compat_driver(pdev);
>> +    if (driver) {
>> +        driver->remove(pdev);
>> +    } else {
>> +        struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
>> +        struct vfio_pci_core_device *core_vpdev;
>> +        struct vfio_pci_device *vpdev;
>> +
>> +        core_vpdev = vfio_device_data(vdev);
>> +        vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev);
>> +        vfio_pci_core_unregister_device(core_vpdev);
>> +        kfree(vpdev);
>> +    }
>>   }
>>     static int vfio_pci_sriov_configure(struct pci_dev *pdev, int 
>> nr_virtfn)
>> diff --git a/drivers/vfio/pci/vfio_pci_core.c 
>> b/drivers/vfio/pci/vfio_pci_core.c
>> index 4de8e352df9c..f9b39abe54cb 100644
>> --- a/drivers/vfio/pci/vfio_pci_core.c
>> +++ b/drivers/vfio/pci/vfio_pci_core.c
>> @@ -354,24 +354,6 @@ int vfio_pci_core_enable(struct 
>> vfio_pci_core_device *vdev)
>>           }
>>       }
>>   -    if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
>> -        IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
>> -        ret = vfio_pci_nvidia_v100_nvlink2_init(vdev);
>> -        if (ret && ret != -ENODEV) {
>> -            pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n");
>> -            goto disable_exit;
>> -        }
>> -    }
>> -
>> -    if (pdev->vendor == PCI_VENDOR_ID_IBM &&
>> -        IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
>> -        ret = vfio_pci_ibm_npu2_init(vdev);
>> -        if (ret && ret != -ENODEV) {
>> -            pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n");
>> -            goto disable_exit;
>> -        }
>> -    }
>> -
>>       return 0;
>>     disable_exit:
>> diff --git a/drivers/vfio/pci/vfio_pci_core.h 
>> b/drivers/vfio/pci/vfio_pci_core.h
>> index 8989443c3086..31f3836e606e 100644
>> --- a/drivers/vfio/pci/vfio_pci_core.h
>> +++ b/drivers/vfio/pci/vfio_pci_core.h
>> @@ -204,20 +204,6 @@ static inline int vfio_pci_igd_init(struct 
>> vfio_pci_core_device *vdev)
>>       return -ENODEV;
>>   }
>>   #endif
>> -#ifdef CONFIG_VFIO_PCI_NVLINK2
>> -extern int vfio_pci_nvidia_v100_nvlink2_init(struct 
>> vfio_pci_core_device *vdev);
>> -extern int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev);
>> -#else
>> -static inline int vfio_pci_nvidia_v100_nvlink2_init(struct 
>> vfio_pci_core_device *vdev)
>> -{
>> -    return -ENODEV;
>> -}
>> -
>> -static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device 
>> *vdev)
>> -{
>> -    return -ENODEV;
>> -}
>> -#endif
>>     #ifdef CONFIG_S390
>>   extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device 
>> *vdev,
>>
>
Jason Gunthorpe March 10, 2021, 1:02 p.m. UTC | #3
On Wed, Mar 10, 2021 at 02:57:57PM +0200, Max Gurtovoy wrote:

> > > +    .err_handler        = &vfio_pci_core_err_handlers,
> > > +};
> > > +
> > > +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
> > > +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev)
> > > +{
> > > +    if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev))
> > > +        return &nvlink2gpu_vfio_pci_driver;
> > 
> > 
> > Why do we need matching PCI ids here instead of looking at the FDT which
> > will work better?
> 
> what is FDT ? any is it better to use it instead of match_id ?

This is emulating the device_driver match for the pci_driver.

I don't think we can combine FDT matching with pci_driver, can we?

Jason
Alexey Kardashevskiy March 10, 2021, 2:19 p.m. UTC | #4
On 10/03/2021 23:57, Max Gurtovoy wrote:
> 
> On 3/10/2021 8:39 AM, Alexey Kardashevskiy wrote:
>>
>>
>> On 09/03/2021 19:33, Max Gurtovoy wrote:
>>> The new drivers introduced are nvlink2gpu_vfio_pci.ko and
>>> npu2_vfio_pci.ko.
>>> The first will be responsible for providing special extensions for
>>> NVIDIA GPUs with NVLINK2 support for P9 platform (and others in the
>>> future). The last will be responsible for POWER9 NPU2 unit (NVLink2 host
>>> bus adapter).
>>>
>>> Also, preserve backward compatibility for users that were binding
>>> NVLINK2 devices to vfio_pci.ko. Hopefully this compatibility layer will
>>> be dropped in the future
>>>
>>> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
>>> ---
>>>   drivers/vfio/pci/Kconfig                      |  28 +++-
>>>   drivers/vfio/pci/Makefile                     |   7 +-
>>>   .../pci/{vfio_pci_npu2.c => npu2_vfio_pci.c}  | 144 ++++++++++++++++-
>>>   drivers/vfio/pci/npu2_vfio_pci.h              |  24 +++
>>>   ...pci_nvlink2gpu.c => nvlink2gpu_vfio_pci.c} | 149 +++++++++++++++++-
>>>   drivers/vfio/pci/nvlink2gpu_vfio_pci.h        |  24 +++
>>>   drivers/vfio/pci/vfio_pci.c                   |  61 ++++++-
>>>   drivers/vfio/pci/vfio_pci_core.c              |  18 ---
>>>   drivers/vfio/pci/vfio_pci_core.h              |  14 --
>>>   9 files changed, 422 insertions(+), 47 deletions(-)
>>>   rename drivers/vfio/pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} (64%)
>>>   create mode 100644 drivers/vfio/pci/npu2_vfio_pci.h
>>>   rename drivers/vfio/pci/{vfio_pci_nvlink2gpu.c => 
>>> nvlink2gpu_vfio_pci.c} (67%)
>>>   create mode 100644 drivers/vfio/pci/nvlink2gpu_vfio_pci.h
>>>
>>> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
>>> index 829e90a2e5a3..88c89863a205 100644
>>> --- a/drivers/vfio/pci/Kconfig
>>> +++ b/drivers/vfio/pci/Kconfig
>>> @@ -48,8 +48,30 @@ config VFIO_PCI_IGD
>>>           To enable Intel IGD assignment through vfio-pci, say Y.
>>>   -config VFIO_PCI_NVLINK2
>>> -    def_bool y
>>> +config VFIO_PCI_NVLINK2GPU
>>> +    tristate "VFIO support for NVIDIA NVLINK2 GPUs"
>>>       depends on VFIO_PCI_CORE && PPC_POWERNV
>>>       help
>>> -      VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
>>> +      VFIO PCI driver for NVIDIA NVLINK2 GPUs with specific extensions
>>> +      for P9 Witherspoon machine.
>>> +
>>> +config VFIO_PCI_NPU2
>>> +    tristate "VFIO support for IBM NPU host bus adapter on P9"
>>> +    depends on VFIO_PCI_NVLINK2GPU && PPC_POWERNV
>>> +    help
>>> +      VFIO PCI specific extensions for IBM NVLink2 host bus adapter 
>>> on P9
>>> +      Witherspoon machine.
>>> +
>>> +config VFIO_PCI_DRIVER_COMPAT
>>> +    bool "VFIO PCI backward compatibility for vendor specific 
>>> extensions"
>>> +    default y
>>> +    depends on VFIO_PCI
>>> +    help
>>> +      Say Y here if you want to preserve VFIO PCI backward
>>> +      compatibility. vfio_pci.ko will continue to automatically use
>>> +      the NVLINK2, NPU2 and IGD VFIO drivers when it is attached to
>>> +      a compatible device.
>>> +
>>> +      When N is selected the user must bind explicity to the module
>>> +      they want to handle the device and vfio_pci.ko will have no
>>> +      device specific special behaviors.
>>> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
>>> index f539f32c9296..86fb62e271fc 100644
>>> --- a/drivers/vfio/pci/Makefile
>>> +++ b/drivers/vfio/pci/Makefile
>>> @@ -2,10 +2,15 @@
>>>     obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
>>>   obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
>>> +obj-$(CONFIG_VFIO_PCI_NPU2) += npu2-vfio-pci.o
>>> +obj-$(CONFIG_VFIO_PCI_NVLINK2GPU) += nvlink2gpu-vfio-pci.o
>>>     vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o 
>>> vfio_pci_rdwr.o vfio_pci_config.o
>>>   vfio-pci-core-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
>>> -vfio-pci-core-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2gpu.o 
>>> vfio_pci_npu2.o
>>>   vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o
>>>     vfio-pci-y := vfio_pci.o
>>> +
>>> +npu2-vfio-pci-y := npu2_vfio_pci.o
>>> +
>>> +nvlink2gpu-vfio-pci-y := nvlink2gpu_vfio_pci.o
>>> diff --git a/drivers/vfio/pci/vfio_pci_npu2.c 
>>> b/drivers/vfio/pci/npu2_vfio_pci.c
>>> similarity index 64%
>>> rename from drivers/vfio/pci/vfio_pci_npu2.c
>>> rename to drivers/vfio/pci/npu2_vfio_pci.c
>>> index 717745256ab3..7071bda0f2b6 100644
>>> --- a/drivers/vfio/pci/vfio_pci_npu2.c
>>> +++ b/drivers/vfio/pci/npu2_vfio_pci.c
>>> @@ -14,19 +14,28 @@
>>>    *    Author: Alex Williamson <alex.williamson@redhat.com>
>>>    */
>>>   +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>>> +
>>> +#include <linux/module.h>
>>>   #include <linux/io.h>
>>>   #include <linux/pci.h>
>>>   #include <linux/uaccess.h>
>>>   #include <linux/vfio.h>
>>> +#include <linux/list.h>
>>>   #include <linux/sched/mm.h>
>>>   #include <linux/mmu_context.h>
>>>   #include <asm/kvm_ppc.h>
>>>     #include "vfio_pci_core.h"
>>> +#include "npu2_vfio_pci.h"
>>>     #define CREATE_TRACE_POINTS
>>>   #include "npu2_trace.h"
>>>   +#define DRIVER_VERSION  "0.1"
>>> +#define DRIVER_AUTHOR   "Alexey Kardashevskiy <aik@ozlabs.ru>"
>>> +#define DRIVER_DESC     "NPU2 VFIO PCI - User Level meta-driver for 
>>> POWER9 NPU NVLink2 HBA"
>>> +
>>>   EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap);
>>>     struct vfio_pci_npu2_data {
>>> @@ -36,6 +45,10 @@ struct vfio_pci_npu2_data {
>>>       unsigned int link_speed; /* The link speed from DT's 
>>> ibm,nvlink-speed */
>>>   };
>>>   +struct npu2_vfio_pci_device {
>>> +    struct vfio_pci_core_device    vdev;
>>> +};
>>> +
>>>   static size_t vfio_pci_npu2_rw(struct vfio_pci_core_device *vdev,
>>>           char __user *buf, size_t count, loff_t *ppos, bool iswrite)
>>>   {
>>> @@ -120,7 +133,7 @@ static const struct vfio_pci_regops 
>>> vfio_pci_npu2_regops = {
>>>       .add_capability = vfio_pci_npu2_add_capability,
>>>   };
>>>   -int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev)
>>> +static int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev)
>>>   {
>>>       int ret;
>>>       struct vfio_pci_npu2_data *data;
>>> @@ -220,3 +233,132 @@ int vfio_pci_ibm_npu2_init(struct 
>>> vfio_pci_core_device *vdev)
>>>         return ret;
>>>   }
>>> +
>>> +static void npu2_vfio_pci_release(void *device_data)
>>> +{
>>> +    struct vfio_pci_core_device *vdev = device_data;
>>> +
>>> +    mutex_lock(&vdev->reflck->lock);
>>> +    if (!(--vdev->refcnt)) {
>>> +        vfio_pci_vf_token_user_add(vdev, -1);
>>> +        vfio_pci_core_spapr_eeh_release(vdev);
>>> +        vfio_pci_core_disable(vdev);
>>> +    }
>>> +    mutex_unlock(&vdev->reflck->lock);
>>> +
>>> +    module_put(THIS_MODULE);
>>> +}
>>> +
>>> +static int npu2_vfio_pci_open(void *device_data)
>>> +{
>>> +    struct vfio_pci_core_device *vdev = device_data;
>>> +    int ret = 0;
>>> +
>>> +    if (!try_module_get(THIS_MODULE))
>>> +        return -ENODEV;
>>> +
>>> +    mutex_lock(&vdev->reflck->lock);
>>> +
>>> +    if (!vdev->refcnt) {
>>> +        ret = vfio_pci_core_enable(vdev);
>>> +        if (ret)
>>> +            goto error;
>>> +
>>> +        ret = vfio_pci_ibm_npu2_init(vdev);
>>> +        if (ret && ret != -ENODEV) {
>>> +            pci_warn(vdev->pdev,
>>> +                 "Failed to setup NVIDIA NV2 ATSD region\n");
>>> +            vfio_pci_core_disable(vdev);
>>> +            goto error;
>>> +        }
>>> +        ret = 0;
>>> +        vfio_pci_probe_mmaps(vdev);
>>> +        vfio_pci_core_spapr_eeh_open(vdev);
>>> +        vfio_pci_vf_token_user_add(vdev, 1);
>>> +    }
>>> +    vdev->refcnt++;
>>> +error:
>>> +    mutex_unlock(&vdev->reflck->lock);
>>> +    if (ret)
>>> +        module_put(THIS_MODULE);
>>> +    return ret;
>>> +}
>>> +
>>> +static const struct vfio_device_ops npu2_vfio_pci_ops = {
>>> +    .name        = "npu2-vfio-pci",
>>> +    .open        = npu2_vfio_pci_open,
>>> +    .release    = npu2_vfio_pci_release,
>>> +    .ioctl        = vfio_pci_core_ioctl,
>>> +    .read        = vfio_pci_core_read,
>>> +    .write        = vfio_pci_core_write,
>>> +    .mmap        = vfio_pci_core_mmap,
>>> +    .request    = vfio_pci_core_request,
>>> +    .match        = vfio_pci_core_match,
>>> +};
>>> +
>>> +static int npu2_vfio_pci_probe(struct pci_dev *pdev,
>>> +        const struct pci_device_id *id)
>>> +{
>>> +    struct npu2_vfio_pci_device *npvdev;
>>> +    int ret;
>>> +
>>> +    npvdev = kzalloc(sizeof(*npvdev), GFP_KERNEL);
>>> +    if (!npvdev)
>>> +        return -ENOMEM;
>>> +
>>> +    ret = vfio_pci_core_register_device(&npvdev->vdev, pdev,
>>> +            &npu2_vfio_pci_ops);
>>> +    if (ret)
>>> +        goto out_free;
>>> +
>>> +    return 0;
>>> +
>>> +out_free:
>>> +    kfree(npvdev);
>>> +    return ret;
>>> +}
>>> +
>>> +static void npu2_vfio_pci_remove(struct pci_dev *pdev)
>>> +{
>>> +    struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
>>> +    struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
>>> +    struct npu2_vfio_pci_device *npvdev;
>>> +
>>> +    npvdev = container_of(core_vpdev, struct npu2_vfio_pci_device, 
>>> vdev);
>>> +
>>> +    vfio_pci_core_unregister_device(core_vpdev);
>>> +    kfree(npvdev);
>>> +}
>>> +
>>> +static const struct pci_device_id npu2_vfio_pci_table[] = {
>>> +    { PCI_VDEVICE(IBM, 0x04ea) },
>>> +    { 0, }
>>> +};
>>> +
>>> +static struct pci_driver npu2_vfio_pci_driver = {
>>> +    .name            = "npu2-vfio-pci",
>>> +    .id_table        = npu2_vfio_pci_table,
>>> +    .probe            = npu2_vfio_pci_probe,
>>> +    .remove            = npu2_vfio_pci_remove,
>>> +#ifdef CONFIG_PCI_IOV
>>> +    .sriov_configure    = vfio_pci_core_sriov_configure,
>>> +#endif
>>> +    .err_handler        = &vfio_pci_core_err_handlers,
>>> +};
>>> +
>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev)
>>> +{
>>> +    if (pci_match_id(npu2_vfio_pci_driver.id_table, pdev))
>>> +        return &npu2_vfio_pci_driver;
>>> +    return NULL;
>>> +}
>>> +EXPORT_SYMBOL_GPL(get_npu2_vfio_pci_driver);
>>> +#endif
>>> +
>>> +module_pci_driver(npu2_vfio_pci_driver);
>>> +
>>> +MODULE_VERSION(DRIVER_VERSION);
>>> +MODULE_LICENSE("GPL v2");
>>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>>> +MODULE_DESCRIPTION(DRIVER_DESC);
>>> diff --git a/drivers/vfio/pci/npu2_vfio_pci.h 
>>> b/drivers/vfio/pci/npu2_vfio_pci.h
>>> new file mode 100644
>>> index 000000000000..92010d340346
>>> --- /dev/null
>>> +++ b/drivers/vfio/pci/npu2_vfio_pci.h
>>> @@ -0,0 +1,24 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +/*
>>> + * Copyright (c) 2020, Mellanox Technologies, Ltd.  All rights 
>>> reserved.
>>> + *     Author: Max Gurtovoy <mgurtovoy@nvidia.com>
>>> + */
>>> +
>>> +#ifndef NPU2_VFIO_PCI_H
>>> +#define NPU2_VFIO_PCI_H
>>> +
>>> +#include <linux/pci.h>
>>> +#include <linux/module.h>
>>> +
>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>>> +#if defined(CONFIG_VFIO_PCI_NPU2) || 
>>> defined(CONFIG_VFIO_PCI_NPU2_MODULE)
>>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev);
>>> +#else
>>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev)
>>> +{
>>> +    return NULL;
>>> +}
>>> +#endif
>>> +#endif
>>> +
>>> +#endif /* NPU2_VFIO_PCI_H */
>>> diff --git a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c 
>>> b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c
>>> similarity index 67%
>>> rename from drivers/vfio/pci/vfio_pci_nvlink2gpu.c
>>> rename to drivers/vfio/pci/nvlink2gpu_vfio_pci.c
>>> index 6dce1e78ee82..84a5ac1ce8ac 100644
>>> --- a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c
>>> +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c
>>> @@ -1,6 +1,6 @@
>>>   // SPDX-License-Identifier: GPL-2.0-only
>>>   /*
>>> - * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2.
>>> + * VFIO PCI NVIDIA NVLink2 GPUs support.
>>>    *
>>>    * Copyright (C) 2018 IBM Corp.  All rights reserved.
>>>    *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
>>> @@ -12,6 +12,9 @@
>>>    *    Author: Alex Williamson <alex.williamson@redhat.com>
>>>    */
>>>   +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>>> +
>>> +#include <linux/module.h>
>>>   #include <linux/io.h>
>>>   #include <linux/pci.h>
>>>   #include <linux/uaccess.h>
>>> @@ -21,10 +24,15 @@
>>>   #include <asm/kvm_ppc.h>
>>>     #include "vfio_pci_core.h"
>>> +#include "nvlink2gpu_vfio_pci.h"
>>>     #define CREATE_TRACE_POINTS
>>>   #include "nvlink2gpu_trace.h"
>>>   +#define DRIVER_VERSION  "0.1"
>>> +#define DRIVER_AUTHOR   "Alexey Kardashevskiy <aik@ozlabs.ru>"
>>> +#define DRIVER_DESC     "NVLINK2GPU VFIO PCI - User Level 
>>> meta-driver for NVIDIA NVLink2 GPUs"
>>> +
>>>   EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault);
>>>   EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap);
>>>   @@ -39,6 +47,10 @@ struct vfio_pci_nvgpu_data {
>>>       struct notifier_block group_notifier;
>>>   };
>>>   +struct nv_vfio_pci_device {
>>> +    struct vfio_pci_core_device    vdev;
>>> +};
>>> +
>>>   static size_t vfio_pci_nvgpu_rw(struct vfio_pci_core_device *vdev,
>>>           char __user *buf, size_t count, loff_t *ppos, bool iswrite)
>>>   {
>>> @@ -207,7 +219,8 @@ static int vfio_pci_nvgpu_group_notifier(struct 
>>> notifier_block *nb,
>>>       return NOTIFY_OK;
>>>   }
>>>   -int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device 
>>> *vdev)
>>> +static int
>>> +vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev)
>>>   {
>>>       int ret;
>>>       u64 reg[2];
>>> @@ -293,3 +306,135 @@ int vfio_pci_nvidia_v100_nvlink2_init(struct 
>>> vfio_pci_core_device *vdev)
>>>         return ret;
>>>   }
>>> +
>>> +static void nvlink2gpu_vfio_pci_release(void *device_data)
>>> +{
>>> +    struct vfio_pci_core_device *vdev = device_data;
>>> +
>>> +    mutex_lock(&vdev->reflck->lock);
>>> +    if (!(--vdev->refcnt)) {
>>> +        vfio_pci_vf_token_user_add(vdev, -1);
>>> +        vfio_pci_core_spapr_eeh_release(vdev);
>>> +        vfio_pci_core_disable(vdev);
>>> +    }
>>> +    mutex_unlock(&vdev->reflck->lock);
>>> +
>>> +    module_put(THIS_MODULE);
>>> +}
>>> +
>>> +static int nvlink2gpu_vfio_pci_open(void *device_data)
>>> +{
>>> +    struct vfio_pci_core_device *vdev = device_data;
>>> +    int ret = 0;
>>> +
>>> +    if (!try_module_get(THIS_MODULE))
>>> +        return -ENODEV;
>>> +
>>> +    mutex_lock(&vdev->reflck->lock);
>>> +
>>> +    if (!vdev->refcnt) {
>>> +        ret = vfio_pci_core_enable(vdev);
>>> +        if (ret)
>>> +            goto error;
>>> +
>>> +        ret = vfio_pci_nvidia_v100_nvlink2_init(vdev);
>>> +        if (ret && ret != -ENODEV) {
>>> +            pci_warn(vdev->pdev,
>>> +                 "Failed to setup NVIDIA NV2 RAM region\n");
>>> +            vfio_pci_core_disable(vdev);
>>> +            goto error;
>>> +        }
>>> +        ret = 0;
>>> +        vfio_pci_probe_mmaps(vdev);
>>> +        vfio_pci_core_spapr_eeh_open(vdev);
>>> +        vfio_pci_vf_token_user_add(vdev, 1);
>>> +    }
>>> +    vdev->refcnt++;
>>> +error:
>>> +    mutex_unlock(&vdev->reflck->lock);
>>> +    if (ret)
>>> +        module_put(THIS_MODULE);
>>> +    return ret;
>>> +}
>>> +
>>> +static const struct vfio_device_ops nvlink2gpu_vfio_pci_ops = {
>>> +    .name        = "nvlink2gpu-vfio-pci",
>>> +    .open        = nvlink2gpu_vfio_pci_open,
>>> +    .release    = nvlink2gpu_vfio_pci_release,
>>> +    .ioctl        = vfio_pci_core_ioctl,
>>> +    .read        = vfio_pci_core_read,
>>> +    .write        = vfio_pci_core_write,
>>> +    .mmap        = vfio_pci_core_mmap,
>>> +    .request    = vfio_pci_core_request,
>>> +    .match        = vfio_pci_core_match,
>>> +};
>>> +
>>> +static int nvlink2gpu_vfio_pci_probe(struct pci_dev *pdev,
>>> +        const struct pci_device_id *id)
>>> +{
>>> +    struct nv_vfio_pci_device *nvdev;
>>> +    int ret;
>>> +
>>> +    nvdev = kzalloc(sizeof(*nvdev), GFP_KERNEL);
>>> +    if (!nvdev)
>>> +        return -ENOMEM;
>>> +
>>> +    ret = vfio_pci_core_register_device(&nvdev->vdev, pdev,
>>> +            &nvlink2gpu_vfio_pci_ops);
>>> +    if (ret)
>>> +        goto out_free;
>>> +
>>> +    return 0;
>>> +
>>> +out_free:
>>> +    kfree(nvdev);
>>> +    return ret;
>>> +}
>>> +
>>> +static void nvlink2gpu_vfio_pci_remove(struct pci_dev *pdev)
>>> +{
>>> +    struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
>>> +    struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
>>> +    struct nv_vfio_pci_device *nvdev;
>>> +
>>> +    nvdev = container_of(core_vpdev, struct nv_vfio_pci_device, vdev);
>>> +
>>> +    vfio_pci_core_unregister_device(core_vpdev);
>>> +    kfree(nvdev);
>>> +}
>>> +
>>> +static const struct pci_device_id nvlink2gpu_vfio_pci_table[] = {
>>> +    { PCI_VDEVICE(NVIDIA, 0x1DB1) }, /* GV100GL-A NVIDIA Tesla 
>>> V100-SXM2-16GB */
>>> +    { PCI_VDEVICE(NVIDIA, 0x1DB5) }, /* GV100GL-A NVIDIA Tesla 
>>> V100-SXM2-32GB */
>>> +    { PCI_VDEVICE(NVIDIA, 0x1DB8) }, /* GV100GL-A NVIDIA Tesla 
>>> V100-SXM3-32GB */
>>> +    { PCI_VDEVICE(NVIDIA, 0x1DF5) }, /* GV100GL-B NVIDIA Tesla 
>>> V100-SXM2-16GB */
>>
>>
>> Where is this list from?
>>
>> Also, how is this supposed to work at the boot time? Will the kernel 
>> try binding let's say this one and nouveau? Which one is going to win?
> 
> At boot time nouveau driver will win since the vfio drivers don't 
> declare MODULE_DEVICE_TABLE


ok but where is the list from anyway?


> 
> 
>>
>>
>>> +    { 0, }
>>
>>
>> Why a comma?
> 
> I'll remove the comma.
> 
> 
>>
>>> +};
>>
>>
>>
>>> +
>>> +static struct pci_driver nvlink2gpu_vfio_pci_driver = {
>>> +    .name            = "nvlink2gpu-vfio-pci",
>>> +    .id_table        = nvlink2gpu_vfio_pci_table,
>>> +    .probe            = nvlink2gpu_vfio_pci_probe,
>>> +    .remove            = nvlink2gpu_vfio_pci_remove,
>>> +#ifdef CONFIG_PCI_IOV
>>> +    .sriov_configure    = vfio_pci_core_sriov_configure,
>>> +#endif
>>
>>
>> What is this IOV business about?
> 
> from vfio_pci
> 
> #ifdef CONFIG_PCI_IOV
> module_param(enable_sriov, bool, 0644);
> MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV 
> configuration.  Enabling SR-IOV on a PF typically requires support of 
> the userspace PF driver, enabling VFs without such support may result in 
> non-functional VFs or PF.");
> #endif


I know what IOV is in general :) What I meant to say was that I am 
pretty sure these GPUs cannot do IOV so this does not need to be in 
these NVLink drivers.



> 
> 
>>
>>
>>> +    .err_handler        = &vfio_pci_core_err_handlers,
>>> +};
>>> +
>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev)
>>> +{
>>> +    if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev))
>>> +        return &nvlink2gpu_vfio_pci_driver;
>>
>>
>> Why do we need matching PCI ids here instead of looking at the FDT 
>> which will work better?
> 
> what is FDT ? any is it better to use it instead of match_id ?


Flattened Device Tree - a way for the firmware to pass the configuration 
to the OS. This data tells if there are NVLinks and what they are linked 
to. This defines if the feature is available as it should work with any 
GPU in this form factor.


> 
>>
>>
>>> +    return NULL;
>>> +}
>>> +EXPORT_SYMBOL_GPL(get_nvlink2gpu_vfio_pci_driver);
>>> +#endif
>>> +
>>> +module_pci_driver(nvlink2gpu_vfio_pci_driver);
>>> +
>>> +MODULE_VERSION(DRIVER_VERSION);
>>> +MODULE_LICENSE("GPL v2");
>>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>>> +MODULE_DESCRIPTION(DRIVER_DESC);
>>> diff --git a/drivers/vfio/pci/nvlink2gpu_vfio_pci.h 
>>> b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h
>>> new file mode 100644
>>> index 000000000000..ebd5b600b190
>>> --- /dev/null
>>> +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h
>>> @@ -0,0 +1,24 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +/*
>>> + * Copyright (c) 2020, Mellanox Technologies, Ltd.  All rights 
>>> reserved.
>>> + *     Author: Max Gurtovoy <mgurtovoy@nvidia.com>
>>> + */
>>> +
>>> +#ifndef NVLINK2GPU_VFIO_PCI_H
>>> +#define NVLINK2GPU_VFIO_PCI_H
>>> +
>>> +#include <linux/pci.h>
>>> +#include <linux/module.h>
>>> +
>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>>> +#if defined(CONFIG_VFIO_PCI_NVLINK2GPU) || 
>>> defined(CONFIG_VFIO_PCI_NVLINK2GPU_MODULE)
>>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev 
>>> *pdev);
>>> +#else
>>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev)
>>> +{
>>> +    return NULL;
>>> +}
>>> +#endif
>>> +#endif
>>> +
>>> +#endif /* NVLINK2GPU_VFIO_PCI_H */
>>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>>> index dbc0a6559914..8e81ea039f31 100644
>>> --- a/drivers/vfio/pci/vfio_pci.c
>>> +++ b/drivers/vfio/pci/vfio_pci.c
>>> @@ -27,6 +27,10 @@
>>>   #include <linux/uaccess.h>
>>>     #include "vfio_pci_core.h"
>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>>> +#include "npu2_vfio_pci.h"
>>> +#include "nvlink2gpu_vfio_pci.h"
>>> +#endif
>>>     #define DRIVER_VERSION  "0.2"
>>>   #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
>>> @@ -142,14 +146,48 @@ static const struct vfio_device_ops 
>>> vfio_pci_ops = {
>>>       .match        = vfio_pci_core_match,
>>>   };
>>>   +/*
>>> + * This layer is used for backward compatibility. Hopefully it will be
>>> + * removed in the future.
>>> + */
>>> +static struct pci_driver *vfio_pci_get_compat_driver(struct pci_dev 
>>> *pdev)
>>> +{
>>> +    switch (pdev->vendor) {
>>> +    case PCI_VENDOR_ID_NVIDIA:
>>> +        switch (pdev->device) {
>>> +        case 0x1db1:
>>> +        case 0x1db5:
>>> +        case 0x1db8:
>>> +        case 0x1df5:
>>> +            return get_nvlink2gpu_vfio_pci_driver(pdev);
>>
>> This does not really need a switch, could simply call these 
>> get_xxxx_vfio_pci_driver. Thanks,
> 
> maybe the result will be the same but I don't think we need to send all 
> NVIDIA devices or IBM devices to this function.

We can tolerate this on POWER (the check is really cheap) and for 
everybody else this driver won't even compile.


> we can maybe export the tables from the vfio_vendor driver and match it 
> here.


I am still missing the point of device matching. It won't bind by 
default at the boot time and it won't make the existing user life any 
easier as they use libvirt which overrides this anyway.


>>
>>
>>> +        default:
>>> +            return NULL;
>>> +        }
>>> +    case PCI_VENDOR_ID_IBM:
>>> +        switch (pdev->device) {
>>> +        case 0x04ea:
>>> +            return get_npu2_vfio_pci_driver(pdev);
>>> +        default:
>>> +            return NULL;
>>> +        }
>>> +    }
>>> +
>>> +    return NULL;
>>> +}
>>> +
>>>   static int vfio_pci_probe(struct pci_dev *pdev, const struct 
>>> pci_device_id *id)
>>>   {
>>>       struct vfio_pci_device *vpdev;
>>> +    struct pci_driver *driver;
>>>       int ret;
>>>         if (vfio_pci_is_denylisted(pdev))
>>>           return -EINVAL;
>>>   +    driver = vfio_pci_get_compat_driver(pdev);
>>> +    if (driver)
>>> +        return driver->probe(pdev, id);
>>> +
>>>       vpdev = kzalloc(sizeof(*vpdev), GFP_KERNEL);
>>>       if (!vpdev)
>>>           return -ENOMEM;
>>> @@ -167,14 +205,21 @@ static int vfio_pci_probe(struct pci_dev *pdev, 
>>> const struct pci_device_id *id)
>>>     static void vfio_pci_remove(struct pci_dev *pdev)
>>>   {
>>> -    struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
>>> -    struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
>>> -    struct vfio_pci_device *vpdev;
>>> -
>>> -    vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev);
>>> -
>>> -    vfio_pci_core_unregister_device(core_vpdev);
>>> -    kfree(vpdev);
>>> +    struct pci_driver *driver;
>>> +
>>> +    driver = vfio_pci_get_compat_driver(pdev);
>>> +    if (driver) {
>>> +        driver->remove(pdev);
>>> +    } else {
>>> +        struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
>>> +        struct vfio_pci_core_device *core_vpdev;
>>> +        struct vfio_pci_device *vpdev;
>>> +
>>> +        core_vpdev = vfio_device_data(vdev);
>>> +        vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev);
>>> +        vfio_pci_core_unregister_device(core_vpdev);
>>> +        kfree(vpdev);
>>> +    }
>>>   }
>>>     static int vfio_pci_sriov_configure(struct pci_dev *pdev, int 
>>> nr_virtfn)
>>> diff --git a/drivers/vfio/pci/vfio_pci_core.c 
>>> b/drivers/vfio/pci/vfio_pci_core.c
>>> index 4de8e352df9c..f9b39abe54cb 100644
>>> --- a/drivers/vfio/pci/vfio_pci_core.c
>>> +++ b/drivers/vfio/pci/vfio_pci_core.c
>>> @@ -354,24 +354,6 @@ int vfio_pci_core_enable(struct 
>>> vfio_pci_core_device *vdev)
>>>           }
>>>       }
>>>   -    if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
>>> -        IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
>>> -        ret = vfio_pci_nvidia_v100_nvlink2_init(vdev);
>>> -        if (ret && ret != -ENODEV) {
>>> -            pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n");
>>> -            goto disable_exit;
>>> -        }
>>> -    }
>>> -
>>> -    if (pdev->vendor == PCI_VENDOR_ID_IBM &&
>>> -        IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
>>> -        ret = vfio_pci_ibm_npu2_init(vdev);
>>> -        if (ret && ret != -ENODEV) {
>>> -            pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n");
>>> -            goto disable_exit;
>>> -        }
>>> -    }
>>> -
>>>       return 0;
>>>     disable_exit:
>>> diff --git a/drivers/vfio/pci/vfio_pci_core.h 
>>> b/drivers/vfio/pci/vfio_pci_core.h
>>> index 8989443c3086..31f3836e606e 100644
>>> --- a/drivers/vfio/pci/vfio_pci_core.h
>>> +++ b/drivers/vfio/pci/vfio_pci_core.h
>>> @@ -204,20 +204,6 @@ static inline int vfio_pci_igd_init(struct 
>>> vfio_pci_core_device *vdev)
>>>       return -ENODEV;
>>>   }
>>>   #endif
>>> -#ifdef CONFIG_VFIO_PCI_NVLINK2
>>> -extern int vfio_pci_nvidia_v100_nvlink2_init(struct 
>>> vfio_pci_core_device *vdev);
>>> -extern int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev);
>>> -#else
>>> -static inline int vfio_pci_nvidia_v100_nvlink2_init(struct 
>>> vfio_pci_core_device *vdev)
>>> -{
>>> -    return -ENODEV;
>>> -}
>>> -
>>> -static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device 
>>> *vdev)
>>> -{
>>> -    return -ENODEV;
>>> -}
>>> -#endif
>>>     #ifdef CONFIG_S390
>>>   extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device 
>>> *vdev,
>>>
>>
Alexey Kardashevskiy March 10, 2021, 2:24 p.m. UTC | #5
On 11/03/2021 00:02, Jason Gunthorpe wrote:
> On Wed, Mar 10, 2021 at 02:57:57PM +0200, Max Gurtovoy wrote:
> 
>>>> +    .err_handler        = &vfio_pci_core_err_handlers,
>>>> +};
>>>> +
>>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>>>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev)
>>>> +{
>>>> +    if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev))
>>>> +        return &nvlink2gpu_vfio_pci_driver;
>>>
>>>
>>> Why do we need matching PCI ids here instead of looking at the FDT which
>>> will work better?
>>
>> what is FDT ? any is it better to use it instead of match_id ?
> 
> This is emulating the device_driver match for the pci_driver.


No it is not, it is a device tree info which lets to skip the linux PCI 
discovery part (the firmware does it anyway) but it tells nothing about 
which drivers to bind.


> I don't think we can combine FDT matching with pci_driver, can we?

It is a c function calling another c function, all within vfio-pci, this 
is not called by the generic pci code.
Jason Gunthorpe March 10, 2021, 7:40 p.m. UTC | #6
On Thu, Mar 11, 2021 at 01:24:47AM +1100, Alexey Kardashevskiy wrote:
> 
> 
> On 11/03/2021 00:02, Jason Gunthorpe wrote:
> > On Wed, Mar 10, 2021 at 02:57:57PM +0200, Max Gurtovoy wrote:
> > 
> > > > > +    .err_handler        = &vfio_pci_core_err_handlers,
> > > > > +};
> > > > > +
> > > > > +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
> > > > > +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev)
> > > > > +{
> > > > > +    if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev))
> > > > > +        return &nvlink2gpu_vfio_pci_driver;
> > > > 
> > > > 
> > > > Why do we need matching PCI ids here instead of looking at the FDT which
> > > > will work better?
> > > 
> > > what is FDT ? any is it better to use it instead of match_id ?
> > 
> > This is emulating the device_driver match for the pci_driver.
> 
> No it is not, it is a device tree info which lets to skip the linux PCI
> discovery part (the firmware does it anyway) but it tells nothing about
> which drivers to bind.

I mean get_nvlink2gpu_vfio_pci_driver() is emulating the PCI match.

Max added a pci driver for NPU here:

+static struct pci_driver npu2_vfio_pci_driver = {
+	.name			= "npu2-vfio-pci",
+	.id_table		= npu2_vfio_pci_table,
+	.probe			= npu2_vfio_pci_probe,


new userspace should use driver_override with "npu-vfio-pci" as the
string not "vfio-pci"

The point of the get_npu2_vfio_pci_driver() is only optional
compatibility to redirect old userspace using "vfio-pci" in the
driver_override to the now split driver code so userspace doesn't see
any change in behavior.

If we don't do this then the vfio-pci driver override will disable the
npu2 special stuff, since Max took it all out of vfio-pci's
pci_driver.

It is supposed to match exactly the same match table as the pci_driver
above. We *don't* want different behavior from what the standrd PCI
driver matcher will do.

Since we don't have any way to mix in FDT discovery to the standard
PCI driver match it will still attach the npu driver but not enable
any special support. This seems OK.

Jason
Max Gurtovoy March 11, 2021, 1:10 a.m. UTC | #7
On 3/10/2021 4:19 PM, Alexey Kardashevskiy wrote:
>
>
> On 10/03/2021 23:57, Max Gurtovoy wrote:
>>
>> On 3/10/2021 8:39 AM, Alexey Kardashevskiy wrote:
>>>
>>>
>>> On 09/03/2021 19:33, Max Gurtovoy wrote:
>>>> The new drivers introduced are nvlink2gpu_vfio_pci.ko and
>>>> npu2_vfio_pci.ko.
>>>> The first will be responsible for providing special extensions for
>>>> NVIDIA GPUs with NVLINK2 support for P9 platform (and others in the
>>>> future). The last will be responsible for POWER9 NPU2 unit (NVLink2 
>>>> host
>>>> bus adapter).
>>>>
>>>> Also, preserve backward compatibility for users that were binding
>>>> NVLINK2 devices to vfio_pci.ko. Hopefully this compatibility layer 
>>>> will
>>>> be dropped in the future
>>>>
>>>> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
>>>> ---
>>>>   drivers/vfio/pci/Kconfig                      |  28 +++-
>>>>   drivers/vfio/pci/Makefile                     |   7 +-
>>>>   .../pci/{vfio_pci_npu2.c => npu2_vfio_pci.c}  | 144 
>>>> ++++++++++++++++-
>>>>   drivers/vfio/pci/npu2_vfio_pci.h              |  24 +++
>>>>   ...pci_nvlink2gpu.c => nvlink2gpu_vfio_pci.c} | 149 
>>>> +++++++++++++++++-
>>>>   drivers/vfio/pci/nvlink2gpu_vfio_pci.h        |  24 +++
>>>>   drivers/vfio/pci/vfio_pci.c                   |  61 ++++++-
>>>>   drivers/vfio/pci/vfio_pci_core.c              |  18 ---
>>>>   drivers/vfio/pci/vfio_pci_core.h              |  14 --
>>>>   9 files changed, 422 insertions(+), 47 deletions(-)
>>>>   rename drivers/vfio/pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} (64%)
>>>>   create mode 100644 drivers/vfio/pci/npu2_vfio_pci.h
>>>>   rename drivers/vfio/pci/{vfio_pci_nvlink2gpu.c => 
>>>> nvlink2gpu_vfio_pci.c} (67%)
>>>>   create mode 100644 drivers/vfio/pci/nvlink2gpu_vfio_pci.h
>>>>
>>>> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
>>>> index 829e90a2e5a3..88c89863a205 100644
>>>> --- a/drivers/vfio/pci/Kconfig
>>>> +++ b/drivers/vfio/pci/Kconfig
>>>> @@ -48,8 +48,30 @@ config VFIO_PCI_IGD
>>>>           To enable Intel IGD assignment through vfio-pci, say Y.
>>>>   -config VFIO_PCI_NVLINK2
>>>> -    def_bool y
>>>> +config VFIO_PCI_NVLINK2GPU
>>>> +    tristate "VFIO support for NVIDIA NVLINK2 GPUs"
>>>>       depends on VFIO_PCI_CORE && PPC_POWERNV
>>>>       help
>>>> -      VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 
>>>> GPUs
>>>> +      VFIO PCI driver for NVIDIA NVLINK2 GPUs with specific 
>>>> extensions
>>>> +      for P9 Witherspoon machine.
>>>> +
>>>> +config VFIO_PCI_NPU2
>>>> +    tristate "VFIO support for IBM NPU host bus adapter on P9"
>>>> +    depends on VFIO_PCI_NVLINK2GPU && PPC_POWERNV
>>>> +    help
>>>> +      VFIO PCI specific extensions for IBM NVLink2 host bus 
>>>> adapter on P9
>>>> +      Witherspoon machine.
>>>> +
>>>> +config VFIO_PCI_DRIVER_COMPAT
>>>> +    bool "VFIO PCI backward compatibility for vendor specific 
>>>> extensions"
>>>> +    default y
>>>> +    depends on VFIO_PCI
>>>> +    help
>>>> +      Say Y here if you want to preserve VFIO PCI backward
>>>> +      compatibility. vfio_pci.ko will continue to automatically use
>>>> +      the NVLINK2, NPU2 and IGD VFIO drivers when it is attached to
>>>> +      a compatible device.
>>>> +
>>>> +      When N is selected the user must bind explicity to the module
>>>> +      they want to handle the device and vfio_pci.ko will have no
>>>> +      device specific special behaviors.
>>>> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
>>>> index f539f32c9296..86fb62e271fc 100644
>>>> --- a/drivers/vfio/pci/Makefile
>>>> +++ b/drivers/vfio/pci/Makefile
>>>> @@ -2,10 +2,15 @@
>>>>     obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
>>>>   obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
>>>> +obj-$(CONFIG_VFIO_PCI_NPU2) += npu2-vfio-pci.o
>>>> +obj-$(CONFIG_VFIO_PCI_NVLINK2GPU) += nvlink2gpu-vfio-pci.o
>>>>     vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o 
>>>> vfio_pci_rdwr.o vfio_pci_config.o
>>>>   vfio-pci-core-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
>>>> -vfio-pci-core-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2gpu.o 
>>>> vfio_pci_npu2.o
>>>>   vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o
>>>>     vfio-pci-y := vfio_pci.o
>>>> +
>>>> +npu2-vfio-pci-y := npu2_vfio_pci.o
>>>> +
>>>> +nvlink2gpu-vfio-pci-y := nvlink2gpu_vfio_pci.o
>>>> diff --git a/drivers/vfio/pci/vfio_pci_npu2.c 
>>>> b/drivers/vfio/pci/npu2_vfio_pci.c
>>>> similarity index 64%
>>>> rename from drivers/vfio/pci/vfio_pci_npu2.c
>>>> rename to drivers/vfio/pci/npu2_vfio_pci.c
>>>> index 717745256ab3..7071bda0f2b6 100644
>>>> --- a/drivers/vfio/pci/vfio_pci_npu2.c
>>>> +++ b/drivers/vfio/pci/npu2_vfio_pci.c
>>>> @@ -14,19 +14,28 @@
>>>>    *    Author: Alex Williamson <alex.williamson@redhat.com>
>>>>    */
>>>>   +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>>>> +
>>>> +#include <linux/module.h>
>>>>   #include <linux/io.h>
>>>>   #include <linux/pci.h>
>>>>   #include <linux/uaccess.h>
>>>>   #include <linux/vfio.h>
>>>> +#include <linux/list.h>
>>>>   #include <linux/sched/mm.h>
>>>>   #include <linux/mmu_context.h>
>>>>   #include <asm/kvm_ppc.h>
>>>>     #include "vfio_pci_core.h"
>>>> +#include "npu2_vfio_pci.h"
>>>>     #define CREATE_TRACE_POINTS
>>>>   #include "npu2_trace.h"
>>>>   +#define DRIVER_VERSION  "0.1"
>>>> +#define DRIVER_AUTHOR   "Alexey Kardashevskiy <aik@ozlabs.ru>"
>>>> +#define DRIVER_DESC     "NPU2 VFIO PCI - User Level meta-driver 
>>>> for POWER9 NPU NVLink2 HBA"
>>>> +
>>>>   EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap);
>>>>     struct vfio_pci_npu2_data {
>>>> @@ -36,6 +45,10 @@ struct vfio_pci_npu2_data {
>>>>       unsigned int link_speed; /* The link speed from DT's 
>>>> ibm,nvlink-speed */
>>>>   };
>>>>   +struct npu2_vfio_pci_device {
>>>> +    struct vfio_pci_core_device    vdev;
>>>> +};
>>>> +
>>>>   static size_t vfio_pci_npu2_rw(struct vfio_pci_core_device *vdev,
>>>>           char __user *buf, size_t count, loff_t *ppos, bool iswrite)
>>>>   {
>>>> @@ -120,7 +133,7 @@ static const struct vfio_pci_regops 
>>>> vfio_pci_npu2_regops = {
>>>>       .add_capability = vfio_pci_npu2_add_capability,
>>>>   };
>>>>   -int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev)
>>>> +static int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev)
>>>>   {
>>>>       int ret;
>>>>       struct vfio_pci_npu2_data *data;
>>>> @@ -220,3 +233,132 @@ int vfio_pci_ibm_npu2_init(struct 
>>>> vfio_pci_core_device *vdev)
>>>>         return ret;
>>>>   }
>>>> +
>>>> +static void npu2_vfio_pci_release(void *device_data)
>>>> +{
>>>> +    struct vfio_pci_core_device *vdev = device_data;
>>>> +
>>>> +    mutex_lock(&vdev->reflck->lock);
>>>> +    if (!(--vdev->refcnt)) {
>>>> +        vfio_pci_vf_token_user_add(vdev, -1);
>>>> +        vfio_pci_core_spapr_eeh_release(vdev);
>>>> +        vfio_pci_core_disable(vdev);
>>>> +    }
>>>> +    mutex_unlock(&vdev->reflck->lock);
>>>> +
>>>> +    module_put(THIS_MODULE);
>>>> +}
>>>> +
>>>> +static int npu2_vfio_pci_open(void *device_data)
>>>> +{
>>>> +    struct vfio_pci_core_device *vdev = device_data;
>>>> +    int ret = 0;
>>>> +
>>>> +    if (!try_module_get(THIS_MODULE))
>>>> +        return -ENODEV;
>>>> +
>>>> +    mutex_lock(&vdev->reflck->lock);
>>>> +
>>>> +    if (!vdev->refcnt) {
>>>> +        ret = vfio_pci_core_enable(vdev);
>>>> +        if (ret)
>>>> +            goto error;
>>>> +
>>>> +        ret = vfio_pci_ibm_npu2_init(vdev);
>>>> +        if (ret && ret != -ENODEV) {
>>>> +            pci_warn(vdev->pdev,
>>>> +                 "Failed to setup NVIDIA NV2 ATSD region\n");
>>>> +            vfio_pci_core_disable(vdev);
>>>> +            goto error;
>>>> +        }
>>>> +        ret = 0;
>>>> +        vfio_pci_probe_mmaps(vdev);
>>>> +        vfio_pci_core_spapr_eeh_open(vdev);
>>>> +        vfio_pci_vf_token_user_add(vdev, 1);
>>>> +    }
>>>> +    vdev->refcnt++;
>>>> +error:
>>>> +    mutex_unlock(&vdev->reflck->lock);
>>>> +    if (ret)
>>>> +        module_put(THIS_MODULE);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static const struct vfio_device_ops npu2_vfio_pci_ops = {
>>>> +    .name        = "npu2-vfio-pci",
>>>> +    .open        = npu2_vfio_pci_open,
>>>> +    .release    = npu2_vfio_pci_release,
>>>> +    .ioctl        = vfio_pci_core_ioctl,
>>>> +    .read        = vfio_pci_core_read,
>>>> +    .write        = vfio_pci_core_write,
>>>> +    .mmap        = vfio_pci_core_mmap,
>>>> +    .request    = vfio_pci_core_request,
>>>> +    .match        = vfio_pci_core_match,
>>>> +};
>>>> +
>>>> +static int npu2_vfio_pci_probe(struct pci_dev *pdev,
>>>> +        const struct pci_device_id *id)
>>>> +{
>>>> +    struct npu2_vfio_pci_device *npvdev;
>>>> +    int ret;
>>>> +
>>>> +    npvdev = kzalloc(sizeof(*npvdev), GFP_KERNEL);
>>>> +    if (!npvdev)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    ret = vfio_pci_core_register_device(&npvdev->vdev, pdev,
>>>> +            &npu2_vfio_pci_ops);
>>>> +    if (ret)
>>>> +        goto out_free;
>>>> +
>>>> +    return 0;
>>>> +
>>>> +out_free:
>>>> +    kfree(npvdev);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static void npu2_vfio_pci_remove(struct pci_dev *pdev)
>>>> +{
>>>> +    struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
>>>> +    struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
>>>> +    struct npu2_vfio_pci_device *npvdev;
>>>> +
>>>> +    npvdev = container_of(core_vpdev, struct npu2_vfio_pci_device, 
>>>> vdev);
>>>> +
>>>> +    vfio_pci_core_unregister_device(core_vpdev);
>>>> +    kfree(npvdev);
>>>> +}
>>>> +
>>>> +static const struct pci_device_id npu2_vfio_pci_table[] = {
>>>> +    { PCI_VDEVICE(IBM, 0x04ea) },
>>>> +    { 0, }
>>>> +};
>>>> +
>>>> +static struct pci_driver npu2_vfio_pci_driver = {
>>>> +    .name            = "npu2-vfio-pci",
>>>> +    .id_table        = npu2_vfio_pci_table,
>>>> +    .probe            = npu2_vfio_pci_probe,
>>>> +    .remove            = npu2_vfio_pci_remove,
>>>> +#ifdef CONFIG_PCI_IOV
>>>> +    .sriov_configure    = vfio_pci_core_sriov_configure,
>>>> +#endif
>>>> +    .err_handler        = &vfio_pci_core_err_handlers,
>>>> +};
>>>> +
>>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>>>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev)
>>>> +{
>>>> +    if (pci_match_id(npu2_vfio_pci_driver.id_table, pdev))
>>>> +        return &npu2_vfio_pci_driver;
>>>> +    return NULL;
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(get_npu2_vfio_pci_driver);
>>>> +#endif
>>>> +
>>>> +module_pci_driver(npu2_vfio_pci_driver);
>>>> +
>>>> +MODULE_VERSION(DRIVER_VERSION);
>>>> +MODULE_LICENSE("GPL v2");
>>>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>>>> +MODULE_DESCRIPTION(DRIVER_DESC);
>>>> diff --git a/drivers/vfio/pci/npu2_vfio_pci.h 
>>>> b/drivers/vfio/pci/npu2_vfio_pci.h
>>>> new file mode 100644
>>>> index 000000000000..92010d340346
>>>> --- /dev/null
>>>> +++ b/drivers/vfio/pci/npu2_vfio_pci.h
>>>> @@ -0,0 +1,24 @@
>>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>>> +/*
>>>> + * Copyright (c) 2020, Mellanox Technologies, Ltd.  All rights 
>>>> reserved.
>>>> + *     Author: Max Gurtovoy <mgurtovoy@nvidia.com>
>>>> + */
>>>> +
>>>> +#ifndef NPU2_VFIO_PCI_H
>>>> +#define NPU2_VFIO_PCI_H
>>>> +
>>>> +#include <linux/pci.h>
>>>> +#include <linux/module.h>
>>>> +
>>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>>>> +#if defined(CONFIG_VFIO_PCI_NPU2) || 
>>>> defined(CONFIG_VFIO_PCI_NPU2_MODULE)
>>>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev);
>>>> +#else
>>>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev)
>>>> +{
>>>> +    return NULL;
>>>> +}
>>>> +#endif
>>>> +#endif
>>>> +
>>>> +#endif /* NPU2_VFIO_PCI_H */
>>>> diff --git a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c 
>>>> b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c
>>>> similarity index 67%
>>>> rename from drivers/vfio/pci/vfio_pci_nvlink2gpu.c
>>>> rename to drivers/vfio/pci/nvlink2gpu_vfio_pci.c
>>>> index 6dce1e78ee82..84a5ac1ce8ac 100644
>>>> --- a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c
>>>> +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c
>>>> @@ -1,6 +1,6 @@
>>>>   // SPDX-License-Identifier: GPL-2.0-only
>>>>   /*
>>>> - * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2.
>>>> + * VFIO PCI NVIDIA NVLink2 GPUs support.
>>>>    *
>>>>    * Copyright (C) 2018 IBM Corp.  All rights reserved.
>>>>    *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> @@ -12,6 +12,9 @@
>>>>    *    Author: Alex Williamson <alex.williamson@redhat.com>
>>>>    */
>>>>   +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>>>> +
>>>> +#include <linux/module.h>
>>>>   #include <linux/io.h>
>>>>   #include <linux/pci.h>
>>>>   #include <linux/uaccess.h>
>>>> @@ -21,10 +24,15 @@
>>>>   #include <asm/kvm_ppc.h>
>>>>     #include "vfio_pci_core.h"
>>>> +#include "nvlink2gpu_vfio_pci.h"
>>>>     #define CREATE_TRACE_POINTS
>>>>   #include "nvlink2gpu_trace.h"
>>>>   +#define DRIVER_VERSION  "0.1"
>>>> +#define DRIVER_AUTHOR   "Alexey Kardashevskiy <aik@ozlabs.ru>"
>>>> +#define DRIVER_DESC     "NVLINK2GPU VFIO PCI - User Level 
>>>> meta-driver for NVIDIA NVLink2 GPUs"
>>>> +
>>>>   EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault);
>>>>   EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap);
>>>>   @@ -39,6 +47,10 @@ struct vfio_pci_nvgpu_data {
>>>>       struct notifier_block group_notifier;
>>>>   };
>>>>   +struct nv_vfio_pci_device {
>>>> +    struct vfio_pci_core_device    vdev;
>>>> +};
>>>> +
>>>>   static size_t vfio_pci_nvgpu_rw(struct vfio_pci_core_device *vdev,
>>>>           char __user *buf, size_t count, loff_t *ppos, bool iswrite)
>>>>   {
>>>> @@ -207,7 +219,8 @@ static int vfio_pci_nvgpu_group_notifier(struct 
>>>> notifier_block *nb,
>>>>       return NOTIFY_OK;
>>>>   }
>>>>   -int vfio_pci_nvidia_v100_nvlink2_init(struct 
>>>> vfio_pci_core_device *vdev)
>>>> +static int
>>>> +vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev)
>>>>   {
>>>>       int ret;
>>>>       u64 reg[2];
>>>> @@ -293,3 +306,135 @@ int vfio_pci_nvidia_v100_nvlink2_init(struct 
>>>> vfio_pci_core_device *vdev)
>>>>         return ret;
>>>>   }
>>>> +
>>>> +static void nvlink2gpu_vfio_pci_release(void *device_data)
>>>> +{
>>>> +    struct vfio_pci_core_device *vdev = device_data;
>>>> +
>>>> +    mutex_lock(&vdev->reflck->lock);
>>>> +    if (!(--vdev->refcnt)) {
>>>> +        vfio_pci_vf_token_user_add(vdev, -1);
>>>> +        vfio_pci_core_spapr_eeh_release(vdev);
>>>> +        vfio_pci_core_disable(vdev);
>>>> +    }
>>>> +    mutex_unlock(&vdev->reflck->lock);
>>>> +
>>>> +    module_put(THIS_MODULE);
>>>> +}
>>>> +
>>>> +static int nvlink2gpu_vfio_pci_open(void *device_data)
>>>> +{
>>>> +    struct vfio_pci_core_device *vdev = device_data;
>>>> +    int ret = 0;
>>>> +
>>>> +    if (!try_module_get(THIS_MODULE))
>>>> +        return -ENODEV;
>>>> +
>>>> +    mutex_lock(&vdev->reflck->lock);
>>>> +
>>>> +    if (!vdev->refcnt) {
>>>> +        ret = vfio_pci_core_enable(vdev);
>>>> +        if (ret)
>>>> +            goto error;
>>>> +
>>>> +        ret = vfio_pci_nvidia_v100_nvlink2_init(vdev);
>>>> +        if (ret && ret != -ENODEV) {
>>>> +            pci_warn(vdev->pdev,
>>>> +                 "Failed to setup NVIDIA NV2 RAM region\n");
>>>> +            vfio_pci_core_disable(vdev);
>>>> +            goto error;
>>>> +        }
>>>> +        ret = 0;
>>>> +        vfio_pci_probe_mmaps(vdev);
>>>> +        vfio_pci_core_spapr_eeh_open(vdev);
>>>> +        vfio_pci_vf_token_user_add(vdev, 1);
>>>> +    }
>>>> +    vdev->refcnt++;
>>>> +error:
>>>> +    mutex_unlock(&vdev->reflck->lock);
>>>> +    if (ret)
>>>> +        module_put(THIS_MODULE);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static const struct vfio_device_ops nvlink2gpu_vfio_pci_ops = {
>>>> +    .name        = "nvlink2gpu-vfio-pci",
>>>> +    .open        = nvlink2gpu_vfio_pci_open,
>>>> +    .release    = nvlink2gpu_vfio_pci_release,
>>>> +    .ioctl        = vfio_pci_core_ioctl,
>>>> +    .read        = vfio_pci_core_read,
>>>> +    .write        = vfio_pci_core_write,
>>>> +    .mmap        = vfio_pci_core_mmap,
>>>> +    .request    = vfio_pci_core_request,
>>>> +    .match        = vfio_pci_core_match,
>>>> +};
>>>> +
>>>> +static int nvlink2gpu_vfio_pci_probe(struct pci_dev *pdev,
>>>> +        const struct pci_device_id *id)
>>>> +{
>>>> +    struct nv_vfio_pci_device *nvdev;
>>>> +    int ret;
>>>> +
>>>> +    nvdev = kzalloc(sizeof(*nvdev), GFP_KERNEL);
>>>> +    if (!nvdev)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    ret = vfio_pci_core_register_device(&nvdev->vdev, pdev,
>>>> +            &nvlink2gpu_vfio_pci_ops);
>>>> +    if (ret)
>>>> +        goto out_free;
>>>> +
>>>> +    return 0;
>>>> +
>>>> +out_free:
>>>> +    kfree(nvdev);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static void nvlink2gpu_vfio_pci_remove(struct pci_dev *pdev)
>>>> +{
>>>> +    struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
>>>> +    struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
>>>> +    struct nv_vfio_pci_device *nvdev;
>>>> +
>>>> +    nvdev = container_of(core_vpdev, struct nv_vfio_pci_device, 
>>>> vdev);
>>>> +
>>>> +    vfio_pci_core_unregister_device(core_vpdev);
>>>> +    kfree(nvdev);
>>>> +}
>>>> +
>>>> +static const struct pci_device_id nvlink2gpu_vfio_pci_table[] = {
>>>> +    { PCI_VDEVICE(NVIDIA, 0x1DB1) }, /* GV100GL-A NVIDIA Tesla 
>>>> V100-SXM2-16GB */
>>>> +    { PCI_VDEVICE(NVIDIA, 0x1DB5) }, /* GV100GL-A NVIDIA Tesla 
>>>> V100-SXM2-32GB */
>>>> +    { PCI_VDEVICE(NVIDIA, 0x1DB8) }, /* GV100GL-A NVIDIA Tesla 
>>>> V100-SXM3-32GB */
>>>> +    { PCI_VDEVICE(NVIDIA, 0x1DF5) }, /* GV100GL-B NVIDIA Tesla 
>>>> V100-SXM2-16GB */
>>>
>>>
>>> Where is this list from?
>>>
>>> Also, how is this supposed to work at the boot time? Will the kernel 
>>> try binding let's say this one and nouveau? Which one is going to win?
>>
>> At boot time nouveau driver will win since the vfio drivers don't 
>> declare MODULE_DEVICE_TABLE
>
>
> ok but where is the list from anyway?

I did some checkings and was told that the SXM devices where the ones 
that were integrated into P9.

If you or anyone on the mailing list has some comments here, please add 
them and I'll do double check.

>
>
>>
>>
>>>
>>>
>>>> +    { 0, }
>>>
>>>
>>> Why a comma?
>>
>> I'll remove the comma.
>>
>>
>>>
>>>> +};
>>>
>>>
>>>
>>>> +
>>>> +static struct pci_driver nvlink2gpu_vfio_pci_driver = {
>>>> +    .name            = "nvlink2gpu-vfio-pci",
>>>> +    .id_table        = nvlink2gpu_vfio_pci_table,
>>>> +    .probe            = nvlink2gpu_vfio_pci_probe,
>>>> +    .remove            = nvlink2gpu_vfio_pci_remove,
>>>> +#ifdef CONFIG_PCI_IOV
>>>> +    .sriov_configure    = vfio_pci_core_sriov_configure,
>>>> +#endif
>>>
>>>
>>> What is this IOV business about?
>>
>> from vfio_pci
>>
>> #ifdef CONFIG_PCI_IOV
>> module_param(enable_sriov, bool, 0644);
>> MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV 
>> configuration.  Enabling SR-IOV on a PF typically requires support of 
>> the userspace PF driver, enabling VFs without such support may result 
>> in non-functional VFs or PF.");
>> #endif
>
>
> I know what IOV is in general :) What I meant to say was that I am 
> pretty sure these GPUs cannot do IOV so this does not need to be in 
> these NVLink drivers.

Thanks.

I'll verify it and remove for v4.


>
>
>
>>
>>
>>>
>>>
>>>> +    .err_handler        = &vfio_pci_core_err_handlers,
>>>> +};
>>>> +
>>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>>>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev 
>>>> *pdev)
>>>> +{
>>>> +    if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev))
>>>> +        return &nvlink2gpu_vfio_pci_driver;
>>>
>>>
>>> Why do we need matching PCI ids here instead of looking at the FDT 
>>> which will work better?
>>
>> what is FDT ? any is it better to use it instead of match_id ?
>
>
> Flattened Device Tree - a way for the firmware to pass the 
> configuration to the OS. This data tells if there are NVLinks and what 
> they are linked to. This defines if the feature is available as it 
> should work with any GPU in this form factor.
>
>
>>
>>>
>>>
>>>> +    return NULL;
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(get_nvlink2gpu_vfio_pci_driver);
>>>> +#endif
>>>> +
>>>> +module_pci_driver(nvlink2gpu_vfio_pci_driver);
>>>> +
>>>> +MODULE_VERSION(DRIVER_VERSION);
>>>> +MODULE_LICENSE("GPL v2");
>>>> +MODULE_AUTHOR(DRIVER_AUTHOR);
>>>> +MODULE_DESCRIPTION(DRIVER_DESC);
>>>> diff --git a/drivers/vfio/pci/nvlink2gpu_vfio_pci.h 
>>>> b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h
>>>> new file mode 100644
>>>> index 000000000000..ebd5b600b190
>>>> --- /dev/null
>>>> +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h
>>>> @@ -0,0 +1,24 @@
>>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>>> +/*
>>>> + * Copyright (c) 2020, Mellanox Technologies, Ltd.  All rights 
>>>> reserved.
>>>> + *     Author: Max Gurtovoy <mgurtovoy@nvidia.com>
>>>> + */
>>>> +
>>>> +#ifndef NVLINK2GPU_VFIO_PCI_H
>>>> +#define NVLINK2GPU_VFIO_PCI_H
>>>> +
>>>> +#include <linux/pci.h>
>>>> +#include <linux/module.h>
>>>> +
>>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>>>> +#if defined(CONFIG_VFIO_PCI_NVLINK2GPU) || 
>>>> defined(CONFIG_VFIO_PCI_NVLINK2GPU_MODULE)
>>>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev 
>>>> *pdev);
>>>> +#else
>>>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev 
>>>> *pdev)
>>>> +{
>>>> +    return NULL;
>>>> +}
>>>> +#endif
>>>> +#endif
>>>> +
>>>> +#endif /* NVLINK2GPU_VFIO_PCI_H */
>>>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>>>> index dbc0a6559914..8e81ea039f31 100644
>>>> --- a/drivers/vfio/pci/vfio_pci.c
>>>> +++ b/drivers/vfio/pci/vfio_pci.c
>>>> @@ -27,6 +27,10 @@
>>>>   #include <linux/uaccess.h>
>>>>     #include "vfio_pci_core.h"
>>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>>>> +#include "npu2_vfio_pci.h"
>>>> +#include "nvlink2gpu_vfio_pci.h"
>>>> +#endif
>>>>     #define DRIVER_VERSION  "0.2"
>>>>   #define DRIVER_AUTHOR   "Alex Williamson 
>>>> <alex.williamson@redhat.com>"
>>>> @@ -142,14 +146,48 @@ static const struct vfio_device_ops 
>>>> vfio_pci_ops = {
>>>>       .match        = vfio_pci_core_match,
>>>>   };
>>>>   +/*
>>>> + * This layer is used for backward compatibility. Hopefully it 
>>>> will be
>>>> + * removed in the future.
>>>> + */
>>>> +static struct pci_driver *vfio_pci_get_compat_driver(struct 
>>>> pci_dev *pdev)
>>>> +{
>>>> +    switch (pdev->vendor) {
>>>> +    case PCI_VENDOR_ID_NVIDIA:
>>>> +        switch (pdev->device) {
>>>> +        case 0x1db1:
>>>> +        case 0x1db5:
>>>> +        case 0x1db8:
>>>> +        case 0x1df5:
>>>> +            return get_nvlink2gpu_vfio_pci_driver(pdev);
>>>
>>> This does not really need a switch, could simply call these 
>>> get_xxxx_vfio_pci_driver. Thanks,
>>
>> maybe the result will be the same but I don't think we need to send 
>> all NVIDIA devices or IBM devices to this function.
>
> We can tolerate this on POWER (the check is really cheap) and for 
> everybody else this driver won't even compile.

I'll improve this function.

Thanks.


>
>
>> we can maybe export the tables from the vfio_vendor driver and match 
>> it here.
>
>
> I am still missing the point of device matching. It won't bind by 
> default at the boot time and it won't make the existing user life any 
> easier as they use libvirt which overrides this anyway.

we're trying to improve the subsystem to be more flexible and we still 
want to preserve backward compatibility for the near future.


>
>
>>>
>>>
>>>> +        default:
>>>> +            return NULL;
>>>> +        }
>>>> +    case PCI_VENDOR_ID_IBM:
>>>> +        switch (pdev->device) {
>>>> +        case 0x04ea:
>>>> +            return get_npu2_vfio_pci_driver(pdev);
>>>> +        default:
>>>> +            return NULL;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return NULL;
>>>> +}
>>>> +
>>>>   static int vfio_pci_probe(struct pci_dev *pdev, const struct 
>>>> pci_device_id *id)
>>>>   {
>>>>       struct vfio_pci_device *vpdev;
>>>> +    struct pci_driver *driver;
>>>>       int ret;
>>>>         if (vfio_pci_is_denylisted(pdev))
>>>>           return -EINVAL;
>>>>   +    driver = vfio_pci_get_compat_driver(pdev);
>>>> +    if (driver)
>>>> +        return driver->probe(pdev, id);
>>>> +
>>>>       vpdev = kzalloc(sizeof(*vpdev), GFP_KERNEL);
>>>>       if (!vpdev)
>>>>           return -ENOMEM;
>>>> @@ -167,14 +205,21 @@ static int vfio_pci_probe(struct pci_dev 
>>>> *pdev, const struct pci_device_id *id)
>>>>     static void vfio_pci_remove(struct pci_dev *pdev)
>>>>   {
>>>> -    struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
>>>> -    struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
>>>> -    struct vfio_pci_device *vpdev;
>>>> -
>>>> -    vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev);
>>>> -
>>>> -    vfio_pci_core_unregister_device(core_vpdev);
>>>> -    kfree(vpdev);
>>>> +    struct pci_driver *driver;
>>>> +
>>>> +    driver = vfio_pci_get_compat_driver(pdev);
>>>> +    if (driver) {
>>>> +        driver->remove(pdev);
>>>> +    } else {
>>>> +        struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
>>>> +        struct vfio_pci_core_device *core_vpdev;
>>>> +        struct vfio_pci_device *vpdev;
>>>> +
>>>> +        core_vpdev = vfio_device_data(vdev);
>>>> +        vpdev = container_of(core_vpdev, struct vfio_pci_device, 
>>>> vdev);
>>>> +        vfio_pci_core_unregister_device(core_vpdev);
>>>> +        kfree(vpdev);
>>>> +    }
>>>>   }
>>>>     static int vfio_pci_sriov_configure(struct pci_dev *pdev, int 
>>>> nr_virtfn)
>>>> diff --git a/drivers/vfio/pci/vfio_pci_core.c 
>>>> b/drivers/vfio/pci/vfio_pci_core.c
>>>> index 4de8e352df9c..f9b39abe54cb 100644
>>>> --- a/drivers/vfio/pci/vfio_pci_core.c
>>>> +++ b/drivers/vfio/pci/vfio_pci_core.c
>>>> @@ -354,24 +354,6 @@ int vfio_pci_core_enable(struct 
>>>> vfio_pci_core_device *vdev)
>>>>           }
>>>>       }
>>>>   -    if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
>>>> -        IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
>>>> -        ret = vfio_pci_nvidia_v100_nvlink2_init(vdev);
>>>> -        if (ret && ret != -ENODEV) {
>>>> -            pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM 
>>>> region\n");
>>>> -            goto disable_exit;
>>>> -        }
>>>> -    }
>>>> -
>>>> -    if (pdev->vendor == PCI_VENDOR_ID_IBM &&
>>>> -        IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
>>>> -        ret = vfio_pci_ibm_npu2_init(vdev);
>>>> -        if (ret && ret != -ENODEV) {
>>>> -            pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD 
>>>> region\n");
>>>> -            goto disable_exit;
>>>> -        }
>>>> -    }
>>>> -
>>>>       return 0;
>>>>     disable_exit:
>>>> diff --git a/drivers/vfio/pci/vfio_pci_core.h 
>>>> b/drivers/vfio/pci/vfio_pci_core.h
>>>> index 8989443c3086..31f3836e606e 100644
>>>> --- a/drivers/vfio/pci/vfio_pci_core.h
>>>> +++ b/drivers/vfio/pci/vfio_pci_core.h
>>>> @@ -204,20 +204,6 @@ static inline int vfio_pci_igd_init(struct 
>>>> vfio_pci_core_device *vdev)
>>>>       return -ENODEV;
>>>>   }
>>>>   #endif
>>>> -#ifdef CONFIG_VFIO_PCI_NVLINK2
>>>> -extern int vfio_pci_nvidia_v100_nvlink2_init(struct 
>>>> vfio_pci_core_device *vdev);
>>>> -extern int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev);
>>>> -#else
>>>> -static inline int vfio_pci_nvidia_v100_nvlink2_init(struct 
>>>> vfio_pci_core_device *vdev)
>>>> -{
>>>> -    return -ENODEV;
>>>> -}
>>>> -
>>>> -static inline int vfio_pci_ibm_npu2_init(struct 
>>>> vfio_pci_core_device *vdev)
>>>> -{
>>>> -    return -ENODEV;
>>>> -}
>>>> -#endif
>>>>     #ifdef CONFIG_S390
>>>>   extern int vfio_pci_info_zdev_add_caps(struct 
>>>> vfio_pci_core_device *vdev,
>>>>
>>>
>
Alexey Kardashevskiy March 11, 2021, 1:20 a.m. UTC | #8
On 11/03/2021 06:40, Jason Gunthorpe wrote:
> On Thu, Mar 11, 2021 at 01:24:47AM +1100, Alexey Kardashevskiy wrote:
>>
>>
>> On 11/03/2021 00:02, Jason Gunthorpe wrote:
>>> On Wed, Mar 10, 2021 at 02:57:57PM +0200, Max Gurtovoy wrote:
>>>
>>>>>> +    .err_handler        = &vfio_pci_core_err_handlers,
>>>>>> +};
>>>>>> +
>>>>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
>>>>>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev)
>>>>>> +{
>>>>>> +    if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev))
>>>>>> +        return &nvlink2gpu_vfio_pci_driver;
>>>>>
>>>>>
>>>>> Why do we need matching PCI ids here instead of looking at the FDT which
>>>>> will work better?
>>>>
>>>> what is FDT ? any is it better to use it instead of match_id ?
>>>
>>> This is emulating the device_driver match for the pci_driver.
>>
>> No it is not, it is a device tree info which lets to skip the linux PCI
>> discovery part (the firmware does it anyway) but it tells nothing about
>> which drivers to bind.
> 
> I mean get_nvlink2gpu_vfio_pci_driver() is emulating the PCI match.
> 
> Max added a pci driver for NPU here:
> 
> +static struct pci_driver npu2_vfio_pci_driver = {
> +	.name			= "npu2-vfio-pci",
> +	.id_table		= npu2_vfio_pci_table,
> +	.probe			= npu2_vfio_pci_probe,
> 
> 
> new userspace should use driver_override with "npu-vfio-pci" as the
> string not "vfio-pci"
> 
> The point of the get_npu2_vfio_pci_driver() is only optional
> compatibility to redirect old userspace using "vfio-pci" in the
> driver_override to the now split driver code so userspace doesn't see
> any change in behavior.
> 
> If we don't do this then the vfio-pci driver override will disable the
> npu2 special stuff, since Max took it all out of vfio-pci's
> pci_driver.
> 
> It is supposed to match exactly the same match table as the pci_driver
> above. We *don't* want different behavior from what the standrd PCI
> driver matcher will do.


This is not a standard PCI driver though and the main vfio-pci won't 
have a list to match ever. IBM NPU PCI id is unlikely to change ever but 
NVIDIA keeps making new devices which work in those P9 boxes, are you 
going to keep adding those ids to nvlink2gpu_vfio_pci_table? btw can the 
id list have only vendor ids and not have device ids?


> Since we don't have any way to mix in FDT discovery to the standard
> PCI driver match it will still attach the npu driver but not enable
> any special support. This seems OK.
Jason Gunthorpe March 11, 2021, 1:34 a.m. UTC | #9
On Thu, Mar 11, 2021 at 12:20:33PM +1100, Alexey Kardashevskiy wrote:

> > It is supposed to match exactly the same match table as the pci_driver
> > above. We *don't* want different behavior from what the standrd PCI
> > driver matcher will do.
> 
> This is not a standard PCI driver though 

It is now, that is what this patch makes it into. This is why it now
has a struct pci_driver.

> and the main vfio-pci won't have a
> list to match ever.

?? vfio-pci uses driver_override or new_id to manage its match list

> IBM NPU PCI id is unlikely to change ever but NVIDIA keeps making
> new devices which work in those P9 boxes, are you going to keep
> adding those ids to nvlink2gpu_vfio_pci_table?

Certainly, as needed. PCI list updates is normal for the kernel.

> btw can the id list have only vendor ids and not have device ids?

The PCI matcher is quite flexable, see the other patch from Max for
the igd

But best practice is to be as narrow as possible as I hope this will
eventually impact module autoloading and other details.

Jason
Alexey Kardashevskiy March 11, 2021, 1:42 a.m. UTC | #10
On 11/03/2021 12:34, Jason Gunthorpe wrote:
> On Thu, Mar 11, 2021 at 12:20:33PM +1100, Alexey Kardashevskiy wrote:
> 
>>> It is supposed to match exactly the same match table as the pci_driver
>>> above. We *don't* want different behavior from what the standrd PCI
>>> driver matcher will do.
>>
>> This is not a standard PCI driver though
> 
> It is now, that is what this patch makes it into. This is why it now
> has a struct pci_driver.
> 
>> and the main vfio-pci won't have a
>> list to match ever.
> 
> ?? vfio-pci uses driver_override or new_id to manage its match list


Exactly, no list to update.


>> IBM NPU PCI id is unlikely to change ever but NVIDIA keeps making
>> new devices which work in those P9 boxes, are you going to keep
>> adding those ids to nvlink2gpu_vfio_pci_table?
> 
> Certainly, as needed. PCI list updates is normal for the kernel.
> 
>> btw can the id list have only vendor ids and not have device ids?
> 
> The PCI matcher is quite flexable, see the other patch from Max for
> the igd


ah cool, do this for NVIDIA GPUs then please, I just discovered another 
P9 system sold with NVIDIA T4s which is not in your list.


> But best practice is to be as narrow as possible as I hope this will
> eventually impact module autoloading and other details.

The amount of device specific knowledge is too little to tie it up to 
device ids, it is a generic PCI driver with quirks. We do not have a 
separate drivers for the hardware which requires quirks.

And how do you hope this should impact autoloading?
Jason Gunthorpe March 11, 2021, 2 a.m. UTC | #11
On Thu, Mar 11, 2021 at 12:42:56PM +1100, Alexey Kardashevskiy wrote:
> > > btw can the id list have only vendor ids and not have device ids?
> > 
> > The PCI matcher is quite flexable, see the other patch from Max for
> > the igd
>  
> ah cool, do this for NVIDIA GPUs then please, I just discovered another P9
> system sold with NVIDIA T4s which is not in your list.

I think it will make things easier down the road if you maintain an
exact list <shrug>

> > But best practice is to be as narrow as possible as I hope this will
> > eventually impact module autoloading and other details.
> 
> The amount of device specific knowledge is too little to tie it up to device
> ids, it is a generic PCI driver with quirks. We do not have a separate
> drivers for the hardware which requires quirks.

It provides its own capability structure exposed to userspace, that is
absolutely not a "quirk"

> And how do you hope this should impact autoloading?

I would like to autoload the most specific vfio driver for the target
hardware.

If you someday need to support new GPU HW that needs a different VFIO
driver then you are really stuck because things become indeterminate
if there are two devices claiming the ID. We don't have the concept of
"best match", driver core works on exact match.

Jason
Alexey Kardashevskiy March 11, 2021, 7:54 a.m. UTC | #12
On 11/03/2021 13:00, Jason Gunthorpe wrote:
> On Thu, Mar 11, 2021 at 12:42:56PM +1100, Alexey Kardashevskiy wrote:
>>>> btw can the id list have only vendor ids and not have device ids?
>>>
>>> The PCI matcher is quite flexable, see the other patch from Max for
>>> the igd
>>   
>> ah cool, do this for NVIDIA GPUs then please, I just discovered another P9
>> system sold with NVIDIA T4s which is not in your list.
> 
> I think it will make things easier down the road if you maintain an
> exact list <shrug>


Then why do not you do the exact list for Intel IGD? The commit log does 
not explain this detail.


>>> But best practice is to be as narrow as possible as I hope this will
>>> eventually impact module autoloading and other details.
>>
>> The amount of device specific knowledge is too little to tie it up to device
>> ids, it is a generic PCI driver with quirks. We do not have a separate
>> drivers for the hardware which requires quirks.
> 
> It provides its own capability structure exposed to userspace, that is
> absolutely not a "quirk"
> 
>> And how do you hope this should impact autoloading?
> 
> I would like to autoload the most specific vfio driver for the target
> hardware.


Is there an idea how it is going to work? For example, the Intel IGD 
driver and vfio-pci-igd - how should the system pick one? If there is no 
MODULE_DEVICE_TABLE in vfio-pci-xxx, is the user supposed to try binding 
all vfio-pci-xxx drivers until some binds?


> If you someday need to support new GPU HW that needs a different VFIO
> driver then you are really stuck because things become indeterminate
> if there are two devices claiming the ID. We don't have the concept of
> "best match", driver core works on exact match.
Max Gurtovoy March 11, 2021, 9:44 a.m. UTC | #13
On 3/11/2021 9:54 AM, Alexey Kardashevskiy wrote:
>
>
> On 11/03/2021 13:00, Jason Gunthorpe wrote:
>> On Thu, Mar 11, 2021 at 12:42:56PM +1100, Alexey Kardashevskiy wrote:
>>>>> btw can the id list have only vendor ids and not have device ids?
>>>>
>>>> The PCI matcher is quite flexable, see the other patch from Max for
>>>> the igd
>>>   ah cool, do this for NVIDIA GPUs then please, I just discovered 
>>> another P9
>>> system sold with NVIDIA T4s which is not in your list.
>>
>> I think it will make things easier down the road if you maintain an
>> exact list <shrug>
>
>
> Then why do not you do the exact list for Intel IGD? The commit log 
> does not explain this detail.

I expect Intel team to review this series and give a more precise list.

I did the best I could in finding a proper configuration for igd.


>
>
>>>> But best practice is to be as narrow as possible as I hope this will
>>>> eventually impact module autoloading and other details.
>>>
>>> The amount of device specific knowledge is too little to tie it up 
>>> to device
>>> ids, it is a generic PCI driver with quirks. We do not have a separate
>>> drivers for the hardware which requires quirks.
>>
>> It provides its own capability structure exposed to userspace, that is
>> absolutely not a "quirk"
>>
>>> And how do you hope this should impact autoloading?
>>
>> I would like to autoload the most specific vfio driver for the target
>> hardware.
>
>
> Is there an idea how it is going to work? For example, the Intel IGD 
> driver and vfio-pci-igd - how should the system pick one? If there is 
> no MODULE_DEVICE_TABLE in vfio-pci-xxx, is the user supposed to try 
> binding all vfio-pci-xxx drivers until some binds?

For example, in my local setup I did a POC patch that convert some 
drivers to be "manual binding only drivers".

So the IGD driver will have the priority, user will unbind the device 
from it, load igd-vfio-pci, bind the device to it, ends with probing.

For now we separated the driver core stuff until we all agree that this 
series is the right way to go + we also make sure it's backward compatible.

>
>
>> If you someday need to support new GPU HW that needs a different VFIO
>> driver then you are really stuck because things become indeterminate
>> if there are two devices claiming the ID. We don't have the concept of
>> "best match", driver core works on exact match.
>
>
>
Jason Gunthorpe March 11, 2021, 4:51 p.m. UTC | #14
On Thu, Mar 11, 2021 at 11:44:38AM +0200, Max Gurtovoy wrote:
> 
> On 3/11/2021 9:54 AM, Alexey Kardashevskiy wrote:
> > 
> > 
> > On 11/03/2021 13:00, Jason Gunthorpe wrote:
> > > On Thu, Mar 11, 2021 at 12:42:56PM +1100, Alexey Kardashevskiy wrote:
> > > > > > btw can the id list have only vendor ids and not have device ids?
> > > > > 
> > > > > The PCI matcher is quite flexable, see the other patch from Max for
> > > > > the igd
> > > >   ah cool, do this for NVIDIA GPUs then please, I just
> > > > discovered another P9
> > > > system sold with NVIDIA T4s which is not in your list.
> > > 
> > > I think it will make things easier down the road if you maintain an
> > > exact list <shrug>
> > 
> > 
> > Then why do not you do the exact list for Intel IGD? The commit log does
> > not explain this detail.
> 
> I expect Intel team to review this series and give a more precise list.
> 
> I did the best I could in finding a proper configuration for igd.

Right. Doing this retroactively is really hard.

Jason
Jason Gunthorpe March 11, 2021, 5:01 p.m. UTC | #15
On Thu, Mar 11, 2021 at 06:54:09PM +1100, Alexey Kardashevskiy wrote:

> Is there an idea how it is going to work? For example, the Intel IGD driver
> and vfio-pci-igd - how should the system pick one? If there is no
> MODULE_DEVICE_TABLE in vfio-pci-xxx, is the user supposed to try binding all
> vfio-pci-xxx drivers until some binds?

We must expose some MODULE_DEVICE_TABLE like thing to userspace.

Compiling everything into one driver and using if statements was only
managable with these tiny drivers - the stuff that is coming are big
things that are infeasible to link directly to vfio_pci.ko

I'm feeling some general consensus around this approach (vs trying to
make a subdriver) so we will start looking at exactly what form that
could take soon.

The general idea would be to have a selection of extended VFIO drivers
for PCI devices that can be loaded as an alternative to vfio-pci and
they provide additional uapi and behaviors that only work on specific
hardware. nvlink is a good example because it does provide new API and
additional HW specific behavior.

A way for userspace to learn about the drivers automatically and sort
out how to load and bind them.

I was thinking about your earlier question about FDT - do you think we
could switch this to a platform_device and provide an of_match_table
that would select correctly? Did IBM enforce a useful compatible
string in the DT for these things?

Jason
Alex Williamson March 19, 2021, 3:23 p.m. UTC | #16
On Wed, 10 Mar 2021 14:57:57 +0200
Max Gurtovoy <mgurtovoy@nvidia.com> wrote:
> On 3/10/2021 8:39 AM, Alexey Kardashevskiy wrote:
> > On 09/03/2021 19:33, Max Gurtovoy wrote:  
> >> +static const struct pci_device_id nvlink2gpu_vfio_pci_table[] = {
> >> +    { PCI_VDEVICE(NVIDIA, 0x1DB1) }, /* GV100GL-A NVIDIA Tesla 
> >> V100-SXM2-16GB */
> >> +    { PCI_VDEVICE(NVIDIA, 0x1DB5) }, /* GV100GL-A NVIDIA Tesla 
> >> V100-SXM2-32GB */
> >> +    { PCI_VDEVICE(NVIDIA, 0x1DB8) }, /* GV100GL-A NVIDIA Tesla 
> >> V100-SXM3-32GB */
> >> +    { PCI_VDEVICE(NVIDIA, 0x1DF5) }, /* GV100GL-B NVIDIA Tesla 
> >> V100-SXM2-16GB */  
> >
> >
> > Where is this list from?
> >
> > Also, how is this supposed to work at the boot time? Will the kernel 
> > try binding let's say this one and nouveau? Which one is going to win?  
> 
> At boot time nouveau driver will win since the vfio drivers don't 
> declare MODULE_DEVICE_TABLE

This still seems troublesome, AIUI the MODULE_DEVICE_TABLE is
responsible for creating aliases so that kmod can figure out which
modules to load, but what happens if all these vfio-pci modules are
built into the kernel or the modules are already loaded?

In the former case, I think it boils down to link order while the
latter is generally considered even less deterministic since it depends
on module load order.  So if one of these vfio modules should get
loaded before the native driver, I think devices could bind here first.

Are there tricks/extensions we could use in driver overrides, for
example maybe a compatibility alias such that one of these vfio-pci
variants could match "vfio-pci"?  Perhaps that, along with some sort of
priority scheme to probe variants ahead of the base driver, though I'm
not sure how we'd get these variants loaded without something like
module aliases.  I know we're trying to avoid creating another level of
driver matching, but that's essentially what we have in the compat
option enabled here, and I'm not sure I see how userspace makes the
leap to understand what driver to use for a given device.  Thanks,

Alex
Jason Gunthorpe March 19, 2021, 4:17 p.m. UTC | #17
On Fri, Mar 19, 2021 at 09:23:41AM -0600, Alex Williamson wrote:
> On Wed, 10 Mar 2021 14:57:57 +0200
> Max Gurtovoy <mgurtovoy@nvidia.com> wrote:
> > On 3/10/2021 8:39 AM, Alexey Kardashevskiy wrote:
> > > On 09/03/2021 19:33, Max Gurtovoy wrote:  
> > >> +static const struct pci_device_id nvlink2gpu_vfio_pci_table[] = {
> > >> +    { PCI_VDEVICE(NVIDIA, 0x1DB1) }, /* GV100GL-A NVIDIA Tesla 
> > >> V100-SXM2-16GB */
> > >> +    { PCI_VDEVICE(NVIDIA, 0x1DB5) }, /* GV100GL-A NVIDIA Tesla 
> > >> V100-SXM2-32GB */
> > >> +    { PCI_VDEVICE(NVIDIA, 0x1DB8) }, /* GV100GL-A NVIDIA Tesla 
> > >> V100-SXM3-32GB */
> > >> +    { PCI_VDEVICE(NVIDIA, 0x1DF5) }, /* GV100GL-B NVIDIA Tesla 
> > >> V100-SXM2-16GB */  
> > >
> > >
> > > Where is this list from?
> > >
> > > Also, how is this supposed to work at the boot time? Will the kernel 
> > > try binding let's say this one and nouveau? Which one is going to win?  
> > 
> > At boot time nouveau driver will win since the vfio drivers don't 
> > declare MODULE_DEVICE_TABLE
> 
> This still seems troublesome, AIUI the MODULE_DEVICE_TABLE is
> responsible for creating aliases so that kmod can figure out which
> modules to load, but what happens if all these vfio-pci modules are
> built into the kernel or the modules are already loaded?

I think we talked about this.. We still need a better way to control
binding of VFIO modules - now that we have device-specific modules we
must have these match tables to control what devices they connect
to.

Previously things used the binding of vfio_pci as the "switch" and
hardcoded all the matches inside it.

I'm still keen to try the "driver flavour" idea I outlined earlier,
but it is hard to say what will resonate with Greg.

> In the former case, I think it boils down to link order while the
> latter is generally considered even less deterministic since it depends
> on module load order.  So if one of these vfio modules should get
> loaded before the native driver, I think devices could bind here first.

At this point - "don't link these statically", we could have a kconfig
to prevent it.

> Are there tricks/extensions we could use in driver overrides, for
> example maybe a compatibility alias such that one of these vfio-pci
> variants could match "vfio-pci"?

driver override is not really useful as soon as you have a match table
as its operation is to defeat the match table entirely. :(

Again, this is still more of a outline how things will look as we must
get through this before we can attempt to do something in the driver
core with Greg.

We could revise this series to not register drivers at all and keep
the uAPI view exactly as is today. This would allow enough code to
show Greg how some driver flavour thing would work.

If soemthing can't be done in the driver core, I'd propse to keep the
same basic outline Max has here, but make registering the "compat"
dynamic - it is basically a sub-driver design at that point and we
give up achieving module autoloading.

Jason
Christoph Hellwig March 19, 2021, 4:20 p.m. UTC | #18
On Fri, Mar 19, 2021 at 01:17:22PM -0300, Jason Gunthorpe wrote:
> I think we talked about this.. We still need a better way to control
> binding of VFIO modules - now that we have device-specific modules we
> must have these match tables to control what devices they connect
> to.
> 
> Previously things used the binding of vfio_pci as the "switch" and
> hardcoded all the matches inside it.
> 
> I'm still keen to try the "driver flavour" idea I outlined earlier,
> but it is hard to say what will resonate with Greg.

IMHO the only model that really works and makes sense is to turn the
whole model around and make vfio a library called by the actual driver
for the device.  That is any device that needs device specific
funtionality simply needs a proper in-kernel driver, which then can be
switched to a vfio mode where all the normal subsystems are unbound
from the device, and VFIO functionality is found to it all while _the_
driver that controls the PCI ID is still in charge of it.

vfio_pci remains a separate driver not binding to any ID by default
and not having any device specific functionality.
Jason Gunthorpe March 19, 2021, 4:28 p.m. UTC | #19
On Fri, Mar 19, 2021 at 05:20:33PM +0100, Christoph Hellwig wrote:
> On Fri, Mar 19, 2021 at 01:17:22PM -0300, Jason Gunthorpe wrote:
> > I think we talked about this.. We still need a better way to control
> > binding of VFIO modules - now that we have device-specific modules we
> > must have these match tables to control what devices they connect
> > to.
> > 
> > Previously things used the binding of vfio_pci as the "switch" and
> > hardcoded all the matches inside it.
> > 
> > I'm still keen to try the "driver flavour" idea I outlined earlier,
> > but it is hard to say what will resonate with Greg.
> 
> IMHO the only model that really works and makes sense is to turn the
> whole model around and make vfio a library called by the actual driver
> for the device.  That is any device that needs device specific
> funtionality simply needs a proper in-kernel driver, which then can be
> switched to a vfio mode where all the normal subsystems are unbound
> from the device, and VFIO functionality is found to it all while _the_
> driver that controls the PCI ID is still in charge of it.

Yes, this is what I want to strive for with Greg.

It would also resolve alot of the uncomfortable code I see in VFIO
using the driver core. For instance, when a device is moved to 'vfio
mode' it can go through and *lock* the entire group of devices to
'vfio mode' or completely fail.

This would replace all the protective code that is all about ensuring
the admin doesn't improperly mix & match in-kernel and vfio drivers
within a security domain.

The wrinkle I don't yet have an easy answer to is how to load vfio_pci
as a universal "default" within the driver core lazy bind scheme and
still have working module autoloading... I'm hoping to get some
research into this..

Jason
Christoph Hellwig March 19, 2021, 4:34 p.m. UTC | #20
On Fri, Mar 19, 2021 at 01:28:48PM -0300, Jason Gunthorpe wrote:
> The wrinkle I don't yet have an easy answer to is how to load vfio_pci
> as a universal "default" within the driver core lazy bind scheme and
> still have working module autoloading... I'm hoping to get some
> research into this..

Should we even load it by default?  One answer would be that the sysfs
file to switch to vfio mode goes into the core PCI layer, and that core
PCI code would contain a hack^H^H^H^Hhook to first load and bind vfio_pci
for that device.
Alex Williamson March 19, 2021, 5:36 p.m. UTC | #21
On Fri, 19 Mar 2021 17:34:49 +0100
Christoph Hellwig <hch@lst.de> wrote:

> On Fri, Mar 19, 2021 at 01:28:48PM -0300, Jason Gunthorpe wrote:
> > The wrinkle I don't yet have an easy answer to is how to load vfio_pci
> > as a universal "default" within the driver core lazy bind scheme and
> > still have working module autoloading... I'm hoping to get some
> > research into this..  

What about using MODULE_SOFTDEP("pre: ...") in the vfio-pci base
driver, which would load all the known variants in order to influence
the match, and therefore probe ordering?

If we coupled that with wildcard support in driver_override, ex.
"vfio_pci*", and used consistent module naming, I think we'd only need
to teach userspace about this wildcard and binding to a specific module
would come for free.  This assumes we drop the per-variant id_table and
use the probe function to skip devices without the necessary
requirements, either wrong device or missing the tables we expect to
expose.

> Should we even load it by default?  One answer would be that the sysfs
> file to switch to vfio mode goes into the core PCI layer, and that core
> PCI code would contain a hack^H^H^H^Hhook to first load and bind vfio_pci
> for that device.

Generally we don't want to be the default driver for anything (I think
mdev devices are the exception).  Assignment to userspace or VM is a
niche use case.  Thanks,

Alex
Jason Gunthorpe March 19, 2021, 8:07 p.m. UTC | #22
On Fri, Mar 19, 2021 at 11:36:42AM -0600, Alex Williamson wrote:
> On Fri, 19 Mar 2021 17:34:49 +0100
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > On Fri, Mar 19, 2021 at 01:28:48PM -0300, Jason Gunthorpe wrote:
> > > The wrinkle I don't yet have an easy answer to is how to load vfio_pci
> > > as a universal "default" within the driver core lazy bind scheme and
> > > still have working module autoloading... I'm hoping to get some
> > > research into this..  
> 
> What about using MODULE_SOFTDEP("pre: ...") in the vfio-pci base
> driver, which would load all the known variants in order to influence
> the match, and therefore probe ordering?

The way the driver core works is to first match against the already
loaded driver list, then trigger an event for module loading and when
new drivers are registered they bind to unbound devices.

So, the trouble is the event through userspace because the kernel
can't just go on to use vfio_pci until it knows userspace has failed
to satisfy the load request.

One answer is to have userspace udev have the "hook" here and when a
vfio flavour mod alias is requested on a PCI device it swaps in
vfio_pci if it can't find an alternative.

The dream would be a system with no vfio modules loaded could do some

 echo "vfio" > /sys/bus/pci/xxx/driver_flavour

And a module would be loaded and a struct vfio_device is created for
that device. Very easy for the user.

> If we coupled that with wildcard support in driver_override, ex.
> "vfio_pci*", and used consistent module naming, I think we'd only need
> to teach userspace about this wildcard and binding to a specific module
> would come for free.

What would the wildcard do?

> This assumes we drop the per-variant id_table and use the probe
> function to skip devices without the necessary requirements, either
> wrong device or missing the tables we expect to expose.

Without a module table how do we know which driver is which? 

Open coding a match table in probe() and returning failure feels hacky
to me.

> > Should we even load it by default?  One answer would be that the sysfs
> > file to switch to vfio mode goes into the core PCI layer, and that core
> > PCI code would contain a hack^H^H^H^Hhook to first load and bind vfio_pci
> > for that device.
> 
> Generally we don't want to be the default driver for anything (I think
> mdev devices are the exception).  Assignment to userspace or VM is a
> niche use case.  Thanks,

By "default" I mean if the user says device A is in "vfio" mode then
the kernel should
 - Search for a specific driver for this device and autoload it
 - If no specific driver is found then attach a default "universal"
   driver for it. vfio_pci is a universal driver.

vfio_platform is also a "universal" driver when in ACPI mode, in some
cases.

For OF cases platform it builts its own little subsystem complete with
autoloading:

                request_module("vfio-reset:%s", vdev->compat);
                vdev->of_reset = vfio_platform_lookup_reset(vdev->compat,
                                                        &vdev->reset_module);

And it is a good example of why I don't like this subsystem design
because vfio_platform doesn't do the driver loading for OF entirely
right, vdev->compat is a single string derived from the compatible
property:

        ret = device_property_read_string(dev, "compatible",
                                          &vdev->compat);
        if (ret)
                dev_err(dev, "Cannot retrieve compat for %s\n", vdev->name);

Unfortunately OF requires that compatible is a *list* of strings and a
correct driver is supposed to evaluate all of them. The driver core
does this all correctly, and this was lost when it was open coded
here.

We should NOT be avoiding the standard infrastructure for matching
drivers to devices by re-implementing it poorly.

Jason
Alex Williamson March 19, 2021, 9:08 p.m. UTC | #23
On Fri, 19 Mar 2021 17:07:49 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Fri, Mar 19, 2021 at 11:36:42AM -0600, Alex Williamson wrote:
> > On Fri, 19 Mar 2021 17:34:49 +0100
> > Christoph Hellwig <hch@lst.de> wrote:
> >   
> > > On Fri, Mar 19, 2021 at 01:28:48PM -0300, Jason Gunthorpe wrote:  
> > > > The wrinkle I don't yet have an easy answer to is how to load vfio_pci
> > > > as a universal "default" within the driver core lazy bind scheme and
> > > > still have working module autoloading... I'm hoping to get some
> > > > research into this..    
> > 
> > What about using MODULE_SOFTDEP("pre: ...") in the vfio-pci base
> > driver, which would load all the known variants in order to influence
> > the match, and therefore probe ordering?  
> 
> The way the driver core works is to first match against the already
> loaded driver list, then trigger an event for module loading and when
> new drivers are registered they bind to unbound devices.

The former is based on id_tables, the latter on MODULE_DEVICE_TABLE, we
don't have either of those.  As noted to Christoph, the cases where we
want a vfio driver to bind to anything automatically is the exception.
 
> So, the trouble is the event through userspace because the kernel
> can't just go on to use vfio_pci until it knows userspace has failed
> to satisfy the load request.

Given that we don't use MODULE_DEVICE_TABLE, vfio-pci doesn't autoload.
AFAIK, all tools like libvirt and driverctl that typically bind devices
to vfio-pci will manually load vfio-pci.  I think we can take advantage
of that.

> One answer is to have userspace udev have the "hook" here and when a
> vfio flavour mod alias is requested on a PCI device it swaps in
> vfio_pci if it can't find an alternative.
> 
> The dream would be a system with no vfio modules loaded could do some
> 
>  echo "vfio" > /sys/bus/pci/xxx/driver_flavour
> 
> And a module would be loaded and a struct vfio_device is created for
> that device. Very easy for the user.

This is like switching a device to a parallel universe where we do
want vfio drivers to bind automatically to devices.
 
> > If we coupled that with wildcard support in driver_override, ex.
> > "vfio_pci*", and used consistent module naming, I think we'd only need
> > to teach userspace about this wildcard and binding to a specific module
> > would come for free.  
> 
> What would the wildcard do?

It allows a driver_override to match more than one driver, not too
dissimilar to your driver_flavor above.  In this case it would match
all driver names starting with "vfio_pci".  For example if we had:

softdep vfio-pci pre: vfio-pci-foo vfio-pci-bar

Then we'd pre-seed the condition that drivers foo and bar precede the
base vfio-pci driver, each will match the device to the driver and have
an opportunity in their probe function to either claim or skip the
device.  Userspace could also set and exact driver_override, for
example if they want to force using the base vfio-pci driver or go
directly to a specific variant.
 
> > This assumes we drop the per-variant id_table and use the probe
> > function to skip devices without the necessary requirements, either
> > wrong device or missing the tables we expect to expose.  
> 
> Without a module table how do we know which driver is which? 
> 
> Open coding a match table in probe() and returning failure feels hacky
> to me.

How's it any different than Max's get_foo_vfio_pci_driver() that calls
pci_match_id() with an internal match table?  It seems a better fit for
the existing use cases, for example the IGD variant can use a single
line table to exclude all except Intel VGA class devices in its probe
callback, then test availability of the extra regions we'd expose,
otherwise return -ENODEV.  The NVLink variant can use pci_match_id() in
the probe callback to filter out anything other than NVIDIA VGA or 3D
accelerator class devices, then check for associated FDT table, or
return -ENODEV.  We already use the vfio_pci probe function to exclude
devices in the deny-list and non-endpoint devices.  Many drivers
clearly place implicit trust in their id_table, others don't.  In the
case of meta drivers, I think it's fair to make use of the latter
approach.

> > > Should we even load it by default?  One answer would be that the sysfs
> > > file to switch to vfio mode goes into the core PCI layer, and that core
> > > PCI code would contain a hack^H^H^H^Hhook to first load and bind vfio_pci
> > > for that device.  
> > 
> > Generally we don't want to be the default driver for anything (I think
> > mdev devices are the exception).  Assignment to userspace or VM is a
> > niche use case.  Thanks,  
> 
> By "default" I mean if the user says device A is in "vfio" mode then
> the kernel should
>  - Search for a specific driver for this device and autoload it
>  - If no specific driver is found then attach a default "universal"
>    driver for it. vfio_pci is a universal driver.
> 
> vfio_platform is also a "universal" driver when in ACPI mode, in some
> cases.
> 
> For OF cases platform it builts its own little subsystem complete with
> autoloading:
> 
>                 request_module("vfio-reset:%s", vdev->compat);
>                 vdev->of_reset = vfio_platform_lookup_reset(vdev->compat,
>                                                         &vdev->reset_module);
> 
> And it is a good example of why I don't like this subsystem design
> because vfio_platform doesn't do the driver loading for OF entirely
> right, vdev->compat is a single string derived from the compatible
> property:
> 
>         ret = device_property_read_string(dev, "compatible",
>                                           &vdev->compat);
>         if (ret)
>                 dev_err(dev, "Cannot retrieve compat for %s\n", vdev->name);
> 
> Unfortunately OF requires that compatible is a *list* of strings and a
> correct driver is supposed to evaluate all of them. The driver core
> does this all correctly, and this was lost when it was open coded
> here.
> 
> We should NOT be avoiding the standard infrastructure for matching
> drivers to devices by re-implementing it poorly.

I take some blame for the request_module() behavior of vfio-platform,
but I think we're on the same page that we don't want to turn vfio-pci
into a nexus for loading variant drivers.  Whatever solution we use for
vfio-pci might translate to replacing that vfio-platform behavior.  As
above, I think it's possible to create that alternate universe of
driver matching with a simple wildcard and load ordering approach,
performing the more specific filtering in the probe callback with fall
through to the next matching driver.  Thanks,

Alex
Jason Gunthorpe March 19, 2021, 10:59 p.m. UTC | #24
On Fri, Mar 19, 2021 at 03:08:09PM -0600, Alex Williamson wrote:
> On Fri, 19 Mar 2021 17:07:49 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Fri, Mar 19, 2021 at 11:36:42AM -0600, Alex Williamson wrote:
> > > On Fri, 19 Mar 2021 17:34:49 +0100
> > > Christoph Hellwig <hch@lst.de> wrote:
> > >   
> > > > On Fri, Mar 19, 2021 at 01:28:48PM -0300, Jason Gunthorpe wrote:  
> > > > > The wrinkle I don't yet have an easy answer to is how to load vfio_pci
> > > > > as a universal "default" within the driver core lazy bind scheme and
> > > > > still have working module autoloading... I'm hoping to get some
> > > > > research into this..    
> > > 
> > > What about using MODULE_SOFTDEP("pre: ...") in the vfio-pci base
> > > driver, which would load all the known variants in order to influence
> > > the match, and therefore probe ordering?  
> > 
> > The way the driver core works is to first match against the already
> > loaded driver list, then trigger an event for module loading and when
> > new drivers are registered they bind to unbound devices.
> 
> The former is based on id_tables, the latter on MODULE_DEVICE_TABLE, we
> don't have either of those.

Well, today we don't, but Max here adds id_table's to the special
devices and a MODULE_DEVICE_TABLE would come too if we do the flavours
thing below.

My starting thinking is that everything should have these tables and
they should work properly..

> As noted to Christoph, the cases where we want a vfio driver to
> bind to anything automatically is the exception.

I agree vfio should not automatically claim devices, but once vfio is
told to claim a device everything from there after should be
automatic.

> > One answer is to have userspace udev have the "hook" here and when a
> > vfio flavour mod alias is requested on a PCI device it swaps in
> > vfio_pci if it can't find an alternative.
> > 
> > The dream would be a system with no vfio modules loaded could do some
> > 
> >  echo "vfio" > /sys/bus/pci/xxx/driver_flavour
> > 
> > And a module would be loaded and a struct vfio_device is created for
> > that device. Very easy for the user.
> 
> This is like switching a device to a parallel universe where we do
> want vfio drivers to bind automatically to devices.

Yes.

If we do this I'd probably suggest that driver_override be bumped down
to some user compat and 'vfio > driver_override' would just set the
flavour.

As-is driver_override seems dangerous as overriding the matching table
could surely allow root userspace to crash the machine. In situations
with trusted boot/signed modules this shouldn't be.

> > > If we coupled that with wildcard support in driver_override, ex.
> > > "vfio_pci*", and used consistent module naming, I think we'd only need
> > > to teach userspace about this wildcard and binding to a specific module
> > > would come for free.  
> > 
> > What would the wildcard do?
> 
> It allows a driver_override to match more than one driver, not too
> dissimilar to your driver_flavor above.  In this case it would match
> all driver names starting with "vfio_pci".  For example if we had:
> 
> softdep vfio-pci pre: vfio-pci-foo vfio-pci-bar
>
> Then we'd pre-seed the condition that drivers foo and bar precede the
> base vfio-pci driver, each will match the device to the driver and have
> an opportunity in their probe function to either claim or skip the
> device.  Userspace could also set and exact driver_override, for
> example if they want to force using the base vfio-pci driver or go
> directly to a specific variant.

Okay, I see. The problem is that this makes 'vfio-pci' monolithic, in
normal situations it will load *everything*.

While that might not seem too bad with these simple drivers, at least
the mlx5 migration driver will have a large dependency tree and pull
in lots of other modules. Even Max's sample from v1 pulls in mlx5_core.ko
and a bunch of other stuff in its orbit.

This is why I want to try for fine grained autoloading first. It
really is the elegant solution if we can work it out.

> > Open coding a match table in probe() and returning failure feels hacky
> > to me.
> 
> How's it any different than Max's get_foo_vfio_pci_driver() that calls
> pci_match_id() with an internal match table?  

Well, I think that is hacky too - but it is hacky only to service user
space compatability so lets put that aside

> It seems a better fit for the existing use cases, for example the
> IGD variant can use a single line table to exclude all except Intel
> VGA class devices in its probe callback, then test availability of
> the extra regions we'd expose, otherwise return -ENODEV.

I don't think we should over-focus on these two firmware triggered
examples. I looked at the Intel GPU driver and it already only reads
the firmware thing for certain PCI ID's, we can absolutely generate a
narrow match table for it. Same is true for the NVIDIA GPU.

The fact this is hard or whatever is beside the point - future drivers
in this scheme should have exact match tables. 

The mlx5 sample is a good example, as it matches a very narrow NVMe
device that is properly labeled with a subvendor ID. It does not match
every NVMe device and then run code to figure it out. I think this is
the right thing to do as it is the only thing that would give us fine
grained module loading.

Even so, I'm not *so* worried about "over matching" - if IGD or the
nvidia stuff load on a wide set of devices then they can just not
enable their extended stuff. It wastes some kernel memory, but it is
OK.

And if some driver *really* gets stuck here the true answer is to
improve the driver core match capability.

> devices in the deny-list and non-endpoint devices.  Many drivers
> clearly place implicit trust in their id_table, others don't.  In the
> case of meta drivers, I think it's fair to make use of the latter
> approach.

Well, AFAIK, the driver core doesn't have a 'try probe, if it fails
then try another driver' approach. One device, one driver. Am I
missing something?

I would prefer not to propose to Greg such a radical change to how
driver loading works..

I also think the softdep/implicit loading/ordering will not be
welcomed, it feels weird to me.

> > We should NOT be avoiding the standard infrastructure for matching
> > drivers to devices by re-implementing it poorly.
> 
> I take some blame for the request_module() behavior of vfio-platform,
> but I think we're on the same page that we don't want to turn
> vfio-pci

Okay, that's good, we can explore the driver core side and see what
could work to decide if we can do fine-grained loading or not.

The question is now how to stage all this work? It is too big to do
all in one shot - can we reform Max's series into something mergable
without the driver core part? For instance removing the
pci_device_driver and dedicated modules and only doing the compat
path? It would look the same from userspace, but the internals would
be split to a library mode.

The patch to add in the device driver would then be small enough to go
along with future driver core changes, and if it fails we still have
several fallback plans to use the librarizided version,.

> into a nexus for loading variant drivers.  Whatever solution we use for
> vfio-pci might translate to replacing that vfio-platform behavior.  

Yes, I'd want to see this too. It shows it is a general enough
idea. Greg has been big on asking for lots of users for driver core
changes, so it is good we have more things.

Thanks,
Jason
Alex Williamson March 20, 2021, 4:40 a.m. UTC | #25
On Fri, 19 Mar 2021 19:59:43 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Fri, Mar 19, 2021 at 03:08:09PM -0600, Alex Williamson wrote:
> > On Fri, 19 Mar 2021 17:07:49 -0300
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >   
> > > On Fri, Mar 19, 2021 at 11:36:42AM -0600, Alex Williamson wrote:  
> > > > On Fri, 19 Mar 2021 17:34:49 +0100
> > > > Christoph Hellwig <hch@lst.de> wrote:
> > > >     
> > > > > On Fri, Mar 19, 2021 at 01:28:48PM -0300, Jason Gunthorpe wrote:    
> > > > > > The wrinkle I don't yet have an easy answer to is how to load vfio_pci
> > > > > > as a universal "default" within the driver core lazy bind scheme and
> > > > > > still have working module autoloading... I'm hoping to get some
> > > > > > research into this..      
> > > > 
> > > > What about using MODULE_SOFTDEP("pre: ...") in the vfio-pci base
> > > > driver, which would load all the known variants in order to influence
> > > > the match, and therefore probe ordering?    
> > > 
> > > The way the driver core works is to first match against the already
> > > loaded driver list, then trigger an event for module loading and when
> > > new drivers are registered they bind to unbound devices.  
> > 
> > The former is based on id_tables, the latter on MODULE_DEVICE_TABLE, we
> > don't have either of those.  
> 
> Well, today we don't, but Max here adds id_table's to the special
> devices and a MODULE_DEVICE_TABLE would come too if we do the flavours
> thing below.

I think the id_tables are the wrong approach for IGD and NVLink
variants.
 
> My starting thinking is that everything should have these tables and
> they should work properly..

id_tables require ongoing maintenance whereas the existing variants
require only vendor + device class and some platform feature, like a
firmware or fdt table.  They're meant to only add extra regions to
vfio-pci base support, not extensively modify the device interface.
 
> > As noted to Christoph, the cases where we want a vfio driver to
> > bind to anything automatically is the exception.  
> 
> I agree vfio should not automatically claim devices, but once vfio is
> told to claim a device everything from there after should be
> automatic.
> 
> > > One answer is to have userspace udev have the "hook" here and when a
> > > vfio flavour mod alias is requested on a PCI device it swaps in
> > > vfio_pci if it can't find an alternative.
> > > 
> > > The dream would be a system with no vfio modules loaded could do some
> > > 
> > >  echo "vfio" > /sys/bus/pci/xxx/driver_flavour
> > > 
> > > And a module would be loaded and a struct vfio_device is created for
> > > that device. Very easy for the user.  
> > 
> > This is like switching a device to a parallel universe where we do
> > want vfio drivers to bind automatically to devices.  
> 
> Yes.
> 
> If we do this I'd probably suggest that driver_override be bumped down
> to some user compat and 'vfio > driver_override' would just set the
> flavour.
> 
> As-is driver_override seems dangerous as overriding the matching table
> could surely allow root userspace to crash the machine. In situations
> with trusted boot/signed modules this shouldn't be.

When we're dealing with meta-drivers that can bind to anything, we
shouldn't rely on the match, but should instead verify the driver is
appropriate in the probe callback.  Even without driver_override,
there's the new_id mechanism.  Either method allows the root user to
break driver binding.  Greg has previously stated something to the
effect that users get to keep all the pieces when they break something
by manipulating driver binding.

> > > > If we coupled that with wildcard support in driver_override, ex.
> > > > "vfio_pci*", and used consistent module naming, I think we'd only need
> > > > to teach userspace about this wildcard and binding to a specific module
> > > > would come for free.    
> > > 
> > > What would the wildcard do?  
> > 
> > It allows a driver_override to match more than one driver, not too
> > dissimilar to your driver_flavor above.  In this case it would match
> > all driver names starting with "vfio_pci".  For example if we had:
> > 
> > softdep vfio-pci pre: vfio-pci-foo vfio-pci-bar
> >
> > Then we'd pre-seed the condition that drivers foo and bar precede the
> > base vfio-pci driver, each will match the device to the driver and have
> > an opportunity in their probe function to either claim or skip the
> > device.  Userspace could also set and exact driver_override, for
> > example if they want to force using the base vfio-pci driver or go
> > directly to a specific variant.  
> 
> Okay, I see. The problem is that this makes 'vfio-pci' monolithic, in
> normal situations it will load *everything*.
> 
> While that might not seem too bad with these simple drivers, at least
> the mlx5 migration driver will have a large dependency tree and pull
> in lots of other modules. Even Max's sample from v1 pulls in mlx5_core.ko
> and a bunch of other stuff in its orbit.

Luckily the mlx5 driver doesn't need to be covered by compatibility
support, so we don't need to set a softdep for it and the module could
be named such that a wildcard driver_override of vfio_pci* shouldn't
logically include that driver.  Users can manually create their own
modprobe.d softdep entry if they'd like to include it.  Otherwise
userspace would need to know to bind to it specifically.
 
> This is why I want to try for fine grained autoloading first. It
> really is the elegant solution if we can work it out.

I just don't see how we create a manageable change to userspace.

> > > Open coding a match table in probe() and returning failure feels hacky
> > > to me.  
> > 
> > How's it any different than Max's get_foo_vfio_pci_driver() that calls
> > pci_match_id() with an internal match table?    
> 
> Well, I think that is hacky too - but it is hacky only to service user
> space compatability so lets put that aside

I don't see that dropping incompatible devices in the probe function
rather than the match via id_table is necessarily a hack.  I think
driver-core explicitly supports this (see below).

> > It seems a better fit for the existing use cases, for example the
> > IGD variant can use a single line table to exclude all except Intel
> > VGA class devices in its probe callback, then test availability of
> > the extra regions we'd expose, otherwise return -ENODEV.  
> 
> I don't think we should over-focus on these two firmware triggered
> examples. I looked at the Intel GPU driver and it already only reads
> the firmware thing for certain PCI ID's, we can absolutely generate a
> narrow match table for it. Same is true for the NVIDIA GPU.

I'm not sure we can make this assertion, both only care about the type
of device and existence of associated firmware tables.  No PCI IDs are
currently involved.

> The fact this is hard or whatever is beside the point - future drivers
> in this scheme should have exact match tables. 
> 
> The mlx5 sample is a good example, as it matches a very narrow NVMe
> device that is properly labeled with a subvendor ID. It does not match
> every NVMe device and then run code to figure it out. I think this is
> the right thing to do as it is the only thing that would give us fine
> grained module loading.

Sounds like the right thing to do for that device, if it's only designed
to run in this framework.  That's more like the mdev device model

> Even so, I'm not *so* worried about "over matching" - if IGD or the
> nvidia stuff load on a wide set of devices then they can just not
> enable their extended stuff. It wastes some kernel memory, but it is
> OK.

I'd rather they bind to the base vfio-pci driver if their extended
features are not available.

> And if some driver *really* gets stuck here the true answer is to
> improve the driver core match capability.
> 
> > devices in the deny-list and non-endpoint devices.  Many drivers
> > clearly place implicit trust in their id_table, others don't.  In the
> > case of meta drivers, I think it's fair to make use of the latter
> > approach.  
> 
> Well, AFAIK, the driver core doesn't have a 'try probe, if it fails
> then try another driver' approach. One device, one driver. Am I
> missing something?

If the driver probe callback fails, really_probe() returns 0 with the
comment:

        /*
         * Ignore errors returned by ->probe so that the next driver can try
         * its luck.
         */
        ret = 0;

That allows bus_for_each_drv() to continue to iterate.

> 
> I would prefer not to propose to Greg such a radical change to how
> driver loading works..

Seems to be how it works already.
 
> I also think the softdep/implicit loading/ordering will not be
> welcomed, it feels weird to me.

AFAICT, it works within the existing driver-core, it's largely an
extension to pci-core driver_override support to enable wildcard
matching, ideally along with adding the same for all buses that support
driver_override.  Thanks,

Alex
Jason Gunthorpe March 21, 2021, 12:58 p.m. UTC | #26
On Fri, Mar 19, 2021 at 10:40:28PM -0600, Alex Williamson wrote:

> > Well, today we don't, but Max here adds id_table's to the special
> > devices and a MODULE_DEVICE_TABLE would come too if we do the flavours
> > thing below.
> 
> I think the id_tables are the wrong approach for IGD and NVLink
> variants.

I really disagree with this. Checking for some random bits in firmware
and assuming that every device made forever into the future works with
this check is not a good way to do compatibility. Christoph made the
same point.

We have good processes to maintain id tables, I don't see this as a
problem.

> > As-is driver_override seems dangerous as overriding the matching table
> > could surely allow root userspace to crash the machine. In situations
> > with trusted boot/signed modules this shouldn't be.
> 
> When we're dealing with meta-drivers that can bind to anything, we
> shouldn't rely on the match, but should instead verify the driver is
> appropriate in the probe callback.  Even without driver_override,
> there's the new_id mechanism.  Either method allows the root user to
> break driver binding.  Greg has previously stated something to the
> effect that users get to keep all the pieces when they break something
> by manipulating driver binding.

Yes, but that is a view where root is allowed to break the kernel, we
now have this optional other world where that is not allowed and root
access to lots of dangerous things are now disabled.

new_id and driver_override should probably be in that disable list
too..

> > While that might not seem too bad with these simple drivers, at least
> > the mlx5 migration driver will have a large dependency tree and pull
> > in lots of other modules. Even Max's sample from v1 pulls in mlx5_core.ko
> > and a bunch of other stuff in its orbit.
> 
> Luckily the mlx5 driver doesn't need to be covered by compatibility
> support, so we don't need to set a softdep for it and the module could
> be named such that a wildcard driver_override of vfio_pci* shouldn't
> logically include that driver.  Users can manually create their own
> modprobe.d softdep entry if they'd like to include it.  Otherwise
> userspace would need to know to bind to it specifically.

But now you are giving up on the whole point, which was to
automatically load the correct specific module without special admin
involvement!

> > This is why I want to try for fine grained autoloading first. It
> > really is the elegant solution if we can work it out.
> 
> I just don't see how we create a manageable change to userspace.

I'm not sure I understand. Even if we add a new sysfs to set some
flavour then that is a pretty trivial change for userspace to move
from driver_override?

> > I don't think we should over-focus on these two firmware triggered
> > examples. I looked at the Intel GPU driver and it already only reads
> > the firmware thing for certain PCI ID's, we can absolutely generate a
> > narrow match table for it. Same is true for the NVIDIA GPU.
> 
> I'm not sure we can make this assertion, both only care about the type
> of device and existence of associated firmware tables.  

Well, I read through the Intel GPU driver and this is how I felt it
works. It doesn't even check the firmware bit unless certain PCI IDs
are matched first.

For NVIDIA GPU Max checked internally and we saw it looks very much
like how Intel GPU works. Only some PCI IDs trigger checking on the
feature the firmware thing is linked to.

My point is: the actual *drivers* consuming these firmware features do
*not* blindly match every PCI device and check for the firmware
bit. They all have narrow matches and further only try to use the
firmware thing for some subset of PCI IDs that the entire driver
supports.

Given that the actual drivers work this way there is no technical
reason vfio-pci can't do this as well.

We don't have to change them of course, they can stay as is if people
feel really strongly.

> > Even so, I'm not *so* worried about "over matching" - if IGD or the
> > nvidia stuff load on a wide set of devices then they can just not
> > enable their extended stuff. It wastes some kernel memory, but it is
> > OK.
> 
> I'd rather they bind to the base vfio-pci driver if their extended
> features are not available.

Sure it would be nice, but functionally it is no different.

> > And if some driver *really* gets stuck here the true answer is to
> > improve the driver core match capability.
> > 
> > > devices in the deny-list and non-endpoint devices.  Many drivers
> > > clearly place implicit trust in their id_table, others don't.  In the
> > > case of meta drivers, I think it's fair to make use of the latter
> > > approach.  
> > 
> > Well, AFAIK, the driver core doesn't have a 'try probe, if it fails
> > then try another driver' approach. One device, one driver. Am I
> > missing something?
> 
> If the driver probe callback fails, really_probe() returns 0 with the
> comment:
> 
>         /*
>          * Ignore errors returned by ->probe so that the next driver can try
>          * its luck.
>          */
>         ret = 0;
> 
> That allows bus_for_each_drv() to continue to iterate.

Er, but we have no reliable way to order drivers in the list so this
still assumes the system has exactly one driver match (even if some of
the match is now in code).

It won't work with a "universal" driver without more changes.

(and I couldn't find out why Cornelia added this long ago, or how or
even if it actually ended up being used)

> > I also think the softdep/implicit loading/ordering will not be
> > welcomed, it feels weird to me.
> 
> AFAICT, it works within the existing driver-core, it's largely an
> extension to pci-core driver_override support to enable wildcard
> matching, ideally along with adding the same for all buses that support
> driver_override.  Thanks,

It is the implicit ordering of module loading that is trouble.

Regards,
Jason
Christoph Hellwig March 22, 2021, 3:11 p.m. UTC | #27
On Fri, Mar 19, 2021 at 05:07:49PM -0300, Jason Gunthorpe wrote:
> The way the driver core works is to first match against the already
> loaded driver list, then trigger an event for module loading and when
> new drivers are registered they bind to unbound devices.
> 
> So, the trouble is the event through userspace because the kernel
> can't just go on to use vfio_pci until it knows userspace has failed
> to satisfy the load request.
> 
> One answer is to have userspace udev have the "hook" here and when a
> vfio flavour mod alias is requested on a PCI device it swaps in
> vfio_pci if it can't find an alternative.
> 
> The dream would be a system with no vfio modules loaded could do some
> 
>  echo "vfio" > /sys/bus/pci/xxx/driver_flavour
> 
> And a module would be loaded and a struct vfio_device is created for
> that device. Very easy for the user.

Maybe I did not communicate my suggestion last week very well.  My
idea is that there are no different pci_drivers vs vfio or not,
but different personalities of the same driver.

So the interface would still look somewhat like your suggestion above,
although I'd prefer something like:

   echo 1 > /sys/bus/pci/xxx/use_vfio

How would the flow look like for the various cases?

 a) if a driver is bound, and it supports the enable_vfio method that
    is called, and everything is controller by the driver, which uses
    symbols exorted from vfio/vfio_pci to implement the functionality
 b) if a driver is bound, but does not support the enable_vfio method
    it is unbound and vfio_pci is bound instead, continue at c)
 c) use the normal current vfio flow

do the reverse on a

echo 0 > /sys/bus/pci/xxx/use_vfio
Alex Williamson March 22, 2021, 4:40 p.m. UTC | #28
On Sun, 21 Mar 2021 09:58:18 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Fri, Mar 19, 2021 at 10:40:28PM -0600, Alex Williamson wrote:
> 
> > > Well, today we don't, but Max here adds id_table's to the special
> > > devices and a MODULE_DEVICE_TABLE would come too if we do the flavours
> > > thing below.  
> > 
> > I think the id_tables are the wrong approach for IGD and NVLink
> > variants.  
> 
> I really disagree with this. Checking for some random bits in firmware
> and assuming that every device made forever into the future works with
> this check is not a good way to do compatibility. Christoph made the
> same point.
> 
> We have good processes to maintain id tables, I don't see this as a
> problem.

The base driver we're discussing here is a meta-driver that binds to
any PCI endpoint as directed by the user.  There is no id_table.  There
can't be any id_table unless you're expecting every device vendor to
submit the exact subset of devices they have tested and condone usage
with this interface.  The IGD extensions here only extend that
interface by providing userspace read-only access to a few additional
pieces of information that we've found to be necessary for certain
userspace drivers.  The actual device interface is unchanged.  In the
case of the NVLink extensions, AIUI these are mostly extensions of a
firmware defined interface for managing aspects of the interconnect to
the device.  It is actually the "random bits in firmware" that we want
to expose, the ID of the device is somewhat tangential, we just only
look for those firmware extensions in association to certain vendor
devices.

Of course if you start looking at features like migration support,
that's more than likely not simply an additional region with optional
information, it would need to interact with the actual state of the
device.  For those, I would very much support use of a specific
id_table.  That's not these.

> > > As-is driver_override seems dangerous as overriding the matching table
> > > could surely allow root userspace to crash the machine. In situations
> > > with trusted boot/signed modules this shouldn't be.  
> > 
> > When we're dealing with meta-drivers that can bind to anything, we
> > shouldn't rely on the match, but should instead verify the driver is
> > appropriate in the probe callback.  Even without driver_override,
> > there's the new_id mechanism.  Either method allows the root user to
> > break driver binding.  Greg has previously stated something to the
> > effect that users get to keep all the pieces when they break something
> > by manipulating driver binding.  
> 
> Yes, but that is a view where root is allowed to break the kernel, we
> now have this optional other world where that is not allowed and root
> access to lots of dangerous things are now disabled.
> 
> new_id and driver_override should probably be in that disable list
> too..

We don't have this other world yet, nor is it clear that we will have
it.  What sort of id_table is the base vfio-pci driver expected to use?
There's always a risk that hardware doesn't adhere to the spec or that
platform firmware might escalate an error that we'd otherwise consider
mundane from a userspace driver.

> > > While that might not seem too bad with these simple drivers, at least
> > > the mlx5 migration driver will have a large dependency tree and pull
> > > in lots of other modules. Even Max's sample from v1 pulls in mlx5_core.ko
> > > and a bunch of other stuff in its orbit.  
> > 
> > Luckily the mlx5 driver doesn't need to be covered by compatibility
> > support, so we don't need to set a softdep for it and the module could
> > be named such that a wildcard driver_override of vfio_pci* shouldn't
> > logically include that driver.  Users can manually create their own
> > modprobe.d softdep entry if they'd like to include it.  Otherwise
> > userspace would need to know to bind to it specifically.  
> 
> But now you are giving up on the whole point, which was to
> automatically load the correct specific module without special admin
> involvement!

This series only exposed a temporary compatibility interface to provide
that anyway.  As I understood it, the long term solution was that
userspace would somehow learn which driver to use for which device.
That "somehow" isn't clear to me.

> > > This is why I want to try for fine grained autoloading first. It
> > > really is the elegant solution if we can work it out.  
> > 
> > I just don't see how we create a manageable change to userspace.  
> 
> I'm not sure I understand. Even if we add a new sysfs to set some
> flavour then that is a pretty trivial change for userspace to move
> from driver_override?

Perhaps for some definition of trivial that I'm not familiar with.
We're talking about changing libvirt and driverctl and every distro and
user that's created a custom script outside of those.  Even changing
from "vfio-pci" to "vfio-pci*" is a hurdle.

> > > I don't think we should over-focus on these two firmware triggered
> > > examples. I looked at the Intel GPU driver and it already only reads
> > > the firmware thing for certain PCI ID's, we can absolutely generate a
> > > narrow match table for it. Same is true for the NVIDIA GPU.  
> > 
> > I'm not sure we can make this assertion, both only care about the type
> > of device and existence of associated firmware tables.    
> 
> Well, I read through the Intel GPU driver and this is how I felt it
> works. It doesn't even check the firmware bit unless certain PCI IDs
> are matched first.

The IDs being only the PCI vendor ID and class code.  The entire IGD
extension is only meant to expose a vendor specific, graphics related
firmware table and collateral config space, so of course we'd restrict
it to that vendor for a graphics class device in approximately the
right location in the system.  There's a big difference between that
and a fixed id_table.
 
> For NVIDIA GPU Max checked internally and we saw it looks very much
> like how Intel GPU works. Only some PCI IDs trigger checking on the
> feature the firmware thing is linked to.

And as Alexey noted, the table came up incomplete.  But also those same
devices exist on platforms where this extension is completely
irrelevant.

> My point is: the actual *drivers* consuming these firmware features do
> *not* blindly match every PCI device and check for the firmware
> bit. They all have narrow matches and further only try to use the
> firmware thing for some subset of PCI IDs that the entire driver
> supports.

So because we don't check for an Intel specific graphics firmware table
when binding to Realtek NIC, we can leap to the conclusion that there
must be a concise id_table we can create for IGD support?

> Given that the actual drivers work this way there is no technical
> reason vfio-pci can't do this as well.

There's a giant assumption above that I'm missing.  Are you expecting
that vendors are actually going to keep up with submitting device IDs
that they claim to have tested and support with vfio-pci and all other
devices won't be allowed to bind?  That would single handedly destroy
any non-enterprise use cases of vfio-pci.

> We don't have to change them of course, they can stay as is if people
> feel really strongly.
> 
> > > Even so, I'm not *so* worried about "over matching" - if IGD or the
> > > nvidia stuff load on a wide set of devices then they can just not
> > > enable their extended stuff. It wastes some kernel memory, but it is
> > > OK.  
> > 
> > I'd rather they bind to the base vfio-pci driver if their extended
> > features are not available.  
> 
> Sure it would be nice, but functionally it is no different.

Exactly, the device interface is not changed, so why is it such a
heinous misstep that we should test for the feature we're trying to
expose rather than a specific ID and fall through if we don't find it?

> > > And if some driver *really* gets stuck here the true answer is to
> > > improve the driver core match capability.
> > >   
> > > > devices in the deny-list and non-endpoint devices.  Many drivers
> > > > clearly place implicit trust in their id_table, others don't.  In the
> > > > case of meta drivers, I think it's fair to make use of the latter
> > > > approach.    
> > > 
> > > Well, AFAIK, the driver core doesn't have a 'try probe, if it fails
> > > then try another driver' approach. One device, one driver. Am I
> > > missing something?  
> > 
> > If the driver probe callback fails, really_probe() returns 0 with the
> > comment:
> > 
> >         /*
> >          * Ignore errors returned by ->probe so that the next driver can try
> >          * its luck.
> >          */
> >         ret = 0;
> > 
> > That allows bus_for_each_drv() to continue to iterate.  
> 
> Er, but we have no reliable way to order drivers in the list so this
> still assumes the system has exactly one driver match (even if some of
> the match is now in code).
> 
> It won't work with a "universal" driver without more changes.
> 
> (and I couldn't find out why Cornelia added this long ago, or how or
> even if it actually ended up being used)

You'd need to go further back than Conny touching it, the original
import into git had:

void driver_attach(struct device_driver * drv)
{
        struct bus_type * bus = drv->bus;
        struct list_head * entry;
        int error;

        if (!bus->match)
                return;

        list_for_each(entry, &bus->devices.list) {
                struct device * dev = container_of(entry, struct device, bus_list);
                if (!dev->driver) {
                        error = driver_probe_device(drv, dev);
                        if (error && (error != -ENODEV))
                                /* driver matched but the probe failed */
                                printk(KERN_WARNING
                                    "%s: probe of %s failed with error %d\n",
                                    drv->name, dev->bus_id, error);
                }
        }
}

So unless you want to do some bitkeeper archaeology, we've always
allowed driver probes to fail and fall through to the next one, not
even complaining with -ENODEV.  In practice it hasn't been an issue
because how many drivers do you expect to have that would even try to
claim a device.  Ordering is only important when there's a catch-all so
we need to figure out how to make that last among a class of drivers
that will attempt to claim a device.  The softdep is a bit of a hack to
do that, I'll admit, but I don't see how the alternate driver flavor
universe solves having a catch-all either.  Thanks,

Alex
Jason Gunthorpe March 22, 2021, 4:44 p.m. UTC | #29
On Mon, Mar 22, 2021 at 04:11:25PM +0100, Christoph Hellwig wrote:
> On Fri, Mar 19, 2021 at 05:07:49PM -0300, Jason Gunthorpe wrote:
> > The way the driver core works is to first match against the already
> > loaded driver list, then trigger an event for module loading and when
> > new drivers are registered they bind to unbound devices.
> > 
> > So, the trouble is the event through userspace because the kernel
> > can't just go on to use vfio_pci until it knows userspace has failed
> > to satisfy the load request.
> > 
> > One answer is to have userspace udev have the "hook" here and when a
> > vfio flavour mod alias is requested on a PCI device it swaps in
> > vfio_pci if it can't find an alternative.
> > 
> > The dream would be a system with no vfio modules loaded could do some
> > 
> >  echo "vfio" > /sys/bus/pci/xxx/driver_flavour
> > 
> > And a module would be loaded and a struct vfio_device is created for
> > that device. Very easy for the user.
> 
> Maybe I did not communicate my suggestion last week very well.  My
> idea is that there are no different pci_drivers vs vfio or not,
> but different personalities of the same driver.

This isn't quite the scenario that needs solving. Lets go back to
Max's V1 posting:

The mlx5_vfio_pci.c pci_driver matches this:

+	{ PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1042,
+			 PCI_VENDOR_ID_MELLANOX, PCI_ANY_ID) }, /* Virtio SNAP controllers */

This overlaps with the match table in
drivers/virtio/virtio_pci_common.c:

        { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) },

So, if we do as you propose we have to add something mellanox specific
to virtio_pci_common which seems to me to just repeating this whole
problem except in more drivers.

The general thing that that is happening is people are adding VM
migration capability to existing standard PCI interfaces like VFIO,
NVMe, etc

At least in this mlx5 situation the PF driver provides the HW access
to do the migration and the vfio mlx5 driver provides all the protocol
and machinery specific to the PCI standard being migrated. They are
all a little different.

But you could imagine some other implemetnation where the VF might
have an extra non-standard BAR that is the migration control.

This is why I like having a full stand alone pci_driver as everyone
implementing this can provide the vfio_device that is appropriate for
the HW.

Jason
Christoph Hellwig March 23, 2021, 1:17 p.m. UTC | #30
On Mon, Mar 22, 2021 at 01:44:11PM -0300, Jason Gunthorpe wrote:
> This isn't quite the scenario that needs solving. Lets go back to
> Max's V1 posting:
> 
> The mlx5_vfio_pci.c pci_driver matches this:
> 
> +	{ PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1042,
> +			 PCI_VENDOR_ID_MELLANOX, PCI_ANY_ID) }, /* Virtio SNAP controllers */
> 
> This overlaps with the match table in
> drivers/virtio/virtio_pci_common.c:
> 
>         { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) },
> 
> So, if we do as you propose we have to add something mellanox specific
> to virtio_pci_common which seems to me to just repeating this whole
> problem except in more drivers.

Oh, yikes.  

> The general thing that that is happening is people are adding VM
> migration capability to existing standard PCI interfaces like VFIO,
> NVMe, etc

Well, if a migration capability is added to virtio (or NVMe) it should
be standardized and not vendor specific.
Jason Gunthorpe March 23, 2021, 1:42 p.m. UTC | #31
On Tue, Mar 23, 2021 at 02:17:09PM +0100, Christoph Hellwig wrote:
> On Mon, Mar 22, 2021 at 01:44:11PM -0300, Jason Gunthorpe wrote:
> > This isn't quite the scenario that needs solving. Lets go back to
> > Max's V1 posting:
> > 
> > The mlx5_vfio_pci.c pci_driver matches this:
> > 
> > +	{ PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1042,
> > +			 PCI_VENDOR_ID_MELLANOX, PCI_ANY_ID) }, /* Virtio SNAP controllers */
> > 
> > This overlaps with the match table in
> > drivers/virtio/virtio_pci_common.c:
> > 
> >         { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) },
> > 
> > So, if we do as you propose we have to add something mellanox specific
> > to virtio_pci_common which seems to me to just repeating this whole
> > problem except in more drivers.
> 
> Oh, yikes.  

This is why I keep saying it is a VFIO driver - it has no relation to
the normal kernel drivers on the hypervisor. Even loading a normal
kernel driver and switching to a VFIO mode would be unacceptably
slow/disruptive.

The goal is to go directly to a VFIO mode driver with PCI driver auto
probing disabled to avoid attaching a regular driver. Big servers will
have 1000's of these things.

> > The general thing that that is happening is people are adding VM
> > migration capability to existing standard PCI interfaces like VFIO,
> > NVMe, etc
> 
> Well, if a migration capability is added to virtio (or NVMe) it should
> be standardized and not vendor specific.

It would be nice, but it would be a challenging standard to write.

I think the industry is still in the pre-standards mode of trying to
even figure out how this stuff should work.

IMHO PCI sig needs to tackle a big part of this as we can't embed any
migration controls in the VF itself, it has to be secure for only
hypervisor use.

What we've got now is a Linux standard in VFIO where the uAPI to
manage migration is multi-vendor and we want to plug drivers into
that.

If in a few years the industry also develops HW standards then I
imagine using the same mechanism to plug in these standards based
implementation.

Jason
Jason Gunthorpe March 23, 2021, 7:32 p.m. UTC | #32
On Mon, Mar 22, 2021 at 10:40:16AM -0600, Alex Williamson wrote:

> Of course if you start looking at features like migration support,
> that's more than likely not simply an additional region with optional
> information, it would need to interact with the actual state of the
> device.  For those, I would very much support use of a specific
> id_table.  That's not these.

What I don't understand is why do we need two different ways of
inserting vendor code?

> > new_id and driver_override should probably be in that disable list
> > too..
> 
> We don't have this other world yet, nor is it clear that we will have
> it.

We do today, it is obscure, but there is a whole set of config options
designed to disable the unsafe kernel features. Kernels booted with
secure boot and signed modules tend to enable a lot of them, for
instance. The people working on the IMA stuff tend to enable a lot
more as you can defeat the purpose of IMA if you can hijack the
kernel.

> What sort of id_table is the base vfio-pci driver expected to use?

If it has a match table it would be all match, this is why I called it
a "universal driver"

If we have a flavour then the flavour controls the activation of
VFIO, not new_id or driver_override, and in vfio flavour mode we can
have an all match table, if we can resolve how to choose between two
drivers with overlapping matches.

> > > > This is why I want to try for fine grained autoloading first. It
> > > > really is the elegant solution if we can work it out.  
> > > 
> > > I just don't see how we create a manageable change to userspace.  
> > 
> > I'm not sure I understand. Even if we add a new sysfs to set some
> > flavour then that is a pretty trivial change for userspace to move
> > from driver_override?
> 
> Perhaps for some definition of trivial that I'm not familiar with.
> We're talking about changing libvirt and driverctl and every distro and
> user that's created a custom script outside of those.  Even changing
> from "vfio-pci" to "vfio-pci*" is a hurdle.

Sure, but it isn't like a major architectural shift, nor is it
mandatory unless you start using this new hardware class.

Userspace changes when we add kernel functionality.. The kernel just
has to keep working the way it used to for old functionality.

> > Well, I read through the Intel GPU driver and this is how I felt it
> > works. It doesn't even check the firmware bit unless certain PCI IDs
> > are matched first.
> 
> The IDs being only the PCI vendor ID and class code.  

I don't mean how vfio works, I mean how the Intel GPU driver works.

eg:

psb_pci_probe()
 psb_driver_load()
  psb_intel_opregion_setup()
           if (memcmp(base, OPREGION_SIGNATURE, 16)) {

i915_pci_probe()
 i915_driver_probe()
  i915_driver_hw_probe()
   intel_opregion_setup()
	if (memcmp(buf, OPREGION_SIGNATURE, 16)) {

All of these memcmp's are protected by exact id_tables hung off the
pci_driver's id_table.

VFIO is the different case. In this case the ID match confirms that
the config space has the ASLS dword at the fixed offset. If the ID
doesn't match nothing should read the ASLS offset.

> > For NVIDIA GPU Max checked internally and we saw it looks very much
> > like how Intel GPU works. Only some PCI IDs trigger checking on the
> > feature the firmware thing is linked to.
> 
> And as Alexey noted, the table came up incomplete.  But also those same
> devices exist on platforms where this extension is completely
> irrelevant.

I understood he ment that NVIDI GPUs *without* NVLINK can exist, but
the ID table we have here is supposed to be the NVLINK compatible
ID's.

> So because we don't check for an Intel specific graphics firmware table
> when binding to Realtek NIC, we can leap to the conclusion that there
> must be a concise id_table we can create for IGD support?

Concise? No, but we can see *today* what the ID table is supposed to
be by just loooking and the three probe functions that touch
OPREGION_SIGNATURE.

> There's a giant assumption above that I'm missing.  Are you expecting
> that vendors are actually going to keep up with submitting device IDs
> that they claim to have tested and support with vfio-pci and all other
> devices won't be allowed to bind?  That would single handedly destroy
> any non-enterprise use cases of vfio-pci.

Why not? They do it for the in-tree GPU drivers today! The ID table
for Intel GPU is even in a *header file* and we can just #include it
into vfio igd as well.

> So unless you want to do some bitkeeper archaeology, we've always
> allowed driver probes to fail and fall through to the next one, not
> even complaining with -ENODEV.  In practice it hasn't been an issue
> because how many drivers do you expect to have that would even try to
> claim a device.  

Do you know of anything using this ability? It might be helpful

> Ordering is only important when there's a catch-all so we need to
> figure out how to make that last among a class of drivers that will
> attempt to claim a device.  The softdep is a bit of a hack to do
> that, I'll admit, but I don't see how the alternate driver flavor
> universe solves having a catch-all either.

Haven't entirely got there yet, but I think the catch all probably has
to be handled by userspace udev/kmod in some way, as it is the only
thing that knows if there is a more specific module to load. This is
the biggest problem..

And again, I feel this is all a big tangent, especially now that HCH
wants to delete the nvlink stuff we should just leave igd alone.

Jason
Alexey Kardashevskiy March 24, 2021, 2:39 a.m. UTC | #33
On 24/03/2021 06:32, Jason Gunthorpe wrote:

>>> For NVIDIA GPU Max checked internally and we saw it looks very much
>>> like how Intel GPU works. Only some PCI IDs trigger checking on the
>>> feature the firmware thing is linked to.
>>
>> And as Alexey noted, the table came up incomplete.  But also those same
>> devices exist on platforms where this extension is completely
>> irrelevant.
> 
> I understood he ment that NVIDI GPUs *without* NVLINK can exist, but
> the ID table we have here is supposed to be the NVLINK compatible
> ID's.


I also meant there are more (than in the proposed list)  GPUs with 
NVLink which will work on P9.
Alex Williamson March 29, 2021, 11:10 p.m. UTC | #34
On Tue, 23 Mar 2021 16:32:13 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Mon, Mar 22, 2021 at 10:40:16AM -0600, Alex Williamson wrote:
> 
> > Of course if you start looking at features like migration support,
> > that's more than likely not simply an additional region with optional
> > information, it would need to interact with the actual state of the
> > device.  For those, I would very much support use of a specific
> > id_table.  That's not these.  
> 
> What I don't understand is why do we need two different ways of
> inserting vendor code?

Because a PCI id table only identifies the device, these drivers are
looking for a device in the context of firmware dependencies.

> > > new_id and driver_override should probably be in that disable list
> > > too..  
> > 
> > We don't have this other world yet, nor is it clear that we will have
> > it.  
> 
> We do today, it is obscure, but there is a whole set of config options
> designed to disable the unsafe kernel features. Kernels booted with
> secure boot and signed modules tend to enable a lot of them, for
> instance. The people working on the IMA stuff tend to enable a lot
> more as you can defeat the purpose of IMA if you can hijack the
> kernel.
> 
> > What sort of id_table is the base vfio-pci driver expected to use?  
> 
> If it has a match table it would be all match, this is why I called it
> a "universal driver"
> 
> If we have a flavour then the flavour controls the activation of
> VFIO, not new_id or driver_override, and in vfio flavour mode we can
> have an all match table, if we can resolve how to choose between two
> drivers with overlapping matches.
> 
> > > > > This is why I want to try for fine grained autoloading first. It
> > > > > really is the elegant solution if we can work it out.    
> > > > 
> > > > I just don't see how we create a manageable change to userspace.    
> > > 
> > > I'm not sure I understand. Even if we add a new sysfs to set some
> > > flavour then that is a pretty trivial change for userspace to move
> > > from driver_override?  
> > 
> > Perhaps for some definition of trivial that I'm not familiar with.
> > We're talking about changing libvirt and driverctl and every distro and
> > user that's created a custom script outside of those.  Even changing
> > from "vfio-pci" to "vfio-pci*" is a hurdle.  
> 
> Sure, but it isn't like a major architectural shift, nor is it
> mandatory unless you start using this new hardware class.
> 
> Userspace changes when we add kernel functionality.. The kernel just
> has to keep working the way it used to for old functionality.

Seems like we're bound to keep igd in the core as you propose below.

> > > Well, I read through the Intel GPU driver and this is how I felt it
> > > works. It doesn't even check the firmware bit unless certain PCI IDs
> > > are matched first.  
> > 
> > The IDs being only the PCI vendor ID and class code.    
> 
> I don't mean how vfio works, I mean how the Intel GPU driver works.
> 
> eg:
> 
> psb_pci_probe()
>  psb_driver_load()
>   psb_intel_opregion_setup()
>            if (memcmp(base, OPREGION_SIGNATURE, 16)) {
> 
> i915_pci_probe()
>  i915_driver_probe()
>   i915_driver_hw_probe()
>    intel_opregion_setup()
> 	if (memcmp(buf, OPREGION_SIGNATURE, 16)) {
> 
> All of these memcmp's are protected by exact id_tables hung off the
> pci_driver's id_table.
> 
> VFIO is the different case. In this case the ID match confirms that
> the config space has the ASLS dword at the fixed offset. If the ID
> doesn't match nothing should read the ASLS offset.
> 
> > > For NVIDIA GPU Max checked internally and we saw it looks very much
> > > like how Intel GPU works. Only some PCI IDs trigger checking on the
> > > feature the firmware thing is linked to.  
> > 
> > And as Alexey noted, the table came up incomplete.  But also those same
> > devices exist on platforms where this extension is completely
> > irrelevant.  
> 
> I understood he ment that NVIDI GPUs *without* NVLINK can exist, but
> the ID table we have here is supposed to be the NVLINK compatible
> ID's.

Those IDs are just for the SXM2 variants of the device that can
exist on a variety of platforms, only one of which includes the
firmware tables to activate the vfio support.

> > So because we don't check for an Intel specific graphics firmware table
> > when binding to Realtek NIC, we can leap to the conclusion that there
> > must be a concise id_table we can create for IGD support?  
> 
> Concise? No, but we can see *today* what the ID table is supposed to
> be by just loooking and the three probe functions that touch
> OPREGION_SIGNATURE.
> 
> > There's a giant assumption above that I'm missing.  Are you expecting
> > that vendors are actually going to keep up with submitting device IDs
> > that they claim to have tested and support with vfio-pci and all other
> > devices won't be allowed to bind?  That would single handedly destroy
> > any non-enterprise use cases of vfio-pci.  
> 
> Why not? They do it for the in-tree GPU drivers today! The ID table
> for Intel GPU is even in a *header file* and we can just #include it
> into vfio igd as well.

Are you volunteering to maintain the vfio-pci-igd id_table, complete
with the implicit expectation that those devices are known to work?
Part of the disconnect we have here might be the intended level of
support.  There's a Kconfig option around vfio igd support for more
than one reason.

I think you're looking for a significant inflection in vendor's stated
support for vfio use cases, beyond the "best-effort, give it a try",
that we currently have.  In some ways I look forward to that, so long
as users can also use it as they do today (maybe not enterprise users).
I sort of see imposing an id_table on igd support as trying to impose
that "vendor condoned" use case before we actually have a vendor
condoning it (or signing up to maintain an id table).

> > So unless you want to do some bitkeeper archaeology, we've always
> > allowed driver probes to fail and fall through to the next one, not
> > even complaining with -ENODEV.  In practice it hasn't been an issue
> > because how many drivers do you expect to have that would even try to
> > claim a device.    
> 
> Do you know of anything using this ability? It might be helpful

I don't.

> > Ordering is only important when there's a catch-all so we need to
> > figure out how to make that last among a class of drivers that will
> > attempt to claim a device.  The softdep is a bit of a hack to do
> > that, I'll admit, but I don't see how the alternate driver flavor
> > universe solves having a catch-all either.  
> 
> Haven't entirely got there yet, but I think the catch all probably has
> to be handled by userspace udev/kmod in some way, as it is the only
> thing that knows if there is a more specific module to load. This is
> the biggest problem..
> 
> And again, I feel this is all a big tangent, especially now that HCH
> wants to delete the nvlink stuff we should just leave igd alone.

Determining which things stay in vfio-pci-core and which things are
split to variant drivers and how those variant drivers can match the
devices they intend to support seems very inline with this series.  If
igd stays as part of vfio-pci-core then I think we're drawing a
parallel to z-pci support, where a significant part of that support is
a set of extra data structures exposed through capabilities to support
userspace use of the device.  Therefore extra regions or data
structures through capabilities, where we're not changing device
access, except as required for the platform (not the device) seem to be
things that fit within the core, right?  Thanks,

Alex
Cornelia Huck April 1, 2021, 1:04 p.m. UTC | #35
On Mon, 29 Mar 2021 17:10:53 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Tue, 23 Mar 2021 16:32:13 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Mon, Mar 22, 2021 at 10:40:16AM -0600, Alex Williamson wrote:

> > > So unless you want to do some bitkeeper archaeology, we've always
> > > allowed driver probes to fail and fall through to the next one, not
> > > even complaining with -ENODEV.  In practice it hasn't been an issue
> > > because how many drivers do you expect to have that would even try to
> > > claim a device.      
> > 
> > Do you know of anything using this ability? It might be helpful  
> 
> I don't.

I've been trying to remember why I added that patch to ignore all
errors (rather than only -ENODEV), but I suspect it might have been
related to the concurrent probing stuff I tried to implement back then.
The one instance of drivers matching to the same id I recall (s390
ctc/lcs) is actually not handled on the individual device level, but in
the meta ccwgroup driver; I don't remember anything else in the s390
case.

> 
> > > Ordering is only important when there's a catch-all so we need to
> > > figure out how to make that last among a class of drivers that will
> > > attempt to claim a device.  The softdep is a bit of a hack to do
> > > that, I'll admit, but I don't see how the alternate driver flavor
> > > universe solves having a catch-all either.    
> > 
> > Haven't entirely got there yet, but I think the catch all probably has
> > to be handled by userspace udev/kmod in some way, as it is the only
> > thing that knows if there is a more specific module to load. This is
> > the biggest problem..
> > 
> > And again, I feel this is all a big tangent, especially now that HCH
> > wants to delete the nvlink stuff we should just leave igd alone.  
> 
> Determining which things stay in vfio-pci-core and which things are
> split to variant drivers and how those variant drivers can match the
> devices they intend to support seems very inline with this series.  If
> igd stays as part of vfio-pci-core then I think we're drawing a
> parallel to z-pci support, where a significant part of that support is
> a set of extra data structures exposed through capabilities to support
> userspace use of the device.  Therefore extra regions or data
> structures through capabilities, where we're not changing device
> access, except as required for the platform (not the device) seem to be
> things that fit within the core, right?  Thanks,
> 
> Alex

As we are only talking about extra data governed by a capability, I
don't really see a problem with keeping it in the vfio core.

For those devices that need more specialized treatment, maybe we need
some kind of priority-based matching? I.e., if we match a device with
drivers, start with the one with highest priority (the specialized
one), and have the generic driver at the lowest priority. A
higher-priority driver added later one should not affect already bound
devices (and would need manual intervention again.)

[I think this has come up in other places in the past as well, but I
don't have any concrete pointers handy.]
Jason Gunthorpe April 1, 2021, 1:12 p.m. UTC | #36
On Mon, Mar 29, 2021 at 05:10:53PM -0600, Alex Williamson wrote:
> On Tue, 23 Mar 2021 16:32:13 -0300
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Mon, Mar 22, 2021 at 10:40:16AM -0600, Alex Williamson wrote:
> > 
> > > Of course if you start looking at features like migration support,
> > > that's more than likely not simply an additional region with optional
> > > information, it would need to interact with the actual state of the
> > > device.  For those, I would very much support use of a specific
> > > id_table.  That's not these.  
> > 
> > What I don't understand is why do we need two different ways of
> > inserting vendor code?
> 
> Because a PCI id table only identifies the device, these drivers are
> looking for a device in the context of firmware dependencies.

The firmware dependencies only exist for a defined list of ID's, so I
don't entirely agree with this statement. I agree with below though,
so lets leave it be.

> > I understood he ment that NVIDI GPUs *without* NVLINK can exist, but
> > the ID table we have here is supposed to be the NVLINK compatible
> > ID's.
> 
> Those IDs are just for the SXM2 variants of the device that can
> exist on a variety of platforms, only one of which includes the
> firmware tables to activate the vfio support.

AFAIK, SXM2 is a special physical form factor that has the nvlink
physical connection - it is only for this specific generation of power
servers that can accept the specific nvlink those cards have.

> I think you're looking for a significant inflection in vendor's stated
> support for vfio use cases, beyond the "best-effort, give it a try",
> that we currently have.

I see, so they don't want to. Lets leave it then.

Though if Xe breaks everything they need to add/maintain a proper ID
table, not more hackery.

> > And again, I feel this is all a big tangent, especially now that HCH
> > wants to delete the nvlink stuff we should just leave igd alone.
> 
> Determining which things stay in vfio-pci-core and which things are
> split to variant drivers and how those variant drivers can match the
> devices they intend to support seems very inline with this series.  

IMHO, the main litmus test for core is if variant drivers will need it
or not.

No variant driver should be stacked on an igd device, or if it someday
is, it should implement the special igd hackery internally (and have a
proper ID table). So when we split it up igd goes into vfio_pci.ko as
some special behavior vfio_pci.ko's universal driver provides for IGD.

Every variant driver will still need the zdev data to be exposed to
userspace, and every PCI device on s390 has that extra information. So
vdev goes to vfio_pci_core.ko

Future things going into vfio_pci.ko need a really good reason why
they can't be varian drivers instead.

Jason
Alex Williamson April 1, 2021, 9:49 p.m. UTC | #37
On Thu, 1 Apr 2021 10:12:27 -0300
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Mon, Mar 29, 2021 at 05:10:53PM -0600, Alex Williamson wrote:
> > On Tue, 23 Mar 2021 16:32:13 -0300
> > Jason Gunthorpe <jgg@nvidia.com> wrote:
> >   
> > > On Mon, Mar 22, 2021 at 10:40:16AM -0600, Alex Williamson wrote:
> > >   
> > > > Of course if you start looking at features like migration support,
> > > > that's more than likely not simply an additional region with optional
> > > > information, it would need to interact with the actual state of the
> > > > device.  For those, I would very much support use of a specific
> > > > id_table.  That's not these.    
> > > 
> > > What I don't understand is why do we need two different ways of
> > > inserting vendor code?  
> > 
> > Because a PCI id table only identifies the device, these drivers are
> > looking for a device in the context of firmware dependencies.  
> 
> The firmware dependencies only exist for a defined list of ID's, so I
> don't entirely agree with this statement. I agree with below though,
> so lets leave it be.
> 
> > > I understood he ment that NVIDI GPUs *without* NVLINK can exist, but
> > > the ID table we have here is supposed to be the NVLINK compatible
> > > ID's.  
> > 
> > Those IDs are just for the SXM2 variants of the device that can
> > exist on a variety of platforms, only one of which includes the
> > firmware tables to activate the vfio support.  
> 
> AFAIK, SXM2 is a special physical form factor that has the nvlink
> physical connection - it is only for this specific generation of power
> servers that can accept the specific nvlink those cards have.

SXM2 is not unique to Power, there are various x86 systems that support
the interface, everything from NVIDIA's own line of DGX systems,
various vendor systems, all the way to VARs like Super Micro and
Gigabyte.

> > I think you're looking for a significant inflection in vendor's stated
> > support for vfio use cases, beyond the "best-effort, give it a try",
> > that we currently have.  
> 
> I see, so they don't want to. Lets leave it then.
> 
> Though if Xe breaks everything they need to add/maintain a proper ID
> table, not more hackery.

e4eccb853664 ("vfio/pci: Bypass IGD init in case of -ENODEV") is
supposed to enable Xe, where the IGD code is expected to return -ENODEV
and we go on with the base vfio-pci support.
 
> > > And again, I feel this is all a big tangent, especially now that
> > > HCH wants to delete the nvlink stuff we should just leave igd
> > > alone.  
> > 
> > Determining which things stay in vfio-pci-core and which things are
> > split to variant drivers and how those variant drivers can match the
> > devices they intend to support seems very inline with this series.
> >   
> 
> IMHO, the main litmus test for core is if variant drivers will need it
> or not.
> 
> No variant driver should be stacked on an igd device, or if it someday
> is, it should implement the special igd hackery internally (and have a
> proper ID table). So when we split it up igd goes into vfio_pci.ko as
> some special behavior vfio_pci.ko's universal driver provides for IGD.
> 
> Every variant driver will still need the zdev data to be exposed to
> userspace, and every PCI device on s390 has that extra information. So
> vdev goes to vfio_pci_core.ko
> 
> Future things going into vfio_pci.ko need a really good reason why
> they can't be varian drivers instead.

That sounds fair.  Thanks,

Alex
diff mbox series

Patch

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 829e90a2e5a3..88c89863a205 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -48,8 +48,30 @@  config VFIO_PCI_IGD
 
 	  To enable Intel IGD assignment through vfio-pci, say Y.
 
-config VFIO_PCI_NVLINK2
-	def_bool y
+config VFIO_PCI_NVLINK2GPU
+	tristate "VFIO support for NVIDIA NVLINK2 GPUs"
 	depends on VFIO_PCI_CORE && PPC_POWERNV
 	help
-	  VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
+	  VFIO PCI driver for NVIDIA NVLINK2 GPUs with specific extensions
+	  for P9 Witherspoon machine.
+
+config VFIO_PCI_NPU2
+	tristate "VFIO support for IBM NPU host bus adapter on P9"
+	depends on VFIO_PCI_NVLINK2GPU && PPC_POWERNV
+	help
+	  VFIO PCI specific extensions for IBM NVLink2 host bus adapter on P9
+	  Witherspoon machine.
+
+config VFIO_PCI_DRIVER_COMPAT
+	bool "VFIO PCI backward compatibility for vendor specific extensions"
+	default y
+	depends on VFIO_PCI
+	help
+	  Say Y here if you want to preserve VFIO PCI backward
+	  compatibility. vfio_pci.ko will continue to automatically use
+	  the NVLINK2, NPU2 and IGD VFIO drivers when it is attached to
+	  a compatible device.
+
+	  When N is selected the user must bind explicity to the module
+	  they want to handle the device and vfio_pci.ko will have no
+	  device specific special behaviors.
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index f539f32c9296..86fb62e271fc 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -2,10 +2,15 @@ 
 
 obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
+obj-$(CONFIG_VFIO_PCI_NPU2) += npu2-vfio-pci.o
+obj-$(CONFIG_VFIO_PCI_NVLINK2GPU) += nvlink2gpu-vfio-pci.o
 
 vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-core-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
-vfio-pci-core-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2gpu.o vfio_pci_npu2.o
 vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o
 
 vfio-pci-y := vfio_pci.o
+
+npu2-vfio-pci-y := npu2_vfio_pci.o
+
+nvlink2gpu-vfio-pci-y := nvlink2gpu_vfio_pci.o
diff --git a/drivers/vfio/pci/vfio_pci_npu2.c b/drivers/vfio/pci/npu2_vfio_pci.c
similarity index 64%
rename from drivers/vfio/pci/vfio_pci_npu2.c
rename to drivers/vfio/pci/npu2_vfio_pci.c
index 717745256ab3..7071bda0f2b6 100644
--- a/drivers/vfio/pci/vfio_pci_npu2.c
+++ b/drivers/vfio/pci/npu2_vfio_pci.c
@@ -14,19 +14,28 @@ 
  *	Author: Alex Williamson <alex.williamson@redhat.com>
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
 #include <linux/io.h>
 #include <linux/pci.h>
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
+#include <linux/list.h>
 #include <linux/sched/mm.h>
 #include <linux/mmu_context.h>
 #include <asm/kvm_ppc.h>
 
 #include "vfio_pci_core.h"
+#include "npu2_vfio_pci.h"
 
 #define CREATE_TRACE_POINTS
 #include "npu2_trace.h"
 
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "Alexey Kardashevskiy <aik@ozlabs.ru>"
+#define DRIVER_DESC     "NPU2 VFIO PCI - User Level meta-driver for POWER9 NPU NVLink2 HBA"
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap);
 
 struct vfio_pci_npu2_data {
@@ -36,6 +45,10 @@  struct vfio_pci_npu2_data {
 	unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */
 };
 
+struct npu2_vfio_pci_device {
+	struct vfio_pci_core_device	vdev;
+};
+
 static size_t vfio_pci_npu2_rw(struct vfio_pci_core_device *vdev,
 		char __user *buf, size_t count, loff_t *ppos, bool iswrite)
 {
@@ -120,7 +133,7 @@  static const struct vfio_pci_regops vfio_pci_npu2_regops = {
 	.add_capability = vfio_pci_npu2_add_capability,
 };
 
-int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev)
+static int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev)
 {
 	int ret;
 	struct vfio_pci_npu2_data *data;
@@ -220,3 +233,132 @@  int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev)
 
 	return ret;
 }
+
+static void npu2_vfio_pci_release(void *device_data)
+{
+	struct vfio_pci_core_device *vdev = device_data;
+
+	mutex_lock(&vdev->reflck->lock);
+	if (!(--vdev->refcnt)) {
+		vfio_pci_vf_token_user_add(vdev, -1);
+		vfio_pci_core_spapr_eeh_release(vdev);
+		vfio_pci_core_disable(vdev);
+	}
+	mutex_unlock(&vdev->reflck->lock);
+
+	module_put(THIS_MODULE);
+}
+
+static int npu2_vfio_pci_open(void *device_data)
+{
+	struct vfio_pci_core_device *vdev = device_data;
+	int ret = 0;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	mutex_lock(&vdev->reflck->lock);
+
+	if (!vdev->refcnt) {
+		ret = vfio_pci_core_enable(vdev);
+		if (ret)
+			goto error;
+
+		ret = vfio_pci_ibm_npu2_init(vdev);
+		if (ret && ret != -ENODEV) {
+			pci_warn(vdev->pdev,
+				 "Failed to setup NVIDIA NV2 ATSD region\n");
+			vfio_pci_core_disable(vdev);
+			goto error;
+		}
+		ret = 0;
+		vfio_pci_probe_mmaps(vdev);
+		vfio_pci_core_spapr_eeh_open(vdev);
+		vfio_pci_vf_token_user_add(vdev, 1);
+	}
+	vdev->refcnt++;
+error:
+	mutex_unlock(&vdev->reflck->lock);
+	if (ret)
+		module_put(THIS_MODULE);
+	return ret;
+}
+
+static const struct vfio_device_ops npu2_vfio_pci_ops = {
+	.name		= "npu2-vfio-pci",
+	.open		= npu2_vfio_pci_open,
+	.release	= npu2_vfio_pci_release,
+	.ioctl		= vfio_pci_core_ioctl,
+	.read		= vfio_pci_core_read,
+	.write		= vfio_pci_core_write,
+	.mmap		= vfio_pci_core_mmap,
+	.request	= vfio_pci_core_request,
+	.match		= vfio_pci_core_match,
+};
+
+static int npu2_vfio_pci_probe(struct pci_dev *pdev,
+		const struct pci_device_id *id)
+{
+	struct npu2_vfio_pci_device *npvdev;
+	int ret;
+
+	npvdev = kzalloc(sizeof(*npvdev), GFP_KERNEL);
+	if (!npvdev)
+		return -ENOMEM;
+
+	ret = vfio_pci_core_register_device(&npvdev->vdev, pdev,
+			&npu2_vfio_pci_ops);
+	if (ret)
+		goto out_free;
+
+	return 0;
+
+out_free:
+	kfree(npvdev);
+	return ret;
+}
+
+static void npu2_vfio_pci_remove(struct pci_dev *pdev)
+{
+	struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
+	struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
+	struct npu2_vfio_pci_device *npvdev;
+
+	npvdev = container_of(core_vpdev, struct npu2_vfio_pci_device, vdev);
+
+	vfio_pci_core_unregister_device(core_vpdev);
+	kfree(npvdev);
+}
+
+static const struct pci_device_id npu2_vfio_pci_table[] = {
+	{ PCI_VDEVICE(IBM, 0x04ea) },
+	{ 0, }
+};
+
+static struct pci_driver npu2_vfio_pci_driver = {
+	.name			= "npu2-vfio-pci",
+	.id_table		= npu2_vfio_pci_table,
+	.probe			= npu2_vfio_pci_probe,
+	.remove			= npu2_vfio_pci_remove,
+#ifdef CONFIG_PCI_IOV
+	.sriov_configure	= vfio_pci_core_sriov_configure,
+#endif
+	.err_handler		= &vfio_pci_core_err_handlers,
+};
+
+#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
+struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev)
+{
+	if (pci_match_id(npu2_vfio_pci_driver.id_table, pdev))
+		return &npu2_vfio_pci_driver;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(get_npu2_vfio_pci_driver);
+#endif
+
+module_pci_driver(npu2_vfio_pci_driver);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/npu2_vfio_pci.h b/drivers/vfio/pci/npu2_vfio_pci.h
new file mode 100644
index 000000000000..92010d340346
--- /dev/null
+++ b/drivers/vfio/pci/npu2_vfio_pci.h
@@ -0,0 +1,24 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2020, Mellanox Technologies, Ltd.  All rights reserved.
+ *     Author: Max Gurtovoy <mgurtovoy@nvidia.com>
+ */
+
+#ifndef NPU2_VFIO_PCI_H
+#define NPU2_VFIO_PCI_H
+
+#include <linux/pci.h>
+#include <linux/module.h>
+
+#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
+#if defined(CONFIG_VFIO_PCI_NPU2) || defined(CONFIG_VFIO_PCI_NPU2_MODULE)
+struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev);
+#else
+struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev)
+{
+	return NULL;
+}
+#endif
+#endif
+
+#endif /* NPU2_VFIO_PCI_H */
diff --git a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c
similarity index 67%
rename from drivers/vfio/pci/vfio_pci_nvlink2gpu.c
rename to drivers/vfio/pci/nvlink2gpu_vfio_pci.c
index 6dce1e78ee82..84a5ac1ce8ac 100644
--- a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c
+++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c
@@ -1,6 +1,6 @@ 
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2.
+ * VFIO PCI NVIDIA NVLink2 GPUs support.
  *
  * Copyright (C) 2018 IBM Corp.  All rights reserved.
  *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
@@ -12,6 +12,9 @@ 
  *	Author: Alex Williamson <alex.williamson@redhat.com>
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
 #include <linux/io.h>
 #include <linux/pci.h>
 #include <linux/uaccess.h>
@@ -21,10 +24,15 @@ 
 #include <asm/kvm_ppc.h>
 
 #include "vfio_pci_core.h"
+#include "nvlink2gpu_vfio_pci.h"
 
 #define CREATE_TRACE_POINTS
 #include "nvlink2gpu_trace.h"
 
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "Alexey Kardashevskiy <aik@ozlabs.ru>"
+#define DRIVER_DESC     "NVLINK2GPU VFIO PCI - User Level meta-driver for NVIDIA NVLink2 GPUs"
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault);
 EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap);
 
@@ -39,6 +47,10 @@  struct vfio_pci_nvgpu_data {
 	struct notifier_block group_notifier;
 };
 
+struct nv_vfio_pci_device {
+	struct vfio_pci_core_device	vdev;
+};
+
 static size_t vfio_pci_nvgpu_rw(struct vfio_pci_core_device *vdev,
 		char __user *buf, size_t count, loff_t *ppos, bool iswrite)
 {
@@ -207,7 +219,8 @@  static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
-int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev)
+static int
+vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev)
 {
 	int ret;
 	u64 reg[2];
@@ -293,3 +306,135 @@  int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev)
 
 	return ret;
 }
+
+static void nvlink2gpu_vfio_pci_release(void *device_data)
+{
+	struct vfio_pci_core_device *vdev = device_data;
+
+	mutex_lock(&vdev->reflck->lock);
+	if (!(--vdev->refcnt)) {
+		vfio_pci_vf_token_user_add(vdev, -1);
+		vfio_pci_core_spapr_eeh_release(vdev);
+		vfio_pci_core_disable(vdev);
+	}
+	mutex_unlock(&vdev->reflck->lock);
+
+	module_put(THIS_MODULE);
+}
+
+static int nvlink2gpu_vfio_pci_open(void *device_data)
+{
+	struct vfio_pci_core_device *vdev = device_data;
+	int ret = 0;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	mutex_lock(&vdev->reflck->lock);
+
+	if (!vdev->refcnt) {
+		ret = vfio_pci_core_enable(vdev);
+		if (ret)
+			goto error;
+
+		ret = vfio_pci_nvidia_v100_nvlink2_init(vdev);
+		if (ret && ret != -ENODEV) {
+			pci_warn(vdev->pdev,
+				 "Failed to setup NVIDIA NV2 RAM region\n");
+			vfio_pci_core_disable(vdev);
+			goto error;
+		}
+		ret = 0;
+		vfio_pci_probe_mmaps(vdev);
+		vfio_pci_core_spapr_eeh_open(vdev);
+		vfio_pci_vf_token_user_add(vdev, 1);
+	}
+	vdev->refcnt++;
+error:
+	mutex_unlock(&vdev->reflck->lock);
+	if (ret)
+		module_put(THIS_MODULE);
+	return ret;
+}
+
+static const struct vfio_device_ops nvlink2gpu_vfio_pci_ops = {
+	.name		= "nvlink2gpu-vfio-pci",
+	.open		= nvlink2gpu_vfio_pci_open,
+	.release	= nvlink2gpu_vfio_pci_release,
+	.ioctl		= vfio_pci_core_ioctl,
+	.read		= vfio_pci_core_read,
+	.write		= vfio_pci_core_write,
+	.mmap		= vfio_pci_core_mmap,
+	.request	= vfio_pci_core_request,
+	.match		= vfio_pci_core_match,
+};
+
+static int nvlink2gpu_vfio_pci_probe(struct pci_dev *pdev,
+		const struct pci_device_id *id)
+{
+	struct nv_vfio_pci_device *nvdev;
+	int ret;
+
+	nvdev = kzalloc(sizeof(*nvdev), GFP_KERNEL);
+	if (!nvdev)
+		return -ENOMEM;
+
+	ret = vfio_pci_core_register_device(&nvdev->vdev, pdev,
+			&nvlink2gpu_vfio_pci_ops);
+	if (ret)
+		goto out_free;
+
+	return 0;
+
+out_free:
+	kfree(nvdev);
+	return ret;
+}
+
+static void nvlink2gpu_vfio_pci_remove(struct pci_dev *pdev)
+{
+	struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
+	struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
+	struct nv_vfio_pci_device *nvdev;
+
+	nvdev = container_of(core_vpdev, struct nv_vfio_pci_device, vdev);
+
+	vfio_pci_core_unregister_device(core_vpdev);
+	kfree(nvdev);
+}
+
+static const struct pci_device_id nvlink2gpu_vfio_pci_table[] = {
+	{ PCI_VDEVICE(NVIDIA, 0x1DB1) }, /* GV100GL-A NVIDIA Tesla V100-SXM2-16GB */
+	{ PCI_VDEVICE(NVIDIA, 0x1DB5) }, /* GV100GL-A NVIDIA Tesla V100-SXM2-32GB */
+	{ PCI_VDEVICE(NVIDIA, 0x1DB8) }, /* GV100GL-A NVIDIA Tesla V100-SXM3-32GB */
+	{ PCI_VDEVICE(NVIDIA, 0x1DF5) }, /* GV100GL-B NVIDIA Tesla V100-SXM2-16GB */
+	{ 0, }
+};
+
+static struct pci_driver nvlink2gpu_vfio_pci_driver = {
+	.name			= "nvlink2gpu-vfio-pci",
+	.id_table		= nvlink2gpu_vfio_pci_table,
+	.probe			= nvlink2gpu_vfio_pci_probe,
+	.remove			= nvlink2gpu_vfio_pci_remove,
+#ifdef CONFIG_PCI_IOV
+	.sriov_configure	= vfio_pci_core_sriov_configure,
+#endif
+	.err_handler		= &vfio_pci_core_err_handlers,
+};
+
+#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
+struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev)
+{
+	if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev))
+		return &nvlink2gpu_vfio_pci_driver;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(get_nvlink2gpu_vfio_pci_driver);
+#endif
+
+module_pci_driver(nvlink2gpu_vfio_pci_driver);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/pci/nvlink2gpu_vfio_pci.h b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h
new file mode 100644
index 000000000000..ebd5b600b190
--- /dev/null
+++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h
@@ -0,0 +1,24 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2020, Mellanox Technologies, Ltd.  All rights reserved.
+ *     Author: Max Gurtovoy <mgurtovoy@nvidia.com>
+ */
+
+#ifndef NVLINK2GPU_VFIO_PCI_H
+#define NVLINK2GPU_VFIO_PCI_H
+
+#include <linux/pci.h>
+#include <linux/module.h>
+
+#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
+#if defined(CONFIG_VFIO_PCI_NVLINK2GPU) || defined(CONFIG_VFIO_PCI_NVLINK2GPU_MODULE)
+struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev);
+#else
+struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev)
+{
+	return NULL;
+}
+#endif
+#endif
+
+#endif /* NVLINK2GPU_VFIO_PCI_H */
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index dbc0a6559914..8e81ea039f31 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -27,6 +27,10 @@ 
 #include <linux/uaccess.h>
 
 #include "vfio_pci_core.h"
+#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT
+#include "npu2_vfio_pci.h"
+#include "nvlink2gpu_vfio_pci.h"
+#endif
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
@@ -142,14 +146,48 @@  static const struct vfio_device_ops vfio_pci_ops = {
 	.match		= vfio_pci_core_match,
 };
 
+/*
+ * This layer is used for backward compatibility. Hopefully it will be
+ * removed in the future.
+ */
+static struct pci_driver *vfio_pci_get_compat_driver(struct pci_dev *pdev)
+{
+	switch (pdev->vendor) {
+	case PCI_VENDOR_ID_NVIDIA:
+		switch (pdev->device) {
+		case 0x1db1:
+		case 0x1db5:
+		case 0x1db8:
+		case 0x1df5:
+			return get_nvlink2gpu_vfio_pci_driver(pdev);
+		default:
+			return NULL;
+		}
+	case PCI_VENDOR_ID_IBM:
+		switch (pdev->device) {
+		case 0x04ea:
+			return get_npu2_vfio_pci_driver(pdev);
+		default:
+			return NULL;
+		}
+	}
+
+	return NULL;
+}
+
 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct vfio_pci_device *vpdev;
+	struct pci_driver *driver;
 	int ret;
 
 	if (vfio_pci_is_denylisted(pdev))
 		return -EINVAL;
 
+	driver = vfio_pci_get_compat_driver(pdev);
+	if (driver)
+		return driver->probe(pdev, id);
+
 	vpdev = kzalloc(sizeof(*vpdev), GFP_KERNEL);
 	if (!vpdev)
 		return -ENOMEM;
@@ -167,14 +205,21 @@  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 static void vfio_pci_remove(struct pci_dev *pdev)
 {
-	struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
-	struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev);
-	struct vfio_pci_device *vpdev;
-
-	vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev);
-
-	vfio_pci_core_unregister_device(core_vpdev);
-	kfree(vpdev);
+	struct pci_driver *driver;
+
+	driver = vfio_pci_get_compat_driver(pdev);
+	if (driver) {
+		driver->remove(pdev);
+	} else {
+		struct vfio_device *vdev = dev_get_drvdata(&pdev->dev);
+		struct vfio_pci_core_device *core_vpdev;
+		struct vfio_pci_device *vpdev;
+
+		core_vpdev = vfio_device_data(vdev);
+		vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev);
+		vfio_pci_core_unregister_device(core_vpdev);
+		kfree(vpdev);
+	}
 }
 
 static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 4de8e352df9c..f9b39abe54cb 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -354,24 +354,6 @@  int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
 		}
 	}
 
-	if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
-	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
-		ret = vfio_pci_nvidia_v100_nvlink2_init(vdev);
-		if (ret && ret != -ENODEV) {
-			pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n");
-			goto disable_exit;
-		}
-	}
-
-	if (pdev->vendor == PCI_VENDOR_ID_IBM &&
-	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
-		ret = vfio_pci_ibm_npu2_init(vdev);
-		if (ret && ret != -ENODEV) {
-			pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n");
-			goto disable_exit;
-		}
-	}
-
 	return 0;
 
 disable_exit:
diff --git a/drivers/vfio/pci/vfio_pci_core.h b/drivers/vfio/pci/vfio_pci_core.h
index 8989443c3086..31f3836e606e 100644
--- a/drivers/vfio/pci/vfio_pci_core.h
+++ b/drivers/vfio/pci/vfio_pci_core.h
@@ -204,20 +204,6 @@  static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
 	return -ENODEV;
 }
 #endif
-#ifdef CONFIG_VFIO_PCI_NVLINK2
-extern int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev);
-extern int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev);
-#else
-static inline int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev)
-{
-	return -ENODEV;
-}
-
-static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev)
-{
-	return -ENODEV;
-}
-#endif
 
 #ifdef CONFIG_S390
 extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,