Message ID | 20210309083357.65467-9-mgurtovoy@nvidia.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Introduce vfio-pci-core subsystem | expand |
On 09/03/2021 19:33, Max Gurtovoy wrote: > The new drivers introduced are nvlink2gpu_vfio_pci.ko and > npu2_vfio_pci.ko. > The first will be responsible for providing special extensions for > NVIDIA GPUs with NVLINK2 support for P9 platform (and others in the > future). The last will be responsible for POWER9 NPU2 unit (NVLink2 host > bus adapter). > > Also, preserve backward compatibility for users that were binding > NVLINK2 devices to vfio_pci.ko. Hopefully this compatibility layer will > be dropped in the future > > Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> > --- > drivers/vfio/pci/Kconfig | 28 +++- > drivers/vfio/pci/Makefile | 7 +- > .../pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} | 144 ++++++++++++++++- > drivers/vfio/pci/npu2_vfio_pci.h | 24 +++ > ...pci_nvlink2gpu.c => nvlink2gpu_vfio_pci.c} | 149 +++++++++++++++++- > drivers/vfio/pci/nvlink2gpu_vfio_pci.h | 24 +++ > drivers/vfio/pci/vfio_pci.c | 61 ++++++- > drivers/vfio/pci/vfio_pci_core.c | 18 --- > drivers/vfio/pci/vfio_pci_core.h | 14 -- > 9 files changed, 422 insertions(+), 47 deletions(-) > rename drivers/vfio/pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} (64%) > create mode 100644 drivers/vfio/pci/npu2_vfio_pci.h > rename drivers/vfio/pci/{vfio_pci_nvlink2gpu.c => nvlink2gpu_vfio_pci.c} (67%) > create mode 100644 drivers/vfio/pci/nvlink2gpu_vfio_pci.h > > diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig > index 829e90a2e5a3..88c89863a205 100644 > --- a/drivers/vfio/pci/Kconfig > +++ b/drivers/vfio/pci/Kconfig > @@ -48,8 +48,30 @@ config VFIO_PCI_IGD > > To enable Intel IGD assignment through vfio-pci, say Y. > > -config VFIO_PCI_NVLINK2 > - def_bool y > +config VFIO_PCI_NVLINK2GPU > + tristate "VFIO support for NVIDIA NVLINK2 GPUs" > depends on VFIO_PCI_CORE && PPC_POWERNV > help > - VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs > + VFIO PCI driver for NVIDIA NVLINK2 GPUs with specific extensions > + for P9 Witherspoon machine. > + > +config VFIO_PCI_NPU2 > + tristate "VFIO support for IBM NPU host bus adapter on P9" > + depends on VFIO_PCI_NVLINK2GPU && PPC_POWERNV > + help > + VFIO PCI specific extensions for IBM NVLink2 host bus adapter on P9 > + Witherspoon machine. > + > +config VFIO_PCI_DRIVER_COMPAT > + bool "VFIO PCI backward compatibility for vendor specific extensions" > + default y > + depends on VFIO_PCI > + help > + Say Y here if you want to preserve VFIO PCI backward > + compatibility. vfio_pci.ko will continue to automatically use > + the NVLINK2, NPU2 and IGD VFIO drivers when it is attached to > + a compatible device. > + > + When N is selected the user must bind explicity to the module > + they want to handle the device and vfio_pci.ko will have no > + device specific special behaviors. > diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile > index f539f32c9296..86fb62e271fc 100644 > --- a/drivers/vfio/pci/Makefile > +++ b/drivers/vfio/pci/Makefile > @@ -2,10 +2,15 @@ > > obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o > obj-$(CONFIG_VFIO_PCI) += vfio-pci.o > +obj-$(CONFIG_VFIO_PCI_NPU2) += npu2-vfio-pci.o > +obj-$(CONFIG_VFIO_PCI_NVLINK2GPU) += nvlink2gpu-vfio-pci.o > > vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o > vfio-pci-core-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o > -vfio-pci-core-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2gpu.o vfio_pci_npu2.o > vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o > > vfio-pci-y := vfio_pci.o > + > +npu2-vfio-pci-y := npu2_vfio_pci.o > + > +nvlink2gpu-vfio-pci-y := nvlink2gpu_vfio_pci.o > diff --git a/drivers/vfio/pci/vfio_pci_npu2.c b/drivers/vfio/pci/npu2_vfio_pci.c > similarity index 64% > rename from drivers/vfio/pci/vfio_pci_npu2.c > rename to drivers/vfio/pci/npu2_vfio_pci.c > index 717745256ab3..7071bda0f2b6 100644 > --- a/drivers/vfio/pci/vfio_pci_npu2.c > +++ b/drivers/vfio/pci/npu2_vfio_pci.c > @@ -14,19 +14,28 @@ > * Author: Alex Williamson <alex.williamson@redhat.com> > */ > > +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt > + > +#include <linux/module.h> > #include <linux/io.h> > #include <linux/pci.h> > #include <linux/uaccess.h> > #include <linux/vfio.h> > +#include <linux/list.h> > #include <linux/sched/mm.h> > #include <linux/mmu_context.h> > #include <asm/kvm_ppc.h> > > #include "vfio_pci_core.h" > +#include "npu2_vfio_pci.h" > > #define CREATE_TRACE_POINTS > #include "npu2_trace.h" > > +#define DRIVER_VERSION "0.1" > +#define DRIVER_AUTHOR "Alexey Kardashevskiy <aik@ozlabs.ru>" > +#define DRIVER_DESC "NPU2 VFIO PCI - User Level meta-driver for POWER9 NPU NVLink2 HBA" > + > EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap); > > struct vfio_pci_npu2_data { > @@ -36,6 +45,10 @@ struct vfio_pci_npu2_data { > unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */ > }; > > +struct npu2_vfio_pci_device { > + struct vfio_pci_core_device vdev; > +}; > + > static size_t vfio_pci_npu2_rw(struct vfio_pci_core_device *vdev, > char __user *buf, size_t count, loff_t *ppos, bool iswrite) > { > @@ -120,7 +133,7 @@ static const struct vfio_pci_regops vfio_pci_npu2_regops = { > .add_capability = vfio_pci_npu2_add_capability, > }; > > -int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev) > +static int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev) > { > int ret; > struct vfio_pci_npu2_data *data; > @@ -220,3 +233,132 @@ int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev) > > return ret; > } > + > +static void npu2_vfio_pci_release(void *device_data) > +{ > + struct vfio_pci_core_device *vdev = device_data; > + > + mutex_lock(&vdev->reflck->lock); > + if (!(--vdev->refcnt)) { > + vfio_pci_vf_token_user_add(vdev, -1); > + vfio_pci_core_spapr_eeh_release(vdev); > + vfio_pci_core_disable(vdev); > + } > + mutex_unlock(&vdev->reflck->lock); > + > + module_put(THIS_MODULE); > +} > + > +static int npu2_vfio_pci_open(void *device_data) > +{ > + struct vfio_pci_core_device *vdev = device_data; > + int ret = 0; > + > + if (!try_module_get(THIS_MODULE)) > + return -ENODEV; > + > + mutex_lock(&vdev->reflck->lock); > + > + if (!vdev->refcnt) { > + ret = vfio_pci_core_enable(vdev); > + if (ret) > + goto error; > + > + ret = vfio_pci_ibm_npu2_init(vdev); > + if (ret && ret != -ENODEV) { > + pci_warn(vdev->pdev, > + "Failed to setup NVIDIA NV2 ATSD region\n"); > + vfio_pci_core_disable(vdev); > + goto error; > + } > + ret = 0; > + vfio_pci_probe_mmaps(vdev); > + vfio_pci_core_spapr_eeh_open(vdev); > + vfio_pci_vf_token_user_add(vdev, 1); > + } > + vdev->refcnt++; > +error: > + mutex_unlock(&vdev->reflck->lock); > + if (ret) > + module_put(THIS_MODULE); > + return ret; > +} > + > +static const struct vfio_device_ops npu2_vfio_pci_ops = { > + .name = "npu2-vfio-pci", > + .open = npu2_vfio_pci_open, > + .release = npu2_vfio_pci_release, > + .ioctl = vfio_pci_core_ioctl, > + .read = vfio_pci_core_read, > + .write = vfio_pci_core_write, > + .mmap = vfio_pci_core_mmap, > + .request = vfio_pci_core_request, > + .match = vfio_pci_core_match, > +}; > + > +static int npu2_vfio_pci_probe(struct pci_dev *pdev, > + const struct pci_device_id *id) > +{ > + struct npu2_vfio_pci_device *npvdev; > + int ret; > + > + npvdev = kzalloc(sizeof(*npvdev), GFP_KERNEL); > + if (!npvdev) > + return -ENOMEM; > + > + ret = vfio_pci_core_register_device(&npvdev->vdev, pdev, > + &npu2_vfio_pci_ops); > + if (ret) > + goto out_free; > + > + return 0; > + > +out_free: > + kfree(npvdev); > + return ret; > +} > + > +static void npu2_vfio_pci_remove(struct pci_dev *pdev) > +{ > + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); > + struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); > + struct npu2_vfio_pci_device *npvdev; > + > + npvdev = container_of(core_vpdev, struct npu2_vfio_pci_device, vdev); > + > + vfio_pci_core_unregister_device(core_vpdev); > + kfree(npvdev); > +} > + > +static const struct pci_device_id npu2_vfio_pci_table[] = { > + { PCI_VDEVICE(IBM, 0x04ea) }, > + { 0, } > +}; > + > +static struct pci_driver npu2_vfio_pci_driver = { > + .name = "npu2-vfio-pci", > + .id_table = npu2_vfio_pci_table, > + .probe = npu2_vfio_pci_probe, > + .remove = npu2_vfio_pci_remove, > +#ifdef CONFIG_PCI_IOV > + .sriov_configure = vfio_pci_core_sriov_configure, > +#endif > + .err_handler = &vfio_pci_core_err_handlers, > +}; > + > +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT > +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev) > +{ > + if (pci_match_id(npu2_vfio_pci_driver.id_table, pdev)) > + return &npu2_vfio_pci_driver; > + return NULL; > +} > +EXPORT_SYMBOL_GPL(get_npu2_vfio_pci_driver); > +#endif > + > +module_pci_driver(npu2_vfio_pci_driver); > + > +MODULE_VERSION(DRIVER_VERSION); > +MODULE_LICENSE("GPL v2"); > +MODULE_AUTHOR(DRIVER_AUTHOR); > +MODULE_DESCRIPTION(DRIVER_DESC); > diff --git a/drivers/vfio/pci/npu2_vfio_pci.h b/drivers/vfio/pci/npu2_vfio_pci.h > new file mode 100644 > index 000000000000..92010d340346 > --- /dev/null > +++ b/drivers/vfio/pci/npu2_vfio_pci.h > @@ -0,0 +1,24 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (c) 2020, Mellanox Technologies, Ltd. All rights reserved. > + * Author: Max Gurtovoy <mgurtovoy@nvidia.com> > + */ > + > +#ifndef NPU2_VFIO_PCI_H > +#define NPU2_VFIO_PCI_H > + > +#include <linux/pci.h> > +#include <linux/module.h> > + > +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT > +#if defined(CONFIG_VFIO_PCI_NPU2) || defined(CONFIG_VFIO_PCI_NPU2_MODULE) > +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev); > +#else > +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev) > +{ > + return NULL; > +} > +#endif > +#endif > + > +#endif /* NPU2_VFIO_PCI_H */ > diff --git a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c > similarity index 67% > rename from drivers/vfio/pci/vfio_pci_nvlink2gpu.c > rename to drivers/vfio/pci/nvlink2gpu_vfio_pci.c > index 6dce1e78ee82..84a5ac1ce8ac 100644 > --- a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c > +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c > @@ -1,6 +1,6 @@ > // SPDX-License-Identifier: GPL-2.0-only > /* > - * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2. > + * VFIO PCI NVIDIA NVLink2 GPUs support. > * > * Copyright (C) 2018 IBM Corp. All rights reserved. > * Author: Alexey Kardashevskiy <aik@ozlabs.ru> > @@ -12,6 +12,9 @@ > * Author: Alex Williamson <alex.williamson@redhat.com> > */ > > +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt > + > +#include <linux/module.h> > #include <linux/io.h> > #include <linux/pci.h> > #include <linux/uaccess.h> > @@ -21,10 +24,15 @@ > #include <asm/kvm_ppc.h> > > #include "vfio_pci_core.h" > +#include "nvlink2gpu_vfio_pci.h" > > #define CREATE_TRACE_POINTS > #include "nvlink2gpu_trace.h" > > +#define DRIVER_VERSION "0.1" > +#define DRIVER_AUTHOR "Alexey Kardashevskiy <aik@ozlabs.ru>" > +#define DRIVER_DESC "NVLINK2GPU VFIO PCI - User Level meta-driver for NVIDIA NVLink2 GPUs" > + > EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault); > EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap); > > @@ -39,6 +47,10 @@ struct vfio_pci_nvgpu_data { > struct notifier_block group_notifier; > }; > > +struct nv_vfio_pci_device { > + struct vfio_pci_core_device vdev; > +}; > + > static size_t vfio_pci_nvgpu_rw(struct vfio_pci_core_device *vdev, > char __user *buf, size_t count, loff_t *ppos, bool iswrite) > { > @@ -207,7 +219,8 @@ static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb, > return NOTIFY_OK; > } > > -int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev) > +static int > +vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev) > { > int ret; > u64 reg[2]; > @@ -293,3 +306,135 @@ int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev) > > return ret; > } > + > +static void nvlink2gpu_vfio_pci_release(void *device_data) > +{ > + struct vfio_pci_core_device *vdev = device_data; > + > + mutex_lock(&vdev->reflck->lock); > + if (!(--vdev->refcnt)) { > + vfio_pci_vf_token_user_add(vdev, -1); > + vfio_pci_core_spapr_eeh_release(vdev); > + vfio_pci_core_disable(vdev); > + } > + mutex_unlock(&vdev->reflck->lock); > + > + module_put(THIS_MODULE); > +} > + > +static int nvlink2gpu_vfio_pci_open(void *device_data) > +{ > + struct vfio_pci_core_device *vdev = device_data; > + int ret = 0; > + > + if (!try_module_get(THIS_MODULE)) > + return -ENODEV; > + > + mutex_lock(&vdev->reflck->lock); > + > + if (!vdev->refcnt) { > + ret = vfio_pci_core_enable(vdev); > + if (ret) > + goto error; > + > + ret = vfio_pci_nvidia_v100_nvlink2_init(vdev); > + if (ret && ret != -ENODEV) { > + pci_warn(vdev->pdev, > + "Failed to setup NVIDIA NV2 RAM region\n"); > + vfio_pci_core_disable(vdev); > + goto error; > + } > + ret = 0; > + vfio_pci_probe_mmaps(vdev); > + vfio_pci_core_spapr_eeh_open(vdev); > + vfio_pci_vf_token_user_add(vdev, 1); > + } > + vdev->refcnt++; > +error: > + mutex_unlock(&vdev->reflck->lock); > + if (ret) > + module_put(THIS_MODULE); > + return ret; > +} > + > +static const struct vfio_device_ops nvlink2gpu_vfio_pci_ops = { > + .name = "nvlink2gpu-vfio-pci", > + .open = nvlink2gpu_vfio_pci_open, > + .release = nvlink2gpu_vfio_pci_release, > + .ioctl = vfio_pci_core_ioctl, > + .read = vfio_pci_core_read, > + .write = vfio_pci_core_write, > + .mmap = vfio_pci_core_mmap, > + .request = vfio_pci_core_request, > + .match = vfio_pci_core_match, > +}; > + > +static int nvlink2gpu_vfio_pci_probe(struct pci_dev *pdev, > + const struct pci_device_id *id) > +{ > + struct nv_vfio_pci_device *nvdev; > + int ret; > + > + nvdev = kzalloc(sizeof(*nvdev), GFP_KERNEL); > + if (!nvdev) > + return -ENOMEM; > + > + ret = vfio_pci_core_register_device(&nvdev->vdev, pdev, > + &nvlink2gpu_vfio_pci_ops); > + if (ret) > + goto out_free; > + > + return 0; > + > +out_free: > + kfree(nvdev); > + return ret; > +} > + > +static void nvlink2gpu_vfio_pci_remove(struct pci_dev *pdev) > +{ > + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); > + struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); > + struct nv_vfio_pci_device *nvdev; > + > + nvdev = container_of(core_vpdev, struct nv_vfio_pci_device, vdev); > + > + vfio_pci_core_unregister_device(core_vpdev); > + kfree(nvdev); > +} > + > +static const struct pci_device_id nvlink2gpu_vfio_pci_table[] = { > + { PCI_VDEVICE(NVIDIA, 0x1DB1) }, /* GV100GL-A NVIDIA Tesla V100-SXM2-16GB */ > + { PCI_VDEVICE(NVIDIA, 0x1DB5) }, /* GV100GL-A NVIDIA Tesla V100-SXM2-32GB */ > + { PCI_VDEVICE(NVIDIA, 0x1DB8) }, /* GV100GL-A NVIDIA Tesla V100-SXM3-32GB */ > + { PCI_VDEVICE(NVIDIA, 0x1DF5) }, /* GV100GL-B NVIDIA Tesla V100-SXM2-16GB */ Where is this list from? Also, how is this supposed to work at the boot time? Will the kernel try binding let's say this one and nouveau? Which one is going to win? > + { 0, } Why a comma? > +}; > + > +static struct pci_driver nvlink2gpu_vfio_pci_driver = { > + .name = "nvlink2gpu-vfio-pci", > + .id_table = nvlink2gpu_vfio_pci_table, > + .probe = nvlink2gpu_vfio_pci_probe, > + .remove = nvlink2gpu_vfio_pci_remove, > +#ifdef CONFIG_PCI_IOV > + .sriov_configure = vfio_pci_core_sriov_configure, > +#endif What is this IOV business about? > + .err_handler = &vfio_pci_core_err_handlers, > +}; > + > +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT > +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev) > +{ > + if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev)) > + return &nvlink2gpu_vfio_pci_driver; Why do we need matching PCI ids here instead of looking at the FDT which will work better? > + return NULL; > +} > +EXPORT_SYMBOL_GPL(get_nvlink2gpu_vfio_pci_driver); > +#endif > + > +module_pci_driver(nvlink2gpu_vfio_pci_driver); > + > +MODULE_VERSION(DRIVER_VERSION); > +MODULE_LICENSE("GPL v2"); > +MODULE_AUTHOR(DRIVER_AUTHOR); > +MODULE_DESCRIPTION(DRIVER_DESC); > diff --git a/drivers/vfio/pci/nvlink2gpu_vfio_pci.h b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h > new file mode 100644 > index 000000000000..ebd5b600b190 > --- /dev/null > +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h > @@ -0,0 +1,24 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (c) 2020, Mellanox Technologies, Ltd. All rights reserved. > + * Author: Max Gurtovoy <mgurtovoy@nvidia.com> > + */ > + > +#ifndef NVLINK2GPU_VFIO_PCI_H > +#define NVLINK2GPU_VFIO_PCI_H > + > +#include <linux/pci.h> > +#include <linux/module.h> > + > +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT > +#if defined(CONFIG_VFIO_PCI_NVLINK2GPU) || defined(CONFIG_VFIO_PCI_NVLINK2GPU_MODULE) > +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev); > +#else > +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev) > +{ > + return NULL; > +} > +#endif > +#endif > + > +#endif /* NVLINK2GPU_VFIO_PCI_H */ > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c > index dbc0a6559914..8e81ea039f31 100644 > --- a/drivers/vfio/pci/vfio_pci.c > +++ b/drivers/vfio/pci/vfio_pci.c > @@ -27,6 +27,10 @@ > #include <linux/uaccess.h> > > #include "vfio_pci_core.h" > +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT > +#include "npu2_vfio_pci.h" > +#include "nvlink2gpu_vfio_pci.h" > +#endif > > #define DRIVER_VERSION "0.2" > #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" > @@ -142,14 +146,48 @@ static const struct vfio_device_ops vfio_pci_ops = { > .match = vfio_pci_core_match, > }; > > +/* > + * This layer is used for backward compatibility. Hopefully it will be > + * removed in the future. > + */ > +static struct pci_driver *vfio_pci_get_compat_driver(struct pci_dev *pdev) > +{ > + switch (pdev->vendor) { > + case PCI_VENDOR_ID_NVIDIA: > + switch (pdev->device) { > + case 0x1db1: > + case 0x1db5: > + case 0x1db8: > + case 0x1df5: > + return get_nvlink2gpu_vfio_pci_driver(pdev); This does not really need a switch, could simply call these get_xxxx_vfio_pci_driver. Thanks, > + default: > + return NULL; > + } > + case PCI_VENDOR_ID_IBM: > + switch (pdev->device) { > + case 0x04ea: > + return get_npu2_vfio_pci_driver(pdev); > + default: > + return NULL; > + } > + } > + > + return NULL; > +} > + > static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > { > struct vfio_pci_device *vpdev; > + struct pci_driver *driver; > int ret; > > if (vfio_pci_is_denylisted(pdev)) > return -EINVAL; > > + driver = vfio_pci_get_compat_driver(pdev); > + if (driver) > + return driver->probe(pdev, id); > + > vpdev = kzalloc(sizeof(*vpdev), GFP_KERNEL); > if (!vpdev) > return -ENOMEM; > @@ -167,14 +205,21 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > > static void vfio_pci_remove(struct pci_dev *pdev) > { > - struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); > - struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); > - struct vfio_pci_device *vpdev; > - > - vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev); > - > - vfio_pci_core_unregister_device(core_vpdev); > - kfree(vpdev); > + struct pci_driver *driver; > + > + driver = vfio_pci_get_compat_driver(pdev); > + if (driver) { > + driver->remove(pdev); > + } else { > + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); > + struct vfio_pci_core_device *core_vpdev; > + struct vfio_pci_device *vpdev; > + > + core_vpdev = vfio_device_data(vdev); > + vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev); > + vfio_pci_core_unregister_device(core_vpdev); > + kfree(vpdev); > + } > } > > static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c > index 4de8e352df9c..f9b39abe54cb 100644 > --- a/drivers/vfio/pci/vfio_pci_core.c > +++ b/drivers/vfio/pci/vfio_pci_core.c > @@ -354,24 +354,6 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) > } > } > > - if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && > - IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { > - ret = vfio_pci_nvidia_v100_nvlink2_init(vdev); > - if (ret && ret != -ENODEV) { > - pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n"); > - goto disable_exit; > - } > - } > - > - if (pdev->vendor == PCI_VENDOR_ID_IBM && > - IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { > - ret = vfio_pci_ibm_npu2_init(vdev); > - if (ret && ret != -ENODEV) { > - pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n"); > - goto disable_exit; > - } > - } > - > return 0; > > disable_exit: > diff --git a/drivers/vfio/pci/vfio_pci_core.h b/drivers/vfio/pci/vfio_pci_core.h > index 8989443c3086..31f3836e606e 100644 > --- a/drivers/vfio/pci/vfio_pci_core.h > +++ b/drivers/vfio/pci/vfio_pci_core.h > @@ -204,20 +204,6 @@ static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) > return -ENODEV; > } > #endif > -#ifdef CONFIG_VFIO_PCI_NVLINK2 > -extern int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev); > -extern int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev); > -#else > -static inline int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev) > -{ > - return -ENODEV; > -} > - > -static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev) > -{ > - return -ENODEV; > -} > -#endif > > #ifdef CONFIG_S390 > extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, >
On 3/10/2021 8:39 AM, Alexey Kardashevskiy wrote: > > > On 09/03/2021 19:33, Max Gurtovoy wrote: >> The new drivers introduced are nvlink2gpu_vfio_pci.ko and >> npu2_vfio_pci.ko. >> The first will be responsible for providing special extensions for >> NVIDIA GPUs with NVLINK2 support for P9 platform (and others in the >> future). The last will be responsible for POWER9 NPU2 unit (NVLink2 host >> bus adapter). >> >> Also, preserve backward compatibility for users that were binding >> NVLINK2 devices to vfio_pci.ko. Hopefully this compatibility layer will >> be dropped in the future >> >> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> >> --- >> drivers/vfio/pci/Kconfig | 28 +++- >> drivers/vfio/pci/Makefile | 7 +- >> .../pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} | 144 ++++++++++++++++- >> drivers/vfio/pci/npu2_vfio_pci.h | 24 +++ >> ...pci_nvlink2gpu.c => nvlink2gpu_vfio_pci.c} | 149 +++++++++++++++++- >> drivers/vfio/pci/nvlink2gpu_vfio_pci.h | 24 +++ >> drivers/vfio/pci/vfio_pci.c | 61 ++++++- >> drivers/vfio/pci/vfio_pci_core.c | 18 --- >> drivers/vfio/pci/vfio_pci_core.h | 14 -- >> 9 files changed, 422 insertions(+), 47 deletions(-) >> rename drivers/vfio/pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} (64%) >> create mode 100644 drivers/vfio/pci/npu2_vfio_pci.h >> rename drivers/vfio/pci/{vfio_pci_nvlink2gpu.c => >> nvlink2gpu_vfio_pci.c} (67%) >> create mode 100644 drivers/vfio/pci/nvlink2gpu_vfio_pci.h >> >> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig >> index 829e90a2e5a3..88c89863a205 100644 >> --- a/drivers/vfio/pci/Kconfig >> +++ b/drivers/vfio/pci/Kconfig >> @@ -48,8 +48,30 @@ config VFIO_PCI_IGD >> To enable Intel IGD assignment through vfio-pci, say Y. >> -config VFIO_PCI_NVLINK2 >> - def_bool y >> +config VFIO_PCI_NVLINK2GPU >> + tristate "VFIO support for NVIDIA NVLINK2 GPUs" >> depends on VFIO_PCI_CORE && PPC_POWERNV >> help >> - VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs >> + VFIO PCI driver for NVIDIA NVLINK2 GPUs with specific extensions >> + for P9 Witherspoon machine. >> + >> +config VFIO_PCI_NPU2 >> + tristate "VFIO support for IBM NPU host bus adapter on P9" >> + depends on VFIO_PCI_NVLINK2GPU && PPC_POWERNV >> + help >> + VFIO PCI specific extensions for IBM NVLink2 host bus adapter >> on P9 >> + Witherspoon machine. >> + >> +config VFIO_PCI_DRIVER_COMPAT >> + bool "VFIO PCI backward compatibility for vendor specific >> extensions" >> + default y >> + depends on VFIO_PCI >> + help >> + Say Y here if you want to preserve VFIO PCI backward >> + compatibility. vfio_pci.ko will continue to automatically use >> + the NVLINK2, NPU2 and IGD VFIO drivers when it is attached to >> + a compatible device. >> + >> + When N is selected the user must bind explicity to the module >> + they want to handle the device and vfio_pci.ko will have no >> + device specific special behaviors. >> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile >> index f539f32c9296..86fb62e271fc 100644 >> --- a/drivers/vfio/pci/Makefile >> +++ b/drivers/vfio/pci/Makefile >> @@ -2,10 +2,15 @@ >> obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o >> obj-$(CONFIG_VFIO_PCI) += vfio-pci.o >> +obj-$(CONFIG_VFIO_PCI_NPU2) += npu2-vfio-pci.o >> +obj-$(CONFIG_VFIO_PCI_NVLINK2GPU) += nvlink2gpu-vfio-pci.o >> vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o >> vfio_pci_rdwr.o vfio_pci_config.o >> vfio-pci-core-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o >> -vfio-pci-core-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2gpu.o >> vfio_pci_npu2.o >> vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o >> vfio-pci-y := vfio_pci.o >> + >> +npu2-vfio-pci-y := npu2_vfio_pci.o >> + >> +nvlink2gpu-vfio-pci-y := nvlink2gpu_vfio_pci.o >> diff --git a/drivers/vfio/pci/vfio_pci_npu2.c >> b/drivers/vfio/pci/npu2_vfio_pci.c >> similarity index 64% >> rename from drivers/vfio/pci/vfio_pci_npu2.c >> rename to drivers/vfio/pci/npu2_vfio_pci.c >> index 717745256ab3..7071bda0f2b6 100644 >> --- a/drivers/vfio/pci/vfio_pci_npu2.c >> +++ b/drivers/vfio/pci/npu2_vfio_pci.c >> @@ -14,19 +14,28 @@ >> * Author: Alex Williamson <alex.williamson@redhat.com> >> */ >> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >> + >> +#include <linux/module.h> >> #include <linux/io.h> >> #include <linux/pci.h> >> #include <linux/uaccess.h> >> #include <linux/vfio.h> >> +#include <linux/list.h> >> #include <linux/sched/mm.h> >> #include <linux/mmu_context.h> >> #include <asm/kvm_ppc.h> >> #include "vfio_pci_core.h" >> +#include "npu2_vfio_pci.h" >> #define CREATE_TRACE_POINTS >> #include "npu2_trace.h" >> +#define DRIVER_VERSION "0.1" >> +#define DRIVER_AUTHOR "Alexey Kardashevskiy <aik@ozlabs.ru>" >> +#define DRIVER_DESC "NPU2 VFIO PCI - User Level meta-driver for >> POWER9 NPU NVLink2 HBA" >> + >> EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap); >> struct vfio_pci_npu2_data { >> @@ -36,6 +45,10 @@ struct vfio_pci_npu2_data { >> unsigned int link_speed; /* The link speed from DT's >> ibm,nvlink-speed */ >> }; >> +struct npu2_vfio_pci_device { >> + struct vfio_pci_core_device vdev; >> +}; >> + >> static size_t vfio_pci_npu2_rw(struct vfio_pci_core_device *vdev, >> char __user *buf, size_t count, loff_t *ppos, bool iswrite) >> { >> @@ -120,7 +133,7 @@ static const struct vfio_pci_regops >> vfio_pci_npu2_regops = { >> .add_capability = vfio_pci_npu2_add_capability, >> }; >> -int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev) >> +static int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev) >> { >> int ret; >> struct vfio_pci_npu2_data *data; >> @@ -220,3 +233,132 @@ int vfio_pci_ibm_npu2_init(struct >> vfio_pci_core_device *vdev) >> return ret; >> } >> + >> +static void npu2_vfio_pci_release(void *device_data) >> +{ >> + struct vfio_pci_core_device *vdev = device_data; >> + >> + mutex_lock(&vdev->reflck->lock); >> + if (!(--vdev->refcnt)) { >> + vfio_pci_vf_token_user_add(vdev, -1); >> + vfio_pci_core_spapr_eeh_release(vdev); >> + vfio_pci_core_disable(vdev); >> + } >> + mutex_unlock(&vdev->reflck->lock); >> + >> + module_put(THIS_MODULE); >> +} >> + >> +static int npu2_vfio_pci_open(void *device_data) >> +{ >> + struct vfio_pci_core_device *vdev = device_data; >> + int ret = 0; >> + >> + if (!try_module_get(THIS_MODULE)) >> + return -ENODEV; >> + >> + mutex_lock(&vdev->reflck->lock); >> + >> + if (!vdev->refcnt) { >> + ret = vfio_pci_core_enable(vdev); >> + if (ret) >> + goto error; >> + >> + ret = vfio_pci_ibm_npu2_init(vdev); >> + if (ret && ret != -ENODEV) { >> + pci_warn(vdev->pdev, >> + "Failed to setup NVIDIA NV2 ATSD region\n"); >> + vfio_pci_core_disable(vdev); >> + goto error; >> + } >> + ret = 0; >> + vfio_pci_probe_mmaps(vdev); >> + vfio_pci_core_spapr_eeh_open(vdev); >> + vfio_pci_vf_token_user_add(vdev, 1); >> + } >> + vdev->refcnt++; >> +error: >> + mutex_unlock(&vdev->reflck->lock); >> + if (ret) >> + module_put(THIS_MODULE); >> + return ret; >> +} >> + >> +static const struct vfio_device_ops npu2_vfio_pci_ops = { >> + .name = "npu2-vfio-pci", >> + .open = npu2_vfio_pci_open, >> + .release = npu2_vfio_pci_release, >> + .ioctl = vfio_pci_core_ioctl, >> + .read = vfio_pci_core_read, >> + .write = vfio_pci_core_write, >> + .mmap = vfio_pci_core_mmap, >> + .request = vfio_pci_core_request, >> + .match = vfio_pci_core_match, >> +}; >> + >> +static int npu2_vfio_pci_probe(struct pci_dev *pdev, >> + const struct pci_device_id *id) >> +{ >> + struct npu2_vfio_pci_device *npvdev; >> + int ret; >> + >> + npvdev = kzalloc(sizeof(*npvdev), GFP_KERNEL); >> + if (!npvdev) >> + return -ENOMEM; >> + >> + ret = vfio_pci_core_register_device(&npvdev->vdev, pdev, >> + &npu2_vfio_pci_ops); >> + if (ret) >> + goto out_free; >> + >> + return 0; >> + >> +out_free: >> + kfree(npvdev); >> + return ret; >> +} >> + >> +static void npu2_vfio_pci_remove(struct pci_dev *pdev) >> +{ >> + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); >> + struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); >> + struct npu2_vfio_pci_device *npvdev; >> + >> + npvdev = container_of(core_vpdev, struct npu2_vfio_pci_device, >> vdev); >> + >> + vfio_pci_core_unregister_device(core_vpdev); >> + kfree(npvdev); >> +} >> + >> +static const struct pci_device_id npu2_vfio_pci_table[] = { >> + { PCI_VDEVICE(IBM, 0x04ea) }, >> + { 0, } >> +}; >> + >> +static struct pci_driver npu2_vfio_pci_driver = { >> + .name = "npu2-vfio-pci", >> + .id_table = npu2_vfio_pci_table, >> + .probe = npu2_vfio_pci_probe, >> + .remove = npu2_vfio_pci_remove, >> +#ifdef CONFIG_PCI_IOV >> + .sriov_configure = vfio_pci_core_sriov_configure, >> +#endif >> + .err_handler = &vfio_pci_core_err_handlers, >> +}; >> + >> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev) >> +{ >> + if (pci_match_id(npu2_vfio_pci_driver.id_table, pdev)) >> + return &npu2_vfio_pci_driver; >> + return NULL; >> +} >> +EXPORT_SYMBOL_GPL(get_npu2_vfio_pci_driver); >> +#endif >> + >> +module_pci_driver(npu2_vfio_pci_driver); >> + >> +MODULE_VERSION(DRIVER_VERSION); >> +MODULE_LICENSE("GPL v2"); >> +MODULE_AUTHOR(DRIVER_AUTHOR); >> +MODULE_DESCRIPTION(DRIVER_DESC); >> diff --git a/drivers/vfio/pci/npu2_vfio_pci.h >> b/drivers/vfio/pci/npu2_vfio_pci.h >> new file mode 100644 >> index 000000000000..92010d340346 >> --- /dev/null >> +++ b/drivers/vfio/pci/npu2_vfio_pci.h >> @@ -0,0 +1,24 @@ >> +/* SPDX-License-Identifier: GPL-2.0 */ >> +/* >> + * Copyright (c) 2020, Mellanox Technologies, Ltd. All rights >> reserved. >> + * Author: Max Gurtovoy <mgurtovoy@nvidia.com> >> + */ >> + >> +#ifndef NPU2_VFIO_PCI_H >> +#define NPU2_VFIO_PCI_H >> + >> +#include <linux/pci.h> >> +#include <linux/module.h> >> + >> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >> +#if defined(CONFIG_VFIO_PCI_NPU2) || >> defined(CONFIG_VFIO_PCI_NPU2_MODULE) >> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev); >> +#else >> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev) >> +{ >> + return NULL; >> +} >> +#endif >> +#endif >> + >> +#endif /* NPU2_VFIO_PCI_H */ >> diff --git a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c >> b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c >> similarity index 67% >> rename from drivers/vfio/pci/vfio_pci_nvlink2gpu.c >> rename to drivers/vfio/pci/nvlink2gpu_vfio_pci.c >> index 6dce1e78ee82..84a5ac1ce8ac 100644 >> --- a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c >> +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c >> @@ -1,6 +1,6 @@ >> // SPDX-License-Identifier: GPL-2.0-only >> /* >> - * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2. >> + * VFIO PCI NVIDIA NVLink2 GPUs support. >> * >> * Copyright (C) 2018 IBM Corp. All rights reserved. >> * Author: Alexey Kardashevskiy <aik@ozlabs.ru> >> @@ -12,6 +12,9 @@ >> * Author: Alex Williamson <alex.williamson@redhat.com> >> */ >> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >> + >> +#include <linux/module.h> >> #include <linux/io.h> >> #include <linux/pci.h> >> #include <linux/uaccess.h> >> @@ -21,10 +24,15 @@ >> #include <asm/kvm_ppc.h> >> #include "vfio_pci_core.h" >> +#include "nvlink2gpu_vfio_pci.h" >> #define CREATE_TRACE_POINTS >> #include "nvlink2gpu_trace.h" >> +#define DRIVER_VERSION "0.1" >> +#define DRIVER_AUTHOR "Alexey Kardashevskiy <aik@ozlabs.ru>" >> +#define DRIVER_DESC "NVLINK2GPU VFIO PCI - User Level >> meta-driver for NVIDIA NVLink2 GPUs" >> + >> EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault); >> EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap); >> @@ -39,6 +47,10 @@ struct vfio_pci_nvgpu_data { >> struct notifier_block group_notifier; >> }; >> +struct nv_vfio_pci_device { >> + struct vfio_pci_core_device vdev; >> +}; >> + >> static size_t vfio_pci_nvgpu_rw(struct vfio_pci_core_device *vdev, >> char __user *buf, size_t count, loff_t *ppos, bool iswrite) >> { >> @@ -207,7 +219,8 @@ static int vfio_pci_nvgpu_group_notifier(struct >> notifier_block *nb, >> return NOTIFY_OK; >> } >> -int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device >> *vdev) >> +static int >> +vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev) >> { >> int ret; >> u64 reg[2]; >> @@ -293,3 +306,135 @@ int vfio_pci_nvidia_v100_nvlink2_init(struct >> vfio_pci_core_device *vdev) >> return ret; >> } >> + >> +static void nvlink2gpu_vfio_pci_release(void *device_data) >> +{ >> + struct vfio_pci_core_device *vdev = device_data; >> + >> + mutex_lock(&vdev->reflck->lock); >> + if (!(--vdev->refcnt)) { >> + vfio_pci_vf_token_user_add(vdev, -1); >> + vfio_pci_core_spapr_eeh_release(vdev); >> + vfio_pci_core_disable(vdev); >> + } >> + mutex_unlock(&vdev->reflck->lock); >> + >> + module_put(THIS_MODULE); >> +} >> + >> +static int nvlink2gpu_vfio_pci_open(void *device_data) >> +{ >> + struct vfio_pci_core_device *vdev = device_data; >> + int ret = 0; >> + >> + if (!try_module_get(THIS_MODULE)) >> + return -ENODEV; >> + >> + mutex_lock(&vdev->reflck->lock); >> + >> + if (!vdev->refcnt) { >> + ret = vfio_pci_core_enable(vdev); >> + if (ret) >> + goto error; >> + >> + ret = vfio_pci_nvidia_v100_nvlink2_init(vdev); >> + if (ret && ret != -ENODEV) { >> + pci_warn(vdev->pdev, >> + "Failed to setup NVIDIA NV2 RAM region\n"); >> + vfio_pci_core_disable(vdev); >> + goto error; >> + } >> + ret = 0; >> + vfio_pci_probe_mmaps(vdev); >> + vfio_pci_core_spapr_eeh_open(vdev); >> + vfio_pci_vf_token_user_add(vdev, 1); >> + } >> + vdev->refcnt++; >> +error: >> + mutex_unlock(&vdev->reflck->lock); >> + if (ret) >> + module_put(THIS_MODULE); >> + return ret; >> +} >> + >> +static const struct vfio_device_ops nvlink2gpu_vfio_pci_ops = { >> + .name = "nvlink2gpu-vfio-pci", >> + .open = nvlink2gpu_vfio_pci_open, >> + .release = nvlink2gpu_vfio_pci_release, >> + .ioctl = vfio_pci_core_ioctl, >> + .read = vfio_pci_core_read, >> + .write = vfio_pci_core_write, >> + .mmap = vfio_pci_core_mmap, >> + .request = vfio_pci_core_request, >> + .match = vfio_pci_core_match, >> +}; >> + >> +static int nvlink2gpu_vfio_pci_probe(struct pci_dev *pdev, >> + const struct pci_device_id *id) >> +{ >> + struct nv_vfio_pci_device *nvdev; >> + int ret; >> + >> + nvdev = kzalloc(sizeof(*nvdev), GFP_KERNEL); >> + if (!nvdev) >> + return -ENOMEM; >> + >> + ret = vfio_pci_core_register_device(&nvdev->vdev, pdev, >> + &nvlink2gpu_vfio_pci_ops); >> + if (ret) >> + goto out_free; >> + >> + return 0; >> + >> +out_free: >> + kfree(nvdev); >> + return ret; >> +} >> + >> +static void nvlink2gpu_vfio_pci_remove(struct pci_dev *pdev) >> +{ >> + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); >> + struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); >> + struct nv_vfio_pci_device *nvdev; >> + >> + nvdev = container_of(core_vpdev, struct nv_vfio_pci_device, vdev); >> + >> + vfio_pci_core_unregister_device(core_vpdev); >> + kfree(nvdev); >> +} >> + >> +static const struct pci_device_id nvlink2gpu_vfio_pci_table[] = { >> + { PCI_VDEVICE(NVIDIA, 0x1DB1) }, /* GV100GL-A NVIDIA Tesla >> V100-SXM2-16GB */ >> + { PCI_VDEVICE(NVIDIA, 0x1DB5) }, /* GV100GL-A NVIDIA Tesla >> V100-SXM2-32GB */ >> + { PCI_VDEVICE(NVIDIA, 0x1DB8) }, /* GV100GL-A NVIDIA Tesla >> V100-SXM3-32GB */ >> + { PCI_VDEVICE(NVIDIA, 0x1DF5) }, /* GV100GL-B NVIDIA Tesla >> V100-SXM2-16GB */ > > > Where is this list from? > > Also, how is this supposed to work at the boot time? Will the kernel > try binding let's say this one and nouveau? Which one is going to win? At boot time nouveau driver will win since the vfio drivers don't declare MODULE_DEVICE_TABLE > > >> + { 0, } > > > Why a comma? I'll remove the comma. > >> +}; > > > >> + >> +static struct pci_driver nvlink2gpu_vfio_pci_driver = { >> + .name = "nvlink2gpu-vfio-pci", >> + .id_table = nvlink2gpu_vfio_pci_table, >> + .probe = nvlink2gpu_vfio_pci_probe, >> + .remove = nvlink2gpu_vfio_pci_remove, >> +#ifdef CONFIG_PCI_IOV >> + .sriov_configure = vfio_pci_core_sriov_configure, >> +#endif > > > What is this IOV business about? from vfio_pci #ifdef CONFIG_PCI_IOV module_param(enable_sriov, bool, 0644); MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration. Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF."); #endif > > >> + .err_handler = &vfio_pci_core_err_handlers, >> +}; >> + >> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev) >> +{ >> + if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev)) >> + return &nvlink2gpu_vfio_pci_driver; > > > Why do we need matching PCI ids here instead of looking at the FDT > which will work better? what is FDT ? any is it better to use it instead of match_id ? > > >> + return NULL; >> +} >> +EXPORT_SYMBOL_GPL(get_nvlink2gpu_vfio_pci_driver); >> +#endif >> + >> +module_pci_driver(nvlink2gpu_vfio_pci_driver); >> + >> +MODULE_VERSION(DRIVER_VERSION); >> +MODULE_LICENSE("GPL v2"); >> +MODULE_AUTHOR(DRIVER_AUTHOR); >> +MODULE_DESCRIPTION(DRIVER_DESC); >> diff --git a/drivers/vfio/pci/nvlink2gpu_vfio_pci.h >> b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h >> new file mode 100644 >> index 000000000000..ebd5b600b190 >> --- /dev/null >> +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h >> @@ -0,0 +1,24 @@ >> +/* SPDX-License-Identifier: GPL-2.0 */ >> +/* >> + * Copyright (c) 2020, Mellanox Technologies, Ltd. All rights >> reserved. >> + * Author: Max Gurtovoy <mgurtovoy@nvidia.com> >> + */ >> + >> +#ifndef NVLINK2GPU_VFIO_PCI_H >> +#define NVLINK2GPU_VFIO_PCI_H >> + >> +#include <linux/pci.h> >> +#include <linux/module.h> >> + >> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >> +#if defined(CONFIG_VFIO_PCI_NVLINK2GPU) || >> defined(CONFIG_VFIO_PCI_NVLINK2GPU_MODULE) >> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev >> *pdev); >> +#else >> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev) >> +{ >> + return NULL; >> +} >> +#endif >> +#endif >> + >> +#endif /* NVLINK2GPU_VFIO_PCI_H */ >> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c >> index dbc0a6559914..8e81ea039f31 100644 >> --- a/drivers/vfio/pci/vfio_pci.c >> +++ b/drivers/vfio/pci/vfio_pci.c >> @@ -27,6 +27,10 @@ >> #include <linux/uaccess.h> >> #include "vfio_pci_core.h" >> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >> +#include "npu2_vfio_pci.h" >> +#include "nvlink2gpu_vfio_pci.h" >> +#endif >> #define DRIVER_VERSION "0.2" >> #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" >> @@ -142,14 +146,48 @@ static const struct vfio_device_ops >> vfio_pci_ops = { >> .match = vfio_pci_core_match, >> }; >> +/* >> + * This layer is used for backward compatibility. Hopefully it will be >> + * removed in the future. >> + */ >> +static struct pci_driver *vfio_pci_get_compat_driver(struct pci_dev >> *pdev) >> +{ >> + switch (pdev->vendor) { >> + case PCI_VENDOR_ID_NVIDIA: >> + switch (pdev->device) { >> + case 0x1db1: >> + case 0x1db5: >> + case 0x1db8: >> + case 0x1df5: >> + return get_nvlink2gpu_vfio_pci_driver(pdev); > > This does not really need a switch, could simply call these > get_xxxx_vfio_pci_driver. Thanks, maybe the result will be the same but I don't think we need to send all NVIDIA devices or IBM devices to this function. we can maybe export the tables from the vfio_vendor driver and match it here. > > >> + default: >> + return NULL; >> + } >> + case PCI_VENDOR_ID_IBM: >> + switch (pdev->device) { >> + case 0x04ea: >> + return get_npu2_vfio_pci_driver(pdev); >> + default: >> + return NULL; >> + } >> + } >> + >> + return NULL; >> +} >> + >> static int vfio_pci_probe(struct pci_dev *pdev, const struct >> pci_device_id *id) >> { >> struct vfio_pci_device *vpdev; >> + struct pci_driver *driver; >> int ret; >> if (vfio_pci_is_denylisted(pdev)) >> return -EINVAL; >> + driver = vfio_pci_get_compat_driver(pdev); >> + if (driver) >> + return driver->probe(pdev, id); >> + >> vpdev = kzalloc(sizeof(*vpdev), GFP_KERNEL); >> if (!vpdev) >> return -ENOMEM; >> @@ -167,14 +205,21 @@ static int vfio_pci_probe(struct pci_dev *pdev, >> const struct pci_device_id *id) >> static void vfio_pci_remove(struct pci_dev *pdev) >> { >> - struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); >> - struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); >> - struct vfio_pci_device *vpdev; >> - >> - vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev); >> - >> - vfio_pci_core_unregister_device(core_vpdev); >> - kfree(vpdev); >> + struct pci_driver *driver; >> + >> + driver = vfio_pci_get_compat_driver(pdev); >> + if (driver) { >> + driver->remove(pdev); >> + } else { >> + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); >> + struct vfio_pci_core_device *core_vpdev; >> + struct vfio_pci_device *vpdev; >> + >> + core_vpdev = vfio_device_data(vdev); >> + vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev); >> + vfio_pci_core_unregister_device(core_vpdev); >> + kfree(vpdev); >> + } >> } >> static int vfio_pci_sriov_configure(struct pci_dev *pdev, int >> nr_virtfn) >> diff --git a/drivers/vfio/pci/vfio_pci_core.c >> b/drivers/vfio/pci/vfio_pci_core.c >> index 4de8e352df9c..f9b39abe54cb 100644 >> --- a/drivers/vfio/pci/vfio_pci_core.c >> +++ b/drivers/vfio/pci/vfio_pci_core.c >> @@ -354,24 +354,6 @@ int vfio_pci_core_enable(struct >> vfio_pci_core_device *vdev) >> } >> } >> - if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && >> - IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { >> - ret = vfio_pci_nvidia_v100_nvlink2_init(vdev); >> - if (ret && ret != -ENODEV) { >> - pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n"); >> - goto disable_exit; >> - } >> - } >> - >> - if (pdev->vendor == PCI_VENDOR_ID_IBM && >> - IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { >> - ret = vfio_pci_ibm_npu2_init(vdev); >> - if (ret && ret != -ENODEV) { >> - pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n"); >> - goto disable_exit; >> - } >> - } >> - >> return 0; >> disable_exit: >> diff --git a/drivers/vfio/pci/vfio_pci_core.h >> b/drivers/vfio/pci/vfio_pci_core.h >> index 8989443c3086..31f3836e606e 100644 >> --- a/drivers/vfio/pci/vfio_pci_core.h >> +++ b/drivers/vfio/pci/vfio_pci_core.h >> @@ -204,20 +204,6 @@ static inline int vfio_pci_igd_init(struct >> vfio_pci_core_device *vdev) >> return -ENODEV; >> } >> #endif >> -#ifdef CONFIG_VFIO_PCI_NVLINK2 >> -extern int vfio_pci_nvidia_v100_nvlink2_init(struct >> vfio_pci_core_device *vdev); >> -extern int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev); >> -#else >> -static inline int vfio_pci_nvidia_v100_nvlink2_init(struct >> vfio_pci_core_device *vdev) >> -{ >> - return -ENODEV; >> -} >> - >> -static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device >> *vdev) >> -{ >> - return -ENODEV; >> -} >> -#endif >> #ifdef CONFIG_S390 >> extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device >> *vdev, >> >
On Wed, Mar 10, 2021 at 02:57:57PM +0200, Max Gurtovoy wrote: > > > + .err_handler = &vfio_pci_core_err_handlers, > > > +}; > > > + > > > +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT > > > +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev) > > > +{ > > > + if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev)) > > > + return &nvlink2gpu_vfio_pci_driver; > > > > > > Why do we need matching PCI ids here instead of looking at the FDT which > > will work better? > > what is FDT ? any is it better to use it instead of match_id ? This is emulating the device_driver match for the pci_driver. I don't think we can combine FDT matching with pci_driver, can we? Jason
On 10/03/2021 23:57, Max Gurtovoy wrote: > > On 3/10/2021 8:39 AM, Alexey Kardashevskiy wrote: >> >> >> On 09/03/2021 19:33, Max Gurtovoy wrote: >>> The new drivers introduced are nvlink2gpu_vfio_pci.ko and >>> npu2_vfio_pci.ko. >>> The first will be responsible for providing special extensions for >>> NVIDIA GPUs with NVLINK2 support for P9 platform (and others in the >>> future). The last will be responsible for POWER9 NPU2 unit (NVLink2 host >>> bus adapter). >>> >>> Also, preserve backward compatibility for users that were binding >>> NVLINK2 devices to vfio_pci.ko. Hopefully this compatibility layer will >>> be dropped in the future >>> >>> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> >>> --- >>> drivers/vfio/pci/Kconfig | 28 +++- >>> drivers/vfio/pci/Makefile | 7 +- >>> .../pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} | 144 ++++++++++++++++- >>> drivers/vfio/pci/npu2_vfio_pci.h | 24 +++ >>> ...pci_nvlink2gpu.c => nvlink2gpu_vfio_pci.c} | 149 +++++++++++++++++- >>> drivers/vfio/pci/nvlink2gpu_vfio_pci.h | 24 +++ >>> drivers/vfio/pci/vfio_pci.c | 61 ++++++- >>> drivers/vfio/pci/vfio_pci_core.c | 18 --- >>> drivers/vfio/pci/vfio_pci_core.h | 14 -- >>> 9 files changed, 422 insertions(+), 47 deletions(-) >>> rename drivers/vfio/pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} (64%) >>> create mode 100644 drivers/vfio/pci/npu2_vfio_pci.h >>> rename drivers/vfio/pci/{vfio_pci_nvlink2gpu.c => >>> nvlink2gpu_vfio_pci.c} (67%) >>> create mode 100644 drivers/vfio/pci/nvlink2gpu_vfio_pci.h >>> >>> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig >>> index 829e90a2e5a3..88c89863a205 100644 >>> --- a/drivers/vfio/pci/Kconfig >>> +++ b/drivers/vfio/pci/Kconfig >>> @@ -48,8 +48,30 @@ config VFIO_PCI_IGD >>> To enable Intel IGD assignment through vfio-pci, say Y. >>> -config VFIO_PCI_NVLINK2 >>> - def_bool y >>> +config VFIO_PCI_NVLINK2GPU >>> + tristate "VFIO support for NVIDIA NVLINK2 GPUs" >>> depends on VFIO_PCI_CORE && PPC_POWERNV >>> help >>> - VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs >>> + VFIO PCI driver for NVIDIA NVLINK2 GPUs with specific extensions >>> + for P9 Witherspoon machine. >>> + >>> +config VFIO_PCI_NPU2 >>> + tristate "VFIO support for IBM NPU host bus adapter on P9" >>> + depends on VFIO_PCI_NVLINK2GPU && PPC_POWERNV >>> + help >>> + VFIO PCI specific extensions for IBM NVLink2 host bus adapter >>> on P9 >>> + Witherspoon machine. >>> + >>> +config VFIO_PCI_DRIVER_COMPAT >>> + bool "VFIO PCI backward compatibility for vendor specific >>> extensions" >>> + default y >>> + depends on VFIO_PCI >>> + help >>> + Say Y here if you want to preserve VFIO PCI backward >>> + compatibility. vfio_pci.ko will continue to automatically use >>> + the NVLINK2, NPU2 and IGD VFIO drivers when it is attached to >>> + a compatible device. >>> + >>> + When N is selected the user must bind explicity to the module >>> + they want to handle the device and vfio_pci.ko will have no >>> + device specific special behaviors. >>> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile >>> index f539f32c9296..86fb62e271fc 100644 >>> --- a/drivers/vfio/pci/Makefile >>> +++ b/drivers/vfio/pci/Makefile >>> @@ -2,10 +2,15 @@ >>> obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o >>> obj-$(CONFIG_VFIO_PCI) += vfio-pci.o >>> +obj-$(CONFIG_VFIO_PCI_NPU2) += npu2-vfio-pci.o >>> +obj-$(CONFIG_VFIO_PCI_NVLINK2GPU) += nvlink2gpu-vfio-pci.o >>> vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o >>> vfio_pci_rdwr.o vfio_pci_config.o >>> vfio-pci-core-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o >>> -vfio-pci-core-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2gpu.o >>> vfio_pci_npu2.o >>> vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o >>> vfio-pci-y := vfio_pci.o >>> + >>> +npu2-vfio-pci-y := npu2_vfio_pci.o >>> + >>> +nvlink2gpu-vfio-pci-y := nvlink2gpu_vfio_pci.o >>> diff --git a/drivers/vfio/pci/vfio_pci_npu2.c >>> b/drivers/vfio/pci/npu2_vfio_pci.c >>> similarity index 64% >>> rename from drivers/vfio/pci/vfio_pci_npu2.c >>> rename to drivers/vfio/pci/npu2_vfio_pci.c >>> index 717745256ab3..7071bda0f2b6 100644 >>> --- a/drivers/vfio/pci/vfio_pci_npu2.c >>> +++ b/drivers/vfio/pci/npu2_vfio_pci.c >>> @@ -14,19 +14,28 @@ >>> * Author: Alex Williamson <alex.williamson@redhat.com> >>> */ >>> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >>> + >>> +#include <linux/module.h> >>> #include <linux/io.h> >>> #include <linux/pci.h> >>> #include <linux/uaccess.h> >>> #include <linux/vfio.h> >>> +#include <linux/list.h> >>> #include <linux/sched/mm.h> >>> #include <linux/mmu_context.h> >>> #include <asm/kvm_ppc.h> >>> #include "vfio_pci_core.h" >>> +#include "npu2_vfio_pci.h" >>> #define CREATE_TRACE_POINTS >>> #include "npu2_trace.h" >>> +#define DRIVER_VERSION "0.1" >>> +#define DRIVER_AUTHOR "Alexey Kardashevskiy <aik@ozlabs.ru>" >>> +#define DRIVER_DESC "NPU2 VFIO PCI - User Level meta-driver for >>> POWER9 NPU NVLink2 HBA" >>> + >>> EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap); >>> struct vfio_pci_npu2_data { >>> @@ -36,6 +45,10 @@ struct vfio_pci_npu2_data { >>> unsigned int link_speed; /* The link speed from DT's >>> ibm,nvlink-speed */ >>> }; >>> +struct npu2_vfio_pci_device { >>> + struct vfio_pci_core_device vdev; >>> +}; >>> + >>> static size_t vfio_pci_npu2_rw(struct vfio_pci_core_device *vdev, >>> char __user *buf, size_t count, loff_t *ppos, bool iswrite) >>> { >>> @@ -120,7 +133,7 @@ static const struct vfio_pci_regops >>> vfio_pci_npu2_regops = { >>> .add_capability = vfio_pci_npu2_add_capability, >>> }; >>> -int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev) >>> +static int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev) >>> { >>> int ret; >>> struct vfio_pci_npu2_data *data; >>> @@ -220,3 +233,132 @@ int vfio_pci_ibm_npu2_init(struct >>> vfio_pci_core_device *vdev) >>> return ret; >>> } >>> + >>> +static void npu2_vfio_pci_release(void *device_data) >>> +{ >>> + struct vfio_pci_core_device *vdev = device_data; >>> + >>> + mutex_lock(&vdev->reflck->lock); >>> + if (!(--vdev->refcnt)) { >>> + vfio_pci_vf_token_user_add(vdev, -1); >>> + vfio_pci_core_spapr_eeh_release(vdev); >>> + vfio_pci_core_disable(vdev); >>> + } >>> + mutex_unlock(&vdev->reflck->lock); >>> + >>> + module_put(THIS_MODULE); >>> +} >>> + >>> +static int npu2_vfio_pci_open(void *device_data) >>> +{ >>> + struct vfio_pci_core_device *vdev = device_data; >>> + int ret = 0; >>> + >>> + if (!try_module_get(THIS_MODULE)) >>> + return -ENODEV; >>> + >>> + mutex_lock(&vdev->reflck->lock); >>> + >>> + if (!vdev->refcnt) { >>> + ret = vfio_pci_core_enable(vdev); >>> + if (ret) >>> + goto error; >>> + >>> + ret = vfio_pci_ibm_npu2_init(vdev); >>> + if (ret && ret != -ENODEV) { >>> + pci_warn(vdev->pdev, >>> + "Failed to setup NVIDIA NV2 ATSD region\n"); >>> + vfio_pci_core_disable(vdev); >>> + goto error; >>> + } >>> + ret = 0; >>> + vfio_pci_probe_mmaps(vdev); >>> + vfio_pci_core_spapr_eeh_open(vdev); >>> + vfio_pci_vf_token_user_add(vdev, 1); >>> + } >>> + vdev->refcnt++; >>> +error: >>> + mutex_unlock(&vdev->reflck->lock); >>> + if (ret) >>> + module_put(THIS_MODULE); >>> + return ret; >>> +} >>> + >>> +static const struct vfio_device_ops npu2_vfio_pci_ops = { >>> + .name = "npu2-vfio-pci", >>> + .open = npu2_vfio_pci_open, >>> + .release = npu2_vfio_pci_release, >>> + .ioctl = vfio_pci_core_ioctl, >>> + .read = vfio_pci_core_read, >>> + .write = vfio_pci_core_write, >>> + .mmap = vfio_pci_core_mmap, >>> + .request = vfio_pci_core_request, >>> + .match = vfio_pci_core_match, >>> +}; >>> + >>> +static int npu2_vfio_pci_probe(struct pci_dev *pdev, >>> + const struct pci_device_id *id) >>> +{ >>> + struct npu2_vfio_pci_device *npvdev; >>> + int ret; >>> + >>> + npvdev = kzalloc(sizeof(*npvdev), GFP_KERNEL); >>> + if (!npvdev) >>> + return -ENOMEM; >>> + >>> + ret = vfio_pci_core_register_device(&npvdev->vdev, pdev, >>> + &npu2_vfio_pci_ops); >>> + if (ret) >>> + goto out_free; >>> + >>> + return 0; >>> + >>> +out_free: >>> + kfree(npvdev); >>> + return ret; >>> +} >>> + >>> +static void npu2_vfio_pci_remove(struct pci_dev *pdev) >>> +{ >>> + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); >>> + struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); >>> + struct npu2_vfio_pci_device *npvdev; >>> + >>> + npvdev = container_of(core_vpdev, struct npu2_vfio_pci_device, >>> vdev); >>> + >>> + vfio_pci_core_unregister_device(core_vpdev); >>> + kfree(npvdev); >>> +} >>> + >>> +static const struct pci_device_id npu2_vfio_pci_table[] = { >>> + { PCI_VDEVICE(IBM, 0x04ea) }, >>> + { 0, } >>> +}; >>> + >>> +static struct pci_driver npu2_vfio_pci_driver = { >>> + .name = "npu2-vfio-pci", >>> + .id_table = npu2_vfio_pci_table, >>> + .probe = npu2_vfio_pci_probe, >>> + .remove = npu2_vfio_pci_remove, >>> +#ifdef CONFIG_PCI_IOV >>> + .sriov_configure = vfio_pci_core_sriov_configure, >>> +#endif >>> + .err_handler = &vfio_pci_core_err_handlers, >>> +}; >>> + >>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev) >>> +{ >>> + if (pci_match_id(npu2_vfio_pci_driver.id_table, pdev)) >>> + return &npu2_vfio_pci_driver; >>> + return NULL; >>> +} >>> +EXPORT_SYMBOL_GPL(get_npu2_vfio_pci_driver); >>> +#endif >>> + >>> +module_pci_driver(npu2_vfio_pci_driver); >>> + >>> +MODULE_VERSION(DRIVER_VERSION); >>> +MODULE_LICENSE("GPL v2"); >>> +MODULE_AUTHOR(DRIVER_AUTHOR); >>> +MODULE_DESCRIPTION(DRIVER_DESC); >>> diff --git a/drivers/vfio/pci/npu2_vfio_pci.h >>> b/drivers/vfio/pci/npu2_vfio_pci.h >>> new file mode 100644 >>> index 000000000000..92010d340346 >>> --- /dev/null >>> +++ b/drivers/vfio/pci/npu2_vfio_pci.h >>> @@ -0,0 +1,24 @@ >>> +/* SPDX-License-Identifier: GPL-2.0 */ >>> +/* >>> + * Copyright (c) 2020, Mellanox Technologies, Ltd. All rights >>> reserved. >>> + * Author: Max Gurtovoy <mgurtovoy@nvidia.com> >>> + */ >>> + >>> +#ifndef NPU2_VFIO_PCI_H >>> +#define NPU2_VFIO_PCI_H >>> + >>> +#include <linux/pci.h> >>> +#include <linux/module.h> >>> + >>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >>> +#if defined(CONFIG_VFIO_PCI_NPU2) || >>> defined(CONFIG_VFIO_PCI_NPU2_MODULE) >>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev); >>> +#else >>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev) >>> +{ >>> + return NULL; >>> +} >>> +#endif >>> +#endif >>> + >>> +#endif /* NPU2_VFIO_PCI_H */ >>> diff --git a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c >>> b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c >>> similarity index 67% >>> rename from drivers/vfio/pci/vfio_pci_nvlink2gpu.c >>> rename to drivers/vfio/pci/nvlink2gpu_vfio_pci.c >>> index 6dce1e78ee82..84a5ac1ce8ac 100644 >>> --- a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c >>> +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c >>> @@ -1,6 +1,6 @@ >>> // SPDX-License-Identifier: GPL-2.0-only >>> /* >>> - * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2. >>> + * VFIO PCI NVIDIA NVLink2 GPUs support. >>> * >>> * Copyright (C) 2018 IBM Corp. All rights reserved. >>> * Author: Alexey Kardashevskiy <aik@ozlabs.ru> >>> @@ -12,6 +12,9 @@ >>> * Author: Alex Williamson <alex.williamson@redhat.com> >>> */ >>> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >>> + >>> +#include <linux/module.h> >>> #include <linux/io.h> >>> #include <linux/pci.h> >>> #include <linux/uaccess.h> >>> @@ -21,10 +24,15 @@ >>> #include <asm/kvm_ppc.h> >>> #include "vfio_pci_core.h" >>> +#include "nvlink2gpu_vfio_pci.h" >>> #define CREATE_TRACE_POINTS >>> #include "nvlink2gpu_trace.h" >>> +#define DRIVER_VERSION "0.1" >>> +#define DRIVER_AUTHOR "Alexey Kardashevskiy <aik@ozlabs.ru>" >>> +#define DRIVER_DESC "NVLINK2GPU VFIO PCI - User Level >>> meta-driver for NVIDIA NVLink2 GPUs" >>> + >>> EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault); >>> EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap); >>> @@ -39,6 +47,10 @@ struct vfio_pci_nvgpu_data { >>> struct notifier_block group_notifier; >>> }; >>> +struct nv_vfio_pci_device { >>> + struct vfio_pci_core_device vdev; >>> +}; >>> + >>> static size_t vfio_pci_nvgpu_rw(struct vfio_pci_core_device *vdev, >>> char __user *buf, size_t count, loff_t *ppos, bool iswrite) >>> { >>> @@ -207,7 +219,8 @@ static int vfio_pci_nvgpu_group_notifier(struct >>> notifier_block *nb, >>> return NOTIFY_OK; >>> } >>> -int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device >>> *vdev) >>> +static int >>> +vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev) >>> { >>> int ret; >>> u64 reg[2]; >>> @@ -293,3 +306,135 @@ int vfio_pci_nvidia_v100_nvlink2_init(struct >>> vfio_pci_core_device *vdev) >>> return ret; >>> } >>> + >>> +static void nvlink2gpu_vfio_pci_release(void *device_data) >>> +{ >>> + struct vfio_pci_core_device *vdev = device_data; >>> + >>> + mutex_lock(&vdev->reflck->lock); >>> + if (!(--vdev->refcnt)) { >>> + vfio_pci_vf_token_user_add(vdev, -1); >>> + vfio_pci_core_spapr_eeh_release(vdev); >>> + vfio_pci_core_disable(vdev); >>> + } >>> + mutex_unlock(&vdev->reflck->lock); >>> + >>> + module_put(THIS_MODULE); >>> +} >>> + >>> +static int nvlink2gpu_vfio_pci_open(void *device_data) >>> +{ >>> + struct vfio_pci_core_device *vdev = device_data; >>> + int ret = 0; >>> + >>> + if (!try_module_get(THIS_MODULE)) >>> + return -ENODEV; >>> + >>> + mutex_lock(&vdev->reflck->lock); >>> + >>> + if (!vdev->refcnt) { >>> + ret = vfio_pci_core_enable(vdev); >>> + if (ret) >>> + goto error; >>> + >>> + ret = vfio_pci_nvidia_v100_nvlink2_init(vdev); >>> + if (ret && ret != -ENODEV) { >>> + pci_warn(vdev->pdev, >>> + "Failed to setup NVIDIA NV2 RAM region\n"); >>> + vfio_pci_core_disable(vdev); >>> + goto error; >>> + } >>> + ret = 0; >>> + vfio_pci_probe_mmaps(vdev); >>> + vfio_pci_core_spapr_eeh_open(vdev); >>> + vfio_pci_vf_token_user_add(vdev, 1); >>> + } >>> + vdev->refcnt++; >>> +error: >>> + mutex_unlock(&vdev->reflck->lock); >>> + if (ret) >>> + module_put(THIS_MODULE); >>> + return ret; >>> +} >>> + >>> +static const struct vfio_device_ops nvlink2gpu_vfio_pci_ops = { >>> + .name = "nvlink2gpu-vfio-pci", >>> + .open = nvlink2gpu_vfio_pci_open, >>> + .release = nvlink2gpu_vfio_pci_release, >>> + .ioctl = vfio_pci_core_ioctl, >>> + .read = vfio_pci_core_read, >>> + .write = vfio_pci_core_write, >>> + .mmap = vfio_pci_core_mmap, >>> + .request = vfio_pci_core_request, >>> + .match = vfio_pci_core_match, >>> +}; >>> + >>> +static int nvlink2gpu_vfio_pci_probe(struct pci_dev *pdev, >>> + const struct pci_device_id *id) >>> +{ >>> + struct nv_vfio_pci_device *nvdev; >>> + int ret; >>> + >>> + nvdev = kzalloc(sizeof(*nvdev), GFP_KERNEL); >>> + if (!nvdev) >>> + return -ENOMEM; >>> + >>> + ret = vfio_pci_core_register_device(&nvdev->vdev, pdev, >>> + &nvlink2gpu_vfio_pci_ops); >>> + if (ret) >>> + goto out_free; >>> + >>> + return 0; >>> + >>> +out_free: >>> + kfree(nvdev); >>> + return ret; >>> +} >>> + >>> +static void nvlink2gpu_vfio_pci_remove(struct pci_dev *pdev) >>> +{ >>> + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); >>> + struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); >>> + struct nv_vfio_pci_device *nvdev; >>> + >>> + nvdev = container_of(core_vpdev, struct nv_vfio_pci_device, vdev); >>> + >>> + vfio_pci_core_unregister_device(core_vpdev); >>> + kfree(nvdev); >>> +} >>> + >>> +static const struct pci_device_id nvlink2gpu_vfio_pci_table[] = { >>> + { PCI_VDEVICE(NVIDIA, 0x1DB1) }, /* GV100GL-A NVIDIA Tesla >>> V100-SXM2-16GB */ >>> + { PCI_VDEVICE(NVIDIA, 0x1DB5) }, /* GV100GL-A NVIDIA Tesla >>> V100-SXM2-32GB */ >>> + { PCI_VDEVICE(NVIDIA, 0x1DB8) }, /* GV100GL-A NVIDIA Tesla >>> V100-SXM3-32GB */ >>> + { PCI_VDEVICE(NVIDIA, 0x1DF5) }, /* GV100GL-B NVIDIA Tesla >>> V100-SXM2-16GB */ >> >> >> Where is this list from? >> >> Also, how is this supposed to work at the boot time? Will the kernel >> try binding let's say this one and nouveau? Which one is going to win? > > At boot time nouveau driver will win since the vfio drivers don't > declare MODULE_DEVICE_TABLE ok but where is the list from anyway? > > >> >> >>> + { 0, } >> >> >> Why a comma? > > I'll remove the comma. > > >> >>> +}; >> >> >> >>> + >>> +static struct pci_driver nvlink2gpu_vfio_pci_driver = { >>> + .name = "nvlink2gpu-vfio-pci", >>> + .id_table = nvlink2gpu_vfio_pci_table, >>> + .probe = nvlink2gpu_vfio_pci_probe, >>> + .remove = nvlink2gpu_vfio_pci_remove, >>> +#ifdef CONFIG_PCI_IOV >>> + .sriov_configure = vfio_pci_core_sriov_configure, >>> +#endif >> >> >> What is this IOV business about? > > from vfio_pci > > #ifdef CONFIG_PCI_IOV > module_param(enable_sriov, bool, 0644); > MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV > configuration. Enabling SR-IOV on a PF typically requires support of > the userspace PF driver, enabling VFs without such support may result in > non-functional VFs or PF."); > #endif I know what IOV is in general :) What I meant to say was that I am pretty sure these GPUs cannot do IOV so this does not need to be in these NVLink drivers. > > >> >> >>> + .err_handler = &vfio_pci_core_err_handlers, >>> +}; >>> + >>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev) >>> +{ >>> + if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev)) >>> + return &nvlink2gpu_vfio_pci_driver; >> >> >> Why do we need matching PCI ids here instead of looking at the FDT >> which will work better? > > what is FDT ? any is it better to use it instead of match_id ? Flattened Device Tree - a way for the firmware to pass the configuration to the OS. This data tells if there are NVLinks and what they are linked to. This defines if the feature is available as it should work with any GPU in this form factor. > >> >> >>> + return NULL; >>> +} >>> +EXPORT_SYMBOL_GPL(get_nvlink2gpu_vfio_pci_driver); >>> +#endif >>> + >>> +module_pci_driver(nvlink2gpu_vfio_pci_driver); >>> + >>> +MODULE_VERSION(DRIVER_VERSION); >>> +MODULE_LICENSE("GPL v2"); >>> +MODULE_AUTHOR(DRIVER_AUTHOR); >>> +MODULE_DESCRIPTION(DRIVER_DESC); >>> diff --git a/drivers/vfio/pci/nvlink2gpu_vfio_pci.h >>> b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h >>> new file mode 100644 >>> index 000000000000..ebd5b600b190 >>> --- /dev/null >>> +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h >>> @@ -0,0 +1,24 @@ >>> +/* SPDX-License-Identifier: GPL-2.0 */ >>> +/* >>> + * Copyright (c) 2020, Mellanox Technologies, Ltd. All rights >>> reserved. >>> + * Author: Max Gurtovoy <mgurtovoy@nvidia.com> >>> + */ >>> + >>> +#ifndef NVLINK2GPU_VFIO_PCI_H >>> +#define NVLINK2GPU_VFIO_PCI_H >>> + >>> +#include <linux/pci.h> >>> +#include <linux/module.h> >>> + >>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >>> +#if defined(CONFIG_VFIO_PCI_NVLINK2GPU) || >>> defined(CONFIG_VFIO_PCI_NVLINK2GPU_MODULE) >>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev >>> *pdev); >>> +#else >>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev) >>> +{ >>> + return NULL; >>> +} >>> +#endif >>> +#endif >>> + >>> +#endif /* NVLINK2GPU_VFIO_PCI_H */ >>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c >>> index dbc0a6559914..8e81ea039f31 100644 >>> --- a/drivers/vfio/pci/vfio_pci.c >>> +++ b/drivers/vfio/pci/vfio_pci.c >>> @@ -27,6 +27,10 @@ >>> #include <linux/uaccess.h> >>> #include "vfio_pci_core.h" >>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >>> +#include "npu2_vfio_pci.h" >>> +#include "nvlink2gpu_vfio_pci.h" >>> +#endif >>> #define DRIVER_VERSION "0.2" >>> #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" >>> @@ -142,14 +146,48 @@ static const struct vfio_device_ops >>> vfio_pci_ops = { >>> .match = vfio_pci_core_match, >>> }; >>> +/* >>> + * This layer is used for backward compatibility. Hopefully it will be >>> + * removed in the future. >>> + */ >>> +static struct pci_driver *vfio_pci_get_compat_driver(struct pci_dev >>> *pdev) >>> +{ >>> + switch (pdev->vendor) { >>> + case PCI_VENDOR_ID_NVIDIA: >>> + switch (pdev->device) { >>> + case 0x1db1: >>> + case 0x1db5: >>> + case 0x1db8: >>> + case 0x1df5: >>> + return get_nvlink2gpu_vfio_pci_driver(pdev); >> >> This does not really need a switch, could simply call these >> get_xxxx_vfio_pci_driver. Thanks, > > maybe the result will be the same but I don't think we need to send all > NVIDIA devices or IBM devices to this function. We can tolerate this on POWER (the check is really cheap) and for everybody else this driver won't even compile. > we can maybe export the tables from the vfio_vendor driver and match it > here. I am still missing the point of device matching. It won't bind by default at the boot time and it won't make the existing user life any easier as they use libvirt which overrides this anyway. >> >> >>> + default: >>> + return NULL; >>> + } >>> + case PCI_VENDOR_ID_IBM: >>> + switch (pdev->device) { >>> + case 0x04ea: >>> + return get_npu2_vfio_pci_driver(pdev); >>> + default: >>> + return NULL; >>> + } >>> + } >>> + >>> + return NULL; >>> +} >>> + >>> static int vfio_pci_probe(struct pci_dev *pdev, const struct >>> pci_device_id *id) >>> { >>> struct vfio_pci_device *vpdev; >>> + struct pci_driver *driver; >>> int ret; >>> if (vfio_pci_is_denylisted(pdev)) >>> return -EINVAL; >>> + driver = vfio_pci_get_compat_driver(pdev); >>> + if (driver) >>> + return driver->probe(pdev, id); >>> + >>> vpdev = kzalloc(sizeof(*vpdev), GFP_KERNEL); >>> if (!vpdev) >>> return -ENOMEM; >>> @@ -167,14 +205,21 @@ static int vfio_pci_probe(struct pci_dev *pdev, >>> const struct pci_device_id *id) >>> static void vfio_pci_remove(struct pci_dev *pdev) >>> { >>> - struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); >>> - struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); >>> - struct vfio_pci_device *vpdev; >>> - >>> - vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev); >>> - >>> - vfio_pci_core_unregister_device(core_vpdev); >>> - kfree(vpdev); >>> + struct pci_driver *driver; >>> + >>> + driver = vfio_pci_get_compat_driver(pdev); >>> + if (driver) { >>> + driver->remove(pdev); >>> + } else { >>> + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); >>> + struct vfio_pci_core_device *core_vpdev; >>> + struct vfio_pci_device *vpdev; >>> + >>> + core_vpdev = vfio_device_data(vdev); >>> + vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev); >>> + vfio_pci_core_unregister_device(core_vpdev); >>> + kfree(vpdev); >>> + } >>> } >>> static int vfio_pci_sriov_configure(struct pci_dev *pdev, int >>> nr_virtfn) >>> diff --git a/drivers/vfio/pci/vfio_pci_core.c >>> b/drivers/vfio/pci/vfio_pci_core.c >>> index 4de8e352df9c..f9b39abe54cb 100644 >>> --- a/drivers/vfio/pci/vfio_pci_core.c >>> +++ b/drivers/vfio/pci/vfio_pci_core.c >>> @@ -354,24 +354,6 @@ int vfio_pci_core_enable(struct >>> vfio_pci_core_device *vdev) >>> } >>> } >>> - if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && >>> - IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { >>> - ret = vfio_pci_nvidia_v100_nvlink2_init(vdev); >>> - if (ret && ret != -ENODEV) { >>> - pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n"); >>> - goto disable_exit; >>> - } >>> - } >>> - >>> - if (pdev->vendor == PCI_VENDOR_ID_IBM && >>> - IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { >>> - ret = vfio_pci_ibm_npu2_init(vdev); >>> - if (ret && ret != -ENODEV) { >>> - pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n"); >>> - goto disable_exit; >>> - } >>> - } >>> - >>> return 0; >>> disable_exit: >>> diff --git a/drivers/vfio/pci/vfio_pci_core.h >>> b/drivers/vfio/pci/vfio_pci_core.h >>> index 8989443c3086..31f3836e606e 100644 >>> --- a/drivers/vfio/pci/vfio_pci_core.h >>> +++ b/drivers/vfio/pci/vfio_pci_core.h >>> @@ -204,20 +204,6 @@ static inline int vfio_pci_igd_init(struct >>> vfio_pci_core_device *vdev) >>> return -ENODEV; >>> } >>> #endif >>> -#ifdef CONFIG_VFIO_PCI_NVLINK2 >>> -extern int vfio_pci_nvidia_v100_nvlink2_init(struct >>> vfio_pci_core_device *vdev); >>> -extern int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev); >>> -#else >>> -static inline int vfio_pci_nvidia_v100_nvlink2_init(struct >>> vfio_pci_core_device *vdev) >>> -{ >>> - return -ENODEV; >>> -} >>> - >>> -static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device >>> *vdev) >>> -{ >>> - return -ENODEV; >>> -} >>> -#endif >>> #ifdef CONFIG_S390 >>> extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device >>> *vdev, >>> >>
On 11/03/2021 00:02, Jason Gunthorpe wrote: > On Wed, Mar 10, 2021 at 02:57:57PM +0200, Max Gurtovoy wrote: > >>>> + .err_handler = &vfio_pci_core_err_handlers, >>>> +}; >>>> + >>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >>>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev) >>>> +{ >>>> + if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev)) >>>> + return &nvlink2gpu_vfio_pci_driver; >>> >>> >>> Why do we need matching PCI ids here instead of looking at the FDT which >>> will work better? >> >> what is FDT ? any is it better to use it instead of match_id ? > > This is emulating the device_driver match for the pci_driver. No it is not, it is a device tree info which lets to skip the linux PCI discovery part (the firmware does it anyway) but it tells nothing about which drivers to bind. > I don't think we can combine FDT matching with pci_driver, can we? It is a c function calling another c function, all within vfio-pci, this is not called by the generic pci code.
On Thu, Mar 11, 2021 at 01:24:47AM +1100, Alexey Kardashevskiy wrote: > > > On 11/03/2021 00:02, Jason Gunthorpe wrote: > > On Wed, Mar 10, 2021 at 02:57:57PM +0200, Max Gurtovoy wrote: > > > > > > > + .err_handler = &vfio_pci_core_err_handlers, > > > > > +}; > > > > > + > > > > > +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT > > > > > +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev) > > > > > +{ > > > > > + if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev)) > > > > > + return &nvlink2gpu_vfio_pci_driver; > > > > > > > > > > > > Why do we need matching PCI ids here instead of looking at the FDT which > > > > will work better? > > > > > > what is FDT ? any is it better to use it instead of match_id ? > > > > This is emulating the device_driver match for the pci_driver. > > No it is not, it is a device tree info which lets to skip the linux PCI > discovery part (the firmware does it anyway) but it tells nothing about > which drivers to bind. I mean get_nvlink2gpu_vfio_pci_driver() is emulating the PCI match. Max added a pci driver for NPU here: +static struct pci_driver npu2_vfio_pci_driver = { + .name = "npu2-vfio-pci", + .id_table = npu2_vfio_pci_table, + .probe = npu2_vfio_pci_probe, new userspace should use driver_override with "npu-vfio-pci" as the string not "vfio-pci" The point of the get_npu2_vfio_pci_driver() is only optional compatibility to redirect old userspace using "vfio-pci" in the driver_override to the now split driver code so userspace doesn't see any change in behavior. If we don't do this then the vfio-pci driver override will disable the npu2 special stuff, since Max took it all out of vfio-pci's pci_driver. It is supposed to match exactly the same match table as the pci_driver above. We *don't* want different behavior from what the standrd PCI driver matcher will do. Since we don't have any way to mix in FDT discovery to the standard PCI driver match it will still attach the npu driver but not enable any special support. This seems OK. Jason
On 3/10/2021 4:19 PM, Alexey Kardashevskiy wrote: > > > On 10/03/2021 23:57, Max Gurtovoy wrote: >> >> On 3/10/2021 8:39 AM, Alexey Kardashevskiy wrote: >>> >>> >>> On 09/03/2021 19:33, Max Gurtovoy wrote: >>>> The new drivers introduced are nvlink2gpu_vfio_pci.ko and >>>> npu2_vfio_pci.ko. >>>> The first will be responsible for providing special extensions for >>>> NVIDIA GPUs with NVLINK2 support for P9 platform (and others in the >>>> future). The last will be responsible for POWER9 NPU2 unit (NVLink2 >>>> host >>>> bus adapter). >>>> >>>> Also, preserve backward compatibility for users that were binding >>>> NVLINK2 devices to vfio_pci.ko. Hopefully this compatibility layer >>>> will >>>> be dropped in the future >>>> >>>> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> >>>> --- >>>> drivers/vfio/pci/Kconfig | 28 +++- >>>> drivers/vfio/pci/Makefile | 7 +- >>>> .../pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} | 144 >>>> ++++++++++++++++- >>>> drivers/vfio/pci/npu2_vfio_pci.h | 24 +++ >>>> ...pci_nvlink2gpu.c => nvlink2gpu_vfio_pci.c} | 149 >>>> +++++++++++++++++- >>>> drivers/vfio/pci/nvlink2gpu_vfio_pci.h | 24 +++ >>>> drivers/vfio/pci/vfio_pci.c | 61 ++++++- >>>> drivers/vfio/pci/vfio_pci_core.c | 18 --- >>>> drivers/vfio/pci/vfio_pci_core.h | 14 -- >>>> 9 files changed, 422 insertions(+), 47 deletions(-) >>>> rename drivers/vfio/pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} (64%) >>>> create mode 100644 drivers/vfio/pci/npu2_vfio_pci.h >>>> rename drivers/vfio/pci/{vfio_pci_nvlink2gpu.c => >>>> nvlink2gpu_vfio_pci.c} (67%) >>>> create mode 100644 drivers/vfio/pci/nvlink2gpu_vfio_pci.h >>>> >>>> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig >>>> index 829e90a2e5a3..88c89863a205 100644 >>>> --- a/drivers/vfio/pci/Kconfig >>>> +++ b/drivers/vfio/pci/Kconfig >>>> @@ -48,8 +48,30 @@ config VFIO_PCI_IGD >>>> To enable Intel IGD assignment through vfio-pci, say Y. >>>> -config VFIO_PCI_NVLINK2 >>>> - def_bool y >>>> +config VFIO_PCI_NVLINK2GPU >>>> + tristate "VFIO support for NVIDIA NVLINK2 GPUs" >>>> depends on VFIO_PCI_CORE && PPC_POWERNV >>>> help >>>> - VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 >>>> GPUs >>>> + VFIO PCI driver for NVIDIA NVLINK2 GPUs with specific >>>> extensions >>>> + for P9 Witherspoon machine. >>>> + >>>> +config VFIO_PCI_NPU2 >>>> + tristate "VFIO support for IBM NPU host bus adapter on P9" >>>> + depends on VFIO_PCI_NVLINK2GPU && PPC_POWERNV >>>> + help >>>> + VFIO PCI specific extensions for IBM NVLink2 host bus >>>> adapter on P9 >>>> + Witherspoon machine. >>>> + >>>> +config VFIO_PCI_DRIVER_COMPAT >>>> + bool "VFIO PCI backward compatibility for vendor specific >>>> extensions" >>>> + default y >>>> + depends on VFIO_PCI >>>> + help >>>> + Say Y here if you want to preserve VFIO PCI backward >>>> + compatibility. vfio_pci.ko will continue to automatically use >>>> + the NVLINK2, NPU2 and IGD VFIO drivers when it is attached to >>>> + a compatible device. >>>> + >>>> + When N is selected the user must bind explicity to the module >>>> + they want to handle the device and vfio_pci.ko will have no >>>> + device specific special behaviors. >>>> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile >>>> index f539f32c9296..86fb62e271fc 100644 >>>> --- a/drivers/vfio/pci/Makefile >>>> +++ b/drivers/vfio/pci/Makefile >>>> @@ -2,10 +2,15 @@ >>>> obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o >>>> obj-$(CONFIG_VFIO_PCI) += vfio-pci.o >>>> +obj-$(CONFIG_VFIO_PCI_NPU2) += npu2-vfio-pci.o >>>> +obj-$(CONFIG_VFIO_PCI_NVLINK2GPU) += nvlink2gpu-vfio-pci.o >>>> vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o >>>> vfio_pci_rdwr.o vfio_pci_config.o >>>> vfio-pci-core-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o >>>> -vfio-pci-core-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2gpu.o >>>> vfio_pci_npu2.o >>>> vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o >>>> vfio-pci-y := vfio_pci.o >>>> + >>>> +npu2-vfio-pci-y := npu2_vfio_pci.o >>>> + >>>> +nvlink2gpu-vfio-pci-y := nvlink2gpu_vfio_pci.o >>>> diff --git a/drivers/vfio/pci/vfio_pci_npu2.c >>>> b/drivers/vfio/pci/npu2_vfio_pci.c >>>> similarity index 64% >>>> rename from drivers/vfio/pci/vfio_pci_npu2.c >>>> rename to drivers/vfio/pci/npu2_vfio_pci.c >>>> index 717745256ab3..7071bda0f2b6 100644 >>>> --- a/drivers/vfio/pci/vfio_pci_npu2.c >>>> +++ b/drivers/vfio/pci/npu2_vfio_pci.c >>>> @@ -14,19 +14,28 @@ >>>> * Author: Alex Williamson <alex.williamson@redhat.com> >>>> */ >>>> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >>>> + >>>> +#include <linux/module.h> >>>> #include <linux/io.h> >>>> #include <linux/pci.h> >>>> #include <linux/uaccess.h> >>>> #include <linux/vfio.h> >>>> +#include <linux/list.h> >>>> #include <linux/sched/mm.h> >>>> #include <linux/mmu_context.h> >>>> #include <asm/kvm_ppc.h> >>>> #include "vfio_pci_core.h" >>>> +#include "npu2_vfio_pci.h" >>>> #define CREATE_TRACE_POINTS >>>> #include "npu2_trace.h" >>>> +#define DRIVER_VERSION "0.1" >>>> +#define DRIVER_AUTHOR "Alexey Kardashevskiy <aik@ozlabs.ru>" >>>> +#define DRIVER_DESC "NPU2 VFIO PCI - User Level meta-driver >>>> for POWER9 NPU NVLink2 HBA" >>>> + >>>> EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap); >>>> struct vfio_pci_npu2_data { >>>> @@ -36,6 +45,10 @@ struct vfio_pci_npu2_data { >>>> unsigned int link_speed; /* The link speed from DT's >>>> ibm,nvlink-speed */ >>>> }; >>>> +struct npu2_vfio_pci_device { >>>> + struct vfio_pci_core_device vdev; >>>> +}; >>>> + >>>> static size_t vfio_pci_npu2_rw(struct vfio_pci_core_device *vdev, >>>> char __user *buf, size_t count, loff_t *ppos, bool iswrite) >>>> { >>>> @@ -120,7 +133,7 @@ static const struct vfio_pci_regops >>>> vfio_pci_npu2_regops = { >>>> .add_capability = vfio_pci_npu2_add_capability, >>>> }; >>>> -int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev) >>>> +static int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev) >>>> { >>>> int ret; >>>> struct vfio_pci_npu2_data *data; >>>> @@ -220,3 +233,132 @@ int vfio_pci_ibm_npu2_init(struct >>>> vfio_pci_core_device *vdev) >>>> return ret; >>>> } >>>> + >>>> +static void npu2_vfio_pci_release(void *device_data) >>>> +{ >>>> + struct vfio_pci_core_device *vdev = device_data; >>>> + >>>> + mutex_lock(&vdev->reflck->lock); >>>> + if (!(--vdev->refcnt)) { >>>> + vfio_pci_vf_token_user_add(vdev, -1); >>>> + vfio_pci_core_spapr_eeh_release(vdev); >>>> + vfio_pci_core_disable(vdev); >>>> + } >>>> + mutex_unlock(&vdev->reflck->lock); >>>> + >>>> + module_put(THIS_MODULE); >>>> +} >>>> + >>>> +static int npu2_vfio_pci_open(void *device_data) >>>> +{ >>>> + struct vfio_pci_core_device *vdev = device_data; >>>> + int ret = 0; >>>> + >>>> + if (!try_module_get(THIS_MODULE)) >>>> + return -ENODEV; >>>> + >>>> + mutex_lock(&vdev->reflck->lock); >>>> + >>>> + if (!vdev->refcnt) { >>>> + ret = vfio_pci_core_enable(vdev); >>>> + if (ret) >>>> + goto error; >>>> + >>>> + ret = vfio_pci_ibm_npu2_init(vdev); >>>> + if (ret && ret != -ENODEV) { >>>> + pci_warn(vdev->pdev, >>>> + "Failed to setup NVIDIA NV2 ATSD region\n"); >>>> + vfio_pci_core_disable(vdev); >>>> + goto error; >>>> + } >>>> + ret = 0; >>>> + vfio_pci_probe_mmaps(vdev); >>>> + vfio_pci_core_spapr_eeh_open(vdev); >>>> + vfio_pci_vf_token_user_add(vdev, 1); >>>> + } >>>> + vdev->refcnt++; >>>> +error: >>>> + mutex_unlock(&vdev->reflck->lock); >>>> + if (ret) >>>> + module_put(THIS_MODULE); >>>> + return ret; >>>> +} >>>> + >>>> +static const struct vfio_device_ops npu2_vfio_pci_ops = { >>>> + .name = "npu2-vfio-pci", >>>> + .open = npu2_vfio_pci_open, >>>> + .release = npu2_vfio_pci_release, >>>> + .ioctl = vfio_pci_core_ioctl, >>>> + .read = vfio_pci_core_read, >>>> + .write = vfio_pci_core_write, >>>> + .mmap = vfio_pci_core_mmap, >>>> + .request = vfio_pci_core_request, >>>> + .match = vfio_pci_core_match, >>>> +}; >>>> + >>>> +static int npu2_vfio_pci_probe(struct pci_dev *pdev, >>>> + const struct pci_device_id *id) >>>> +{ >>>> + struct npu2_vfio_pci_device *npvdev; >>>> + int ret; >>>> + >>>> + npvdev = kzalloc(sizeof(*npvdev), GFP_KERNEL); >>>> + if (!npvdev) >>>> + return -ENOMEM; >>>> + >>>> + ret = vfio_pci_core_register_device(&npvdev->vdev, pdev, >>>> + &npu2_vfio_pci_ops); >>>> + if (ret) >>>> + goto out_free; >>>> + >>>> + return 0; >>>> + >>>> +out_free: >>>> + kfree(npvdev); >>>> + return ret; >>>> +} >>>> + >>>> +static void npu2_vfio_pci_remove(struct pci_dev *pdev) >>>> +{ >>>> + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); >>>> + struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); >>>> + struct npu2_vfio_pci_device *npvdev; >>>> + >>>> + npvdev = container_of(core_vpdev, struct npu2_vfio_pci_device, >>>> vdev); >>>> + >>>> + vfio_pci_core_unregister_device(core_vpdev); >>>> + kfree(npvdev); >>>> +} >>>> + >>>> +static const struct pci_device_id npu2_vfio_pci_table[] = { >>>> + { PCI_VDEVICE(IBM, 0x04ea) }, >>>> + { 0, } >>>> +}; >>>> + >>>> +static struct pci_driver npu2_vfio_pci_driver = { >>>> + .name = "npu2-vfio-pci", >>>> + .id_table = npu2_vfio_pci_table, >>>> + .probe = npu2_vfio_pci_probe, >>>> + .remove = npu2_vfio_pci_remove, >>>> +#ifdef CONFIG_PCI_IOV >>>> + .sriov_configure = vfio_pci_core_sriov_configure, >>>> +#endif >>>> + .err_handler = &vfio_pci_core_err_handlers, >>>> +}; >>>> + >>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >>>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev) >>>> +{ >>>> + if (pci_match_id(npu2_vfio_pci_driver.id_table, pdev)) >>>> + return &npu2_vfio_pci_driver; >>>> + return NULL; >>>> +} >>>> +EXPORT_SYMBOL_GPL(get_npu2_vfio_pci_driver); >>>> +#endif >>>> + >>>> +module_pci_driver(npu2_vfio_pci_driver); >>>> + >>>> +MODULE_VERSION(DRIVER_VERSION); >>>> +MODULE_LICENSE("GPL v2"); >>>> +MODULE_AUTHOR(DRIVER_AUTHOR); >>>> +MODULE_DESCRIPTION(DRIVER_DESC); >>>> diff --git a/drivers/vfio/pci/npu2_vfio_pci.h >>>> b/drivers/vfio/pci/npu2_vfio_pci.h >>>> new file mode 100644 >>>> index 000000000000..92010d340346 >>>> --- /dev/null >>>> +++ b/drivers/vfio/pci/npu2_vfio_pci.h >>>> @@ -0,0 +1,24 @@ >>>> +/* SPDX-License-Identifier: GPL-2.0 */ >>>> +/* >>>> + * Copyright (c) 2020, Mellanox Technologies, Ltd. All rights >>>> reserved. >>>> + * Author: Max Gurtovoy <mgurtovoy@nvidia.com> >>>> + */ >>>> + >>>> +#ifndef NPU2_VFIO_PCI_H >>>> +#define NPU2_VFIO_PCI_H >>>> + >>>> +#include <linux/pci.h> >>>> +#include <linux/module.h> >>>> + >>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >>>> +#if defined(CONFIG_VFIO_PCI_NPU2) || >>>> defined(CONFIG_VFIO_PCI_NPU2_MODULE) >>>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev); >>>> +#else >>>> +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev) >>>> +{ >>>> + return NULL; >>>> +} >>>> +#endif >>>> +#endif >>>> + >>>> +#endif /* NPU2_VFIO_PCI_H */ >>>> diff --git a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c >>>> b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c >>>> similarity index 67% >>>> rename from drivers/vfio/pci/vfio_pci_nvlink2gpu.c >>>> rename to drivers/vfio/pci/nvlink2gpu_vfio_pci.c >>>> index 6dce1e78ee82..84a5ac1ce8ac 100644 >>>> --- a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c >>>> +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c >>>> @@ -1,6 +1,6 @@ >>>> // SPDX-License-Identifier: GPL-2.0-only >>>> /* >>>> - * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2. >>>> + * VFIO PCI NVIDIA NVLink2 GPUs support. >>>> * >>>> * Copyright (C) 2018 IBM Corp. All rights reserved. >>>> * Author: Alexey Kardashevskiy <aik@ozlabs.ru> >>>> @@ -12,6 +12,9 @@ >>>> * Author: Alex Williamson <alex.williamson@redhat.com> >>>> */ >>>> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >>>> + >>>> +#include <linux/module.h> >>>> #include <linux/io.h> >>>> #include <linux/pci.h> >>>> #include <linux/uaccess.h> >>>> @@ -21,10 +24,15 @@ >>>> #include <asm/kvm_ppc.h> >>>> #include "vfio_pci_core.h" >>>> +#include "nvlink2gpu_vfio_pci.h" >>>> #define CREATE_TRACE_POINTS >>>> #include "nvlink2gpu_trace.h" >>>> +#define DRIVER_VERSION "0.1" >>>> +#define DRIVER_AUTHOR "Alexey Kardashevskiy <aik@ozlabs.ru>" >>>> +#define DRIVER_DESC "NVLINK2GPU VFIO PCI - User Level >>>> meta-driver for NVIDIA NVLink2 GPUs" >>>> + >>>> EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault); >>>> EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap); >>>> @@ -39,6 +47,10 @@ struct vfio_pci_nvgpu_data { >>>> struct notifier_block group_notifier; >>>> }; >>>> +struct nv_vfio_pci_device { >>>> + struct vfio_pci_core_device vdev; >>>> +}; >>>> + >>>> static size_t vfio_pci_nvgpu_rw(struct vfio_pci_core_device *vdev, >>>> char __user *buf, size_t count, loff_t *ppos, bool iswrite) >>>> { >>>> @@ -207,7 +219,8 @@ static int vfio_pci_nvgpu_group_notifier(struct >>>> notifier_block *nb, >>>> return NOTIFY_OK; >>>> } >>>> -int vfio_pci_nvidia_v100_nvlink2_init(struct >>>> vfio_pci_core_device *vdev) >>>> +static int >>>> +vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev) >>>> { >>>> int ret; >>>> u64 reg[2]; >>>> @@ -293,3 +306,135 @@ int vfio_pci_nvidia_v100_nvlink2_init(struct >>>> vfio_pci_core_device *vdev) >>>> return ret; >>>> } >>>> + >>>> +static void nvlink2gpu_vfio_pci_release(void *device_data) >>>> +{ >>>> + struct vfio_pci_core_device *vdev = device_data; >>>> + >>>> + mutex_lock(&vdev->reflck->lock); >>>> + if (!(--vdev->refcnt)) { >>>> + vfio_pci_vf_token_user_add(vdev, -1); >>>> + vfio_pci_core_spapr_eeh_release(vdev); >>>> + vfio_pci_core_disable(vdev); >>>> + } >>>> + mutex_unlock(&vdev->reflck->lock); >>>> + >>>> + module_put(THIS_MODULE); >>>> +} >>>> + >>>> +static int nvlink2gpu_vfio_pci_open(void *device_data) >>>> +{ >>>> + struct vfio_pci_core_device *vdev = device_data; >>>> + int ret = 0; >>>> + >>>> + if (!try_module_get(THIS_MODULE)) >>>> + return -ENODEV; >>>> + >>>> + mutex_lock(&vdev->reflck->lock); >>>> + >>>> + if (!vdev->refcnt) { >>>> + ret = vfio_pci_core_enable(vdev); >>>> + if (ret) >>>> + goto error; >>>> + >>>> + ret = vfio_pci_nvidia_v100_nvlink2_init(vdev); >>>> + if (ret && ret != -ENODEV) { >>>> + pci_warn(vdev->pdev, >>>> + "Failed to setup NVIDIA NV2 RAM region\n"); >>>> + vfio_pci_core_disable(vdev); >>>> + goto error; >>>> + } >>>> + ret = 0; >>>> + vfio_pci_probe_mmaps(vdev); >>>> + vfio_pci_core_spapr_eeh_open(vdev); >>>> + vfio_pci_vf_token_user_add(vdev, 1); >>>> + } >>>> + vdev->refcnt++; >>>> +error: >>>> + mutex_unlock(&vdev->reflck->lock); >>>> + if (ret) >>>> + module_put(THIS_MODULE); >>>> + return ret; >>>> +} >>>> + >>>> +static const struct vfio_device_ops nvlink2gpu_vfio_pci_ops = { >>>> + .name = "nvlink2gpu-vfio-pci", >>>> + .open = nvlink2gpu_vfio_pci_open, >>>> + .release = nvlink2gpu_vfio_pci_release, >>>> + .ioctl = vfio_pci_core_ioctl, >>>> + .read = vfio_pci_core_read, >>>> + .write = vfio_pci_core_write, >>>> + .mmap = vfio_pci_core_mmap, >>>> + .request = vfio_pci_core_request, >>>> + .match = vfio_pci_core_match, >>>> +}; >>>> + >>>> +static int nvlink2gpu_vfio_pci_probe(struct pci_dev *pdev, >>>> + const struct pci_device_id *id) >>>> +{ >>>> + struct nv_vfio_pci_device *nvdev; >>>> + int ret; >>>> + >>>> + nvdev = kzalloc(sizeof(*nvdev), GFP_KERNEL); >>>> + if (!nvdev) >>>> + return -ENOMEM; >>>> + >>>> + ret = vfio_pci_core_register_device(&nvdev->vdev, pdev, >>>> + &nvlink2gpu_vfio_pci_ops); >>>> + if (ret) >>>> + goto out_free; >>>> + >>>> + return 0; >>>> + >>>> +out_free: >>>> + kfree(nvdev); >>>> + return ret; >>>> +} >>>> + >>>> +static void nvlink2gpu_vfio_pci_remove(struct pci_dev *pdev) >>>> +{ >>>> + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); >>>> + struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); >>>> + struct nv_vfio_pci_device *nvdev; >>>> + >>>> + nvdev = container_of(core_vpdev, struct nv_vfio_pci_device, >>>> vdev); >>>> + >>>> + vfio_pci_core_unregister_device(core_vpdev); >>>> + kfree(nvdev); >>>> +} >>>> + >>>> +static const struct pci_device_id nvlink2gpu_vfio_pci_table[] = { >>>> + { PCI_VDEVICE(NVIDIA, 0x1DB1) }, /* GV100GL-A NVIDIA Tesla >>>> V100-SXM2-16GB */ >>>> + { PCI_VDEVICE(NVIDIA, 0x1DB5) }, /* GV100GL-A NVIDIA Tesla >>>> V100-SXM2-32GB */ >>>> + { PCI_VDEVICE(NVIDIA, 0x1DB8) }, /* GV100GL-A NVIDIA Tesla >>>> V100-SXM3-32GB */ >>>> + { PCI_VDEVICE(NVIDIA, 0x1DF5) }, /* GV100GL-B NVIDIA Tesla >>>> V100-SXM2-16GB */ >>> >>> >>> Where is this list from? >>> >>> Also, how is this supposed to work at the boot time? Will the kernel >>> try binding let's say this one and nouveau? Which one is going to win? >> >> At boot time nouveau driver will win since the vfio drivers don't >> declare MODULE_DEVICE_TABLE > > > ok but where is the list from anyway? I did some checkings and was told that the SXM devices where the ones that were integrated into P9. If you or anyone on the mailing list has some comments here, please add them and I'll do double check. > > >> >> >>> >>> >>>> + { 0, } >>> >>> >>> Why a comma? >> >> I'll remove the comma. >> >> >>> >>>> +}; >>> >>> >>> >>>> + >>>> +static struct pci_driver nvlink2gpu_vfio_pci_driver = { >>>> + .name = "nvlink2gpu-vfio-pci", >>>> + .id_table = nvlink2gpu_vfio_pci_table, >>>> + .probe = nvlink2gpu_vfio_pci_probe, >>>> + .remove = nvlink2gpu_vfio_pci_remove, >>>> +#ifdef CONFIG_PCI_IOV >>>> + .sriov_configure = vfio_pci_core_sriov_configure, >>>> +#endif >>> >>> >>> What is this IOV business about? >> >> from vfio_pci >> >> #ifdef CONFIG_PCI_IOV >> module_param(enable_sriov, bool, 0644); >> MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV >> configuration. Enabling SR-IOV on a PF typically requires support of >> the userspace PF driver, enabling VFs without such support may result >> in non-functional VFs or PF."); >> #endif > > > I know what IOV is in general :) What I meant to say was that I am > pretty sure these GPUs cannot do IOV so this does not need to be in > these NVLink drivers. Thanks. I'll verify it and remove for v4. > > > >> >> >>> >>> >>>> + .err_handler = &vfio_pci_core_err_handlers, >>>> +}; >>>> + >>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >>>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev >>>> *pdev) >>>> +{ >>>> + if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev)) >>>> + return &nvlink2gpu_vfio_pci_driver; >>> >>> >>> Why do we need matching PCI ids here instead of looking at the FDT >>> which will work better? >> >> what is FDT ? any is it better to use it instead of match_id ? > > > Flattened Device Tree - a way for the firmware to pass the > configuration to the OS. This data tells if there are NVLinks and what > they are linked to. This defines if the feature is available as it > should work with any GPU in this form factor. > > >> >>> >>> >>>> + return NULL; >>>> +} >>>> +EXPORT_SYMBOL_GPL(get_nvlink2gpu_vfio_pci_driver); >>>> +#endif >>>> + >>>> +module_pci_driver(nvlink2gpu_vfio_pci_driver); >>>> + >>>> +MODULE_VERSION(DRIVER_VERSION); >>>> +MODULE_LICENSE("GPL v2"); >>>> +MODULE_AUTHOR(DRIVER_AUTHOR); >>>> +MODULE_DESCRIPTION(DRIVER_DESC); >>>> diff --git a/drivers/vfio/pci/nvlink2gpu_vfio_pci.h >>>> b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h >>>> new file mode 100644 >>>> index 000000000000..ebd5b600b190 >>>> --- /dev/null >>>> +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h >>>> @@ -0,0 +1,24 @@ >>>> +/* SPDX-License-Identifier: GPL-2.0 */ >>>> +/* >>>> + * Copyright (c) 2020, Mellanox Technologies, Ltd. All rights >>>> reserved. >>>> + * Author: Max Gurtovoy <mgurtovoy@nvidia.com> >>>> + */ >>>> + >>>> +#ifndef NVLINK2GPU_VFIO_PCI_H >>>> +#define NVLINK2GPU_VFIO_PCI_H >>>> + >>>> +#include <linux/pci.h> >>>> +#include <linux/module.h> >>>> + >>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >>>> +#if defined(CONFIG_VFIO_PCI_NVLINK2GPU) || >>>> defined(CONFIG_VFIO_PCI_NVLINK2GPU_MODULE) >>>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev >>>> *pdev); >>>> +#else >>>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev >>>> *pdev) >>>> +{ >>>> + return NULL; >>>> +} >>>> +#endif >>>> +#endif >>>> + >>>> +#endif /* NVLINK2GPU_VFIO_PCI_H */ >>>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c >>>> index dbc0a6559914..8e81ea039f31 100644 >>>> --- a/drivers/vfio/pci/vfio_pci.c >>>> +++ b/drivers/vfio/pci/vfio_pci.c >>>> @@ -27,6 +27,10 @@ >>>> #include <linux/uaccess.h> >>>> #include "vfio_pci_core.h" >>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >>>> +#include "npu2_vfio_pci.h" >>>> +#include "nvlink2gpu_vfio_pci.h" >>>> +#endif >>>> #define DRIVER_VERSION "0.2" >>>> #define DRIVER_AUTHOR "Alex Williamson >>>> <alex.williamson@redhat.com>" >>>> @@ -142,14 +146,48 @@ static const struct vfio_device_ops >>>> vfio_pci_ops = { >>>> .match = vfio_pci_core_match, >>>> }; >>>> +/* >>>> + * This layer is used for backward compatibility. Hopefully it >>>> will be >>>> + * removed in the future. >>>> + */ >>>> +static struct pci_driver *vfio_pci_get_compat_driver(struct >>>> pci_dev *pdev) >>>> +{ >>>> + switch (pdev->vendor) { >>>> + case PCI_VENDOR_ID_NVIDIA: >>>> + switch (pdev->device) { >>>> + case 0x1db1: >>>> + case 0x1db5: >>>> + case 0x1db8: >>>> + case 0x1df5: >>>> + return get_nvlink2gpu_vfio_pci_driver(pdev); >>> >>> This does not really need a switch, could simply call these >>> get_xxxx_vfio_pci_driver. Thanks, >> >> maybe the result will be the same but I don't think we need to send >> all NVIDIA devices or IBM devices to this function. > > We can tolerate this on POWER (the check is really cheap) and for > everybody else this driver won't even compile. I'll improve this function. Thanks. > > >> we can maybe export the tables from the vfio_vendor driver and match >> it here. > > > I am still missing the point of device matching. It won't bind by > default at the boot time and it won't make the existing user life any > easier as they use libvirt which overrides this anyway. we're trying to improve the subsystem to be more flexible and we still want to preserve backward compatibility for the near future. > > >>> >>> >>>> + default: >>>> + return NULL; >>>> + } >>>> + case PCI_VENDOR_ID_IBM: >>>> + switch (pdev->device) { >>>> + case 0x04ea: >>>> + return get_npu2_vfio_pci_driver(pdev); >>>> + default: >>>> + return NULL; >>>> + } >>>> + } >>>> + >>>> + return NULL; >>>> +} >>>> + >>>> static int vfio_pci_probe(struct pci_dev *pdev, const struct >>>> pci_device_id *id) >>>> { >>>> struct vfio_pci_device *vpdev; >>>> + struct pci_driver *driver; >>>> int ret; >>>> if (vfio_pci_is_denylisted(pdev)) >>>> return -EINVAL; >>>> + driver = vfio_pci_get_compat_driver(pdev); >>>> + if (driver) >>>> + return driver->probe(pdev, id); >>>> + >>>> vpdev = kzalloc(sizeof(*vpdev), GFP_KERNEL); >>>> if (!vpdev) >>>> return -ENOMEM; >>>> @@ -167,14 +205,21 @@ static int vfio_pci_probe(struct pci_dev >>>> *pdev, const struct pci_device_id *id) >>>> static void vfio_pci_remove(struct pci_dev *pdev) >>>> { >>>> - struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); >>>> - struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); >>>> - struct vfio_pci_device *vpdev; >>>> - >>>> - vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev); >>>> - >>>> - vfio_pci_core_unregister_device(core_vpdev); >>>> - kfree(vpdev); >>>> + struct pci_driver *driver; >>>> + >>>> + driver = vfio_pci_get_compat_driver(pdev); >>>> + if (driver) { >>>> + driver->remove(pdev); >>>> + } else { >>>> + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); >>>> + struct vfio_pci_core_device *core_vpdev; >>>> + struct vfio_pci_device *vpdev; >>>> + >>>> + core_vpdev = vfio_device_data(vdev); >>>> + vpdev = container_of(core_vpdev, struct vfio_pci_device, >>>> vdev); >>>> + vfio_pci_core_unregister_device(core_vpdev); >>>> + kfree(vpdev); >>>> + } >>>> } >>>> static int vfio_pci_sriov_configure(struct pci_dev *pdev, int >>>> nr_virtfn) >>>> diff --git a/drivers/vfio/pci/vfio_pci_core.c >>>> b/drivers/vfio/pci/vfio_pci_core.c >>>> index 4de8e352df9c..f9b39abe54cb 100644 >>>> --- a/drivers/vfio/pci/vfio_pci_core.c >>>> +++ b/drivers/vfio/pci/vfio_pci_core.c >>>> @@ -354,24 +354,6 @@ int vfio_pci_core_enable(struct >>>> vfio_pci_core_device *vdev) >>>> } >>>> } >>>> - if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && >>>> - IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { >>>> - ret = vfio_pci_nvidia_v100_nvlink2_init(vdev); >>>> - if (ret && ret != -ENODEV) { >>>> - pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM >>>> region\n"); >>>> - goto disable_exit; >>>> - } >>>> - } >>>> - >>>> - if (pdev->vendor == PCI_VENDOR_ID_IBM && >>>> - IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { >>>> - ret = vfio_pci_ibm_npu2_init(vdev); >>>> - if (ret && ret != -ENODEV) { >>>> - pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD >>>> region\n"); >>>> - goto disable_exit; >>>> - } >>>> - } >>>> - >>>> return 0; >>>> disable_exit: >>>> diff --git a/drivers/vfio/pci/vfio_pci_core.h >>>> b/drivers/vfio/pci/vfio_pci_core.h >>>> index 8989443c3086..31f3836e606e 100644 >>>> --- a/drivers/vfio/pci/vfio_pci_core.h >>>> +++ b/drivers/vfio/pci/vfio_pci_core.h >>>> @@ -204,20 +204,6 @@ static inline int vfio_pci_igd_init(struct >>>> vfio_pci_core_device *vdev) >>>> return -ENODEV; >>>> } >>>> #endif >>>> -#ifdef CONFIG_VFIO_PCI_NVLINK2 >>>> -extern int vfio_pci_nvidia_v100_nvlink2_init(struct >>>> vfio_pci_core_device *vdev); >>>> -extern int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev); >>>> -#else >>>> -static inline int vfio_pci_nvidia_v100_nvlink2_init(struct >>>> vfio_pci_core_device *vdev) >>>> -{ >>>> - return -ENODEV; >>>> -} >>>> - >>>> -static inline int vfio_pci_ibm_npu2_init(struct >>>> vfio_pci_core_device *vdev) >>>> -{ >>>> - return -ENODEV; >>>> -} >>>> -#endif >>>> #ifdef CONFIG_S390 >>>> extern int vfio_pci_info_zdev_add_caps(struct >>>> vfio_pci_core_device *vdev, >>>> >>> >
On 11/03/2021 06:40, Jason Gunthorpe wrote: > On Thu, Mar 11, 2021 at 01:24:47AM +1100, Alexey Kardashevskiy wrote: >> >> >> On 11/03/2021 00:02, Jason Gunthorpe wrote: >>> On Wed, Mar 10, 2021 at 02:57:57PM +0200, Max Gurtovoy wrote: >>> >>>>>> + .err_handler = &vfio_pci_core_err_handlers, >>>>>> +}; >>>>>> + >>>>>> +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT >>>>>> +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev) >>>>>> +{ >>>>>> + if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev)) >>>>>> + return &nvlink2gpu_vfio_pci_driver; >>>>> >>>>> >>>>> Why do we need matching PCI ids here instead of looking at the FDT which >>>>> will work better? >>>> >>>> what is FDT ? any is it better to use it instead of match_id ? >>> >>> This is emulating the device_driver match for the pci_driver. >> >> No it is not, it is a device tree info which lets to skip the linux PCI >> discovery part (the firmware does it anyway) but it tells nothing about >> which drivers to bind. > > I mean get_nvlink2gpu_vfio_pci_driver() is emulating the PCI match. > > Max added a pci driver for NPU here: > > +static struct pci_driver npu2_vfio_pci_driver = { > + .name = "npu2-vfio-pci", > + .id_table = npu2_vfio_pci_table, > + .probe = npu2_vfio_pci_probe, > > > new userspace should use driver_override with "npu-vfio-pci" as the > string not "vfio-pci" > > The point of the get_npu2_vfio_pci_driver() is only optional > compatibility to redirect old userspace using "vfio-pci" in the > driver_override to the now split driver code so userspace doesn't see > any change in behavior. > > If we don't do this then the vfio-pci driver override will disable the > npu2 special stuff, since Max took it all out of vfio-pci's > pci_driver. > > It is supposed to match exactly the same match table as the pci_driver > above. We *don't* want different behavior from what the standrd PCI > driver matcher will do. This is not a standard PCI driver though and the main vfio-pci won't have a list to match ever. IBM NPU PCI id is unlikely to change ever but NVIDIA keeps making new devices which work in those P9 boxes, are you going to keep adding those ids to nvlink2gpu_vfio_pci_table? btw can the id list have only vendor ids and not have device ids? > Since we don't have any way to mix in FDT discovery to the standard > PCI driver match it will still attach the npu driver but not enable > any special support. This seems OK.
On Thu, Mar 11, 2021 at 12:20:33PM +1100, Alexey Kardashevskiy wrote: > > It is supposed to match exactly the same match table as the pci_driver > > above. We *don't* want different behavior from what the standrd PCI > > driver matcher will do. > > This is not a standard PCI driver though It is now, that is what this patch makes it into. This is why it now has a struct pci_driver. > and the main vfio-pci won't have a > list to match ever. ?? vfio-pci uses driver_override or new_id to manage its match list > IBM NPU PCI id is unlikely to change ever but NVIDIA keeps making > new devices which work in those P9 boxes, are you going to keep > adding those ids to nvlink2gpu_vfio_pci_table? Certainly, as needed. PCI list updates is normal for the kernel. > btw can the id list have only vendor ids and not have device ids? The PCI matcher is quite flexable, see the other patch from Max for the igd But best practice is to be as narrow as possible as I hope this will eventually impact module autoloading and other details. Jason
On 11/03/2021 12:34, Jason Gunthorpe wrote: > On Thu, Mar 11, 2021 at 12:20:33PM +1100, Alexey Kardashevskiy wrote: > >>> It is supposed to match exactly the same match table as the pci_driver >>> above. We *don't* want different behavior from what the standrd PCI >>> driver matcher will do. >> >> This is not a standard PCI driver though > > It is now, that is what this patch makes it into. This is why it now > has a struct pci_driver. > >> and the main vfio-pci won't have a >> list to match ever. > > ?? vfio-pci uses driver_override or new_id to manage its match list Exactly, no list to update. >> IBM NPU PCI id is unlikely to change ever but NVIDIA keeps making >> new devices which work in those P9 boxes, are you going to keep >> adding those ids to nvlink2gpu_vfio_pci_table? > > Certainly, as needed. PCI list updates is normal for the kernel. > >> btw can the id list have only vendor ids and not have device ids? > > The PCI matcher is quite flexable, see the other patch from Max for > the igd ah cool, do this for NVIDIA GPUs then please, I just discovered another P9 system sold with NVIDIA T4s which is not in your list. > But best practice is to be as narrow as possible as I hope this will > eventually impact module autoloading and other details. The amount of device specific knowledge is too little to tie it up to device ids, it is a generic PCI driver with quirks. We do not have a separate drivers for the hardware which requires quirks. And how do you hope this should impact autoloading?
On Thu, Mar 11, 2021 at 12:42:56PM +1100, Alexey Kardashevskiy wrote: > > > btw can the id list have only vendor ids and not have device ids? > > > > The PCI matcher is quite flexable, see the other patch from Max for > > the igd > > ah cool, do this for NVIDIA GPUs then please, I just discovered another P9 > system sold with NVIDIA T4s which is not in your list. I think it will make things easier down the road if you maintain an exact list <shrug> > > But best practice is to be as narrow as possible as I hope this will > > eventually impact module autoloading and other details. > > The amount of device specific knowledge is too little to tie it up to device > ids, it is a generic PCI driver with quirks. We do not have a separate > drivers for the hardware which requires quirks. It provides its own capability structure exposed to userspace, that is absolutely not a "quirk" > And how do you hope this should impact autoloading? I would like to autoload the most specific vfio driver for the target hardware. If you someday need to support new GPU HW that needs a different VFIO driver then you are really stuck because things become indeterminate if there are two devices claiming the ID. We don't have the concept of "best match", driver core works on exact match. Jason
On 11/03/2021 13:00, Jason Gunthorpe wrote: > On Thu, Mar 11, 2021 at 12:42:56PM +1100, Alexey Kardashevskiy wrote: >>>> btw can the id list have only vendor ids and not have device ids? >>> >>> The PCI matcher is quite flexable, see the other patch from Max for >>> the igd >> >> ah cool, do this for NVIDIA GPUs then please, I just discovered another P9 >> system sold with NVIDIA T4s which is not in your list. > > I think it will make things easier down the road if you maintain an > exact list <shrug> Then why do not you do the exact list for Intel IGD? The commit log does not explain this detail. >>> But best practice is to be as narrow as possible as I hope this will >>> eventually impact module autoloading and other details. >> >> The amount of device specific knowledge is too little to tie it up to device >> ids, it is a generic PCI driver with quirks. We do not have a separate >> drivers for the hardware which requires quirks. > > It provides its own capability structure exposed to userspace, that is > absolutely not a "quirk" > >> And how do you hope this should impact autoloading? > > I would like to autoload the most specific vfio driver for the target > hardware. Is there an idea how it is going to work? For example, the Intel IGD driver and vfio-pci-igd - how should the system pick one? If there is no MODULE_DEVICE_TABLE in vfio-pci-xxx, is the user supposed to try binding all vfio-pci-xxx drivers until some binds? > If you someday need to support new GPU HW that needs a different VFIO > driver then you are really stuck because things become indeterminate > if there are two devices claiming the ID. We don't have the concept of > "best match", driver core works on exact match.
On 3/11/2021 9:54 AM, Alexey Kardashevskiy wrote: > > > On 11/03/2021 13:00, Jason Gunthorpe wrote: >> On Thu, Mar 11, 2021 at 12:42:56PM +1100, Alexey Kardashevskiy wrote: >>>>> btw can the id list have only vendor ids and not have device ids? >>>> >>>> The PCI matcher is quite flexable, see the other patch from Max for >>>> the igd >>> ah cool, do this for NVIDIA GPUs then please, I just discovered >>> another P9 >>> system sold with NVIDIA T4s which is not in your list. >> >> I think it will make things easier down the road if you maintain an >> exact list <shrug> > > > Then why do not you do the exact list for Intel IGD? The commit log > does not explain this detail. I expect Intel team to review this series and give a more precise list. I did the best I could in finding a proper configuration for igd. > > >>>> But best practice is to be as narrow as possible as I hope this will >>>> eventually impact module autoloading and other details. >>> >>> The amount of device specific knowledge is too little to tie it up >>> to device >>> ids, it is a generic PCI driver with quirks. We do not have a separate >>> drivers for the hardware which requires quirks. >> >> It provides its own capability structure exposed to userspace, that is >> absolutely not a "quirk" >> >>> And how do you hope this should impact autoloading? >> >> I would like to autoload the most specific vfio driver for the target >> hardware. > > > Is there an idea how it is going to work? For example, the Intel IGD > driver and vfio-pci-igd - how should the system pick one? If there is > no MODULE_DEVICE_TABLE in vfio-pci-xxx, is the user supposed to try > binding all vfio-pci-xxx drivers until some binds? For example, in my local setup I did a POC patch that convert some drivers to be "manual binding only drivers". So the IGD driver will have the priority, user will unbind the device from it, load igd-vfio-pci, bind the device to it, ends with probing. For now we separated the driver core stuff until we all agree that this series is the right way to go + we also make sure it's backward compatible. > > >> If you someday need to support new GPU HW that needs a different VFIO >> driver then you are really stuck because things become indeterminate >> if there are two devices claiming the ID. We don't have the concept of >> "best match", driver core works on exact match. > > >
On Thu, Mar 11, 2021 at 11:44:38AM +0200, Max Gurtovoy wrote: > > On 3/11/2021 9:54 AM, Alexey Kardashevskiy wrote: > > > > > > On 11/03/2021 13:00, Jason Gunthorpe wrote: > > > On Thu, Mar 11, 2021 at 12:42:56PM +1100, Alexey Kardashevskiy wrote: > > > > > > btw can the id list have only vendor ids and not have device ids? > > > > > > > > > > The PCI matcher is quite flexable, see the other patch from Max for > > > > > the igd > > > > ah cool, do this for NVIDIA GPUs then please, I just > > > > discovered another P9 > > > > system sold with NVIDIA T4s which is not in your list. > > > > > > I think it will make things easier down the road if you maintain an > > > exact list <shrug> > > > > > > Then why do not you do the exact list for Intel IGD? The commit log does > > not explain this detail. > > I expect Intel team to review this series and give a more precise list. > > I did the best I could in finding a proper configuration for igd. Right. Doing this retroactively is really hard. Jason
On Thu, Mar 11, 2021 at 06:54:09PM +1100, Alexey Kardashevskiy wrote: > Is there an idea how it is going to work? For example, the Intel IGD driver > and vfio-pci-igd - how should the system pick one? If there is no > MODULE_DEVICE_TABLE in vfio-pci-xxx, is the user supposed to try binding all > vfio-pci-xxx drivers until some binds? We must expose some MODULE_DEVICE_TABLE like thing to userspace. Compiling everything into one driver and using if statements was only managable with these tiny drivers - the stuff that is coming are big things that are infeasible to link directly to vfio_pci.ko I'm feeling some general consensus around this approach (vs trying to make a subdriver) so we will start looking at exactly what form that could take soon. The general idea would be to have a selection of extended VFIO drivers for PCI devices that can be loaded as an alternative to vfio-pci and they provide additional uapi and behaviors that only work on specific hardware. nvlink is a good example because it does provide new API and additional HW specific behavior. A way for userspace to learn about the drivers automatically and sort out how to load and bind them. I was thinking about your earlier question about FDT - do you think we could switch this to a platform_device and provide an of_match_table that would select correctly? Did IBM enforce a useful compatible string in the DT for these things? Jason
On Wed, 10 Mar 2021 14:57:57 +0200 Max Gurtovoy <mgurtovoy@nvidia.com> wrote: > On 3/10/2021 8:39 AM, Alexey Kardashevskiy wrote: > > On 09/03/2021 19:33, Max Gurtovoy wrote: > >> +static const struct pci_device_id nvlink2gpu_vfio_pci_table[] = { > >> + { PCI_VDEVICE(NVIDIA, 0x1DB1) }, /* GV100GL-A NVIDIA Tesla > >> V100-SXM2-16GB */ > >> + { PCI_VDEVICE(NVIDIA, 0x1DB5) }, /* GV100GL-A NVIDIA Tesla > >> V100-SXM2-32GB */ > >> + { PCI_VDEVICE(NVIDIA, 0x1DB8) }, /* GV100GL-A NVIDIA Tesla > >> V100-SXM3-32GB */ > >> + { PCI_VDEVICE(NVIDIA, 0x1DF5) }, /* GV100GL-B NVIDIA Tesla > >> V100-SXM2-16GB */ > > > > > > Where is this list from? > > > > Also, how is this supposed to work at the boot time? Will the kernel > > try binding let's say this one and nouveau? Which one is going to win? > > At boot time nouveau driver will win since the vfio drivers don't > declare MODULE_DEVICE_TABLE This still seems troublesome, AIUI the MODULE_DEVICE_TABLE is responsible for creating aliases so that kmod can figure out which modules to load, but what happens if all these vfio-pci modules are built into the kernel or the modules are already loaded? In the former case, I think it boils down to link order while the latter is generally considered even less deterministic since it depends on module load order. So if one of these vfio modules should get loaded before the native driver, I think devices could bind here first. Are there tricks/extensions we could use in driver overrides, for example maybe a compatibility alias such that one of these vfio-pci variants could match "vfio-pci"? Perhaps that, along with some sort of priority scheme to probe variants ahead of the base driver, though I'm not sure how we'd get these variants loaded without something like module aliases. I know we're trying to avoid creating another level of driver matching, but that's essentially what we have in the compat option enabled here, and I'm not sure I see how userspace makes the leap to understand what driver to use for a given device. Thanks, Alex
On Fri, Mar 19, 2021 at 09:23:41AM -0600, Alex Williamson wrote: > On Wed, 10 Mar 2021 14:57:57 +0200 > Max Gurtovoy <mgurtovoy@nvidia.com> wrote: > > On 3/10/2021 8:39 AM, Alexey Kardashevskiy wrote: > > > On 09/03/2021 19:33, Max Gurtovoy wrote: > > >> +static const struct pci_device_id nvlink2gpu_vfio_pci_table[] = { > > >> + { PCI_VDEVICE(NVIDIA, 0x1DB1) }, /* GV100GL-A NVIDIA Tesla > > >> V100-SXM2-16GB */ > > >> + { PCI_VDEVICE(NVIDIA, 0x1DB5) }, /* GV100GL-A NVIDIA Tesla > > >> V100-SXM2-32GB */ > > >> + { PCI_VDEVICE(NVIDIA, 0x1DB8) }, /* GV100GL-A NVIDIA Tesla > > >> V100-SXM3-32GB */ > > >> + { PCI_VDEVICE(NVIDIA, 0x1DF5) }, /* GV100GL-B NVIDIA Tesla > > >> V100-SXM2-16GB */ > > > > > > > > > Where is this list from? > > > > > > Also, how is this supposed to work at the boot time? Will the kernel > > > try binding let's say this one and nouveau? Which one is going to win? > > > > At boot time nouveau driver will win since the vfio drivers don't > > declare MODULE_DEVICE_TABLE > > This still seems troublesome, AIUI the MODULE_DEVICE_TABLE is > responsible for creating aliases so that kmod can figure out which > modules to load, but what happens if all these vfio-pci modules are > built into the kernel or the modules are already loaded? I think we talked about this.. We still need a better way to control binding of VFIO modules - now that we have device-specific modules we must have these match tables to control what devices they connect to. Previously things used the binding of vfio_pci as the "switch" and hardcoded all the matches inside it. I'm still keen to try the "driver flavour" idea I outlined earlier, but it is hard to say what will resonate with Greg. > In the former case, I think it boils down to link order while the > latter is generally considered even less deterministic since it depends > on module load order. So if one of these vfio modules should get > loaded before the native driver, I think devices could bind here first. At this point - "don't link these statically", we could have a kconfig to prevent it. > Are there tricks/extensions we could use in driver overrides, for > example maybe a compatibility alias such that one of these vfio-pci > variants could match "vfio-pci"? driver override is not really useful as soon as you have a match table as its operation is to defeat the match table entirely. :( Again, this is still more of a outline how things will look as we must get through this before we can attempt to do something in the driver core with Greg. We could revise this series to not register drivers at all and keep the uAPI view exactly as is today. This would allow enough code to show Greg how some driver flavour thing would work. If soemthing can't be done in the driver core, I'd propse to keep the same basic outline Max has here, but make registering the "compat" dynamic - it is basically a sub-driver design at that point and we give up achieving module autoloading. Jason
On Fri, Mar 19, 2021 at 01:17:22PM -0300, Jason Gunthorpe wrote: > I think we talked about this.. We still need a better way to control > binding of VFIO modules - now that we have device-specific modules we > must have these match tables to control what devices they connect > to. > > Previously things used the binding of vfio_pci as the "switch" and > hardcoded all the matches inside it. > > I'm still keen to try the "driver flavour" idea I outlined earlier, > but it is hard to say what will resonate with Greg. IMHO the only model that really works and makes sense is to turn the whole model around and make vfio a library called by the actual driver for the device. That is any device that needs device specific funtionality simply needs a proper in-kernel driver, which then can be switched to a vfio mode where all the normal subsystems are unbound from the device, and VFIO functionality is found to it all while _the_ driver that controls the PCI ID is still in charge of it. vfio_pci remains a separate driver not binding to any ID by default and not having any device specific functionality.
On Fri, Mar 19, 2021 at 05:20:33PM +0100, Christoph Hellwig wrote: > On Fri, Mar 19, 2021 at 01:17:22PM -0300, Jason Gunthorpe wrote: > > I think we talked about this.. We still need a better way to control > > binding of VFIO modules - now that we have device-specific modules we > > must have these match tables to control what devices they connect > > to. > > > > Previously things used the binding of vfio_pci as the "switch" and > > hardcoded all the matches inside it. > > > > I'm still keen to try the "driver flavour" idea I outlined earlier, > > but it is hard to say what will resonate with Greg. > > IMHO the only model that really works and makes sense is to turn the > whole model around and make vfio a library called by the actual driver > for the device. That is any device that needs device specific > funtionality simply needs a proper in-kernel driver, which then can be > switched to a vfio mode where all the normal subsystems are unbound > from the device, and VFIO functionality is found to it all while _the_ > driver that controls the PCI ID is still in charge of it. Yes, this is what I want to strive for with Greg. It would also resolve alot of the uncomfortable code I see in VFIO using the driver core. For instance, when a device is moved to 'vfio mode' it can go through and *lock* the entire group of devices to 'vfio mode' or completely fail. This would replace all the protective code that is all about ensuring the admin doesn't improperly mix & match in-kernel and vfio drivers within a security domain. The wrinkle I don't yet have an easy answer to is how to load vfio_pci as a universal "default" within the driver core lazy bind scheme and still have working module autoloading... I'm hoping to get some research into this.. Jason
On Fri, Mar 19, 2021 at 01:28:48PM -0300, Jason Gunthorpe wrote: > The wrinkle I don't yet have an easy answer to is how to load vfio_pci > as a universal "default" within the driver core lazy bind scheme and > still have working module autoloading... I'm hoping to get some > research into this.. Should we even load it by default? One answer would be that the sysfs file to switch to vfio mode goes into the core PCI layer, and that core PCI code would contain a hack^H^H^H^Hhook to first load and bind vfio_pci for that device.
On Fri, 19 Mar 2021 17:34:49 +0100 Christoph Hellwig <hch@lst.de> wrote: > On Fri, Mar 19, 2021 at 01:28:48PM -0300, Jason Gunthorpe wrote: > > The wrinkle I don't yet have an easy answer to is how to load vfio_pci > > as a universal "default" within the driver core lazy bind scheme and > > still have working module autoloading... I'm hoping to get some > > research into this.. What about using MODULE_SOFTDEP("pre: ...") in the vfio-pci base driver, which would load all the known variants in order to influence the match, and therefore probe ordering? If we coupled that with wildcard support in driver_override, ex. "vfio_pci*", and used consistent module naming, I think we'd only need to teach userspace about this wildcard and binding to a specific module would come for free. This assumes we drop the per-variant id_table and use the probe function to skip devices without the necessary requirements, either wrong device or missing the tables we expect to expose. > Should we even load it by default? One answer would be that the sysfs > file to switch to vfio mode goes into the core PCI layer, and that core > PCI code would contain a hack^H^H^H^Hhook to first load and bind vfio_pci > for that device. Generally we don't want to be the default driver for anything (I think mdev devices are the exception). Assignment to userspace or VM is a niche use case. Thanks, Alex
On Fri, Mar 19, 2021 at 11:36:42AM -0600, Alex Williamson wrote: > On Fri, 19 Mar 2021 17:34:49 +0100 > Christoph Hellwig <hch@lst.de> wrote: > > > On Fri, Mar 19, 2021 at 01:28:48PM -0300, Jason Gunthorpe wrote: > > > The wrinkle I don't yet have an easy answer to is how to load vfio_pci > > > as a universal "default" within the driver core lazy bind scheme and > > > still have working module autoloading... I'm hoping to get some > > > research into this.. > > What about using MODULE_SOFTDEP("pre: ...") in the vfio-pci base > driver, which would load all the known variants in order to influence > the match, and therefore probe ordering? The way the driver core works is to first match against the already loaded driver list, then trigger an event for module loading and when new drivers are registered they bind to unbound devices. So, the trouble is the event through userspace because the kernel can't just go on to use vfio_pci until it knows userspace has failed to satisfy the load request. One answer is to have userspace udev have the "hook" here and when a vfio flavour mod alias is requested on a PCI device it swaps in vfio_pci if it can't find an alternative. The dream would be a system with no vfio modules loaded could do some echo "vfio" > /sys/bus/pci/xxx/driver_flavour And a module would be loaded and a struct vfio_device is created for that device. Very easy for the user. > If we coupled that with wildcard support in driver_override, ex. > "vfio_pci*", and used consistent module naming, I think we'd only need > to teach userspace about this wildcard and binding to a specific module > would come for free. What would the wildcard do? > This assumes we drop the per-variant id_table and use the probe > function to skip devices without the necessary requirements, either > wrong device or missing the tables we expect to expose. Without a module table how do we know which driver is which? Open coding a match table in probe() and returning failure feels hacky to me. > > Should we even load it by default? One answer would be that the sysfs > > file to switch to vfio mode goes into the core PCI layer, and that core > > PCI code would contain a hack^H^H^H^Hhook to first load and bind vfio_pci > > for that device. > > Generally we don't want to be the default driver for anything (I think > mdev devices are the exception). Assignment to userspace or VM is a > niche use case. Thanks, By "default" I mean if the user says device A is in "vfio" mode then the kernel should - Search for a specific driver for this device and autoload it - If no specific driver is found then attach a default "universal" driver for it. vfio_pci is a universal driver. vfio_platform is also a "universal" driver when in ACPI mode, in some cases. For OF cases platform it builts its own little subsystem complete with autoloading: request_module("vfio-reset:%s", vdev->compat); vdev->of_reset = vfio_platform_lookup_reset(vdev->compat, &vdev->reset_module); And it is a good example of why I don't like this subsystem design because vfio_platform doesn't do the driver loading for OF entirely right, vdev->compat is a single string derived from the compatible property: ret = device_property_read_string(dev, "compatible", &vdev->compat); if (ret) dev_err(dev, "Cannot retrieve compat for %s\n", vdev->name); Unfortunately OF requires that compatible is a *list* of strings and a correct driver is supposed to evaluate all of them. The driver core does this all correctly, and this was lost when it was open coded here. We should NOT be avoiding the standard infrastructure for matching drivers to devices by re-implementing it poorly. Jason
On Fri, 19 Mar 2021 17:07:49 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Fri, Mar 19, 2021 at 11:36:42AM -0600, Alex Williamson wrote: > > On Fri, 19 Mar 2021 17:34:49 +0100 > > Christoph Hellwig <hch@lst.de> wrote: > > > > > On Fri, Mar 19, 2021 at 01:28:48PM -0300, Jason Gunthorpe wrote: > > > > The wrinkle I don't yet have an easy answer to is how to load vfio_pci > > > > as a universal "default" within the driver core lazy bind scheme and > > > > still have working module autoloading... I'm hoping to get some > > > > research into this.. > > > > What about using MODULE_SOFTDEP("pre: ...") in the vfio-pci base > > driver, which would load all the known variants in order to influence > > the match, and therefore probe ordering? > > The way the driver core works is to first match against the already > loaded driver list, then trigger an event for module loading and when > new drivers are registered they bind to unbound devices. The former is based on id_tables, the latter on MODULE_DEVICE_TABLE, we don't have either of those. As noted to Christoph, the cases where we want a vfio driver to bind to anything automatically is the exception. > So, the trouble is the event through userspace because the kernel > can't just go on to use vfio_pci until it knows userspace has failed > to satisfy the load request. Given that we don't use MODULE_DEVICE_TABLE, vfio-pci doesn't autoload. AFAIK, all tools like libvirt and driverctl that typically bind devices to vfio-pci will manually load vfio-pci. I think we can take advantage of that. > One answer is to have userspace udev have the "hook" here and when a > vfio flavour mod alias is requested on a PCI device it swaps in > vfio_pci if it can't find an alternative. > > The dream would be a system with no vfio modules loaded could do some > > echo "vfio" > /sys/bus/pci/xxx/driver_flavour > > And a module would be loaded and a struct vfio_device is created for > that device. Very easy for the user. This is like switching a device to a parallel universe where we do want vfio drivers to bind automatically to devices. > > If we coupled that with wildcard support in driver_override, ex. > > "vfio_pci*", and used consistent module naming, I think we'd only need > > to teach userspace about this wildcard and binding to a specific module > > would come for free. > > What would the wildcard do? It allows a driver_override to match more than one driver, not too dissimilar to your driver_flavor above. In this case it would match all driver names starting with "vfio_pci". For example if we had: softdep vfio-pci pre: vfio-pci-foo vfio-pci-bar Then we'd pre-seed the condition that drivers foo and bar precede the base vfio-pci driver, each will match the device to the driver and have an opportunity in their probe function to either claim or skip the device. Userspace could also set and exact driver_override, for example if they want to force using the base vfio-pci driver or go directly to a specific variant. > > This assumes we drop the per-variant id_table and use the probe > > function to skip devices without the necessary requirements, either > > wrong device or missing the tables we expect to expose. > > Without a module table how do we know which driver is which? > > Open coding a match table in probe() and returning failure feels hacky > to me. How's it any different than Max's get_foo_vfio_pci_driver() that calls pci_match_id() with an internal match table? It seems a better fit for the existing use cases, for example the IGD variant can use a single line table to exclude all except Intel VGA class devices in its probe callback, then test availability of the extra regions we'd expose, otherwise return -ENODEV. The NVLink variant can use pci_match_id() in the probe callback to filter out anything other than NVIDIA VGA or 3D accelerator class devices, then check for associated FDT table, or return -ENODEV. We already use the vfio_pci probe function to exclude devices in the deny-list and non-endpoint devices. Many drivers clearly place implicit trust in their id_table, others don't. In the case of meta drivers, I think it's fair to make use of the latter approach. > > > Should we even load it by default? One answer would be that the sysfs > > > file to switch to vfio mode goes into the core PCI layer, and that core > > > PCI code would contain a hack^H^H^H^Hhook to first load and bind vfio_pci > > > for that device. > > > > Generally we don't want to be the default driver for anything (I think > > mdev devices are the exception). Assignment to userspace or VM is a > > niche use case. Thanks, > > By "default" I mean if the user says device A is in "vfio" mode then > the kernel should > - Search for a specific driver for this device and autoload it > - If no specific driver is found then attach a default "universal" > driver for it. vfio_pci is a universal driver. > > vfio_platform is also a "universal" driver when in ACPI mode, in some > cases. > > For OF cases platform it builts its own little subsystem complete with > autoloading: > > request_module("vfio-reset:%s", vdev->compat); > vdev->of_reset = vfio_platform_lookup_reset(vdev->compat, > &vdev->reset_module); > > And it is a good example of why I don't like this subsystem design > because vfio_platform doesn't do the driver loading for OF entirely > right, vdev->compat is a single string derived from the compatible > property: > > ret = device_property_read_string(dev, "compatible", > &vdev->compat); > if (ret) > dev_err(dev, "Cannot retrieve compat for %s\n", vdev->name); > > Unfortunately OF requires that compatible is a *list* of strings and a > correct driver is supposed to evaluate all of them. The driver core > does this all correctly, and this was lost when it was open coded > here. > > We should NOT be avoiding the standard infrastructure for matching > drivers to devices by re-implementing it poorly. I take some blame for the request_module() behavior of vfio-platform, but I think we're on the same page that we don't want to turn vfio-pci into a nexus for loading variant drivers. Whatever solution we use for vfio-pci might translate to replacing that vfio-platform behavior. As above, I think it's possible to create that alternate universe of driver matching with a simple wildcard and load ordering approach, performing the more specific filtering in the probe callback with fall through to the next matching driver. Thanks, Alex
On Fri, Mar 19, 2021 at 03:08:09PM -0600, Alex Williamson wrote: > On Fri, 19 Mar 2021 17:07:49 -0300 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Fri, Mar 19, 2021 at 11:36:42AM -0600, Alex Williamson wrote: > > > On Fri, 19 Mar 2021 17:34:49 +0100 > > > Christoph Hellwig <hch@lst.de> wrote: > > > > > > > On Fri, Mar 19, 2021 at 01:28:48PM -0300, Jason Gunthorpe wrote: > > > > > The wrinkle I don't yet have an easy answer to is how to load vfio_pci > > > > > as a universal "default" within the driver core lazy bind scheme and > > > > > still have working module autoloading... I'm hoping to get some > > > > > research into this.. > > > > > > What about using MODULE_SOFTDEP("pre: ...") in the vfio-pci base > > > driver, which would load all the known variants in order to influence > > > the match, and therefore probe ordering? > > > > The way the driver core works is to first match against the already > > loaded driver list, then trigger an event for module loading and when > > new drivers are registered they bind to unbound devices. > > The former is based on id_tables, the latter on MODULE_DEVICE_TABLE, we > don't have either of those. Well, today we don't, but Max here adds id_table's to the special devices and a MODULE_DEVICE_TABLE would come too if we do the flavours thing below. My starting thinking is that everything should have these tables and they should work properly.. > As noted to Christoph, the cases where we want a vfio driver to > bind to anything automatically is the exception. I agree vfio should not automatically claim devices, but once vfio is told to claim a device everything from there after should be automatic. > > One answer is to have userspace udev have the "hook" here and when a > > vfio flavour mod alias is requested on a PCI device it swaps in > > vfio_pci if it can't find an alternative. > > > > The dream would be a system with no vfio modules loaded could do some > > > > echo "vfio" > /sys/bus/pci/xxx/driver_flavour > > > > And a module would be loaded and a struct vfio_device is created for > > that device. Very easy for the user. > > This is like switching a device to a parallel universe where we do > want vfio drivers to bind automatically to devices. Yes. If we do this I'd probably suggest that driver_override be bumped down to some user compat and 'vfio > driver_override' would just set the flavour. As-is driver_override seems dangerous as overriding the matching table could surely allow root userspace to crash the machine. In situations with trusted boot/signed modules this shouldn't be. > > > If we coupled that with wildcard support in driver_override, ex. > > > "vfio_pci*", and used consistent module naming, I think we'd only need > > > to teach userspace about this wildcard and binding to a specific module > > > would come for free. > > > > What would the wildcard do? > > It allows a driver_override to match more than one driver, not too > dissimilar to your driver_flavor above. In this case it would match > all driver names starting with "vfio_pci". For example if we had: > > softdep vfio-pci pre: vfio-pci-foo vfio-pci-bar > > Then we'd pre-seed the condition that drivers foo and bar precede the > base vfio-pci driver, each will match the device to the driver and have > an opportunity in their probe function to either claim or skip the > device. Userspace could also set and exact driver_override, for > example if they want to force using the base vfio-pci driver or go > directly to a specific variant. Okay, I see. The problem is that this makes 'vfio-pci' monolithic, in normal situations it will load *everything*. While that might not seem too bad with these simple drivers, at least the mlx5 migration driver will have a large dependency tree and pull in lots of other modules. Even Max's sample from v1 pulls in mlx5_core.ko and a bunch of other stuff in its orbit. This is why I want to try for fine grained autoloading first. It really is the elegant solution if we can work it out. > > Open coding a match table in probe() and returning failure feels hacky > > to me. > > How's it any different than Max's get_foo_vfio_pci_driver() that calls > pci_match_id() with an internal match table? Well, I think that is hacky too - but it is hacky only to service user space compatability so lets put that aside > It seems a better fit for the existing use cases, for example the > IGD variant can use a single line table to exclude all except Intel > VGA class devices in its probe callback, then test availability of > the extra regions we'd expose, otherwise return -ENODEV. I don't think we should over-focus on these two firmware triggered examples. I looked at the Intel GPU driver and it already only reads the firmware thing for certain PCI ID's, we can absolutely generate a narrow match table for it. Same is true for the NVIDIA GPU. The fact this is hard or whatever is beside the point - future drivers in this scheme should have exact match tables. The mlx5 sample is a good example, as it matches a very narrow NVMe device that is properly labeled with a subvendor ID. It does not match every NVMe device and then run code to figure it out. I think this is the right thing to do as it is the only thing that would give us fine grained module loading. Even so, I'm not *so* worried about "over matching" - if IGD or the nvidia stuff load on a wide set of devices then they can just not enable their extended stuff. It wastes some kernel memory, but it is OK. And if some driver *really* gets stuck here the true answer is to improve the driver core match capability. > devices in the deny-list and non-endpoint devices. Many drivers > clearly place implicit trust in their id_table, others don't. In the > case of meta drivers, I think it's fair to make use of the latter > approach. Well, AFAIK, the driver core doesn't have a 'try probe, if it fails then try another driver' approach. One device, one driver. Am I missing something? I would prefer not to propose to Greg such a radical change to how driver loading works.. I also think the softdep/implicit loading/ordering will not be welcomed, it feels weird to me. > > We should NOT be avoiding the standard infrastructure for matching > > drivers to devices by re-implementing it poorly. > > I take some blame for the request_module() behavior of vfio-platform, > but I think we're on the same page that we don't want to turn > vfio-pci Okay, that's good, we can explore the driver core side and see what could work to decide if we can do fine-grained loading or not. The question is now how to stage all this work? It is too big to do all in one shot - can we reform Max's series into something mergable without the driver core part? For instance removing the pci_device_driver and dedicated modules and only doing the compat path? It would look the same from userspace, but the internals would be split to a library mode. The patch to add in the device driver would then be small enough to go along with future driver core changes, and if it fails we still have several fallback plans to use the librarizided version,. > into a nexus for loading variant drivers. Whatever solution we use for > vfio-pci might translate to replacing that vfio-platform behavior. Yes, I'd want to see this too. It shows it is a general enough idea. Greg has been big on asking for lots of users for driver core changes, so it is good we have more things. Thanks, Jason
On Fri, 19 Mar 2021 19:59:43 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Fri, Mar 19, 2021 at 03:08:09PM -0600, Alex Williamson wrote: > > On Fri, 19 Mar 2021 17:07:49 -0300 > > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > On Fri, Mar 19, 2021 at 11:36:42AM -0600, Alex Williamson wrote: > > > > On Fri, 19 Mar 2021 17:34:49 +0100 > > > > Christoph Hellwig <hch@lst.de> wrote: > > > > > > > > > On Fri, Mar 19, 2021 at 01:28:48PM -0300, Jason Gunthorpe wrote: > > > > > > The wrinkle I don't yet have an easy answer to is how to load vfio_pci > > > > > > as a universal "default" within the driver core lazy bind scheme and > > > > > > still have working module autoloading... I'm hoping to get some > > > > > > research into this.. > > > > > > > > What about using MODULE_SOFTDEP("pre: ...") in the vfio-pci base > > > > driver, which would load all the known variants in order to influence > > > > the match, and therefore probe ordering? > > > > > > The way the driver core works is to first match against the already > > > loaded driver list, then trigger an event for module loading and when > > > new drivers are registered they bind to unbound devices. > > > > The former is based on id_tables, the latter on MODULE_DEVICE_TABLE, we > > don't have either of those. > > Well, today we don't, but Max here adds id_table's to the special > devices and a MODULE_DEVICE_TABLE would come too if we do the flavours > thing below. I think the id_tables are the wrong approach for IGD and NVLink variants. > My starting thinking is that everything should have these tables and > they should work properly.. id_tables require ongoing maintenance whereas the existing variants require only vendor + device class and some platform feature, like a firmware or fdt table. They're meant to only add extra regions to vfio-pci base support, not extensively modify the device interface. > > As noted to Christoph, the cases where we want a vfio driver to > > bind to anything automatically is the exception. > > I agree vfio should not automatically claim devices, but once vfio is > told to claim a device everything from there after should be > automatic. > > > > One answer is to have userspace udev have the "hook" here and when a > > > vfio flavour mod alias is requested on a PCI device it swaps in > > > vfio_pci if it can't find an alternative. > > > > > > The dream would be a system with no vfio modules loaded could do some > > > > > > echo "vfio" > /sys/bus/pci/xxx/driver_flavour > > > > > > And a module would be loaded and a struct vfio_device is created for > > > that device. Very easy for the user. > > > > This is like switching a device to a parallel universe where we do > > want vfio drivers to bind automatically to devices. > > Yes. > > If we do this I'd probably suggest that driver_override be bumped down > to some user compat and 'vfio > driver_override' would just set the > flavour. > > As-is driver_override seems dangerous as overriding the matching table > could surely allow root userspace to crash the machine. In situations > with trusted boot/signed modules this shouldn't be. When we're dealing with meta-drivers that can bind to anything, we shouldn't rely on the match, but should instead verify the driver is appropriate in the probe callback. Even without driver_override, there's the new_id mechanism. Either method allows the root user to break driver binding. Greg has previously stated something to the effect that users get to keep all the pieces when they break something by manipulating driver binding. > > > > If we coupled that with wildcard support in driver_override, ex. > > > > "vfio_pci*", and used consistent module naming, I think we'd only need > > > > to teach userspace about this wildcard and binding to a specific module > > > > would come for free. > > > > > > What would the wildcard do? > > > > It allows a driver_override to match more than one driver, not too > > dissimilar to your driver_flavor above. In this case it would match > > all driver names starting with "vfio_pci". For example if we had: > > > > softdep vfio-pci pre: vfio-pci-foo vfio-pci-bar > > > > Then we'd pre-seed the condition that drivers foo and bar precede the > > base vfio-pci driver, each will match the device to the driver and have > > an opportunity in their probe function to either claim or skip the > > device. Userspace could also set and exact driver_override, for > > example if they want to force using the base vfio-pci driver or go > > directly to a specific variant. > > Okay, I see. The problem is that this makes 'vfio-pci' monolithic, in > normal situations it will load *everything*. > > While that might not seem too bad with these simple drivers, at least > the mlx5 migration driver will have a large dependency tree and pull > in lots of other modules. Even Max's sample from v1 pulls in mlx5_core.ko > and a bunch of other stuff in its orbit. Luckily the mlx5 driver doesn't need to be covered by compatibility support, so we don't need to set a softdep for it and the module could be named such that a wildcard driver_override of vfio_pci* shouldn't logically include that driver. Users can manually create their own modprobe.d softdep entry if they'd like to include it. Otherwise userspace would need to know to bind to it specifically. > This is why I want to try for fine grained autoloading first. It > really is the elegant solution if we can work it out. I just don't see how we create a manageable change to userspace. > > > Open coding a match table in probe() and returning failure feels hacky > > > to me. > > > > How's it any different than Max's get_foo_vfio_pci_driver() that calls > > pci_match_id() with an internal match table? > > Well, I think that is hacky too - but it is hacky only to service user > space compatability so lets put that aside I don't see that dropping incompatible devices in the probe function rather than the match via id_table is necessarily a hack. I think driver-core explicitly supports this (see below). > > It seems a better fit for the existing use cases, for example the > > IGD variant can use a single line table to exclude all except Intel > > VGA class devices in its probe callback, then test availability of > > the extra regions we'd expose, otherwise return -ENODEV. > > I don't think we should over-focus on these two firmware triggered > examples. I looked at the Intel GPU driver and it already only reads > the firmware thing for certain PCI ID's, we can absolutely generate a > narrow match table for it. Same is true for the NVIDIA GPU. I'm not sure we can make this assertion, both only care about the type of device and existence of associated firmware tables. No PCI IDs are currently involved. > The fact this is hard or whatever is beside the point - future drivers > in this scheme should have exact match tables. > > The mlx5 sample is a good example, as it matches a very narrow NVMe > device that is properly labeled with a subvendor ID. It does not match > every NVMe device and then run code to figure it out. I think this is > the right thing to do as it is the only thing that would give us fine > grained module loading. Sounds like the right thing to do for that device, if it's only designed to run in this framework. That's more like the mdev device model > Even so, I'm not *so* worried about "over matching" - if IGD or the > nvidia stuff load on a wide set of devices then they can just not > enable their extended stuff. It wastes some kernel memory, but it is > OK. I'd rather they bind to the base vfio-pci driver if their extended features are not available. > And if some driver *really* gets stuck here the true answer is to > improve the driver core match capability. > > > devices in the deny-list and non-endpoint devices. Many drivers > > clearly place implicit trust in their id_table, others don't. In the > > case of meta drivers, I think it's fair to make use of the latter > > approach. > > Well, AFAIK, the driver core doesn't have a 'try probe, if it fails > then try another driver' approach. One device, one driver. Am I > missing something? If the driver probe callback fails, really_probe() returns 0 with the comment: /* * Ignore errors returned by ->probe so that the next driver can try * its luck. */ ret = 0; That allows bus_for_each_drv() to continue to iterate. > > I would prefer not to propose to Greg such a radical change to how > driver loading works.. Seems to be how it works already. > I also think the softdep/implicit loading/ordering will not be > welcomed, it feels weird to me. AFAICT, it works within the existing driver-core, it's largely an extension to pci-core driver_override support to enable wildcard matching, ideally along with adding the same for all buses that support driver_override. Thanks, Alex
On Fri, Mar 19, 2021 at 10:40:28PM -0600, Alex Williamson wrote: > > Well, today we don't, but Max here adds id_table's to the special > > devices and a MODULE_DEVICE_TABLE would come too if we do the flavours > > thing below. > > I think the id_tables are the wrong approach for IGD and NVLink > variants. I really disagree with this. Checking for some random bits in firmware and assuming that every device made forever into the future works with this check is not a good way to do compatibility. Christoph made the same point. We have good processes to maintain id tables, I don't see this as a problem. > > As-is driver_override seems dangerous as overriding the matching table > > could surely allow root userspace to crash the machine. In situations > > with trusted boot/signed modules this shouldn't be. > > When we're dealing with meta-drivers that can bind to anything, we > shouldn't rely on the match, but should instead verify the driver is > appropriate in the probe callback. Even without driver_override, > there's the new_id mechanism. Either method allows the root user to > break driver binding. Greg has previously stated something to the > effect that users get to keep all the pieces when they break something > by manipulating driver binding. Yes, but that is a view where root is allowed to break the kernel, we now have this optional other world where that is not allowed and root access to lots of dangerous things are now disabled. new_id and driver_override should probably be in that disable list too.. > > While that might not seem too bad with these simple drivers, at least > > the mlx5 migration driver will have a large dependency tree and pull > > in lots of other modules. Even Max's sample from v1 pulls in mlx5_core.ko > > and a bunch of other stuff in its orbit. > > Luckily the mlx5 driver doesn't need to be covered by compatibility > support, so we don't need to set a softdep for it and the module could > be named such that a wildcard driver_override of vfio_pci* shouldn't > logically include that driver. Users can manually create their own > modprobe.d softdep entry if they'd like to include it. Otherwise > userspace would need to know to bind to it specifically. But now you are giving up on the whole point, which was to automatically load the correct specific module without special admin involvement! > > This is why I want to try for fine grained autoloading first. It > > really is the elegant solution if we can work it out. > > I just don't see how we create a manageable change to userspace. I'm not sure I understand. Even if we add a new sysfs to set some flavour then that is a pretty trivial change for userspace to move from driver_override? > > I don't think we should over-focus on these two firmware triggered > > examples. I looked at the Intel GPU driver and it already only reads > > the firmware thing for certain PCI ID's, we can absolutely generate a > > narrow match table for it. Same is true for the NVIDIA GPU. > > I'm not sure we can make this assertion, both only care about the type > of device and existence of associated firmware tables. Well, I read through the Intel GPU driver and this is how I felt it works. It doesn't even check the firmware bit unless certain PCI IDs are matched first. For NVIDIA GPU Max checked internally and we saw it looks very much like how Intel GPU works. Only some PCI IDs trigger checking on the feature the firmware thing is linked to. My point is: the actual *drivers* consuming these firmware features do *not* blindly match every PCI device and check for the firmware bit. They all have narrow matches and further only try to use the firmware thing for some subset of PCI IDs that the entire driver supports. Given that the actual drivers work this way there is no technical reason vfio-pci can't do this as well. We don't have to change them of course, they can stay as is if people feel really strongly. > > Even so, I'm not *so* worried about "over matching" - if IGD or the > > nvidia stuff load on a wide set of devices then they can just not > > enable their extended stuff. It wastes some kernel memory, but it is > > OK. > > I'd rather they bind to the base vfio-pci driver if their extended > features are not available. Sure it would be nice, but functionally it is no different. > > And if some driver *really* gets stuck here the true answer is to > > improve the driver core match capability. > > > > > devices in the deny-list and non-endpoint devices. Many drivers > > > clearly place implicit trust in their id_table, others don't. In the > > > case of meta drivers, I think it's fair to make use of the latter > > > approach. > > > > Well, AFAIK, the driver core doesn't have a 'try probe, if it fails > > then try another driver' approach. One device, one driver. Am I > > missing something? > > If the driver probe callback fails, really_probe() returns 0 with the > comment: > > /* > * Ignore errors returned by ->probe so that the next driver can try > * its luck. > */ > ret = 0; > > That allows bus_for_each_drv() to continue to iterate. Er, but we have no reliable way to order drivers in the list so this still assumes the system has exactly one driver match (even if some of the match is now in code). It won't work with a "universal" driver without more changes. (and I couldn't find out why Cornelia added this long ago, or how or even if it actually ended up being used) > > I also think the softdep/implicit loading/ordering will not be > > welcomed, it feels weird to me. > > AFAICT, it works within the existing driver-core, it's largely an > extension to pci-core driver_override support to enable wildcard > matching, ideally along with adding the same for all buses that support > driver_override. Thanks, It is the implicit ordering of module loading that is trouble. Regards, Jason
On Fri, Mar 19, 2021 at 05:07:49PM -0300, Jason Gunthorpe wrote: > The way the driver core works is to first match against the already > loaded driver list, then trigger an event for module loading and when > new drivers are registered they bind to unbound devices. > > So, the trouble is the event through userspace because the kernel > can't just go on to use vfio_pci until it knows userspace has failed > to satisfy the load request. > > One answer is to have userspace udev have the "hook" here and when a > vfio flavour mod alias is requested on a PCI device it swaps in > vfio_pci if it can't find an alternative. > > The dream would be a system with no vfio modules loaded could do some > > echo "vfio" > /sys/bus/pci/xxx/driver_flavour > > And a module would be loaded and a struct vfio_device is created for > that device. Very easy for the user. Maybe I did not communicate my suggestion last week very well. My idea is that there are no different pci_drivers vs vfio or not, but different personalities of the same driver. So the interface would still look somewhat like your suggestion above, although I'd prefer something like: echo 1 > /sys/bus/pci/xxx/use_vfio How would the flow look like for the various cases? a) if a driver is bound, and it supports the enable_vfio method that is called, and everything is controller by the driver, which uses symbols exorted from vfio/vfio_pci to implement the functionality b) if a driver is bound, but does not support the enable_vfio method it is unbound and vfio_pci is bound instead, continue at c) c) use the normal current vfio flow do the reverse on a echo 0 > /sys/bus/pci/xxx/use_vfio
On Sun, 21 Mar 2021 09:58:18 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Fri, Mar 19, 2021 at 10:40:28PM -0600, Alex Williamson wrote: > > > > Well, today we don't, but Max here adds id_table's to the special > > > devices and a MODULE_DEVICE_TABLE would come too if we do the flavours > > > thing below. > > > > I think the id_tables are the wrong approach for IGD and NVLink > > variants. > > I really disagree with this. Checking for some random bits in firmware > and assuming that every device made forever into the future works with > this check is not a good way to do compatibility. Christoph made the > same point. > > We have good processes to maintain id tables, I don't see this as a > problem. The base driver we're discussing here is a meta-driver that binds to any PCI endpoint as directed by the user. There is no id_table. There can't be any id_table unless you're expecting every device vendor to submit the exact subset of devices they have tested and condone usage with this interface. The IGD extensions here only extend that interface by providing userspace read-only access to a few additional pieces of information that we've found to be necessary for certain userspace drivers. The actual device interface is unchanged. In the case of the NVLink extensions, AIUI these are mostly extensions of a firmware defined interface for managing aspects of the interconnect to the device. It is actually the "random bits in firmware" that we want to expose, the ID of the device is somewhat tangential, we just only look for those firmware extensions in association to certain vendor devices. Of course if you start looking at features like migration support, that's more than likely not simply an additional region with optional information, it would need to interact with the actual state of the device. For those, I would very much support use of a specific id_table. That's not these. > > > As-is driver_override seems dangerous as overriding the matching table > > > could surely allow root userspace to crash the machine. In situations > > > with trusted boot/signed modules this shouldn't be. > > > > When we're dealing with meta-drivers that can bind to anything, we > > shouldn't rely on the match, but should instead verify the driver is > > appropriate in the probe callback. Even without driver_override, > > there's the new_id mechanism. Either method allows the root user to > > break driver binding. Greg has previously stated something to the > > effect that users get to keep all the pieces when they break something > > by manipulating driver binding. > > Yes, but that is a view where root is allowed to break the kernel, we > now have this optional other world where that is not allowed and root > access to lots of dangerous things are now disabled. > > new_id and driver_override should probably be in that disable list > too.. We don't have this other world yet, nor is it clear that we will have it. What sort of id_table is the base vfio-pci driver expected to use? There's always a risk that hardware doesn't adhere to the spec or that platform firmware might escalate an error that we'd otherwise consider mundane from a userspace driver. > > > While that might not seem too bad with these simple drivers, at least > > > the mlx5 migration driver will have a large dependency tree and pull > > > in lots of other modules. Even Max's sample from v1 pulls in mlx5_core.ko > > > and a bunch of other stuff in its orbit. > > > > Luckily the mlx5 driver doesn't need to be covered by compatibility > > support, so we don't need to set a softdep for it and the module could > > be named such that a wildcard driver_override of vfio_pci* shouldn't > > logically include that driver. Users can manually create their own > > modprobe.d softdep entry if they'd like to include it. Otherwise > > userspace would need to know to bind to it specifically. > > But now you are giving up on the whole point, which was to > automatically load the correct specific module without special admin > involvement! This series only exposed a temporary compatibility interface to provide that anyway. As I understood it, the long term solution was that userspace would somehow learn which driver to use for which device. That "somehow" isn't clear to me. > > > This is why I want to try for fine grained autoloading first. It > > > really is the elegant solution if we can work it out. > > > > I just don't see how we create a manageable change to userspace. > > I'm not sure I understand. Even if we add a new sysfs to set some > flavour then that is a pretty trivial change for userspace to move > from driver_override? Perhaps for some definition of trivial that I'm not familiar with. We're talking about changing libvirt and driverctl and every distro and user that's created a custom script outside of those. Even changing from "vfio-pci" to "vfio-pci*" is a hurdle. > > > I don't think we should over-focus on these two firmware triggered > > > examples. I looked at the Intel GPU driver and it already only reads > > > the firmware thing for certain PCI ID's, we can absolutely generate a > > > narrow match table for it. Same is true for the NVIDIA GPU. > > > > I'm not sure we can make this assertion, both only care about the type > > of device and existence of associated firmware tables. > > Well, I read through the Intel GPU driver and this is how I felt it > works. It doesn't even check the firmware bit unless certain PCI IDs > are matched first. The IDs being only the PCI vendor ID and class code. The entire IGD extension is only meant to expose a vendor specific, graphics related firmware table and collateral config space, so of course we'd restrict it to that vendor for a graphics class device in approximately the right location in the system. There's a big difference between that and a fixed id_table. > For NVIDIA GPU Max checked internally and we saw it looks very much > like how Intel GPU works. Only some PCI IDs trigger checking on the > feature the firmware thing is linked to. And as Alexey noted, the table came up incomplete. But also those same devices exist on platforms where this extension is completely irrelevant. > My point is: the actual *drivers* consuming these firmware features do > *not* blindly match every PCI device and check for the firmware > bit. They all have narrow matches and further only try to use the > firmware thing for some subset of PCI IDs that the entire driver > supports. So because we don't check for an Intel specific graphics firmware table when binding to Realtek NIC, we can leap to the conclusion that there must be a concise id_table we can create for IGD support? > Given that the actual drivers work this way there is no technical > reason vfio-pci can't do this as well. There's a giant assumption above that I'm missing. Are you expecting that vendors are actually going to keep up with submitting device IDs that they claim to have tested and support with vfio-pci and all other devices won't be allowed to bind? That would single handedly destroy any non-enterprise use cases of vfio-pci. > We don't have to change them of course, they can stay as is if people > feel really strongly. > > > > Even so, I'm not *so* worried about "over matching" - if IGD or the > > > nvidia stuff load on a wide set of devices then they can just not > > > enable their extended stuff. It wastes some kernel memory, but it is > > > OK. > > > > I'd rather they bind to the base vfio-pci driver if their extended > > features are not available. > > Sure it would be nice, but functionally it is no different. Exactly, the device interface is not changed, so why is it such a heinous misstep that we should test for the feature we're trying to expose rather than a specific ID and fall through if we don't find it? > > > And if some driver *really* gets stuck here the true answer is to > > > improve the driver core match capability. > > > > > > > devices in the deny-list and non-endpoint devices. Many drivers > > > > clearly place implicit trust in their id_table, others don't. In the > > > > case of meta drivers, I think it's fair to make use of the latter > > > > approach. > > > > > > Well, AFAIK, the driver core doesn't have a 'try probe, if it fails > > > then try another driver' approach. One device, one driver. Am I > > > missing something? > > > > If the driver probe callback fails, really_probe() returns 0 with the > > comment: > > > > /* > > * Ignore errors returned by ->probe so that the next driver can try > > * its luck. > > */ > > ret = 0; > > > > That allows bus_for_each_drv() to continue to iterate. > > Er, but we have no reliable way to order drivers in the list so this > still assumes the system has exactly one driver match (even if some of > the match is now in code). > > It won't work with a "universal" driver without more changes. > > (and I couldn't find out why Cornelia added this long ago, or how or > even if it actually ended up being used) You'd need to go further back than Conny touching it, the original import into git had: void driver_attach(struct device_driver * drv) { struct bus_type * bus = drv->bus; struct list_head * entry; int error; if (!bus->match) return; list_for_each(entry, &bus->devices.list) { struct device * dev = container_of(entry, struct device, bus_list); if (!dev->driver) { error = driver_probe_device(drv, dev); if (error && (error != -ENODEV)) /* driver matched but the probe failed */ printk(KERN_WARNING "%s: probe of %s failed with error %d\n", drv->name, dev->bus_id, error); } } } So unless you want to do some bitkeeper archaeology, we've always allowed driver probes to fail and fall through to the next one, not even complaining with -ENODEV. In practice it hasn't been an issue because how many drivers do you expect to have that would even try to claim a device. Ordering is only important when there's a catch-all so we need to figure out how to make that last among a class of drivers that will attempt to claim a device. The softdep is a bit of a hack to do that, I'll admit, but I don't see how the alternate driver flavor universe solves having a catch-all either. Thanks, Alex
On Mon, Mar 22, 2021 at 04:11:25PM +0100, Christoph Hellwig wrote: > On Fri, Mar 19, 2021 at 05:07:49PM -0300, Jason Gunthorpe wrote: > > The way the driver core works is to first match against the already > > loaded driver list, then trigger an event for module loading and when > > new drivers are registered they bind to unbound devices. > > > > So, the trouble is the event through userspace because the kernel > > can't just go on to use vfio_pci until it knows userspace has failed > > to satisfy the load request. > > > > One answer is to have userspace udev have the "hook" here and when a > > vfio flavour mod alias is requested on a PCI device it swaps in > > vfio_pci if it can't find an alternative. > > > > The dream would be a system with no vfio modules loaded could do some > > > > echo "vfio" > /sys/bus/pci/xxx/driver_flavour > > > > And a module would be loaded and a struct vfio_device is created for > > that device. Very easy for the user. > > Maybe I did not communicate my suggestion last week very well. My > idea is that there are no different pci_drivers vs vfio or not, > but different personalities of the same driver. This isn't quite the scenario that needs solving. Lets go back to Max's V1 posting: The mlx5_vfio_pci.c pci_driver matches this: + { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1042, + PCI_VENDOR_ID_MELLANOX, PCI_ANY_ID) }, /* Virtio SNAP controllers */ This overlaps with the match table in drivers/virtio/virtio_pci_common.c: { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) }, So, if we do as you propose we have to add something mellanox specific to virtio_pci_common which seems to me to just repeating this whole problem except in more drivers. The general thing that that is happening is people are adding VM migration capability to existing standard PCI interfaces like VFIO, NVMe, etc At least in this mlx5 situation the PF driver provides the HW access to do the migration and the vfio mlx5 driver provides all the protocol and machinery specific to the PCI standard being migrated. They are all a little different. But you could imagine some other implemetnation where the VF might have an extra non-standard BAR that is the migration control. This is why I like having a full stand alone pci_driver as everyone implementing this can provide the vfio_device that is appropriate for the HW. Jason
On Mon, Mar 22, 2021 at 01:44:11PM -0300, Jason Gunthorpe wrote: > This isn't quite the scenario that needs solving. Lets go back to > Max's V1 posting: > > The mlx5_vfio_pci.c pci_driver matches this: > > + { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1042, > + PCI_VENDOR_ID_MELLANOX, PCI_ANY_ID) }, /* Virtio SNAP controllers */ > > This overlaps with the match table in > drivers/virtio/virtio_pci_common.c: > > { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) }, > > So, if we do as you propose we have to add something mellanox specific > to virtio_pci_common which seems to me to just repeating this whole > problem except in more drivers. Oh, yikes. > The general thing that that is happening is people are adding VM > migration capability to existing standard PCI interfaces like VFIO, > NVMe, etc Well, if a migration capability is added to virtio (or NVMe) it should be standardized and not vendor specific.
On Tue, Mar 23, 2021 at 02:17:09PM +0100, Christoph Hellwig wrote: > On Mon, Mar 22, 2021 at 01:44:11PM -0300, Jason Gunthorpe wrote: > > This isn't quite the scenario that needs solving. Lets go back to > > Max's V1 posting: > > > > The mlx5_vfio_pci.c pci_driver matches this: > > > > + { PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1042, > > + PCI_VENDOR_ID_MELLANOX, PCI_ANY_ID) }, /* Virtio SNAP controllers */ > > > > This overlaps with the match table in > > drivers/virtio/virtio_pci_common.c: > > > > { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) }, > > > > So, if we do as you propose we have to add something mellanox specific > > to virtio_pci_common which seems to me to just repeating this whole > > problem except in more drivers. > > Oh, yikes. This is why I keep saying it is a VFIO driver - it has no relation to the normal kernel drivers on the hypervisor. Even loading a normal kernel driver and switching to a VFIO mode would be unacceptably slow/disruptive. The goal is to go directly to a VFIO mode driver with PCI driver auto probing disabled to avoid attaching a regular driver. Big servers will have 1000's of these things. > > The general thing that that is happening is people are adding VM > > migration capability to existing standard PCI interfaces like VFIO, > > NVMe, etc > > Well, if a migration capability is added to virtio (or NVMe) it should > be standardized and not vendor specific. It would be nice, but it would be a challenging standard to write. I think the industry is still in the pre-standards mode of trying to even figure out how this stuff should work. IMHO PCI sig needs to tackle a big part of this as we can't embed any migration controls in the VF itself, it has to be secure for only hypervisor use. What we've got now is a Linux standard in VFIO where the uAPI to manage migration is multi-vendor and we want to plug drivers into that. If in a few years the industry also develops HW standards then I imagine using the same mechanism to plug in these standards based implementation. Jason
On Mon, Mar 22, 2021 at 10:40:16AM -0600, Alex Williamson wrote: > Of course if you start looking at features like migration support, > that's more than likely not simply an additional region with optional > information, it would need to interact with the actual state of the > device. For those, I would very much support use of a specific > id_table. That's not these. What I don't understand is why do we need two different ways of inserting vendor code? > > new_id and driver_override should probably be in that disable list > > too.. > > We don't have this other world yet, nor is it clear that we will have > it. We do today, it is obscure, but there is a whole set of config options designed to disable the unsafe kernel features. Kernels booted with secure boot and signed modules tend to enable a lot of them, for instance. The people working on the IMA stuff tend to enable a lot more as you can defeat the purpose of IMA if you can hijack the kernel. > What sort of id_table is the base vfio-pci driver expected to use? If it has a match table it would be all match, this is why I called it a "universal driver" If we have a flavour then the flavour controls the activation of VFIO, not new_id or driver_override, and in vfio flavour mode we can have an all match table, if we can resolve how to choose between two drivers with overlapping matches. > > > > This is why I want to try for fine grained autoloading first. It > > > > really is the elegant solution if we can work it out. > > > > > > I just don't see how we create a manageable change to userspace. > > > > I'm not sure I understand. Even if we add a new sysfs to set some > > flavour then that is a pretty trivial change for userspace to move > > from driver_override? > > Perhaps for some definition of trivial that I'm not familiar with. > We're talking about changing libvirt and driverctl and every distro and > user that's created a custom script outside of those. Even changing > from "vfio-pci" to "vfio-pci*" is a hurdle. Sure, but it isn't like a major architectural shift, nor is it mandatory unless you start using this new hardware class. Userspace changes when we add kernel functionality.. The kernel just has to keep working the way it used to for old functionality. > > Well, I read through the Intel GPU driver and this is how I felt it > > works. It doesn't even check the firmware bit unless certain PCI IDs > > are matched first. > > The IDs being only the PCI vendor ID and class code. I don't mean how vfio works, I mean how the Intel GPU driver works. eg: psb_pci_probe() psb_driver_load() psb_intel_opregion_setup() if (memcmp(base, OPREGION_SIGNATURE, 16)) { i915_pci_probe() i915_driver_probe() i915_driver_hw_probe() intel_opregion_setup() if (memcmp(buf, OPREGION_SIGNATURE, 16)) { All of these memcmp's are protected by exact id_tables hung off the pci_driver's id_table. VFIO is the different case. In this case the ID match confirms that the config space has the ASLS dword at the fixed offset. If the ID doesn't match nothing should read the ASLS offset. > > For NVIDIA GPU Max checked internally and we saw it looks very much > > like how Intel GPU works. Only some PCI IDs trigger checking on the > > feature the firmware thing is linked to. > > And as Alexey noted, the table came up incomplete. But also those same > devices exist on platforms where this extension is completely > irrelevant. I understood he ment that NVIDI GPUs *without* NVLINK can exist, but the ID table we have here is supposed to be the NVLINK compatible ID's. > So because we don't check for an Intel specific graphics firmware table > when binding to Realtek NIC, we can leap to the conclusion that there > must be a concise id_table we can create for IGD support? Concise? No, but we can see *today* what the ID table is supposed to be by just loooking and the three probe functions that touch OPREGION_SIGNATURE. > There's a giant assumption above that I'm missing. Are you expecting > that vendors are actually going to keep up with submitting device IDs > that they claim to have tested and support with vfio-pci and all other > devices won't be allowed to bind? That would single handedly destroy > any non-enterprise use cases of vfio-pci. Why not? They do it for the in-tree GPU drivers today! The ID table for Intel GPU is even in a *header file* and we can just #include it into vfio igd as well. > So unless you want to do some bitkeeper archaeology, we've always > allowed driver probes to fail and fall through to the next one, not > even complaining with -ENODEV. In practice it hasn't been an issue > because how many drivers do you expect to have that would even try to > claim a device. Do you know of anything using this ability? It might be helpful > Ordering is only important when there's a catch-all so we need to > figure out how to make that last among a class of drivers that will > attempt to claim a device. The softdep is a bit of a hack to do > that, I'll admit, but I don't see how the alternate driver flavor > universe solves having a catch-all either. Haven't entirely got there yet, but I think the catch all probably has to be handled by userspace udev/kmod in some way, as it is the only thing that knows if there is a more specific module to load. This is the biggest problem.. And again, I feel this is all a big tangent, especially now that HCH wants to delete the nvlink stuff we should just leave igd alone. Jason
On 24/03/2021 06:32, Jason Gunthorpe wrote: >>> For NVIDIA GPU Max checked internally and we saw it looks very much >>> like how Intel GPU works. Only some PCI IDs trigger checking on the >>> feature the firmware thing is linked to. >> >> And as Alexey noted, the table came up incomplete. But also those same >> devices exist on platforms where this extension is completely >> irrelevant. > > I understood he ment that NVIDI GPUs *without* NVLINK can exist, but > the ID table we have here is supposed to be the NVLINK compatible > ID's. I also meant there are more (than in the proposed list) GPUs with NVLink which will work on P9.
On Tue, 23 Mar 2021 16:32:13 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Mon, Mar 22, 2021 at 10:40:16AM -0600, Alex Williamson wrote: > > > Of course if you start looking at features like migration support, > > that's more than likely not simply an additional region with optional > > information, it would need to interact with the actual state of the > > device. For those, I would very much support use of a specific > > id_table. That's not these. > > What I don't understand is why do we need two different ways of > inserting vendor code? Because a PCI id table only identifies the device, these drivers are looking for a device in the context of firmware dependencies. > > > new_id and driver_override should probably be in that disable list > > > too.. > > > > We don't have this other world yet, nor is it clear that we will have > > it. > > We do today, it is obscure, but there is a whole set of config options > designed to disable the unsafe kernel features. Kernels booted with > secure boot and signed modules tend to enable a lot of them, for > instance. The people working on the IMA stuff tend to enable a lot > more as you can defeat the purpose of IMA if you can hijack the > kernel. > > > What sort of id_table is the base vfio-pci driver expected to use? > > If it has a match table it would be all match, this is why I called it > a "universal driver" > > If we have a flavour then the flavour controls the activation of > VFIO, not new_id or driver_override, and in vfio flavour mode we can > have an all match table, if we can resolve how to choose between two > drivers with overlapping matches. > > > > > > This is why I want to try for fine grained autoloading first. It > > > > > really is the elegant solution if we can work it out. > > > > > > > > I just don't see how we create a manageable change to userspace. > > > > > > I'm not sure I understand. Even if we add a new sysfs to set some > > > flavour then that is a pretty trivial change for userspace to move > > > from driver_override? > > > > Perhaps for some definition of trivial that I'm not familiar with. > > We're talking about changing libvirt and driverctl and every distro and > > user that's created a custom script outside of those. Even changing > > from "vfio-pci" to "vfio-pci*" is a hurdle. > > Sure, but it isn't like a major architectural shift, nor is it > mandatory unless you start using this new hardware class. > > Userspace changes when we add kernel functionality.. The kernel just > has to keep working the way it used to for old functionality. Seems like we're bound to keep igd in the core as you propose below. > > > Well, I read through the Intel GPU driver and this is how I felt it > > > works. It doesn't even check the firmware bit unless certain PCI IDs > > > are matched first. > > > > The IDs being only the PCI vendor ID and class code. > > I don't mean how vfio works, I mean how the Intel GPU driver works. > > eg: > > psb_pci_probe() > psb_driver_load() > psb_intel_opregion_setup() > if (memcmp(base, OPREGION_SIGNATURE, 16)) { > > i915_pci_probe() > i915_driver_probe() > i915_driver_hw_probe() > intel_opregion_setup() > if (memcmp(buf, OPREGION_SIGNATURE, 16)) { > > All of these memcmp's are protected by exact id_tables hung off the > pci_driver's id_table. > > VFIO is the different case. In this case the ID match confirms that > the config space has the ASLS dword at the fixed offset. If the ID > doesn't match nothing should read the ASLS offset. > > > > For NVIDIA GPU Max checked internally and we saw it looks very much > > > like how Intel GPU works. Only some PCI IDs trigger checking on the > > > feature the firmware thing is linked to. > > > > And as Alexey noted, the table came up incomplete. But also those same > > devices exist on platforms where this extension is completely > > irrelevant. > > I understood he ment that NVIDI GPUs *without* NVLINK can exist, but > the ID table we have here is supposed to be the NVLINK compatible > ID's. Those IDs are just for the SXM2 variants of the device that can exist on a variety of platforms, only one of which includes the firmware tables to activate the vfio support. > > So because we don't check for an Intel specific graphics firmware table > > when binding to Realtek NIC, we can leap to the conclusion that there > > must be a concise id_table we can create for IGD support? > > Concise? No, but we can see *today* what the ID table is supposed to > be by just loooking and the three probe functions that touch > OPREGION_SIGNATURE. > > > There's a giant assumption above that I'm missing. Are you expecting > > that vendors are actually going to keep up with submitting device IDs > > that they claim to have tested and support with vfio-pci and all other > > devices won't be allowed to bind? That would single handedly destroy > > any non-enterprise use cases of vfio-pci. > > Why not? They do it for the in-tree GPU drivers today! The ID table > for Intel GPU is even in a *header file* and we can just #include it > into vfio igd as well. Are you volunteering to maintain the vfio-pci-igd id_table, complete with the implicit expectation that those devices are known to work? Part of the disconnect we have here might be the intended level of support. There's a Kconfig option around vfio igd support for more than one reason. I think you're looking for a significant inflection in vendor's stated support for vfio use cases, beyond the "best-effort, give it a try", that we currently have. In some ways I look forward to that, so long as users can also use it as they do today (maybe not enterprise users). I sort of see imposing an id_table on igd support as trying to impose that "vendor condoned" use case before we actually have a vendor condoning it (or signing up to maintain an id table). > > So unless you want to do some bitkeeper archaeology, we've always > > allowed driver probes to fail and fall through to the next one, not > > even complaining with -ENODEV. In practice it hasn't been an issue > > because how many drivers do you expect to have that would even try to > > claim a device. > > Do you know of anything using this ability? It might be helpful I don't. > > Ordering is only important when there's a catch-all so we need to > > figure out how to make that last among a class of drivers that will > > attempt to claim a device. The softdep is a bit of a hack to do > > that, I'll admit, but I don't see how the alternate driver flavor > > universe solves having a catch-all either. > > Haven't entirely got there yet, but I think the catch all probably has > to be handled by userspace udev/kmod in some way, as it is the only > thing that knows if there is a more specific module to load. This is > the biggest problem.. > > And again, I feel this is all a big tangent, especially now that HCH > wants to delete the nvlink stuff we should just leave igd alone. Determining which things stay in vfio-pci-core and which things are split to variant drivers and how those variant drivers can match the devices they intend to support seems very inline with this series. If igd stays as part of vfio-pci-core then I think we're drawing a parallel to z-pci support, where a significant part of that support is a set of extra data structures exposed through capabilities to support userspace use of the device. Therefore extra regions or data structures through capabilities, where we're not changing device access, except as required for the platform (not the device) seem to be things that fit within the core, right? Thanks, Alex
On Mon, 29 Mar 2021 17:10:53 -0600 Alex Williamson <alex.williamson@redhat.com> wrote: > On Tue, 23 Mar 2021 16:32:13 -0300 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Mon, Mar 22, 2021 at 10:40:16AM -0600, Alex Williamson wrote: > > > So unless you want to do some bitkeeper archaeology, we've always > > > allowed driver probes to fail and fall through to the next one, not > > > even complaining with -ENODEV. In practice it hasn't been an issue > > > because how many drivers do you expect to have that would even try to > > > claim a device. > > > > Do you know of anything using this ability? It might be helpful > > I don't. I've been trying to remember why I added that patch to ignore all errors (rather than only -ENODEV), but I suspect it might have been related to the concurrent probing stuff I tried to implement back then. The one instance of drivers matching to the same id I recall (s390 ctc/lcs) is actually not handled on the individual device level, but in the meta ccwgroup driver; I don't remember anything else in the s390 case. > > > > Ordering is only important when there's a catch-all so we need to > > > figure out how to make that last among a class of drivers that will > > > attempt to claim a device. The softdep is a bit of a hack to do > > > that, I'll admit, but I don't see how the alternate driver flavor > > > universe solves having a catch-all either. > > > > Haven't entirely got there yet, but I think the catch all probably has > > to be handled by userspace udev/kmod in some way, as it is the only > > thing that knows if there is a more specific module to load. This is > > the biggest problem.. > > > > And again, I feel this is all a big tangent, especially now that HCH > > wants to delete the nvlink stuff we should just leave igd alone. > > Determining which things stay in vfio-pci-core and which things are > split to variant drivers and how those variant drivers can match the > devices they intend to support seems very inline with this series. If > igd stays as part of vfio-pci-core then I think we're drawing a > parallel to z-pci support, where a significant part of that support is > a set of extra data structures exposed through capabilities to support > userspace use of the device. Therefore extra regions or data > structures through capabilities, where we're not changing device > access, except as required for the platform (not the device) seem to be > things that fit within the core, right? Thanks, > > Alex As we are only talking about extra data governed by a capability, I don't really see a problem with keeping it in the vfio core. For those devices that need more specialized treatment, maybe we need some kind of priority-based matching? I.e., if we match a device with drivers, start with the one with highest priority (the specialized one), and have the generic driver at the lowest priority. A higher-priority driver added later one should not affect already bound devices (and would need manual intervention again.) [I think this has come up in other places in the past as well, but I don't have any concrete pointers handy.]
On Mon, Mar 29, 2021 at 05:10:53PM -0600, Alex Williamson wrote: > On Tue, 23 Mar 2021 16:32:13 -0300 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Mon, Mar 22, 2021 at 10:40:16AM -0600, Alex Williamson wrote: > > > > > Of course if you start looking at features like migration support, > > > that's more than likely not simply an additional region with optional > > > information, it would need to interact with the actual state of the > > > device. For those, I would very much support use of a specific > > > id_table. That's not these. > > > > What I don't understand is why do we need two different ways of > > inserting vendor code? > > Because a PCI id table only identifies the device, these drivers are > looking for a device in the context of firmware dependencies. The firmware dependencies only exist for a defined list of ID's, so I don't entirely agree with this statement. I agree with below though, so lets leave it be. > > I understood he ment that NVIDI GPUs *without* NVLINK can exist, but > > the ID table we have here is supposed to be the NVLINK compatible > > ID's. > > Those IDs are just for the SXM2 variants of the device that can > exist on a variety of platforms, only one of which includes the > firmware tables to activate the vfio support. AFAIK, SXM2 is a special physical form factor that has the nvlink physical connection - it is only for this specific generation of power servers that can accept the specific nvlink those cards have. > I think you're looking for a significant inflection in vendor's stated > support for vfio use cases, beyond the "best-effort, give it a try", > that we currently have. I see, so they don't want to. Lets leave it then. Though if Xe breaks everything they need to add/maintain a proper ID table, not more hackery. > > And again, I feel this is all a big tangent, especially now that HCH > > wants to delete the nvlink stuff we should just leave igd alone. > > Determining which things stay in vfio-pci-core and which things are > split to variant drivers and how those variant drivers can match the > devices they intend to support seems very inline with this series. IMHO, the main litmus test for core is if variant drivers will need it or not. No variant driver should be stacked on an igd device, or if it someday is, it should implement the special igd hackery internally (and have a proper ID table). So when we split it up igd goes into vfio_pci.ko as some special behavior vfio_pci.ko's universal driver provides for IGD. Every variant driver will still need the zdev data to be exposed to userspace, and every PCI device on s390 has that extra information. So vdev goes to vfio_pci_core.ko Future things going into vfio_pci.ko need a really good reason why they can't be varian drivers instead. Jason
On Thu, 1 Apr 2021 10:12:27 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Mon, Mar 29, 2021 at 05:10:53PM -0600, Alex Williamson wrote: > > On Tue, 23 Mar 2021 16:32:13 -0300 > > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > On Mon, Mar 22, 2021 at 10:40:16AM -0600, Alex Williamson wrote: > > > > > > > Of course if you start looking at features like migration support, > > > > that's more than likely not simply an additional region with optional > > > > information, it would need to interact with the actual state of the > > > > device. For those, I would very much support use of a specific > > > > id_table. That's not these. > > > > > > What I don't understand is why do we need two different ways of > > > inserting vendor code? > > > > Because a PCI id table only identifies the device, these drivers are > > looking for a device in the context of firmware dependencies. > > The firmware dependencies only exist for a defined list of ID's, so I > don't entirely agree with this statement. I agree with below though, > so lets leave it be. > > > > I understood he ment that NVIDI GPUs *without* NVLINK can exist, but > > > the ID table we have here is supposed to be the NVLINK compatible > > > ID's. > > > > Those IDs are just for the SXM2 variants of the device that can > > exist on a variety of platforms, only one of which includes the > > firmware tables to activate the vfio support. > > AFAIK, SXM2 is a special physical form factor that has the nvlink > physical connection - it is only for this specific generation of power > servers that can accept the specific nvlink those cards have. SXM2 is not unique to Power, there are various x86 systems that support the interface, everything from NVIDIA's own line of DGX systems, various vendor systems, all the way to VARs like Super Micro and Gigabyte. > > I think you're looking for a significant inflection in vendor's stated > > support for vfio use cases, beyond the "best-effort, give it a try", > > that we currently have. > > I see, so they don't want to. Lets leave it then. > > Though if Xe breaks everything they need to add/maintain a proper ID > table, not more hackery. e4eccb853664 ("vfio/pci: Bypass IGD init in case of -ENODEV") is supposed to enable Xe, where the IGD code is expected to return -ENODEV and we go on with the base vfio-pci support. > > > And again, I feel this is all a big tangent, especially now that > > > HCH wants to delete the nvlink stuff we should just leave igd > > > alone. > > > > Determining which things stay in vfio-pci-core and which things are > > split to variant drivers and how those variant drivers can match the > > devices they intend to support seems very inline with this series. > > > > IMHO, the main litmus test for core is if variant drivers will need it > or not. > > No variant driver should be stacked on an igd device, or if it someday > is, it should implement the special igd hackery internally (and have a > proper ID table). So when we split it up igd goes into vfio_pci.ko as > some special behavior vfio_pci.ko's universal driver provides for IGD. > > Every variant driver will still need the zdev data to be exposed to > userspace, and every PCI device on s390 has that extra information. So > vdev goes to vfio_pci_core.ko > > Future things going into vfio_pci.ko need a really good reason why > they can't be varian drivers instead. That sounds fair. Thanks, Alex
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 829e90a2e5a3..88c89863a205 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -48,8 +48,30 @@ config VFIO_PCI_IGD To enable Intel IGD assignment through vfio-pci, say Y. -config VFIO_PCI_NVLINK2 - def_bool y +config VFIO_PCI_NVLINK2GPU + tristate "VFIO support for NVIDIA NVLINK2 GPUs" depends on VFIO_PCI_CORE && PPC_POWERNV help - VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs + VFIO PCI driver for NVIDIA NVLINK2 GPUs with specific extensions + for P9 Witherspoon machine. + +config VFIO_PCI_NPU2 + tristate "VFIO support for IBM NPU host bus adapter on P9" + depends on VFIO_PCI_NVLINK2GPU && PPC_POWERNV + help + VFIO PCI specific extensions for IBM NVLink2 host bus adapter on P9 + Witherspoon machine. + +config VFIO_PCI_DRIVER_COMPAT + bool "VFIO PCI backward compatibility for vendor specific extensions" + default y + depends on VFIO_PCI + help + Say Y here if you want to preserve VFIO PCI backward + compatibility. vfio_pci.ko will continue to automatically use + the NVLINK2, NPU2 and IGD VFIO drivers when it is attached to + a compatible device. + + When N is selected the user must bind explicity to the module + they want to handle the device and vfio_pci.ko will have no + device specific special behaviors. diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index f539f32c9296..86fb62e271fc 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -2,10 +2,15 @@ obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o obj-$(CONFIG_VFIO_PCI) += vfio-pci.o +obj-$(CONFIG_VFIO_PCI_NPU2) += npu2-vfio-pci.o +obj-$(CONFIG_VFIO_PCI_NVLINK2GPU) += nvlink2gpu-vfio-pci.o vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-core-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o -vfio-pci-core-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2gpu.o vfio_pci_npu2.o vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o vfio-pci-y := vfio_pci.o + +npu2-vfio-pci-y := npu2_vfio_pci.o + +nvlink2gpu-vfio-pci-y := nvlink2gpu_vfio_pci.o diff --git a/drivers/vfio/pci/vfio_pci_npu2.c b/drivers/vfio/pci/npu2_vfio_pci.c similarity index 64% rename from drivers/vfio/pci/vfio_pci_npu2.c rename to drivers/vfio/pci/npu2_vfio_pci.c index 717745256ab3..7071bda0f2b6 100644 --- a/drivers/vfio/pci/vfio_pci_npu2.c +++ b/drivers/vfio/pci/npu2_vfio_pci.c @@ -14,19 +14,28 @@ * Author: Alex Williamson <alex.williamson@redhat.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> #include <linux/io.h> #include <linux/pci.h> #include <linux/uaccess.h> #include <linux/vfio.h> +#include <linux/list.h> #include <linux/sched/mm.h> #include <linux/mmu_context.h> #include <asm/kvm_ppc.h> #include "vfio_pci_core.h" +#include "npu2_vfio_pci.h" #define CREATE_TRACE_POINTS #include "npu2_trace.h" +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "Alexey Kardashevskiy <aik@ozlabs.ru>" +#define DRIVER_DESC "NPU2 VFIO PCI - User Level meta-driver for POWER9 NPU NVLink2 HBA" + EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap); struct vfio_pci_npu2_data { @@ -36,6 +45,10 @@ struct vfio_pci_npu2_data { unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */ }; +struct npu2_vfio_pci_device { + struct vfio_pci_core_device vdev; +}; + static size_t vfio_pci_npu2_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { @@ -120,7 +133,7 @@ static const struct vfio_pci_regops vfio_pci_npu2_regops = { .add_capability = vfio_pci_npu2_add_capability, }; -int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev) +static int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev) { int ret; struct vfio_pci_npu2_data *data; @@ -220,3 +233,132 @@ int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev) return ret; } + +static void npu2_vfio_pci_release(void *device_data) +{ + struct vfio_pci_core_device *vdev = device_data; + + mutex_lock(&vdev->reflck->lock); + if (!(--vdev->refcnt)) { + vfio_pci_vf_token_user_add(vdev, -1); + vfio_pci_core_spapr_eeh_release(vdev); + vfio_pci_core_disable(vdev); + } + mutex_unlock(&vdev->reflck->lock); + + module_put(THIS_MODULE); +} + +static int npu2_vfio_pci_open(void *device_data) +{ + struct vfio_pci_core_device *vdev = device_data; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mutex_lock(&vdev->reflck->lock); + + if (!vdev->refcnt) { + ret = vfio_pci_core_enable(vdev); + if (ret) + goto error; + + ret = vfio_pci_ibm_npu2_init(vdev); + if (ret && ret != -ENODEV) { + pci_warn(vdev->pdev, + "Failed to setup NVIDIA NV2 ATSD region\n"); + vfio_pci_core_disable(vdev); + goto error; + } + ret = 0; + vfio_pci_probe_mmaps(vdev); + vfio_pci_core_spapr_eeh_open(vdev); + vfio_pci_vf_token_user_add(vdev, 1); + } + vdev->refcnt++; +error: + mutex_unlock(&vdev->reflck->lock); + if (ret) + module_put(THIS_MODULE); + return ret; +} + +static const struct vfio_device_ops npu2_vfio_pci_ops = { + .name = "npu2-vfio-pci", + .open = npu2_vfio_pci_open, + .release = npu2_vfio_pci_release, + .ioctl = vfio_pci_core_ioctl, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, +}; + +static int npu2_vfio_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct npu2_vfio_pci_device *npvdev; + int ret; + + npvdev = kzalloc(sizeof(*npvdev), GFP_KERNEL); + if (!npvdev) + return -ENOMEM; + + ret = vfio_pci_core_register_device(&npvdev->vdev, pdev, + &npu2_vfio_pci_ops); + if (ret) + goto out_free; + + return 0; + +out_free: + kfree(npvdev); + return ret; +} + +static void npu2_vfio_pci_remove(struct pci_dev *pdev) +{ + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); + struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); + struct npu2_vfio_pci_device *npvdev; + + npvdev = container_of(core_vpdev, struct npu2_vfio_pci_device, vdev); + + vfio_pci_core_unregister_device(core_vpdev); + kfree(npvdev); +} + +static const struct pci_device_id npu2_vfio_pci_table[] = { + { PCI_VDEVICE(IBM, 0x04ea) }, + { 0, } +}; + +static struct pci_driver npu2_vfio_pci_driver = { + .name = "npu2-vfio-pci", + .id_table = npu2_vfio_pci_table, + .probe = npu2_vfio_pci_probe, + .remove = npu2_vfio_pci_remove, +#ifdef CONFIG_PCI_IOV + .sriov_configure = vfio_pci_core_sriov_configure, +#endif + .err_handler = &vfio_pci_core_err_handlers, +}; + +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev) +{ + if (pci_match_id(npu2_vfio_pci_driver.id_table, pdev)) + return &npu2_vfio_pci_driver; + return NULL; +} +EXPORT_SYMBOL_GPL(get_npu2_vfio_pci_driver); +#endif + +module_pci_driver(npu2_vfio_pci_driver); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/drivers/vfio/pci/npu2_vfio_pci.h b/drivers/vfio/pci/npu2_vfio_pci.h new file mode 100644 index 000000000000..92010d340346 --- /dev/null +++ b/drivers/vfio/pci/npu2_vfio_pci.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020, Mellanox Technologies, Ltd. All rights reserved. + * Author: Max Gurtovoy <mgurtovoy@nvidia.com> + */ + +#ifndef NPU2_VFIO_PCI_H +#define NPU2_VFIO_PCI_H + +#include <linux/pci.h> +#include <linux/module.h> + +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT +#if defined(CONFIG_VFIO_PCI_NPU2) || defined(CONFIG_VFIO_PCI_NPU2_MODULE) +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev); +#else +struct pci_driver *get_npu2_vfio_pci_driver(struct pci_dev *pdev) +{ + return NULL; +} +#endif +#endif + +#endif /* NPU2_VFIO_PCI_H */ diff --git a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c similarity index 67% rename from drivers/vfio/pci/vfio_pci_nvlink2gpu.c rename to drivers/vfio/pci/nvlink2gpu_vfio_pci.c index 6dce1e78ee82..84a5ac1ce8ac 100644 --- a/drivers/vfio/pci/vfio_pci_nvlink2gpu.c +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2. + * VFIO PCI NVIDIA NVLink2 GPUs support. * * Copyright (C) 2018 IBM Corp. All rights reserved. * Author: Alexey Kardashevskiy <aik@ozlabs.ru> @@ -12,6 +12,9 @@ * Author: Alex Williamson <alex.williamson@redhat.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> #include <linux/io.h> #include <linux/pci.h> #include <linux/uaccess.h> @@ -21,10 +24,15 @@ #include <asm/kvm_ppc.h> #include "vfio_pci_core.h" +#include "nvlink2gpu_vfio_pci.h" #define CREATE_TRACE_POINTS #include "nvlink2gpu_trace.h" +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "Alexey Kardashevskiy <aik@ozlabs.ru>" +#define DRIVER_DESC "NVLINK2GPU VFIO PCI - User Level meta-driver for NVIDIA NVLink2 GPUs" + EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault); EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap); @@ -39,6 +47,10 @@ struct vfio_pci_nvgpu_data { struct notifier_block group_notifier; }; +struct nv_vfio_pci_device { + struct vfio_pci_core_device vdev; +}; + static size_t vfio_pci_nvgpu_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { @@ -207,7 +219,8 @@ static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb, return NOTIFY_OK; } -int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev) +static int +vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev) { int ret; u64 reg[2]; @@ -293,3 +306,135 @@ int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev) return ret; } + +static void nvlink2gpu_vfio_pci_release(void *device_data) +{ + struct vfio_pci_core_device *vdev = device_data; + + mutex_lock(&vdev->reflck->lock); + if (!(--vdev->refcnt)) { + vfio_pci_vf_token_user_add(vdev, -1); + vfio_pci_core_spapr_eeh_release(vdev); + vfio_pci_core_disable(vdev); + } + mutex_unlock(&vdev->reflck->lock); + + module_put(THIS_MODULE); +} + +static int nvlink2gpu_vfio_pci_open(void *device_data) +{ + struct vfio_pci_core_device *vdev = device_data; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mutex_lock(&vdev->reflck->lock); + + if (!vdev->refcnt) { + ret = vfio_pci_core_enable(vdev); + if (ret) + goto error; + + ret = vfio_pci_nvidia_v100_nvlink2_init(vdev); + if (ret && ret != -ENODEV) { + pci_warn(vdev->pdev, + "Failed to setup NVIDIA NV2 RAM region\n"); + vfio_pci_core_disable(vdev); + goto error; + } + ret = 0; + vfio_pci_probe_mmaps(vdev); + vfio_pci_core_spapr_eeh_open(vdev); + vfio_pci_vf_token_user_add(vdev, 1); + } + vdev->refcnt++; +error: + mutex_unlock(&vdev->reflck->lock); + if (ret) + module_put(THIS_MODULE); + return ret; +} + +static const struct vfio_device_ops nvlink2gpu_vfio_pci_ops = { + .name = "nvlink2gpu-vfio-pci", + .open = nvlink2gpu_vfio_pci_open, + .release = nvlink2gpu_vfio_pci_release, + .ioctl = vfio_pci_core_ioctl, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, +}; + +static int nvlink2gpu_vfio_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct nv_vfio_pci_device *nvdev; + int ret; + + nvdev = kzalloc(sizeof(*nvdev), GFP_KERNEL); + if (!nvdev) + return -ENOMEM; + + ret = vfio_pci_core_register_device(&nvdev->vdev, pdev, + &nvlink2gpu_vfio_pci_ops); + if (ret) + goto out_free; + + return 0; + +out_free: + kfree(nvdev); + return ret; +} + +static void nvlink2gpu_vfio_pci_remove(struct pci_dev *pdev) +{ + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); + struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); + struct nv_vfio_pci_device *nvdev; + + nvdev = container_of(core_vpdev, struct nv_vfio_pci_device, vdev); + + vfio_pci_core_unregister_device(core_vpdev); + kfree(nvdev); +} + +static const struct pci_device_id nvlink2gpu_vfio_pci_table[] = { + { PCI_VDEVICE(NVIDIA, 0x1DB1) }, /* GV100GL-A NVIDIA Tesla V100-SXM2-16GB */ + { PCI_VDEVICE(NVIDIA, 0x1DB5) }, /* GV100GL-A NVIDIA Tesla V100-SXM2-32GB */ + { PCI_VDEVICE(NVIDIA, 0x1DB8) }, /* GV100GL-A NVIDIA Tesla V100-SXM3-32GB */ + { PCI_VDEVICE(NVIDIA, 0x1DF5) }, /* GV100GL-B NVIDIA Tesla V100-SXM2-16GB */ + { 0, } +}; + +static struct pci_driver nvlink2gpu_vfio_pci_driver = { + .name = "nvlink2gpu-vfio-pci", + .id_table = nvlink2gpu_vfio_pci_table, + .probe = nvlink2gpu_vfio_pci_probe, + .remove = nvlink2gpu_vfio_pci_remove, +#ifdef CONFIG_PCI_IOV + .sriov_configure = vfio_pci_core_sriov_configure, +#endif + .err_handler = &vfio_pci_core_err_handlers, +}; + +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev) +{ + if (pci_match_id(nvlink2gpu_vfio_pci_driver.id_table, pdev)) + return &nvlink2gpu_vfio_pci_driver; + return NULL; +} +EXPORT_SYMBOL_GPL(get_nvlink2gpu_vfio_pci_driver); +#endif + +module_pci_driver(nvlink2gpu_vfio_pci_driver); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/drivers/vfio/pci/nvlink2gpu_vfio_pci.h b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h new file mode 100644 index 000000000000..ebd5b600b190 --- /dev/null +++ b/drivers/vfio/pci/nvlink2gpu_vfio_pci.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020, Mellanox Technologies, Ltd. All rights reserved. + * Author: Max Gurtovoy <mgurtovoy@nvidia.com> + */ + +#ifndef NVLINK2GPU_VFIO_PCI_H +#define NVLINK2GPU_VFIO_PCI_H + +#include <linux/pci.h> +#include <linux/module.h> + +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT +#if defined(CONFIG_VFIO_PCI_NVLINK2GPU) || defined(CONFIG_VFIO_PCI_NVLINK2GPU_MODULE) +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev); +#else +struct pci_driver *get_nvlink2gpu_vfio_pci_driver(struct pci_dev *pdev) +{ + return NULL; +} +#endif +#endif + +#endif /* NVLINK2GPU_VFIO_PCI_H */ diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index dbc0a6559914..8e81ea039f31 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -27,6 +27,10 @@ #include <linux/uaccess.h> #include "vfio_pci_core.h" +#ifdef CONFIG_VFIO_PCI_DRIVER_COMPAT +#include "npu2_vfio_pci.h" +#include "nvlink2gpu_vfio_pci.h" +#endif #define DRIVER_VERSION "0.2" #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" @@ -142,14 +146,48 @@ static const struct vfio_device_ops vfio_pci_ops = { .match = vfio_pci_core_match, }; +/* + * This layer is used for backward compatibility. Hopefully it will be + * removed in the future. + */ +static struct pci_driver *vfio_pci_get_compat_driver(struct pci_dev *pdev) +{ + switch (pdev->vendor) { + case PCI_VENDOR_ID_NVIDIA: + switch (pdev->device) { + case 0x1db1: + case 0x1db5: + case 0x1db8: + case 0x1df5: + return get_nvlink2gpu_vfio_pci_driver(pdev); + default: + return NULL; + } + case PCI_VENDOR_ID_IBM: + switch (pdev->device) { + case 0x04ea: + return get_npu2_vfio_pci_driver(pdev); + default: + return NULL; + } + } + + return NULL; +} + static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct vfio_pci_device *vpdev; + struct pci_driver *driver; int ret; if (vfio_pci_is_denylisted(pdev)) return -EINVAL; + driver = vfio_pci_get_compat_driver(pdev); + if (driver) + return driver->probe(pdev, id); + vpdev = kzalloc(sizeof(*vpdev), GFP_KERNEL); if (!vpdev) return -ENOMEM; @@ -167,14 +205,21 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) static void vfio_pci_remove(struct pci_dev *pdev) { - struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); - struct vfio_pci_core_device *core_vpdev = vfio_device_data(vdev); - struct vfio_pci_device *vpdev; - - vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev); - - vfio_pci_core_unregister_device(core_vpdev); - kfree(vpdev); + struct pci_driver *driver; + + driver = vfio_pci_get_compat_driver(pdev); + if (driver) { + driver->remove(pdev); + } else { + struct vfio_device *vdev = dev_get_drvdata(&pdev->dev); + struct vfio_pci_core_device *core_vpdev; + struct vfio_pci_device *vpdev; + + core_vpdev = vfio_device_data(vdev); + vpdev = container_of(core_vpdev, struct vfio_pci_device, vdev); + vfio_pci_core_unregister_device(core_vpdev); + kfree(vpdev); + } } static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 4de8e352df9c..f9b39abe54cb 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -354,24 +354,6 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) } } - if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && - IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { - ret = vfio_pci_nvidia_v100_nvlink2_init(vdev); - if (ret && ret != -ENODEV) { - pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n"); - goto disable_exit; - } - } - - if (pdev->vendor == PCI_VENDOR_ID_IBM && - IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { - ret = vfio_pci_ibm_npu2_init(vdev); - if (ret && ret != -ENODEV) { - pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n"); - goto disable_exit; - } - } - return 0; disable_exit: diff --git a/drivers/vfio/pci/vfio_pci_core.h b/drivers/vfio/pci/vfio_pci_core.h index 8989443c3086..31f3836e606e 100644 --- a/drivers/vfio/pci/vfio_pci_core.h +++ b/drivers/vfio/pci/vfio_pci_core.h @@ -204,20 +204,6 @@ static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) return -ENODEV; } #endif -#ifdef CONFIG_VFIO_PCI_NVLINK2 -extern int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev); -extern int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev); -#else -static inline int vfio_pci_nvidia_v100_nvlink2_init(struct vfio_pci_core_device *vdev) -{ - return -ENODEV; -} - -static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_core_device *vdev) -{ - return -ENODEV; -} -#endif #ifdef CONFIG_S390 extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
The new drivers introduced are nvlink2gpu_vfio_pci.ko and npu2_vfio_pci.ko. The first will be responsible for providing special extensions for NVIDIA GPUs with NVLINK2 support for P9 platform (and others in the future). The last will be responsible for POWER9 NPU2 unit (NVLink2 host bus adapter). Also, preserve backward compatibility for users that were binding NVLINK2 devices to vfio_pci.ko. Hopefully this compatibility layer will be dropped in the future Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> --- drivers/vfio/pci/Kconfig | 28 +++- drivers/vfio/pci/Makefile | 7 +- .../pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} | 144 ++++++++++++++++- drivers/vfio/pci/npu2_vfio_pci.h | 24 +++ ...pci_nvlink2gpu.c => nvlink2gpu_vfio_pci.c} | 149 +++++++++++++++++- drivers/vfio/pci/nvlink2gpu_vfio_pci.h | 24 +++ drivers/vfio/pci/vfio_pci.c | 61 ++++++- drivers/vfio/pci/vfio_pci_core.c | 18 --- drivers/vfio/pci/vfio_pci_core.h | 14 -- 9 files changed, 422 insertions(+), 47 deletions(-) rename drivers/vfio/pci/{vfio_pci_npu2.c => npu2_vfio_pci.c} (64%) create mode 100644 drivers/vfio/pci/npu2_vfio_pci.h rename drivers/vfio/pci/{vfio_pci_nvlink2gpu.c => nvlink2gpu_vfio_pci.c} (67%) create mode 100644 drivers/vfio/pci/nvlink2gpu_vfio_pci.h