diff mbox series

[net-next,v2,1/1] virtual-bus: Implementation of Virtual Bus

Message ID 20191115223355.1277139-1-jeffrey.t.kirsher@intel.com (mailing list archive)
State RFC
Headers show
Series [net-next,v2,1/1] virtual-bus: Implementation of Virtual Bus | expand

Commit Message

Kirsher, Jeffrey T Nov. 15, 2019, 10:33 p.m. UTC
From: Dave Ertman <david.m.ertman@intel.com>

This is the initial implementation of the Virtual Bus,
virtbus_device and virtbus_driver.  The virtual bus is
a software based bus intended to support lightweight
devices and drivers and provide matching between them
and probing of the registered drivers.

The primary purpose of the virual bus is to provide
matching services and to pass the data pointer
contained in the virtbus_device to the virtbus_driver
during its probe call.  This will allow two separate
kernel objects to match up and start communication.

The bus will support probe/remove shutdown and
suspend/resume callbacks.

Kconfig and Makefile alterations are included

Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
Signed-off-by: Kiran Patil <kiran.patil@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
v2: Cleaned up the virtual bus interface based on feedback from Greg KH
    and provided a test driver and test virtual bus device as an example
    of how to implement the virtual bus.

 Documentation/driver-api/virtual_bus.rst      |  76 ++++
 drivers/bus/Kconfig                           |  14 +
 drivers/bus/Makefile                          |   1 +
 drivers/bus/virtual_bus.c                     | 326 ++++++++++++++++++
 include/linux/virtual_bus.h                   |  55 +++
 .../virtual_bus/virtual_bus_dev/Makefile      |   7 +
 .../virtual_bus_dev/virtual_bus_dev.c         |  67 ++++
 .../virtual_bus/virtual_bus_drv/Makefile      |   7 +
 .../virtual_bus_drv/virtual_bus_drv.c         | 101 ++++++
 9 files changed, 654 insertions(+)
 create mode 100644 Documentation/driver-api/virtual_bus.rst
 create mode 100644 drivers/bus/virtual_bus.c
 create mode 100644 include/linux/virtual_bus.h
 create mode 100644 tools/testing/selftests/virtual_bus/virtual_bus_dev/Makefile
 create mode 100644 tools/testing/selftests/virtual_bus/virtual_bus_dev/virtual_bus_dev.c
 create mode 100644 tools/testing/selftests/virtual_bus/virtual_bus_drv/Makefile
 create mode 100644 tools/testing/selftests/virtual_bus/virtual_bus_drv/virtual_bus_drv.c

Comments

Parav Pandit Nov. 15, 2019, 11:25 p.m. UTC | #1
Hi Jeff,

> From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Sent: Friday, November 15, 2019 4:34 PM
> 
> From: Dave Ertman <david.m.ertman@intel.com>
> 
> This is the initial implementation of the Virtual Bus, virtbus_device and
> virtbus_driver.  The virtual bus is a software based bus intended to support
> lightweight devices and drivers and provide matching between them and
> probing of the registered drivers.
> 
> The primary purpose of the virual bus is to provide matching services and to
> pass the data pointer contained in the virtbus_device to the virtbus_driver
> during its probe call.  This will allow two separate kernel objects to match up
> and start communication.
> 
It is fundamental to know that rdma device created by virtbus_driver will be anchored to which bus for an non abusive use.
virtbus or parent pci bus?
I asked this question in v1 version of this patch.

Also since it says - 'to support lightweight devices', documenting that information is critical to avoid ambiguity.

Since for a while I am working on the subbus/subdev_bus/xbus/mdev [1] whatever we want to call it, it overlaps with your comment about 'to support lightweight devices'.
Hence let's make things crystal clear weather the purpose is 'only matching service' or also 'lightweight devices'.
If this is only matching service, lets please remove lightweight devices part..

You additionally need modpost support for id table integration to modifo, modprobe and other tools.
A small patch similar to this one [2] is needed.
Please include in the series.

[..]

> +static const
> +struct virtbus_dev_id *virtbus_match_id(const struct virtbus_dev_id *id,
> +					struct virtbus_device *vdev)
> +{
> +	while (id->name[0]) {
> +		if (!strcmp(vdev->name, id->name)) {
> +			vdev->dev_id = id;
Matching function shouldn't be modifying the id.

> +			return id;
> +		}
> +		id++;
> +	}
> +	return NULL;
> +}
> +
> +#define to_virtbus_dev(x)	(container_of((x), struct virtbus_device, dev))
> +#define to_virtbus_drv(x)	(container_of((x), struct virtbus_driver, \
> +				 driver))
> +
> +/**
> + * virtbus_match - bind virtbus device to virtbus driver
> + * @dev: device
> + * @drv: driver
> + *
> + * Virtbus device IDs are always in "<name>.<instance>" format.
We might have to change this scheme depending on the first question I asked in the email about device anchoring.

> +
> +struct bus_type virtual_bus_type = {
> +	.name		= "virtbus",
> +	.match		= virtbus_match,
> +	.probe		= virtbus_probe,
> +	.remove		= virtbus_remove,
> +	.shutdown	= virtbus_shutdown,
> +	.suspend	= virtbus_suspend,
> +	.resume		= virtbus_resume,
> +};
Drop the tab alignment.

> +
> +/**
> + * virtbus_dev_register - add a virtual bus device
> + * @vdev: virtual bus device to add
> + */
> +int virtbus_dev_register(struct virtbus_device *vdev) {
> +	int ret;
> +
> +	if (!vdev)
> +		return -EINVAL;
No need for this check.
Driver shouldn't be called null device registration.

> +
> +	device_initialize(&vdev->dev);
> +
> +	vdev->dev.bus = &virtual_bus_type;
> +	/* All device IDs are automatically allocated */
> +	ret = ida_simple_get(&virtbus_dev_ida, 0, 0, GFP_KERNEL);
> +	if (ret < 0)
> +		return ret;
> +
This is bug, once device_initialize() is done, it must do put_device() and follow the release sequence.

> +	vdev->id = ret;
> +	dev_set_name(&vdev->dev, "%s.%d", vdev->name, vdev->id);
> +
> +	dev_dbg(&vdev->dev, "Registering VirtBus device '%s'\n",
I think 'virtbus' naming is better instead of 'VirtBus' all over. We don't do "Pci' in prints etc.

> +		dev_name(&vdev->dev));
> +
> +	ret = device_add(&vdev->dev);
> +	if (!ret)
> +		return ret;
> +
> +	/* Error adding virtual device */
> +	device_del(&vdev->dev);
> +	ida_simple_remove(&virtbus_dev_ida, vdev->id);
> +	vdev->id = VIRTBUS_DEVID_NONE;
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(virtbus_dev_register);
> +
> +/**
> + * virtbus_dev_unregister - remove a virtual bus device
> + * vdev: virtual bus device we are removing  */ void
> +virtbus_dev_unregister(struct virtbus_device *vdev) {
> +	if (!IS_ERR_OR_NULL(vdev)) {
> +		device_del(&vdev->dev);
> +
> +		ida_simple_remove(&virtbus_dev_ida, vdev->id);
I believe this should be done in the release() because above device_del() may not ensure that all references to the devices are dropped.

> +		vdev->id = VIRTBUS_DEVID_NONE;
> +	}
> +}
> +EXPORT_SYMBOL_GPL(virtbus_dev_unregister);
> +
> +struct virtbus_object {
> +	struct virtbus_device vdev;
> +	char name[];
> +};
> +
This shouldn't be needed once. More below.

> +/**
> + * virtbus_dev_release - Destroy a virtbus device
> + * @vdev: virtual device to release
> + *
> + * Note that the vdev->data which is separately allocated needs to be
> + * separately freed on it own.
> + */
> +static void virtbus_dev_release(struct device *dev) {
> +	struct virtbus_object *vo = container_of(dev, struct virtbus_object,
> +						 vdev.dev);
> +
> +	kfree(vo);
> +}
> +
> +/**
> + * virtbus_dev_alloc - allocate a virtbus device
> + * @name: name to associate with the vdev
> + * @data: pointer to data to be associated with this device  */ struct
> +virtbus_device *virtbus_dev_alloc(const char *name, void *data) {
> +	struct virtbus_object *vo;
> +
Data should not be used.
Caller needs to give a size of the object to allocate.
I discussed the example in detail with Jason in v1 of this patch. Please refer in that email.
It should be something like this.

/* size = sizeof(struct i40_virtbus_dev), and it is the first member */
virtbus_dev_alloc(size)
{
	[..]
}

struct i40_virtbus_dev {
	struct virbus_dev virtdev;
	/*... more fields that you want to share with other driver and to use in probe() */
};

irdma_probe(..)
{
	struct i40_virtbus_dev dev = container_of(dev, struct i40_virtbus_dev, dev);
}

[..]

> diff --git a/include/linux/virtual_bus.h b/include/linux/virtual_bus.h new file
> mode 100644 index 000000000000..b6f2406180f8
> --- /dev/null
> +++ b/include/linux/virtual_bus.h
> @@ -0,0 +1,55 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * virtual_bus.h - lightweight software bus
> + *
> + * Copyright (c) 2019-20 Intel Corporation
> + *
> + * Please see Documentation/driver-api/virtual_bus.rst for more
> +information  */
> +
> +#ifndef _VIRTUAL_BUS_H_
> +#define _VIRTUAL_BUS_H_
> +
> +#include <linux/device.h>
> +
> +#define VIRTBUS_DEVID_NONE	(-1)
> +#define VIRTBUS_NAME_SIZE	20
> +
> +struct virtbus_dev_id {
> +	char name[VIRTBUS_NAME_SIZE];
> +	u64 driver_data;
> +};
> +
> +struct virtbus_device {
> +	const char			*name;
> +	int				id;
> +	const struct virtbus_dev_id	*dev_id;
> +	struct device			dev;
Drop the tab based alignment and just please follow format of virtbus_driver you did below.
> +	void				*data;
Please drop data. we need only wrapper API virtbus_get/set_drvdata().
> +};

[1] https://lore.kernel.org/linux-rdma/20191107160448.20962-1-parav@mellanox.com/
[2] https://lore.kernel.org/patchwork/patch/1046991/
Parav Pandit Nov. 15, 2019, 11:42 p.m. UTC | #2
> From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Sent: Friday, November 15, 2019 4:34 PM
> Subject: [net-next v2 1/1] virtual-bus: Implementation of Virtual Bus
> 
> From: Dave Ertman <david.m.ertman@intel.com>
> 
> This is the initial implementation of the Virtual Bus, virtbus_device and
> virtbus_driver.  The virtual bus is a software based bus intended to support
> lightweight devices and drivers and provide matching between them and

When you wrote ' lightweight devices', you probably intent to say,
'multiple class of devices such as rdma, netdev etc for a single underlying parent device'.

'struct class' has clear meaning in kernel with above two examples and more.
Though it's not limited to these two classes, an example is always better. :-)

If so, please word that say, that avoids the confusion and it will be aligned to below primary purpose description.

> probing of the registered drivers.
> 
> The primary purpose of the virual bus is to provide matching services and to
> pass the data pointer contained in the virtbus_device to the virtbus_driver
> during its probe call.  This will allow two separate kernel objects to match up
> and start communication.
Greg KH Nov. 18, 2019, 7:48 a.m. UTC | #3
On Fri, Nov 15, 2019 at 02:33:55PM -0800, Jeff Kirsher wrote:
> From: Dave Ertman <david.m.ertman@intel.com>
> 
> This is the initial implementation of the Virtual Bus,
> virtbus_device and virtbus_driver.  The virtual bus is
> a software based bus intended to support lightweight
> devices and drivers and provide matching between them
> and probing of the registered drivers.
> 
> The primary purpose of the virual bus is to provide
> matching services and to pass the data pointer
> contained in the virtbus_device to the virtbus_driver
> during its probe call.  This will allow two separate
> kernel objects to match up and start communication.
> 
> The bus will support probe/remove shutdown and
> suspend/resume callbacks.
> 
> Kconfig and Makefile alterations are included
> 
> Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
> Signed-off-by: Kiran Patil <kiran.patil@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> ---
> v2: Cleaned up the virtual bus interface based on feedback from Greg KH
>     and provided a test driver and test virtual bus device as an example
>     of how to implement the virtual bus.
> 
>  Documentation/driver-api/virtual_bus.rst      |  76 ++++
>  drivers/bus/Kconfig                           |  14 +
>  drivers/bus/Makefile                          |   1 +
>  drivers/bus/virtual_bus.c                     | 326 ++++++++++++++++++
>  include/linux/virtual_bus.h                   |  55 +++
>  .../virtual_bus/virtual_bus_dev/Makefile      |   7 +
>  .../virtual_bus_dev/virtual_bus_dev.c         |  67 ++++
>  .../virtual_bus/virtual_bus_drv/Makefile      |   7 +
>  .../virtual_bus_drv/virtual_bus_drv.c         | 101 ++++++
>  9 files changed, 654 insertions(+)
>  create mode 100644 Documentation/driver-api/virtual_bus.rst
>  create mode 100644 drivers/bus/virtual_bus.c
>  create mode 100644 include/linux/virtual_bus.h
>  create mode 100644 tools/testing/selftests/virtual_bus/virtual_bus_dev/Makefile
>  create mode 100644 tools/testing/selftests/virtual_bus/virtual_bus_dev/virtual_bus_dev.c
>  create mode 100644 tools/testing/selftests/virtual_bus/virtual_bus_drv/Makefile
>  create mode 100644 tools/testing/selftests/virtual_bus/virtual_bus_drv/virtual_bus_drv.c
> 
> diff --git a/Documentation/driver-api/virtual_bus.rst b/Documentation/driver-api/virtual_bus.rst
> new file mode 100644
> index 000000000000..970e06267284
> --- /dev/null
> +++ b/Documentation/driver-api/virtual_bus.rst
> @@ -0,0 +1,76 @@
> +===============================
> +Virtual Bus Devices and Drivers
> +===============================
> +
> +See <linux/virtual_bus.h> for the models for virtbus_device and virtbus_driver.
> +This bus is meant to be a lightweight software based bus to attach generic
> +devices and drivers to so that a chunk of data can be passed between them.
> +
> +One use case example is an rdma driver needing to connect with several
> +different types of PCI LAN devices to be able to request resources from
> +them (queue sets).  Each LAN driver that supports rdma will register a
> +virtbus_device on the virtual bus for each physical function.  The rdma
> +driver will register as a virtbus_driver on the virtual bus to be
> +matched up with multiple virtbus_devices and receive a pointer to a
> +struct containing the callbacks that the PCI LAN drivers support for
> +registering with them.
> +
> +Sections in this document:
> +        Virtbus devices
> +        Virtbus drivers
> +        Device Enumeration
> +        Device naming and driver binding
> +        Virtual Bus API entry points
> +
> +Virtbus devices
> +~~~~~~~~~~~~~~~
> +Virtbus_devices are lightweight objects that support the minimal device
> +functionality.  Devices will accept a name, and then an automatically
> +generated index is concatenated onto it for the virtbus_device->name.
> +
> +The memory backing the "void *data" element of the virtbus_device is
> +expected to be allocated and freed outside the context of the bus
> +operations.  This memory is also expected to remain viable for the
> +duration of the time that the virtbus_device is registered to the
> +virtual bus. (e.g. from before the virtbus_dev_register until after
> +the paired virtbus_dev_unregister).
> +
> +The provided API for virtbus_dev_alloc is an efficient way of allocating
> +the memory for the virtbus_device (except for the data element) and
> +automatically freeing it when the device is removed from the bus.
> +
> +Virtbus drivers
> +~~~~~~~~~~~~~~~
> +Virtbus drivers register with the virtual bus to be matched with virtbus
> +devices.  They expect to be registered with a probe and remove callback,
> +and also support shutdown, suspend, and resume callbacks.  They otherwise
> +follow the standard driver behavior of having discovery and enumeration
> +handled in the bus infrastructure.
> +
> +Virtbus drivers register themselves with the API entry point virtbus_drv_reg
> +and unregister with virtbus_drv_unreg.
> +
> +Device Enumeration
> +~~~~~~~~~~~~~~~~~~
> +Enumeration is handled automatically by the bus infrastructure via the
> +ida_simple methods.
> +
> +Device naming and driver binding
> +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> +The virtbus_device.dev.name is the canonical name for the device. It is
> +built from two other parts:
> +
> +        - virtbus_device.name (also used for matching).
> +        - virtbus_device.id (generated automatically from ida_simple calls)
> +
> +This allows for multiple virtbus_devices with the same name, which will all
> +be matched to the same virtbus_driver. Driver binding is performed by the
> +driver core, invoking driver probe() after finding a match between device and driver.
> +
> +Virtual Bus API entry points
> +~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> +struct virtbus_device *virtbus_dev_alloc(const char *name, void *data)
> +int virtbus_dev_register(struct virtbus_device *vdev)
> +void virtbus_dev_unregister(struct virtbus_device *vdev)
> +int virtbus_drv_register(struct virtbus_driver *vdrv, struct module *owner)
> +void virtbus_drv_unregister(struct virtbus_driver *vdrv)
> diff --git a/drivers/bus/Kconfig b/drivers/bus/Kconfig
> index 6b331061d34b..30cef35b0c30 100644
> --- a/drivers/bus/Kconfig
> +++ b/drivers/bus/Kconfig
> @@ -193,4 +193,18 @@ config DA8XX_MSTPRI
>  
>  source "drivers/bus/fsl-mc/Kconfig"
>  
> +config VIRTUAL_BUS
> +       tristate "lightweight Virtual Bus"
> +       depends on PM
> +       help
> +         Provides a lightweight bus for virtbus_devices to be added to it
> +         and virtbus_drivers to be registered on it.  Will create a match
> +         between the driver and device, then call the driver's probe with
> +         the virtbus_device's struct (including a pointer for data).
> +         One example is the irdma driver needing to connect with various
> +         PCI LAN drivers to request resources (queues) to be able to perform
> +         its function.  The data in the virtbus_device created by the
> +         PCI LAN driver is a set of ops (function pointers) for the irdma
> +         driver to use to register and communicate with the PCI LAN driver.
> +
>  endmenu
> diff --git a/drivers/bus/Makefile b/drivers/bus/Makefile
> index 16b43d3468c6..0b0ba53cbe5b 100644
> --- a/drivers/bus/Makefile
> +++ b/drivers/bus/Makefile
> @@ -33,3 +33,4 @@ obj-$(CONFIG_UNIPHIER_SYSTEM_BUS)	+= uniphier-system-bus.o
>  obj-$(CONFIG_VEXPRESS_CONFIG)	+= vexpress-config.o
>  
>  obj-$(CONFIG_DA8XX_MSTPRI)	+= da8xx-mstpri.o
> +obj-$(CONFIG_VIRTUAL_BUS)	+= virtual_bus.o
> diff --git a/drivers/bus/virtual_bus.c b/drivers/bus/virtual_bus.c
> new file mode 100644
> index 000000000000..c6eab1658391
> --- /dev/null
> +++ b/drivers/bus/virtual_bus.c
> @@ -0,0 +1,326 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * virtual_bus.c - lightweight software based bus for virtual devices
> + *
> + * Copyright (c) 2019-20 Intel Corporation
> + *
> + * Please see Documentation/driver-api/virtual_bus.rst for
> + * more information
> + */
> +
> +#include <linux/string.h>
> +#include <linux/virtual_bus.h>
> +#include <linux/of_irq.h>
> +#include <linux/module.h>
> +#include <linux/init.h>
> +#include <linux/pm_runtime.h>
> +#include <linux/pm_domain.h>
> +#include <linux/acpi.h>
> +#include <linux/device.h>
> +
> +MODULE_LICENSE("GPL v2");
> +MODULE_DESCRIPTION("Lightweight Virtual Bus");
> +MODULE_AUTHOR("David Ertman <david.m.ertman@intel.com>");
> +MODULE_AUTHOR("Kiran Patil <kiran.patil@intel.com>");
> +
> +static DEFINE_IDA(virtbus_dev_ida);

Do you ever clean up this when unloaded?  I didn't see that happening
but I might have missed it.

> +
> +static const
> +struct virtbus_dev_id *virtbus_match_id(const struct virtbus_dev_id *id,
> +					struct virtbus_device *vdev)
> +{
> +	while (id->name[0]) {
> +		if (!strcmp(vdev->name, id->name)) {
> +			vdev->dev_id = id;

Why are you changing/setting the id?

> +			return id;
> +		}
> +		id++;
> +	}
> +	return NULL;
> +}
> +
> +#define to_virtbus_dev(x)	(container_of((x), struct virtbus_device, dev))
> +#define to_virtbus_drv(x)	(container_of((x), struct virtbus_driver, \
> +				 driver))
> +
> +/**
> + * virtbus_match - bind virtbus device to virtbus driver
> + * @dev: device
> + * @drv: driver
> + *
> + * Virtbus device IDs are always in "<name>.<instance>" format.
> + * Instances are automatically selected through an ida_simple_get so
> + * are positive integers. Names are taken from the device name field.
> + * Driver IDs are simple <name>.  Need to extract the name from the
> + * Virtual Device compare to name of the driver.
> + */
> +static int virtbus_match(struct device *dev, struct device_driver *drv)
> +{
> +	struct virtbus_driver *vdrv = to_virtbus_drv(drv);
> +	struct virtbus_device *vdev = to_virtbus_dev(dev);
> +
> +	if (vdrv->id_table)
> +		return virtbus_match_id(vdrv->id_table, vdev) != NULL;
> +
> +	return !strcmp(vdev->name, drv->name);
> +}
> +
> +/**
> + * virtbus_probe - call probe of the virtbus_drv
> + * @dev: device struct
> + */
> +static int virtbus_probe(struct device *dev)
> +{
> +	if (dev->driver->probe)
> +		return dev->driver->probe(dev);
> +
> +	return 0;
> +}
> +
> +static int virtbus_remove(struct device *dev)
> +{
> +	if (dev->driver->remove)
> +		return dev->driver->remove(dev);
> +
> +	return 0;
> +}
> +
> +static void virtbus_shutdown(struct device *dev)
> +{
> +	if (dev->driver->shutdown)
> +		dev->driver->shutdown(dev);
> +}
> +
> +static int virtbus_suspend(struct device *dev, pm_message_t state)
> +{
> +	if (dev->driver->suspend)
> +		return dev->driver->suspend(dev, state);
> +
> +	return 0;
> +}
> +
> +static int virtbus_resume(struct device *dev)
> +{
> +	if (dev->driver->resume)
> +		return dev->driver->resume(dev);
> +
> +	return 0;
> +}
> +
> +struct bus_type virtual_bus_type = {
> +	.name		= "virtbus",
> +	.match		= virtbus_match,
> +	.probe		= virtbus_probe,
> +	.remove		= virtbus_remove,
> +	.shutdown	= virtbus_shutdown,
> +	.suspend	= virtbus_suspend,
> +	.resume		= virtbus_resume,
> +};
> +
> +/**
> + * virtbus_dev_register - add a virtual bus device
> + * @vdev: virtual bus device to add
> + */
> +int virtbus_dev_register(struct virtbus_device *vdev)
> +{
> +	int ret;
> +
> +	if (!vdev)
> +		return -EINVAL;
> +
> +	device_initialize(&vdev->dev);
> +
> +	vdev->dev.bus = &virtual_bus_type;
> +	/* All device IDs are automatically allocated */
> +	ret = ida_simple_get(&virtbus_dev_ida, 0, 0, GFP_KERNEL);
> +	if (ret < 0)
> +		return ret;
> +
> +	vdev->id = ret;
> +	dev_set_name(&vdev->dev, "%s.%d", vdev->name, vdev->id);
> +
> +	dev_dbg(&vdev->dev, "Registering VirtBus device '%s'\n",
> +		dev_name(&vdev->dev));
> +
> +	ret = device_add(&vdev->dev);
> +	if (!ret)
> +		return ret;

This logic has tripped me up multiple times, it's an anti-pattern.
Please do:
	if (ret)
		goto device_add_error;

	return 0;

device_add_error:
	...

> +
> +	/* Error adding virtual device */
> +	device_del(&vdev->dev);
> +	ida_simple_remove(&virtbus_dev_ida, vdev->id);
> +	vdev->id = VIRTBUS_DEVID_NONE;
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(virtbus_dev_register);
> +
> +/**
> + * virtbus_dev_unregister - remove a virtual bus device
> + * vdev: virtual bus device we are removing
> + */
> +void virtbus_dev_unregister(struct virtbus_device *vdev)
> +{
> +	if (!IS_ERR_OR_NULL(vdev)) {
> +		device_del(&vdev->dev);
> +
> +		ida_simple_remove(&virtbus_dev_ida, vdev->id);
> +		vdev->id = VIRTBUS_DEVID_NONE;

Why set the id?  What will care/check this?

> +	}
> +}
> +EXPORT_SYMBOL_GPL(virtbus_dev_unregister);
> +
> +struct virtbus_object {
> +	struct virtbus_device vdev;
> +	char name[];
> +};

Why not use the name in the device structure?

thanks,

greg k-h
Greg KH Nov. 18, 2019, 7:49 a.m. UTC | #4
On Fri, Nov 15, 2019 at 02:33:55PM -0800, Jeff Kirsher wrote:
> From: Dave Ertman <david.m.ertman@intel.com>
> 
> This is the initial implementation of the Virtual Bus,
> virtbus_device and virtbus_driver.  The virtual bus is
> a software based bus intended to support lightweight
> devices and drivers and provide matching between them
> and probing of the registered drivers.
> 
> The primary purpose of the virual bus is to provide
> matching services and to pass the data pointer
> contained in the virtbus_device to the virtbus_driver
> during its probe call.  This will allow two separate
> kernel objects to match up and start communication.
> 
> The bus will support probe/remove shutdown and
> suspend/resume callbacks.
> 
> Kconfig and Makefile alterations are included
> 
> Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
> Signed-off-by: Kiran Patil <kiran.patil@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> ---
> v2: Cleaned up the virtual bus interface based on feedback from Greg KH
>     and provided a test driver and test virtual bus device as an example
>     of how to implement the virtual bus.

There is not a real user of this here, many of your exported functions
are not used at all, right?  I want to see this in "real use" to
actually determine how it works, and that's the only way you will know
if it solves your problem or not.

thanks,

greg k-h
Ertman, David M Nov. 18, 2019, 10:55 p.m. UTC | #5
> -----Original Message-----
> From: Greg KH <gregkh@linuxfoundation.org>
> Sent: Sunday, November 17, 2019 11:50 PM
> To: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>
> Cc: davem@davemloft.net; Ertman, David M <david.m.ertman@intel.com>;
> netdev@vger.kernel.org; linux-rdma@vger.kernel.org;
> nhorman@redhat.com; sassmann@redhat.com; jgg@ziepe.ca;
> parav@mellanox.com; Patil, Kiran <kiran.patil@intel.com>
> Subject: Re: [net-next v2 1/1] virtual-bus: Implementation of Virtual Bus
> 
> On Fri, Nov 15, 2019 at 02:33:55PM -0800, Jeff Kirsher wrote:
> > From: Dave Ertman <david.m.ertman@intel.com>
> >
> > This is the initial implementation of the Virtual Bus, virtbus_device
> > and virtbus_driver.  The virtual bus is a software based bus intended
> > to support lightweight devices and drivers and provide matching
> > between them and probing of the registered drivers.
> >
> > The primary purpose of the virual bus is to provide matching services
> > and to pass the data pointer contained in the virtbus_device to the
> > virtbus_driver during its probe call.  This will allow two separate
> > kernel objects to match up and start communication.
> >
> > The bus will support probe/remove shutdown and suspend/resume
> > callbacks.
> >
> > Kconfig and Makefile alterations are included
> >
> > Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
> > Signed-off-by: Kiran Patil <kiran.patil@intel.com>
> > Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > ---
> > v2: Cleaned up the virtual bus interface based on feedback from Greg KH
> >     and provided a test driver and test virtual bus device as an example
> >     of how to implement the virtual bus.
> 
> There is not a real user of this here, many of your exported functions are not
> used at all, right?  I want to see this in "real use" to actually determine how it
> works, and that's the only way you will know if it solves your problem or not.
> 
> thanks,
> 
> greg k-h

I totally understand.  The ice, i40e, and irdma drivers will be available later this
week using the new virtbus.  I am implementing some changes suggested by both
you and Parav Pandit, otherwise it would already be ready :)

-Dave E
Ertman, David M Nov. 18, 2019, 10:57 p.m. UTC | #6
> -----Original Message-----
> From: Greg KH <gregkh@linuxfoundation.org>
> Sent: Sunday, November 17, 2019 11:49 PM
> To: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>
> Cc: davem@davemloft.net; Ertman, David M <david.m.ertman@intel.com>;
> netdev@vger.kernel.org; linux-rdma@vger.kernel.org;
> nhorman@redhat.com; sassmann@redhat.com; jgg@ziepe.ca;
> parav@mellanox.com; Patil, Kiran <kiran.patil@intel.com>
> Subject: Re: [net-next v2 1/1] virtual-bus: Implementation of Virtual Bus
> 
> On Fri, Nov 15, 2019 at 02:33:55PM -0800, Jeff Kirsher wrote:
> > From: Dave Ertman <david.m.ertman@intel.com>
> >
> > This is the initial implementation of the Virtual Bus, virtbus_device
> > and virtbus_driver.  The virtual bus is a software based bus intended
> > to support lightweight devices and drivers and provide matching
> > between them and probing of the registered drivers.
> >
> > The primary purpose of the virual bus is to provide matching services
> > and to pass the data pointer contained in the virtbus_device to the
> > virtbus_driver during its probe call.  This will allow two separate
> > kernel objects to match up and start communication.
> >
> > The bus will support probe/remove shutdown and suspend/resume
> > callbacks.
> >
> > Kconfig and Makefile alterations are included
> >
> > Signed-off-by: Dave Ertman <david.m.ertman@intel.com>
> > Signed-off-by: Kiran Patil <kiran.patil@intel.com>
> > Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > ---
> > v2: Cleaned up the virtual bus interface based on feedback from Greg KH
> >     and provided a test driver and test virtual bus device as an example
> >     of how to implement the virtual bus.
> >
> >  Documentation/driver-api/virtual_bus.rst      |  76 ++++
> >  drivers/bus/Kconfig                           |  14 +
> >  drivers/bus/Makefile                          |   1 +
> >  drivers/bus/virtual_bus.c                     | 326 ++++++++++++++++++
> >  include/linux/virtual_bus.h                   |  55 +++
> >  .../virtual_bus/virtual_bus_dev/Makefile      |   7 +
> >  .../virtual_bus_dev/virtual_bus_dev.c         |  67 ++++
> >  .../virtual_bus/virtual_bus_drv/Makefile      |   7 +
> >  .../virtual_bus_drv/virtual_bus_drv.c         | 101 ++++++
> >  9 files changed, 654 insertions(+)
> >  create mode 100644 Documentation/driver-api/virtual_bus.rst
> >  create mode 100644 drivers/bus/virtual_bus.c  create mode 100644
> > include/linux/virtual_bus.h  create mode 100644
> > tools/testing/selftests/virtual_bus/virtual_bus_dev/Makefile
> >  create mode 100644
> > tools/testing/selftests/virtual_bus/virtual_bus_dev/virtual_bus_dev.c
> >  create mode 100644
> > tools/testing/selftests/virtual_bus/virtual_bus_drv/Makefile
> >  create mode 100644
> > tools/testing/selftests/virtual_bus/virtual_bus_drv/virtual_bus_drv.c
> >
> > diff --git a/Documentation/driver-api/virtual_bus.rst
> > b/Documentation/driver-api/virtual_bus.rst
> > new file mode 100644
> > index 000000000000..970e06267284
> > --- /dev/null
> > +++ b/Documentation/driver-api/virtual_bus.rst
> > @@ -0,0 +1,76 @@
> > +===============================
> > +Virtual Bus Devices and Drivers
> > +===============================
> > +
> > +See <linux/virtual_bus.h> for the models for virtbus_device and
> virtbus_driver.
> > +This bus is meant to be a lightweight software based bus to attach
> > +generic devices and drivers to so that a chunk of data can be passed
> between them.
> > +
> > +One use case example is an rdma driver needing to connect with
> > +several different types of PCI LAN devices to be able to request
> > +resources from them (queue sets).  Each LAN driver that supports rdma
> > +will register a virtbus_device on the virtual bus for each physical
> > +function.  The rdma driver will register as a virtbus_driver on the
> > +virtual bus to be matched up with multiple virtbus_devices and
> > +receive a pointer to a struct containing the callbacks that the PCI
> > +LAN drivers support for registering with them.
> > +
> > +Sections in this document:
> > +        Virtbus devices
> > +        Virtbus drivers
> > +        Device Enumeration
> > +        Device naming and driver binding
> > +        Virtual Bus API entry points
> > +
> > +Virtbus devices
> > +~~~~~~~~~~~~~~~
> > +Virtbus_devices are lightweight objects that support the minimal
> > +device functionality.  Devices will accept a name, and then an
> > +automatically generated index is concatenated onto it for the
> virtbus_device->name.
> > +
> > +The memory backing the "void *data" element of the virtbus_device is
> > +expected to be allocated and freed outside the context of the bus
> > +operations.  This memory is also expected to remain viable for the
> > +duration of the time that the virtbus_device is registered to the
> > +virtual bus. (e.g. from before the virtbus_dev_register until after
> > +the paired virtbus_dev_unregister).
> > +
> > +The provided API for virtbus_dev_alloc is an efficient way of
> > +allocating the memory for the virtbus_device (except for the data
> > +element) and automatically freeing it when the device is removed from
> the bus.
> > +
> > +Virtbus drivers
> > +~~~~~~~~~~~~~~~
> > +Virtbus drivers register with the virtual bus to be matched with
> > +virtbus devices.  They expect to be registered with a probe and
> > +remove callback, and also support shutdown, suspend, and resume
> > +callbacks.  They otherwise follow the standard driver behavior of
> > +having discovery and enumeration handled in the bus infrastructure.
> > +
> > +Virtbus drivers register themselves with the API entry point
> > +virtbus_drv_reg and unregister with virtbus_drv_unreg.
> > +
> > +Device Enumeration
> > +~~~~~~~~~~~~~~~~~~
> > +Enumeration is handled automatically by the bus infrastructure via
> > +the ida_simple methods.
> > +
> > +Device naming and driver binding
> > +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > +The virtbus_device.dev.name is the canonical name for the device. It
> > +is built from two other parts:
> > +
> > +        - virtbus_device.name (also used for matching).
> > +        - virtbus_device.id (generated automatically from ida_simple
> > + calls)
> > +
> > +This allows for multiple virtbus_devices with the same name, which
> > +will all be matched to the same virtbus_driver. Driver binding is
> > +performed by the driver core, invoking driver probe() after finding a
> match between device and driver.
> > +
> > +Virtual Bus API entry points
> > +~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > +struct virtbus_device *virtbus_dev_alloc(const char *name, void
> > +*data) int virtbus_dev_register(struct virtbus_device *vdev) void
> > +virtbus_dev_unregister(struct virtbus_device *vdev) int
> > +virtbus_drv_register(struct virtbus_driver *vdrv, struct module
> > +*owner) void virtbus_drv_unregister(struct virtbus_driver *vdrv)
> > diff --git a/drivers/bus/Kconfig b/drivers/bus/Kconfig index
> > 6b331061d34b..30cef35b0c30 100644
> > --- a/drivers/bus/Kconfig
> > +++ b/drivers/bus/Kconfig
> > @@ -193,4 +193,18 @@ config DA8XX_MSTPRI
> >
> >  source "drivers/bus/fsl-mc/Kconfig"
> >
> > +config VIRTUAL_BUS
> > +       tristate "lightweight Virtual Bus"
> > +       depends on PM
> > +       help
> > +         Provides a lightweight bus for virtbus_devices to be added to it
> > +         and virtbus_drivers to be registered on it.  Will create a match
> > +         between the driver and device, then call the driver's probe with
> > +         the virtbus_device's struct (including a pointer for data).
> > +         One example is the irdma driver needing to connect with various
> > +         PCI LAN drivers to request resources (queues) to be able to perform
> > +         its function.  The data in the virtbus_device created by the
> > +         PCI LAN driver is a set of ops (function pointers) for the irdma
> > +         driver to use to register and communicate with the PCI LAN driver.
> > +
> >  endmenu
> > diff --git a/drivers/bus/Makefile b/drivers/bus/Makefile index
> > 16b43d3468c6..0b0ba53cbe5b 100644
> > --- a/drivers/bus/Makefile
> > +++ b/drivers/bus/Makefile
> > @@ -33,3 +33,4 @@ obj-$(CONFIG_UNIPHIER_SYSTEM_BUS)	+=
> uniphier-system-bus.o
> >  obj-$(CONFIG_VEXPRESS_CONFIG)	+= vexpress-config.o
> >
> >  obj-$(CONFIG_DA8XX_MSTPRI)	+= da8xx-mstpri.o
> > +obj-$(CONFIG_VIRTUAL_BUS)	+= virtual_bus.o
> > diff --git a/drivers/bus/virtual_bus.c b/drivers/bus/virtual_bus.c new
> > file mode 100644 index 000000000000..c6eab1658391
> > --- /dev/null
> > +++ b/drivers/bus/virtual_bus.c
> > @@ -0,0 +1,326 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * virtual_bus.c - lightweight software based bus for virtual devices
> > + *
> > + * Copyright (c) 2019-20 Intel Corporation
> > + *
> > + * Please see Documentation/driver-api/virtual_bus.rst for
> > + * more information
> > + */
> > +
> > +#include <linux/string.h>
> > +#include <linux/virtual_bus.h>
> > +#include <linux/of_irq.h>
> > +#include <linux/module.h>
> > +#include <linux/init.h>
> > +#include <linux/pm_runtime.h>
> > +#include <linux/pm_domain.h>
> > +#include <linux/acpi.h>
> > +#include <linux/device.h>
> > +
> > +MODULE_LICENSE("GPL v2");
> > +MODULE_DESCRIPTION("Lightweight Virtual Bus");
> MODULE_AUTHOR("David
> > +Ertman <david.m.ertman@intel.com>"); MODULE_AUTHOR("Kiran Patil
> > +<kiran.patil@intel.com>");
> > +
> > +static DEFINE_IDA(virtbus_dev_ida);
> 
> Do you ever clean up this when unloaded?  I didn't see that happening but I
> might have missed it.

I have added a ida_destroy() call to the module exit function.  Good catch!

> 
> > +
> > +static const
> > +struct virtbus_dev_id *virtbus_match_id(const struct virtbus_dev_id *id,
> > +					struct virtbus_device *vdev)
> > +{
> > +	while (id->name[0]) {
> > +		if (!strcmp(vdev->name, id->name)) {
> > +			vdev->dev_id = id;
> 
> Why are you changing/setting the id?

This is not the main id of the device.  I chose a bad name for this field.

This is copying the pointer to the element out of the id_table that was matched
into the struct of the virtbus_device, so that when the virtbus_device struct is
passed as a parameter to the virtbus_driver's probe, the correct driver_data can
be accessed.

I will change the name of this field to "matched_element" and add a comment
as to what is going on here.

> 
> > +			return id;
> > +		}
> > +		id++;
> > +	}
> > +	return NULL;
> > +}
> > +
> > +#define to_virtbus_dev(x)	(container_of((x), struct virtbus_device,
> dev))
> > +#define to_virtbus_drv(x)	(container_of((x), struct virtbus_driver, \
> > +				 driver))
> > +
> > +/**
> > + * virtbus_match - bind virtbus device to virtbus driver
> > + * @dev: device
> > + * @drv: driver
> > + *
> > + * Virtbus device IDs are always in "<name>.<instance>" format.
> > + * Instances are automatically selected through an ida_simple_get so
> > + * are positive integers. Names are taken from the device name field.
> > + * Driver IDs are simple <name>.  Need to extract the name from the
> > + * Virtual Device compare to name of the driver.
> > + */
> > +static int virtbus_match(struct device *dev, struct device_driver
> > +*drv) {
> > +	struct virtbus_driver *vdrv = to_virtbus_drv(drv);
> > +	struct virtbus_device *vdev = to_virtbus_dev(dev);
> > +
> > +	if (vdrv->id_table)
> > +		return virtbus_match_id(vdrv->id_table, vdev) != NULL;
> > +
> > +	return !strcmp(vdev->name, drv->name); }
> > +
> > +/**
> > + * virtbus_probe - call probe of the virtbus_drv
> > + * @dev: device struct
> > + */
> > +static int virtbus_probe(struct device *dev) {
> > +	if (dev->driver->probe)
> > +		return dev->driver->probe(dev);
> > +
> > +	return 0;
> > +}
> > +
> > +static int virtbus_remove(struct device *dev) {
> > +	if (dev->driver->remove)
> > +		return dev->driver->remove(dev);
> > +
> > +	return 0;
> > +}
> > +
> > +static void virtbus_shutdown(struct device *dev) {
> > +	if (dev->driver->shutdown)
> > +		dev->driver->shutdown(dev);
> > +}
> > +
> > +static int virtbus_suspend(struct device *dev, pm_message_t state) {
> > +	if (dev->driver->suspend)
> > +		return dev->driver->suspend(dev, state);
> > +
> > +	return 0;
> > +}
> > +
> > +static int virtbus_resume(struct device *dev) {
> > +	if (dev->driver->resume)
> > +		return dev->driver->resume(dev);
> > +
> > +	return 0;
> > +}
> > +
> > +struct bus_type virtual_bus_type = {
> > +	.name		= "virtbus",
> > +	.match		= virtbus_match,
> > +	.probe		= virtbus_probe,
> > +	.remove		= virtbus_remove,
> > +	.shutdown	= virtbus_shutdown,
> > +	.suspend	= virtbus_suspend,
> > +	.resume		= virtbus_resume,
> > +};
> > +
> > +/**
> > + * virtbus_dev_register - add a virtual bus device
> > + * @vdev: virtual bus device to add
> > + */
> > +int virtbus_dev_register(struct virtbus_device *vdev) {
> > +	int ret;
> > +
> > +	if (!vdev)
> > +		return -EINVAL;
> > +
> > +	device_initialize(&vdev->dev);
> > +
> > +	vdev->dev.bus = &virtual_bus_type;
> > +	/* All device IDs are automatically allocated */
> > +	ret = ida_simple_get(&virtbus_dev_ida, 0, 0, GFP_KERNEL);
> > +	if (ret < 0)
> > +		return ret;
> > +
> > +	vdev->id = ret;
> > +	dev_set_name(&vdev->dev, "%s.%d", vdev->name, vdev->id);
> > +
> > +	dev_dbg(&vdev->dev, "Registering VirtBus device '%s'\n",
> > +		dev_name(&vdev->dev));
> > +
> > +	ret = device_add(&vdev->dev);
> > +	if (!ret)
> > +		return ret;
> 
> This logic has tripped me up multiple times, it's an anti-pattern.
> Please do:
> 	if (ret)
> 		goto device_add_error;
> 
> 	return 0;
> 
> device_add_error:
> 	...
> 

Flow changed to match your example.

> > +
> > +	/* Error adding virtual device */
> > +	device_del(&vdev->dev);
> > +	ida_simple_remove(&virtbus_dev_ida, vdev->id);
> > +	vdev->id = VIRTBUS_DEVID_NONE;
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(virtbus_dev_register);
> > +
> > +/**
> > + * virtbus_dev_unregister - remove a virtual bus device
> > + * vdev: virtual bus device we are removing  */ void
> > +virtbus_dev_unregister(struct virtbus_device *vdev) {
> > +	if (!IS_ERR_OR_NULL(vdev)) {
> > +		device_del(&vdev->dev);
> > +
> > +		ida_simple_remove(&virtbus_dev_ida, vdev->id);
> > +		vdev->id = VIRTBUS_DEVID_NONE;
> 
> Why set the id?  What will care/check this?
> 

Removed, and removed the #define as well.  I had a thought on how I
was going to use this, but after further consideration, it didn't pan out
like I wanted.

> > +	}
> > +}
> > +EXPORT_SYMBOL_GPL(virtbus_dev_unregister);
> > +
> > +struct virtbus_object {
> > +	struct virtbus_device vdev;
> > +	char name[];
> > +};
> 
> Why not use the name in the device structure?
> 

I am implemting a change from Parav that will eliminate
the need for the entire virtbus_object.  So this name
field will be going away :)

> thanks,
> 
> greg k-h

-Dave E
Ertman, David M Nov. 19, 2019, 3:58 a.m. UTC | #7
> -----Original Message-----
> From: Parav Pandit <parav@mellanox.com>
> Sent: Friday, November 15, 2019 3:26 PM
> To: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>; davem@davemloft.net;
> gregkh@linuxfoundation.org
> Cc: Ertman, David M <david.m.ertman@intel.com>;
> netdev@vger.kernel.org; linux-rdma@vger.kernel.org;
> nhorman@redhat.com; sassmann@redhat.com; jgg@ziepe.ca; Patil, Kiran
> <kiran.patil@intel.com>
> Subject: RE: [net-next v2 1/1] virtual-bus: Implementation of Virtual Bus
> 
> Hi Jeff,
> 
> > From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > Sent: Friday, November 15, 2019 4:34 PM
> >
> > From: Dave Ertman <david.m.ertman@intel.com>
> >
> > This is the initial implementation of the Virtual Bus, virtbus_device
> > and virtbus_driver.  The virtual bus is a software based bus intended
> > to support lightweight devices and drivers and provide matching
> > between them and probing of the registered drivers.
> >
> > The primary purpose of the virual bus is to provide matching services
> > and to pass the data pointer contained in the virtbus_device to the
> > virtbus_driver during its probe call.  This will allow two separate
> > kernel objects to match up and start communication.
> >
> It is fundamental to know that rdma device created by virtbus_driver will be
> anchored to which bus for an non abusive use.
> virtbus or parent pci bus?
> I asked this question in v1 version of this patch.

The model we will be using is a PCI LAN driver that will allocate and
register a virtbus_device.  The virtbus_device will be anchored to the virtual
bus, not the PCI bus.

The virtbus does not have a requirement that elements registering with it
have any association with another outside bus or device.

RDMA is not attached to any bus when it's init is called.  The virtbus_driver
that it will create will be attached to the virtual bus.

The RDMA driver will register a virtbus_driver object.  Its probe will
accept the data pointer from the virtbus_device that the PCI LAN driver
created.

> 
> Also since it says - 'to support lightweight devices', documenting that
> information is critical to avoid ambiguity.
> 
> Since for a while I am working on the subbus/subdev_bus/xbus/mdev [1]
> whatever we want to call it, it overlaps with your comment about 'to support
> lightweight devices'.
> Hence let's make things crystal clear weather the purpose is 'only matching
> service' or also 'lightweight devices'.
> If this is only matching service, lets please remove lightweight devices part..
> 

This is only for matching services for kernel objects, I will work on
phrasing this clearer.

> You additionally need modpost support for id table integration to modifo,
> modprobe and other tools.
> A small patch similar to this one [2] is needed.
> Please include in the series.
> 

modpost support added - thanks for that catch!

> [..]
> 
> > +static const
> > +struct virtbus_dev_id *virtbus_match_id(const struct virtbus_dev_id *id,
> > +					struct virtbus_device *vdev)
> > +{
> > +	while (id->name[0]) {
> > +		if (!strcmp(vdev->name, id->name)) {
> > +			vdev->dev_id = id;
> Matching function shouldn't be modifying the id.

This is not the main id of the virtbus_device.  This is copying the
element in the driver id_table that was matched into the virtbus_device
struct, so that when the virtbus_device struct is passed to the
virtbus_driver's probe, it can access the correct driver_data.

I chose a poor name for this field, I will change the name of this part of the
struct to matched_element and include a comment on what is going on here.

> 
> > +			return id;
> > +		}
> > +		id++;
> > +	}
> > +	return NULL;
> > +}
> > +
> > +#define to_virtbus_dev(x)	(container_of((x), struct virtbus_device,
> dev))
> > +#define to_virtbus_drv(x)	(container_of((x), struct virtbus_driver, \
> > +				 driver))
> > +
> > +/**
> > + * virtbus_match - bind virtbus device to virtbus driver
> > + * @dev: device
> > + * @drv: driver
> > + *
> > + * Virtbus device IDs are always in "<name>.<instance>" format.
> We might have to change this scheme depending on the first question I
> asked in the email about device anchoring.
> 
> > +
> > +struct bus_type virtual_bus_type = {
> > +	.name		= "virtbus",
> > +	.match		= virtbus_match,
> > +	.probe		= virtbus_probe,
> > +	.remove		= virtbus_remove,
> > +	.shutdown	= virtbus_shutdown,
> > +	.suspend	= virtbus_suspend,
> > +	.resume		= virtbus_resume,
> > +};
> Drop the tab alignment.
> 

Dropped :)

> > +
> > +/**
> > + * virtbus_dev_register - add a virtual bus device
> > + * @vdev: virtual bus device to add
> > + */
> > +int virtbus_dev_register(struct virtbus_device *vdev) {
> > +	int ret;
> > +
> > +	if (!vdev)
> > +		return -EINVAL;
> No need for this check.
> Driver shouldn't be called null device registration.

check removed.

> 
> > +
> > +	device_initialize(&vdev->dev);
> > +
> > +	vdev->dev.bus = &virtual_bus_type;
> > +	/* All device IDs are automatically allocated */
> > +	ret = ida_simple_get(&virtbus_dev_ida, 0, 0, GFP_KERNEL);
> > +	if (ret < 0)
> > +		return ret;
> > +
> This is bug, once device_initialize() is done, it must do put_device() and
> follow the release sequence.
> 

changed to use put_device().

> > +	vdev->id = ret;
> > +	dev_set_name(&vdev->dev, "%s.%d", vdev->name, vdev->id);
> > +
> > +	dev_dbg(&vdev->dev, "Registering VirtBus device '%s'\n",
> I think 'virtbus' naming is better instead of 'VirtBus' all over. We don't do "Pci'
> in prints etc.
> 

Changed to virtbus.

> > +		dev_name(&vdev->dev));
> > +
> > +	ret = device_add(&vdev->dev);
> > +	if (!ret)
> > +		return ret;
> > +
> > +	/* Error adding virtual device */
> > +	device_del(&vdev->dev);
> > +	ida_simple_remove(&virtbus_dev_ida, vdev->id);
> > +	vdev->id = VIRTBUS_DEVID_NONE;
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(virtbus_dev_register);
> > +
> > +/**
> > + * virtbus_dev_unregister - remove a virtual bus device
> > + * vdev: virtual bus device we are removing  */ void
> > +virtbus_dev_unregister(struct virtbus_device *vdev) {
> > +	if (!IS_ERR_OR_NULL(vdev)) {
> > +		device_del(&vdev->dev);
> > +
> > +		ida_simple_remove(&virtbus_dev_ida, vdev->id);
> I believe this should be done in the release() because above device_del()
> may not ensure that all references to the devices are dropped.
> 

ida_release moved to .release() function.

> > +		vdev->id = VIRTBUS_DEVID_NONE;
> > +	}
> > +}
> > +EXPORT_SYMBOL_GPL(virtbus_dev_unregister);
> > +
> > +struct virtbus_object {
> > +	struct virtbus_device vdev;
> > +	char name[];
> > +};
> > +
> This shouldn't be needed once. More below.
> 
> > +/**
> > + * virtbus_dev_release - Destroy a virtbus device
> > + * @vdev: virtual device to release
> > + *
> > + * Note that the vdev->data which is separately allocated needs to be
> > + * separately freed on it own.
> > + */
> > +static void virtbus_dev_release(struct device *dev) {
> > +	struct virtbus_object *vo = container_of(dev, struct virtbus_object,
> > +						 vdev.dev);
> > +
> > +	kfree(vo);
> > +}
> > +
> > +/**
> > + * virtbus_dev_alloc - allocate a virtbus device
> > + * @name: name to associate with the vdev
> > + * @data: pointer to data to be associated with this device  */
> > +struct virtbus_device *virtbus_dev_alloc(const char *name, void *data) {
> > +	struct virtbus_object *vo;
> > +
> Data should not be used.
> Caller needs to give a size of the object to allocate.
> I discussed the example in detail with Jason in v1 of this patch. Please refer in
> that email.
> It should be something like this.
> 
> /* size = sizeof(struct i40_virtbus_dev), and it is the first member */
> virtbus_dev_alloc(size)
> {
> 	[..]
> }
> 
> struct i40_virtbus_dev {
> 	struct virbus_dev virtdev;
> 	/*... more fields that you want to share with other driver and to use
> in probe() */ };
> 
> irdma_probe(..)
> {
> 	struct i40_virtbus_dev dev = container_of(dev, struct
> i40_virtbus_dev, dev); }
> 
> [..]
> 
> > diff --git a/include/linux/virtual_bus.h b/include/linux/virtual_bus.h
> > new file mode 100644 index 000000000000..b6f2406180f8
> > --- /dev/null
> > +++ b/include/linux/virtual_bus.h
> > @@ -0,0 +1,55 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * virtual_bus.h - lightweight software bus
> > + *
> > + * Copyright (c) 2019-20 Intel Corporation
> > + *
> > + * Please see Documentation/driver-api/virtual_bus.rst for more
> > +information  */
> > +
> > +#ifndef _VIRTUAL_BUS_H_
> > +#define _VIRTUAL_BUS_H_
> > +
> > +#include <linux/device.h>
> > +
> > +#define VIRTBUS_DEVID_NONE	(-1)
> > +#define VIRTBUS_NAME_SIZE	20
> > +
> > +struct virtbus_dev_id {
> > +	char name[VIRTBUS_NAME_SIZE];
> > +	u64 driver_data;
> > +};
> > +
> > +struct virtbus_device {
> > +	const char			*name;
> > +	int				id;
> > +	const struct virtbus_dev_id	*dev_id;
> > +	struct device			dev;
> Drop the tab based alignment and just please follow format of virtbus_driver
> you did below.
> > +	void				*data;
> Please drop data. we need only wrapper API virtbus_get/set_drvdata().
> > +};
> 

Data dropped in favor of the device creator using a struct to contain the
virtbus_device and data field, and the virtbus_driver using a container_of()
to get to the data after receiving the virtbus_device struct in probe.

Function virtbus_dev_alloc removed from patch (since the device creator will
need to allocate for the container object).

> [1] https://lore.kernel.org/linux-rdma/20191107160448.20962-1-
> parav@mellanox.com/
> [2] https://lore.kernel.org/patchwork/patch/1046991/


Thanks for the feedback!

-Dave E
Jason Wang Nov. 19, 2019, 4:08 a.m. UTC | #8
On 2019/11/16 上午7:25, Parav Pandit wrote:
> Hi Jeff,
>
>> From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
>> Sent: Friday, November 15, 2019 4:34 PM
>>
>> From: Dave Ertman <david.m.ertman@intel.com>
>>
>> This is the initial implementation of the Virtual Bus, virtbus_device and
>> virtbus_driver.  The virtual bus is a software based bus intended to support
>> lightweight devices and drivers and provide matching between them and
>> probing of the registered drivers.
>>
>> The primary purpose of the virual bus is to provide matching services and to
>> pass the data pointer contained in the virtbus_device to the virtbus_driver
>> during its probe call.  This will allow two separate kernel objects to match up
>> and start communication.
>>
> It is fundamental to know that rdma device created by virtbus_driver will be anchored to which bus for an non abusive use.
> virtbus or parent pci bus?
> I asked this question in v1 version of this patch.
>
> Also since it says - 'to support lightweight devices', documenting that information is critical to avoid ambiguity.
>
> Since for a while I am working on the subbus/subdev_bus/xbus/mdev [1] whatever we want to call it, it overlaps with your comment about 'to support lightweight devices'.
> Hence let's make things crystal clear weather the purpose is 'only matching service' or also 'lightweight devices'.
> If this is only matching service, lets please remove lightweight devices part..


Yes, if it's matching + lightweight device, its function is almost a 
duplication of mdev. And I'm working on extending mdev[1] to be a 
generic module to support any types of virtual devices a while. The 
advantage of mdev is:

1) ready for the userspace driver (VFIO based)
2) have a sysfs/GUID based management interface

So for 1, it's not clear that how userspace driver would be supported 
here, or it's completely not being accounted in this series? For 2, it 
looks to me that this series leave it to the implementation, this means 
management to learn several vendor specific interfaces which seems a burden.

Note, technically Virtual Bus could be implemented on top of [1] with 
the full lifecycle API.

[1] https://lkml.org/lkml/2019/11/18/261


>
> You additionally need modpost support for id table integration to modifo, modprobe and other tools.
> A small patch similar to this one [2] is needed.
> Please include in the series.
>
> [..]


And probably a uevent method. But rethinking of this, matching through a 
single virtual bus seems not good. What if driver want to do some 
specific matching? E.g for virtio, we may want a vhost-net driver that 
only match networking device. With a single bus, it probably means you 
need another bus on top and provide the virtio specific matching there. 
This looks not straightforward as allowing multiple type of buses.

Thanks
Parav Pandit Nov. 19, 2019, 4:31 a.m. UTC | #9
Hi David,

> From: Ertman, David M <david.m.ertman@intel.com>
> Sent: Monday, November 18, 2019 9:59 PM
> Subject: RE: [net-next v2 1/1] virtual-bus: Implementation of Virtual Bus
> 
> > -----Original Message-----
> > From: Parav Pandit <parav@mellanox.com>
> > Sent: Friday, November 15, 2019 3:26 PM
> > To: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>;
> > davem@davemloft.net; gregkh@linuxfoundation.org
> > Cc: Ertman, David M <david.m.ertman@intel.com>;
> > netdev@vger.kernel.org; linux-rdma@vger.kernel.org;
> > nhorman@redhat.com; sassmann@redhat.com; jgg@ziepe.ca; Patil, Kiran
> > <kiran.patil@intel.com>
> > Subject: RE: [net-next v2 1/1] virtual-bus: Implementation of Virtual
> > Bus
> >
> > Hi Jeff,
> >
> > > From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > > Sent: Friday, November 15, 2019 4:34 PM
> > >
> > > From: Dave Ertman <david.m.ertman@intel.com>
> > >
> > > This is the initial implementation of the Virtual Bus,
> > > virtbus_device and virtbus_driver.  The virtual bus is a software
> > > based bus intended to support lightweight devices and drivers and
> > > provide matching between them and probing of the registered drivers.
> > >
> > > The primary purpose of the virual bus is to provide matching
> > > services and to pass the data pointer contained in the
> > > virtbus_device to the virtbus_driver during its probe call.  This
> > > will allow two separate kernel objects to match up and start
> communication.
> > >
> > It is fundamental to know that rdma device created by virtbus_driver
> > will be anchored to which bus for an non abusive use.
> > virtbus or parent pci bus?
> > I asked this question in v1 version of this patch.
> 
> The model we will be using is a PCI LAN driver that will allocate and register a
> virtbus_device.  The virtbus_device will be anchored to the virtual bus, not the
> PCI bus.
o.k.

> 
> The virtbus does not have a requirement that elements registering with it have
> any association with another outside bus or device.
> 
This is what I want to capture in cover letter and documentation.

> RDMA is not attached to any bus when it's init is called.  The virtbus_driver that
> it will create will be attached to the virtual bus.
> 
> The RDMA driver will register a virtbus_driver object.  Its probe will accept the
> data pointer from the virtbus_device that the PCI LAN driver created.
> 
What I am saying that RDMA device created by the irdma driver or mlx5_ib driver should be anchored to the PCI device and not the virtbus device.

struct ib_device.dev.parent = &pci_dev->dev;

if this is not done, and if it is,

struct ib_device.dev.parent = &virtbus_dev->dev;

Than we are inviting huge burden as below.
(a) user compatibility with several tools, orchestration etc is broken, because rdma cannot be reached back to its PCI device as before.
This is some internal kernel change for 'better code handling', which is surfacing to rdma device name changing - systemd/udev broken, until all distros upgrade and implement this virtbus naming scheme.
Even with that orchestration tools shipped outside of distro are broken.

(b) virtbus must extend iommu support in intel, arm, amd, ppc systems otherwise straight away rdma is broken in those environments with this 'internal code restructure'.
These iommu doesn't support non PCI buses.

(c) anchoring on virtbus brings challenge to get unique id for persistent naming when irdma/mlx5_ib device is not created by 'user'.

This improvement by bus matching service != 'ethX to ens2f0 improvement of netdevs happened few years back'.
Hence, my input is,

irdma_virtubus_probe() {
	struct ib_device.dev.parent = &pci_dev->dev;
	ib_register_driver();
}

> >
> > Also since it says - 'to support lightweight devices', documenting
> > that information is critical to avoid ambiguity.
> >
> > Since for a while I am working on the subbus/subdev_bus/xbus/mdev [1]
> > whatever we want to call it, it overlaps with your comment about 'to
> > support lightweight devices'.
> > Hence let's make things crystal clear weather the purpose is 'only
> > matching service' or also 'lightweight devices'.
> > If this is only matching service, lets please remove lightweight devices part..
> >
> 
> This is only for matching services for kernel objects, I will work on phrasing this
> clearer.
> 

Ok. Great. Due to above two fundamental points, we clearly write up the matching service.
Not sure naming virtbus to 'virbus_service' in bus_type is an extreme way to make this clear.

> > You additionally need modpost support for id table integration to
> > modifo, modprobe and other tools.
> > A small patch similar to this one [2] is needed.
> > Please include in the series.
> >
> 
> modpost support added - thanks for that catch!
> 
> > [..]
> >
> > > +static const
> > > +struct virtbus_dev_id *virtbus_match_id(const struct virtbus_dev_id *id,
> > > +					struct virtbus_device *vdev)
> > > +{
> > > +	while (id->name[0]) {
> > > +		if (!strcmp(vdev->name, id->name)) {
> > > +			vdev->dev_id = id;
> > Matching function shouldn't be modifying the id.
> 
> This is not the main id of the virtbus_device.  This is copying the element in the
> driver id_table that was matched into the virtbus_device struct, so that when
> the virtbus_device struct is passed to the virtbus_driver's probe, it can access
> the correct driver_data.
> 
> I chose a poor name for this field, I will change the name of this part of the
> struct to matched_element and include a comment on what is going on here.
> 
When virtbus_device is created, its class_id or device_id identifying the device is decided.
So it should be set in the create() routine.
Parav Pandit Nov. 19, 2019, 4:36 a.m. UTC | #10
Hi Jason Wang,

> From: Jason Wang <jasowang@redhat.com>
> Sent: Monday, November 18, 2019 10:08 PM
> 
> On 2019/11/16 上午7:25, Parav Pandit wrote:
> > Hi Jeff,
> >
> >> From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> >> Sent: Friday, November 15, 2019 4:34 PM
> >>
> >> From: Dave Ertman <david.m.ertman@intel.com>
> >>
> >> This is the initial implementation of the Virtual Bus, virtbus_device
> >> and virtbus_driver.  The virtual bus is a software based bus intended
> >> to support lightweight devices and drivers and provide matching
> >> between them and probing of the registered drivers.
> >>
> >> The primary purpose of the virual bus is to provide matching services
> >> and to pass the data pointer contained in the virtbus_device to the
> >> virtbus_driver during its probe call.  This will allow two separate
> >> kernel objects to match up and start communication.
> >>
> > It is fundamental to know that rdma device created by virtbus_driver will be
> anchored to which bus for an non abusive use.
> > virtbus or parent pci bus?
> > I asked this question in v1 version of this patch.
> >
> > Also since it says - 'to support lightweight devices', documenting that
> information is critical to avoid ambiguity.
> >
> > Since for a while I am working on the subbus/subdev_bus/xbus/mdev [1]
> whatever we want to call it, it overlaps with your comment about 'to support
> lightweight devices'.
> > Hence let's make things crystal clear weather the purpose is 'only matching
> service' or also 'lightweight devices'.
> > If this is only matching service, lets please remove lightweight devices part..
> 
> 
> Yes, if it's matching + lightweight device, its function is almost a duplication of
> mdev. And I'm working on extending mdev[1] to be a generic module to
> support any types of virtual devices a while. The advantage of mdev is:
> 
> 1) ready for the userspace driver (VFIO based)
> 2) have a sysfs/GUID based management interface
> 
> So for 1, it's not clear that how userspace driver would be supported here, or
> it's completely not being accounted in this series? For 2, it looks to me that this
> series leave it to the implementation, this means management to learn several
> vendor specific interfaces which seems a burden.
> 
> Note, technically Virtual Bus could be implemented on top of [1] with the full
> lifecycle API.
> 
> [1] https://lkml.org/lkml/2019/11/18/261
> 
> 
> >
> > You additionally need modpost support for id table integration to modifo,
> modprobe and other tools.
> > A small patch similar to this one [2] is needed.
> > Please include in the series.
> >
> > [..]
> 
> 
> And probably a uevent method. But rethinking of this, matching through a
> single virtual bus seems not good. What if driver want to do some specific
> matching? E.g for virtio, we may want a vhost-net driver that only match
> networking device. With a single bus, it probably means you need another bus
> on top and provide the virtio specific matching there.
> This looks not straightforward as allowing multiple type of buses.
> 
The purpose of the bus is to attach two drivers, mlx5_core (creator of netdevices) and mlx5_ib (create of rdma devices) on single PCI function.
Meaning 'multiple classes of devices' are created on top of single underlying parent device.

So bus is just the 'matching service' and nothing more. It is not meant to address virtio, mdev, sub functions usecases.
Parav Pandit Nov. 19, 2019, 4:39 a.m. UTC | #11
Hi David,

> Sent: Monday, November 18, 2019 10:32 PM
> To: Ertman, David M <david.m.ertman@intel.com>; Kirsher, Jeffrey T
> <jeffrey.t.kirsher@intel.com>; davem@davemloft.net;
> gregkh@linuxfoundation.org
> Cc: netdev@vger.kernel.org; linux-rdma@vger.kernel.org;
> nhorman@redhat.com; sassmann@redhat.com; jgg@ziepe.ca; Patil, Kiran
> <kiran.patil@intel.com>
> Subject: RE: [net-next v2 1/1] virtual-bus: Implementation of Virtual Bus
> 
> Hi David,
> 
> > From: Ertman, David M <david.m.ertman@intel.com>
> > Sent: Monday, November 18, 2019 9:59 PM
> > Subject: RE: [net-next v2 1/1] virtual-bus: Implementation of Virtual
> > Bus
> >
> > > -----Original Message-----
> > > From: Parav Pandit <parav@mellanox.com>
> > > Sent: Friday, November 15, 2019 3:26 PM
> > > To: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>;
> > > davem@davemloft.net; gregkh@linuxfoundation.org
> > > Cc: Ertman, David M <david.m.ertman@intel.com>;
> > > netdev@vger.kernel.org; linux-rdma@vger.kernel.org;
> > > nhorman@redhat.com; sassmann@redhat.com; jgg@ziepe.ca; Patil, Kiran
> > > <kiran.patil@intel.com>
> > > Subject: RE: [net-next v2 1/1] virtual-bus: Implementation of
> > > Virtual Bus
> > >
> > > Hi Jeff,
> > >
> > > > From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > > > Sent: Friday, November 15, 2019 4:34 PM
> > > >
> > > > From: Dave Ertman <david.m.ertman@intel.com>
> > > >
> > > > This is the initial implementation of the Virtual Bus,
> > > > virtbus_device and virtbus_driver.  The virtual bus is a software
> > > > based bus intended to support lightweight devices and drivers and
> > > > provide matching between them and probing of the registered drivers.
> > > >
> > > > The primary purpose of the virual bus is to provide matching
> > > > services and to pass the data pointer contained in the
> > > > virtbus_device to the virtbus_driver during its probe call.  This
> > > > will allow two separate kernel objects to match up and start
> > communication.
> > > >
> > > It is fundamental to know that rdma device created by virtbus_driver
> > > will be anchored to which bus for an non abusive use.
> > > virtbus or parent pci bus?
> > > I asked this question in v1 version of this patch.
> >
> > The model we will be using is a PCI LAN driver that will allocate and
> > register a virtbus_device.  The virtbus_device will be anchored to the
> > virtual bus, not the PCI bus.
> o.k.
> 
> >
> > The virtbus does not have a requirement that elements registering with
> > it have any association with another outside bus or device.
> >
> This is what I want to capture in cover letter and documentation.
> 
> > RDMA is not attached to any bus when it's init is called.  The
> > virtbus_driver that it will create will be attached to the virtual bus.
> >
> > The RDMA driver will register a virtbus_driver object.  Its probe will
> > accept the data pointer from the virtbus_device that the PCI LAN driver
> created.
> >
> What I am saying that RDMA device created by the irdma driver or mlx5_ib
> driver should be anchored to the PCI device and not the virtbus device.
> 
> struct ib_device.dev.parent = &pci_dev->dev;
> 
> if this is not done, and if it is,
> 
> struct ib_device.dev.parent = &virtbus_dev->dev;
> 
> Than we are inviting huge burden as below.
> (a) user compatibility with several tools, orchestration etc is broken, because
> rdma cannot be reached back to its PCI device as before.
> This is some internal kernel change for 'better code handling', which is
> surfacing to rdma device name changing - systemd/udev broken, until all
> distros upgrade and implement this virtbus naming scheme.
> Even with that orchestration tools shipped outside of distro are broken.
> 
> (b) virtbus must extend iommu support in intel, arm, amd, ppc systems
> otherwise straight away rdma is broken in those environments with this
> 'internal code restructure'.
> These iommu doesn't support non PCI buses.
> 
> (c) anchoring on virtbus brings challenge to get unique id for persistent naming
> when irdma/mlx5_ib device is not created by 'user'.
> 
> This improvement by bus matching service != 'ethX to ens2f0 improvement of
> netdevs happened few years back'.
> Hence, my input is,
> 
> irdma_virtubus_probe() {
> 	struct ib_device.dev.parent = &pci_dev->dev;
> 	ib_register_driver();
> }
> 
With this, I forgot to mention that, virtbus doesn't need PM callbacks, because PM core layer works on suspend/resume devices in reverse order of their creation.
Given that protocol devices (like rdma and netdev) devices shouldn't be anchored on virtbus, it doesn't need PM callbacks.
Please remove them.

suspend() will be called first on rdma device (because it was created last).
Jason Wang Nov. 19, 2019, 6:51 a.m. UTC | #12
On 2019/11/19 下午12:36, Parav Pandit wrote:
> Hi Jason Wang,
>
>> From: Jason Wang <jasowang@redhat.com>
>> Sent: Monday, November 18, 2019 10:08 PM
>>
>> On 2019/11/16 上午7:25, Parav Pandit wrote:
>>> Hi Jeff,
>>>
>>>> From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
>>>> Sent: Friday, November 15, 2019 4:34 PM
>>>>
>>>> From: Dave Ertman <david.m.ertman@intel.com>
>>>>
>>>> This is the initial implementation of the Virtual Bus, virtbus_device
>>>> and virtbus_driver.  The virtual bus is a software based bus intended
>>>> to support lightweight devices and drivers and provide matching
>>>> between them and probing of the registered drivers.
>>>>
>>>> The primary purpose of the virual bus is to provide matching services
>>>> and to pass the data pointer contained in the virtbus_device to the
>>>> virtbus_driver during its probe call.  This will allow two separate
>>>> kernel objects to match up and start communication.
>>>>
>>> It is fundamental to know that rdma device created by virtbus_driver will be
>> anchored to which bus for an non abusive use.
>>> virtbus or parent pci bus?
>>> I asked this question in v1 version of this patch.
>>>
>>> Also since it says - 'to support lightweight devices', documenting that
>> information is critical to avoid ambiguity.
>>> Since for a while I am working on the subbus/subdev_bus/xbus/mdev [1]
>> whatever we want to call it, it overlaps with your comment about 'to support
>> lightweight devices'.
>>> Hence let's make things crystal clear weather the purpose is 'only matching
>> service' or also 'lightweight devices'.
>>> If this is only matching service, lets please remove lightweight devices part..
>>
>> Yes, if it's matching + lightweight device, its function is almost a duplication of
>> mdev. And I'm working on extending mdev[1] to be a generic module to
>> support any types of virtual devices a while. The advantage of mdev is:
>>
>> 1) ready for the userspace driver (VFIO based)
>> 2) have a sysfs/GUID based management interface
>>
>> So for 1, it's not clear that how userspace driver would be supported here, or
>> it's completely not being accounted in this series? For 2, it looks to me that this
>> series leave it to the implementation, this means management to learn several
>> vendor specific interfaces which seems a burden.
>>
>> Note, technically Virtual Bus could be implemented on top of [1] with the full
>> lifecycle API.
>>
>> [1] https://lkml.org/lkml/2019/11/18/261
>>
>>
>>> You additionally need modpost support for id table integration to modifo,
>> modprobe and other tools.
>>> A small patch similar to this one [2] is needed.
>>> Please include in the series.
>>>
>>> [..]
>>
>> And probably a uevent method. But rethinking of this, matching through a
>> single virtual bus seems not good. What if driver want to do some specific
>> matching? E.g for virtio, we may want a vhost-net driver that only match
>> networking device. With a single bus, it probably means you need another bus
>> on top and provide the virtio specific matching there.
>> This looks not straightforward as allowing multiple type of buses.
>>
> The purpose of the bus is to attach two drivers,


Right, I just start to think whether it was generic to support the case 
as virtio or mdev to avoid function duplications.


>   mlx5_core (creator of netdevices) and mlx5_ib (create of rdma devices) on single PCI function.
> Meaning 'multiple classes of devices' are created on top of single underlying parent device.


This is not what I read, the doc said:

"
+One use case example is an rdma driver needing to connect with several
+different types of PCI LAN devices to be able to request resources from
+them (queue sets).  Each LAN driver that supports rdma will register a
+virtbus_device on the virtual bus for each physical function. The rdma
+driver will register as a virtbus_driver on the virtual bus to be
+matched up with multiple virtbus_devices and receive a pointer to a
+struct containing the callbacks that the PCI LAN drivers support for
+registering with them.

"

So it means to connect a single rdma driver with several RDMA capable 
LAN drivers on top of several PCI functions. If this is true, I'm not 
quite sure the advantage of using a bus since it's more like aggregation 
as what bond/team did.


>
> So bus is just the 'matching service' and nothing more. It is not meant to address virtio, mdev, sub functions usecases.


Probably, for virtio mdev we need more than just matching: life cycle 
management, cooperation with VFIO and we also want to be prepared for 
the device slicing (like sub functions).

Thanks
Parav Pandit Nov. 19, 2019, 7:13 a.m. UTC | #13
> From: Jason Wang <jasowang@redhat.com>
> Subject: Re: [net-next v2 1/1] virtual-bus: Implementation of Virtual Bus
> 
> 
[..]

> 
> Probably, for virtio mdev we need more than just matching: life cycle
> management, cooperation with VFIO and we also want to be prepared for
> the device slicing (like sub functions).

Well I am revising my patches to life cycle sub functions via devlink interface for few reasons, as

(a) avoid mdev bus abuse (still named as mdev in your v13 series, though it is actually for vfio-mdev)
(b) support iommu
(c) manage and have coupling with devlink eswitch framework, which is very rich in several aspects
(d) get rid of limited sysfs interface for mdev creation, as netlink is standard and flexible to add params etc.

If you want to get a glimpse of old RFC work of my revised series, please refer to [1].

Jiri, Jason, me think that even virtio accelerated devices will need eswitch support. And hence, life cycling virtio accelerated devices via devlink makes a lot of sense to us.
This way user has single tool to choose what type of device he want to use (similar to ip link add link type).
So sub function flavour will be something like (virtio or sf).

So I am reviving my old RFC [1] back now in few days as actual patches based on series [2].

[1] https://lkml.org/lkml/2019/3/1/19
[2] https://lore.kernel.org/linux-rdma/20191107160448.20962-1-parav@mellanox.com/
Jason Wang Nov. 19, 2019, 7:37 a.m. UTC | #14
On 2019/11/19 下午3:13, Parav Pandit wrote:
>> From: Jason Wang <jasowang@redhat.com>
>> Subject: Re: [net-next v2 1/1] virtual-bus: Implementation of Virtual Bus
>>
>>
> [..]
>
>> Probably, for virtio mdev we need more than just matching: life cycle
>> management, cooperation with VFIO and we also want to be prepared for
>> the device slicing (like sub functions).
> Well I am revising my patches to life cycle sub functions via devlink interface for few reasons, as
>
> (a) avoid mdev bus abuse (still named as mdev in your v13 series, though it is actually for vfio-mdev)


Yes, but it could be simply renamed to "vfio-mdev".


> (b) support iommu


That is already supported by mdev.


> (c) manage and have coupling with devlink eswitch framework, which is very rich in several aspects


Good point.


> (d) get rid of limited sysfs interface for mdev creation, as netlink is standard and flexible to add params etc.


Standard but net specific.


>
> If you want to get a glimpse of old RFC work of my revised series, please refer to [1].


Will do.


>
> Jiri, Jason, me think that even virtio accelerated devices will need eswitch support. And hence, life cycling virtio accelerated devices via devlink makes a lot of sense to us.
> This way user has single tool to choose what type of device he want to use (similar to ip link add link type).
> So sub function flavour will be something like (virtio or sf).


Networking is only one of the types that is supported in virtio-mdev. 
The codes are generic enough to support any kind of virtio device 
(block, scsi, crypto etc). Sysfs is less flexible but type independent. 
I agree that devlink is standard and feature richer but still network 
specific. It's probably hard to add devlink to other type of physical 
drivers. I'm thinking whether it's possible to combine syfs and devlink: 
e.g the mdev is available only after the sub fuction is created and 
fully configured by devlink.

Thanks


>
> So I am reviving my old RFC [1] back now in few days as actual patches based on series [2].
>
> [1] https://lkml.org/lkml/2019/3/1/19
> [2] https://lore.kernel.org/linux-rdma/20191107160448.20962-1-parav@mellanox.com/
>
Jason Wang Nov. 19, 2019, 8:04 a.m. UTC | #15
On 2019/11/18 下午3:48, Greg KH wrote:
> +Virtbus drivers
> +~~~~~~~~~~~~~~~
> +Virtbus drivers register with the virtual bus to be matched with virtbus
> +devices.  They expect to be registered with a probe and remove callback,
> +and also support shutdown, suspend, and resume callbacks.  They otherwise
> +follow the standard driver behavior of having discovery and enumeration
> +handled in the bus infrastructure.
> +
> +Virtbus drivers register themselves with the API entry point virtbus_drv_reg
> +and unregister with virtbus_drv_unreg.
> +
> +Device Enumeration
> +~~~~~~~~~~~~~~~~~~
> +Enumeration is handled automatically by the bus infrastructure via the
> +ida_simple methods.
> +
> +Device naming and driver binding
> +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> +The virtbus_device.dev.name is the canonical name for the device. It is
> +built from two other parts:
> +
> +        - virtbus_device.name (also used for matching).
> +        - virtbus_device.id (generated automatically from ida_simple calls)
> +
> +This allows for multiple virtbus_devices with the same name, which will all
> +be matched to the same virtbus_driver. Driver binding is performed by the
> +driver core, invoking driver probe() after finding a match between device and driver.
> +
> +Virtual Bus API entry points
> +~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> +struct virtbus_device *virtbus_dev_alloc(const char *name, void *data)


Hi:

Several questions about the name parameter here:

- If we want to have multiple types of device to be attached, some 
convention is needed to avoid confusion during the match. But if we had 
such one (e.g prefix or suffix), it basically another bus?
- Who decides the name of this virtbus dev, is it under the control of 
userspace? If yes, a management interface is required.

Thanks


> +int virtbus_dev_register(struct virtbus_device *vdev)
> +void virtbus_dev_unregister(struct virtbus_device *vdev)
> +int virtbus_drv_register(struct virtbus_driver *vdrv, struct module *owner)
> +void virtbus_drv_unregister(struct virtbus_driver *vdrv)
Parav Pandit Nov. 19, 2019, 3:14 p.m. UTC | #16
> From: Jason Wang <jasowang@redhat.com>
> Sent: Tuesday, November 19, 2019 1:37 AM
> 
> On 2019/11/19 下午3:13, Parav Pandit wrote:
> >> From: Jason Wang <jasowang@redhat.com>
> >> Subject: Re: [net-next v2 1/1] virtual-bus: Implementation of Virtual
> >> Bus
> >>
> >>
> > [..]
> >
> >> Probably, for virtio mdev we need more than just matching: life cycle
> >> management, cooperation with VFIO and we also want to be prepared for
> >> the device slicing (like sub functions).
> > Well I am revising my patches to life cycle sub functions via devlink
> > interface for few reasons, as
> >
> > (a) avoid mdev bus abuse (still named as mdev in your v13 series,
> > though it is actually for vfio-mdev)
> 
> 
> Yes, but it could be simply renamed to "vfio-mdev".
> 
> 
> > (b) support iommu
> 
> 
> That is already supported by mdev.
> 
> 
> > (c) manage and have coupling with devlink eswitch framework, which is
> > very rich in several aspects
> 
> 
> Good point.
> 
> 
> > (d) get rid of limited sysfs interface for mdev creation, as netlink is
> standard and flexible to add params etc.
> 
> 
> Standard but net specific.
> 
> 
> >
> > If you want to get a glimpse of old RFC work of my revised series, please
> refer to [1].
> 
> 
> Will do.
> 
> 
> >
> > Jiri, Jason, me think that even virtio accelerated devices will need eswitch
> support. And hence, life cycling virtio accelerated devices via devlink makes a
> lot of sense to us.
> > This way user has single tool to choose what type of device he want to use
> (similar to ip link add link type).
> > So sub function flavour will be something like (virtio or sf).
> 
> 
> Networking is only one of the types that is supported in virtio-mdev.
> The codes are generic enough to support any kind of virtio device (block,
> scsi, crypto etc). Sysfs is less flexible but type independent.
> I agree that devlink is standard and feature richer but still network specific.
> It's probably hard to add devlink to other type of physical drivers. I'm
> thinking whether it's possible to combine syfs and devlink:
> e.g the mdev is available only after the sub fuction is created and fully
> configured by devlink.
> 

Nop. Devlink is NOT net specific. It works at the bus/device level.
Any block/scsi/crypto can register devlink instance and implement the necessary ops as long as device has bus.
Jason Gunthorpe Nov. 19, 2019, 4:46 p.m. UTC | #17
On Tue, Nov 19, 2019 at 03:37:03PM +0800, Jason Wang wrote:

> > Jiri, Jason, me think that even virtio accelerated devices will need eswitch support. And hence, life cycling virtio accelerated devices via devlink makes a lot of sense to us.
> > This way user has single tool to choose what type of device he want to use (similar to ip link add link type).
> > So sub function flavour will be something like (virtio or sf).
> 
> Networking is only one of the types that is supported in virtio-mdev. The
> codes are generic enough to support any kind of virtio device (block, scsi,
> crypto etc). Sysfs is less flexible but type independent. I agree that
> devlink is standard and feature richer but still network specific. It's
> probably hard to add devlink to other type of physical drivers. I'm thinking
> whether it's possible to combine syfs and devlink: e.g the mdev is available
> only after the sub fuction is created and fully configured by devlink.

The driver providing the virtio should really be in control of the
life cycle policy. For net related virtio that is clearly devlink.

Even for block we may find that network storage providers (ie some
HW accelerated virtio-blk-over-ethernet) will want to use devlink to
create a combination ethernet and accelerated virtio-block widget.

This is why the guid life cycle stuff should be in a library that can
be used by the drivers that need it (assuming all drivers don't just
use devlink as Parav proposes)

As always, this is all very hard to tell without actually seeing real
accelerated drivers implement this. 

Your patch series might be a bit premature in this regard.

Jason
Ertman, David M Nov. 19, 2019, 5:44 p.m. UTC | #18
> -----Original Message-----
> From: Parav Pandit <parav@mellanox.com>
> Sent: Monday, November 18, 2019 8:32 PM
> To: Ertman, David M <david.m.ertman@intel.com>; Kirsher, Jeffrey T
> <jeffrey.t.kirsher@intel.com>; davem@davemloft.net;
> gregkh@linuxfoundation.org
> Cc: netdev@vger.kernel.org; linux-rdma@vger.kernel.org;
> nhorman@redhat.com; sassmann@redhat.com; jgg@ziepe.ca; Patil, Kiran
> <kiran.patil@intel.com>
> Subject: RE: [net-next v2 1/1] virtual-bus: Implementation of Virtual Bus
> 
> Hi David,
> 
> > From: Ertman, David M <david.m.ertman@intel.com>
> > Sent: Monday, November 18, 2019 9:59 PM
> > Subject: RE: [net-next v2 1/1] virtual-bus: Implementation of Virtual
> > Bus
> >
> > > -----Original Message-----
> > > From: Parav Pandit <parav@mellanox.com>
> > > Sent: Friday, November 15, 2019 3:26 PM
> > > To: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>;
> > > davem@davemloft.net; gregkh@linuxfoundation.org
> > > Cc: Ertman, David M <david.m.ertman@intel.com>;
> > > netdev@vger.kernel.org; linux-rdma@vger.kernel.org;
> > > nhorman@redhat.com; sassmann@redhat.com; jgg@ziepe.ca; Patil,
> Kiran
> > > <kiran.patil@intel.com>
> > > Subject: RE: [net-next v2 1/1] virtual-bus: Implementation of
> > > Virtual Bus
> > >
> > > Hi Jeff,
> > >
> > > > From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > > > Sent: Friday, November 15, 2019 4:34 PM
> > > >
> > > > From: Dave Ertman <david.m.ertman@intel.com>
> > > >
> > > > This is the initial implementation of the Virtual Bus,
> > > > virtbus_device and virtbus_driver.  The virtual bus is a software
> > > > based bus intended to support lightweight devices and drivers and
> > > > provide matching between them and probing of the registered drivers.
> > > >
> > > > The primary purpose of the virual bus is to provide matching
> > > > services and to pass the data pointer contained in the
> > > > virtbus_device to the virtbus_driver during its probe call.  This
> > > > will allow two separate kernel objects to match up and start
> > communication.
> > > >
> > > It is fundamental to know that rdma device created by virtbus_driver
> > > will be anchored to which bus for an non abusive use.
> > > virtbus or parent pci bus?
> > > I asked this question in v1 version of this patch.
> >
> > The model we will be using is a PCI LAN driver that will allocate and
> > register a virtbus_device.  The virtbus_device will be anchored to the
> > virtual bus, not the PCI bus.
> o.k.
> 
> >
> > The virtbus does not have a requirement that elements registering with
> > it have any association with another outside bus or device.
> >
> This is what I want to capture in cover letter and documentation.
> 
> > RDMA is not attached to any bus when it's init is called.  The
> > virtbus_driver that it will create will be attached to the virtual bus.
> >
> > The RDMA driver will register a virtbus_driver object.  Its probe will
> > accept the data pointer from the virtbus_device that the PCI LAN driver
> created.
> >
> What I am saying that RDMA device created by the irdma driver or mlx5_ib
> driver should be anchored to the PCI device and not the virtbus device.
> 
> struct ib_device.dev.parent = &pci_dev->dev;
> 
> if this is not done, and if it is,
> 
> struct ib_device.dev.parent = &virtbus_dev->dev;
> 
> Than we are inviting huge burden as below.
> (a) user compatibility with several tools, orchestration etc is broken, because
> rdma cannot be reached back to its PCI device as before.
> This is some internal kernel change for 'better code handling', which is
> surfacing to rdma device name changing - systemd/udev broken, until all
> distros upgrade and implement this virtbus naming scheme.
> Even with that orchestration tools shipped outside of distro are broken.
> 
> (b) virtbus must extend iommu support in intel, arm, amd, ppc systems
> otherwise straight away rdma is broken in those environments with this
> 'internal code restructure'.
> These iommu doesn't support non PCI buses.
> 
> (c) anchoring on virtbus brings challenge to get unique id for persistent
> naming when irdma/mlx5_ib device is not created by 'user'.
> 
> This improvement by bus matching service != 'ethX to ens2f0 improvement
> of netdevs happened few years back'.
> Hence, my input is,
> 
> irdma_virtubus_probe() {
> 	struct ib_device.dev.parent = &pci_dev->dev;
> 	ib_register_driver();
> }
> 

Sounds reasonable.  In our actual consumer, we will set the parent device to the PCI device,
which I believe we are doing in the irdma driver already.
But, this is a consideration outside of the virtbus scope, since its purpose is only matching
up two kernel objects.

> > >
> > > Also since it says - 'to support lightweight devices', documenting
> > > that information is critical to avoid ambiguity.
> > >
> > > Since for a while I am working on the subbus/subdev_bus/xbus/mdev
> > > [1] whatever we want to call it, it overlaps with your comment about
> > > 'to support lightweight devices'.
> > > Hence let's make things crystal clear weather the purpose is 'only
> > > matching service' or also 'lightweight devices'.
> > > If this is only matching service, lets please remove lightweight devices
> part..
> > >
> >
> > This is only for matching services for kernel objects, I will work on
> > phrasing this clearer.
> >
> 
> Ok. Great. Due to above two fundamental points, we clearly write up the
> matching service.
> Not sure naming virtbus to 'virbus_service' in bus_type is an extreme way to
> make this clear.
> 
> > > You additionally need modpost support for id table integration to
> > > modifo, modprobe and other tools.
> > > A small patch similar to this one [2] is needed.
> > > Please include in the series.
> > >
> >
> > modpost support added - thanks for that catch!
> >
> > > [..]
> > >
> > > > +static const
> > > > +struct virtbus_dev_id *virtbus_match_id(const struct virtbus_dev_id
> *id,
> > > > +					struct virtbus_device *vdev) {
> > > > +	while (id->name[0]) {
> > > > +		if (!strcmp(vdev->name, id->name)) {
> > > > +			vdev->dev_id = id;
> > > Matching function shouldn't be modifying the id.
> >
> > This is not the main id of the virtbus_device.  This is copying the
> > element in the driver id_table that was matched into the
> > virtbus_device struct, so that when the virtbus_device struct is
> > passed to the virtbus_driver's probe, it can access the correct driver_data.
> >
> > I chose a poor name for this field, I will change the name of this
> > part of the struct to matched_element and include a comment on what is
> going on here.
> >
> When virtbus_device is created, its class_id or device_id identifying the
> device is decided.
> So it should be set in the create() routine.

That is where it is set.  What is happening in the match routine is the same thing that the
platform bus is doing in its id_table match, copying a piece of data specific to this match
into the device so that the driver can access it in the probe call.

Again, I *really* appreciate the feedback.  It has been very helpful!
Ertman, David M Nov. 19, 2019, 5:46 p.m. UTC | #19
> -----Original Message-----
> From: Parav Pandit <parav@mellanox.com>
> Sent: Monday, November 18, 2019 8:40 PM
> To: Parav Pandit <parav@mellanox.com>; Ertman, David M
> <david.m.ertman@intel.com>; Kirsher, Jeffrey T
> <jeffrey.t.kirsher@intel.com>; davem@davemloft.net;
> gregkh@linuxfoundation.org
> Cc: netdev@vger.kernel.org; linux-rdma@vger.kernel.org;
> nhorman@redhat.com; sassmann@redhat.com; jgg@ziepe.ca; Patil, Kiran
> <kiran.patil@intel.com>
> Subject: RE: [net-next v2 1/1] virtual-bus: Implementation of Virtual Bus
> 
> Hi David,
> 
> > Sent: Monday, November 18, 2019 10:32 PM
> > To: Ertman, David M <david.m.ertman@intel.com>; Kirsher, Jeffrey T
> > <jeffrey.t.kirsher@intel.com>; davem@davemloft.net;
> > gregkh@linuxfoundation.org
> > Cc: netdev@vger.kernel.org; linux-rdma@vger.kernel.org;
> > nhorman@redhat.com; sassmann@redhat.com; jgg@ziepe.ca; Patil, Kiran
> > <kiran.patil@intel.com>
> > Subject: RE: [net-next v2 1/1] virtual-bus: Implementation of Virtual
> > Bus
> >
> > Hi David,
> >
> > > From: Ertman, David M <david.m.ertman@intel.com>
> > > Sent: Monday, November 18, 2019 9:59 PM
> > > Subject: RE: [net-next v2 1/1] virtual-bus: Implementation of
> > > Virtual Bus
> > >
> > > > -----Original Message-----
> > > > From: Parav Pandit <parav@mellanox.com>
> > > > Sent: Friday, November 15, 2019 3:26 PM
> > > > To: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>;
> > > > davem@davemloft.net; gregkh@linuxfoundation.org
> > > > Cc: Ertman, David M <david.m.ertman@intel.com>;
> > > > netdev@vger.kernel.org; linux-rdma@vger.kernel.org;
> > > > nhorman@redhat.com; sassmann@redhat.com; jgg@ziepe.ca; Patil,
> > > > Kiran <kiran.patil@intel.com>
> > > > Subject: RE: [net-next v2 1/1] virtual-bus: Implementation of
> > > > Virtual Bus
> > > >
> > > > Hi Jeff,
> > > >
> > > > > From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> > > > > Sent: Friday, November 15, 2019 4:34 PM
> > > > >
> > > > > From: Dave Ertman <david.m.ertman@intel.com>
> > > > >
> > > > > This is the initial implementation of the Virtual Bus,
> > > > > virtbus_device and virtbus_driver.  The virtual bus is a
> > > > > software based bus intended to support lightweight devices and
> > > > > drivers and provide matching between them and probing of the
> registered drivers.
> > > > >
> > > > > The primary purpose of the virual bus is to provide matching
> > > > > services and to pass the data pointer contained in the
> > > > > virtbus_device to the virtbus_driver during its probe call.
> > > > > This will allow two separate kernel objects to match up and
> > > > > start
> > > communication.
> > > > >
> > > > It is fundamental to know that rdma device created by
> > > > virtbus_driver will be anchored to which bus for an non abusive use.
> > > > virtbus or parent pci bus?
> > > > I asked this question in v1 version of this patch.
> > >
> > > The model we will be using is a PCI LAN driver that will allocate
> > > and register a virtbus_device.  The virtbus_device will be anchored
> > > to the virtual bus, not the PCI bus.
> > o.k.
> >
> > >
> > > The virtbus does not have a requirement that elements registering
> > > with it have any association with another outside bus or device.
> > >
> > This is what I want to capture in cover letter and documentation.
> >
> > > RDMA is not attached to any bus when it's init is called.  The
> > > virtbus_driver that it will create will be attached to the virtual bus.
> > >
> > > The RDMA driver will register a virtbus_driver object.  Its probe
> > > will accept the data pointer from the virtbus_device that the PCI
> > > LAN driver
> > created.
> > >
> > What I am saying that RDMA device created by the irdma driver or
> > mlx5_ib driver should be anchored to the PCI device and not the virtbus
> device.
> >
> > struct ib_device.dev.parent = &pci_dev->dev;
> >
> > if this is not done, and if it is,
> >
> > struct ib_device.dev.parent = &virtbus_dev->dev;
> >
> > Than we are inviting huge burden as below.
> > (a) user compatibility with several tools, orchestration etc is
> > broken, because rdma cannot be reached back to its PCI device as before.
> > This is some internal kernel change for 'better code handling', which
> > is surfacing to rdma device name changing - systemd/udev broken, until
> > all distros upgrade and implement this virtbus naming scheme.
> > Even with that orchestration tools shipped outside of distro are broken.
> >
> > (b) virtbus must extend iommu support in intel, arm, amd, ppc systems
> > otherwise straight away rdma is broken in those environments with this
> > 'internal code restructure'.
> > These iommu doesn't support non PCI buses.
> >
> > (c) anchoring on virtbus brings challenge to get unique id for
> > persistent naming when irdma/mlx5_ib device is not created by 'user'.
> >
> > This improvement by bus matching service != 'ethX to ens2f0
> > improvement of netdevs happened few years back'.
> > Hence, my input is,
> >
> > irdma_virtubus_probe() {
> > 	struct ib_device.dev.parent = &pci_dev->dev;
> > 	ib_register_driver();
> > }
> >
> With this, I forgot to mention that, virtbus doesn't need PM callbacks,
> because PM core layer works on suspend/resume devices in reverse order
> of their creation.
> Given that protocol devices (like rdma and netdev) devices shouldn't be
> anchored on virtbus, it doesn't need PM callbacks.
> Please remove them.
> 
> suspend() will be called first on rdma device (because it was created last).

This is only true in the rdma/PLCI LAN situation.  virtbus can be used on two
kernel objects that have no connection to another bus or device, but only use
the virtbus for connecting up.  In that case, those entities will need the PM
callbacks.

-Dave E
Ertman, David M Nov. 19, 2019, 5:50 p.m. UTC | #20
> -----Original Message-----
> From: Jason Wang <jasowang@redhat.com>
> Sent: Tuesday, November 19, 2019 12:05 AM
> To: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>; Ertman, David M
> <david.m.ertman@intel.com>
> Cc: Greg KH <gregkh@linuxfoundation.org>; davem@davemloft.net;
> netdev@vger.kernel.org; linux-rdma@vger.kernel.org;
> nhorman@redhat.com; sassmann@redhat.com; jgg@ziepe.ca;
> parav@mellanox.com; Patil, Kiran <kiran.patil@intel.com>
> Subject: Re: [net-next v2 1/1] virtual-bus: Implementation of Virtual Bus
> 
> 
> On 2019/11/18 下午3:48, Greg KH wrote:
> > +Virtbus drivers
> > +~~~~~~~~~~~~~~~
> > +Virtbus drivers register with the virtual bus to be matched with
> > +virtbus devices.  They expect to be registered with a probe and
> > +remove callback, and also support shutdown, suspend, and resume
> > +callbacks.  They otherwise follow the standard driver behavior of
> > +having discovery and enumeration handled in the bus infrastructure.
> > +
> > +Virtbus drivers register themselves with the API entry point
> > +virtbus_drv_reg and unregister with virtbus_drv_unreg.
> > +
> > +Device Enumeration
> > +~~~~~~~~~~~~~~~~~~
> > +Enumeration is handled automatically by the bus infrastructure via
> > +the ida_simple methods.
> > +
> > +Device naming and driver binding
> > +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > +The virtbus_device.dev.name is the canonical name for the device. It
> > +is built from two other parts:
> > +
> > +        - virtbus_device.name (also used for matching).
> > +        - virtbus_device.id (generated automatically from ida_simple
> > + calls)
> > +
> > +This allows for multiple virtbus_devices with the same name, which
> > +will all be matched to the same virtbus_driver. Driver binding is
> > +performed by the driver core, invoking driver probe() after finding a
> match between device and driver.
> > +
> > +Virtual Bus API entry points
> > +~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > +struct virtbus_device *virtbus_dev_alloc(const char *name, void
> > +*data)
> 
> 
> Hi:
> 
> Several questions about the name parameter here:
> 
> - If we want to have multiple types of device to be attached, some
> convention is needed to avoid confusion during the match. But if we had
> such one (e.g prefix or suffix), it basically another bus?
> - Who decides the name of this virtbus dev, is it under the control of
> userspace? If yes, a management interface is required.
> 
> Thanks
> 
This function has been removed from the API.  New patch set inbound
implementing changes that Parav suggested.

> 
> > +int virtbus_dev_register(struct virtbus_device *vdev) void
> > +virtbus_dev_unregister(struct virtbus_device *vdev) int
> > +virtbus_drv_register(struct virtbus_driver *vdrv, struct module
> > +*owner) void virtbus_drv_unregister(struct virtbus_driver *vdrv)
Jason Gunthorpe Nov. 19, 2019, 6:39 p.m. UTC | #21
On Tue, Nov 19, 2019 at 05:46:50PM +0000, Ertman, David M wrote:

> > With this, I forgot to mention that, virtbus doesn't need PM callbacks,
> > because PM core layer works on suspend/resume devices in reverse order
> > of their creation.
> > Given that protocol devices (like rdma and netdev) devices shouldn't be
> > anchored on virtbus, it doesn't need PM callbacks.
> > Please remove them.
> > 
> > suspend() will be called first on rdma device (because it was created last).
> 
> This is only true in the rdma/PLCI LAN situation.  virtbus can be used on two
> kernel objects that have no connection to another bus or device, but only use
> the virtbus for connecting up.  In that case, those entities will need the PM
> callbacks.

PM order is something to be careful of, the callbacks for a virtbus
driver have to happen before the PCI segment that driver is actually
on gets shutdown.

Is it possible to get PM callbacks directly from the PCI device
without binding a driver to it? I think not..

So, either a driver-specific path has to relay PM callbacks from the
owning PCI device to the attaching sub driver

Or, we have to rely on the driver core to manage PM for us and
instantiate a virtbus segment under the PCI device such that normal PM
callback ordering works properly. (this would be my preference)

Ie we shut down the virtbus under 01:00.0 then we shut down 01:00 ,
then PCI bus 01:00, then the root complex, etc.

Look to I2C for an example how this looks in sysfs

Jason
Michael S. Tsirkin Nov. 19, 2019, 6:58 p.m. UTC | #22
On Tue, Nov 19, 2019 at 12:46:32PM -0400, Jason Gunthorpe wrote:
> As always, this is all very hard to tell without actually seeing real
> accelerated drivers implement this. 
> 
> Your patch series might be a bit premature in this regard.

Actually drivers implementing this have been posted, haven't they?
See e.g. https://lwn.net/Articles/804379/
Jason Gunthorpe Nov. 19, 2019, 7:03 p.m. UTC | #23
On Tue, Nov 19, 2019 at 01:58:42PM -0500, Michael S. Tsirkin wrote:
> On Tue, Nov 19, 2019 at 12:46:32PM -0400, Jason Gunthorpe wrote:
> > As always, this is all very hard to tell without actually seeing real
> > accelerated drivers implement this. 
> > 
> > Your patch series might be a bit premature in this regard.
> 
> Actually drivers implementing this have been posted, haven't they?
> See e.g. https://lwn.net/Articles/804379/

Has the consumer half been posted?

Jason
Jason Gunthorpe Nov. 19, 2019, 7:15 p.m. UTC | #24
On Tue, Nov 19, 2019 at 01:58:42PM -0500, Michael S. Tsirkin wrote:
> On Tue, Nov 19, 2019 at 12:46:32PM -0400, Jason Gunthorpe wrote:
> > As always, this is all very hard to tell without actually seeing real
> > accelerated drivers implement this. 
> > 
> > Your patch series might be a bit premature in this regard.
> 
> Actually drivers implementing this have been posted, haven't they?
> See e.g. https://lwn.net/Articles/804379/

Is that a real driver? It looks like another example quality
thing. 

For instance why do we need any of this if it has '#define
IFCVF_MDEV_LIMIT 1' ?

Surely for this HW just use vfio over the entire PCI function and be
done with it?

Jason
Michael S. Tsirkin Nov. 19, 2019, 9:33 p.m. UTC | #25
On Tue, Nov 19, 2019 at 03:15:47PM -0400, Jason Gunthorpe wrote:
> On Tue, Nov 19, 2019 at 01:58:42PM -0500, Michael S. Tsirkin wrote:
> > On Tue, Nov 19, 2019 at 12:46:32PM -0400, Jason Gunthorpe wrote:
> > > As always, this is all very hard to tell without actually seeing real
> > > accelerated drivers implement this. 
> > > 
> > > Your patch series might be a bit premature in this regard.
> > 
> > Actually drivers implementing this have been posted, haven't they?
> > See e.g. https://lwn.net/Articles/804379/
> 
> Is that a real driver? It looks like another example quality
> thing. 
> 
> For instance why do we need any of this if it has '#define
> IFCVF_MDEV_LIMIT 1' ?
> 
> Surely for this HW just use vfio over the entire PCI function and be
> done with it?
> 
> Jason

What this does is allow using it with unmodified virtio drivers within guests.
You won't get this with passthrough as it only implements parts of
virtio in hardware.
Michael S. Tsirkin Nov. 19, 2019, 9:34 p.m. UTC | #26
On Tue, Nov 19, 2019 at 03:03:40PM -0400, Jason Gunthorpe wrote:
> On Tue, Nov 19, 2019 at 01:58:42PM -0500, Michael S. Tsirkin wrote:
> > On Tue, Nov 19, 2019 at 12:46:32PM -0400, Jason Gunthorpe wrote:
> > > As always, this is all very hard to tell without actually seeing real
> > > accelerated drivers implement this. 
> > > 
> > > Your patch series might be a bit premature in this regard.
> > 
> > Actually drivers implementing this have been posted, haven't they?
> > See e.g. https://lwn.net/Articles/804379/
> 
> Has the consumer half been posted?
> 
> Jason 

Yes: https://lkml.org/lkml/2019/11/18/1068
Jason Gunthorpe Nov. 19, 2019, 11:10 p.m. UTC | #27
On Tue, Nov 19, 2019 at 04:33:40PM -0500, Michael S. Tsirkin wrote:
> On Tue, Nov 19, 2019 at 03:15:47PM -0400, Jason Gunthorpe wrote:
> > On Tue, Nov 19, 2019 at 01:58:42PM -0500, Michael S. Tsirkin wrote:
> > > On Tue, Nov 19, 2019 at 12:46:32PM -0400, Jason Gunthorpe wrote:
> > > > As always, this is all very hard to tell without actually seeing real
> > > > accelerated drivers implement this. 
> > > > 
> > > > Your patch series might be a bit premature in this regard.
> > > 
> > > Actually drivers implementing this have been posted, haven't they?
> > > See e.g. https://lwn.net/Articles/804379/
> > 
> > Is that a real driver? It looks like another example quality
> > thing. 
> > 
> > For instance why do we need any of this if it has '#define
> > IFCVF_MDEV_LIMIT 1' ?
> > 
> > Surely for this HW just use vfio over the entire PCI function and be
> > done with it?
> 
> What this does is allow using it with unmodified virtio drivers
> within guests.  You won't get this with passthrough as it only
> implements parts of virtio in hardware.

I don't mean use vfio to perform passthrough, I mean to use vfio to
implement the software parts in userspace while vfio to talk to the
hardware.

  kernel -> vfio -> user space virtio driver -> qemu -> guest

Generally we don't want to see things in the kernel that can be done
in userspace, and to me, at least for this driver, this looks
completely solvable in userspace.

Jason
Michael S. Tsirkin Nov. 20, 2019, 12:16 a.m. UTC | #28
On Tue, Nov 19, 2019 at 07:10:23PM -0400, Jason Gunthorpe wrote:
> On Tue, Nov 19, 2019 at 04:33:40PM -0500, Michael S. Tsirkin wrote:
> > On Tue, Nov 19, 2019 at 03:15:47PM -0400, Jason Gunthorpe wrote:
> > > On Tue, Nov 19, 2019 at 01:58:42PM -0500, Michael S. Tsirkin wrote:
> > > > On Tue, Nov 19, 2019 at 12:46:32PM -0400, Jason Gunthorpe wrote:
> > > > > As always, this is all very hard to tell without actually seeing real
> > > > > accelerated drivers implement this. 
> > > > > 
> > > > > Your patch series might be a bit premature in this regard.
> > > > 
> > > > Actually drivers implementing this have been posted, haven't they?
> > > > See e.g. https://lwn.net/Articles/804379/
> > > 
> > > Is that a real driver? It looks like another example quality
> > > thing. 
> > > 
> > > For instance why do we need any of this if it has '#define
> > > IFCVF_MDEV_LIMIT 1' ?
> > > 
> > > Surely for this HW just use vfio over the entire PCI function and be
> > > done with it?
> > 
> > What this does is allow using it with unmodified virtio drivers
> > within guests.  You won't get this with passthrough as it only
> > implements parts of virtio in hardware.
> 
> I don't mean use vfio to perform passthrough, I mean to use vfio to
> implement the software parts in userspace while vfio to talk to the
> hardware.

You repeated vfio twice here, hard to decode what you meant actually.

>   kernel -> vfio -> user space virtio driver -> qemu -> guest

Exactly what has been implemented for control path.
The interface between vfio and userspace is
based on virtio which is IMHO much better than
a vendor specific one. userspace stays vendor agnostic.

> Generally we don't want to see things in the kernel that can be done
> in userspace, and to me, at least for this driver, this looks
> completely solvable in userspace.

I don't think that extends as far as actively encouraging userspace
drivers poking at hardware in a vendor specific way.  That has lots of
security and portability implications and isn't appropriate for
everyone. It is kernel's job to abstract hardware away and present
a unified interface as far as possible.
Jason Gunthorpe Nov. 20, 2019, 1:46 a.m. UTC | #29
On Tue, Nov 19, 2019 at 07:16:21PM -0500, Michael S. Tsirkin wrote:
> On Tue, Nov 19, 2019 at 07:10:23PM -0400, Jason Gunthorpe wrote:
> > On Tue, Nov 19, 2019 at 04:33:40PM -0500, Michael S. Tsirkin wrote:
> > > On Tue, Nov 19, 2019 at 03:15:47PM -0400, Jason Gunthorpe wrote:
> > > > On Tue, Nov 19, 2019 at 01:58:42PM -0500, Michael S. Tsirkin wrote:
> > > > > On Tue, Nov 19, 2019 at 12:46:32PM -0400, Jason Gunthorpe wrote:
> > > > > > As always, this is all very hard to tell without actually seeing real
> > > > > > accelerated drivers implement this. 
> > > > > > 
> > > > > > Your patch series might be a bit premature in this regard.
> > > > > 
> > > > > Actually drivers implementing this have been posted, haven't they?
> > > > > See e.g. https://lwn.net/Articles/804379/
> > > > 
> > > > Is that a real driver? It looks like another example quality
> > > > thing. 
> > > > 
> > > > For instance why do we need any of this if it has '#define
> > > > IFCVF_MDEV_LIMIT 1' ?
> > > > 
> > > > Surely for this HW just use vfio over the entire PCI function and be
> > > > done with it?
> > > 
> > > What this does is allow using it with unmodified virtio drivers
> > > within guests.  You won't get this with passthrough as it only
> > > implements parts of virtio in hardware.
> > 
> > I don't mean use vfio to perform passthrough, I mean to use vfio to
> > implement the software parts in userspace while vfio to talk to the
> > hardware.
> 
> You repeated vfio twice here, hard to decode what you meant actually.

'while using vifo to talk to the hardware'

> >   kernel -> vfio -> user space virtio driver -> qemu -> guest
>
> Exactly what has been implemented for control path.

I do not mean the modified mediated vfio this series proposes, I mean
vfio-pci, on a full PCI VF, exactly like we have today.

> The interface between vfio and userspace is
> based on virtio which is IMHO much better than
> a vendor specific one. userspace stays vendor agnostic.

Why is that even a good thing? It is much easier to provide drivers
via qemu/etc in user space then it is to make kernel upgrades. We've
learned this lesson many times.

This is why we have had the philosophy that if it doesn't need to be
in the kernel it should be in userspace.

> > Generally we don't want to see things in the kernel that can be done
> > in userspace, and to me, at least for this driver, this looks
> > completely solvable in userspace.
> 
> I don't think that extends as far as actively encouraging userspace
> drivers poking at hardware in a vendor specific way.  

Yes, it does, if you can implement your user space requirements using
vfio then why do you need a kernel driver?

The kernel needs to be involved when there are things only the kernel
can do. If IFC has such things they should be spelled out to justify
using a mediated device.

> That has lots of security and portability implications and isn't
> appropriate for everyone. 

This is already using vfio. It doesn't make sense to claim that using
vfio properly is somehow less secure or less portable.

What I find particularly ugly is that this 'IFC VF NIC' driver
pretends to be a mediated vfio device, but actually bypasses all the
mediated device ops for managing dma security and just directly plugs
the system IOMMU for the underlying PCI device into vfio.

I suppose this little hack is what is motivating this abuse of vfio in
the first place?

Frankly I think a kernel driver touching a PCI function for which vfio
is now controlling the system iommu for is a violation of the security
model, and I'm very surprised AlexW didn't NAK this idea.

Perhaps it is because none of the patches actually describe how the
DMA security model for this so-called mediated device works? :(

Or perhaps it is because this submission is split up so much it is
hard to see what is being proposed? (I note this IFC driver is the
first user of the mdev_set_iommu_device() function)

> It is kernel's job to abstract hardware away and present a unified
> interface as far as possible.

Sure, you could create a virtio accelerator driver framework in our
new drivers/accel I hear was started. That could make some sense, if
we had HW that actually required/benefited from kernel involvement.

Jason
Jason Wang Nov. 20, 2019, 3:15 a.m. UTC | #30
----- Original Message -----
> 
> 
> > From: Jason Wang <jasowang@redhat.com>
> > Sent: Tuesday, November 19, 2019 1:37 AM
> > 
> > On 2019/11/19 下午3:13, Parav Pandit wrote:
> > >> From: Jason Wang <jasowang@redhat.com>
> > >> Subject: Re: [net-next v2 1/1] virtual-bus: Implementation of Virtual
> > >> Bus
> > >>
> > >>
> > > [..]
> > >
> > >> Probably, for virtio mdev we need more than just matching: life cycle
> > >> management, cooperation with VFIO and we also want to be prepared for
> > >> the device slicing (like sub functions).
> > > Well I am revising my patches to life cycle sub functions via devlink
> > > interface for few reasons, as
> > >
> > > (a) avoid mdev bus abuse (still named as mdev in your v13 series,
> > > though it is actually for vfio-mdev)
> > 
> > 
> > Yes, but it could be simply renamed to "vfio-mdev".
> > 
> > 
> > > (b) support iommu
> > 
> > 
> > That is already supported by mdev.
> > 
> > 
> > > (c) manage and have coupling with devlink eswitch framework, which is
> > > very rich in several aspects
> > 
> > 
> > Good point.
> > 
> > 
> > > (d) get rid of limited sysfs interface for mdev creation, as netlink is
> > standard and flexible to add params etc.
> > 
> > 
> > Standard but net specific.
> > 
> > 
> > >
> > > If you want to get a glimpse of old RFC work of my revised series, please
> > refer to [1].
> > 
> > 
> > Will do.
> > 
> > 
> > >
> > > Jiri, Jason, me think that even virtio accelerated devices will need
> > > eswitch
> > support. And hence, life cycling virtio accelerated devices via devlink
> > makes a
> > lot of sense to us.
> > > This way user has single tool to choose what type of device he want to
> > > use
> > (similar to ip link add link type).
> > > So sub function flavour will be something like (virtio or sf).
> > 
> > 
> > Networking is only one of the types that is supported in virtio-mdev.
> > The codes are generic enough to support any kind of virtio device (block,
> > scsi, crypto etc). Sysfs is less flexible but type independent.
> > I agree that devlink is standard and feature richer but still network
> > specific.
> > It's probably hard to add devlink to other type of physical drivers. I'm
> > thinking whether it's possible to combine syfs and devlink:
> > e.g the mdev is available only after the sub fuction is created and fully
> > configured by devlink.
> > 
> 
> Nop. Devlink is NOT net specific. It works at the bus/device level.
> Any block/scsi/crypto can register devlink instance and implement the
> necessary ops as long as device has bus.
> 

Well, uapi/linux/devlink.h told me:

"
 * include/uapi/linux/devlink.h - Network physical device Netlink interface
"

And the userspace tool was packaged into iproute2, the command was
named as "TC", "PORT", "ESWITCH". All of those were strong hints that it
was network specific. Even for networking, only few vendors choose to
implement this.

So technically it could be extended but how hard it can be achieved in
reality?

I still don't see why devlink is conflicted with GUID/sysfs, you can
hook sysfs events to devlink or do post or pre configuration through
devlink. This is much more easier than forcing all vendors to use
devlink.

Thanks
Jason Wang Nov. 20, 2019, 3:24 a.m. UTC | #31
----- Original Message -----
> On Tue, Nov 19, 2019 at 03:37:03PM +0800, Jason Wang wrote:
> 
> > > Jiri, Jason, me think that even virtio accelerated devices will need
> > > eswitch support. And hence, life cycling virtio accelerated devices via
> > > devlink makes a lot of sense to us.
> > > This way user has single tool to choose what type of device he want to
> > > use (similar to ip link add link type).
> > > So sub function flavour will be something like (virtio or sf).
> > 
> > Networking is only one of the types that is supported in virtio-mdev. The
> > codes are generic enough to support any kind of virtio device (block, scsi,
> > crypto etc). Sysfs is less flexible but type independent. I agree that
> > devlink is standard and feature richer but still network specific. It's
> > probably hard to add devlink to other type of physical drivers. I'm
> > thinking
> > whether it's possible to combine syfs and devlink: e.g the mdev is
> > available
> > only after the sub fuction is created and fully configured by devlink.
> 
> The driver providing the virtio should really be in control of the
> life cycle policy. For net related virtio that is clearly devlink.

As replied in another thread, there were already existed devices
(Intel IFC VF) that doesn't use devlink.

> 
> Even for block we may find that network storage providers (ie some
> HW accelerated virtio-blk-over-ethernet) will want to use devlink to
> create a combination ethernet and accelerated virtio-block widget.
> 
>

Note, there's already commercial virtio-blk done at PF level provided
by Ali ECS instance. So it's looks pretty clear to me it's almost
impossible to have every vendors to use devlink. Tie virtio soluton to
devlink seems a burden and actually devlink doesn't conflict with the
simple sysfs interface.

Thanks
Jason Wang Nov. 20, 2019, 3:29 a.m. UTC | #32
----- Original Message -----
> On Tue, Nov 19, 2019 at 01:58:42PM -0500, Michael S. Tsirkin wrote:
> > On Tue, Nov 19, 2019 at 12:46:32PM -0400, Jason Gunthorpe wrote:
> > > As always, this is all very hard to tell without actually seeing real
> > > accelerated drivers implement this.
> > > 
> > > Your patch series might be a bit premature in this regard.
> > 
> > Actually drivers implementing this have been posted, haven't they?
> > See e.g. https://lwn.net/Articles/804379/
> 
> Is that a real driver? It looks like another example quality
> thing.

I think the answer is obvious:

+static struct pci_driver ifcvf_driver = {
+	.name     = IFCVF_DRIVER_NAME,
+	.id_table = ifcvf_pci_ids,
+	.probe    = ifcvf_probe,
+	.remove   = ifcvf_remove,
+};

> 
> For instance why do we need any of this if it has '#define
> IFCVF_MDEV_LIMIT 1' ?

This is just because virtio was done at VF level.

Thanks

> 
> Surely for this HW just use vfio over the entire PCI function and be
> done with it?
> 
> Jason
> 
>
Parav Pandit Nov. 20, 2019, 3:38 a.m. UTC | #33
> From: Jason Wang <jasowang@redhat.com>
> Sent: Tuesday, November 19, 2019 9:15 PM
> 
> ----- Original Message -----
> >
> >
> > > From: Jason Wang <jasowang@redhat.com>
> > > Sent: Tuesday, November 19, 2019 1:37 AM
> > >
> >
> > Nop. Devlink is NOT net specific. It works at the bus/device level.
> > Any block/scsi/crypto can register devlink instance and implement the
> > necessary ops as long as device has bus.
> >
> 
> Well, uapi/linux/devlink.h told me:
> 
> "
>  * include/uapi/linux/devlink.h - Network physical device Netlink interface "
> 
> And the userspace tool was packaged into iproute2, the command was named
> as "TC", "PORT", "ESWITCH". All of those were strong hints that it was network
> specific. Even for networking, only few vendors choose to implement this.
> 
It is under iproute2 tool but it is not limited to networking.
Though today most users are networking drivers.

I do not know how ovs offloads are done without devlink by other vendors doing in-kernel drivers.

> So technically it could be extended but how hard it can be achieved in reality?
> 
What are the missing things?
I am extending it for subfunctions lifecycle. I see virtio as yet another flavour/type of subfunction.

> I still don't see why devlink is conflicted with GUID/sysfs, you can hook sysfs
It is not conflicting. If you look at what all devlink infrastructure provides, you will end up replicating all of it via sysfs..
It got syscaller support too, which is great for validation.
I have posted subfunction series with mdev and used devlink for all rest of the esw and mgmt. interface to utilize it.

sriov via sysfs and devlink sriov/esw handling has some severe locking issues, mainly because they are from two different interfaces.

> events to devlink or do post or pre configuration through devlink. This is much
> more easier than forcing all vendors to use devlink.
>
It is not about forcing. It is about leveraging existing kernel framework available without reinventing the wheel.
I am 100% sure, implementing health, dumps, traces, reporters, syscaller, monitors, interrupt configs, extending params via sysfs will be no-go.
sysfs is not meant for such things anymore. Any modern device management will need all of it.
Jason Wang Nov. 20, 2019, 3:59 a.m. UTC | #34
----- Original Message -----
> On Tue, Nov 19, 2019 at 07:16:21PM -0500, Michael S. Tsirkin wrote:
> > On Tue, Nov 19, 2019 at 07:10:23PM -0400, Jason Gunthorpe wrote:
> > > On Tue, Nov 19, 2019 at 04:33:40PM -0500, Michael S. Tsirkin wrote:
> > > > On Tue, Nov 19, 2019 at 03:15:47PM -0400, Jason Gunthorpe wrote:
> > > > > On Tue, Nov 19, 2019 at 01:58:42PM -0500, Michael S. Tsirkin wrote:
> > > > > > On Tue, Nov 19, 2019 at 12:46:32PM -0400, Jason Gunthorpe wrote:
> > > > > > > As always, this is all very hard to tell without actually seeing
> > > > > > > real
> > > > > > > accelerated drivers implement this.
> > > > > > > 
> > > > > > > Your patch series might be a bit premature in this regard.
> > > > > > 
> > > > > > Actually drivers implementing this have been posted, haven't they?
> > > > > > See e.g. https://lwn.net/Articles/804379/
> > > > > 
> > > > > Is that a real driver? It looks like another example quality
> > > > > thing.
> > > > > 
> > > > > For instance why do we need any of this if it has '#define
> > > > > IFCVF_MDEV_LIMIT 1' ?
> > > > > 
> > > > > Surely for this HW just use vfio over the entire PCI function and be
> > > > > done with it?
> > > > 
> > > > What this does is allow using it with unmodified virtio drivers
> > > > within guests.  You won't get this with passthrough as it only
> > > > implements parts of virtio in hardware.
> > > 
> > > I don't mean use vfio to perform passthrough, I mean to use vfio to
> > > implement the software parts in userspace while vfio to talk to the
> > > hardware.
> > 
> > You repeated vfio twice here, hard to decode what you meant actually.
> 
> 'while using vifo to talk to the hardware'
> 
> > >   kernel -> vfio -> user space virtio driver -> qemu -> guest
> >
> > Exactly what has been implemented for control path.
> 
> I do not mean the modified mediated vfio this series proposes, I mean
> vfio-pci, on a full PCI VF, exactly like we have today.
> 
> > The interface between vfio and userspace is
> > based on virtio which is IMHO much better than
> > a vendor specific one. userspace stays vendor agnostic.
> 
> Why is that even a good thing? It is much easier to provide drivers
> via qemu/etc in user space then it is to make kernel upgrades. We've
> learned this lesson many times.

For upgrades, since we had a unified interface. It could be done
through:

1) switch the datapath from hardware to software (e.g vhost)
2) unload and load the driver
3) switch teh datapath back

Having drivers in user space have other issues, there're a lot of
customers want to stick to kernel drivers.

> 
> This is why we have had the philosophy that if it doesn't need to be
> in the kernel it should be in userspace.

Let me clarify again. For this framework, it aims to support both
kernel driver and userspce driver. For this series, it only contains
the kernel driver part. What it did is to allow kernel virtio driver
to control vDPA devices. Then we can provide a unified interface for
all of the VM, containers and bare metal. For this use case, I don't
see a way to leave the driver in userspace other than injecting
traffic back through vhost/TAP which is ugly.

> 
> > > Generally we don't want to see things in the kernel that can be done
> > > in userspace, and to me, at least for this driver, this looks
> > > completely solvable in userspace.
> > 
> > I don't think that extends as far as actively encouraging userspace
> > drivers poking at hardware in a vendor specific way.
> 
> Yes, it does, if you can implement your user space requirements using
> vfio then why do you need a kernel driver?
>

VFIO is only for userspace driver, we want kernel virtio driver run as
well. That's why a unified API is designed for both.

> The kernel needs to be involved when there are things only the kernel
> can do. If IFC has such things they should be spelled out to justify
> using a mediated device.

Why? It allows a full functional virtio driver run on the host.


> 
> > That has lots of security and portability implications and isn't
> > appropriate for everyone.
> 
> This is already using vfio. It doesn't make sense to claim that using
> vfio properly is somehow less secure or less portable.
> 
> What I find particularly ugly is that this 'IFC VF NIC' driver
> pretends to be a mediated vfio device, but actually bypasses all the
> mediated device ops for managing dma security and just directly plugs
> the system IOMMU for the underlying PCI device into vfio.

Well, VFIO have multiple types of API. The design is to stick the VFIO
DMA model like container work for making DMA API work for userspace
driver. We can invent something our own but it must duplicate with the
exist API and it will be extra overhead when VFIO DMA API starts to
support stuffs like nesting or PASID.

So in conclusion for vhost-mdev:

- DMA is still done through VFIO manner e.g container fd etc.
- device API is totally virtio specific.

Compared with vfio-pci device, the only difference is the device API,
we don't use device fd but vhost-net fd, but of course we can switch
to use device fd. I'm sure we can settle this part down by having a
way that is acceptable by both sides.

> 
> I suppose this little hack is what is motivating this abuse of vfio in
> the first place?
> 
> Frankly I think a kernel driver touching a PCI function for which vfio
> is now controlling the system iommu for is a violation of the security
> model, and I'm very surprised AlexW didn't NAK this idea.
>
> Perhaps it is because none of the patches actually describe how the
> DMA security model for this so-called mediated device works? :(
>
> Or perhaps it is because this submission is split up so much it is
> hard to see what is being proposed? (I note this IFC driver is the
> first user of the mdev_set_iommu_device() function)
>

Are you objecting the mdev_set_iommu_deivce() stuffs here?

> > It is kernel's job to abstract hardware away and present a unified
> > interface as far as possible.
> 
> Sure, you could create a virtio accelerator driver framework in our
> new drivers/accel I hear was started. That could make some sense, if
> we had HW that actually required/benefited from kernel involvement.

The framework is not designed specifically for your card. It tries to be
generic to support every types of virtio hardware devices, it's not
tied to any bus (e.g PCI) and any vendor. So it's not only a question
of how to slice a PCIE ethernet device.

Thanks

> 
> Jason
> 
>
Jason Wang Nov. 20, 2019, 4:07 a.m. UTC | #35
On 2019/11/20 上午11:38, Parav Pandit wrote:
>
>> From: Jason Wang <jasowang@redhat.com>
>> Sent: Tuesday, November 19, 2019 9:15 PM
>>
>> ----- Original Message -----
>>>
>>>> From: Jason Wang <jasowang@redhat.com>
>>>> Sent: Tuesday, November 19, 2019 1:37 AM
>>>>
>>> Nop. Devlink is NOT net specific. It works at the bus/device level.
>>> Any block/scsi/crypto can register devlink instance and implement the
>>> necessary ops as long as device has bus.
>>>
>> Well, uapi/linux/devlink.h told me:
>>
>> "
>>   * include/uapi/linux/devlink.h - Network physical device Netlink interface "
>>
>> And the userspace tool was packaged into iproute2, the command was named
>> as "TC", "PORT", "ESWITCH". All of those were strong hints that it was network
>> specific. Even for networking, only few vendors choose to implement this.
>>
> It is under iproute2 tool but it is not limited to networking.
> Though today most users are networking drivers.
>
> I do not know how ovs offloads are done without devlink by other vendors doing in-kernel drivers.
>
>> So technically it could be extended but how hard it can be achieved in reality?
>>
> What are the missing things?
> I am extending it for subfunctions lifecycle. I see virtio as yet another flavour/type of subfunction.


Just to make sure we're on the same page. Sub function is only one of 
the possible cases for virtio. As I replied in another thread, we had 
already had NIC that does virtio at PF or VF level. For reality, I mean 
the effort spent on convincing all vendors to use devlink.


>
>> I still don't see why devlink is conflicted with GUID/sysfs, you can hook sysfs
> It is not conflicting. If you look at what all devlink infrastructure provides, you will end up replicating all of it via sysfs..


To clarify, I'm not saying duplicating all stuffs through sysfs. I meant 
whether we can:

1) create sub fucntion and do must to have pre configuration through 
devlink
2) only after sub function is created one more available instance was 
added and shown through sysfs
3) user can choose to create and use that mdev instance as it did for 
other type of device like vGPU
4) devlink can still use to report other stuffs


> It got syscaller support too, which is great for validation.
> I have posted subfunction series with mdev and used devlink for all rest of the esw and mgmt. interface to utilize it.
>
> sriov via sysfs and devlink sriov/esw handling has some severe locking issues, mainly because they are from two different interfaces.
>
>> events to devlink or do post or pre configuration through devlink. This is much
>> more easier than forcing all vendors to use devlink.
>>
> It is not about forcing. It is about leveraging existing kernel framework available without reinventing the wheel.
> I am 100% sure, implementing health, dumps, traces, reporters, syscaller, monitors, interrupt configs, extending params via sysfs will be no-go.
> sysfs is not meant for such things anymore. Any modern device management will need all of it.


I'm not familiar with other type of devices, but they should have their 
own vendor specific way to do that. That the real problems.

Thanks
Jason Wang Nov. 20, 2019, 5:34 a.m. UTC | #36
On 2019/11/20 上午11:59, Jason Wang wrote:
> Well, VFIO have multiple types of API. The design is to stick the VFIO
> DMA model like container work for making DMA API work for userspace
> driver. We can invent something our own but it must duplicate with the
> exist API and it will be extra overhead when VFIO DMA API starts to
> support stuffs like nesting or PASID.
>
> So in conclusion for vhost-mdev:
>
> - DMA is still done through VFIO manner e.g container fd etc.
> - device API is totally virtio specific.
>
> Compared with vfio-pci device, the only difference is the device API,
> we don't use device fd but vhost-net fd,


Correction here, device fd is used here instead of vhost-net fd.

Thanks


>   but of course we can switch
> to use device fd. I'm sure we can settle this part down by having a
> way that is acceptable by both sides.
Michael S. Tsirkin Nov. 20, 2019, 7:38 a.m. UTC | #37
On Tue, Nov 19, 2019 at 09:46:53PM -0400, Jason Gunthorpe wrote:
> On Tue, Nov 19, 2019 at 07:16:21PM -0500, Michael S. Tsirkin wrote:
> > On Tue, Nov 19, 2019 at 07:10:23PM -0400, Jason Gunthorpe wrote:
> > > On Tue, Nov 19, 2019 at 04:33:40PM -0500, Michael S. Tsirkin wrote:
> > > > On Tue, Nov 19, 2019 at 03:15:47PM -0400, Jason Gunthorpe wrote:
> > > > > On Tue, Nov 19, 2019 at 01:58:42PM -0500, Michael S. Tsirkin wrote:
> > > > > > On Tue, Nov 19, 2019 at 12:46:32PM -0400, Jason Gunthorpe wrote:
> > > > > > > As always, this is all very hard to tell without actually seeing real
> > > > > > > accelerated drivers implement this. 
> > > > > > > 
> > > > > > > Your patch series might be a bit premature in this regard.
> > > > > > 
> > > > > > Actually drivers implementing this have been posted, haven't they?
> > > > > > See e.g. https://lwn.net/Articles/804379/
> > > > > 
> > > > > Is that a real driver? It looks like another example quality
> > > > > thing. 
> > > > > 
> > > > > For instance why do we need any of this if it has '#define
> > > > > IFCVF_MDEV_LIMIT 1' ?
> > > > > 
> > > > > Surely for this HW just use vfio over the entire PCI function and be
> > > > > done with it?
> > > > 
> > > > What this does is allow using it with unmodified virtio drivers
> > > > within guests.  You won't get this with passthrough as it only
> > > > implements parts of virtio in hardware.
> > > 
> > > I don't mean use vfio to perform passthrough, I mean to use vfio to
> > > implement the software parts in userspace while vfio to talk to the
> > > hardware.
> > 
> > You repeated vfio twice here, hard to decode what you meant actually.
> 
> 'while using vifo to talk to the hardware'

Sorry still have trouble reading that.

> > >   kernel -> vfio -> user space virtio driver -> qemu -> guest
> >
> > Exactly what has been implemented for control path.
> 
> I do not mean the modified mediated vfio this series proposes, I mean
> vfio-pci, on a full PCI VF, exactly like we have today.
> 
> > The interface between vfio and userspace is
> > based on virtio which is IMHO much better than
> > a vendor specific one. userspace stays vendor agnostic.
> 
> Why is that even a good thing? It is much easier to provide drivers
> via qemu/etc in user space then it is to make kernel upgrades. We've
> learned this lesson many times.
> 
> This is why we have had the philosophy that if it doesn't need to be
> in the kernel it should be in userspace.
> 
> > > Generally we don't want to see things in the kernel that can be done
> > > in userspace, and to me, at least for this driver, this looks
> > > completely solvable in userspace.
> > 
> > I don't think that extends as far as actively encouraging userspace
> > drivers poking at hardware in a vendor specific way.  
> 
> Yes, it does, if you can implement your user space requirements using
> vfio then why do you need a kernel driver?

People's requirements differ. You are happy with just pass through a VF
you can already use it. Case closed. There are enough people who have
a fixed userspace that people have built virtio accelerators,
now there's value in supporting that, and a vendor specific
userspace blob is not supporting that requirement.

> The kernel needs to be involved when there are things only the kernel
> can do. If IFC has such things they should be spelled out to justify
> using a mediated device.
> 
> > That has lots of security and portability implications and isn't
> > appropriate for everyone. 
> 
> This is already using vfio.

It's using the IOMMU parts since these are portable.
But the userspace interface is vendor-independent here.

> It doesn't make sense to claim that using
> vfio properly is somehow less secure or less portable.
> 
> What I find particularly ugly is that this 'IFC VF NIC' driver
> pretends to be a mediated vfio device, but actually bypasses all the
> mediated device ops for managing dma security and just directly plugs
> the system IOMMU for the underlying PCI device into vfio.
> 
> I suppose this little hack is what is motivating this abuse of vfio in
> the first place?
> 
> Frankly I think a kernel driver touching a PCI function for which vfio
> is now controlling the system iommu for is a violation of the security
> model, and I'm very surprised AlexW didn't NAK this idea.
>
> Perhaps it is because none of the patches actually describe how the
> DMA security model for this so-called mediated device works? :(

That can be improved, good point.

> Or perhaps it is because this submission is split up so much it is
> hard to see what is being proposed? (I note this IFC driver is the
> first user of the mdev_set_iommu_device() function)

I agree it's hard, but then 3 people seem to work on that
at the same time.

> > It is kernel's job to abstract hardware away and present a unified
> > interface as far as possible.
> 
> Sure, you could create a virtio accelerator driver framework in our
> new drivers/accel I hear was started. That could make some sense, if
> we had HW that actually required/benefited from kernel involvement.
> 
> Jason
Michael S. Tsirkin Nov. 20, 2019, 8:52 a.m. UTC | #38
On Wed, Nov 20, 2019 at 03:38:18AM +0000, Parav Pandit wrote:
> 
> 
> > From: Jason Wang <jasowang@redhat.com>
> > Sent: Tuesday, November 19, 2019 9:15 PM
> > 
> > ----- Original Message -----
> > >
> > >
> > > > From: Jason Wang <jasowang@redhat.com>
> > > > Sent: Tuesday, November 19, 2019 1:37 AM
> > > >
> > >
> > > Nop. Devlink is NOT net specific. It works at the bus/device level.
> > > Any block/scsi/crypto can register devlink instance and implement the
> > > necessary ops as long as device has bus.
> > >
> > 
> > Well, uapi/linux/devlink.h told me:
> > 
> > "
> >  * include/uapi/linux/devlink.h - Network physical device Netlink interface "
> > 
> > And the userspace tool was packaged into iproute2, the command was named
> > as "TC", "PORT", "ESWITCH". All of those were strong hints that it was network
> > specific. Even for networking, only few vendors choose to implement this.
> > 
> It is under iproute2 tool but it is not limited to networking.

Want to fix the documentation then?
That's sure to confuse people ...
Jiri Pirko Nov. 20, 2019, 12:03 p.m. UTC | #39
Wed, Nov 20, 2019 at 09:52:23AM CET, mst@redhat.com wrote:
>On Wed, Nov 20, 2019 at 03:38:18AM +0000, Parav Pandit wrote:
>> 
>> 
>> > From: Jason Wang <jasowang@redhat.com>
>> > Sent: Tuesday, November 19, 2019 9:15 PM
>> > 
>> > ----- Original Message -----
>> > >
>> > >
>> > > > From: Jason Wang <jasowang@redhat.com>
>> > > > Sent: Tuesday, November 19, 2019 1:37 AM
>> > > >
>> > >
>> > > Nop. Devlink is NOT net specific. It works at the bus/device level.
>> > > Any block/scsi/crypto can register devlink instance and implement the
>> > > necessary ops as long as device has bus.
>> > >
>> > 
>> > Well, uapi/linux/devlink.h told me:
>> > 
>> > "
>> >  * include/uapi/linux/devlink.h - Network physical device Netlink interface "
>> > 
>> > And the userspace tool was packaged into iproute2, the command was named
>> > as "TC", "PORT", "ESWITCH". All of those were strong hints that it was network
>> > specific. Even for networking, only few vendors choose to implement this.
>> > 
>> It is under iproute2 tool but it is not limited to networking.
>
>Want to fix the documentation then?
>That's sure to confuse people ...

Will do.

>
>-- 
>MST
>
Jason Gunthorpe Nov. 20, 2019, 1:03 p.m. UTC | #40
On Wed, Nov 20, 2019 at 02:38:08AM -0500, Michael S. Tsirkin wrote:
> > > I don't think that extends as far as actively encouraging userspace
> > > drivers poking at hardware in a vendor specific way.  
> > 
> > Yes, it does, if you can implement your user space requirements using
> > vfio then why do you need a kernel driver?
> 
> People's requirements differ. You are happy with just pass through a VF
> you can already use it. Case closed. There are enough people who have
> a fixed userspace that people have built virtio accelerators,
> now there's value in supporting that, and a vendor specific
> userspace blob is not supporting that requirement.

I have no idea what you are trying to explain here. I'm not advocating
for vfio pass through.

Jason
Jason Gunthorpe Nov. 20, 2019, 1:33 p.m. UTC | #41
On Tue, Nov 19, 2019 at 10:24:51PM -0500, Jason Wang wrote:

> > The driver providing the virtio should really be in control of the
> > life cycle policy. For net related virtio that is clearly devlink.
> 
> As replied in another thread, there were already existed devices
> (Intel IFC VF) that doesn't use devlink.

Why is that a justification? Drivers can learn to use devlink, it
isn't like it is set in stone.

Jason
Jason Gunthorpe Nov. 20, 2019, 1:38 p.m. UTC | #42
On Tue, Nov 19, 2019 at 10:59:20PM -0500, Jason Wang wrote:

> > > The interface between vfio and userspace is
> > > based on virtio which is IMHO much better than
> > > a vendor specific one. userspace stays vendor agnostic.
> > 
> > Why is that even a good thing? It is much easier to provide drivers
> > via qemu/etc in user space then it is to make kernel upgrades. We've
> > learned this lesson many times.
> 
> For upgrades, since we had a unified interface. It could be done
> through:
> 
> 1) switch the datapath from hardware to software (e.g vhost)
> 2) unload and load the driver
> 3) switch teh datapath back
> 
> Having drivers in user space have other issues, there're a lot of
> customers want to stick to kernel drivers.

So you want to support upgrade of kernel modules, but runtime
upgrading the userspace part is impossible? Seems very strange to me.

> > This is why we have had the philosophy that if it doesn't need to be
> > in the kernel it should be in userspace.
> 
> Let me clarify again. For this framework, it aims to support both
> kernel driver and userspce driver. For this series, it only contains
> the kernel driver part. What it did is to allow kernel virtio driver
> to control vDPA devices. Then we can provide a unified interface for
> all of the VM, containers and bare metal. For this use case, I don't
> see a way to leave the driver in userspace other than injecting
> traffic back through vhost/TAP which is ugly.

Binding to the other kernel virtio drivers is a reasonable
justification, but none of this comes through in the patch cover
letters or patch commit messages.

> > > That has lots of security and portability implications and isn't
> > > appropriate for everyone.
> > 
> > This is already using vfio. It doesn't make sense to claim that using
> > vfio properly is somehow less secure or less portable.
> > 
> > What I find particularly ugly is that this 'IFC VF NIC' driver
> > pretends to be a mediated vfio device, but actually bypasses all the
> > mediated device ops for managing dma security and just directly plugs
> > the system IOMMU for the underlying PCI device into vfio.
> 
> Well, VFIO have multiple types of API. The design is to stick the VFIO
> DMA model like container work for making DMA API work for userspace
> driver.

Well, it doesn't, that model, for security, is predicated on vfio
being the exclusive owner of the device. For instance if the kernel
driver were to perform DMA as well then security would be lost.

> > I suppose this little hack is what is motivating this abuse of vfio in
> > the first place?
> > 
> > Frankly I think a kernel driver touching a PCI function for which vfio
> > is now controlling the system iommu for is a violation of the security
> > model, and I'm very surprised AlexW didn't NAK this idea.
> >
> > Perhaps it is because none of the patches actually describe how the
> > DMA security model for this so-called mediated device works? :(
> >
> > Or perhaps it is because this submission is split up so much it is
> > hard to see what is being proposed? (I note this IFC driver is the
> > first user of the mdev_set_iommu_device() function)
> 
> Are you objecting the mdev_set_iommu_deivce() stuffs here?

I'm questioning if it fits the vfio PCI device security model, yes.

> > > It is kernel's job to abstract hardware away and present a unified
> > > interface as far as possible.
> > 
> > Sure, you could create a virtio accelerator driver framework in our
> > new drivers/accel I hear was started. That could make some sense, if
> > we had HW that actually required/benefited from kernel involvement.
> 
> The framework is not designed specifically for your card. It tries to be
> generic to support every types of virtio hardware devices, it's not
> tied to any bus (e.g PCI) and any vendor. So it's not only a question
> of how to slice a PCIE ethernet device.

That doesn't explain why this isn't some new driver subsystem and
instead treats vfio as a driver multiplexer.

Jason
Jason Gunthorpe Nov. 20, 2019, 1:41 p.m. UTC | #43
On Wed, Nov 20, 2019 at 12:07:59PM +0800, Jason Wang wrote:

> 1) create sub fucntion and do must to have pre configuration through devlink
> 2) only after sub function is created one more available instance was added
> and shown through sysfs
> 3) user can choose to create and use that mdev instance as it did for other
> type of device like vGPU
> 4) devlink can still use to report other stuffs

Why do we want the extra step #3? The user already indicated they want
a mdev via #1

I have the same question for the PF and VF cases, why doesn't a mdev
get created automatically when the VF is probed? Why does this need
the guid stuff?

The guid stuff was intended for, essentially, multi-function devices
that could be sliced up, I don't think it makes sense to use it for
single-function VF devices like the ICF driver.

Overall the guid thing should be optional. Drivers providing mdev
should be able to use another scheme, like devlink, to on demand
create their mdevs.

Jason
Michael S. Tsirkin Nov. 20, 2019, 1:43 p.m. UTC | #44
On Wed, Nov 20, 2019 at 09:03:19AM -0400, Jason Gunthorpe wrote:
> On Wed, Nov 20, 2019 at 02:38:08AM -0500, Michael S. Tsirkin wrote:
> > > > I don't think that extends as far as actively encouraging userspace
> > > > drivers poking at hardware in a vendor specific way.  
> > > 
> > > Yes, it does, if you can implement your user space requirements using
> > > vfio then why do you need a kernel driver?
> > 
> > People's requirements differ. You are happy with just pass through a VF
> > you can already use it. Case closed. There are enough people who have
> > a fixed userspace that people have built virtio accelerators,
> > now there's value in supporting that, and a vendor specific
> > userspace blob is not supporting that requirement.
> 
> I have no idea what you are trying to explain here. I'm not advocating
> for vfio pass through.

You seem to come from an RDMA background, used to userspace linking to
vendor libraries to do basic things like push bits out on the network,
because users live on the performance edge and rebuild their
userspace often anyway.

Lots of people are not like that, they would rather have the
vendor-specific driver live in the kernel, with userspace being
portable, thank you very much.
Michael S. Tsirkin Nov. 20, 2019, 2:15 p.m. UTC | #45
On Wed, Nov 20, 2019 at 09:38:35AM -0400, Jason Gunthorpe wrote:
> On Tue, Nov 19, 2019 at 10:59:20PM -0500, Jason Wang wrote:
> 
> > > > The interface between vfio and userspace is
> > > > based on virtio which is IMHO much better than
> > > > a vendor specific one. userspace stays vendor agnostic.
> > > 
> > > Why is that even a good thing? It is much easier to provide drivers
> > > via qemu/etc in user space then it is to make kernel upgrades. We've
> > > learned this lesson many times.
> > 
> > For upgrades, since we had a unified interface. It could be done
> > through:
> > 
> > 1) switch the datapath from hardware to software (e.g vhost)
> > 2) unload and load the driver
> > 3) switch teh datapath back
> > 
> > Having drivers in user space have other issues, there're a lot of
> > customers want to stick to kernel drivers.
> 
> So you want to support upgrade of kernel modules, but runtime
> upgrading the userspace part is impossible? Seems very strange to me.

It's still true, you have to kill userspace to update a shared library.

Not to mention that things like rust encourage static builds so you
can't update a library even if you were willing to risk doing
that.

> > > This is why we have had the philosophy that if it doesn't need to be
> > > in the kernel it should be in userspace.
> > 
> > Let me clarify again. For this framework, it aims to support both
> > kernel driver and userspce driver. For this series, it only contains
> > the kernel driver part. What it did is to allow kernel virtio driver
> > to control vDPA devices. Then we can provide a unified interface for
> > all of the VM, containers and bare metal. For this use case, I don't
> > see a way to leave the driver in userspace other than injecting
> > traffic back through vhost/TAP which is ugly.
> 
> Binding to the other kernel virtio drivers is a reasonable
> justification, but none of this comes through in the patch cover
> letters or patch commit messages.

Yea this could have been communicated better.

> > > > That has lots of security and portability implications and isn't
> > > > appropriate for everyone.
> > > 
> > > This is already using vfio. It doesn't make sense to claim that using
> > > vfio properly is somehow less secure or less portable.
> > > 
> > > What I find particularly ugly is that this 'IFC VF NIC' driver
> > > pretends to be a mediated vfio device, but actually bypasses all the
> > > mediated device ops for managing dma security and just directly plugs
> > > the system IOMMU for the underlying PCI device into vfio.
> > 
> > Well, VFIO have multiple types of API. The design is to stick the VFIO
> > DMA model like container work for making DMA API work for userspace
> > driver.
> 
> Well, it doesn't, that model, for security, is predicated on vfio
> being the exclusive owner of the device. For instance if the kernel
> driver were to perform DMA as well then security would be lost.

The assumption at least IFC driver makes is that the kernel
driver does no DMA.

> > > I suppose this little hack is what is motivating this abuse of vfio in
> > > the first place?
> > > 
> > > Frankly I think a kernel driver touching a PCI function for which vfio
> > > is now controlling the system iommu for is a violation of the security
> > > model, and I'm very surprised AlexW didn't NAK this idea.
> > >
> > > Perhaps it is because none of the patches actually describe how the
> > > DMA security model for this so-called mediated device works? :(
> > >
> > > Or perhaps it is because this submission is split up so much it is
> > > hard to see what is being proposed? (I note this IFC driver is the
> > > first user of the mdev_set_iommu_device() function)
> > 
> > Are you objecting the mdev_set_iommu_deivce() stuffs here?
> 
> I'm questioning if it fits the vfio PCI device security model, yes.

If you look at the IFC patch you'll find it doesn't do DMA, that's
what makes it secure.

> > > > It is kernel's job to abstract hardware away and present a unified
> > > > interface as far as possible.
> > > 
> > > Sure, you could create a virtio accelerator driver framework in our
> > > new drivers/accel I hear was started. That could make some sense, if
> > > we had HW that actually required/benefited from kernel involvement.
> > 
> > The framework is not designed specifically for your card. It tries to be
> > generic to support every types of virtio hardware devices, it's not
> > tied to any bus (e.g PCI) and any vendor. So it's not only a question
> > of how to slice a PCIE ethernet device.
> 
> That doesn't explain why this isn't some new driver subsystem and
> instead treats vfio as a driver multiplexer.
> 
> Jason

That motivation's missing.
Jason Gunthorpe Nov. 20, 2019, 2:30 p.m. UTC | #46
On Wed, Nov 20, 2019 at 08:43:20AM -0500, Michael S. Tsirkin wrote:
> On Wed, Nov 20, 2019 at 09:03:19AM -0400, Jason Gunthorpe wrote:
> > On Wed, Nov 20, 2019 at 02:38:08AM -0500, Michael S. Tsirkin wrote:
> > > > > I don't think that extends as far as actively encouraging userspace
> > > > > drivers poking at hardware in a vendor specific way.  
> > > > 
> > > > Yes, it does, if you can implement your user space requirements using
> > > > vfio then why do you need a kernel driver?
> > > 
> > > People's requirements differ. You are happy with just pass through a VF
> > > you can already use it. Case closed. There are enough people who have
> > > a fixed userspace that people have built virtio accelerators,
> > > now there's value in supporting that, and a vendor specific
> > > userspace blob is not supporting that requirement.
> > 
> > I have no idea what you are trying to explain here. I'm not advocating
> > for vfio pass through.
> 
> You seem to come from an RDMA background, used to userspace linking to
> vendor libraries to do basic things like push bits out on the network,
> because users live on the performance edge and rebuild their
> userspace often anyway.
> 
> Lots of people are not like that, they would rather have the
> vendor-specific driver live in the kernel, with userspace being
> portable, thank you very much.

You are actually proposing a very RDMA like approach with a split
kernel/user driver design. Maybe the virtio user driver will turn out
to be 'portable'.

Based on the last 20 years of experience, the kernel component has
proven to be the larger burden and drag than the userspace part. I
think the high interest in DPDK, SPDK and others show this is a common
principle.

At the very least for new approaches like this it makes alot of sense
to have a user space driver until enough HW is available that a
proper, well thought out kernel side can be built.

For instance, this VFIO based approach might be very suitable to the
intel VF based ICF driver, but we don't yet have an example of non-VF
HW that might not be well suited to VFIO.

Jason
Michael S. Tsirkin Nov. 20, 2019, 2:57 p.m. UTC | #47
On Wed, Nov 20, 2019 at 10:30:54AM -0400, Jason Gunthorpe wrote:
> On Wed, Nov 20, 2019 at 08:43:20AM -0500, Michael S. Tsirkin wrote:
> > On Wed, Nov 20, 2019 at 09:03:19AM -0400, Jason Gunthorpe wrote:
> > > On Wed, Nov 20, 2019 at 02:38:08AM -0500, Michael S. Tsirkin wrote:
> > > > > > I don't think that extends as far as actively encouraging userspace
> > > > > > drivers poking at hardware in a vendor specific way.  
> > > > > 
> > > > > Yes, it does, if you can implement your user space requirements using
> > > > > vfio then why do you need a kernel driver?
> > > > 
> > > > People's requirements differ. You are happy with just pass through a VF
> > > > you can already use it. Case closed. There are enough people who have
> > > > a fixed userspace that people have built virtio accelerators,
> > > > now there's value in supporting that, and a vendor specific
> > > > userspace blob is not supporting that requirement.
> > > 
> > > I have no idea what you are trying to explain here. I'm not advocating
> > > for vfio pass through.
> > 
> > You seem to come from an RDMA background, used to userspace linking to
> > vendor libraries to do basic things like push bits out on the network,
> > because users live on the performance edge and rebuild their
> > userspace often anyway.
> > 
> > Lots of people are not like that, they would rather have the
> > vendor-specific driver live in the kernel, with userspace being
> > portable, thank you very much.
> 
> You are actually proposing a very RDMA like approach with a split
> kernel/user driver design. Maybe the virtio user driver will turn out
> to be 'portable'.
> 
> Based on the last 20 years of experience, the kernel component has
> proven to be the larger burden and drag than the userspace part. I
> think the high interest in DPDK, SPDK and others show this is a common
> principle.

And I guess the interest in BPF shows the opposite?
I don't see how this kind of argument proves anything.  DPDK/SPDK are
written by a group of developers who care about raw speed and nothing
else. I guess in that setting you want a userspace driver. I know you
work for a hardware company so to you it looks like that's all people
care about.  More power to you, but that need seems to be
addressed by dpdk.
But lots of people would rather have e.g. better security
than a 0.1% faster networking.

> At the very least for new approaches like this it makes alot of sense
> to have a user space driver until enough HW is available that a
> proper, well thought out kernel side can be built.

But hardware is available, driver has been posted by Intel.
Have you looked at that?

So I am not sure it's a good idea to discuss whether code is "proper" or
"so-called", that just does not sound like constructive criticism.
And I think it might be helpful if you look at the code and provide
comments, so far your comments are just on the cover letter and commit
logs. If you look at that you might find your answer to why Alex did not
nak this.

> For instance, this VFIO based approach might be very suitable to the
> intel VF based ICF driver, but we don't yet have an example of non-VF
> HW that might not be well suited to VFIO.
> 
> Jason

I don't think we should keep moving the goalposts like this.

If people write drivers and find some infrastruture useful,
and it looks more or less generic on the outset, then I don't
see why it's a bad idea to merge it.

*We don't want to decide how to support this hardware, write a userspace driver*
isn't a reasonable approach IMHO.
Jason Gunthorpe Nov. 20, 2019, 4:45 p.m. UTC | #48
On Wed, Nov 20, 2019 at 09:57:17AM -0500, Michael S. Tsirkin wrote:
> On Wed, Nov 20, 2019 at 10:30:54AM -0400, Jason Gunthorpe wrote:
> > On Wed, Nov 20, 2019 at 08:43:20AM -0500, Michael S. Tsirkin wrote:
> > > On Wed, Nov 20, 2019 at 09:03:19AM -0400, Jason Gunthorpe wrote:
> > > > On Wed, Nov 20, 2019 at 02:38:08AM -0500, Michael S. Tsirkin wrote:
> > > > > > > I don't think that extends as far as actively encouraging userspace
> > > > > > > drivers poking at hardware in a vendor specific way.  
> > > > > > 
> > > > > > Yes, it does, if you can implement your user space requirements using
> > > > > > vfio then why do you need a kernel driver?
> > > > > 
> > > > > People's requirements differ. You are happy with just pass through a VF
> > > > > you can already use it. Case closed. There are enough people who have
> > > > > a fixed userspace that people have built virtio accelerators,
> > > > > now there's value in supporting that, and a vendor specific
> > > > > userspace blob is not supporting that requirement.
> > > > 
> > > > I have no idea what you are trying to explain here. I'm not advocating
> > > > for vfio pass through.
> > > 
> > > You seem to come from an RDMA background, used to userspace linking to
> > > vendor libraries to do basic things like push bits out on the network,
> > > because users live on the performance edge and rebuild their
> > > userspace often anyway.
> > > 
> > > Lots of people are not like that, they would rather have the
> > > vendor-specific driver live in the kernel, with userspace being
> > > portable, thank you very much.
> > 
> > You are actually proposing a very RDMA like approach with a split
> > kernel/user driver design. Maybe the virtio user driver will turn out
> > to be 'portable'.
> > 
> > Based on the last 20 years of experience, the kernel component has
> > proven to be the larger burden and drag than the userspace part. I
> > think the high interest in DPDK, SPDK and others show this is a common
> > principle.
> 
> And I guess the interest in BPF shows the opposite?

There is room for both, I wouldn't discount either approach entirely
out of hand.

> > At the very least for new approaches like this it makes alot of sense
> > to have a user space driver until enough HW is available that a
> > proper, well thought out kernel side can be built.
> 
> But hardware is available, driver has been posted by Intel.
> Have you looked at that?

I'm not sure pointing at that driver is so helpful, it is very small
and mostly just reflects virtio ops into some undocumented register
pokes.

There is no explanation at all for the large scale architecture
choices:
 - Why vfio
 - Why mdev without providing a device IOMMU
 - Why use GUID lifecycle management for singlton function PF/VF
   drivers
 - Why not use devlink
 - Why not use vfio-pci with a userspace driver

These are legitimate questions and answers like "because we like it
this way" or "this is how the drivers are written today" isn't very
satisfying at all.

> > For instance, this VFIO based approach might be very suitable to the
> > intel VF based ICF driver, but we don't yet have an example of non-VF
> > HW that might not be well suited to VFIO.
>
> I don't think we should keep moving the goalposts like this.

It is ABI, it should be done as best we can as we have to live with it
for a long time. Right now HW is just starting to come to market with
VDPA and it feels rushed to design a whole subsystem style ABI around
one, quite simplistic, driver example.

> If people write drivers and find some infrastruture useful,
> and it looks more or less generic on the outset, then I don't
> see why it's a bad idea to merge it.

Because it is userspace ABI, caution is always justified when defining
new ABI.

Jason
Alex Williamson Nov. 20, 2019, 5:28 p.m. UTC | #49
On Wed, 20 Nov 2019 09:38:35 -0400
Jason Gunthorpe <jgg@ziepe.ca> wrote:

> On Tue, Nov 19, 2019 at 10:59:20PM -0500, Jason Wang wrote:
> 
> > > > The interface between vfio and userspace is
> > > > based on virtio which is IMHO much better than
> > > > a vendor specific one. userspace stays vendor agnostic.  
> > > 
> > > Why is that even a good thing? It is much easier to provide drivers
> > > via qemu/etc in user space then it is to make kernel upgrades. We've
> > > learned this lesson many times.  
> > 
> > For upgrades, since we had a unified interface. It could be done
> > through:
> > 
> > 1) switch the datapath from hardware to software (e.g vhost)
> > 2) unload and load the driver
> > 3) switch teh datapath back
> > 
> > Having drivers in user space have other issues, there're a lot of
> > customers want to stick to kernel drivers.  
> 
> So you want to support upgrade of kernel modules, but runtime
> upgrading the userspace part is impossible? Seems very strange to me.
> 
> > > This is why we have had the philosophy that if it doesn't need to be
> > > in the kernel it should be in userspace.  
> > 
> > Let me clarify again. For this framework, it aims to support both
> > kernel driver and userspce driver. For this series, it only contains
> > the kernel driver part. What it did is to allow kernel virtio driver
> > to control vDPA devices. Then we can provide a unified interface for
> > all of the VM, containers and bare metal. For this use case, I don't
> > see a way to leave the driver in userspace other than injecting
> > traffic back through vhost/TAP which is ugly.  
> 
> Binding to the other kernel virtio drivers is a reasonable
> justification, but none of this comes through in the patch cover
> letters or patch commit messages.
> 
> > > > That has lots of security and portability implications and isn't
> > > > appropriate for everyone.  
> > > 
> > > This is already using vfio. It doesn't make sense to claim that using
> > > vfio properly is somehow less secure or less portable.
> > > 
> > > What I find particularly ugly is that this 'IFC VF NIC' driver
> > > pretends to be a mediated vfio device, but actually bypasses all the
> > > mediated device ops for managing dma security and just directly plugs
> > > the system IOMMU for the underlying PCI device into vfio.  
> > 
> > Well, VFIO have multiple types of API. The design is to stick the VFIO
> > DMA model like container work for making DMA API work for userspace
> > driver.  
> 
> Well, it doesn't, that model, for security, is predicated on vfio
> being the exclusive owner of the device. For instance if the kernel
> driver were to perform DMA as well then security would be lost.
> 
> > > I suppose this little hack is what is motivating this abuse of vfio in
> > > the first place?
> > > 
> > > Frankly I think a kernel driver touching a PCI function for which vfio
> > > is now controlling the system iommu for is a violation of the security
> > > model, and I'm very surprised AlexW didn't NAK this idea.
> > >
> > > Perhaps it is because none of the patches actually describe how the
> > > DMA security model for this so-called mediated device works? :(
> > >
> > > Or perhaps it is because this submission is split up so much it is
> > > hard to see what is being proposed? (I note this IFC driver is the
> > > first user of the mdev_set_iommu_device() function)  
> > 
> > Are you objecting the mdev_set_iommu_deivce() stuffs here?  
> 
> I'm questioning if it fits the vfio PCI device security model, yes.

The mdev IOMMU backing device model is for when an mdev device has
IOMMU based isolation, either via the PCI requester ID or via requester
ID + PASID.  For example, an SR-IOV VF may be used by a vendor to
provide IOMMU based translation and isolation, but the VF may not be
complete otherwise to provide a self contained device.  It might
require explicit coordination and interaction with the PF driver, ie.
mediation.  The IOMMU backing device is certainly not meant to share an
IOMMU address space with host drivers, except as necessary for the
mediation of the device.  The vfio model manages the IOMMU domain of
the backing device exclusively, any attempt to dual-host the device
respective to the IOMMU should fault in the dma/iommu-ops.  Thanks,

Alex
Jason Gunthorpe Nov. 20, 2019, 6:11 p.m. UTC | #50
On Wed, Nov 20, 2019 at 10:28:56AM -0700, Alex Williamson wrote:
> > > Are you objecting the mdev_set_iommu_deivce() stuffs here?  
> > 
> > I'm questioning if it fits the vfio PCI device security model, yes.
> 
> The mdev IOMMU backing device model is for when an mdev device has
> IOMMU based isolation, either via the PCI requester ID or via requester
> ID + PASID.  For example, an SR-IOV VF may be used by a vendor to
> provide IOMMU based translation and isolation, but the VF may not be
> complete otherwise to provide a self contained device.  It might
> require explicit coordination and interaction with the PF driver, ie.
> mediation.  

In this case the PF does not look to be involved, the ICF kernel
driver is only manipulating registers in the same VF that the vfio
owns the IOMMU for.

This is why I keep calling it a "so-called mediated device" because it
is absolutely not clear what the kernel driver is mediating. Nearly
all its work is providing a subsystem-style IOCTL interface under the
existing vfio multiplexer unrelated to vfio requirements for DMA.

> The IOMMU backing device is certainly not meant to share an IOMMU
> address space with host drivers, except as necessary for the
> mediation of the device.  The vfio model manages the IOMMU domain of
> the backing device exclusively, any attempt to dual-host the device
> respective to the IOMMU should fault in the dma/iommu-ops.  Thanks,

Sounds more reasonable if the kernel dma_ops are prevented while vfio
is using the device.

However, to me it feels wrong that just because a driver wishes to use
PASID or IOMMU features it should go through vfio and mediated
devices.

It is not even necessary as we have several examples already of
drivers using these features without vfio.

I feel like mdev is suffering from mission creep. I see people
proposing to use mdev for many wild things, the Mellanox SF stuff in
the other thread and this 'virtio subsystem' being the two that have
come up publicly this month.

Putting some boundaries on mdev usage would really help people know
when to use it. My top two from this discussion would be:

- mdev devices should only bind to vfio. It is not a general kernel
  driver matcher mechanism. It is not 'virtual-bus'.

- mdev & vfio are not a substitute for a proper kernel subsystem. We
  shouldn't export a complex subsystem-like ioctl API through
  vfio ioctl extensions. Make a proper subsystem, it is not so hard.

Maybe others agree?

Thanks,
Jason
Michael S. Tsirkin Nov. 20, 2019, 10:05 p.m. UTC | #51
On Wed, Nov 20, 2019 at 12:45:25PM -0400, Jason Gunthorpe wrote:
> > > For instance, this VFIO based approach might be very suitable to the
> > > intel VF based ICF driver, but we don't yet have an example of non-VF
> > > HW that might not be well suited to VFIO.
> >
> > I don't think we should keep moving the goalposts like this.
> 
> It is ABI, it should be done as best we can as we have to live with it
> for a long time. Right now HW is just starting to come to market with
> VDPA and it feels rushed to design a whole subsystem style ABI around
> one, quite simplistic, driver example.

Well one has to enable hardware in some way. It's not really reasonable
to ask for multiple devices to be available just so there's a driver and
people can use them. At this rate no one will want to be the first to
ship new devices ;)

> > If people write drivers and find some infrastruture useful,
> > and it looks more or less generic on the outset, then I don't
> > see why it's a bad idea to merge it.
> 
> Because it is userspace ABI, caution is always justified when defining
> new ABI.


Reasonable caution, sure. Asking Alex to block Intel's driver until
someone else catches up and ships competing hardware isn't reasonable
though. If that's your proposal I guess we'll have to agree to disagree.
Alex Williamson Nov. 20, 2019, 10:07 p.m. UTC | #52
On Wed, 20 Nov 2019 14:11:08 -0400
Jason Gunthorpe <jgg@ziepe.ca> wrote:

> On Wed, Nov 20, 2019 at 10:28:56AM -0700, Alex Williamson wrote:
> > > > Are you objecting the mdev_set_iommu_deivce() stuffs here?    
> > > 
> > > I'm questioning if it fits the vfio PCI device security model, yes.  
> > 
> > The mdev IOMMU backing device model is for when an mdev device has
> > IOMMU based isolation, either via the PCI requester ID or via requester
> > ID + PASID.  For example, an SR-IOV VF may be used by a vendor to
> > provide IOMMU based translation and isolation, but the VF may not be
> > complete otherwise to provide a self contained device.  It might
> > require explicit coordination and interaction with the PF driver, ie.
> > mediation.    
> 
> In this case the PF does not look to be involved, the ICF kernel
> driver is only manipulating registers in the same VF that the vfio
> owns the IOMMU for.

The mdev_set_iommu_device() call is probably getting caught up in the
confusion of mdev as it exists today being vfio specific.  What I
described in my reply is vfio specific.  The vfio iommu backend is
currently the only code that calls mdev_get_iommu_device(), JasonW
doesn't use it in the virtio-mdev code, so this seems like a stray vfio
specific interface that's setup by IFC but never used.

> This is why I keep calling it a "so-called mediated device" because it
> is absolutely not clear what the kernel driver is mediating. Nearly
> all its work is providing a subsystem-style IOCTL interface under the
> existing vfio multiplexer unrelated to vfio requirements for DMA.

Names don't always evolve well to what an interface becomes, see for
example vfio.  However, even in the vfio sense of mediated devices we
have protocol translation.  The mdev vendor driver translates vfio API
callbacks into hardware specific interactions.  Is this really much
different?

> > The IOMMU backing device is certainly not meant to share an IOMMU
> > address space with host drivers, except as necessary for the
> > mediation of the device.  The vfio model manages the IOMMU domain of
> > the backing device exclusively, any attempt to dual-host the device
> > respective to the IOMMU should fault in the dma/iommu-ops.  Thanks,  
> 
> Sounds more reasonable if the kernel dma_ops are prevented while vfio
> is using the device.

AFAIK we can't mix DMA ops and IOMMU ops at the same time and the
domain information necessary for the latter is owned within the vfio
IOMMU backend.

> However, to me it feels wrong that just because a driver wishes to use
> PASID or IOMMU features it should go through vfio and mediated
> devices.

I don't think I said this.  IOMMU backing of an mdev is an acceleration
feature as far as vfio-mdev is concerned.  There are clearly other ways
to use the IOMMU.

> It is not even necessary as we have several examples already of
> drivers using these features without vfio.

Of course.

> I feel like mdev is suffering from mission creep. I see people
> proposing to use mdev for many wild things, the Mellanox SF stuff in
> the other thread and this 'virtio subsystem' being the two that have
> come up publicly this month.

Tell me about it... ;)
 
> Putting some boundaries on mdev usage would really help people know
> when to use it. My top two from this discussion would be:
> 
> - mdev devices should only bind to vfio. It is not a general kernel
>   driver matcher mechanism. It is not 'virtual-bus'.

I think this requires the driver-core knowledge to really appreciate.
Otherwise there's apparently a common need to create sub-devices and
without closer inspection of the bus:driver API contract, it's too easy
to try to abstract the device:driver API via the bus.  mdev already has
a notion that the device itself can use any API, but the interface to
the bus is the vendor provided, vfio compatible callbacks.

> - mdev & vfio are not a substitute for a proper kernel subsystem. We
>   shouldn't export a complex subsystem-like ioctl API through
>   vfio ioctl extensions. Make a proper subsystem, it is not so hard.

This is not as clear to me, is "ioctl" used once or twice too often or
are you describing a defined structure of callbacks as an ioctl API?
The vfio mdev interface is just an extension of the file descriptor
based vfio device API.  The device needs to handle actual ioctls, but
JasonW's virtio-mdev series had their own set of callbacks.  Maybe a
concrete example of this item would be helpful.  Thanks,

Alex
Parav Pandit Nov. 20, 2019, 10:39 p.m. UTC | #53
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Wednesday, November 20, 2019 4:08 PM
> 
> On Wed, 20 Nov 2019 14:11:08 -0400
> Jason Gunthorpe <jgg@ziepe.ca> wrote:
> 
> > I feel like mdev is suffering from mission creep. I see people
> > proposing to use mdev for many wild things, the Mellanox SF stuff in
> > the other thread and this 'virtio subsystem' being the two that have
> > come up publicly this month.
> 
> Tell me about it... ;)
> 
Initial Mellanox sub function proposal was done using dedicated non-mdev subdev bus in [1] because mdev looked very vfio-ish.

Along the way mdev proposal was suggested at [2] by mdev maintainers to use.
The bus existed that detached two drivers (mdev and vfio_mdev), there was some motivation to attach other drivers.

After that we continued discussion and mdev extension using alias to have persistent naming in [3].

So far so good, but when we want to have actual use of mdev driver, it doesn't look right. :-)

> > Putting some boundaries on mdev usage would really help people know
> > when to use it. My top two from this discussion would be:
> >
> > - mdev devices should only bind to vfio. It is not a general kernel
> >   driver matcher mechanism. It is not 'virtual-bus'.
> 
So yes, we must define the scope of mdev and have right documentation to capture that.

If mdev is not supposed to be extended beyond vfio, why do you even need a bus? For iommu attachment?

[1] https://lkml.org/lkml/2019/3/1/19
[2] https://lkml.org/lkml/2019/3/7/696
[3] https://lkml.org/lkml/2019/8/26/854
Jason Gunthorpe Nov. 21, 2019, 1:38 a.m. UTC | #54
On Wed, Nov 20, 2019 at 05:05:00PM -0500, Michael S. Tsirkin wrote:
> On Wed, Nov 20, 2019 at 12:45:25PM -0400, Jason Gunthorpe wrote:
> > > > For instance, this VFIO based approach might be very suitable to the
> > > > intel VF based ICF driver, but we don't yet have an example of non-VF
> > > > HW that might not be well suited to VFIO.
> > >
> > > I don't think we should keep moving the goalposts like this.
> > 
> > It is ABI, it should be done as best we can as we have to live with it
> > for a long time. Right now HW is just starting to come to market with
> > VDPA and it feels rushed to design a whole subsystem style ABI around
> > one, quite simplistic, driver example.
> 
> Well one has to enable hardware in some way. It's not really reasonable
> to ask for multiple devices to be available just so there's a driver and
> people can use them.

Er, this has actually been a fairly standard ask for new subsystems.

I think virtio is well grounded here compared to other things I've
seen, but it should still be done with a lot more NIC community involvement.

> At this rate no one will want to be the first to ship new devices ;)

Why?
 
> > > If people write drivers and find some infrastruture useful,
> > > and it looks more or less generic on the outset, then I don't
> > > see why it's a bad idea to merge it.
> > 
> > Because it is userspace ABI, caution is always justified when defining
> > new ABI.
> 
> Reasonable caution, sure. Asking Alex to block Intel's driver until
> someone else catches up and ships competing hardware isn't reasonable
> though. If that's your proposal I guess we'll have to agree to disagree.

Vendors may be willing to participate, as Mellanox is doing,
pre-product.

Jason
Jason Gunthorpe Nov. 21, 2019, 3:03 a.m. UTC | #55
On Wed, Nov 20, 2019 at 03:07:32PM -0700, Alex Williamson wrote:

> > On Wed, Nov 20, 2019 at 10:28:56AM -0700, Alex Williamson wrote:
> > > > > Are you objecting the mdev_set_iommu_deivce() stuffs here?    
> > > > 
> > > > I'm questioning if it fits the vfio PCI device security model, yes.  
> > > 
> > > The mdev IOMMU backing device model is for when an mdev device has
> > > IOMMU based isolation, either via the PCI requester ID or via requester
> > > ID + PASID.  For example, an SR-IOV VF may be used by a vendor to
> > > provide IOMMU based translation and isolation, but the VF may not be
> > > complete otherwise to provide a self contained device.  It might
> > > require explicit coordination and interaction with the PF driver, ie.
> > > mediation.    
> > 
> > In this case the PF does not look to be involved, the ICF kernel
> > driver is only manipulating registers in the same VF that the vfio
> > owns the IOMMU for.
> 
> The mdev_set_iommu_device() call is probably getting caught up in the
> confusion of mdev as it exists today being vfio specific.  What I
> described in my reply is vfio specific.  The vfio iommu backend is
> currently the only code that calls mdev_get_iommu_device(), JasonW
> doesn't use it in the virtio-mdev code, so this seems like a stray vfio
> specific interface that's setup by IFC but never used.

I couldn't really say, it was the only thing I noticed in IFC that
seemed to have anything to do with identifying what IOMMU group to use
for the vfio interface..
 
> > This is why I keep calling it a "so-called mediated device" because it
> > is absolutely not clear what the kernel driver is mediating. Nearly
> > all its work is providing a subsystem-style IOCTL interface under the
> > existing vfio multiplexer unrelated to vfio requirements for DMA.
> 
> Names don't always evolve well to what an interface becomes, see for
> example vfio.  However, even in the vfio sense of mediated devices we
> have protocol translation.  The mdev vendor driver translates vfio API
> callbacks into hardware specific interactions.  Is this really much
> different?

I think the name was fine if you constrain 'mediated' to mean
'mediated IOMMU'

Broading to be basically any driver interface is starting to overlap
with the role of the driver core and subsystems in Linux.

> > However, to me it feels wrong that just because a driver wishes to use
> > PASID or IOMMU features it should go through vfio and mediated
> > devices.
> 
> I don't think I said this.  IOMMU backing of an mdev is an acceleration
> feature as far as vfio-mdev is concerned.  There are clearly other ways
> to use the IOMMU.

Sorry, I didn't mean to imply you said this, I was mearly reflecting
on the mission creep comment below. Often in private converstations
the use of mdev has been justified by 'because it uses IOMMU'

> > I feel like mdev is suffering from mission creep. I see people
> > proposing to use mdev for many wild things, the Mellanox SF stuff in
> > the other thread and this 'virtio subsystem' being the two that have
> > come up publicly this month.
> 
> Tell me about it... ;)
>  
> > Putting some boundaries on mdev usage would really help people know
> > when to use it. My top two from this discussion would be:
> > 
> > - mdev devices should only bind to vfio. It is not a general kernel
> >   driver matcher mechanism. It is not 'virtual-bus'.
> 
> I think this requires the driver-core knowledge to really appreciate.
> Otherwise there's apparently a common need to create sub-devices and
> without closer inspection of the bus:driver API contract, it's too easy
> to try to abstract the device:driver API via the bus.  mdev already has
> a notion that the device itself can use any API, but the interface to
> the bus is the vendor provided, vfio compatible callbacks.

But now that we are talking about this, I think there is a pretty
clear opinion forming that if you want to do kernel-kernel drivers
that is 'virtual bus' as proposed in this threads patch, not mdev.

Adding that knowledge to the mdev documentation would probably help
future people.

> > - mdev & vfio are not a substitute for a proper kernel subsystem. We
> >   shouldn't export a complex subsystem-like ioctl API through
> >   vfio ioctl extensions. Make a proper subsystem, it is not so hard.
> 
> This is not as clear to me, is "ioctl" used once or twice too often or
> are you describing a defined structure of callbacks as an ioctl API?
> The vfio mdev interface is just an extension of the file descriptor
> based vfio device API.  The device needs to handle actual ioctls, but
> JasonW's virtio-mdev series had their own set of callbacks.  Maybe a
> concrete example of this item would be helpful.  Thanks,

I did not intend it to be a clear opinion, more of a vauge guide for
documentation. I think as a maintainer you will be asked to make this
call.

The role of a subsystem in Linux is traditionally to take many
different kinds of HW devices and bring them to a common programming
API. Provide management and diagnostics, and expose some user ABI to
access the HW.

The role of vfio has traditionally been around secure device
assignment of a HW resource to a VM. I'm not totally clear on what the
role if mdev is seen to be, but all the mdev drivers in the tree seem
to make 'and pass it to KVM' a big part of their description.

So, looking at the virtio patches, I see some intended use is to map
some BAR pages into the VM. I see an ops struct to take different
kinds of HW devices to a common internal kernel API. I understand a
desire to bind kernel drivers that are not vfio to those ops, and I
see a user ioctl ABI based around those ops.

I also understand the BAR map is not registers, but just a write-only
doorbell page. So I suppose any interaction the guest will have with
the device prior to starting DMA is going to be software emulated in
qemu, and relayed into ioctls. (?) ie this is no longer strictly
"device assignment" but "accelerated device emulation".

Is virtio more vfio or more subsystem? The biggest thing that points
toward vfio is the intended use. The other items push away.

Frankly, when I look at what this virtio stuff is doing I see RDMA:
 - Both have a secure BAR pages for mmaping to userspace (or VM)
 - Both are prevented from interacting with the device at a register
   level and must call to the kernel - ie creating resources is a
   kernel call - for security.
 - Both create command request/response rings in userspace controlled
   memory and have HW DMA to read requests and DMA to generate responses
 - Both allow the work on the rings to DMA outside the ring to
   addresses controlled by userspace.
 - Both have to support a mixture of HW that uses on-device security
   or IOMMU based security.

(I actually gave a talk on how alot of modern HW is following the RDMA
 design patterns at plumbers, maybe video will come out soon)

We've had the same debate with RDMA. Like VFIO it has an extensible
file descriptor with a driver-specific path that can serve an
unlimited range of uses. We have had to come up with some sensability
and definition for "what is RDMA" and is appropriate for the FD.

Jason
Jason Wang Nov. 21, 2019, 3:52 a.m. UTC | #56
On 2019/11/20 下午9:38, Jason Gunthorpe wrote:
> On Tue, Nov 19, 2019 at 10:59:20PM -0500, Jason Wang wrote:
>
>>>> The interface between vfio and userspace is
>>>> based on virtio which is IMHO much better than
>>>> a vendor specific one. userspace stays vendor agnostic.
>>> Why is that even a good thing? It is much easier to provide drivers
>>> via qemu/etc in user space then it is to make kernel upgrades. We've
>>> learned this lesson many times.
>> For upgrades, since we had a unified interface. It could be done
>> through:
>>
>> 1) switch the datapath from hardware to software (e.g vhost)
>> 2) unload and load the driver
>> 3) switch teh datapath back
>>
>> Having drivers in user space have other issues, there're a lot of
>> customers want to stick to kernel drivers.
> So you want to support upgrade of kernel modules, but runtime
> upgrading the userspace part is impossible? Seems very strange to me.


Since you're talking about kernel upgrades, so comes such technical 
possibility.


>
>>> This is why we have had the philosophy that if it doesn't need to be
>>> in the kernel it should be in userspace.
>> Let me clarify again. For this framework, it aims to support both
>> kernel driver and userspce driver. For this series, it only contains
>> the kernel driver part. What it did is to allow kernel virtio driver
>> to control vDPA devices. Then we can provide a unified interface for
>> all of the VM, containers and bare metal. For this use case, I don't
>> see a way to leave the driver in userspace other than injecting
>> traffic back through vhost/TAP which is ugly.
> Binding to the other kernel virtio drivers is a reasonable
> justification, but none of this comes through in the patch cover
> letters or patch commit messages.


In the cover letter it had (of course I'm not native speaker but I will 
try my best to make it more readable for next version).

"
There are hardwares that can do virtio datapath offloading while
having its own control path. This path tries to implement a mdev based
unified API to support using kernel virtio driver to drive those
devices. This is done by introducing a new mdev transport for virtio
(virtio_mdev) and register itself as a new kind of mdev driver. Then
it provides a unified way for kernel virtio driver to talk with mdev
device implementation.

Though the series only contains kernel driver support, the goal is to
make the transport generic enough to support userspace drivers. This
means vhost-mdev[1] could be built on top as well by reusing the
transport.

"


>
>>>> That has lots of security and portability implications and isn't
>>>> appropriate for everyone.
>>> This is already using vfio. It doesn't make sense to claim that using
>>> vfio properly is somehow less secure or less portable.
>>>
>>> What I find particularly ugly is that this 'IFC VF NIC' driver
>>> pretends to be a mediated vfio device, but actually bypasses all the
>>> mediated device ops for managing dma security and just directly plugs
>>> the system IOMMU for the underlying PCI device into vfio.
>> Well, VFIO have multiple types of API. The design is to stick the VFIO
>> DMA model like container work for making DMA API work for userspace
>> driver.
> Well, it doesn't, that model, for security, is predicated on vfio
> being the exclusive owner of the device. For instance if the kernel
> driver were to perform DMA as well then security would be lost.


It's the responsibility of the kernel mdev driver to preserve the DMA 
isolation. And it's possible that mdev needs communicate with the master 
(PF or other) using its own memory, this should be allowed.


> to
>>> I suppose this little hack is what is motivating this abuse of vfio in
>>> the first place?
>>>
>>> Frankly I think a kernel driver touching a PCI function for which vfio
>>> is now controlling the system iommu for is a violation of the security
>>> model, and I'm very surprised AlexW didn't NAK this idea.
>>>
>>> Perhaps it is because none of the patches actually describe how the
>>> DMA security model for this so-called mediated device works? :(
>>>
>>> Or perhaps it is because this submission is split up so much it is
>>> hard to see what is being proposed? (I note this IFC driver is the
>>> first user of the mdev_set_iommu_device() function)
>> Are you objecting the mdev_set_iommu_deivce() stuffs here?
> I'm questioning if it fits the vfio PCI device security model, yes.
>
>>>> It is kernel's job to abstract hardware away and present a unified
>>>> interface as far as possible.
>>> Sure, you could create a virtio accelerator driver framework in our
>>> new drivers/accel I hear was started. That could make some sense, if
>>> we had HW that actually required/benefited from kernel involvement.
>> The framework is not designed specifically for your card. It tries to be
>> generic to support every types of virtio hardware devices, it's not
>> tied to any bus (e.g PCI) and any vendor. So it's not only a question
>> of how to slice a PCIE ethernet device.
> That doesn't explain why this isn't some new driver subsystem


The vhost-mdev is a vfio-mdev device. It sticks to the VFIO programming 
model. Any reason to reinvent the wheel?


> and
> instead treats vfio as a driver multiplexer.


I fail to understand this. VFIO had already support PCI, AP, mdev, and 
possible other buses (e.g vmbus) in the future. VFIO is not PCI 
specific, why requires vfio-mdev to be PCI specific?

Thanks

>
> Jason
>
Jason Wang Nov. 21, 2019, 3:57 a.m. UTC | #57
On 2019/11/20 下午9:33, Jason Gunthorpe wrote:
> On Tue, Nov 19, 2019 at 10:24:51PM -0500, Jason Wang wrote:
>
>>> The driver providing the virtio should really be in control of the
>>> life cycle policy. For net related virtio that is clearly devlink.
>> As replied in another thread, there were already existed devices
>> (Intel IFC VF) that doesn't use devlink.
> Why is that a justification? Drivers can learn to use devlink, it
> isn't like it is set in stone.


Technically, I fully agree. But vendors has their right to to other way 
unless devlink is forced when creating netdevice.

Thanks


>
> Jason
>
Jason Wang Nov. 21, 2019, 4:06 a.m. UTC | #58
On 2019/11/20 下午9:41, Jason Gunthorpe wrote:
> On Wed, Nov 20, 2019 at 12:07:59PM +0800, Jason Wang wrote:
>
>> 1) create sub fucntion and do must to have pre configuration through devlink
>> 2) only after sub function is created one more available instance was added
>> and shown through sysfs
>> 3) user can choose to create and use that mdev instance as it did for other
>> type of device like vGPU
>> 4) devlink can still use to report other stuffs
> Why do we want the extra step #3? The user already indicated they want
> a mdev via #1


It's about the compatibility, but if you wish, I think we can develop 
devlink based lifecycle for mdev for sure.


>
> I have the same question for the PF and VF cases, why doesn't a mdev
> get created automatically when the VF is probed? Why does this need
> the guid stuff?


All you said here is possible, it's a design choice for the management 
interface.


>
> The guid stuff was intended for, essentially, multi-function devices
> that could be sliced up, I don't think it makes sense to use it for
> single-function VF devices like the ICF driver.


It doesn't harm, and indeed we have other choice, we can do it gradually 
on top.


>
> Overall the guid thing should be optional. Drivers providing mdev
> should be able to use another scheme, like devlink, to on demand
> create their mdevs.


Yes, that's for sure. I'm not against to devlink for mdev/subdev, I just 
say we should not make devlink the only choice for mdev/subdev.

Thanks


>
> Jason
>
Michael S. Tsirkin Nov. 21, 2019, 4:24 a.m. UTC | #59
On Wed, Nov 20, 2019 at 11:03:57PM -0400, Jason Gunthorpe wrote:
> Frankly, when I look at what this virtio stuff is doing I see RDMA:
>  - Both have a secure BAR pages for mmaping to userspace (or VM)
>  - Both are prevented from interacting with the device at a register
>    level and must call to the kernel - ie creating resources is a
>    kernel call - for security.
>  - Both create command request/response rings in userspace controlled
>    memory and have HW DMA to read requests and DMA to generate responses
>  - Both allow the work on the rings to DMA outside the ring to
>    addresses controlled by userspace.
>  - Both have to support a mixture of HW that uses on-device security
>    or IOMMU based security.

The main difference is userspace/drivers need to be portable with
virtio.
Jason Wang Nov. 21, 2019, 4:53 a.m. UTC | #60
On 2019/11/21 上午12:45, Jason Gunthorpe wrote:
> On Wed, Nov 20, 2019 at 09:57:17AM -0500, Michael S. Tsirkin wrote:
>> On Wed, Nov 20, 2019 at 10:30:54AM -0400, Jason Gunthorpe wrote:
>>> On Wed, Nov 20, 2019 at 08:43:20AM -0500, Michael S. Tsirkin wrote:
>>>> On Wed, Nov 20, 2019 at 09:03:19AM -0400, Jason Gunthorpe wrote:
>>>>> On Wed, Nov 20, 2019 at 02:38:08AM -0500, Michael S. Tsirkin wrote:
>>>>>>>> I don't think that extends as far as actively encouraging userspace
>>>>>>>> drivers poking at hardware in a vendor specific way.
>>>>>>> Yes, it does, if you can implement your user space requirements using
>>>>>>> vfio then why do you need a kernel driver?
>>>>>> People's requirements differ. You are happy with just pass through a VF
>>>>>> you can already use it. Case closed. There are enough people who have
>>>>>> a fixed userspace that people have built virtio accelerators,
>>>>>> now there's value in supporting that, and a vendor specific
>>>>>> userspace blob is not supporting that requirement.
>>>>> I have no idea what you are trying to explain here. I'm not advocating
>>>>> for vfio pass through.
>>>> You seem to come from an RDMA background, used to userspace linking to
>>>> vendor libraries to do basic things like push bits out on the network,
>>>> because users live on the performance edge and rebuild their
>>>> userspace often anyway.
>>>>
>>>> Lots of people are not like that, they would rather have the
>>>> vendor-specific driver live in the kernel, with userspace being
>>>> portable, thank you very much.
>>> You are actually proposing a very RDMA like approach with a split
>>> kernel/user driver design. Maybe the virtio user driver will turn out
>>> to be 'portable'.
>>>
>>> Based on the last 20 years of experience, the kernel component has
>>> proven to be the larger burden and drag than the userspace part. I
>>> think the high interest in DPDK, SPDK and others show this is a common
>>> principle.
>> And I guess the interest in BPF shows the opposite?
> There is room for both, I wouldn't discount either approach entirely
> out of hand.
>
>>> At the very least for new approaches like this it makes alot of sense
>>> to have a user space driver until enough HW is available that a
>>> proper, well thought out kernel side can be built.
>> But hardware is available, driver has been posted by Intel.
>> Have you looked at that?
> I'm not sure pointing at that driver is so helpful, it is very small
> and mostly just reflects virtio ops into some undocumented register
> pokes.


What do you expect to see then? The IFC driver is sufficient for 
demonstrating the design and implementation of the framework that is a 
vDPA driver. If you care about a better management API for mdev, we can 
discuss but it should be another topic which should not block this series.


>
> There is no explanation at all for the large scale architecture
> choices:


Most of the parts have been explained more or less in the cover letter.


>   - Why vfio


In cover letter it explains that userspace driver + vhost mdev is the 
goal. And VFIO is the most popular interface for developing userspace 
drivers. Having vendor specific userspace driver framework is possible 
but would be a pain for management and qemu.


>   - Why mdev without providing a device IOMMU


This is a question for mdev not directly related to the series . Either 
bus IOMMU or device IOMMU (as vGPU already did) is supported.


>   - Why use GUID lifecycle management for singlton function PF/VF
>     drivers


It was just because it's the only existed interface right now, and 
management has been taught to use this interface.


>   - Why not use devlink


Technically it's possible. But for explanation, it's just because I 
don't get any question before start the draft the new version. I can add 
this in the cover letter of next version.


>   - Why not use vfio-pci with a userspace driver


In cover letter, it explains that the series is for kernel virtio driver.


>
> These are legitimate questions and answers like "because we like it
> this way"


Where are stuffs like this?


>   or "this is how the drivers are written today" isn't very
> satisfying at all.


If you are talking about devlink + mdev. I would say for now, you're 
welcome to develop devlink based lifecycle for mdev.  But if you want to 
discuss devlink support for each type of devices, it's obvious not the 
correct place.


>
>>> For instance, this VFIO based approach might be very suitable to the
>>> intel VF based ICF driver, but we don't yet have an example of non-VF
>>> HW that might not be well suited to VFIO.


What's the reason that causes your HW not suited to VFIO? Mdev had 
already supported device IOMMU partially, let's just improve it if it 
doesn't meet your requirement. Or are there any fundamental barriers there?


>> I don't think we should keep moving the goalposts like this.
> It is ABI, it should be done as best we can as we have to live with it
> for a long time. Right now HW is just starting to come to market with
> VDPA and it feels rushed to design a whole subsystem style ABI around
> one, quite simplistic, driver example.


Well, I know there could be some special features in your hardware, 
let's just discuss here to seek a solution instead of keep saying "your 
framework does not fit our case" without any real details.


>
>> If people write drivers and find some infrastruture useful,
>> and it looks more or less generic on the outset, then I don't
>> see why it's a bad idea to merge it.
> Because it is userspace ABI, caution is always justified when defining
> new ABI.


Well, if you read vhost-mdev patch, you will see it doesn't invent any 
userspace ABI. VFIO ABI is completely followed there.

Thanks


>
> Jason
>
Jason Wang Nov. 21, 2019, 5:22 a.m. UTC | #61
On 2019/11/21 上午6:07, Alex Williamson wrote:
> On Wed, 20 Nov 2019 14:11:08 -0400
> Jason Gunthorpe<jgg@ziepe.ca>  wrote:
>
>> On Wed, Nov 20, 2019 at 10:28:56AM -0700, Alex Williamson wrote:
>>>>> Are you objecting the mdev_set_iommu_deivce() stuffs here?
>>>> I'm questioning if it fits the vfio PCI device security model, yes.
>>> The mdev IOMMU backing device model is for when an mdev device has
>>> IOMMU based isolation, either via the PCI requester ID or via requester
>>> ID + PASID.  For example, an SR-IOV VF may be used by a vendor to
>>> provide IOMMU based translation and isolation, but the VF may not be
>>> complete otherwise to provide a self contained device.  It might
>>> require explicit coordination and interaction with the PF driver, ie.
>>> mediation.
>> In this case the PF does not look to be involved, the ICF kernel
>> driver is only manipulating registers in the same VF that the vfio
>> owns the IOMMU for.
> The mdev_set_iommu_device() call is probably getting caught up in the
> confusion of mdev as it exists today being vfio specific.  What I
> described in my reply is vfio specific.  The vfio iommu backend is
> currently the only code that calls mdev_get_iommu_device(), JasonW
> doesn't use it in the virtio-mdev code, so this seems like a stray vfio
> specific interface that's setup by IFC but never used.
>

It will be used by userspace driver through vhost-mdev code for having a 
correct IOMMU when doing DMA mappings.

Thanks
Jason Wang Nov. 21, 2019, 6:59 a.m. UTC | #62
On 2019/11/21 上午2:11, Jason Gunthorpe wrote:
> On Wed, Nov 20, 2019 at 10:28:56AM -0700, Alex Williamson wrote:
>>>> Are you objecting the mdev_set_iommu_deivce() stuffs here?
>>> I'm questioning if it fits the vfio PCI device security model, yes.
>> The mdev IOMMU backing device model is for when an mdev device has
>> IOMMU based isolation, either via the PCI requester ID or via requester
>> ID + PASID.  For example, an SR-IOV VF may be used by a vendor to
>> provide IOMMU based translation and isolation, but the VF may not be
>> complete otherwise to provide a self contained device.  It might
>> require explicit coordination and interaction with the PF driver, ie.
>> mediation.
> In this case the PF does not look to be involved, the ICF kernel
> driver is only manipulating registers in the same VF that the vfio
> owns the IOMMU for.
>
> This is why I keep calling it a "so-called mediated device" because it
> is absolutely not clear what the kernel driver is mediating.


It tries to do mediation between virtio commands and real device. It 
works similar to mdev PCI device that do mediation between PCI commands 
and real device. This is exact what mediator pattern[1] did, no?

[1] https://en.wikipedia.org/wiki/Mediator_pattern


> Nearly
> all its work is providing a subsystem-style IOCTL interface under the
> existing vfio multiplexer unrelated to vfio requirements for DMA.


What do you mean by "unrelated to vfio", the ioctl() interface belongs 
its device ops is pretty device specific. And for IFC VF driver, it 
doesn't see ioctl, it can only see virtio commands.


>
>> The IOMMU backing device is certainly not meant to share an IOMMU
>> address space with host drivers, except as necessary for the
>> mediation of the device.  The vfio model manages the IOMMU domain of
>> the backing device exclusively, any attempt to dual-host the device
>> respective to the IOMMU should fault in the dma/iommu-ops.  Thanks,
> Sounds more reasonable if the kernel dma_ops are prevented while vfio
> is using the device.
>
> However, to me it feels wrong that just because a driver wishes to use
> PASID or IOMMU features it should go through vfio and mediated
> devices.
>
> It is not even necessary as we have several examples already of
> drivers using these features without vfio.


Confused, are you suggesting a new module to support fine grain DMA 
isolation to userspace driver? How different would it looks compared 
with exist VFIO then?


>
> I feel like mdev is suffering from mission creep. I see people
> proposing to use mdev for many wild things, the Mellanox SF stuff in
> the other thread and this 'virtio subsystem' being the two that have
> come up publicly this month.
>
> Putting some boundaries on mdev usage would really help people know
> when to use it.


And forbid people to extend it? Do you agree that there are lots of 
common requirements between:

- mediation between virtio and real device
- mediation between PCI and real device

?


> My top two from this discussion would be:
>
> - mdev devices should only bind to vfio. It is not a general kernel
>    driver matcher mechanism. It is not 'virtual-bus'.


It's still unclear to me why mdev must bind to vfio. Though they are 
coupled but the pretty loosely. I would argue that any device that is 
doing mediation between drivers and device could be done through mdev. 
Bind mdev to vfio means you need invent other things to support kernel 
driver and your parent need to be prepared for those two different APIs. 
Mdev devices it self won't be a bus, but it could provide helpers to 
build a mediated bus.


>
> - mdev & vfio are not a substitute for a proper kernel subsystem. We
>    shouldn't export a complex subsystem-like ioctl API through
>    vfio ioctl extensions.


I would say though e.g the regions based VFIO device API looks generic, 
it carries device/bus specific information there. It would be rather 
simple to switch back to region API and build vhost protocol on top. But 
does it really have a lot of differences?


>   Make a proper subsystem, it is not so hard.


Vhost is the subsystem bu then how to abstract the DMA there? It would 
be more than 99% similar to VFIO then.

Thanks


>
> Maybe others agree?
>
> Thanks,
> Jason
>
Jason Wang Nov. 21, 2019, 7:21 a.m. UTC | #63
On 2019/11/21 上午11:03, Jason Gunthorpe wrote:
> On Wed, Nov 20, 2019 at 03:07:32PM -0700, Alex Williamson wrote:
>
>>> On Wed, Nov 20, 2019 at 10:28:56AM -0700, Alex Williamson wrote:
>>>>>> Are you objecting the mdev_set_iommu_deivce() stuffs here?
>>>>> I'm questioning if it fits the vfio PCI device security model, yes.
>>>> The mdev IOMMU backing device model is for when an mdev device has
>>>> IOMMU based isolation, either via the PCI requester ID or via requester
>>>> ID + PASID.  For example, an SR-IOV VF may be used by a vendor to
>>>> provide IOMMU based translation and isolation, but the VF may not be
>>>> complete otherwise to provide a self contained device.  It might
>>>> require explicit coordination and interaction with the PF driver, ie.
>>>> mediation.
>>> In this case the PF does not look to be involved, the ICF kernel
>>> driver is only manipulating registers in the same VF that the vfio
>>> owns the IOMMU for.
>> The mdev_set_iommu_device() call is probably getting caught up in the
>> confusion of mdev as it exists today being vfio specific.  What I
>> described in my reply is vfio specific.  The vfio iommu backend is
>> currently the only code that calls mdev_get_iommu_device(), JasonW
>> doesn't use it in the virtio-mdev code, so this seems like a stray vfio
>> specific interface that's setup by IFC but never used.
> I couldn't really say, it was the only thing I noticed in IFC that
> seemed to have anything to do with identifying what IOMMU group to use
> for the vfio interface..
>   
>>> This is why I keep calling it a "so-called mediated device" because it
>>> is absolutely not clear what the kernel driver is mediating. Nearly
>>> all its work is providing a subsystem-style IOCTL interface under the
>>> existing vfio multiplexer unrelated to vfio requirements for DMA.
>> Names don't always evolve well to what an interface becomes, see for
>> example vfio.  However, even in the vfio sense of mediated devices we
>> have protocol translation.  The mdev vendor driver translates vfio API
>> callbacks into hardware specific interactions.  Is this really much
>> different?
> I think the name was fine if you constrain 'mediated' to mean
> 'mediated IOMMU'


But actually it does much more than just IOMMU.


>
> Broading to be basically any driver interface is starting to overlap
> with the role of the driver core and subsystems in Linux.
>
>>> However, to me it feels wrong that just because a driver wishes to use
>>> PASID or IOMMU features it should go through vfio and mediated
>>> devices.
>> I don't think I said this.  IOMMU backing of an mdev is an acceleration
>> feature as far as vfio-mdev is concerned.  There are clearly other ways
>> to use the IOMMU.
> Sorry, I didn't mean to imply you said this, I was mearly reflecting
> on the mission creep comment below. Often in private converstations
> the use of mdev has been justified by 'because it uses IOMMU'
>
>>> I feel like mdev is suffering from mission creep. I see people
>>> proposing to use mdev for many wild things, the Mellanox SF stuff in
>>> the other thread and this 'virtio subsystem' being the two that have
>>> come up publicly this month.
>> Tell me about it... ;)
>>   
>>> Putting some boundaries on mdev usage would really help people know
>>> when to use it. My top two from this discussion would be:
>>>
>>> - mdev devices should only bind to vfio. It is not a general kernel
>>>    driver matcher mechanism. It is not 'virtual-bus'.
>> I think this requires the driver-core knowledge to really appreciate.
>> Otherwise there's apparently a common need to create sub-devices and
>> without closer inspection of the bus:driver API contract, it's too easy
>> to try to abstract the device:driver API via the bus.  mdev already has
>> a notion that the device itself can use any API, but the interface to
>> the bus is the vendor provided, vfio compatible callbacks.
> But now that we are talking about this, I think there is a pretty
> clear opinion forming that if you want to do kernel-kernel drivers
> that is 'virtual bus' as proposed in this threads patch, not mdev.


This looks confused.

1) Virtual bus allows multiple different type of devices to be attached 
on a single bus, isn't this where you show your concern when I do 
similar thing for a single mdev-bus?
2) Virtual bus hide the communication through a void *, this is not 
impossible for mdev.
3) After decoupling vfio out of mdev, there's no fundamental difference 
between mdev and virtual bus except that mdev is coupled with sysfs 
interface and can talk to VFIO. And that's really what we want for 
virtio, not only for having a management interface but also for a 
unified framework/API between vhost(userspace) and virtio(kernel) driver.
4) In the cover letter of virtual-bus it said:


"
+One use case example is an rdma driver needing to connect with several
+different types of PCI LAN devices to be able to request resources from
+them (queue sets).  Each LAN driver that supports rdma will register a
+virtbus_device on the virtual bus for each physical function. The rdma
+driver will register as a virtbus_driver on the virtual bus to be
+matched up with multiple virtbus_devices and receive a pointer to a
+struct containing the callbacks that the PCI LAN drivers support for
+registering with them.

"

It did something like device aggregation. Ok, you might think it could 
be extended. But why mdev can't be extended?


>
> Adding that knowledge to the mdev documentation would probably help
> future people.
>
>>> - mdev & vfio are not a substitute for a proper kernel subsystem. We
>>>    shouldn't export a complex subsystem-like ioctl API through
>>>    vfio ioctl extensions. Make a proper subsystem, it is not so hard.
>> This is not as clear to me, is "ioctl" used once or twice too often or
>> are you describing a defined structure of callbacks as an ioctl API?
>> The vfio mdev interface is just an extension of the file descriptor
>> based vfio device API.  The device needs to handle actual ioctls, but
>> JasonW's virtio-mdev series had their own set of callbacks.  Maybe a
>> concrete example of this item would be helpful.  Thanks,
> I did not intend it to be a clear opinion, more of a vauge guide for
> documentation. I think as a maintainer you will be asked to make this
> call.
>
> The role of a subsystem in Linux is traditionally to take many
> different kinds of HW devices and bring them to a common programming
> API. Provide management and diagnostics, and expose some user ABI to
> access the HW.
>
> The role of vfio has traditionally been around secure device
> assignment of a HW resource to a VM. I'm not totally clear on what the
> role if mdev is seen to be, but all the mdev drivers in the tree seem
> to make 'and pass it to KVM' a big part of their description.
>
> So, looking at the virtio patches, I see some intended use is to map
> some BAR pages into the VM.


Nope, at least not for the current stage. It still depends on the 
virtio-net-pci emulatio in qemu to work. In the future, we will allow 
such mapping only for dorbell.


>   I see an ops struct to take different
> kinds of HW devices to a common internal kernel API. I understand a
> desire to bind kernel drivers that are not vfio to those ops, and I
> see a user ioctl ABI based around those ops.
>
> I also understand the BAR map is not registers, but just a write-only
> doorbell page. So I suppose any interaction the guest will have with
> the device prior to starting DMA is going to be software emulated in
> qemu, and relayed into ioctls. (?) ie this is no longer strictly
> "device assignment" but "accelerated device emulation".
>
> Is virtio more vfio or more subsystem? The biggest thing that points
> toward vfio is the intended use. The other items push away.
>
> Frankly, when I look at what this virtio stuff is doing I see RDMA:
>   - Both have a secure BAR pages for mmaping to userspace (or VM)
>   - Both are prevented from interacting with the device at a register
>     level and must call to the kernel - ie creating resources is a
>     kernel call - for security.
>   - Both create command request/response rings in userspace controlled
>     memory and have HW DMA to read requests and DMA to generate responses
>   - Both allow the work on the rings to DMA outside the ring to
>     addresses controlled by userspace.
>   - Both have to support a mixture of HW that uses on-device security
>     or IOMMU based security.
>
> (I actually gave a talk on how alot of modern HW is following the RDMA
>   design patterns at plumbers, maybe video will come out soon)
>
> We've had the same debate with RDMA. Like VFIO it has an extensible
> file descriptor with a driver-specific path that can serve an
> unlimited range of uses. We have had to come up with some sensability
> and definition for "what is RDMA" and is appropriate for the FD.


If you looking at my V13, after decoupling, you can register your own 
vhost driver on the mdev_virito bus with your own API if you don't like 
VFIO.

Thanks


>
> Jason
>
Jason Wang Nov. 21, 2019, 8:17 a.m. UTC | #64
On 2019/11/21 上午6:39, Parav Pandit wrote:
>> From: Alex Williamson<alex.williamson@redhat.com>
>> Sent: Wednesday, November 20, 2019 4:08 PM
>>
>> On Wed, 20 Nov 2019 14:11:08 -0400
>> Jason Gunthorpe<jgg@ziepe.ca>  wrote:
>>
>>> I feel like mdev is suffering from mission creep. I see people
>>> proposing to use mdev for many wild things, the Mellanox SF stuff in
>>> the other thread and this 'virtio subsystem' being the two that have
>>> come up publicly this month.
>> Tell me about it...;)
>>
> Initial Mellanox sub function proposal was done using dedicated non-mdev subdev bus in [1] because mdev looked very vfio-ish.
>
> Along the way mdev proposal was suggested at [2] by mdev maintainers to use.
> The bus existed that detached two drivers (mdev and vfio_mdev), there was some motivation to attach other drivers.
>
> After that we continued discussion and mdev extension using alias to have persistent naming in [3].
>
> So far so good, but when we want to have actual use of mdev driver, it doesn't look right.:-)
>

Want to implement devlink for mdev then? I think it may help in the case.

Thanks
Jason Gunthorpe Nov. 21, 2019, 1:44 p.m. UTC | #65
On Wed, Nov 20, 2019 at 11:24:03PM -0500, Michael S. Tsirkin wrote:
> On Wed, Nov 20, 2019 at 11:03:57PM -0400, Jason Gunthorpe wrote:
> > Frankly, when I look at what this virtio stuff is doing I see RDMA:
> >  - Both have a secure BAR pages for mmaping to userspace (or VM)
> >  - Both are prevented from interacting with the device at a register
> >    level and must call to the kernel - ie creating resources is a
> >    kernel call - for security.
> >  - Both create command request/response rings in userspace controlled
> >    memory and have HW DMA to read requests and DMA to generate responses
> >  - Both allow the work on the rings to DMA outside the ring to
> >    addresses controlled by userspace.
> >  - Both have to support a mixture of HW that uses on-device security
> >    or IOMMU based security.
> 
> The main difference is userspace/drivers need to be portable with
> virtio.

rdma also has a stable/portable user space library API that is
portable to multiple operating systems.

What you don't like is that RDMA userspace has driver-specific
code. Ie the kernel interface is not fully hardware independent.

Jason
Jason Gunthorpe Nov. 21, 2019, 2:17 p.m. UTC | #66
On Thu, Nov 21, 2019 at 03:21:29PM +0800, Jason Wang wrote:
> > The role of vfio has traditionally been around secure device
> > assignment of a HW resource to a VM. I'm not totally clear on what the
> > role if mdev is seen to be, but all the mdev drivers in the tree seem
> > to make 'and pass it to KVM' a big part of their description.
> > 
> > So, looking at the virtio patches, I see some intended use is to map
> > some BAR pages into the VM.
> 
> Nope, at least not for the current stage. It still depends on the
> virtio-net-pci emulatio in qemu to work. In the future, we will allow such
> mapping only for dorbell.

There has been a lot of emails today, but I think this is the main
point I want to respond to.

Using vfio when you don't even assign any part of the device BAR to
the VM is, frankly, a gigantic misuse, IMHO.

Just needing userspace DMA is not, in any way, a justification to use
vfio.

We have extensive library interfaces in the kernel to do userspace DMA
and subsystems like GPU and RDMA are full of example uses of this kind
of stuff. Everything from on-device IOMMU to system IOMMU to PASID. If
you find things missing then we need to improve those library
interfaces, not further abuse VFIO.

Further, I do not think it is wise to design the userspace ABI around
a simplistict implementation that can't do BAR assignment, and can't
support multiple virtio rings on single PCI function. This stuff is
clearly too premature.

My advice is to proceed as a proper subsystem with your own chardev,
own bus type, etc and maybe live in staging for a bit until 2-3
drivers are implementing the ABI (or at the very least agreeing with),
as is the typical process for Linux.

Building a new kernel ABI is hard (this is why I advised to use a
userspace driver). It has to go through the community process at the
usual pace.

Jason
Martin Habets Nov. 21, 2019, 3:10 p.m. UTC | #67
On 19/11/2019 04:08, Jason Wang wrote:
> 
> On 2019/11/16 上午7:25, Parav Pandit wrote:
>> Hi Jeff,
>>
>>> From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
>>> Sent: Friday, November 15, 2019 4:34 PM
>>>
>>> From: Dave Ertman <david.m.ertman@intel.com>
>>>
>>> This is the initial implementation of the Virtual Bus, virtbus_device and
>>> virtbus_driver.  The virtual bus is a software based bus intended to support
>>> lightweight devices and drivers and provide matching between them and
>>> probing of the registered drivers.
>>>
>>> The primary purpose of the virual bus is to provide matching services and to
>>> pass the data pointer contained in the virtbus_device to the virtbus_driver
>>> during its probe call.  This will allow two separate kernel objects to match up
>>> and start communication.
>>>
>> It is fundamental to know that rdma device created by virtbus_driver will be anchored to which bus for an non abusive use.
>> virtbus or parent pci bus?
>> I asked this question in v1 version of this patch.
>>
>> Also since it says - 'to support lightweight devices', documenting that information is critical to avoid ambiguity.
>>
>> Since for a while I am working on the subbus/subdev_bus/xbus/mdev [1] whatever we want to call it, it overlaps with your comment about 'to support lightweight devices'.
>> Hence let's make things crystal clear weather the purpose is 'only matching service' or also 'lightweight devices'.
>> If this is only matching service, lets please remove lightweight devices part..
> 
> 
> Yes, if it's matching + lightweight device, its function is almost a duplication of mdev. And I'm working on extending mdev[1] to be a generic module to support any types of virtual devices a while. The advantage of mdev is:
> 
> 1) ready for the userspace driver (VFIO based)
> 2) have a sysfs/GUID based management interface

In my view this virtual-bus is more generic and more flexible than mdev.
What for you are the advantages of mdev to me are some of it's disadvantages.

The way I see it we can provide rdma support in the driver using virtual-bus.
At the moment we would need separate mdev support in the driver for vdpa, but I hope at some point mdev
would become a layer on top of virtual-bus.
Besides these users we also support internal tools for our hardware factory provisioning, and for testing/debugging.
I could easily imagine such tools using a virtual-bus device. With mdev those interfaces would be more convoluted.

> So for 1, it's not clear that how userspace driver would be supported here, or it's completely not being accounted in this series? For 2, it looks to me that this series leave it to the implementation, this means management to learn several vendor specific interfaces which seems a burden.
> 
> Note, technically Virtual Bus could be implemented on top of [1] with the full lifecycle API.

Seems easier to me to do that the other way around: mdev could be implemented on top of virtual-bus.

Best regards,
Martin

> [1] https://lkml.org/lkml/2019/11/18/261
> 
> 
>>
>> You additionally need modpost support for id table integration to modifo, modprobe and other tools.
>> A small patch similar to this one [2] is needed.
>> Please include in the series.
>>
>> [..]
> 
> 
> And probably a uevent method. But rethinking of this, matching through a single virtual bus seems not good. What if driver want to do some specific matching? E.g for virtio, we may want a vhost-net driver that only match networking device. With a single bus, it probably means you need another bus on top and provide the virtio specific matching there. This looks not straightforward as allowing multiple type of buses.
> 
> Thanks
>
Jason Wang Nov. 22, 2019, 8:45 a.m. UTC | #68
On 2019/11/21 下午10:17, Jason Gunthorpe wrote:
> On Thu, Nov 21, 2019 at 03:21:29PM +0800, Jason Wang wrote:
>>> The role of vfio has traditionally been around secure device
>>> assignment of a HW resource to a VM. I'm not totally clear on what the
>>> role if mdev is seen to be, but all the mdev drivers in the tree seem
>>> to make 'and pass it to KVM' a big part of their description.
>>>
>>> So, looking at the virtio patches, I see some intended use is to map
>>> some BAR pages into the VM.
>> Nope, at least not for the current stage. It still depends on the
>> virtio-net-pci emulatio in qemu to work. In the future, we will allow such
>> mapping only for dorbell.
> There has been a lot of emails today, but I think this is the main
> point I want to respond to.
>
> Using vfio when you don't even assign any part of the device BAR to
> the VM is, frankly, a gigantic misuse, IMHO.


That's not a compelling point. If you go through the discussion on 
vhost-mdev from last year, the direct mapping of doorbell is accounted 
since that time[1]. It works since its stateless. Having an arbitrary 
BAR to be mapped directly to VM may cause lots of troubles for migration 
since it requires a vendor specific way to get the state of the device. 
I don't think having a vendor specific migration driver is acceptable in 
qemu. What's more, direct mapping through MMIO is even optional (see 
CONFIG_VFIO_PCI_MMAP) and vfio support buses without any MMIO region.


>
> Just needing userspace DMA is not, in any way, a justification to use
> vfio.
>
> We have extensive library interfaces in the kernel to do userspace DMA
> and subsystems like GPU and RDMA are full of example uses of this kind
> of stuff.


I'm not sure which library did you mean here. Is any of those library 
used by qemu? If not, what's the reason?

For virtio, we need a device agnostic API which supports migration. Is 
that something the library you mention here can provide?


>   Everything from on-device IOMMU to system IOMMU to PASID. If
> you find things missing then we need to improve those library
> interfaces, not further abuse VFIO.
>
> Further, I do not think it is wise to design the userspace ABI around
> a simplistict implementation that can't do BAR assignment,


Again, the vhost-mdev follow the VFIO ABI, no new ABI is invented, and 
mmap() was kept their for mapping device regions.


> and can't
> support multiple virtio rings on single PCI function.


How do you know multiple virtio rings can't be supported? It should be 
address at the level of parent devices not virtio-mdev framework, no?


> This stuff is
> clearly too premature.


It depends on how mature you want. All the above two points looks 
invalid to me.


>
> My advice is to proceed as a proper subsystem with your own chardev,
> own bus type, etc and maybe live in staging for a bit until 2-3
> drivers are implementing the ABI (or at the very least agreeing with),
> as is the typical process for Linux.


I'm open to comments for sure, but looking at all the requirement for 
vDPA, most of the requirement could be settled through existed modules, 
that's not only a simplification for developing but also for management 
layer or userspace drivers.


>
> Building a new kernel ABI is hard (this is why I advised to use a
> userspace driver).


Well, it looks to me my clarification is ignored several times. There's 
no new ABI invented in the series, no?


> It has to go through the community process at the
> usual pace.


What do you mean by "usual pace"? It has been more than 1.5 year since 
the first version of vhost-mdev [1] that was posted on the list.

[1] https://lwn.net/Articles/750770/

Thanks

>
> Jason
>
Jason Wang Nov. 22, 2019, 9:13 a.m. UTC | #69
On 2019/11/21 下午11:10, Martin Habets wrote:
> On 19/11/2019 04:08, Jason Wang wrote:
>> On 2019/11/16 上午7:25, Parav Pandit wrote:
>>> Hi Jeff,
>>>
>>>> From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
>>>> Sent: Friday, November 15, 2019 4:34 PM
>>>>
>>>> From: Dave Ertman <david.m.ertman@intel.com>
>>>>
>>>> This is the initial implementation of the Virtual Bus, virtbus_device and
>>>> virtbus_driver.  The virtual bus is a software based bus intended to support
>>>> lightweight devices and drivers and provide matching between them and
>>>> probing of the registered drivers.
>>>>
>>>> The primary purpose of the virual bus is to provide matching services and to
>>>> pass the data pointer contained in the virtbus_device to the virtbus_driver
>>>> during its probe call.  This will allow two separate kernel objects to match up
>>>> and start communication.
>>>>
>>> It is fundamental to know that rdma device created by virtbus_driver will be anchored to which bus for an non abusive use.
>>> virtbus or parent pci bus?
>>> I asked this question in v1 version of this patch.
>>>
>>> Also since it says - 'to support lightweight devices', documenting that information is critical to avoid ambiguity.
>>>
>>> Since for a while I am working on the subbus/subdev_bus/xbus/mdev [1] whatever we want to call it, it overlaps with your comment about 'to support lightweight devices'.
>>> Hence let's make things crystal clear weather the purpose is 'only matching service' or also 'lightweight devices'.
>>> If this is only matching service, lets please remove lightweight devices part..
>>
>> Yes, if it's matching + lightweight device, its function is almost a duplication of mdev. And I'm working on extending mdev[1] to be a generic module to support any types of virtual devices a while. The advantage of mdev is:
>>
>> 1) ready for the userspace driver (VFIO based)
>> 2) have a sysfs/GUID based management interface
> In my view this virtual-bus is more generic and more flexible than mdev.


Even after the series [1] here?


> What for you are the advantages of mdev to me are some of it's disadvantages.
>
> The way I see it we can provide rdma support in the driver using virtual-bus.


Yes, but since it does matching only, you can do everything you want. 
But it looks to me Greg does not want a bus to be an API multiplexer. So 
if a dedicated bus is desired, it won't be much of code to have a bus on 
your own.


> At the moment we would need separate mdev support in the driver for vdpa, but I hope at some point mdev
> would become a layer on top of virtual-bus.
> Besides these users we also support internal tools for our hardware factory provisioning, and for testing/debugging.
> I could easily imagine such tools using a virtual-bus device. With mdev those interfaces would be more convoluted.


Can you give me an example?


>
>> So for 1, it's not clear that how userspace driver would be supported here, or it's completely not being accounted in this series? For 2, it looks to me that this series leave it to the implementation, this means management to learn several vendor specific interfaces which seems a burden.
>>
>> Note, technically Virtual Bus could be implemented on top of [1] with the full lifecycle API.
> Seems easier to me to do that the other way around: mdev could be implemented on top of virtual-bus.


Probably, without the part of parent_ops, they are almost equal.

Thanks


>
> Best regards,
> Martin
>
>> [1] https://lkml.org/lkml/2019/11/18/261
>>
>>
>>> You additionally need modpost support for id table integration to modifo, modprobe and other tools.
>>> A small patch similar to this one [2] is needed.
>>> Please include in the series.
>>>
>>> [..]
>>
>> And probably a uevent method. But rethinking of this, matching through a single virtual bus seems not good. What if driver want to do some specific matching? E.g for virtio, we may want a vhost-net driver that only match networking device. With a single bus, it probably means you need another bus on top and provide the virtio specific matching there. This looks not straightforward as allowing multiple type of buses.
>>
>> Thanks
>>
Parav Pandit Nov. 22, 2019, 4:19 p.m. UTC | #70
> From: Jason Wang <jasowang@redhat.com>
> Sent: Friday, November 22, 2019 3:14 AM
> 
> On 2019/11/21 下午11:10, Martin Habets wrote:
> > On 19/11/2019 04:08, Jason Wang wrote:
> >> On 2019/11/16 上午7:25, Parav Pandit wrote:
> >>> Hi Jeff,
> >>>
> >>>> From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> >>>> Sent: Friday, November 15, 2019 4:34 PM
> >>>>
> >>>> From: Dave Ertman <david.m.ertman@intel.com>
> >>>>
> >>>> This is the initial implementation of the Virtual Bus,
> >>>> virtbus_device and virtbus_driver.  The virtual bus is a software
> >>>> based bus intended to support lightweight devices and drivers and
> >>>> provide matching between them and probing of the registered drivers.
> >>>>
> >>>> The primary purpose of the virual bus is to provide matching
> >>>> services and to pass the data pointer contained in the
> >>>> virtbus_device to the virtbus_driver during its probe call.  This
> >>>> will allow two separate kernel objects to match up and start
> communication.
> >>>>
> >>> It is fundamental to know that rdma device created by virtbus_driver will
> be anchored to which bus for an non abusive use.
> >>> virtbus or parent pci bus?
> >>> I asked this question in v1 version of this patch.
> >>>
> >>> Also since it says - 'to support lightweight devices', documenting that
> information is critical to avoid ambiguity.
> >>>
> >>> Since for a while I am working on the subbus/subdev_bus/xbus/mdev [1]
> whatever we want to call it, it overlaps with your comment about 'to support
> lightweight devices'.
> >>> Hence let's make things crystal clear weather the purpose is 'only
> matching service' or also 'lightweight devices'.
> >>> If this is only matching service, lets please remove lightweight devices
> part..
> >>
> >> Yes, if it's matching + lightweight device, its function is almost a duplication
> of mdev. And I'm working on extending mdev[1] to be a generic module to
> support any types of virtual devices a while. The advantage of mdev is:
> >>
> >> 1) ready for the userspace driver (VFIO based)
> >> 2) have a sysfs/GUID based management interface
> > In my view this virtual-bus is more generic and more flexible than mdev.
> 
> 
> Even after the series [1] here?
> 
> 
> > What for you are the advantages of mdev to me are some of it's
> disadvantages.
> >
> > The way I see it we can provide rdma support in the driver using virtual-bus.
> 
This is fine, because it is only used for matching service.

> 
> Yes, but since it does matching only, you can do everything you want.
> But it looks to me Greg does not want a bus to be an API multiplexer. So if a
> dedicated bus is desired, it won't be much of code to have a bus on your own.
> 
Right. virtbus shouldn't be a multiplexer.
Otherwise mdev can be improved (abused) exactly the way virtbus might. Where 'mdev m stands for multiplexer too'. :-)
No, we shouldn’t do that.

Listening to Greg and Jason G, I agree that virtbus shouldn't be a multiplexer.
There are few basic differences between subfunctions and matching service device object.
Subfunctions over period of time will have several attributes, few that I think of right away are:
1. BAR resource info, write combine info
2. irq vectors details
3. unique id assigned by user (while virtbus will not assign such user id as they are auto created for matching service for PF/VF)
4. rdma device created by matched driver resides on pci bus or parent device
While rdma and netdev created on over subfunctions are linked to their own 'struct device'.

Due to that sysfs view for these two different types of devices is bit different.
Putting both on same bus just doesn't appear right with above fundamental differences of core layer.

> 
> > At the moment we would need separate mdev support in the driver for
> > vdpa, but I hope at some point mdev would become a layer on top of virtual-
> bus.

How is it optimal to create multiple 'struct device' for single purpose?
Especially when one wants to create hundreds of such devices to begin with.
User facing tool should be able to select device type and place the device on right bus.
Jason Gunthorpe Nov. 22, 2019, 6:02 p.m. UTC | #71
On Fri, Nov 22, 2019 at 04:45:38PM +0800, Jason Wang wrote:
> 
> On 2019/11/21 下午10:17, Jason Gunthorpe wrote:
> > On Thu, Nov 21, 2019 at 03:21:29PM +0800, Jason Wang wrote:
> > > > The role of vfio has traditionally been around secure device
> > > > assignment of a HW resource to a VM. I'm not totally clear on what the
> > > > role if mdev is seen to be, but all the mdev drivers in the tree seem
> > > > to make 'and pass it to KVM' a big part of their description.
> > > > 
> > > > So, looking at the virtio patches, I see some intended use is to map
> > > > some BAR pages into the VM.
> > > Nope, at least not for the current stage. It still depends on the
> > > virtio-net-pci emulatio in qemu to work. In the future, we will allow such
> > > mapping only for dorbell.
> > There has been a lot of emails today, but I think this is the main
> > point I want to respond to.
> > 
> > Using vfio when you don't even assign any part of the device BAR to
> > the VM is, frankly, a gigantic misuse, IMHO.
> 
> That's not a compelling point. 

Well, this discussion is going nowhere.

> > Just needing userspace DMA is not, in any way, a justification to use
> > vfio.
> > 
> > We have extensive library interfaces in the kernel to do userspace DMA
> > and subsystems like GPU and RDMA are full of example uses of this kind
> > of stuff.
> 
> I'm not sure which library did you mean here. Is any of those library used
> by qemu? If not, what's the reason?

I mean the library functions in the kernel that vfio uses to implement
all the user dma stuff. Other subsystems use them too, it is not
exclusive to vfio.

> > Further, I do not think it is wise to design the userspace ABI around
> > a simplistict implementation that can't do BAR assignment,
> 
> Again, the vhost-mdev follow the VFIO ABI, no new ABI is invented, and
> mmap() was kept their for mapping device regions.

The patches have a new file in include/uapi.

Everything in include/api is considered new user ABI.

> > My advice is to proceed as a proper subsystem with your own chardev,
> > own bus type, etc and maybe live in staging for a bit until 2-3
> > drivers are implementing the ABI (or at the very least agreeing with),
> > as is the typical process for Linux.
> 
> I'm open to comments for sure, but looking at all the requirement for vDPA,
> most of the requirement could be settled through existed modules, that's not
> only a simplification for developing but also for management layer or
> userspace drivers.

We've already got disagreement that the GUID based mdev approach is
desirable for management.

Performing userspace DMA from a kernel driver is absolutely not a
reason to use VFIO.

Is there any technical justification?

Jason
Tiwei Bie Nov. 23, 2019, 4:39 a.m. UTC | #72
On Fri, Nov 22, 2019 at 02:02:14PM -0400, Jason Gunthorpe wrote:
> On Fri, Nov 22, 2019 at 04:45:38PM +0800, Jason Wang wrote:
> > On 2019/11/21 下午10:17, Jason Gunthorpe wrote:
> > > On Thu, Nov 21, 2019 at 03:21:29PM +0800, Jason Wang wrote:
> > > > > The role of vfio has traditionally been around secure device
> > > > > assignment of a HW resource to a VM. I'm not totally clear on what the
> > > > > role if mdev is seen to be, but all the mdev drivers in the tree seem
> > > > > to make 'and pass it to KVM' a big part of their description.
> > > > > 
> > > > > So, looking at the virtio patches, I see some intended use is to map
> > > > > some BAR pages into the VM.
> > > > Nope, at least not for the current stage. It still depends on the
> > > > virtio-net-pci emulatio in qemu to work. In the future, we will allow such
> > > > mapping only for dorbell.
> > > There has been a lot of emails today, but I think this is the main
> > > point I want to respond to.
> > > 
> > > Using vfio when you don't even assign any part of the device BAR to
> > > the VM is, frankly, a gigantic misuse, IMHO.
> > 
> > That's not a compelling point. 
> 
> Well, this discussion is going nowhere.

You removed JasonW's other reply in above quote. He said it clearly
that we do want/need to assign parts of device BAR to the VM.

> 
> > > Just needing userspace DMA is not, in any way, a justification to use
> > > vfio.
> > > 
> > > We have extensive library interfaces in the kernel to do userspace DMA
> > > and subsystems like GPU and RDMA are full of example uses of this kind
> > > of stuff.
> > 
> > I'm not sure which library did you mean here. Is any of those library used
> > by qemu? If not, what's the reason?
> 
> I mean the library functions in the kernel that vfio uses to implement
> all the user dma stuff. Other subsystems use them too, it is not
> exclusive to vfio.

IIUC, your point is to suggest us invent new DMA API for userspace to
use instead of leveraging VFIO's well defined DMA API. Even if we don't
use VFIO at all, I would imagine it could be very VFIO-like (e.g. caps
for BAR + container/group for DMA) eventually.

> 
> > > Further, I do not think it is wise to design the userspace ABI around
> > > a simplistict implementation that can't do BAR assignment,
> > 
> > Again, the vhost-mdev follow the VFIO ABI, no new ABI is invented, and
> > mmap() was kept their for mapping device regions.
> 
> The patches have a new file in include/uapi.

I guess you didn't look at the code. Just to clarify, there is no
new file introduced in include/uapi. Only small vhost extensions to
the existing vhost uapi are involved in vhost-mdev.

>
Michael S. Tsirkin Nov. 23, 2019, 4:48 p.m. UTC | #73
On Thu, Nov 21, 2019 at 10:17:32AM -0400, Jason Gunthorpe wrote:
> On Thu, Nov 21, 2019 at 03:21:29PM +0800, Jason Wang wrote:
> > > The role of vfio has traditionally been around secure device
> > > assignment of a HW resource to a VM. I'm not totally clear on what the
> > > role if mdev is seen to be, but all the mdev drivers in the tree seem
> > > to make 'and pass it to KVM' a big part of their description.
> > > 
> > > So, looking at the virtio patches, I see some intended use is to map
> > > some BAR pages into the VM.
> > 
> > Nope, at least not for the current stage. It still depends on the
> > virtio-net-pci emulatio in qemu to work. In the future, we will allow such
> > mapping only for dorbell.
> 
> There has been a lot of emails today, but I think this is the main
> point I want to respond to.
> 
> Using vfio when you don't even assign any part of the device BAR to
> the VM is, frankly, a gigantic misuse, IMHO.

That's something that should be fixed BTW.  Hardware supports this, so
it's possible, and VFIO should make it easy to add.
Does this put this comment to rest?


> Just needing userspace DMA is not, in any way, a justification to use
> vfio.
> 
> We have extensive library interfaces in the kernel to do userspace DMA
> and subsystems like GPU and RDMA are full of example uses of this kind
> of stuff. Everything from on-device IOMMU to system IOMMU to PASID. If
> you find things missing then we need to improve those library
> interfaces, not further abuse VFIO.
> 
> Further, I do not think it is wise to design the userspace ABI around
> a simplistict implementation that can't do BAR assignment,

This just should be added, IFC cna do BAR assignment.

> and can't
> support multiple virtio rings on single PCI function.

It can't support multiple virtio *devices* per function.  Sub functions
devices imho are not a must.  E.g. lots of people use SRIOV and are
quite happy.  So I don't see what is wrong with a device per function,
for starters.

> This stuff is
> clearly too premature.
>
> My advice is to proceed as a proper subsystem with your own chardev,
> own bus type, etc and maybe live in staging for a bit until 2-3
> drivers are implementing the ABI (or at the very least agreeing with),
> as is the typical process for Linux.
> 
> Building a new kernel ABI is hard (this is why I advised to use a
> userspace driver). It has to go through the community process at the
> usual pace.
> 
> Jason
Michael S. Tsirkin Nov. 23, 2019, 4:50 p.m. UTC | #74
On Thu, Nov 21, 2019 at 09:44:38AM -0400, Jason Gunthorpe wrote:
> On Wed, Nov 20, 2019 at 11:24:03PM -0500, Michael S. Tsirkin wrote:
> > On Wed, Nov 20, 2019 at 11:03:57PM -0400, Jason Gunthorpe wrote:
> > > Frankly, when I look at what this virtio stuff is doing I see RDMA:
> > >  - Both have a secure BAR pages for mmaping to userspace (or VM)
> > >  - Both are prevented from interacting with the device at a register
> > >    level and must call to the kernel - ie creating resources is a
> > >    kernel call - for security.
> > >  - Both create command request/response rings in userspace controlled
> > >    memory and have HW DMA to read requests and DMA to generate responses
> > >  - Both allow the work on the rings to DMA outside the ring to
> > >    addresses controlled by userspace.
> > >  - Both have to support a mixture of HW that uses on-device security
> > >    or IOMMU based security.
> > 
> > The main difference is userspace/drivers need to be portable with
> > virtio.
> 
> rdma also has a stable/portable user space library API that is
> portable to multiple operating systems.
> 
> What you don't like is that RDMA userspace has driver-specific
> code. Ie the kernel interface is not fully hardware independent.
> 
> Jason

Right. Not that I don't like it, it has some advantages too.
But it's addressing a different need which a vendor
specific userspace driver doesn't address.
Jason Gunthorpe Nov. 23, 2019, 11:09 p.m. UTC | #75
On Sat, Nov 23, 2019 at 12:39:51PM +0800, Tiwei Bie wrote:
> On Fri, Nov 22, 2019 at 02:02:14PM -0400, Jason Gunthorpe wrote:
> > On Fri, Nov 22, 2019 at 04:45:38PM +0800, Jason Wang wrote:
> > > On 2019/11/21 下午10:17, Jason Gunthorpe wrote:
> > > > On Thu, Nov 21, 2019 at 03:21:29PM +0800, Jason Wang wrote:
> > > > > > The role of vfio has traditionally been around secure device
> > > > > > assignment of a HW resource to a VM. I'm not totally clear on what the
> > > > > > role if mdev is seen to be, but all the mdev drivers in the tree seem
> > > > > > to make 'and pass it to KVM' a big part of their description.
> > > > > > 
> > > > > > So, looking at the virtio patches, I see some intended use is to map
> > > > > > some BAR pages into the VM.
> > > > > Nope, at least not for the current stage. It still depends on the
> > > > > virtio-net-pci emulatio in qemu to work. In the future, we will allow such
> > > > > mapping only for dorbell.
> > > > There has been a lot of emails today, but I think this is the main
> > > > point I want to respond to.
> > > > 
> > > > Using vfio when you don't even assign any part of the device BAR to
> > > > the VM is, frankly, a gigantic misuse, IMHO.
> > > 
> > > That's not a compelling point. 
> > 
> > Well, this discussion is going nowhere.
> 
> You removed JasonW's other reply in above quote. He said it clearly
> that we do want/need to assign parts of device BAR to the VM.

Generally we don't look at patches based on stuff that isn't in them.

> > I mean the library functions in the kernel that vfio uses to implement
> > all the user dma stuff. Other subsystems use them too, it is not
> > exclusive to vfio.
> 
> IIUC, your point is to suggest us invent new DMA API for userspace to
> use instead of leveraging VFIO's well defined DMA API. Even if we don't
> use VFIO at all, I would imagine it could be very VFIO-like (e.g. caps
> for BAR + container/group for DMA) eventually.

None of the other user dma subsystems seem to have the problems you
are imagining here. Perhaps you should try it first?
 
> > > > Further, I do not think it is wise to design the userspace ABI around
> > > > a simplistict implementation that can't do BAR assignment,
> > > 
> > > Again, the vhost-mdev follow the VFIO ABI, no new ABI is invented, and
> > > mmap() was kept their for mapping device regions.
> > 
> > The patches have a new file in include/uapi.
> 
> I guess you didn't look at the code. Just to clarify, there is no
> new file introduced in include/uapi. Only small vhost extensions to
> the existing vhost uapi are involved in vhost-mdev.

You know, I review alot of patches every week, and sometimes I make
mistakes, but not this time. From the ICF cover letter:

https://lkml.org/lkml/2019/11/7/62

 drivers/vfio/mdev/mdev_core.c    |  21 ++
 drivers/vhost/Kconfig            |  12 +
 drivers/vhost/Makefile           |   3 +
 drivers/vhost/mdev.c             | 556 +++++++++++++++++++++++++++++++
 include/linux/mdev.h             |   5 +
 include/uapi/linux/vhost.h       |  21 ++
 include/uapi/linux/vhost_types.h |   8 +
      ^^^^^^^^^^^^^^

Perhaps you thought I ment ICF was adding uapi? My remarks cover all
three of the series involved here.

Jason
Michael S. Tsirkin Nov. 24, 2019, 11 a.m. UTC | #76
On Sat, Nov 23, 2019 at 07:09:48PM -0400, Jason Gunthorpe wrote:
> > > > > Further, I do not think it is wise to design the userspace ABI around
> > > > > a simplistict implementation that can't do BAR assignment,
> > > > 
> > > > Again, the vhost-mdev follow the VFIO ABI, no new ABI is invented, and
> > > > mmap() was kept their for mapping device regions.
> > > 
> > > The patches have a new file in include/uapi.
> > 
> > I guess you didn't look at the code. Just to clarify, there is no
> > new file introduced in include/uapi. Only small vhost extensions to
> > the existing vhost uapi are involved in vhost-mdev.
> 
> You know, I review alot of patches every week, and sometimes I make
> mistakes, but not this time. From the ICF cover letter:
> 
> https://lkml.org/lkml/2019/11/7/62
> 
>  drivers/vfio/mdev/mdev_core.c    |  21 ++
>  drivers/vhost/Kconfig            |  12 +
>  drivers/vhost/Makefile           |   3 +
>  drivers/vhost/mdev.c             | 556 +++++++++++++++++++++++++++++++
>  include/linux/mdev.h             |   5 +
>  include/uapi/linux/vhost.h       |  21 ++
>  include/uapi/linux/vhost_types.h |   8 +
>       ^^^^^^^^^^^^^^
> 
> Perhaps you thought I ment ICF was adding uapi? My remarks cover all
> three of the series involved here.

Tiwei seems to be right - include/uapi/linux/vhost.h and
include/uapi/linux/vhost_types.h are both existing files.  vhost uapi
extensions included here are very modest. They
just add virtio spec things that vhost was missing.

Are you looking at a very old linux tree maybe?
vhost_types.h appeared around 4.20.
Tiwei Bie Nov. 24, 2019, 2:51 p.m. UTC | #77
On Sat, Nov 23, 2019 at 07:09:48PM -0400, Jason Gunthorpe wrote:
> On Sat, Nov 23, 2019 at 12:39:51PM +0800, Tiwei Bie wrote:
> > On Fri, Nov 22, 2019 at 02:02:14PM -0400, Jason Gunthorpe wrote:
> > > On Fri, Nov 22, 2019 at 04:45:38PM +0800, Jason Wang wrote:
> > > > On 2019/11/21 下午10:17, Jason Gunthorpe wrote:
> > > > > On Thu, Nov 21, 2019 at 03:21:29PM +0800, Jason Wang wrote:
> > > > > > > The role of vfio has traditionally been around secure device
> > > > > > > assignment of a HW resource to a VM. I'm not totally clear on what the
> > > > > > > role if mdev is seen to be, but all the mdev drivers in the tree seem
> > > > > > > to make 'and pass it to KVM' a big part of their description.
> > > > > > > 
> > > > > > > So, looking at the virtio patches, I see some intended use is to map
> > > > > > > some BAR pages into the VM.
> > > > > > Nope, at least not for the current stage. It still depends on the
> > > > > > virtio-net-pci emulatio in qemu to work. In the future, we will allow such
> > > > > > mapping only for dorbell.
> > > > > There has been a lot of emails today, but I think this is the main
> > > > > point I want to respond to.
> > > > > 
> > > > > Using vfio when you don't even assign any part of the device BAR to
> > > > > the VM is, frankly, a gigantic misuse, IMHO.
> > > > 
> > > > That's not a compelling point. 
> > > 
> > > Well, this discussion is going nowhere.
> > 
> > You removed JasonW's other reply in above quote. He said it clearly
> > that we do want/need to assign parts of device BAR to the VM.
> 
> Generally we don't look at patches based on stuff that isn't in them.

The hardware is ready, and it's something really necessary (for
the performance). It was planned to be added immediately after
current series. If you want, it certainly can be included right now.

> 
> > > I mean the library functions in the kernel that vfio uses to implement
> > > all the user dma stuff. Other subsystems use them too, it is not
> > > exclusive to vfio.
> > 
> > IIUC, your point is to suggest us invent new DMA API for userspace to
> > use instead of leveraging VFIO's well defined DMA API. Even if we don't
> > use VFIO at all, I would imagine it could be very VFIO-like (e.g. caps
> > for BAR + container/group for DMA) eventually.
> 
> None of the other user dma subsystems seem to have the problems you
> are imagining here. Perhaps you should try it first?

Actually VFIO DMA API wasn't used at the beginning of vhost-mdev. But
after the discussion in upstream during the RFC stage since the last
year, the conclusion is that leveraging VFIO's existing DMA API would
be the better choice and then vhost-mdev switched to that direction.

>  
> > > > > Further, I do not think it is wise to design the userspace ABI around
> > > > > a simplistict implementation that can't do BAR assignment,
> > > > 
> > > > Again, the vhost-mdev follow the VFIO ABI, no new ABI is invented, and
> > > > mmap() was kept their for mapping device regions.
> > > 
> > > The patches have a new file in include/uapi.
> > 
> > I guess you didn't look at the code. Just to clarify, there is no
> > new file introduced in include/uapi. Only small vhost extensions to
> > the existing vhost uapi are involved in vhost-mdev.
> 
> You know, I review alot of patches every week, and sometimes I make
> mistakes, but not this time. From the ICF cover letter:
> 
> https://lkml.org/lkml/2019/11/7/62
> 
>  drivers/vfio/mdev/mdev_core.c    |  21 ++
>  drivers/vhost/Kconfig            |  12 +
>  drivers/vhost/Makefile           |   3 +
>  drivers/vhost/mdev.c             | 556 +++++++++++++++++++++++++++++++
>  include/linux/mdev.h             |   5 +
>  include/uapi/linux/vhost.h       |  21 ++
>  include/uapi/linux/vhost_types.h |   8 +
>       ^^^^^^^^^^^^^^
> 
> Perhaps you thought I ment ICF was adding uapi? My remarks cover all
> three of the series involved here.

No, I meant the same thing. Michael helped me explain that.
https://patchwork.ozlabs.org/patch/1195895/#2311180

> 
> Jason
Tiwei Bie Nov. 24, 2019, 2:56 p.m. UTC | #78
On Sun, Nov 24, 2019 at 06:00:23AM -0500, Michael S. Tsirkin wrote:
> On Sat, Nov 23, 2019 at 07:09:48PM -0400, Jason Gunthorpe wrote:
> > > > > > Further, I do not think it is wise to design the userspace ABI around
> > > > > > a simplistict implementation that can't do BAR assignment,
> > > > > 
> > > > > Again, the vhost-mdev follow the VFIO ABI, no new ABI is invented, and
> > > > > mmap() was kept their for mapping device regions.
> > > > 
> > > > The patches have a new file in include/uapi.
> > > 
> > > I guess you didn't look at the code. Just to clarify, there is no
> > > new file introduced in include/uapi. Only small vhost extensions to
> > > the existing vhost uapi are involved in vhost-mdev.
> > 
> > You know, I review alot of patches every week, and sometimes I make
> > mistakes, but not this time. From the ICF cover letter:
> > 
> > https://lkml.org/lkml/2019/11/7/62
> > 
> >  drivers/vfio/mdev/mdev_core.c    |  21 ++
> >  drivers/vhost/Kconfig            |  12 +
> >  drivers/vhost/Makefile           |   3 +
> >  drivers/vhost/mdev.c             | 556 +++++++++++++++++++++++++++++++
> >  include/linux/mdev.h             |   5 +
> >  include/uapi/linux/vhost.h       |  21 ++
> >  include/uapi/linux/vhost_types.h |   8 +
> >       ^^^^^^^^^^^^^^
> > 
> > Perhaps you thought I ment ICF was adding uapi? My remarks cover all
> > three of the series involved here.
> 
> Tiwei seems to be right - include/uapi/linux/vhost.h and
> include/uapi/linux/vhost_types.h are both existing files.  vhost uapi
> extensions included here are very modest. They
> just add virtio spec things that vhost was missing.

Yeah, that's what I meant. Thanks!

> 
> Are you looking at a very old linux tree maybe?
> vhost_types.h appeared around 4.20.
> 
> -- 
> MST
>
Michael S. Tsirkin Nov. 24, 2019, 3:07 p.m. UTC | #79
On Sun, Nov 24, 2019 at 10:51:24PM +0800, Tiwei Bie wrote:
> On Sat, Nov 23, 2019 at 07:09:48PM -0400, Jason Gunthorpe wrote:
> > On Sat, Nov 23, 2019 at 12:39:51PM +0800, Tiwei Bie wrote:
> > > On Fri, Nov 22, 2019 at 02:02:14PM -0400, Jason Gunthorpe wrote:
> > > > On Fri, Nov 22, 2019 at 04:45:38PM +0800, Jason Wang wrote:
> > > > > On 2019/11/21 下午10:17, Jason Gunthorpe wrote:
> > > > > > On Thu, Nov 21, 2019 at 03:21:29PM +0800, Jason Wang wrote:
> > > > > > > > The role of vfio has traditionally been around secure device
> > > > > > > > assignment of a HW resource to a VM. I'm not totally clear on what the
> > > > > > > > role if mdev is seen to be, but all the mdev drivers in the tree seem
> > > > > > > > to make 'and pass it to KVM' a big part of their description.
> > > > > > > > 
> > > > > > > > So, looking at the virtio patches, I see some intended use is to map
> > > > > > > > some BAR pages into the VM.
> > > > > > > Nope, at least not for the current stage. It still depends on the
> > > > > > > virtio-net-pci emulatio in qemu to work. In the future, we will allow such
> > > > > > > mapping only for dorbell.
> > > > > > There has been a lot of emails today, but I think this is the main
> > > > > > point I want to respond to.
> > > > > > 
> > > > > > Using vfio when you don't even assign any part of the device BAR to
> > > > > > the VM is, frankly, a gigantic misuse, IMHO.
> > > > > 
> > > > > That's not a compelling point. 
> > > > 
> > > > Well, this discussion is going nowhere.
> > > 
> > > You removed JasonW's other reply in above quote. He said it clearly
> > > that we do want/need to assign parts of device BAR to the VM.
> > 
> > Generally we don't look at patches based on stuff that isn't in them.
> 
> The hardware is ready, and it's something really necessary (for
> the performance). It was planned to be added immediately after
> current series. If you want, it certainly can be included right now.

It can't hurt, for sure. Can be a separate patch if you feel
review is easier that way.

> > 
> > > > I mean the library functions in the kernel that vfio uses to implement
> > > > all the user dma stuff. Other subsystems use them too, it is not
> > > > exclusive to vfio.
> > > 
> > > IIUC, your point is to suggest us invent new DMA API for userspace to
> > > use instead of leveraging VFIO's well defined DMA API. Even if we don't
> > > use VFIO at all, I would imagine it could be very VFIO-like (e.g. caps
> > > for BAR + container/group for DMA) eventually.
> > 
> > None of the other user dma subsystems seem to have the problems you
> > are imagining here. Perhaps you should try it first?
> 
> Actually VFIO DMA API wasn't used at the beginning of vhost-mdev. But
> after the discussion in upstream during the RFC stage since the last
> year, the conclusion is that leveraging VFIO's existing DMA API would
> be the better choice and then vhost-mdev switched to that direction.
> 
> >  
> > > > > > Further, I do not think it is wise to design the userspace ABI around
> > > > > > a simplistict implementation that can't do BAR assignment,
> > > > > 
> > > > > Again, the vhost-mdev follow the VFIO ABI, no new ABI is invented, and
> > > > > mmap() was kept their for mapping device regions.
> > > > 
> > > > The patches have a new file in include/uapi.
> > > 
> > > I guess you didn't look at the code. Just to clarify, there is no
> > > new file introduced in include/uapi. Only small vhost extensions to
> > > the existing vhost uapi are involved in vhost-mdev.
> > 
> > You know, I review alot of patches every week, and sometimes I make
> > mistakes, but not this time. From the ICF cover letter:
> > 
> > https://lkml.org/lkml/2019/11/7/62
> > 
> >  drivers/vfio/mdev/mdev_core.c    |  21 ++
> >  drivers/vhost/Kconfig            |  12 +
> >  drivers/vhost/Makefile           |   3 +
> >  drivers/vhost/mdev.c             | 556 +++++++++++++++++++++++++++++++
> >  include/linux/mdev.h             |   5 +
> >  include/uapi/linux/vhost.h       |  21 ++
> >  include/uapi/linux/vhost_types.h |   8 +
> >       ^^^^^^^^^^^^^^
> > 
> > Perhaps you thought I ment ICF was adding uapi? My remarks cover all
> > three of the series involved here.
> 
> No, I meant the same thing. Michael helped me explain that.
> https://patchwork.ozlabs.org/patch/1195895/#2311180
> 
> > 
> > Jason
Jason Gunthorpe Nov. 25, 2019, 12:07 a.m. UTC | #80
On Sun, Nov 24, 2019 at 06:00:23AM -0500, Michael S. Tsirkin wrote:
> On Sat, Nov 23, 2019 at 07:09:48PM -0400, Jason Gunthorpe wrote:
> > > > > > Further, I do not think it is wise to design the userspace ABI around
> > > > > > a simplistict implementation that can't do BAR assignment,
> > > > > 
> > > > > Again, the vhost-mdev follow the VFIO ABI, no new ABI is invented, and
> > > > > mmap() was kept their for mapping device regions.
> > > > 
> > > > The patches have a new file in include/uapi.
> > > 
> > > I guess you didn't look at the code. Just to clarify, there is no
> > > new file introduced in include/uapi. Only small vhost extensions to
> > > the existing vhost uapi are involved in vhost-mdev.
> > 
> > You know, I review alot of patches every week, and sometimes I make
> > mistakes, but not this time. From the ICF cover letter:
> > 
> > https://lkml.org/lkml/2019/11/7/62
> > 
> >  drivers/vfio/mdev/mdev_core.c    |  21 ++
> >  drivers/vhost/Kconfig            |  12 +
> >  drivers/vhost/Makefile           |   3 +
> >  drivers/vhost/mdev.c             | 556 +++++++++++++++++++++++++++++++
> >  include/linux/mdev.h             |   5 +
> >  include/uapi/linux/vhost.h       |  21 ++
> >  include/uapi/linux/vhost_types.h |   8 +
> >       ^^^^^^^^^^^^^^
> > 
> > Perhaps you thought I ment ICF was adding uapi? My remarks cover all
> > three of the series involved here.
> 
> Tiwei seems to be right - include/uapi/linux/vhost.h and
> include/uapi/linux/vhost_types.h are both existing files.  vhost uapi
> extensions included here are very modest. They
> just add virtio spec things that vhost was missing.

Sigh, fine whatever, I mispoke and called the 7 new ioctls a 'new
file' instead of 'new ioctls' when responding to someone who denied
they even existed. 

Anyhow why do both of you keep saying "small vhost extensions to the
existing vhost uapi" when these 7 new ioctls appear to be connected to
vfio_device_ops, and /dev/vfio ?

Oh, gross, this is running some existing ioctl interface over
/dev/vfio - the new uABI here is really putting all 10 new ioctls on
/dev/vfio that didn't exist there before.

Jason
Jason Gunthorpe Nov. 25, 2019, 12:09 a.m. UTC | #81
On Sun, Nov 24, 2019 at 10:51:24PM +0800, Tiwei Bie wrote:

> > > You removed JasonW's other reply in above quote. He said it clearly
> > > that we do want/need to assign parts of device BAR to the VM.
> > 
> > Generally we don't look at patches based on stuff that isn't in them.
> 
> The hardware is ready, and it's something really necessary (for
> the performance). It was planned to be added immediately after
> current series. If you want, it certainly can be included right now.

I don't think it makes a significant difference, there are enough
reasons already that this does not belong in vfio. Both Greg and I
already were very against using mdev as an alterative to the driver
core.

> > > IIUC, your point is to suggest us invent new DMA API for userspace to
> > > use instead of leveraging VFIO's well defined DMA API. Even if we don't
> > > use VFIO at all, I would imagine it could be very VFIO-like (e.g. caps
> > > for BAR + container/group for DMA) eventually.
> > 
> > None of the other user dma subsystems seem to have the problems you
> > are imagining here. Perhaps you should try it first?
> 
> Actually VFIO DMA API wasn't used at the beginning of vhost-mdev. But
> after the discussion in upstream during the RFC stage since the last
> year, the conclusion is that leveraging VFIO's existing DMA API would
> be the better choice and then vhost-mdev switched to that direction.

Well, unfortunately, I think that discussion may have led you
wrong. Do you have a link? Did you post an ICF driver that didn't use vfio?

Jason
Jason Wang Nov. 25, 2019, 12:59 p.m. UTC | #82
On 2019/11/25 上午8:09, Jason Gunthorpe wrote:
> On Sun, Nov 24, 2019 at 10:51:24PM +0800, Tiwei Bie wrote:
>
>>>> You removed JasonW's other reply in above quote. He said it clearly
>>>> that we do want/need to assign parts of device BAR to the VM.
>>> Generally we don't look at patches based on stuff that isn't in them.
>> The hardware is ready, and it's something really necessary (for
>> the performance). It was planned to be added immediately after
>> current series. If you want, it certainly can be included right now.
> I don't think it makes a significant difference, there are enough
> reasons already that this does not belong in vfio. Both Greg and I
> already were very against using mdev as an alterative to the driver
> core.


Don't get us wrong, in v13 this is what Greg said [1].

"
Also, see the other conversations we are having about a "virtual" bus
and devices.  I do not want to have two different ways of doing the same
thing in the kernel at the same time please.  Please work together with
the Intel developers to solve this in a unified way, as you both
need/want the same thing here.

Neither this, nor the other proposal can be accepted until you all agree
on the design and implementation.
"

[1] https://lkml.org/lkml/2019/11/18/521


>
>>>> IIUC, your point is to suggest us invent new DMA API for userspace to
>>>> use instead of leveraging VFIO's well defined DMA API. Even if we don't
>>>> use VFIO at all, I would imagine it could be very VFIO-like (e.g. caps
>>>> for BAR + container/group for DMA) eventually.
>>> None of the other user dma subsystems seem to have the problems you
>>> are imagining here. Perhaps you should try it first?
>> Actually VFIO DMA API wasn't used at the beginning of vhost-mdev. But
>> after the discussion in upstream during the RFC stage since the last
>> year, the conclusion is that leveraging VFIO's existing DMA API would
>> be the better choice and then vhost-mdev switched to that direction.
> Well, unfortunately, I think that discussion may have led you
> wrong. Do you have a link? Did you post an ICF driver that didn't use vfio?


Why do you think the driver posted in [2] use vfio?

[2] https://lkml.org/lkml/2019/11/21/479

Thanks


>
> Jason
>
Martin Habets Nov. 26, 2019, 12:26 p.m. UTC | #83
On 22/11/2019 16:19, Parav Pandit wrote:
> 
> 
>> From: Jason Wang <jasowang@redhat.com>
>> Sent: Friday, November 22, 2019 3:14 AM
>>
>> On 2019/11/21 下午11:10, Martin Habets wrote:
>>> On 19/11/2019 04:08, Jason Wang wrote:
>>>> On 2019/11/16 上午7:25, Parav Pandit wrote:
>>>>> Hi Jeff,
>>>>>
>>>>>> From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
>>>>>> Sent: Friday, November 15, 2019 4:34 PM
>>>>>>
>>>>>> From: Dave Ertman <david.m.ertman@intel.com>
>>>>>>
>>>>>> This is the initial implementation of the Virtual Bus,
>>>>>> virtbus_device and virtbus_driver.  The virtual bus is a software
>>>>>> based bus intended to support lightweight devices and drivers and
>>>>>> provide matching between them and probing of the registered drivers.
>>>>>>
>>>>>> The primary purpose of the virual bus is to provide matching
>>>>>> services and to pass the data pointer contained in the
>>>>>> virtbus_device to the virtbus_driver during its probe call.  This
>>>>>> will allow two separate kernel objects to match up and start
>> communication.
>>>>>>
>>>>> It is fundamental to know that rdma device created by virtbus_driver will
>> be anchored to which bus for an non abusive use.
>>>>> virtbus or parent pci bus?
>>>>> I asked this question in v1 version of this patch.
>>>>>
>>>>> Also since it says - 'to support lightweight devices', documenting that
>> information is critical to avoid ambiguity.
>>>>>
>>>>> Since for a while I am working on the subbus/subdev_bus/xbus/mdev [1]
>> whatever we want to call it, it overlaps with your comment about 'to support
>> lightweight devices'.
>>>>> Hence let's make things crystal clear weather the purpose is 'only
>> matching service' or also 'lightweight devices'.
>>>>> If this is only matching service, lets please remove lightweight devices
>> part..
>>>>
>>>> Yes, if it's matching + lightweight device, its function is almost a duplication
>> of mdev. And I'm working on extending mdev[1] to be a generic module to
>> support any types of virtual devices a while. The advantage of mdev is:
>>>>
>>>> 1) ready for the userspace driver (VFIO based)
>>>> 2) have a sysfs/GUID based management interface
>>> In my view this virtual-bus is more generic and more flexible than mdev.
>>
>>
>> Even after the series [1] here?

I have been following that series. It does make mdev more flexible, and almost turns it into a real bus.
Even with those improvements to mdev the virtual-bus is in my view still more generic and more flexible,
and hence more future-proof.

>>
>>> What for you are the advantages of mdev to me are some of it's
>> disadvantages.
>>>
>>> The way I see it we can provide rdma support in the driver using virtual-bus.
>>
> This is fine, because it is only used for matching service.
> 
>>
>> Yes, but since it does matching only, you can do everything you want.
>> But it looks to me Greg does not want a bus to be an API multiplexer. So if a
>> dedicated bus is desired, it won't be much of code to have a bus on your own.

I did not intend for it to be a multiplexer. And I very much prefer a generic bus over a any driver specific bus.

> Right. virtbus shouldn't be a multiplexer.
> Otherwise mdev can be improved (abused) exactly the way virtbus might. Where 'mdev m stands for multiplexer too'. :-)
> No, we shouldn’t do that.
> 
> Listening to Greg and Jason G, I agree that virtbus shouldn't be a multiplexer.
> There are few basic differences between subfunctions and matching service device object.
> Subfunctions over period of time will have several attributes, few that I think of right away are:
> 1. BAR resource info, write combine info
> 2. irq vectors details
> 3. unique id assigned by user (while virtbus will not assign such user id as they are auto created for matching service for PF/VF)
> 4. rdma device created by matched driver resides on pci bus or parent device
> While rdma and netdev created on over subfunctions are linked to their own 'struct device'.

This is more aligned with my thinking as well, although I do not call these items subfunctions.
There can be different devices for different users, where multiple can be active at the same time (with some constraints).

One important thing to note is that there may not not be a netdev device. What we traditionally call
a "network driver" will then only manage the virtualised devices.

> Due to that sysfs view for these two different types of devices is bit different.
> Putting both on same bus just doesn't appear right with above fundamental differences of core layer.

Can you explain which code layer you mean?

>>
>>> At the moment we would need separate mdev support in the driver for
>>> vdpa, but I hope at some point mdev would become a layer on top of virtual-
>> bus.
> 
> How is it optimal to create multiple 'struct device' for single purpose?
> Especially when one wants to create hundreds of such devices to begin with.
> User facing tool should be able to select device type and place the device on right bus.

At this point I think it is not possible to create a solution that is optimal right now for all use cases.
With the virtual bus we do have a solid foundation going forward, for the users we know now and for
future ones. Optimisation is something that needs to happen over time, without breaking existing users.

As for the user facing tool, the only one I know of that always works is "echo" into a sysfs file.

Best regards,
Martin
Jason Wang Nov. 27, 2019, 10:58 a.m. UTC | #84
On 2019/11/26 下午8:26, Martin Habets wrote:
> On 22/11/2019 16:19, Parav Pandit wrote:
>>
>>> From: Jason Wang <jasowang@redhat.com>
>>> Sent: Friday, November 22, 2019 3:14 AM
>>>
>>> On 2019/11/21 下午11:10, Martin Habets wrote:
>>>> On 19/11/2019 04:08, Jason Wang wrote:
>>>>> On 2019/11/16 上午7:25, Parav Pandit wrote:
>>>>>> Hi Jeff,
>>>>>>
>>>>>>> From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
>>>>>>> Sent: Friday, November 15, 2019 4:34 PM
>>>>>>>
>>>>>>> From: Dave Ertman <david.m.ertman@intel.com>
>>>>>>>
>>>>>>> This is the initial implementation of the Virtual Bus,
>>>>>>> virtbus_device and virtbus_driver.  The virtual bus is a software
>>>>>>> based bus intended to support lightweight devices and drivers and
>>>>>>> provide matching between them and probing of the registered drivers.
>>>>>>>
>>>>>>> The primary purpose of the virual bus is to provide matching
>>>>>>> services and to pass the data pointer contained in the
>>>>>>> virtbus_device to the virtbus_driver during its probe call.  This
>>>>>>> will allow two separate kernel objects to match up and start
>>> communication.
>>>>>> It is fundamental to know that rdma device created by virtbus_driver will
>>> be anchored to which bus for an non abusive use.
>>>>>> virtbus or parent pci bus?
>>>>>> I asked this question in v1 version of this patch.
>>>>>>
>>>>>> Also since it says - 'to support lightweight devices', documenting that
>>> information is critical to avoid ambiguity.
>>>>>> Since for a while I am working on the subbus/subdev_bus/xbus/mdev [1]
>>> whatever we want to call it, it overlaps with your comment about 'to support
>>> lightweight devices'.
>>>>>> Hence let's make things crystal clear weather the purpose is 'only
>>> matching service' or also 'lightweight devices'.
>>>>>> If this is only matching service, lets please remove lightweight devices
>>> part..
>>>>> Yes, if it's matching + lightweight device, its function is almost a duplication
>>> of mdev. And I'm working on extending mdev[1] to be a generic module to
>>> support any types of virtual devices a while. The advantage of mdev is:
>>>>> 1) ready for the userspace driver (VFIO based)
>>>>> 2) have a sysfs/GUID based management interface
>>>> In my view this virtual-bus is more generic and more flexible than mdev.
>>>
>>> Even after the series [1] here?
> I have been following that series. It does make mdev more flexible, and almost turns it into a real bus.
> Even with those improvements to mdev the virtual-bus is in my view still more generic and more flexible,
> and hence more future-proof.


So the only difference so far is after that series is:

1) mdev has sysfs support
2) mdev has support from vfio

For 1) we can decouple that part to be more flexible, for 2) I think you 
would still need that part other than inventing a new VFIO driver (e.g 
vfio-virtual-bus)?


>
>>>> What for you are the advantages of mdev to me are some of it's
>>> disadvantages.
>>>> The way I see it we can provide rdma support in the driver using virtual-bus.
>> This is fine, because it is only used for matching service.
>>
>>> Yes, but since it does matching only, you can do everything you want.
>>> But it looks to me Greg does not want a bus to be an API multiplexer. So if a
>>> dedicated bus is desired, it won't be much of code to have a bus on your own.
> I did not intend for it to be a multiplexer. And I very much prefer a generic bus over a any driver specific bus.
>
>> Right. virtbus shouldn't be a multiplexer.
>> Otherwise mdev can be improved (abused) exactly the way virtbus might. Where 'mdev m stands for multiplexer too'. :-)
>> No, we shouldn’t do that.
>>
>> Listening to Greg and Jason G, I agree that virtbus shouldn't be a multiplexer.
>> There are few basic differences between subfunctions and matching service device object.
>> Subfunctions over period of time will have several attributes, few that I think of right away are:
>> 1. BAR resource info, write combine info
>> 2. irq vectors details
>> 3. unique id assigned by user (while virtbus will not assign such user id as they are auto created for matching service for PF/VF)
>> 4. rdma device created by matched driver resides on pci bus or parent device
>> While rdma and netdev created on over subfunctions are linked to their own 'struct device'.
> This is more aligned with my thinking as well, although I do not call these items subfunctions.
> There can be different devices for different users, where multiple can be active at the same time (with some constraints).
>
> One important thing to note is that there may not not be a netdev device. What we traditionally call
> a "network driver" will then only manage the virtualised devices.
>
>> Due to that sysfs view for these two different types of devices is bit different.
>> Putting both on same bus just doesn't appear right with above fundamental differences of core layer.
> Can you explain which code layer you mean?
>
>>>> At the moment we would need separate mdev support in the driver for
>>>> vdpa, but I hope at some point mdev would become a layer on top of virtual-
>>> bus.
>> How is it optimal to create multiple 'struct device' for single purpose?
>> Especially when one wants to create hundreds of such devices to begin with.
>> User facing tool should be able to select device type and place the device on right bus.
> At this point I think it is not possible to create a solution that is optimal right now for all use cases.


Probably yes.


> With the virtual bus we do have a solid foundation going forward, for the users we know now and for
> future ones.


If I understand correctly, if multiplexer is not preferred. It would be 
hard to have a bus on your own code, there's no much code could be reused.

Thanks


>   Optimisation is something that needs to happen over time, without breaking existing users.
>
> As for the user facing tool, the only one I know of that always works is "echo" into a sysfs file.
>
> Best regards,
> Martin
>
Jason Wang Nov. 27, 2019, 11:03 a.m. UTC | #85
On 2019/11/27 下午6:58, Jason Wang wrote:
>
>> With the virtual bus we do have a solid foundation going forward, for 
>> the users we know now and for
>> future ones.
>
>
> If I understand correctly, if multiplexer is not preferred. It would 
> be hard to have a bus on your own code, there's no much code could be 
> reused.
>

Sorry, I meant "not hard to have" ...

Thanks


> Thanks
diff mbox series

Patch

diff --git a/Documentation/driver-api/virtual_bus.rst b/Documentation/driver-api/virtual_bus.rst
new file mode 100644
index 000000000000..970e06267284
--- /dev/null
+++ b/Documentation/driver-api/virtual_bus.rst
@@ -0,0 +1,76 @@ 
+===============================
+Virtual Bus Devices and Drivers
+===============================
+
+See <linux/virtual_bus.h> for the models for virtbus_device and virtbus_driver.
+This bus is meant to be a lightweight software based bus to attach generic
+devices and drivers to so that a chunk of data can be passed between them.
+
+One use case example is an rdma driver needing to connect with several
+different types of PCI LAN devices to be able to request resources from
+them (queue sets).  Each LAN driver that supports rdma will register a
+virtbus_device on the virtual bus for each physical function.  The rdma
+driver will register as a virtbus_driver on the virtual bus to be
+matched up with multiple virtbus_devices and receive a pointer to a
+struct containing the callbacks that the PCI LAN drivers support for
+registering with them.
+
+Sections in this document:
+        Virtbus devices
+        Virtbus drivers
+        Device Enumeration
+        Device naming and driver binding
+        Virtual Bus API entry points
+
+Virtbus devices
+~~~~~~~~~~~~~~~
+Virtbus_devices are lightweight objects that support the minimal device
+functionality.  Devices will accept a name, and then an automatically
+generated index is concatenated onto it for the virtbus_device->name.
+
+The memory backing the "void *data" element of the virtbus_device is
+expected to be allocated and freed outside the context of the bus
+operations.  This memory is also expected to remain viable for the
+duration of the time that the virtbus_device is registered to the
+virtual bus. (e.g. from before the virtbus_dev_register until after
+the paired virtbus_dev_unregister).
+
+The provided API for virtbus_dev_alloc is an efficient way of allocating
+the memory for the virtbus_device (except for the data element) and
+automatically freeing it when the device is removed from the bus.
+
+Virtbus drivers
+~~~~~~~~~~~~~~~
+Virtbus drivers register with the virtual bus to be matched with virtbus
+devices.  They expect to be registered with a probe and remove callback,
+and also support shutdown, suspend, and resume callbacks.  They otherwise
+follow the standard driver behavior of having discovery and enumeration
+handled in the bus infrastructure.
+
+Virtbus drivers register themselves with the API entry point virtbus_drv_reg
+and unregister with virtbus_drv_unreg.
+
+Device Enumeration
+~~~~~~~~~~~~~~~~~~
+Enumeration is handled automatically by the bus infrastructure via the
+ida_simple methods.
+
+Device naming and driver binding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The virtbus_device.dev.name is the canonical name for the device. It is
+built from two other parts:
+
+        - virtbus_device.name (also used for matching).
+        - virtbus_device.id (generated automatically from ida_simple calls)
+
+This allows for multiple virtbus_devices with the same name, which will all
+be matched to the same virtbus_driver. Driver binding is performed by the
+driver core, invoking driver probe() after finding a match between device and driver.
+
+Virtual Bus API entry points
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+struct virtbus_device *virtbus_dev_alloc(const char *name, void *data)
+int virtbus_dev_register(struct virtbus_device *vdev)
+void virtbus_dev_unregister(struct virtbus_device *vdev)
+int virtbus_drv_register(struct virtbus_driver *vdrv, struct module *owner)
+void virtbus_drv_unregister(struct virtbus_driver *vdrv)
diff --git a/drivers/bus/Kconfig b/drivers/bus/Kconfig
index 6b331061d34b..30cef35b0c30 100644
--- a/drivers/bus/Kconfig
+++ b/drivers/bus/Kconfig
@@ -193,4 +193,18 @@  config DA8XX_MSTPRI
 
 source "drivers/bus/fsl-mc/Kconfig"
 
+config VIRTUAL_BUS
+       tristate "lightweight Virtual Bus"
+       depends on PM
+       help
+         Provides a lightweight bus for virtbus_devices to be added to it
+         and virtbus_drivers to be registered on it.  Will create a match
+         between the driver and device, then call the driver's probe with
+         the virtbus_device's struct (including a pointer for data).
+         One example is the irdma driver needing to connect with various
+         PCI LAN drivers to request resources (queues) to be able to perform
+         its function.  The data in the virtbus_device created by the
+         PCI LAN driver is a set of ops (function pointers) for the irdma
+         driver to use to register and communicate with the PCI LAN driver.
+
 endmenu
diff --git a/drivers/bus/Makefile b/drivers/bus/Makefile
index 16b43d3468c6..0b0ba53cbe5b 100644
--- a/drivers/bus/Makefile
+++ b/drivers/bus/Makefile
@@ -33,3 +33,4 @@  obj-$(CONFIG_UNIPHIER_SYSTEM_BUS)	+= uniphier-system-bus.o
 obj-$(CONFIG_VEXPRESS_CONFIG)	+= vexpress-config.o
 
 obj-$(CONFIG_DA8XX_MSTPRI)	+= da8xx-mstpri.o
+obj-$(CONFIG_VIRTUAL_BUS)	+= virtual_bus.o
diff --git a/drivers/bus/virtual_bus.c b/drivers/bus/virtual_bus.c
new file mode 100644
index 000000000000..c6eab1658391
--- /dev/null
+++ b/drivers/bus/virtual_bus.c
@@ -0,0 +1,326 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtual_bus.c - lightweight software based bus for virtual devices
+ *
+ * Copyright (c) 2019-20 Intel Corporation
+ *
+ * Please see Documentation/driver-api/virtual_bus.rst for
+ * more information
+ */
+
+#include <linux/string.h>
+#include <linux/virtual_bus.h>
+#include <linux/of_irq.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pm_runtime.h>
+#include <linux/pm_domain.h>
+#include <linux/acpi.h>
+#include <linux/device.h>
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Lightweight Virtual Bus");
+MODULE_AUTHOR("David Ertman <david.m.ertman@intel.com>");
+MODULE_AUTHOR("Kiran Patil <kiran.patil@intel.com>");
+
+static DEFINE_IDA(virtbus_dev_ida);
+
+static const
+struct virtbus_dev_id *virtbus_match_id(const struct virtbus_dev_id *id,
+					struct virtbus_device *vdev)
+{
+	while (id->name[0]) {
+		if (!strcmp(vdev->name, id->name)) {
+			vdev->dev_id = id;
+			return id;
+		}
+		id++;
+	}
+	return NULL;
+}
+
+#define to_virtbus_dev(x)	(container_of((x), struct virtbus_device, dev))
+#define to_virtbus_drv(x)	(container_of((x), struct virtbus_driver, \
+				 driver))
+
+/**
+ * virtbus_match - bind virtbus device to virtbus driver
+ * @dev: device
+ * @drv: driver
+ *
+ * Virtbus device IDs are always in "<name>.<instance>" format.
+ * Instances are automatically selected through an ida_simple_get so
+ * are positive integers. Names are taken from the device name field.
+ * Driver IDs are simple <name>.  Need to extract the name from the
+ * Virtual Device compare to name of the driver.
+ */
+static int virtbus_match(struct device *dev, struct device_driver *drv)
+{
+	struct virtbus_driver *vdrv = to_virtbus_drv(drv);
+	struct virtbus_device *vdev = to_virtbus_dev(dev);
+
+	if (vdrv->id_table)
+		return virtbus_match_id(vdrv->id_table, vdev) != NULL;
+
+	return !strcmp(vdev->name, drv->name);
+}
+
+/**
+ * virtbus_probe - call probe of the virtbus_drv
+ * @dev: device struct
+ */
+static int virtbus_probe(struct device *dev)
+{
+	if (dev->driver->probe)
+		return dev->driver->probe(dev);
+
+	return 0;
+}
+
+static int virtbus_remove(struct device *dev)
+{
+	if (dev->driver->remove)
+		return dev->driver->remove(dev);
+
+	return 0;
+}
+
+static void virtbus_shutdown(struct device *dev)
+{
+	if (dev->driver->shutdown)
+		dev->driver->shutdown(dev);
+}
+
+static int virtbus_suspend(struct device *dev, pm_message_t state)
+{
+	if (dev->driver->suspend)
+		return dev->driver->suspend(dev, state);
+
+	return 0;
+}
+
+static int virtbus_resume(struct device *dev)
+{
+	if (dev->driver->resume)
+		return dev->driver->resume(dev);
+
+	return 0;
+}
+
+struct bus_type virtual_bus_type = {
+	.name		= "virtbus",
+	.match		= virtbus_match,
+	.probe		= virtbus_probe,
+	.remove		= virtbus_remove,
+	.shutdown	= virtbus_shutdown,
+	.suspend	= virtbus_suspend,
+	.resume		= virtbus_resume,
+};
+
+/**
+ * virtbus_dev_register - add a virtual bus device
+ * @vdev: virtual bus device to add
+ */
+int virtbus_dev_register(struct virtbus_device *vdev)
+{
+	int ret;
+
+	if (!vdev)
+		return -EINVAL;
+
+	device_initialize(&vdev->dev);
+
+	vdev->dev.bus = &virtual_bus_type;
+	/* All device IDs are automatically allocated */
+	ret = ida_simple_get(&virtbus_dev_ida, 0, 0, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+
+	vdev->id = ret;
+	dev_set_name(&vdev->dev, "%s.%d", vdev->name, vdev->id);
+
+	dev_dbg(&vdev->dev, "Registering VirtBus device '%s'\n",
+		dev_name(&vdev->dev));
+
+	ret = device_add(&vdev->dev);
+	if (!ret)
+		return ret;
+
+	/* Error adding virtual device */
+	device_del(&vdev->dev);
+	ida_simple_remove(&virtbus_dev_ida, vdev->id);
+	vdev->id = VIRTBUS_DEVID_NONE;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(virtbus_dev_register);
+
+/**
+ * virtbus_dev_unregister - remove a virtual bus device
+ * vdev: virtual bus device we are removing
+ */
+void virtbus_dev_unregister(struct virtbus_device *vdev)
+{
+	if (!IS_ERR_OR_NULL(vdev)) {
+		device_del(&vdev->dev);
+
+		ida_simple_remove(&virtbus_dev_ida, vdev->id);
+		vdev->id = VIRTBUS_DEVID_NONE;
+	}
+}
+EXPORT_SYMBOL_GPL(virtbus_dev_unregister);
+
+struct virtbus_object {
+	struct virtbus_device vdev;
+	char name[];
+};
+
+/**
+ * virtbus_dev_release - Destroy a virtbus device
+ * @vdev: virtual device to release
+ *
+ * Note that the vdev->data which is separately allocated needs to be
+ * separately freed on it own.
+ */
+static void virtbus_dev_release(struct device *dev)
+{
+	struct virtbus_object *vo = container_of(dev, struct virtbus_object,
+						 vdev.dev);
+
+	kfree(vo);
+}
+
+/**
+ * virtbus_dev_alloc - allocate a virtbus device
+ * @name: name to associate with the vdev
+ * @data: pointer to data to be associated with this device
+ */
+struct virtbus_device *virtbus_dev_alloc(const char *name, void *data)
+{
+	struct virtbus_object *vo;
+
+	/* Create a virtbus object to contain the vdev and name.  This
+	 * avoids a problem with the const attribute of name in the vdev.
+	 * The virtbus_object will be allocated here and freed in the
+	 * release function.
+	 */
+	vo = kzalloc(sizeof(*vo) + strlen(name) + 1, GFP_KERNEL);
+	if (!vo)
+		return NULL;
+
+	strcpy(vo->name, name);
+	vo->vdev.name = vo->name;
+	vo->vdev.data = data;
+	vo->vdev.dev.release = virtbus_dev_release;
+
+	return &vo->vdev;
+}
+EXPORT_SYMBOL_GPL(virtbus_dev_alloc);
+
+static int virtbus_drv_probe(struct device *_dev)
+{
+	struct virtbus_driver *vdrv = to_virtbus_drv(_dev->driver);
+	struct virtbus_device *vdev = to_virtbus_dev(_dev);
+	int ret;
+
+	ret = dev_pm_domain_attach(_dev, true);
+	if (ret) {
+		dev_warn(_dev, "Failed to attatch to PM Domain : %d\n", ret);
+		return ret;
+	}
+
+	if (vdrv->probe) {
+		ret = vdrv->probe(vdev);
+		if (ret)
+			dev_pm_domain_detach(_dev, true);
+	}
+
+	return ret;
+}
+
+static int virtbus_drv_remove(struct device *_dev)
+{
+	struct virtbus_driver *vdrv = to_virtbus_drv(_dev->driver);
+	struct virtbus_device *vdev = to_virtbus_dev(_dev);
+	int ret = 0;
+
+	if (vdrv->remove)
+		ret = vdrv->remove(vdev);
+
+	dev_pm_domain_detach(_dev, true);
+
+	return ret;
+}
+
+static void virtbus_drv_shutdown(struct device *_dev)
+{
+	struct virtbus_driver *vdrv = to_virtbus_drv(_dev->driver);
+	struct virtbus_device *vdev = to_virtbus_dev(_dev);
+
+	if (vdrv->shutdown)
+		vdrv->shutdown(vdev);
+}
+
+static int virtbus_drv_suspend(struct device *_dev, pm_message_t state)
+{
+	struct virtbus_driver *vdrv = to_virtbus_drv(_dev->driver);
+	struct virtbus_device *vdev = to_virtbus_dev(_dev);
+
+	if (vdrv->suspend)
+		return vdrv->suspend(vdev, state);
+
+	return 0;
+}
+
+static int virtbus_drv_resume(struct device *_dev)
+{
+	struct virtbus_driver *vdrv = to_virtbus_drv(_dev->driver);
+	struct virtbus_device *vdev = to_virtbus_dev(_dev);
+
+	if (vdrv->resume)
+		return vdrv->resume(vdev);
+
+	return 0;
+}
+
+/**
+ * __virtbus_drv_register - register a driver for virtual bus devices
+ * @vdrv: virtbus_driver structure
+ * @owner: owning module/driver
+ */
+int __virtbus_drv_register(struct virtbus_driver *vdrv, struct module *owner)
+{
+	vdrv->driver.owner = owner;
+	vdrv->driver.bus = &virtual_bus_type;
+	vdrv->driver.probe = virtbus_drv_probe;
+	vdrv->driver.remove = virtbus_drv_remove;
+	vdrv->driver.shutdown = virtbus_drv_shutdown;
+	vdrv->driver.suspend = virtbus_drv_suspend;
+	vdrv->driver.resume = virtbus_drv_resume;
+
+	return driver_register(&vdrv->driver);
+}
+EXPORT_SYMBOL_GPL(__virtbus_drv_register);
+
+/**
+ * virtbus_drv_unregister - unregister a driver for virtual bus devices
+ * @drv: virtbus_driver structure
+ */
+void virtbus_drv_unregister(struct virtbus_driver *vdrv)
+{
+	driver_unregister(&vdrv->driver);
+}
+EXPORT_SYMBOL_GPL(virtbus_drv_unregister);
+
+static int __init virtual_bus_init(void)
+{
+	return bus_register(&virtual_bus_type);
+}
+
+static void __exit virtual_bus_exit(void)
+{
+	bus_unregister(&virtual_bus_type);
+}
+
+module_init(virtual_bus_init);
+module_exit(virtual_bus_exit);
diff --git a/include/linux/virtual_bus.h b/include/linux/virtual_bus.h
new file mode 100644
index 000000000000..b6f2406180f8
--- /dev/null
+++ b/include/linux/virtual_bus.h
@@ -0,0 +1,55 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * virtual_bus.h - lightweight software bus
+ *
+ * Copyright (c) 2019-20 Intel Corporation
+ *
+ * Please see Documentation/driver-api/virtual_bus.rst for more information
+ */
+
+#ifndef _VIRTUAL_BUS_H_
+#define _VIRTUAL_BUS_H_
+
+#include <linux/device.h>
+
+#define VIRTBUS_DEVID_NONE	(-1)
+#define VIRTBUS_NAME_SIZE	20
+
+struct virtbus_dev_id {
+	char name[VIRTBUS_NAME_SIZE];
+	u64 driver_data;
+};
+
+struct virtbus_device {
+	const char			*name;
+	int				id;
+	const struct virtbus_dev_id	*dev_id;
+	struct device			dev;
+	void				*data;
+};
+
+/* If the driver uses a id_table to match with virtbus_devices, then the
+ * memory for the table is expected to remain allocated for the duration
+ * of the pairing between driver and device.  The pointer for the matching
+ * element will be copied to the dev_id field of the virtbus_device.
+ */
+struct virtbus_driver {
+	int (*probe)(struct virtbus_device *);
+	int (*remove)(struct virtbus_device *);
+	void (*shutdown)(struct virtbus_device *);
+	int (*suspend)(struct virtbus_device *, pm_message_t);
+	int (*resume)(struct virtbus_device *);
+	struct device_driver driver;
+	const struct virtbus_dev_id *id_table;
+};
+
+int virtbus_dev_register(struct virtbus_device *vdev);
+void virtbus_dev_unregister(struct virtbus_device *vdev);
+struct virtbus_device *virtbus_dev_alloc(const char *name, void *devdata);
+int __virtbus_drv_register(struct virtbus_driver *vdrv, struct module *owner);
+void virtbus_drv_unregister(struct virtbus_driver *vdrv);
+
+#define virtbus_drv_register(vdrv) \
+	__virtbus_drv_register(vdrv, THIS_MODULE)
+
+#endif /* _VIRTUAL_BUS_H_ */
diff --git a/tools/testing/selftests/virtual_bus/virtual_bus_dev/Makefile b/tools/testing/selftests/virtual_bus/virtual_bus_dev/Makefile
new file mode 100644
index 000000000000..ddd5088eb26b
--- /dev/null
+++ b/tools/testing/selftests/virtual_bus/virtual_bus_dev/Makefile
@@ -0,0 +1,7 @@ 
+obj-m += virtual_bus_dev.o
+
+all:
+	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
+
+clean:
+	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
diff --git a/tools/testing/selftests/virtual_bus/virtual_bus_dev/virtual_bus_dev.c b/tools/testing/selftests/virtual_bus/virtual_bus_dev/virtual_bus_dev.c
new file mode 100644
index 000000000000..7257e599f12b
--- /dev/null
+++ b/tools/testing/selftests/virtual_bus/virtual_bus_dev/virtual_bus_dev.c
@@ -0,0 +1,67 @@ 
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/module.h>
+#include <linux/virtual_bus.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Dave Ertman");
+MODULE_DESCRIPTION("Test to create a device on virtual bus");
+MODULE_VERSION("1.0");
+
+static struct virtbus_device *vdev;
+static char *data;
+
+static int __init test_dev_init(void)
+{
+	int ret = 0;
+	static char *name;
+
+	printk(KERN_INFO "Loading Virtual Bus Test Device\n");
+
+	name = kzalloc(VIRTBUS_NAME_SIZE, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+	strcpy(name, "virtual_bus_dev");
+
+	data = kzalloc(128, GFP_KERNEL);
+	if (!data) {
+		kfree(name);
+		return -ENOMEM;
+	}
+	strcpy(data, "This is my data string - isn't it wonderful!");
+
+	vdev = virtbus_dev_alloc(name, data);
+	if (!vdev) {
+		kfree(name);
+		kfree(data);
+		return -EINVAL;
+	}
+
+	printk(KERN_ERR "Virtbus Device allocated:\n\t%s\n\t%s\n", vdev->name,
+	       (char *)vdev->data);
+
+	ret = virtbus_dev_register(vdev);
+	kfree(name);
+	if (ret) {
+		printk(KERN_ERR "FAILED TO ADD VIRTBUS DEVICE %d\n", ret);
+		return ret;
+	}
+
+	printk(KERN_INFO "Virtual Device created\n");
+	return ret;
+}
+
+static void __exit test_dev_exit(void)
+{
+	printk(KERN_INFO "Exiting Virtual Bus Test Device");
+
+	virtbus_dev_unregister(vdev);
+	kfree(data);
+
+	printk(KERN_INFO "Virtual Bus Test Device removed\n");
+}
+
+module_init(test_dev_init);
+module_exit(test_dev_exit);
diff --git a/tools/testing/selftests/virtual_bus/virtual_bus_drv/Makefile b/tools/testing/selftests/virtual_bus/virtual_bus_drv/Makefile
new file mode 100644
index 000000000000..a4b7467f7878
--- /dev/null
+++ b/tools/testing/selftests/virtual_bus/virtual_bus_drv/Makefile
@@ -0,0 +1,7 @@ 
+obj-m += virtual_bus_drv.o
+
+all:
+	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
+
+clean:
+	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
diff --git a/tools/testing/selftests/virtual_bus/virtual_bus_drv/virtual_bus_drv.c b/tools/testing/selftests/virtual_bus/virtual_bus_drv/virtual_bus_drv.c
new file mode 100644
index 000000000000..202288809b1c
--- /dev/null
+++ b/tools/testing/selftests/virtual_bus/virtual_bus_drv/virtual_bus_drv.c
@@ -0,0 +1,101 @@ 
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/module.h>
+#include <linux/virtual_bus.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dave Ertman");
+MODULE_DESCRIPTION("Test to register a driver on virtual bus");
+MODULE_VERSION("1.0");
+
+static int td_probe(struct virtbus_device *vdev)
+{
+	printk(KERN_ERR "VIRTBUS DRIVER PROBED\n");
+	printk(KERN_ERR "DATA IS %s", vdev->data ? "NOT NULL" : "NULL");
+	if (vdev->dev_id) {
+		printk(KERN_ERR "DEV_ID->DATA 0x%08x\n",
+			(uint)vdev->dev_id->driver_data);
+		if (vdev->dev_id->driver_data == 1)
+			printk(KERN_ERR "DATA STRING: %s\n",
+			       (char *)vdev->data);
+	} else {
+		printk(KERN_ERR "DEV_ID->DATA is NULL\n");
+	}
+
+	return 0;
+}
+
+static int td_remove(struct virtbus_device *vdev)
+{
+	printk(KERN_ERR "VIRTBUS DRIVER REMOVED\n");
+	return 0;
+}
+
+static void td_shutdown(struct virtbus_device *vdev)
+{
+	printk(KERN_ERR "VIRTBUS DRIVER SHUTDOWN\n");
+}
+
+static const struct virtbus_dev_id vdev_id_table[] = {
+
+	{
+		.name = "NOT THE NAME",
+		.driver_data = 0x00000000,
+	},
+	{
+		.name = "virtual_bus_dev",
+		.driver_data = 0x00000001,
+	},
+	{
+		.name = "ice_rdma",
+		.driver_data = 0x00000002,
+	},
+	{
+		.name = "YET AGAIN NOT NAME",
+		.driver_data = 0x00000003,
+	},
+};
+
+static struct virtbus_driver vdrv = {
+	.probe = td_probe,
+	.remove = td_remove,
+	.shutdown = td_shutdown,
+	.driver = {
+		.name = "virtual_bus_dev",
+	},
+};
+
+static int __init test_drv_init(void)
+{
+	int ret;
+
+	printk(KERN_INFO "Registering Virtual Bus Test Driver\n");
+
+	/* To do a simple match, leave the id_table as NULL */
+	vdrv.id_table = &vdev_id_table[0];
+
+	printk(KERN_ERR "name of 0 is %s\n", vdrv.id_table->name);
+
+	ret = virtbus_drv_register(&vdrv);
+
+	if (!ret)
+		printk(KERN_INFO "Virtual Driver registered\n");
+	else
+		printk(KERN_INFO "Virtual Driver FAILED!!\n");
+
+	return ret;
+}
+
+static void __exit test_drv_exit(void)
+{
+	printk(KERN_INFO "Exiting Virtual Bus Test Driver");
+
+	virtbus_drv_unregister(&vdrv);
+
+	printk(KERN_INFO "Virtual Bus Test Driver removed\n");
+}
+
+module_init(test_drv_init);
+module_exit(test_drv_exit);