diff mbox series

[3/5] vDPA: introduce vDPA bus

Message ID 20200116124231.20253-4-jasowang@redhat.com (mailing list archive)
State New, archived
Headers show
Series vDPA support | expand

Commit Message

Jason Wang Jan. 16, 2020, 12:42 p.m. UTC
vDPA device is a device that uses a datapath which complies with the
virtio specifications with vendor specific control path. vDPA devices
can be both physically located on the hardware or emulated by
software. vDPA hardware devices are usually implemented through PCIE
with the following types:

- PF (Physical Function) - A single Physical Function
- VF (Virtual Function) - Device that supports single root I/O
  virtualization (SR-IOV). Its Virtual Function (VF) represents a
  virtualized instance of the device that can be assigned to different
  partitions
- VDEV (Virtual Device) - With technologies such as Intel Scalable
  IOV, a virtual device composed by host OS utilizing one or more
  ADIs.
- SF (Sub function) - Vendor specific interface to slice the Physical
  Function to multiple sub functions that can be assigned to different
  partitions as virtual devices.

From a driver's perspective, depends on how and where the DMA
translation is done, vDPA devices are split into two types:

- Platform specific DMA translation - From the driver's perspective,
  the device can be used on a platform where device access to data in
  memory is limited and/or translated. An example is a PCIE vDPA whose
  DMA request was tagged via a bus (e.g PCIE) specific way. DMA
  translation and protection are done at PCIE bus IOMMU level.
- Device specific DMA translation - The device implements DMA
  isolation and protection through its own logic. An example is a vDPA
  device which uses on-chip IOMMU.

To hide the differences and complexity of the above types for a vDPA
device/IOMMU options and in order to present a generic virtio device
to the upper layer, a device agnostic framework is required.

This patch introduces a software vDPA bus which abstracts the
common attributes of vDPA device, vDPA bus driver and the
communication method (vdpa_config_ops) between the vDPA device
abstraction and the vDPA bus driver:

With the abstraction of vDPA bus and vDPA bus operations, the
difference and complexity of the under layer hardware is hidden from
upper layer. The vDPA bus drivers on top can use a unified
vdpa_config_ops to control different types of vDPA device.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 MAINTAINERS                  |   1 +
 drivers/virtio/Kconfig       |   2 +
 drivers/virtio/Makefile      |   1 +
 drivers/virtio/vdpa/Kconfig  |   9 ++
 drivers/virtio/vdpa/Makefile |   2 +
 drivers/virtio/vdpa/vdpa.c   | 141 ++++++++++++++++++++++++++
 include/linux/vdpa.h         | 191 +++++++++++++++++++++++++++++++++++
 7 files changed, 347 insertions(+)
 create mode 100644 drivers/virtio/vdpa/Kconfig
 create mode 100644 drivers/virtio/vdpa/Makefile
 create mode 100644 drivers/virtio/vdpa/vdpa.c
 create mode 100644 include/linux/vdpa.h

Comments

Jason Gunthorpe Jan. 16, 2020, 3:22 p.m. UTC | #1
On Thu, Jan 16, 2020 at 08:42:29PM +0800, Jason Wang wrote:
> vDPA device is a device that uses a datapath which complies with the
> virtio specifications with vendor specific control path. vDPA devices
> can be both physically located on the hardware or emulated by
> software. vDPA hardware devices are usually implemented through PCIE
> with the following types:
> 
> - PF (Physical Function) - A single Physical Function
> - VF (Virtual Function) - Device that supports single root I/O
>   virtualization (SR-IOV). Its Virtual Function (VF) represents a
>   virtualized instance of the device that can be assigned to different
>   partitions

> - VDEV (Virtual Device) - With technologies such as Intel Scalable
>   IOV, a virtual device composed by host OS utilizing one or more
>   ADIs.
> - SF (Sub function) - Vendor specific interface to slice the Physical
>   Function to multiple sub functions that can be assigned to different
>   partitions as virtual devices.

I really hope we don't end up with two different ways to spell this
same thing.

> @@ -0,0 +1,2 @@
> +# SPDX-License-Identifier: GPL-2.0
> +obj-$(CONFIG_VDPA) += vdpa.o
> diff --git a/drivers/virtio/vdpa/vdpa.c b/drivers/virtio/vdpa/vdpa.c
> new file mode 100644
> index 000000000000..2b0e4a9f105d
> +++ b/drivers/virtio/vdpa/vdpa.c
> @@ -0,0 +1,141 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * vDPA bus.
> + *
> + * Copyright (c) 2019, Red Hat. All rights reserved.
> + *     Author: Jason Wang <jasowang@redhat.com>

2020 tests days

> + *
> + */
> +
> +#include <linux/module.h>
> +#include <linux/idr.h>
> +#include <linux/vdpa.h>
> +
> +#define MOD_VERSION  "0.1"

I think module versions are discouraged these days

> +#define MOD_DESC     "vDPA bus"
> +#define MOD_AUTHOR   "Jason Wang <jasowang@redhat.com>"
> +#define MOD_LICENSE  "GPL v2"
> +
> +static DEFINE_IDA(vdpa_index_ida);
> +
> +struct device *vdpa_get_parent(struct vdpa_device *vdpa)
> +{
> +	return vdpa->dev.parent;
> +}
> +EXPORT_SYMBOL(vdpa_get_parent);
> +
> +void vdpa_set_parent(struct vdpa_device *vdpa, struct device *parent)
> +{
> +	vdpa->dev.parent = parent;
> +}
> +EXPORT_SYMBOL(vdpa_set_parent);
> +
> +struct vdpa_device *dev_to_vdpa(struct device *_dev)
> +{
> +	return container_of(_dev, struct vdpa_device, dev);
> +}
> +EXPORT_SYMBOL_GPL(dev_to_vdpa);
> +
> +struct device *vdpa_to_dev(struct vdpa_device *vdpa)
> +{
> +	return &vdpa->dev;
> +}
> +EXPORT_SYMBOL_GPL(vdpa_to_dev);

Why these trivial assessors? Seems unnecessary, or should at least be
static inlines in a header

> +int register_vdpa_device(struct vdpa_device *vdpa)
> +{

Usually we want to see symbols consistently prefixed with vdpa_*, is
there a reason why register/unregister are swapped?

> +	int err;
> +
> +	if (!vdpa_get_parent(vdpa))
> +		return -EINVAL;
> +
> +	if (!vdpa->config)
> +		return -EINVAL;
> +
> +	err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL);
> +	if (err < 0)
> +		return -EFAULT;
> +
> +	vdpa->dev.bus = &vdpa_bus;
> +	device_initialize(&vdpa->dev);

IMHO device_initialize should not be called inside something called
register, toooften we find out that the caller drivers need the device
to be initialized earlier, ie to use the kref, or something.

I find the best flow is to have some init function that does the
device_initialize and sets the device_name that the driver can call
early.

Shouldn't there be a device/driver matching process of some kind?

Jason
Jason Wang Jan. 17, 2020, 3:03 a.m. UTC | #2
On 2020/1/16 下午11:22, Jason Gunthorpe wrote:
> On Thu, Jan 16, 2020 at 08:42:29PM +0800, Jason Wang wrote:
>> vDPA device is a device that uses a datapath which complies with the
>> virtio specifications with vendor specific control path. vDPA devices
>> can be both physically located on the hardware or emulated by
>> software. vDPA hardware devices are usually implemented through PCIE
>> with the following types:
>>
>> - PF (Physical Function) - A single Physical Function
>> - VF (Virtual Function) - Device that supports single root I/O
>>    virtualization (SR-IOV). Its Virtual Function (VF) represents a
>>    virtualized instance of the device that can be assigned to different
>>    partitions
>> - VDEV (Virtual Device) - With technologies such as Intel Scalable
>>    IOV, a virtual device composed by host OS utilizing one or more
>>    ADIs.
>> - SF (Sub function) - Vendor specific interface to slice the Physical
>>    Function to multiple sub functions that can be assigned to different
>>    partitions as virtual devices.
> I really hope we don't end up with two different ways to spell this
> same thing.


I think you meant ADI vs SF. It looks to me that ADI is limited to the 
scope of scalable IOV but SF not.


>
>> @@ -0,0 +1,2 @@
>> +# SPDX-License-Identifier: GPL-2.0
>> +obj-$(CONFIG_VDPA) += vdpa.o
>> diff --git a/drivers/virtio/vdpa/vdpa.c b/drivers/virtio/vdpa/vdpa.c
>> new file mode 100644
>> index 000000000000..2b0e4a9f105d
>> +++ b/drivers/virtio/vdpa/vdpa.c
>> @@ -0,0 +1,141 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * vDPA bus.
>> + *
>> + * Copyright (c) 2019, Red Hat. All rights reserved.
>> + *     Author: Jason Wang <jasowang@redhat.com>
> 2020 tests days


Will fix.


>
>> + *
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/idr.h>
>> +#include <linux/vdpa.h>
>> +
>> +#define MOD_VERSION  "0.1"
> I think module versions are discouraged these days


Will remove.


>
>> +#define MOD_DESC     "vDPA bus"
>> +#define MOD_AUTHOR   "Jason Wang <jasowang@redhat.com>"
>> +#define MOD_LICENSE  "GPL v2"
>> +
>> +static DEFINE_IDA(vdpa_index_ida);
>> +
>> +struct device *vdpa_get_parent(struct vdpa_device *vdpa)
>> +{
>> +	return vdpa->dev.parent;
>> +}
>> +EXPORT_SYMBOL(vdpa_get_parent);
>> +
>> +void vdpa_set_parent(struct vdpa_device *vdpa, struct device *parent)
>> +{
>> +	vdpa->dev.parent = parent;
>> +}
>> +EXPORT_SYMBOL(vdpa_set_parent);
>> +
>> +struct vdpa_device *dev_to_vdpa(struct device *_dev)
>> +{
>> +	return container_of(_dev, struct vdpa_device, dev);
>> +}
>> +EXPORT_SYMBOL_GPL(dev_to_vdpa);
>> +
>> +struct device *vdpa_to_dev(struct vdpa_device *vdpa)
>> +{
>> +	return &vdpa->dev;
>> +}
>> +EXPORT_SYMBOL_GPL(vdpa_to_dev);
> Why these trivial assessors? Seems unnecessary, or should at least be
> static inlines in a header


Will fix.


>
>> +int register_vdpa_device(struct vdpa_device *vdpa)
>> +{
> Usually we want to see symbols consistently prefixed with vdpa_*, is
> there a reason why register/unregister are swapped?


I follow the name from virtio. I will switch to vdpa_*.


>
>> +	int err;
>> +
>> +	if (!vdpa_get_parent(vdpa))
>> +		return -EINVAL;
>> +
>> +	if (!vdpa->config)
>> +		return -EINVAL;
>> +
>> +	err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL);
>> +	if (err < 0)
>> +		return -EFAULT;
>> +
>> +	vdpa->dev.bus = &vdpa_bus;
>> +	device_initialize(&vdpa->dev);
> IMHO device_initialize should not be called inside something called
> register, toooften we find out that the caller drivers need the device
> to be initialized earlier, ie to use the kref, or something.
>
> I find the best flow is to have some init function that does the
> device_initialize and sets the device_name that the driver can call
> early.


Ok, will do.


>
> Shouldn't there be a device/driver matching process of some kind?


The question is what do we want do match here.

1) "virtio" vs "vhost", I implemented matching method for this in mdev 
series, but it looks unnecessary for vDPA device driver to know about 
this. Anyway we can use sysfs driver bind/unbind to switch drivers
2) virtio device id and vendor id. I'm not sure we need this consider 
the two drivers so far (virtio/vhost) are all bus drivers.

Thanks


>
> Jason
>
Randy Dunlap Jan. 17, 2020, 4:16 a.m. UTC | #3
On 1/16/20 4:42 AM, Jason Wang wrote:
> diff --git a/drivers/virtio/vdpa/Kconfig b/drivers/virtio/vdpa/Kconfig
> new file mode 100644
> index 000000000000..3032727b4d98
> --- /dev/null
> +++ b/drivers/virtio/vdpa/Kconfig
> @@ -0,0 +1,9 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +config VDPA
> +	tristate
> +        default n
> +        help
> +          Enable this module to support vDPA device that uses a

	                                        devices

> +          datapath which complies with virtio specifications with
> +          vendor specific control path.
> +

Use tab + 2 spaces for Kconfig indentation.
Jason Wang Jan. 17, 2020, 9:34 a.m. UTC | #4
On 2020/1/17 下午12:16, Randy Dunlap wrote:
> On 1/16/20 4:42 AM, Jason Wang wrote:
>> diff --git a/drivers/virtio/vdpa/Kconfig b/drivers/virtio/vdpa/Kconfig
>> new file mode 100644
>> index 000000000000..3032727b4d98
>> --- /dev/null
>> +++ b/drivers/virtio/vdpa/Kconfig
>> @@ -0,0 +1,9 @@
>> +# SPDX-License-Identifier: GPL-2.0-only
>> +config VDPA
>> +	tristate
>> +        default n
>> +        help
>> +          Enable this module to support vDPA device that uses a
> 	                                        devices
>
>> +          datapath which complies with virtio specifications with
>> +          vendor specific control path.
>> +
> Use tab + 2 spaces for Kconfig indentation.


Will fix.

Thanks

>
Michael S. Tsirkin Jan. 17, 2020, 12:13 p.m. UTC | #5
On Thu, Jan 16, 2020 at 08:42:29PM +0800, Jason Wang wrote:
> vDPA device is a device that uses a datapath which complies with the
> virtio specifications with vendor specific control path. vDPA devices
> can be both physically located on the hardware or emulated by
> software. vDPA hardware devices are usually implemented through PCIE
> with the following types:
> 
> - PF (Physical Function) - A single Physical Function
> - VF (Virtual Function) - Device that supports single root I/O
>   virtualization (SR-IOV). Its Virtual Function (VF) represents a
>   virtualized instance of the device that can be assigned to different
>   partitions
> - VDEV (Virtual Device) - With technologies such as Intel Scalable
>   IOV, a virtual device composed by host OS utilizing one or more
>   ADIs.
> - SF (Sub function) - Vendor specific interface to slice the Physical
>   Function to multiple sub functions that can be assigned to different
>   partitions as virtual devices.
> 
> >From a driver's perspective, depends on how and where the DMA
> translation is done, vDPA devices are split into two types:
> 
> - Platform specific DMA translation - From the driver's perspective,
>   the device can be used on a platform where device access to data in
>   memory is limited and/or translated. An example is a PCIE vDPA whose
>   DMA request was tagged via a bus (e.g PCIE) specific way. DMA
>   translation and protection are done at PCIE bus IOMMU level.
> - Device specific DMA translation - The device implements DMA
>   isolation and protection through its own logic. An example is a vDPA
>   device which uses on-chip IOMMU.
> 
> To hide the differences and complexity of the above types for a vDPA
> device/IOMMU options and in order to present a generic virtio device
> to the upper layer, a device agnostic framework is required.
> 
> This patch introduces a software vDPA bus which abstracts the
> common attributes of vDPA device, vDPA bus driver and the
> communication method (vdpa_config_ops) between the vDPA device
> abstraction and the vDPA bus driver:
> 
> With the abstraction of vDPA bus and vDPA bus operations, the
> difference and complexity of the under layer hardware is hidden from
> upper layer. The vDPA bus drivers on top can use a unified
> vdpa_config_ops to control different types of vDPA device.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  MAINTAINERS                  |   1 +
>  drivers/virtio/Kconfig       |   2 +
>  drivers/virtio/Makefile      |   1 +
>  drivers/virtio/vdpa/Kconfig  |   9 ++
>  drivers/virtio/vdpa/Makefile |   2 +
>  drivers/virtio/vdpa/vdpa.c   | 141 ++++++++++++++++++++++++++
>  include/linux/vdpa.h         | 191 +++++++++++++++++++++++++++++++++++
>  7 files changed, 347 insertions(+)
>  create mode 100644 drivers/virtio/vdpa/Kconfig
>  create mode 100644 drivers/virtio/vdpa/Makefile
>  create mode 100644 drivers/virtio/vdpa/vdpa.c
>  create mode 100644 include/linux/vdpa.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index d4bda9c900fa..578d2a581e3b 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -17540,6 +17540,7 @@ F:	tools/virtio/
>  F:	drivers/net/virtio_net.c
>  F:	drivers/block/virtio_blk.c
>  F:	include/linux/virtio*.h
> +F:	include/linux/vdpa.h
>  F:	include/uapi/linux/virtio_*.h
>  F:	drivers/crypto/virtio/
>  F:	mm/balloon_compaction.c
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index 078615cf2afc..9c4fdb64d9ac 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -96,3 +96,5 @@ config VIRTIO_MMIO_CMDLINE_DEVICES
>  	 If unsure, say 'N'.
>  
>  endif # VIRTIO_MENU
> +
> +source "drivers/virtio/vdpa/Kconfig"
> diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
> index 3a2b5c5dcf46..fdf5eacd0d0a 100644
> --- a/drivers/virtio/Makefile
> +++ b/drivers/virtio/Makefile
> @@ -6,3 +6,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
>  virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
>  obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
>  obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
> +obj-$(CONFIG_VDPA) += vdpa/
> diff --git a/drivers/virtio/vdpa/Kconfig b/drivers/virtio/vdpa/Kconfig
> new file mode 100644
> index 000000000000..3032727b4d98
> --- /dev/null
> +++ b/drivers/virtio/vdpa/Kconfig
> @@ -0,0 +1,9 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +config VDPA
> +	tristate
> +        default n
> +        help
> +          Enable this module to support vDPA device that uses a
> +          datapath which complies with virtio specifications with
> +          vendor specific control path.
> +
> diff --git a/drivers/virtio/vdpa/Makefile b/drivers/virtio/vdpa/Makefile
> new file mode 100644
> index 000000000000..ee6a35e8a4fb
> --- /dev/null
> +++ b/drivers/virtio/vdpa/Makefile
> @@ -0,0 +1,2 @@
> +# SPDX-License-Identifier: GPL-2.0
> +obj-$(CONFIG_VDPA) += vdpa.o
> diff --git a/drivers/virtio/vdpa/vdpa.c b/drivers/virtio/vdpa/vdpa.c
> new file mode 100644
> index 000000000000..2b0e4a9f105d
> --- /dev/null
> +++ b/drivers/virtio/vdpa/vdpa.c
> @@ -0,0 +1,141 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * vDPA bus.
> + *
> + * Copyright (c) 2019, Red Hat. All rights reserved.
> + *     Author: Jason Wang <jasowang@redhat.com>
> + *
> + */
> +
> +#include <linux/module.h>
> +#include <linux/idr.h>
> +#include <linux/vdpa.h>
> +
> +#define MOD_VERSION  "0.1"
> +#define MOD_DESC     "vDPA bus"
> +#define MOD_AUTHOR   "Jason Wang <jasowang@redhat.com>"
> +#define MOD_LICENSE  "GPL v2"
> +
> +static DEFINE_IDA(vdpa_index_ida);
> +
> +struct device *vdpa_get_parent(struct vdpa_device *vdpa)
> +{
> +	return vdpa->dev.parent;
> +}
> +EXPORT_SYMBOL(vdpa_get_parent);
> +
> +void vdpa_set_parent(struct vdpa_device *vdpa, struct device *parent)
> +{
> +	vdpa->dev.parent = parent;
> +}
> +EXPORT_SYMBOL(vdpa_set_parent);
> +
> +struct vdpa_device *dev_to_vdpa(struct device *_dev)
> +{
> +	return container_of(_dev, struct vdpa_device, dev);
> +}
> +EXPORT_SYMBOL_GPL(dev_to_vdpa);
> +
> +struct device *vdpa_to_dev(struct vdpa_device *vdpa)
> +{
> +	return &vdpa->dev;
> +}
> +EXPORT_SYMBOL_GPL(vdpa_to_dev);
> +
> +static int vdpa_dev_probe(struct device *d)
> +{
> +	struct vdpa_device *dev = dev_to_vdpa(d);
> +	struct vdpa_driver *drv = drv_to_vdpa(dev->dev.driver);
> +	int ret = 0;
> +
> +	if (drv && drv->probe)
> +		ret = drv->probe(d);
> +
> +	return ret;
> +}
> +
> +static int vdpa_dev_remove(struct device *d)
> +{
> +	struct vdpa_device *dev = dev_to_vdpa(d);
> +	struct vdpa_driver *drv = drv_to_vdpa(dev->dev.driver);
> +
> +	if (drv && drv->remove)
> +		drv->remove(d);
> +
> +	return 0;
> +}
> +
> +static struct bus_type vdpa_bus = {
> +	.name  = "vdpa",
> +	.probe = vdpa_dev_probe,
> +	.remove = vdpa_dev_remove,
> +};
> +
> +int register_vdpa_device(struct vdpa_device *vdpa)
> +{
> +	int err;
> +
> +	if (!vdpa_get_parent(vdpa))
> +		return -EINVAL;
> +
> +	if (!vdpa->config)
> +		return -EINVAL;
> +
> +	err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL);
> +	if (err < 0)
> +		return -EFAULT;
> +
> +	vdpa->dev.bus = &vdpa_bus;
> +	device_initialize(&vdpa->dev);
> +
> +	vdpa->index = err;
> +	dev_set_name(&vdpa->dev, "vdpa%u", vdpa->index);
> +
> +	err = device_add(&vdpa->dev);
> +	if (err)
> +		ida_simple_remove(&vdpa_index_ida, vdpa->index);
> +
> +	return err;
> +}
> +EXPORT_SYMBOL_GPL(register_vdpa_device);
> +
> +void unregister_vdpa_device(struct vdpa_device *vdpa)
> +{
> +	int index = vdpa->index;
> +
> +	device_unregister(&vdpa->dev);
> +	ida_simple_remove(&vdpa_index_ida, index);
> +}
> +EXPORT_SYMBOL_GPL(unregister_vdpa_device);
> +
> +int register_vdpa_driver(struct vdpa_driver *driver)
> +{
> +	driver->drv.bus = &vdpa_bus;
> +	return driver_register(&driver->drv);
> +}
> +EXPORT_SYMBOL_GPL(register_vdpa_driver);
> +
> +void unregister_vdpa_driver(struct vdpa_driver *driver)
> +{
> +	driver_unregister(&driver->drv);
> +}
> +EXPORT_SYMBOL_GPL(unregister_vdpa_driver);
> +
> +static int vdpa_init(void)
> +{
> +	if (bus_register(&vdpa_bus) != 0)
> +		panic("virtio bus registration failed");
> +	return 0;
> +}
> +
> +static void __exit vdpa_exit(void)
> +{
> +	bus_unregister(&vdpa_bus);
> +	ida_destroy(&vdpa_index_ida);
> +}
> +core_initcall(vdpa_init);
> +module_exit(vdpa_exit);
> +
> +MODULE_VERSION(MOD_VERSION);
> +MODULE_AUTHOR(MOD_AUTHOR);
> +MODULE_LICENSE(MOD_LICENSE);
> diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
> new file mode 100644
> index 000000000000..47760137ef66
> --- /dev/null
> +++ b/include/linux/vdpa.h
> @@ -0,0 +1,191 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_VDPA_H
> +#define _LINUX_VDPA_H
> +
> +#include <linux/device.h>
> +#include <linux/interrupt.h>
> +#include <linux/vhost_iotlb.h>
> +
> +/**
> + * vDPA callback definition.
> + * @callback: interrupt callback function
> + * @private: the data passed to the callback function
> + */
> +struct vdpa_callback {
> +	irqreturn_t (*callback)(void *data);
> +	void *private;
> +};
> +
> +/**
> + * vDPA device - representation of a vDPA device
> + * @dev: underlying device
> + * @config: the configuration ops for this device.
> + * @index: device index
> + */
> +struct vdpa_device {
> +	struct device dev;
> +	const struct vdpa_config_ops *config;
> +	int index;
> +};
> +
> +/**
> + * vDPA_config_ops - operations for configuring a vDPA device.
> + * Note: vDPA device drivers are required to implement all of the
> + * operations unless it is optional mentioned in the following list.
> + * @set_vq_address:		Set the address of virtqueue
> + *				@vdev: vdpa device
> + *				@idx: virtqueue index
> + *				@desc_area: address of desc area
> + *				@driver_area: address of driver area
> + *				@device_area: address of device area
> + *				Returns integer: success (0) or error (< 0)
> + * @set_vq_num:			Set the size of virtqueue
> + *				@vdev: vdpa device
> + *				@idx: virtqueue index
> + *				@num: the size of virtqueue
> + * @kick_vq:			Kick the virtqueue
> + *				@vdev: vdpa device
> + *				@idx: virtqueue index


This seems wrong: kicks are data path so drivers should not
do it in a vendor specific way. How about an API
returning the device/resource that can then be
mapped as appropriate?


> + * @set_vq_cb:			Set the interrupt callback function for
> + *				a virtqueue
> + *				@vdev: vdpa device
> + *				@idx: virtqueue index
> + *				@cb: virtio-vdev interrupt callback structure


Calls are data path too, I think we need some way to map MSI?

> + * @set_vq_ready:		Set ready status for a virtqueue
> + *				@vdev: vdpa device
> + *				@idx: virtqueue index
> + *				@ready: ready (true) not ready(false)
> + * @get_vq_ready:		Get ready status for a virtqueue
> + *				@vdev: vdpa device
> + *				@idx: virtqueue index
> + *				Returns boolean: ready (true) or not (false)
> + * @set_vq_state:		Set the state for a virtqueue
> + *				@vdev: vdpa device
> + *				@idx: virtqueue index
> + *				@state: virtqueue state (last_avail_idx)
> + *				Returns integer: success (0) or error (< 0)
> + * @get_vq_state:		Get the state for a virtqueue
> + *				@vdev: vdpa device
> + *				@idx: virtqueue index
> + *				Returns virtqueue state (last_avail_idx)
> + * @get_vq_align:		Get the virtqueue align requirement
> + *				for the device
> + *				@vdev: vdpa device
> + *				Returns virtqueue algin requirement


Where does this come from? Spec dictates that for a data path,
vendor specific values for this will break userspace ...

> + * @get_features:		Get virtio features supported by the device
> + *				@vdev: vdpa device
> + *				Returns the virtio features support by the
> + *				device
> + * @set_features:		Set virtio features supported by the driver
> + *				@vdev: vdpa device
> + *				@features: feature support by the driver
> + *				Returns integer: success (0) or error (< 0)
> + * @set_config_cb:		Set the config interrupt callback
> + *				@vdev: vdpa device
> + *				@cb: virtio-vdev interrupt callback structure
> + * @get_vq_num_max:		Get the max size of virtqueue
> + *				@vdev: vdpa device
> + *				Returns u16: max size of virtqueue


I'm not sure this has to be uniform across VQs.

> + * @get_device_id:		Get virtio device id
> + *				@vdev: vdpa device
> + *				Returns u32: virtio device id


is this the virtio ID? PCI ID?

> + * @get_vendor_id:		Get id for the vendor that provides this device
> + *				@vdev: vdpa device
> + *				Returns u32: virtio vendor id

what's the idea behind this? userspace normally doesn't interact with
this ... debugging?

> + * @get_status:			Get the device status
> + *				@vdev: vdpa device
> + *				Returns u8: virtio device status
> + * @set_status:			Set the device status
> + *				@vdev: vdpa device
> + *				@status: virtio device status
> + * @get_config:			Read from device specific configuration space
> + *				@vdev: vdpa device
> + *				@offset: offset from the beginning of
> + *				configuration space
> + *				@buf: buffer used to read to
> + *				@len: the length to read from
> + *				configuration space
> + * @set_config:			Write to device specific configuration space
> + *				@vdev: vdpa device
> + *				@offset: offset from the beginning of
> + *				configuration space
> + *				@buf: buffer used to write from
> + *				@len: the length to write to
> + *				configuration space
> + * @get_generation:		Get device config generation (optional)
> + *				@vdev: vdpa device
> + *				Returns u32: device generation
> + * @set_map:			Set device memory mapping, optional
> + *				and only needed for device that using
> + *				device specific DMA translation
> + *				(on-chip IOMMU)
> + *				@vdev: vdpa device
> + *				@iotlb: vhost memory mapping to be
> + *				used by the vDPA
> + *				Returns integer: success (0) or error (< 0)

OK so any change just swaps in a completely new mapping?
Wouldn't this make minor changes such as memory hotplug
quite expensive?

> + */
> +struct vdpa_config_ops {
> +	/* Virtqueue ops */
> +	int (*set_vq_address)(struct vdpa_device *vdev,
> +			      u16 idx, u64 desc_area, u64 driver_area,
> +			      u64 device_area);
> +	void (*set_vq_num)(struct vdpa_device *vdev, u16 idx, u32 num);
> +	void (*kick_vq)(struct vdpa_device *vdev, u16 idx);
> +	void (*set_vq_cb)(struct vdpa_device *vdev, u16 idx,
> +			  struct vdpa_callback *cb);
> +	void (*set_vq_ready)(struct vdpa_device *vdev, u16 idx, bool ready);
> +	bool (*get_vq_ready)(struct vdpa_device *vdev, u16 idx);
> +	int (*set_vq_state)(struct vdpa_device *vdev, u16 idx, u64 state);
> +	u64 (*get_vq_state)(struct vdpa_device *vdev, u16 idx);
> +
> +	/* Device ops */
> +	u16 (*get_vq_align)(struct vdpa_device *vdev);
> +	u64 (*get_features)(struct vdpa_device *vdev);
> +	int (*set_features)(struct vdpa_device *vdev, u64 features);
> +	void (*set_config_cb)(struct vdpa_device *vdev,
> +			      struct vdpa_callback *cb);
> +	u16 (*get_vq_num_max)(struct vdpa_device *vdev);
> +	u32 (*get_device_id)(struct vdpa_device *vdev);
> +	u32 (*get_vendor_id)(struct vdpa_device *vdev);
> +	u8 (*get_status)(struct vdpa_device *vdev);
> +	void (*set_status)(struct vdpa_device *vdev, u8 status);
> +	void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
> +			   void *buf, unsigned int len);
> +	void (*set_config)(struct vdpa_device *vdev, unsigned int offset,
> +			   const void *buf, unsigned int len);
> +	u32 (*get_generation)(struct vdpa_device *vdev);
> +
> +	/* Mem table */
> +	int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb);
> +};
> +
> +int register_vdpa_device(struct vdpa_device *vdpa);
> +void unregister_vdpa_device(struct vdpa_device *vdpa);
> +
> +struct device *vdpa_get_parent(struct vdpa_device *vdpa);
> +void vdpa_set_parent(struct vdpa_device *vdpa, struct device *parent);
> +
> +struct vdpa_device *dev_to_vdpa(struct device *_dev);
> +struct device *vdpa_to_dev(struct vdpa_device *vdpa);
> +
> +/**
> + * vdpa_driver - operations for a vDPA driver
> + * @driver: underlying device driver
> + * @probe: the function to call when a device is found.  Returns 0 or -errno.
> + * @remove: the function to call when a device is removed.
> + */
> +struct vdpa_driver {
> +	struct device_driver drv;
> +	int (*probe)(struct device *dev);
> +	void (*remove)(struct device *dev);
> +};
> +
> +int register_vdpa_driver(struct vdpa_driver *drv);
> +void unregister_vdpa_driver(struct vdpa_driver *drv);
> +
> +static inline struct vdpa_driver *drv_to_vdpa(struct device_driver *drv)
> +{
> +	return container_of(drv, struct vdpa_driver, drv);
> +}
> +
> +#endif /* _LINUX_VDPA_H */
> -- 
> 2.19.1
Jason Wang Jan. 17, 2020, 1:52 p.m. UTC | #6
On 2020/1/17 下午8:13, Michael S. Tsirkin wrote:
> On Thu, Jan 16, 2020 at 08:42:29PM +0800, Jason Wang wrote:
>> vDPA device is a device that uses a datapath which complies with the
>> virtio specifications with vendor specific control path. vDPA devices
>> can be both physically located on the hardware or emulated by
>> software. vDPA hardware devices are usually implemented through PCIE
>> with the following types:
>>
>> - PF (Physical Function) - A single Physical Function
>> - VF (Virtual Function) - Device that supports single root I/O
>>    virtualization (SR-IOV). Its Virtual Function (VF) represents a
>>    virtualized instance of the device that can be assigned to different
>>    partitions
>> - VDEV (Virtual Device) - With technologies such as Intel Scalable
>>    IOV, a virtual device composed by host OS utilizing one or more
>>    ADIs.
>> - SF (Sub function) - Vendor specific interface to slice the Physical
>>    Function to multiple sub functions that can be assigned to different
>>    partitions as virtual devices.
>>
>> >From a driver's perspective, depends on how and where the DMA
>> translation is done, vDPA devices are split into two types:
>>
>> - Platform specific DMA translation - From the driver's perspective,
>>    the device can be used on a platform where device access to data in
>>    memory is limited and/or translated. An example is a PCIE vDPA whose
>>    DMA request was tagged via a bus (e.g PCIE) specific way. DMA
>>    translation and protection are done at PCIE bus IOMMU level.
>> - Device specific DMA translation - The device implements DMA
>>    isolation and protection through its own logic. An example is a vDPA
>>    device which uses on-chip IOMMU.
>>
>> To hide the differences and complexity of the above types for a vDPA
>> device/IOMMU options and in order to present a generic virtio device
>> to the upper layer, a device agnostic framework is required.
>>
>> This patch introduces a software vDPA bus which abstracts the
>> common attributes of vDPA device, vDPA bus driver and the
>> communication method (vdpa_config_ops) between the vDPA device
>> abstraction and the vDPA bus driver:
>>
>> With the abstraction of vDPA bus and vDPA bus operations, the
>> difference and complexity of the under layer hardware is hidden from
>> upper layer. The vDPA bus drivers on top can use a unified
>> vdpa_config_ops to control different types of vDPA device.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   MAINTAINERS                  |   1 +
>>   drivers/virtio/Kconfig       |   2 +
>>   drivers/virtio/Makefile      |   1 +
>>   drivers/virtio/vdpa/Kconfig  |   9 ++
>>   drivers/virtio/vdpa/Makefile |   2 +
>>   drivers/virtio/vdpa/vdpa.c   | 141 ++++++++++++++++++++++++++
>>   include/linux/vdpa.h         | 191 +++++++++++++++++++++++++++++++++++
>>   7 files changed, 347 insertions(+)
>>   create mode 100644 drivers/virtio/vdpa/Kconfig
>>   create mode 100644 drivers/virtio/vdpa/Makefile
>>   create mode 100644 drivers/virtio/vdpa/vdpa.c
>>   create mode 100644 include/linux/vdpa.h
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index d4bda9c900fa..578d2a581e3b 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -17540,6 +17540,7 @@ F:	tools/virtio/
>>   F:	drivers/net/virtio_net.c
>>   F:	drivers/block/virtio_blk.c
>>   F:	include/linux/virtio*.h
>> +F:	include/linux/vdpa.h
>>   F:	include/uapi/linux/virtio_*.h
>>   F:	drivers/crypto/virtio/
>>   F:	mm/balloon_compaction.c
>> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
>> index 078615cf2afc..9c4fdb64d9ac 100644
>> --- a/drivers/virtio/Kconfig
>> +++ b/drivers/virtio/Kconfig
>> @@ -96,3 +96,5 @@ config VIRTIO_MMIO_CMDLINE_DEVICES
>>   	 If unsure, say 'N'.
>>   
>>   endif # VIRTIO_MENU
>> +
>> +source "drivers/virtio/vdpa/Kconfig"
>> diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
>> index 3a2b5c5dcf46..fdf5eacd0d0a 100644
>> --- a/drivers/virtio/Makefile
>> +++ b/drivers/virtio/Makefile
>> @@ -6,3 +6,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
>>   virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
>>   obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
>>   obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
>> +obj-$(CONFIG_VDPA) += vdpa/
>> diff --git a/drivers/virtio/vdpa/Kconfig b/drivers/virtio/vdpa/Kconfig
>> new file mode 100644
>> index 000000000000..3032727b4d98
>> --- /dev/null
>> +++ b/drivers/virtio/vdpa/Kconfig
>> @@ -0,0 +1,9 @@
>> +# SPDX-License-Identifier: GPL-2.0-only
>> +config VDPA
>> +	tristate
>> +        default n
>> +        help
>> +          Enable this module to support vDPA device that uses a
>> +          datapath which complies with virtio specifications with
>> +          vendor specific control path.
>> +
>> diff --git a/drivers/virtio/vdpa/Makefile b/drivers/virtio/vdpa/Makefile
>> new file mode 100644
>> index 000000000000..ee6a35e8a4fb
>> --- /dev/null
>> +++ b/drivers/virtio/vdpa/Makefile
>> @@ -0,0 +1,2 @@
>> +# SPDX-License-Identifier: GPL-2.0
>> +obj-$(CONFIG_VDPA) += vdpa.o
>> diff --git a/drivers/virtio/vdpa/vdpa.c b/drivers/virtio/vdpa/vdpa.c
>> new file mode 100644
>> index 000000000000..2b0e4a9f105d
>> --- /dev/null
>> +++ b/drivers/virtio/vdpa/vdpa.c
>> @@ -0,0 +1,141 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * vDPA bus.
>> + *
>> + * Copyright (c) 2019, Red Hat. All rights reserved.
>> + *     Author: Jason Wang <jasowang@redhat.com>
>> + *
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/idr.h>
>> +#include <linux/vdpa.h>
>> +
>> +#define MOD_VERSION  "0.1"
>> +#define MOD_DESC     "vDPA bus"
>> +#define MOD_AUTHOR   "Jason Wang <jasowang@redhat.com>"
>> +#define MOD_LICENSE  "GPL v2"
>> +
>> +static DEFINE_IDA(vdpa_index_ida);
>> +
>> +struct device *vdpa_get_parent(struct vdpa_device *vdpa)
>> +{
>> +	return vdpa->dev.parent;
>> +}
>> +EXPORT_SYMBOL(vdpa_get_parent);
>> +
>> +void vdpa_set_parent(struct vdpa_device *vdpa, struct device *parent)
>> +{
>> +	vdpa->dev.parent = parent;
>> +}
>> +EXPORT_SYMBOL(vdpa_set_parent);
>> +
>> +struct vdpa_device *dev_to_vdpa(struct device *_dev)
>> +{
>> +	return container_of(_dev, struct vdpa_device, dev);
>> +}
>> +EXPORT_SYMBOL_GPL(dev_to_vdpa);
>> +
>> +struct device *vdpa_to_dev(struct vdpa_device *vdpa)
>> +{
>> +	return &vdpa->dev;
>> +}
>> +EXPORT_SYMBOL_GPL(vdpa_to_dev);
>> +
>> +static int vdpa_dev_probe(struct device *d)
>> +{
>> +	struct vdpa_device *dev = dev_to_vdpa(d);
>> +	struct vdpa_driver *drv = drv_to_vdpa(dev->dev.driver);
>> +	int ret = 0;
>> +
>> +	if (drv && drv->probe)
>> +		ret = drv->probe(d);
>> +
>> +	return ret;
>> +}
>> +
>> +static int vdpa_dev_remove(struct device *d)
>> +{
>> +	struct vdpa_device *dev = dev_to_vdpa(d);
>> +	struct vdpa_driver *drv = drv_to_vdpa(dev->dev.driver);
>> +
>> +	if (drv && drv->remove)
>> +		drv->remove(d);
>> +
>> +	return 0;
>> +}
>> +
>> +static struct bus_type vdpa_bus = {
>> +	.name  = "vdpa",
>> +	.probe = vdpa_dev_probe,
>> +	.remove = vdpa_dev_remove,
>> +};
>> +
>> +int register_vdpa_device(struct vdpa_device *vdpa)
>> +{
>> +	int err;
>> +
>> +	if (!vdpa_get_parent(vdpa))
>> +		return -EINVAL;
>> +
>> +	if (!vdpa->config)
>> +		return -EINVAL;
>> +
>> +	err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL);
>> +	if (err < 0)
>> +		return -EFAULT;
>> +
>> +	vdpa->dev.bus = &vdpa_bus;
>> +	device_initialize(&vdpa->dev);
>> +
>> +	vdpa->index = err;
>> +	dev_set_name(&vdpa->dev, "vdpa%u", vdpa->index);
>> +
>> +	err = device_add(&vdpa->dev);
>> +	if (err)
>> +		ida_simple_remove(&vdpa_index_ida, vdpa->index);
>> +
>> +	return err;
>> +}
>> +EXPORT_SYMBOL_GPL(register_vdpa_device);
>> +
>> +void unregister_vdpa_device(struct vdpa_device *vdpa)
>> +{
>> +	int index = vdpa->index;
>> +
>> +	device_unregister(&vdpa->dev);
>> +	ida_simple_remove(&vdpa_index_ida, index);
>> +}
>> +EXPORT_SYMBOL_GPL(unregister_vdpa_device);
>> +
>> +int register_vdpa_driver(struct vdpa_driver *driver)
>> +{
>> +	driver->drv.bus = &vdpa_bus;
>> +	return driver_register(&driver->drv);
>> +}
>> +EXPORT_SYMBOL_GPL(register_vdpa_driver);
>> +
>> +void unregister_vdpa_driver(struct vdpa_driver *driver)
>> +{
>> +	driver_unregister(&driver->drv);
>> +}
>> +EXPORT_SYMBOL_GPL(unregister_vdpa_driver);
>> +
>> +static int vdpa_init(void)
>> +{
>> +	if (bus_register(&vdpa_bus) != 0)
>> +		panic("virtio bus registration failed");
>> +	return 0;
>> +}
>> +
>> +static void __exit vdpa_exit(void)
>> +{
>> +	bus_unregister(&vdpa_bus);
>> +	ida_destroy(&vdpa_index_ida);
>> +}
>> +core_initcall(vdpa_init);
>> +module_exit(vdpa_exit);
>> +
>> +MODULE_VERSION(MOD_VERSION);
>> +MODULE_AUTHOR(MOD_AUTHOR);
>> +MODULE_LICENSE(MOD_LICENSE);
>> diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
>> new file mode 100644
>> index 000000000000..47760137ef66
>> --- /dev/null
>> +++ b/include/linux/vdpa.h
>> @@ -0,0 +1,191 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#ifndef _LINUX_VDPA_H
>> +#define _LINUX_VDPA_H
>> +
>> +#include <linux/device.h>
>> +#include <linux/interrupt.h>
>> +#include <linux/vhost_iotlb.h>
>> +
>> +/**
>> + * vDPA callback definition.
>> + * @callback: interrupt callback function
>> + * @private: the data passed to the callback function
>> + */
>> +struct vdpa_callback {
>> +	irqreturn_t (*callback)(void *data);
>> +	void *private;
>> +};
>> +
>> +/**
>> + * vDPA device - representation of a vDPA device
>> + * @dev: underlying device
>> + * @config: the configuration ops for this device.
>> + * @index: device index
>> + */
>> +struct vdpa_device {
>> +	struct device dev;
>> +	const struct vdpa_config_ops *config;
>> +	int index;
>> +};
>> +
>> +/**
>> + * vDPA_config_ops - operations for configuring a vDPA device.
>> + * Note: vDPA device drivers are required to implement all of the
>> + * operations unless it is optional mentioned in the following list.
>> + * @set_vq_address:		Set the address of virtqueue
>> + *				@vdev: vdpa device
>> + *				@idx: virtqueue index
>> + *				@desc_area: address of desc area
>> + *				@driver_area: address of driver area
>> + *				@device_area: address of device area
>> + *				Returns integer: success (0) or error (< 0)
>> + * @set_vq_num:			Set the size of virtqueue
>> + *				@vdev: vdpa device
>> + *				@idx: virtqueue index
>> + *				@num: the size of virtqueue
>> + * @kick_vq:			Kick the virtqueue
>> + *				@vdev: vdpa device
>> + *				@idx: virtqueue index
>
> This seems wrong: kicks are data path so drivers should not
> do it in a vendor specific way.


I'm not sure I get this since the doorbell is pretty vendor specific.

The idea here is to start form simple and common cases that can work for 
both kernel virtio drivers and vhost:

- For kernel, kick_vq() is called from vq->notify() directly
- For vhost, vhost is in charge of hook eventfd to kick_vq()


>   How about an API
> returning the device/resource that can then be
> mapped as appropriate?
>

Yes, this could be a further optimization on top but not a must (only 
work for e.g the doorbell does not share MMIO space with other 
functions). For vhost we need something like this and need to hook it to 
mmap() of vhost file descriptor.


>> + * @set_vq_cb:			Set the interrupt callback function for
>> + *				a virtqueue
>> + *				@vdev: vdpa device
>> + *				@idx: virtqueue index
>> + *				@cb: virtio-vdev interrupt callback structure
>
> Calls are data path too, I think we need some way to map MSI?


Similarly, this could be a optimization on top, and we can start from 
simple and common cases:

- For kernel, the vq callback could be mapped to MSI interrupt handler 
directly
- For vhost, eventfd wakeup could be hook in the cb here


>
>> + * @set_vq_ready:		Set ready status for a virtqueue
>> + *				@vdev: vdpa device
>> + *				@idx: virtqueue index
>> + *				@ready: ready (true) not ready(false)
>> + * @get_vq_ready:		Get ready status for a virtqueue
>> + *				@vdev: vdpa device
>> + *				@idx: virtqueue index
>> + *				Returns boolean: ready (true) or not (false)
>> + * @set_vq_state:		Set the state for a virtqueue
>> + *				@vdev: vdpa device
>> + *				@idx: virtqueue index
>> + *				@state: virtqueue state (last_avail_idx)
>> + *				Returns integer: success (0) or error (< 0)
>> + * @get_vq_state:		Get the state for a virtqueue
>> + *				@vdev: vdpa device
>> + *				@idx: virtqueue index
>> + *				Returns virtqueue state (last_avail_idx)
>> + * @get_vq_align:		Get the virtqueue align requirement
>> + *				for the device
>> + *				@vdev: vdpa device
>> + *				Returns virtqueue algin requirement
>
> Where does this come from? Spec dictates that for a data path,
> vendor specific values for this will break userspace ...


It comes from the align parameter of vring_create_virtqueue(). We can 
expose the alignment to userspace if necessary. If it's not necessary, I 
can drop this method here.


>
>> + * @get_features:		Get virtio features supported by the device
>> + *				@vdev: vdpa device
>> + *				Returns the virtio features support by the
>> + *				device
>> + * @set_features:		Set virtio features supported by the driver
>> + *				@vdev: vdpa device
>> + *				@features: feature support by the driver
>> + *				Returns integer: success (0) or error (< 0)
>> + * @set_config_cb:		Set the config interrupt callback
>> + *				@vdev: vdpa device
>> + *				@cb: virtio-vdev interrupt callback structure
>> + * @get_vq_num_max:		Get the max size of virtqueue
>> + *				@vdev: vdpa device
>> + *				Returns u16: max size of virtqueue
>
> I'm not sure this has to be uniform across VQs.


Let me add an index parameter to this.


>
>> + * @get_device_id:		Get virtio device id
>> + *				@vdev: vdpa device
>> + *				Returns u32: virtio device id
>
> is this the virtio ID? PCI ID?


Virtio ID


>
>> + * @get_vendor_id:		Get id for the vendor that provides this device
>> + *				@vdev: vdpa device
>> + *				Returns u32: virtio vendor id
> what's the idea behind this? userspace normally doesn't interact with
> this ... debugging?


This allows some vendor specific driver on top of vDPA bus. If this is 
not interested, I can drop this.


>
>> + * @get_status:			Get the device status
>> + *				@vdev: vdpa device
>> + *				Returns u8: virtio device status
>> + * @set_status:			Set the device status
>> + *				@vdev: vdpa device
>> + *				@status: virtio device status
>> + * @get_config:			Read from device specific configuration space
>> + *				@vdev: vdpa device
>> + *				@offset: offset from the beginning of
>> + *				configuration space
>> + *				@buf: buffer used to read to
>> + *				@len: the length to read from
>> + *				configuration space
>> + * @set_config:			Write to device specific configuration space
>> + *				@vdev: vdpa device
>> + *				@offset: offset from the beginning of
>> + *				configuration space
>> + *				@buf: buffer used to write from
>> + *				@len: the length to write to
>> + *				configuration space
>> + * @get_generation:		Get device config generation (optional)
>> + *				@vdev: vdpa device
>> + *				Returns u32: device generation
>> + * @set_map:			Set device memory mapping, optional
>> + *				and only needed for device that using
>> + *				device specific DMA translation
>> + *				(on-chip IOMMU)
>> + *				@vdev: vdpa device
>> + *				@iotlb: vhost memory mapping to be
>> + *				used by the vDPA
>> + *				Returns integer: success (0) or error (< 0)
> OK so any change just swaps in a completely new mapping?
> Wouldn't this make minor changes such as memory hotplug
> quite expensive?


My understanding is that the incremental updating of the on chip IOMMU 
may degrade the  performance. So vendor vDPA drivers may want to know 
all the mappings at once. Technically, we can keep the incremental API 
here and let the vendor vDPA drivers to record the full mapping 
internally which may slightly increase the complexity of vendor driver. 
We need more inputs from vendors here.

Thanks


>
>> + */
>> +struct vdpa_config_ops {
>> +	/* Virtqueue ops */
>> +	int (*set_vq_address)(struct vdpa_device *vdev,
>> +			      u16 idx, u64 desc_area, u64 driver_area,
>> +			      u64 device_area);
>> +	void (*set_vq_num)(struct vdpa_device *vdev, u16 idx, u32 num);
>> +	void (*kick_vq)(struct vdpa_device *vdev, u16 idx);
>> +	void (*set_vq_cb)(struct vdpa_device *vdev, u16 idx,
>> +			  struct vdpa_callback *cb);
>> +	void (*set_vq_ready)(struct vdpa_device *vdev, u16 idx, bool ready);
>> +	bool (*get_vq_ready)(struct vdpa_device *vdev, u16 idx);
>> +	int (*set_vq_state)(struct vdpa_device *vdev, u16 idx, u64 state);
>> +	u64 (*get_vq_state)(struct vdpa_device *vdev, u16 idx);
>> +
>> +	/* Device ops */
>> +	u16 (*get_vq_align)(struct vdpa_device *vdev);
>> +	u64 (*get_features)(struct vdpa_device *vdev);
>> +	int (*set_features)(struct vdpa_device *vdev, u64 features);
>> +	void (*set_config_cb)(struct vdpa_device *vdev,
>> +			      struct vdpa_callback *cb);
>> +	u16 (*get_vq_num_max)(struct vdpa_device *vdev);
>> +	u32 (*get_device_id)(struct vdpa_device *vdev);
>> +	u32 (*get_vendor_id)(struct vdpa_device *vdev);
>> +	u8 (*get_status)(struct vdpa_device *vdev);
>> +	void (*set_status)(struct vdpa_device *vdev, u8 status);
>> +	void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
>> +			   void *buf, unsigned int len);
>> +	void (*set_config)(struct vdpa_device *vdev, unsigned int offset,
>> +			   const void *buf, unsigned int len);
>> +	u32 (*get_generation)(struct vdpa_device *vdev);
>> +
>> +	/* Mem table */
>> +	int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb);
>> +};
>> +
>> +int register_vdpa_device(struct vdpa_device *vdpa);
>> +void unregister_vdpa_device(struct vdpa_device *vdpa);
>> +
>> +struct device *vdpa_get_parent(struct vdpa_device *vdpa);
>> +void vdpa_set_parent(struct vdpa_device *vdpa, struct device *parent);
>> +
>> +struct vdpa_device *dev_to_vdpa(struct device *_dev);
>> +struct device *vdpa_to_dev(struct vdpa_device *vdpa);
>> +
>> +/**
>> + * vdpa_driver - operations for a vDPA driver
>> + * @driver: underlying device driver
>> + * @probe: the function to call when a device is found.  Returns 0 or -errno.
>> + * @remove: the function to call when a device is removed.
>> + */
>> +struct vdpa_driver {
>> +	struct device_driver drv;
>> +	int (*probe)(struct device *dev);
>> +	void (*remove)(struct device *dev);
>> +};
>> +
>> +int register_vdpa_driver(struct vdpa_driver *drv);
>> +void unregister_vdpa_driver(struct vdpa_driver *drv);
>> +
>> +static inline struct vdpa_driver *drv_to_vdpa(struct device_driver *drv)
>> +{
>> +	return container_of(drv, struct vdpa_driver, drv);
>> +}
>> +
>> +#endif /* _LINUX_VDPA_H */
>> -- 
>> 2.19.1
Jason Gunthorpe Jan. 17, 2020, 1:54 p.m. UTC | #7
On Fri, Jan 17, 2020 at 11:03:12AM +0800, Jason Wang wrote:
> 
> On 2020/1/16 下午11:22, Jason Gunthorpe wrote:
> > On Thu, Jan 16, 2020 at 08:42:29PM +0800, Jason Wang wrote:
> > > vDPA device is a device that uses a datapath which complies with the
> > > virtio specifications with vendor specific control path. vDPA devices
> > > can be both physically located on the hardware or emulated by
> > > software. vDPA hardware devices are usually implemented through PCIE
> > > with the following types:
> > > 
> > > - PF (Physical Function) - A single Physical Function
> > > - VF (Virtual Function) - Device that supports single root I/O
> > >    virtualization (SR-IOV). Its Virtual Function (VF) represents a
> > >    virtualized instance of the device that can be assigned to different
> > >    partitions
> > > - VDEV (Virtual Device) - With technologies such as Intel Scalable
> > >    IOV, a virtual device composed by host OS utilizing one or more
> > >    ADIs.
> > > - SF (Sub function) - Vendor specific interface to slice the Physical
> > >    Function to multiple sub functions that can be assigned to different
> > >    partitions as virtual devices.
> > I really hope we don't end up with two different ways to spell this
> > same thing.
> 
> I think you meant ADI vs SF. It looks to me that ADI is limited to the scope
> of scalable IOV but SF not.

I think if one looks carefully you'd find that SF and ADI are using
very similar techiniques. For instance we'd also like to use the code
reorg of the MSIX vector setup with SFs that Intel is calling IMS.

Really SIOV is simply a bundle of pre-existing stuff under a tidy
name, whatever code skeleton we come up with for SFs should be re-used
for ADI.

> > Shouldn't there be a device/driver matching process of some kind?
> 
> 
> The question is what do we want do match here.
> 
> 1) "virtio" vs "vhost", I implemented matching method for this in mdev
> series, but it looks unnecessary for vDPA device driver to know about this.
> Anyway we can use sysfs driver bind/unbind to switch drivers
> 2) virtio device id and vendor id. I'm not sure we need this consider the
> two drivers so far (virtio/vhost) are all bus drivers.

As we seem to be contemplating some dynamic creation of vdpa devices I
think upon creation time it should be specified what mode they should
run it and then all driver binding and autoloading should happen
automatically. Telling the user to bind/unbind is a very poor
experience.

Jason
Shahaf Shuler Jan. 19, 2020, 9:07 a.m. UTC | #8
Friday, January 17, 2020 4:13 PM, Rob Miller:
Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
>>On 2020/1/17 下午8:13, Michael S. Tsirkin wrote:
>>> On Thu, Jan 16, 2020 at 08:42:29PM +0800, Jason Wang wrote:

[...]

>>> + * @set_map:                        Set device memory mapping, optional
>>> + *                          and only needed for device that using
>>> + *                          device specific DMA translation
>>> + *                          (on-chip IOMMU)
>>> + *                          @vdev: vdpa device
>>> + *                          @iotlb: vhost memory mapping to be
>>> + *                          used by the vDPA
>>> + *                          Returns integer: success (0) or error (< 0)
>> OK so any change just swaps in a completely new mapping?
>> Wouldn't this make minor changes such as memory hotplug
>> quite expensive?

What is the concern? Traversing the rb tree or fully replace the on-chip IOMMU translations? 
If the latest, then I think we can take such optimization on the driver level (i.e. to update only the diff between the two mapping). 
If the first one, then I think memory hotplug is a heavy flow regardless. Do you think the extra cycles for the tree traverse will be visible in any way? 

>
>My understanding is that the incremental updating of the on chip IOMMU 
>may degrade the  performance. So vendor vDPA drivers may want to know 
>all the mappings at once. 

Yes exact. For Mellanox case for instance many optimization can be performed on a given memory layout.

>Technically, we can keep the incremental API 
>here and let the vendor vDPA drivers to record the full mapping 
>internally which may slightly increase the complexity of vendor driver. 

What will be the trigger for the driver to know it received the last mapping on this series and it can now push it to the on-chip IOMMU?

>We need more inputs from vendors here.
>
>Thanks
Michael S. Tsirkin Jan. 19, 2020, 9:59 a.m. UTC | #9
On Sun, Jan 19, 2020 at 09:07:09AM +0000, Shahaf Shuler wrote:
> >Technically, we can keep the incremental API 
> >here and let the vendor vDPA drivers to record the full mapping 
> >internally which may slightly increase the complexity of vendor driver. 
> 
> What will be the trigger for the driver to know it received the last mapping on this series and it can now push it to the on-chip IOMMU?

Some kind of invalidate API?
Jason Wang Jan. 20, 2020, 7:50 a.m. UTC | #10
On 2020/1/17 下午9:54, Jason Gunthorpe wrote:
> On Fri, Jan 17, 2020 at 11:03:12AM +0800, Jason Wang wrote:
>> On 2020/1/16 下午11:22, Jason Gunthorpe wrote:
>>> On Thu, Jan 16, 2020 at 08:42:29PM +0800, Jason Wang wrote:
>>>> vDPA device is a device that uses a datapath which complies with the
>>>> virtio specifications with vendor specific control path. vDPA devices
>>>> can be both physically located on the hardware or emulated by
>>>> software. vDPA hardware devices are usually implemented through PCIE
>>>> with the following types:
>>>>
>>>> - PF (Physical Function) - A single Physical Function
>>>> - VF (Virtual Function) - Device that supports single root I/O
>>>>     virtualization (SR-IOV). Its Virtual Function (VF) represents a
>>>>     virtualized instance of the device that can be assigned to different
>>>>     partitions
>>>> - VDEV (Virtual Device) - With technologies such as Intel Scalable
>>>>     IOV, a virtual device composed by host OS utilizing one or more
>>>>     ADIs.
>>>> - SF (Sub function) - Vendor specific interface to slice the Physical
>>>>     Function to multiple sub functions that can be assigned to different
>>>>     partitions as virtual devices.
>>> I really hope we don't end up with two different ways to spell this
>>> same thing.
>> I think you meant ADI vs SF. It looks to me that ADI is limited to the scope
>> of scalable IOV but SF not.
> I think if one looks carefully you'd find that SF and ADI are using
> very similar techiniques. For instance we'd also like to use the code
> reorg of the MSIX vector setup with SFs that Intel is calling IMS.
>
> Really SIOV is simply a bundle of pre-existing stuff under a tidy
> name, whatever code skeleton we come up with for SFs should be re-used
> for ADI.


Ok, but do you prefer to mention ADI only for the next version?


>
>>> Shouldn't there be a device/driver matching process of some kind?
>>
>> The question is what do we want do match here.
>>
>> 1) "virtio" vs "vhost", I implemented matching method for this in mdev
>> series, but it looks unnecessary for vDPA device driver to know about this.
>> Anyway we can use sysfs driver bind/unbind to switch drivers
>> 2) virtio device id and vendor id. I'm not sure we need this consider the
>> two drivers so far (virtio/vhost) are all bus drivers.
> As we seem to be contemplating some dynamic creation of vdpa devices I
> think upon creation time it should be specified what mode they should
> run it and then all driver binding and autoloading should happen
> automatically. Telling the user to bind/unbind is a very poor
> experience.
>
> Jason


Ok, I will add the type (virtio vs vhost) and driver matching method back.

Thanks
Jason Wang Jan. 20, 2020, 8:19 a.m. UTC | #11
On 2020/1/17 下午10:12, Rob Miller wrote:
>
>
>     >> + * @get_vendor_id:          Get id for the vendor that
>     provides this device
>     >> + *                          @vdev: vdpa device
>     >> + *                          Returns u32: virtio vendor id
>     > what's the idea behind this? userspace normally doesn't interact
>     with
>     > this ... debugging?
>
>
>     This allows some vendor specific driver on top of vDPA bus. If
>     this is
>     not interested, I can drop this.
>
> RJM>] wouldn't  usage of get_device_id & get_vendor_id, beyond 
> reporting, tend to possibly leading to vendor specific code in the 
> framework instead of leaving the framework agnostic and leave the 
> vendor specific stuff to the vendor's vDPA device driver?


For virtio device id, I think it is needed for kernel/userspace to know 
which driver to load (e.g loading virtio-net for networking devic).

For virtio vendor id, it was needed by kernel virtio driver, and virtio 
bus can match driver based on virtio vendor id. So it doesn't prevent 
3rd vendor specific driver for virtio device.

Maybe we can report VIRTIO_DEV_ANY_ID as vendor id to forbid vendor 
specific stuffs.

Thanks
Jason Wang Jan. 20, 2020, 8:43 a.m. UTC | #12
On 2020/1/19 下午5:07, Shahaf Shuler wrote:
> Friday, January 17, 2020 4:13 PM, Rob Miller:
> Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
>>> On 2020/1/17 下午8:13, Michael S. Tsirkin wrote:
>>>> On Thu, Jan 16, 2020 at 08:42:29PM +0800, Jason Wang wrote:
> [...]
>
>>>> + * @set_map:                        Set device memory mapping, optional
>>>> + *                          and only needed for device that using
>>>> + *                          device specific DMA translation
>>>> + *                          (on-chip IOMMU)
>>>> + *                          @vdev: vdpa device
>>>> + *                          @iotlb: vhost memory mapping to be
>>>> + *                          used by the vDPA
>>>> + *                          Returns integer: success (0) or error (< 0)
>>> OK so any change just swaps in a completely new mapping?
>>> Wouldn't this make minor changes such as memory hotplug
>>> quite expensive?
> What is the concern? Traversing the rb tree or fully replace the on-chip IOMMU translations?
> If the latest, then I think we can take such optimization on the driver level (i.e. to update only the diff between the two mapping).


This is similar to the design of platform IOMMU part of vhost-vdpa. We 
decide to send diffs to platform IOMMU there. If it's ok to do that in 
driver, we can replace set_map with incremental API like map()/unmap().

Then driver need to maintain rbtree itself.


> If the first one, then I think memory hotplug is a heavy flow regardless. Do you think the extra cycles for the tree traverse will be visible in any way?


I think if the driver can pause the DMA during the time for setting up 
new mapping, it should be fine.


>   
>
>> My understanding is that the incremental updating of the on chip IOMMU
>> may degrade the  performance. So vendor vDPA drivers may want to know
>> all the mappings at once.
> Yes exact. For Mellanox case for instance many optimization can be performed on a given memory layout.
>
>> Technically, we can keep the incremental API
>> here and let the vendor vDPA drivers to record the full mapping
>> internally which may slightly increase the complexity of vendor driver.
> What will be the trigger for the driver to know it received the last mapping on this series and it can now push it to the on-chip IOMMU?


For GPA->HVA(HPA) mapping, we can have flag for this.

But for GIOVA_>HVA(HPA) mapping which could be changed by guest, it 
looks to me there's no concept of "last mapping" there. I guess in this 
case, mappings needs to be set from the ground. This could be expensive 
but consider most application uses static mappings (e.g dpdk in guest). 
It should be ok.

Thanks


>
>> We need more inputs from vendors here.
>>
>> Thanks
>
Jason Wang Jan. 20, 2020, 8:44 a.m. UTC | #13
On 2020/1/19 下午5:59, Michael S. Tsirkin wrote:
> On Sun, Jan 19, 2020 at 09:07:09AM +0000, Shahaf Shuler wrote:
>>> Technically, we can keep the incremental API
>>> here and let the vendor vDPA drivers to record the full mapping
>>> internally which may slightly increase the complexity of vendor driver.
>> What will be the trigger for the driver to know it received the last mapping on this series and it can now push it to the on-chip IOMMU?
> Some kind of invalidate API?
>

The problem is how to deal with the case of vIOMMU. When vIOMMU is 
enabling there's no concept of last mapping.

Thanks
Michael S. Tsirkin Jan. 20, 2020, 12:09 p.m. UTC | #14
On Mon, Jan 20, 2020 at 04:44:34PM +0800, Jason Wang wrote:
> 
> On 2020/1/19 下午5:59, Michael S. Tsirkin wrote:
> > On Sun, Jan 19, 2020 at 09:07:09AM +0000, Shahaf Shuler wrote:
> > > > Technically, we can keep the incremental API
> > > > here and let the vendor vDPA drivers to record the full mapping
> > > > internally which may slightly increase the complexity of vendor driver.
> > > What will be the trigger for the driver to know it received the last mapping on this series and it can now push it to the on-chip IOMMU?
> > Some kind of invalidate API?
> > 
> 
> The problem is how to deal with the case of vIOMMU. When vIOMMU is enabling
> there's no concept of last mapping.
> 
> Thanks

Most IOMMUs have a translation cache so have an invalidate API too.
Michael S. Tsirkin Jan. 20, 2020, 12:17 p.m. UTC | #15
On Fri, Jan 17, 2020 at 01:54:42PM +0000, Jason Gunthorpe wrote:
> > 1) "virtio" vs "vhost", I implemented matching method for this in mdev
> > series, but it looks unnecessary for vDPA device driver to know about this.
> > Anyway we can use sysfs driver bind/unbind to switch drivers
> > 2) virtio device id and vendor id. I'm not sure we need this consider the
> > two drivers so far (virtio/vhost) are all bus drivers.
> 
> As we seem to be contemplating some dynamic creation of vdpa devices I
> think upon creation time it should be specified what mode they should
> run it and then all driver binding and autoloading should happen
> automatically. Telling the user to bind/unbind is a very poor
> experience.

Maybe but OTOH it's an existing interface. I think we can reasonably
start with bind/unbind and then add ability to specify
the mode later. bind/unbind come from core so they will be
maintained anyway.
Jason Gunthorpe Jan. 20, 2020, 5:49 p.m. UTC | #16
On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
> This is similar to the design of platform IOMMU part of vhost-vdpa. We
> decide to send diffs to platform IOMMU there. If it's ok to do that in
> driver, we can replace set_map with incremental API like map()/unmap().
> 
> Then driver need to maintain rbtree itself.

I think we really need to see two modes, one where there is a fixed
translation without dynamic vIOMMU driven changes and one that
supports vIOMMU.

There are different optimization goals in the drivers for these two
configurations.

> > If the first one, then I think memory hotplug is a heavy flow
> > regardless. Do you think the extra cycles for the tree traverse
> > will be visible in any way?
> 
> I think if the driver can pause the DMA during the time for setting up new
> mapping, it should be fine.

This is very tricky for any driver if the mapping change hits the
virtio rings. :(

Even a IOMMU using driver is going to have problems with that..

Jason
Jason Gunthorpe Jan. 20, 2020, 5:50 p.m. UTC | #17
On Mon, Jan 20, 2020 at 07:17:26AM -0500, Michael S. Tsirkin wrote:
> On Fri, Jan 17, 2020 at 01:54:42PM +0000, Jason Gunthorpe wrote:
> > > 1) "virtio" vs "vhost", I implemented matching method for this in mdev
> > > series, but it looks unnecessary for vDPA device driver to know about this.
> > > Anyway we can use sysfs driver bind/unbind to switch drivers
> > > 2) virtio device id and vendor id. I'm not sure we need this consider the
> > > two drivers so far (virtio/vhost) are all bus drivers.
> > 
> > As we seem to be contemplating some dynamic creation of vdpa devices I
> > think upon creation time it should be specified what mode they should
> > run it and then all driver binding and autoloading should happen
> > automatically. Telling the user to bind/unbind is a very poor
> > experience.
> 
> Maybe but OTOH it's an existing interface. I think we can reasonably
> start with bind/unbind and then add ability to specify
> the mode later. bind/unbind come from core so they will be
> maintained anyway.

Existing where? For vfio? vfio is the only thing I am aware doing
that, and this is not vfio..

Jason
Shahaf Shuler Jan. 20, 2020, 8:51 p.m. UTC | #18
Monday, January 20, 2020 7:50 PM, Jason Gunthorpe:
> Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
> 
> On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
> > This is similar to the design of platform IOMMU part of vhost-vdpa. We
> > decide to send diffs to platform IOMMU there. If it's ok to do that in
> > driver, we can replace set_map with incremental API like map()/unmap().
> >
> > Then driver need to maintain rbtree itself.
> 
> I think we really need to see two modes, one where there is a fixed
> translation without dynamic vIOMMU driven changes and one that supports
> vIOMMU.
> 
> There are different optimization goals in the drivers for these two
> configurations.

+1.
It will be best to have one API for static config (i.e. mapping can be set only before virtio device gets active), and one API for dynamic changes that can be set after the virtio device is active. 

> 
> > > If the first one, then I think memory hotplug is a heavy flow
> > > regardless. Do you think the extra cycles for the tree traverse will
> > > be visible in any way?
> >
> > I think if the driver can pause the DMA during the time for setting up
> > new mapping, it should be fine.
> 
> This is very tricky for any driver if the mapping change hits the virtio rings. :(
> 
> Even a IOMMU using driver is going to have problems with that..
> 
> Jason
Michael S. Tsirkin Jan. 20, 2020, 9:25 p.m. UTC | #19
On Mon, Jan 20, 2020 at 08:51:43PM +0000, Shahaf Shuler wrote:
> Monday, January 20, 2020 7:50 PM, Jason Gunthorpe:
> > Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
> > 
> > On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
> > > This is similar to the design of platform IOMMU part of vhost-vdpa. We
> > > decide to send diffs to platform IOMMU there. If it's ok to do that in
> > > driver, we can replace set_map with incremental API like map()/unmap().
> > >
> > > Then driver need to maintain rbtree itself.
> > 
> > I think we really need to see two modes, one where there is a fixed
> > translation without dynamic vIOMMU driven changes and one that supports
> > vIOMMU.
> > 
> > There are different optimization goals in the drivers for these two
> > configurations.
> 
> +1.
> It will be best to have one API for static config (i.e. mapping can be
> set only before virtio device gets active), and one API for dynamic
> changes that can be set after the virtio device is active. 

Frankly I don't see when we'd use the static one.
Memory hotplug is enabled for most guests...

> > 
> > > > If the first one, then I think memory hotplug is a heavy flow
> > > > regardless. Do you think the extra cycles for the tree traverse will
> > > > be visible in any way?
> > >
> > > I think if the driver can pause the DMA during the time for setting up
> > > new mapping, it should be fine.
> > 
> > This is very tricky for any driver if the mapping change hits the virtio rings. :(
> > 
> > Even a IOMMU using driver is going to have problems with that..
> > 
> > Jason
Shahaf Shuler Jan. 20, 2020, 9:47 p.m. UTC | #20
Monday, January 20, 2020 11:25 PM, Michael S. Tsirkin:
> Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
> 
> On Mon, Jan 20, 2020 at 08:51:43PM +0000, Shahaf Shuler wrote:
> > Monday, January 20, 2020 7:50 PM, Jason Gunthorpe:
> > > Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
> > >
> > > On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
> > > > This is similar to the design of platform IOMMU part of
> > > > vhost-vdpa. We decide to send diffs to platform IOMMU there. If
> > > > it's ok to do that in driver, we can replace set_map with incremental API
> like map()/unmap().
> > > >
> > > > Then driver need to maintain rbtree itself.
> > >
> > > I think we really need to see two modes, one where there is a fixed
> > > translation without dynamic vIOMMU driven changes and one that
> > > supports vIOMMU.
> > >
> > > There are different optimization goals in the drivers for these two
> > > configurations.
> >
> > +1.
> > It will be best to have one API for static config (i.e. mapping can be
> > set only before virtio device gets active), and one API for dynamic
> > changes that can be set after the virtio device is active.
> 
> Frankly I don't see when we'd use the static one.
> Memory hotplug is enabled for most guests...

The fact memory hotplug is enabled doesn't necessarily means there is not cold-plugged memory on the hot plugged slots. 
So your claim is majority of guests are deployed w/o any cold-plugged memory? 

> 
> > >
> > > > > If the first one, then I think memory hotplug is a heavy flow
> > > > > regardless. Do you think the extra cycles for the tree traverse
> > > > > will be visible in any way?
> > > >
> > > > I think if the driver can pause the DMA during the time for
> > > > setting up new mapping, it should be fine.
> > >
> > > This is very tricky for any driver if the mapping change hits the
> > > virtio rings. :(
> > >
> > > Even a IOMMU using driver is going to have problems with that..
> > >
> > > Jason
Michael S. Tsirkin Jan. 20, 2020, 9:48 p.m. UTC | #21
On Mon, Jan 20, 2020 at 05:49:39PM +0000, Jason Gunthorpe wrote:
> On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
> > This is similar to the design of platform IOMMU part of vhost-vdpa. We
> > decide to send diffs to platform IOMMU there. If it's ok to do that in
> > driver, we can replace set_map with incremental API like map()/unmap().
> > 
> > Then driver need to maintain rbtree itself.
> 
> I think we really need to see two modes, one where there is a fixed
> translation without dynamic vIOMMU driven changes and one that
> supports vIOMMU.
> 
> There are different optimization goals in the drivers for these two
> configurations.
> 
> > > If the first one, then I think memory hotplug is a heavy flow
> > > regardless. Do you think the extra cycles for the tree traverse
> > > will be visible in any way?
> > 
> > I think if the driver can pause the DMA during the time for setting up new
> > mapping, it should be fine.
> 
> This is very tricky for any driver if the mapping change hits the
> virtio rings. :(
> 
> Even a IOMMU using driver is going to have problems with that..
> 
> Jason

I think for starters we can assume this doesn't happen,
so any change doesn't affect any buffers in use.
Certainly true e.g. for memory hotplug.
Michael S. Tsirkin Jan. 20, 2020, 9:56 p.m. UTC | #22
On Mon, Jan 20, 2020 at 05:50:55PM +0000, Jason Gunthorpe wrote:
> On Mon, Jan 20, 2020 at 07:17:26AM -0500, Michael S. Tsirkin wrote:
> > On Fri, Jan 17, 2020 at 01:54:42PM +0000, Jason Gunthorpe wrote:
> > > > 1) "virtio" vs "vhost", I implemented matching method for this in mdev
> > > > series, but it looks unnecessary for vDPA device driver to know about this.
> > > > Anyway we can use sysfs driver bind/unbind to switch drivers
> > > > 2) virtio device id and vendor id. I'm not sure we need this consider the
> > > > two drivers so far (virtio/vhost) are all bus drivers.
> > > 
> > > As we seem to be contemplating some dynamic creation of vdpa devices I
> > > think upon creation time it should be specified what mode they should
> > > run it and then all driver binding and autoloading should happen
> > > automatically. Telling the user to bind/unbind is a very poor
> > > experience.
> > 
> > Maybe but OTOH it's an existing interface. I think we can reasonably
> > start with bind/unbind and then add ability to specify
> > the mode later. bind/unbind come from core so they will be
> > maintained anyway.
> 
> Existing where?

Driver core.

> For vfio? vfio is the only thing I am aware doing
> that, and this is not vfio..
> 
> Jason


vfio is not doing anything. anyone can use a combination
of unbind and driver_override to attach a driver to a device.

It's not a great interface but it's there without any code,
and it will stay there without maintainance overhead
if we later add a nicer one.
Michael S. Tsirkin Jan. 20, 2020, 9:59 p.m. UTC | #23
On Mon, Jan 20, 2020 at 09:47:18PM +0000, Shahaf Shuler wrote:
> Monday, January 20, 2020 11:25 PM, Michael S. Tsirkin:
> > Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
> > 
> > On Mon, Jan 20, 2020 at 08:51:43PM +0000, Shahaf Shuler wrote:
> > > Monday, January 20, 2020 7:50 PM, Jason Gunthorpe:
> > > > Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
> > > >
> > > > On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
> > > > > This is similar to the design of platform IOMMU part of
> > > > > vhost-vdpa. We decide to send diffs to platform IOMMU there. If
> > > > > it's ok to do that in driver, we can replace set_map with incremental API
> > like map()/unmap().
> > > > >
> > > > > Then driver need to maintain rbtree itself.
> > > >
> > > > I think we really need to see two modes, one where there is a fixed
> > > > translation without dynamic vIOMMU driven changes and one that
> > > > supports vIOMMU.
> > > >
> > > > There are different optimization goals in the drivers for these two
> > > > configurations.
> > >
> > > +1.
> > > It will be best to have one API for static config (i.e. mapping can be
> > > set only before virtio device gets active), and one API for dynamic
> > > changes that can be set after the virtio device is active.
> > 
> > Frankly I don't see when we'd use the static one.
> > Memory hotplug is enabled for most guests...
> 
> The fact memory hotplug is enabled doesn't necessarily means there is not cold-plugged memory on the hot plugged slots. 
> So your claim is majority of guests are deployed w/o any cold-plugged memory? 

Sorry for not being clear. I was merely saying that dynamic one
can't be optional, and static one can. So how about we
start just with the dynamic one, then add the static one
as a later optimization?


> > 
> > > >
> > > > > > If the first one, then I think memory hotplug is a heavy flow
> > > > > > regardless. Do you think the extra cycles for the tree traverse
> > > > > > will be visible in any way?
> > > > >
> > > > > I think if the driver can pause the DMA during the time for
> > > > > setting up new mapping, it should be fine.
> > > >
> > > > This is very tricky for any driver if the mapping change hits the
> > > > virtio rings. :(
> > > >
> > > > Even a IOMMU using driver is going to have problems with that..
> > > >
> > > > Jason
Jason Wang Jan. 21, 2020, 3:32 a.m. UTC | #24
On 2020/1/20 下午8:09, Michael S. Tsirkin wrote:
> On Mon, Jan 20, 2020 at 04:44:34PM +0800, Jason Wang wrote:
>> On 2020/1/19 下午5:59, Michael S. Tsirkin wrote:
>>> On Sun, Jan 19, 2020 at 09:07:09AM +0000, Shahaf Shuler wrote:
>>>>> Technically, we can keep the incremental API
>>>>> here and let the vendor vDPA drivers to record the full mapping
>>>>> internally which may slightly increase the complexity of vendor driver.
>>>> What will be the trigger for the driver to know it received the last mapping on this series and it can now push it to the on-chip IOMMU?
>>> Some kind of invalidate API?
>>>
>> The problem is how to deal with the case of vIOMMU. When vIOMMU is enabling
>> there's no concept of last mapping.
>>
>> Thanks
> Most IOMMUs have a translation cache so have an invalidate API too.


Ok, then I get you.

But in this case, when vIOMMU is enabled, each new map became a "last 
mapping".

Thanks

>
Jason Wang Jan. 21, 2020, 4 a.m. UTC | #25
On 2020/1/21 上午1:49, Jason Gunthorpe wrote:
> On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
>> This is similar to the design of platform IOMMU part of vhost-vdpa. We
>> decide to send diffs to platform IOMMU there. If it's ok to do that in
>> driver, we can replace set_map with incremental API like map()/unmap().
>>
>> Then driver need to maintain rbtree itself.
> I think we really need to see two modes, one where there is a fixed
> translation without dynamic vIOMMU driven changes and one that
> supports vIOMMU.


I think in this case, you meant the method proposed by Shahaf that sends 
diffs of "fixed translation" to device?

It would be kind of tricky to deal with the following case for example:

old map [4G, 16G) new map [4G, 8G)

If we do

1) flush [4G, 16G)
2) add [4G, 8G)

There could be a window between 1) and 2).

It requires the IOMMU that can do

1) remove [8G, 16G)
2) flush [8G, 16G)
3) change [4G, 8G)

....

>
> There are different optimization goals in the drivers for these two
> configurations.
>
>>> If the first one, then I think memory hotplug is a heavy flow
>>> regardless. Do you think the extra cycles for the tree traverse
>>> will be visible in any way?
>> I think if the driver can pause the DMA during the time for setting up new
>> mapping, it should be fine.
> This is very tricky for any driver if the mapping change hits the
> virtio rings. :(
>
> Even a IOMMU using driver is going to have problems with that..
>
> Jason


Or I wonder whether ATS/PRI can help here. E.g during I/O page fault, 
driver/device can wait for the new mapping to be set and then replay the 
DMA.

Thanks

>
Michael S. Tsirkin Jan. 21, 2020, 5:47 a.m. UTC | #26
On Tue, Jan 21, 2020 at 12:00:57PM +0800, Jason Wang wrote:
> 
> On 2020/1/21 上午1:49, Jason Gunthorpe wrote:
> > On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
> > > This is similar to the design of platform IOMMU part of vhost-vdpa. We
> > > decide to send diffs to platform IOMMU there. If it's ok to do that in
> > > driver, we can replace set_map with incremental API like map()/unmap().
> > > 
> > > Then driver need to maintain rbtree itself.
> > I think we really need to see two modes, one where there is a fixed
> > translation without dynamic vIOMMU driven changes and one that
> > supports vIOMMU.
> 
> 
> I think in this case, you meant the method proposed by Shahaf that sends
> diffs of "fixed translation" to device?
> 
> It would be kind of tricky to deal with the following case for example:
> 
> old map [4G, 16G) new map [4G, 8G)
> 
> If we do
> 
> 1) flush [4G, 16G)
> 2) add [4G, 8G)
> 
> There could be a window between 1) and 2).
> 
> It requires the IOMMU that can do
> 
> 1) remove [8G, 16G)
> 2) flush [8G, 16G)
> 3) change [4G, 8G)
> 
> ....

Basically what I had in mind is something like qemu memory api

0. begin
1. remove [8G, 16G)
2. add [4G, 8G)
3. commit

Anyway, I'm fine with a one-shot API for now, we can
improve it later.

> > 
> > There are different optimization goals in the drivers for these two
> > configurations.
> > 
> > > > If the first one, then I think memory hotplug is a heavy flow
> > > > regardless. Do you think the extra cycles for the tree traverse
> > > > will be visible in any way?
> > > I think if the driver can pause the DMA during the time for setting up new
> > > mapping, it should be fine.
> > This is very tricky for any driver if the mapping change hits the
> > virtio rings. :(
> > 
> > Even a IOMMU using driver is going to have problems with that..
> > 
> > Jason
> 
> 
> Or I wonder whether ATS/PRI can help here. E.g during I/O page fault,
> driver/device can wait for the new mapping to be set and then replay the
> DMA.
> 
> Thanks
>
Shahaf Shuler Jan. 21, 2020, 6:01 a.m. UTC | #27
Tuesday, January 21, 2020 12:00 AM, Michael S. Tsirkin:
> Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
> 
> On Mon, Jan 20, 2020 at 09:47:18PM +0000, Shahaf Shuler wrote:
> > Monday, January 20, 2020 11:25 PM, Michael S. Tsirkin:
> > > Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
> > >
> > > On Mon, Jan 20, 2020 at 08:51:43PM +0000, Shahaf Shuler wrote:
> > > > Monday, January 20, 2020 7:50 PM, Jason Gunthorpe:
> > > > > Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
> > > > >
> > > > > On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
> > > > > > This is similar to the design of platform IOMMU part of
> > > > > > vhost-vdpa. We decide to send diffs to platform IOMMU there.
> > > > > > If it's ok to do that in driver, we can replace set_map with
> > > > > > incremental API
> > > like map()/unmap().
> > > > > >
> > > > > > Then driver need to maintain rbtree itself.
> > > > >
> > > > > I think we really need to see two modes, one where there is a
> > > > > fixed translation without dynamic vIOMMU driven changes and one
> > > > > that supports vIOMMU.
> > > > >
> > > > > There are different optimization goals in the drivers for these
> > > > > two configurations.
> > > >
> > > > +1.
> > > > It will be best to have one API for static config (i.e. mapping
> > > > can be set only before virtio device gets active), and one API for
> > > > dynamic changes that can be set after the virtio device is active.
> > >
> > > Frankly I don't see when we'd use the static one.
> > > Memory hotplug is enabled for most guests...
> >
> > The fact memory hotplug is enabled doesn't necessarily means there is not
> cold-plugged memory on the hot plugged slots.
> > So your claim is majority of guests are deployed w/o any cold-plugged
> memory?
> 
> Sorry for not being clear. I was merely saying that dynamic one can't be
> optional, and static one can. So how about we start just with the dynamic
> one, then add the static one as a later optimization?

Since we have the use case (cold plugged memory to guest, e.g. when populated w/ hugepages) I think we should start w/ both. The static one can be optional for drivers. 

Moreover am not yet clear about the suggested API for dynamic, can you share the prototype you have in mind?
Also will it be :
1. multiple add_map and then flag the driver to set
Or
2. each add_map should be set by the driver as stand alone.
Jason Wang Jan. 21, 2020, 7:57 a.m. UTC | #28
On 2020/1/21 下午2:01, Shahaf Shuler wrote:
> Tuesday, January 21, 2020 12:00 AM, Michael S. Tsirkin:
>> Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
>>
>> On Mon, Jan 20, 2020 at 09:47:18PM +0000, Shahaf Shuler wrote:
>>> Monday, January 20, 2020 11:25 PM, Michael S. Tsirkin:
>>>> Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
>>>>
>>>> On Mon, Jan 20, 2020 at 08:51:43PM +0000, Shahaf Shuler wrote:
>>>>> Monday, January 20, 2020 7:50 PM, Jason Gunthorpe:
>>>>>> Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
>>>>>>
>>>>>> On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
>>>>>>> This is similar to the design of platform IOMMU part of
>>>>>>> vhost-vdpa. We decide to send diffs to platform IOMMU there.
>>>>>>> If it's ok to do that in driver, we can replace set_map with
>>>>>>> incremental API
>>>> like map()/unmap().
>>>>>>> Then driver need to maintain rbtree itself.
>>>>>> I think we really need to see two modes, one where there is a
>>>>>> fixed translation without dynamic vIOMMU driven changes and one
>>>>>> that supports vIOMMU.
>>>>>>
>>>>>> There are different optimization goals in the drivers for these
>>>>>> two configurations.
>>>>> +1.
>>>>> It will be best to have one API for static config (i.e. mapping
>>>>> can be set only before virtio device gets active), and one API for
>>>>> dynamic changes that can be set after the virtio device is active.
>>>> Frankly I don't see when we'd use the static one.
>>>> Memory hotplug is enabled for most guests...
>>> The fact memory hotplug is enabled doesn't necessarily means there is not
>> cold-plugged memory on the hot plugged slots.
>>> So your claim is majority of guests are deployed w/o any cold-plugged
>> memory?
>>
>> Sorry for not being clear. I was merely saying that dynamic one can't be
>> optional, and static one can. So how about we start just with the dynamic
>> one, then add the static one as a later optimization?
> Since we have the use case (cold plugged memory to guest, e.g. when populated w/ hugepages) I think we should start w/ both. The static one can be optional for drivers.
>
> Moreover am not yet clear about the suggested API for dynamic, can you share the prototype you have in mind?
> Also will it be :
> 1. multiple add_map and then flag the driver to set
> Or
> 2. each add_map should be set by the driver as stand alone.


For dynamic one, it looks to me that introducing add_map()/del_map() bus 
operations is much more cleaner than reusing current set_map() one.

Thanks


>
Jason Wang Jan. 21, 2020, 8 a.m. UTC | #29
On 2020/1/21 下午1:47, Michael S. Tsirkin wrote:
> On Tue, Jan 21, 2020 at 12:00:57PM +0800, Jason Wang wrote:
>> On 2020/1/21 上午1:49, Jason Gunthorpe wrote:
>>> On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
>>>> This is similar to the design of platform IOMMU part of vhost-vdpa. We
>>>> decide to send diffs to platform IOMMU there. If it's ok to do that in
>>>> driver, we can replace set_map with incremental API like map()/unmap().
>>>>
>>>> Then driver need to maintain rbtree itself.
>>> I think we really need to see two modes, one where there is a fixed
>>> translation without dynamic vIOMMU driven changes and one that
>>> supports vIOMMU.
>>
>> I think in this case, you meant the method proposed by Shahaf that sends
>> diffs of "fixed translation" to device?
>>
>> It would be kind of tricky to deal with the following case for example:
>>
>> old map [4G, 16G) new map [4G, 8G)
>>
>> If we do
>>
>> 1) flush [4G, 16G)
>> 2) add [4G, 8G)
>>
>> There could be a window between 1) and 2).
>>
>> It requires the IOMMU that can do
>>
>> 1) remove [8G, 16G)
>> 2) flush [8G, 16G)
>> 3) change [4G, 8G)
>>
>> ....
> Basically what I had in mind is something like qemu memory api
>
> 0. begin
> 1. remove [8G, 16G)
> 2. add [4G, 8G)
> 3. commit


This sounds more flexible e.g driver may choose to implement static 
mapping one through commit. But a question here, it looks to me this 
still requires the DMA to be synced with at least commit here. Otherwise 
device may get DMA fault? Or device is expected to be paused DMA during 
begin?

Thanks


>
> Anyway, I'm fine with a one-shot API for now, we can
> improve it later.
>
>>> There are different optimization goals in the drivers for these two
>>> configurations.
>>>
>>>>> If the first one, then I think memory hotplug is a heavy flow
>>>>> regardless. Do you think the extra cycles for the tree traverse
>>>>> will be visible in any way?
>>>> I think if the driver can pause the DMA during the time for setting up new
>>>> mapping, it should be fine.
>>> This is very tricky for any driver if the mapping change hits the
>>> virtio rings. :(
>>>
>>> Even a IOMMU using driver is going to have problems with that..
>>>
>>> Jason
>>
>> Or I wonder whether ATS/PRI can help here. E.g during I/O page fault,
>> driver/device can wait for the new mapping to be set and then replay the
>> DMA.
>>
>> Thanks
>>
>
Michael S. Tsirkin Jan. 21, 2020, 8:15 a.m. UTC | #30
On Tue, Jan 21, 2020 at 04:00:38PM +0800, Jason Wang wrote:
> 
> On 2020/1/21 下午1:47, Michael S. Tsirkin wrote:
> > On Tue, Jan 21, 2020 at 12:00:57PM +0800, Jason Wang wrote:
> > > On 2020/1/21 上午1:49, Jason Gunthorpe wrote:
> > > > On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
> > > > > This is similar to the design of platform IOMMU part of vhost-vdpa. We
> > > > > decide to send diffs to platform IOMMU there. If it's ok to do that in
> > > > > driver, we can replace set_map with incremental API like map()/unmap().
> > > > > 
> > > > > Then driver need to maintain rbtree itself.
> > > > I think we really need to see two modes, one where there is a fixed
> > > > translation without dynamic vIOMMU driven changes and one that
> > > > supports vIOMMU.
> > > 
> > > I think in this case, you meant the method proposed by Shahaf that sends
> > > diffs of "fixed translation" to device?
> > > 
> > > It would be kind of tricky to deal with the following case for example:
> > > 
> > > old map [4G, 16G) new map [4G, 8G)
> > > 
> > > If we do
> > > 
> > > 1) flush [4G, 16G)
> > > 2) add [4G, 8G)
> > > 
> > > There could be a window between 1) and 2).
> > > 
> > > It requires the IOMMU that can do
> > > 
> > > 1) remove [8G, 16G)
> > > 2) flush [8G, 16G)
> > > 3) change [4G, 8G)
> > > 
> > > ....
> > Basically what I had in mind is something like qemu memory api
> > 
> > 0. begin
> > 1. remove [8G, 16G)
> > 2. add [4G, 8G)
> > 3. commit
> 
> 
> This sounds more flexible e.g driver may choose to implement static mapping
> one through commit. But a question here, it looks to me this still requires
> the DMA to be synced with at least commit here. Otherwise device may get DMA
> fault? Or device is expected to be paused DMA during begin?
> 
> Thanks

For example, commit might switch one set of tables for another,
without need to pause DMA.

> 
> > 
> > Anyway, I'm fine with a one-shot API for now, we can
> > improve it later.
> > 
> > > > There are different optimization goals in the drivers for these two
> > > > configurations.
> > > > 
> > > > > > If the first one, then I think memory hotplug is a heavy flow
> > > > > > regardless. Do you think the extra cycles for the tree traverse
> > > > > > will be visible in any way?
> > > > > I think if the driver can pause the DMA during the time for setting up new
> > > > > mapping, it should be fine.
> > > > This is very tricky for any driver if the mapping change hits the
> > > > virtio rings. :(
> > > > 
> > > > Even a IOMMU using driver is going to have problems with that..
> > > > 
> > > > Jason
> > > 
> > > Or I wonder whether ATS/PRI can help here. E.g during I/O page fault,
> > > driver/device can wait for the new mapping to be set and then replay the
> > > DMA.
> > > 
> > > Thanks
> > > 
> >
Jason Wang Jan. 21, 2020, 8:35 a.m. UTC | #31
On 2020/1/21 下午4:15, Michael S. Tsirkin wrote:
> On Tue, Jan 21, 2020 at 04:00:38PM +0800, Jason Wang wrote:
>> On 2020/1/21 下午1:47, Michael S. Tsirkin wrote:
>>> On Tue, Jan 21, 2020 at 12:00:57PM +0800, Jason Wang wrote:
>>>> On 2020/1/21 上午1:49, Jason Gunthorpe wrote:
>>>>> On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
>>>>>> This is similar to the design of platform IOMMU part of vhost-vdpa. We
>>>>>> decide to send diffs to platform IOMMU there. If it's ok to do that in
>>>>>> driver, we can replace set_map with incremental API like map()/unmap().
>>>>>>
>>>>>> Then driver need to maintain rbtree itself.
>>>>> I think we really need to see two modes, one where there is a fixed
>>>>> translation without dynamic vIOMMU driven changes and one that
>>>>> supports vIOMMU.
>>>> I think in this case, you meant the method proposed by Shahaf that sends
>>>> diffs of "fixed translation" to device?
>>>>
>>>> It would be kind of tricky to deal with the following case for example:
>>>>
>>>> old map [4G, 16G) new map [4G, 8G)
>>>>
>>>> If we do
>>>>
>>>> 1) flush [4G, 16G)
>>>> 2) add [4G, 8G)
>>>>
>>>> There could be a window between 1) and 2).
>>>>
>>>> It requires the IOMMU that can do
>>>>
>>>> 1) remove [8G, 16G)
>>>> 2) flush [8G, 16G)
>>>> 3) change [4G, 8G)
>>>>
>>>> ....
>>> Basically what I had in mind is something like qemu memory api
>>>
>>> 0. begin
>>> 1. remove [8G, 16G)
>>> 2. add [4G, 8G)
>>> 3. commit
>>
>> This sounds more flexible e.g driver may choose to implement static mapping
>> one through commit. But a question here, it looks to me this still requires
>> the DMA to be synced with at least commit here. Otherwise device may get DMA
>> fault? Or device is expected to be paused DMA during begin?
>>
>> Thanks
> For example, commit might switch one set of tables for another,
> without need to pause DMA.


Yes, I think that works but need confirmation from Shahaf or Jason.

Thanks



>
>>> Anyway, I'm fine with a one-shot API for now, we can
>>> improve it later.
>>>
>>>>> There are different optimization goals in the drivers for these two
>>>>> configurations.
>>>>>
>>>>>>> If the first one, then I think memory hotplug is a heavy flow
>>>>>>> regardless. Do you think the extra cycles for the tree traverse
>>>>>>> will be visible in any way?
>>>>>> I think if the driver can pause the DMA during the time for setting up new
>>>>>> mapping, it should be fine.
>>>>> This is very tricky for any driver if the mapping change hits the
>>>>> virtio rings. :(
>>>>>
>>>>> Even a IOMMU using driver is going to have problems with that..
>>>>>
>>>>> Jason
>>>> Or I wonder whether ATS/PRI can help here. E.g during I/O page fault,
>>>> driver/device can wait for the new mapping to be set and then replay the
>>>> DMA.
>>>>
>>>> Thanks
>>>>
Tian, Kevin Jan. 21, 2020, 8:40 a.m. UTC | #32
> From: Jason Wang <jasowang@redhat.com>
> Sent: Friday, January 17, 2020 11:03 AM
> 
> 
> On 2020/1/16 下午11:22, Jason Gunthorpe wrote:
> > On Thu, Jan 16, 2020 at 08:42:29PM +0800, Jason Wang wrote:
> >> vDPA device is a device that uses a datapath which complies with the
> >> virtio specifications with vendor specific control path. vDPA devices
> >> can be both physically located on the hardware or emulated by
> >> software. vDPA hardware devices are usually implemented through PCIE
> >> with the following types:
> >>
> >> - PF (Physical Function) - A single Physical Function
> >> - VF (Virtual Function) - Device that supports single root I/O
> >>    virtualization (SR-IOV). Its Virtual Function (VF) represents a
> >>    virtualized instance of the device that can be assigned to different
> >>    partitions
> >> - VDEV (Virtual Device) - With technologies such as Intel Scalable
> >>    IOV, a virtual device composed by host OS utilizing one or more
> >>    ADIs.

the concept of VDEV includes both software bits and ADIs. If you
only take about hardware types, using ADI is more accurate.

> >> - SF (Sub function) - Vendor specific interface to slice the Physical
> >>    Function to multiple sub functions that can be assigned to different
> >>    partitions as virtual devices.
> > I really hope we don't end up with two different ways to spell this
> > same thing.
> 
> 
> I think you meant ADI vs SF. It looks to me that ADI is limited to the
> scope of scalable IOV but SF not.

ADI is just a term for minimally assignable resource in Scalable IOV. 
'assignable' implies several things, e.g. the resource can be independently 
mapped to/accessed by user space or guest, DMAs between two
ADIs are isolated, operating one ADI doesn't affecting another ADI,
etc.  I'm not clear about  other vendor specific interfaces, but supposing
they need match the similar requirements. Then do we really want to
differentiate ADI vs. SF? What about merging them with ADI as just
one example of finer-grained slicing?

> 
> 
> >
> >> @@ -0,0 +1,2 @@
> >> +# SPDX-License-Identifier: GPL-2.0
> >> +obj-$(CONFIG_VDPA) += vdpa.o
> >> diff --git a/drivers/virtio/vdpa/vdpa.c b/drivers/virtio/vdpa/vdpa.c
> >> new file mode 100644
> >> index 000000000000..2b0e4a9f105d
> >> +++ b/drivers/virtio/vdpa/vdpa.c
> >> @@ -0,0 +1,141 @@
> >> +// SPDX-License-Identifier: GPL-2.0-only
> >> +/*
> >> + * vDPA bus.
> >> + *
> >> + * Copyright (c) 2019, Red Hat. All rights reserved.
> >> + *     Author: Jason Wang <jasowang@redhat.com>
> > 2020 tests days
> 
> 
> Will fix.
> 
> 
> >
> >> + *
> >> + */
> >> +
> >> +#include <linux/module.h>
> >> +#include <linux/idr.h>
> >> +#include <linux/vdpa.h>
> >> +
> >> +#define MOD_VERSION  "0.1"
> > I think module versions are discouraged these days
> 
> 
> Will remove.
> 
> 
> >
> >> +#define MOD_DESC     "vDPA bus"
> >> +#define MOD_AUTHOR   "Jason Wang <jasowang@redhat.com>"
> >> +#define MOD_LICENSE  "GPL v2"
> >> +
> >> +static DEFINE_IDA(vdpa_index_ida);
> >> +
> >> +struct device *vdpa_get_parent(struct vdpa_device *vdpa)
> >> +{
> >> +	return vdpa->dev.parent;
> >> +}
> >> +EXPORT_SYMBOL(vdpa_get_parent);
> >> +
> >> +void vdpa_set_parent(struct vdpa_device *vdpa, struct device *parent)
> >> +{
> >> +	vdpa->dev.parent = parent;
> >> +}
> >> +EXPORT_SYMBOL(vdpa_set_parent);
> >> +
> >> +struct vdpa_device *dev_to_vdpa(struct device *_dev)
> >> +{
> >> +	return container_of(_dev, struct vdpa_device, dev);
> >> +}
> >> +EXPORT_SYMBOL_GPL(dev_to_vdpa);
> >> +
> >> +struct device *vdpa_to_dev(struct vdpa_device *vdpa)
> >> +{
> >> +	return &vdpa->dev;
> >> +}
> >> +EXPORT_SYMBOL_GPL(vdpa_to_dev);
> > Why these trivial assessors? Seems unnecessary, or should at least be
> > static inlines in a header
> 
> 
> Will fix.
> 
> 
> >
> >> +int register_vdpa_device(struct vdpa_device *vdpa)
> >> +{
> > Usually we want to see symbols consistently prefixed with vdpa_*, is
> > there a reason why register/unregister are swapped?
> 
> 
> I follow the name from virtio. I will switch to vdpa_*.
> 
> 
> >
> >> +	int err;
> >> +
> >> +	if (!vdpa_get_parent(vdpa))
> >> +		return -EINVAL;
> >> +
> >> +	if (!vdpa->config)
> >> +		return -EINVAL;
> >> +
> >> +	err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL);
> >> +	if (err < 0)
> >> +		return -EFAULT;
> >> +
> >> +	vdpa->dev.bus = &vdpa_bus;
> >> +	device_initialize(&vdpa->dev);
> > IMHO device_initialize should not be called inside something called
> > register, toooften we find out that the caller drivers need the device
> > to be initialized earlier, ie to use the kref, or something.
> >
> > I find the best flow is to have some init function that does the
> > device_initialize and sets the device_name that the driver can call
> > early.
> 
> 
> Ok, will do.
> 
> 
> >
> > Shouldn't there be a device/driver matching process of some kind?
> 
> 
> The question is what do we want do match here.
> 
> 1) "virtio" vs "vhost", I implemented matching method for this in mdev
> series, but it looks unnecessary for vDPA device driver to know about
> this. Anyway we can use sysfs driver bind/unbind to switch drivers
> 2) virtio device id and vendor id. I'm not sure we need this consider
> the two drivers so far (virtio/vhost) are all bus drivers.
> 
> Thanks
> 
> 
> >
> > Jason
> >
Jason Wang Jan. 21, 2020, 9:41 a.m. UTC | #33
On 2020/1/21 下午4:40, Tian, Kevin wrote:
>> From: Jason Wang <jasowang@redhat.com>
>> Sent: Friday, January 17, 2020 11:03 AM
>>
>>
>> On 2020/1/16 下午11:22, Jason Gunthorpe wrote:
>>> On Thu, Jan 16, 2020 at 08:42:29PM +0800, Jason Wang wrote:
>>>> vDPA device is a device that uses a datapath which complies with the
>>>> virtio specifications with vendor specific control path. vDPA devices
>>>> can be both physically located on the hardware or emulated by
>>>> software. vDPA hardware devices are usually implemented through PCIE
>>>> with the following types:
>>>>
>>>> - PF (Physical Function) - A single Physical Function
>>>> - VF (Virtual Function) - Device that supports single root I/O
>>>>     virtualization (SR-IOV). Its Virtual Function (VF) represents a
>>>>     virtualized instance of the device that can be assigned to different
>>>>     partitions
>>>> - VDEV (Virtual Device) - With technologies such as Intel Scalable
>>>>     IOV, a virtual device composed by host OS utilizing one or more
>>>>     ADIs.
> the concept of VDEV includes both software bits and ADIs. If you
> only take about hardware types, using ADI is more accurate.


Ok.


>
>>>> - SF (Sub function) - Vendor specific interface to slice the Physical
>>>>     Function to multiple sub functions that can be assigned to different
>>>>     partitions as virtual devices.
>>> I really hope we don't end up with two different ways to spell this
>>> same thing.
>>
>> I think you meant ADI vs SF. It looks to me that ADI is limited to the
>> scope of scalable IOV but SF not.
> ADI is just a term for minimally assignable resource in Scalable IOV.
> 'assignable' implies several things, e.g. the resource can be independently
> mapped to/accessed by user space or guest, DMAs between two
> ADIs are isolated, operating one ADI doesn't affecting another ADI,
> etc.  I'm not clear about  other vendor specific interfaces, but supposing
> they need match the similar requirements. Then do we really want to
> differentiate ADI vs. SF? What about merging them with ADI as just
> one example of finer-grained slicing?


I think so. That what Jason G want as well.

Thanks


>
>>
>>>> @@ -0,0 +1,2 @@
>>>> +# SPDX-License-Identifier: GPL-2.0
>>>> +obj-$(CONFIG_VDPA) += vdpa.o
>>>> diff --git a/drivers/virtio/vdpa/vdpa.c b/drivers/virtio/vdpa/vdpa.c
>>>> new file mode 100644
>>>> index 000000000000..2b0e4a9f105d
>>>> +++ b/drivers/virtio/vdpa/vdpa.c
>>>> @@ -0,0 +1,141 @@
>>>> +// SPDX-License-Identifier: GPL-2.0-only
>>>> +/*
>>>> + * vDPA bus.
>>>> + *
>>>> + * Copyright (c) 2019, Red Hat. All rights reserved.
>>>> + *     Author: Jason Wang <jasowang@redhat.com>
>>> 2020 tests days
>>
>> Will fix.
>>
>>
>>>> + *
>>>> + */
>>>> +
>>>> +#include <linux/module.h>
>>>> +#include <linux/idr.h>
>>>> +#include <linux/vdpa.h>
>>>> +
>>>> +#define MOD_VERSION  "0.1"
>>> I think module versions are discouraged these days
>>
>> Will remove.
>>
>>
>>>> +#define MOD_DESC     "vDPA bus"
>>>> +#define MOD_AUTHOR   "Jason Wang <jasowang@redhat.com>"
>>>> +#define MOD_LICENSE  "GPL v2"
>>>> +
>>>> +static DEFINE_IDA(vdpa_index_ida);
>>>> +
>>>> +struct device *vdpa_get_parent(struct vdpa_device *vdpa)
>>>> +{
>>>> +	return vdpa->dev.parent;
>>>> +}
>>>> +EXPORT_SYMBOL(vdpa_get_parent);
>>>> +
>>>> +void vdpa_set_parent(struct vdpa_device *vdpa, struct device *parent)
>>>> +{
>>>> +	vdpa->dev.parent = parent;
>>>> +}
>>>> +EXPORT_SYMBOL(vdpa_set_parent);
>>>> +
>>>> +struct vdpa_device *dev_to_vdpa(struct device *_dev)
>>>> +{
>>>> +	return container_of(_dev, struct vdpa_device, dev);
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(dev_to_vdpa);
>>>> +
>>>> +struct device *vdpa_to_dev(struct vdpa_device *vdpa)
>>>> +{
>>>> +	return &vdpa->dev;
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(vdpa_to_dev);
>>> Why these trivial assessors? Seems unnecessary, or should at least be
>>> static inlines in a header
>>
>> Will fix.
>>
>>
>>>> +int register_vdpa_device(struct vdpa_device *vdpa)
>>>> +{
>>> Usually we want to see symbols consistently prefixed with vdpa_*, is
>>> there a reason why register/unregister are swapped?
>>
>> I follow the name from virtio. I will switch to vdpa_*.
>>
>>
>>>> +	int err;
>>>> +
>>>> +	if (!vdpa_get_parent(vdpa))
>>>> +		return -EINVAL;
>>>> +
>>>> +	if (!vdpa->config)
>>>> +		return -EINVAL;
>>>> +
>>>> +	err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL);
>>>> +	if (err < 0)
>>>> +		return -EFAULT;
>>>> +
>>>> +	vdpa->dev.bus = &vdpa_bus;
>>>> +	device_initialize(&vdpa->dev);
>>> IMHO device_initialize should not be called inside something called
>>> register, toooften we find out that the caller drivers need the device
>>> to be initialized earlier, ie to use the kref, or something.
>>>
>>> I find the best flow is to have some init function that does the
>>> device_initialize and sets the device_name that the driver can call
>>> early.
>>
>> Ok, will do.
>>
>>
>>> Shouldn't there be a device/driver matching process of some kind?
>>
>> The question is what do we want do match here.
>>
>> 1) "virtio" vs "vhost", I implemented matching method for this in mdev
>> series, but it looks unnecessary for vDPA device driver to know about
>> this. Anyway we can use sysfs driver bind/unbind to switch drivers
>> 2) virtio device id and vendor id. I'm not sure we need this consider
>> the two drivers so far (virtio/vhost) are all bus drivers.
>>
>> Thanks
>>
>>
>>> Jason
>>>
Shahaf Shuler Jan. 21, 2020, 11:09 a.m. UTC | #34
Tuesday, January 21, 2020 10:35 AM, Jason Wang:
> Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
> 
> 
> On 2020/1/21 下午4:15, Michael S. Tsirkin wrote:
> > On Tue, Jan 21, 2020 at 04:00:38PM +0800, Jason Wang wrote:
> >> On 2020/1/21 下午1:47, Michael S. Tsirkin wrote:
> >>> On Tue, Jan 21, 2020 at 12:00:57PM +0800, Jason Wang wrote:
> >>>> On 2020/1/21 上午1:49, Jason Gunthorpe wrote:
> >>>>> On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
> >>>>>> This is similar to the design of platform IOMMU part of
> >>>>>> vhost-vdpa. We decide to send diffs to platform IOMMU there. If
> >>>>>> it's ok to do that in driver, we can replace set_map with incremental
> API like map()/unmap().
> >>>>>>
> >>>>>> Then driver need to maintain rbtree itself.
> >>>>> I think we really need to see two modes, one where there is a
> >>>>> fixed translation without dynamic vIOMMU driven changes and one
> >>>>> that supports vIOMMU.
> >>>> I think in this case, you meant the method proposed by Shahaf that
> >>>> sends diffs of "fixed translation" to device?
> >>>>
> >>>> It would be kind of tricky to deal with the following case for example:
> >>>>
> >>>> old map [4G, 16G) new map [4G, 8G)
> >>>>
> >>>> If we do
> >>>>
> >>>> 1) flush [4G, 16G)
> >>>> 2) add [4G, 8G)
> >>>>
> >>>> There could be a window between 1) and 2).
> >>>>
> >>>> It requires the IOMMU that can do
> >>>>
> >>>> 1) remove [8G, 16G)
> >>>> 2) flush [8G, 16G)
> >>>> 3) change [4G, 8G)
> >>>>
> >>>> ....
> >>> Basically what I had in mind is something like qemu memory api
> >>>
> >>> 0. begin
> >>> 1. remove [8G, 16G)
> >>> 2. add [4G, 8G)
> >>> 3. commit
> >>
> >> This sounds more flexible e.g driver may choose to implement static
> >> mapping one through commit. But a question here, it looks to me this
> >> still requires the DMA to be synced with at least commit here.
> >> Otherwise device may get DMA fault? Or device is expected to be paused
> DMA during begin?
> >>
> >> Thanks
> > For example, commit might switch one set of tables for another,
> > without need to pause DMA.
> 
> 
> Yes, I think that works but need confirmation from Shahaf or Jason.

From my side, as I wrote, I would like to see the suggested function prototype along w/ the definition of the expectation from driver upon calling those. 
It is not 100% clear to me what should be the outcome of remove/flush/change/commit

> 
> Thanks
> 
> 
> 
> >
> >>> Anyway, I'm fine with a one-shot API for now, we can improve it
> >>> later.
> >>>
> >>>>> There are different optimization goals in the drivers for these
> >>>>> two configurations.
> >>>>>
> >>>>>>> If the first one, then I think memory hotplug is a heavy flow
> >>>>>>> regardless. Do you think the extra cycles for the tree traverse
> >>>>>>> will be visible in any way?
> >>>>>> I think if the driver can pause the DMA during the time for
> >>>>>> setting up new mapping, it should be fine.
> >>>>> This is very tricky for any driver if the mapping change hits the
> >>>>> virtio rings. :(
> >>>>>
> >>>>> Even a IOMMU using driver is going to have problems with that..
> >>>>>
> >>>>> Jason
> >>>> Or I wonder whether ATS/PRI can help here. E.g during I/O page
> >>>> fault, driver/device can wait for the new mapping to be set and
> >>>> then replay the DMA.
> >>>>
> >>>> Thanks
> >>>>
Jason Gunthorpe Jan. 21, 2020, 2:05 p.m. UTC | #35
On Tue, Jan 21, 2020 at 03:15:43AM -0500, Michael S. Tsirkin wrote:
> > This sounds more flexible e.g driver may choose to implement static mapping
> > one through commit. But a question here, it looks to me this still requires
> > the DMA to be synced with at least commit here. Otherwise device may get DMA
> > fault? Or device is expected to be paused DMA during begin?
> > 
> > Thanks
> 
> For example, commit might switch one set of tables for another,
> without need to pause DMA.

I'm not aware of any hardware that can do something like this
completely atomically..

Any mapping change API has to be based around add/remove regions
without any active DMA (ie active DMA is a guest error the guest can
be crashed if it does this)

Jason
Jason Gunthorpe Jan. 21, 2020, 2:07 p.m. UTC | #36
On Mon, Jan 20, 2020 at 04:25:23PM -0500, Michael S. Tsirkin wrote:
> On Mon, Jan 20, 2020 at 08:51:43PM +0000, Shahaf Shuler wrote:
> > Monday, January 20, 2020 7:50 PM, Jason Gunthorpe:
> > > Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
> > > 
> > > On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
> > > > This is similar to the design of platform IOMMU part of vhost-vdpa. We
> > > > decide to send diffs to platform IOMMU there. If it's ok to do that in
> > > > driver, we can replace set_map with incremental API like map()/unmap().
> > > >
> > > > Then driver need to maintain rbtree itself.
> > > 
> > > I think we really need to see two modes, one where there is a fixed
> > > translation without dynamic vIOMMU driven changes and one that supports
> > > vIOMMU.
> > > 
> > > There are different optimization goals in the drivers for these two
> > > configurations.
> > 
> > +1.
> > It will be best to have one API for static config (i.e. mapping can be
> > set only before virtio device gets active), and one API for dynamic
> > changes that can be set after the virtio device is active. 
> 
> Frankly I don't see when we'd use the static one.
> Memory hotplug is enabled for most guests...

If someone wants to run a full performance application, like dpdk,
then they may wish to trade memory hotplug in that VM for more
performance.

Perhaps Shahaf can quantify the performance delta?

Jason
Jason Gunthorpe Jan. 21, 2020, 2:12 p.m. UTC | #37
On Mon, Jan 20, 2020 at 04:56:06PM -0500, Michael S. Tsirkin wrote:
> > For vfio? vfio is the only thing I am aware doing
> > that, and this is not vfio..
> 
> vfio is not doing anything. anyone can use a combination
> of unbind and driver_override to attach a driver to a device.
> 
> It's not a great interface but it's there without any code,
> and it will stay there without maintainance overhead
> if we later add a nicer one.

Well, it is not a great interface, and it is only really used in
normal cases by vfio.

I don't think it is a good idea to design new subsystems with that
idea in mind, particularly since detatching the vdpa driver would not
trigger destruction of the underlying dynamic resource (ie the SF).

We need a way to trigger that destruction..

Jason
Michael S. Tsirkin Jan. 21, 2020, 2:15 p.m. UTC | #38
On Tue, Jan 21, 2020 at 02:12:05PM +0000, Jason Gunthorpe wrote:
> On Mon, Jan 20, 2020 at 04:56:06PM -0500, Michael S. Tsirkin wrote:
> > > For vfio? vfio is the only thing I am aware doing
> > > that, and this is not vfio..
> > 
> > vfio is not doing anything. anyone can use a combination
> > of unbind and driver_override to attach a driver to a device.
> > 
> > It's not a great interface but it's there without any code,
> > and it will stay there without maintainance overhead
> > if we later add a nicer one.
> 
> Well, it is not a great interface, and it is only really used in
> normal cases by vfio.
> 
> I don't think it is a good idea to design new subsystems with that
> idea in mind, particularly since detatching the vdpa driver would not
> trigger destruction of the underlying dynamic resource (ie the SF).
> 
> We need a way to trigger that destruction..
> 
> Jason 

You wanted a netlink command for this, right?
Michael S. Tsirkin Jan. 21, 2020, 2:16 p.m. UTC | #39
On Tue, Jan 21, 2020 at 02:07:59PM +0000, Jason Gunthorpe wrote:
> On Mon, Jan 20, 2020 at 04:25:23PM -0500, Michael S. Tsirkin wrote:
> > On Mon, Jan 20, 2020 at 08:51:43PM +0000, Shahaf Shuler wrote:
> > > Monday, January 20, 2020 7:50 PM, Jason Gunthorpe:
> > > > Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
> > > > 
> > > > On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
> > > > > This is similar to the design of platform IOMMU part of vhost-vdpa. We
> > > > > decide to send diffs to platform IOMMU there. If it's ok to do that in
> > > > > driver, we can replace set_map with incremental API like map()/unmap().
> > > > >
> > > > > Then driver need to maintain rbtree itself.
> > > > 
> > > > I think we really need to see two modes, one where there is a fixed
> > > > translation without dynamic vIOMMU driven changes and one that supports
> > > > vIOMMU.
> > > > 
> > > > There are different optimization goals in the drivers for these two
> > > > configurations.
> > > 
> > > +1.
> > > It will be best to have one API for static config (i.e. mapping can be
> > > set only before virtio device gets active), and one API for dynamic
> > > changes that can be set after the virtio device is active. 
> > 
> > Frankly I don't see when we'd use the static one.
> > Memory hotplug is enabled for most guests...
> 
> If someone wants to run a full performance application, like dpdk,
> then they may wish to trade memory hotplug in that VM for more
> performance.

Right. But let let's get basic functionality working first.

> Perhaps Shahaf can quantify the performance delta?
> 
> Jason
Jason Gunthorpe Jan. 21, 2020, 2:16 p.m. UTC | #40
On Tue, Jan 21, 2020 at 09:15:14AM -0500, Michael S. Tsirkin wrote:
> On Tue, Jan 21, 2020 at 02:12:05PM +0000, Jason Gunthorpe wrote:
> > On Mon, Jan 20, 2020 at 04:56:06PM -0500, Michael S. Tsirkin wrote:
> > > > For vfio? vfio is the only thing I am aware doing
> > > > that, and this is not vfio..
> > > 
> > > vfio is not doing anything. anyone can use a combination
> > > of unbind and driver_override to attach a driver to a device.
> > > 
> > > It's not a great interface but it's there without any code,
> > > and it will stay there without maintainance overhead
> > > if we later add a nicer one.
> > 
> > Well, it is not a great interface, and it is only really used in
> > normal cases by vfio.
> > 
> > I don't think it is a good idea to design new subsystems with that
> > idea in mind, particularly since detatching the vdpa driver would not
> > trigger destruction of the underlying dynamic resource (ie the SF).
> > 
> > We need a way to trigger that destruction..
> > 
> > Jason 
> 
> You wanted a netlink command for this, right?

It is my suggestion.

Based on experiance here we started out with sysfs and it was OK, but
slow. When we added container support the entire sysfs thing
completely exploded and we had to replace it with netlink.

Jason
Michael S. Tsirkin Jan. 21, 2020, 2:17 p.m. UTC | #41
On Tue, Jan 21, 2020 at 02:05:04PM +0000, Jason Gunthorpe wrote:
> On Tue, Jan 21, 2020 at 03:15:43AM -0500, Michael S. Tsirkin wrote:
> > > This sounds more flexible e.g driver may choose to implement static mapping
> > > one through commit. But a question here, it looks to me this still requires
> > > the DMA to be synced with at least commit here. Otherwise device may get DMA
> > > fault? Or device is expected to be paused DMA during begin?
> > > 
> > > Thanks
> > 
> > For example, commit might switch one set of tables for another,
> > without need to pause DMA.
> 
> I'm not aware of any hardware that can do something like this
> completely atomically..

FWIW VTD can do this atomically.

> Any mapping change API has to be based around add/remove regions
> without any active DMA (ie active DMA is a guest error the guest can
> be crashed if it does this)
> 
> Jason

Right, lots of cases are well served by only changing parts of
mapping that aren't in active use. Memory hotplug is such a case.
That's not the same as a completely static mapping.
Jason Wang Jan. 22, 2020, 6:18 a.m. UTC | #42
On 2020/1/21 下午10:17, Michael S. Tsirkin wrote:
> On Tue, Jan 21, 2020 at 02:05:04PM +0000, Jason Gunthorpe wrote:
>> On Tue, Jan 21, 2020 at 03:15:43AM -0500, Michael S. Tsirkin wrote:
>>>> This sounds more flexible e.g driver may choose to implement static mapping
>>>> one through commit. But a question here, it looks to me this still requires
>>>> the DMA to be synced with at least commit here. Otherwise device may get DMA
>>>> fault? Or device is expected to be paused DMA during begin?
>>>>
>>>> Thanks
>>> For example, commit might switch one set of tables for another,
>>> without need to pause DMA.
>> I'm not aware of any hardware that can do something like this
>> completely atomically..
> FWIW VTD can do this atomically.
>
>> Any mapping change API has to be based around add/remove regions
>> without any active DMA (ie active DMA is a guest error the guest can
>> be crashed if it does this)
>>
>> Jason
> Right, lots of cases are well served by only changing parts of
> mapping that aren't in active use. Memory hotplug is such a case.
> That's not the same as a completely static mapping.


For hotplug it should be fine with current Qemu since it belongs to 
different memory regions. So each dimm should have its own dedicated map 
entries in IOMMU.

But I'm not sure if the merging logic in current vhost memory listener 
may cause any trouble, we may need to disable it.

Thanks


>
Jason Wang Jan. 22, 2020, 6:36 a.m. UTC | #43
On 2020/1/21 下午7:09, Shahaf Shuler wrote:
> Tuesday, January 21, 2020 10:35 AM, Jason Wang:
>> Subject: Re: [PATCH 3/5] vDPA: introduce vDPA bus
>>
>>
>> On 2020/1/21 下午4:15, Michael S. Tsirkin wrote:
>>> On Tue, Jan 21, 2020 at 04:00:38PM +0800, Jason Wang wrote:
>>>> On 2020/1/21 下午1:47, Michael S. Tsirkin wrote:
>>>>> On Tue, Jan 21, 2020 at 12:00:57PM +0800, Jason Wang wrote:
>>>>>> On 2020/1/21 上午1:49, Jason Gunthorpe wrote:
>>>>>>> On Mon, Jan 20, 2020 at 04:43:53PM +0800, Jason Wang wrote:
>>>>>>>> This is similar to the design of platform IOMMU part of
>>>>>>>> vhost-vdpa. We decide to send diffs to platform IOMMU there. If
>>>>>>>> it's ok to do that in driver, we can replace set_map with incremental
>> API like map()/unmap().
>>>>>>>> Then driver need to maintain rbtree itself.
>>>>>>> I think we really need to see two modes, one where there is a
>>>>>>> fixed translation without dynamic vIOMMU driven changes and one
>>>>>>> that supports vIOMMU.
>>>>>> I think in this case, you meant the method proposed by Shahaf that
>>>>>> sends diffs of "fixed translation" to device?
>>>>>>
>>>>>> It would be kind of tricky to deal with the following case for example:
>>>>>>
>>>>>> old map [4G, 16G) new map [4G, 8G)
>>>>>>
>>>>>> If we do
>>>>>>
>>>>>> 1) flush [4G, 16G)
>>>>>> 2) add [4G, 8G)
>>>>>>
>>>>>> There could be a window between 1) and 2).
>>>>>>
>>>>>> It requires the IOMMU that can do
>>>>>>
>>>>>> 1) remove [8G, 16G)
>>>>>> 2) flush [8G, 16G)
>>>>>> 3) change [4G, 8G)
>>>>>>
>>>>>> ....
>>>>> Basically what I had in mind is something like qemu memory api
>>>>>
>>>>> 0. begin
>>>>> 1. remove [8G, 16G)
>>>>> 2. add [4G, 8G)
>>>>> 3. commit
>>>> This sounds more flexible e.g driver may choose to implement static
>>>> mapping one through commit. But a question here, it looks to me this
>>>> still requires the DMA to be synced with at least commit here.
>>>> Otherwise device may get DMA fault? Or device is expected to be paused
>> DMA during begin?
>>>> Thanks
>>> For example, commit might switch one set of tables for another,
>>> without need to pause DMA.
>> Yes, I think that works but need confirmation from Shahaf or Jason.
>  From my side, as I wrote, I would like to see the suggested function prototype along w/ the definition of the expectation from driver upon calling those.
> It is not 100% clear to me what should be the outcome of remove/flush/change/commit


Right, I can do this in next version after the discussion is converged.

Thanks


>
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index d4bda9c900fa..578d2a581e3b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17540,6 +17540,7 @@  F:	tools/virtio/
 F:	drivers/net/virtio_net.c
 F:	drivers/block/virtio_blk.c
 F:	include/linux/virtio*.h
+F:	include/linux/vdpa.h
 F:	include/uapi/linux/virtio_*.h
 F:	drivers/crypto/virtio/
 F:	mm/balloon_compaction.c
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 078615cf2afc..9c4fdb64d9ac 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -96,3 +96,5 @@  config VIRTIO_MMIO_CMDLINE_DEVICES
 	 If unsure, say 'N'.
 
 endif # VIRTIO_MENU
+
+source "drivers/virtio/vdpa/Kconfig"
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index 3a2b5c5dcf46..fdf5eacd0d0a 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -6,3 +6,4 @@  virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
 virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
 obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
 obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
+obj-$(CONFIG_VDPA) += vdpa/
diff --git a/drivers/virtio/vdpa/Kconfig b/drivers/virtio/vdpa/Kconfig
new file mode 100644
index 000000000000..3032727b4d98
--- /dev/null
+++ b/drivers/virtio/vdpa/Kconfig
@@ -0,0 +1,9 @@ 
+# SPDX-License-Identifier: GPL-2.0-only
+config VDPA
+	tristate
+        default n
+        help
+          Enable this module to support vDPA device that uses a
+          datapath which complies with virtio specifications with
+          vendor specific control path.
+
diff --git a/drivers/virtio/vdpa/Makefile b/drivers/virtio/vdpa/Makefile
new file mode 100644
index 000000000000..ee6a35e8a4fb
--- /dev/null
+++ b/drivers/virtio/vdpa/Makefile
@@ -0,0 +1,2 @@ 
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_VDPA) += vdpa.o
diff --git a/drivers/virtio/vdpa/vdpa.c b/drivers/virtio/vdpa/vdpa.c
new file mode 100644
index 000000000000..2b0e4a9f105d
--- /dev/null
+++ b/drivers/virtio/vdpa/vdpa.c
@@ -0,0 +1,141 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vDPA bus.
+ *
+ * Copyright (c) 2019, Red Hat. All rights reserved.
+ *     Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/idr.h>
+#include <linux/vdpa.h>
+
+#define MOD_VERSION  "0.1"
+#define MOD_DESC     "vDPA bus"
+#define MOD_AUTHOR   "Jason Wang <jasowang@redhat.com>"
+#define MOD_LICENSE  "GPL v2"
+
+static DEFINE_IDA(vdpa_index_ida);
+
+struct device *vdpa_get_parent(struct vdpa_device *vdpa)
+{
+	return vdpa->dev.parent;
+}
+EXPORT_SYMBOL(vdpa_get_parent);
+
+void vdpa_set_parent(struct vdpa_device *vdpa, struct device *parent)
+{
+	vdpa->dev.parent = parent;
+}
+EXPORT_SYMBOL(vdpa_set_parent);
+
+struct vdpa_device *dev_to_vdpa(struct device *_dev)
+{
+	return container_of(_dev, struct vdpa_device, dev);
+}
+EXPORT_SYMBOL_GPL(dev_to_vdpa);
+
+struct device *vdpa_to_dev(struct vdpa_device *vdpa)
+{
+	return &vdpa->dev;
+}
+EXPORT_SYMBOL_GPL(vdpa_to_dev);
+
+static int vdpa_dev_probe(struct device *d)
+{
+	struct vdpa_device *dev = dev_to_vdpa(d);
+	struct vdpa_driver *drv = drv_to_vdpa(dev->dev.driver);
+	int ret = 0;
+
+	if (drv && drv->probe)
+		ret = drv->probe(d);
+
+	return ret;
+}
+
+static int vdpa_dev_remove(struct device *d)
+{
+	struct vdpa_device *dev = dev_to_vdpa(d);
+	struct vdpa_driver *drv = drv_to_vdpa(dev->dev.driver);
+
+	if (drv && drv->remove)
+		drv->remove(d);
+
+	return 0;
+}
+
+static struct bus_type vdpa_bus = {
+	.name  = "vdpa",
+	.probe = vdpa_dev_probe,
+	.remove = vdpa_dev_remove,
+};
+
+int register_vdpa_device(struct vdpa_device *vdpa)
+{
+	int err;
+
+	if (!vdpa_get_parent(vdpa))
+		return -EINVAL;
+
+	if (!vdpa->config)
+		return -EINVAL;
+
+	err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL);
+	if (err < 0)
+		return -EFAULT;
+
+	vdpa->dev.bus = &vdpa_bus;
+	device_initialize(&vdpa->dev);
+
+	vdpa->index = err;
+	dev_set_name(&vdpa->dev, "vdpa%u", vdpa->index);
+
+	err = device_add(&vdpa->dev);
+	if (err)
+		ida_simple_remove(&vdpa_index_ida, vdpa->index);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(register_vdpa_device);
+
+void unregister_vdpa_device(struct vdpa_device *vdpa)
+{
+	int index = vdpa->index;
+
+	device_unregister(&vdpa->dev);
+	ida_simple_remove(&vdpa_index_ida, index);
+}
+EXPORT_SYMBOL_GPL(unregister_vdpa_device);
+
+int register_vdpa_driver(struct vdpa_driver *driver)
+{
+	driver->drv.bus = &vdpa_bus;
+	return driver_register(&driver->drv);
+}
+EXPORT_SYMBOL_GPL(register_vdpa_driver);
+
+void unregister_vdpa_driver(struct vdpa_driver *driver)
+{
+	driver_unregister(&driver->drv);
+}
+EXPORT_SYMBOL_GPL(unregister_vdpa_driver);
+
+static int vdpa_init(void)
+{
+	if (bus_register(&vdpa_bus) != 0)
+		panic("virtio bus registration failed");
+	return 0;
+}
+
+static void __exit vdpa_exit(void)
+{
+	bus_unregister(&vdpa_bus);
+	ida_destroy(&vdpa_index_ida);
+}
+core_initcall(vdpa_init);
+module_exit(vdpa_exit);
+
+MODULE_VERSION(MOD_VERSION);
+MODULE_AUTHOR(MOD_AUTHOR);
+MODULE_LICENSE(MOD_LICENSE);
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
new file mode 100644
index 000000000000..47760137ef66
--- /dev/null
+++ b/include/linux/vdpa.h
@@ -0,0 +1,191 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VDPA_H
+#define _LINUX_VDPA_H
+
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/vhost_iotlb.h>
+
+/**
+ * vDPA callback definition.
+ * @callback: interrupt callback function
+ * @private: the data passed to the callback function
+ */
+struct vdpa_callback {
+	irqreturn_t (*callback)(void *data);
+	void *private;
+};
+
+/**
+ * vDPA device - representation of a vDPA device
+ * @dev: underlying device
+ * @config: the configuration ops for this device.
+ * @index: device index
+ */
+struct vdpa_device {
+	struct device dev;
+	const struct vdpa_config_ops *config;
+	int index;
+};
+
+/**
+ * vDPA_config_ops - operations for configuring a vDPA device.
+ * Note: vDPA device drivers are required to implement all of the
+ * operations unless it is optional mentioned in the following list.
+ * @set_vq_address:		Set the address of virtqueue
+ *				@vdev: vdpa device
+ *				@idx: virtqueue index
+ *				@desc_area: address of desc area
+ *				@driver_area: address of driver area
+ *				@device_area: address of device area
+ *				Returns integer: success (0) or error (< 0)
+ * @set_vq_num:			Set the size of virtqueue
+ *				@vdev: vdpa device
+ *				@idx: virtqueue index
+ *				@num: the size of virtqueue
+ * @kick_vq:			Kick the virtqueue
+ *				@vdev: vdpa device
+ *				@idx: virtqueue index
+ * @set_vq_cb:			Set the interrupt callback function for
+ *				a virtqueue
+ *				@vdev: vdpa device
+ *				@idx: virtqueue index
+ *				@cb: virtio-vdev interrupt callback structure
+ * @set_vq_ready:		Set ready status for a virtqueue
+ *				@vdev: vdpa device
+ *				@idx: virtqueue index
+ *				@ready: ready (true) not ready(false)
+ * @get_vq_ready:		Get ready status for a virtqueue
+ *				@vdev: vdpa device
+ *				@idx: virtqueue index
+ *				Returns boolean: ready (true) or not (false)
+ * @set_vq_state:		Set the state for a virtqueue
+ *				@vdev: vdpa device
+ *				@idx: virtqueue index
+ *				@state: virtqueue state (last_avail_idx)
+ *				Returns integer: success (0) or error (< 0)
+ * @get_vq_state:		Get the state for a virtqueue
+ *				@vdev: vdpa device
+ *				@idx: virtqueue index
+ *				Returns virtqueue state (last_avail_idx)
+ * @get_vq_align:		Get the virtqueue align requirement
+ *				for the device
+ *				@vdev: vdpa device
+ *				Returns virtqueue algin requirement
+ * @get_features:		Get virtio features supported by the device
+ *				@vdev: vdpa device
+ *				Returns the virtio features support by the
+ *				device
+ * @set_features:		Set virtio features supported by the driver
+ *				@vdev: vdpa device
+ *				@features: feature support by the driver
+ *				Returns integer: success (0) or error (< 0)
+ * @set_config_cb:		Set the config interrupt callback
+ *				@vdev: vdpa device
+ *				@cb: virtio-vdev interrupt callback structure
+ * @get_vq_num_max:		Get the max size of virtqueue
+ *				@vdev: vdpa device
+ *				Returns u16: max size of virtqueue
+ * @get_device_id:		Get virtio device id
+ *				@vdev: vdpa device
+ *				Returns u32: virtio device id
+ * @get_vendor_id:		Get id for the vendor that provides this device
+ *				@vdev: vdpa device
+ *				Returns u32: virtio vendor id
+ * @get_status:			Get the device status
+ *				@vdev: vdpa device
+ *				Returns u8: virtio device status
+ * @set_status:			Set the device status
+ *				@vdev: vdpa device
+ *				@status: virtio device status
+ * @get_config:			Read from device specific configuration space
+ *				@vdev: vdpa device
+ *				@offset: offset from the beginning of
+ *				configuration space
+ *				@buf: buffer used to read to
+ *				@len: the length to read from
+ *				configuration space
+ * @set_config:			Write to device specific configuration space
+ *				@vdev: vdpa device
+ *				@offset: offset from the beginning of
+ *				configuration space
+ *				@buf: buffer used to write from
+ *				@len: the length to write to
+ *				configuration space
+ * @get_generation:		Get device config generation (optional)
+ *				@vdev: vdpa device
+ *				Returns u32: device generation
+ * @set_map:			Set device memory mapping, optional
+ *				and only needed for device that using
+ *				device specific DMA translation
+ *				(on-chip IOMMU)
+ *				@vdev: vdpa device
+ *				@iotlb: vhost memory mapping to be
+ *				used by the vDPA
+ *				Returns integer: success (0) or error (< 0)
+ */
+struct vdpa_config_ops {
+	/* Virtqueue ops */
+	int (*set_vq_address)(struct vdpa_device *vdev,
+			      u16 idx, u64 desc_area, u64 driver_area,
+			      u64 device_area);
+	void (*set_vq_num)(struct vdpa_device *vdev, u16 idx, u32 num);
+	void (*kick_vq)(struct vdpa_device *vdev, u16 idx);
+	void (*set_vq_cb)(struct vdpa_device *vdev, u16 idx,
+			  struct vdpa_callback *cb);
+	void (*set_vq_ready)(struct vdpa_device *vdev, u16 idx, bool ready);
+	bool (*get_vq_ready)(struct vdpa_device *vdev, u16 idx);
+	int (*set_vq_state)(struct vdpa_device *vdev, u16 idx, u64 state);
+	u64 (*get_vq_state)(struct vdpa_device *vdev, u16 idx);
+
+	/* Device ops */
+	u16 (*get_vq_align)(struct vdpa_device *vdev);
+	u64 (*get_features)(struct vdpa_device *vdev);
+	int (*set_features)(struct vdpa_device *vdev, u64 features);
+	void (*set_config_cb)(struct vdpa_device *vdev,
+			      struct vdpa_callback *cb);
+	u16 (*get_vq_num_max)(struct vdpa_device *vdev);
+	u32 (*get_device_id)(struct vdpa_device *vdev);
+	u32 (*get_vendor_id)(struct vdpa_device *vdev);
+	u8 (*get_status)(struct vdpa_device *vdev);
+	void (*set_status)(struct vdpa_device *vdev, u8 status);
+	void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
+			   void *buf, unsigned int len);
+	void (*set_config)(struct vdpa_device *vdev, unsigned int offset,
+			   const void *buf, unsigned int len);
+	u32 (*get_generation)(struct vdpa_device *vdev);
+
+	/* Mem table */
+	int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb);
+};
+
+int register_vdpa_device(struct vdpa_device *vdpa);
+void unregister_vdpa_device(struct vdpa_device *vdpa);
+
+struct device *vdpa_get_parent(struct vdpa_device *vdpa);
+void vdpa_set_parent(struct vdpa_device *vdpa, struct device *parent);
+
+struct vdpa_device *dev_to_vdpa(struct device *_dev);
+struct device *vdpa_to_dev(struct vdpa_device *vdpa);
+
+/**
+ * vdpa_driver - operations for a vDPA driver
+ * @driver: underlying device driver
+ * @probe: the function to call when a device is found.  Returns 0 or -errno.
+ * @remove: the function to call when a device is removed.
+ */
+struct vdpa_driver {
+	struct device_driver drv;
+	int (*probe)(struct device *dev);
+	void (*remove)(struct device *dev);
+};
+
+int register_vdpa_driver(struct vdpa_driver *drv);
+void unregister_vdpa_driver(struct vdpa_driver *drv);
+
+static inline struct vdpa_driver *drv_to_vdpa(struct device_driver *drv)
+{
+	return container_of(drv, struct vdpa_driver, drv);
+}
+
+#endif /* _LINUX_VDPA_H */