diff mbox series

[4/5] virtio: introduce a vDPA based transport

Message ID 20200116124231.20253-5-jasowang@redhat.com (mailing list archive)
State New, archived
Headers show
Series vDPA support | expand

Commit Message

Jason Wang Jan. 16, 2020, 12:42 p.m. UTC
This patch introduces a vDPA transport for virtio. This is used to
use kernel virtio driver to drive the mediated device that is capable
of populating virtqueue directly.

A new virtio-vdpa driver will be registered to the vDPA bus, when a
new virtio-vdpa device is probed, it will register the device with
vdpa based config ops. This means it is a software transport between
vDPA driver and vDPA device. The transport was implemented through
bus_ops of vDPA parent.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/virtio/Kconfig       |  13 ++
 drivers/virtio/Makefile      |   1 +
 drivers/virtio/virtio_vdpa.c | 400 +++++++++++++++++++++++++++++++++++
 3 files changed, 414 insertions(+)
 create mode 100644 drivers/virtio/virtio_vdpa.c

Comments

Jason Gunthorpe Jan. 16, 2020, 3:38 p.m. UTC | #1
On Thu, Jan 16, 2020 at 08:42:30PM +0800, Jason Wang wrote:
> diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
> new file mode 100644
> index 000000000000..86936e5e7ec3
> +++ b/drivers/virtio/virtio_vdpa.c
> @@ -0,0 +1,400 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * VIRTIO based driver for vDPA device
> + *
> + * Copyright (c) 2020, Red Hat. All rights reserved.
> + *     Author: Jason Wang <jasowang@redhat.com>
> + *
> + */
> +
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/kernel.h>
> +#include <linux/slab.h>
> +#include <linux/uuid.h>
> +#include <linux/virtio.h>
> +#include <linux/vdpa.h>
> +#include <linux/virtio_config.h>
> +#include <linux/virtio_ring.h>
> +
> +#define MOD_VERSION  "0.1"
> +#define MOD_AUTHOR   "Jason Wang <jasowang@redhat.com>"
> +#define MOD_DESC     "vDPA bus driver for virtio devices"
> +#define MOD_LICENSE  "GPL v2"
> +
> +#define to_virtio_vdpa_device(dev) \
> +	container_of(dev, struct virtio_vdpa_device, vdev)

Should be a static function

> +struct virtio_vdpa_device {
> +	struct virtio_device vdev;
> +	struct vdpa_device *vdpa;
> +	u64 features;
> +
> +	/* The lock to protect virtqueue list */
> +	spinlock_t lock;
> +	/* List of virtio_vdpa_vq_info */
> +	struct list_head virtqueues;
> +};
> +
> +struct virtio_vdpa_vq_info {
> +	/* the actual virtqueue */
> +	struct virtqueue *vq;
> +
> +	/* the list node for the virtqueues list */
> +	struct list_head node;
> +};
> +
> +static struct vdpa_device *vd_get_vdpa(struct virtio_device *vdev)
> +{
> +	struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
> +	struct vdpa_device *vdpa = vd_dev->vdpa;
> +
> +	return vdpa;

Bit of a long way to say

  return to_virtio_vdpa_device(vdev)->vdpa

?

> +err_vq:
> +	vring_del_virtqueue(vq);
> +error_new_virtqueue:
> +	ops->set_vq_ready(vdpa, index, 0);
> +	WARN_ON(ops->get_vq_ready(vdpa, index));

A warn_on during error unwind? Sketchy, deserves a comment I think

> +static void virtio_vdpa_release_dev(struct device *_d)
> +{
> +	struct virtio_device *vdev =
> +	       container_of(_d, struct virtio_device, dev);
> +	struct virtio_vdpa_device *vd_dev =
> +	       container_of(vdev, struct virtio_vdpa_device, vdev);
> +	struct vdpa_device *vdpa = vd_dev->vdpa;
> +
> +	devm_kfree(&vdpa->dev, vd_dev);
> +}

It is unusual for the release function to not be owned by the
subsystem, through the class. I'm not sure there are enough module ref
counts to ensure that this function is not unloaded?

Usually to make this all work sanely the subsytem provides some
allocation function

 vdpa_dev = vdpa_alloc_dev(parent, ops, sizeof(struct virtio_vdpa_device))
 struct virtio_vdpa_device *priv = vdpa_priv(vdpa_dev)

Then the subsystem naturally owns all the memory.

Otherwise it gets tricky to ensure that the module doesn't unload
before all the krefs are put.

> +
> +static int virtio_vdpa_probe(struct device *dev)
> +{
> +	struct vdpa_device *vdpa = dev_to_vdpa(dev);

The probe function for a class should accept the classes type already,
no casting.

> +	const struct vdpa_config_ops *ops = vdpa->config;
> +	struct virtio_vdpa_device *vd_dev;
> +	int rc;
> +
> +	vd_dev = devm_kzalloc(dev, sizeof(*vd_dev), GFP_KERNEL);
> +	if (!vd_dev)
> +		return -ENOMEM;

This is not right, the struct device lifetime is controled by a kref,
not via devm. If you want to use a devm unwind then the unwind is
put_device, not devm_kfree.

In this simple situation I don't see a reason to use devm.

> +	vd_dev->vdev.dev.parent = &vdpa->dev;
> +	vd_dev->vdev.dev.release = virtio_vdpa_release_dev;
> +	vd_dev->vdev.config = &virtio_vdpa_config_ops;
> +	vd_dev->vdpa = vdpa;
> +	INIT_LIST_HEAD(&vd_dev->virtqueues);
> +	spin_lock_init(&vd_dev->lock);
> +
> +	vd_dev->vdev.id.device = ops->get_device_id(vdpa);
> +	if (vd_dev->vdev.id.device == 0)
> +		return -ENODEV;
> +
> +	vd_dev->vdev.id.vendor = ops->get_vendor_id(vdpa);
> +	rc = register_virtio_device(&vd_dev->vdev);
> +	if (rc)
> +		put_device(dev);

And a ugly unwind like this is why you want to have device_initialize()
exposed to the driver, so there is a clear pairing that calling
device_initialize() must be followed by put_device. This should also
use the goto unwind style

> +	else
> +		dev_set_drvdata(dev, vd_dev);
> +
> +	return rc;
> +}
> +
> +static void virtio_vdpa_remove(struct device *dev)
> +{

Remove should also already accept the right type

> +	struct virtio_vdpa_device *vd_dev = dev_get_drvdata(dev);
> +
> +	unregister_virtio_device(&vd_dev->vdev);
> +}
> +
> +static struct vdpa_driver virtio_vdpa_driver = {
> +	.drv = {
> +		.name	= "virtio_vdpa",
> +	},
> +	.probe	= virtio_vdpa_probe,
> +	.remove = virtio_vdpa_remove,
> +};

Still a little unclear on binding, is this supposed to bind to all
vdpa devices?

Where is the various THIS_MODULE's I expect to see in a scheme like
this?

All function pointers must be protected by a held module reference
count, ie the above probe/remove and all the pointers in ops.

> +static int __init virtio_vdpa_init(void)
> +{
> +	return register_vdpa_driver(&virtio_vdpa_driver);
> +}
> +
> +static void __exit virtio_vdpa_exit(void)
> +{
> +	unregister_vdpa_driver(&virtio_vdpa_driver);
> +}
> +
> +module_init(virtio_vdpa_init)
> +module_exit(virtio_vdpa_exit)

Best to provide the usual 'module_pci_driver' like scheme for this
boiler plate.

> +MODULE_VERSION(MOD_VERSION);
> +MODULE_LICENSE(MOD_LICENSE);
> +MODULE_AUTHOR(MOD_AUTHOR);
> +MODULE_DESCRIPTION(MOD_DESC);

Why the indirection with 2nd defines?

Jason
Randy Dunlap Jan. 17, 2020, 4:10 a.m. UTC | #2
Hi,

On 1/16/20 4:42 AM, Jason Wang wrote:
> diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
> index 9c4fdb64d9ac..b4276999d17d 100644
> --- a/drivers/virtio/Kconfig
> +++ b/drivers/virtio/Kconfig
> @@ -43,6 +43,19 @@ config VIRTIO_PCI_LEGACY
>  
>  	  If unsure, say Y.
>  
> +config VIRTIO_VDPA
> +	tristate "vDPA driver for virtio devices"
> +	depends on VDPA && VIRTIO
> +	default n
> +	help
> +	  This driver provides support for virtio based paravirtual

	                                   virtio-based

> +	  device driver over vDPA bus. For this to be useful, you need
> +	  an appropriate vDPA device implementation that operates on a
> +          physical device to allow the datapath of virtio to be

use tab + 2 spaces above for indentation, not lots of spaces.

> +	  offloaded to hardware.
> +
> +	  If unsure, say M.
> +
>  config VIRTIO_PMEM
>  	tristate "Support for virtio pmem driver"
>  	depends on VIRTIO
Jason Wang Jan. 17, 2020, 9:32 a.m. UTC | #3
On 2020/1/16 下午11:38, Jason Gunthorpe wrote:
> On Thu, Jan 16, 2020 at 08:42:30PM +0800, Jason Wang wrote:
>> diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
>> new file mode 100644
>> index 000000000000..86936e5e7ec3
>> +++ b/drivers/virtio/virtio_vdpa.c
>> @@ -0,0 +1,400 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * VIRTIO based driver for vDPA device
>> + *
>> + * Copyright (c) 2020, Red Hat. All rights reserved.
>> + *     Author: Jason Wang <jasowang@redhat.com>
>> + *
>> + */
>> +
>> +#include <linux/init.h>
>> +#include <linux/module.h>
>> +#include <linux/device.h>
>> +#include <linux/kernel.h>
>> +#include <linux/slab.h>
>> +#include <linux/uuid.h>
>> +#include <linux/virtio.h>
>> +#include <linux/vdpa.h>
>> +#include <linux/virtio_config.h>
>> +#include <linux/virtio_ring.h>
>> +
>> +#define MOD_VERSION  "0.1"
>> +#define MOD_AUTHOR   "Jason Wang <jasowang@redhat.com>"
>> +#define MOD_DESC     "vDPA bus driver for virtio devices"
>> +#define MOD_LICENSE  "GPL v2"
>> +
>> +#define to_virtio_vdpa_device(dev) \
>> +	container_of(dev, struct virtio_vdpa_device, vdev)
> Should be a static function


Ok.


>
>> +struct virtio_vdpa_device {
>> +	struct virtio_device vdev;
>> +	struct vdpa_device *vdpa;
>> +	u64 features;
>> +
>> +	/* The lock to protect virtqueue list */
>> +	spinlock_t lock;
>> +	/* List of virtio_vdpa_vq_info */
>> +	struct list_head virtqueues;
>> +};
>> +
>> +struct virtio_vdpa_vq_info {
>> +	/* the actual virtqueue */
>> +	struct virtqueue *vq;
>> +
>> +	/* the list node for the virtqueues list */
>> +	struct list_head node;
>> +};
>> +
>> +static struct vdpa_device *vd_get_vdpa(struct virtio_device *vdev)
>> +{
>> +	struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
>> +	struct vdpa_device *vdpa = vd_dev->vdpa;
>> +
>> +	return vdpa;
> Bit of a long way to say
>
>    return to_virtio_vdpa_device(vdev)->vdpa
>
> ?


Right.


>
>> +err_vq:
>> +	vring_del_virtqueue(vq);
>> +error_new_virtqueue:
>> +	ops->set_vq_ready(vdpa, index, 0);
>> +	WARN_ON(ops->get_vq_ready(vdpa, index));
> A warn_on during error unwind? Sketchy, deserves a comment I think


Yes, it's a hint of bug in the vDPA driver. Will add a comment.


>
>> +static void virtio_vdpa_release_dev(struct device *_d)
>> +{
>> +	struct virtio_device *vdev =
>> +	       container_of(_d, struct virtio_device, dev);
>> +	struct virtio_vdpa_device *vd_dev =
>> +	       container_of(vdev, struct virtio_vdpa_device, vdev);
>> +	struct vdpa_device *vdpa = vd_dev->vdpa;
>> +
>> +	devm_kfree(&vdpa->dev, vd_dev);
>> +}
> It is unusual for the release function to not be owned by the
> subsystem, through the class.


This is how virtio_pci and virtio_mmio work now. Virtio devices may have 
different transports which require different release functions. I think 
this is the reason why virtio


> I'm not sure there are enough module ref
> counts to ensure that this function is not unloaded?


Let me double check this.


>
> Usually to make this all work sanely the subsytem provides some
> allocation function
>
>   vdpa_dev = vdpa_alloc_dev(parent, ops, sizeof(struct virtio_vdpa_device))
>   struct virtio_vdpa_device *priv = vdpa_priv(vdpa_dev)
>
> Then the subsystem naturally owns all the memory.
>
> Otherwise it gets tricky to ensure that the module doesn't unload
> before all the krefs are put.


I see.


>
>> +
>> +static int virtio_vdpa_probe(struct device *dev)
>> +{
>> +	struct vdpa_device *vdpa = dev_to_vdpa(dev);
> The probe function for a class should accept the classes type already,
> no casting.


Right.


>
>> +	const struct vdpa_config_ops *ops = vdpa->config;
>> +	struct virtio_vdpa_device *vd_dev;
>> +	int rc;
>> +
>> +	vd_dev = devm_kzalloc(dev, sizeof(*vd_dev), GFP_KERNEL);
>> +	if (!vd_dev)
>> +		return -ENOMEM;
> This is not right, the struct device lifetime is controled by a kref,
> not via devm. If you want to use a devm unwind then the unwind is
> put_device, not devm_kfree.


I'm not sure I get the point here. The lifetime is bound to underlying 
vDPA device and devres allow to be freed before the vpda device is 
released. But I agree using devres of underlying vdpa device looks wired.


>
> In this simple situation I don't see a reason to use devm.
>
>> +	vd_dev->vdev.dev.parent = &vdpa->dev;
>> +	vd_dev->vdev.dev.release = virtio_vdpa_release_dev;
>> +	vd_dev->vdev.config = &virtio_vdpa_config_ops;
>> +	vd_dev->vdpa = vdpa;
>> +	INIT_LIST_HEAD(&vd_dev->virtqueues);
>> +	spin_lock_init(&vd_dev->lock);
>> +
>> +	vd_dev->vdev.id.device = ops->get_device_id(vdpa);
>> +	if (vd_dev->vdev.id.device == 0)
>> +		return -ENODEV;
>> +
>> +	vd_dev->vdev.id.vendor = ops->get_vendor_id(vdpa);
>> +	rc = register_virtio_device(&vd_dev->vdev);
>> +	if (rc)
>> +		put_device(dev);
> And a ugly unwind like this is why you want to have device_initialize()
> exposed to the driver,


In this context, which "driver" did you mean here? (Note, virtio-vdpa is 
the driver for vDPA bus here).


>   so there is a clear pairing that calling
> device_initialize() must be followed by put_device. This should also
> use the goto unwind style
>
>> +	else
>> +		dev_set_drvdata(dev, vd_dev);
>> +
>> +	return rc;
>> +}
>> +
>> +static void virtio_vdpa_remove(struct device *dev)
>> +{
> Remove should also already accept the right type


Yes.


>
>> +	struct virtio_vdpa_device *vd_dev = dev_get_drvdata(dev);
>> +
>> +	unregister_virtio_device(&vd_dev->vdev);
>> +}
>> +
>> +static struct vdpa_driver virtio_vdpa_driver = {
>> +	.drv = {
>> +		.name	= "virtio_vdpa",
>> +	},
>> +	.probe	= virtio_vdpa_probe,
>> +	.remove = virtio_vdpa_remove,
>> +};
> Still a little unclear on binding, is this supposed to bind to all
> vdpa devices?


Yes, it expected to drive all vDPA devices.


>
> Where is the various THIS_MODULE's I expect to see in a scheme like
> this?
>
> All function pointers must be protected by a held module reference
> count, ie the above probe/remove and all the pointers in ops.


Will double check, since I don't see this in other virtio transport 
drivers (PCI or MMIO).


>
>> +static int __init virtio_vdpa_init(void)
>> +{
>> +	return register_vdpa_driver(&virtio_vdpa_driver);
>> +}
>> +
>> +static void __exit virtio_vdpa_exit(void)
>> +{
>> +	unregister_vdpa_driver(&virtio_vdpa_driver);
>> +}
>> +
>> +module_init(virtio_vdpa_init)
>> +module_exit(virtio_vdpa_exit)
> Best to provide the usual 'module_pci_driver' like scheme for this
> boiler plate.


Ok.


>
>> +MODULE_VERSION(MOD_VERSION);
>> +MODULE_LICENSE(MOD_LICENSE);
>> +MODULE_AUTHOR(MOD_AUTHOR);
>> +MODULE_DESCRIPTION(MOD_DESC);
> Why the indirection with 2nd defines?


Will fix.

Thanks


>
> Jason
>
Jason Gunthorpe Jan. 17, 2020, 2 p.m. UTC | #4
On Fri, Jan 17, 2020 at 05:32:35PM +0800, Jason Wang wrote:
> > 
> > > +	const struct vdpa_config_ops *ops = vdpa->config;
> > > +	struct virtio_vdpa_device *vd_dev;
> > > +	int rc;
> > > +
> > > +	vd_dev = devm_kzalloc(dev, sizeof(*vd_dev), GFP_KERNEL);
> > > +	if (!vd_dev)
> > > +		return -ENOMEM;
> > This is not right, the struct device lifetime is controled by a kref,
> > not via devm. If you want to use a devm unwind then the unwind is
> > put_device, not devm_kfree.
> 
> I'm not sure I get the point here. The lifetime is bound to underlying vDPA
> device and devres allow to be freed before the vpda device is released. But
> I agree using devres of underlying vdpa device looks wired.

Once device_initialize is called the only way to free a struct device
is via put_device, while here you have a devm trigger that will
unconditionally do kfree on a struct device without respecting the
reference count.

reference counted memory must never be allocated with devm.

> > > +	vd_dev->vdev.dev.release = virtio_vdpa_release_dev;
> > > +	vd_dev->vdev.config = &virtio_vdpa_config_ops;
> > > +	vd_dev->vdpa = vdpa;
> > > +	INIT_LIST_HEAD(&vd_dev->virtqueues);
> > > +	spin_lock_init(&vd_dev->lock);
> > > +
> > > +	vd_dev->vdev.id.device = ops->get_device_id(vdpa);
> > > +	if (vd_dev->vdev.id.device == 0)
> > > +		return -ENODEV;
> > > +
> > > +	vd_dev->vdev.id.vendor = ops->get_vendor_id(vdpa);
> > > +	rc = register_virtio_device(&vd_dev->vdev);
> > > +	if (rc)
> > > +		put_device(dev);
> > And a ugly unwind like this is why you want to have device_initialize()
> > exposed to the driver,
> 
> In this context, which "driver" did you mean here? (Note, virtio-vdpa is the
> driver for vDPA bus here).

'driver' is the thing using the 'core' library calls to implement a
device, so here the 'vd_dev' is the driver and
'register_virtio_device' is the core

> > 
> > Where is the various THIS_MODULE's I expect to see in a scheme like
> > this?
> > 
> > All function pointers must be protected by a held module reference
> > count, ie the above probe/remove and all the pointers in ops.
> 
> Will double check, since I don't see this in other virtio transport drivers
> (PCI or MMIO).

pci_register_driver is a macro that provides a THIS_MODULE, and the
pci core code sets driver.owner, then the rest of the stuff related to
driver ops is supposed to work against that to protect the driver ops.

For the device module refcounting you either need to ensure that
'unregister' is a strong fence and guanentees that no device ops are
called past unregister (noting that this is impossible for release),
or you need to hold the module lock until release.

It is common to see non-core subsystems get this stuff wrong.

Jason
Jason Wang Jan. 20, 2020, 7:52 a.m. UTC | #5
On 2020/1/17 下午10:00, Jason Gunthorpe wrote:
> On Fri, Jan 17, 2020 at 05:32:35PM +0800, Jason Wang wrote:
>>>> +	const struct vdpa_config_ops *ops = vdpa->config;
>>>> +	struct virtio_vdpa_device *vd_dev;
>>>> +	int rc;
>>>> +
>>>> +	vd_dev = devm_kzalloc(dev, sizeof(*vd_dev), GFP_KERNEL);
>>>> +	if (!vd_dev)
>>>> +		return -ENOMEM;
>>> This is not right, the struct device lifetime is controled by a kref,
>>> not via devm. If you want to use a devm unwind then the unwind is
>>> put_device, not devm_kfree.
>> I'm not sure I get the point here. The lifetime is bound to underlying vDPA
>> device and devres allow to be freed before the vpda device is released. But
>> I agree using devres of underlying vdpa device looks wired.
> Once device_initialize is called the only way to free a struct device
> is via put_device, while here you have a devm trigger that will
> unconditionally do kfree on a struct device without respecting the
> reference count.
>
> reference counted memory must never be allocated with devm.


Right, fixed.


>
>>>> +	vd_dev->vdev.dev.release = virtio_vdpa_release_dev;
>>>> +	vd_dev->vdev.config = &virtio_vdpa_config_ops;
>>>> +	vd_dev->vdpa = vdpa;
>>>> +	INIT_LIST_HEAD(&vd_dev->virtqueues);
>>>> +	spin_lock_init(&vd_dev->lock);
>>>> +
>>>> +	vd_dev->vdev.id.device = ops->get_device_id(vdpa);
>>>> +	if (vd_dev->vdev.id.device == 0)
>>>> +		return -ENODEV;
>>>> +
>>>> +	vd_dev->vdev.id.vendor = ops->get_vendor_id(vdpa);
>>>> +	rc = register_virtio_device(&vd_dev->vdev);
>>>> +	if (rc)
>>>> +		put_device(dev);
>>> And a ugly unwind like this is why you want to have device_initialize()
>>> exposed to the driver,
>> In this context, which "driver" did you mean here? (Note, virtio-vdpa is the
>> driver for vDPA bus here).
> 'driver' is the thing using the 'core' library calls to implement a
> device, so here the 'vd_dev' is the driver and
> 'register_virtio_device' is the core


Ok.


>
>>> Where is the various THIS_MODULE's I expect to see in a scheme like
>>> this?
>>>
>>> All function pointers must be protected by a held module reference
>>> count, ie the above probe/remove and all the pointers in ops.
>> Will double check, since I don't see this in other virtio transport drivers
>> (PCI or MMIO).
> pci_register_driver is a macro that provides a THIS_MODULE, and the
> pci core code sets driver.owner, then the rest of the stuff related to
> driver ops is supposed to work against that to protect the driver ops.
>
> For the device module refcounting you either need to ensure that
> 'unregister' is a strong fence and guanentees that no device ops are
> called past unregister (noting that this is impossible for release),
> or you need to hold the module lock until release.
>
> It is common to see non-core subsystems get this stuff wrong.
>
> Jason


Ok. I see.

Thanks
diff mbox series

Patch

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 9c4fdb64d9ac..b4276999d17d 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -43,6 +43,19 @@  config VIRTIO_PCI_LEGACY
 
 	  If unsure, say Y.
 
+config VIRTIO_VDPA
+	tristate "vDPA driver for virtio devices"
+	depends on VDPA && VIRTIO
+	default n
+	help
+	  This driver provides support for virtio based paravirtual
+	  device driver over vDPA bus. For this to be useful, you need
+	  an appropriate vDPA device implementation that operates on a
+          physical device to allow the datapath of virtio to be
+	  offloaded to hardware.
+
+	  If unsure, say M.
+
 config VIRTIO_PMEM
 	tristate "Support for virtio pmem driver"
 	depends on VIRTIO
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index fdf5eacd0d0a..3407ac03fe60 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -6,4 +6,5 @@  virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
 virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o
 obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o
 obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
+obj-$(CONFIG_VIRTIO_VDPA) += virtio_vdpa.o
 obj-$(CONFIG_VDPA) += vdpa/
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
new file mode 100644
index 000000000000..86936e5e7ec3
--- /dev/null
+++ b/drivers/virtio/virtio_vdpa.c
@@ -0,0 +1,400 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VIRTIO based driver for vDPA device
+ *
+ * Copyright (c) 2020, Red Hat. All rights reserved.
+ *     Author: Jason Wang <jasowang@redhat.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/virtio.h>
+#include <linux/vdpa.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ring.h>
+
+#define MOD_VERSION  "0.1"
+#define MOD_AUTHOR   "Jason Wang <jasowang@redhat.com>"
+#define MOD_DESC     "vDPA bus driver for virtio devices"
+#define MOD_LICENSE  "GPL v2"
+
+#define to_virtio_vdpa_device(dev) \
+	container_of(dev, struct virtio_vdpa_device, vdev)
+
+struct virtio_vdpa_device {
+	struct virtio_device vdev;
+	struct vdpa_device *vdpa;
+	u64 features;
+
+	/* The lock to protect virtqueue list */
+	spinlock_t lock;
+	/* List of virtio_vdpa_vq_info */
+	struct list_head virtqueues;
+};
+
+struct virtio_vdpa_vq_info {
+	/* the actual virtqueue */
+	struct virtqueue *vq;
+
+	/* the list node for the virtqueues list */
+	struct list_head node;
+};
+
+static struct vdpa_device *vd_get_vdpa(struct virtio_device *vdev)
+{
+	struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
+	struct vdpa_device *vdpa = vd_dev->vdpa;
+
+	return vdpa;
+}
+
+static void virtio_vdpa_get(struct virtio_device *vdev, unsigned offset,
+			    void *buf, unsigned len)
+{
+	struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+	const struct vdpa_config_ops *ops = vdpa->config;
+
+	ops->get_config(vdpa, offset, buf, len);
+}
+
+static void virtio_vdpa_set(struct virtio_device *vdev, unsigned offset,
+			    const void *buf, unsigned len)
+{
+	struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+	const struct vdpa_config_ops *ops = vdpa->config;
+
+	ops->set_config(vdpa, offset, buf, len);
+}
+
+static u32 virtio_vdpa_generation(struct virtio_device *vdev)
+{
+	struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+	const struct vdpa_config_ops *ops = vdpa->config;
+
+	if (ops->get_generation)
+		return ops->get_generation(vdpa);
+
+	return 0;
+}
+
+static u8 virtio_vdpa_get_status(struct virtio_device *vdev)
+{
+	struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+	const struct vdpa_config_ops *ops = vdpa->config;
+
+	return ops->get_status(vdpa);
+}
+
+static void virtio_vdpa_set_status(struct virtio_device *vdev, u8 status)
+{
+	struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+	const struct vdpa_config_ops *ops = vdpa->config;
+
+	return ops->set_status(vdpa, status);
+}
+
+static void virtio_vdpa_reset(struct virtio_device *vdev)
+{
+	struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+	const struct vdpa_config_ops *ops = vdpa->config;
+
+	return ops->set_status(vdpa, 0);
+}
+
+static bool virtio_vdpa_notify(struct virtqueue *vq)
+{
+	struct vdpa_device *vdpa = vd_get_vdpa(vq->vdev);
+	const struct vdpa_config_ops *ops = vdpa->config;
+
+	ops->kick_vq(vdpa, vq->index);
+
+	return true;
+}
+
+static irqreturn_t virtio_vdpa_config_cb(void *private)
+{
+	struct virtio_vdpa_device *vd_dev = private;
+
+	virtio_config_changed(&vd_dev->vdev);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t virtio_vdpa_virtqueue_cb(void *private)
+{
+	struct virtio_vdpa_vq_info *info = private;
+
+	return vring_interrupt(0, info->vq);
+}
+
+static struct virtqueue *
+virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
+		     void (*callback)(struct virtqueue *vq),
+		     const char *name, bool ctx)
+{
+	struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
+	struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+	const struct vdpa_config_ops *ops = vdpa->config;
+	struct virtio_vdpa_vq_info *info;
+	struct vdpa_callback cb;
+	struct virtqueue *vq;
+	u64 desc_addr, driver_addr, device_addr;
+	unsigned long flags;
+	u32 align, num;
+	int err;
+
+	if (!name)
+		return NULL;
+
+	/* Queue shouldn't already be set up. */
+	if (ops->get_vq_ready(vdpa, index))
+		return ERR_PTR(-ENOENT);
+
+	/* Allocate and fill out our active queue description */
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return ERR_PTR(-ENOMEM);
+
+	num = ops->get_vq_num_max(vdpa);
+	if (num == 0) {
+		err = -ENOENT;
+		goto error_new_virtqueue;
+	}
+
+	/* Create the vring */
+	align = ops->get_vq_align(vdpa);
+	vq = vring_create_virtqueue(index, num, align, vdev,
+				    true, true, ctx,
+				    virtio_vdpa_notify, callback, name);
+	if (!vq) {
+		err = -ENOMEM;
+		goto error_new_virtqueue;
+	}
+
+	/* Setup virtqueue callback */
+	cb.callback = virtio_vdpa_virtqueue_cb;
+	cb.private = info;
+	ops->set_vq_cb(vdpa, index, &cb);
+	ops->set_vq_num(vdpa, index, virtqueue_get_vring_size(vq));
+
+	desc_addr = virtqueue_get_desc_addr(vq);
+	driver_addr = virtqueue_get_avail_addr(vq);
+	device_addr = virtqueue_get_used_addr(vq);
+
+	if (ops->set_vq_address(vdpa, index,
+				desc_addr, driver_addr,
+				device_addr)) {
+		err = -EINVAL;
+		goto err_vq;
+	}
+
+	ops->set_vq_ready(vdpa, index, 1);
+
+	vq->priv = info;
+	info->vq = vq;
+
+	spin_lock_irqsave(&vd_dev->lock, flags);
+	list_add(&info->node, &vd_dev->virtqueues);
+	spin_unlock_irqrestore(&vd_dev->lock, flags);
+
+	return vq;
+
+err_vq:
+	vring_del_virtqueue(vq);
+error_new_virtqueue:
+	ops->set_vq_ready(vdpa, index, 0);
+	WARN_ON(ops->get_vq_ready(vdpa, index));
+	kfree(info);
+	return ERR_PTR(err);
+}
+
+static void virtio_vdpa_del_vq(struct virtqueue *vq)
+{
+	struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vq->vdev);
+	struct vdpa_device *vdpa = vd_dev->vdpa;
+	const struct vdpa_config_ops *ops = vdpa->config;
+	struct virtio_vdpa_vq_info *info = vq->priv;
+	unsigned int index = vq->index;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vd_dev->lock, flags);
+	list_del(&info->node);
+	spin_unlock_irqrestore(&vd_dev->lock, flags);
+
+	/* Select and deactivate the queue */
+	ops->set_vq_ready(vdpa, index, 0);
+	WARN_ON(ops->get_vq_ready(vdpa, index));
+
+	vring_del_virtqueue(vq);
+
+	kfree(info);
+}
+
+static void virtio_vdpa_del_vqs(struct virtio_device *vdev)
+{
+	struct virtqueue *vq, *n;
+
+	list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+		virtio_vdpa_del_vq(vq);
+}
+
+static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+				struct virtqueue *vqs[],
+				vq_callback_t *callbacks[],
+				const char * const names[],
+				const bool *ctx,
+				struct irq_affinity *desc)
+{
+	struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
+	struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+	const struct vdpa_config_ops *ops = vdpa->config;
+	struct vdpa_callback cb;
+	int i, err, queue_idx = 0;
+
+	for (i = 0; i < nvqs; ++i) {
+		if (!names[i]) {
+			vqs[i] = NULL;
+			continue;
+		}
+
+		vqs[i] = virtio_vdpa_setup_vq(vdev, queue_idx++,
+					      callbacks[i], names[i], ctx ?
+					      ctx[i] : false);
+		if (IS_ERR(vqs[i])) {
+			err = PTR_ERR(vqs[i]);
+			goto err_setup_vq;
+		}
+	}
+
+	cb.callback = virtio_vdpa_config_cb;
+	cb.private = vd_dev;
+	ops->set_config_cb(vdpa, &cb);
+
+	return 0;
+
+err_setup_vq:
+	virtio_vdpa_del_vqs(vdev);
+	return err;
+}
+
+static u64 virtio_vdpa_get_features(struct virtio_device *vdev)
+{
+	struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+	const struct vdpa_config_ops *ops = vdpa->config;
+
+	return ops->get_features(vdpa);
+}
+
+static int virtio_vdpa_finalize_features(struct virtio_device *vdev)
+{
+	struct vdpa_device *vdpa = vd_get_vdpa(vdev);
+	const struct vdpa_config_ops *ops = vdpa->config;
+
+	/* Give virtio_ring a chance to accept features. */
+	vring_transport_features(vdev);
+
+	return ops->set_features(vdpa, vdev->features);
+}
+
+static const char *virtio_vdpa_bus_name(struct virtio_device *vdev)
+{
+	struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
+	struct vdpa_device *vdpa = vd_dev->vdpa;
+
+	return dev_name(vdpa_to_dev(vdpa));
+}
+
+static const struct virtio_config_ops virtio_vdpa_config_ops = {
+	.get		= virtio_vdpa_get,
+	.set		= virtio_vdpa_set,
+	.generation	= virtio_vdpa_generation,
+	.get_status	= virtio_vdpa_get_status,
+	.set_status	= virtio_vdpa_set_status,
+	.reset		= virtio_vdpa_reset,
+	.find_vqs	= virtio_vdpa_find_vqs,
+	.del_vqs	= virtio_vdpa_del_vqs,
+	.get_features	= virtio_vdpa_get_features,
+	.finalize_features = virtio_vdpa_finalize_features,
+	.bus_name	= virtio_vdpa_bus_name,
+};
+
+static void virtio_vdpa_release_dev(struct device *_d)
+{
+	struct virtio_device *vdev =
+	       container_of(_d, struct virtio_device, dev);
+	struct virtio_vdpa_device *vd_dev =
+	       container_of(vdev, struct virtio_vdpa_device, vdev);
+	struct vdpa_device *vdpa = vd_dev->vdpa;
+
+	devm_kfree(&vdpa->dev, vd_dev);
+}
+
+static int virtio_vdpa_probe(struct device *dev)
+{
+	struct vdpa_device *vdpa = dev_to_vdpa(dev);
+	const struct vdpa_config_ops *ops = vdpa->config;
+	struct virtio_vdpa_device *vd_dev;
+	int rc;
+
+	vd_dev = devm_kzalloc(dev, sizeof(*vd_dev), GFP_KERNEL);
+	if (!vd_dev)
+		return -ENOMEM;
+
+	vd_dev->vdev.dev.parent = &vdpa->dev;
+	vd_dev->vdev.dev.release = virtio_vdpa_release_dev;
+	vd_dev->vdev.config = &virtio_vdpa_config_ops;
+	vd_dev->vdpa = vdpa;
+	INIT_LIST_HEAD(&vd_dev->virtqueues);
+	spin_lock_init(&vd_dev->lock);
+
+	vd_dev->vdev.id.device = ops->get_device_id(vdpa);
+	if (vd_dev->vdev.id.device == 0)
+		return -ENODEV;
+
+	vd_dev->vdev.id.vendor = ops->get_vendor_id(vdpa);
+	rc = register_virtio_device(&vd_dev->vdev);
+	if (rc)
+		put_device(dev);
+	else
+		dev_set_drvdata(dev, vd_dev);
+
+	return rc;
+}
+
+static void virtio_vdpa_remove(struct device *dev)
+{
+	struct virtio_vdpa_device *vd_dev = dev_get_drvdata(dev);
+
+	unregister_virtio_device(&vd_dev->vdev);
+}
+
+static struct vdpa_driver virtio_vdpa_driver = {
+	.drv = {
+		.name	= "virtio_vdpa",
+	},
+	.probe	= virtio_vdpa_probe,
+	.remove = virtio_vdpa_remove,
+};
+
+static int __init virtio_vdpa_init(void)
+{
+	return register_vdpa_driver(&virtio_vdpa_driver);
+}
+
+static void __exit virtio_vdpa_exit(void)
+{
+	unregister_vdpa_driver(&virtio_vdpa_driver);
+}
+
+module_init(virtio_vdpa_init)
+module_exit(virtio_vdpa_exit)
+
+MODULE_VERSION(MOD_VERSION);
+MODULE_LICENSE(MOD_LICENSE);
+MODULE_AUTHOR(MOD_AUTHOR);
+MODULE_DESCRIPTION(MOD_DESC);