diff mbox series

[08/10] vfio-iommufd: Support iommufd for emulated VFIO devices

Message ID 8-v1-4991695894d8+211-vfio_iommufd_jgg@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Connect VFIO to IOMMUFD | expand

Commit Message

Jason Gunthorpe Oct. 25, 2022, 6:50 p.m. UTC
Emulated VFIO devices are calling vfio_register_emulated_iommu_dev() and
consist of all the mdev drivers.

Like the physical drivers, support for iommufd is provided by the driver
supplying the correct correct standard ops. Provide ops from the core that
duplicate what vfio_register_emulated_iommu_dev() does.

Emulated drivers are where it is more likely to see variation in the
iommfd support ops. For instance IDXD will probably need to setup both a
iommfd_device context linked to a PASID and an iommufd_access context to
support all their mdev operations.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/gpu/drm/i915/gvt/kvmgt.c  |   3 +
 drivers/s390/cio/vfio_ccw_ops.c   |   3 +
 drivers/s390/crypto/vfio_ap_ops.c |   3 +
 drivers/vfio/container.c          | 108 ++++++-----------------------
 drivers/vfio/iommufd.c            |  57 ++++++++++++++++
 drivers/vfio/vfio.h               |  10 ++-
 drivers/vfio/vfio_main.c          | 110 +++++++++++++++++++++++++++++-
 include/linux/vfio.h              |  14 ++++
 8 files changed, 217 insertions(+), 91 deletions(-)

Comments

Tian, Kevin Nov. 1, 2022, 8:37 a.m. UTC | #1
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Wednesday, October 26, 2022 2:51 AM
> 
> Emulated VFIO devices are calling vfio_register_emulated_iommu_dev() and
> consist of all the mdev drivers.
> 
> Like the physical drivers, support for iommufd is provided by the driver
> supplying the correct correct standard ops. Provide ops from the core that
> duplicate what vfio_register_emulated_iommu_dev() does.
> 
> Emulated drivers are where it is more likely to see variation in the
> iommfd support ops. For instance IDXD will probably need to setup both a
> iommfd_device context linked to a PASID and an iommufd_access context to
> support all their mdev operations.
> 
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/gpu/drm/i915/gvt/kvmgt.c  |   3 +
>  drivers/s390/cio/vfio_ccw_ops.c   |   3 +
>  drivers/s390/crypto/vfio_ap_ops.c |   3 +
>  drivers/vfio/container.c          | 108 ++++++-----------------------
>  drivers/vfio/iommufd.c            |  57 ++++++++++++++++
>  drivers/vfio/vfio.h               |  10 ++-
>  drivers/vfio/vfio_main.c          | 110 +++++++++++++++++++++++++++++-
>  include/linux/vfio.h              |  14 ++++
>  8 files changed, 217 insertions(+), 91 deletions(-)

mtty, mdpy and mbochs?

> diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c
> index 8772dad6808539..0388f2e33447eb 100644
> --- a/drivers/vfio/container.c
> +++ b/drivers/vfio/container.c
> @@ -540,113 +540,45 @@ void vfio_group_unuse_container(struct
> vfio_group *group)
>  	fput(group->opened_file);
>  }
> 
> -/*
> - * Pin contiguous user pages and return their associated host pages for local
> - * domain only.
> - * @device [in]  : device
> - * @iova [in]    : starting IOVA of user pages to be pinned.
> - * @npage [in]   : count of pages to be pinned.  This count should not
> - *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
> - * @prot [in]    : protection flags
> - * @pages[out]   : array of host pages
> - * Return error or number of pages pinned.
> - *
> - * A driver may only call this function if the vfio_device was created
> - * by vfio_register_emulated_iommu_dev().
> - */
> -int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
> -		   int npage, int prot, struct page **pages)
> +int vfio_container_pin_pages(struct vfio_container *container,
> +			     struct iommu_group *iommu_group, dma_addr_t
> iova,
> +			     int npage, int prot, struct page **pages)
>  {
> -	struct vfio_container *container;
> -	struct vfio_group *group = device->group;
> -	struct vfio_iommu_driver *driver;
> -	int ret;
> -
> -	if (!pages || !npage || !vfio_assert_device_open(device))
> -		return -EINVAL;
> +	/* group->container cannot change while a vfio device is open */
> +	struct vfio_iommu_driver *driver = container->iommu_driver;
> 
>  	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
>  		return -E2BIG;
> 
>  	/* group->container cannot change while a vfio device is open */
> -	container = group->container;
>  	driver = container->iommu_driver;

duplicated comment and assignment.

Actually, I'm not sure whether the comment should be put within this
container helper and other two. There is no group reference in these
helpers then it sounds like the comment makes more sense to be in the
caller side?

> +void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int
> npage)
> +{
> +	if (WARN_ON(!vfio_assert_device_open(device)))
> +		return;
> +
> +	if (device->group->container) {
> +		vfio_container_unpin_pages(device->group->container, iova,
> +					   npage);
> +	} else if (device->iommufd_access) {

be consistent with other two helpers i.e. if-if instead of if-else

> +		if (WARN_ON(iova > ULONG_MAX))
> +			return;

Is there a reason why this is a WARN_ON only in unpin but not in pin?

> +int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
> +		size_t len, bool write)
> +{
> +	if (!data || len <= 0 || !vfio_assert_device_open(device))
> +		return -EINVAL;
> +
> +	if (device->group->container)
> +		return vfio_container_dma_rw(device->group->container,
> iova,
> +					     data, len, write);
> +
> +	if (device->iommufd_access) {
> +		unsigned int flags = 0;
> +
> +		if (iova > ULONG_MAX)
> +			return -EINVAL;
> +
> +		/* VFIO historically tries to auto-detect a kthread */
> +		if (!current->mm)
> +			flags |= IOMMUFD_ACCESS_RW_KTHREAD;

Can you elaborate why this cannot be put in iommufd as the default
policy similar to what vfio container does?
Jason Gunthorpe Nov. 1, 2022, 12:49 p.m. UTC | #2
On Tue, Nov 01, 2022 at 08:37:39AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Wednesday, October 26, 2022 2:51 AM
> > 
> > Emulated VFIO devices are calling vfio_register_emulated_iommu_dev() and
> > consist of all the mdev drivers.
> > 
> > Like the physical drivers, support for iommufd is provided by the driver
> > supplying the correct correct standard ops. Provide ops from the core that
> > duplicate what vfio_register_emulated_iommu_dev() does.
> > 
> > Emulated drivers are where it is more likely to see variation in the
> > iommfd support ops. For instance IDXD will probably need to setup both a
> > iommfd_device context linked to a PASID and an iommufd_access context to
> > support all their mdev operations.
> > 
> > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > ---
> >  drivers/gpu/drm/i915/gvt/kvmgt.c  |   3 +
> >  drivers/s390/cio/vfio_ccw_ops.c   |   3 +
> >  drivers/s390/crypto/vfio_ap_ops.c |   3 +
> >  drivers/vfio/container.c          | 108 ++++++-----------------------
> >  drivers/vfio/iommufd.c            |  57 ++++++++++++++++
> >  drivers/vfio/vfio.h               |  10 ++-
> >  drivers/vfio/vfio_main.c          | 110 +++++++++++++++++++++++++++++-
> >  include/linux/vfio.h              |  14 ++++
> >  8 files changed, 217 insertions(+), 91 deletions(-)
> 
> mtty, mdpy and mbochs?

They don't call rw or pin_pages, so they don't need to do
anything:


	/*
	 * If the driver doesn't provide this op then it means the device does
	 * not do DMA at all. So nothing to do.
	 */
	if (!vdev->ops->bind_iommufd)
		return 0;

> > +int vfio_container_pin_pages(struct vfio_container *container,
> > +			     struct iommu_group *iommu_group, dma_addr_t
> > iova,
> > +			     int npage, int prot, struct page **pages)
> >  {
> > -	struct vfio_container *container;
> > -	struct vfio_group *group = device->group;
> > -	struct vfio_iommu_driver *driver;
> > -	int ret;
> > -
> > -	if (!pages || !npage || !vfio_assert_device_open(device))
> > -		return -EINVAL;
> > +	/* group->container cannot change while a vfio device is open */
> > +	struct vfio_iommu_driver *driver = container->iommu_driver;
> > 
> >  	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
> >  		return -E2BIG;
> > 
> >  	/* group->container cannot change while a vfio device is open */
> > -	container = group->container;
> >  	driver = container->iommu_driver;
> 
> duplicated comment and assignment.
> 
> Actually, I'm not sure whether the comment should be put within this
> container helper and other two. There is no group reference in these
> helpers then it sounds like the comment makes more sense to be in the
> caller side?

Yeah, that is better

> > +void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int
> > npage)
> > +{
> > +	if (WARN_ON(!vfio_assert_device_open(device)))
> > +		return;
> > +
> > +	if (device->group->container) {
> > +		vfio_container_unpin_pages(device->group->container, iova,
> > +					   npage);
> > +	} else if (device->iommufd_access) {
> 
> be consistent with other two helpers i.e. if-if instead of if-else

Done

> > +		if (WARN_ON(iova > ULONG_MAX))
> > +			return;
> 
> Is there a reason why this is a WARN_ON only in unpin but not in pin?

This is how it has always been. I suppose someone once thought it
would be OK for the driver to do racy stuff during pin - but clearly
that is not the case. Lets fix it while we are here.

> > +int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
> > +		size_t len, bool write)
> > +{
> > +	if (!data || len <= 0 || !vfio_assert_device_open(device))
> > +		return -EINVAL;
> > +
> > +	if (device->group->container)
> > +		return vfio_container_dma_rw(device->group->container,
> > iova,
> > +					     data, len, write);
> > +
> > +	if (device->iommufd_access) {
> > +		unsigned int flags = 0;
> > +
> > +		if (iova > ULONG_MAX)
> > +			return -EINVAL;
> > +
> > +		/* VFIO historically tries to auto-detect a kthread */
> > +		if (!current->mm)
> > +			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
> 
> Can you elaborate why this cannot be put in iommufd as the default
> policy similar to what vfio container does?

Snooping in kernel structs to try to guess the calling execution
context is bad design. The caller should know its own context and it
should declare positively what it is. Someday this should be lifted
out of VFIO as well and into the drivers.

Jason
Tian, Kevin Nov. 3, 2022, 4:52 a.m. UTC | #3
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Tuesday, November 1, 2022 8:49 PM
> > > ---
> > >  drivers/gpu/drm/i915/gvt/kvmgt.c  |   3 +
> > >  drivers/s390/cio/vfio_ccw_ops.c   |   3 +
> > >  drivers/s390/crypto/vfio_ap_ops.c |   3 +
> > >  drivers/vfio/container.c          | 108 ++++++-----------------------
> > >  drivers/vfio/iommufd.c            |  57 ++++++++++++++++
> > >  drivers/vfio/vfio.h               |  10 ++-
> > >  drivers/vfio/vfio_main.c          | 110 +++++++++++++++++++++++++++++-
> > >  include/linux/vfio.h              |  14 ++++
> > >  8 files changed, 217 insertions(+), 91 deletions(-)
> >
> > mtty, mdpy and mbochs?
> 
> They don't call rw or pin_pages, so they don't need to do
> anything:
> 
> 
> 	/*
> 	 * If the driver doesn't provide this op then it means the device does
> 	 * not do DMA at all. So nothing to do.
> 	 */
> 	if (!vdev->ops->bind_iommufd)
> 		return 0;
> 

OK, I see the point of this check here.

btw It'd be good to document in vfio_device_ops that driver must provide
this op if the device does DMA.

> > > +
> > > +		/* VFIO historically tries to auto-detect a kthread */
> > > +		if (!current->mm)
> > > +			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
> >
> > Can you elaborate why this cannot be put in iommufd as the default
> > policy similar to what vfio container does?
> 
> Snooping in kernel structs to try to guess the calling execution
> context is bad design. The caller should know its own context and it
> should declare positively what it is. Someday this should be lifted
> out of VFIO as well and into the drivers.
> 

with the last sentence it makes more sense. otherwise I didn't see
why putting the guess in vfio makes real difference from doing it
in iommufd as there is no vfio specific state referenced for making
this decision.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 7a45e5360caf2d..579b230a0f58d9 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1474,6 +1474,9 @@  static const struct vfio_device_ops intel_vgpu_dev_ops = {
 	.mmap		= intel_vgpu_mmap,
 	.ioctl		= intel_vgpu_ioctl,
 	.dma_unmap	= intel_vgpu_dma_unmap,
+	.bind_iommufd	= vfio_iommufd_emulated_bind,
+	.unbind_iommufd = vfio_iommufd_emulated_unbind,
+	.attach_ioas	= vfio_iommufd_emulated_attach_ioas,
 };
 
 static int intel_vgpu_probe(struct mdev_device *mdev)
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index 6ae4d012d80084..560453d99c24fc 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -588,6 +588,9 @@  static const struct vfio_device_ops vfio_ccw_dev_ops = {
 	.ioctl = vfio_ccw_mdev_ioctl,
 	.request = vfio_ccw_mdev_request,
 	.dma_unmap = vfio_ccw_dma_unmap,
+	.bind_iommufd = vfio_iommufd_emulated_bind,
+	.unbind_iommufd = vfio_iommufd_emulated_unbind,
+	.attach_ioas = vfio_iommufd_emulated_attach_ioas,
 };
 
 struct mdev_driver vfio_ccw_mdev_driver = {
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index 0b4cc8c597ae67..bb7776d207924f 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -1789,6 +1789,9 @@  static const struct vfio_device_ops vfio_ap_matrix_dev_ops = {
 	.close_device = vfio_ap_mdev_close_device,
 	.ioctl = vfio_ap_mdev_ioctl,
 	.dma_unmap = vfio_ap_mdev_dma_unmap,
+	.bind_iommufd = vfio_iommufd_emulated_bind,
+	.unbind_iommufd = vfio_iommufd_emulated_unbind,
+	.attach_ioas = vfio_iommufd_emulated_attach_ioas,
 };
 
 static struct mdev_driver vfio_ap_matrix_driver = {
diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c
index 8772dad6808539..0388f2e33447eb 100644
--- a/drivers/vfio/container.c
+++ b/drivers/vfio/container.c
@@ -540,113 +540,45 @@  void vfio_group_unuse_container(struct vfio_group *group)
 	fput(group->opened_file);
 }
 
-/*
- * Pin contiguous user pages and return their associated host pages for local
- * domain only.
- * @device [in]  : device
- * @iova [in]    : starting IOVA of user pages to be pinned.
- * @npage [in]   : count of pages to be pinned.  This count should not
- *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
- * @prot [in]    : protection flags
- * @pages[out]   : array of host pages
- * Return error or number of pages pinned.
- *
- * A driver may only call this function if the vfio_device was created
- * by vfio_register_emulated_iommu_dev().
- */
-int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
-		   int npage, int prot, struct page **pages)
+int vfio_container_pin_pages(struct vfio_container *container,
+			     struct iommu_group *iommu_group, dma_addr_t iova,
+			     int npage, int prot, struct page **pages)
 {
-	struct vfio_container *container;
-	struct vfio_group *group = device->group;
-	struct vfio_iommu_driver *driver;
-	int ret;
-
-	if (!pages || !npage || !vfio_assert_device_open(device))
-		return -EINVAL;
+	/* group->container cannot change while a vfio device is open */
+	struct vfio_iommu_driver *driver = container->iommu_driver;
 
 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
 		return -E2BIG;
 
 	/* group->container cannot change while a vfio device is open */
-	container = group->container;
 	driver = container->iommu_driver;
-	if (likely(driver && driver->ops->pin_pages))
-		ret = driver->ops->pin_pages(container->iommu_data,
-					     group->iommu_group, iova,
-					     npage, prot, pages);
-	else
-		ret = -ENOTTY;
-
-	return ret;
+	if (unlikely(!driver || !driver->ops->pin_pages))
+		return -ENOTTY;
+	return driver->ops->pin_pages(container->iommu_data, iommu_group, iova,
+				      npage, prot, pages);
 }
-EXPORT_SYMBOL(vfio_pin_pages);
 
-/*
- * Unpin contiguous host pages for local domain only.
- * @device [in]  : device
- * @iova [in]    : starting address of user pages to be unpinned.
- * @npage [in]   : count of pages to be unpinned.  This count should not
- *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
- */
-void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
+void vfio_container_unpin_pages(struct vfio_container *container,
+				dma_addr_t iova, int npage)
 {
-	struct vfio_container *container;
-	struct vfio_iommu_driver *driver;
-
 	if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES))
 		return;
 
-	if (WARN_ON(!vfio_assert_device_open(device)))
-		return;
-
-	/* group->container cannot change while a vfio device is open */
-	container = device->group->container;
-	driver = container->iommu_driver;
-
-	driver->ops->unpin_pages(container->iommu_data, iova, npage);
+	container->iommu_driver->ops->unpin_pages(container->iommu_data, iova,
+						  npage);
 }
-EXPORT_SYMBOL(vfio_unpin_pages);
 
-/*
- * This interface allows the CPUs to perform some sort of virtual DMA on
- * behalf of the device.
- *
- * CPUs read/write from/into a range of IOVAs pointing to user space memory
- * into/from a kernel buffer.
- *
- * As the read/write of user space memory is conducted via the CPUs and is
- * not a real device DMA, it is not necessary to pin the user space memory.
- *
- * @device [in]		: VFIO device
- * @iova [in]		: base IOVA of a user space buffer
- * @data [in]		: pointer to kernel buffer
- * @len [in]		: kernel buffer length
- * @write		: indicate read or write
- * Return error code on failure or 0 on success.
- */
-int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
-		size_t len, bool write)
+int vfio_container_dma_rw(struct vfio_container *container, dma_addr_t iova,
+			  void *data, size_t len, bool write)
 {
-	struct vfio_container *container;
-	struct vfio_iommu_driver *driver;
-	int ret = 0;
-
-	if (!data || len <= 0 || !vfio_assert_device_open(device))
-		return -EINVAL;
-
 	/* group->container cannot change while a vfio device is open */
-	container = device->group->container;
-	driver = container->iommu_driver;
+	struct vfio_iommu_driver *driver = container->iommu_driver;
 
-	if (likely(driver && driver->ops->dma_rw))
-		ret = driver->ops->dma_rw(container->iommu_data,
-					  iova, data, len, write);
-	else
-		ret = -ENOTTY;
-	return ret;
+	if (unlikely(!driver || !driver->ops->dma_rw))
+		return -ENOTTY;
+	return driver->ops->dma_rw(container->iommu_data, iova, data, len,
+				   write);
 }
-EXPORT_SYMBOL(vfio_dma_rw);
 
 int __init vfio_container_init(void)
 {
diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index 8280bb32ee677c..40eb6931ab2321 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -102,3 +102,60 @@  int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id)
 	return iommufd_device_attach(vdev->iommufd_device, pt_id, flags);
 }
 EXPORT_SYMBOL_GPL(vfio_iommufd_physical_attach_ioas);
+
+/*
+ * The emulated standard ops mean that vfio_device is going to use the
+ * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this
+ * ops set should call vfio_register_emulated_iommu_dev().
+ */
+
+static void vfio_emulated_unmap(void *data, unsigned long iova,
+				unsigned long length)
+{
+	struct vfio_device *vdev = data;
+
+	vdev->ops->dma_unmap(vdev, iova, length);
+}
+
+static const struct iommufd_access_ops vfio_user_ops = {
+	.unmap = vfio_emulated_unmap,
+};
+
+int vfio_iommufd_emulated_bind(struct vfio_device *vdev,
+			       struct iommufd_ctx *ictx, u32 *out_device_id)
+{
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	vdev->iommufd_ictx = ictx;
+	iommufd_ctx_get(ictx);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_bind);
+
+void vfio_iommufd_emulated_unbind(struct vfio_device *vdev)
+{
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	if (vdev->iommufd_access) {
+		iommufd_access_destroy(vdev->iommufd_access);
+		vdev->iommufd_access = NULL;
+	}
+	iommufd_ctx_put(vdev->iommufd_ictx);
+	vdev->iommufd_ictx = NULL;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_unbind);
+
+int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id)
+{
+	struct iommufd_access *user;
+
+	lockdep_assert_held(&vdev->dev_set->lock);
+
+	user = iommufd_access_create(vdev->iommufd_ictx, *pt_id, &vfio_user_ops,
+				     vdev);
+	if (IS_ERR(user))
+		return PTR_ERR(user);
+	vdev->iommufd_access = user;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_attach_ioas);
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 809f2e8523968e..d57a08afb5cf5c 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -111,8 +111,6 @@  struct vfio_iommu_driver {
 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops);
 
-bool vfio_assert_device_open(struct vfio_device *device);
-
 struct vfio_container *vfio_container_from_file(struct file *filep);
 int vfio_group_use_container(struct vfio_group *group);
 void vfio_group_unuse_container(struct vfio_group *group);
@@ -121,6 +119,14 @@  int vfio_container_attach_group(struct vfio_container *container,
 void vfio_group_detach_container(struct vfio_group *group);
 void vfio_device_container_register(struct vfio_device *device);
 void vfio_device_container_unregister(struct vfio_device *device);
+int vfio_container_pin_pages(struct vfio_container *container,
+			     struct iommu_group *iommu_group, dma_addr_t iova,
+			     int npage, int prot, struct page **pages);
+void vfio_container_unpin_pages(struct vfio_container *container,
+				dma_addr_t iova, int npage);
+int vfio_container_dma_rw(struct vfio_container *container, dma_addr_t iova,
+			  void *data, size_t len, bool write);
+
 int __init vfio_container_init(void);
 void vfio_container_cleanup(void);
 
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index bfbda04af1ffda..9b837efbddb6db 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -766,7 +766,7 @@  static int vfio_group_ioctl_set_container(struct vfio_group *group,
 static const struct file_operations vfio_device_fops;
 
 /* true if the vfio_device has open_device() called but not close_device() */
-bool vfio_assert_device_open(struct vfio_device *device)
+static bool vfio_assert_device_open(struct vfio_device *device)
 {
 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
 }
@@ -1861,6 +1861,114 @@  int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
 }
 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
 
+/*
+ * Pin contiguous user pages and return their associated host pages for local
+ * domain only.
+ * @device [in]  : device
+ * @iova [in]    : starting IOVA of user pages to be pinned.
+ * @npage [in]   : count of pages to be pinned.  This count should not
+ *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
+ * @prot [in]    : protection flags
+ * @pages[out]   : array of host pages
+ * Return error or number of pages pinned.
+ *
+ * A driver may only call this function if the vfio_device was created
+ * by vfio_register_emulated_iommu_dev() due to vfio_container_pin_pages().
+ */
+int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
+		   int npage, int prot, struct page **pages)
+{
+	if (!pages || !npage || !vfio_assert_device_open(device))
+		return -EINVAL;
+	if (device->group->container)
+		return vfio_container_pin_pages(device->group->container,
+						device->group->iommu_group,
+						iova, npage, prot, pages);
+	if (device->iommufd_access) {
+		int ret;
+
+		if (iova > ULONG_MAX)
+			return -EINVAL;
+		ret = iommufd_access_pin_pages(
+			device->iommufd_access, iova, npage * PAGE_SIZE, pages,
+			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
+		if (ret)
+			return ret;
+		return npage;
+	}
+	return -EINVAL;
+}
+EXPORT_SYMBOL(vfio_pin_pages);
+
+/*
+ * Unpin contiguous host pages for local domain only.
+ * @device [in]  : device
+ * @iova [in]    : starting address of user pages to be unpinned.
+ * @npage [in]   : count of pages to be unpinned.  This count should not
+ *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
+ */
+void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
+{
+	if (WARN_ON(!vfio_assert_device_open(device)))
+		return;
+
+	if (device->group->container) {
+		vfio_container_unpin_pages(device->group->container, iova,
+					   npage);
+	} else if (device->iommufd_access) {
+		if (WARN_ON(iova > ULONG_MAX))
+			return;
+		iommufd_access_unpin_pages(device->iommufd_access, iova,
+					   npage * PAGE_SIZE);
+	}
+}
+EXPORT_SYMBOL(vfio_unpin_pages);
+
+/*
+ * This interface allows the CPUs to perform some sort of virtual DMA on
+ * behalf of the device.
+ *
+ * CPUs read/write from/into a range of IOVAs pointing to user space memory
+ * into/from a kernel buffer.
+ *
+ * As the read/write of user space memory is conducted via the CPUs and is
+ * not a real device DMA, it is not necessary to pin the user space memory.
+ *
+ * @device [in]		: VFIO device
+ * @iova [in]		: base IOVA of a user space buffer
+ * @data [in]		: pointer to kernel buffer
+ * @len [in]		: kernel buffer length
+ * @write		: indicate read or write
+ * Return error code on failure or 0 on success.
+ */
+int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
+		size_t len, bool write)
+{
+	if (!data || len <= 0 || !vfio_assert_device_open(device))
+		return -EINVAL;
+
+	if (device->group->container)
+		return vfio_container_dma_rw(device->group->container, iova,
+					     data, len, write);
+
+	if (device->iommufd_access) {
+		unsigned int flags = 0;
+
+		if (iova > ULONG_MAX)
+			return -EINVAL;
+
+		/* VFIO historically tries to auto-detect a kthread */
+		if (!current->mm)
+			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
+		if (write)
+			flags |= IOMMUFD_ACCESS_RW_WRITE;
+		return iommufd_access_rw(device->iommufd_access, iova, data,
+					 len, flags);
+	}
+	return -EINVAL;
+}
+EXPORT_SYMBOL(vfio_dma_rw);
+
 /*
  * Module/class support
  */
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index a7fc4d747dc226..d5f84f98c0fa8f 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -19,6 +19,7 @@ 
 struct kvm;
 struct iommufd_ctx;
 struct iommufd_device;
+struct iommufd_access;
 
 /*
  * VFIO devices can be placed in a set, this allows all devices to share this
@@ -56,8 +57,10 @@  struct vfio_device {
 	struct completion comp;
 	struct list_head group_next;
 	struct list_head iommu_entry;
+	struct iommufd_access *iommufd_access;
 #if IS_ENABLED(CONFIG_IOMMUFD)
 	struct iommufd_device *iommufd_device;
+	struct iommufd_ctx *iommufd_ictx;
 	bool iommufd_attached;
 #endif
 };
@@ -111,6 +114,10 @@  int vfio_iommufd_physical_bind(struct vfio_device *vdev,
 			       struct iommufd_ctx *ictx, u32 *out_device_id);
 void vfio_iommufd_physical_unbind(struct vfio_device *vdev);
 int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
+int vfio_iommufd_emulated_bind(struct vfio_device *vdev,
+			       struct iommufd_ctx *ictx, u32 *out_device_id);
+void vfio_iommufd_emulated_unbind(struct vfio_device *vdev);
+int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
 #else
 #define vfio_iommufd_physical_bind                                      \
 	((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx,   \
@@ -119,6 +126,13 @@  int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
 	((void (*)(struct vfio_device *vdev)) NULL)
 #define vfio_iommufd_physical_attach_ioas \
 	((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL)
+#define vfio_iommufd_emulated_bind                                      \
+	((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx,   \
+		  u32 *out_device_id)) NULL)
+#define vfio_iommufd_emulated_unbind \
+	((void (*)(struct vfio_device *vdev)) NULL)
+#define vfio_iommufd_emulated_attach_ioas \
+	((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL)
 #endif
 
 /**