diff mbox series

[v5,13/19] iommufd: Add kAPI toward external drivers for physical devices

Message ID 13-v5-4001c2997bd0+30c-iommufd_jgg@nvidia.com (mailing list archive)
State New
Headers show
Series IOMMUFD Generic interface | expand

Commit Message

Jason Gunthorpe Nov. 16, 2022, 9 p.m. UTC
Add the four functions external drivers need to connect physical DMA to
the IOMMUFD:

iommufd_device_bind() / iommufd_device_unbind()
  Register the device with iommufd and establish security isolation.

iommufd_device_attach() / iommufd_device_detach()
  Connect a bound device to a page table

Binding a device creates a device object ID in the uAPI, however the
generic API provides no IOCTLs to manipulate them.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Yi Liu <yi.l.liu@intel.com>
Tested-by: Lixiao Yang <lixiao.yang@intel.com>
Tested-by: Matthew Rosato <mjrosato@linux.ibm.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/Makefile          |   1 +
 drivers/iommu/iommufd/device.c          | 417 ++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |   5 +
 drivers/iommu/iommufd/main.c            |   3 +
 include/linux/iommufd.h                 |  13 +
 5 files changed, 439 insertions(+)
 create mode 100644 drivers/iommu/iommufd/device.c

Comments

Eric Auger Nov. 27, 2022, 9:13 p.m. UTC | #1
Hi Jason,

On 11/16/22 22:00, Jason Gunthorpe wrote:
> Add the four functions external drivers need to connect physical DMA to
> the IOMMUFD:
>
> iommufd_device_bind() / iommufd_device_unbind()
>   Register the device with iommufd and establish security isolation.
>
> iommufd_device_attach() / iommufd_device_detach()
>   Connect a bound device to a page table
>
> Binding a device creates a device object ID in the uAPI, however the
> generic API provides no IOCTLs to manipulate them.
(yet)
>
> Tested-by: Nicolin Chen <nicolinc@nvidia.com>
> Tested-by: Yi Liu <yi.l.liu@intel.com>
> Tested-by: Lixiao Yang <lixiao.yang@intel.com>
> Tested-by: Matthew Rosato <mjrosato@linux.ibm.com>
> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> ---
>  drivers/iommu/iommufd/Makefile          |   1 +
>  drivers/iommu/iommufd/device.c          | 417 ++++++++++++++++++++++++
>  drivers/iommu/iommufd/iommufd_private.h |   5 +
>  drivers/iommu/iommufd/main.c            |   3 +
>  include/linux/iommufd.h                 |  13 +
>  5 files changed, 439 insertions(+)
>  create mode 100644 drivers/iommu/iommufd/device.c
>
> diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
> index e13e971aa28c60..ca28a135b9675f 100644
> --- a/drivers/iommu/iommufd/Makefile
> +++ b/drivers/iommu/iommufd/Makefile
> @@ -1,5 +1,6 @@
>  # SPDX-License-Identifier: GPL-2.0-only
>  iommufd-y := \
> +	device.o \
>  	hw_pagetable.o \
>  	io_pagetable.o \
>  	ioas.o \
> diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
> new file mode 100644
> index 00000000000000..a71f5740773f84
> --- /dev/null
> +++ b/drivers/iommu/iommufd/device.c
> @@ -0,0 +1,417 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
> + */
> +#include <linux/iommufd.h>
> +#include <linux/slab.h>
> +#include <linux/iommu.h>
> +#include <linux/irqdomain.h>
> +
> +#include "iommufd_private.h"
> +
> +/*
> + * A iommufd_device object represents the binding relationship between a
> + * consuming driver and the iommufd. These objects are created/destroyed by
> + * external drivers, not by userspace.
> + */
> +struct iommufd_device {
> +	struct iommufd_object obj;
> +	struct iommufd_ctx *ictx;
> +	struct iommufd_hw_pagetable *hwpt;
> +	/* Head at iommufd_hw_pagetable::devices */
> +	struct list_head devices_item;
> +	/* always the physical device */
> +	struct device *dev;
> +	struct iommu_group *group;
> +	bool enforce_cache_coherency;
> +};
> +
> +void iommufd_device_destroy(struct iommufd_object *obj)
> +{
> +	struct iommufd_device *idev =
> +		container_of(obj, struct iommufd_device, obj);
> +
> +	iommu_device_release_dma_owner(idev->dev);
> +	iommu_group_put(idev->group);
> +	iommufd_ctx_put(idev->ictx);
> +}
> +
> +/**
> + * iommufd_device_bind - Bind a physical device to an iommu fd
> + * @ictx: iommufd file descriptor
> + * @dev: Pointer to a physical PCI device struct
not a PCI dev anymore
> + * @id: Output ID number to return to userspace for this device
> + *
> + * A successful bind establishes an ownership over the device and returns
> + * struct iommufd_device pointer, otherwise returns error pointer.
> + *
> + * A driver using this API must set driver_managed_dma and must not touch
> + * the device until this routine succeeds and establishes ownership.
> + *
> + * Binding a PCI device places the entire RID under iommufd control.
> + *
> + * The caller must undo this with iommufd_device_unbind()
> + */
> +struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
> +					   struct device *dev, u32 *id)
> +{
> +	struct iommufd_device *idev;
> +	struct iommu_group *group;
> +	int rc;
> +
> +	/*
> +	 * iommufd always sets IOMMU_CACHE because we offer no way for userspace
> +	 * to restore cache coherency.
> +	 */
> +	if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY))
> +		return ERR_PTR(-EINVAL);
> +
> +	group = iommu_group_get(dev);
> +	if (!group)
> +		return ERR_PTR(-ENODEV);
> +
> +	rc = iommu_device_claim_dma_owner(dev, ictx);
> +	if (rc)
> +		goto out_group_put;
> +
> +	idev = iommufd_object_alloc(ictx, idev, IOMMUFD_OBJ_DEVICE);
> +	if (IS_ERR(idev)) {
> +		rc = PTR_ERR(idev);
> +		goto out_release_owner;
> +	}
> +	idev->ictx = ictx;
> +	iommufd_ctx_get(ictx);
> +	idev->dev = dev;
> +	idev->enforce_cache_coherency =
> +		device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
> +	/* The calling driver is a user until iommufd_device_unbind() */
> +	refcount_inc(&idev->obj.users);
> +	/* group refcount moves into iommufd_device */
> +	idev->group = group;
> +
> +	/*
> +	 * If the caller fails after this success it must call
> +	 * iommufd_unbind_device() which is safe since we hold this refcount.
> +	 * This also means the device is a leaf in the graph and no other object
> +	 * can take a reference on it.
> +	 */
> +	iommufd_object_finalize(ictx, &idev->obj);
> +	*id = idev->obj.id;
> +	return idev;
> +
> +out_release_owner:
> +	iommu_device_release_dma_owner(dev);
> +out_group_put:
> +	iommu_group_put(group);
> +	return ERR_PTR(rc);
> +}
> +EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD);
> +
> +/**
> + * iommufd_device_unbind - Undo iommufd_device_bind()
> + * @idev: Device returned by iommufd_device_bind()
> + *
> + * Release the device from iommufd control. The DMA ownership will return back
> + * to unowned with DMA controlled by the DMA API. This invalidates the
> + * iommufd_device pointer, other APIs that consume it must not be called
> + * concurrently.
> + */
> +void iommufd_device_unbind(struct iommufd_device *idev)
> +{
> +	bool was_destroyed;
> +
> +	was_destroyed = iommufd_object_destroy_user(idev->ictx, &idev->obj);
> +	WARN_ON(!was_destroyed);
> +}
> +EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD);
> +
> +static int iommufd_device_setup_msi(struct iommufd_device *idev,
> +				    struct iommufd_hw_pagetable *hwpt,
> +				    phys_addr_t sw_msi_start,
> +				    unsigned int flags)
> +{
> +	int rc;
> +
> +	/*
> +	 * IOMMU_CAP_INTR_REMAP means that the platform is isolating MSI, and it
rather means that the *IOMMU* implements IRQ remapping.

irq_domain_check_msi_remap() instead means the MSI controller implements that functionality (a given device id is able to trigger MSI #n and this #n gets translated into actual MSI #m)
So what you want is to prevent an assigned device from being able to DMA into an MSI doorbell that is not protected by either the IOMMU or the MSI controller. If this happens this means the DMA can generate any kind of MSI traffic that can jeopardize the host or other VMs

To me this is independent on the the fact that the IOMMU translates MSI write requests (targetting the @ where the MSI message is written), so independent on the fact a SW MSI reserved region is exposed. 

That's why in the vfio_iommu_type.c the msi_recap capability is first checked:

        msi_remap = irq_domain_check_msi_remap() ||
                    iommu_group_for_each_dev(iommu_group, (void *)IOMMU_CAP_INTR_REMAP,
                                             vfio_iommu_device_capable);

        if (!vfio_allow_unsafe_interrupts && !msi_remap) {
                pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
                       __func__);
                ret = -EPERM;
                goto out_detach;
        }


and afterwards resv_msi is checked to see if we need to create the so-called msi cookie.
This msi cookie tells the MSI writes are translated by the IOMMU and somebody must create a mapping for those transactions.

> +	 * creates the MSI window by default in the iommu domain. Nothing
> +	 * further to do.
> +	 */
> +	if (device_iommu_capable(idev->dev, IOMMU_CAP_INTR_REMAP))
> +		return 0;
> +
> +	/*
> +	 * On ARM systems that set the global IRQ_DOMAIN_FLAG_MSI_REMAP every
> +	 * allocated iommu_domain will block interrupts by default and this
It sounds there is a confusion between IRQ remapping and the fact MSI
writes are not bypassed by the IOMMU.
> +	 * special flow is needed to turn them back on. iommu_dma_prepare_msi()
> +	 * will install pages into our domain after request_irq() to make this
> +	 * work.
> +	 *
> +	 * FIXME: This is conceptually broken for iommufd since we want to allow
> +	 * userspace to change the domains, eg switch from an identity IOAS to a
> +	 * DMA IOAS. There is currently no way to create a MSI window that
> +	 * matches what the IRQ layer actually expects in a newly created
> +	 * domain.
> +	 */
> +	if (irq_domain_check_msi_remap()) {
> +		if (WARN_ON(!sw_msi_start))
> +			return -EPERM;
> +		/*
> +		 * iommu_get_msi_cookie() can only be called once per domain,
> +		 * it returns -EBUSY on later calls.
> +		 */
> +		if (hwpt->msi_cookie)
> +			return 0;
> +		rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start);
> +		if (rc)
> +			return rc;
> +		hwpt->msi_cookie = true;
> +		return 0;
> +	}
> +
> +	/*
> +	 * Otherwise the platform has a MSI window that is not isolated. For
> +	 * historical compat with VFIO allow a module parameter to ignore the
> +	 * insecurity.
> +	 */
> +	if (!(flags & IOMMUFD_ATTACH_FLAGS_ALLOW_UNSAFE_INTERRUPT))
> +		return -EPERM;
> +
> +	dev_warn(
> +		idev->dev,
> +		"Device interrupts cannot be isolated by the IOMMU, this platform in insecure. Use an \"allow_unsafe_interrupts\" module parameter to override\n");
> +	return 0;
> +}
> +
> +static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt,
> +					   struct iommu_group *group)
> +{
> +	struct iommufd_device *cur_dev;
> +
> +	list_for_each_entry(cur_dev, &hwpt->devices, devices_item)
> +		if (cur_dev->group == group)
> +			return true;
> +	return false;
> +}
> +
> +static int iommufd_device_do_attach(struct iommufd_device *idev,
> +				    struct iommufd_hw_pagetable *hwpt,
> +				    unsigned int flags)
> +{
> +	phys_addr_t sw_msi_start = 0;
> +	int rc;
> +
> +	mutex_lock(&hwpt->devices_lock);
> +
> +	/*
> +	 * Try to upgrade the domain we have, it is an iommu driver bug to
> +	 * report IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail
> +	 * enforce_cache_coherency when there are no devices attached to the
> +	 * domain.
> +	 */
> +	if (idev->enforce_cache_coherency && !hwpt->enforce_cache_coherency) {
> +		if (hwpt->domain->ops->enforce_cache_coherency)
> +			hwpt->enforce_cache_coherency =
> +				hwpt->domain->ops->enforce_cache_coherency(
> +					hwpt->domain);
> +		if (!hwpt->enforce_cache_coherency) {
> +			WARN_ON(list_empty(&hwpt->devices));
> +			rc = -EINVAL;
> +			goto out_unlock;
> +		}
> +	}
> +
> +	rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt, idev->dev,
> +						   idev->group, &sw_msi_start);
> +	if (rc)
> +		goto out_unlock;
> +
so in the case of any IOMMU_RESV_MSI, iommufd_device_setup_msi() will be
called with *sw_msi_start = 0 which will return -EPERM?
I don't think this is what we want. In that case I think we want the
RESV_MSI region to be taken into account as a RESV region but we don't
need the MSI cookie.
> +	rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start, flags);
> +	if (rc)
> +		goto out_iova;
> +
> +	/*
> +	 * FIXME: Hack around missing a device-centric iommu api, only attach to
> +	 * the group once for the first device that is in the group.
> +	 */
> +	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
> +		rc = iommu_attach_group(hwpt->domain, idev->group);
> +		if (rc)
> +			goto out_iova;
> +
> +		if (list_empty(&hwpt->devices)) {
> +			rc = iopt_table_add_domain(&hwpt->ioas->iopt,
> +						   hwpt->domain);
> +			if (rc)
> +				goto out_detach;
> +		}
> +	}
> +
> +	idev->hwpt = hwpt;
> +	refcount_inc(&hwpt->obj.users);
> +	list_add(&idev->devices_item, &hwpt->devices);
> +	mutex_unlock(&hwpt->devices_lock);
> +	return 0;
> +
> +out_detach:
> +	iommu_detach_group(hwpt->domain, idev->group);
> +out_iova:
> +	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
> +out_unlock:
> +	mutex_unlock(&hwpt->devices_lock);
> +	return rc;
> +}
> +
> +/*
> + * When automatically managing the domains we search for a compatible domain in
> + * the iopt and if one is found use it, otherwise create a new domain.
> + * Automatic domain selection will never pick a manually created domain.
> + */
> +static int iommufd_device_auto_get_domain(struct iommufd_device *idev,
> +					  struct iommufd_ioas *ioas,
> +					  unsigned int flags)
> +{
> +	struct iommufd_hw_pagetable *hwpt;
> +	int rc;
> +
> +	/*
> +	 * There is no differentiation when domains are allocated, so any domain
> +	 * that is willing to attach to the device is interchangeable with any
> +	 * other.
> +	 */
> +	mutex_lock(&ioas->mutex);
> +	list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
> +		if (!hwpt->auto_domain)
> +			continue;
> +
> +		rc = iommufd_device_do_attach(idev, hwpt, flags);
> +
> +		/*
> +		 * -EINVAL means the domain is incompatible with the device.
> +		 * Other error codes should propagate to userspace as failure.
> +		 * Success means the domain is attached.
> +		 */
> +		if (rc == -EINVAL)
> +			continue;
> +		goto out_unlock;
> +	}
> +
> +	hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev->dev);
> +	if (IS_ERR(hwpt)) {
> +		rc = PTR_ERR(hwpt);
> +		goto out_unlock;
> +	}
> +	hwpt->auto_domain = true;
> +
> +	rc = iommufd_device_do_attach(idev, hwpt, flags);
> +	if (rc)
> +		goto out_abort;
> +	list_add_tail(&hwpt->hwpt_item, &ioas->hwpt_list);
> +
> +	mutex_unlock(&ioas->mutex);
> +	iommufd_object_finalize(idev->ictx, &hwpt->obj);
> +	return 0;
> +
> +out_abort:
> +	iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj);
> +out_unlock:
> +	mutex_unlock(&ioas->mutex);
> +	return rc;
> +}
> +
> +/**
> + * iommufd_device_attach - Connect a device to an iommu_domain
> + * @idev: device to attach
> + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
> + *         Output the IOMMUFD_OBJ_HW_PAGETABLE ID
> + * @flags: Optional flags
> + *
> + * This connects the device to an iommu_domain, either automatically or manually
> + * selected. Once this completes the device could do DMA.
> + *
> + * The caller should return the resulting pt_id back to userspace.
> + * This function is undone by calling iommufd_device_detach().
> + */
> +int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id,
> +			  unsigned int flags)
> +{
> +	struct iommufd_object *pt_obj;
> +	int rc;
> +
> +	pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY);
> +	if (IS_ERR(pt_obj))
> +		return PTR_ERR(pt_obj);
> +
> +	switch (pt_obj->type) {
> +	case IOMMUFD_OBJ_HW_PAGETABLE: {
> +		struct iommufd_hw_pagetable *hwpt =
> +			container_of(pt_obj, struct iommufd_hw_pagetable, obj);
> +
> +		rc = iommufd_device_do_attach(idev, hwpt, flags);
> +		if (rc)
> +			goto out_put_pt_obj;
> +
> +		mutex_lock(&hwpt->ioas->mutex);
> +		list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list);
> +		mutex_unlock(&hwpt->ioas->mutex);
> +		break;
> +	}
> +	case IOMMUFD_OBJ_IOAS: {
> +		struct iommufd_ioas *ioas =
> +			container_of(pt_obj, struct iommufd_ioas, obj);
> +
> +		rc = iommufd_device_auto_get_domain(idev, ioas, flags);
> +		if (rc)
> +			goto out_put_pt_obj;
> +		break;
> +	}
> +	default:
> +		rc = -EINVAL;
> +		goto out_put_pt_obj;
> +	}
> +
> +	refcount_inc(&idev->obj.users);
> +	*pt_id = idev->hwpt->obj.id;
> +	rc = 0;
> +
> +out_put_pt_obj:
> +	iommufd_put_object(pt_obj);
> +	return rc;
> +}
> +EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
> +
> +/**
> + * iommufd_device_detach - Disconnect a device to an iommu_domain
from
> + * @idev: device to detach
> + *
> + * Undo iommufd_device_attach(). This disconnects the idev from the previously
> + * attached pt_id. The device returns back to a blocked DMA translation.
> + */
> +void iommufd_device_detach(struct iommufd_device *idev)
> +{
> +	struct iommufd_hw_pagetable *hwpt = idev->hwpt;
> +
> +	mutex_lock(&hwpt->ioas->mutex);
> +	mutex_lock(&hwpt->devices_lock);
> +	list_del(&idev->devices_item);
> +	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
> +		if (list_empty(&hwpt->devices)) {
> +			iopt_table_remove_domain(&hwpt->ioas->iopt,
> +						 hwpt->domain);
> +			list_del(&hwpt->hwpt_item);
> +		}
> +		iommu_detach_group(hwpt->domain, idev->group);
> +	}
> +	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
> +	mutex_unlock(&hwpt->devices_lock);
> +	mutex_unlock(&hwpt->ioas->mutex);
> +
> +	if (hwpt->auto_domain)
> +		iommufd_object_destroy_user(idev->ictx, &hwpt->obj);
> +	else
> +		refcount_dec(&hwpt->obj.users);
> +
> +	idev->hwpt = NULL;
> +
> +	refcount_dec(&idev->obj.users);
> +}
> +EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD);
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index bb5cbd8f4e5991..73345886d969e5 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -103,6 +103,7 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
>  enum iommufd_object_type {
>  	IOMMUFD_OBJ_NONE,
>  	IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
> +	IOMMUFD_OBJ_DEVICE,
>  	IOMMUFD_OBJ_HW_PAGETABLE,
>  	IOMMUFD_OBJ_IOAS,
>  };
> @@ -229,6 +230,8 @@ struct iommufd_hw_pagetable {
>  	struct iommufd_ioas *ioas;
>  	struct iommu_domain *domain;
>  	bool auto_domain : 1;
> +	bool enforce_cache_coherency : 1;
> +	bool msi_cookie : 1;
>  	/* Head at iommufd_ioas::hwpt_list */
>  	struct list_head hwpt_item;
>  	struct mutex devices_lock;
> @@ -240,6 +243,8 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
>  			   struct device *dev);
>  void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
>  
> +void iommufd_device_destroy(struct iommufd_object *obj);
> +
>  struct iommufd_access {
>  	unsigned long iova_alignment;
>  	u32 iopt_access_list_id;
> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index 3eab714b8e12a3..8a114ddbdfcde2 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -352,6 +352,9 @@ void iommufd_ctx_put(struct iommufd_ctx *ictx)
>  EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD);
>  
>  static const struct iommufd_object_ops iommufd_object_ops[] = {
> +	[IOMMUFD_OBJ_DEVICE] = {
> +		.destroy = iommufd_device_destroy,
> +	},
>  	[IOMMUFD_OBJ_IOAS] = {
>  		.destroy = iommufd_ioas_destroy,
>  	},
> diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
> index 26e09d539737bb..31efacd8a46cce 100644
> --- a/include/linux/iommufd.h
> +++ b/include/linux/iommufd.h
> @@ -9,10 +9,23 @@
>  #include <linux/types.h>
>  #include <linux/errno.h>
>  #include <linux/err.h>
> +#include <linux/device.h>
>  
> +struct iommufd_device;
>  struct iommufd_ctx;
>  struct file;
>  
> +struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
> +					   struct device *dev, u32 *id);
> +void iommufd_device_unbind(struct iommufd_device *idev);
> +
> +enum {
> +	IOMMUFD_ATTACH_FLAGS_ALLOW_UNSAFE_INTERRUPT = 1 << 0,
> +};
> +int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id,
> +			  unsigned int flags);
> +void iommufd_device_detach(struct iommufd_device *idev);
> +
>  enum {
>  	IOMMUFD_ACCESS_RW_READ = 0,
>  	IOMMUFD_ACCESS_RW_WRITE = 1 << 0,
Thanks

Eric
Jason Gunthorpe Nov. 28, 2022, 12:14 a.m. UTC | #2
On Sun, Nov 27, 2022 at 10:13:55PM +0100, Eric Auger wrote:
> > +static int iommufd_device_setup_msi(struct iommufd_device *idev,
> > +				    struct iommufd_hw_pagetable *hwpt,
> > +				    phys_addr_t sw_msi_start,
> > +				    unsigned int flags)
> > +{
> > +	int rc;
> > +
> > +	/*
> > +	 * IOMMU_CAP_INTR_REMAP means that the platform is isolating MSI, and it
> rather means that the *IOMMU* implements IRQ remapping.

Not really. The name is a mess, but as it is implemented, it means the
platform is implementing MSI security. How exactly that is done is not
really defined, and it doesn't really belong as an iommu property.
However the security is being created is done in a way that is
transparent to the iommu_domain user.

MSI security means that the originating device (eg the RID for PCI) is
validated when the MSI is processed. RIDs can only use MSIs they are
authorized to use.

It doesn't matter how it is done, if it remapping HW, fancy
iommu_domain tricks, or built into the MSI controller. Set this flag
if the platform is secure and doesn't need the code triggered by
irq_domain_check_msi_remap().

> irq_domain_check_msi_remap() instead means the MSI controller
> implements that functionality (a given device id is able to trigger

Not quite, it means that MSI isolation is available, however it is not
transparent and the iommu_domain user must do the little dance that
follows.

It does not mean the MSI controller implements the security
functionality because it is not set on x86.

Logically on x86 we should consider the remapping logic as part of the
MSI controller even if x86 makes them separated for historical
reasons.

> MSI #n and this #n gets translated into actual MSI #m) So what you
> want is to prevent an assigned device from being able to DMA into an
> MSI doorbell that is not protected by either the IOMMU or the MSI
> controller. If this happens this means the DMA can generate any kind
> of MSI traffic that can jeopardize the host or other VMs

I don't know of any real systems that work like this. ARM standard GIC
uses a shared ITS page, the IOMMU does nothing to provide MSI
security. MSI security is built into the GIC because it validates the
device ID as part of processing the MSI. The IOMMU is only involved
to grant access to the shared ITS page.

Intel is similar, it provides security through the MSI controller's
remapping logic, the only difference is that on Intel the MSI window
is always present in every iommu_domain (becuase it occures before the
IOMMU), and in ARM you have to do the little dance.

Even if the iommu is to be involved it is all hidden from this layer.

> and afterwards resv_msi is checked to see if we need to create the
> so-called msi cookie.  This msi cookie tells the MSI writes are
> translated by the IOMMU and somebody must create a mapping for those
> transactions.

The cookie is always necessary if we are going to use the
non-transparent mode. That is what makes it the non transparent
mode. We have to supply the reserved IOVA range from one part of the
iommu driver to another part.

> > +	 * creates the MSI window by default in the iommu domain. Nothing
> > +	 * further to do.
> > +	 */
> > +	if (device_iommu_capable(idev->dev, IOMMU_CAP_INTR_REMAP))
> > +		return 0;
> > +
> > +	/*
> > +	 * On ARM systems that set the global IRQ_DOMAIN_FLAG_MSI_REMAP every
> > +	 * allocated iommu_domain will block interrupts by default and this
> It sounds there is a confusion between IRQ remapping and the fact MSI
> writes are not bypassed by the IOMMU.

I don't think I'm confused :)

> > +static int iommufd_device_do_attach(struct iommufd_device *idev,
> > +				    struct iommufd_hw_pagetable *hwpt,
> > +				    unsigned int flags)
> > +{
> > +	phys_addr_t sw_msi_start = 0;
> > +	int rc;
> > +
> > +	mutex_lock(&hwpt->devices_lock);
> > +
> > +	/*
> > +	 * Try to upgrade the domain we have, it is an iommu driver bug to
> > +	 * report IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail
> > +	 * enforce_cache_coherency when there are no devices attached to the
> > +	 * domain.
> > +	 */
> > +	if (idev->enforce_cache_coherency && !hwpt->enforce_cache_coherency) {
> > +		if (hwpt->domain->ops->enforce_cache_coherency)
> > +			hwpt->enforce_cache_coherency =
> > +				hwpt->domain->ops->enforce_cache_coherency(
> > +					hwpt->domain);
> > +		if (!hwpt->enforce_cache_coherency) {
> > +			WARN_ON(list_empty(&hwpt->devices));
> > +			rc = -EINVAL;
> > +			goto out_unlock;
> > +		}
> > +	}
> > +
> > +	rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt, idev->dev,
> > +						   idev->group, &sw_msi_start);
> > +	if (rc)
> > +		goto out_unlock;
> > +
> so in the case of any IOMMU_RESV_MSI, iommufd_device_setup_msi() will be
> called with *sw_msi_start = 0 which will return -EPERM?
> I don't think this is what we want. In that case I think we want the
> RESV_MSI region to be taken into account as a RESV region but we don't
> need the MSI cookie.

This was sort of sloppy copied from VFIO - we should just delete
it. The is no driver that sets both, and once the platform asserts
irq_domain_check_msi_remap() it is going down the non-transparent path
anyhow and must set a cookie to work. [again the names doesn't make
any sense for the functionality]

Failing with EPERM is probably not so bad since the platform is using
an invalid configuration. I'm kind of inclined to leave this for right
now since it has all be tested and I don't want to make a
regression. But I can try to send a patch to tidy it a bit more, maybe
add an appropriate WARN_ON.

The whole thing is actually backwards. The IOMMU_CAP_INTR_REMAP should
have been some global irq_has_secure_msi() and everything with
interrupt remapping, and ARM should set it.

Then the domain should have had a domain cap
IOMUU_CAP_REQUIRE_MSI_WINDOW_SETUP to do the little dance ARM drivers
need.

And even better would have been to not have the little dance in the
first place, as we don't really need the iommu_domain user to convey
the fixed MSI window from one part of the iommu driver to another.

We may try to fix this up when we get to doing nesting on ARM, or
maybe just leave the confusing sort of working thing as-is. I don't
know.

Jason
Eric Auger Nov. 28, 2022, 10:55 a.m. UTC | #3
Hi Jason,

On 11/28/22 01:14, Jason Gunthorpe wrote:
> On Sun, Nov 27, 2022 at 10:13:55PM +0100, Eric Auger wrote:
>>> +static int iommufd_device_setup_msi(struct iommufd_device *idev,
>>> +				    struct iommufd_hw_pagetable *hwpt,
>>> +				    phys_addr_t sw_msi_start,
>>> +				    unsigned int flags)
>>> +{
>>> +	int rc;
>>> +
>>> +	/*
>>> +	 * IOMMU_CAP_INTR_REMAP means that the platform is isolating MSI, and it
>> rather means that the *IOMMU* implements IRQ remapping.
> Not really. The name is a mess, but as it is implemented, it means the
> platform is implementing MSI security. How exactly that is done is not
> really defined, and it doesn't really belong as an iommu property.
> However the security is being created is done in a way that is
> transparent to the iommu_domain user.
Some 'ARM platforms' implement what you call MSI security but they do
not advertise IOMMU_CAP_INTR_REMAP

Besides refering to include/linux/iommu.h:
IOMMU_CAP_INTR_REMAP,           /* IOMMU supports interrupt isolation */

>
> MSI security means that the originating device (eg the RID for PCI) is
> validated when the MSI is processed. RIDs can only use MSIs they are
> authorized to use.
agreed this is what I described below.
>
> It doesn't matter how it is done, if it remapping HW, fancy
> iommu_domain tricks, or built into the MSI controller. Set this flag
> if the platform is secure and doesn't need the code triggered by
> irq_domain_check_msi_remap().
this is not what is implemented as of now. If the IOMMU does support
interrupt isolation, it advertises IOMMU_CAP_INTR_REMAP. On ARM this
feature is implemented by the ITS MSI controller instead and the only
way to retrieve the info whether the device MSIs are directed to that
kind of MSI controller is to use irq_domain_check_msi_remap().
>
>> irq_domain_check_msi_remap() instead means the MSI controller
>> implements that functionality (a given device id is able to trigger
> Not quite, it means that MSI isolation is available, however it is not
> transparent and the iommu_domain user must do the little dance that
> follows.
No I do not agree on that point. The 'little dance' is needed because
the SMMU does not bypass MSI writes as done on Intel. And someone must
take care of the MSI transaction mapping. This is the role of the MSI
cookie stuff. To me this is independent on the above discussion whether
MSI isolation is implemented.
>
> It does not mean the MSI controller implements the security
> functionality because it is not set on x86.
>
> Logically on x86 we should consider the remapping logic as part of the
> MSI controller even if x86 makes them separated for historical
> reasons.
>
>> MSI #n and this #n gets translated into actual MSI #m) So what you
>> want is to prevent an assigned device from being able to DMA into an
>> MSI doorbell that is not protected by either the IOMMU or the MSI
>> controller. If this happens this means the DMA can generate any kind
>> of MSI traffic that can jeopardize the host or other VMs
> I don't know of any real systems that work like this. ARM standard GIC
> uses a shared ITS page, the IOMMU does nothing to provide MSI
> security. MSI security is built into the GIC because it validates the
> device ID as part of processing the MSI. The IOMMU is only involved
> to grant access to the shared ITS page.

?? Yeah that's what I said. The SMMU does nothing about MSI security.
The ITS does.
>
> Intel is similar, it provides security through the MSI controller's
> remapping logic, the only difference is that on Intel the MSI window
> is always present in every iommu_domain (becuase it occures before the
> IOMMU), and in ARM you have to do the little dance.
On Intel the MSI window [FEE0_0000h - FEF0_000h] is bypassed by the
IOMMU. On ARM MSI transactions are translated except in case of HW MSI
RESV regions (I think).
>
> Even if the iommu is to be involved it is all hidden from this layer.
>
>> and afterwards resv_msi is checked to see if we need to create the
>> so-called msi cookie.  This msi cookie tells the MSI writes are
>> translated by the IOMMU and somebody must create a mapping for those
>> transactions.
> The cookie is always necessary if we are going to use the
> non-transparent mode. That is what makes it the non transparent
> mode. We have to supply the reserved IOVA range from one part of the
> iommu driver to another part.
>
>>> +	 * creates the MSI window by default in the iommu domain. Nothing
>>> +	 * further to do.
>>> +	 */
>>> +	if (device_iommu_capable(idev->dev, IOMMU_CAP_INTR_REMAP))
>>> +		return 0;
>>> +
>>> +	/*
>>> +	 * On ARM systems that set the global IRQ_DOMAIN_FLAG_MSI_REMAP every
>>> +	 * allocated iommu_domain will block interrupts by default and this
>> It sounds there is a confusion between IRQ remapping and the fact MSI
>> writes are not bypassed by the IOMMU.
> I don't think I'm confused :)
As soon as there is an SW MSI_RESV region and only in that case you need
to put in place the msi cookie (because it indicates the IOMMU
translates MSI transactions).

The fact the platform provides MSI security (through IOMMU or MSI
controller) looks orthogonal to me.
>
>>> +static int iommufd_device_do_attach(struct iommufd_device *idev,
>>> +				    struct iommufd_hw_pagetable *hwpt,
>>> +				    unsigned int flags)
>>> +{
>>> +	phys_addr_t sw_msi_start = 0;
>>> +	int rc;
>>> +
>>> +	mutex_lock(&hwpt->devices_lock);
>>> +
>>> +	/*
>>> +	 * Try to upgrade the domain we have, it is an iommu driver bug to
>>> +	 * report IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail
>>> +	 * enforce_cache_coherency when there are no devices attached to the
>>> +	 * domain.
>>> +	 */
>>> +	if (idev->enforce_cache_coherency && !hwpt->enforce_cache_coherency) {
>>> +		if (hwpt->domain->ops->enforce_cache_coherency)
>>> +			hwpt->enforce_cache_coherency =
>>> +				hwpt->domain->ops->enforce_cache_coherency(
>>> +					hwpt->domain);
>>> +		if (!hwpt->enforce_cache_coherency) {
>>> +			WARN_ON(list_empty(&hwpt->devices));
>>> +			rc = -EINVAL;
>>> +			goto out_unlock;
>>> +		}
>>> +	}
>>> +
>>> +	rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt, idev->dev,
>>> +						   idev->group, &sw_msi_start);
>>> +	if (rc)
>>> +		goto out_unlock;
>>> +
>> so in the case of any IOMMU_RESV_MSI, iommufd_device_setup_msi() will be
>> called with *sw_msi_start = 0 which will return -EPERM?
>> I don't think this is what we want. In that case I think we want the
>> RESV_MSI region to be taken into account as a RESV region but we don't
>> need the MSI cookie.
> This was sort of sloppy copied from VFIO - we should just delete
> it. The is no driver that sets both, and once the platform asserts
> irq_domain_check_msi_remap() it is going down the non-transparent path
> anyhow and must set a cookie to work. [again the names doesn't make
> any sense for the functionality]
>
> Failing with EPERM is probably not so bad since the platform is using
> an invalid configuration. I'm kind of inclined to leave this for right
I don't understand why it is invalid? HW MSI RESV region is a valid
config and not sure you tested with that kind of setup, did you?
> now since it has all be tested and I don't want to make a
> regression. But I can try to send a patch to tidy it a bit more, maybe
> add an appropriate WARN_ON.
>
> The whole thing is actually backwards. The IOMMU_CAP_INTR_REMAP should
> have been some global irq_has_secure_msi() and everything with
> interrupt remapping, and ARM should set it.
You are revisiting this IOMMU_CAP_INTR_REMAP definition ... this is not
what is documented in the header file.

>
> Then the domain should have had a domain cap
> IOMUU_CAP_REQUIRE_MSI_WINDOW_SETUP to do the little dance ARM drivers
> need.
>
> And even better would have been to not have the little dance in the
> first place, as we don't really need the iommu_domain user to convey
> the fixed MSI window from one part of the iommu driver to another.
>
> We may try to fix this up when we get to doing nesting on ARM, or
> maybe just leave the confusing sort of working thing as-is. I don't
> know.
>
> Jason
>
Eric
Jason Gunthorpe Nov. 28, 2022, 1:20 p.m. UTC | #4
On Mon, Nov 28, 2022 at 11:55:41AM +0100, Eric Auger wrote:

> > Not really. The name is a mess, but as it is implemented, it means the
> > platform is implementing MSI security. How exactly that is done is not
> > really defined, and it doesn't really belong as an iommu property.
> > However the security is being created is done in a way that is
> > transparent to the iommu_domain user.
> Some 'ARM platforms' implement what you call MSI security but they do
> not advertise IOMMU_CAP_INTR_REMAP

Sounds like a bug.
 
> Besides refering to include/linux/iommu.h:
> IOMMU_CAP_INTR_REMAP,           /* IOMMU supports interrupt isolation */

Documentation doesn't match code.

> > It doesn't matter how it is done, if it remapping HW, fancy
> > iommu_domain tricks, or built into the MSI controller. Set this flag
> > if the platform is secure and doesn't need the code triggered by
> > irq_domain_check_msi_remap().

> this is not what is implemented as of now. If the IOMMU does support
> interrupt isolation, it advertises IOMMU_CAP_INTR_REMAP. On ARM this
> feature is implemented by the ITS MSI controller instead and the only
> way to retrieve the info whether the device MSIs are directed to that
> kind of MSI controller is to use irq_domain_check_msi_remap().

It is important to keep the Linux design seperated from what the
architecture papers describes. In Linux the IOMMU is represented by
the iommu_domain and the iommu_ops. On x86 neither of these objects
play any role in interrupt delivery. Yes, the x86 architecture papers
place some of the interrupt logic inside what they consider the iommu
block, but that is just some historical stuff and shouldn't impact the
SW design.

If we had put the IRTE bits inside the irqchip layer instead of in the
iommu driver, it would have made a lot more sense.

The fact that ARM was allowed to be different (or rather the x86 mess
wasn't cleaned up before the ARM mess was overlayed on top) is why
this is so confusing. They are doing the same things, just in
unnecessarily different ways.

> >> irq_domain_check_msi_remap() instead means the MSI controller
> >> implements that functionality (a given device id is able to trigger
> > Not quite, it means that MSI isolation is available, however it is not
> > transparent and the iommu_domain user must do the little dance that
> > follows.
> No I do not agree on that point. The 'little dance' is needed because
> the SMMU does not bypass MSI writes as done on Intel. And someone must
> take care of the MSI transaction mapping. This is the role of the MSI
> cookie stuff. To me this is independent on the above discussion whether
> MSI isolation is implemented.

OK, so you are worried about someone who sets
allow_unsafe_interrupts=1 they will not get the iommu_get_msi_cookie()
call done even though they still need it? That does seem wrong.

> > This was sort of sloppy copied from VFIO - we should just delete
> > it. The is no driver that sets both, and once the platform asserts
> > irq_domain_check_msi_remap() it is going down the non-transparent path
> > anyhow and must set a cookie to work. [again the names doesn't make
> > any sense for the functionality]
> >
> > Failing with EPERM is probably not so bad since the platform is using
> > an invalid configuration. I'm kind of inclined to leave this for
> > right

> I don't understand why it is invalid? HW MSI RESV region is a valid
> config and not sure you tested with that kind of setup, did you?

Why would it be a valid config? No driver sets both..

HW MSI RESV should set IOMMU_CAP_INTR_REMAP like Intel does.

irq_domain_check_msi_remap() is only for SW MSI RESV regions.

This is what the code implements, and yes it makes no sense.

Jason
Eric Auger Nov. 28, 2022, 2:17 p.m. UTC | #5
On 11/28/22 14:20, Jason Gunthorpe wrote:
> On Mon, Nov 28, 2022 at 11:55:41AM +0100, Eric Auger wrote:
>
>>> Not really. The name is a mess, but as it is implemented, it means the
>>> platform is implementing MSI security. How exactly that is done is not
>>> really defined, and it doesn't really belong as an iommu property.
>>> However the security is being created is done in a way that is
>>> transparent to the iommu_domain user.
>> Some 'ARM platforms' implement what you call MSI security but they do
>> not advertise IOMMU_CAP_INTR_REMAP
> Sounds like a bug.
>  
>> Besides refering to include/linux/iommu.h:
>> IOMMU_CAP_INTR_REMAP,           /* IOMMU supports interrupt isolation */
> Documentation doesn't match code.
>
>>> It doesn't matter how it is done, if it remapping HW, fancy
>>> iommu_domain tricks, or built into the MSI controller. Set this flag
>>> if the platform is secure and doesn't need the code triggered by
>>> irq_domain_check_msi_remap().
>> this is not what is implemented as of now. If the IOMMU does support
>> interrupt isolation, it advertises IOMMU_CAP_INTR_REMAP. On ARM this
>> feature is implemented by the ITS MSI controller instead and the only
>> way to retrieve the info whether the device MSIs are directed to that
>> kind of MSI controller is to use irq_domain_check_msi_remap().
> It is important to keep the Linux design seperated from what the
> architecture papers describes. In Linux the IOMMU is represented by
> the iommu_domain and the iommu_ops. On x86 neither of these objects
> play any role in interrupt delivery. Yes, the x86 architecture papers
> place some of the interrupt logic inside what they consider the iommu
> block, but that is just some historical stuff and shouldn't impact the
> SW design.
>
> If we had put the IRTE bits inside the irqchip layer instead of in the
> iommu driver, it would have made a lot more sense.
>
> The fact that ARM was allowed to be different (or rather the x86 mess
> wasn't cleaned up before the ARM mess was overlayed on top) is why
> this is so confusing. They are doing the same things, just in
> unnecessarily different ways.

fair enough. But that's a separate discussion that needs to happen with
iommu and irqchip maintainers I think. At the moment things are
implemented that way.
>
>>>> irq_domain_check_msi_remap() instead means the MSI controller
>>>> implements that functionality (a given device id is able to trigger
>>> Not quite, it means that MSI isolation is available, however it is not
>>> transparent and the iommu_domain user must do the little dance that
>>> follows.
>> No I do not agree on that point. The 'little dance' is needed because
>> the SMMU does not bypass MSI writes as done on Intel. And someone must
>> take care of the MSI transaction mapping. This is the role of the MSI
>> cookie stuff. To me this is independent on the above discussion whether
>> MSI isolation is implemented.
> OK, so you are worried about someone who sets
> allow_unsafe_interrupts=1 they will not get the iommu_get_msi_cookie()
> call done even though they still need it? That does seem wrong.
yes
>
>>> This was sort of sloppy copied from VFIO - we should just delete
>>> it. The is no driver that sets both, and once the platform asserts
>>> irq_domain_check_msi_remap() it is going down the non-transparent path
>>> anyhow and must set a cookie to work. [again the names doesn't make
>>> any sense for the functionality]
>>>
>>> Failing with EPERM is probably not so bad since the platform is using
>>> an invalid configuration. I'm kind of inclined to leave this for
>>> right
>> I don't understand why it is invalid? HW MSI RESV region is a valid
>> config and not sure you tested with that kind of setup, did you?
> Why would it be a valid config? No driver sets both..
>
> HW MSI RESV should set IOMMU_CAP_INTR_REMAP like Intel does.
>
> irq_domain_check_msi_remap() is only for SW MSI RESV regions.
In theory no. Nothing prevents from having an MSI isolation capable
controller (such as the ITS) with an IOMMU HW which wouldn't translate
MSIs. In the past there has been such kind of problematic I think for
HiSilicon HW
https://lore.kernel.org/all/20171006140450.89652-1-shameerali.kolothum.thodi@huawei.com/
This was eventually addressed differently but Shameer may precise ... 

Eric

>
> This is what the code implements, and yes it makes no sense.
>
> Jason
>
Jason Gunthorpe Nov. 29, 2022, 1:09 a.m. UTC | #6
On Mon, Nov 28, 2022 at 03:17:09PM +0100, Eric Auger wrote:

> > The fact that ARM was allowed to be different (or rather the x86 mess
> > wasn't cleaned up before the ARM mess was overlayed on top) is why
> > this is so confusing. They are doing the same things, just in
> > unnecessarily different ways.
> 
> fair enough. But that's a separate discussion that needs to happen with
> iommu and irqchip maintainers I think. At the moment things are
> implemented that way.

Here, this should fix it all up:

https://github.com/jgunthorpe/linux/commits/secure_msi

What do you think?

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index e13e971aa28c60..ca28a135b9675f 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -1,5 +1,6 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 iommufd-y := \
+	device.o \
 	hw_pagetable.o \
 	io_pagetable.o \
 	ioas.o \
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
new file mode 100644
index 00000000000000..a71f5740773f84
--- /dev/null
+++ b/drivers/iommu/iommufd/device.c
@@ -0,0 +1,417 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/iommufd.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/irqdomain.h>
+
+#include "iommufd_private.h"
+
+/*
+ * A iommufd_device object represents the binding relationship between a
+ * consuming driver and the iommufd. These objects are created/destroyed by
+ * external drivers, not by userspace.
+ */
+struct iommufd_device {
+	struct iommufd_object obj;
+	struct iommufd_ctx *ictx;
+	struct iommufd_hw_pagetable *hwpt;
+	/* Head at iommufd_hw_pagetable::devices */
+	struct list_head devices_item;
+	/* always the physical device */
+	struct device *dev;
+	struct iommu_group *group;
+	bool enforce_cache_coherency;
+};
+
+void iommufd_device_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_device *idev =
+		container_of(obj, struct iommufd_device, obj);
+
+	iommu_device_release_dma_owner(idev->dev);
+	iommu_group_put(idev->group);
+	iommufd_ctx_put(idev->ictx);
+}
+
+/**
+ * iommufd_device_bind - Bind a physical device to an iommu fd
+ * @ictx: iommufd file descriptor
+ * @dev: Pointer to a physical PCI device struct
+ * @id: Output ID number to return to userspace for this device
+ *
+ * A successful bind establishes an ownership over the device and returns
+ * struct iommufd_device pointer, otherwise returns error pointer.
+ *
+ * A driver using this API must set driver_managed_dma and must not touch
+ * the device until this routine succeeds and establishes ownership.
+ *
+ * Binding a PCI device places the entire RID under iommufd control.
+ *
+ * The caller must undo this with iommufd_device_unbind()
+ */
+struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
+					   struct device *dev, u32 *id)
+{
+	struct iommufd_device *idev;
+	struct iommu_group *group;
+	int rc;
+
+	/*
+	 * iommufd always sets IOMMU_CACHE because we offer no way for userspace
+	 * to restore cache coherency.
+	 */
+	if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY))
+		return ERR_PTR(-EINVAL);
+
+	group = iommu_group_get(dev);
+	if (!group)
+		return ERR_PTR(-ENODEV);
+
+	rc = iommu_device_claim_dma_owner(dev, ictx);
+	if (rc)
+		goto out_group_put;
+
+	idev = iommufd_object_alloc(ictx, idev, IOMMUFD_OBJ_DEVICE);
+	if (IS_ERR(idev)) {
+		rc = PTR_ERR(idev);
+		goto out_release_owner;
+	}
+	idev->ictx = ictx;
+	iommufd_ctx_get(ictx);
+	idev->dev = dev;
+	idev->enforce_cache_coherency =
+		device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
+	/* The calling driver is a user until iommufd_device_unbind() */
+	refcount_inc(&idev->obj.users);
+	/* group refcount moves into iommufd_device */
+	idev->group = group;
+
+	/*
+	 * If the caller fails after this success it must call
+	 * iommufd_unbind_device() which is safe since we hold this refcount.
+	 * This also means the device is a leaf in the graph and no other object
+	 * can take a reference on it.
+	 */
+	iommufd_object_finalize(ictx, &idev->obj);
+	*id = idev->obj.id;
+	return idev;
+
+out_release_owner:
+	iommu_device_release_dma_owner(dev);
+out_group_put:
+	iommu_group_put(group);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD);
+
+/**
+ * iommufd_device_unbind - Undo iommufd_device_bind()
+ * @idev: Device returned by iommufd_device_bind()
+ *
+ * Release the device from iommufd control. The DMA ownership will return back
+ * to unowned with DMA controlled by the DMA API. This invalidates the
+ * iommufd_device pointer, other APIs that consume it must not be called
+ * concurrently.
+ */
+void iommufd_device_unbind(struct iommufd_device *idev)
+{
+	bool was_destroyed;
+
+	was_destroyed = iommufd_object_destroy_user(idev->ictx, &idev->obj);
+	WARN_ON(!was_destroyed);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD);
+
+static int iommufd_device_setup_msi(struct iommufd_device *idev,
+				    struct iommufd_hw_pagetable *hwpt,
+				    phys_addr_t sw_msi_start,
+				    unsigned int flags)
+{
+	int rc;
+
+	/*
+	 * IOMMU_CAP_INTR_REMAP means that the platform is isolating MSI, and it
+	 * creates the MSI window by default in the iommu domain. Nothing
+	 * further to do.
+	 */
+	if (device_iommu_capable(idev->dev, IOMMU_CAP_INTR_REMAP))
+		return 0;
+
+	/*
+	 * On ARM systems that set the global IRQ_DOMAIN_FLAG_MSI_REMAP every
+	 * allocated iommu_domain will block interrupts by default and this
+	 * special flow is needed to turn them back on. iommu_dma_prepare_msi()
+	 * will install pages into our domain after request_irq() to make this
+	 * work.
+	 *
+	 * FIXME: This is conceptually broken for iommufd since we want to allow
+	 * userspace to change the domains, eg switch from an identity IOAS to a
+	 * DMA IOAS. There is currently no way to create a MSI window that
+	 * matches what the IRQ layer actually expects in a newly created
+	 * domain.
+	 */
+	if (irq_domain_check_msi_remap()) {
+		if (WARN_ON(!sw_msi_start))
+			return -EPERM;
+		/*
+		 * iommu_get_msi_cookie() can only be called once per domain,
+		 * it returns -EBUSY on later calls.
+		 */
+		if (hwpt->msi_cookie)
+			return 0;
+		rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start);
+		if (rc)
+			return rc;
+		hwpt->msi_cookie = true;
+		return 0;
+	}
+
+	/*
+	 * Otherwise the platform has a MSI window that is not isolated. For
+	 * historical compat with VFIO allow a module parameter to ignore the
+	 * insecurity.
+	 */
+	if (!(flags & IOMMUFD_ATTACH_FLAGS_ALLOW_UNSAFE_INTERRUPT))
+		return -EPERM;
+
+	dev_warn(
+		idev->dev,
+		"Device interrupts cannot be isolated by the IOMMU, this platform in insecure. Use an \"allow_unsafe_interrupts\" module parameter to override\n");
+	return 0;
+}
+
+static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt,
+					   struct iommu_group *group)
+{
+	struct iommufd_device *cur_dev;
+
+	list_for_each_entry(cur_dev, &hwpt->devices, devices_item)
+		if (cur_dev->group == group)
+			return true;
+	return false;
+}
+
+static int iommufd_device_do_attach(struct iommufd_device *idev,
+				    struct iommufd_hw_pagetable *hwpt,
+				    unsigned int flags)
+{
+	phys_addr_t sw_msi_start = 0;
+	int rc;
+
+	mutex_lock(&hwpt->devices_lock);
+
+	/*
+	 * Try to upgrade the domain we have, it is an iommu driver bug to
+	 * report IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail
+	 * enforce_cache_coherency when there are no devices attached to the
+	 * domain.
+	 */
+	if (idev->enforce_cache_coherency && !hwpt->enforce_cache_coherency) {
+		if (hwpt->domain->ops->enforce_cache_coherency)
+			hwpt->enforce_cache_coherency =
+				hwpt->domain->ops->enforce_cache_coherency(
+					hwpt->domain);
+		if (!hwpt->enforce_cache_coherency) {
+			WARN_ON(list_empty(&hwpt->devices));
+			rc = -EINVAL;
+			goto out_unlock;
+		}
+	}
+
+	rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt, idev->dev,
+						   idev->group, &sw_msi_start);
+	if (rc)
+		goto out_unlock;
+
+	rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start, flags);
+	if (rc)
+		goto out_iova;
+
+	/*
+	 * FIXME: Hack around missing a device-centric iommu api, only attach to
+	 * the group once for the first device that is in the group.
+	 */
+	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
+		rc = iommu_attach_group(hwpt->domain, idev->group);
+		if (rc)
+			goto out_iova;
+
+		if (list_empty(&hwpt->devices)) {
+			rc = iopt_table_add_domain(&hwpt->ioas->iopt,
+						   hwpt->domain);
+			if (rc)
+				goto out_detach;
+		}
+	}
+
+	idev->hwpt = hwpt;
+	refcount_inc(&hwpt->obj.users);
+	list_add(&idev->devices_item, &hwpt->devices);
+	mutex_unlock(&hwpt->devices_lock);
+	return 0;
+
+out_detach:
+	iommu_detach_group(hwpt->domain, idev->group);
+out_iova:
+	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+out_unlock:
+	mutex_unlock(&hwpt->devices_lock);
+	return rc;
+}
+
+/*
+ * When automatically managing the domains we search for a compatible domain in
+ * the iopt and if one is found use it, otherwise create a new domain.
+ * Automatic domain selection will never pick a manually created domain.
+ */
+static int iommufd_device_auto_get_domain(struct iommufd_device *idev,
+					  struct iommufd_ioas *ioas,
+					  unsigned int flags)
+{
+	struct iommufd_hw_pagetable *hwpt;
+	int rc;
+
+	/*
+	 * There is no differentiation when domains are allocated, so any domain
+	 * that is willing to attach to the device is interchangeable with any
+	 * other.
+	 */
+	mutex_lock(&ioas->mutex);
+	list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
+		if (!hwpt->auto_domain)
+			continue;
+
+		rc = iommufd_device_do_attach(idev, hwpt, flags);
+
+		/*
+		 * -EINVAL means the domain is incompatible with the device.
+		 * Other error codes should propagate to userspace as failure.
+		 * Success means the domain is attached.
+		 */
+		if (rc == -EINVAL)
+			continue;
+		goto out_unlock;
+	}
+
+	hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev->dev);
+	if (IS_ERR(hwpt)) {
+		rc = PTR_ERR(hwpt);
+		goto out_unlock;
+	}
+	hwpt->auto_domain = true;
+
+	rc = iommufd_device_do_attach(idev, hwpt, flags);
+	if (rc)
+		goto out_abort;
+	list_add_tail(&hwpt->hwpt_item, &ioas->hwpt_list);
+
+	mutex_unlock(&ioas->mutex);
+	iommufd_object_finalize(idev->ictx, &hwpt->obj);
+	return 0;
+
+out_abort:
+	iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj);
+out_unlock:
+	mutex_unlock(&ioas->mutex);
+	return rc;
+}
+
+/**
+ * iommufd_device_attach - Connect a device to an iommu_domain
+ * @idev: device to attach
+ * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
+ *         Output the IOMMUFD_OBJ_HW_PAGETABLE ID
+ * @flags: Optional flags
+ *
+ * This connects the device to an iommu_domain, either automatically or manually
+ * selected. Once this completes the device could do DMA.
+ *
+ * The caller should return the resulting pt_id back to userspace.
+ * This function is undone by calling iommufd_device_detach().
+ */
+int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id,
+			  unsigned int flags)
+{
+	struct iommufd_object *pt_obj;
+	int rc;
+
+	pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY);
+	if (IS_ERR(pt_obj))
+		return PTR_ERR(pt_obj);
+
+	switch (pt_obj->type) {
+	case IOMMUFD_OBJ_HW_PAGETABLE: {
+		struct iommufd_hw_pagetable *hwpt =
+			container_of(pt_obj, struct iommufd_hw_pagetable, obj);
+
+		rc = iommufd_device_do_attach(idev, hwpt, flags);
+		if (rc)
+			goto out_put_pt_obj;
+
+		mutex_lock(&hwpt->ioas->mutex);
+		list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list);
+		mutex_unlock(&hwpt->ioas->mutex);
+		break;
+	}
+	case IOMMUFD_OBJ_IOAS: {
+		struct iommufd_ioas *ioas =
+			container_of(pt_obj, struct iommufd_ioas, obj);
+
+		rc = iommufd_device_auto_get_domain(idev, ioas, flags);
+		if (rc)
+			goto out_put_pt_obj;
+		break;
+	}
+	default:
+		rc = -EINVAL;
+		goto out_put_pt_obj;
+	}
+
+	refcount_inc(&idev->obj.users);
+	*pt_id = idev->hwpt->obj.id;
+	rc = 0;
+
+out_put_pt_obj:
+	iommufd_put_object(pt_obj);
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
+
+/**
+ * iommufd_device_detach - Disconnect a device to an iommu_domain
+ * @idev: device to detach
+ *
+ * Undo iommufd_device_attach(). This disconnects the idev from the previously
+ * attached pt_id. The device returns back to a blocked DMA translation.
+ */
+void iommufd_device_detach(struct iommufd_device *idev)
+{
+	struct iommufd_hw_pagetable *hwpt = idev->hwpt;
+
+	mutex_lock(&hwpt->ioas->mutex);
+	mutex_lock(&hwpt->devices_lock);
+	list_del(&idev->devices_item);
+	if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) {
+		if (list_empty(&hwpt->devices)) {
+			iopt_table_remove_domain(&hwpt->ioas->iopt,
+						 hwpt->domain);
+			list_del(&hwpt->hwpt_item);
+		}
+		iommu_detach_group(hwpt->domain, idev->group);
+	}
+	iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+	mutex_unlock(&hwpt->devices_lock);
+	mutex_unlock(&hwpt->ioas->mutex);
+
+	if (hwpt->auto_domain)
+		iommufd_object_destroy_user(idev->ictx, &hwpt->obj);
+	else
+		refcount_dec(&hwpt->obj.users);
+
+	idev->hwpt = NULL;
+
+	refcount_dec(&idev->obj.users);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD);
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index bb5cbd8f4e5991..73345886d969e5 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -103,6 +103,7 @@  static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
 enum iommufd_object_type {
 	IOMMUFD_OBJ_NONE,
 	IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
+	IOMMUFD_OBJ_DEVICE,
 	IOMMUFD_OBJ_HW_PAGETABLE,
 	IOMMUFD_OBJ_IOAS,
 };
@@ -229,6 +230,8 @@  struct iommufd_hw_pagetable {
 	struct iommufd_ioas *ioas;
 	struct iommu_domain *domain;
 	bool auto_domain : 1;
+	bool enforce_cache_coherency : 1;
+	bool msi_cookie : 1;
 	/* Head at iommufd_ioas::hwpt_list */
 	struct list_head hwpt_item;
 	struct mutex devices_lock;
@@ -240,6 +243,8 @@  iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 			   struct device *dev);
 void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
 
+void iommufd_device_destroy(struct iommufd_object *obj);
+
 struct iommufd_access {
 	unsigned long iova_alignment;
 	u32 iopt_access_list_id;
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 3eab714b8e12a3..8a114ddbdfcde2 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -352,6 +352,9 @@  void iommufd_ctx_put(struct iommufd_ctx *ictx)
 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD);
 
 static const struct iommufd_object_ops iommufd_object_ops[] = {
+	[IOMMUFD_OBJ_DEVICE] = {
+		.destroy = iommufd_device_destroy,
+	},
 	[IOMMUFD_OBJ_IOAS] = {
 		.destroy = iommufd_ioas_destroy,
 	},
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 26e09d539737bb..31efacd8a46cce 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -9,10 +9,23 @@ 
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/device.h>
 
+struct iommufd_device;
 struct iommufd_ctx;
 struct file;
 
+struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
+					   struct device *dev, u32 *id);
+void iommufd_device_unbind(struct iommufd_device *idev);
+
+enum {
+	IOMMUFD_ATTACH_FLAGS_ALLOW_UNSAFE_INTERRUPT = 1 << 0,
+};
+int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id,
+			  unsigned int flags);
+void iommufd_device_detach(struct iommufd_device *idev);
+
 enum {
 	IOMMUFD_ACCESS_RW_READ = 0,
 	IOMMUFD_ACCESS_RW_WRITE = 1 << 0,