diff mbox series

[v2,3/6] iommufd: Initializing and releasing IO page fault data

Message ID 20231026024930.382898-4-baolu.lu@linux.intel.com (mailing list archive)
State New
Headers show
Series IOMMUFD: Deliver IO page faults to user space | expand

Commit Message

Baolu Lu Oct. 26, 2023, 2:49 a.m. UTC
Add some housekeeping code for IO page fault dilivery. Add a fault field
in the iommufd_hw_pagetable structure to store pending IO page faults and
other related data.

The fault field is allocated and initialized when an IOPF-capable user
HWPT is allocated. It is indicated by the IOMMU_HWPT_ALLOC_IOPF_CAPABLE
flag being set in the allocation user data. The fault field exists until
the HWPT is destroyed. This also means that you can determine whether a
HWPT is IOPF-capable by checking the fault field.

When an IOPF-capable HWPT is attached to a device (could also be a PASID of
a device in the future), the iommufd device pointer is saved for the pasid
of the device. The pointer is recalled and all pending iopf groups are
discarded after the HWPT is detached from the device.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 include/linux/iommu.h                   |  6 +++
 drivers/iommu/iommufd/iommufd_private.h | 10 ++++
 drivers/iommu/iommufd/device.c          | 69 +++++++++++++++++++++++--
 drivers/iommu/iommufd/hw_pagetable.c    | 56 +++++++++++++++++++-
 4 files changed, 137 insertions(+), 4 deletions(-)

Comments

Joel Granados Dec. 12, 2023, 1:10 p.m. UTC | #1
On Thu, Oct 26, 2023 at 10:49:27AM +0800, Lu Baolu wrote:
> Add some housekeeping code for IO page fault dilivery. Add a fault field
> in the iommufd_hw_pagetable structure to store pending IO page faults and
> other related data.
> 
> The fault field is allocated and initialized when an IOPF-capable user
> HWPT is allocated. It is indicated by the IOMMU_HWPT_ALLOC_IOPF_CAPABLE
> flag being set in the allocation user data. The fault field exists until
> the HWPT is destroyed. This also means that you can determine whether a
> HWPT is IOPF-capable by checking the fault field.
> 
> When an IOPF-capable HWPT is attached to a device (could also be a PASID of
> a device in the future), the iommufd device pointer is saved for the pasid
> of the device. The pointer is recalled and all pending iopf groups are
> discarded after the HWPT is detached from the device.
> 
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
>  include/linux/iommu.h                   |  6 +++
>  drivers/iommu/iommufd/iommufd_private.h | 10 ++++
>  drivers/iommu/iommufd/device.c          | 69 +++++++++++++++++++++++--
>  drivers/iommu/iommufd/hw_pagetable.c    | 56 +++++++++++++++++++-
>  4 files changed, 137 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 615d8a5f9dee..600ca3842c8a 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -130,6 +130,12 @@ struct iopf_group {
>  	struct work_struct work;
>  	struct device *dev;
>  	struct iommu_domain *domain;
> +
> +	/*
> +	 * Used by iopf handlers, like iommufd, to hook the iopf group
> +	 * on its own lists.
> +	 */
> +	struct list_head node;
>  };
>  
>  /**
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index 1bd412cff2d6..0dbaa2dc5b22 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -230,6 +230,15 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd,
>  
>  int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
>  
> +struct hw_pgtable_fault {
> +	struct iommufd_ctx *ictx;
> +	struct iommufd_hw_pagetable *hwpt;
> +	/* Protect below iopf lists. */
> +	struct mutex mutex;
> +	struct list_head deliver;
> +	struct list_head response;
> +};
> +
>  /*
>   * A HW pagetable is called an iommu_domain inside the kernel. This user object
>   * allows directly creating and inspecting the domains. Domains that have kernel
> @@ -239,6 +248,7 @@ int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
>  struct iommufd_hw_pagetable {
>  	struct iommufd_object obj;
>  	struct iommu_domain *domain;
> +	struct hw_pgtable_fault *fault;
>  
>  	void (*abort)(struct iommufd_object *obj);
>  	void (*destroy)(struct iommufd_object *obj);
> diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
> index 645ab5d290fe..0a8e03d5e7c5 100644
> --- a/drivers/iommu/iommufd/device.c
> +++ b/drivers/iommu/iommufd/device.c
> @@ -456,6 +456,16 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
>  	if (rc)
>  		goto err_unlock;
>  
> +	if (hwpt->fault) {
> +		void *curr;
> +
> +		curr = iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, idev);
I'm hitting an error here when I try to attach to a hwpt that I created
previously with the `IOMMU_HWPT_ALLOC_IOPF_CAPABLE` flag.

I get an -ENODEV from iopf_pasid_cookie_set which is triggered by
dev->iommu->fault_param being 0x0.

I looked around and I see that the fault param gets set in
iopf_queue_add_device which is called from iommu_dev_enable_feature
only. Furthermore iommu_dev_enable_feature is only called in idxd and
uacce drivers.

Questions:
1. Should iopf_queue_add_device get called from the
   IOMMU_HWPT_ALLOC_IOPF_CAPABLE ioctl call? This make sense to me as
   this is where the device and the IOPF are related from user space.
2. This is not intended to work only with idxd and uacce. right?

Best
> +		if (IS_ERR(curr)) {
> +			rc = PTR_ERR(curr);
> +			goto err_unresv;
> +		}
> +	}
> +
>  	/*
>  	 * Only attach to the group once for the first device that is in the
>  	 * group. All the other devices will follow this attachment. The user
> @@ -466,17 +476,20 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
>  	if (list_empty(&idev->igroup->device_list)) {
>  		rc = iommufd_group_setup_msi(idev->igroup, hwpt);
>  		if (rc)
> -			goto err_unresv;
> +			goto err_unset;
>  
>  		rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
>  		if (rc)
> -			goto err_unresv;
> +			goto err_unset;
>  		idev->igroup->hwpt = hwpt;
>  	}
>  	refcount_inc(&hwpt->obj.users);
>  	list_add_tail(&idev->group_item, &idev->igroup->device_list);
>  	mutex_unlock(&idev->igroup->lock);
>  	return 0;
> +err_unset:
> +	if (hwpt->fault)
> +		iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, NULL);
>  err_unresv:
>  	iommufd_device_remove_rr(idev, hwpt);
>  err_unlock:
> @@ -484,6 +497,30 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
>  	return rc;
>  }
>  
> +/*
> + * Discard all pending page faults. Called when a hw pagetable is detached
> + * from a device. The iommu core guarantees that all page faults have been
> + * responded, hence there's no need to respond it again.
> + */
> +static void iommufd_hw_pagetable_discard_iopf(struct iommufd_hw_pagetable *hwpt)
> +{
> +	struct iopf_group *group, *next;
> +
> +	if (!hwpt->fault)
> +		return;
> +
> +	mutex_lock(&hwpt->fault->mutex);
> +	list_for_each_entry_safe(group, next, &hwpt->fault->deliver, node) {
> +		list_del(&group->node);
> +		iopf_free_group(group);
> +	}
> +	list_for_each_entry_safe(group, next, &hwpt->fault->response, node) {
> +		list_del(&group->node);
> +		iopf_free_group(group);
> +	}
> +	mutex_unlock(&hwpt->fault->mutex);
> +}
> +
>  struct iommufd_hw_pagetable *
>  iommufd_hw_pagetable_detach(struct iommufd_device *idev)
>  {
> @@ -491,6 +528,8 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev)
>  
>  	mutex_lock(&idev->igroup->lock);
>  	list_del(&idev->group_item);
> +	if (hwpt->fault)
> +		iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, NULL);
>  	if (list_empty(&idev->igroup->device_list)) {
>  		iommu_detach_group(hwpt->domain, idev->igroup->group);
>  		idev->igroup->hwpt = NULL;
> @@ -498,6 +537,8 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev)
>  	iommufd_device_remove_rr(idev, hwpt);
>  	mutex_unlock(&idev->igroup->lock);
>  
> +	iommufd_hw_pagetable_discard_iopf(hwpt);
> +
>  	/* Caller must destroy hwpt */
>  	return hwpt;
>  }
> @@ -563,9 +604,24 @@ iommufd_device_do_replace(struct iommufd_device *idev,
>  	if (rc)
>  		goto err_unresv;
>  
> +	if (old_hwpt->fault) {
> +		iommufd_hw_pagetable_discard_iopf(old_hwpt);
> +		iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, NULL);
> +	}
> +
> +	if (hwpt->fault) {
> +		void *curr;
> +
> +		curr = iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, idev);
> +		if (IS_ERR(curr)) {
> +			rc = PTR_ERR(curr);
> +			goto err_unresv;
> +		}
> +	}
> +
>  	rc = iommu_group_replace_domain(igroup->group, hwpt->domain);
>  	if (rc)
> -		goto err_unresv;
> +		goto err_unset;
>  
>  	if (iommufd_hw_pagetable_compare_ioas(old_hwpt, hwpt)) {
>  		list_for_each_entry(cur, &igroup->device_list, group_item)
> @@ -583,8 +639,15 @@ iommufd_device_do_replace(struct iommufd_device *idev,
>  					      &old_hwpt->obj.users));
>  	mutex_unlock(&idev->igroup->lock);
>  
> +	iommufd_hw_pagetable_discard_iopf(old_hwpt);
> +
>  	/* Caller must destroy old_hwpt */
>  	return old_hwpt;
> +err_unset:
> +	if (hwpt->fault)
> +		iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, NULL);
> +	if (old_hwpt->fault)
> +		iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, idev);
>  err_unresv:
>  	if (iommufd_hw_pagetable_compare_ioas(old_hwpt, hwpt)) {
>  		list_for_each_entry(cur, &igroup->device_list, group_item)
> diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
> index 72c46de1396b..9f94c824cf86 100644
> --- a/drivers/iommu/iommufd/hw_pagetable.c
> +++ b/drivers/iommu/iommufd/hw_pagetable.c
> @@ -38,9 +38,38 @@ static void iommufd_kernel_managed_hwpt_destroy(struct iommufd_object *obj)
>  	refcount_dec(&hwpt->ioas->obj.users);
>  }
>  
> +static struct hw_pgtable_fault *hw_pagetable_fault_alloc(void)
> +{
> +	struct hw_pgtable_fault *fault;
> +
> +	fault = kzalloc(sizeof(*fault), GFP_KERNEL);
> +	if (!fault)
> +		return ERR_PTR(-ENOMEM);
> +
> +	INIT_LIST_HEAD(&fault->deliver);
> +	INIT_LIST_HEAD(&fault->response);
> +	mutex_init(&fault->mutex);
> +
> +	return fault;
> +}
> +
> +static void hw_pagetable_fault_free(struct hw_pgtable_fault *fault)
> +{
> +	WARN_ON(!list_empty(&fault->deliver));
> +	WARN_ON(!list_empty(&fault->response));
> +
> +	kfree(fault);
> +}
> +
>  void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)
>  {
> -	container_of(obj, struct iommufd_hw_pagetable, obj)->destroy(obj);
> +	struct iommufd_hw_pagetable *hwpt =
> +		container_of(obj, struct iommufd_hw_pagetable, obj);
> +
> +	if (hwpt->fault)
> +		hw_pagetable_fault_free(hwpt->fault);
> +
> +	hwpt->destroy(obj);
>  }
>  
>  static void iommufd_user_managed_hwpt_abort(struct iommufd_object *obj)
> @@ -289,6 +318,17 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx,
>  	return ERR_PTR(rc);
>  }
>  
> +static int iommufd_hw_pagetable_iopf_handler(struct iopf_group *group)
> +{
> +	struct iommufd_hw_pagetable *hwpt = group->domain->fault_data;
> +
> +	mutex_lock(&hwpt->fault->mutex);
> +	list_add_tail(&group->node, &hwpt->fault->deliver);
> +	mutex_unlock(&hwpt->fault->mutex);
> +
> +	return 0;
> +}
> +
>  int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
>  {
>  	struct iommufd_hw_pagetable *(*alloc_fn)(
> @@ -364,6 +404,20 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
>  		goto out_unlock;
>  	}
>  
> +	if (cmd->flags & IOMMU_HWPT_ALLOC_IOPF_CAPABLE) {
> +		hwpt->fault = hw_pagetable_fault_alloc();
> +		if (IS_ERR(hwpt->fault)) {
> +			rc = PTR_ERR(hwpt->fault);
> +			hwpt->fault = NULL;
> +			goto out_hwpt;
> +		}
> +
> +		hwpt->fault->ictx = ucmd->ictx;
> +		hwpt->fault->hwpt = hwpt;
> +		hwpt->domain->iopf_handler = iommufd_hw_pagetable_iopf_handler;
> +		hwpt->domain->fault_data = hwpt;
> +	}
> +
>  	cmd->out_hwpt_id = hwpt->obj.id;
>  	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
>  	if (rc)
> -- 
> 2.34.1
>
Jason Gunthorpe Dec. 12, 2023, 2:12 p.m. UTC | #2
On Tue, Dec 12, 2023 at 02:10:08PM +0100, Joel Granados wrote:

> > diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
> > index 645ab5d290fe..0a8e03d5e7c5 100644
> > --- a/drivers/iommu/iommufd/device.c
> > +++ b/drivers/iommu/iommufd/device.c
> > @@ -456,6 +456,16 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
> >  	if (rc)
> >  		goto err_unlock;
> >  
> > +	if (hwpt->fault) {
> > +		void *curr;
> > +
> > +		curr = iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, idev);
> I'm hitting an error here when I try to attach to a hwpt that I created
> previously with the `IOMMU_HWPT_ALLOC_IOPF_CAPABLE` flag.
> 
> I get an -ENODEV from iopf_pasid_cookie_set which is triggered by
> dev->iommu->fault_param being 0x0.
> 
> I looked around and I see that the fault param gets set in
> iopf_queue_add_device which is called from iommu_dev_enable_feature
> only. Furthermore iommu_dev_enable_feature is only called in idxd and
> uacce drivers.
> 
> Questions:
> 1. Should iopf_queue_add_device get called from the
>    IOMMU_HWPT_ALLOC_IOPF_CAPABLE ioctl call? This make sense to me as
>    this is where the device and the IOPF are related from user space.

It probably needs to call the set feature thing in the short term.

In the medium term I would like the drivers to manage the iopf based
on domain attachment not explicit feature asks

> 2. This is not intended to work only with idxd and uacce. right?

It should work everywhere, I suspect Intel Team didn't hit this
because they are testing IDXD SIOV? Can you guys also test it as a PF
assignment?

Jason
Baolu Lu Dec. 13, 2023, 2:04 a.m. UTC | #3
On 12/12/23 10:12 PM, Jason Gunthorpe wrote:
> On Tue, Dec 12, 2023 at 02:10:08PM +0100, Joel Granados wrote:
> 
>>> diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
>>> index 645ab5d290fe..0a8e03d5e7c5 100644
>>> --- a/drivers/iommu/iommufd/device.c
>>> +++ b/drivers/iommu/iommufd/device.c
>>> @@ -456,6 +456,16 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
>>>   	if (rc)
>>>   		goto err_unlock;
>>>   
>>> +	if (hwpt->fault) {
>>> +		void *curr;
>>> +
>>> +		curr = iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, idev);
>> I'm hitting an error here when I try to attach to a hwpt that I created
>> previously with the `IOMMU_HWPT_ALLOC_IOPF_CAPABLE` flag.
>>
>> I get an -ENODEV from iopf_pasid_cookie_set which is triggered by
>> dev->iommu->fault_param being 0x0.
>>
>> I looked around and I see that the fault param gets set in
>> iopf_queue_add_device which is called from iommu_dev_enable_feature
>> only. Furthermore iommu_dev_enable_feature is only called in idxd and
>> uacce drivers.
>>
>> Questions:
>> 1. Should iopf_queue_add_device get called from the
>>     IOMMU_HWPT_ALLOC_IOPF_CAPABLE ioctl call? This make sense to me as
>>     this is where the device and the IOPF are related from user space.
> It probably needs to call the set feature thing in the short term.
> 
> In the medium term I would like the drivers to manage the iopf based
> on domain attachment not explicit feature asks

Yes, it's the same as my plan.

> 
>> 2. This is not intended to work only with idxd and uacce. right?
> It should work everywhere, I suspect Intel Team didn't hit this
> because they are testing IDXD SIOV?

Yes.

> Can you guys also test it as a PF
> assignment?

For PF assignment, probably the driver (vfio-pci) needs to enable iopf.

Best regards,
baolu
Tian, Kevin Dec. 13, 2023, 2:15 a.m. UTC | #4
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Wednesday, December 13, 2023 10:05 AM
> >
> >> 2. This is not intended to work only with idxd and uacce. right?
> > It should work everywhere, I suspect Intel Team didn't hit this
> > because they are testing IDXD SIOV?
> 
> Yes.
> 
> > Can you guys also test it as a PF
> > assignment?
> 
> For PF assignment, probably the driver (vfio-pci) needs to enable iopf.
> 

We haven't merged anything for SIOV yet.

so the base of this series should be PCI functions (PF or VF) and vfio-pci
has to be extended with whatever required to support iopf.
Jason Gunthorpe Dec. 13, 2023, 1:19 p.m. UTC | #5
On Wed, Dec 13, 2023 at 02:15:28AM +0000, Tian, Kevin wrote:
> > From: Baolu Lu <baolu.lu@linux.intel.com>
> > Sent: Wednesday, December 13, 2023 10:05 AM
> > >
> > >> 2. This is not intended to work only with idxd and uacce. right?
> > > It should work everywhere, I suspect Intel Team didn't hit this
> > > because they are testing IDXD SIOV?
> > 
> > Yes.
> > 
> > > Can you guys also test it as a PF
> > > assignment?
> > 
> > For PF assignment, probably the driver (vfio-pci) needs to enable iopf.
> > 
> 
> We haven't merged anything for SIOV yet.
> 
> so the base of this series should be PCI functions (PF or VF) and vfio-pci
> has to be extended with whatever required to support iopf.

Right. I suggest you target full idxd device assignment to a guest
with working PRI/etc as a validation.

Jason
diff mbox series

Patch

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 615d8a5f9dee..600ca3842c8a 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -130,6 +130,12 @@  struct iopf_group {
 	struct work_struct work;
 	struct device *dev;
 	struct iommu_domain *domain;
+
+	/*
+	 * Used by iopf handlers, like iommufd, to hook the iopf group
+	 * on its own lists.
+	 */
+	struct list_head node;
 };
 
 /**
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 1bd412cff2d6..0dbaa2dc5b22 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -230,6 +230,15 @@  int iommufd_option_rlimit_mode(struct iommu_option *cmd,
 
 int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
 
+struct hw_pgtable_fault {
+	struct iommufd_ctx *ictx;
+	struct iommufd_hw_pagetable *hwpt;
+	/* Protect below iopf lists. */
+	struct mutex mutex;
+	struct list_head deliver;
+	struct list_head response;
+};
+
 /*
  * A HW pagetable is called an iommu_domain inside the kernel. This user object
  * allows directly creating and inspecting the domains. Domains that have kernel
@@ -239,6 +248,7 @@  int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
 struct iommufd_hw_pagetable {
 	struct iommufd_object obj;
 	struct iommu_domain *domain;
+	struct hw_pgtable_fault *fault;
 
 	void (*abort)(struct iommufd_object *obj);
 	void (*destroy)(struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 645ab5d290fe..0a8e03d5e7c5 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -456,6 +456,16 @@  int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 	if (rc)
 		goto err_unlock;
 
+	if (hwpt->fault) {
+		void *curr;
+
+		curr = iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, idev);
+		if (IS_ERR(curr)) {
+			rc = PTR_ERR(curr);
+			goto err_unresv;
+		}
+	}
+
 	/*
 	 * Only attach to the group once for the first device that is in the
 	 * group. All the other devices will follow this attachment. The user
@@ -466,17 +476,20 @@  int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 	if (list_empty(&idev->igroup->device_list)) {
 		rc = iommufd_group_setup_msi(idev->igroup, hwpt);
 		if (rc)
-			goto err_unresv;
+			goto err_unset;
 
 		rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
 		if (rc)
-			goto err_unresv;
+			goto err_unset;
 		idev->igroup->hwpt = hwpt;
 	}
 	refcount_inc(&hwpt->obj.users);
 	list_add_tail(&idev->group_item, &idev->igroup->device_list);
 	mutex_unlock(&idev->igroup->lock);
 	return 0;
+err_unset:
+	if (hwpt->fault)
+		iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, NULL);
 err_unresv:
 	iommufd_device_remove_rr(idev, hwpt);
 err_unlock:
@@ -484,6 +497,30 @@  int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 	return rc;
 }
 
+/*
+ * Discard all pending page faults. Called when a hw pagetable is detached
+ * from a device. The iommu core guarantees that all page faults have been
+ * responded, hence there's no need to respond it again.
+ */
+static void iommufd_hw_pagetable_discard_iopf(struct iommufd_hw_pagetable *hwpt)
+{
+	struct iopf_group *group, *next;
+
+	if (!hwpt->fault)
+		return;
+
+	mutex_lock(&hwpt->fault->mutex);
+	list_for_each_entry_safe(group, next, &hwpt->fault->deliver, node) {
+		list_del(&group->node);
+		iopf_free_group(group);
+	}
+	list_for_each_entry_safe(group, next, &hwpt->fault->response, node) {
+		list_del(&group->node);
+		iopf_free_group(group);
+	}
+	mutex_unlock(&hwpt->fault->mutex);
+}
+
 struct iommufd_hw_pagetable *
 iommufd_hw_pagetable_detach(struct iommufd_device *idev)
 {
@@ -491,6 +528,8 @@  iommufd_hw_pagetable_detach(struct iommufd_device *idev)
 
 	mutex_lock(&idev->igroup->lock);
 	list_del(&idev->group_item);
+	if (hwpt->fault)
+		iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, NULL);
 	if (list_empty(&idev->igroup->device_list)) {
 		iommu_detach_group(hwpt->domain, idev->igroup->group);
 		idev->igroup->hwpt = NULL;
@@ -498,6 +537,8 @@  iommufd_hw_pagetable_detach(struct iommufd_device *idev)
 	iommufd_device_remove_rr(idev, hwpt);
 	mutex_unlock(&idev->igroup->lock);
 
+	iommufd_hw_pagetable_discard_iopf(hwpt);
+
 	/* Caller must destroy hwpt */
 	return hwpt;
 }
@@ -563,9 +604,24 @@  iommufd_device_do_replace(struct iommufd_device *idev,
 	if (rc)
 		goto err_unresv;
 
+	if (old_hwpt->fault) {
+		iommufd_hw_pagetable_discard_iopf(old_hwpt);
+		iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, NULL);
+	}
+
+	if (hwpt->fault) {
+		void *curr;
+
+		curr = iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, idev);
+		if (IS_ERR(curr)) {
+			rc = PTR_ERR(curr);
+			goto err_unresv;
+		}
+	}
+
 	rc = iommu_group_replace_domain(igroup->group, hwpt->domain);
 	if (rc)
-		goto err_unresv;
+		goto err_unset;
 
 	if (iommufd_hw_pagetable_compare_ioas(old_hwpt, hwpt)) {
 		list_for_each_entry(cur, &igroup->device_list, group_item)
@@ -583,8 +639,15 @@  iommufd_device_do_replace(struct iommufd_device *idev,
 					      &old_hwpt->obj.users));
 	mutex_unlock(&idev->igroup->lock);
 
+	iommufd_hw_pagetable_discard_iopf(old_hwpt);
+
 	/* Caller must destroy old_hwpt */
 	return old_hwpt;
+err_unset:
+	if (hwpt->fault)
+		iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, NULL);
+	if (old_hwpt->fault)
+		iopf_pasid_cookie_set(idev->dev, IOMMU_NO_PASID, idev);
 err_unresv:
 	if (iommufd_hw_pagetable_compare_ioas(old_hwpt, hwpt)) {
 		list_for_each_entry(cur, &igroup->device_list, group_item)
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 72c46de1396b..9f94c824cf86 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -38,9 +38,38 @@  static void iommufd_kernel_managed_hwpt_destroy(struct iommufd_object *obj)
 	refcount_dec(&hwpt->ioas->obj.users);
 }
 
+static struct hw_pgtable_fault *hw_pagetable_fault_alloc(void)
+{
+	struct hw_pgtable_fault *fault;
+
+	fault = kzalloc(sizeof(*fault), GFP_KERNEL);
+	if (!fault)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&fault->deliver);
+	INIT_LIST_HEAD(&fault->response);
+	mutex_init(&fault->mutex);
+
+	return fault;
+}
+
+static void hw_pagetable_fault_free(struct hw_pgtable_fault *fault)
+{
+	WARN_ON(!list_empty(&fault->deliver));
+	WARN_ON(!list_empty(&fault->response));
+
+	kfree(fault);
+}
+
 void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)
 {
-	container_of(obj, struct iommufd_hw_pagetable, obj)->destroy(obj);
+	struct iommufd_hw_pagetable *hwpt =
+		container_of(obj, struct iommufd_hw_pagetable, obj);
+
+	if (hwpt->fault)
+		hw_pagetable_fault_free(hwpt->fault);
+
+	hwpt->destroy(obj);
 }
 
 static void iommufd_user_managed_hwpt_abort(struct iommufd_object *obj)
@@ -289,6 +318,17 @@  iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx,
 	return ERR_PTR(rc);
 }
 
+static int iommufd_hw_pagetable_iopf_handler(struct iopf_group *group)
+{
+	struct iommufd_hw_pagetable *hwpt = group->domain->fault_data;
+
+	mutex_lock(&hwpt->fault->mutex);
+	list_add_tail(&group->node, &hwpt->fault->deliver);
+	mutex_unlock(&hwpt->fault->mutex);
+
+	return 0;
+}
+
 int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 {
 	struct iommufd_hw_pagetable *(*alloc_fn)(
@@ -364,6 +404,20 @@  int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 		goto out_unlock;
 	}
 
+	if (cmd->flags & IOMMU_HWPT_ALLOC_IOPF_CAPABLE) {
+		hwpt->fault = hw_pagetable_fault_alloc();
+		if (IS_ERR(hwpt->fault)) {
+			rc = PTR_ERR(hwpt->fault);
+			hwpt->fault = NULL;
+			goto out_hwpt;
+		}
+
+		hwpt->fault->ictx = ucmd->ictx;
+		hwpt->fault->hwpt = hwpt;
+		hwpt->domain->iopf_handler = iommufd_hw_pagetable_iopf_handler;
+		hwpt->domain->fault_data = hwpt;
+	}
+
 	cmd->out_hwpt_id = hwpt->obj.id;
 	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
 	if (rc)