diff mbox series

[v4,09/10] iommu: Make iommu_queue_iopf() more generic

Message ID 20230825023026.132919-10-baolu.lu@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series iommu: Prepare to deliver page faults to user space | expand

Commit Message

Baolu Lu Aug. 25, 2023, 2:30 a.m. UTC
Make iommu_queue_iopf() more generic by making the iopf_group a minimal
set of iopf's that an iopf handler of domain should handle and respond
to. Add domain parameter to struct iopf_group so that the handler can
retrieve and use it directly.

Change iommu_queue_iopf() to forward groups of iopf's to the domain's
iopf handler. This is also a necessary step to decouple the sva iopf
handling code from this interface.

The iopf handling framework in the core requires that domain's lifetime
should cover all possible iopf. This has been documented in the comments
for iommu_queue_iopf(), which is the entry point of the framework. Add
some debugging code to enforce this requirement.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 include/linux/iommu.h      |  4 ++--
 drivers/iommu/iommu-sva.h  |  6 ++---
 drivers/iommu/io-pgfault.c | 49 ++++++++++++++++++++++++++++----------
 drivers/iommu/iommu-sva.c  |  3 +--
 drivers/iommu/iommu.c      | 23 ++++++++++++++++++
 5 files changed, 65 insertions(+), 20 deletions(-)

Comments

Tian, Kevin Aug. 25, 2023, 8:17 a.m. UTC | #1
> From: Lu Baolu <baolu.lu@linux.intel.com>
> Sent: Friday, August 25, 2023 10:30 AM
> 
> +	if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)
> +		domain = iommu_get_domain_for_dev_pasid(dev, fault-
> >prm.pasid, 0);
> +	else
> +		domain = iommu_get_domain_for_dev(dev);
> +
> +	if (!domain || !domain->iopf_handler) {
> +		dev_warn_ratelimited(dev,
> +			"iopf from pasid %d received without handler
> installed\n",

"without domain attached or handler installed"

> 
> +int iommu_sva_handle_iopf(struct iopf_group *group)
> +{
> +	struct iommu_fault_param *fault_param = group->dev->iommu-
> >fault_param;
> +
> +	INIT_WORK(&group->work, iopf_handler);
> +	if (!queue_work(fault_param->queue->wq, &group->work))
> +		return -EBUSY;
> +
> +	return 0;
> +}

this function is generic so the name should not tie to 'sva'.

> +
>  /**
>   * iopf_queue_flush_dev - Ensure that all queued faults have been
> processed
>   * @dev: the endpoint whose faults need to be flushed.

Presumably we also need a flush callback per domain given now
the use of workqueue is optional then flush_workqueue() might
not be sufficient.

> 
> +static void assert_no_pending_iopf(struct device *dev, ioasid_t pasid)
> +{
> +	struct iommu_fault_param *iopf_param = dev->iommu-
> >fault_param;
> +	struct iopf_fault *iopf;
> +
> +	if (!iopf_param)
> +		return;
> +
> +	mutex_lock(&iopf_param->lock);
> +	list_for_each_entry(iopf, &iopf_param->partial, list) {
> +		if (WARN_ON(iopf->fault.prm.pasid == pasid))
> +			break;
> +	}

partial list is protected by dev_iommu lock.

> +
> +	list_for_each_entry(iopf, &iopf_param->faults, list) {
> +		if (WARN_ON(iopf->fault.prm.pasid == pasid))
> +			break;
> +	}
> +	mutex_unlock(&iopf_param->lock);
> +}
> +
>  void iommu_detach_device(struct iommu_domain *domain, struct device
> *dev)
>  {
>  	struct iommu_group *group;
> @@ -1959,6 +1980,7 @@ void iommu_detach_device(struct iommu_domain
> *domain, struct device *dev)
>  	if (!group)
>  		return;
> 
> +	assert_no_pending_iopf(dev, IOMMU_NO_PASID);
>  	mutex_lock(&group->mutex);
>  	if (WARN_ON(domain != group->domain) ||
>  	    WARN_ON(list_count_nodes(&group->devices) != 1))
> @@ -3269,6 +3291,7 @@ void iommu_detach_device_pasid(struct
> iommu_domain *domain, struct device *dev,
>  {
>  	struct iommu_group *group = iommu_group_get(dev);
> 
> +	assert_no_pending_iopf(dev, pasid);

this doesn't look correct. A sane driver will stop triggering new
page request before calling detach but there are still pending ones
not drained until iopf_queue_flush_dev() called by
ops->remove_dev_pasid().

then this check will cause false warning.

>  	mutex_lock(&group->mutex);
>  	__iommu_remove_group_pasid(group, pasid);
>  	WARN_ON(xa_erase(&group->pasid_array, pasid) != domain);
> --
> 2.34.1
Baolu Lu Aug. 26, 2023, 7:32 a.m. UTC | #2
On 8/25/23 4:17 PM, Tian, Kevin wrote:
>> From: Lu Baolu <baolu.lu@linux.intel.com>
>> Sent: Friday, August 25, 2023 10:30 AM
>>
>> +	if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)
>> +		domain = iommu_get_domain_for_dev_pasid(dev, fault-
>>> prm.pasid, 0);
>> +	else
>> +		domain = iommu_get_domain_for_dev(dev);
>> +
>> +	if (!domain || !domain->iopf_handler) {
>> +		dev_warn_ratelimited(dev,
>> +			"iopf from pasid %d received without handler
>> installed\n",
> 
> "without domain attached or handler installed"

Okay.

> 
>>
>> +int iommu_sva_handle_iopf(struct iopf_group *group)
>> +{
>> +	struct iommu_fault_param *fault_param = group->dev->iommu-
>>> fault_param;
>> +
>> +	INIT_WORK(&group->work, iopf_handler);
>> +	if (!queue_work(fault_param->queue->wq, &group->work))
>> +		return -EBUSY;
>> +
>> +	return 0;
>> +}
> 
> this function is generic so the name should not tie to 'sva'.

Currently only sva uses it. It's fine to make it generic later when any
new use comes. Does it work to you?

Best regards,
baolu
Baolu Lu Aug. 26, 2023, 8:01 a.m. UTC | #3
On 8/25/23 4:17 PM, Tian, Kevin wrote:
>> +
>>   /**
>>    * iopf_queue_flush_dev - Ensure that all queued faults have been
>> processed
>>    * @dev: the endpoint whose faults need to be flushed.
> Presumably we also need a flush callback per domain given now
> the use of workqueue is optional then flush_workqueue() might
> not be sufficient.
> 

The iopf_queue_flush_dev() function flushes all pending faults from the
IOMMU queue for a specific device. It has no means to flush fault queues
out of iommu core.

The iopf_queue_flush_dev() function is typically called when a domain is
detaching from a PASID. Hence it's necessary to flush the pending faults
from top to bottom. For example, iommufd should flush pending faults in
its fault queues after detaching the domain from the pasid.

The fault_param->lock mutex is sufficient to avoid the race condition if
the workqueue is not used. However, if the workqueue is used, then it is
possible for a workqueue thread to be in the middle of delivering a
fault while the fault queue is being flushed.

Best regards,
baolu
Baolu Lu Aug. 26, 2023, 8:04 a.m. UTC | #4
On 8/25/23 4:17 PM, Tian, Kevin wrote:
>> +static void assert_no_pending_iopf(struct device *dev, ioasid_t pasid)
>> +{
>> +	struct iommu_fault_param *iopf_param = dev->iommu-
>>> fault_param;
>> +	struct iopf_fault *iopf;
>> +
>> +	if (!iopf_param)
>> +		return;
>> +
>> +	mutex_lock(&iopf_param->lock);
>> +	list_for_each_entry(iopf, &iopf_param->partial, list) {
>> +		if (WARN_ON(iopf->fault.prm.pasid == pasid))
>> +			break;
>> +	}
> partial list is protected by dev_iommu lock.
> 

Ah, do you mind elaborating a bit more? In my mind, partial list is
protected by dev_iommu->fault_param->lock.

Best regards,
baolu
Baolu Lu Aug. 26, 2023, 8:08 a.m. UTC | #5
On 8/25/23 4:17 PM, Tian, Kevin wrote:
>> +
>> +	list_for_each_entry(iopf, &iopf_param->faults, list) {
>> +		if (WARN_ON(iopf->fault.prm.pasid == pasid))
>> +			break;
>> +	}
>> +	mutex_unlock(&iopf_param->lock);
>> +}
>> +
>>   void iommu_detach_device(struct iommu_domain *domain, struct device
>> *dev)
>>   {
>>   	struct iommu_group *group;
>> @@ -1959,6 +1980,7 @@ void iommu_detach_device(struct iommu_domain
>> *domain, struct device *dev)
>>   	if (!group)
>>   		return;
>>
>> +	assert_no_pending_iopf(dev, IOMMU_NO_PASID);
>>   	mutex_lock(&group->mutex);
>>   	if (WARN_ON(domain != group->domain) ||
>>   	    WARN_ON(list_count_nodes(&group->devices) != 1))
>> @@ -3269,6 +3291,7 @@ void iommu_detach_device_pasid(struct
>> iommu_domain *domain, struct device *dev,
>>   {
>>   	struct iommu_group *group = iommu_group_get(dev);
>>
>> +	assert_no_pending_iopf(dev, pasid);
> this doesn't look correct. A sane driver will stop triggering new
> page request before calling detach but there are still pending ones
> not drained until iopf_queue_flush_dev() called by
> ops->remove_dev_pasid().
> 
> then this check will cause false warning.
> 

You are right. It is not only incorrect but also pointless. The iommu
driver should flush the iopf queues in the path of detaching domains. I
will remove it if no objection.

Best regards,
baolu
Tian, Kevin Aug. 30, 2023, 7:34 a.m. UTC | #6
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Saturday, August 26, 2023 3:32 PM
> 
> On 8/25/23 4:17 PM, Tian, Kevin wrote:
> >> From: Lu Baolu <baolu.lu@linux.intel.com>
> >> Sent: Friday, August 25, 2023 10:30 AM
> >>
> >> +int iommu_sva_handle_iopf(struct iopf_group *group)
> >> +{
> >> +	struct iommu_fault_param *fault_param = group->dev->iommu-
> >>> fault_param;
> >> +
> >> +	INIT_WORK(&group->work, iopf_handler);
> >> +	if (!queue_work(fault_param->queue->wq, &group->work))
> >> +		return -EBUSY;
> >> +
> >> +	return 0;
> >> +}
> >
> > this function is generic so the name should not tie to 'sva'.
> 
> Currently only sva uses it. It's fine to make it generic later when any
> new use comes. Does it work to you?
> 

It's fine given you moved this function to sva.c in next patch.
Tian, Kevin Aug. 30, 2023, 7:43 a.m. UTC | #7
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Saturday, August 26, 2023 4:01 PM
> 
> On 8/25/23 4:17 PM, Tian, Kevin wrote:
> >> +
> >>   /**
> >>    * iopf_queue_flush_dev - Ensure that all queued faults have been
> >> processed
> >>    * @dev: the endpoint whose faults need to be flushed.
> > Presumably we also need a flush callback per domain given now
> > the use of workqueue is optional then flush_workqueue() might
> > not be sufficient.
> >
> 
> The iopf_queue_flush_dev() function flushes all pending faults from the
> IOMMU queue for a specific device. It has no means to flush fault queues
> out of iommu core.
> 
> The iopf_queue_flush_dev() function is typically called when a domain is
> detaching from a PASID. Hence it's necessary to flush the pending faults
> from top to bottom. For example, iommufd should flush pending faults in
> its fault queues after detaching the domain from the pasid.
> 

Is there an ordering problem? The last step of intel_svm_drain_prq()
in the detaching path issues a set of descriptors to drain page requests
and responses in hardware. It cannot complete if not all software queues
are drained and it's counter-intuitive to drain a software queue after 
the hardware draining has already been completed.

btw just flushing requests is probably insufficient in iommufd case since
the responses are received asynchronously. It requires an interface to
drain both requests and responses (presumably with timeouts in case
of a malicious guest which never responds) in the detach path.

it's not a problem for sva as responses are synchrounsly delivered after
handling mm fault. So fine to not touch it in this series but certainly
this area needs more work when moving to support iommufd. 
Tian, Kevin Aug. 30, 2023, 7:55 a.m. UTC | #8
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Saturday, August 26, 2023 4:04 PM
> 
> On 8/25/23 4:17 PM, Tian, Kevin wrote:
> >> +static void assert_no_pending_iopf(struct device *dev, ioasid_t pasid)
> >> +{
> >> +	struct iommu_fault_param *iopf_param = dev->iommu-
> >>> fault_param;
> >> +	struct iopf_fault *iopf;
> >> +
> >> +	if (!iopf_param)
> >> +		return;
> >> +
> >> +	mutex_lock(&iopf_param->lock);
> >> +	list_for_each_entry(iopf, &iopf_param->partial, list) {
> >> +		if (WARN_ON(iopf->fault.prm.pasid == pasid))
> >> +			break;
> >> +	}
> > partial list is protected by dev_iommu lock.
> >
> 
> Ah, do you mind elaborating a bit more? In my mind, partial list is
> protected by dev_iommu->fault_param->lock.
> 

well, it's not how the code is currently written. iommu_queue_iopf()
doesn't hold dev_iommu->fault_param->lock to update the partial
list. 

while at it looks there is also a mislocking in iopf_queue_discard_partial()
which only acquires queue->lock.

So we have three places touching the partial list all with different locks:

- iommu_queue_iopf() relies on dev_iommu->lock
- iopf_queue_discard_partial() relies on queue->lock
- this new assert function uses dev_iommu->fault_param->lock
Tian, Kevin Aug. 30, 2023, 8:50 a.m. UTC | #9
> From: Tian, Kevin
> Sent: Wednesday, August 30, 2023 3:44 PM
> 
> > From: Baolu Lu <baolu.lu@linux.intel.com>
> > Sent: Saturday, August 26, 2023 4:01 PM
> >
> > On 8/25/23 4:17 PM, Tian, Kevin wrote:
> > >> +
> > >>   /**
> > >>    * iopf_queue_flush_dev - Ensure that all queued faults have been
> > >> processed
> > >>    * @dev: the endpoint whose faults need to be flushed.
> > > Presumably we also need a flush callback per domain given now
> > > the use of workqueue is optional then flush_workqueue() might
> > > not be sufficient.
> > >
> >
> > The iopf_queue_flush_dev() function flushes all pending faults from the
> > IOMMU queue for a specific device. It has no means to flush fault queues
> > out of iommu core.
> >
> > The iopf_queue_flush_dev() function is typically called when a domain is
> > detaching from a PASID. Hence it's necessary to flush the pending faults
> > from top to bottom. For example, iommufd should flush pending faults in
> > its fault queues after detaching the domain from the pasid.
> >
> 
> Is there an ordering problem? The last step of intel_svm_drain_prq()
> in the detaching path issues a set of descriptors to drain page requests
> and responses in hardware. It cannot complete if not all software queues
> are drained and it's counter-intuitive to drain a software queue after
> the hardware draining has already been completed.

to be clear it's correct to drain request queues from bottom to top as the
lower level queue is the input to the higher level queue. But for response
the lowest draining needs to wait for response from higher levels. It's
interesting that intel-iommu driver combines draining hw page requests
and responses in one step in intel_svm_drain_prq(). this also needs some
consideration regarding to iommufd...

> 
> btw just flushing requests is probably insufficient in iommufd case since
> the responses are received asynchronously. It requires an interface to
> drain both requests and responses (presumably with timeouts in case
> of a malicious guest which never responds) in the detach path.
> 
> it's not a problem for sva as responses are synchrounsly delivered after
> handling mm fault. So fine to not touch it in this series but certainly
> this area needs more work when moving to support iommufd. 
Vasant Hegde Aug. 30, 2023, 11:02 a.m. UTC | #10
Tian, Baolu,

On 8/30/2023 1:13 PM, Tian, Kevin wrote:
>> From: Baolu Lu <baolu.lu@linux.intel.com>
>> Sent: Saturday, August 26, 2023 4:01 PM
>>
>> On 8/25/23 4:17 PM, Tian, Kevin wrote:
>>>> +
>>>>   /**
>>>>    * iopf_queue_flush_dev - Ensure that all queued faults have been
>>>> processed
>>>>    * @dev: the endpoint whose faults need to be flushed.
>>> Presumably we also need a flush callback per domain given now
>>> the use of workqueue is optional then flush_workqueue() might
>>> not be sufficient.
>>>
>>
>> The iopf_queue_flush_dev() function flushes all pending faults from the
>> IOMMU queue for a specific device. It has no means to flush fault queues
>> out of iommu core.
>>
>> The iopf_queue_flush_dev() function is typically called when a domain is
>> detaching from a PASID. Hence it's necessary to flush the pending faults
>> from top to bottom. For example, iommufd should flush pending faults in
>> its fault queues after detaching the domain from the pasid.
>>
> 
> Is there an ordering problem? The last step of intel_svm_drain_prq()
> in the detaching path issues a set of descriptors to drain page requests
> and responses in hardware. It cannot complete if not all software queues
> are drained and it's counter-intuitive to drain a software queue after 
> the hardware draining has already been completed.
> 
> btw just flushing requests is probably insufficient in iommufd case since
> the responses are received asynchronously. It requires an interface to
> drain both requests and responses (presumably with timeouts in case
> of a malicious guest which never responds) in the detach path.
> 
> it's not a problem for sva as responses are synchrounsly delivered after
> handling mm fault. So fine to not touch it in this series but certainly
> this area needs more work when moving to support iommufd. 
Jean-Philippe Brucker Aug. 30, 2023, 12:49 p.m. UTC | #11
On Wed, Aug 30, 2023 at 04:32:47PM +0530, Vasant Hegde wrote:
> Tian, Baolu,
> 
> On 8/30/2023 1:13 PM, Tian, Kevin wrote:
> >> From: Baolu Lu <baolu.lu@linux.intel.com>
> >> Sent: Saturday, August 26, 2023 4:01 PM
> >>
> >> On 8/25/23 4:17 PM, Tian, Kevin wrote:
> >>>> +
> >>>>   /**
> >>>>    * iopf_queue_flush_dev - Ensure that all queued faults have been
> >>>> processed
> >>>>    * @dev: the endpoint whose faults need to be flushed.
> >>> Presumably we also need a flush callback per domain given now
> >>> the use of workqueue is optional then flush_workqueue() might
> >>> not be sufficient.
> >>>
> >>
> >> The iopf_queue_flush_dev() function flushes all pending faults from the
> >> IOMMU queue for a specific device. It has no means to flush fault queues
> >> out of iommu core.
> >>
> >> The iopf_queue_flush_dev() function is typically called when a domain is
> >> detaching from a PASID. Hence it's necessary to flush the pending faults
> >> from top to bottom. For example, iommufd should flush pending faults in
> >> its fault queues after detaching the domain from the pasid.
> >>
> > 
> > Is there an ordering problem? The last step of intel_svm_drain_prq()
> > in the detaching path issues a set of descriptors to drain page requests
> > and responses in hardware. It cannot complete if not all software queues
> > are drained and it's counter-intuitive to drain a software queue after 
> > the hardware draining has already been completed.
> > 
> > btw just flushing requests is probably insufficient in iommufd case since
> > the responses are received asynchronously. It requires an interface to
> > drain both requests and responses (presumably with timeouts in case
> > of a malicious guest which never responds) in the detach path.
> > 
> > it's not a problem for sva as responses are synchrounsly delivered after
> > handling mm fault. So fine to not touch it in this series but certainly
> > this area needs more work when moving to support iommufd. 
Vasant Hegde Aug. 31, 2023, 6:57 a.m. UTC | #12
Hi Jean,

On 8/30/2023 6:19 PM, Jean-Philippe Brucker wrote:
> On Wed, Aug 30, 2023 at 04:32:47PM +0530, Vasant Hegde wrote:
>> Tian, Baolu,
>>
>> On 8/30/2023 1:13 PM, Tian, Kevin wrote:
>>>> From: Baolu Lu <baolu.lu@linux.intel.com>
>>>> Sent: Saturday, August 26, 2023 4:01 PM
>>>>
>>>> On 8/25/23 4:17 PM, Tian, Kevin wrote:
>>>>>> +
>>>>>>   /**
>>>>>>    * iopf_queue_flush_dev - Ensure that all queued faults have been
>>>>>> processed
>>>>>>    * @dev: the endpoint whose faults need to be flushed.
>>>>> Presumably we also need a flush callback per domain given now
>>>>> the use of workqueue is optional then flush_workqueue() might
>>>>> not be sufficient.
>>>>>
>>>>
>>>> The iopf_queue_flush_dev() function flushes all pending faults from the
>>>> IOMMU queue for a specific device. It has no means to flush fault queues
>>>> out of iommu core.
>>>>
>>>> The iopf_queue_flush_dev() function is typically called when a domain is
>>>> detaching from a PASID. Hence it's necessary to flush the pending faults
>>>> from top to bottom. For example, iommufd should flush pending faults in
>>>> its fault queues after detaching the domain from the pasid.
>>>>
>>>
>>> Is there an ordering problem? The last step of intel_svm_drain_prq()
>>> in the detaching path issues a set of descriptors to drain page requests
>>> and responses in hardware. It cannot complete if not all software queues
>>> are drained and it's counter-intuitive to drain a software queue after 
>>> the hardware draining has already been completed.
>>>
>>> btw just flushing requests is probably insufficient in iommufd case since
>>> the responses are received asynchronously. It requires an interface to
>>> drain both requests and responses (presumably with timeouts in case
>>> of a malicious guest which never responds) in the detach path.
>>>
>>> it's not a problem for sva as responses are synchrounsly delivered after
>>> handling mm fault. So fine to not touch it in this series but certainly
>>> this area needs more work when moving to support iommufd. 
Baolu Lu Aug. 31, 2023, 9:27 a.m. UTC | #13
On 2023/8/30 15:43, Tian, Kevin wrote:
>> From: Baolu Lu <baolu.lu@linux.intel.com>
>> Sent: Saturday, August 26, 2023 4:01 PM
>>
>> On 8/25/23 4:17 PM, Tian, Kevin wrote:
>>>> +
>>>>    /**
>>>>     * iopf_queue_flush_dev - Ensure that all queued faults have been
>>>> processed
>>>>     * @dev: the endpoint whose faults need to be flushed.
>>> Presumably we also need a flush callback per domain given now
>>> the use of workqueue is optional then flush_workqueue() might
>>> not be sufficient.
>>>
>>
>> The iopf_queue_flush_dev() function flushes all pending faults from the
>> IOMMU queue for a specific device. It has no means to flush fault queues
>> out of iommu core.
>>
>> The iopf_queue_flush_dev() function is typically called when a domain is
>> detaching from a PASID. Hence it's necessary to flush the pending faults
>> from top to bottom. For example, iommufd should flush pending faults in
>> its fault queues after detaching the domain from the pasid.
>>
> 
> Is there an ordering problem? The last step of intel_svm_drain_prq()
> in the detaching path issues a set of descriptors to drain page requests
> and responses in hardware. It cannot complete if not all software queues
> are drained and it's counter-intuitive to drain a software queue after
> the hardware draining has already been completed.
> 
> btw just flushing requests is probably insufficient in iommufd case since
> the responses are received asynchronously. It requires an interface to
> drain both requests and responses (presumably with timeouts in case
> of a malicious guest which never responds) in the detach path.

You are right. Good catch.

To put it simply, iopf_queue_flush_dev() is insufficient to support the
case of forwarding iopf's over iommufd. Do I understand it right?

Perhaps we should drain the partial list and the response pending list?
With these two lists drained, no more iopf's for the specific pasid will
be forwarded up, and page response from upper layer will be dropped.

> 
> it's not a problem for sva as responses are synchrounsly delivered after
> handling mm fault. So fine to not touch it in this series but certainly
> this area needs more work when moving to support iommufd. 
Baolu Lu Aug. 31, 2023, 9:42 a.m. UTC | #14
On 2023/8/30 16:50, Tian, Kevin wrote:
>> From: Tian, Kevin
>> Sent: Wednesday, August 30, 2023 3:44 PM
>>
>>> From: Baolu Lu<baolu.lu@linux.intel.com>
>>> Sent: Saturday, August 26, 2023 4:01 PM
>>>
>>> On 8/25/23 4:17 PM, Tian, Kevin wrote:
>>>>> +
>>>>>    /**
>>>>>     * iopf_queue_flush_dev - Ensure that all queued faults have been
>>>>> processed
>>>>>     * @dev: the endpoint whose faults need to be flushed.
>>>> Presumably we also need a flush callback per domain given now
>>>> the use of workqueue is optional then flush_workqueue() might
>>>> not be sufficient.
>>>>
>>> The iopf_queue_flush_dev() function flushes all pending faults from the
>>> IOMMU queue for a specific device. It has no means to flush fault queues
>>> out of iommu core.
>>>
>>> The iopf_queue_flush_dev() function is typically called when a domain is
>>> detaching from a PASID. Hence it's necessary to flush the pending faults
>>> from top to bottom. For example, iommufd should flush pending faults in
>>> its fault queues after detaching the domain from the pasid.
>>>
>> Is there an ordering problem? The last step of intel_svm_drain_prq()
>> in the detaching path issues a set of descriptors to drain page requests
>> and responses in hardware. It cannot complete if not all software queues
>> are drained and it's counter-intuitive to drain a software queue after
>> the hardware draining has already been completed.
> to be clear it's correct to drain request queues from bottom to top as the
> lower level queue is the input to the higher level queue. But for response
> the lowest draining needs to wait for response from higher levels. It's
> interesting that intel-iommu driver combines draining hw page requests
> and responses in one step in intel_svm_drain_prq(). this also needs some
> consideration regarding to iommufd...
> 

I agree with you. For the responses, we can iterate over the list of
page requests pending to respond. If any fault matches the pasid and the
device, we can drain it by responding IOMMU_PAGE_RESP_INVALID to the
device.

After that the responses for the drained faults will be dropped by the
iommu_page_response() interface.

Best regards,
baolu
Baolu Lu Aug. 31, 2023, 11:24 a.m. UTC | #15
On 2023/8/30 15:55, Tian, Kevin wrote:
>> From: Baolu Lu <baolu.lu@linux.intel.com>
>> Sent: Saturday, August 26, 2023 4:04 PM
>>
>> On 8/25/23 4:17 PM, Tian, Kevin wrote:
>>>> +static void assert_no_pending_iopf(struct device *dev, ioasid_t pasid)
>>>> +{
>>>> +	struct iommu_fault_param *iopf_param = dev->iommu-
>>>>> fault_param;
>>>> +	struct iopf_fault *iopf;
>>>> +
>>>> +	if (!iopf_param)
>>>> +		return;
>>>> +
>>>> +	mutex_lock(&iopf_param->lock);
>>>> +	list_for_each_entry(iopf, &iopf_param->partial, list) {
>>>> +		if (WARN_ON(iopf->fault.prm.pasid == pasid))
>>>> +			break;
>>>> +	}
>>> partial list is protected by dev_iommu lock.
>>>
>>
>> Ah, do you mind elaborating a bit more? In my mind, partial list is
>> protected by dev_iommu->fault_param->lock.
>>
> 
> well, it's not how the code is currently written. iommu_queue_iopf()
> doesn't hold dev_iommu->fault_param->lock to update the partial
> list.
> 
> while at it looks there is also a mislocking in iopf_queue_discard_partial()
> which only acquires queue->lock.
> 
> So we have three places touching the partial list all with different locks:
> 
> - iommu_queue_iopf() relies on dev_iommu->lock
> - iopf_queue_discard_partial() relies on queue->lock
> - this new assert function uses dev_iommu->fault_param->lock

Yeah, I see your point now. Thanks for the explanation.

So, my understanding is that dev_iommu->lock protects the whole
pointer of dev_iommu->fault_param, while dev_iommu->fault_param->lock
protects the lists inside it.

Is this locking mechanism different from what you think?

Best regards,
baolu
Tian, Kevin Sept. 1, 2023, 2:49 a.m. UTC | #16
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Thursday, August 31, 2023 5:28 PM
> 
> On 2023/8/30 15:43, Tian, Kevin wrote:
> >> From: Baolu Lu <baolu.lu@linux.intel.com>
> >> Sent: Saturday, August 26, 2023 4:01 PM
> >>
> >> On 8/25/23 4:17 PM, Tian, Kevin wrote:
> >>>> +
> >>>>    /**
> >>>>     * iopf_queue_flush_dev - Ensure that all queued faults have been
> >>>> processed
> >>>>     * @dev: the endpoint whose faults need to be flushed.
> >>> Presumably we also need a flush callback per domain given now
> >>> the use of workqueue is optional then flush_workqueue() might
> >>> not be sufficient.
> >>>
> >>
> >> The iopf_queue_flush_dev() function flushes all pending faults from the
> >> IOMMU queue for a specific device. It has no means to flush fault queues
> >> out of iommu core.
> >>
> >> The iopf_queue_flush_dev() function is typically called when a domain is
> >> detaching from a PASID. Hence it's necessary to flush the pending faults
> >> from top to bottom. For example, iommufd should flush pending faults in
> >> its fault queues after detaching the domain from the pasid.
> >>
> >
> > Is there an ordering problem? The last step of intel_svm_drain_prq()
> > in the detaching path issues a set of descriptors to drain page requests
> > and responses in hardware. It cannot complete if not all software queues
> > are drained and it's counter-intuitive to drain a software queue after
> > the hardware draining has already been completed.
> >
> > btw just flushing requests is probably insufficient in iommufd case since
> > the responses are received asynchronously. It requires an interface to
> > drain both requests and responses (presumably with timeouts in case
> > of a malicious guest which never responds) in the detach path.
> 
> You are right. Good catch.
> 
> To put it simply, iopf_queue_flush_dev() is insufficient to support the
> case of forwarding iopf's over iommufd. Do I understand it right?

yes

> 
> Perhaps we should drain the partial list and the response pending list?
> With these two lists drained, no more iopf's for the specific pasid will
> be forwarded up, and page response from upper layer will be dropped.
> 

correct.
Tian, Kevin Sept. 1, 2023, 2:50 a.m. UTC | #17
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Thursday, August 31, 2023 7:25 PM
> 
> On 2023/8/30 15:55, Tian, Kevin wrote:
> >> From: Baolu Lu <baolu.lu@linux.intel.com>
> >> Sent: Saturday, August 26, 2023 4:04 PM
> >>
> >> On 8/25/23 4:17 PM, Tian, Kevin wrote:
> >>>> +static void assert_no_pending_iopf(struct device *dev, ioasid_t pasid)
> >>>> +{
> >>>> +	struct iommu_fault_param *iopf_param = dev->iommu-
> >>>>> fault_param;
> >>>> +	struct iopf_fault *iopf;
> >>>> +
> >>>> +	if (!iopf_param)
> >>>> +		return;
> >>>> +
> >>>> +	mutex_lock(&iopf_param->lock);
> >>>> +	list_for_each_entry(iopf, &iopf_param->partial, list) {
> >>>> +		if (WARN_ON(iopf->fault.prm.pasid == pasid))
> >>>> +			break;
> >>>> +	}
> >>> partial list is protected by dev_iommu lock.
> >>>
> >>
> >> Ah, do you mind elaborating a bit more? In my mind, partial list is
> >> protected by dev_iommu->fault_param->lock.
> >>
> >
> > well, it's not how the code is currently written. iommu_queue_iopf()
> > doesn't hold dev_iommu->fault_param->lock to update the partial
> > list.
> >
> > while at it looks there is also a mislocking in iopf_queue_discard_partial()
> > which only acquires queue->lock.
> >
> > So we have three places touching the partial list all with different locks:
> >
> > - iommu_queue_iopf() relies on dev_iommu->lock
> > - iopf_queue_discard_partial() relies on queue->lock
> > - this new assert function uses dev_iommu->fault_param->lock
> 
> Yeah, I see your point now. Thanks for the explanation.
> 
> So, my understanding is that dev_iommu->lock protects the whole
> pointer of dev_iommu->fault_param, while dev_iommu->fault_param->lock
> protects the lists inside it.
> 

yes. let's use fault_param->lock consistently for those lists in all paths.
Baolu Lu Sept. 5, 2023, 5:19 a.m. UTC | #18
On 2023/9/1 10:49, Tian, Kevin wrote:
>> From: Baolu Lu<baolu.lu@linux.intel.com>
>> Sent: Thursday, August 31, 2023 5:28 PM
>>
>> On 2023/8/30 15:43, Tian, Kevin wrote:
>>>> From: Baolu Lu<baolu.lu@linux.intel.com>
>>>> Sent: Saturday, August 26, 2023 4:01 PM
>>>>
>>>> On 8/25/23 4:17 PM, Tian, Kevin wrote:
>>>>>> +
>>>>>>     /**
>>>>>>      * iopf_queue_flush_dev - Ensure that all queued faults have been
>>>>>> processed
>>>>>>      * @dev: the endpoint whose faults need to be flushed.
>>>>> Presumably we also need a flush callback per domain given now
>>>>> the use of workqueue is optional then flush_workqueue() might
>>>>> not be sufficient.
>>>>>
>>>> The iopf_queue_flush_dev() function flushes all pending faults from the
>>>> IOMMU queue for a specific device. It has no means to flush fault queues
>>>> out of iommu core.
>>>>
>>>> The iopf_queue_flush_dev() function is typically called when a domain is
>>>> detaching from a PASID. Hence it's necessary to flush the pending faults
>>>> from top to bottom. For example, iommufd should flush pending faults in
>>>> its fault queues after detaching the domain from the pasid.
>>>>
>>> Is there an ordering problem? The last step of intel_svm_drain_prq()
>>> in the detaching path issues a set of descriptors to drain page requests
>>> and responses in hardware. It cannot complete if not all software queues
>>> are drained and it's counter-intuitive to drain a software queue after
>>> the hardware draining has already been completed.
>>>
>>> btw just flushing requests is probably insufficient in iommufd case since
>>> the responses are received asynchronously. It requires an interface to
>>> drain both requests and responses (presumably with timeouts in case
>>> of a malicious guest which never responds) in the detach path.
>> You are right. Good catch.
>>
>> To put it simply, iopf_queue_flush_dev() is insufficient to support the
>> case of forwarding iopf's over iommufd. Do I understand it right?
> yes

I added below patch to address the iopf_queue_flush_dev() issue. What do
you think of this?

iommu: Improve iopf_queue_flush_dev()

The iopf_queue_flush_dev() is called by the iommu driver before releasing
a PASID. It ensures that all pending faults for this PASID have been
handled or cancelled, and won't hit the address space that reuses this
PASID. The driver must make sure that no new fault is added to the queue.

The SMMUv3 driver doesn't use it because it only implements the
Arm-specific stall fault model where DMA transactions are held in the SMMU
while waiting for the OS to handle iopf's. Since a device driver must
complete all DMA transactions before detaching domain, there are no
pending iopf's with the stall model. PRI support requires adding a call to
iopf_queue_flush_dev() after flushing the hardware page fault queue.

The current implementation of iopf_queue_flush_dev() is a simplified
version. It is only suitable for SVA case in which the processing of iopf
is implemented in the inner loop of the iommu subsystem.

Improve this interface to make it also work for handling iopf out of the
iommu core.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
  include/linux/iommu.h      |  4 ++--
  drivers/iommu/intel/svm.c  |  2 +-
  drivers/iommu/io-pgfault.c | 40 ++++++++++++++++++++++++++++++++++++--
  3 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 77ad33ffe3ac..465e23e945d0 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1275,7 +1275,7 @@ iommu_sva_domain_alloc(struct device *dev, struct 
mm_struct *mm)
  #ifdef CONFIG_IOMMU_IOPF
  int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev);
  int iopf_queue_remove_device(struct iopf_queue *queue, struct device 
*dev);
-int iopf_queue_flush_dev(struct device *dev);
+int iopf_queue_flush_dev(struct device *dev, ioasid_t pasid);
  struct iopf_queue *iopf_queue_alloc(const char *name);
  void iopf_queue_free(struct iopf_queue *queue);
  int iopf_queue_discard_partial(struct iopf_queue *queue);
@@ -1295,7 +1295,7 @@ iopf_queue_remove_device(struct iopf_queue *queue, 
struct device *dev)
  	return -ENODEV;
  }

-static inline int iopf_queue_flush_dev(struct device *dev)
+static inline int iopf_queue_flush_dev(struct device *dev, ioasid_t pasid)
  {
  	return -ENODEV;
  }
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 780c5bd73ec2..4c3f4533e337 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -495,7 +495,7 @@ void intel_drain_pasid_prq(struct device *dev, u32 
pasid)
  		goto prq_retry;
  	}

-	iopf_queue_flush_dev(dev);
+	iopf_queue_flush_dev(dev, pasid);

  	/*
  	 * Perform steps described in VT-d spec CH7.10 to drain page
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 3e6845bc5902..84728fb89ac7 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -309,17 +309,53 @@ EXPORT_SYMBOL_GPL(iommu_page_response);
   *
   * Return: 0 on success and <0 on error.
   */
-int iopf_queue_flush_dev(struct device *dev)
+int iopf_queue_flush_dev(struct device *dev, ioasid_t pasid)
  {
  	struct iommu_fault_param *iopf_param = iopf_get_dev_fault_param(dev);
+	const struct iommu_ops *ops = dev_iommu_ops(dev);
+	struct iommu_page_response resp;
+	struct iopf_fault *iopf, *next;
+	int ret = 0;

  	if (!iopf_param)
  		return -ENODEV;

  	flush_workqueue(iopf_param->queue->wq);
+
+	mutex_lock(&iopf_param->lock);
+	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
+		if (!(iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) ||
+		    iopf->fault.prm.pasid != pasid)
+			break;
+
+		list_del(&iopf->list);
+		kfree(iopf);
+	}
+
+	list_for_each_entry_safe(iopf, next, &iopf_param->faults, list) {
+		if (!(iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) ||
+		    iopf->fault.prm.pasid != pasid)
+			continue;
+
+		memset(&resp, 0, sizeof(struct iommu_page_response));
+		resp.pasid = iopf->fault.prm.pasid;
+		resp.grpid = iopf->fault.prm.grpid;
+		resp.code = IOMMU_PAGE_RESP_INVALID;
+
+		if (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID)
+			resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
+
+		ret = ops->page_response(dev, iopf, &resp);
+		if (ret)
+			break;
+
+		list_del(&iopf->list);
+		kfree(iopf);
+	}
+	mutex_unlock(&iopf_param->lock);
  	iopf_put_dev_fault_param(iopf_param);

-	return 0;
+	return ret;
  }
  EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);

Best regards,
baolu
Baolu Lu Sept. 5, 2023, 5:24 a.m. UTC | #19
On 2023/9/1 10:50, Tian, Kevin wrote:
>> From: Baolu Lu <baolu.lu@linux.intel.com>
>> Sent: Thursday, August 31, 2023 7:25 PM
>>
>> On 2023/8/30 15:55, Tian, Kevin wrote:
>>>> From: Baolu Lu <baolu.lu@linux.intel.com>
>>>> Sent: Saturday, August 26, 2023 4:04 PM
>>>>
>>>> On 8/25/23 4:17 PM, Tian, Kevin wrote:
>>>>>> +static void assert_no_pending_iopf(struct device *dev, ioasid_t pasid)
>>>>>> +{
>>>>>> +	struct iommu_fault_param *iopf_param = dev->iommu-
>>>>>>> fault_param;
>>>>>> +	struct iopf_fault *iopf;
>>>>>> +
>>>>>> +	if (!iopf_param)
>>>>>> +		return;
>>>>>> +
>>>>>> +	mutex_lock(&iopf_param->lock);
>>>>>> +	list_for_each_entry(iopf, &iopf_param->partial, list) {
>>>>>> +		if (WARN_ON(iopf->fault.prm.pasid == pasid))
>>>>>> +			break;
>>>>>> +	}
>>>>> partial list is protected by dev_iommu lock.
>>>>>
>>>>
>>>> Ah, do you mind elaborating a bit more? In my mind, partial list is
>>>> protected by dev_iommu->fault_param->lock.
>>>>
>>>
>>> well, it's not how the code is currently written. iommu_queue_iopf()
>>> doesn't hold dev_iommu->fault_param->lock to update the partial
>>> list.
>>>
>>> while at it looks there is also a mislocking in iopf_queue_discard_partial()
>>> which only acquires queue->lock.
>>>
>>> So we have three places touching the partial list all with different locks:
>>>
>>> - iommu_queue_iopf() relies on dev_iommu->lock
>>> - iopf_queue_discard_partial() relies on queue->lock
>>> - this new assert function uses dev_iommu->fault_param->lock
>>
>> Yeah, I see your point now. Thanks for the explanation.
>>
>> So, my understanding is that dev_iommu->lock protects the whole
>> pointer of dev_iommu->fault_param, while dev_iommu->fault_param->lock
>> protects the lists inside it.
>>
> 
> yes. let's use fault_param->lock consistently for those lists in all paths.

Hi Kevin,

I am trying to address this issue in below patch. Does it looks sane to
you?

iommu: Consolidate per-device fault data management

The per-device fault data is a data structure that is used to store
information about faults that occur on a device. This data is allocated
when IOPF is enabled on the device and freed when IOPF is disabled. The
data is used in the paths of iopf reporting, handling, responding, and
draining.

The fault data is protected by two locks:

- dev->iommu->lock: This lock is used to protect the allocation and
   freeing of the fault data.
- dev->iommu->fault_parameter->lock: This lock is used to protect the
   fault data itself.

Improve the iopf code to enforce this lock mechanism and add a reference
counter in the fault data to avoid use-after-free issue.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
  include/linux/iommu.h      |   3 +
  drivers/iommu/io-pgfault.c | 122 +++++++++++++++++++++++--------------
  2 files changed, 79 insertions(+), 46 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 1697ac168f05..77ad33ffe3ac 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -480,6 +480,8 @@ struct iommu_device {
  /**
   * struct iommu_fault_param - per-device IOMMU fault data
   * @lock: protect pending faults list
+ * @users: user counter to manage the lifetime of the data, this field
+ *         is protected by dev->iommu->lock.
   * @dev: the device that owns this param
   * @queue: IOPF queue
   * @queue_list: index into queue->devices
@@ -489,6 +491,7 @@ struct iommu_device {
   */
  struct iommu_fault_param {
  	struct mutex lock;
+	int users;

  	struct device *dev;
  	struct iopf_queue *queue;
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 7e5c6798ce24..3e6845bc5902 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -26,6 +26,49 @@ void iopf_free_group(struct iopf_group *group)
  }
  EXPORT_SYMBOL_GPL(iopf_free_group);

+/*
+ * Return the fault parameter of a device if it exists. Otherwise, 
return NULL.
+ * On a successful return, the caller takes a reference of this 
parameter and
+ * should put it after use by calling iopf_put_dev_fault_param().
+ */
+static struct iommu_fault_param *iopf_get_dev_fault_param(struct device 
*dev)
+{
+	struct dev_iommu *param = dev->iommu;
+	struct iommu_fault_param *fault_param;
+
+	if (!param)
+		return NULL;
+
+	mutex_lock(&param->lock);
+	fault_param = param->fault_param;
+	if (fault_param)
+		fault_param->users++;
+	mutex_unlock(&param->lock);
+
+	return fault_param;
+}
+
+/* Caller must hold a reference of the fault parameter. */
+static void iopf_put_dev_fault_param(struct iommu_fault_param *fault_param)
+{
+	struct device *dev = fault_param->dev;
+	struct dev_iommu *param = dev->iommu;
+
+	mutex_lock(&param->lock);
+	if (WARN_ON(fault_param->users <= 0 ||
+		    fault_param != param->fault_param)) {
+		mutex_unlock(&param->lock);
+		return;
+	}
+
+	if (--fault_param->users == 0) {
+		param->fault_param = NULL;
+		kfree(fault_param);
+		put_device(dev);
+	}
+	mutex_unlock(&param->lock);
+}
+
  /**
   * iommu_handle_iopf - IO Page Fault handler
   * @fault: fault event
@@ -72,23 +115,14 @@ static int iommu_handle_iopf(struct iommu_fault 
*fault, struct device *dev)
  	struct iopf_group *group;
  	struct iommu_domain *domain;
  	struct iopf_fault *iopf, *next;
-	struct iommu_fault_param *iopf_param;
-	struct dev_iommu *param = dev->iommu;
+	struct iommu_fault_param *iopf_param = dev->iommu->fault_param;

-	lockdep_assert_held(&param->lock);
+	lockdep_assert_held(&iopf_param->lock);

  	if (fault->type != IOMMU_FAULT_PAGE_REQ)
  		/* Not a recoverable page fault */
  		return -EOPNOTSUPP;

-	/*
-	 * As long as we're holding param->lock, the queue can't be unlinked
-	 * from the device and therefore cannot disappear.
-	 */
-	iopf_param = param->fault_param;
-	if (!iopf_param)
-		return -ENODEV;
-
  	if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
  		iopf = kzalloc(sizeof(*iopf), GFP_KERNEL);
  		if (!iopf)
@@ -167,18 +201,15 @@ static int iommu_handle_iopf(struct iommu_fault 
*fault, struct device *dev)
   */
  int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
  {
-	struct dev_iommu *param = dev->iommu;
+	struct iommu_fault_param *fault_param;
  	struct iopf_fault *evt_pending = NULL;
-	struct iommu_fault_param *fparam;
  	int ret = 0;

-	if (!param || !evt)
+	fault_param = iopf_get_dev_fault_param(dev);
+	if (!fault_param)
  		return -EINVAL;

-	/* we only report device fault if there is a handler registered */
-	mutex_lock(&param->lock);
-	fparam = param->fault_param;
-
+	mutex_lock(&fault_param->lock);
  	if (evt->fault.type == IOMMU_FAULT_PAGE_REQ &&
  	    (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
  		evt_pending = kmemdup(evt, sizeof(struct iopf_fault),
@@ -187,20 +218,18 @@ int iommu_report_device_fault(struct device *dev, 
struct iopf_fault *evt)
  			ret = -ENOMEM;
  			goto done_unlock;
  		}
-		mutex_lock(&fparam->lock);
-		list_add_tail(&evt_pending->list, &fparam->faults);
-		mutex_unlock(&fparam->lock);
+		list_add_tail(&evt_pending->list, &fault_param->faults);
  	}

  	ret = iommu_handle_iopf(&evt->fault, dev);
  	if (ret && evt_pending) {
-		mutex_lock(&fparam->lock);
  		list_del(&evt_pending->list);
-		mutex_unlock(&fparam->lock);
  		kfree(evt_pending);
  	}
  done_unlock:
-	mutex_unlock(&param->lock);
+	mutex_unlock(&fault_param->lock);
+	iopf_put_dev_fault_param(fault_param);
+
  	return ret;
  }
  EXPORT_SYMBOL_GPL(iommu_report_device_fault);
@@ -212,19 +241,20 @@ int iommu_page_response(struct device *dev,
  	int ret = -EINVAL;
  	struct iopf_fault *evt;
  	struct iommu_fault_page_request *prm;
-	struct dev_iommu *param = dev->iommu;
+	struct iommu_fault_param *fault_param;
  	const struct iommu_ops *ops = dev_iommu_ops(dev);
  	bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID;

  	if (!ops->page_response)
  		return -ENODEV;

-	if (!param || !param->fault_param)
-		return -EINVAL;
+	fault_param = iopf_get_dev_fault_param(dev);
+	if (!fault_param)
+		return -ENODEV;

  	/* Only send response if there is a fault report pending */
-	mutex_lock(&param->fault_param->lock);
-	if (list_empty(&param->fault_param->faults)) {
+	mutex_lock(&fault_param->lock);
+	if (list_empty(&fault_param->faults)) {
  		dev_warn_ratelimited(dev, "no pending PRQ, drop response\n");
  		goto done_unlock;
  	}
@@ -232,7 +262,7 @@ int iommu_page_response(struct device *dev,
  	 * Check if we have a matching page request pending to respond,
  	 * otherwise return -EINVAL
  	 */
-	list_for_each_entry(evt, &param->fault_param->faults, list) {
+	list_for_each_entry(evt, &fault_param->faults, list) {
  		prm = &evt->fault.prm;
  		if (prm->grpid != msg->grpid)
  			continue;
@@ -260,7 +290,9 @@ int iommu_page_response(struct device *dev,
  	}

  done_unlock:
-	mutex_unlock(&param->fault_param->lock);
+	mutex_unlock(&fault_param->lock);
+	iopf_put_dev_fault_param(fault_param);
+
  	return ret;
  }
  EXPORT_SYMBOL_GPL(iommu_page_response);
@@ -279,22 +311,15 @@ EXPORT_SYMBOL_GPL(iommu_page_response);
   */
  int iopf_queue_flush_dev(struct device *dev)
  {
-	int ret = 0;
-	struct iommu_fault_param *iopf_param;
-	struct dev_iommu *param = dev->iommu;
+	struct iommu_fault_param *iopf_param = iopf_get_dev_fault_param(dev);

-	if (!param)
+	if (!iopf_param)
  		return -ENODEV;

-	mutex_lock(&param->lock);
-	iopf_param = param->fault_param;
-	if (iopf_param)
-		flush_workqueue(iopf_param->queue->wq);
-	else
-		ret = -ENODEV;
-	mutex_unlock(&param->lock);
+	flush_workqueue(iopf_param->queue->wq);
+	iopf_put_dev_fault_param(iopf_param);

-	return ret;
+	return 0;
  }
  EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);

@@ -318,11 +343,13 @@ int iopf_queue_discard_partial(struct iopf_queue 
*queue)

  	mutex_lock(&queue->lock);
  	list_for_each_entry(iopf_param, &queue->devices, queue_list) {
+		mutex_lock(&iopf_param->lock);
  		list_for_each_entry_safe(iopf, next, &iopf_param->partial,
  					 list) {
  			list_del(&iopf->list);
  			kfree(iopf);
  		}
+		mutex_unlock(&iopf_param->lock);
  	}
  	mutex_unlock(&queue->lock);
  	return 0;
@@ -361,6 +388,7 @@ int iopf_queue_add_device(struct iopf_queue *queue, 
struct device *dev)
  	INIT_LIST_HEAD(&fault_param->faults);
  	INIT_LIST_HEAD(&fault_param->partial);
  	fault_param->dev = dev;
+	fault_param->users = 1;
  	list_add(&fault_param->queue_list, &queue->devices);
  	fault_param->queue = queue;

@@ -413,9 +441,11 @@ int iopf_queue_remove_device(struct iopf_queue 
*queue, struct device *dev)
  	list_for_each_entry_safe(iopf, next, &fault_param->partial, list)
  		kfree(iopf);

-	param->fault_param = NULL;
-	kfree(fault_param);
-	put_device(dev);
+	if (--fault_param->users == 0) {
+		param->fault_param = NULL;
+		kfree(fault_param);
+		put_device(dev);
+	}
  unlock:
  	mutex_unlock(&param->lock);
  	mutex_unlock(&queue->lock);
Tian, Kevin Sept. 11, 2023, 6:35 a.m. UTC | #20
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Tuesday, September 5, 2023 1:20 PM
> 
> I added below patch to address the iopf_queue_flush_dev() issue. What do
> you think of this?
> 
> iommu: Improve iopf_queue_flush_dev()
> 
> The iopf_queue_flush_dev() is called by the iommu driver before releasing
> a PASID. It ensures that all pending faults for this PASID have been
> handled or cancelled, and won't hit the address space that reuses this
> PASID. The driver must make sure that no new fault is added to the queue.
> 
> The SMMUv3 driver doesn't use it because it only implements the
> Arm-specific stall fault model where DMA transactions are held in the SMMU
> while waiting for the OS to handle iopf's. Since a device driver must
> complete all DMA transactions before detaching domain, there are no
> pending iopf's with the stall model. PRI support requires adding a call to
> iopf_queue_flush_dev() after flushing the hardware page fault queue.
> 
> The current implementation of iopf_queue_flush_dev() is a simplified
> version. It is only suitable for SVA case in which the processing of iopf
> is implemented in the inner loop of the iommu subsystem.
> 
> Improve this interface to make it also work for handling iopf out of the
> iommu core.
> 
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
>   include/linux/iommu.h      |  4 ++--
>   drivers/iommu/intel/svm.c  |  2 +-
>   drivers/iommu/io-pgfault.c | 40
> ++++++++++++++++++++++++++++++++++++--
>   3 files changed, 41 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 77ad33ffe3ac..465e23e945d0 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -1275,7 +1275,7 @@ iommu_sva_domain_alloc(struct device *dev,
> struct
> mm_struct *mm)
>   #ifdef CONFIG_IOMMU_IOPF
>   int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev);
>   int iopf_queue_remove_device(struct iopf_queue *queue, struct device
> *dev);
> -int iopf_queue_flush_dev(struct device *dev);
> +int iopf_queue_flush_dev(struct device *dev, ioasid_t pasid);
>   struct iopf_queue *iopf_queue_alloc(const char *name);
>   void iopf_queue_free(struct iopf_queue *queue);
>   int iopf_queue_discard_partial(struct iopf_queue *queue);
> @@ -1295,7 +1295,7 @@ iopf_queue_remove_device(struct iopf_queue
> *queue,
> struct device *dev)
>   	return -ENODEV;
>   }
> 
> -static inline int iopf_queue_flush_dev(struct device *dev)
> +static inline int iopf_queue_flush_dev(struct device *dev, ioasid_t pasid)
>   {
>   	return -ENODEV;
>   }
> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
> index 780c5bd73ec2..4c3f4533e337 100644
> --- a/drivers/iommu/intel/svm.c
> +++ b/drivers/iommu/intel/svm.c
> @@ -495,7 +495,7 @@ void intel_drain_pasid_prq(struct device *dev, u32
> pasid)
>   		goto prq_retry;
>   	}
> 
> -	iopf_queue_flush_dev(dev);
> +	iopf_queue_flush_dev(dev, pasid);
> 
>   	/*
>   	 * Perform steps described in VT-d spec CH7.10 to drain page
> diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
> index 3e6845bc5902..84728fb89ac7 100644
> --- a/drivers/iommu/io-pgfault.c
> +++ b/drivers/iommu/io-pgfault.c
> @@ -309,17 +309,53 @@ EXPORT_SYMBOL_GPL(iommu_page_response);
>    *
>    * Return: 0 on success and <0 on error.
>    */
> -int iopf_queue_flush_dev(struct device *dev)
> +int iopf_queue_flush_dev(struct device *dev, ioasid_t pasid)
>   {
>   	struct iommu_fault_param *iopf_param =
> iopf_get_dev_fault_param(dev);
> +	const struct iommu_ops *ops = dev_iommu_ops(dev);
> +	struct iommu_page_response resp;
> +	struct iopf_fault *iopf, *next;
> +	int ret = 0;
> 
>   	if (!iopf_param)
>   		return -ENODEV;
> 
>   	flush_workqueue(iopf_param->queue->wq);
> +
> +	mutex_lock(&iopf_param->lock);
> +	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
> +		if (!(iopf->fault.prm.flags &
> IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) ||
> +		    iopf->fault.prm.pasid != pasid)
> +			break;
> +
> +		list_del(&iopf->list);
> +		kfree(iopf);
> +	}
> +
> +	list_for_each_entry_safe(iopf, next, &iopf_param->faults, list) {
> +		if (!(iopf->fault.prm.flags &
> IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) ||
> +		    iopf->fault.prm.pasid != pasid)
> +			continue;
> +
> +		memset(&resp, 0, sizeof(struct iommu_page_response));
> +		resp.pasid = iopf->fault.prm.pasid;
> +		resp.grpid = iopf->fault.prm.grpid;
> +		resp.code = IOMMU_PAGE_RESP_INVALID;
> +
> +		if (iopf->fault.prm.flags &
> IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID)
> +			resp.flags = IOMMU_PAGE_RESP_PASID_VALID;

Out of curiosity. Is it a valid configuration which has REQUEST_PASID_VALID
set but RESP_PASID_VALID cleared? I'm unclear why another response
flag is required beyond what the request flag has told...

> +
> +		ret = ops->page_response(dev, iopf, &resp);
> +		if (ret)
> +			break;
> +
> +		list_del(&iopf->list);
> +		kfree(iopf);
> +	}
> +	mutex_unlock(&iopf_param->lock);
>   	iopf_put_dev_fault_param(iopf_param);
> 
> -	return 0;
> +	return ret;
>   }
>   EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);
> 

This looks OK. Another nit is that the warning of "no pending PRQ"
in iommu_page_response() should be removed given with above
change it's expected for iommufd responses to be received after this
flush operation in iommu core.
Tian, Kevin Sept. 11, 2023, 6:57 a.m. UTC | #21
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Tuesday, September 5, 2023 1:24 PM
> 
> Hi Kevin,
> 
> I am trying to address this issue in below patch. Does it looks sane to
> you?
> 
> iommu: Consolidate per-device fault data management
> 
> The per-device fault data is a data structure that is used to store
> information about faults that occur on a device. This data is allocated
> when IOPF is enabled on the device and freed when IOPF is disabled. The
> data is used in the paths of iopf reporting, handling, responding, and
> draining.
> 
> The fault data is protected by two locks:
> 
> - dev->iommu->lock: This lock is used to protect the allocation and
>    freeing of the fault data.
> - dev->iommu->fault_parameter->lock: This lock is used to protect the
>    fault data itself.
> 
> Improve the iopf code to enforce this lock mechanism and add a reference
> counter in the fault data to avoid use-after-free issue.
> 

Can you elaborate the use-after-free issue and why a new user count
is required?

btw a Fix tag is required given this mislocking issue has been there for
quite some time...
Baolu Lu Sept. 11, 2023, 12:26 p.m. UTC | #22
Hi Kevin,

Thanks for looking at my patches.

On 2023/9/11 14:35, Tian, Kevin wrote:
>> From: Baolu Lu <baolu.lu@linux.intel.com>
>> Sent: Tuesday, September 5, 2023 1:20 PM
>>
>> I added below patch to address the iopf_queue_flush_dev() issue. What do
>> you think of this?
>>
>> iommu: Improve iopf_queue_flush_dev()
>>
>> The iopf_queue_flush_dev() is called by the iommu driver before releasing
>> a PASID. It ensures that all pending faults for this PASID have been
>> handled or cancelled, and won't hit the address space that reuses this
>> PASID. The driver must make sure that no new fault is added to the queue.
>>
>> The SMMUv3 driver doesn't use it because it only implements the
>> Arm-specific stall fault model where DMA transactions are held in the SMMU
>> while waiting for the OS to handle iopf's. Since a device driver must
>> complete all DMA transactions before detaching domain, there are no
>> pending iopf's with the stall model. PRI support requires adding a call to
>> iopf_queue_flush_dev() after flushing the hardware page fault queue.
>>
>> The current implementation of iopf_queue_flush_dev() is a simplified
>> version. It is only suitable for SVA case in which the processing of iopf
>> is implemented in the inner loop of the iommu subsystem.
>>
>> Improve this interface to make it also work for handling iopf out of the
>> iommu core.
>>
>> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>> ---
>>    include/linux/iommu.h      |  4 ++--
>>    drivers/iommu/intel/svm.c  |  2 +-
>>    drivers/iommu/io-pgfault.c | 40
>> ++++++++++++++++++++++++++++++++++++--
>>    3 files changed, 41 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
>> index 77ad33ffe3ac..465e23e945d0 100644
>> --- a/include/linux/iommu.h
>> +++ b/include/linux/iommu.h
>> @@ -1275,7 +1275,7 @@ iommu_sva_domain_alloc(struct device *dev,
>> struct
>> mm_struct *mm)
>>    #ifdef CONFIG_IOMMU_IOPF
>>    int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev);
>>    int iopf_queue_remove_device(struct iopf_queue *queue, struct device
>> *dev);
>> -int iopf_queue_flush_dev(struct device *dev);
>> +int iopf_queue_flush_dev(struct device *dev, ioasid_t pasid);
>>    struct iopf_queue *iopf_queue_alloc(const char *name);
>>    void iopf_queue_free(struct iopf_queue *queue);
>>    int iopf_queue_discard_partial(struct iopf_queue *queue);
>> @@ -1295,7 +1295,7 @@ iopf_queue_remove_device(struct iopf_queue
>> *queue,
>> struct device *dev)
>>    	return -ENODEV;
>>    }
>>
>> -static inline int iopf_queue_flush_dev(struct device *dev)
>> +static inline int iopf_queue_flush_dev(struct device *dev, ioasid_t pasid)
>>    {
>>    	return -ENODEV;
>>    }
>> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
>> index 780c5bd73ec2..4c3f4533e337 100644
>> --- a/drivers/iommu/intel/svm.c
>> +++ b/drivers/iommu/intel/svm.c
>> @@ -495,7 +495,7 @@ void intel_drain_pasid_prq(struct device *dev, u32
>> pasid)
>>    		goto prq_retry;
>>    	}
>>
>> -	iopf_queue_flush_dev(dev);
>> +	iopf_queue_flush_dev(dev, pasid);
>>
>>    	/*
>>    	 * Perform steps described in VT-d spec CH7.10 to drain page
>> diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
>> index 3e6845bc5902..84728fb89ac7 100644
>> --- a/drivers/iommu/io-pgfault.c
>> +++ b/drivers/iommu/io-pgfault.c
>> @@ -309,17 +309,53 @@ EXPORT_SYMBOL_GPL(iommu_page_response);
>>     *
>>     * Return: 0 on success and <0 on error.
>>     */
>> -int iopf_queue_flush_dev(struct device *dev)
>> +int iopf_queue_flush_dev(struct device *dev, ioasid_t pasid)
>>    {
>>    	struct iommu_fault_param *iopf_param =
>> iopf_get_dev_fault_param(dev);
>> +	const struct iommu_ops *ops = dev_iommu_ops(dev);
>> +	struct iommu_page_response resp;
>> +	struct iopf_fault *iopf, *next;
>> +	int ret = 0;
>>
>>    	if (!iopf_param)
>>    		return -ENODEV;
>>
>>    	flush_workqueue(iopf_param->queue->wq);
>> +
>> +	mutex_lock(&iopf_param->lock);
>> +	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
>> +		if (!(iopf->fault.prm.flags &
>> IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) ||
>> +		    iopf->fault.prm.pasid != pasid)
>> +			break;
>> +
>> +		list_del(&iopf->list);
>> +		kfree(iopf);
>> +	}
>> +
>> +	list_for_each_entry_safe(iopf, next, &iopf_param->faults, list) {
>> +		if (!(iopf->fault.prm.flags &
>> IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) ||
>> +		    iopf->fault.prm.pasid != pasid)
>> +			continue;
>> +
>> +		memset(&resp, 0, sizeof(struct iommu_page_response));
>> +		resp.pasid = iopf->fault.prm.pasid;
>> +		resp.grpid = iopf->fault.prm.grpid;
>> +		resp.code = IOMMU_PAGE_RESP_INVALID;
>> +
>> +		if (iopf->fault.prm.flags &
>> IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID)
>> +			resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
> 
> Out of curiosity. Is it a valid configuration which has REQUEST_PASID_VALID
> set but RESP_PASID_VALID cleared? I'm unclear why another response
> flag is required beyond what the request flag has told...

This seems to have uncovered a bug in VT-d driver.

The PCIe spec (Section 10.4.2.2) states:

"
If a Page Request has a PASID, the corresponding PRG Response Message
may optionally contain one as well.

If the PRG Response PASID Required bit is Clear, PRG Response Messages
do not have a PASID. If the PRG Response PASID Required bit is Set, PRG
Response Messages have a PASID if the Page Request also had one. The
Function is permitted to use the PASID value from the prefix in
conjunction with the PRG Index to match requests and responses.
"

The "PRG Response PASID Required bit" is a read-only field in the PCI
page request status register. It is represented by
"pdev->pasid_required".

So below code in VT-d driver is not correct:

542 static int intel_svm_prq_report(struct intel_iommu *iommu, struct 
device *dev,
543                                 struct page_req_dsc *desc)
544 {

[...]

556
557         if (desc->lpig)
558                 event.fault.prm.flags |= 
IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
559         if (desc->pasid_present) {
560                 event.fault.prm.flags |= 
IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
561                 event.fault.prm.flags |= 
IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
562         }
[...]

The right logic should be

	if (pdev->pasid_required)
		event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;

Thoughts?

>> +
>> +		ret = ops->page_response(dev, iopf, &resp);
>> +		if (ret)
>> +			break;
>> +
>> +		list_del(&iopf->list);
>> +		kfree(iopf);
>> +	}
>> +	mutex_unlock(&iopf_param->lock);
>>    	iopf_put_dev_fault_param(iopf_param);
>>
>> -	return 0;
>> +	return ret;
>>    }
>>    EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);
>>
> 
> This looks OK. Another nit is that the warning of "no pending PRQ"
> in iommu_page_response() should be removed given with above
> change it's expected for iommufd responses to be received after this
> flush operation in iommu core.

Yeah! Addressed.

Best regards,
baolu
Baolu Lu Sept. 11, 2023, 12:46 p.m. UTC | #23
On 2023/9/11 14:57, Tian, Kevin wrote:
>> From: Baolu Lu <baolu.lu@linux.intel.com>
>> Sent: Tuesday, September 5, 2023 1:24 PM
>>
>> Hi Kevin,
>>
>> I am trying to address this issue in below patch. Does it looks sane to
>> you?
>>
>> iommu: Consolidate per-device fault data management
>>
>> The per-device fault data is a data structure that is used to store
>> information about faults that occur on a device. This data is allocated
>> when IOPF is enabled on the device and freed when IOPF is disabled. The
>> data is used in the paths of iopf reporting, handling, responding, and
>> draining.
>>
>> The fault data is protected by two locks:
>>
>> - dev->iommu->lock: This lock is used to protect the allocation and
>>     freeing of the fault data.
>> - dev->iommu->fault_parameter->lock: This lock is used to protect the
>>     fault data itself.
>>
>> Improve the iopf code to enforce this lock mechanism and add a reference
>> counter in the fault data to avoid use-after-free issue.
>>
> 
> Can you elaborate the use-after-free issue and why a new user count
> is required?

I was concerned that when iommufd uses iopf, page fault report/response
may occur simultaneously with enable/disable PRI.

Currently, this is not an issue as the enable/disable PRI is in its own
path. In the future, we may discard this interface and enable PRI when
attaching the first PRI-capable domain, and disable it when detaching
the last PRI-capable domain.

> 
> btw a Fix tag is required given this mislocking issue has been there for
> quite some time...

I don't see any real issue fixed by this change. It's only a lock
refactoring after the code refactoring and preparing it for iommufd use.
Perhaps I missed anything?

Best regards,
baolu
Tian, Kevin Sept. 13, 2023, 2:25 a.m. UTC | #24
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Monday, September 11, 2023 8:27 PM
> 
> >
> > Out of curiosity. Is it a valid configuration which has
> REQUEST_PASID_VALID
> > set but RESP_PASID_VALID cleared? I'm unclear why another response
> > flag is required beyond what the request flag has told...
> 
> This seems to have uncovered a bug in VT-d driver.
> 
> The PCIe spec (Section 10.4.2.2) states:
> 
> "
> If a Page Request has a PASID, the corresponding PRG Response Message
> may optionally contain one as well.
> 
> If the PRG Response PASID Required bit is Clear, PRG Response Messages
> do not have a PASID. If the PRG Response PASID Required bit is Set, PRG
> Response Messages have a PASID if the Page Request also had one. The
> Function is permitted to use the PASID value from the prefix in
> conjunction with the PRG Index to match requests and responses.
> "
> 
> The "PRG Response PASID Required bit" is a read-only field in the PCI
> page request status register. It is represented by
> "pdev->pasid_required".
> 
> So below code in VT-d driver is not correct:
> 
> 542 static int intel_svm_prq_report(struct intel_iommu *iommu, struct
> device *dev,
> 543                                 struct page_req_dsc *desc)
> 544 {
> 
> [...]
> 
> 556
> 557         if (desc->lpig)
> 558                 event.fault.prm.flags |=
> IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
> 559         if (desc->pasid_present) {
> 560                 event.fault.prm.flags |=
> IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
> 561                 event.fault.prm.flags |=
> IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
> 562         }
> [...]
> 
> The right logic should be
> 
> 	if (pdev->pasid_required)
> 		event.fault.prm.flags |=
> IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
> 
> Thoughts?
> 

yes, it's the right fix. We haven't seen any bug report probably because
all SVM-capable devices have pasid_required set? 
Tian, Kevin Sept. 13, 2023, 2:34 a.m. UTC | #25
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Monday, September 11, 2023 8:46 PM
> 
> On 2023/9/11 14:57, Tian, Kevin wrote:
> >> From: Baolu Lu <baolu.lu@linux.intel.com>
> >> Sent: Tuesday, September 5, 2023 1:24 PM
> >>
> >> Hi Kevin,
> >>
> >> I am trying to address this issue in below patch. Does it looks sane to
> >> you?
> >>
> >> iommu: Consolidate per-device fault data management
> >>
> >> The per-device fault data is a data structure that is used to store
> >> information about faults that occur on a device. This data is allocated
> >> when IOPF is enabled on the device and freed when IOPF is disabled. The
> >> data is used in the paths of iopf reporting, handling, responding, and
> >> draining.
> >>
> >> The fault data is protected by two locks:
> >>
> >> - dev->iommu->lock: This lock is used to protect the allocation and
> >>     freeing of the fault data.
> >> - dev->iommu->fault_parameter->lock: This lock is used to protect the
> >>     fault data itself.
> >>
> >> Improve the iopf code to enforce this lock mechanism and add a
> reference
> >> counter in the fault data to avoid use-after-free issue.
> >>
> >
> > Can you elaborate the use-after-free issue and why a new user count
> > is required?
> 
> I was concerned that when iommufd uses iopf, page fault report/response
> may occur simultaneously with enable/disable PRI.
> 
> Currently, this is not an issue as the enable/disable PRI is in its own
> path. In the future, we may discard this interface and enable PRI when
> attaching the first PRI-capable domain, and disable it when detaching
> the last PRI-capable domain.

Then let's not do it now until there is a real need after you have a
thorough design for iommufd.

> 
> >
> > btw a Fix tag is required given this mislocking issue has been there for
> > quite some time...
> 
> I don't see any real issue fixed by this change. It's only a lock
> refactoring after the code refactoring and preparing it for iommufd use.
> Perhaps I missed anything?
> 

mislocking already exists today for the partial list:

  - iommu_queue_iopf() uses dev->iommu->lock;
  - iopf_queue_discard_partial() uses queue->lock;
Baolu Lu Sept. 13, 2023, 2:44 a.m. UTC | #26
On 9/13/23 10:25 AM, Tian, Kevin wrote:
>> From: Baolu Lu <baolu.lu@linux.intel.com>
>> Sent: Monday, September 11, 2023 8:27 PM
>>
>>>
>>> Out of curiosity. Is it a valid configuration which has
>> REQUEST_PASID_VALID
>>> set but RESP_PASID_VALID cleared? I'm unclear why another response
>>> flag is required beyond what the request flag has told...
>>
>> This seems to have uncovered a bug in VT-d driver.
>>
>> The PCIe spec (Section 10.4.2.2) states:
>>
>> "
>> If a Page Request has a PASID, the corresponding PRG Response Message
>> may optionally contain one as well.
>>
>> If the PRG Response PASID Required bit is Clear, PRG Response Messages
>> do not have a PASID. If the PRG Response PASID Required bit is Set, PRG
>> Response Messages have a PASID if the Page Request also had one. The
>> Function is permitted to use the PASID value from the prefix in
>> conjunction with the PRG Index to match requests and responses.
>> "
>>
>> The "PRG Response PASID Required bit" is a read-only field in the PCI
>> page request status register. It is represented by
>> "pdev->pasid_required".
>>
>> So below code in VT-d driver is not correct:
>>
>> 542 static int intel_svm_prq_report(struct intel_iommu *iommu, struct
>> device *dev,
>> 543                                 struct page_req_dsc *desc)
>> 544 {
>>
>> [...]
>>
>> 556
>> 557         if (desc->lpig)
>> 558                 event.fault.prm.flags |=
>> IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
>> 559         if (desc->pasid_present) {
>> 560                 event.fault.prm.flags |=
>> IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
>> 561                 event.fault.prm.flags |=
>> IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
>> 562         }
>> [...]
>>
>> The right logic should be
>>
>> 	if (pdev->pasid_required)
>> 		event.fault.prm.flags |=
>> IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
>>
>> Thoughts?
>>
> 
> yes, it's the right fix. We haven't seen any bug report probably because
> all SVM-capable devices have pasid_required set? 
Baolu Lu Sept. 13, 2023, 4:23 a.m. UTC | #27
On 9/13/23 10:34 AM, Tian, Kevin wrote:
>> From: Baolu Lu <baolu.lu@linux.intel.com>
>> Sent: Monday, September 11, 2023 8:46 PM
>>
>> On 2023/9/11 14:57, Tian, Kevin wrote:
>>>> From: Baolu Lu <baolu.lu@linux.intel.com>
>>>> Sent: Tuesday, September 5, 2023 1:24 PM
>>>>
>>>> Hi Kevin,
>>>>
>>>> I am trying to address this issue in below patch. Does it looks sane to
>>>> you?
>>>>
>>>> iommu: Consolidate per-device fault data management
>>>>
>>>> The per-device fault data is a data structure that is used to store
>>>> information about faults that occur on a device. This data is allocated
>>>> when IOPF is enabled on the device and freed when IOPF is disabled. The
>>>> data is used in the paths of iopf reporting, handling, responding, and
>>>> draining.
>>>>
>>>> The fault data is protected by two locks:
>>>>
>>>> - dev->iommu->lock: This lock is used to protect the allocation and
>>>>      freeing of the fault data.
>>>> - dev->iommu->fault_parameter->lock: This lock is used to protect the
>>>>      fault data itself.
>>>>
>>>> Improve the iopf code to enforce this lock mechanism and add a
>> reference
>>>> counter in the fault data to avoid use-after-free issue.
>>>>
>>>
>>> Can you elaborate the use-after-free issue and why a new user count
>>> is required?
>>
>> I was concerned that when iommufd uses iopf, page fault report/response
>> may occur simultaneously with enable/disable PRI.
>>
>> Currently, this is not an issue as the enable/disable PRI is in its own
>> path. In the future, we may discard this interface and enable PRI when
>> attaching the first PRI-capable domain, and disable it when detaching
>> the last PRI-capable domain.
> 
> Then let's not do it now until there is a real need after you have a
> thorough design for iommufd.

Okay, fair enough.

> 
>>
>>>
>>> btw a Fix tag is required given this mislocking issue has been there for
>>> quite some time...
>>
>> I don't see any real issue fixed by this change. It's only a lock
>> refactoring after the code refactoring and preparing it for iommufd use.
>> Perhaps I missed anything?
>>
> 
> mislocking already exists today for the partial list:
> 
>    - iommu_queue_iopf() uses dev->iommu->lock;
>    - iopf_queue_discard_partial() uses queue->lock;

So, if it's worth it, let me try splitting a fix patch.

Best regards,
baolu
Baolu Lu Sept. 13, 2023, 6:18 a.m. UTC | #28
On 2023/9/13 10:34, Tian, Kevin wrote:
>> From: Baolu Lu<baolu.lu@linux.intel.com>
>> Sent: Monday, September 11, 2023 8:46 PM
>>
>> On 2023/9/11 14:57, Tian, Kevin wrote:
>>>> From: Baolu Lu<baolu.lu@linux.intel.com>
>>>> Sent: Tuesday, September 5, 2023 1:24 PM
>>>>
>>>> Hi Kevin,
>>>>
>>>> I am trying to address this issue in below patch. Does it looks sane to
>>>> you?
>>>>
>>>> iommu: Consolidate per-device fault data management
>>>>
>>>> The per-device fault data is a data structure that is used to store
>>>> information about faults that occur on a device. This data is allocated
>>>> when IOPF is enabled on the device and freed when IOPF is disabled. The
>>>> data is used in the paths of iopf reporting, handling, responding, and
>>>> draining.
>>>>
>>>> The fault data is protected by two locks:
>>>>
>>>> - dev->iommu->lock: This lock is used to protect the allocation and
>>>>      freeing of the fault data.
>>>> - dev->iommu->fault_parameter->lock: This lock is used to protect the
>>>>      fault data itself.
>>>>
>>>> Improve the iopf code to enforce this lock mechanism and add a
>> reference
>>>> counter in the fault data to avoid use-after-free issue.
>>>>
>>> Can you elaborate the use-after-free issue and why a new user count
>>> is required?
>> I was concerned that when iommufd uses iopf, page fault report/response
>> may occur simultaneously with enable/disable PRI.
>>
>> Currently, this is not an issue as the enable/disable PRI is in its own
>> path. In the future, we may discard this interface and enable PRI when
>> attaching the first PRI-capable domain, and disable it when detaching
>> the last PRI-capable domain.
> Then let's not do it now until there is a real need after you have a
> thorough design for iommufd.

I revisited this part of code and found that it's still valuable to make
the code clean and simple. The fault parameter is accessed in various
paths, such as reporting iopf, responding iopf, draining iopf's, adding
queue and removing queue. In each path, we need to repeat below locking
code:

	mutex_lock(&dev->iommu->lock);
	fault_param = dev->iommu->fault_param;
	if (!fault_param) {
		mutex_unlock(&dev->iommu->lock);
		return -ENODEV;
	}

	/* use the fault parameter */
	... ...

	mutex_unlock(&dev->iommu->lock);

The order of the locks is also important. Otherwise, a possible deadlock
issue will be reported by lockdep.

By consolidating above code in iopf_get/put_dev_fault_param() helpers,
it could be simplified as:

	fault_param = iopf_get_dev_fault_param(dev);
	if (!fault_param)
		return -ENODEV;

	/* use the fault parameter */
	... ...

	iopf_put_dev_fault_param(fault_param);

The lock order issue is removed. And it will make the code simpler and
easier for maintenance.

Best regards,
baolu
diff mbox series

Patch

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 19420ccd43a7..687dfde2c874 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -128,6 +128,7 @@  struct iopf_group {
 	struct list_head faults;
 	struct work_struct work;
 	struct device *dev;
+	struct iommu_domain *domain;
 };
 
 /**
@@ -197,8 +198,7 @@  struct iommu_domain {
 	unsigned long pgsize_bitmap;	/* Bitmap of page sizes in use */
 	struct iommu_domain_geometry geometry;
 	struct iommu_dma_cookie *iova_cookie;
-	enum iommu_page_response_code (*iopf_handler)(struct iommu_fault *fault,
-						      void *data);
+	int (*iopf_handler)(struct iopf_group *group);
 	void *fault_data;
 	union {
 		struct {
diff --git a/drivers/iommu/iommu-sva.h b/drivers/iommu/iommu-sva.h
index de7819c796ce..27c8da115b41 100644
--- a/drivers/iommu/iommu-sva.h
+++ b/drivers/iommu/iommu-sva.h
@@ -22,8 +22,7 @@  int iopf_queue_flush_dev(struct device *dev);
 struct iopf_queue *iopf_queue_alloc(const char *name);
 void iopf_queue_free(struct iopf_queue *queue);
 int iopf_queue_discard_partial(struct iopf_queue *queue);
-enum iommu_page_response_code
-iommu_sva_handle_iopf(struct iommu_fault *fault, void *data);
+int iommu_sva_handle_iopf(struct iopf_group *group);
 
 #else /* CONFIG_IOMMU_SVA */
 static inline int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
@@ -62,8 +61,7 @@  static inline int iopf_queue_discard_partial(struct iopf_queue *queue)
 	return -ENODEV;
 }
 
-static inline enum iommu_page_response_code
-iommu_sva_handle_iopf(struct iommu_fault *fault, void *data)
+static inline int iommu_sva_handle_iopf(struct iopf_group *group)
 {
 	return IOMMU_PAGE_RESP_INVALID;
 }
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 09e05f483b4f..06330d922925 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -13,6 +13,9 @@ 
 
 #include "iommu-sva.h"
 
+enum iommu_page_response_code
+iommu_sva_handle_mm(struct iommu_fault *fault, struct mm_struct *mm);
+
 static void iopf_free_group(struct iopf_group *group)
 {
 	struct iopf_fault *iopf, *next;
@@ -45,23 +48,18 @@  static void iopf_handler(struct work_struct *work)
 {
 	struct iopf_fault *iopf;
 	struct iopf_group *group;
-	struct iommu_domain *domain;
 	enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS;
 
 	group = container_of(work, struct iopf_group, work);
-	domain = iommu_get_domain_for_dev_pasid(group->dev,
-				group->last_fault.fault.prm.pasid, 0);
-	if (!domain || !domain->iopf_handler)
-		status = IOMMU_PAGE_RESP_INVALID;
-
 	list_for_each_entry(iopf, &group->faults, list) {
 		/*
 		 * For the moment, errors are sticky: don't handle subsequent
 		 * faults in the group if there is an error.
 		 */
-		if (status == IOMMU_PAGE_RESP_SUCCESS)
-			status = domain->iopf_handler(&iopf->fault,
-						      domain->fault_data);
+		if (status != IOMMU_PAGE_RESP_SUCCESS)
+			break;
+
+		status = iommu_sva_handle_mm(&iopf->fault, group->domain->mm);
 	}
 
 	iopf_complete_group(group->dev, &group->last_fault, status);
@@ -112,6 +110,7 @@  int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
 {
 	int ret;
 	struct iopf_group *group;
+	struct iommu_domain *domain;
 	struct iopf_fault *iopf, *next;
 	struct iommu_fault_param *iopf_param;
 	struct dev_iommu *param = dev->iommu;
@@ -143,6 +142,19 @@  int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
 		return 0;
 	}
 
+	if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)
+		domain = iommu_get_domain_for_dev_pasid(dev, fault->prm.pasid, 0);
+	else
+		domain = iommu_get_domain_for_dev(dev);
+
+	if (!domain || !domain->iopf_handler) {
+		dev_warn_ratelimited(dev,
+			"iopf from pasid %d received without handler installed\n",
+			 fault->prm.pasid);
+		ret = -ENODEV;
+		goto cleanup_partial;
+	}
+
 	group = kzalloc(sizeof(*group), GFP_KERNEL);
 	if (!group) {
 		/*
@@ -157,8 +169,8 @@  int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
 	group->dev = dev;
 	group->last_fault.fault = *fault;
 	INIT_LIST_HEAD(&group->faults);
+	group->domain = domain;
 	list_add(&group->last_fault.list, &group->faults);
-	INIT_WORK(&group->work, iopf_handler);
 
 	/* See if we have partial faults for this group */
 	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
@@ -167,9 +179,11 @@  int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
 			list_move(&iopf->list, &group->faults);
 	}
 
-	queue_work(iopf_param->queue->wq, &group->work);
-	return 0;
+	ret = domain->iopf_handler(group);
+	if (ret)
+		iopf_free_group(group);
 
+	return ret;
 cleanup_partial:
 	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
 		if (iopf->fault.prm.grpid == fault->prm.grpid) {
@@ -181,6 +195,17 @@  int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_queue_iopf);
 
+int iommu_sva_handle_iopf(struct iopf_group *group)
+{
+	struct iommu_fault_param *fault_param = group->dev->iommu->fault_param;
+
+	INIT_WORK(&group->work, iopf_handler);
+	if (!queue_work(fault_param->queue->wq, &group->work))
+		return -EBUSY;
+
+	return 0;
+}
+
 /**
  * iopf_queue_flush_dev - Ensure that all queued faults have been processed
  * @dev: the endpoint whose faults need to be flushed.
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index b78671a8a914..ba0d5b7e106a 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -149,11 +149,10 @@  EXPORT_SYMBOL_GPL(iommu_sva_get_pasid);
  * I/O page fault handler for SVA
  */
 enum iommu_page_response_code
-iommu_sva_handle_iopf(struct iommu_fault *fault, void *data)
+iommu_sva_handle_mm(struct iommu_fault *fault, struct mm_struct *mm)
 {
 	vm_fault_t ret;
 	struct vm_area_struct *vma;
-	struct mm_struct *mm = data;
 	unsigned int access_flags = 0;
 	unsigned int fault_flags = FAULT_FLAG_REMOTE;
 	struct iommu_fault_page_request *prm = &fault->prm;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index a7f6d0ec0400..0704a0f36349 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1951,6 +1951,27 @@  int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
 	return 0;
 }
 
+static void assert_no_pending_iopf(struct device *dev, ioasid_t pasid)
+{
+	struct iommu_fault_param *iopf_param = dev->iommu->fault_param;
+	struct iopf_fault *iopf;
+
+	if (!iopf_param)
+		return;
+
+	mutex_lock(&iopf_param->lock);
+	list_for_each_entry(iopf, &iopf_param->partial, list) {
+		if (WARN_ON(iopf->fault.prm.pasid == pasid))
+			break;
+	}
+
+	list_for_each_entry(iopf, &iopf_param->faults, list) {
+		if (WARN_ON(iopf->fault.prm.pasid == pasid))
+			break;
+	}
+	mutex_unlock(&iopf_param->lock);
+}
+
 void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
 {
 	struct iommu_group *group;
@@ -1959,6 +1980,7 @@  void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
 	if (!group)
 		return;
 
+	assert_no_pending_iopf(dev, IOMMU_NO_PASID);
 	mutex_lock(&group->mutex);
 	if (WARN_ON(domain != group->domain) ||
 	    WARN_ON(list_count_nodes(&group->devices) != 1))
@@ -3269,6 +3291,7 @@  void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev,
 {
 	struct iommu_group *group = iommu_group_get(dev);
 
+	assert_no_pending_iopf(dev, pasid);
 	mutex_lock(&group->mutex);
 	__iommu_remove_group_pasid(group, pasid);
 	WARN_ON(xa_erase(&group->pasid_array, pasid) != domain);