diff mbox series

[1/2] iommu: Prevent RESV_DIRECT devices from blocking domains

Message ID 20230607035145.343698-2-baolu.lu@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series Prevent RESV_DIRECT devices from user assignment | expand

Commit Message

Baolu Lu June 7, 2023, 3:51 a.m. UTC
The IOMMU_RESV_DIRECT flag indicates that a memory region must be mapped
1:1 at all times. This means that the region must always be accessible to
the device, even if the device is attached to a blocking domain. This is
equal to saying that IOMMU_RESV_DIRECT flag prevents devices from being
attached to blocking domains.

This also implies that devices that implement RESV_DIRECT regions will be
prevented from being assigned to user space since taking the DMA ownership
immediately switches to a blocking domain.

The rule of preventing devices with the IOMMU_RESV_DIRECT regions from
being assigned to user space has existed in the Intel IOMMU driver for
a long time. Now, this rule is being lifted up to a general core rule,
as other architectures like AMD and ARM also have RMRR-like reserved
regions. This has been discussed in the community mailing list and refer
to below link for more details.

Other places using unmanaged domains for kernel DMA must follow the
iommu_get_resv_regions() and setup IOMMU_RESV_DIRECT - we do not restrict
them in the core code.

Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/linux-iommu/BN9PR11MB5276E84229B5BD952D78E9598C639@BN9PR11MB5276.namprd11.prod.outlook.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 include/linux/iommu.h |  2 ++
 drivers/iommu/iommu.c | 39 +++++++++++++++++++++++++++++----------
 2 files changed, 31 insertions(+), 10 deletions(-)

Comments

Liu, Jingqi June 12, 2023, 8:28 a.m. UTC | #1
On 6/7/2023 11:51 AM, Lu Baolu wrote:
> The IOMMU_RESV_DIRECT flag indicates that a memory region must be mapped
> 1:1 at all times. This means that the region must always be accessible to
> the device, even if the device is attached to a blocking domain. This is
> equal to saying that IOMMU_RESV_DIRECT flag prevents devices from being
> attached to blocking domains.
>
> This also implies that devices that implement RESV_DIRECT regions will be
> prevented from being assigned to user space since taking the DMA ownership
> immediately switches to a blocking domain.
>
> The rule of preventing devices with the IOMMU_RESV_DIRECT regions from
> being assigned to user space has existed in the Intel IOMMU driver for
> a long time. Now, this rule is being lifted up to a general core rule,
> as other architectures like AMD and ARM also have RMRR-like reserved
> regions. This has been discussed in the community mailing list and refer
> to below link for more details.
>
> Other places using unmanaged domains for kernel DMA must follow the
> iommu_get_resv_regions() and setup IOMMU_RESV_DIRECT - we do not restrict
> them in the core code.
>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Cc: Kevin Tian <kevin.tian@intel.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Link: https://lore.kernel.org/linux-iommu/BN9PR11MB5276E84229B5BD952D78E9598C639@BN9PR11MB5276.namprd11.prod.outlook.com
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
>   include/linux/iommu.h |  2 ++
>   drivers/iommu/iommu.c | 39 +++++++++++++++++++++++++++++----------
>   2 files changed, 31 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index d31642596675..fd18019ac951 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -409,6 +409,7 @@ struct iommu_fault_param {
>    * @priv:	 IOMMU Driver private data
>    * @max_pasids:  number of PASIDs this device can consume
>    * @attach_deferred: the dma domain attachment is deferred
> + * @requires_direct: The driver requested IOMMU_RESV_DIRECT
>    *
>    * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
>    *	struct iommu_group	*iommu_group;
> @@ -422,6 +423,7 @@ struct dev_iommu {
>   	void				*priv;
>   	u32				max_pasids;
>   	u32				attach_deferred:1;
> +	u32				requires_direct:1;
>   };
>   
>   int iommu_device_register(struct iommu_device *iommu,
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 9e0228ef612b..e59de7852067 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -959,12 +959,7 @@ static int iommu_create_device_direct_mappings(struct iommu_domain *domain,
>   	unsigned long pg_size;
>   	int ret = 0;
>   
> -	if (!iommu_is_dma_domain(domain))
> -		return 0;
> -
> -	BUG_ON(!domain->pgsize_bitmap);
> -
> -	pg_size = 1UL << __ffs(domain->pgsize_bitmap);
> +	pg_size = domain->pgsize_bitmap ? 1UL << __ffs(domain->pgsize_bitmap) : 0;
Would it be better to add the following check here?
     if (WARN_ON(!pg_size))
             return -EINVAL;

Instead of checking latter in the loop as follows.
     if (WARN_ON_ONCE(!pg_size)) {
             ret = -EINVAL;
             goto out;
     }

Thanks,
Jingqi
>   	INIT_LIST_HEAD(&mappings);
>   
>   	iommu_get_resv_regions(dev, &mappings);
> @@ -974,13 +969,22 @@ static int iommu_create_device_direct_mappings(struct iommu_domain *domain,
>   		dma_addr_t start, end, addr;
>   		size_t map_size = 0;
>   
> +		if (entry->type == IOMMU_RESV_DIRECT)
> +			dev->iommu->requires_direct = 1;
> +
> +		if ((entry->type != IOMMU_RESV_DIRECT &&
> +		     entry->type != IOMMU_RESV_DIRECT_RELAXABLE) ||
> +		    !iommu_is_dma_domain(domain))
> +			continue;
> +
> +		if (WARN_ON_ONCE(!pg_size)) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +
>   		start = ALIGN(entry->start, pg_size);
>   		end   = ALIGN(entry->start + entry->length, pg_size);
>   
> -		if (entry->type != IOMMU_RESV_DIRECT &&
> -		    entry->type != IOMMU_RESV_DIRECT_RELAXABLE)
> -			continue;
> -
>   		for (addr = start; addr <= end; addr += pg_size) {
>   			phys_addr_t phys_addr;
>   
> @@ -2121,6 +2125,21 @@ static int __iommu_device_set_domain(struct iommu_group *group,
>   {
>   	int ret;
>   
> +	/*
> +	 * If the driver has requested IOMMU_RESV_DIRECT then we cannot allow
> +	 * the blocking domain to be attached as it does not contain the
> +	 * required 1:1 mapping. This test effectively exclusive the device from
> +	 * being used with iommu_group_claim_dma_owner() which will block vfio
> +	 * and iommufd as well.
> +	 */
> +	if (dev->iommu->requires_direct &&
> +	    (new_domain->type == IOMMU_DOMAIN_BLOCKED ||
> +	     new_domain == group->blocking_domain)) {
> +		dev_warn(dev,
> +			 "Firmware has requested this device have a 1:1 IOMMU mapping, rejecting configuring the device without a 1:1 mapping. Contact your platform vendor.\n");
> +		return -EINVAL;
> +	}
> +
>   	if (dev->iommu->attach_deferred) {
>   		if (new_domain == group->default_domain)
>   			return 0;
Baolu Lu June 13, 2023, 3:14 a.m. UTC | #2
On 6/12/23 4:28 PM, Liu, Jingqi wrote:
> On 6/7/2023 11:51 AM, Lu Baolu wrote:
>> The IOMMU_RESV_DIRECT flag indicates that a memory region must be mapped
>> 1:1 at all times. This means that the region must always be accessible to
>> the device, even if the device is attached to a blocking domain. This is
>> equal to saying that IOMMU_RESV_DIRECT flag prevents devices from being
>> attached to blocking domains.
>>
>> This also implies that devices that implement RESV_DIRECT regions will be
>> prevented from being assigned to user space since taking the DMA 
>> ownership
>> immediately switches to a blocking domain.
>>
>> The rule of preventing devices with the IOMMU_RESV_DIRECT regions from
>> being assigned to user space has existed in the Intel IOMMU driver for
>> a long time. Now, this rule is being lifted up to a general core rule,
>> as other architectures like AMD and ARM also have RMRR-like reserved
>> regions. This has been discussed in the community mailing list and refer
>> to below link for more details.
>>
>> Other places using unmanaged domains for kernel DMA must follow the
>> iommu_get_resv_regions() and setup IOMMU_RESV_DIRECT - we do not restrict
>> them in the core code.
>>
>> Cc: Robin Murphy <robin.murphy@arm.com>
>> Cc: Alex Williamson <alex.williamson@redhat.com>
>> Cc: Kevin Tian <kevin.tian@intel.com>
>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>> Link: 
>> https://lore.kernel.org/linux-iommu/BN9PR11MB5276E84229B5BD952D78E9598C639@BN9PR11MB5276.namprd11.prod.outlook.com
>> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>> ---
>>   include/linux/iommu.h |  2 ++
>>   drivers/iommu/iommu.c | 39 +++++++++++++++++++++++++++++----------
>>   2 files changed, 31 insertions(+), 10 deletions(-)
>>
>> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
>> index d31642596675..fd18019ac951 100644
>> --- a/include/linux/iommu.h
>> +++ b/include/linux/iommu.h
>> @@ -409,6 +409,7 @@ struct iommu_fault_param {
>>    * @priv:     IOMMU Driver private data
>>    * @max_pasids:  number of PASIDs this device can consume
>>    * @attach_deferred: the dma domain attachment is deferred
>> + * @requires_direct: The driver requested IOMMU_RESV_DIRECT
>>    *
>>    * TODO: migrate other per device data pointers under 
>> iommu_dev_data, e.g.
>>    *    struct iommu_group    *iommu_group;
>> @@ -422,6 +423,7 @@ struct dev_iommu {
>>       void                *priv;
>>       u32                max_pasids;
>>       u32                attach_deferred:1;
>> +    u32                requires_direct:1;
>>   };
>>   int iommu_device_register(struct iommu_device *iommu,
>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index 9e0228ef612b..e59de7852067 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -959,12 +959,7 @@ static int 
>> iommu_create_device_direct_mappings(struct iommu_domain *domain,
>>       unsigned long pg_size;
>>       int ret = 0;
>> -    if (!iommu_is_dma_domain(domain))
>> -        return 0;
>> -
>> -    BUG_ON(!domain->pgsize_bitmap);
>> -
>> -    pg_size = 1UL << __ffs(domain->pgsize_bitmap);
>> +    pg_size = domain->pgsize_bitmap ? 1UL << 
>> __ffs(domain->pgsize_bitmap) : 0;
> Would it be better to add the following check here?
>      if (WARN_ON(!pg_size))
>              return -EINVAL;
> 
> Instead of checking latter in the loop as follows.
>      if (WARN_ON_ONCE(!pg_size)) {
>              ret = -EINVAL;
>              goto out;
>      }

I am afraid no. Only the paging domains need a valid pg_size. That's the
reason why I put it after the iommu_is_dma_domain() check. The previous
code has the same behavior too.

Best regards,
baolu
Robin Murphy June 19, 2023, 1:33 p.m. UTC | #3
On 07/06/2023 4:51 am, Lu Baolu wrote:
> The IOMMU_RESV_DIRECT flag indicates that a memory region must be mapped
> 1:1 at all times. This means that the region must always be accessible to
> the device, even if the device is attached to a blocking domain. This is
> equal to saying that IOMMU_RESV_DIRECT flag prevents devices from being
> attached to blocking domains.
> 
> This also implies that devices that implement RESV_DIRECT regions will be
> prevented from being assigned to user space since taking the DMA ownership
> immediately switches to a blocking domain.
> 
> The rule of preventing devices with the IOMMU_RESV_DIRECT regions from
> being assigned to user space has existed in the Intel IOMMU driver for
> a long time. Now, this rule is being lifted up to a general core rule,
> as other architectures like AMD and ARM also have RMRR-like reserved
> regions. This has been discussed in the community mailing list and refer
> to below link for more details.
> 
> Other places using unmanaged domains for kernel DMA must follow the
> iommu_get_resv_regions() and setup IOMMU_RESV_DIRECT - we do not restrict
> them in the core code.
> 
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Cc: Kevin Tian <kevin.tian@intel.com>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Link: https://lore.kernel.org/linux-iommu/BN9PR11MB5276E84229B5BD952D78E9598C639@BN9PR11MB5276.namprd11.prod.outlook.com
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
>   include/linux/iommu.h |  2 ++
>   drivers/iommu/iommu.c | 39 +++++++++++++++++++++++++++++----------
>   2 files changed, 31 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index d31642596675..fd18019ac951 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -409,6 +409,7 @@ struct iommu_fault_param {
>    * @priv:	 IOMMU Driver private data
>    * @max_pasids:  number of PASIDs this device can consume
>    * @attach_deferred: the dma domain attachment is deferred
> + * @requires_direct: The driver requested IOMMU_RESV_DIRECT
>    *
>    * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
>    *	struct iommu_group	*iommu_group;
> @@ -422,6 +423,7 @@ struct dev_iommu {
>   	void				*priv;
>   	u32				max_pasids;
>   	u32				attach_deferred:1;
> +	u32				requires_direct:1;
>   };
>   
>   int iommu_device_register(struct iommu_device *iommu,
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 9e0228ef612b..e59de7852067 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -959,12 +959,7 @@ static int iommu_create_device_direct_mappings(struct iommu_domain *domain,
>   	unsigned long pg_size;
>   	int ret = 0;
>   
> -	if (!iommu_is_dma_domain(domain))
> -		return 0;
> -
> -	BUG_ON(!domain->pgsize_bitmap);
> -
> -	pg_size = 1UL << __ffs(domain->pgsize_bitmap);
> +	pg_size = domain->pgsize_bitmap ? 1UL << __ffs(domain->pgsize_bitmap) : 0;
>   	INIT_LIST_HEAD(&mappings);
>   
>   	iommu_get_resv_regions(dev, &mappings);
> @@ -974,13 +969,22 @@ static int iommu_create_device_direct_mappings(struct iommu_domain *domain,
>   		dma_addr_t start, end, addr;
>   		size_t map_size = 0;
>   
> +		if (entry->type == IOMMU_RESV_DIRECT)
> +			dev->iommu->requires_direct = 1;
> +
> +		if ((entry->type != IOMMU_RESV_DIRECT &&
> +		     entry->type != IOMMU_RESV_DIRECT_RELAXABLE) ||
> +		    !iommu_is_dma_domain(domain))
> +			continue;
> +
> +		if (WARN_ON_ONCE(!pg_size)) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +
>   		start = ALIGN(entry->start, pg_size);
>   		end   = ALIGN(entry->start + entry->length, pg_size);
>   
> -		if (entry->type != IOMMU_RESV_DIRECT &&
> -		    entry->type != IOMMU_RESV_DIRECT_RELAXABLE)
> -			continue;
> -
>   		for (addr = start; addr <= end; addr += pg_size) {
>   			phys_addr_t phys_addr;
>   
> @@ -2121,6 +2125,21 @@ static int __iommu_device_set_domain(struct iommu_group *group,
>   {
>   	int ret;
>   
> +	/*
> +	 * If the driver has requested IOMMU_RESV_DIRECT then we cannot allow
> +	 * the blocking domain to be attached as it does not contain the
> +	 * required 1:1 mapping. This test effectively exclusive the device from
> +	 * being used with iommu_group_claim_dma_owner() which will block vfio
> +	 * and iommufd as well.
> +	 */
> +	if (dev->iommu->requires_direct &&
> +	    (new_domain->type == IOMMU_DOMAIN_BLOCKED ||

Given the notion elsewhere that we want to use the blocking domain as a 
last resort to handle an attach failure, at face value it looks suspect 
that failing to attach to a blocking domain could also be a thing. I 
guess technically this is failing at a slightly different level so maybe 
it does work out OK, but it's still smelly.

The main thing, though, is that not everything implements the 
IOMMU_DOMAIN_BLOCKED optimisation, so a nominal blocking domain could be 
IOMMU_DOMAIN_UNMANAGED as well. FWIW I'd prefer to make the RESV_DIRECT 
check explicit in __iommu_take_dma_ownership() rather than hide it in an 
implementation detail; that's going to be a lot clearer to reason about 
as time goes on.

Thanks,
Robin.

> +	     new_domain == group->blocking_domain)) {
> +		dev_warn(dev,
> +			 "Firmware has requested this device have a 1:1 IOMMU mapping, rejecting configuring the device without a 1:1 mapping. Contact your platform vendor.\n");
> +		return -EINVAL;
> +	}
> +
>   	if (dev->iommu->attach_deferred) {
>   		if (new_domain == group->default_domain)
>   			return 0;
Jason Gunthorpe June 19, 2023, 1:41 p.m. UTC | #4
On Mon, Jun 19, 2023 at 02:33:18PM +0100, Robin Murphy wrote:
> > @@ -2121,6 +2125,21 @@ static int __iommu_device_set_domain(struct iommu_group *group,
> >   {
> >   	int ret;
> > +	/*
> > +	 * If the driver has requested IOMMU_RESV_DIRECT then we cannot allow
> > +	 * the blocking domain to be attached as it does not contain the
> > +	 * required 1:1 mapping. This test effectively exclusive the device from
> > +	 * being used with iommu_group_claim_dma_owner() which will block vfio
> > +	 * and iommufd as well.
> > +	 */
> > +	if (dev->iommu->requires_direct &&
> > +	    (new_domain->type == IOMMU_DOMAIN_BLOCKED ||
> 
> Given the notion elsewhere that we want to use the blocking domain as a last
> resort to handle an attach failure,

We shouldn't do that for cases where requires_direct is true, the last
resort will have to be the static identity domain.

> at face value it looks suspect that failing to attach to a blocking
> domain could also be a thing. I guess technically this is failing at
> a slightly different level so maybe it does work out OK, but it's
> still smelly.

It basically says that this driver doesn't support blocking domains on
this device. What we don't want is for the driver to fail blocking or
identity attaches.
 
> The main thing, though, is that not everything implements the
> IOMMU_DOMAIN_BLOCKED optimisation, so a nominal blocking domain could be
> IOMMU_DOMAIN_UNMANAGED as well. 

Yes, it should check new_domain == group->blocking_domain as well.

> FWIW I'd prefer to make the RESV_DIRECT check explicit in
> __iommu_take_dma_ownership() rather than hide it in an
> implementation detail; that's going to be a lot clearer to reason
> about as time goes on.

We want to completely forbid blocking domains at all on these devices
because they are not supported (by FW request). I don't really like
the idea that we go and assume the only users of blocking domains are
also using take_dma_ownership() - that feels like a future bug waiting
to happen.

Jason
Robin Murphy June 19, 2023, 2:20 p.m. UTC | #5
On 19/06/2023 2:41 pm, Jason Gunthorpe wrote:
> On Mon, Jun 19, 2023 at 02:33:18PM +0100, Robin Murphy wrote:
>>> @@ -2121,6 +2125,21 @@ static int __iommu_device_set_domain(struct iommu_group *group,
>>>    {
>>>    	int ret;
>>> +	/*
>>> +	 * If the driver has requested IOMMU_RESV_DIRECT then we cannot allow
>>> +	 * the blocking domain to be attached as it does not contain the
>>> +	 * required 1:1 mapping. This test effectively exclusive the device from
>>> +	 * being used with iommu_group_claim_dma_owner() which will block vfio
>>> +	 * and iommufd as well.
>>> +	 */
>>> +	if (dev->iommu->requires_direct &&
>>> +	    (new_domain->type == IOMMU_DOMAIN_BLOCKED ||
>>
>> Given the notion elsewhere that we want to use the blocking domain as a last
>> resort to handle an attach failure,
> 
> We shouldn't do that for cases where requires_direct is true, the last
> resort will have to be the static identity domain.
> 
>> at face value it looks suspect that failing to attach to a blocking
>> domain could also be a thing. I guess technically this is failing at
>> a slightly different level so maybe it does work out OK, but it's
>> still smelly.
> 
> It basically says that this driver doesn't support blocking domains on
> this device. What we don't want is for the driver to fail blocking or
> identity attaches.

Is that really the relevant semantic though? I thought the point was to 
prevent userspace (or anyone else for that matter) taking ownership of a 
device with reserved regions which we can't trust them to honour. Not 
least because the series is entitled "Prevent RESV_DIRECT devices from 
user assignment", not anything about attaching to blocking domains. Plus 
the existing intel-iommu behaviour being generalised is specific to 
IOMMU_DOMAIN_UNMANAGED.

>> The main thing, though, is that not everything implements the
>> IOMMU_DOMAIN_BLOCKED optimisation, so a nominal blocking domain could be
>> IOMMU_DOMAIN_UNMANAGED as well.
> 
> Yes, it should check new_domain == group->blocking_domain as well.
> 
>> FWIW I'd prefer to make the RESV_DIRECT check explicit in
>> __iommu_take_dma_ownership() rather than hide it in an
>> implementation detail; that's going to be a lot clearer to reason
>> about as time goes on.
> 
> We want to completely forbid blocking domains at all on these devices
> because they are not supported (by FW request). I don't really like
> the idea that we go and assume the only users of blocking domains are
> also using take_dma_ownership() - that feels like a future bug waiting
> to happen.

On reflection, I don't think that aspect actually matters anyway - 
nobody can explicitly request attachment to a blocking domain, so if the 
only time they're used is when the IOMMU driver has already had a 
catastrophic internal failure such that we decide to declare the device 
toasted and deliberately put it into an unusable state, blocking its 
reserved regions doesn't seem like a big deal. In fact if anything it 
kind of feels like the right thing to do for that situation. We're 
saying that we want the device to stop accessing memory because things 
might be in an inconsistent state which we can't trust; who says that 
mappings of RESV_DIRECT regions haven't also gone wonky? Having BLOCKED 
mean that the device truly cannot access - and thus potentially corrupt 
- *any* memory anywhere seems like the most robust and useful behaviour.

Thanks,
Robin.
Jason Gunthorpe June 19, 2023, 3:30 p.m. UTC | #6
On Mon, Jun 19, 2023 at 03:20:30PM +0100, Robin Murphy wrote:
> On 19/06/2023 2:41 pm, Jason Gunthorpe wrote:
> > On Mon, Jun 19, 2023 at 02:33:18PM +0100, Robin Murphy wrote:
> > > > @@ -2121,6 +2125,21 @@ static int __iommu_device_set_domain(struct iommu_group *group,
> > > >    {
> > > >    	int ret;
> > > > +	/*
> > > > +	 * If the driver has requested IOMMU_RESV_DIRECT then we cannot allow
> > > > +	 * the blocking domain to be attached as it does not contain the
> > > > +	 * required 1:1 mapping. This test effectively exclusive the device from
> > > > +	 * being used with iommu_group_claim_dma_owner() which will block vfio
> > > > +	 * and iommufd as well.
> > > > +	 */
> > > > +	if (dev->iommu->requires_direct &&
> > > > +	    (new_domain->type == IOMMU_DOMAIN_BLOCKED ||
> > > 
> > > Given the notion elsewhere that we want to use the blocking domain as a last
> > > resort to handle an attach failure,
> > 
> > We shouldn't do that for cases where requires_direct is true, the last
> > resort will have to be the static identity domain.
> > 
> > > at face value it looks suspect that failing to attach to a blocking
> > > domain could also be a thing. I guess technically this is failing at
> > > a slightly different level so maybe it does work out OK, but it's
> > > still smelly.
> > 
> > It basically says that this driver doesn't support blocking domains on
> > this device. What we don't want is for the driver to fail blocking or
> > identity attaches.
> 
> Is that really the relevant semantic though?

It is the semantic I have had in mind. The end goal:

1) Driver never fails identity or blocking attachments
2) Identity and blocking domains are global static so always available
3) Core code puts restrictions on when identity and blocking domains
   can be used (eg trusted, reserved regions)
4) Catastrophic error recovery ends up moving to either blocking
   (preferred) or identity.

> I thought the point was to prevent userspace (or anyone else for
> that matter) taking ownership of a device with reserved regions
> which we can't trust them to honour.

Yes, this is the practical side effect of enforcing the rules.

The other side effect is that it removes another special case where a
driver has special behavior tied to IOMMU_DOMAIN_DMA.

> assignment", not anything about attaching to blocking domains. Plus the
> existing intel-iommu behaviour being generalised is specific to
> IOMMU_DOMAIN_UNMANAGED.

Being specific to UNMANAGED is a driver mistake, IMHO. It is a hack to
make it only work with VFIO and avoid dma-iommu.

> On reflection, I don't think that aspect actually matters anyway - nobody
> can explicitly request attachment to a blocking domain

They are used as part of the ownership mechanism, blocking domains are
effectively explicitly requested by detaching domains from owned
groups.

Yes, take_ownership is very carefully a sufficient place to put the
check with today's design, but it feels wrong because the blocking
domain compatability has conceptually nothing to do with ownership. If
we use blocking domains in the future then the check will be in the
wrong place.

> so if the only time they're used is when the IOMMU driver has
> already had a catastrophic internal failure such that we decide to
> declare the device toasted and deliberately put it into an unusable
> state, blocking its reserved regions doesn't seem like a big deal.

I think we should discuss then when we get to actually implementing
the error recovery flow we want. I do like blocking in general for the
reasons you give, and that was my first plan.. But if the BIOS will
crash or something if we don't do the reserved region maps that isn't
so good either. IDK would like to hear from the people using this BIOS
feature.

Thanks,
Jason
Tian, Kevin June 27, 2023, 7:54 a.m. UTC | #7
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Tuesday, June 13, 2023 11:15 AM
> 
> On 6/12/23 4:28 PM, Liu, Jingqi wrote:
> > On 6/7/2023 11:51 AM, Lu Baolu wrote:
> >> -
> >> -    BUG_ON(!domain->pgsize_bitmap);
> >> -
> >> -    pg_size = 1UL << __ffs(domain->pgsize_bitmap);
> >> +    pg_size = domain->pgsize_bitmap ? 1UL <<
> >> __ffs(domain->pgsize_bitmap) : 0;
> > Would it be better to add the following check here?
> >      if (WARN_ON(!pg_size))
> >              return -EINVAL;
> >
> > Instead of checking latter in the loop as follows.
> >      if (WARN_ON_ONCE(!pg_size)) {
> >              ret = -EINVAL;
> >              goto out;
> >      }
> 
> I am afraid no. Only the paging domains need a valid pg_size. That's the
> reason why I put it after the iommu_is_dma_domain() check. The previous
> code has the same behavior too.
> 

You could also add the dma domain check here. pg_size is static
then it makes more sense to verify it once instead of in a loop.
Baolu Lu June 27, 2023, 8:01 a.m. UTC | #8
On 2023/6/27 15:54, Tian, Kevin wrote:
>> From: Baolu Lu <baolu.lu@linux.intel.com>
>> Sent: Tuesday, June 13, 2023 11:15 AM
>>
>> On 6/12/23 4:28 PM, Liu, Jingqi wrote:
>>> On 6/7/2023 11:51 AM, Lu Baolu wrote:
>>>> -
>>>> -    BUG_ON(!domain->pgsize_bitmap);
>>>> -
>>>> -    pg_size = 1UL << __ffs(domain->pgsize_bitmap);
>>>> +    pg_size = domain->pgsize_bitmap ? 1UL <<
>>>> __ffs(domain->pgsize_bitmap) : 0;
>>> Would it be better to add the following check here?
>>>       if (WARN_ON(!pg_size))
>>>               return -EINVAL;
>>>
>>> Instead of checking latter in the loop as follows.
>>>       if (WARN_ON_ONCE(!pg_size)) {
>>>               ret = -EINVAL;
>>>               goto out;
>>>       }
>>
>> I am afraid no. Only the paging domains need a valid pg_size. That's the
>> reason why I put it after the iommu_is_dma_domain() check. The previous
>> code has the same behavior too.
>>
> 
> You could also add the dma domain check here. pg_size is static
> then it makes more sense to verify it once instead of in a loop.

Agreed. Does below additional change make sense?

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index e59de7852067..3be88b5f36bb 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -962,6 +962,9 @@ static int 
iommu_create_device_direct_mappings(struct iommu_domain *domain,
         pg_size = domain->pgsize_bitmap ? 1UL << 
__ffs(domain->pgsize_bitmap) : 0;
         INIT_LIST_HEAD(&mappings);

+       if (WARN_ON_ONCE((domain->type & __IOMMU_DOMAIN_PAGING) && 
!pg_size))
+               return -EINVAL;
+
         iommu_get_resv_regions(dev, &mappings);

         /* We need to consider overlapping regions for different devices */
@@ -977,11 +980,6 @@ static int 
iommu_create_device_direct_mappings(struct iommu_domain *domain,
                     !iommu_is_dma_domain(domain))
                         continue;

-               if (WARN_ON_ONCE(!pg_size)) {
-                       ret = -EINVAL;
-                       goto out;
-               }
-
                 start = ALIGN(entry->start, pg_size);
                 end   = ALIGN(entry->start + entry->length, pg_size);

Best regards,
baolu
Tian, Kevin June 27, 2023, 8:10 a.m. UTC | #9
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Monday, June 19, 2023 11:30 PM
> 
> On Mon, Jun 19, 2023 at 03:20:30PM +0100, Robin Murphy wrote:
> 
> > so if the only time they're used is when the IOMMU driver has
> > already had a catastrophic internal failure such that we decide to
> > declare the device toasted and deliberately put it into an unusable
> > state, blocking its reserved regions doesn't seem like a big deal.
> 
> I think we should discuss then when we get to actually implementing
> the error recovery flow we want. I do like blocking in general for the
> reasons you give, and that was my first plan.. But if the BIOS will
> crash or something if we don't do the reserved region maps that isn't
> so good either. IDK would like to hear from the people using this BIOS
> feature.
> 

The only devices with RMRR which I'm aware of on Intel platforms are
GPU and USB. However they are all RESV_DIRECT_RELAXABLE type.

Here is one reference from the Xen hypervisor. It has a concept called
quarantine domain (similar to blocking domain) when a device is
de-assigned w/o an owner. The quarantine domain has no mappings
except the ones identity-mapped for RMRR types. I'm not sure whether
they observed real examples of RMRR devices which are not GPU/USB.

I guess the reason of not going fully identity or fully blocked is from
the same worry that the BIOS may go insane while Xen still wants to
quarantine the device as much as possible.
Tian, Kevin June 27, 2023, 8:15 a.m. UTC | #10
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Tuesday, June 27, 2023 4:01 PM
> 
> On 2023/6/27 15:54, Tian, Kevin wrote:
> >> From: Baolu Lu <baolu.lu@linux.intel.com>
> >> Sent: Tuesday, June 13, 2023 11:15 AM
> >>
> >> On 6/12/23 4:28 PM, Liu, Jingqi wrote:
> >>> On 6/7/2023 11:51 AM, Lu Baolu wrote:
> >>>> -
> >>>> -    BUG_ON(!domain->pgsize_bitmap);
> >>>> -
> >>>> -    pg_size = 1UL << __ffs(domain->pgsize_bitmap);
> >>>> +    pg_size = domain->pgsize_bitmap ? 1UL <<
> >>>> __ffs(domain->pgsize_bitmap) : 0;
> >>> Would it be better to add the following check here?
> >>>       if (WARN_ON(!pg_size))
> >>>               return -EINVAL;
> >>>
> >>> Instead of checking latter in the loop as follows.
> >>>       if (WARN_ON_ONCE(!pg_size)) {
> >>>               ret = -EINVAL;
> >>>               goto out;
> >>>       }
> >>
> >> I am afraid no. Only the paging domains need a valid pg_size. That's the
> >> reason why I put it after the iommu_is_dma_domain() check. The
> previous
> >> code has the same behavior too.
> >>
> >
> > You could also add the dma domain check here. pg_size is static
> > then it makes more sense to verify it once instead of in a loop.
> 
> Agreed. Does below additional change make sense?
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index e59de7852067..3be88b5f36bb 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -962,6 +962,9 @@ static int
> iommu_create_device_direct_mappings(struct iommu_domain *domain,
>          pg_size = domain->pgsize_bitmap ? 1UL <<
> __ffs(domain->pgsize_bitmap) : 0;
>          INIT_LIST_HEAD(&mappings);
> 
> +       if (WARN_ON_ONCE((domain->type & __IOMMU_DOMAIN_PAGING)
> &&
> !pg_size))
> +               return -EINVAL;

what's the reason of not using iommu_is_dma_domain()? this is called
in the probe path only for the default domain. Otherwise if you change
like this then you also want to change the check in the loop later to be
consistent.

> +
>          iommu_get_resv_regions(dev, &mappings);
> 
>          /* We need to consider overlapping regions for different devices */
> @@ -977,11 +980,6 @@ static int
> iommu_create_device_direct_mappings(struct iommu_domain *domain,
>                      !iommu_is_dma_domain(domain))
>                          continue;
> 
> -               if (WARN_ON_ONCE(!pg_size)) {
> -                       ret = -EINVAL;
> -                       goto out;
> -               }
> -
>                  start = ALIGN(entry->start, pg_size);
>                  end   = ALIGN(entry->start + entry->length, pg_size);
> 
> Best regards,
> baolu
Baolu Lu June 27, 2023, 8:21 a.m. UTC | #11
On 2023/6/27 16:15, Tian, Kevin wrote:
>> From: Baolu Lu<baolu.lu@linux.intel.com>
>> Sent: Tuesday, June 27, 2023 4:01 PM
>>
>> On 2023/6/27 15:54, Tian, Kevin wrote:
>>>> From: Baolu Lu<baolu.lu@linux.intel.com>
>>>> Sent: Tuesday, June 13, 2023 11:15 AM
>>>>
>>>> On 6/12/23 4:28 PM, Liu, Jingqi wrote:
>>>>> On 6/7/2023 11:51 AM, Lu Baolu wrote:
>>>>>> -
>>>>>> -    BUG_ON(!domain->pgsize_bitmap);
>>>>>> -
>>>>>> -    pg_size = 1UL << __ffs(domain->pgsize_bitmap);
>>>>>> +    pg_size = domain->pgsize_bitmap ? 1UL <<
>>>>>> __ffs(domain->pgsize_bitmap) : 0;
>>>>> Would it be better to add the following check here?
>>>>>        if (WARN_ON(!pg_size))
>>>>>                return -EINVAL;
>>>>>
>>>>> Instead of checking latter in the loop as follows.
>>>>>        if (WARN_ON_ONCE(!pg_size)) {
>>>>>                ret = -EINVAL;
>>>>>                goto out;
>>>>>        }
>>>> I am afraid no. Only the paging domains need a valid pg_size. That's the
>>>> reason why I put it after the iommu_is_dma_domain() check. The
>> previous
>>>> code has the same behavior too.
>>>>
>>> You could also add the dma domain check here. pg_size is static
>>> then it makes more sense to verify it once instead of in a loop.
>> Agreed. Does below additional change make sense?
>>
>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index e59de7852067..3be88b5f36bb 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -962,6 +962,9 @@ static int
>> iommu_create_device_direct_mappings(struct iommu_domain *domain,
>>           pg_size = domain->pgsize_bitmap ? 1UL <<
>> __ffs(domain->pgsize_bitmap) : 0;
>>           INIT_LIST_HEAD(&mappings);
>>
>> +       if (WARN_ON_ONCE((domain->type & __IOMMU_DOMAIN_PAGING)
>> &&
>> !pg_size))
>> +               return -EINVAL;
> what's the reason of not using iommu_is_dma_domain()? this is called
> in the probe path only for the default domain. Otherwise if you change
> like this then you also want to change the check in the loop later to be
> consistent.
> 

Yes. iommu_is_dma_domain() is better.

Best regards,
baolu
Jason Gunthorpe June 27, 2023, 3:47 p.m. UTC | #12
On Tue, Jun 27, 2023 at 04:01:01PM +0800, Baolu Lu wrote:

> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index e59de7852067..3be88b5f36bb 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -962,6 +962,9 @@ static int iommu_create_device_direct_mappings(struct
> iommu_domain *domain,
>         pg_size = domain->pgsize_bitmap ? 1UL <<
> __ffs(domain->pgsize_bitmap) : 0;
>         INIT_LIST_HEAD(&mappings);
> 
> +       if (WARN_ON_ONCE((domain->type & __IOMMU_DOMAIN_PAGING) &&
> !pg_size))
> +               return -EINVAL;

Calling this function with an identity domain is expected, it must
return 0.

Jason
Jason Gunthorpe June 27, 2023, 3:49 p.m. UTC | #13
On Tue, Jun 27, 2023 at 08:10:48AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Monday, June 19, 2023 11:30 PM
> > 
> > On Mon, Jun 19, 2023 at 03:20:30PM +0100, Robin Murphy wrote:
> > 
> > > so if the only time they're used is when the IOMMU driver has
> > > already had a catastrophic internal failure such that we decide to
> > > declare the device toasted and deliberately put it into an unusable
> > > state, blocking its reserved regions doesn't seem like a big deal.
> > 
> > I think we should discuss then when we get to actually implementing
> > the error recovery flow we want. I do like blocking in general for the
> > reasons you give, and that was my first plan.. But if the BIOS will
> > crash or something if we don't do the reserved region maps that isn't
> > so good either. IDK would like to hear from the people using this BIOS
> > feature.
> > 
> 
> The only devices with RMRR which I'm aware of on Intel platforms are
> GPU and USB. However they are all RESV_DIRECT_RELAXABLE type.
> 
> Here is one reference from the Xen hypervisor. It has a concept called
> quarantine domain (similar to blocking domain) when a device is
> de-assigned w/o an owner. The quarantine domain has no mappings
> except the ones identity-mapped for RMRR types. I'm not sure whether
> they observed real examples of RMRR devices which are not GPU/USB.

I guess, it seems a bit hard core, but we could spend the cost to
build such a domain during probe..

At least for our cases we already do expect that DMA is halted before
we start messing with the domains so identity may not be the same
issue as with Xen..

Jason
diff mbox series

Patch

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index d31642596675..fd18019ac951 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -409,6 +409,7 @@  struct iommu_fault_param {
  * @priv:	 IOMMU Driver private data
  * @max_pasids:  number of PASIDs this device can consume
  * @attach_deferred: the dma domain attachment is deferred
+ * @requires_direct: The driver requested IOMMU_RESV_DIRECT
  *
  * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
  *	struct iommu_group	*iommu_group;
@@ -422,6 +423,7 @@  struct dev_iommu {
 	void				*priv;
 	u32				max_pasids;
 	u32				attach_deferred:1;
+	u32				requires_direct:1;
 };
 
 int iommu_device_register(struct iommu_device *iommu,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 9e0228ef612b..e59de7852067 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -959,12 +959,7 @@  static int iommu_create_device_direct_mappings(struct iommu_domain *domain,
 	unsigned long pg_size;
 	int ret = 0;
 
-	if (!iommu_is_dma_domain(domain))
-		return 0;
-
-	BUG_ON(!domain->pgsize_bitmap);
-
-	pg_size = 1UL << __ffs(domain->pgsize_bitmap);
+	pg_size = domain->pgsize_bitmap ? 1UL << __ffs(domain->pgsize_bitmap) : 0;
 	INIT_LIST_HEAD(&mappings);
 
 	iommu_get_resv_regions(dev, &mappings);
@@ -974,13 +969,22 @@  static int iommu_create_device_direct_mappings(struct iommu_domain *domain,
 		dma_addr_t start, end, addr;
 		size_t map_size = 0;
 
+		if (entry->type == IOMMU_RESV_DIRECT)
+			dev->iommu->requires_direct = 1;
+
+		if ((entry->type != IOMMU_RESV_DIRECT &&
+		     entry->type != IOMMU_RESV_DIRECT_RELAXABLE) ||
+		    !iommu_is_dma_domain(domain))
+			continue;
+
+		if (WARN_ON_ONCE(!pg_size)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
 		start = ALIGN(entry->start, pg_size);
 		end   = ALIGN(entry->start + entry->length, pg_size);
 
-		if (entry->type != IOMMU_RESV_DIRECT &&
-		    entry->type != IOMMU_RESV_DIRECT_RELAXABLE)
-			continue;
-
 		for (addr = start; addr <= end; addr += pg_size) {
 			phys_addr_t phys_addr;
 
@@ -2121,6 +2125,21 @@  static int __iommu_device_set_domain(struct iommu_group *group,
 {
 	int ret;
 
+	/*
+	 * If the driver has requested IOMMU_RESV_DIRECT then we cannot allow
+	 * the blocking domain to be attached as it does not contain the
+	 * required 1:1 mapping. This test effectively exclusive the device from
+	 * being used with iommu_group_claim_dma_owner() which will block vfio
+	 * and iommufd as well.
+	 */
+	if (dev->iommu->requires_direct &&
+	    (new_domain->type == IOMMU_DOMAIN_BLOCKED ||
+	     new_domain == group->blocking_domain)) {
+		dev_warn(dev,
+			 "Firmware has requested this device have a 1:1 IOMMU mapping, rejecting configuring the device without a 1:1 mapping. Contact your platform vendor.\n");
+		return -EINVAL;
+	}
+
 	if (dev->iommu->attach_deferred) {
 		if (new_domain == group->default_domain)
 			return 0;