diff mbox series

[v3,02/14] iommu: Report domain nesting info

Message ID 1592988927-48009-3-git-send-email-yi.l.liu@intel.com (mailing list archive)
State New, archived
Headers show
Series vfio: expose virtual Shared Virtual Addressing to VMs | expand

Commit Message

Yi Liu June 24, 2020, 8:55 a.m. UTC
IOMMUs that support nesting translation needs report the capability info
to userspace, e.g. the format of first level/stage paging structures.

This patch reports nesting info by DOMAIN_ATTR_NESTING. Caller can get
nesting info after setting DOMAIN_ATTR_NESTING.

v2 -> v3:
*) remvoe cap/ecap_mask in iommu_nesting_info.
*) reuse DOMAIN_ATTR_NESTING to get nesting info.
*) return an empty iommu_nesting_info for SMMU drivers per Jean'
   suggestion.

Cc: Kevin Tian <kevin.tian@intel.com>
CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
---
 drivers/iommu/arm-smmu-v3.c | 29 ++++++++++++++++++++--
 drivers/iommu/arm-smmu.c    | 29 ++++++++++++++++++++--
 include/uapi/linux/iommu.h  | 59 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 113 insertions(+), 4 deletions(-)

Comments

Jean-Philippe Brucker June 26, 2020, 7:47 a.m. UTC | #1
On Wed, Jun 24, 2020 at 01:55:15AM -0700, Liu Yi L wrote:
> IOMMUs that support nesting translation needs report the capability info
> to userspace, e.g. the format of first level/stage paging structures.
> 
> This patch reports nesting info by DOMAIN_ATTR_NESTING. Caller can get
> nesting info after setting DOMAIN_ATTR_NESTING.
> 
> v2 -> v3:
> *) remvoe cap/ecap_mask in iommu_nesting_info.
> *) reuse DOMAIN_ATTR_NESTING to get nesting info.
> *) return an empty iommu_nesting_info for SMMU drivers per Jean'
>    suggestion.
> 
> Cc: Kevin Tian <kevin.tian@intel.com>
> CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Cc: Eric Auger <eric.auger@redhat.com>
> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Lu Baolu <baolu.lu@linux.intel.com>
> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> ---
>  drivers/iommu/arm-smmu-v3.c | 29 ++++++++++++++++++++--
>  drivers/iommu/arm-smmu.c    | 29 ++++++++++++++++++++--

Looks reasonable to me. Please move the SMMU changes to a separate patch
and Cc the SMMU maintainers:

Cc: Will Deacon <will@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>

Thanks,
Jean

>  include/uapi/linux/iommu.h  | 59 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 113 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index f578677..0c45d4d 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -3019,6 +3019,32 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
>  	return group;
>  }
>  
> +static int arm_smmu_domain_nesting_info(struct arm_smmu_domain *smmu_domain,
> +					void *data)
> +{
> +	struct iommu_nesting_info *info = (struct iommu_nesting_info *) data;
> +	u32 size;
> +
> +	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
> +		return -ENODEV;
> +
> +	size = sizeof(struct iommu_nesting_info);
> +
> +	/*
> +	 * if provided buffer size is not equal to the size, should
> +	 * return 0 and also the expected buffer size to caller.
> +	 */
> +	if (info->size != size) {
> +		info->size = size;
> +		return 0;
> +	}
> +
> +	/* report an empty iommu_nesting_info for now */
> +	memset(info, 0x0, size);
> +	info->size = size;
> +	return 0;
> +}
> +
>  static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>  				    enum iommu_attr attr, void *data)
>  {
> @@ -3028,8 +3054,7 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>  	case IOMMU_DOMAIN_UNMANAGED:
>  		switch (attr) {
>  		case DOMAIN_ATTR_NESTING:
> -			*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
> -			return 0;
> +			return arm_smmu_domain_nesting_info(smmu_domain, data);
>  		default:
>  			return -ENODEV;
>  		}
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index 243bc4c..908607d 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -1506,6 +1506,32 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
>  	return group;
>  }
>  
> +static int arm_smmu_domain_nesting_info(struct arm_smmu_domain *smmu_domain,
> +					void *data)
> +{
> +	struct iommu_nesting_info *info = (struct iommu_nesting_info *) data;
> +	u32 size;
> +
> +	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
> +		return -ENODEV;
> +
> +	size = sizeof(struct iommu_nesting_info);
> +
> +	/*
> +	 * if provided buffer size is not equal to the size, should
> +	 * return 0 and also the expected buffer size to caller.
> +	 */
> +	if (info->size != size) {
> +		info->size = size;
> +		return 0;
> +	}
> +
> +	/* report an empty iommu_nesting_info for now */
> +	memset(info, 0x0, size);
> +	info->size = size;
> +	return 0;
> +}
> +
>  static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>  				    enum iommu_attr attr, void *data)
>  {
> @@ -1515,8 +1541,7 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>  	case IOMMU_DOMAIN_UNMANAGED:
>  		switch (attr) {
>  		case DOMAIN_ATTR_NESTING:
> -			*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
> -			return 0;
> +			return arm_smmu_domain_nesting_info(smmu_domain, data);
>  		default:
>  			return -ENODEV;
>  		}
> diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> index 1afc661..898c99a 100644
> --- a/include/uapi/linux/iommu.h
> +++ b/include/uapi/linux/iommu.h
> @@ -332,4 +332,63 @@ struct iommu_gpasid_bind_data {
>  	} vendor;
>  };
>  
> +/*
> + * struct iommu_nesting_info - Information for nesting-capable IOMMU.
> + *				user space should check it before using
> + *				nesting capability.
> + *
> + * @size:	size of the whole structure
> + * @format:	PASID table entry format, the same definition with
> + *		@format of struct iommu_gpasid_bind_data.
> + * @features:	supported nesting features.
> + * @flags:	currently reserved for future extension.
> + * @data:	vendor specific cap info.
> + *
> + * +---------------+----------------------------------------------------+
> + * | feature       |  Notes                                             |
> + * +===============+====================================================+
> + * | SYSWIDE_PASID |  Kernel manages PASID in system wide, PASIDs used  |
> + * |               |  in the system should be allocated by host kernel  |
> + * +---------------+----------------------------------------------------+
> + * | BIND_PGTBL    |  bind page tables to host PASID, the PASID could   |
> + * |               |  either be a host PASID passed in bind request or  |
> + * |               |  default PASIDs (e.g. default PASID of aux-domain) |
> + * +---------------+----------------------------------------------------+
> + * | CACHE_INVLD   |  mandatory feature for nesting capable IOMMU       |
> + * +---------------+----------------------------------------------------+
> + *
> + */
> +struct iommu_nesting_info {
> +	__u32	size;
> +	__u32	format;
> +	__u32	features;
> +#define IOMMU_NESTING_FEAT_SYSWIDE_PASID	(1 << 0)
> +#define IOMMU_NESTING_FEAT_BIND_PGTBL		(1 << 1)
> +#define IOMMU_NESTING_FEAT_CACHE_INVLD		(1 << 2)
> +	__u32	flags;
> +	__u8	data[];
> +};
> +
> +/*
> + * struct iommu_nesting_info_vtd - Intel VT-d specific nesting info
> + *
> + *
> + * @flags:	VT-d specific flags. Currently reserved for future
> + *		extension.
> + * @addr_width:	The output addr width of first level/stage translation
> + * @pasid_bits:	Maximum supported PASID bits, 0 represents no PASID
> + *		support.
> + * @cap_reg:	Describe basic capabilities as defined in VT-d capability
> + *		register.
> + * @ecap_reg:	Describe the extended capabilities as defined in VT-d
> + *		extended capability register.
> + */
> +struct iommu_nesting_info_vtd {
> +	__u32	flags;
> +	__u16	addr_width;
> +	__u16	pasid_bits;
> +	__u64	cap_reg;
> +	__u64	ecap_reg;
> +};
> +
>  #endif /* _UAPI_IOMMU_H */
> -- 
> 2.7.4
>
Robin Murphy June 26, 2020, 4:04 p.m. UTC | #2
On 2020-06-26 08:47, Jean-Philippe Brucker wrote:
> On Wed, Jun 24, 2020 at 01:55:15AM -0700, Liu Yi L wrote:
>> IOMMUs that support nesting translation needs report the capability info
>> to userspace, e.g. the format of first level/stage paging structures.
>>
>> This patch reports nesting info by DOMAIN_ATTR_NESTING. Caller can get
>> nesting info after setting DOMAIN_ATTR_NESTING.
>>
>> v2 -> v3:
>> *) remvoe cap/ecap_mask in iommu_nesting_info.
>> *) reuse DOMAIN_ATTR_NESTING to get nesting info.
>> *) return an empty iommu_nesting_info for SMMU drivers per Jean'
>>     suggestion.
>>
>> Cc: Kevin Tian <kevin.tian@intel.com>
>> CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
>> Cc: Alex Williamson <alex.williamson@redhat.com>
>> Cc: Eric Auger <eric.auger@redhat.com>
>> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
>> Cc: Joerg Roedel <joro@8bytes.org>
>> Cc: Lu Baolu <baolu.lu@linux.intel.com>
>> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
>> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
>> ---
>>   drivers/iommu/arm-smmu-v3.c | 29 ++++++++++++++++++++--
>>   drivers/iommu/arm-smmu.c    | 29 ++++++++++++++++++++--
> 
> Looks reasonable to me. Please move the SMMU changes to a separate patch
> and Cc the SMMU maintainers:

Cheers Jean, I'll admit I've been skipping over a lot of these patches 
lately :)

A couple of comments below...

> 
> Cc: Will Deacon <will@kernel.org>
> Cc: Robin Murphy <robin.murphy@arm.com>
> 
> Thanks,
> Jean
> 
>>   include/uapi/linux/iommu.h  | 59 +++++++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 113 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>> index f578677..0c45d4d 100644
>> --- a/drivers/iommu/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm-smmu-v3.c
>> @@ -3019,6 +3019,32 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
>>   	return group;
>>   }
>>   
>> +static int arm_smmu_domain_nesting_info(struct arm_smmu_domain *smmu_domain,
>> +					void *data)
>> +{
>> +	struct iommu_nesting_info *info = (struct iommu_nesting_info *) data;
>> +	u32 size;
>> +
>> +	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
>> +		return -ENODEV;
>> +
>> +	size = sizeof(struct iommu_nesting_info);
>> +
>> +	/*
>> +	 * if provided buffer size is not equal to the size, should
>> +	 * return 0 and also the expected buffer size to caller.
>> +	 */
>> +	if (info->size != size) {
>> +		info->size = size;
>> +		return 0;
>> +	}
>> +
>> +	/* report an empty iommu_nesting_info for now */
>> +	memset(info, 0x0, size);
>> +	info->size = size;
>> +	return 0;
>> +}
>> +
>>   static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>>   				    enum iommu_attr attr, void *data)
>>   {
>> @@ -3028,8 +3054,7 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>>   	case IOMMU_DOMAIN_UNMANAGED:
>>   		switch (attr) {
>>   		case DOMAIN_ATTR_NESTING:
>> -			*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
>> -			return 0;
>> +			return arm_smmu_domain_nesting_info(smmu_domain, data);
>>   		default:
>>   			return -ENODEV;
>>   		}
>> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
>> index 243bc4c..908607d 100644
>> --- a/drivers/iommu/arm-smmu.c
>> +++ b/drivers/iommu/arm-smmu.c
>> @@ -1506,6 +1506,32 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
>>   	return group;
>>   }
>>   
>> +static int arm_smmu_domain_nesting_info(struct arm_smmu_domain *smmu_domain,
>> +					void *data)
>> +{
>> +	struct iommu_nesting_info *info = (struct iommu_nesting_info *) data;
>> +	u32 size;
>> +
>> +	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
>> +		return -ENODEV;
>> +
>> +	size = sizeof(struct iommu_nesting_info);
>> +
>> +	/*
>> +	 * if provided buffer size is not equal to the size, should
>> +	 * return 0 and also the expected buffer size to caller.
>> +	 */
>> +	if (info->size != size) {
>> +		info->size = size;
>> +		return 0;
>> +	}
>> +
>> +	/* report an empty iommu_nesting_info for now */
>> +	memset(info, 0x0, size);
>> +	info->size = size;
>> +	return 0;
>> +}
>> +
>>   static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>>   				    enum iommu_attr attr, void *data)
>>   {
>> @@ -1515,8 +1541,7 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>>   	case IOMMU_DOMAIN_UNMANAGED:
>>   		switch (attr) {
>>   		case DOMAIN_ATTR_NESTING:
>> -			*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
>> -			return 0;
>> +			return arm_smmu_domain_nesting_info(smmu_domain, data);
>>   		default:
>>   			return -ENODEV;
>>   		}
>> diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
>> index 1afc661..898c99a 100644
>> --- a/include/uapi/linux/iommu.h
>> +++ b/include/uapi/linux/iommu.h
>> @@ -332,4 +332,63 @@ struct iommu_gpasid_bind_data {
>>   	} vendor;
>>   };
>>   
>> +/*
>> + * struct iommu_nesting_info - Information for nesting-capable IOMMU.
>> + *				user space should check it before using
>> + *				nesting capability.
>> + *
>> + * @size:	size of the whole structure
>> + * @format:	PASID table entry format, the same definition with
>> + *		@format of struct iommu_gpasid_bind_data.
>> + * @features:	supported nesting features.
>> + * @flags:	currently reserved for future extension.
>> + * @data:	vendor specific cap info.
>> + *
>> + * +---------------+----------------------------------------------------+
>> + * | feature       |  Notes                                             |
>> + * +===============+====================================================+
>> + * | SYSWIDE_PASID |  Kernel manages PASID in system wide, PASIDs used  |
>> + * |               |  in the system should be allocated by host kernel  |
>> + * +---------------+----------------------------------------------------+
>> + * | BIND_PGTBL    |  bind page tables to host PASID, the PASID could   |
>> + * |               |  either be a host PASID passed in bind request or  |
>> + * |               |  default PASIDs (e.g. default PASID of aux-domain) |
>> + * +---------------+----------------------------------------------------+
>> + * | CACHE_INVLD   |  mandatory feature for nesting capable IOMMU       |
>> + * +---------------+----------------------------------------------------+
>> + *
>> + */
>> +struct iommu_nesting_info {
>> +	__u32	size;
>> +	__u32	format;
>> +	__u32	features;
>> +#define IOMMU_NESTING_FEAT_SYSWIDE_PASID	(1 << 0)
>> +#define IOMMU_NESTING_FEAT_BIND_PGTBL		(1 << 1)
>> +#define IOMMU_NESTING_FEAT_CACHE_INVLD		(1 << 2)
>> +	__u32	flags;
>> +	__u8	data[];
>> +};
>> +
>> +/*
>> + * struct iommu_nesting_info_vtd - Intel VT-d specific nesting info
>> + *
>> + *
>> + * @flags:	VT-d specific flags. Currently reserved for future
>> + *		extension.
>> + * @addr_width:	The output addr width of first level/stage translation
>> + * @pasid_bits:	Maximum supported PASID bits, 0 represents no PASID
>> + *		support.
>> + * @cap_reg:	Describe basic capabilities as defined in VT-d capability
>> + *		register.
>> + * @ecap_reg:	Describe the extended capabilities as defined in VT-d
>> + *		extended capability register.
>> + */
>> +struct iommu_nesting_info_vtd {
>> +	__u32	flags;
>> +	__u16	addr_width;

I think this might be worth promoting to a generic feature - Arm has the 
same notion of intermediate address size, and I'd imagine that pretty 
much any other two-stage translation system would as well (either 
explicitly or implicitly). It also comes close to something the DPDK 
folks raised where they wanted parity with a feature that currently 
scrapes AGAW out of some VT-d-specific place, so abstracting it to 
completely generic code, in a way that could eventually be generalised 
to reporting info for non-nested domains too, would be really nice.

What would also be cool is if the user was able to pass in a structure 
with preferred values for the address size and other capabilities when 
they request nesting in the first place. Right now we'll always set up 
the maximum possible sized page table for any domain, but if we knew 
ahead of time how many bits the user actually cared about then we could 
potentially be more efficient (e.g. use fewer levels of pagetable or a 
different translation granule).

Robin.

>> +	__u16	pasid_bits;
>> +	__u64	cap_reg;
>> +	__u64	ecap_reg;
>> +};
>> +
>>   #endif /* _UAPI_IOMMU_H */
>> -- 
>> 2.7.4
>>
Yi Liu June 27, 2020, 6:14 a.m. UTC | #3
> From: Jean-Philippe Brucker <jean-philippe@linaro.org>
> Sent: Friday, June 26, 2020 3:48 PM
> 
> On Wed, Jun 24, 2020 at 01:55:15AM -0700, Liu Yi L wrote:
> > IOMMUs that support nesting translation needs report the capability
> > info to userspace, e.g. the format of first level/stage paging structures.
> >
> > This patch reports nesting info by DOMAIN_ATTR_NESTING. Caller can get
> > nesting info after setting DOMAIN_ATTR_NESTING.
> >
> > v2 -> v3:
> > *) remvoe cap/ecap_mask in iommu_nesting_info.
> > *) reuse DOMAIN_ATTR_NESTING to get nesting info.
> > *) return an empty iommu_nesting_info for SMMU drivers per Jean'
> >    suggestion.
> >
> > Cc: Kevin Tian <kevin.tian@intel.com>
> > CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Cc: Alex Williamson <alex.williamson@redhat.com>
> > Cc: Eric Auger <eric.auger@redhat.com>
> > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> > Cc: Joerg Roedel <joro@8bytes.org>
> > Cc: Lu Baolu <baolu.lu@linux.intel.com>
> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > ---
> >  drivers/iommu/arm-smmu-v3.c | 29 ++++++++++++++++++++--
> >  drivers/iommu/arm-smmu.c    | 29 ++++++++++++++++++++--
> 
> Looks reasonable to me. Please move the SMMU changes to a separate patch
> and Cc the SMMU maintainers:
> 
> Cc: Will Deacon <will@kernel.org>
> Cc: Robin Murphy <robin.murphy@arm.com>

got you. will do it.

Regards,
Yi Liu

> Thanks,
> Jean
> 
> >  include/uapi/linux/iommu.h  | 59
> > +++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 113 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> > index f578677..0c45d4d 100644
> > --- a/drivers/iommu/arm-smmu-v3.c
> > +++ b/drivers/iommu/arm-smmu-v3.c
> > @@ -3019,6 +3019,32 @@ static struct iommu_group
> *arm_smmu_device_group(struct device *dev)
> >  	return group;
> >  }
> >
> > +static int arm_smmu_domain_nesting_info(struct arm_smmu_domain
> *smmu_domain,
> > +					void *data)
> > +{
> > +	struct iommu_nesting_info *info = (struct iommu_nesting_info *) data;
> > +	u32 size;
> > +
> > +	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
> > +		return -ENODEV;
> > +
> > +	size = sizeof(struct iommu_nesting_info);
> > +
> > +	/*
> > +	 * if provided buffer size is not equal to the size, should
> > +	 * return 0 and also the expected buffer size to caller.
> > +	 */
> > +	if (info->size != size) {
> > +		info->size = size;
> > +		return 0;
> > +	}
> > +
> > +	/* report an empty iommu_nesting_info for now */
> > +	memset(info, 0x0, size);
> > +	info->size = size;
> > +	return 0;
> > +}
> > +
> >  static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
> >  				    enum iommu_attr attr, void *data)  { @@ -
> 3028,8 +3054,7 @@
> > static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
> >  	case IOMMU_DOMAIN_UNMANAGED:
> >  		switch (attr) {
> >  		case DOMAIN_ATTR_NESTING:
> > -			*(int *)data = (smmu_domain->stage ==
> ARM_SMMU_DOMAIN_NESTED);
> > -			return 0;
> > +			return arm_smmu_domain_nesting_info(smmu_domain,
> data);
> >  		default:
> >  			return -ENODEV;
> >  		}
> > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index
> > 243bc4c..908607d 100644
> > --- a/drivers/iommu/arm-smmu.c
> > +++ b/drivers/iommu/arm-smmu.c
> > @@ -1506,6 +1506,32 @@ static struct iommu_group
> *arm_smmu_device_group(struct device *dev)
> >  	return group;
> >  }
> >
> > +static int arm_smmu_domain_nesting_info(struct arm_smmu_domain
> *smmu_domain,
> > +					void *data)
> > +{
> > +	struct iommu_nesting_info *info = (struct iommu_nesting_info *) data;
> > +	u32 size;
> > +
> > +	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
> > +		return -ENODEV;
> > +
> > +	size = sizeof(struct iommu_nesting_info);
> > +
> > +	/*
> > +	 * if provided buffer size is not equal to the size, should
> > +	 * return 0 and also the expected buffer size to caller.
> > +	 */
> > +	if (info->size != size) {
> > +		info->size = size;
> > +		return 0;
> > +	}
> > +
> > +	/* report an empty iommu_nesting_info for now */
> > +	memset(info, 0x0, size);
> > +	info->size = size;
> > +	return 0;
> > +}
> > +
> >  static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
> >  				    enum iommu_attr attr, void *data)  { @@ -
> 1515,8 +1541,7 @@
> > static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
> >  	case IOMMU_DOMAIN_UNMANAGED:
> >  		switch (attr) {
> >  		case DOMAIN_ATTR_NESTING:
> > -			*(int *)data = (smmu_domain->stage ==
> ARM_SMMU_DOMAIN_NESTED);
> > -			return 0;
> > +			return arm_smmu_domain_nesting_info(smmu_domain,
> data);
> >  		default:
> >  			return -ENODEV;
> >  		}
> > diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> > index 1afc661..898c99a 100644
> > --- a/include/uapi/linux/iommu.h
> > +++ b/include/uapi/linux/iommu.h
> > @@ -332,4 +332,63 @@ struct iommu_gpasid_bind_data {
> >  	} vendor;
> >  };
> >
> > +/*
> > + * struct iommu_nesting_info - Information for nesting-capable IOMMU.
> > + *				user space should check it before using
> > + *				nesting capability.
> > + *
> > + * @size:	size of the whole structure
> > + * @format:	PASID table entry format, the same definition with
> > + *		@format of struct iommu_gpasid_bind_data.
> > + * @features:	supported nesting features.
> > + * @flags:	currently reserved for future extension.
> > + * @data:	vendor specific cap info.
> > + *
> > + * +---------------+----------------------------------------------------+
> > + * | feature       |  Notes                                             |
> > + *
> >
> ++===============+============================================
> ========
> > ++
> > + * | SYSWIDE_PASID |  Kernel manages PASID in system wide, PASIDs used  |
> > + * |               |  in the system should be allocated by host kernel  |
> > + * +---------------+----------------------------------------------------+
> > + * | BIND_PGTBL    |  bind page tables to host PASID, the PASID could   |
> > + * |               |  either be a host PASID passed in bind request or  |
> > + * |               |  default PASIDs (e.g. default PASID of aux-domain) |
> > + * +---------------+----------------------------------------------------+
> > + * | CACHE_INVLD   |  mandatory feature for nesting capable IOMMU       |
> > + *
> > ++---------------+----------------------------------------------------
> > ++
> > + *
> > + */
> > +struct iommu_nesting_info {
> > +	__u32	size;
> > +	__u32	format;
> > +	__u32	features;
> > +#define IOMMU_NESTING_FEAT_SYSWIDE_PASID	(1 << 0)
> > +#define IOMMU_NESTING_FEAT_BIND_PGTBL		(1 << 1)
> > +#define IOMMU_NESTING_FEAT_CACHE_INVLD		(1 << 2)
> > +	__u32	flags;
> > +	__u8	data[];
> > +};
> > +
> > +/*
> > + * struct iommu_nesting_info_vtd - Intel VT-d specific nesting info
> > + *
> > + *
> > + * @flags:	VT-d specific flags. Currently reserved for future
> > + *		extension.
> > + * @addr_width:	The output addr width of first level/stage translation
> > + * @pasid_bits:	Maximum supported PASID bits, 0 represents no PASID
> > + *		support.
> > + * @cap_reg:	Describe basic capabilities as defined in VT-d capability
> > + *		register.
> > + * @ecap_reg:	Describe the extended capabilities as defined in VT-d
> > + *		extended capability register.
> > + */
> > +struct iommu_nesting_info_vtd {
> > +	__u32	flags;
> > +	__u16	addr_width;
> > +	__u16	pasid_bits;
> > +	__u64	cap_reg;
> > +	__u64	ecap_reg;
> > +};
> > +
> >  #endif /* _UAPI_IOMMU_H */
> > --
> > 2.7.4
> >
Yi Liu June 27, 2020, 6:53 a.m. UTC | #4
Hi Robin,

> From: Robin Murphy <robin.murphy@arm.com>
> Sent: Saturday, June 27, 2020 12:05 AM
> 
> On 2020-06-26 08:47, Jean-Philippe Brucker wrote:
> > On Wed, Jun 24, 2020 at 01:55:15AM -0700, Liu Yi L wrote:
> >> IOMMUs that support nesting translation needs report the capability
> >> info to userspace, e.g. the format of first level/stage paging structures.
> >>
> >> This patch reports nesting info by DOMAIN_ATTR_NESTING. Caller can
> >> get nesting info after setting DOMAIN_ATTR_NESTING.
> >>
> >> v2 -> v3:
> >> *) remvoe cap/ecap_mask in iommu_nesting_info.
> >> *) reuse DOMAIN_ATTR_NESTING to get nesting info.
> >> *) return an empty iommu_nesting_info for SMMU drivers per Jean'
> >>     suggestion.
> >>
> >> Cc: Kevin Tian <kevin.tian@intel.com>
> >> CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> >> Cc: Alex Williamson <alex.williamson@redhat.com>
> >> Cc: Eric Auger <eric.auger@redhat.com>
> >> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> >> Cc: Joerg Roedel <joro@8bytes.org>
> >> Cc: Lu Baolu <baolu.lu@linux.intel.com>
> >> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> >> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> >> ---
> >>   drivers/iommu/arm-smmu-v3.c | 29 ++++++++++++++++++++--
> >>   drivers/iommu/arm-smmu.c    | 29 ++++++++++++++++++++--
> >
> > Looks reasonable to me. Please move the SMMU changes to a separate
> > patch and Cc the SMMU maintainers:
> 
> Cheers Jean, I'll admit I've been skipping over a lot of these patches lately :)
> 
> A couple of comments below...
> 
> >
> > Cc: Will Deacon <will@kernel.org>
> > Cc: Robin Murphy <robin.murphy@arm.com>
> >
> > Thanks,
> > Jean
> >
> >>   include/uapi/linux/iommu.h  | 59
> +++++++++++++++++++++++++++++++++++++++++++++
> >>   3 files changed, 113 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/drivers/iommu/arm-smmu-v3.c
> >> b/drivers/iommu/arm-smmu-v3.c index f578677..0c45d4d 100644
> >> --- a/drivers/iommu/arm-smmu-v3.c
> >> +++ b/drivers/iommu/arm-smmu-v3.c
> >> @@ -3019,6 +3019,32 @@ static struct iommu_group
> *arm_smmu_device_group(struct device *dev)
> >>   	return group;
> >>   }
> >>
> >> +static int arm_smmu_domain_nesting_info(struct arm_smmu_domain
> *smmu_domain,
> >> +					void *data)
> >> +{
> >> +	struct iommu_nesting_info *info = (struct iommu_nesting_info *) data;
> >> +	u32 size;
> >> +
> >> +	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
> >> +		return -ENODEV;
> >> +
> >> +	size = sizeof(struct iommu_nesting_info);
> >> +
> >> +	/*
> >> +	 * if provided buffer size is not equal to the size, should
> >> +	 * return 0 and also the expected buffer size to caller.
> >> +	 */
> >> +	if (info->size != size) {
> >> +		info->size = size;
> >> +		return 0;
> >> +	}
> >> +
> >> +	/* report an empty iommu_nesting_info for now */
> >> +	memset(info, 0x0, size);
> >> +	info->size = size;
> >> +	return 0;
> >> +}
> >> +
> >>   static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
> >>   				    enum iommu_attr attr, void *data)
> >>   {
> >> @@ -3028,8 +3054,7 @@ static int arm_smmu_domain_get_attr(struct
> iommu_domain *domain,
> >>   	case IOMMU_DOMAIN_UNMANAGED:
> >>   		switch (attr) {
> >>   		case DOMAIN_ATTR_NESTING:
> >> -			*(int *)data = (smmu_domain->stage ==
> ARM_SMMU_DOMAIN_NESTED);
> >> -			return 0;
> >> +			return arm_smmu_domain_nesting_info(smmu_domain,
> data);
> >>   		default:
> >>   			return -ENODEV;
> >>   		}
> >> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> >> index 243bc4c..908607d 100644
> >> --- a/drivers/iommu/arm-smmu.c
> >> +++ b/drivers/iommu/arm-smmu.c
> >> @@ -1506,6 +1506,32 @@ static struct iommu_group
> *arm_smmu_device_group(struct device *dev)
> >>   	return group;
> >>   }
> >>
> >> +static int arm_smmu_domain_nesting_info(struct arm_smmu_domain
> *smmu_domain,
> >> +					void *data)
> >> +{
> >> +	struct iommu_nesting_info *info = (struct iommu_nesting_info *) data;
> >> +	u32 size;
> >> +
> >> +	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
> >> +		return -ENODEV;
> >> +
> >> +	size = sizeof(struct iommu_nesting_info);
> >> +
> >> +	/*
> >> +	 * if provided buffer size is not equal to the size, should
> >> +	 * return 0 and also the expected buffer size to caller.
> >> +	 */
> >> +	if (info->size != size) {
> >> +		info->size = size;
> >> +		return 0;
> >> +	}
> >> +
> >> +	/* report an empty iommu_nesting_info for now */
> >> +	memset(info, 0x0, size);
> >> +	info->size = size;
> >> +	return 0;
> >> +}
> >> +
> >>   static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
> >>   				    enum iommu_attr attr, void *data)
> >>   {
> >> @@ -1515,8 +1541,7 @@ static int arm_smmu_domain_get_attr(struct
> iommu_domain *domain,
> >>   	case IOMMU_DOMAIN_UNMANAGED:
> >>   		switch (attr) {
> >>   		case DOMAIN_ATTR_NESTING:
> >> -			*(int *)data = (smmu_domain->stage ==
> ARM_SMMU_DOMAIN_NESTED);
> >> -			return 0;
> >> +			return arm_smmu_domain_nesting_info(smmu_domain,
> data);
> >>   		default:
> >>   			return -ENODEV;
> >>   		}
> >> diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> >> index 1afc661..898c99a 100644
> >> --- a/include/uapi/linux/iommu.h
> >> +++ b/include/uapi/linux/iommu.h
> >> @@ -332,4 +332,63 @@ struct iommu_gpasid_bind_data {
> >>   	} vendor;
> >>   };
> >>
> >> +/*
> >> + * struct iommu_nesting_info - Information for nesting-capable IOMMU.
> >> + *				user space should check it before using
> >> + *				nesting capability.
> >> + *
> >> + * @size:	size of the whole structure
> >> + * @format:	PASID table entry format, the same definition with
> >> + *		@format of struct iommu_gpasid_bind_data.
> >> + * @features:	supported nesting features.
> >> + * @flags:	currently reserved for future extension.
> >> + * @data:	vendor specific cap info.
> >> + *
> >> + * +---------------+----------------------------------------------------+
> >> + * | feature       |  Notes                                             |
> >> + *
> >>
> ++===============+============================================
> =======
> >> +=+
> >> + * | SYSWIDE_PASID |  Kernel manages PASID in system wide, PASIDs used  |
> >> + * |               |  in the system should be allocated by host kernel  |
> >> + * +---------------+----------------------------------------------------+
> >> + * | BIND_PGTBL    |  bind page tables to host PASID, the PASID could   |
> >> + * |               |  either be a host PASID passed in bind request or  |
> >> + * |               |  default PASIDs (e.g. default PASID of aux-domain) |
> >> + * +---------------+----------------------------------------------------+
> >> + * | CACHE_INVLD   |  mandatory feature for nesting capable IOMMU       |
> >> + *
> >> ++---------------+---------------------------------------------------
> >> +-+
> >> + *
> >> + */
> >> +struct iommu_nesting_info {
> >> +	__u32	size;
> >> +	__u32	format;
> >> +	__u32	features;
> >> +#define IOMMU_NESTING_FEAT_SYSWIDE_PASID	(1 << 0)
> >> +#define IOMMU_NESTING_FEAT_BIND_PGTBL		(1 << 1)
> >> +#define IOMMU_NESTING_FEAT_CACHE_INVLD		(1 << 2)
> >> +	__u32	flags;
> >> +	__u8	data[];
> >> +};
> >> +
> >> +/*
> >> + * struct iommu_nesting_info_vtd - Intel VT-d specific nesting info
> >> + *
> >> + *
> >> + * @flags:	VT-d specific flags. Currently reserved for future
> >> + *		extension.
> >> + * @addr_width:	The output addr width of first level/stage translation
> >> + * @pasid_bits:	Maximum supported PASID bits, 0 represents no PASID
> >> + *		support.
> >> + * @cap_reg:	Describe basic capabilities as defined in VT-d capability
> >> + *		register.
> >> + * @ecap_reg:	Describe the extended capabilities as defined in VT-d
> >> + *		extended capability register.
> >> + */
> >> +struct iommu_nesting_info_vtd {
> >> +	__u32	flags;
> >> +	__u16	addr_width;
> 
> I think this might be worth promoting to a generic feature - Arm has the same
> notion of intermediate address size, and I'd imagine that pretty much any other
> two-stage translation system would as well (either explicitly or implicitly).
> It also
> comes close to something the DPDK folks raised where they wanted parity with a
> feature that currently scrapes AGAW out of some VT-d-specific place, so
> abstracting it to completely generic code, in a way that could eventually be
> generalised to reporting info for non-nested domains too, would be really nice.

got you. I can do that.

> What would also be cool is if the user was able to pass in a structure with
> preferred values for the address size and other capabilities when they request
> nesting in the first place. Right now we'll always set up the maximum possible
> sized page table for any domain, but if we knew ahead of time how many bits the
> user actually cared about then we could potentially be more efficient (e.g. use
> fewer levels of pagetable or a different translation granule).

agreed, and I guess only the configurable caps (like the addr_width, domain
could have different addr_width per user request). I think it may be an
optimization afterward. Here, we report all the nesting related caps to user,
thus user could either do pre-check or expose correct capability to guest per
hardware support. This is necesary as nesting requires guest to maintain page
tables per hw supporting.

Regards,
Yi Liu

> Robin.
> 
> >> +	__u16	pasid_bits;
> >> +	__u64	cap_reg;
> >> +	__u64	ecap_reg;
> >> +};
> >> +
> >>   #endif /* _UAPI_IOMMU_H */
> >> --
> >> 2.7.4
> >>
Stefan Hajnoczi June 29, 2020, 9:24 a.m. UTC | #5
On Wed, Jun 24, 2020 at 01:55:15AM -0700, Liu Yi L wrote:
> +/*
> + * struct iommu_nesting_info - Information for nesting-capable IOMMU.
> + *				user space should check it before using
> + *				nesting capability.
> + *
> + * @size:	size of the whole structure
> + * @format:	PASID table entry format, the same definition with
> + *		@format of struct iommu_gpasid_bind_data.
> + * @features:	supported nesting features.
> + * @flags:	currently reserved for future extension.
> + * @data:	vendor specific cap info.
> + *
> + * +---------------+----------------------------------------------------+
> + * | feature       |  Notes                                             |
> + * +===============+====================================================+
> + * | SYSWIDE_PASID |  Kernel manages PASID in system wide, PASIDs used  |
> + * |               |  in the system should be allocated by host kernel  |
> + * +---------------+----------------------------------------------------+
> + * | BIND_PGTBL    |  bind page tables to host PASID, the PASID could   |
> + * |               |  either be a host PASID passed in bind request or  |
> + * |               |  default PASIDs (e.g. default PASID of aux-domain) |
> + * +---------------+----------------------------------------------------+
> + * | CACHE_INVLD   |  mandatory feature for nesting capable IOMMU       |
> + * +---------------+----------------------------------------------------+

This feature description is vague about what CACHE_INVLD does and how to
use it. If I understand correctly, the presence of this feature means
that VFIO_IOMMU_NESTING_OP_CACHE_INVLD must be used?

The same kind of clarification could be done for SYSWIDE_PASID and
BIND_PGTBL too.

Stefan
Yi Liu June 29, 2020, 12:23 p.m. UTC | #6
Hi Stefan,

> From: Stefan Hajnoczi <stefanha@gmail.com>
> Sent: Monday, June 29, 2020 5:25 PM
> 
> On Wed, Jun 24, 2020 at 01:55:15AM -0700, Liu Yi L wrote:
> > +/*
> > + * struct iommu_nesting_info - Information for nesting-capable IOMMU.
> > + *				user space should check it before using
> > + *				nesting capability.
> > + *
> > + * @size:	size of the whole structure
> > + * @format:	PASID table entry format, the same definition with
> > + *		@format of struct iommu_gpasid_bind_data.
> > + * @features:	supported nesting features.
> > + * @flags:	currently reserved for future extension.
> > + * @data:	vendor specific cap info.
> > + *
> > + * +---------------+----------------------------------------------------+
> > + * | feature       |  Notes                                             |
> > + *
> +===============+===================================================
> =+
> > + * | SYSWIDE_PASID |  Kernel manages PASID in system wide, PASIDs used  |
> > + * |               |  in the system should be allocated by host kernel  |
> > + * +---------------+----------------------------------------------------+
> > + * | BIND_PGTBL    |  bind page tables to host PASID, the PASID could   |
> > + * |               |  either be a host PASID passed in bind request or  |
> > + * |               |  default PASIDs (e.g. default PASID of aux-domain) |
> > + * +---------------+----------------------------------------------------+
> > + * | CACHE_INVLD   |  mandatory feature for nesting capable IOMMU       |
> > + * +---------------+----------------------------------------------------+
> 
> This feature description is vague about what CACHE_INVLD does and how to
> use it. If I understand correctly, the presence of this feature means
> that VFIO_IOMMU_NESTING_OP_CACHE_INVLD must be used?
>
> The same kind of clarification could be done for SYSWIDE_PASID and
> BIND_PGTBL too.

For SYSWIDE_PASID and BIND_PGTBL, yes, presence of the feature bit
means must use. So the two are requirements to user space if it wants
to setup nesting. While for CACHE_INVLD, it's kind of availability
here. How about removing CACHE_INVLD as presence of BIND_PGTBL should
indicates support of CACHE_INVLD?

Regards,
Yi Liu

> Stefan
Tian, Kevin June 30, 2020, 1:20 a.m. UTC | #7
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Saturday, June 27, 2020 2:53 PM
> 
> Hi Robin,
> 
> > From: Robin Murphy <robin.murphy@arm.com>
> > Sent: Saturday, June 27, 2020 12:05 AM
> >
> > On 2020-06-26 08:47, Jean-Philippe Brucker wrote:
> > > On Wed, Jun 24, 2020 at 01:55:15AM -0700, Liu Yi L wrote:
> > >> IOMMUs that support nesting translation needs report the capability
> > >> info to userspace, e.g. the format of first level/stage paging structures.
> > >>
> > >> This patch reports nesting info by DOMAIN_ATTR_NESTING. Caller can
> > >> get nesting info after setting DOMAIN_ATTR_NESTING.
> > >>
> > >> v2 -> v3:
> > >> *) remvoe cap/ecap_mask in iommu_nesting_info.
> > >> *) reuse DOMAIN_ATTR_NESTING to get nesting info.
> > >> *) return an empty iommu_nesting_info for SMMU drivers per Jean'
> > >>     suggestion.
> > >>
> > >> Cc: Kevin Tian <kevin.tian@intel.com>
> > >> CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > >> Cc: Alex Williamson <alex.williamson@redhat.com>
> > >> Cc: Eric Auger <eric.auger@redhat.com>
> > >> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> > >> Cc: Joerg Roedel <joro@8bytes.org>
> > >> Cc: Lu Baolu <baolu.lu@linux.intel.com>
> > >> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > >> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > >> ---
> > >>   drivers/iommu/arm-smmu-v3.c | 29 ++++++++++++++++++++--
> > >>   drivers/iommu/arm-smmu.c    | 29 ++++++++++++++++++++--
> > >
> > > Looks reasonable to me. Please move the SMMU changes to a separate
> > > patch and Cc the SMMU maintainers:
> >
> > Cheers Jean, I'll admit I've been skipping over a lot of these patches lately :)
> >
> > A couple of comments below...
> >
> > >
> > > Cc: Will Deacon <will@kernel.org>
> > > Cc: Robin Murphy <robin.murphy@arm.com>
> > >
> > > Thanks,
> > > Jean
> > >
> > >>   include/uapi/linux/iommu.h  | 59
> > +++++++++++++++++++++++++++++++++++++++++++++
> > >>   3 files changed, 113 insertions(+), 4 deletions(-)
> > >>
> > >> diff --git a/drivers/iommu/arm-smmu-v3.c
> > >> b/drivers/iommu/arm-smmu-v3.c index f578677..0c45d4d 100644
> > >> --- a/drivers/iommu/arm-smmu-v3.c
> > >> +++ b/drivers/iommu/arm-smmu-v3.c
> > >> @@ -3019,6 +3019,32 @@ static struct iommu_group
> > *arm_smmu_device_group(struct device *dev)
> > >>   	return group;
> > >>   }
> > >>
> > >> +static int arm_smmu_domain_nesting_info(struct arm_smmu_domain
> > *smmu_domain,
> > >> +					void *data)
> > >> +{
> > >> +	struct iommu_nesting_info *info = (struct iommu_nesting_info *)
> data;
> > >> +	u32 size;
> > >> +
> > >> +	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
> > >> +		return -ENODEV;
> > >> +
> > >> +	size = sizeof(struct iommu_nesting_info);
> > >> +
> > >> +	/*
> > >> +	 * if provided buffer size is not equal to the size, should
> > >> +	 * return 0 and also the expected buffer size to caller.
> > >> +	 */
> > >> +	if (info->size != size) {
> > >> +		info->size = size;
> > >> +		return 0;
> > >> +	}
> > >> +
> > >> +	/* report an empty iommu_nesting_info for now */
> > >> +	memset(info, 0x0, size);
> > >> +	info->size = size;
> > >> +	return 0;
> > >> +}
> > >> +
> > >>   static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
> > >>   				    enum iommu_attr attr, void *data)
> > >>   {
> > >> @@ -3028,8 +3054,7 @@ static int arm_smmu_domain_get_attr(struct
> > iommu_domain *domain,
> > >>   	case IOMMU_DOMAIN_UNMANAGED:
> > >>   		switch (attr) {
> > >>   		case DOMAIN_ATTR_NESTING:
> > >> -			*(int *)data = (smmu_domain->stage ==
> > ARM_SMMU_DOMAIN_NESTED);
> > >> -			return 0;
> > >> +			return
> arm_smmu_domain_nesting_info(smmu_domain,
> > data);
> > >>   		default:
> > >>   			return -ENODEV;
> > >>   		}
> > >> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> > >> index 243bc4c..908607d 100644
> > >> --- a/drivers/iommu/arm-smmu.c
> > >> +++ b/drivers/iommu/arm-smmu.c
> > >> @@ -1506,6 +1506,32 @@ static struct iommu_group
> > *arm_smmu_device_group(struct device *dev)
> > >>   	return group;
> > >>   }
> > >>
> > >> +static int arm_smmu_domain_nesting_info(struct arm_smmu_domain
> > *smmu_domain,
> > >> +					void *data)
> > >> +{
> > >> +	struct iommu_nesting_info *info = (struct iommu_nesting_info *)
> data;
> > >> +	u32 size;
> > >> +
> > >> +	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
> > >> +		return -ENODEV;
> > >> +
> > >> +	size = sizeof(struct iommu_nesting_info);
> > >> +
> > >> +	/*
> > >> +	 * if provided buffer size is not equal to the size, should
> > >> +	 * return 0 and also the expected buffer size to caller.
> > >> +	 */
> > >> +	if (info->size != size) {
> > >> +		info->size = size;
> > >> +		return 0;
> > >> +	}
> > >> +
> > >> +	/* report an empty iommu_nesting_info for now */
> > >> +	memset(info, 0x0, size);
> > >> +	info->size = size;
> > >> +	return 0;
> > >> +}
> > >> +
> > >>   static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
> > >>   				    enum iommu_attr attr, void *data)
> > >>   {
> > >> @@ -1515,8 +1541,7 @@ static int arm_smmu_domain_get_attr(struct
> > iommu_domain *domain,
> > >>   	case IOMMU_DOMAIN_UNMANAGED:
> > >>   		switch (attr) {
> > >>   		case DOMAIN_ATTR_NESTING:
> > >> -			*(int *)data = (smmu_domain->stage ==
> > ARM_SMMU_DOMAIN_NESTED);
> > >> -			return 0;
> > >> +			return
> arm_smmu_domain_nesting_info(smmu_domain,
> > data);
> > >>   		default:
> > >>   			return -ENODEV;
> > >>   		}
> > >> diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> > >> index 1afc661..898c99a 100644
> > >> --- a/include/uapi/linux/iommu.h
> > >> +++ b/include/uapi/linux/iommu.h
> > >> @@ -332,4 +332,63 @@ struct iommu_gpasid_bind_data {
> > >>   	} vendor;
> > >>   };
> > >>
> > >> +/*
> > >> + * struct iommu_nesting_info - Information for nesting-capable
> IOMMU.
> > >> + *				user space should check it before using
> > >> + *				nesting capability.
> > >> + *
> > >> + * @size:	size of the whole structure
> > >> + * @format:	PASID table entry format, the same definition with
> > >> + *		@format of struct iommu_gpasid_bind_data.
> > >> + * @features:	supported nesting features.
> > >> + * @flags:	currently reserved for future extension.
> > >> + * @data:	vendor specific cap info.
> > >> + *
> > >> + * +---------------+----------------------------------------------------+
> > >> + * | feature       |  Notes                                             |
> > >> + *
> > >>
> > ++===============+============================================
> > =======
> > >> +=+
> > >> + * | SYSWIDE_PASID |  Kernel manages PASID in system wide, PASIDs
> used  |
> > >> + * |               |  in the system should be allocated by host kernel  |
> > >> + * +---------------+----------------------------------------------------+
> > >> + * | BIND_PGTBL    |  bind page tables to host PASID, the PASID could   |
> > >> + * |               |  either be a host PASID passed in bind request or  |
> > >> + * |               |  default PASIDs (e.g. default PASID of aux-domain) |
> > >> + * +---------------+----------------------------------------------------+
> > >> + * | CACHE_INVLD   |  mandatory feature for nesting capable IOMMU
> |
> > >> + *
> > >> ++---------------+---------------------------------------------------
> > >> +-+
> > >> + *
> > >> + */
> > >> +struct iommu_nesting_info {
> > >> +	__u32	size;
> > >> +	__u32	format;
> > >> +	__u32	features;
> > >> +#define IOMMU_NESTING_FEAT_SYSWIDE_PASID	(1 << 0)
> > >> +#define IOMMU_NESTING_FEAT_BIND_PGTBL		(1 << 1)
> > >> +#define IOMMU_NESTING_FEAT_CACHE_INVLD		(1 << 2)
> > >> +	__u32	flags;
> > >> +	__u8	data[];
> > >> +};
> > >> +
> > >> +/*
> > >> + * struct iommu_nesting_info_vtd - Intel VT-d specific nesting info
> > >> + *
> > >> + *
> > >> + * @flags:	VT-d specific flags. Currently reserved for future
> > >> + *		extension.
> > >> + * @addr_width:	The output addr width of first level/stage translation
> > >> + * @pasid_bits:	Maximum supported PASID bits, 0 represents no
> PASID
> > >> + *		support.
> > >> + * @cap_reg:	Describe basic capabilities as defined in VT-d
> capability
> > >> + *		register.
> > >> + * @ecap_reg:	Describe the extended capabilities as defined in VT-d
> > >> + *		extended capability register.
> > >> + */
> > >> +struct iommu_nesting_info_vtd {
> > >> +	__u32	flags;
> > >> +	__u16	addr_width;
> >
> > I think this might be worth promoting to a generic feature - Arm has the
> same
> > notion of intermediate address size, and I'd imagine that pretty much any
> other
> > two-stage translation system would as well (either explicitly or implicitly).
> > It also
> > comes close to something the DPDK folks raised where they wanted parity
> with a
> > feature that currently scrapes AGAW out of some VT-d-specific place, so
> > abstracting it to completely generic code, in a way that could eventually be
> > generalised to reporting info for non-nested domains too, would be really
> nice.
> 
> got you. I can do that.
> 
> > What would also be cool is if the user was able to pass in a structure with
> > preferred values for the address size and other capabilities when they
> request
> > nesting in the first place. Right now we'll always set up the maximum
> possible
> > sized page table for any domain, but if we knew ahead of time how many
> bits the
> > user actually cared about then we could potentially be more efficient (e.g.
> use
> > fewer levels of pagetable or a different translation granule).
> 
> agreed, and I guess only the configurable caps (like the addr_width, domain
> could have different addr_width per user request). I think it may be an
> optimization afterward. Here, we report all the nesting related caps to user,
> thus user could either do pre-check or expose correct capability to guest per
> hardware support. This is necesary as nesting requires guest to maintain
> page
> tables per hw supporting.
> 

yes, this likely requires a new uAPI thus it could come as an incremental patch
later. We may reuse the same structure as defined here for communicating
preferred values.

Thanks
Kevin
Tian, Kevin June 30, 2020, 2 a.m. UTC | #8
> From: Liu, Yi L <yi.l.liu@intel.com>
> Sent: Monday, June 29, 2020 8:23 PM
> 
> Hi Stefan,
> 
> > From: Stefan Hajnoczi <stefanha@gmail.com>
> > Sent: Monday, June 29, 2020 5:25 PM
> >
> > On Wed, Jun 24, 2020 at 01:55:15AM -0700, Liu Yi L wrote:
> > > +/*
> > > + * struct iommu_nesting_info - Information for nesting-capable IOMMU.
> > > + *				user space should check it before using
> > > + *				nesting capability.
> > > + *
> > > + * @size:	size of the whole structure
> > > + * @format:	PASID table entry format, the same definition with
> > > + *		@format of struct iommu_gpasid_bind_data.
> > > + * @features:	supported nesting features.
> > > + * @flags:	currently reserved for future extension.
> > > + * @data:	vendor specific cap info.
> > > + *
> > > + * +---------------+----------------------------------------------------+
> > > + * | feature       |  Notes                                             |
> > > + *
> >
> +===============+===============================================
> ====
> > =+
> > > + * | SYSWIDE_PASID |  Kernel manages PASID in system wide, PASIDs
> used  |
> > > + * |               |  in the system should be allocated by host kernel  |
> > > + * +---------------+----------------------------------------------------+
> > > + * | BIND_PGTBL    |  bind page tables to host PASID, the PASID could   |
> > > + * |               |  either be a host PASID passed in bind request or  |
> > > + * |               |  default PASIDs (e.g. default PASID of aux-domain) |
> > > + * +---------------+----------------------------------------------------+
> > > + * | CACHE_INVLD   |  mandatory feature for nesting capable IOMMU
> |
> > > + * +---------------+----------------------------------------------------+
> >
> > This feature description is vague about what CACHE_INVLD does and how
> to
> > use it. If I understand correctly, the presence of this feature means
> > that VFIO_IOMMU_NESTING_OP_CACHE_INVLD must be used?
> >
> > The same kind of clarification could be done for SYSWIDE_PASID and
> > BIND_PGTBL too.
> 
> For SYSWIDE_PASID and BIND_PGTBL, yes, presence of the feature bit
> means must use. So the two are requirements to user space if it wants
> to setup nesting. While for CACHE_INVLD, it's kind of availability
> here. How about removing CACHE_INVLD as presence of BIND_PGTBL should
> indicates support of CACHE_INVLD?
> 

So far this assumption is correct but it may not be true when thinking forward.
For example, a vendor might find a way to allow the owner of 1st-level page
table to directly invalidate cache w/o going through host IOMMU driver. From
this angle I feel explicitly reporting this capability is more robust.

Regarding to the description, what about below?

--
SYSWIDE_PASID: PASIDs are managed in system-wide, instead of per device.
When a device is assigned to userspace or VM, proper uAPI (provided by 
userspace driver framework, e.g. VFIO) must be used to allocate/free PASIDs
for the assigned device.

BIND_PGTBL: The owner of the first-level/stage-1 page table must explicitly 
bind the page table to associated PASID (either the one specified in bind 
request or the default PASID of the iommu domain), through VFIO_IOMMU
_NESTING_OP

CACHE_INVLD: The owner of the first-level/stage-1 page table must
explicitly invalidate the IOMMU cache through VFIO_IOMMU_NESTING_OP,
according to vendor-specific requirement when changing the page table.
--

Thanks
Kevin
Yi Liu June 30, 2020, 3:45 a.m. UTC | #9
> From: Tian, Kevin <kevin.tian@intel.com>
> Sent: Tuesday, June 30, 2020 10:01 AM
>
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Monday, June 29, 2020 8:23 PM
> >
> > Hi Stefan,
> >
> > > From: Stefan Hajnoczi <stefanha@gmail.com>
> > > Sent: Monday, June 29, 2020 5:25 PM
> > >
> > > On Wed, Jun 24, 2020 at 01:55:15AM -0700, Liu Yi L wrote:
> > > > +/*
> > > > + * struct iommu_nesting_info - Information for nesting-capable IOMMU.
> > > > + *				user space should check it before using
> > > > + *				nesting capability.
> > > > + *
> > > > + * @size:	size of the whole structure
> > > > + * @format:	PASID table entry format, the same definition with
> > > > + *		@format of struct iommu_gpasid_bind_data.
> > > > + * @features:	supported nesting features.
> > > > + * @flags:	currently reserved for future extension.
> > > > + * @data:	vendor specific cap info.
> > > > + *
> > > > + * +---------------+----------------------------------------------------+
> > > > + * | feature       |  Notes                                             |
> > > > + *
> > >
> > +===============+===============================================
> > ====
> > > =+
> > > > + * | SYSWIDE_PASID |  Kernel manages PASID in system wide, PASIDs
> > used  |
> > > > + * |               |  in the system should be allocated by host kernel  |
> > > > + * +---------------+----------------------------------------------------+
> > > > + * | BIND_PGTBL    |  bind page tables to host PASID, the PASID could   |
> > > > + * |               |  either be a host PASID passed in bind request or  |
> > > > + * |               |  default PASIDs (e.g. default PASID of aux-domain) |
> > > > + * +---------------+----------------------------------------------------+
> > > > + * | CACHE_INVLD   |  mandatory feature for nesting capable IOMMU
> > |
> > > > + * +---------------+----------------------------------------------------+
> > >
> > > This feature description is vague about what CACHE_INVLD does and how
> > to
> > > use it. If I understand correctly, the presence of this feature means
> > > that VFIO_IOMMU_NESTING_OP_CACHE_INVLD must be used?
> > >
> > > The same kind of clarification could be done for SYSWIDE_PASID and
> > > BIND_PGTBL too.
> >
> > For SYSWIDE_PASID and BIND_PGTBL, yes, presence of the feature bit
> > means must use. So the two are requirements to user space if it wants
> > to setup nesting. While for CACHE_INVLD, it's kind of availability
> > here. How about removing CACHE_INVLD as presence of BIND_PGTBL should
> > indicates support of CACHE_INVLD?
> >
> 
> So far this assumption is correct but it may not be true when thinking forward.
> For example, a vendor might find a way to allow the owner of 1st-level page
> table to directly invalidate cache w/o going through host IOMMU driver. From
> this angle I feel explicitly reporting this capability is more robust.

I see. explicitly require 1st-level page table owner to do cache invalidation after
modifying page table is fair to me.

> Regarding to the description, what about below?
> 
> --
> SYSWIDE_PASID: PASIDs are managed in system-wide, instead of per device.
> When a device is assigned to userspace or VM, proper uAPI (provided by
> userspace driver framework, e.g. VFIO) must be used to allocate/free PASIDs
> for the assigned device.
> 
> BIND_PGTBL: The owner of the first-level/stage-1 page table must explicitly
> bind the page table to associated PASID (either the one specified in bind
> request or the default PASID of the iommu domain), through VFIO_IOMMU
> _NESTING_OP
> 
> CACHE_INVLD: The owner of the first-level/stage-1 page table must
> explicitly invalidate the IOMMU cache through VFIO_IOMMU_NESTING_OP,
> according to vendor-specific requirement when changing the page table.
> --

thanks for the statements, will apply.

Regards,
Yi Liu

> Thanks
> Kevin
> 
>
Alex Williamson July 2, 2020, 5:54 p.m. UTC | #10
On Wed, 24 Jun 2020 01:55:15 -0700
Liu Yi L <yi.l.liu@intel.com> wrote:

> IOMMUs that support nesting translation needs report the capability info
> to userspace, e.g. the format of first level/stage paging structures.
> 
> This patch reports nesting info by DOMAIN_ATTR_NESTING. Caller can get
> nesting info after setting DOMAIN_ATTR_NESTING.
> 
> v2 -> v3:
> *) remvoe cap/ecap_mask in iommu_nesting_info.
> *) reuse DOMAIN_ATTR_NESTING to get nesting info.
> *) return an empty iommu_nesting_info for SMMU drivers per Jean'
>    suggestion.
> 
> Cc: Kevin Tian <kevin.tian@intel.com>
> CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Cc: Eric Auger <eric.auger@redhat.com>
> Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> Cc: Joerg Roedel <joro@8bytes.org>
> Cc: Lu Baolu <baolu.lu@linux.intel.com>
> Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> ---
>  drivers/iommu/arm-smmu-v3.c | 29 ++++++++++++++++++++--
>  drivers/iommu/arm-smmu.c    | 29 ++++++++++++++++++++--
>  include/uapi/linux/iommu.h  | 59 +++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 113 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index f578677..0c45d4d 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -3019,6 +3019,32 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
>  	return group;
>  }
>  
> +static int arm_smmu_domain_nesting_info(struct arm_smmu_domain *smmu_domain,
> +					void *data)
> +{
> +	struct iommu_nesting_info *info = (struct iommu_nesting_info *) data;
> +	u32 size;
> +
> +	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
> +		return -ENODEV;
> +
> +	size = sizeof(struct iommu_nesting_info);
> +
> +	/*
> +	 * if provided buffer size is not equal to the size, should
> +	 * return 0 and also the expected buffer size to caller.
> +	 */
> +	if (info->size != size) {
> +		info->size = size;
> +		return 0;
> +	}
> +
> +	/* report an empty iommu_nesting_info for now */
> +	memset(info, 0x0, size);
> +	info->size = size;
> +	return 0;
> +}
> +
>  static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>  				    enum iommu_attr attr, void *data)
>  {
> @@ -3028,8 +3054,7 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>  	case IOMMU_DOMAIN_UNMANAGED:
>  		switch (attr) {
>  		case DOMAIN_ATTR_NESTING:
> -			*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
> -			return 0;
> +			return arm_smmu_domain_nesting_info(smmu_domain, data);
>  		default:
>  			return -ENODEV;
>  		}
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index 243bc4c..908607d 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -1506,6 +1506,32 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
>  	return group;
>  }
>  
> +static int arm_smmu_domain_nesting_info(struct arm_smmu_domain *smmu_domain,
> +					void *data)
> +{
> +	struct iommu_nesting_info *info = (struct iommu_nesting_info *) data;
> +	u32 size;
> +
> +	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
> +		return -ENODEV;
> +
> +	size = sizeof(struct iommu_nesting_info);
> +
> +	/*
> +	 * if provided buffer size is not equal to the size, should
> +	 * return 0 and also the expected buffer size to caller.
> +	 */
> +	if (info->size != size) {
> +		info->size = size;
> +		return 0;
> +	}
> +
> +	/* report an empty iommu_nesting_info for now */
> +	memset(info, 0x0, size);
> +	info->size = size;
> +	return 0;
> +}
> +
>  static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>  				    enum iommu_attr attr, void *data)
>  {
> @@ -1515,8 +1541,7 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
>  	case IOMMU_DOMAIN_UNMANAGED:
>  		switch (attr) {
>  		case DOMAIN_ATTR_NESTING:
> -			*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
> -			return 0;
> +			return arm_smmu_domain_nesting_info(smmu_domain, data);
>  		default:
>  			return -ENODEV;
>  		}
> diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> index 1afc661..898c99a 100644
> --- a/include/uapi/linux/iommu.h
> +++ b/include/uapi/linux/iommu.h
> @@ -332,4 +332,63 @@ struct iommu_gpasid_bind_data {
>  	} vendor;
>  };
>  
> +/*
> + * struct iommu_nesting_info - Information for nesting-capable IOMMU.
> + *				user space should check it before using
> + *				nesting capability.
> + *
> + * @size:	size of the whole structure
> + * @format:	PASID table entry format, the same definition with
> + *		@format of struct iommu_gpasid_bind_data.
> + * @features:	supported nesting features.
> + * @flags:	currently reserved for future extension.
> + * @data:	vendor specific cap info.
> + *
> + * +---------------+----------------------------------------------------+
> + * | feature       |  Notes                                             |
> + * +===============+====================================================+
> + * | SYSWIDE_PASID |  Kernel manages PASID in system wide, PASIDs used  |
> + * |               |  in the system should be allocated by host kernel  |
> + * +---------------+----------------------------------------------------+
> + * | BIND_PGTBL    |  bind page tables to host PASID, the PASID could   |
> + * |               |  either be a host PASID passed in bind request or  |
> + * |               |  default PASIDs (e.g. default PASID of aux-domain) |
> + * +---------------+----------------------------------------------------+
> + * | CACHE_INVLD   |  mandatory feature for nesting capable IOMMU       |
> + * +---------------+----------------------------------------------------+

Agree with the previous comments on these descriptions and Kevin's
suggestions.

> + *
> + */
> +struct iommu_nesting_info {
> +	__u32	size;
> +	__u32	format;
> +	__u32	features;
> +#define IOMMU_NESTING_FEAT_SYSWIDE_PASID	(1 << 0)
> +#define IOMMU_NESTING_FEAT_BIND_PGTBL		(1 << 1)
> +#define IOMMU_NESTING_FEAT_CACHE_INVLD		(1 << 2)
> +	__u32	flags;
> +	__u8	data[];

How does the user determine which vendor structure is provided in
data[]?  Thanks,

Alex

> +};
> +
> +/*
> + * struct iommu_nesting_info_vtd - Intel VT-d specific nesting info
> + *
> + *
> + * @flags:	VT-d specific flags. Currently reserved for future
> + *		extension.
> + * @addr_width:	The output addr width of first level/stage translation
> + * @pasid_bits:	Maximum supported PASID bits, 0 represents no PASID
> + *		support.
> + * @cap_reg:	Describe basic capabilities as defined in VT-d capability
> + *		register.
> + * @ecap_reg:	Describe the extended capabilities as defined in VT-d
> + *		extended capability register.
> + */
> +struct iommu_nesting_info_vtd {
> +	__u32	flags;
> +	__u16	addr_width;
> +	__u16	pasid_bits;
> +	__u64	cap_reg;
> +	__u64	ecap_reg;
> +};
> +
>  #endif /* _UAPI_IOMMU_H */
Yi Liu July 3, 2020, 3:53 a.m. UTC | #11
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Friday, July 3, 2020 1:55 AM
> 
> On Wed, 24 Jun 2020 01:55:15 -0700
> Liu Yi L <yi.l.liu@intel.com> wrote:
> 
> > IOMMUs that support nesting translation needs report the capability
> > info to userspace, e.g. the format of first level/stage paging structures.
> >
> > This patch reports nesting info by DOMAIN_ATTR_NESTING. Caller can get
> > nesting info after setting DOMAIN_ATTR_NESTING.
> >
> > v2 -> v3:
> > *) remvoe cap/ecap_mask in iommu_nesting_info.
> > *) reuse DOMAIN_ATTR_NESTING to get nesting info.
> > *) return an empty iommu_nesting_info for SMMU drivers per Jean'
> >    suggestion.
> >
> > Cc: Kevin Tian <kevin.tian@intel.com>
> > CC: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > Cc: Alex Williamson <alex.williamson@redhat.com>
> > Cc: Eric Auger <eric.auger@redhat.com>
> > Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
> > Cc: Joerg Roedel <joro@8bytes.org>
> > Cc: Lu Baolu <baolu.lu@linux.intel.com>
> > Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
> > Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
> > ---
> >  drivers/iommu/arm-smmu-v3.c | 29 ++++++++++++++++++++--
> >  drivers/iommu/arm-smmu.c    | 29 ++++++++++++++++++++--
> >  include/uapi/linux/iommu.h  | 59
> > +++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 113 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> > index f578677..0c45d4d 100644
> > --- a/drivers/iommu/arm-smmu-v3.c
> > +++ b/drivers/iommu/arm-smmu-v3.c
> > @@ -3019,6 +3019,32 @@ static struct iommu_group
> *arm_smmu_device_group(struct device *dev)
> >  	return group;
> >  }
> >
> > +static int arm_smmu_domain_nesting_info(struct arm_smmu_domain
> *smmu_domain,
> > +					void *data)
> > +{
> > +	struct iommu_nesting_info *info = (struct iommu_nesting_info *) data;
> > +	u32 size;
> > +
> > +	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
> > +		return -ENODEV;
> > +
> > +	size = sizeof(struct iommu_nesting_info);
> > +
> > +	/*
> > +	 * if provided buffer size is not equal to the size, should
> > +	 * return 0 and also the expected buffer size to caller.
> > +	 */
> > +	if (info->size != size) {
> > +		info->size = size;
> > +		return 0;
> > +	}
> > +
> > +	/* report an empty iommu_nesting_info for now */
> > +	memset(info, 0x0, size);
> > +	info->size = size;
> > +	return 0;
> > +}
> > +
> >  static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
> >  				    enum iommu_attr attr, void *data)  { @@ -
> 3028,8 +3054,7 @@
> > static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
> >  	case IOMMU_DOMAIN_UNMANAGED:
> >  		switch (attr) {
> >  		case DOMAIN_ATTR_NESTING:
> > -			*(int *)data = (smmu_domain->stage ==
> ARM_SMMU_DOMAIN_NESTED);
> > -			return 0;
> > +			return arm_smmu_domain_nesting_info(smmu_domain,
> data);
> >  		default:
> >  			return -ENODEV;
> >  		}
> > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index
> > 243bc4c..908607d 100644
> > --- a/drivers/iommu/arm-smmu.c
> > +++ b/drivers/iommu/arm-smmu.c
> > @@ -1506,6 +1506,32 @@ static struct iommu_group
> *arm_smmu_device_group(struct device *dev)
> >  	return group;
> >  }
> >
> > +static int arm_smmu_domain_nesting_info(struct arm_smmu_domain
> *smmu_domain,
> > +					void *data)
> > +{
> > +	struct iommu_nesting_info *info = (struct iommu_nesting_info *) data;
> > +	u32 size;
> > +
> > +	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
> > +		return -ENODEV;
> > +
> > +	size = sizeof(struct iommu_nesting_info);
> > +
> > +	/*
> > +	 * if provided buffer size is not equal to the size, should
> > +	 * return 0 and also the expected buffer size to caller.
> > +	 */
> > +	if (info->size != size) {
> > +		info->size = size;
> > +		return 0;
> > +	}
> > +
> > +	/* report an empty iommu_nesting_info for now */
> > +	memset(info, 0x0, size);
> > +	info->size = size;
> > +	return 0;
> > +}
> > +
> >  static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
> >  				    enum iommu_attr attr, void *data)  { @@ -
> 1515,8 +1541,7 @@
> > static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
> >  	case IOMMU_DOMAIN_UNMANAGED:
> >  		switch (attr) {
> >  		case DOMAIN_ATTR_NESTING:
> > -			*(int *)data = (smmu_domain->stage ==
> ARM_SMMU_DOMAIN_NESTED);
> > -			return 0;
> > +			return arm_smmu_domain_nesting_info(smmu_domain,
> data);
> >  		default:
> >  			return -ENODEV;
> >  		}
> > diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> > index 1afc661..898c99a 100644
> > --- a/include/uapi/linux/iommu.h
> > +++ b/include/uapi/linux/iommu.h
> > @@ -332,4 +332,63 @@ struct iommu_gpasid_bind_data {
> >  	} vendor;
> >  };
> >
> > +/*
> > + * struct iommu_nesting_info - Information for nesting-capable IOMMU.
> > + *				user space should check it before using
> > + *				nesting capability.
> > + *
> > + * @size:	size of the whole structure
> > + * @format:	PASID table entry format, the same definition with
> > + *		@format of struct iommu_gpasid_bind_data.
> > + * @features:	supported nesting features.
> > + * @flags:	currently reserved for future extension.
> > + * @data:	vendor specific cap info.
> > + *
> > + * +---------------+----------------------------------------------------+
> > + * | feature       |  Notes                                             |
> > + *
> >
> ++===============+============================================
> ========
> > ++
> > + * | SYSWIDE_PASID |  Kernel manages PASID in system wide, PASIDs used  |
> > + * |               |  in the system should be allocated by host kernel  |
> > + * +---------------+----------------------------------------------------+
> > + * | BIND_PGTBL    |  bind page tables to host PASID, the PASID could   |
> > + * |               |  either be a host PASID passed in bind request or  |
> > + * |               |  default PASIDs (e.g. default PASID of aux-domain) |
> > + * +---------------+----------------------------------------------------+
> > + * | CACHE_INVLD   |  mandatory feature for nesting capable IOMMU       |
> > + *
> > ++---------------+----------------------------------------------------
> > ++
> 
> Agree with the previous comments on these descriptions and Kevin's suggestions.

I see. will follow the suggestions.

> > + *
> > + */
> > +struct iommu_nesting_info {
> > +	__u32	size;
> > +	__u32	format;
> > +	__u32	features;
> > +#define IOMMU_NESTING_FEAT_SYSWIDE_PASID	(1 << 0)
> > +#define IOMMU_NESTING_FEAT_BIND_PGTBL		(1 << 1)
> > +#define IOMMU_NESTING_FEAT_CACHE_INVLD		(1 << 2)
> > +	__u32	flags;
> > +	__u8	data[];
> 
> How does the user determine which vendor structure is provided in data[]?

it can be deduced by the @format field. @format field follows the definition
in the iommu_gpasid_bind_data.

struct iommu_gpasid_bind_data {
        __u32 argsz;
#define IOMMU_GPASID_BIND_VERSION_1     1
        __u32 version;
#define IOMMU_PASID_FORMAT_INTEL_VTD    1
        __u32 format;
#define IOMMU_SVA_GPASID_VAL    (1 << 0) /* guest PASID valid */
        __u64 flags;
        __u64 gpgd;
        __u64 hpasid;
        __u64 gpasid;
        __u32 addr_width;
        __u8  padding[12];
        /* Vendor specific data */
        union {
                struct iommu_gpasid_bind_data_vtd vtd;
        } vendor;
};

Regards,
Yi Liu

> Thanks,
> 
> Alex
> 
> > +};
> > +
> > +/*
> > + * struct iommu_nesting_info_vtd - Intel VT-d specific nesting info
> > + *
> > + *
> > + * @flags:	VT-d specific flags. Currently reserved for future
> > + *		extension.
> > + * @addr_width:	The output addr width of first level/stage translation
> > + * @pasid_bits:	Maximum supported PASID bits, 0 represents no PASID
> > + *		support.
> > + * @cap_reg:	Describe basic capabilities as defined in VT-d capability
> > + *		register.
> > + * @ecap_reg:	Describe the extended capabilities as defined in VT-d
> > + *		extended capability register.
> > + */
> > +struct iommu_nesting_info_vtd {
> > +	__u32	flags;
> > +	__u16	addr_width;
> > +	__u16	pasid_bits;
> > +	__u64	cap_reg;
> > +	__u64	ecap_reg;
> > +};
> > +
> >  #endif /* _UAPI_IOMMU_H */
Stefan Hajnoczi July 3, 2020, 9:59 a.m. UTC | #12
On Tue, Jun 30, 2020 at 02:00:49AM +0000, Tian, Kevin wrote:
> > From: Liu, Yi L <yi.l.liu@intel.com>
> > Sent: Monday, June 29, 2020 8:23 PM
> > 
> > Hi Stefan,
> > 
> > > From: Stefan Hajnoczi <stefanha@gmail.com>
> > > Sent: Monday, June 29, 2020 5:25 PM
> > >
> > > On Wed, Jun 24, 2020 at 01:55:15AM -0700, Liu Yi L wrote:
> > > > +/*
> > > > + * struct iommu_nesting_info - Information for nesting-capable IOMMU.
> > > > + *				user space should check it before using
> > > > + *				nesting capability.
> > > > + *
> > > > + * @size:	size of the whole structure
> > > > + * @format:	PASID table entry format, the same definition with
> > > > + *		@format of struct iommu_gpasid_bind_data.
> > > > + * @features:	supported nesting features.
> > > > + * @flags:	currently reserved for future extension.
> > > > + * @data:	vendor specific cap info.
> > > > + *
> > > > + * +---------------+----------------------------------------------------+
> > > > + * | feature       |  Notes                                             |
> > > > + *
> > >
> > +===============+===============================================
> > ====
> > > =+
> > > > + * | SYSWIDE_PASID |  Kernel manages PASID in system wide, PASIDs
> > used  |
> > > > + * |               |  in the system should be allocated by host kernel  |
> > > > + * +---------------+----------------------------------------------------+
> > > > + * | BIND_PGTBL    |  bind page tables to host PASID, the PASID could   |
> > > > + * |               |  either be a host PASID passed in bind request or  |
> > > > + * |               |  default PASIDs (e.g. default PASID of aux-domain) |
> > > > + * +---------------+----------------------------------------------------+
> > > > + * | CACHE_INVLD   |  mandatory feature for nesting capable IOMMU
> > |
> > > > + * +---------------+----------------------------------------------------+
> > >
> > > This feature description is vague about what CACHE_INVLD does and how
> > to
> > > use it. If I understand correctly, the presence of this feature means
> > > that VFIO_IOMMU_NESTING_OP_CACHE_INVLD must be used?
> > >
> > > The same kind of clarification could be done for SYSWIDE_PASID and
> > > BIND_PGTBL too.
> > 
> > For SYSWIDE_PASID and BIND_PGTBL, yes, presence of the feature bit
> > means must use. So the two are requirements to user space if it wants
> > to setup nesting. While for CACHE_INVLD, it's kind of availability
> > here. How about removing CACHE_INVLD as presence of BIND_PGTBL should
> > indicates support of CACHE_INVLD?
> > 
> 
> So far this assumption is correct but it may not be true when thinking forward.
> For example, a vendor might find a way to allow the owner of 1st-level page
> table to directly invalidate cache w/o going through host IOMMU driver. From
> this angle I feel explicitly reporting this capability is more robust.
> 
> Regarding to the description, what about below?
> 
> --
> SYSWIDE_PASID: PASIDs are managed in system-wide, instead of per device.
> When a device is assigned to userspace or VM, proper uAPI (provided by 
> userspace driver framework, e.g. VFIO) must be used to allocate/free PASIDs
> for the assigned device.
> 
> BIND_PGTBL: The owner of the first-level/stage-1 page table must explicitly 
> bind the page table to associated PASID (either the one specified in bind 
> request or the default PASID of the iommu domain), through VFIO_IOMMU
> _NESTING_OP
> 
> CACHE_INVLD: The owner of the first-level/stage-1 page table must
> explicitly invalidate the IOMMU cache through VFIO_IOMMU_NESTING_OP,
> according to vendor-specific requirement when changing the page table.
> --

Mentioning the API to allocate/free PASIDs and VFIO_IOMMU_NESTING_OP has
made this clearer. This lets someone reading the documentation know
where to look for further information on using these features.

Thank you!

Stefan
diff mbox series

Patch

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index f578677..0c45d4d 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -3019,6 +3019,32 @@  static struct iommu_group *arm_smmu_device_group(struct device *dev)
 	return group;
 }
 
+static int arm_smmu_domain_nesting_info(struct arm_smmu_domain *smmu_domain,
+					void *data)
+{
+	struct iommu_nesting_info *info = (struct iommu_nesting_info *) data;
+	u32 size;
+
+	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+		return -ENODEV;
+
+	size = sizeof(struct iommu_nesting_info);
+
+	/*
+	 * if provided buffer size is not equal to the size, should
+	 * return 0 and also the expected buffer size to caller.
+	 */
+	if (info->size != size) {
+		info->size = size;
+		return 0;
+	}
+
+	/* report an empty iommu_nesting_info for now */
+	memset(info, 0x0, size);
+	info->size = size;
+	return 0;
+}
+
 static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 				    enum iommu_attr attr, void *data)
 {
@@ -3028,8 +3054,7 @@  static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 	case IOMMU_DOMAIN_UNMANAGED:
 		switch (attr) {
 		case DOMAIN_ATTR_NESTING:
-			*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
-			return 0;
+			return arm_smmu_domain_nesting_info(smmu_domain, data);
 		default:
 			return -ENODEV;
 		}
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 243bc4c..908607d 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -1506,6 +1506,32 @@  static struct iommu_group *arm_smmu_device_group(struct device *dev)
 	return group;
 }
 
+static int arm_smmu_domain_nesting_info(struct arm_smmu_domain *smmu_domain,
+					void *data)
+{
+	struct iommu_nesting_info *info = (struct iommu_nesting_info *) data;
+	u32 size;
+
+	if (!info || smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+		return -ENODEV;
+
+	size = sizeof(struct iommu_nesting_info);
+
+	/*
+	 * if provided buffer size is not equal to the size, should
+	 * return 0 and also the expected buffer size to caller.
+	 */
+	if (info->size != size) {
+		info->size = size;
+		return 0;
+	}
+
+	/* report an empty iommu_nesting_info for now */
+	memset(info, 0x0, size);
+	info->size = size;
+	return 0;
+}
+
 static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 				    enum iommu_attr attr, void *data)
 {
@@ -1515,8 +1541,7 @@  static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 	case IOMMU_DOMAIN_UNMANAGED:
 		switch (attr) {
 		case DOMAIN_ATTR_NESTING:
-			*(int *)data = (smmu_domain->stage == ARM_SMMU_DOMAIN_NESTED);
-			return 0;
+			return arm_smmu_domain_nesting_info(smmu_domain, data);
 		default:
 			return -ENODEV;
 		}
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 1afc661..898c99a 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -332,4 +332,63 @@  struct iommu_gpasid_bind_data {
 	} vendor;
 };
 
+/*
+ * struct iommu_nesting_info - Information for nesting-capable IOMMU.
+ *				user space should check it before using
+ *				nesting capability.
+ *
+ * @size:	size of the whole structure
+ * @format:	PASID table entry format, the same definition with
+ *		@format of struct iommu_gpasid_bind_data.
+ * @features:	supported nesting features.
+ * @flags:	currently reserved for future extension.
+ * @data:	vendor specific cap info.
+ *
+ * +---------------+----------------------------------------------------+
+ * | feature       |  Notes                                             |
+ * +===============+====================================================+
+ * | SYSWIDE_PASID |  Kernel manages PASID in system wide, PASIDs used  |
+ * |               |  in the system should be allocated by host kernel  |
+ * +---------------+----------------------------------------------------+
+ * | BIND_PGTBL    |  bind page tables to host PASID, the PASID could   |
+ * |               |  either be a host PASID passed in bind request or  |
+ * |               |  default PASIDs (e.g. default PASID of aux-domain) |
+ * +---------------+----------------------------------------------------+
+ * | CACHE_INVLD   |  mandatory feature for nesting capable IOMMU       |
+ * +---------------+----------------------------------------------------+
+ *
+ */
+struct iommu_nesting_info {
+	__u32	size;
+	__u32	format;
+	__u32	features;
+#define IOMMU_NESTING_FEAT_SYSWIDE_PASID	(1 << 0)
+#define IOMMU_NESTING_FEAT_BIND_PGTBL		(1 << 1)
+#define IOMMU_NESTING_FEAT_CACHE_INVLD		(1 << 2)
+	__u32	flags;
+	__u8	data[];
+};
+
+/*
+ * struct iommu_nesting_info_vtd - Intel VT-d specific nesting info
+ *
+ *
+ * @flags:	VT-d specific flags. Currently reserved for future
+ *		extension.
+ * @addr_width:	The output addr width of first level/stage translation
+ * @pasid_bits:	Maximum supported PASID bits, 0 represents no PASID
+ *		support.
+ * @cap_reg:	Describe basic capabilities as defined in VT-d capability
+ *		register.
+ * @ecap_reg:	Describe the extended capabilities as defined in VT-d
+ *		extended capability register.
+ */
+struct iommu_nesting_info_vtd {
+	__u32	flags;
+	__u16	addr_width;
+	__u16	pasid_bits;
+	__u64	cap_reg;
+	__u64	ecap_reg;
+};
+
 #endif /* _UAPI_IOMMU_H */