diff mbox series

[v6,09/22] vfio: VFIO_IOMMU_BIND/UNBIND_MSI

Message ID 20190317172232.1068-10-eric.auger@redhat.com (mailing list archive)
State New, archived
Headers show
Series SMMUv3 Nested Stage Setup | expand

Commit Message

Eric Auger March 17, 2019, 5:22 p.m. UTC
This patch adds the VFIO_IOMMU_BIND/UNBIND_MSI ioctl which aim
to pass/withdraw the guest MSI binding to/from the host.

Signed-off-by: Eric Auger <eric.auger@redhat.com>

---
v3 -> v4:
- add UNBIND
- unwind on BIND error

v2 -> v3:
- adapt to new proto of bind_guest_msi
- directly use vfio_iommu_for_each_dev

v1 -> v2:
- s/vfio_iommu_type1_guest_msi_binding/vfio_iommu_type1_bind_guest_msi
---
 drivers/vfio/vfio_iommu_type1.c | 58 +++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h       | 29 +++++++++++++++++
 2 files changed, 87 insertions(+)

Comments

Alex Williamson March 21, 2019, 11:01 p.m. UTC | #1
On Sun, 17 Mar 2019 18:22:19 +0100
Eric Auger <eric.auger@redhat.com> wrote:

> This patch adds the VFIO_IOMMU_BIND/UNBIND_MSI ioctl which aim
> to pass/withdraw the guest MSI binding to/from the host.
> 
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> 
> ---
> v3 -> v4:
> - add UNBIND
> - unwind on BIND error
> 
> v2 -> v3:
> - adapt to new proto of bind_guest_msi
> - directly use vfio_iommu_for_each_dev
> 
> v1 -> v2:
> - s/vfio_iommu_type1_guest_msi_binding/vfio_iommu_type1_bind_guest_msi
> ---
>  drivers/vfio/vfio_iommu_type1.c | 58 +++++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h       | 29 +++++++++++++++++
>  2 files changed, 87 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 12a40b9db6aa..66513679081b 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -1710,6 +1710,25 @@ static int vfio_cache_inv_fn(struct device *dev, void *data)
>  	return iommu_cache_invalidate(d, dev, &ustruct->info);
>  }
>  
> +static int vfio_bind_msi_fn(struct device *dev, void *data)
> +{
> +	struct vfio_iommu_type1_bind_msi *ustruct =
> +		(struct vfio_iommu_type1_bind_msi *)data;
> +	struct iommu_domain *d = iommu_get_domain_for_dev(dev);
> +
> +	return iommu_bind_guest_msi(d, dev, ustruct->iova,
> +				    ustruct->gpa, ustruct->size);
> +}
> +
> +static int vfio_unbind_msi_fn(struct device *dev, void *data)
> +{
> +	dma_addr_t *iova = (dma_addr_t *)data;
> +	struct iommu_domain *d = iommu_get_domain_for_dev(dev);

Same as previous, we can encapsulate domain in our own struct to avoid
a lookup.

> +
> +	iommu_unbind_guest_msi(d, dev, *iova);

Is it strange that iommu-core is exposing these interfaces at a device
level if every one of them requires us to walk all the devices?  Thanks,

Alex

> +	return 0;
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  				   unsigned int cmd, unsigned long arg)
>  {
> @@ -1814,6 +1833,45 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  					      vfio_cache_inv_fn);
>  		mutex_unlock(&iommu->lock);
>  		return ret;
> +	} else if (cmd == VFIO_IOMMU_BIND_MSI) {
> +		struct vfio_iommu_type1_bind_msi ustruct;
> +		int ret;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_bind_msi,
> +				    size);
> +
> +		if (copy_from_user(&ustruct, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (ustruct.argsz < minsz || ustruct.flags)
> +			return -EINVAL;
> +
> +		mutex_lock(&iommu->lock);
> +		ret = vfio_iommu_for_each_dev(iommu, &ustruct,
> +					      vfio_bind_msi_fn);
> +		if (ret)
> +			vfio_iommu_for_each_dev(iommu, &ustruct.iova,
> +						vfio_unbind_msi_fn);
> +		mutex_unlock(&iommu->lock);
> +		return ret;
> +	} else if (cmd == VFIO_IOMMU_UNBIND_MSI) {
> +		struct vfio_iommu_type1_unbind_msi ustruct;
> +		int ret;
> +
> +		minsz = offsetofend(struct vfio_iommu_type1_unbind_msi,
> +				    iova);
> +
> +		if (copy_from_user(&ustruct, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (ustruct.argsz < minsz || ustruct.flags)
> +			return -EINVAL;
> +
> +		mutex_lock(&iommu->lock);
> +		ret = vfio_iommu_for_each_dev(iommu, &ustruct.iova,
> +					      vfio_unbind_msi_fn);
> +		mutex_unlock(&iommu->lock);
> +		return ret;
>  	}
>  
>  	return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 29f0ef2d805d..6763389b6adc 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -789,6 +789,35 @@ struct vfio_iommu_type1_cache_invalidate {
>  };
>  #define VFIO_IOMMU_CACHE_INVALIDATE      _IO(VFIO_TYPE, VFIO_BASE + 24)
>  
> +/**
> + * VFIO_IOMMU_BIND_MSI - _IOWR(VFIO_TYPE, VFIO_BASE + 25,
> + *			struct vfio_iommu_type1_bind_msi)
> + *
> + * Pass a stage 1 MSI doorbell mapping to the host so that this
> + * latter can build a nested stage2 mapping
> + */
> +struct vfio_iommu_type1_bind_msi {
> +	__u32   argsz;
> +	__u32   flags;
> +	__u64	iova;
> +	__u64	gpa;
> +	__u64	size;
> +};
> +#define VFIO_IOMMU_BIND_MSI      _IO(VFIO_TYPE, VFIO_BASE + 25)
> +
> +/**
> + * VFIO_IOMMU_UNBIND_MSI - _IOWR(VFIO_TYPE, VFIO_BASE + 26,
> + *			struct vfio_iommu_type1_unbind_msi)
> + *
> + * Unregister an MSI mapping
> + */
> +struct vfio_iommu_type1_unbind_msi {
> +	__u32   argsz;
> +	__u32   flags;
> +	__u64	iova;
> +};
> +#define VFIO_IOMMU_UNBIND_MSI      _IO(VFIO_TYPE, VFIO_BASE + 26)
> +
>  /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>  
>  /*
Eric Auger March 22, 2019, 9:30 a.m. UTC | #2
Hi Alex,
On 3/22/19 12:01 AM, Alex Williamson wrote:
> On Sun, 17 Mar 2019 18:22:19 +0100
> Eric Auger <eric.auger@redhat.com> wrote:
> 
>> This patch adds the VFIO_IOMMU_BIND/UNBIND_MSI ioctl which aim
>> to pass/withdraw the guest MSI binding to/from the host.
>>
>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>
>> ---
>> v3 -> v4:
>> - add UNBIND
>> - unwind on BIND error
>>
>> v2 -> v3:
>> - adapt to new proto of bind_guest_msi
>> - directly use vfio_iommu_for_each_dev
>>
>> v1 -> v2:
>> - s/vfio_iommu_type1_guest_msi_binding/vfio_iommu_type1_bind_guest_msi
>> ---
>>  drivers/vfio/vfio_iommu_type1.c | 58 +++++++++++++++++++++++++++++++++
>>  include/uapi/linux/vfio.h       | 29 +++++++++++++++++
>>  2 files changed, 87 insertions(+)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>> index 12a40b9db6aa..66513679081b 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -1710,6 +1710,25 @@ static int vfio_cache_inv_fn(struct device *dev, void *data)
>>  	return iommu_cache_invalidate(d, dev, &ustruct->info);
>>  }
>>  
>> +static int vfio_bind_msi_fn(struct device *dev, void *data)
>> +{
>> +	struct vfio_iommu_type1_bind_msi *ustruct =
>> +		(struct vfio_iommu_type1_bind_msi *)data;
>> +	struct iommu_domain *d = iommu_get_domain_for_dev(dev);
>> +
>> +	return iommu_bind_guest_msi(d, dev, ustruct->iova,
>> +				    ustruct->gpa, ustruct->size);
>> +}
>> +
>> +static int vfio_unbind_msi_fn(struct device *dev, void *data)
>> +{
>> +	dma_addr_t *iova = (dma_addr_t *)data;
>> +	struct iommu_domain *d = iommu_get_domain_for_dev(dev);
> 
> Same as previous, we can encapsulate domain in our own struct to avoid
> a lookup.
> 
>> +
>> +	iommu_unbind_guest_msi(d, dev, *iova);
> 
> Is it strange that iommu-core is exposing these interfaces at a device
> level if every one of them requires us to walk all the devices?  Thanks,

Hum this per device API was devised in response of Robin's comments on

[RFC v2 12/20] dma-iommu: Implement NESTED_MSI cookie.

"
But that then seems to reveal a somewhat bigger problem - if the callers
are simply registering IPAs, and relying on the ITS driver to grab an
entry and fill in a PA later, then how does either one know *which* PA
is supposed to belong to a given IPA in the case where you have multiple
devices with different ITS targets assigned to the same guest? (and if
it's possible to assume a guest will use per-device stage 1 mappings and
present it with a single vITS backed by multiple pITSes, I think things
start breaking even harder.)
"

However looking back into the problem I wonder if there was an issue
with the iommu_domain based API.

If my understanding is correct, when assigned devices are protected by a
vIOMMU then they necessarily end up in separate host iommu domains even
if they belong to the same iommu_domain on the guest. And there can only
be a single device in this iommu_domain.

If this is confirmed, there is a non ambiguous association between 1
physical iommu_domain, 1 device, 1 S1 mapping and 1 physical MSI
controller.

I added the device handle handle to disambiguate those associations. The
gIOVA ->gDB mapping is associated with a device handle. Then when the
host needs a stage 1 mapping for this device, to build the nested
mapping towards the physical DB it can easily grab the gIOVA->gDB stage
1 mapping registered for this device.

The correctness looks more obvious to me, at least.

Thanks

Eric

> 
> Alex
> 
>> +	return 0;
>> +}
>> +
>>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>>  				   unsigned int cmd, unsigned long arg)
>>  {
>> @@ -1814,6 +1833,45 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>>  					      vfio_cache_inv_fn);
>>  		mutex_unlock(&iommu->lock);
>>  		return ret;
>> +	} else if (cmd == VFIO_IOMMU_BIND_MSI) {
>> +		struct vfio_iommu_type1_bind_msi ustruct;
>> +		int ret;
>> +
>> +		minsz = offsetofend(struct vfio_iommu_type1_bind_msi,
>> +				    size);
>> +
>> +		if (copy_from_user(&ustruct, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (ustruct.argsz < minsz || ustruct.flags)
>> +			return -EINVAL;
>> +
>> +		mutex_lock(&iommu->lock);
>> +		ret = vfio_iommu_for_each_dev(iommu, &ustruct,
>> +					      vfio_bind_msi_fn);
>> +		if (ret)
>> +			vfio_iommu_for_each_dev(iommu, &ustruct.iova,
>> +						vfio_unbind_msi_fn);
>> +		mutex_unlock(&iommu->lock);
>> +		return ret;
>> +	} else if (cmd == VFIO_IOMMU_UNBIND_MSI) {
>> +		struct vfio_iommu_type1_unbind_msi ustruct;
>> +		int ret;
>> +
>> +		minsz = offsetofend(struct vfio_iommu_type1_unbind_msi,
>> +				    iova);
>> +
>> +		if (copy_from_user(&ustruct, (void __user *)arg, minsz))
>> +			return -EFAULT;
>> +
>> +		if (ustruct.argsz < minsz || ustruct.flags)
>> +			return -EINVAL;
>> +
>> +		mutex_lock(&iommu->lock);
>> +		ret = vfio_iommu_for_each_dev(iommu, &ustruct.iova,
>> +					      vfio_unbind_msi_fn);
>> +		mutex_unlock(&iommu->lock);
>> +		return ret;
>>  	}
>>  
>>  	return -ENOTTY;
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index 29f0ef2d805d..6763389b6adc 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -789,6 +789,35 @@ struct vfio_iommu_type1_cache_invalidate {
>>  };
>>  #define VFIO_IOMMU_CACHE_INVALIDATE      _IO(VFIO_TYPE, VFIO_BASE + 24)
>>  
>> +/**
>> + * VFIO_IOMMU_BIND_MSI - _IOWR(VFIO_TYPE, VFIO_BASE + 25,
>> + *			struct vfio_iommu_type1_bind_msi)
>> + *
>> + * Pass a stage 1 MSI doorbell mapping to the host so that this
>> + * latter can build a nested stage2 mapping
>> + */
>> +struct vfio_iommu_type1_bind_msi {
>> +	__u32   argsz;
>> +	__u32   flags;
>> +	__u64	iova;
>> +	__u64	gpa;
>> +	__u64	size;
>> +};
>> +#define VFIO_IOMMU_BIND_MSI      _IO(VFIO_TYPE, VFIO_BASE + 25)
>> +
>> +/**
>> + * VFIO_IOMMU_UNBIND_MSI - _IOWR(VFIO_TYPE, VFIO_BASE + 26,
>> + *			struct vfio_iommu_type1_unbind_msi)
>> + *
>> + * Unregister an MSI mapping
>> + */
>> +struct vfio_iommu_type1_unbind_msi {
>> +	__u32   argsz;
>> +	__u32   flags;
>> +	__u64	iova;
>> +};
>> +#define VFIO_IOMMU_UNBIND_MSI      _IO(VFIO_TYPE, VFIO_BASE + 26)
>> +
>>  /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>>  
>>  /*
>
Alex Williamson March 22, 2019, 10:09 p.m. UTC | #3
On Fri, 22 Mar 2019 10:30:02 +0100
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Alex,
> On 3/22/19 12:01 AM, Alex Williamson wrote:
> > On Sun, 17 Mar 2019 18:22:19 +0100
> > Eric Auger <eric.auger@redhat.com> wrote:
> >   
> >> This patch adds the VFIO_IOMMU_BIND/UNBIND_MSI ioctl which aim
> >> to pass/withdraw the guest MSI binding to/from the host.
> >>
> >> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> >>
> >> ---
> >> v3 -> v4:
> >> - add UNBIND
> >> - unwind on BIND error
> >>
> >> v2 -> v3:
> >> - adapt to new proto of bind_guest_msi
> >> - directly use vfio_iommu_for_each_dev
> >>
> >> v1 -> v2:
> >> - s/vfio_iommu_type1_guest_msi_binding/vfio_iommu_type1_bind_guest_msi
> >> ---
> >>  drivers/vfio/vfio_iommu_type1.c | 58 +++++++++++++++++++++++++++++++++
> >>  include/uapi/linux/vfio.h       | 29 +++++++++++++++++
> >>  2 files changed, 87 insertions(+)
> >>
> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >> index 12a40b9db6aa..66513679081b 100644
> >> --- a/drivers/vfio/vfio_iommu_type1.c
> >> +++ b/drivers/vfio/vfio_iommu_type1.c
> >> @@ -1710,6 +1710,25 @@ static int vfio_cache_inv_fn(struct device *dev, void *data)
> >>  	return iommu_cache_invalidate(d, dev, &ustruct->info);
> >>  }
> >>  
> >> +static int vfio_bind_msi_fn(struct device *dev, void *data)
> >> +{
> >> +	struct vfio_iommu_type1_bind_msi *ustruct =
> >> +		(struct vfio_iommu_type1_bind_msi *)data;
> >> +	struct iommu_domain *d = iommu_get_domain_for_dev(dev);
> >> +
> >> +	return iommu_bind_guest_msi(d, dev, ustruct->iova,
> >> +				    ustruct->gpa, ustruct->size);
> >> +}
> >> +
> >> +static int vfio_unbind_msi_fn(struct device *dev, void *data)
> >> +{
> >> +	dma_addr_t *iova = (dma_addr_t *)data;
> >> +	struct iommu_domain *d = iommu_get_domain_for_dev(dev);  
> > 
> > Same as previous, we can encapsulate domain in our own struct to avoid
> > a lookup.
> >   
> >> +
> >> +	iommu_unbind_guest_msi(d, dev, *iova);  
> > 
> > Is it strange that iommu-core is exposing these interfaces at a device
> > level if every one of them requires us to walk all the devices?  Thanks,  
> 
> Hum this per device API was devised in response of Robin's comments on
> 
> [RFC v2 12/20] dma-iommu: Implement NESTED_MSI cookie.
> 
> "
> But that then seems to reveal a somewhat bigger problem - if the callers
> are simply registering IPAs, and relying on the ITS driver to grab an
> entry and fill in a PA later, then how does either one know *which* PA
> is supposed to belong to a given IPA in the case where you have multiple
> devices with different ITS targets assigned to the same guest? (and if
> it's possible to assume a guest will use per-device stage 1 mappings and
> present it with a single vITS backed by multiple pITSes, I think things
> start breaking even harder.)
> "
> 
> However looking back into the problem I wonder if there was an issue
> with the iommu_domain based API.
> 
> If my understanding is correct, when assigned devices are protected by a
> vIOMMU then they necessarily end up in separate host iommu domains even
> if they belong to the same iommu_domain on the guest. And there can only
> be a single device in this iommu_domain.

Don't forget that a container represents the IOMMU context in a vfio
environment, groups are associated with containers and a group may
contain one or more devices.  When a vIOMMU comes into play, we still
only have an IOMMU context per container.  If we have multiple devices
in a group, we run into problems with vIOMMU.  We can resolve this by
requiring that the user ignore all but one device in the group,
or making sure that the devices in the group have the same IOMMU
context.  The latter we could do in QEMU if PCIe-to-PCI bridges there
masked the per-device address space as it does on real hardware (ie.
there is no requester ID on conventional PCI, all transactions appear to
the IOMMU with the bridge requester ID).  So I raise this question
because vfio's minimum domain granularity is a group.

> If this is confirmed, there is a non ambiguous association between 1
> physical iommu_domain, 1 device, 1 S1 mapping and 1 physical MSI
> controller.
> 
> I added the device handle handle to disambiguate those associations. The
> gIOVA ->gDB mapping is associated with a device handle. Then when the
> host needs a stage 1 mapping for this device, to build the nested
> mapping towards the physical DB it can easily grab the gIOVA->gDB stage
> 1 mapping registered for this device.
> 
> The correctness looks more obvious to me, at least.

Except all devices within all groups within the same container
necessarily share the same IOMMU context, so from that perspective, it
appears to impose non-trivial redundancy on the caller.  Thanks,

Alex
Eric Auger April 3, 2019, 2:30 p.m. UTC | #4
Hi Alex,

On 3/22/19 11:09 PM, Alex Williamson wrote:
> On Fri, 22 Mar 2019 10:30:02 +0100
> Auger Eric <eric.auger@redhat.com> wrote:
> 
>> Hi Alex,
>> On 3/22/19 12:01 AM, Alex Williamson wrote:
>>> On Sun, 17 Mar 2019 18:22:19 +0100
>>> Eric Auger <eric.auger@redhat.com> wrote:
>>>   
>>>> This patch adds the VFIO_IOMMU_BIND/UNBIND_MSI ioctl which aim
>>>> to pass/withdraw the guest MSI binding to/from the host.
>>>>
>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>
>>>> ---
>>>> v3 -> v4:
>>>> - add UNBIND
>>>> - unwind on BIND error
>>>>
>>>> v2 -> v3:
>>>> - adapt to new proto of bind_guest_msi
>>>> - directly use vfio_iommu_for_each_dev
>>>>
>>>> v1 -> v2:
>>>> - s/vfio_iommu_type1_guest_msi_binding/vfio_iommu_type1_bind_guest_msi
>>>> ---
>>>>  drivers/vfio/vfio_iommu_type1.c | 58 +++++++++++++++++++++++++++++++++
>>>>  include/uapi/linux/vfio.h       | 29 +++++++++++++++++
>>>>  2 files changed, 87 insertions(+)
>>>>
>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>>> index 12a40b9db6aa..66513679081b 100644
>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>> @@ -1710,6 +1710,25 @@ static int vfio_cache_inv_fn(struct device *dev, void *data)
>>>>  	return iommu_cache_invalidate(d, dev, &ustruct->info);
>>>>  }
>>>>  
>>>> +static int vfio_bind_msi_fn(struct device *dev, void *data)
>>>> +{
>>>> +	struct vfio_iommu_type1_bind_msi *ustruct =
>>>> +		(struct vfio_iommu_type1_bind_msi *)data;
>>>> +	struct iommu_domain *d = iommu_get_domain_for_dev(dev);
>>>> +
>>>> +	return iommu_bind_guest_msi(d, dev, ustruct->iova,
>>>> +				    ustruct->gpa, ustruct->size);
>>>> +}
>>>> +
>>>> +static int vfio_unbind_msi_fn(struct device *dev, void *data)
>>>> +{
>>>> +	dma_addr_t *iova = (dma_addr_t *)data;
>>>> +	struct iommu_domain *d = iommu_get_domain_for_dev(dev);  
>>>
>>> Same as previous, we can encapsulate domain in our own struct to avoid
>>> a lookup.
>>>   
>>>> +
>>>> +	iommu_unbind_guest_msi(d, dev, *iova);  
>>>
>>> Is it strange that iommu-core is exposing these interfaces at a device
>>> level if every one of them requires us to walk all the devices?  Thanks,  
>>
>> Hum this per device API was devised in response of Robin's comments on
>>
>> [RFC v2 12/20] dma-iommu: Implement NESTED_MSI cookie.
>>
>> "
>> But that then seems to reveal a somewhat bigger problem - if the callers
>> are simply registering IPAs, and relying on the ITS driver to grab an
>> entry and fill in a PA later, then how does either one know *which* PA
>> is supposed to belong to a given IPA in the case where you have multiple
>> devices with different ITS targets assigned to the same guest? (and if
>> it's possible to assume a guest will use per-device stage 1 mappings and
>> present it with a single vITS backed by multiple pITSes, I think things
>> start breaking even harder.)
>> "
>>
>> However looking back into the problem I wonder if there was an issue
>> with the iommu_domain based API.
>>
>> If my understanding is correct, when assigned devices are protected by a
>> vIOMMU then they necessarily end up in separate host iommu domains even
>> if they belong to the same iommu_domain on the guest. And there can only
>> be a single device in this iommu_domain.
> 
> Don't forget that a container represents the IOMMU context in a vfio
> environment, groups are associated with containers and a group may
> contain one or more devices.  When a vIOMMU comes into play, we still
> only have an IOMMU context per container.  If we have multiple devices
> in a group, we run into problems with vIOMMU.  We can resolve this by
> requiring that the user ignore all but one device in the group,
> or making sure that the devices in the group have the same IOMMU
> context.  The latter we could do in QEMU if PCIe-to-PCI bridges there
> masked the per-device address space as it does on real hardware (ie.
> there is no requester ID on conventional PCI, all transactions appear to
> the IOMMU with the bridge requester ID).  So I raise this question
> because vfio's minimum domain granularity is a group.
> 
>> If this is confirmed, there is a non ambiguous association between 1
>> physical iommu_domain, 1 device, 1 S1 mapping and 1 physical MSI
>> controller.
>>
>> I added the device handle handle to disambiguate those associations. The
>> gIOVA ->gDB mapping is associated with a device handle. Then when the
>> host needs a stage 1 mapping for this device, to build the nested
>> mapping towards the physical DB it can easily grab the gIOVA->gDB stage
>> 1 mapping registered for this device.
>>
>> The correctness looks more obvious to me, at least.
> 
> Except all devices within all groups within the same container
> necessarily share the same IOMMU context, so from that perspective, it
> appears to impose non-trivial redundancy on the caller.  Thanks,

Taking into consideration the case where we could have several devices
attached to the same host iommu group, each of them possibly using
different host MSI doorbells, I think I am in trouble.

Let's assume that using the pcie-to-pci bridge trick on guest side they
end up in the same container and in the same guest iommu group.

At the moment there is a single MSI controller on guest, so the same
gIOVA/gDB S1 mapping is going to be created by the guest iommu dommain
and both devices are programmed with gIOVA. If dev0 and dev1 are
attached to different host MSI controllers, I would need to build the 2
nested bindings:
dev0: MSI nested binding: gIOVA -> gDB -> hDB0
dev1: MSI nested binding: gIOVA -> gDB -> hDB1
(on guest there is a single MSI controller at the moment)

which is not possible as the devices belong to the same host iommu group
and share the same mapping.

The solution would be to instantiate 2 MSI controllers on guest side, in
which case we would end up with
dev0: gIOVA0 -> gDB0 -> hDB0
dev1: gIOVA1 -> gDB1 -> hDB1

Isn't it somehow what we do with the IOMMU RID topology. We need to take
into account the host topology (2 devices belonging to the same group)
to force the same on guest by introducing a PCIe-to-PCI bridge. Here we
would need to say, those assigned devices are attached to different MSI
domains on host, so we need the same on guest.

Anyway, the current container based IOCTL would fail to implement that
because I would register gIOVA0 -> gDB0 and gIOVA1 -> gDB1 for each
device within the container which would definitively fail to build the
correct association. So I think I would need anyway a device based IOTCL
that would aim to tell: this assigned device uses this S1 MSI binding.
All the notification mechanism we have in qemu is based on container, so
this would obliged to have device based notification mechanism.

So I wonder whether it wouldn't be sensible to restrict this use case
and say we support nested mode only if we have a single assigned device
within the container?

Thoughts?

Eric



> 
> Alex
>
Alex Williamson April 3, 2019, 5:38 p.m. UTC | #5
On Wed, 3 Apr 2019 16:30:15 +0200
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Alex,
> 
> On 3/22/19 11:09 PM, Alex Williamson wrote:
> > On Fri, 22 Mar 2019 10:30:02 +0100
> > Auger Eric <eric.auger@redhat.com> wrote:
> >   
> >> Hi Alex,
> >> On 3/22/19 12:01 AM, Alex Williamson wrote:  
> >>> On Sun, 17 Mar 2019 18:22:19 +0100
> >>> Eric Auger <eric.auger@redhat.com> wrote:
> >>>     
> >>>> This patch adds the VFIO_IOMMU_BIND/UNBIND_MSI ioctl which aim
> >>>> to pass/withdraw the guest MSI binding to/from the host.
> >>>>
> >>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> >>>>
> >>>> ---
> >>>> v3 -> v4:
> >>>> - add UNBIND
> >>>> - unwind on BIND error
> >>>>
> >>>> v2 -> v3:
> >>>> - adapt to new proto of bind_guest_msi
> >>>> - directly use vfio_iommu_for_each_dev
> >>>>
> >>>> v1 -> v2:
> >>>> - s/vfio_iommu_type1_guest_msi_binding/vfio_iommu_type1_bind_guest_msi
> >>>> ---
> >>>>  drivers/vfio/vfio_iommu_type1.c | 58 +++++++++++++++++++++++++++++++++
> >>>>  include/uapi/linux/vfio.h       | 29 +++++++++++++++++
> >>>>  2 files changed, 87 insertions(+)
> >>>>
> >>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >>>> index 12a40b9db6aa..66513679081b 100644
> >>>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>>> @@ -1710,6 +1710,25 @@ static int vfio_cache_inv_fn(struct device *dev, void *data)
> >>>>  	return iommu_cache_invalidate(d, dev, &ustruct->info);
> >>>>  }
> >>>>  
> >>>> +static int vfio_bind_msi_fn(struct device *dev, void *data)
> >>>> +{
> >>>> +	struct vfio_iommu_type1_bind_msi *ustruct =
> >>>> +		(struct vfio_iommu_type1_bind_msi *)data;
> >>>> +	struct iommu_domain *d = iommu_get_domain_for_dev(dev);
> >>>> +
> >>>> +	return iommu_bind_guest_msi(d, dev, ustruct->iova,
> >>>> +				    ustruct->gpa, ustruct->size);
> >>>> +}
> >>>> +
> >>>> +static int vfio_unbind_msi_fn(struct device *dev, void *data)
> >>>> +{
> >>>> +	dma_addr_t *iova = (dma_addr_t *)data;
> >>>> +	struct iommu_domain *d = iommu_get_domain_for_dev(dev);    
> >>>
> >>> Same as previous, we can encapsulate domain in our own struct to avoid
> >>> a lookup.
> >>>     
> >>>> +
> >>>> +	iommu_unbind_guest_msi(d, dev, *iova);    
> >>>
> >>> Is it strange that iommu-core is exposing these interfaces at a device
> >>> level if every one of them requires us to walk all the devices?  Thanks,    
> >>
> >> Hum this per device API was devised in response of Robin's comments on
> >>
> >> [RFC v2 12/20] dma-iommu: Implement NESTED_MSI cookie.
> >>
> >> "
> >> But that then seems to reveal a somewhat bigger problem - if the callers
> >> are simply registering IPAs, and relying on the ITS driver to grab an
> >> entry and fill in a PA later, then how does either one know *which* PA
> >> is supposed to belong to a given IPA in the case where you have multiple
> >> devices with different ITS targets assigned to the same guest? (and if
> >> it's possible to assume a guest will use per-device stage 1 mappings and
> >> present it with a single vITS backed by multiple pITSes, I think things
> >> start breaking even harder.)
> >> "
> >>
> >> However looking back into the problem I wonder if there was an issue
> >> with the iommu_domain based API.
> >>
> >> If my understanding is correct, when assigned devices are protected by a
> >> vIOMMU then they necessarily end up in separate host iommu domains even
> >> if they belong to the same iommu_domain on the guest. And there can only
> >> be a single device in this iommu_domain.  
> > 
> > Don't forget that a container represents the IOMMU context in a vfio
> > environment, groups are associated with containers and a group may
> > contain one or more devices.  When a vIOMMU comes into play, we still
> > only have an IOMMU context per container.  If we have multiple devices
> > in a group, we run into problems with vIOMMU.  We can resolve this by
> > requiring that the user ignore all but one device in the group,
> > or making sure that the devices in the group have the same IOMMU
> > context.  The latter we could do in QEMU if PCIe-to-PCI bridges there
> > masked the per-device address space as it does on real hardware (ie.
> > there is no requester ID on conventional PCI, all transactions appear to
> > the IOMMU with the bridge requester ID).  So I raise this question
> > because vfio's minimum domain granularity is a group.
> >   
> >> If this is confirmed, there is a non ambiguous association between 1
> >> physical iommu_domain, 1 device, 1 S1 mapping and 1 physical MSI
> >> controller.
> >>
> >> I added the device handle handle to disambiguate those associations. The
> >> gIOVA ->gDB mapping is associated with a device handle. Then when the
> >> host needs a stage 1 mapping for this device, to build the nested
> >> mapping towards the physical DB it can easily grab the gIOVA->gDB stage
> >> 1 mapping registered for this device.
> >>
> >> The correctness looks more obvious to me, at least.  
> > 
> > Except all devices within all groups within the same container
> > necessarily share the same IOMMU context, so from that perspective, it
> > appears to impose non-trivial redundancy on the caller.  Thanks,  
> 
> Taking into consideration the case where we could have several devices
> attached to the same host iommu group, each of them possibly using
> different host MSI doorbells, I think I am in trouble.
> 
> Let's assume that using the pcie-to-pci bridge trick on guest side they
> end up in the same container and in the same guest iommu group.
> 
> At the moment there is a single MSI controller on guest, so the same
> gIOVA/gDB S1 mapping is going to be created by the guest iommu dommain
> and both devices are programmed with gIOVA. If dev0 and dev1 are
> attached to different host MSI controllers, I would need to build the 2
> nested bindings:
> dev0: MSI nested binding: gIOVA -> gDB -> hDB0
> dev1: MSI nested binding: gIOVA -> gDB -> hDB1
> (on guest there is a single MSI controller at the moment)
> 
> which is not possible as the devices belong to the same host iommu group
> and share the same mapping.
> 
> The solution would be to instantiate 2 MSI controllers on guest side, in
> which case we would end up with
> dev0: gIOVA0 -> gDB0 -> hDB0
> dev1: gIOVA1 -> gDB1 -> hDB1
> 
> Isn't it somehow what we do with the IOMMU RID topology. We need to take
> into account the host topology (2 devices belonging to the same group)
> to force the same on guest by introducing a PCIe-to-PCI bridge. Here we
> would need to say, those assigned devices are attached to different MSI
> domains on host, so we need the same on guest.
> 
> Anyway, the current container based IOCTL would fail to implement that
> because I would register gIOVA0 -> gDB0 and gIOVA1 -> gDB1 for each
> device within the container which would definitively fail to build the
> correct association. So I think I would need anyway a device based IOTCL
> that would aim to tell: this assigned device uses this S1 MSI binding.
> All the notification mechanism we have in qemu is based on container, so
> this would obliged to have device based notification mechanism.
> 
> So I wonder whether it wouldn't be sensible to restrict this use case
> and say we support nested mode only if we have a single assigned device
> within the container?
> 
> Thoughts?

We've essentially done that with vIOMMU up to this point already, it's
not been possible to assign multiple devices from the same group to a
VM with intel-iommu, amd-iommu, or smmu due to the requirement of
separate address spaces per device.  It's only when we introduce
address space aliasing with bridges that we can even consider this
possibility, and it's a configuration which smmu doesn't properly
support even on bare metal.  I hope we can consider that to be simply a
gap in the implementation that will get fixed and not an architectural
problem.

As we discussed offline though, I wonder if we're attempting to support
more than necessary with your scenarios above.  If devices within the
same group can be verified to share a host MSI controller, do we still
have an issue mapping them to a single guest MSI controller?  When we
talked we were headed down a path that if a group is necessarily
associated to a single IOMMU, perhaps that necessarily means that a
group is also associated to a single MSI controller.  I've since
thought of a configuration where a group could span physical IOMMU
devices, NVLink.  As essentially a secondary bus interface for a
device, NVLink can cause devices with arbitrary PCI hierarchy
connections to be non-isolated, and ideally our grouping would
understand to account for that.  However, if it could be determined
that a group associates to a single MSI controller, do we still have an
issue with multiple devices within the group?

My issue with the per device interface for what is fundamentally an
IOVA mapping is that vfio does not support per device mappings.  We
support mappings at the container level, where the minimum set of
devices we can attach to a container is a group.  Therefore to create
an interface that purports to support device level mappings is not
accurate.  Maybe MSI controllers will restrict our configuration but
I'd rather not design the interface around the wrong level of mapping
granularity.  Thanks,

Alex
Eric Auger April 4, 2019, 6:55 a.m. UTC | #6
Hi Marc, Robin, Alex,

On 4/3/19 7:38 PM, Alex Williamson wrote:
> On Wed, 3 Apr 2019 16:30:15 +0200
> Auger Eric <eric.auger@redhat.com> wrote:
> 
>> Hi Alex,
>>
>> On 3/22/19 11:09 PM, Alex Williamson wrote:
>>> On Fri, 22 Mar 2019 10:30:02 +0100
>>> Auger Eric <eric.auger@redhat.com> wrote:
>>>   
>>>> Hi Alex,
>>>> On 3/22/19 12:01 AM, Alex Williamson wrote:  
>>>>> On Sun, 17 Mar 2019 18:22:19 +0100
>>>>> Eric Auger <eric.auger@redhat.com> wrote:
>>>>>     
>>>>>> This patch adds the VFIO_IOMMU_BIND/UNBIND_MSI ioctl which aim
>>>>>> to pass/withdraw the guest MSI binding to/from the host.
>>>>>>
>>>>>> Signed-off-by: Eric Auger <eric.auger@redhat.com>
>>>>>>
>>>>>> ---
>>>>>> v3 -> v4:
>>>>>> - add UNBIND
>>>>>> - unwind on BIND error
>>>>>>
>>>>>> v2 -> v3:
>>>>>> - adapt to new proto of bind_guest_msi
>>>>>> - directly use vfio_iommu_for_each_dev
>>>>>>
>>>>>> v1 -> v2:
>>>>>> - s/vfio_iommu_type1_guest_msi_binding/vfio_iommu_type1_bind_guest_msi
>>>>>> ---
>>>>>>  drivers/vfio/vfio_iommu_type1.c | 58 +++++++++++++++++++++++++++++++++
>>>>>>  include/uapi/linux/vfio.h       | 29 +++++++++++++++++
>>>>>>  2 files changed, 87 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>>>>> index 12a40b9db6aa..66513679081b 100644
>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>>>> @@ -1710,6 +1710,25 @@ static int vfio_cache_inv_fn(struct device *dev, void *data)
>>>>>>  	return iommu_cache_invalidate(d, dev, &ustruct->info);
>>>>>>  }
>>>>>>  
>>>>>> +static int vfio_bind_msi_fn(struct device *dev, void *data)
>>>>>> +{
>>>>>> +	struct vfio_iommu_type1_bind_msi *ustruct =
>>>>>> +		(struct vfio_iommu_type1_bind_msi *)data;
>>>>>> +	struct iommu_domain *d = iommu_get_domain_for_dev(dev);
>>>>>> +
>>>>>> +	return iommu_bind_guest_msi(d, dev, ustruct->iova,
>>>>>> +				    ustruct->gpa, ustruct->size);
>>>>>> +}
>>>>>> +
>>>>>> +static int vfio_unbind_msi_fn(struct device *dev, void *data)
>>>>>> +{
>>>>>> +	dma_addr_t *iova = (dma_addr_t *)data;
>>>>>> +	struct iommu_domain *d = iommu_get_domain_for_dev(dev);    
>>>>>
>>>>> Same as previous, we can encapsulate domain in our own struct to avoid
>>>>> a lookup.
>>>>>     
>>>>>> +
>>>>>> +	iommu_unbind_guest_msi(d, dev, *iova);    
>>>>>
>>>>> Is it strange that iommu-core is exposing these interfaces at a device
>>>>> level if every one of them requires us to walk all the devices?  Thanks,    
>>>>
>>>> Hum this per device API was devised in response of Robin's comments on
>>>>
>>>> [RFC v2 12/20] dma-iommu: Implement NESTED_MSI cookie.
>>>>
>>>> "
>>>> But that then seems to reveal a somewhat bigger problem - if the callers
>>>> are simply registering IPAs, and relying on the ITS driver to grab an
>>>> entry and fill in a PA later, then how does either one know *which* PA
>>>> is supposed to belong to a given IPA in the case where you have multiple
>>>> devices with different ITS targets assigned to the same guest? (and if
>>>> it's possible to assume a guest will use per-device stage 1 mappings and
>>>> present it with a single vITS backed by multiple pITSes, I think things
>>>> start breaking even harder.)
>>>> "
>>>>
>>>> However looking back into the problem I wonder if there was an issue
>>>> with the iommu_domain based API.
>>>>
>>>> If my understanding is correct, when assigned devices are protected by a
>>>> vIOMMU then they necessarily end up in separate host iommu domains even
>>>> if they belong to the same iommu_domain on the guest. And there can only
>>>> be a single device in this iommu_domain.  
>>>
>>> Don't forget that a container represents the IOMMU context in a vfio
>>> environment, groups are associated with containers and a group may
>>> contain one or more devices.  When a vIOMMU comes into play, we still
>>> only have an IOMMU context per container.  If we have multiple devices
>>> in a group, we run into problems with vIOMMU.  We can resolve this by
>>> requiring that the user ignore all but one device in the group,
>>> or making sure that the devices in the group have the same IOMMU
>>> context.  The latter we could do in QEMU if PCIe-to-PCI bridges there
>>> masked the per-device address space as it does on real hardware (ie.
>>> there is no requester ID on conventional PCI, all transactions appear to
>>> the IOMMU with the bridge requester ID).  So I raise this question
>>> because vfio's minimum domain granularity is a group.
>>>   
>>>> If this is confirmed, there is a non ambiguous association between 1
>>>> physical iommu_domain, 1 device, 1 S1 mapping and 1 physical MSI
>>>> controller.
>>>>
>>>> I added the device handle handle to disambiguate those associations. The
>>>> gIOVA ->gDB mapping is associated with a device handle. Then when the
>>>> host needs a stage 1 mapping for this device, to build the nested
>>>> mapping towards the physical DB it can easily grab the gIOVA->gDB stage
>>>> 1 mapping registered for this device.
>>>>
>>>> The correctness looks more obvious to me, at least.  
>>>
>>> Except all devices within all groups within the same container
>>> necessarily share the same IOMMU context, so from that perspective, it
>>> appears to impose non-trivial redundancy on the caller.  Thanks,  
>>
>> Taking into consideration the case where we could have several devices
>> attached to the same host iommu group, each of them possibly using
>> different host MSI doorbells, I think I am in trouble.
>>
>> Let's assume that using the pcie-to-pci bridge trick on guest side they
>> end up in the same container and in the same guest iommu group.
>>
>> At the moment there is a single MSI controller on guest, so the same
>> gIOVA/gDB S1 mapping is going to be created by the guest iommu dommain
>> and both devices are programmed with gIOVA. If dev0 and dev1 are
>> attached to different host MSI controllers, I would need to build the 2
>> nested bindings:
>> dev0: MSI nested binding: gIOVA -> gDB -> hDB0
>> dev1: MSI nested binding: gIOVA -> gDB -> hDB1
>> (on guest there is a single MSI controller at the moment)
>>
>> which is not possible as the devices belong to the same host iommu group
>> and share the same mapping.
>>
>> The solution would be to instantiate 2 MSI controllers on guest side, in
>> which case we would end up with
>> dev0: gIOVA0 -> gDB0 -> hDB0
>> dev1: gIOVA1 -> gDB1 -> hDB1
>>
>> Isn't it somehow what we do with the IOMMU RID topology. We need to take
>> into account the host topology (2 devices belonging to the same group)
>> to force the same on guest by introducing a PCIe-to-PCI bridge. Here we
>> would need to say, those assigned devices are attached to different MSI
>> domains on host, so we need the same on guest.
>>
>> Anyway, the current container based IOCTL would fail to implement that
>> because I would register gIOVA0 -> gDB0 and gIOVA1 -> gDB1 for each
>> device within the container which would definitively fail to build the
>> correct association. So I think I would need anyway a device based IOTCL
>> that would aim to tell: this assigned device uses this S1 MSI binding.
>> All the notification mechanism we have in qemu is based on container, so
>> this would obliged to have device based notification mechanism.
>>
>> So I wonder whether it wouldn't be sensible to restrict this use case
>> and say we support nested mode only if we have a single assigned device
>> within the container?
>>
>> Thoughts?
> 
> We've essentially done that with vIOMMU up to this point already, it's
> not been possible to assign multiple devices from the same group to a
> VM with intel-iommu, amd-iommu, or smmu due to the requirement of
> separate address spaces per device.  It's only when we introduce
> address space aliasing with bridges that we can even consider this
> possibility, and it's a configuration which smmu doesn't properly
> support even on bare metal.  I hope we can consider that to be simply a
> gap in the implementation that will get fixed and not an architectural
> problem.
> 
> As we discussed offline though, I wonder if we're attempting to support
> more than necessary with your scenarios above.  If devices within the
> same group can be verified to share a host MSI controller, do we still
> have an issue mapping them to a single guest MSI controller?  When we
> talked we were headed down a path that if a group is necessarily
> associated to a single IOMMU, perhaps that necessarily means that a
> group is also associated to a single MSI controller.  I've since
> thought of a configuration where a group could span physical IOMMU
> devices, NVLink.  As essentially a secondary bus interface for a
> device, NVLink can cause devices with arbitrary PCI hierarchy
> connections to be non-isolated, and ideally our grouping would
> understand to account for that.  However, if it could be determined
> that a group associates to a single MSI controller, do we still have an
> issue with multiple devices within the group?

Marc, Robin,

Do you think this is a reasonable assumption to consider devices within
the same host iommu group share the same MSI doorbell?
> 
> My issue with the per device interface for what is fundamentally an
> IOVA mapping is that vfio does not support per device mappings.  We
> support mappings at the container level, where the minimum set of
> devices we can attach to a container is a group.  Therefore to create
> an interface that purports to support device level mappings is not
> accurate.  Maybe MSI controllers will restrict our configuration but
> I'd rather not design the interface around the wrong level of mapping
> granularity.  Thanks,

Alex,

In case of nested, the vfio container is used to setup the stage2 only
whereas the stage1 is owned by the guest. Here the mapping we pass to
the host is a stage1 mapping; this information is used to build the
correct S2 mapping. Then the mapping remains the same for the whole
container. If dev1 decides to use gIOVA0 it will reach hDB0. Anyway the
granularity of the mapping cannot change.

So this rather tells this assigned dev uses this given giova for
reaching a guest MSI doorbell.

Thanks

Eric
> 
> Alex
>
Vincent Stehlé April 10, 2019, 12:35 p.m. UTC | #7
On Thu, Apr 04, 2019 at 08:55:25AM +0200, Auger Eric wrote:
> Hi Marc, Robin, Alex,
(..)
> Do you think this is a reasonable assumption to consider devices within
> the same host iommu group share the same MSI doorbell?

Hi Eric,

I am not sure this assumption always hold.

Marc, Robin and Alex can correct me, but for example I think the following
topology is valid for Arm systems:

 +------------+  +------------+
 | Endpoint A |  | Endpoint B |
 +------------+  +------------+
            v     v
          /---------\
         |  Non-ACS  |
         |  Switch   |
          \---------/
               v
       +---------------+
       |     PCIe      |
       | Root Complex  |
       +---------------+
               v
         +-----------+
         |   SMMU    |
         +-----------+
               v
  +--------------------------+
  |   System interconnect    |
  +--------------------------+
        v              v
  +-----------+  +-----------+
  |   ITS A   |  |   ITS B   |
  +-----------+  +-----------+

All PCIe Endpoints and ITS could be in the same ITS Group 0, meaning
devices could send their MSI at any ITS in hardware.

For Linux the two PCIe Endpoints would be in the same iommu group, because
the switch in this example does not support ACS.

I think the devicetree msi-map property could be used to "map" the RID of
Endpoint A to ITS A and the RID of Endpoint B to ITS B, which would violate
the assumption.

See the monolithic example in [1], the example system in [2], appendices
D, E and F in [3] and the msi-map property in [4].

Best regards,
Vincent.

[1] https://static.docs.arm.com/100336/0102/corelink_gic600_generic_interrupt_controller_technical_reference_manual_100336_0102_00_en.pdf
[2] http://infocenter.arm.com/help/topic/com.arm.doc.den0049d/DEN0049D_IO_Remapping_Table.pdf
[3] https://static.docs.arm.com/den0029/50/Q1-DEN0029B_SBSA_5.0.pdf
[4] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/devicetree/bindings/pci/pci-msi.txt
Eric Auger April 10, 2019, 1:02 p.m. UTC | #8
Hi Vincent,

On 4/10/19 2:35 PM, Vincent Stehlé wrote:
> On Thu, Apr 04, 2019 at 08:55:25AM +0200, Auger Eric wrote:
>> Hi Marc, Robin, Alex,
> (..)
>> Do you think this is a reasonable assumption to consider devices within
>> the same host iommu group share the same MSI doorbell?
> 
> Hi Eric,
> 
> I am not sure this assumption always hold.
> 
> Marc, Robin and Alex can correct me, but for example I think the following
> topology is valid for Arm systems:
> 
>  +------------+  +------------+
>  | Endpoint A |  | Endpoint B |
>  +------------+  +------------+
>             v     v
>           /---------\
>          |  Non-ACS  |
>          |  Switch   |
>           \---------/
>                v
>        +---------------+
>        |     PCIe      |
>        | Root Complex  |
>        +---------------+
>                v
>          +-----------+
>          |   SMMU    |
>          +-----------+
>                v
>   +--------------------------+
>   |   System interconnect    |
>   +--------------------------+
>         v              v
>   +-----------+  +-----------+
>   |   ITS A   |  |   ITS B   |
>   +-----------+  +-----------+
> 
> All PCIe Endpoints and ITS could be in the same ITS Group 0, meaning
> devices could send their MSI at any ITS in hardware.
> 
> For Linux the two PCIe Endpoints would be in the same iommu group, because
> the switch in this example does not support ACS.
> 
> I think the devicetree msi-map property could be used to "map" the RID of
> Endpoint A to ITS A and the RID of Endpoint B to ITS B, which would violate
> the assumption.
> 
> See the monolithic example in [1], the example system in [2], appendices
> D, E and F in [3] and the msi-map property in [4].

Thank you for the review & links.

I understand the above topology is perfectly valid. Now the question is:
is it sufficiently common to care about it?

At the moment VFIO/vIOMMU assignment of devices belonging to the same
group isn't upstream yet. Work is ongoing by Alex to support it. It uses
a PCIe-to-PCI bridge on guest side and it looks this topology is not
supported by the SMMUv3 driver. Then comes the trouble of using several
ITS in nested mode.

If this topology is sufficiently rare I propose we to do not support it
in this VFIO/vIOMMU use case. in v7 I introduced a check that aims to
verify devices attached to the same nested iommu_domain share the same
msi_domain.

Thanks

Eric
> 
> Best regards,
> Vincent.
> 
> [1] https://static.docs.arm.com/100336/0102/corelink_gic600_generic_interrupt_controller_technical_reference_manual_100336_0102_00_en.pdf
> [2] http://infocenter.arm.com/help/topic/com.arm.doc.den0049d/DEN0049D_IO_Remapping_Table.pdf
> [3] https://static.docs.arm.com/den0029/50/Q1-DEN0029B_SBSA_5.0.pdf
> [4] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/devicetree/bindings/pci/pci-msi.txt
>
Marc Zyngier April 10, 2019, 1:15 p.m. UTC | #9
Hi Vincent,

On 10/04/2019 13:35, Vincent Stehlé wrote:
> On Thu, Apr 04, 2019 at 08:55:25AM +0200, Auger Eric wrote:
>> Hi Marc, Robin, Alex,
> (..)
>> Do you think this is a reasonable assumption to consider devices within
>> the same host iommu group share the same MSI doorbell?
> 
> Hi Eric,
> 
> I am not sure this assumption always hold.
> 
> Marc, Robin and Alex can correct me, but for example I think the following
> topology is valid for Arm systems:
> 
>  +------------+  +------------+
>  | Endpoint A |  | Endpoint B |
>  +------------+  +------------+
>             v     v
>           /---------\
>          |  Non-ACS  |
>          |  Switch   |
>           \---------/
>                v
>        +---------------+
>        |     PCIe      |
>        | Root Complex  |
>        +---------------+
>                v
>          +-----------+
>          |   SMMU    |
>          +-----------+
>                v
>   +--------------------------+
>   |   System interconnect    |
>   +--------------------------+
>         v              v
>   +-----------+  +-----------+
>   |   ITS A   |  |   ITS B   |
>   +-----------+  +-----------+
> 
> All PCIe Endpoints and ITS could be in the same ITS Group 0, meaning
> devices could send their MSI at any ITS in hardware.
> 
> For Linux the two PCIe Endpoints would be in the same iommu group, because
> the switch in this example does not support ACS.
> 
> I think the devicetree msi-map property could be used to "map" the RID of
> Endpoint A to ITS A and the RID of Endpoint B to ITS B, which would violate
> the assumption.
> 
> See the monolithic example in [1], the example system in [2], appendices
> D, E and F in [3] and the msi-map property in [4].

I think we are all in agreement that this is a possible topology. It is
just that it doesn't exist in any real-life implementation we know of
(the ITS tends to be close to the RC and not downstream of the
interconnect).

Given the complexity of what we're trying to put together, I'd rather
start with a small step which supports commonly implemented topology,
and later address the odd ones if they actually crop up.

Thanks,

	M.
diff mbox series

Patch

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 12a40b9db6aa..66513679081b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1710,6 +1710,25 @@  static int vfio_cache_inv_fn(struct device *dev, void *data)
 	return iommu_cache_invalidate(d, dev, &ustruct->info);
 }
 
+static int vfio_bind_msi_fn(struct device *dev, void *data)
+{
+	struct vfio_iommu_type1_bind_msi *ustruct =
+		(struct vfio_iommu_type1_bind_msi *)data;
+	struct iommu_domain *d = iommu_get_domain_for_dev(dev);
+
+	return iommu_bind_guest_msi(d, dev, ustruct->iova,
+				    ustruct->gpa, ustruct->size);
+}
+
+static int vfio_unbind_msi_fn(struct device *dev, void *data)
+{
+	dma_addr_t *iova = (dma_addr_t *)data;
+	struct iommu_domain *d = iommu_get_domain_for_dev(dev);
+
+	iommu_unbind_guest_msi(d, dev, *iova);
+	return 0;
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
 				   unsigned int cmd, unsigned long arg)
 {
@@ -1814,6 +1833,45 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 					      vfio_cache_inv_fn);
 		mutex_unlock(&iommu->lock);
 		return ret;
+	} else if (cmd == VFIO_IOMMU_BIND_MSI) {
+		struct vfio_iommu_type1_bind_msi ustruct;
+		int ret;
+
+		minsz = offsetofend(struct vfio_iommu_type1_bind_msi,
+				    size);
+
+		if (copy_from_user(&ustruct, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (ustruct.argsz < minsz || ustruct.flags)
+			return -EINVAL;
+
+		mutex_lock(&iommu->lock);
+		ret = vfio_iommu_for_each_dev(iommu, &ustruct,
+					      vfio_bind_msi_fn);
+		if (ret)
+			vfio_iommu_for_each_dev(iommu, &ustruct.iova,
+						vfio_unbind_msi_fn);
+		mutex_unlock(&iommu->lock);
+		return ret;
+	} else if (cmd == VFIO_IOMMU_UNBIND_MSI) {
+		struct vfio_iommu_type1_unbind_msi ustruct;
+		int ret;
+
+		minsz = offsetofend(struct vfio_iommu_type1_unbind_msi,
+				    iova);
+
+		if (copy_from_user(&ustruct, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (ustruct.argsz < minsz || ustruct.flags)
+			return -EINVAL;
+
+		mutex_lock(&iommu->lock);
+		ret = vfio_iommu_for_each_dev(iommu, &ustruct.iova,
+					      vfio_unbind_msi_fn);
+		mutex_unlock(&iommu->lock);
+		return ret;
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 29f0ef2d805d..6763389b6adc 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -789,6 +789,35 @@  struct vfio_iommu_type1_cache_invalidate {
 };
 #define VFIO_IOMMU_CACHE_INVALIDATE      _IO(VFIO_TYPE, VFIO_BASE + 24)
 
+/**
+ * VFIO_IOMMU_BIND_MSI - _IOWR(VFIO_TYPE, VFIO_BASE + 25,
+ *			struct vfio_iommu_type1_bind_msi)
+ *
+ * Pass a stage 1 MSI doorbell mapping to the host so that this
+ * latter can build a nested stage2 mapping
+ */
+struct vfio_iommu_type1_bind_msi {
+	__u32   argsz;
+	__u32   flags;
+	__u64	iova;
+	__u64	gpa;
+	__u64	size;
+};
+#define VFIO_IOMMU_BIND_MSI      _IO(VFIO_TYPE, VFIO_BASE + 25)
+
+/**
+ * VFIO_IOMMU_UNBIND_MSI - _IOWR(VFIO_TYPE, VFIO_BASE + 26,
+ *			struct vfio_iommu_type1_unbind_msi)
+ *
+ * Unregister an MSI mapping
+ */
+struct vfio_iommu_type1_unbind_msi {
+	__u32   argsz;
+	__u32   flags;
+	__u64	iova;
+};
+#define VFIO_IOMMU_UNBIND_MSI      _IO(VFIO_TYPE, VFIO_BASE + 26)
+
 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
 
 /*