diff mbox series

[v2,04/19] iommufd: Allow pt_id to carry viommu_id for IOMMU_HWPT_ALLOC

Message ID 2d469a5279ef05820d5993df752d32239878338d.1724776335.git.nicolinc@nvidia.com (mailing list archive)
State New, archived
Headers show
Series iommufd: Add VIOMMU infrastructure (Part-1) | expand

Commit Message

Nicolin Chen Aug. 27, 2024, 4:59 p.m. UTC
Now a VIOMMU can wrap a shareable nested parent HWPT. So, it can act like
a nested parent HWPT to allocate a nested HWPT.

Support that in the IOMMU_HWPT_ALLOC ioctl handler, and update its kdoc.

Also, associate a viommu to an allocating nested HWPT.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
 drivers/iommu/iommufd/hw_pagetable.c    | 24 ++++++++++++++++++++++--
 drivers/iommu/iommufd/iommufd_private.h |  1 +
 include/uapi/linux/iommufd.h            | 12 ++++++------
 3 files changed, 29 insertions(+), 8 deletions(-)

Comments

Jason Gunthorpe Sept. 5, 2024, 3:56 p.m. UTC | #1
On Tue, Aug 27, 2024 at 09:59:41AM -0700, Nicolin Chen wrote:
> Now a VIOMMU can wrap a shareable nested parent HWPT. So, it can act like
> a nested parent HWPT to allocate a nested HWPT.
> 
> Support that in the IOMMU_HWPT_ALLOC ioctl handler, and update its kdoc.
> 
> Also, associate a viommu to an allocating nested HWPT.
> 
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>  drivers/iommu/iommufd/hw_pagetable.c    | 24 ++++++++++++++++++++++--
>  drivers/iommu/iommufd/iommufd_private.h |  1 +
>  include/uapi/linux/iommufd.h            | 12 ++++++------
>  3 files changed, 29 insertions(+), 8 deletions(-)

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>

Jason
Yi Liu Sept. 26, 2024, 8:50 a.m. UTC | #2
On 2024/8/28 00:59, Nicolin Chen wrote:
> Now a VIOMMU can wrap a shareable nested parent HWPT. So, it can act like
> a nested parent HWPT to allocate a nested HWPT.
> 
> Support that in the IOMMU_HWPT_ALLOC ioctl handler, and update its kdoc.
> 
> Also, associate a viommu to an allocating nested HWPT.

it still not quite clear to me what vIOMMU obj stands for. Here, it is a
wrapper of s2 hpwt IIUC. But in the cover letter, vIOMMU obj can instanced
per the vIOMMU units in VM. Does it mean each vIOMMU of VM can only have
one s2 HWPT?

> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
>   drivers/iommu/iommufd/hw_pagetable.c    | 24 ++++++++++++++++++++++--
>   drivers/iommu/iommufd/iommufd_private.h |  1 +
>   include/uapi/linux/iommufd.h            | 12 ++++++------
>   3 files changed, 29 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
> index c21bb59c4022..06adbcc304bc 100644
> --- a/drivers/iommu/iommufd/hw_pagetable.c
> +++ b/drivers/iommu/iommufd/hw_pagetable.c
> @@ -57,6 +57,9 @@ void iommufd_hwpt_nested_destroy(struct iommufd_object *obj)
>   		container_of(obj, struct iommufd_hwpt_nested, common.obj);
>   
>   	__iommufd_hwpt_destroy(&hwpt_nested->common);
> +
> +	if (hwpt_nested->viommu)
> +		refcount_dec(&hwpt_nested->viommu->obj.users);
>   	refcount_dec(&hwpt_nested->parent->common.obj.users);
>   }
>   
> @@ -213,6 +216,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
>    */
>   static struct iommufd_hwpt_nested *
>   iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
> +			  struct iommufd_viommu *viommu,
>   			  struct iommufd_hwpt_paging *parent,
>   			  struct iommufd_device *idev, u32 flags,
>   			  const struct iommu_user_data *user_data)
> @@ -234,13 +238,16 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
>   		return ERR_CAST(hwpt_nested);
>   	hwpt = &hwpt_nested->common;
>   
> +	if (viommu)
> +		refcount_inc(&viommu->obj.users);
> +	hwpt_nested->viommu = viommu;
>   	refcount_inc(&parent->common.obj.users);
>   	hwpt_nested->parent = parent;
>   
>   	hwpt->domain = ops->domain_alloc_user(idev->dev,
>   					      flags & ~IOMMU_HWPT_FAULT_ID_VALID,
>   					      parent->common.domain,
> -					      NULL, user_data);
> +					      viommu, user_data);
>   	if (IS_ERR(hwpt->domain)) {
>   		rc = PTR_ERR(hwpt->domain);
>   		hwpt->domain = NULL;
> @@ -307,7 +314,7 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
>   		struct iommufd_hwpt_nested *hwpt_nested;
>   
>   		hwpt_nested = iommufd_hwpt_nested_alloc(
> -			ucmd->ictx,
> +			ucmd->ictx, NULL,
>   			container_of(pt_obj, struct iommufd_hwpt_paging,
>   				     common.obj),
>   			idev, cmd->flags, &user_data);
> @@ -316,6 +323,19 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
>   			goto out_unlock;
>   		}
>   		hwpt = &hwpt_nested->common;
> +	} else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) {
> +		struct iommufd_hwpt_nested *hwpt_nested;
> +		struct iommufd_viommu *viommu;
> +
> +		viommu = container_of(pt_obj, struct iommufd_viommu, obj);
> +		hwpt_nested = iommufd_hwpt_nested_alloc(
> +			ucmd->ictx, viommu, viommu->hwpt, idev,
> +			cmd->flags, &user_data);
> +		if (IS_ERR(hwpt_nested)) {
> +			rc = PTR_ERR(hwpt_nested);
> +			goto out_unlock;
> +		}
> +		hwpt = &hwpt_nested->common;
>   	} else {
>   		rc = -EINVAL;
>   		goto out_put_pt;
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index 154f7ba5f45c..1f2a1c133b9a 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -313,6 +313,7 @@ struct iommufd_hwpt_paging {
>   struct iommufd_hwpt_nested {
>   	struct iommufd_hw_pagetable common;
>   	struct iommufd_hwpt_paging *parent;
> +	struct iommufd_viommu *viommu;
>   };
>   
>   static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt)
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index ac77903b5cc4..51ce6a019c34 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -430,7 +430,7 @@ enum iommu_hwpt_data_type {
>    * @size: sizeof(struct iommu_hwpt_alloc)
>    * @flags: Combination of enum iommufd_hwpt_alloc_flags
>    * @dev_id: The device to allocate this HWPT for
> - * @pt_id: The IOAS or HWPT to connect this HWPT to
> + * @pt_id: The IOAS or HWPT or VIOMMU to connect this HWPT to
>    * @out_hwpt_id: The ID of the new HWPT
>    * @__reserved: Must be 0
>    * @data_type: One of enum iommu_hwpt_data_type
> @@ -449,11 +449,11 @@ enum iommu_hwpt_data_type {
>    * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a
>    * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags.
>    *
> - * A user-managed nested HWPT will be created from a given parent HWPT via
> - * @pt_id, in which the parent HWPT must be allocated previously via the
> - * same ioctl from a given IOAS (@pt_id). In this case, the @data_type
> - * must be set to a pre-defined type corresponding to an I/O page table
> - * type supported by the underlying IOMMU hardware.
> + * A user-managed nested HWPT will be created from a given VIOMMU (wrapping a
> + * parent HWPT) or a parent HWPT via @pt_id, in which the parent HWPT must be
> + * allocated previously via the same ioctl from a given IOAS (@pt_id). In this
> + * case, the @data_type must be set to a pre-defined type corresponding to an
> + * I/O page table type supported by the underlying IOMMU hardware.
>    *
>    * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and
>    * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr
Nicolin Chen Sept. 26, 2024, 8:10 p.m. UTC | #3
On Thu, Sep 26, 2024 at 04:50:46PM +0800, Yi Liu wrote:
> On 2024/8/28 00:59, Nicolin Chen wrote:
> > Now a VIOMMU can wrap a shareable nested parent HWPT. So, it can act like
> > a nested parent HWPT to allocate a nested HWPT.
> > 
> > Support that in the IOMMU_HWPT_ALLOC ioctl handler, and update its kdoc.
> > 
> > Also, associate a viommu to an allocating nested HWPT.
> 
> it still not quite clear to me what vIOMMU obj stands for. Here, it is a
> wrapper of s2 hpwt IIUC. But in the cover letter, vIOMMU obj can instanced
> per the vIOMMU units in VM.

Yea, the implementation in this version is merely a wrapper. I
had a general introduction of vIOMMU in the other reply. And I
will put something similar in the next version of the series,
so the idea would be bigger than a wrapper.

> Does it mean each vIOMMU of VM can only have
> one s2 HWPT?

Giving some examples here:
 - If a VM has 1 vIOMMU, there will be 1 vIOMMU object in the
   kernel holding one S2 HWPT.
 - If a VM has 2 vIOMMUs, there will be 2 vIOMMU objects in the
   kernel that can hold two different S2 HWPTs, or share one S2
   HWPT (saving memory).

Thanks
Nic
Tian, Kevin Sept. 27, 2024, 12:43 a.m. UTC | #4
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Friday, September 27, 2024 4:11 AM
> 
> On Thu, Sep 26, 2024 at 04:50:46PM +0800, Yi Liu wrote:
> > On 2024/8/28 00:59, Nicolin Chen wrote:
> > > Now a VIOMMU can wrap a shareable nested parent HWPT. So, it can act
> like
> > > a nested parent HWPT to allocate a nested HWPT.
> > >
> > > Support that in the IOMMU_HWPT_ALLOC ioctl handler, and update its
> kdoc.
> > >
> > > Also, associate a viommu to an allocating nested HWPT.
> >
> > it still not quite clear to me what vIOMMU obj stands for. Here, it is a
> > wrapper of s2 hpwt IIUC. But in the cover letter, vIOMMU obj can instanced
> > per the vIOMMU units in VM.
> 
> Yea, the implementation in this version is merely a wrapper. I
> had a general introduction of vIOMMU in the other reply. And I
> will put something similar in the next version of the series,
> so the idea would be bigger than a wrapper.
> 
> > Does it mean each vIOMMU of VM can only have
> > one s2 HWPT?
> 
> Giving some examples here:
>  - If a VM has 1 vIOMMU, there will be 1 vIOMMU object in the
>    kernel holding one S2 HWPT.
>  - If a VM has 2 vIOMMUs, there will be 2 vIOMMU objects in the
>    kernel that can hold two different S2 HWPTs, or share one S2
>    HWPT (saving memory).
> 

this is not consistent with previous discussion.

even for 1 vIOMMU per VM there could be multiple vIOMMU objects
created in the kernel in case the devices connected to the VM-visible
vIOMMU locate behind different physical SMMUs.

we don't expect one vIOMMU object to span multiple physical ones.
Nicolin Chen Sept. 27, 2024, 1:25 a.m. UTC | #5
On Fri, Sep 27, 2024 at 12:43:16AM +0000, Tian, Kevin wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Friday, September 27, 2024 4:11 AM
> >
> > On Thu, Sep 26, 2024 at 04:50:46PM +0800, Yi Liu wrote:
> > > On 2024/8/28 00:59, Nicolin Chen wrote:
> > > > Now a VIOMMU can wrap a shareable nested parent HWPT. So, it can act
> > like
> > > > a nested parent HWPT to allocate a nested HWPT.
> > > >
> > > > Support that in the IOMMU_HWPT_ALLOC ioctl handler, and update its
> > kdoc.
> > > >
> > > > Also, associate a viommu to an allocating nested HWPT.
> > >
> > > it still not quite clear to me what vIOMMU obj stands for. Here, it is a
> > > wrapper of s2 hpwt IIUC. But in the cover letter, vIOMMU obj can instanced
> > > per the vIOMMU units in VM.
> >
> > Yea, the implementation in this version is merely a wrapper. I
> > had a general introduction of vIOMMU in the other reply. And I
> > will put something similar in the next version of the series,
> > so the idea would be bigger than a wrapper.
> >
> > > Does it mean each vIOMMU of VM can only have
> > > one s2 HWPT?
> >
> > Giving some examples here:
> >  - If a VM has 1 vIOMMU, there will be 1 vIOMMU object in the
> >    kernel holding one S2 HWPT.
> >  - If a VM has 2 vIOMMUs, there will be 2 vIOMMU objects in the
> >    kernel that can hold two different S2 HWPTs, or share one S2
> >    HWPT (saving memory).
> >
> 
> this is not consistent with previous discussion.
> 
> even for 1 vIOMMU per VM there could be multiple vIOMMU objects
> created in the kernel in case the devices connected to the VM-visible
> vIOMMU locate behind different physical SMMUs.
> 
> we don't expect one vIOMMU object to span multiple physical ones.

I think it's consistent, yet we had different perspectives for a
virtual IOMMU instance in the VM: Jason's suggested design for a
VM is to have 1-to-1 mapping between virtual IOMMU instances and
physical IOMMU instances. So, one vIOMMU is backed by one pIOMMU
only, i.e. one vIOMMU object in the kernel.

Your case seems to be the model where a VM has one giant virtual
IOMMU instance backed by multiple physical IOMMUs, in which case
all the passthrough devices, regardless their associated pIOMMUs,
are connected to this shared virtual IOMMU. And yes, this shared
virtual IOMMU can have multiple vIOMMU objects.

Regarding these two models, I had listed their pros/cons at (2):
https://lore.kernel.org/qemu-devel/cover.1719361174.git.nicolinc@nvidia.com/

(Not 100% sure) VT-d might not have something like vCMDQ, so it
can stay in the shared model to simplify certain things, though
I feel it may face some similar situation like mapping multiple
physical MMIO regions to a single virtual region (undoable!) if
some day intel has some similar HW-accelerated feature?

Thanks
Nic
Tian, Kevin Sept. 27, 2024, 2:23 a.m. UTC | #6
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Friday, September 27, 2024 9:26 AM
> 
> On Fri, Sep 27, 2024 at 12:43:16AM +0000, Tian, Kevin wrote:
> > > From: Nicolin Chen <nicolinc@nvidia.com>
> > > Sent: Friday, September 27, 2024 4:11 AM
> > >
> > > On Thu, Sep 26, 2024 at 04:50:46PM +0800, Yi Liu wrote:
> > > > On 2024/8/28 00:59, Nicolin Chen wrote:
> > > > > Now a VIOMMU can wrap a shareable nested parent HWPT. So, it can
> act
> > > like
> > > > > a nested parent HWPT to allocate a nested HWPT.
> > > > >
> > > > > Support that in the IOMMU_HWPT_ALLOC ioctl handler, and update
> its
> > > kdoc.
> > > > >
> > > > > Also, associate a viommu to an allocating nested HWPT.
> > > >
> > > > it still not quite clear to me what vIOMMU obj stands for. Here, it is a
> > > > wrapper of s2 hpwt IIUC. But in the cover letter, vIOMMU obj can
> instanced
> > > > per the vIOMMU units in VM.
> > >
> > > Yea, the implementation in this version is merely a wrapper. I
> > > had a general introduction of vIOMMU in the other reply. And I
> > > will put something similar in the next version of the series,
> > > so the idea would be bigger than a wrapper.
> > >
> > > > Does it mean each vIOMMU of VM can only have
> > > > one s2 HWPT?
> > >
> > > Giving some examples here:
> > >  - If a VM has 1 vIOMMU, there will be 1 vIOMMU object in the
> > >    kernel holding one S2 HWPT.
> > >  - If a VM has 2 vIOMMUs, there will be 2 vIOMMU objects in the
> > >    kernel that can hold two different S2 HWPTs, or share one S2
> > >    HWPT (saving memory).
> > >
> >
> > this is not consistent with previous discussion.
> >
> > even for 1 vIOMMU per VM there could be multiple vIOMMU objects
> > created in the kernel in case the devices connected to the VM-visible
> > vIOMMU locate behind different physical SMMUs.
> >
> > we don't expect one vIOMMU object to span multiple physical ones.
> 
> I think it's consistent, yet we had different perspectives for a
> virtual IOMMU instance in the VM: Jason's suggested design for a
> VM is to have 1-to-1 mapping between virtual IOMMU instances and
> physical IOMMU instances. So, one vIOMMU is backed by one pIOMMU
> only, i.e. one vIOMMU object in the kernel.
> 
> Your case seems to be the model where a VM has one giant virtual
> IOMMU instance backed by multiple physical IOMMUs, in which case
> all the passthrough devices, regardless their associated pIOMMUs,
> are connected to this shared virtual IOMMU. And yes, this shared
> virtual IOMMU can have multiple vIOMMU objects.

yes.

sorry that I should not use "inconsistent" in the last reply. It's more
about completeness for what the design allows. 
Nicolin Chen Sept. 27, 2024, 2:57 a.m. UTC | #7
On Fri, Sep 27, 2024 at 02:23:16AM +0000, Tian, Kevin wrote:

> > > > > Does it mean each vIOMMU of VM can only have
> > > > > one s2 HWPT?
> > > >
> > > > Giving some examples here:
> > > >  - If a VM has 1 vIOMMU, there will be 1 vIOMMU object in the
> > > >    kernel holding one S2 HWPT.
> > > >  - If a VM has 2 vIOMMUs, there will be 2 vIOMMU objects in the
> > > >    kernel that can hold two different S2 HWPTs, or share one S2
> > > >    HWPT (saving memory).
> > > >
> > >
> > > this is not consistent with previous discussion.
> > >
> > > even for 1 vIOMMU per VM there could be multiple vIOMMU objects
> > > created in the kernel in case the devices connected to the VM-visible
> > > vIOMMU locate behind different physical SMMUs.
> > >
> > > we don't expect one vIOMMU object to span multiple physical ones.
> >
> > I think it's consistent, yet we had different perspectives for a
> > virtual IOMMU instance in the VM: Jason's suggested design for a
> > VM is to have 1-to-1 mapping between virtual IOMMU instances and
> > physical IOMMU instances. So, one vIOMMU is backed by one pIOMMU
> > only, i.e. one vIOMMU object in the kernel.
> >
> > Your case seems to be the model where a VM has one giant virtual
> > IOMMU instance backed by multiple physical IOMMUs, in which case
> > all the passthrough devices, regardless their associated pIOMMUs,
> > are connected to this shared virtual IOMMU. And yes, this shared
> > virtual IOMMU can have multiple vIOMMU objects.
> 
> yes.
> 
> sorry that I should not use "inconsistent" in the last reply. It's more
> about completeness for what the design allows. 
Yi Liu Sept. 27, 2024, 5:38 a.m. UTC | #8
On 2024/9/27 04:10, Nicolin Chen wrote:
> On Thu, Sep 26, 2024 at 04:50:46PM +0800, Yi Liu wrote:
>> On 2024/8/28 00:59, Nicolin Chen wrote:
>>> Now a VIOMMU can wrap a shareable nested parent HWPT. So, it can act like
>>> a nested parent HWPT to allocate a nested HWPT.
>>>
>>> Support that in the IOMMU_HWPT_ALLOC ioctl handler, and update its kdoc.
>>>
>>> Also, associate a viommu to an allocating nested HWPT.
>>
>> it still not quite clear to me what vIOMMU obj stands for. Here, it is a
>> wrapper of s2 hpwt IIUC. But in the cover letter, vIOMMU obj can instanced
>> per the vIOMMU units in VM.
> 
> Yea, the implementation in this version is merely a wrapper. I
> had a general introduction of vIOMMU in the other reply. And I
> will put something similar in the next version of the series,
> so the idea would be bigger than a wrapper.

yep. would be good to see it. Otherwise, it is really confusion what
vIOMMU obj exactly means in concept. :)

>> Does it mean each vIOMMU of VM can only have
>> one s2 HWPT?
> 
> Giving some examples here:
>   - If a VM has 1 vIOMMU, there will be 1 vIOMMU object in the
>     kernel holding one S2 HWPT.
>   - If a VM has 2 vIOMMUs, there will be 2 vIOMMU objects in the
>     kernel that can hold two different S2 HWPTs, or share one S2
>     HWPT (saving memory).

So if you have two devices assigned to a VM, then you may have two
vIOMMUs or one vIOMMU exposed to guest. This depends on whether the two
devices are behind the same physical IOMMU. If it's two vIOMMUs, the two
can share the s2 hwpt if their physical IOMMU is compatible. is it?

To achieve the above, you need to know if the physical IOMMUs of the
assigned devices, hence be able to tell if physical IOMMUs are the
same and if they are compatible. How would userspace know such infos?
Nicolin Chen Sept. 27, 2024, 6:02 a.m. UTC | #9
On Fri, Sep 27, 2024 at 01:38:08PM +0800, Yi Liu wrote:
> > > Does it mean each vIOMMU of VM can only have
> > > one s2 HWPT?
> > 
> > Giving some examples here:
> >   - If a VM has 1 vIOMMU, there will be 1 vIOMMU object in the
> >     kernel holding one S2 HWPT.
> >   - If a VM has 2 vIOMMUs, there will be 2 vIOMMU objects in the
> >     kernel that can hold two different S2 HWPTs, or share one S2
> >     HWPT (saving memory).
> 
> So if you have two devices assigned to a VM, then you may have two
> vIOMMUs or one vIOMMU exposed to guest. This depends on whether the two
> devices are behind the same physical IOMMU. If it's two vIOMMUs, the two
> can share the s2 hwpt if their physical IOMMU is compatible. is it?

Yes.

> To achieve the above, you need to know if the physical IOMMUs of the
> assigned devices, hence be able to tell if physical IOMMUs are the
> same and if they are compatible. How would userspace know such infos?

My draft implementation with QEMU does something like this:
 - List all viommu-matched iommu nodes under /sys/class/iommu: LINKs
 - Get PCI device's /sys/bus/pci/devices/0000:00:00.0/iommu: LINK0
 - Compare the LINK0 against the LINKs

We so far don't have an ID for physical IOMMU instance, which can
be an alternative to return via the hw_info call, otherwise.

QEMU then does the routing to assign PCI buses and IORT (or DT).
This part is suggested now to move to libvirt though. So, I think
at the end of the day, libvirt would run the sys check and assign
a device to the corresponding pci bus backed by the correct IOMMU.

This gives an example showing two devices behind iommu0 and third
device behind iommu1 are assigned to a VM:
  -device pxb-pcie.id=pcie.viommu0,bus=pcie.0.... \   # bus for viommu0
  -device pxb-pcie.id=pcie.viommu1,bus=pcie.0.... \   # bus for viommu1
  -device pcie-root-port,id=pcie.viommu0p0,bus=pcie.viommu0... \
  -device pcie-root-port,id=pcie.viommu0p1,bus=pcie.viommu0... \
  -device pcie-root-port,id=pcie.viommu1p0,bus=pcie.viommu1... \
  -device vfio-pci,bus=pcie.viommu0p0... \  # connect to bus for viommu0
  -device vfio-pci,bus=pcie.viommu0p1... \  # connect to bus for viommu0
  -device vfio-pci,bus=pcie.viommu1p0...    # connect to bus for viommu1

For compatibility to share a stage-2 HWPT, basically we would do
a device attach to one of the stage-2 HWPT from the list that VMM
should keep. This attach has all the compatibility test, down to
the IOMMU driver. If it fails, just allocate a new stage-2 HWPT.

Thanks
Nic
Jason Gunthorpe Sept. 27, 2024, 11:59 a.m. UTC | #10
On Thu, Sep 26, 2024 at 11:02:37PM -0700, Nicolin Chen wrote:
> On Fri, Sep 27, 2024 at 01:38:08PM +0800, Yi Liu wrote:
> > > > Does it mean each vIOMMU of VM can only have
> > > > one s2 HWPT?
> > > 
> > > Giving some examples here:
> > >   - If a VM has 1 vIOMMU, there will be 1 vIOMMU object in the
> > >     kernel holding one S2 HWPT.
> > >   - If a VM has 2 vIOMMUs, there will be 2 vIOMMU objects in the
> > >     kernel that can hold two different S2 HWPTs, or share one S2
> > >     HWPT (saving memory).
> > 
> > So if you have two devices assigned to a VM, then you may have two
> > vIOMMUs or one vIOMMU exposed to guest. This depends on whether the two
> > devices are behind the same physical IOMMU. If it's two vIOMMUs, the two
> > can share the s2 hwpt if their physical IOMMU is compatible. is it?
> 
> Yes.
> 
> > To achieve the above, you need to know if the physical IOMMUs of the
> > assigned devices, hence be able to tell if physical IOMMUs are the
> > same and if they are compatible. How would userspace know such infos?
> 
> My draft implementation with QEMU does something like this:
>  - List all viommu-matched iommu nodes under /sys/class/iommu: LINKs
>  - Get PCI device's /sys/bus/pci/devices/0000:00:00.0/iommu: LINK0
>  - Compare the LINK0 against the LINKs
> 
> We so far don't have an ID for physical IOMMU instance, which can
> be an alternative to return via the hw_info call, otherwise.

We could return the sys/class/iommu string from some get_info or
something

> For compatibility to share a stage-2 HWPT, basically we would do
> a device attach to one of the stage-2 HWPT from the list that VMM
> should keep. This attach has all the compatibility test, down to
> the IOMMU driver. If it fails, just allocate a new stage-2 HWPT.

Ideally just creating the viommu should validate the passed in hwpt is
compatible without attaching.

Jason
Yi Liu Sept. 27, 2024, 12:12 p.m. UTC | #11
On 2024/9/27 14:02, Nicolin Chen wrote:
> On Fri, Sep 27, 2024 at 01:38:08PM +0800, Yi Liu wrote:
>>>> Does it mean each vIOMMU of VM can only have
>>>> one s2 HWPT?
>>>
>>> Giving some examples here:
>>>    - If a VM has 1 vIOMMU, there will be 1 vIOMMU object in the
>>>      kernel holding one S2 HWPT.
>>>    - If a VM has 2 vIOMMUs, there will be 2 vIOMMU objects in the
>>>      kernel that can hold two different S2 HWPTs, or share one S2
>>>      HWPT (saving memory).
>>
>> So if you have two devices assigned to a VM, then you may have two
>> vIOMMUs or one vIOMMU exposed to guest. This depends on whether the two
>> devices are behind the same physical IOMMU. If it's two vIOMMUs, the two
>> can share the s2 hwpt if their physical IOMMU is compatible. is it?
> 
> Yes.
> 
>> To achieve the above, you need to know if the physical IOMMUs of the
>> assigned devices, hence be able to tell if physical IOMMUs are the
>> same and if they are compatible. How would userspace know such infos?
> 
> My draft implementation with QEMU does something like this:
>   - List all viommu-matched iommu nodes under /sys/class/iommu: LINKs
>   - Get PCI device's /sys/bus/pci/devices/0000:00:00.0/iommu: LINK0
>   - Compare the LINK0 against the LINKs
> 
> We so far don't have an ID for physical IOMMU instance, which can
> be an alternative to return via the hw_info call, otherwise.

intel platform has a kind of ID for the physical IOMMUs.

ls /sys/class/iommu/
dmar0  dmar1  dmar10  dmar11  dmar12  dmar13  dmar14  dmar15  dmar16 
dmar17  dmar18  dmar19  dmar2  dmar3  dmar4  dmar5  dmar6  dmar7  dmar8 
dmar9  iommufd_selftest_iommu.0

> QEMU then does the routing to assign PCI buses and IORT (or DT).
> This part is suggested now to move to libvirt though. So, I think
> at the end of the day, libvirt would run the sys check and assign
> a device to the corresponding pci bus backed by the correct IOMMU.

and also give the correct viommu for the device.

> 
> This gives an example showing two devices behind iommu0 and third
> device behind iommu1 are assigned to a VM:
>    -device pxb-pcie.id=pcie.viommu0,bus=pcie.0.... \   # bus for viommu0
>    -device pxb-pcie.id=pcie.viommu1,bus=pcie.0.... \   # bus for viommu1
>    -device pcie-root-port,id=pcie.viommu0p0,bus=pcie.viommu0... \
>    -device pcie-root-port,id=pcie.viommu0p1,bus=pcie.viommu0... \
>    -device pcie-root-port,id=pcie.viommu1p0,bus=pcie.viommu1... \
>    -device vfio-pci,bus=pcie.viommu0p0... \  # connect to bus for viommu0
>    -device vfio-pci,bus=pcie.viommu0p1... \  # connect to bus for viommu0
>    -device vfio-pci,bus=pcie.viommu1p0...    # connect to bus for viommu1

is the viommu# an "-object" or just hints to describe the relationship
between device and viommu and build the IORT?

I'm considering how it would look like if the QEMU Intel vIOMMU is going
to use the viommu obj. Currently, we only support one virtual VT-d due to
some considerations like hot-plug. Per your conversation with Kevin, it
seems to be supported. So there is no strict connection between vIOMMU
and vIOMMU obj. But the vIOMMU obj can only be connected with one pIOMMU.
right?

https://lore.kernel.org/linux-iommu/ZvYJl1AQWXWX0BQL@Asurada-Nvidia/

> For compatibility to share a stage-2 HWPT, basically we would do
> a device attach to one of the stage-2 HWPT from the list that VMM
> should keep. This attach has all the compatibility test, down to
> the IOMMU driver. If it fails, just allocate a new stage-2 HWPT.

yeah. I think this was covered by Zhenzhong's QEMU series.
Nicolin Chen Sept. 27, 2024, 7:52 p.m. UTC | #12
On Fri, Sep 27, 2024 at 08:59:25AM -0300, Jason Gunthorpe wrote:
> On Thu, Sep 26, 2024 at 11:02:37PM -0700, Nicolin Chen wrote:
> > On Fri, Sep 27, 2024 at 01:38:08PM +0800, Yi Liu wrote:
> > > > > Does it mean each vIOMMU of VM can only have
> > > > > one s2 HWPT?
> > > > 
> > > > Giving some examples here:
> > > >   - If a VM has 1 vIOMMU, there will be 1 vIOMMU object in the
> > > >     kernel holding one S2 HWPT.
> > > >   - If a VM has 2 vIOMMUs, there will be 2 vIOMMU objects in the
> > > >     kernel that can hold two different S2 HWPTs, or share one S2
> > > >     HWPT (saving memory).
> > > 
> > > So if you have two devices assigned to a VM, then you may have two
> > > vIOMMUs or one vIOMMU exposed to guest. This depends on whether the two
> > > devices are behind the same physical IOMMU. If it's two vIOMMUs, the two
> > > can share the s2 hwpt if their physical IOMMU is compatible. is it?
> > 
> > Yes.
> > 
> > > To achieve the above, you need to know if the physical IOMMUs of the
> > > assigned devices, hence be able to tell if physical IOMMUs are the
> > > same and if they are compatible. How would userspace know such infos?
> > 
> > My draft implementation with QEMU does something like this:
> >  - List all viommu-matched iommu nodes under /sys/class/iommu: LINKs
> >  - Get PCI device's /sys/bus/pci/devices/0000:00:00.0/iommu: LINK0
> >  - Compare the LINK0 against the LINKs
> > 
> > We so far don't have an ID for physical IOMMU instance, which can
> > be an alternative to return via the hw_info call, otherwise.
> 
> We could return the sys/class/iommu string from some get_info or
> something

I had a patch doing an ida alloc for each iommu_dev and returning
the ID via hw_info. It wasn't useful at that time, as we went for
fail-n-retry for S2 HWPT allocations on multi-pIOMMU platforms.

Perhaps that could be cleaner than returning a string?

> > For compatibility to share a stage-2 HWPT, basically we would do
> > a device attach to one of the stage-2 HWPT from the list that VMM
> > should keep. This attach has all the compatibility test, down to
> > the IOMMU driver. If it fails, just allocate a new stage-2 HWPT.
> 
> Ideally just creating the viommu should validate the passed in hwpt is
> compatible without attaching.

I think I should add a validation between hwpt->domain->owner and
dev_iommu_ops(idev->dev) then!

Thanks
Nicolin
Nicolin Chen Sept. 27, 2024, 8:29 p.m. UTC | #13
On Fri, Sep 27, 2024 at 08:12:40PM +0800, Yi Liu wrote:
> External email: Use caution opening links or attachments
> 
> 
> On 2024/9/27 14:02, Nicolin Chen wrote:
> > On Fri, Sep 27, 2024 at 01:38:08PM +0800, Yi Liu wrote:
> > > > > Does it mean each vIOMMU of VM can only have
> > > > > one s2 HWPT?
> > > > 
> > > > Giving some examples here:
> > > >    - If a VM has 1 vIOMMU, there will be 1 vIOMMU object in the
> > > >      kernel holding one S2 HWPT.
> > > >    - If a VM has 2 vIOMMUs, there will be 2 vIOMMU objects in the
> > > >      kernel that can hold two different S2 HWPTs, or share one S2
> > > >      HWPT (saving memory).
> > > 
> > > So if you have two devices assigned to a VM, then you may have two
> > > vIOMMUs or one vIOMMU exposed to guest. This depends on whether the two
> > > devices are behind the same physical IOMMU. If it's two vIOMMUs, the two
> > > can share the s2 hwpt if their physical IOMMU is compatible. is it?
> > 
> > Yes.
> > 
> > > To achieve the above, you need to know if the physical IOMMUs of the
> > > assigned devices, hence be able to tell if physical IOMMUs are the
> > > same and if they are compatible. How would userspace know such infos?
> > 
> > My draft implementation with QEMU does something like this:
> >   - List all viommu-matched iommu nodes under /sys/class/iommu: LINKs
> >   - Get PCI device's /sys/bus/pci/devices/0000:00:00.0/iommu: LINK0
> >   - Compare the LINK0 against the LINKs
> > 
> > We so far don't have an ID for physical IOMMU instance, which can
> > be an alternative to return via the hw_info call, otherwise.
> 
> intel platform has a kind of ID for the physical IOMMUs.
> 
> ls /sys/class/iommu/
> dmar0  dmar1  dmar10  dmar11  dmar12  dmar13  dmar14  dmar15  dmar16
> dmar17  dmar18  dmar19  dmar2  dmar3  dmar4  dmar5  dmar6  dmar7  dmar8
> dmar9  iommufd_selftest_iommu.0

Wow, that's a lot of IOMMU devices. I somehow had an impression
that Intel uses one physical IOMMU..

Yea, we need something in the core. I had one patch previously:
https://github.com/nicolinc/iommufd/commit/b7520901184fd9fa127abb88c1f0be16b9967cff

> > QEMU then does the routing to assign PCI buses and IORT (or DT).
> > This part is suggested now to move to libvirt though. So, I think
> > at the end of the day, libvirt would run the sys check and assign
> > a device to the corresponding pci bus backed by the correct IOMMU.
> 
> and also give the correct viommu for the device.

In this design, a pxb bus is exclusively created for a viommu
instance, meaning so long as device is assigned to the correct
bus number, it'll be linked to the correct viommu.

> > This gives an example showing two devices behind iommu0 and third
> > device behind iommu1 are assigned to a VM:
> >    -device pxb-pcie.id=pcie.viommu0,bus=pcie.0.... \   # bus for viommu0
> >    -device pxb-pcie.id=pcie.viommu1,bus=pcie.0.... \   # bus for viommu1
> >    -device pcie-root-port,id=pcie.viommu0p0,bus=pcie.viommu0... \
> >    -device pcie-root-port,id=pcie.viommu0p1,bus=pcie.viommu0... \
> >    -device pcie-root-port,id=pcie.viommu1p0,bus=pcie.viommu1... \
> >    -device vfio-pci,bus=pcie.viommu0p0... \  # connect to bus for viommu0
> >    -device vfio-pci,bus=pcie.viommu0p1... \  # connect to bus for viommu0
> >    -device vfio-pci,bus=pcie.viommu1p0...    # connect to bus for viommu1
> 
> is the viommu# an "-object" or just hints to describe the relationship
> between device and viommu and build the IORT?

Yes. Eric actually suggested something better for the relationship
between pxb-pcie with viommu:

-device
pxb-pcie,bus_nr=100,id=pci.12,numa_node=0,bus=pcie.0,addr=0x3,iommu=<id>
from:
https://lore.kernel.org/qemu-devel/9c3e95c2-1035-4a55-89a3-97165ef32f18@redhat.com/

This would likely help the IORT or Device Tree building.

Currently, ARM VIRT machine doesn't create a vSMMU via a "-device"
string, i.e. not a plugable module yet. I recall Intel does. So,
you guys are one step ahead.

> I'm considering how it would look like if the QEMU Intel vIOMMU is going
> to use the viommu obj. Currently, we only support one virtual VT-d due to
> some considerations like hot-plug. Per your conversation with Kevin, it
> seems to be supported. So there is no strict connection between vIOMMU
> and vIOMMU obj. But the vIOMMU obj can only be connected with one pIOMMU.
> right?

Yes. Most of my earlier vSMMU versions did something similar, e.g.
one shared vSMMU instance in the QEMU holding a list of S2 hwpts.
With this new iommufd viommu object, it would be a list of viommu
objs. Eric suggested that HostIOMMUDevice could store any pIOMMU
info. So, compatibility check can be done with that (or the old
fashioned way of trying an device attach).

The invalidation on the other hand needs to identify each trapped
invalidation request to distribute it to the correct viommu. This
is also one of the cons of this shared viommu model: invalidation
inefficiency -- there can be some cases where we fail to identify
which viommu to distribute so we have to broadcast to all viommus.
With a multi-viommu-instance model, invalidations are distributed
naturally by the guest kernel.

Thanks
Nicolin
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index c21bb59c4022..06adbcc304bc 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -57,6 +57,9 @@  void iommufd_hwpt_nested_destroy(struct iommufd_object *obj)
 		container_of(obj, struct iommufd_hwpt_nested, common.obj);
 
 	__iommufd_hwpt_destroy(&hwpt_nested->common);
+
+	if (hwpt_nested->viommu)
+		refcount_dec(&hwpt_nested->viommu->obj.users);
 	refcount_dec(&hwpt_nested->parent->common.obj.users);
 }
 
@@ -213,6 +216,7 @@  iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
  */
 static struct iommufd_hwpt_nested *
 iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
+			  struct iommufd_viommu *viommu,
 			  struct iommufd_hwpt_paging *parent,
 			  struct iommufd_device *idev, u32 flags,
 			  const struct iommu_user_data *user_data)
@@ -234,13 +238,16 @@  iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
 		return ERR_CAST(hwpt_nested);
 	hwpt = &hwpt_nested->common;
 
+	if (viommu)
+		refcount_inc(&viommu->obj.users);
+	hwpt_nested->viommu = viommu;
 	refcount_inc(&parent->common.obj.users);
 	hwpt_nested->parent = parent;
 
 	hwpt->domain = ops->domain_alloc_user(idev->dev,
 					      flags & ~IOMMU_HWPT_FAULT_ID_VALID,
 					      parent->common.domain,
-					      NULL, user_data);
+					      viommu, user_data);
 	if (IS_ERR(hwpt->domain)) {
 		rc = PTR_ERR(hwpt->domain);
 		hwpt->domain = NULL;
@@ -307,7 +314,7 @@  int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 		struct iommufd_hwpt_nested *hwpt_nested;
 
 		hwpt_nested = iommufd_hwpt_nested_alloc(
-			ucmd->ictx,
+			ucmd->ictx, NULL,
 			container_of(pt_obj, struct iommufd_hwpt_paging,
 				     common.obj),
 			idev, cmd->flags, &user_data);
@@ -316,6 +323,19 @@  int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 			goto out_unlock;
 		}
 		hwpt = &hwpt_nested->common;
+	} else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) {
+		struct iommufd_hwpt_nested *hwpt_nested;
+		struct iommufd_viommu *viommu;
+
+		viommu = container_of(pt_obj, struct iommufd_viommu, obj);
+		hwpt_nested = iommufd_hwpt_nested_alloc(
+			ucmd->ictx, viommu, viommu->hwpt, idev,
+			cmd->flags, &user_data);
+		if (IS_ERR(hwpt_nested)) {
+			rc = PTR_ERR(hwpt_nested);
+			goto out_unlock;
+		}
+		hwpt = &hwpt_nested->common;
 	} else {
 		rc = -EINVAL;
 		goto out_put_pt;
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 154f7ba5f45c..1f2a1c133b9a 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -313,6 +313,7 @@  struct iommufd_hwpt_paging {
 struct iommufd_hwpt_nested {
 	struct iommufd_hw_pagetable common;
 	struct iommufd_hwpt_paging *parent;
+	struct iommufd_viommu *viommu;
 };
 
 static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt)
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index ac77903b5cc4..51ce6a019c34 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -430,7 +430,7 @@  enum iommu_hwpt_data_type {
  * @size: sizeof(struct iommu_hwpt_alloc)
  * @flags: Combination of enum iommufd_hwpt_alloc_flags
  * @dev_id: The device to allocate this HWPT for
- * @pt_id: The IOAS or HWPT to connect this HWPT to
+ * @pt_id: The IOAS or HWPT or VIOMMU to connect this HWPT to
  * @out_hwpt_id: The ID of the new HWPT
  * @__reserved: Must be 0
  * @data_type: One of enum iommu_hwpt_data_type
@@ -449,11 +449,11 @@  enum iommu_hwpt_data_type {
  * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a
  * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags.
  *
- * A user-managed nested HWPT will be created from a given parent HWPT via
- * @pt_id, in which the parent HWPT must be allocated previously via the
- * same ioctl from a given IOAS (@pt_id). In this case, the @data_type
- * must be set to a pre-defined type corresponding to an I/O page table
- * type supported by the underlying IOMMU hardware.
+ * A user-managed nested HWPT will be created from a given VIOMMU (wrapping a
+ * parent HWPT) or a parent HWPT via @pt_id, in which the parent HWPT must be
+ * allocated previously via the same ioctl from a given IOAS (@pt_id). In this
+ * case, the @data_type must be set to a pre-defined type corresponding to an
+ * I/O page table type supported by the underlying IOMMU hardware.
  *
  * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and
  * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr